Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 21 additions & 107 deletions cudax/include/cuda/experimental/__driver/driver_api.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,145 +36,59 @@
// #define'd version aliases in cuda.h (e.g. #define cuFoo cuFoo_v2).
// The ## operator suppresses macro expansion of the function name, so this is
// safe even for names that are #define'd to versioned variants.
# define _CUDAX_GET_DRIVER_FUNCTION(pfn_name, major, minor) \
reinterpret_cast<PFN_##pfn_name##_v##major##0##minor##0>( \
# define _CUDAX_GET_DRIVER_FUNCTION(pfn_name, major, minor) \
reinterpret_cast<::PFN_##pfn_name##_v##major##0##minor##0>( \
::cuda::__driver::__get_driver_entry_point(#pfn_name, major, minor))

namespace cuda::experimental::__driver
{
// ── Graph: memset node ──────────────────────────────────────────────────────
// ── Graph: polymorphic add node ─────────────────────────────────────────────

[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddMemsetNode(
::CUgraph __graph,
const ::CUgraphNode* __deps,
::cuda::std::size_t __ndeps,
::CUdeviceptr __dst,
::cuda::std::size_t __pitch,
unsigned int __value,
unsigned int __element_size,
::cuda::std::size_t __width,
::cuda::std::size_t __height)
{
static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddMemsetNode, 10, 0);
::CUgraphNode __node{};
::CUDA_MEMSET_NODE_PARAMS __params{};
__params.dst = __dst;
__params.pitch = __pitch;
__params.value = __value;
__params.elementSize = __element_size;
__params.width = __width;
__params.height = __height;
::CUcontext __ctx = ::cuda::__driver::__ctxGetCurrent();
::cuda::__driver::__call_driver_fn(
__driver_fn, "Failed to add a memset node to graph", &__node, __graph, __deps, __ndeps, &__params, __ctx);
return __node;
}

// ── Graph: memcpy node (1-D) ────────────────────────────────────────────────
# if _CCCL_CTK_AT_LEAST(12, 2)

[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddMemcpyNode1D(
::CUgraph __graph,
const ::CUgraphNode* __deps,
::cuda::std::size_t __ndeps,
::CUdeviceptr __dst,
::CUdeviceptr __src,
::cuda::std::size_t __byte_count)
[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddNode(
::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUgraphNodeParams* __params)
{
static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddMemcpyNode, 10, 0);
static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddNode, 12, 2);
::CUgraphNode __node{};
::CUDA_MEMCPY3D __params{};
__params.srcMemoryType = ::CU_MEMORYTYPE_UNIFIED;
__params.srcDevice = __src;
__params.dstMemoryType = ::CU_MEMORYTYPE_UNIFIED;
__params.dstDevice = __dst;
__params.WidthInBytes = __byte_count;
__params.Height = 1;
__params.Depth = 1;
::CUcontext __ctx = ::cuda::__driver::__ctxGetCurrent();
::cuda::__driver::__call_driver_fn(
__driver_fn, "Failed to add a memcpy node to graph", &__node, __graph, __deps, __ndeps, &__params, __ctx);
__driver_fn, "Failed to add a node to graph", &__node, __graph, __deps, __ndeps, __params);
return __node;
}

// ── Graph: host node ────────────────────────────────────────────────────────

[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddHostNode(
::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUhostFn __fn, void* __user_data)
{
static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddHostNode, 10, 0);
::CUgraphNode __node{};
::CUDA_HOST_NODE_PARAMS __params{};
__params.fn = __fn;
__params.userData = __user_data;
::cuda::__driver::__call_driver_fn(
__driver_fn, "Failed to add a host node to graph", &__node, __graph, __deps, __ndeps, &__params);
return __node;
}
# endif // _CCCL_CTK_AT_LEAST(12, 2)

// ── Graph: child graph node ─────────────────────────────────────────────────
// ── Graph: user object (ref-counted data lifetime tied to graph) ─────────────

[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddChildGraphNode(
::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUgraph __child_graph)
_CCCL_HOST_API inline void __graphRetainUserObject(::CUgraph __graph, void* __ptr, ::CUhostFn __destroy)
{
static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddChildGraphNode, 10, 0);
::CUgraphNode __node{};
::cuda::__driver::__call_driver_fn(
__driver_fn, "Failed to add a child graph node", &__node, __graph, __deps, __ndeps, __child_graph);
return __node;
}

// ── Graph: event record node ────────────────────────────────────────────────
static auto __create_fn = _CUDAX_GET_DRIVER_FUNCTION(cuUserObjectCreate, 11, 3);
static auto __retain_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphRetainUserObject, 11, 3);

[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode
__graphAddEventRecordNode(::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUevent __ev)
{
static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddEventRecordNode, 11, 1);
::CUgraphNode __node{};
::CUuserObject __obj{};
::cuda::__driver::__call_driver_fn(
__driver_fn, "Failed to add an event record node to graph", &__node, __graph, __deps, __ndeps, __ev);
return __node;
}

// ── Graph: event wait node ──────────────────────────────────────────────────

[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode
__graphAddEventWaitNode(::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUevent __ev)
{
static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddEventWaitNode, 11, 1);
::CUgraphNode __node{};
__create_fn, "Failed to create user object", &__obj, __ptr, __destroy, 1u, ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC);
// CU_GRAPH_USER_OBJECT_MOVE transfers our reference to the graph without incrementing.
// After this call, the graph owns the sole reference — do not release.
::cuda::__driver::__call_driver_fn(
__driver_fn, "Failed to add an event wait node to graph", &__node, __graph, __deps, __ndeps, __ev);
return __node;
__retain_fn, "Failed to retain user object on graph", __graph, __obj, 1u, ::CU_GRAPH_USER_OBJECT_MOVE);
}

// ── Graph: conditional handle ───────────────────────────────────────────────

# if _CCCL_CTK_AT_LEAST(12, 4) && _CCCL_CTK_BELOW(13, 0)
# if _CCCL_CTK_AT_LEAST(12, 4)

[[nodiscard]] _CCCL_HOST_API inline ::CUgraphConditionalHandle
__graphConditionalHandleCreate(::CUgraph __graph, unsigned int __default_val, unsigned int __flags)
__graphConditionalHandleCreate(::CUgraph __graph, ::CUcontext __ctx, unsigned int __default_val, unsigned int __flags)
{
static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphConditionalHandleCreate, 12, 3);
::CUgraphConditionalHandle __handle{};
::CUcontext __ctx = ::cuda::__driver::__ctxGetCurrent();
::cuda::__driver::__call_driver_fn(
__driver_fn, "Failed to create a conditional handle", &__handle, __graph, __ctx, __default_val, __flags);
return __handle;
}

// ── Graph: generic add node (used for conditional nodes) ────────────────────

[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddNode(
::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUgraphNodeParams* __params)
{
static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddNode, 12, 2);
::CUgraphNode __node{};
::cuda::__driver::__call_driver_fn(
__driver_fn, "Failed to add a node to graph", &__node, __graph, __deps, __ndeps, __params);
return __node;
}

# endif // _CCCL_CTK_AT_LEAST(12, 4) && _CCCL_CTK_BELOW(13, 0)
# endif // _CCCL_CTK_AT_LEAST(12, 4)

// ── Graph: create ───────────────────────────────────────────────────────────

Expand Down
92 changes: 92 additions & 0 deletions cudax/include/cuda/experimental/__graph/child_graph.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDAX__GRAPH_CHILD_GRAPH_CUH
#define _CUDAX__GRAPH_CHILD_GRAPH_CUH

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if _CCCL_CTK_AT_LEAST(12, 2)

# include <cuda/experimental/__driver/driver_api.cuh>
# include <cuda/experimental/__graph/graph_builder.cuh>
# include <cuda/experimental/__graph/graph_builder_ref.cuh>
# include <cuda/experimental/__graph/graph_node_ref.cuh>
# include <cuda/experimental/__graph/path_builder.cuh>

# include <cuda/std/__cccl/prologue.h>

namespace cuda::experimental
{
//! \brief Adds a child graph node to a CUDA graph path.
//!
//! The entire subgraph described by \p __child is embedded as a single node in the parent
//! graph. All nodes in the child graph execute before any successor of the new child-graph
//! node.
//!
//! \param __pb Path builder to insert the node into.
//! \param __child A `graph_builder_ref` whose underlying graph will become the child.
//! \return A `graph_node_ref` for the newly added child-graph node.
//! \throws cuda::std::cuda_error if node creation fails.
_CCCL_HOST_API inline graph_node_ref insert_child_graph(path_builder& __pb, graph_builder_ref __child)
{
auto __deps = __pb.get_dependencies();
::CUgraphNodeParams __params{};
__params.type = ::CU_GRAPH_NODE_TYPE_GRAPH;
__params.graph.graph = __child.get();
auto __node = ::cuda::experimental::__driver::__graphAddNode(
__pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params);

__pb.__clear_and_set_dependency_node(__node);
return graph_node_ref{__node, __pb.get_native_graph_handle()};
}

# if _CCCL_CTK_AT_LEAST(12, 9)
//! \brief Adds a child graph node to a CUDA graph path, transferring ownership.
//!
//! The child graph is moved into the parent graph node. After this call, the
//! `graph_builder` is left in a null state and the parent graph owns the child's
//! lifetime.
//!
//! \param __pb Path builder to insert the node into.
//! \param __child An rvalue `graph_builder` whose graph will be moved into the parent.
//! \return A `graph_node_ref` for the newly added child-graph node.
//! \throws cuda::std::cuda_error if node creation fails.
_CCCL_HOST_API inline graph_node_ref insert_child_graph(path_builder& __pb, graph_builder&& __child)
{
auto __deps = __pb.get_dependencies();
::CUgraphNodeParams __params{};
__params.type = ::CU_GRAPH_NODE_TYPE_GRAPH;
__params.graph.graph = __child.get();
__params.graph.ownership = ::CU_GRAPH_CHILD_GRAPH_OWNERSHIP_MOVE;
auto __node = ::cuda::experimental::__driver::__graphAddNode(
__pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params);

(void) __child.release();

__pb.__clear_and_set_dependency_node(__node);
return graph_node_ref{__node, __pb.get_native_graph_handle()};
}
# endif // _CCCL_CTK_AT_LEAST(12, 9)
} // namespace cuda::experimental

# include <cuda/std/__cccl/epilogue.h>

#endif // _CCCL_CTK_AT_LEAST(12, 2)

#endif // _CUDAX__GRAPH_CHILD_GRAPH_CUH
Loading
Loading