diff --git a/cudax/include/cuda/experimental/__driver/driver_api.cuh b/cudax/include/cuda/experimental/__driver/driver_api.cuh index 13973ef9a49..0a37858aa3b 100644 --- a/cudax/include/cuda/experimental/__driver/driver_api.cuh +++ b/cudax/include/cuda/experimental/__driver/driver_api.cuh @@ -36,145 +36,59 @@ // #define'd version aliases in cuda.h (e.g. #define cuFoo cuFoo_v2). // The ## operator suppresses macro expansion of the function name, so this is // safe even for names that are #define'd to versioned variants. -# define _CUDAX_GET_DRIVER_FUNCTION(pfn_name, major, minor) \ - reinterpret_cast( \ +# define _CUDAX_GET_DRIVER_FUNCTION(pfn_name, major, minor) \ + reinterpret_cast<::PFN_##pfn_name##_v##major##0##minor##0>( \ ::cuda::__driver::__get_driver_entry_point(#pfn_name, major, minor)) namespace cuda::experimental::__driver { -// ── Graph: memset node ────────────────────────────────────────────────────── +// ── Graph: polymorphic add node ───────────────────────────────────────────── -[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddMemsetNode( - ::CUgraph __graph, - const ::CUgraphNode* __deps, - ::cuda::std::size_t __ndeps, - ::CUdeviceptr __dst, - ::cuda::std::size_t __pitch, - unsigned int __value, - unsigned int __element_size, - ::cuda::std::size_t __width, - ::cuda::std::size_t __height) -{ - static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddMemsetNode, 10, 0); - ::CUgraphNode __node{}; - ::CUDA_MEMSET_NODE_PARAMS __params{}; - __params.dst = __dst; - __params.pitch = __pitch; - __params.value = __value; - __params.elementSize = __element_size; - __params.width = __width; - __params.height = __height; - ::CUcontext __ctx = ::cuda::__driver::__ctxGetCurrent(); - ::cuda::__driver::__call_driver_fn( - __driver_fn, "Failed to add a memset node to graph", &__node, __graph, __deps, __ndeps, &__params, __ctx); - return __node; -} - -// ── Graph: memcpy node (1-D) ──────────────────────────────────────────────── +# if _CCCL_CTK_AT_LEAST(12, 2) -[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddMemcpyNode1D( - ::CUgraph __graph, - const ::CUgraphNode* __deps, - ::cuda::std::size_t __ndeps, - ::CUdeviceptr __dst, - ::CUdeviceptr __src, - ::cuda::std::size_t __byte_count) +[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddNode( + ::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUgraphNodeParams* __params) { - static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddMemcpyNode, 10, 0); + static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddNode, 12, 2); ::CUgraphNode __node{}; - ::CUDA_MEMCPY3D __params{}; - __params.srcMemoryType = ::CU_MEMORYTYPE_UNIFIED; - __params.srcDevice = __src; - __params.dstMemoryType = ::CU_MEMORYTYPE_UNIFIED; - __params.dstDevice = __dst; - __params.WidthInBytes = __byte_count; - __params.Height = 1; - __params.Depth = 1; - ::CUcontext __ctx = ::cuda::__driver::__ctxGetCurrent(); ::cuda::__driver::__call_driver_fn( - __driver_fn, "Failed to add a memcpy node to graph", &__node, __graph, __deps, __ndeps, &__params, __ctx); + __driver_fn, "Failed to add a node to graph", &__node, __graph, __deps, __ndeps, __params); return __node; } -// ── Graph: host node ──────────────────────────────────────────────────────── - -[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddHostNode( - ::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUhostFn __fn, void* __user_data) -{ - static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddHostNode, 10, 0); - ::CUgraphNode __node{}; - ::CUDA_HOST_NODE_PARAMS __params{}; - __params.fn = __fn; - __params.userData = __user_data; - ::cuda::__driver::__call_driver_fn( - __driver_fn, "Failed to add a host node to graph", &__node, __graph, __deps, __ndeps, &__params); - return __node; -} +# endif // _CCCL_CTK_AT_LEAST(12, 2) -// ── Graph: child graph node ───────────────────────────────────────────────── +// ── Graph: user object (ref-counted data lifetime tied to graph) ───────────── -[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddChildGraphNode( - ::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUgraph __child_graph) +_CCCL_HOST_API inline void __graphRetainUserObject(::CUgraph __graph, void* __ptr, ::CUhostFn __destroy) { - static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddChildGraphNode, 10, 0); - ::CUgraphNode __node{}; - ::cuda::__driver::__call_driver_fn( - __driver_fn, "Failed to add a child graph node", &__node, __graph, __deps, __ndeps, __child_graph); - return __node; -} - -// ── Graph: event record node ──────────────────────────────────────────────── + static auto __create_fn = _CUDAX_GET_DRIVER_FUNCTION(cuUserObjectCreate, 11, 3); + static auto __retain_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphRetainUserObject, 11, 3); -[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode -__graphAddEventRecordNode(::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUevent __ev) -{ - static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddEventRecordNode, 11, 1); - ::CUgraphNode __node{}; + ::CUuserObject __obj{}; ::cuda::__driver::__call_driver_fn( - __driver_fn, "Failed to add an event record node to graph", &__node, __graph, __deps, __ndeps, __ev); - return __node; -} - -// ── Graph: event wait node ────────────────────────────────────────────────── - -[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode -__graphAddEventWaitNode(::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUevent __ev) -{ - static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddEventWaitNode, 11, 1); - ::CUgraphNode __node{}; + __create_fn, "Failed to create user object", &__obj, __ptr, __destroy, 1u, ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC); + // CU_GRAPH_USER_OBJECT_MOVE transfers our reference to the graph without incrementing. + // After this call, the graph owns the sole reference — do not release. ::cuda::__driver::__call_driver_fn( - __driver_fn, "Failed to add an event wait node to graph", &__node, __graph, __deps, __ndeps, __ev); - return __node; + __retain_fn, "Failed to retain user object on graph", __graph, __obj, 1u, ::CU_GRAPH_USER_OBJECT_MOVE); } // ── Graph: conditional handle ─────────────────────────────────────────────── -# if _CCCL_CTK_AT_LEAST(12, 4) && _CCCL_CTK_BELOW(13, 0) +# if _CCCL_CTK_AT_LEAST(12, 4) [[nodiscard]] _CCCL_HOST_API inline ::CUgraphConditionalHandle -__graphConditionalHandleCreate(::CUgraph __graph, unsigned int __default_val, unsigned int __flags) +__graphConditionalHandleCreate(::CUgraph __graph, ::CUcontext __ctx, unsigned int __default_val, unsigned int __flags) { static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphConditionalHandleCreate, 12, 3); ::CUgraphConditionalHandle __handle{}; - ::CUcontext __ctx = ::cuda::__driver::__ctxGetCurrent(); ::cuda::__driver::__call_driver_fn( __driver_fn, "Failed to create a conditional handle", &__handle, __graph, __ctx, __default_val, __flags); return __handle; } -// ── Graph: generic add node (used for conditional nodes) ──────────────────── - -[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddNode( - ::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUgraphNodeParams* __params) -{ - static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddNode, 12, 2); - ::CUgraphNode __node{}; - ::cuda::__driver::__call_driver_fn( - __driver_fn, "Failed to add a node to graph", &__node, __graph, __deps, __ndeps, __params); - return __node; -} - -# endif // _CCCL_CTK_AT_LEAST(12, 4) && _CCCL_CTK_BELOW(13, 0) +# endif // _CCCL_CTK_AT_LEAST(12, 4) // ── Graph: create ─────────────────────────────────────────────────────────── diff --git a/cudax/include/cuda/experimental/__graph/child_graph.cuh b/cudax/include/cuda/experimental/__graph/child_graph.cuh new file mode 100644 index 00000000000..722e6b0c78c --- /dev/null +++ b/cudax/include/cuda/experimental/__graph/child_graph.cuh @@ -0,0 +1,92 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__GRAPH_CHILD_GRAPH_CUH +#define _CUDAX__GRAPH_CHILD_GRAPH_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CTK_AT_LEAST(12, 2) + +# include +# include +# include +# include +# include + +# include + +namespace cuda::experimental +{ +//! \brief Adds a child graph node to a CUDA graph path. +//! +//! The entire subgraph described by \p __child is embedded as a single node in the parent +//! graph. All nodes in the child graph execute before any successor of the new child-graph +//! node. +//! +//! \param __pb Path builder to insert the node into. +//! \param __child A `graph_builder_ref` whose underlying graph will become the child. +//! \return A `graph_node_ref` for the newly added child-graph node. +//! \throws cuda::std::cuda_error if node creation fails. +_CCCL_HOST_API inline graph_node_ref insert_child_graph(path_builder& __pb, graph_builder_ref __child) +{ + auto __deps = __pb.get_dependencies(); + ::CUgraphNodeParams __params{}; + __params.type = ::CU_GRAPH_NODE_TYPE_GRAPH; + __params.graph.graph = __child.get(); + auto __node = ::cuda::experimental::__driver::__graphAddNode( + __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params); + + __pb.__clear_and_set_dependency_node(__node); + return graph_node_ref{__node, __pb.get_native_graph_handle()}; +} + +# if _CCCL_CTK_AT_LEAST(12, 9) +//! \brief Adds a child graph node to a CUDA graph path, transferring ownership. +//! +//! The child graph is moved into the parent graph node. After this call, the +//! `graph_builder` is left in a null state and the parent graph owns the child's +//! lifetime. +//! +//! \param __pb Path builder to insert the node into. +//! \param __child An rvalue `graph_builder` whose graph will be moved into the parent. +//! \return A `graph_node_ref` for the newly added child-graph node. +//! \throws cuda::std::cuda_error if node creation fails. +_CCCL_HOST_API inline graph_node_ref insert_child_graph(path_builder& __pb, graph_builder&& __child) +{ + auto __deps = __pb.get_dependencies(); + ::CUgraphNodeParams __params{}; + __params.type = ::CU_GRAPH_NODE_TYPE_GRAPH; + __params.graph.graph = __child.get(); + __params.graph.ownership = ::CU_GRAPH_CHILD_GRAPH_OWNERSHIP_MOVE; + auto __node = ::cuda::experimental::__driver::__graphAddNode( + __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params); + + (void) __child.release(); + + __pb.__clear_and_set_dependency_node(__node); + return graph_node_ref{__node, __pb.get_native_graph_handle()}; +} +# endif // _CCCL_CTK_AT_LEAST(12, 9) +} // namespace cuda::experimental + +# include + +#endif // _CCCL_CTK_AT_LEAST(12, 2) + +#endif // _CUDAX__GRAPH_CHILD_GRAPH_CUH diff --git a/cudax/include/cuda/experimental/__graph/conditional_node.cuh b/cudax/include/cuda/experimental/__graph/conditional_node.cuh new file mode 100644 index 00000000000..25da74d359d --- /dev/null +++ b/cudax/include/cuda/experimental/__graph/conditional_node.cuh @@ -0,0 +1,187 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__GRAPH_CONDITIONAL_NODE_CUH +#define _CUDAX__GRAPH_CONDITIONAL_NODE_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CTK_AT_LEAST(12, 4) + +# include +# include +# include +# include + +# include + +namespace cuda::experimental +{ +//! \brief A thin, non-owning wrapper around a `cudaGraphConditionalHandle`. +//! +//! A conditional handle is a graph-scoped token whose value at runtime controls whether +//! the body of an `if` or `while` conditional node executes. The handle is owned by +//! the graph — there is no destroy API — so this wrapper is trivially copyable and +//! safe to pass by value into device kernels. +//! +//! Users can construct a handle directly, or let `make_if_node` / `make_while_node` +//! create one automatically. +//! +//! \rst +//! .. _cudax-graph-conditional-handle: +//! \endrst +struct conditional_handle +{ + //! \brief Creates a conditional handle for the given graph. + //! + //! \param __graph Graph in which the conditional node will be inserted. + //! \param __default_val Initial value of the handle (1 = execute body, 0 = skip). + //! \throws cuda::std::cuda_error if `cudaGraphConditionalHandleCreate` fails. + _CCCL_HOST_API explicit conditional_handle(graph_builder_ref __graph, unsigned int __default_val = 1) + : __handle_(::cuda::experimental::__driver::__graphConditionalHandleCreate( + __graph.get(), __graph.get_device().primary_context(), __default_val, ::cudaGraphCondAssignDefault)) + {} + + //! \brief Sets the runtime value of the conditional handle from device code. + //! + //! \param __value Non-zero to execute the body, zero to skip. + _CCCL_DEVICE void set_value(unsigned int __value) const noexcept + { + ::cudaGraphSetConditional(__handle_, __value); + } + + //! \brief Convenience: enables execution of the conditional body (sets the handle to 1). + _CCCL_DEVICE void enable() const noexcept + { + set_value(1u); + } + + //! \brief Convenience: disables execution of the conditional body (sets the handle to 0). + _CCCL_DEVICE void disable() const noexcept + { + set_value(0u); + } + + //! \brief Returns the underlying `cudaGraphConditionalHandle`. + [[nodiscard]] _CCCL_NODEBUG_HOST_API ::cudaGraphConditionalHandle get() const noexcept + { + return __handle_; + } + +private: + ::cudaGraphConditionalHandle __handle_{}; +}; + +//! \brief Result of adding a conditional node. +//! +//! Contains the newly created conditional node, the body graph that should be +//! populated by the caller, and the conditional handle to pass into body kernels. +struct conditional_node_result +{ + graph_node_ref node; //!< The conditional node in the parent graph. + graph_builder_ref body_graph; //!< The body graph to populate with operations. + conditional_handle handle; //!< The handle to control execution from device code. +}; + +_CCCL_HOST_API inline conditional_node_result +__make_conditional_node(path_builder& __pb, conditional_handle __handle, ::CUgraphConditionalNodeType __type) +{ + auto __deps = __pb.get_dependencies(); + + ::CUgraphNodeParams __params{}; + __params.type = ::CU_GRAPH_NODE_TYPE_CONDITIONAL; + __params.conditional.handle = __handle.get(); + __params.conditional.type = __type; + __params.conditional.size = 1; + __params.conditional.ctx = __pb.get_device().primary_context(); + + auto __node = ::cuda::experimental::__driver::__graphAddNode( + __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params); + + __pb.__clear_and_set_dependency_node(__node); + + return {graph_node_ref{__node, __pb.get_native_graph_handle()}, + graph_builder_ref{__params.conditional.phGraph_out[0], __pb.get_device()}, + __handle}; +} +//! \brief Adds an `if`-conditional node to a CUDA graph path. +//! +//! At runtime, if the value of the handle is non-zero the body graph executes once; +//! otherwise it is skipped entirely. +//! +//! The caller must populate the returned `body_graph` with all operations that should +//! run conditionally before the parent graph is instantiated. +//! +//! \param __pb Path builder to insert the node into. +//! \param __default_val Initial handle value (1 = execute, 0 = skip). Ignored when +//! \p __handle is provided. +//! \return A `conditional_node_result` containing the node ref, body graph, and handle. +//! \throws cuda::std::cuda_error if node creation fails. +_CCCL_HOST_API inline conditional_node_result make_if_node(path_builder& __pb, unsigned int __default_val = 1) +{ + conditional_handle __handle{__pb.get_graph(), __default_val}; + return __make_conditional_node(__pb, __handle, ::CU_GRAPH_COND_TYPE_IF); +} + +//! \brief Adds an `if`-conditional node reusing an existing conditional handle. +//! +//! \param __pb Path builder to insert the node into. +//! \param __handle An existing conditional handle (e.g. shared with another node). +//! \return A `conditional_node_result` containing the node ref, body graph, and handle. +//! \throws cuda::std::cuda_error if node creation fails. +_CCCL_HOST_API inline conditional_node_result make_if_node(path_builder& __pb, conditional_handle __handle) +{ + return __make_conditional_node(__pb, __handle, ::CU_GRAPH_COND_TYPE_IF); +} + +//! \brief Adds a `while`-conditional node to a CUDA graph path. +//! +//! At runtime, the body graph is executed repeatedly as long as the handle value +//! is non-zero at the start of each iteration (including the first). +//! +//! The caller must populate the returned `body_graph` before instantiating the parent +//! graph. The body is responsible for calling `handle.set_value(0) or handle.disable()` +//! to terminate the loop. +//! +//! \param __pb Path builder to insert the node into. +//! \param __default_val Initial handle value (1 = enter loop, 0 = skip). +//! \return A `conditional_node_result` containing the node ref, body graph, and handle. +//! \throws cuda::std::cuda_error if node creation fails. +_CCCL_HOST_API inline conditional_node_result make_while_node(path_builder& __pb, unsigned int __default_val = 1) +{ + conditional_handle __handle{__pb.get_graph(), __default_val}; + return __make_conditional_node(__pb, __handle, ::CU_GRAPH_COND_TYPE_WHILE); +} + +//! \brief Adds a `while`-conditional node reusing an existing conditional handle. +//! +//! \param __pb Path builder to insert the node into. +//! \param __handle An existing conditional handle (e.g. shared with another node). +//! \return A `conditional_node_result` containing the node ref, body graph, and handle. +//! \throws cuda::std::cuda_error if node creation fails. +_CCCL_HOST_API inline conditional_node_result make_while_node(path_builder& __pb, conditional_handle __handle) +{ + return __make_conditional_node(__pb, __handle, ::CU_GRAPH_COND_TYPE_WHILE); +} +} // namespace cuda::experimental + +# include + +#endif // _CCCL_CTK_AT_LEAST(12, 4) + +#endif // _CUDAX__GRAPH_CONDITIONAL_NODE_CUH diff --git a/cudax/include/cuda/experimental/__graph/copy_bytes.cuh b/cudax/include/cuda/experimental/__graph/copy_bytes.cuh new file mode 100644 index 00000000000..35ed8dde0ff --- /dev/null +++ b/cudax/include/cuda/experimental/__graph/copy_bytes.cuh @@ -0,0 +1,147 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__GRAPH_COPY_BYTES_CUH +#define _CUDAX__GRAPH_COPY_BYTES_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CTK_AT_LEAST(12, 2) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include +# include +# include + +# include + +namespace cuda::experimental +{ +template +_CCCL_HOST_API graph_node_ref +__copy_bytes_graph_impl(path_builder& __pb, ::cuda::std::span<_SrcTy> __src, ::cuda::std::span<_DstTy> __dst) +{ + static_assert(!::cuda::std::is_const_v<_DstTy>, "Copy destination can't be const"); + static_assert(::cuda::std::is_trivially_copyable_v<_SrcTy> && ::cuda::std::is_trivially_copyable_v<_DstTy>, + "Copy source and destination element types must be trivially copyable"); + + if (__src.size_bytes() > __dst.size_bytes()) + { + _CCCL_THROW(::std::invalid_argument, "Copy destination is too small to fit the source data"); + } + + if (__src.size_bytes() == 0) + { + return graph_node_ref{}; + } + + auto __deps = __pb.get_dependencies(); + ::CUgraphNodeParams __params{}; + __params.type = ::CU_GRAPH_NODE_TYPE_MEMCPY; + __params.memcpy.copyCtx = __pb.get_device().primary_context(); + __params.memcpy.copyParams.srcMemoryType = ::CU_MEMORYTYPE_UNIFIED; + __params.memcpy.copyParams.srcDevice = reinterpret_cast<::CUdeviceptr>(__src.data()); + __params.memcpy.copyParams.dstMemoryType = ::CU_MEMORYTYPE_UNIFIED; + __params.memcpy.copyParams.dstDevice = reinterpret_cast<::CUdeviceptr>(__dst.data()); + __params.memcpy.copyParams.WidthInBytes = __src.size_bytes(); + __params.memcpy.copyParams.Height = 1; + __params.memcpy.copyParams.Depth = 1; + auto __node = ::cuda::experimental::__driver::__graphAddNode( + __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params); + + __pb.__clear_and_set_dependency_node(__node); + return graph_node_ref{__node, __pb.get_native_graph_handle()}; +} + +template +_CCCL_HOST_API graph_node_ref __copy_bytes_graph_impl( + path_builder& __pb, + ::cuda::std::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src, + ::cuda::std::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst) +{ + static_assert(::cuda::std::is_constructible_v<_DstExtents, _SrcExtents>, + "Multidimensional copy requires both source and destination extents to be compatible"); + static_assert(::cuda::std::is_same_v<_SrcLayout, _DstLayout>, + "Multidimensional copy requires both source and destination layouts to match"); + + if (!__dst.is_exhaustive()) + { + _CCCL_THROW(::std::invalid_argument, "copy_bytes supports only exhaustive mdspans"); + } + + if (__src.extents() != __dst.extents()) + { + _CCCL_THROW(::std::invalid_argument, "Copy destination size differs from the source"); + } + + return __copy_bytes_graph_impl( + __pb, + ::cuda::std::span(__src.data_handle(), __src.mapping().required_span_size()), + ::cuda::std::span(__dst.data_handle(), __dst.mapping().required_span_size())); +} +//! \brief Adds a memcpy node to a CUDA graph path that copies bytes from source to destination. +_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) +_CCCL_REQUIRES(::cuda::__spannable<::cuda::transformed_device_argument_t<_SrcTy>> + _CCCL_AND ::cuda::__spannable<::cuda::transformed_device_argument_t<_DstTy>>) +_CCCL_HOST_API graph_node_ref copy_bytes(path_builder& __pb, _SrcTy&& __src, _DstTy&& __dst) +{ + return __copy_bytes_graph_impl( + __pb, + ::cuda::std::span( + ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_SrcTy>(__src))), + ::cuda::std::span( + ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_DstTy>(__dst)))); +} + +//! \brief Adds a memcpy node for mdspan source and destination. +_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) +_CCCL_REQUIRES(::cuda::__mdspannable<::cuda::transformed_device_argument_t<_SrcTy>> + _CCCL_AND ::cuda::__mdspannable<::cuda::transformed_device_argument_t<_DstTy>>) +_CCCL_HOST_API graph_node_ref copy_bytes(path_builder& __pb, _SrcTy&& __src, _DstTy&& __dst) +{ + return __copy_bytes_graph_impl( + __pb, + ::cuda::__as_mdspan( + ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_SrcTy>(__src))), + ::cuda::__as_mdspan( + ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_DstTy>(__dst)))); +} +} // namespace cuda::experimental + +# include + +#endif // _CCCL_CTK_AT_LEAST(12, 2) + +#endif // _CUDAX__GRAPH_COPY_BYTES_CUH diff --git a/cudax/include/cuda/experimental/__graph/fill_bytes.cuh b/cudax/include/cuda/experimental/__graph/fill_bytes.cuh new file mode 100644 index 00000000000..ecff1853844 --- /dev/null +++ b/cudax/include/cuda/experimental/__graph/fill_bytes.cuh @@ -0,0 +1,136 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__GRAPH_FILL_BYTES_CUH +#define _CUDAX__GRAPH_FILL_BYTES_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CTK_AT_LEAST(12, 2) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include +# include +# include +# include + +# include + +namespace cuda::experimental +{ +template +_CCCL_HOST_API graph_node_ref +__fill_bytes_graph_impl(path_builder& __pb, ::cuda::std::span<_DstTy, _DstSize> __dst, ::cuda::std::uint8_t __value) +{ + static_assert(!::cuda::std::is_const_v<_DstTy>, "Fill destination can't be const"); + static_assert(::cuda::std::is_trivially_copyable_v<_DstTy>, + "Fill destination element type must be trivially copyable"); + + auto __deps = __pb.get_dependencies(); + ::CUgraphNodeParams __params{}; + __params.type = ::CU_GRAPH_NODE_TYPE_MEMSET; + __params.memset.dst = reinterpret_cast<::CUdeviceptr>(__dst.data()); + __params.memset.pitch = __dst.size_bytes(); + __params.memset.value = __value; + __params.memset.elementSize = 1; + __params.memset.width = __dst.size_bytes(); + __params.memset.height = 1; + __params.memset.ctx = __pb.get_device().primary_context(); + auto __node = ::cuda::experimental::__driver::__graphAddNode( + __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params); + + __pb.__clear_and_set_dependency_node(__node); + return graph_node_ref{__node, __pb.get_native_graph_handle()}; +} + +template +_CCCL_HOST_API graph_node_ref __fill_bytes_graph_impl( + path_builder& __pb, + ::cuda::std::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst, + ::cuda::std::uint8_t __value) +{ + if (!__dst.is_exhaustive()) + { + _CCCL_THROW(::std::invalid_argument, "fill_bytes supports only exhaustive mdspans"); + } + + return __fill_bytes_graph_impl( + __pb, ::cuda::std::span(__dst.data_handle(), __dst.mapping().required_span_size()), __value); +} +//! \brief Adds a memset node to a CUDA graph path that bytewise-fills the destination. +//! +//! This overload is selected when the destination (after applying `launch_transform`) is +//! a contiguous range convertible to `cuda::std::span`. The element type must be trivially +//! copyable and non-const. The pointer captured in the node must remain valid until the +//! graph executes. +//! +//! \param __pb Path builder to insert the node into. +//! \param __dst Destination memory to fill. +//! \param __value Byte value to write to every byte of the destination. +//! \return A `graph_node_ref` for the newly added memset node. +//! \throws cuda::std::cuda_error if node creation fails. +_CCCL_TEMPLATE(typename _DstTy) +_CCCL_REQUIRES(::cuda::__spannable<::cuda::transformed_device_argument_t<_DstTy>>) +_CCCL_HOST_API graph_node_ref fill_bytes(path_builder& __pb, _DstTy&& __dst, ::cuda::std::uint8_t __value) +{ + return __fill_bytes_graph_impl( + __pb, + ::cuda::std::span( + ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_DstTy>(__dst))), + __value); +} + +//! \brief Adds a memset node to a CUDA graph path that bytewise-fills the destination. +//! +//! This overload is selected when the destination (after applying `launch_transform`) is +//! a `cuda::std::mdspan`. The mdspan must be exhaustive. The element type must be trivially +//! copyable and non-const. The pointer captured in the node must remain valid until the +//! graph executes. +//! +//! \param __pb Path builder to insert the node into. +//! \param __dst Destination memory to fill. +//! \param __value Byte value to write to every byte of the destination. +//! \return A `graph_node_ref` for the newly added memset node. +//! \throws cuda::std::cuda_error if node creation fails. +_CCCL_TEMPLATE(typename _DstTy) +_CCCL_REQUIRES(::cuda::__mdspannable<::cuda::transformed_device_argument_t<_DstTy>>) +_CCCL_HOST_API graph_node_ref fill_bytes(path_builder& __pb, _DstTy&& __dst, ::cuda::std::uint8_t __value) +{ + return __fill_bytes_graph_impl( + __pb, + ::cuda::__as_mdspan( + ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_DstTy>(__dst))), + __value); +} +} // namespace cuda::experimental + +# include + +#endif // _CCCL_CTK_AT_LEAST(12, 2) + +#endif // _CUDAX__GRAPH_FILL_BYTES_CUH diff --git a/cudax/include/cuda/experimental/__graph/graph_builder_ref.cuh b/cudax/include/cuda/experimental/__graph/graph_builder_ref.cuh index 4d0673e3d78..c90e6fe7550 100644 --- a/cudax/include/cuda/experimental/__graph/graph_builder_ref.cuh +++ b/cudax/include/cuda/experimental/__graph/graph_builder_ref.cuh @@ -262,16 +262,20 @@ private: //! \param __parent The parent graph to which this graph will be added. //! \return A `graph_node_ref` representing the added child graph. //! \throws cuda::std::cuda_error if `cudaGraphAddChildGraphNode` fails. +#if _CCCL_CTK_AT_LEAST(12, 2) template [[nodiscard]] _CCCL_HOST_API auto __add_to_graph(cudaGraph_t __parent, ::cuda::std::span __deps) -> graph_node_ref { + ::CUgraphNodeParams __params{}; + __params.type = ::CU_GRAPH_NODE_TYPE_GRAPH; + __params.graph.graph = __graph_; graph_node_ref __child; __child.__graph_ = __graph_; - __child.__node_ = - ::cuda::experimental::__driver::__graphAddChildGraphNode(__parent, __deps.data(), __deps.size(), __graph_); + __child.__node_ = ::cuda::experimental::__driver::__graphAddNode(__parent, __deps.data(), __deps.size(), &__params); return __child; } +#endif // _CCCL_CTK_AT_LEAST(12, 2) device_ref __dev_; //!< The device on which the graph is built. cudaGraph_t __graph_ = nullptr; //!< The underlying CUDA graph handle. diff --git a/cudax/include/cuda/experimental/__graph/graph_node_type.cuh b/cudax/include/cuda/experimental/__graph/graph_node_type.cuh index 0e51f70bed6..7ace27410f0 100644 --- a/cudax/include/cuda/experimental/__graph/graph_node_type.cuh +++ b/cudax/include/cuda/experimental/__graph/graph_node_type.cuh @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// diff --git a/cudax/include/cuda/experimental/__graph/host_launch.cuh b/cudax/include/cuda/experimental/__graph/host_launch.cuh new file mode 100644 index 00000000000..86b5b1fb43f --- /dev/null +++ b/cudax/include/cuda/experimental/__graph/host_launch.cuh @@ -0,0 +1,142 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__GRAPH_HOST_LAUNCH_CUH +#define _CUDAX__GRAPH_HOST_LAUNCH_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CTK_AT_LEAST(12, 2) + +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include +# include +# include + +# include + +# include + +namespace cuda::experimental +{ +// Launcher for a plain no-arg function pointer. +template +_CCCL_HOST_API inline void CUDA_CB __graph_func_ptr_launcher(void* __callable_ptr) +{ + reinterpret_cast<_FuncPtr>(__callable_ptr)(); +} + +// Launcher for a heap-allocated callable + argument pack. The graph host node callback +// signature is void(void*), unlike the stream callback which is void(CUstream, CUresult, void*), +// so we can't reuse ::cuda::__stream_callback_launcher here. +template +_CCCL_HOST_API inline void CUDA_CB __graph_callback_launcher(void* __data_ptr) +{ + auto* __data = static_cast<_CallbackData*>(__data_ptr); + // Copy, not move — the graph may be launched multiple times. + // Data lifetime is managed by a graph user object. + ::cuda::std::apply(__data->__callable_, __data->__args_); +} + +template +_CCCL_HOST_API inline void CUDA_CB __graph_callback_data_destroyer(void* __data_ptr) +{ + delete static_cast<_CallbackData*>(__data_ptr); +} + +//! \brief Adds a host node to a CUDA graph path that invokes a callable on the host. +//! +//! The callable and its arguments are copied into a heap allocation whose lifetime is +//! tied to the graph via a CUDA user object. The graph can be launched multiple times. +//! The rules and restrictions match `cuda::host_launch`: +//! - The callable must not call into CUDA Runtime or Driver APIs. +//! - It must not depend on another thread that could block on asynchronous CUDA work. +//! +//! Three dispatch paths (mirroring `cuda::host_launch`): +//! 1. A bare no-arg function pointer; no allocation. +//! 2. A `std::reference_wrapper` (no args) passes the address of the referenced object. +//! 3. Everything else is heap-allocated with lifetime managed by a graph user object. +//! +//! \param __pb Path builder to insert the node into. +//! \param __callable Callable to execute on the host. +//! \param __args Arguments to forward to the callable. +//! \return A `graph_node_ref` for the newly added host node. +//! \throws cuda::std::cuda_error if node creation fails. +template +_CCCL_HOST_API graph_node_ref host_launch(path_builder& __pb, _Callable __callable, _Args... __args) +{ + static_assert(::cuda::std::is_invocable_v<_Callable, _Args...>, + "Callable can't be called with the supplied arguments"); + static_assert(::cuda::std::is_move_constructible_v<_Callable>, "The callable must be move constructible"); + static_assert((::cuda::std::is_move_constructible_v<_Args> && ...), + "All callback arguments must be move constructible"); + + constexpr bool __has_args = sizeof...(_Args) > 0; + + ::CUhostFn __fn = nullptr; + void* __user_data = nullptr; + + if constexpr (!__has_args && ::cuda::std::is_pointer_v<_Callable> + && ::cuda::std::is_function_v<::cuda::std::remove_pointer_t<_Callable>>) + { + __fn = __graph_func_ptr_launcher<_Callable>; + __user_data = reinterpret_cast(__callable); + } + else if constexpr (!__has_args && ::cuda::std::__is_cuda_std_reference_wrapper_v<_Callable>) + { + __fn = ::cuda::__host_func_launcher; + __user_data = static_cast(::cuda::std::addressof(__callable.get())); + } + else + { + // Heap-allocate the callback data. Lifetime is tied to the graph via a user object. + using _CallbackData = ::cuda::__stream_callback_data<_Callable, _Args...>; + auto __data = new _CallbackData{::cuda::std::move(__callable), {::cuda::std::move(__args)...}}; + __fn = __graph_callback_launcher<_CallbackData>; + __user_data = __data; + ::cuda::experimental::__driver::__graphRetainUserObject( + __pb.get_native_graph_handle(), __data, __graph_callback_data_destroyer<_CallbackData>); + } + + auto __deps = __pb.get_dependencies(); + ::CUgraphNodeParams __params{}; + __params.type = ::CU_GRAPH_NODE_TYPE_HOST; + __params.host.fn = __fn; + __params.host.userData = __user_data; + auto __node = ::cuda::experimental::__driver::__graphAddNode( + __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params); + + __pb.__clear_and_set_dependency_node(__node); + return graph_node_ref{__node, __pb.get_native_graph_handle()}; +} +} // namespace cuda::experimental + +# include + +#endif // _CCCL_CTK_AT_LEAST(12, 2) + +#endif // _CUDAX__GRAPH_HOST_LAUNCH_CUH diff --git a/cudax/include/cuda/experimental/__graph/path_builder.cuh b/cudax/include/cuda/experimental/__graph/path_builder.cuh index 927b349914d..8fead795057 100644 --- a/cudax/include/cuda/experimental/__graph/path_builder.cuh +++ b/cudax/include/cuda/experimental/__graph/path_builder.cuh @@ -13,6 +13,7 @@ #include +#include #include #include #include @@ -117,6 +118,49 @@ struct path_builder __nodes_.insert(__nodes_.end(), __other.__nodes_.begin(), __other.__nodes_.end()); } + //! \brief Adds an event-wait node and makes it the next dependency. + //! + //! The new node waits for \p __ev to be recorded before any successor of this node + //! executes. This mirrors `stream_ref::wait(event_ref)`. + //! + //! \param __ev Event that graph execution should wait on. + //! \return A `graph_node_ref` for the newly added event-wait node. + //! \throws cuda::std::cuda_error if `cudaGraphAddEventWaitNode` fails. +#if _CCCL_CTK_AT_LEAST(12, 2) + _CCCL_HOST_API graph_node_ref wait(::cuda::event_ref __ev) + { + auto __deps = get_dependencies(); + ::CUgraphNodeParams __params{}; + __params.type = ::CU_GRAPH_NODE_TYPE_WAIT_EVENT; + __params.eventWait.event = __ev.get(); + auto __node = ::cuda::experimental::__driver::__graphAddNode(__graph_, __deps.data(), __deps.size(), &__params); + + __clear_and_set_dependency_node(__node); + return graph_node_ref{__node, __graph_}; + } + + //! \brief Adds an event-record node and makes it the next dependency. + //! + //! The new node records \p __ev when it executes, signalling any downstream waiters. + //! This mirrors `stream_ref::record_event(event_flags)` but takes an existing event + //! because graph construction happens before execution. + //! + //! \param __ev Event to record. + //! \return A `graph_node_ref` for the newly added event-record node. + //! \throws cuda::std::cuda_error if `cudaGraphAddEventRecordNode` fails. + _CCCL_HOST_API graph_node_ref record_event(::cuda::event_ref __ev) + { + auto __deps = get_dependencies(); + ::CUgraphNodeParams __params{}; + __params.type = ::CU_GRAPH_NODE_TYPE_EVENT_RECORD; + __params.eventRecord.event = __ev.get(); + auto __node = ::cuda::experimental::__driver::__graphAddNode(__graph_, __deps.data(), __deps.size(), &__params); + + __clear_and_set_dependency_node(__node); + return graph_node_ref{__node, __graph_}; + } +#endif // _CCCL_CTK_AT_LEAST(12, 2) + template static constexpr bool __all_dependencies = (graph_dependency && ...); diff --git a/cudax/include/cuda/experimental/graph.cuh b/cudax/include/cuda/experimental/graph.cuh index 0b60303cafd..6055b9aaaf8 100644 --- a/cudax/include/cuda/experimental/graph.cuh +++ b/cudax/include/cuda/experimental/graph.cuh @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -12,12 +12,17 @@ #define __CUDAX_GRAPH_CUH // IWYU pragma: begin_exports +#include +#include +#include #include +#include #include #include #include #include #include +#include #include // IWYU pragma: end_exports diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index c5e853470c0..445e01f79ce 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -67,6 +67,7 @@ target_compile_options( cudax_add_catch2_test(test_target graph graph/graph_smoke.cu + graph/graph_node_ops_smoke.cu ) cudax_add_catch2_test(test_target stream diff --git a/cudax/test/graph/graph_node_ops_smoke.cu b/cudax/test/graph/graph_node_ops_smoke.cu new file mode 100644 index 00000000000..f19e639ef1b --- /dev/null +++ b/cudax/test/graph/graph_node_ops_smoke.cu @@ -0,0 +1,555 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#if _CCCL_CTK_AT_LEAST(12, 2) + +# include +# include +# include + +# include +# include + +namespace +{ +namespace test +{ +// ─── helpers ─────────────────────────────────────────────────────────────── + +// RAII wrapper around a pinned-memory allocation of N elements of type T. +template +struct pinned_array +{ + _malloc_pinned mem; + std::size_t n; + + explicit pinned_array(std::size_t __n, T __init = T{}) + : mem(__n * sizeof(T)) + , n(__n) + { + for (std::size_t i = 0; i < n; ++i) + { + get()[i] = __init; + } + } + + pinned_array(const pinned_array&) = delete; + pinned_array& operator=(const pinned_array&) = delete; + + T* get() const noexcept + { + return mem.get_as(); + } + T& operator[](std::size_t i) const noexcept + { + return get()[i]; + } +}; + +// ─── kernels used in conditional tests ───────────────────────────────────── + +# if _CCCL_CTK_AT_LEAST(12, 4) +// Body kernel for while-loop conditional test: decrements a counter and +// stops the loop when it reaches zero. +struct count_down_and_stop +{ + __device__ void operator()(cudax::conditional_handle handle, int* counter) const noexcept + { + --(*counter); + if (*counter <= 0) + { + handle.disable(); + } + } +}; +# endif // _CCCL_CTK_AT_LEAST(12, 4) +} // namespace test +} // namespace + +// ──────────────────────────────────────────────────────────────────────────── +// fill_bytes +// ──────────────────────────────────────────────────────────────────────────── + +C2H_TEST("graph fill_bytes sets every byte to the requested value", "[graph][fill_bytes]") +{ + cudax::stream s{cuda::device_ref{0}}; + + constexpr std::size_t N = 64; + test::pinned_array mem{N, static_cast(0xDEADBEEF)}; + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // Zero-fill via a graph memset node. + cudax::fill_bytes(pb, ::cuda::std::span{mem.get(), N}, ::cuda::std::uint8_t{0}); + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + for (std::size_t i = 0; i < N; ++i) + { + CUDAX_REQUIRE(mem[i] == 0); + } +} + +C2H_TEST("graph fill_bytes with non-zero value", "[graph][fill_bytes]") +{ + cudax::stream s{cuda::device_ref{0}}; + + constexpr std::size_t N = 8; + test::pinned_array mem{N}; + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + cudax::fill_bytes(pb, ::cuda::std::span{mem.get(), N}, ::cuda::std::uint8_t{0xAB}); + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + for (std::size_t i = 0; i < N; ++i) + { + CUDAX_REQUIRE(mem[i] == static_cast(0xAB)); + } +} + +// ──────────────────────────────────────────────────────────────────────────── +// copy_bytes +// ──────────────────────────────────────────────────────────────────────────── + +C2H_TEST("graph copy_bytes copies data from source to destination", "[graph][copy_bytes]") +{ + cudax::stream s{cuda::device_ref{0}}; + + constexpr std::size_t N = 32; + test::pinned_array src{N}; + test::pinned_array dst{N, -1}; + + for (std::size_t i = 0; i < N; ++i) + { + src[i] = static_cast(i * 7); + } + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + cudax::copy_bytes(pb, ::cuda::std::span{src.get(), N}, ::cuda::std::span{dst.get(), N}); + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + for (std::size_t i = 0; i < N; ++i) + { + CUDAX_REQUIRE(dst[i] == src[i]); + } +} + +C2H_TEST("graph copy_bytes can be chained after fill_bytes", "[graph][fill_bytes][copy_bytes]") +{ + cudax::stream s{cuda::device_ref{0}}; + + constexpr std::size_t N = 16; + test::pinned_array src{N}; + test::pinned_array dst{N}; + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // Fill source with 0xFF, then copy to destination. + cudax::fill_bytes(pb, ::cuda::std::span{src.get(), N}, ::cuda::std::uint8_t{0xFF}); + cudax::copy_bytes(pb, ::cuda::std::span{src.get(), N}, ::cuda::std::span{dst.get(), N}); + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + for (std::size_t i = 0; i < N; ++i) + { + CUDAX_REQUIRE(dst[i] == static_cast(0xFF)); + } +} + +// ──────────────────────────────────────────────────────────────────────────── +// host_launch +// ──────────────────────────────────────────────────────────────────────────── + +C2H_TEST("graph host_launch executes a lambda callback", "[graph][host_launch]") +{ + cudax::stream s{cuda::device_ref{0}}; + // pinned so the host-side increment is visible immediately after sync + test::pinned counter{0}; + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // Capture the pointer by value so the callback remains valid after graph build. + int* ptr = counter.get(); + cudax::host_launch(pb, [ptr]() { + *ptr = 42; + }); + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + CUDAX_REQUIRE(*counter == 42); +} + +C2H_TEST("graph host_launch with arguments", "[graph][host_launch]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned a{10}; + test::pinned b{20}; + test::pinned result{0}; + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + int* pa = a.get(); + int* pb2 = b.get(); + int* pr = result.get(); + cudax::host_launch( + pb, + [](int* x, int* y, int* r) { + *r = *x + *y; + }, + pa, + pb2, + pr); + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + CUDAX_REQUIRE(*result == 30); +} + +C2H_TEST("graph host_launch can be chained with kernel nodes", "[graph][host_launch]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned_array mem{1}; + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // Kernel sets value to 42. + int* ptr = mem.get(); + cudax::launch(pb, test::one_thread_dims, test::assign_42{}, ptr); + + // Host callback increments it. + cudax::host_launch(pb, [ptr]() { + *ptr += 1; + }); + + // Kernel verifies the final value is 43. + cudax::launch(pb, test::one_thread_dims, test::verify_n<43>{}, ptr); + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + CUDAX_REQUIRE(mem[0] == 43); +} + +C2H_TEST("graph host_launch can be launched multiple times", "[graph][host_launch]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned_array mem{1}; + int* ptr = mem.get(); + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // Host callback increments the value each time. + cudax::host_launch(pb, [ptr]() { + *ptr += 1; + }); + + auto exec = g.instantiate(); + + // Launch 5 times — each launch should increment by 1. + for (int i = 0; i < 5; ++i) + { + exec.launch(s); + s.sync(); + CUDAX_REQUIRE(mem[0] == i + 1); + } +} + +C2H_TEST("graph host_launch data is cleaned up when graph is destroyed", "[graph][host_launch]") +{ + // Use a shared_ptr as a witness: the weak_ptr expires when all copies are gone. + auto witness = ::std::make_shared(42); + ::std::weak_ptr weak = witness; + + { + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // The lambda captures a copy of the shared_ptr, which gets stored in the graph's user object. + cudax::host_launch(pb, [witness]() { + (void) witness; + }); + + // Release our copy — the graph's user object should keep the shared_ptr alive. + witness.reset(); + CUDAX_REQUIRE(!weak.expired()); + } + // graph_builder destroyed — user object destructor should have deleted the callback data, + // releasing the last shared_ptr copy. + CUDAX_REQUIRE(weak.expired()); +} + +// ──────────────────────────────────────────────────────────────────────────── +// event record / wait +// ──────────────────────────────────────────────────────────────────────────── + +C2H_TEST("graph record_event and wait(event_ref) impose ordering across independent paths", + "[graph][event_record][event_wait]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned_array mem{1}; + + cuda::event ev{cuda::device_ref{0}}; + + cudax::graph_builder g; + + // Path A: assign 42, then record the event. + int* val = mem.get(); + cudax::path_builder path_a = cudax::start_path(g); + cudax::launch(path_a, test::one_thread_dims, test::assign_42{}, val); + path_a.record_event(ev); + + // Path B (independent start): wait on the event, then verify value is 42. + cudax::path_builder path_b = cudax::start_path(g); // no deps from path_a + path_b.wait(ev); + cudax::launch(path_b, test::one_thread_dims, test::verify_42{}, val); + + // Drain both paths. + path_a.wait(path_b); + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + CUDAX_REQUIRE(mem[0] == 42); +} + +C2H_TEST("graph record_event node has the correct node type", "[graph][event_record]") +{ + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + cuda::event ev{cuda::device_ref{0}}; + auto node = pb.record_event(ev); + + CUDAX_REQUIRE(node.type() == cudax::graph_node_type::event_record); +} + +C2H_TEST("graph wait(event_ref) node has the correct node type", "[graph][event_wait]") +{ + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + cuda::event ev{cuda::device_ref{0}}; + auto node = pb.wait(ev); + + CUDAX_REQUIRE(node.type() == cudax::graph_node_type::wait_event); +} + +// ──────────────────────────────────────────────────────────────────────────── +// child graph +// ──────────────────────────────────────────────────────────────────────────── + +C2H_TEST("graph insert_child_graph embeds a subgraph", "[graph][child_graph]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned_array mem{1}; + int* val = mem.get(); + + // Build the child graph: kernel that assigns 42. + cudax::graph_builder child_g; + { + cudax::path_builder child_pb = cudax::start_path(child_g); + cudax::launch(child_pb, test::one_thread_dims, test::assign_42{}, val); + } + + // Build the parent graph: embed the child, then verify. + cudax::graph_builder parent_g; + cudax::path_builder pb = cudax::start_path(parent_g); + + cudax::insert_child_graph(pb, child_g); + cudax::launch(pb, test::one_thread_dims, test::verify_42{}, val); + + auto exec = parent_g.instantiate(); + exec.launch(s); + s.sync(); + + CUDAX_REQUIRE(mem[0] == 42); +} + +# if _CCCL_CTK_AT_LEAST(12, 9) +C2H_TEST("graph insert_child_graph with ownership transfer", "[graph][child_graph]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned_array mem{1}; + int* val = mem.get(); + + cudax::graph_builder child_g; + { + cudax::path_builder child_pb = cudax::start_path(child_g); + cudax::launch(child_pb, test::one_thread_dims, test::assign_42{}, val); + } + + cudax::graph_builder parent_g; + cudax::path_builder pb = cudax::start_path(parent_g); + + // Move the child graph into the parent — child_g is null afterwards. + cudax::insert_child_graph(pb, std::move(child_g)); + CUDAX_REQUIRE(child_g.get() == nullptr); + + cudax::launch(pb, test::one_thread_dims, test::verify_42{}, val); + + auto exec = parent_g.instantiate(); + exec.launch(s); + s.sync(); + + CUDAX_REQUIRE(mem[0] == 42); +} +# endif // _CCCL_CTK_AT_LEAST(12, 9) + +C2H_TEST("graph insert_child_graph node has the correct node type", "[graph][child_graph]") +{ + cudax::graph_builder child_g; + { + cudax::path_builder child_pb = cudax::start_path(child_g); + cudax::launch(child_pb, test::one_thread_dims, test::empty_kernel{}); + } + + cudax::graph_builder parent_g; + cudax::path_builder pb = cudax::start_path(parent_g); + + auto node = cudax::insert_child_graph(pb, child_g); + + CUDAX_REQUIRE(node.type() == cudax::graph_node_type::graph); +} + +// ──────────────────────────────────────────────────────────────────────────── +// conditional nodes (if / while) +// ──────────────────────────────────────────────────────────────────────────── + +# if _CCCL_CTK_AT_LEAST(12, 4) + +C2H_TEST("graph make_if_node body executes when handle is non-zero", "[graph][conditional][if_node]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned_array mem{1}; + int* val = mem.get(); + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // Default value 1 → body executes. + auto [cond_node, body_graph, handle] = cudax::make_if_node(pb, /*__default_val=*/1u); + + // Populate the body graph: assign 42 to val. + { + cudax::path_builder body_pb = cudax::start_path(body_graph); + cudax::launch(body_pb, test::one_thread_dims, test::assign_42{}, val); + } + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + CUDAX_REQUIRE(mem[0] == 42); +} + +C2H_TEST("graph make_if_node body is skipped when handle is zero", "[graph][conditional][if_node]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned_array mem{1}; + int* val = mem.get(); + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // Default value 0 → body is skipped. + auto [cond_node, body_graph, handle] = cudax::make_if_node(pb, /*__default_val=*/0u); + + { + cudax::path_builder body_pb = cudax::start_path(body_graph); + cudax::launch(body_pb, test::one_thread_dims, test::assign_42{}, val); + } + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + // val should remain 0 because the body was skipped. + CUDAX_REQUIRE(mem[0] == 0); +} + +C2H_TEST("graph make_while_node body executes the expected number of times", "[graph][conditional][while_node]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned_array mem{1, 5}; // will be decremented to 0 + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // Default value 1 → loop runs as long as counter > 0. + auto [while_node, body_graph, handle] = cudax::make_while_node(pb); + + // Body: decrement counter and stop when done. + { + cudax::path_builder body_pb = cudax::start_path(body_graph); + cudax::launch(body_pb, test::one_thread_dims, test::count_down_and_stop{}, handle, mem.get()); + } + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + CUDAX_REQUIRE(mem[0] == 0); +} + +C2H_TEST("graph make_if_node with pre-constructed handle", "[graph][conditional][if_node]") +{ + cudax::stream s{cuda::device_ref{0}}; + test::pinned_array mem{1}; + int* val = mem.get(); + + cudax::graph_builder g; + cudax::path_builder pb = cudax::start_path(g); + + // User constructs handle directly. + cudax::conditional_handle my_handle{g, 1u}; + auto [cond_node, body_graph, handle] = cudax::make_if_node(pb, my_handle); + + { + cudax::path_builder body_pb = cudax::start_path(body_graph); + cudax::launch(body_pb, test::one_thread_dims, test::assign_42{}, val); + } + + auto exec = g.instantiate(); + exec.launch(s); + s.sync(); + + CUDAX_REQUIRE(mem[0] == 42); +} + +# endif // _CCCL_CTK_AT_LEAST(12, 4) + +#endif // _CCCL_CTK_AT_LEAST(12, 2) diff --git a/libcudacxx/include/cuda/__device/device_ref.h b/libcudacxx/include/cuda/__device/device_ref.h index 70ead5ef77b..1aecd3bfe08 100644 --- a/libcudacxx/include/cuda/__device/device_ref.h +++ b/libcudacxx/include/cuda/__device/device_ref.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -128,6 +128,12 @@ class device_ref //! @brief Initializes the primary context of the device. _CCCL_HOST_API void init() const; // implemented in to avoid circular dependency + //! @brief Retrieve the primary context of this device. + //! + //! @return The primary CUDA context for this device. + [[nodiscard]] _CCCL_HOST_API ::CUcontext primary_context() const; // implemented in + // to avoid circular dependency + //! @brief Retrieve the name of this device. //! //! @return String view containing the name of this device. diff --git a/libcudacxx/include/cuda/__device/physical_device.h b/libcudacxx/include/cuda/__device/physical_device.h index b827294fbb6..d45a8019c8a 100644 --- a/libcudacxx/include/cuda/__device/physical_device.h +++ b/libcudacxx/include/cuda/__device/physical_device.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -159,6 +159,11 @@ _CCCL_HOST_API inline void device_ref::init() const (void) ::cuda::__physical_devices()[__id_].__primary_context(); } +[[nodiscard]] _CCCL_HOST_API inline ::CUcontext device_ref::primary_context() const +{ + return ::cuda::__physical_devices()[__id_].__primary_context(); +} + [[nodiscard]] _CCCL_HOST_API inline ::cuda::std::string_view device_ref::name() const { return ::cuda::__physical_devices()[__id_].__name();