diff --git a/cudax/include/cuda/experimental/__driver/driver_api.cuh b/cudax/include/cuda/experimental/__driver/driver_api.cuh
index 13973ef9a49..0a37858aa3b 100644
--- a/cudax/include/cuda/experimental/__driver/driver_api.cuh
+++ b/cudax/include/cuda/experimental/__driver/driver_api.cuh
@@ -36,145 +36,59 @@
 // #define'd version aliases in cuda.h (e.g. #define cuFoo cuFoo_v2).
 // The ## operator suppresses macro expansion of the function name, so this is
 // safe even for names that are #define'd to versioned variants.
-#  define _CUDAX_GET_DRIVER_FUNCTION(pfn_name, major, minor)  \
-    reinterpret_cast<PFN_##pfn_name##_v##major##0##minor##0>( \
+#  define _CUDAX_GET_DRIVER_FUNCTION(pfn_name, major, minor)    \
+    reinterpret_cast<::PFN_##pfn_name##_v##major##0##minor##0>( \
       ::cuda::__driver::__get_driver_entry_point(#pfn_name, major, minor))
 
 namespace cuda::experimental::__driver
 {
-// ── Graph: memset node ──────────────────────────────────────────────────────
+// ── Graph: polymorphic add node ─────────────────────────────────────────────
 
-[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddMemsetNode(
-  ::CUgraph __graph,
-  const ::CUgraphNode* __deps,
-  ::cuda::std::size_t __ndeps,
-  ::CUdeviceptr __dst,
-  ::cuda::std::size_t __pitch,
-  unsigned int __value,
-  unsigned int __element_size,
-  ::cuda::std::size_t __width,
-  ::cuda::std::size_t __height)
-{
-  static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddMemsetNode, 10, 0);
-  ::CUgraphNode __node{};
-  ::CUDA_MEMSET_NODE_PARAMS __params{};
-  __params.dst         = __dst;
-  __params.pitch       = __pitch;
-  __params.value       = __value;
-  __params.elementSize = __element_size;
-  __params.width       = __width;
-  __params.height      = __height;
-  ::CUcontext __ctx    = ::cuda::__driver::__ctxGetCurrent();
-  ::cuda::__driver::__call_driver_fn(
-    __driver_fn, "Failed to add a memset node to graph", &__node, __graph, __deps, __ndeps, &__params, __ctx);
-  return __node;
-}
-
-// ── Graph: memcpy node (1-D) ────────────────────────────────────────────────
+#  if _CCCL_CTK_AT_LEAST(12, 2)
 
-[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddMemcpyNode1D(
-  ::CUgraph __graph,
-  const ::CUgraphNode* __deps,
-  ::cuda::std::size_t __ndeps,
-  ::CUdeviceptr __dst,
-  ::CUdeviceptr __src,
-  ::cuda::std::size_t __byte_count)
+[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddNode(
+  ::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUgraphNodeParams* __params)
 {
-  static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddMemcpyNode, 10, 0);
+  static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddNode, 12, 2);
   ::CUgraphNode __node{};
-  ::CUDA_MEMCPY3D __params{};
-  __params.srcMemoryType = ::CU_MEMORYTYPE_UNIFIED;
-  __params.srcDevice     = __src;
-  __params.dstMemoryType = ::CU_MEMORYTYPE_UNIFIED;
-  __params.dstDevice     = __dst;
-  __params.WidthInBytes  = __byte_count;
-  __params.Height        = 1;
-  __params.Depth         = 1;
-  ::CUcontext __ctx      = ::cuda::__driver::__ctxGetCurrent();
   ::cuda::__driver::__call_driver_fn(
-    __driver_fn, "Failed to add a memcpy node to graph", &__node, __graph, __deps, __ndeps, &__params, __ctx);
+    __driver_fn, "Failed to add a node to graph", &__node, __graph, __deps, __ndeps, __params);
   return __node;
 }
 
-// ── Graph: host node ────────────────────────────────────────────────────────
-
-[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddHostNode(
-  ::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUhostFn __fn, void* __user_data)
-{
-  static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddHostNode, 10, 0);
-  ::CUgraphNode __node{};
-  ::CUDA_HOST_NODE_PARAMS __params{};
-  __params.fn       = __fn;
-  __params.userData = __user_data;
-  ::cuda::__driver::__call_driver_fn(
-    __driver_fn, "Failed to add a host node to graph", &__node, __graph, __deps, __ndeps, &__params);
-  return __node;
-}
+#  endif // _CCCL_CTK_AT_LEAST(12, 2)
 
-// ── Graph: child graph node ─────────────────────────────────────────────────
+// ── Graph: user object (ref-counted data lifetime tied to graph) ─────────────
 
-[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddChildGraphNode(
-  ::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUgraph __child_graph)
+_CCCL_HOST_API inline void __graphRetainUserObject(::CUgraph __graph, void* __ptr, ::CUhostFn __destroy)
 {
-  static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddChildGraphNode, 10, 0);
-  ::CUgraphNode __node{};
-  ::cuda::__driver::__call_driver_fn(
-    __driver_fn, "Failed to add a child graph node", &__node, __graph, __deps, __ndeps, __child_graph);
-  return __node;
-}
-
-// ── Graph: event record node ────────────────────────────────────────────────
+  static auto __create_fn = _CUDAX_GET_DRIVER_FUNCTION(cuUserObjectCreate, 11, 3);
+  static auto __retain_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphRetainUserObject, 11, 3);
 
-[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode
-__graphAddEventRecordNode(::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUevent __ev)
-{
-  static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddEventRecordNode, 11, 1);
-  ::CUgraphNode __node{};
+  ::CUuserObject __obj{};
   ::cuda::__driver::__call_driver_fn(
-    __driver_fn, "Failed to add an event record node to graph", &__node, __graph, __deps, __ndeps, __ev);
-  return __node;
-}
-
-// ── Graph: event wait node ──────────────────────────────────────────────────
-
-[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode
-__graphAddEventWaitNode(::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUevent __ev)
-{
-  static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddEventWaitNode, 11, 1);
-  ::CUgraphNode __node{};
+    __create_fn, "Failed to create user object", &__obj, __ptr, __destroy, 1u, ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC);
+  // CU_GRAPH_USER_OBJECT_MOVE transfers our reference to the graph without incrementing.
+  // After this call, the graph owns the sole reference — do not release.
   ::cuda::__driver::__call_driver_fn(
-    __driver_fn, "Failed to add an event wait node to graph", &__node, __graph, __deps, __ndeps, __ev);
-  return __node;
+    __retain_fn, "Failed to retain user object on graph", __graph, __obj, 1u, ::CU_GRAPH_USER_OBJECT_MOVE);
 }
 
 // ── Graph: conditional handle ───────────────────────────────────────────────
 
-#  if _CCCL_CTK_AT_LEAST(12, 4) && _CCCL_CTK_BELOW(13, 0)
+#  if _CCCL_CTK_AT_LEAST(12, 4)
 
 [[nodiscard]] _CCCL_HOST_API inline ::CUgraphConditionalHandle
-__graphConditionalHandleCreate(::CUgraph __graph, unsigned int __default_val, unsigned int __flags)
+__graphConditionalHandleCreate(::CUgraph __graph, ::CUcontext __ctx, unsigned int __default_val, unsigned int __flags)
 {
   static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphConditionalHandleCreate, 12, 3);
   ::CUgraphConditionalHandle __handle{};
-  ::CUcontext __ctx = ::cuda::__driver::__ctxGetCurrent();
   ::cuda::__driver::__call_driver_fn(
     __driver_fn, "Failed to create a conditional handle", &__handle, __graph, __ctx, __default_val, __flags);
   return __handle;
 }
 
-// ── Graph: generic add node (used for conditional nodes) ────────────────────
-
-[[nodiscard]] _CCCL_HOST_API inline ::CUgraphNode __graphAddNode(
-  ::CUgraph __graph, const ::CUgraphNode* __deps, ::cuda::std::size_t __ndeps, ::CUgraphNodeParams* __params)
-{
-  static auto __driver_fn = _CUDAX_GET_DRIVER_FUNCTION(cuGraphAddNode, 12, 2);
-  ::CUgraphNode __node{};
-  ::cuda::__driver::__call_driver_fn(
-    __driver_fn, "Failed to add a node to graph", &__node, __graph, __deps, __ndeps, __params);
-  return __node;
-}
-
-#  endif // _CCCL_CTK_AT_LEAST(12, 4) && _CCCL_CTK_BELOW(13, 0)
+#  endif // _CCCL_CTK_AT_LEAST(12, 4)
 
 // ── Graph: create ───────────────────────────────────────────────────────────
 
diff --git a/cudax/include/cuda/experimental/__graph/child_graph.cuh b/cudax/include/cuda/experimental/__graph/child_graph.cuh
new file mode 100644
index 00000000000..722e6b0c78c
--- /dev/null
+++ b/cudax/include/cuda/experimental/__graph/child_graph.cuh
@@ -0,0 +1,92 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__GRAPH_CHILD_GRAPH_CUH
+#define _CUDAX__GRAPH_CHILD_GRAPH_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CTK_AT_LEAST(12, 2)
+
+#  include <cuda/experimental/__driver/driver_api.cuh>
+#  include <cuda/experimental/__graph/graph_builder.cuh>
+#  include <cuda/experimental/__graph/graph_builder_ref.cuh>
+#  include <cuda/experimental/__graph/graph_node_ref.cuh>
+#  include <cuda/experimental/__graph/path_builder.cuh>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental
+{
+//! \brief Adds a child graph node to a CUDA graph path.
+//!
+//! The entire subgraph described by \p __child is embedded as a single node in the parent
+//! graph.  All nodes in the child graph execute before any successor of the new child-graph
+//! node.
+//!
+//! \param __pb    Path builder to insert the node into.
+//! \param __child A `graph_builder_ref` whose underlying graph will become the child.
+//! \return A `graph_node_ref` for the newly added child-graph node.
+//! \throws cuda::std::cuda_error if node creation fails.
+_CCCL_HOST_API inline graph_node_ref insert_child_graph(path_builder& __pb, graph_builder_ref __child)
+{
+  auto __deps = __pb.get_dependencies();
+  ::CUgraphNodeParams __params{};
+  __params.type        = ::CU_GRAPH_NODE_TYPE_GRAPH;
+  __params.graph.graph = __child.get();
+  auto __node          = ::cuda::experimental::__driver::__graphAddNode(
+    __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params);
+
+  __pb.__clear_and_set_dependency_node(__node);
+  return graph_node_ref{__node, __pb.get_native_graph_handle()};
+}
+
+#  if _CCCL_CTK_AT_LEAST(12, 9)
+//! \brief Adds a child graph node to a CUDA graph path, transferring ownership.
+//!
+//! The child graph is moved into the parent graph node. After this call, the
+//! `graph_builder` is left in a null state and the parent graph owns the child's
+//! lifetime.
+//!
+//! \param __pb    Path builder to insert the node into.
+//! \param __child An rvalue `graph_builder` whose graph will be moved into the parent.
+//! \return A `graph_node_ref` for the newly added child-graph node.
+//! \throws cuda::std::cuda_error if node creation fails.
+_CCCL_HOST_API inline graph_node_ref insert_child_graph(path_builder& __pb, graph_builder&& __child)
+{
+  auto __deps = __pb.get_dependencies();
+  ::CUgraphNodeParams __params{};
+  __params.type            = ::CU_GRAPH_NODE_TYPE_GRAPH;
+  __params.graph.graph     = __child.get();
+  __params.graph.ownership = ::CU_GRAPH_CHILD_GRAPH_OWNERSHIP_MOVE;
+  auto __node              = ::cuda::experimental::__driver::__graphAddNode(
+    __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params);
+
+  (void) __child.release();
+
+  __pb.__clear_and_set_dependency_node(__node);
+  return graph_node_ref{__node, __pb.get_native_graph_handle()};
+}
+#  endif // _CCCL_CTK_AT_LEAST(12, 9)
+} // namespace cuda::experimental
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_CTK_AT_LEAST(12, 2)
+
+#endif // _CUDAX__GRAPH_CHILD_GRAPH_CUH
diff --git a/cudax/include/cuda/experimental/__graph/conditional_node.cuh b/cudax/include/cuda/experimental/__graph/conditional_node.cuh
new file mode 100644
index 00000000000..25da74d359d
--- /dev/null
+++ b/cudax/include/cuda/experimental/__graph/conditional_node.cuh
@@ -0,0 +1,187 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__GRAPH_CONDITIONAL_NODE_CUH
+#define _CUDAX__GRAPH_CONDITIONAL_NODE_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CTK_AT_LEAST(12, 4)
+
+#  include <cuda/experimental/__driver/driver_api.cuh>
+#  include <cuda/experimental/__graph/graph_builder_ref.cuh>
+#  include <cuda/experimental/__graph/graph_node_ref.cuh>
+#  include <cuda/experimental/__graph/path_builder.cuh>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental
+{
+//! \brief A thin, non-owning wrapper around a `cudaGraphConditionalHandle`.
+//!
+//! A conditional handle is a graph-scoped token whose value at runtime controls whether
+//! the body of an `if` or `while` conditional node executes.  The handle is owned by
+//! the graph — there is no destroy API — so this wrapper is trivially copyable and
+//! safe to pass by value into device kernels.
+//!
+//! Users can construct a handle directly, or let `make_if_node` / `make_while_node`
+//! create one automatically.
+//!
+//! \rst
+//! .. _cudax-graph-conditional-handle:
+//! \endrst
+struct conditional_handle
+{
+  //! \brief Creates a conditional handle for the given graph.
+  //!
+  //! \param __graph       Graph in which the conditional node will be inserted.
+  //! \param __default_val Initial value of the handle (1 = execute body, 0 = skip).
+  //! \throws cuda::std::cuda_error if `cudaGraphConditionalHandleCreate` fails.
+  _CCCL_HOST_API explicit conditional_handle(graph_builder_ref __graph, unsigned int __default_val = 1)
+      : __handle_(::cuda::experimental::__driver::__graphConditionalHandleCreate(
+          __graph.get(), __graph.get_device().primary_context(), __default_val, ::cudaGraphCondAssignDefault))
+  {}
+
+  //! \brief Sets the runtime value of the conditional handle from device code.
+  //!
+  //! \param __value Non-zero to execute the body, zero to skip.
+  _CCCL_DEVICE void set_value(unsigned int __value) const noexcept
+  {
+    ::cudaGraphSetConditional(__handle_, __value);
+  }
+
+  //! \brief Convenience: enables execution of the conditional body (sets the handle to 1).
+  _CCCL_DEVICE void enable() const noexcept
+  {
+    set_value(1u);
+  }
+
+  //! \brief Convenience: disables execution of the conditional body (sets the handle to 0).
+  _CCCL_DEVICE void disable() const noexcept
+  {
+    set_value(0u);
+  }
+
+  //! \brief Returns the underlying `cudaGraphConditionalHandle`.
+  [[nodiscard]] _CCCL_NODEBUG_HOST_API ::cudaGraphConditionalHandle get() const noexcept
+  {
+    return __handle_;
+  }
+
+private:
+  ::cudaGraphConditionalHandle __handle_{};
+};
+
+//! \brief Result of adding a conditional node.
+//!
+//! Contains the newly created conditional node, the body graph that should be
+//! populated by the caller, and the conditional handle to pass into body kernels.
+struct conditional_node_result
+{
+  graph_node_ref node; //!< The conditional node in the parent graph.
+  graph_builder_ref body_graph; //!< The body graph to populate with operations.
+  conditional_handle handle; //!< The handle to control execution from device code.
+};
+
+_CCCL_HOST_API inline conditional_node_result
+__make_conditional_node(path_builder& __pb, conditional_handle __handle, ::CUgraphConditionalNodeType __type)
+{
+  auto __deps = __pb.get_dependencies();
+
+  ::CUgraphNodeParams __params{};
+  __params.type               = ::CU_GRAPH_NODE_TYPE_CONDITIONAL;
+  __params.conditional.handle = __handle.get();
+  __params.conditional.type   = __type;
+  __params.conditional.size   = 1;
+  __params.conditional.ctx    = __pb.get_device().primary_context();
+
+  auto __node = ::cuda::experimental::__driver::__graphAddNode(
+    __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params);
+
+  __pb.__clear_and_set_dependency_node(__node);
+
+  return {graph_node_ref{__node, __pb.get_native_graph_handle()},
+          graph_builder_ref{__params.conditional.phGraph_out[0], __pb.get_device()},
+          __handle};
+}
+//! \brief Adds an `if`-conditional node to a CUDA graph path.
+//!
+//! At runtime, if the value of the handle is non-zero the body graph executes once;
+//! otherwise it is skipped entirely.
+//!
+//! The caller must populate the returned `body_graph` with all operations that should
+//! run conditionally before the parent graph is instantiated.
+//!
+//! \param __pb          Path builder to insert the node into.
+//! \param __default_val Initial handle value (1 = execute, 0 = skip). Ignored when
+//!                      \p __handle is provided.
+//! \return A `conditional_node_result` containing the node ref, body graph, and handle.
+//! \throws cuda::std::cuda_error if node creation fails.
+_CCCL_HOST_API inline conditional_node_result make_if_node(path_builder& __pb, unsigned int __default_val = 1)
+{
+  conditional_handle __handle{__pb.get_graph(), __default_val};
+  return __make_conditional_node(__pb, __handle, ::CU_GRAPH_COND_TYPE_IF);
+}
+
+//! \brief Adds an `if`-conditional node reusing an existing conditional handle.
+//!
+//! \param __pb     Path builder to insert the node into.
+//! \param __handle An existing conditional handle (e.g. shared with another node).
+//! \return A `conditional_node_result` containing the node ref, body graph, and handle.
+//! \throws cuda::std::cuda_error if node creation fails.
+_CCCL_HOST_API inline conditional_node_result make_if_node(path_builder& __pb, conditional_handle __handle)
+{
+  return __make_conditional_node(__pb, __handle, ::CU_GRAPH_COND_TYPE_IF);
+}
+
+//! \brief Adds a `while`-conditional node to a CUDA graph path.
+//!
+//! At runtime, the body graph is executed repeatedly as long as the handle value
+//! is non-zero at the start of each iteration (including the first).
+//!
+//! The caller must populate the returned `body_graph` before instantiating the parent
+//! graph. The body is responsible for calling `handle.set_value(0) or handle.disable()`
+//! to terminate the loop.
+//!
+//! \param __pb          Path builder to insert the node into.
+//! \param __default_val Initial handle value (1 = enter loop, 0 = skip).
+//! \return A `conditional_node_result` containing the node ref, body graph, and handle.
+//! \throws cuda::std::cuda_error if node creation fails.
+_CCCL_HOST_API inline conditional_node_result make_while_node(path_builder& __pb, unsigned int __default_val = 1)
+{
+  conditional_handle __handle{__pb.get_graph(), __default_val};
+  return __make_conditional_node(__pb, __handle, ::CU_GRAPH_COND_TYPE_WHILE);
+}
+
+//! \brief Adds a `while`-conditional node reusing an existing conditional handle.
+//!
+//! \param __pb     Path builder to insert the node into.
+//! \param __handle An existing conditional handle (e.g. shared with another node).
+//! \return A `conditional_node_result` containing the node ref, body graph, and handle.
+//! \throws cuda::std::cuda_error if node creation fails.
+_CCCL_HOST_API inline conditional_node_result make_while_node(path_builder& __pb, conditional_handle __handle)
+{
+  return __make_conditional_node(__pb, __handle, ::CU_GRAPH_COND_TYPE_WHILE);
+}
+} // namespace cuda::experimental
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_CTK_AT_LEAST(12, 4)
+
+#endif // _CUDAX__GRAPH_CONDITIONAL_NODE_CUH
diff --git a/cudax/include/cuda/experimental/__graph/copy_bytes.cuh b/cudax/include/cuda/experimental/__graph/copy_bytes.cuh
new file mode 100644
index 00000000000..35ed8dde0ff
--- /dev/null
+++ b/cudax/include/cuda/experimental/__graph/copy_bytes.cuh
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__GRAPH_COPY_BYTES_CUH
+#define _CUDAX__GRAPH_COPY_BYTES_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CTK_AT_LEAST(12, 2)
+
+#  include <cuda/__algorithm/common.h>
+#  include <cuda/__stream/launch_transform.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__exception/exception_macros.h>
+#  include <cuda/std/__host_stdlib/stdexcept>
+#  include <cuda/std/__type_traits/is_const.h>
+#  include <cuda/std/__type_traits/is_trivially_copyable.h>
+#  include <cuda/std/cstddef>
+#  include <cuda/std/span>
+
+#  include <cuda/experimental/__driver/driver_api.cuh>
+#  include <cuda/experimental/__graph/graph_node_ref.cuh>
+#  include <cuda/experimental/__graph/path_builder.cuh>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental
+{
+template <typename _SrcTy, typename _DstTy>
+_CCCL_HOST_API graph_node_ref
+__copy_bytes_graph_impl(path_builder& __pb, ::cuda::std::span<_SrcTy> __src, ::cuda::std::span<_DstTy> __dst)
+{
+  static_assert(!::cuda::std::is_const_v<_DstTy>, "Copy destination can't be const");
+  static_assert(::cuda::std::is_trivially_copyable_v<_SrcTy> && ::cuda::std::is_trivially_copyable_v<_DstTy>,
+                "Copy source and destination element types must be trivially copyable");
+
+  if (__src.size_bytes() > __dst.size_bytes())
+  {
+    _CCCL_THROW(::std::invalid_argument, "Copy destination is too small to fit the source data");
+  }
+
+  if (__src.size_bytes() == 0)
+  {
+    return graph_node_ref{};
+  }
+
+  auto __deps = __pb.get_dependencies();
+  ::CUgraphNodeParams __params{};
+  __params.type                            = ::CU_GRAPH_NODE_TYPE_MEMCPY;
+  __params.memcpy.copyCtx                  = __pb.get_device().primary_context();
+  __params.memcpy.copyParams.srcMemoryType = ::CU_MEMORYTYPE_UNIFIED;
+  __params.memcpy.copyParams.srcDevice     = reinterpret_cast<::CUdeviceptr>(__src.data());
+  __params.memcpy.copyParams.dstMemoryType = ::CU_MEMORYTYPE_UNIFIED;
+  __params.memcpy.copyParams.dstDevice     = reinterpret_cast<::CUdeviceptr>(__dst.data());
+  __params.memcpy.copyParams.WidthInBytes  = __src.size_bytes();
+  __params.memcpy.copyParams.Height        = 1;
+  __params.memcpy.copyParams.Depth         = 1;
+  auto __node                              = ::cuda::experimental::__driver::__graphAddNode(
+    __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params);
+
+  __pb.__clear_and_set_dependency_node(__node);
+  return graph_node_ref{__node, __pb.get_native_graph_handle()};
+}
+
+template <typename _SrcElem,
+          typename _SrcExtents,
+          typename _SrcLayout,
+          typename _SrcAccessor,
+          typename _DstElem,
+          typename _DstExtents,
+          typename _DstLayout,
+          typename _DstAccessor>
+_CCCL_HOST_API graph_node_ref __copy_bytes_graph_impl(
+  path_builder& __pb,
+  ::cuda::std::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
+  ::cuda::std::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
+{
+  static_assert(::cuda::std::is_constructible_v<_DstExtents, _SrcExtents>,
+                "Multidimensional copy requires both source and destination extents to be compatible");
+  static_assert(::cuda::std::is_same_v<_SrcLayout, _DstLayout>,
+                "Multidimensional copy requires both source and destination layouts to match");
+
+  if (!__dst.is_exhaustive())
+  {
+    _CCCL_THROW(::std::invalid_argument, "copy_bytes supports only exhaustive mdspans");
+  }
+
+  if (__src.extents() != __dst.extents())
+  {
+    _CCCL_THROW(::std::invalid_argument, "Copy destination size differs from the source");
+  }
+
+  return __copy_bytes_graph_impl(
+    __pb,
+    ::cuda::std::span(__src.data_handle(), __src.mapping().required_span_size()),
+    ::cuda::std::span(__dst.data_handle(), __dst.mapping().required_span_size()));
+}
+//! \brief Adds a memcpy node to a CUDA graph path that copies bytes from source to destination.
+_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
+_CCCL_REQUIRES(::cuda::__spannable<::cuda::transformed_device_argument_t<_SrcTy>>
+                 _CCCL_AND ::cuda::__spannable<::cuda::transformed_device_argument_t<_DstTy>>)
+_CCCL_HOST_API graph_node_ref copy_bytes(path_builder& __pb, _SrcTy&& __src, _DstTy&& __dst)
+{
+  return __copy_bytes_graph_impl(
+    __pb,
+    ::cuda::std::span(
+      ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_SrcTy>(__src))),
+    ::cuda::std::span(
+      ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_DstTy>(__dst))));
+}
+
+//! \brief Adds a memcpy node for mdspan source and destination.
+_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
+_CCCL_REQUIRES(::cuda::__mdspannable<::cuda::transformed_device_argument_t<_SrcTy>>
+                 _CCCL_AND ::cuda::__mdspannable<::cuda::transformed_device_argument_t<_DstTy>>)
+_CCCL_HOST_API graph_node_ref copy_bytes(path_builder& __pb, _SrcTy&& __src, _DstTy&& __dst)
+{
+  return __copy_bytes_graph_impl(
+    __pb,
+    ::cuda::__as_mdspan(
+      ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_SrcTy>(__src))),
+    ::cuda::__as_mdspan(
+      ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_DstTy>(__dst))));
+}
+} // namespace cuda::experimental
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_CTK_AT_LEAST(12, 2)
+
+#endif // _CUDAX__GRAPH_COPY_BYTES_CUH
diff --git a/cudax/include/cuda/experimental/__graph/fill_bytes.cuh b/cudax/include/cuda/experimental/__graph/fill_bytes.cuh
new file mode 100644
index 00000000000..ecff1853844
--- /dev/null
+++ b/cudax/include/cuda/experimental/__graph/fill_bytes.cuh
@@ -0,0 +1,136 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__GRAPH_FILL_BYTES_CUH
+#define _CUDAX__GRAPH_FILL_BYTES_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CTK_AT_LEAST(12, 2)
+
+#  include <cuda/__algorithm/common.h>
+#  include <cuda/__stream/launch_transform.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__exception/exception_macros.h>
+#  include <cuda/std/__host_stdlib/stdexcept>
+#  include <cuda/std/__type_traits/is_const.h>
+#  include <cuda/std/__type_traits/is_trivially_copyable.h>
+#  include <cuda/std/cstdint>
+#  include <cuda/std/span>
+
+#  include <cuda/experimental/__driver/driver_api.cuh>
+#  include <cuda/experimental/__graph/concepts.cuh>
+#  include <cuda/experimental/__graph/graph_node_ref.cuh>
+#  include <cuda/experimental/__graph/path_builder.cuh>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental
+{
+template <typename _DstTy, ::cuda::std::size_t _DstSize>
+_CCCL_HOST_API graph_node_ref
+__fill_bytes_graph_impl(path_builder& __pb, ::cuda::std::span<_DstTy, _DstSize> __dst, ::cuda::std::uint8_t __value)
+{
+  static_assert(!::cuda::std::is_const_v<_DstTy>, "Fill destination can't be const");
+  static_assert(::cuda::std::is_trivially_copyable_v<_DstTy>,
+                "Fill destination element type must be trivially copyable");
+
+  auto __deps = __pb.get_dependencies();
+  ::CUgraphNodeParams __params{};
+  __params.type               = ::CU_GRAPH_NODE_TYPE_MEMSET;
+  __params.memset.dst         = reinterpret_cast<::CUdeviceptr>(__dst.data());
+  __params.memset.pitch       = __dst.size_bytes();
+  __params.memset.value       = __value;
+  __params.memset.elementSize = 1;
+  __params.memset.width       = __dst.size_bytes();
+  __params.memset.height      = 1;
+  __params.memset.ctx         = __pb.get_device().primary_context();
+  auto __node                 = ::cuda::experimental::__driver::__graphAddNode(
+    __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params);
+
+  __pb.__clear_and_set_dependency_node(__node);
+  return graph_node_ref{__node, __pb.get_native_graph_handle()};
+}
+
+template <typename _DstElem, typename _DstExtents, typename _DstLayout, typename _DstAccessor>
+_CCCL_HOST_API graph_node_ref __fill_bytes_graph_impl(
+  path_builder& __pb,
+  ::cuda::std::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst,
+  ::cuda::std::uint8_t __value)
+{
+  if (!__dst.is_exhaustive())
+  {
+    _CCCL_THROW(::std::invalid_argument, "fill_bytes supports only exhaustive mdspans");
+  }
+
+  return __fill_bytes_graph_impl(
+    __pb, ::cuda::std::span(__dst.data_handle(), __dst.mapping().required_span_size()), __value);
+}
+//! \brief Adds a memset node to a CUDA graph path that bytewise-fills the destination.
+//!
+//! This overload is selected when the destination (after applying `launch_transform`) is
+//! a contiguous range convertible to `cuda::std::span`. The element type must be trivially
+//! copyable and non-const. The pointer captured in the node must remain valid until the
+//! graph executes.
+//!
+//! \param __pb    Path builder to insert the node into.
+//! \param __dst   Destination memory to fill.
+//! \param __value Byte value to write to every byte of the destination.
+//! \return A `graph_node_ref` for the newly added memset node.
+//! \throws cuda::std::cuda_error if node creation fails.
+_CCCL_TEMPLATE(typename _DstTy)
+_CCCL_REQUIRES(::cuda::__spannable<::cuda::transformed_device_argument_t<_DstTy>>)
+_CCCL_HOST_API graph_node_ref fill_bytes(path_builder& __pb, _DstTy&& __dst, ::cuda::std::uint8_t __value)
+{
+  return __fill_bytes_graph_impl(
+    __pb,
+    ::cuda::std::span(
+      ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_DstTy>(__dst))),
+    __value);
+}
+
+//! \brief Adds a memset node to a CUDA graph path that bytewise-fills the destination.
+//!
+//! This overload is selected when the destination (after applying `launch_transform`) is
+//! a `cuda::std::mdspan`. The mdspan must be exhaustive. The element type must be trivially
+//! copyable and non-const. The pointer captured in the node must remain valid until the
+//! graph executes.
+//!
+//! \param __pb    Path builder to insert the node into.
+//! \param __dst   Destination memory to fill.
+//! \param __value Byte value to write to every byte of the destination.
+//! \return A `graph_node_ref` for the newly added memset node.
+//! \throws cuda::std::cuda_error if node creation fails.
+_CCCL_TEMPLATE(typename _DstTy)
+_CCCL_REQUIRES(::cuda::__mdspannable<::cuda::transformed_device_argument_t<_DstTy>>)
+_CCCL_HOST_API graph_node_ref fill_bytes(path_builder& __pb, _DstTy&& __dst, ::cuda::std::uint8_t __value)
+{
+  return __fill_bytes_graph_impl(
+    __pb,
+    ::cuda::__as_mdspan(
+      ::cuda::launch_transform(::cuda::stream_ref{::cuda::invalid_stream}, ::cuda::std::forward<_DstTy>(__dst))),
+    __value);
+}
+} // namespace cuda::experimental
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_CTK_AT_LEAST(12, 2)
+
+#endif // _CUDAX__GRAPH_FILL_BYTES_CUH
diff --git a/cudax/include/cuda/experimental/__graph/graph_builder_ref.cuh b/cudax/include/cuda/experimental/__graph/graph_builder_ref.cuh
index 4d0673e3d78..c90e6fe7550 100644
--- a/cudax/include/cuda/experimental/__graph/graph_builder_ref.cuh
+++ b/cudax/include/cuda/experimental/__graph/graph_builder_ref.cuh
@@ -262,16 +262,20 @@ private:
   //! \param __parent The parent graph to which this graph will be added.
   //! \return A `graph_node_ref` representing the added child graph.
   //! \throws cuda::std::cuda_error if `cudaGraphAddChildGraphNode` fails.
+#if _CCCL_CTK_AT_LEAST(12, 2)
   template <size_t _Extent>
   [[nodiscard]] _CCCL_HOST_API auto
   __add_to_graph(cudaGraph_t __parent, ::cuda::std::span<cudaGraphNode_t, _Extent> __deps) -> graph_node_ref
   {
+    ::CUgraphNodeParams __params{};
+    __params.type        = ::CU_GRAPH_NODE_TYPE_GRAPH;
+    __params.graph.graph = __graph_;
     graph_node_ref __child;
     __child.__graph_ = __graph_;
-    __child.__node_ =
-      ::cuda::experimental::__driver::__graphAddChildGraphNode(__parent, __deps.data(), __deps.size(), __graph_);
+    __child.__node_ = ::cuda::experimental::__driver::__graphAddNode(__parent, __deps.data(), __deps.size(), &__params);
     return __child;
   }
+#endif // _CCCL_CTK_AT_LEAST(12, 2)
 
   device_ref __dev_; //!< The device on which the graph is built.
   cudaGraph_t __graph_ = nullptr; //!< The underlying CUDA graph handle.
diff --git a/cudax/include/cuda/experimental/__graph/graph_node_type.cuh b/cudax/include/cuda/experimental/__graph/graph_node_type.cuh
index 0e51f70bed6..7ace27410f0 100644
--- a/cudax/include/cuda/experimental/__graph/graph_node_type.cuh
+++ b/cudax/include/cuda/experimental/__graph/graph_node_type.cuh
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/cudax/include/cuda/experimental/__graph/host_launch.cuh b/cudax/include/cuda/experimental/__graph/host_launch.cuh
new file mode 100644
index 00000000000..86b5b1fb43f
--- /dev/null
+++ b/cudax/include/cuda/experimental/__graph/host_launch.cuh
@@ -0,0 +1,142 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__GRAPH_HOST_LAUNCH_CUH
+#define _CUDAX__GRAPH_HOST_LAUNCH_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CTK_AT_LEAST(12, 2)
+
+#  include <cuda/__launch/host_launch.h>
+#  include <cuda/std/__functional/invoke.h>
+#  include <cuda/std/__functional/reference_wrapper.h>
+#  include <cuda/std/__memory/addressof.h>
+#  include <cuda/std/__type_traits/is_function.h>
+#  include <cuda/std/__type_traits/is_move_constructible.h>
+#  include <cuda/std/__type_traits/is_pointer.h>
+#  include <cuda/std/__type_traits/remove_pointer.h>
+#  include <cuda/std/__utility/move.h>
+
+#  include <cuda/experimental/__driver/driver_api.cuh>
+#  include <cuda/experimental/__graph/graph_node_ref.cuh>
+#  include <cuda/experimental/__graph/path_builder.cuh>
+
+#  include <memory>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental
+{
+// Launcher for a plain no-arg function pointer.
+template <class _FuncPtr>
+_CCCL_HOST_API inline void CUDA_CB __graph_func_ptr_launcher(void* __callable_ptr)
+{
+  reinterpret_cast<_FuncPtr>(__callable_ptr)();
+}
+
+// Launcher for a heap-allocated callable + argument pack. The graph host node callback
+// signature is void(void*), unlike the stream callback which is void(CUstream, CUresult, void*),
+// so we can't reuse ::cuda::__stream_callback_launcher here.
+template <class _CallbackData>
+_CCCL_HOST_API inline void CUDA_CB __graph_callback_launcher(void* __data_ptr)
+{
+  auto* __data = static_cast<_CallbackData*>(__data_ptr);
+  // Copy, not move — the graph may be launched multiple times.
+  // Data lifetime is managed by a graph user object.
+  ::cuda::std::apply(__data->__callable_, __data->__args_);
+}
+
+template <class _CallbackData>
+_CCCL_HOST_API inline void CUDA_CB __graph_callback_data_destroyer(void* __data_ptr)
+{
+  delete static_cast<_CallbackData*>(__data_ptr);
+}
+
+//! \brief Adds a host node to a CUDA graph path that invokes a callable on the host.
+//!
+//! The callable and its arguments are copied into a heap allocation whose lifetime is
+//! tied to the graph via a CUDA user object. The graph can be launched multiple times.
+//! The rules and restrictions match `cuda::host_launch`:
+//!   - The callable must not call into CUDA Runtime or Driver APIs.
+//!   - It must not depend on another thread that could block on asynchronous CUDA work.
+//!
+//! Three dispatch paths (mirroring `cuda::host_launch`):
+//!   1. A bare no-arg function pointer; no allocation.
+//!   2. A `std::reference_wrapper` (no args) passes the address of the referenced object.
+//!   3. Everything else is heap-allocated with lifetime managed by a graph user object.
+//!
+//! \param __pb       Path builder to insert the node into.
+//! \param __callable Callable to execute on the host.
+//! \param __args     Arguments to forward to the callable.
+//! \return A `graph_node_ref` for the newly added host node.
+//! \throws cuda::std::cuda_error if node creation fails.
+template <class _Callable, class... _Args>
+_CCCL_HOST_API graph_node_ref host_launch(path_builder& __pb, _Callable __callable, _Args... __args)
+{
+  static_assert(::cuda::std::is_invocable_v<_Callable, _Args...>,
+                "Callable can't be called with the supplied arguments");
+  static_assert(::cuda::std::is_move_constructible_v<_Callable>, "The callable must be move constructible");
+  static_assert((::cuda::std::is_move_constructible_v<_Args> && ...),
+                "All callback arguments must be move constructible");
+
+  constexpr bool __has_args = sizeof...(_Args) > 0;
+
+  ::CUhostFn __fn   = nullptr;
+  void* __user_data = nullptr;
+
+  if constexpr (!__has_args && ::cuda::std::is_pointer_v<_Callable>
+                && ::cuda::std::is_function_v<::cuda::std::remove_pointer_t<_Callable>>)
+  {
+    __fn        = __graph_func_ptr_launcher<_Callable>;
+    __user_data = reinterpret_cast<void*>(__callable);
+  }
+  else if constexpr (!__has_args && ::cuda::std::__is_cuda_std_reference_wrapper_v<_Callable>)
+  {
+    __fn        = ::cuda::__host_func_launcher<typename _Callable::type>;
+    __user_data = static_cast<void*>(::cuda::std::addressof(__callable.get()));
+  }
+  else
+  {
+    // Heap-allocate the callback data. Lifetime is tied to the graph via a user object.
+    using _CallbackData = ::cuda::__stream_callback_data<_Callable, _Args...>;
+    auto __data         = new _CallbackData{::cuda::std::move(__callable), {::cuda::std::move(__args)...}};
+    __fn                = __graph_callback_launcher<_CallbackData>;
+    __user_data         = __data;
+    ::cuda::experimental::__driver::__graphRetainUserObject(
+      __pb.get_native_graph_handle(), __data, __graph_callback_data_destroyer<_CallbackData>);
+  }
+
+  auto __deps = __pb.get_dependencies();
+  ::CUgraphNodeParams __params{};
+  __params.type          = ::CU_GRAPH_NODE_TYPE_HOST;
+  __params.host.fn       = __fn;
+  __params.host.userData = __user_data;
+  auto __node            = ::cuda::experimental::__driver::__graphAddNode(
+    __pb.get_native_graph_handle(), __deps.data(), __deps.size(), &__params);
+
+  __pb.__clear_and_set_dependency_node(__node);
+  return graph_node_ref{__node, __pb.get_native_graph_handle()};
+}
+} // namespace cuda::experimental
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_CTK_AT_LEAST(12, 2)
+
+#endif // _CUDAX__GRAPH_HOST_LAUNCH_CUH
diff --git a/cudax/include/cuda/experimental/__graph/path_builder.cuh b/cudax/include/cuda/experimental/__graph/path_builder.cuh
index 927b349914d..8fead795057 100644
--- a/cudax/include/cuda/experimental/__graph/path_builder.cuh
+++ b/cudax/include/cuda/experimental/__graph/path_builder.cuh
@@ -13,6 +13,7 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/__event/event_ref.h>
 #include <cuda/__runtime/api_wrapper.h>
 #include <cuda/std/__exception/cuda_error.h>
 #include <cuda/std/__exception/exception_macros.h>
@@ -117,6 +118,49 @@ struct path_builder
     __nodes_.insert(__nodes_.end(), __other.__nodes_.begin(), __other.__nodes_.end());
   }
 
+  //! \brief Adds an event-wait node and makes it the next dependency.
+  //!
+  //! The new node waits for \p __ev to be recorded before any successor of this node
+  //! executes. This mirrors `stream_ref::wait(event_ref)`.
+  //!
+  //! \param __ev Event that graph execution should wait on.
+  //! \return A `graph_node_ref` for the newly added event-wait node.
+  //! \throws cuda::std::cuda_error if `cudaGraphAddEventWaitNode` fails.
+#if _CCCL_CTK_AT_LEAST(12, 2)
+  _CCCL_HOST_API graph_node_ref wait(::cuda::event_ref __ev)
+  {
+    auto __deps = get_dependencies();
+    ::CUgraphNodeParams __params{};
+    __params.type            = ::CU_GRAPH_NODE_TYPE_WAIT_EVENT;
+    __params.eventWait.event = __ev.get();
+    auto __node = ::cuda::experimental::__driver::__graphAddNode(__graph_, __deps.data(), __deps.size(), &__params);
+
+    __clear_and_set_dependency_node(__node);
+    return graph_node_ref{__node, __graph_};
+  }
+
+  //! \brief Adds an event-record node and makes it the next dependency.
+  //!
+  //! The new node records \p __ev when it executes, signalling any downstream waiters.
+  //! This mirrors `stream_ref::record_event(event_flags)` but takes an existing event
+  //! because graph construction happens before execution.
+  //!
+  //! \param __ev Event to record.
+  //! \return A `graph_node_ref` for the newly added event-record node.
+  //! \throws cuda::std::cuda_error if `cudaGraphAddEventRecordNode` fails.
+  _CCCL_HOST_API graph_node_ref record_event(::cuda::event_ref __ev)
+  {
+    auto __deps = get_dependencies();
+    ::CUgraphNodeParams __params{};
+    __params.type              = ::CU_GRAPH_NODE_TYPE_EVENT_RECORD;
+    __params.eventRecord.event = __ev.get();
+    auto __node = ::cuda::experimental::__driver::__graphAddNode(__graph_, __deps.data(), __deps.size(), &__params);
+
+    __clear_and_set_dependency_node(__node);
+    return graph_node_ref{__node, __graph_};
+  }
+#endif // _CCCL_CTK_AT_LEAST(12, 2)
+
   template <typename... Nodes>
   static constexpr bool __all_dependencies = (graph_dependency<Nodes> && ...);
 
diff --git a/cudax/include/cuda/experimental/graph.cuh b/cudax/include/cuda/experimental/graph.cuh
index 0b60303cafd..6055b9aaaf8 100644
--- a/cudax/include/cuda/experimental/graph.cuh
+++ b/cudax/include/cuda/experimental/graph.cuh
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -12,12 +12,17 @@
 #define __CUDAX_GRAPH_CUH
 
 // IWYU pragma: begin_exports
+#include <cuda/experimental/__graph/child_graph.cuh>
+#include <cuda/experimental/__graph/conditional_node.cuh>
+#include <cuda/experimental/__graph/copy_bytes.cuh>
 #include <cuda/experimental/__graph/depends_on.cuh>
+#include <cuda/experimental/__graph/fill_bytes.cuh>
 #include <cuda/experimental/__graph/fwd.cuh>
 #include <cuda/experimental/__graph/graph.cuh>
 #include <cuda/experimental/__graph/graph_builder.cuh>
 #include <cuda/experimental/__graph/graph_node_ref.cuh>
 #include <cuda/experimental/__graph/graph_node_type.cuh>
+#include <cuda/experimental/__graph/host_launch.cuh>
 #include <cuda/experimental/__graph/path_builder.cuh>
 // IWYU pragma: end_exports
 
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index c5e853470c0..445e01f79ce 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -67,6 +67,7 @@ target_compile_options(
 
 cudax_add_catch2_test(test_target graph
     graph/graph_smoke.cu
+    graph/graph_node_ops_smoke.cu
 )
 
 cudax_add_catch2_test(test_target stream
diff --git a/cudax/test/graph/graph_node_ops_smoke.cu b/cudax/test/graph/graph_node_ops_smoke.cu
new file mode 100644
index 00000000000..f19e639ef1b
--- /dev/null
+++ b/cudax/test/graph/graph_node_ops_smoke.cu
@@ -0,0 +1,555 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__cccl/cuda_toolkit.h>
+
+#if _CCCL_CTK_AT_LEAST(12, 2)
+
+#  include <cuda/experimental/graph.cuh>
+#  include <cuda/experimental/launch.cuh>
+#  include <cuda/experimental/stream.cuh>
+
+#  include <testing.cuh>
+#  include <utility.cuh>
+
+namespace
+{
+namespace test
+{
+// ─── helpers ───────────────────────────────────────────────────────────────
+
+// RAII wrapper around a pinned-memory allocation of N elements of type T.
+template <typename T>
+struct pinned_array
+{
+  _malloc_pinned mem;
+  std::size_t n;
+
+  explicit pinned_array(std::size_t __n, T __init = T{})
+      : mem(__n * sizeof(T))
+      , n(__n)
+  {
+    for (std::size_t i = 0; i < n; ++i)
+    {
+      get()[i] = __init;
+    }
+  }
+
+  pinned_array(const pinned_array&)            = delete;
+  pinned_array& operator=(const pinned_array&) = delete;
+
+  T* get() const noexcept
+  {
+    return mem.get_as<T>();
+  }
+  T& operator[](std::size_t i) const noexcept
+  {
+    return get()[i];
+  }
+};
+
+// ─── kernels used in conditional tests ─────────────────────────────────────
+
+#  if _CCCL_CTK_AT_LEAST(12, 4)
+// Body kernel for while-loop conditional test: decrements a counter and
+// stops the loop when it reaches zero.
+struct count_down_and_stop
+{
+  __device__ void operator()(cudax::conditional_handle handle, int* counter) const noexcept
+  {
+    --(*counter);
+    if (*counter <= 0)
+    {
+      handle.disable();
+    }
+  }
+};
+#  endif // _CCCL_CTK_AT_LEAST(12, 4)
+} // namespace test
+} // namespace
+
+// ────────────────────────────────────────────────────────────────────────────
+// fill_bytes
+// ────────────────────────────────────────────────────────────────────────────
+
+C2H_TEST("graph fill_bytes sets every byte to the requested value", "[graph][fill_bytes]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+
+  constexpr std::size_t N = 64;
+  test::pinned_array<int> mem{N, static_cast<int>(0xDEADBEEF)};
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  // Zero-fill via a graph memset node.
+  cudax::fill_bytes(pb, ::cuda::std::span{mem.get(), N}, ::cuda::std::uint8_t{0});
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  for (std::size_t i = 0; i < N; ++i)
+  {
+    CUDAX_REQUIRE(mem[i] == 0);
+  }
+}
+
+C2H_TEST("graph fill_bytes with non-zero value", "[graph][fill_bytes]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+
+  constexpr std::size_t N = 8;
+  test::pinned_array<unsigned char> mem{N};
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  cudax::fill_bytes(pb, ::cuda::std::span{mem.get(), N}, ::cuda::std::uint8_t{0xAB});
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  for (std::size_t i = 0; i < N; ++i)
+  {
+    CUDAX_REQUIRE(mem[i] == static_cast<unsigned char>(0xAB));
+  }
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// copy_bytes
+// ────────────────────────────────────────────────────────────────────────────
+
+C2H_TEST("graph copy_bytes copies data from source to destination", "[graph][copy_bytes]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+
+  constexpr std::size_t N = 32;
+  test::pinned_array<int> src{N};
+  test::pinned_array<int> dst{N, -1};
+
+  for (std::size_t i = 0; i < N; ++i)
+  {
+    src[i] = static_cast<int>(i * 7);
+  }
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  cudax::copy_bytes(pb, ::cuda::std::span{src.get(), N}, ::cuda::std::span{dst.get(), N});
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  for (std::size_t i = 0; i < N; ++i)
+  {
+    CUDAX_REQUIRE(dst[i] == src[i]);
+  }
+}
+
+C2H_TEST("graph copy_bytes can be chained after fill_bytes", "[graph][fill_bytes][copy_bytes]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+
+  constexpr std::size_t N = 16;
+  test::pinned_array<unsigned char> src{N};
+  test::pinned_array<unsigned char> dst{N};
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  // Fill source with 0xFF, then copy to destination.
+  cudax::fill_bytes(pb, ::cuda::std::span{src.get(), N}, ::cuda::std::uint8_t{0xFF});
+  cudax::copy_bytes(pb, ::cuda::std::span{src.get(), N}, ::cuda::std::span{dst.get(), N});
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  for (std::size_t i = 0; i < N; ++i)
+  {
+    CUDAX_REQUIRE(dst[i] == static_cast<unsigned char>(0xFF));
+  }
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// host_launch
+// ────────────────────────────────────────────────────────────────────────────
+
+C2H_TEST("graph host_launch executes a lambda callback", "[graph][host_launch]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  // pinned so the host-side increment is visible immediately after sync
+  test::pinned<int> counter{0};
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  // Capture the pointer by value so the callback remains valid after graph build.
+  int* ptr = counter.get();
+  cudax::host_launch(pb, [ptr]() {
+    *ptr = 42;
+  });
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  CUDAX_REQUIRE(*counter == 42);
+}
+
+C2H_TEST("graph host_launch with arguments", "[graph][host_launch]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned<int> a{10};
+  test::pinned<int> b{20};
+  test::pinned<int> result{0};
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  int* pa  = a.get();
+  int* pb2 = b.get();
+  int* pr  = result.get();
+  cudax::host_launch(
+    pb,
+    [](int* x, int* y, int* r) {
+      *r = *x + *y;
+    },
+    pa,
+    pb2,
+    pr);
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  CUDAX_REQUIRE(*result == 30);
+}
+
+C2H_TEST("graph host_launch can be chained with kernel nodes", "[graph][host_launch]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned_array<int> mem{1};
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  // Kernel sets value to 42.
+  int* ptr = mem.get();
+  cudax::launch(pb, test::one_thread_dims, test::assign_42{}, ptr);
+
+  // Host callback increments it.
+  cudax::host_launch(pb, [ptr]() {
+    *ptr += 1;
+  });
+
+  // Kernel verifies the final value is 43.
+  cudax::launch(pb, test::one_thread_dims, test::verify_n<43>{}, ptr);
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  CUDAX_REQUIRE(mem[0] == 43);
+}
+
+C2H_TEST("graph host_launch can be launched multiple times", "[graph][host_launch]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned_array<int> mem{1};
+  int* ptr = mem.get();
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  // Host callback increments the value each time.
+  cudax::host_launch(pb, [ptr]() {
+    *ptr += 1;
+  });
+
+  auto exec = g.instantiate();
+
+  // Launch 5 times — each launch should increment by 1.
+  for (int i = 0; i < 5; ++i)
+  {
+    exec.launch(s);
+    s.sync();
+    CUDAX_REQUIRE(mem[0] == i + 1);
+  }
+}
+
+C2H_TEST("graph host_launch data is cleaned up when graph is destroyed", "[graph][host_launch]")
+{
+  // Use a shared_ptr as a witness: the weak_ptr expires when all copies are gone.
+  auto witness              = ::std::make_shared<int>(42);
+  ::std::weak_ptr<int> weak = witness;
+
+  {
+    cudax::graph_builder g;
+    cudax::path_builder pb = cudax::start_path(g);
+
+    // The lambda captures a copy of the shared_ptr, which gets stored in the graph's user object.
+    cudax::host_launch(pb, [witness]() {
+      (void) witness;
+    });
+
+    // Release our copy — the graph's user object should keep the shared_ptr alive.
+    witness.reset();
+    CUDAX_REQUIRE(!weak.expired());
+  }
+  // graph_builder destroyed — user object destructor should have deleted the callback data,
+  // releasing the last shared_ptr copy.
+  CUDAX_REQUIRE(weak.expired());
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// event record / wait
+// ────────────────────────────────────────────────────────────────────────────
+
+C2H_TEST("graph record_event and wait(event_ref) impose ordering across independent paths",
+         "[graph][event_record][event_wait]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned_array<int> mem{1};
+
+  cuda::event ev{cuda::device_ref{0}};
+
+  cudax::graph_builder g;
+
+  // Path A: assign 42, then record the event.
+  int* val                   = mem.get();
+  cudax::path_builder path_a = cudax::start_path(g);
+  cudax::launch(path_a, test::one_thread_dims, test::assign_42{}, val);
+  path_a.record_event(ev);
+
+  // Path B (independent start): wait on the event, then verify value is 42.
+  cudax::path_builder path_b = cudax::start_path(g); // no deps from path_a
+  path_b.wait(ev);
+  cudax::launch(path_b, test::one_thread_dims, test::verify_42{}, val);
+
+  // Drain both paths.
+  path_a.wait(path_b);
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  CUDAX_REQUIRE(mem[0] == 42);
+}
+
+C2H_TEST("graph record_event node has the correct node type", "[graph][event_record]")
+{
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  cuda::event ev{cuda::device_ref{0}};
+  auto node = pb.record_event(ev);
+
+  CUDAX_REQUIRE(node.type() == cudax::graph_node_type::event_record);
+}
+
+C2H_TEST("graph wait(event_ref) node has the correct node type", "[graph][event_wait]")
+{
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  cuda::event ev{cuda::device_ref{0}};
+  auto node = pb.wait(ev);
+
+  CUDAX_REQUIRE(node.type() == cudax::graph_node_type::wait_event);
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// child graph
+// ────────────────────────────────────────────────────────────────────────────
+
+C2H_TEST("graph insert_child_graph embeds a subgraph", "[graph][child_graph]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned_array<int> mem{1};
+  int* val = mem.get();
+
+  // Build the child graph: kernel that assigns 42.
+  cudax::graph_builder child_g;
+  {
+    cudax::path_builder child_pb = cudax::start_path(child_g);
+    cudax::launch(child_pb, test::one_thread_dims, test::assign_42{}, val);
+  }
+
+  // Build the parent graph: embed the child, then verify.
+  cudax::graph_builder parent_g;
+  cudax::path_builder pb = cudax::start_path(parent_g);
+
+  cudax::insert_child_graph(pb, child_g);
+  cudax::launch(pb, test::one_thread_dims, test::verify_42{}, val);
+
+  auto exec = parent_g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  CUDAX_REQUIRE(mem[0] == 42);
+}
+
+#  if _CCCL_CTK_AT_LEAST(12, 9)
+C2H_TEST("graph insert_child_graph with ownership transfer", "[graph][child_graph]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned_array<int> mem{1};
+  int* val = mem.get();
+
+  cudax::graph_builder child_g;
+  {
+    cudax::path_builder child_pb = cudax::start_path(child_g);
+    cudax::launch(child_pb, test::one_thread_dims, test::assign_42{}, val);
+  }
+
+  cudax::graph_builder parent_g;
+  cudax::path_builder pb = cudax::start_path(parent_g);
+
+  // Move the child graph into the parent — child_g is null afterwards.
+  cudax::insert_child_graph(pb, std::move(child_g));
+  CUDAX_REQUIRE(child_g.get() == nullptr);
+
+  cudax::launch(pb, test::one_thread_dims, test::verify_42{}, val);
+
+  auto exec = parent_g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  CUDAX_REQUIRE(mem[0] == 42);
+}
+#  endif // _CCCL_CTK_AT_LEAST(12, 9)
+
+C2H_TEST("graph insert_child_graph node has the correct node type", "[graph][child_graph]")
+{
+  cudax::graph_builder child_g;
+  {
+    cudax::path_builder child_pb = cudax::start_path(child_g);
+    cudax::launch(child_pb, test::one_thread_dims, test::empty_kernel{});
+  }
+
+  cudax::graph_builder parent_g;
+  cudax::path_builder pb = cudax::start_path(parent_g);
+
+  auto node = cudax::insert_child_graph(pb, child_g);
+
+  CUDAX_REQUIRE(node.type() == cudax::graph_node_type::graph);
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// conditional nodes (if / while)
+// ────────────────────────────────────────────────────────────────────────────
+
+#  if _CCCL_CTK_AT_LEAST(12, 4)
+
+C2H_TEST("graph make_if_node body executes when handle is non-zero", "[graph][conditional][if_node]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned_array<int> mem{1};
+  int* val = mem.get();
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  // Default value 1 → body executes.
+  auto [cond_node, body_graph, handle] = cudax::make_if_node(pb, /*__default_val=*/1u);
+
+  // Populate the body graph: assign 42 to val.
+  {
+    cudax::path_builder body_pb = cudax::start_path(body_graph);
+    cudax::launch(body_pb, test::one_thread_dims, test::assign_42{}, val);
+  }
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  CUDAX_REQUIRE(mem[0] == 42);
+}
+
+C2H_TEST("graph make_if_node body is skipped when handle is zero", "[graph][conditional][if_node]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned_array<int> mem{1};
+  int* val = mem.get();
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  // Default value 0 → body is skipped.
+  auto [cond_node, body_graph, handle] = cudax::make_if_node(pb, /*__default_val=*/0u);
+
+  {
+    cudax::path_builder body_pb = cudax::start_path(body_graph);
+    cudax::launch(body_pb, test::one_thread_dims, test::assign_42{}, val);
+  }
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  // val should remain 0 because the body was skipped.
+  CUDAX_REQUIRE(mem[0] == 0);
+}
+
+C2H_TEST("graph make_while_node body executes the expected number of times", "[graph][conditional][while_node]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned_array<int> mem{1, 5}; // will be decremented to 0
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  // Default value 1 → loop runs as long as counter > 0.
+  auto [while_node, body_graph, handle] = cudax::make_while_node(pb);
+
+  // Body: decrement counter and stop when done.
+  {
+    cudax::path_builder body_pb = cudax::start_path(body_graph);
+    cudax::launch(body_pb, test::one_thread_dims, test::count_down_and_stop{}, handle, mem.get());
+  }
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  CUDAX_REQUIRE(mem[0] == 0);
+}
+
+C2H_TEST("graph make_if_node with pre-constructed handle", "[graph][conditional][if_node]")
+{
+  cudax::stream s{cuda::device_ref{0}};
+  test::pinned_array<int> mem{1};
+  int* val = mem.get();
+
+  cudax::graph_builder g;
+  cudax::path_builder pb = cudax::start_path(g);
+
+  // User constructs handle directly.
+  cudax::conditional_handle my_handle{g, 1u};
+  auto [cond_node, body_graph, handle] = cudax::make_if_node(pb, my_handle);
+
+  {
+    cudax::path_builder body_pb = cudax::start_path(body_graph);
+    cudax::launch(body_pb, test::one_thread_dims, test::assign_42{}, val);
+  }
+
+  auto exec = g.instantiate();
+  exec.launch(s);
+  s.sync();
+
+  CUDAX_REQUIRE(mem[0] == 42);
+}
+
+#  endif // _CCCL_CTK_AT_LEAST(12, 4)
+
+#endif // _CCCL_CTK_AT_LEAST(12, 2)
diff --git a/libcudacxx/include/cuda/__device/device_ref.h b/libcudacxx/include/cuda/__device/device_ref.h
index 70ead5ef77b..1aecd3bfe08 100644
--- a/libcudacxx/include/cuda/__device/device_ref.h
+++ b/libcudacxx/include/cuda/__device/device_ref.h
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -128,6 +128,12 @@ class device_ref
   //! @brief Initializes the primary context of the device.
   _CCCL_HOST_API void init() const; // implemented in <cuda/__device/physical_device.h> to avoid circular dependency
 
+  //! @brief Retrieve the primary context of this device.
+  //!
+  //! @return The primary CUDA context for this device.
+  [[nodiscard]] _CCCL_HOST_API ::CUcontext primary_context() const; // implemented in <cuda/__device/physical_device.h>
+                                                                    // to avoid circular dependency
+
   //! @brief Retrieve the name of this device.
   //!
   //! @return String view containing the name of this device.
diff --git a/libcudacxx/include/cuda/__device/physical_device.h b/libcudacxx/include/cuda/__device/physical_device.h
index b827294fbb6..d45a8019c8a 100644
--- a/libcudacxx/include/cuda/__device/physical_device.h
+++ b/libcudacxx/include/cuda/__device/physical_device.h
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -159,6 +159,11 @@ _CCCL_HOST_API inline void device_ref::init() const
   (void) ::cuda::__physical_devices()[__id_].__primary_context();
 }
 
+[[nodiscard]] _CCCL_HOST_API inline ::CUcontext device_ref::primary_context() const
+{
+  return ::cuda::__physical_devices()[__id_].__primary_context();
+}
+
 [[nodiscard]] _CCCL_HOST_API inline ::cuda::std::string_view device_ref::name() const
 {
   return ::cuda::__physical_devices()[__id_].__name();