ROCm · phambinhfin · Mar 30, 2026 · Mar 30, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc b/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
@@ -443,7 +443,8 @@ ConvertThunksToCommandBuffer(
     const DebugOptions& debug_options) {
   bool enable_loop_unroll = debug_options.xla_gpu_command_buffer_unroll_loops();
   bool enable_va_remapping =
-      debug_options.xla_gpu_enable_command_buffer_va_remapping();
+      debug_options.xla_gpu_enable_command_buffer_va_remapping() ||
+      debug_options.xla_gpu_enable_circular_vmm_pool();
   TF_ASSIGN_OR_RETURN(
       CommandExecutor cmd_executor,
       ConvertToCommands(

diff --git a/xla/backends/gpu/runtime/command_buffer_thunk.cc b/xla/backends/gpu/runtime/command_buffer_thunk.cc
@@ -198,20 +198,28 @@ absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
       /*additional_compute_streams=*/{}, params.execution_scoped_state,
       /*mock_collectives=*/false);
 
-  // If command buffer is in `kCreate` state it means that command buffer
-  // sequence was never recorded into it. We initialize all command buffers
-  // before execution, because command buffers when instantiated will allocate
-  // memory on device and this might lead to deadlocks when we have concurrent
-  // NCCL operations in flight.
-  //
-  // If commands require initialization (and VA remapping is not enabled), we
-  // also record them into the command buffer before execution. This is required
-  // to guarantee that collective commands are recorded on all participating
-  // ranks to avoid deadlocks.
-  if (cmd_buffer->warmup_done && (cmd_buffer->command_buffer->state() ==
-                                      se::CommandBuffer::State::kCreate ||
-                                  (!enable_command_buffer_va_remapping_ &&
-                                   commands_.requires_initialization()))) {
+  bool warmup = cmd_buffer->warmup_done;
+  auto state = cmd_buffer->command_buffer->state();
+  bool will_record = warmup && (state == se::CommandBuffer::State::kCreate ||
+                                (!enable_command_buffer_va_remapping_ &&
+                                 commands_.requires_initialization()));
+  LOG(INFO) << absl::StrFormat(
+      "CommandBufferThunk::Initialize: warmup_done=%d state=%d "
+      "va_remapping=%d requires_init=%d will_record=%d",
+      warmup, static_cast<int>(state), enable_command_buffer_va_remapping_,
+      commands_.requires_initialization(), will_record);
+
+  // Log the addresses that will be used for recording
+  if (will_record) {
+    for (auto idx : commands_.allocs_indices()) {
+      auto addr = execute_params.buffer_allocations->GetDeviceAddress(idx);
+      LOG(INFO) << absl::StrFormat(
+          "  Initialize record addr[%d]: %p size=%d", idx, addr.opaque(),
+          addr.size());
+    }
+  }
+
+  if (will_record) {
     VLOG(3) << "Initialize command buffer on device #"
             << params.executor->device_ordinal()
             << " by recoding command buffer cmd sequence"
@@ -271,22 +279,41 @@ absl::Status CommandBufferThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   // warm up iteration, run through thunks if they are present.
   if (!cmd_buffer->warmup_done && thunks_) {
-    VLOG(2) << "Executing warm up iteration of command buffer thunk";
+    LOG(INFO) << "CommandBufferThunk: WARMUP - running sequential thunks";
+    for (auto idx : commands_.allocs_indices()) {
+      auto addr = params.buffer_allocations->GetDeviceAddress(idx);
+      LOG(INFO) << absl::StrFormat(
+          "  Warmup addr[%d]: %p size=%d", idx, addr.opaque(), addr.size());
+    }
     TF_RETURN_IF_ERROR(thunks_->ExecuteOnStream(params));
     cmd_buffer->warmup_done = true;
     return absl::OkStatus();
   }
 
   auto updated_allocs = cmd_buffer->UpdateBufferAllocations(commands_, params);
 
-  // Determine whether to (re-)record the command buffer and whether this is a
-  // first-time initialization recording (VA remapping path).
   bool is_first_record =
       enable_command_buffer_va_remapping_ &&
       cmd_buffer->command_buffer->state() == se::CommandBuffer::State::kCreate;
   bool needs_update = !enable_command_buffer_va_remapping_ &&
                       (!updated_allocs.empty() || commands_.force_update());
 
+  LOG(INFO) << absl::StrFormat(
+      "CommandBufferThunk::ExecuteOnStream: va_remapping=%d updated_allocs=%d "
+      "is_first_record=%d needs_update=%d num_executions=%d state=%d",
+      enable_command_buffer_va_remapping_, updated_allocs.size(),
+      is_first_record, needs_update, cmd_buffer->num_executions,
+      static_cast<int>(cmd_buffer->command_buffer->state()));
+
+  // Log addresses on first few executions
+  if (cmd_buffer->num_executions < 3) {
+    for (auto idx : commands_.allocs_indices()) {
+      auto addr = params.buffer_allocations->GetDeviceAddress(idx);
+      LOG(INFO) << absl::StrFormat(
+          "  Execute addr[%d]: %p size=%d", idx, addr.opaque(), addr.size());
+    }
+  }
+
   if (is_first_record || needs_update) {
     XLA_VLOG_DEVICE(3, executor->device_ordinal())
         << "Create/Update command buffer"

diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc
@@ -503,6 +503,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_enable_pdl(true);
   opts.set_xla_gpu_enable_command_buffer_va_remapping(false);
+  opts.set_xla_gpu_enable_circular_vmm_pool(false);
+  opts.set_xla_gpu_circular_vmm_pool_slots(1);
   return opts;
 }
 
@@ -3012,6 +3014,21 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Enable VA remapping for command buffer thunks. When enabled, command "
       "buffer thunks use fixed virtual addresses across executions, allowing "
       "the command buffer to be recorded once and replayed without updates."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_circular_vmm_pool",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_circular_vmm_pool),
+      debug_options->xla_gpu_enable_circular_vmm_pool(),
+      "Enable circular VMM pool for command buffer thunks. Pre-allocates N "
+      "physical memory slots with permanent VA mappings, using GPU timeline "
+      "signaling for safe slot reuse. Eliminates per-iteration map/unmap "
+      "overhead entirely after startup."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_circular_vmm_pool_slots",
+      int32_setter_for(
+          &DebugOptions::set_xla_gpu_circular_vmm_pool_slots),
+      debug_options->xla_gpu_circular_vmm_pool_slots(),
+      "Number of slots in the circular VMM pool (default 2)."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more

diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
@@ -703,6 +703,7 @@ cc_library(
     name = "gpu_executable",
     srcs = ["gpu_executable.cc"],
     hdrs = ["gpu_executable.h"],
+    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
         ":alias_info",
         ":backend_configs_cc",
@@ -804,7 +805,9 @@ cc_library(
         "@tsl//tsl/platform:random",
         "@tsl//tsl/profiler/lib:scoped_annotation",
         "@tsl//tsl/profiler/lib:traceme",
-    ],
+    ] + if_rocm_is_configured([
+        "//xla/stream_executor/rocm:circular_vmm_pool",
+    ]),
 )
 
 tf_proto_library(

diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc
@@ -116,6 +116,9 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/sycl/sycl_platform_id.h"
 #include "xla/stream_executor/vmm_device_address_allocator.h"
+#if TENSORFLOW_USE_ROCM
+#include "xla/stream_executor/rocm/circular_vmm_pool.h"
+#endif
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/env_time.h"
 #include "xla/tsl/platform/logging.h"
@@ -1515,6 +1518,151 @@ absl::Status GpuExecutable::ExecuteThunksWithVaRemapping(
   return absl::OkStatus();
 }
 
+absl::Status GpuExecutable::ExecuteThunksWithCircularVmmPool(
+    const BufferAllocations& buffer_allocations,
+    const ServiceExecutableRunOptions* run_options,
+    se::StreamExecutor* executor, int64_t unique_id,
+    Thunk::ExecutableSource executable_source, bool block_host_until_done) {
+#if TENSORFLOW_USE_ROCM
+  CircularPoolState* pool_state = nullptr;
+  {
+    absl::MutexLock lock(&circular_pool_mutex_);
+    pool_state = &circular_pools_[executor];
+  }
+
+  // Build sets: which buffers go into pool, which need data copying.
+  // Pool ALL non-constant command buffer allocations (params + temps).
+  // Constants keep BFC addresses (loaded from module globals, stable).
+  absl::btree_set<BufferAllocation::Index> pool_indexes;
+  absl::btree_set<BufferAllocation::Index> copy_indexes;
+  if (buffer_assignment_) {
+    for (BufferAllocation::Index idx : command_buffer_allocation_indexes_) {
+      const auto& alloc = buffer_assignment_->GetAllocation(idx);
+      if (alloc.is_constant() || alloc.size() == 0) continue;
+      pool_indexes.insert(idx);
+      if (alloc.is_entry_computation_parameter() || alloc.maybe_live_out()) {
+        copy_indexes.insert(idx);
+      }
+    }
+  }
+
+  // Initialize pool on first use.
+  if (pool_state->pool == nullptr) {
+    int num_slots = has_module()
+        ? module_config().debug_options().xla_gpu_circular_vmm_pool_slots()
+        : 1;
+
+    if (pool_indexes.empty()) {
+      return ExecuteThunksImpl(
+          has_module() ? &module_config().debug_options() : nullptr,
+          module_name_, unique_id, *thunk_executor_, executable_source,
+          run_options, buffer_allocations, block_host_until_done,
+          execution_stream_ids_, collective_memory_cache_);
+    }
+
+    std::vector<uint64_t> buffer_sizes;
+    buffer_sizes.reserve(pool_indexes.size());
+    for (BufferAllocation::Index idx : pool_indexes) {
+      buffer_sizes.push_back(buffer_allocations.GetDeviceAddress(idx).size());
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        auto pool,
+        se::gpu::CircularVmmPool::Create(executor, buffer_sizes, num_slots));
+
+    LOG(INFO) << absl::StrFormat(
+        "CircularVmmPool: created %d slots for module %s on device %d "
+        "(%d command buffer allocations)",
+        num_slots, module_name_, executor->device_ordinal(),
+        command_buffer_allocation_indexes_.size());
+
+    pool_state->pool = std::move(pool);
+  }
+
+  auto* pool = static_cast<se::gpu::CircularVmmPool*>(pool_state->pool.get());
+  uint64_t iteration = pool_state->iteration_count++;
+  int slot_idx = iteration % pool->num_slots();
+
+  // Acquire next slot — non-blocking check of GPU timeline counter.
+  TF_ASSIGN_OR_RETURN(auto slot_addresses, pool->AcquireNextSlot(iteration));
+
+  LOG(INFO) << absl::StrFormat(
+      "CircularVmmPool iter=%d slot=%d/%d: %d pool addrs",
+      iteration, slot_idx, pool->num_slots(), slot_addresses.size());
+
+  // Build remapped buffer allocations: all pooled buffers use pool VA
+  // addresses; constants and non-command-buffer buffers keep BFC addresses.
+  // For params/live-out, copy data from BFC into pool VA before execution.
+  std::vector<se::DeviceAddressBase> mapped_buffers;
+  mapped_buffers.reserve(buffer_allocations.size());
+  int slot_addr_idx = 0;
+  for (BufferAllocation::Index i = 0;
+       i < static_cast<BufferAllocation::Index>(buffer_allocations.size());
+       ++i) {
+    if (pool_indexes.contains(i)) {
+      auto pool_addr = slot_addresses[slot_addr_idx++];
+      auto bfc_addr = buffer_allocations.GetDeviceAddress(i);
+
+      // Copy param data from BFC into pool before execution. This is needed
+      // because the graph uses stable pool VA addresses, but the actual data
+      // lives at BFC addresses which may change. For params, the data itself
+      // may also change (e.g., optimizer weight updates in training).
+      if (copy_indexes.contains(i) && !bfc_addr.is_null() &&
+          bfc_addr.size() > 0) {
+        se::DeviceAddressBase pool_dst(pool_addr.opaque(), bfc_addr.size());
+        TF_RETURN_IF_ERROR(run_options->stream()->MemcpyD2D(
+            &pool_dst, bfc_addr, bfc_addr.size()));
+      }
+      mapped_buffers.push_back(pool_addr);
+    } else {
+      mapped_buffers.push_back(buffer_allocations.GetDeviceAddress(i));
+    }
+  }
+
+  BufferAllocations remapped_buffer_allocations(
+      mapped_buffers, buffer_allocations.device_ordinal(),
+      buffer_allocations.memory_allocator());
+
+  TF_RETURN_IF_ERROR(ExecuteThunksImpl(
+      has_module() ? &module_config().debug_options() : nullptr, module_name_,
+      unique_id, *thunk_executor_, executable_source, run_options,
+      remapped_buffer_allocations, block_host_until_done,
+      execution_stream_ids_, collective_memory_cache_));
+
+  // Copy live-out results back from pool to BFC so the output appears at
+  // the expected BFC address for downstream consumers.
+  slot_addr_idx = 0;
+  for (BufferAllocation::Index i = 0;
+       i < static_cast<BufferAllocation::Index>(buffer_allocations.size());
+       ++i) {
+    if (pool_indexes.contains(i)) {
+      auto pool_addr = slot_addresses[slot_addr_idx++];
+      if (copy_indexes.contains(i) && buffer_assignment_) {
+        const auto& alloc = buffer_assignment_->GetAllocation(i);
+        if (alloc.maybe_live_out()) {
+          auto bfc_addr = buffer_allocations.GetDeviceAddress(i);
+          if (!bfc_addr.is_null() && bfc_addr.size() > 0) {
+            se::DeviceAddressBase bfc_dst(bfc_addr.opaque(), bfc_addr.size());
+            se::DeviceAddressBase pool_src(pool_addr.opaque(), bfc_addr.size());
+            TF_RETURN_IF_ERROR(run_options->stream()->MemcpyD2D(
+                &bfc_dst, pool_src, bfc_addr.size()));
+          }
+        }
+      }
+    }
+  }
+
+  // GPU signals slot completion so the CPU knows when this slot is safe to
+  // reuse (non-blocking write via hipStreamWriteValue64).
+  TF_RETURN_IF_ERROR(pool->ReleaseSlot(run_options->stream(), iteration));
-  if (block_host_until_done) {
-    TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone());
-  }
-
-  // GPU signals slot completion so the CPU knows when this slot is safe to
-  // reuse (non-blocking write via hipStreamWriteValue64).
-  TF_RETURN_IF_ERROR(pool->ReleaseSlot(run_options->stream(), iteration));
+  // GPU signals slot completion so the CPU knows when this slot is safe to
+  // reuse (non-blocking write via hipStreamWriteValue64).
+  TF_RETURN_IF_ERROR(pool->ReleaseSlot(run_options->stream(), iteration));
+
+  if (block_host_until_done) {
+    TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone());
+  }
-  if (block_host_until_done) {
-    TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone());
-  }
-
-  // GPU signals slot completion so the CPU knows when this slot is safe to
-  // reuse (non-blocking write via hipStreamWriteValue64).
-  TF_RETURN_IF_ERROR(pool->ReleaseSlot(run_options->stream(), iteration));
+  // GPU signals slot completion so the CPU knows when this slot is safe to
+  // reuse (non-blocking write via hipStreamWriteValue64).
+  TF_RETURN_IF_ERROR(pool->ReleaseSlot(run_options->stream(), iteration));
+
+  if (block_host_until_done) {
+    TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone());
+  }
+
+  return absl::OkStatus();
+#else
+  return absl::UnimplementedError(
+      "Circular VMM pool is only supported on ROCm.");
+#endif
+}
+
 absl::Status GpuExecutable::ExecuteThunks(
     const BufferAllocations& buffer_allocations,
     const ServiceExecutableRunOptions* run_options) {
@@ -1587,21 +1735,32 @@ absl::Status GpuExecutable::ExecuteThunks(
 
   se::StreamExecutor* executor = run_options->stream()->parent();
 
+  bool has_cmd_buffer_allocs = !command_buffer_allocation_indexes_.empty();
+
+  // Check if circular VMM pool is enabled (takes priority over VA remapping).
+  bool enable_circular_vmm_pool =
+      has_cmd_buffer_allocs && has_module() &&
+      module_config().debug_options().xla_gpu_enable_circular_vmm_pool();
+
   // Check if command buffer VA remapping is enabled.
   bool enable_command_buffer_va_remapping =
-      (command_buffer_allocation_indexes_.size() > 0) && has_module() &&
+      !enable_circular_vmm_pool && has_cmd_buffer_allocs && has_module() &&
       module_config()
           .debug_options()
           .xla_gpu_enable_command_buffer_va_remapping() &&
       dynamic_cast<se::DeviceAddressVmmAllocator*>(memory_allocator) != nullptr;
 
-  XLA_VLOG_DEVICE(3, executor->device_ordinal()) << absl::StreamFormat(
-      "ExecuteThunks: command_buffer_allocation_indexes_.size()=%d "
-      "enable_command_buffer_va_remapping=%d",
-      command_buffer_allocation_indexes_.size(),
+  LOG(INFO) << absl::StreamFormat(
+      "ExecuteThunks: cmd_buffer_allocs=%d circular_vmm_pool=%d "
+      "va_remapping=%d",
+      command_buffer_allocation_indexes_.size(), enable_circular_vmm_pool,
       enable_command_buffer_va_remapping);
 
-  if (enable_command_buffer_va_remapping) {
+  if (enable_circular_vmm_pool) {
+    TF_RETURN_IF_ERROR(ExecuteThunksWithCircularVmmPool(
+        buffer_allocations, run_options, executor, unique_id, executable_source,
+        block_host_until_done));
+  } else if (enable_command_buffer_va_remapping) {
     TF_RETURN_IF_ERROR(ExecuteThunksWithVaRemapping(
         buffer_allocations, run_options, executor, unique_id, executable_source,
         block_host_until_done));

diff --git a/xla/service/gpu/gpu_executable.h b/xla/service/gpu/gpu_executable.h
@@ -430,6 +430,24 @@ class GpuExecutable : public Executable {
   absl::node_hash_map<stream_executor::StreamExecutor*, VaRanges>
       module_va_ranges_ ABSL_GUARDED_BY(va_ranges_mutex_);
 
+  // Circular VMM pool: pre-allocated slots with permanent VA mappings and
+  // GPU timeline signaling for safe slot reuse. ROCm-only.
+  absl::Status ExecuteThunksWithCircularVmmPool(
+      const BufferAllocations& buffer_allocations,
+      const ServiceExecutableRunOptions* run_options,
+      stream_executor::StreamExecutor* executor, int64_t unique_id,
+      Thunk::ExecutableSource executable_source, bool block_host_until_done);
+
+  struct CircularPoolState {
+    std::shared_ptr<void> pool;
+    uint64_t iteration_count = 0;
+    // Track last-seen BFC addresses to skip redundant D2D memcpy.
+    absl::flat_hash_map<BufferAllocation::Index, void*> last_param_addrs;
+  };
+  absl::Mutex circular_pool_mutex_;
+  absl::node_hash_map<stream_executor::StreamExecutor*, CircularPoolState>
+      circular_pools_ ABSL_GUARDED_BY(circular_pool_mutex_);
+
   GpuExecutable(const GpuExecutable&) = delete;
   GpuExecutable& operator=(const GpuExecutable&) = delete;
 

diff --git a/xla/service/gpu/thunk_emitter.cc b/xla/service/gpu/thunk_emitter.cc
@@ -399,8 +399,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCommandBufferThunk(
 
   bool enable_loop_unroll = ir_emitter_context_->debug_options()
                                 .xla_gpu_command_buffer_unroll_loops();
-  bool enable_va_remapping = ir_emitter_context_->debug_options()
-                                 .xla_gpu_enable_command_buffer_va_remapping();
+  bool enable_va_remapping =
+      ir_emitter_context_->debug_options()
+          .xla_gpu_enable_command_buffer_va_remapping() ||
+      ir_emitter_context_->debug_options()
+          .xla_gpu_enable_circular_vmm_pool();
   TF_ASSIGN_OR_RETURN(
       CommandExecutor cmd_executor,
       ConvertToCommands(