diff --git a/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/nvbench_helper/nvbench_helper/nvbench_helper.cuh
index 344c261bf8c..9a54355754e 100644
--- a/nvbench_helper/nvbench_helper/nvbench_helper.cuh
+++ b/nvbench_helper/nvbench_helper/nvbench_helper.cuh
@@ -47,8 +47,8 @@ NVBENCH_DECLARE_TYPE_STRINGS(complex64, "C64", "complex64");
 
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type");
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type");
-NVBENCH_DECLARE_TYPE_STRINGS(cub::ArgMin, "ArgMin", "cub::ArgMin");
-NVBENCH_DECLARE_TYPE_STRINGS(cub::ArgMax, "ArgMax", "cub::ArgMax");
+NVBENCH_DECLARE_TYPE_STRINGS(cub::detail::arg_min, "arg_min", "cub::detail::arg_min");
+NVBENCH_DECLARE_TYPE_STRINGS(cub::detail::arg_max, "arg_max", "cub::detail::arg_max");
 
 template <typename T, T I>
 struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
diff --git a/thrust/benchmarks/bench/extrema/basic.cu b/thrust/benchmarks/bench/extrema/basic.cu
new file mode 100644
index 00000000000..3bae59603d2
--- /dev/null
+++ b/thrust/benchmarks/bench/extrema/basic.cu
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+
+#include "nvbench_helper.cuh"
+
+template <typename T, typename Func>
+static void bench_extremum(nvbench::state& state, nvbench::type_list<T>, Func func)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> in = generate(elements);
+
+  using offset_t = typename decltype(in.cbegin())::difference_type;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<offset_t>(1);
+
+  caching_allocator_t alloc;
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               do_not_optimize(func(policy(alloc, launch), in.cbegin(), in.cend()));
+             });
+}
+
+template <typename T>
+static void min_element(nvbench::state& state, nvbench::type_list<T> list)
+{
+  bench_extremum(state, list, [](auto&&... args) {
+    return thrust::min_element(args...);
+  });
+}
+
+NVBENCH_BENCH_TYPES(min_element, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("min_element")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
+
+template <typename T>
+static void max_element(nvbench::state& state, nvbench::type_list<T> list)
+{
+  bench_extremum(state, list, [](auto&&... args) {
+    return thrust::max_element(args...);
+  });
+}
+
+NVBENCH_BENCH_TYPES(max_element, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("max_element")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
+
+template <typename T>
+static void minmax_element(nvbench::state& state, nvbench::type_list<T> list)
+{
+  bench_extremum(state, list, [](auto&&... args) {
+    return thrust::minmax_element(args...);
+  });
+}
+
+NVBENCH_BENCH_TYPES(minmax_element, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("minmax_element")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h
index d2d9f69dc80..252ef6a29dc 100644
--- a/thrust/thrust/system/cuda/detail/extrema.h
+++ b/thrust/thrust/system/cuda/detail/extrema.h
@@ -17,15 +17,12 @@
 
 #  include <thrust/system/cuda/config.h>
 
-#  include <cub/util_math.cuh>
-
 #  include <thrust/detail/temporary_array.h>
 #  include <thrust/extrema.h>
-#  include <thrust/iterator/counting_iterator.h>
-#  include <thrust/iterator/transform_iterator.h>
 #  include <thrust/system/cuda/detail/cdp_dispatch.h>
-#  include <thrust/system/cuda/detail/reduce.h>
 
+#  include <cuda/__iterator/counting_iterator.h>
+#  include <cuda/__iterator/zip_iterator.h>
 #  include <cuda/std/__functional/operations.h>
 #  include <cuda/std/__iterator/distance.h>
 #  include <cuda/std/__utility/pair.h>
@@ -36,416 +33,181 @@ namespace cuda_cub
 {
 namespace __extrema
 {
-template <class InputType, class IndexType, class Predicate>
-struct arg_min_f
-{
-  Predicate predicate;
-  using pair_type = ::cuda::std::tuple<InputType, IndexType>;
-
-  _CCCL_HOST_DEVICE arg_min_f(Predicate p)
-      : predicate(p)
-  {}
-
-  pair_type _CCCL_DEVICE operator()(pair_type const& lhs, pair_type const& rhs)
-  {
-    InputType const& rhs_value = ::cuda::std::get<0>(rhs);
-    InputType const& lhs_value = ::cuda::std::get<0>(lhs);
-    IndexType const& rhs_key   = ::cuda::std::get<1>(rhs);
-    IndexType const& lhs_key   = ::cuda::std::get<1>(lhs);
-
-    // check values first
-    if (predicate(lhs_value, rhs_value))
-    {
-      return lhs;
-    }
-    else if (predicate(rhs_value, lhs_value))
-    {
-      return rhs;
-    }
-
-    // values are equivalent, prefer smaller index
-    if (lhs_key < rhs_key)
-    {
-      return lhs;
-    }
-    else
-    {
-      return rhs;
-    }
-  }
-}; // struct arg_min_f
-
-template <class InputType, class IndexType, class Predicate>
-struct arg_max_f
+template <class Derived, class ItemsIt, class BinaryPred>
+ItemsIt CUB_RUNTIME_FUNCTION
+cub_min_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred)
 {
-  Predicate predicate;
-  using pair_type = ::cuda::std::tuple<InputType, IndexType>;
+  cudaStream_t stream      = cuda_cub::stream(policy);
+  using offset_t           = thrust::detail::it_difference_t<ItemsIt>;
+  const offset_t num_items = ::cuda::std::distance(first, last);
 
-  _CCCL_HOST_DEVICE arg_max_f(Predicate p)
-      : predicate(p)
-  {}
-
-  pair_type _CCCL_DEVICE operator()(pair_type const& lhs, pair_type const& rhs)
+  if (num_items == 0)
   {
-    InputType const& rhs_value = ::cuda::std::get<0>(rhs);
-    InputType const& lhs_value = ::cuda::std::get<0>(lhs);
-    IndexType const& rhs_key   = ::cuda::std::get<1>(rhs);
-    IndexType const& lhs_key   = ::cuda::std::get<1>(lhs);
-
-    // check values first
-    if (predicate(lhs_value, rhs_value))
-    {
-      return rhs;
-    }
-    else if (predicate(rhs_value, lhs_value))
-    {
-      return lhs;
-    }
-
-    // values are equivalent, prefer smaller index
-    if (lhs_key < rhs_key)
-    {
-      return lhs;
-    }
-    else
-    {
-      return rhs;
-    }
+    return last;
   }
-}; // struct arg_max_f
-
-template <class InputType, class IndexType, class Predicate>
-struct arg_minmax_f
-{
-  Predicate predicate;
 
-  using pair_type      = ::cuda::std::tuple<InputType, IndexType>;
-  using two_pairs_type = ::cuda::std::tuple<pair_type, pair_type>;
-
-  using arg_min_t = arg_min_f<InputType, IndexType, Predicate>;
-  using arg_max_t = arg_max_f<InputType, IndexType, Predicate>;
+  size_t tmp_size = 0;
+  auto error      = cub::DeviceReduce::ArgMin(
+    nullptr,
+    tmp_size,
+    first,
+    ::cuda::discard_iterator{},
+    static_cast<offset_t*>(nullptr),
+    num_items,
+    binary_pred,
+    stream);
+  throw_on_error(error, "min_element failed to allocate temporary storages");
 
-  _CCCL_HOST_DEVICE arg_minmax_f(Predicate p)
-      : predicate(p)
-  {}
+  // We allocate both the temporary storage needed for the algorithm, and a `size_type` to store the result.
+  thrust::detail::temporary_array<char, Derived> tmp(policy, sizeof(offset_t) + tmp_size);
+  offset_t* index_ptr = thrust::detail::aligned_reinterpret_cast<offset_t*>(tmp.data().get());
+  void* tmp_ptr       = static_cast<void*>(tmp.data().get() + sizeof(offset_t));
 
-  two_pairs_type _CCCL_DEVICE operator()(two_pairs_type const& lhs, two_pairs_type const& rhs)
-  {
-    pair_type const& rhs_min = ::cuda::std::get<0>(rhs);
-    pair_type const& lhs_min = ::cuda::std::get<0>(lhs);
-    pair_type const& rhs_max = ::cuda::std::get<1>(rhs);
-    pair_type const& lhs_max = ::cuda::std::get<1>(lhs);
+  error = cub::DeviceReduce::ArgMin(
+    tmp_ptr, tmp_size, first, ::cuda::discard_iterator{}, index_ptr, num_items, binary_pred, stream);
+  cuda_cub::throw_on_error(error, "min_element failed to launch cub::DeviceReduce::ArgMin");
 
-    auto result =
-      ::cuda::std::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min), arg_max_t(predicate)(lhs_max, rhs_max));
+  cuda_cub::throw_on_error(cuda_cub::synchronize(policy), "min_element failed to synchronize");
 
-    return result;
-  }
+  return first + get_value(policy, index_ptr);
+}
 
-  struct duplicate_tuple
-  {
-    _CCCL_DEVICE two_pairs_type operator()(pair_type const& t)
-    {
-      return ::cuda::std::make_tuple(t, t);
-    }
-  };
-}; // struct arg_minmax_f
-
-template <class T, class InputIt, class OutputIt, class Size, class ReductionOp>
-cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
-  void* d_temp_storage,
-  size_t& temp_storage_bytes,
-  InputIt input_it,
-  Size num_items,
-  ReductionOp reduction_op,
-  OutputIt output_it,
-  cudaStream_t stream)
+template <typename OffsetT, typename T>
+struct minmax_accum_t
 {
-  using core::detail::AgentLauncher;
-  using core::detail::AgentPlan;
-  using core::detail::cuda_optional;
-  using core::detail::get_agent_plan;
+  ::cuda::std::pair<OffsetT, T> min, max;
+};
 
-  using UnsignedSize = typename detail::make_unsigned_special<Size>::type;
-
-  if (num_items == 0)
+template <typename OffsetT, typename T>
+struct minmax_load_transformation
+{
+  // convert from zip_iterator
+  template <typename TRef>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto operator()(::cuda::std::tuple<OffsetT, TRef> input) const
+    -> minmax_accum_t<OffsetT, T>
   {
-    return cudaErrorNotSupported;
+    auto p = ::cuda::std::pair<OffsetT, T>{::cuda::std::get<0>(input), ::cuda::std::get<1>(input)};
+    return {p, p};
   }
+};
 
-  using reduce_agent = AgentLauncher<__reduce::ReduceAgent<InputIt, OutputIt, T, Size, ReductionOp>>;
+template <typename OffsetT>
+struct output_t
+{
+  OffsetT min_offset;
+  OffsetT max_offset;
 
-  typename reduce_agent::Plan reduce_plan = reduce_agent::get_plan(stream);
+  output_t() = default;
 
-  cudaError_t status = cudaSuccess;
+  // convert from accumulator type (during assignment at the end of the kernel)
+  template <typename T>
+  _CCCL_API _CCCL_FORCEINLINE output_t(minmax_accum_t<OffsetT, T> result)
+      : min_offset(result.min.first)
+      , max_offset(result.max.first)
+  {}
+};
 
-  if (num_items <= reduce_plan.items_per_tile)
-  {
-    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1);
-
-    // small, single tile size
-    if (d_temp_storage == nullptr)
-    {
-      temp_storage_bytes = max<size_t>(1, vshmem_size);
-      return status;
-    }
-    char* vshmem_ptr = vshmem_size > 0 ? (char*) d_temp_storage : nullptr;
-
-    reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only");
-    ra.launch(input_it, output_it, num_items, reduction_op);
-    _CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-  }
-  else
+template <typename OffsetT, typename T, typename ValueLessThen = ::cuda::std::less<>>
+struct minmax_reduce_op : ValueLessThen
+{
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
+  operator()(const minmax_accum_t<OffsetT, T>& a, const minmax_accum_t<OffsetT, T>& b) const
+    -> minmax_accum_t<OffsetT, T>
   {
-    // regular size
-    cuda_optional<int> sm_count = core::detail::get_sm_count();
-    _CUDA_CUB_RET_IF_FAIL(sm_count.status());
-
-    // reduction will not use more cta counts than requested
-    cuda_optional<int> max_blocks_per_sm = reduce_agent::template get_max_blocks_per_sm<
-      InputIt,
-      OutputIt,
-      Size,
-      cub::GridEvenShare<Size>,
-      cub::GridQueue<UnsignedSize>,
-      ReductionOp>(reduce_plan);
-    _CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
-
-    int reduce_device_occupancy = (int) max_blocks_per_sm * sm_count;
-
-    int sm_oversubscription = 5;
-    int max_blocks          = reduce_device_occupancy * sm_oversubscription;
-
-    cub::GridEvenShare<Size> even_share;
-    even_share.DispatchInit(num_items, max_blocks, reduce_plan.items_per_tile);
-
-    // we will launch at most "max_blocks" blocks in a grid
-    // so preallocate virtual shared memory storage for this if required
-    //
-    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
-
-    // Temporary storage allocation requirements
-    void* allocations[3]       = {nullptr, nullptr, nullptr};
-    size_t allocation_sizes[3] = {
-      max_blocks * sizeof(T), // bytes needed for privatized block reductions
-      cub::GridQueue<UnsignedSize>::AllocationSize(), // bytes needed for grid queue descriptor0
-      vshmem_size // size of virtualized shared memory storage
-    };
-    status = cub::detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
-    _CUDA_CUB_RET_IF_FAIL(status);
-    if (d_temp_storage == nullptr)
-    {
-      return status;
-    }
-
-    T* d_block_reductions = (T*) allocations[0];
-    cub::GridQueue<UnsignedSize> queue(allocations[1]);
-    char* vshmem_ptr = vshmem_size > 0 ? (char*) allocations[2] : nullptr;
-
-    // Get grid size for device_reduce_sweep_kernel
-    int reduce_grid_size = 0;
-    if (reduce_plan.grid_mapping == cub::GRID_MAPPING_RAKE)
-    {
-      // Work is distributed evenly
-      reduce_grid_size = even_share.grid_size;
-    }
-    else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC)
-    {
-      // Work is distributed dynamically
-      size_t num_tiles = ::cuda::ceil_div(num_items, reduce_plan.items_per_tile);
-
-      // if not enough to fill the device with threadblocks
-      // then fill the device with threadblocks
-      reduce_grid_size = static_cast<int>((min) (num_tiles, static_cast<size_t>(reduce_device_occupancy)));
-
-      using drain_agent    = AgentLauncher<__reduce::DrainAgent<Size>>;
-      AgentPlan drain_plan = drain_agent::get_plan();
-      drain_plan.grid_size = 1;
-      drain_agent da(drain_plan, stream, "__reduce::drain_agent");
-      da.launch(queue, num_items);
-      _CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-    }
-    else
-    {
-      _CUDA_CUB_RET_IF_FAIL(cudaErrorNotSupported);
-    }
-
-    reduce_plan.grid_size = reduce_grid_size;
-    reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce");
-    ra.launch(input_it, d_block_reductions, num_items, even_share, queue, reduction_op);
-    _CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    using reduce_agent_single = AgentLauncher<__reduce::ReduceAgent<T*, OutputIt, T, Size, ReductionOp>>;
-
-    reduce_plan.grid_size = 1;
-    reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce");
-
-    ra1.launch(d_block_reductions, output_it, reduce_grid_size, reduction_op);
-    _CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    const auto& less = static_cast<const ValueLessThen&>(*this);
+    const auto min   = cub::detail::arg_less<ValueLessThen>{less}(a.min, b.min);
+    const auto max   = cub::detail::arg_less<cub::detail::swap_args<ValueLessThen>>{less}(a.max, b.max);
+    return {min, max};
   }
 
-  return status;
-} // func doit_step
+  // needed for __accumulator_t, never called at runtime
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto operator()(
+    const cub::detail::reduce::empty_problem_init_t<output_t<OffsetT>>&, const minmax_accum_t<OffsetT, T>&) const
+    -> minmax_accum_t<OffsetT, T>;
+};
 
-// this is an init-less reduce, needed for min/max-element functionality
-// this will avoid copying the first value from device->host
-template <typename Derived, typename InputIt, typename Size, typename BinaryOp, typename T>
-THRUST_RUNTIME_FUNCTION T
-extrema(execution_policy<Derived>& policy, InputIt first, Size num_items, BinaryOp binary_op, T*)
+template <class Derived, class ItemsIt, class BinaryPred>
+::cuda::std::pair<ItemsIt, ItemsIt> CUB_RUNTIME_FUNCTION
+cub_minmax_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred)
 {
-  size_t temp_storage_bytes = 0;
-  cudaStream_t stream       = cuda_cub::stream(policy);
-
-  cudaError_t status;
-  THRUST_INDEX_TYPE_DISPATCH(
-    status,
-    doit_step<T>,
-    num_items,
-    (nullptr, temp_storage_bytes, first, num_items_fixed, binary_op, static_cast<T*>(nullptr), stream));
-  cuda_cub::throw_on_error(status, "extrema failed on 1st step");
-
-  size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes};
-  void* allocations[2]       = {nullptr, nullptr};
-
-  size_t storage_size = 0;
-  status              = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
-  cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage");
-
-  // Allocate temporary storage.
-  thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
-  void* ptr = static_cast<void*>(tmp.data().get());
-
-  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
-  cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage");
+  cudaStream_t stream      = cuda_cub::stream(policy);
+  using offset_t           = thrust::detail::it_difference_t<ItemsIt>;
+  const offset_t num_items = ::cuda::std::distance(first, last);
 
-  T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
-
-  THRUST_INDEX_TYPE_DISPATCH(
-    status,
-    doit_step<T>,
-    num_items,
-    (allocations[1], temp_storage_bytes, first, num_items_fixed, binary_op, d_result, stream));
-  cuda_cub::throw_on_error(status, "extrema failed on 2nd step");
-
-  status = cuda_cub::synchronize(policy);
-  cuda_cub::throw_on_error(status, "extrema failed to synchronize");
-
-  T result = cuda_cub::get_value(policy, d_result);
-
-  return result;
-}
-
-template <template <class, class, class> class ArgFunctor, class Derived, class ItemsIt, class BinaryPred>
-ItemsIt THRUST_RUNTIME_FUNCTION
-element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred)
-{
-  if (first == last)
+  if (num_items == 0)
   {
-    return last;
+    return {first, first};
   }
 
-  using InputType = thrust::detail::it_value_t<ItemsIt>;
-  using IndexType = thrust::detail::it_difference_t<ItemsIt>;
-
-  IndexType num_items = static_cast<IndexType>(::cuda::std::distance(first, last));
-
-  using iterator_tuple = ::cuda::std::tuple<ItemsIt, counting_iterator<IndexType>>;
-  using zip_iterator   = thrust::zip_iterator<iterator_tuple>;
+  using input_t = thrust::detail::it_value_t<ItemsIt>;
+  auto indexed_first =
+    ::cuda::make_zip_iterator(::cuda::counting_iterator<offset_t>(0), thrust::try_unwrap_contiguous_iterator(first));
+  auto reduction_op = minmax_reduce_op<offset_t, input_t, BinaryPred>{binary_pred};
+  auto transform_op = minmax_load_transformation<offset_t, input_t>{};
+  using output_t    = output_t<offset_t>;
+  const auto init   = cub::detail::reduce::empty_problem_init_t<output_t>{};
+
+  size_t tmp_size = 0;
+  auto error      = cub::DeviceReduce::TransformReduce(
+    nullptr,
+    tmp_size,
+    indexed_first,
+    static_cast<output_t*>(nullptr),
+    num_items,
+    reduction_op,
+    transform_op,
+    init,
+    stream);
+  throw_on_error(error, "minmax_element failed to allocate temporary storages");
 
-  iterator_tuple iter_tuple = ::cuda::std::make_tuple(first, counting_iterator<IndexType>(0));
+  // We allocate both the temporary storage needed for the algorithm, and a `size_type` to store the result.
+  thrust::detail::temporary_array<char, Derived> tmp(policy, sizeof(output_t) + tmp_size);
+  output_t* out_ptr = thrust::detail::aligned_reinterpret_cast<output_t*>(tmp.data().get());
+  void* tmp_ptr     = static_cast<void*>(tmp.data().get() + sizeof(output_t));
 
-  using arg_min_t = ArgFunctor<InputType, IndexType, BinaryPred>;
-  using T         = ::cuda::std::tuple<InputType, IndexType>;
+  error = cub::DeviceReduce::TransformReduce(
+    tmp_ptr, tmp_size, indexed_first, out_ptr, num_items, reduction_op, transform_op, init, stream);
+  cuda_cub::throw_on_error(error, "minmax_element failed to launch cub::DeviceReduce::ArgMin");
 
-  zip_iterator begin = thrust::make_zip_iterator(iter_tuple);
+  cuda_cub::throw_on_error(cuda_cub::synchronize(policy), "min_element failed to synchronize");
 
-  T result = extrema(policy, begin, num_items, arg_min_t(binary_pred), (T*) (nullptr));
-  return first + ::cuda::std::get<1>(result);
+  const auto [min_offset, max_offset] = get_value(policy, out_ptr);
+  return {first + min_offset, first + max_offset};
 }
 } // namespace __extrema
 
 /// min element
 
 _CCCL_EXEC_CHECK_DISABLE
-template <class Derived, class ItemsIt, class BinaryPred>
+template <class Derived, class ItemsIt, class BinaryPred = ::cuda::std::less<thrust::detail::it_value_t<ItemsIt>>>
 ItemsIt _CCCL_HOST_DEVICE
-min_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred)
+min_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred = {})
 {
-  THRUST_CDP_DISPATCH((last = __extrema::element<__extrema::arg_min_f>(policy, first, last, binary_pred);),
-                      (last = thrust::min_element(cvt_to_seq(derived_cast(policy)), first, last, binary_pred);));
-  return last;
-}
-
-template <class Derived, class ItemsIt>
-ItemsIt _CCCL_HOST_DEVICE min_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last)
-{
-  using value_type = thrust::detail::it_value_t<ItemsIt>;
-  return cuda_cub::min_element(policy, first, last, ::cuda::std::less<value_type>());
+  THRUST_CDP_DISPATCH(({ return __extrema::cub_min_element(policy, first, last, binary_pred); }),
+                      ({ return thrust::min_element(cvt_to_seq(derived_cast(policy)), first, last, binary_pred); }));
 }
 
 /// max element
 
 _CCCL_EXEC_CHECK_DISABLE
-template <class Derived, class ItemsIt, class BinaryPred>
+template <class Derived, class ItemsIt, class BinaryPred = ::cuda::std::less<thrust::detail::it_value_t<ItemsIt>>>
 ItemsIt _CCCL_HOST_DEVICE
-max_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred)
+max_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred = {})
 {
-  THRUST_CDP_DISPATCH((last = __extrema::element<__extrema::arg_max_f>(policy, first, last, binary_pred);),
-                      (last = thrust::max_element(cvt_to_seq(derived_cast(policy)), first, last, binary_pred);));
-  return last;
-}
-
-template <class Derived, class ItemsIt>
-ItemsIt _CCCL_HOST_DEVICE max_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last)
-{
-  using value_type = thrust::detail::it_value_t<ItemsIt>;
-  return cuda_cub::max_element(policy, first, last, ::cuda::std::less<value_type>());
+  THRUST_CDP_DISPATCH(
+    ({ return __extrema::cub_min_element(policy, first, last, cub::detail::swap_args{binary_pred}); }),
+    ({ return thrust::max_element(cvt_to_seq(derived_cast(policy)), first, last, binary_pred); }));
 }
 
 /// minmax element
 
 _CCCL_EXEC_CHECK_DISABLE
-template <class Derived, class ItemsIt, class BinaryPred>
-::cuda::std::pair<ItemsIt, ItemsIt> _CCCL_HOST_DEVICE
-minmax_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred)
-{
-  auto ret = ::cuda::std::make_pair(last, last);
-  if (first == last)
-  {
-    return ret;
-  }
-
-  THRUST_CDP_DISPATCH(
-    (using InputType = thrust::detail::it_value_t<ItemsIt>; using IndexType = thrust::detail::it_difference_t<ItemsIt>;
-
-     const auto num_items = static_cast<IndexType>(::cuda::std::distance(first, last));
-
-     using iterator_tuple = ::cuda::std::tuple<ItemsIt, counting_iterator<IndexType>>;
-     using zip_iterator   = thrust::zip_iterator<iterator_tuple>;
-
-     iterator_tuple iter_tuple = ::cuda::std::make_tuple(first, counting_iterator<IndexType>(0));
-
-     using arg_minmax_t   = __extrema::arg_minmax_f<InputType, IndexType, BinaryPred>;
-     using two_pairs_type = typename arg_minmax_t::two_pairs_type;
-     using duplicate_t    = typename arg_minmax_t::duplicate_tuple;
-     using transform_t    = thrust::transform_iterator<duplicate_t, zip_iterator, two_pairs_type, two_pairs_type>;
-
-     zip_iterator begin    = thrust::make_zip_iterator(iter_tuple);
-     two_pairs_type result = __extrema::extrema(
-       policy, transform_t(begin, duplicate_t()), num_items, arg_minmax_t(binary_pred), (two_pairs_type*) (nullptr));
-     ret = ::cuda::std::make_pair(first + ::cuda::std::get<1>(::cuda::std::get<0>(result)),
-                                  first + ::cuda::std::get<1>(::cuda::std::get<1>(result)));),
-    // CDP Sequential impl:
-    (ret = thrust::minmax_element(cvt_to_seq(derived_cast(policy)), first, last, binary_pred);));
-  return ret;
-}
-
-template <class Derived, class ItemsIt>
+template <class Derived, class ItemsIt, class BinaryPred = ::cuda::std::less<thrust::detail::it_value_t<ItemsIt>>>
 ::cuda::std::pair<ItemsIt, ItemsIt> _CCCL_HOST_DEVICE
-minmax_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last)
+minmax_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred = {})
 {
-  using value_type = thrust::detail::it_value_t<ItemsIt>;
-  return cuda_cub::minmax_element(policy, first, last, ::cuda::std::less<value_type>());
+  THRUST_CDP_DISPATCH(({ return __extrema::cub_minmax_element(policy, first, last, binary_pred); }),
+                      ({ return thrust::minmax_element(cvt_to_seq(derived_cast(policy)), first, last, binary_pred); }));
 }
 } // namespace cuda_cub
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h
index b8d326e27ca..c2b0a4f11af 100644
--- a/thrust/thrust/system/cuda/detail/reduce.h
+++ b/thrust/thrust/system/cuda/detail/reduce.h
@@ -18,29 +18,16 @@
 #  include <thrust/system/cuda/config.h>
 
 #  include <cub/device/device_reduce.cuh>
-#  include <cub/grid/grid_queue.cuh>
-#  include <cub/iterator/cache_modified_input_iterator.cuh>
-#  include <cub/util_math.cuh>
 
 #  include <thrust/detail/alignment.h>
-#  include <thrust/detail/raw_reference_cast.h>
 #  include <thrust/detail/temporary_array.h>
-#  include <thrust/functional.h>
 #  include <thrust/system/cuda/detail/cdp_dispatch.h>
-#  include <thrust/system/cuda/detail/core/agent_launcher.h>
-#  include <thrust/system/cuda/detail/dispatch.h>
 #  include <thrust/system/cuda/detail/execution_policy.h>
 #  include <thrust/system/cuda/detail/get_value.h>
-#  include <thrust/system/cuda/detail/make_unsigned_special.h>
 #  include <thrust/system/cuda/detail/util.h>
 
 #  include <cuda/std/__functional/operations.h>
 #  include <cuda/std/__iterator/distance.h>
-#  include <cuda/std/__memory/is_sufficiently_aligned.h>
-#  include <cuda/std/__type_traits/conditional.h>
-#  include <cuda/std/__type_traits/is_arithmetic.h>
-#  include <cuda/std/__type_traits/is_pointer.h>
-#  include <cuda/std/__type_traits/remove_cv.h>
 #  include <cuda/std/cstdint>
 
 THRUST_NAMESPACE_BEGIN
@@ -65,518 +52,6 @@ void _CCCL_HOST_DEVICE reduce_into(
 
 namespace cuda_cub
 {
-namespace __reduce
-{
-template <bool>
-struct is_true : thrust::detail::false_type
-{};
-template <>
-struct is_true<true> : thrust::detail::true_type
-{};
-
-template <int _BLOCK_THREADS,
-          int _ITEMS_PER_THREAD                      = 1,
-          int _VECTOR_LOAD_LENGTH                    = 1,
-          cub::BlockReduceAlgorithm _BLOCK_ALGORITHM = cub::BLOCK_REDUCE_RAKING,
-          cub::CacheLoadModifier _LOAD_MODIFIER      = cub::LOAD_DEFAULT,
-          cub::GridMappingStrategy _GRID_MAPPING     = cub::GRID_MAPPING_DYNAMIC>
-struct PtxPolicy
-{
-  static constexpr int BLOCK_THREADS      = _BLOCK_THREADS;
-  static constexpr int ITEMS_PER_THREAD   = _ITEMS_PER_THREAD;
-  static constexpr int VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH;
-  static constexpr int ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD;
-
-  static const cub::BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
-  static const cub::CacheLoadModifier LOAD_MODIFIER      = _LOAD_MODIFIER;
-  static const cub::GridMappingStrategy GRID_MAPPING     = _GRID_MAPPING;
-}; // struct PtxPolicy
-
-template <class, class>
-struct Tuning;
-
-template <class T>
-struct Tuning<core::detail::sm52, T>
-{
-  // Relative size of T type to a 4-byte word
-  static constexpr int SCALE_FACTOR_4B = (sizeof(T) + 3) / 4;
-  // Relative size of T type to a 1-byte word
-  static constexpr int SCALE_FACTOR_1B = sizeof(T);
-
-  // ReducePolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items)
-  using ReducePolicy1B =
-    PtxPolicy<128,
-              (((24 / Tuning::SCALE_FACTOR_1B) > (1)) ? (24 / Tuning::SCALE_FACTOR_1B) : (1)),
-              4,
-              cub::BLOCK_REDUCE_WARP_REDUCTIONS,
-              cub::LOAD_LDG,
-              cub::GRID_MAPPING_DYNAMIC>;
-
-  // ReducePolicy4B types (GTX Titan: 255.1 GB/s @ 48M 4B items)
-  using ReducePolicy4B =
-    PtxPolicy<256,
-              (((20 / Tuning::SCALE_FACTOR_4B) > (1)) ? (20 / Tuning::SCALE_FACTOR_4B) : (1)),
-              4,
-              cub::BLOCK_REDUCE_WARP_REDUCTIONS,
-              cub::LOAD_LDG,
-              cub::GRID_MAPPING_DYNAMIC>;
-
-  using type = ::cuda::std::conditional_t<(sizeof(T) < 4), ReducePolicy1B, ReducePolicy4B>;
-}; // Tuning sm52
-
-template <class InputIt, class OutputIt, class T, class Size, class ReductionOp>
-struct ReduceAgent
-{
-  using UnsignedSize = typename detail::make_unsigned_special<Size>::type;
-
-  template <class Arch>
-  struct PtxPlan : Tuning<Arch, T>::type
-  {
-    // we need this type definition to indicate "specialize_plan" metafunction
-    // that this PtxPlan may have specializations for different Arch
-    // via Tuning<Arch,T> type.
-    //
-    using tuning = Tuning<Arch, T>;
-
-    using Vector      = cub::CubVector<T, PtxPlan::VECTOR_LOAD_LENGTH>;
-    using LoadIt      = cub::detail::try_make_cache_modified_iterator_t<PtxPlan::LOAD_MODIFIER, InputIt>;
-    using BlockReduce = cub::BlockReduce<T, PtxPlan::BLOCK_THREADS, PtxPlan::BLOCK_ALGORITHM, 1, 1>;
-
-    using VectorLoadIt = cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, Vector, Size>;
-
-    struct TempStorage
-    {
-      typename BlockReduce::TempStorage reduce;
-      //
-      Size dequeue_offset;
-    }; // struct TempStorage
-
-  }; // struct PtxPlan
-
-  // Reduction need additional information which is not covered in
-  // default core::AgentPlan. We thus inherit from core::AgentPlan
-  // and add additional member fields that are needed.
-  // Other algorithms, e.g. merge, may not need additional information,
-  // and may use AgentPlan directly, instead of defining their own Plan type.
-  //
-  struct Plan : core::detail::AgentPlan
-  {
-    cub::GridMappingStrategy grid_mapping;
-
-    THRUST_RUNTIME_FUNCTION Plan() {}
-
-    template <class P>
-    THRUST_RUNTIME_FUNCTION Plan(P)
-        : core::detail::AgentPlan(P())
-        , grid_mapping(P::GRID_MAPPING)
-    {}
-  };
-
-  // this specialized PtxPlan for a device-compiled Arch
-  // ptx_plan type *must* only be used from device code
-  // Its use from host code will result in *undefined behaviour*
-  //
-  using ptx_plan = typename core::detail::specialize_plan_msvc10_war<PtxPlan>::type::type;
-
-  using TempStorage  = typename ptx_plan::TempStorage;
-  using Vector       = typename ptx_plan::Vector;
-  using LoadIt       = typename ptx_plan::LoadIt;
-  using BlockReduce  = typename ptx_plan::BlockReduce;
-  using VectorLoadIt = typename ptx_plan::VectorLoadIt;
-
-  static constexpr int ITEMS_PER_THREAD   = ptx_plan::ITEMS_PER_THREAD;
-  static constexpr int BLOCK_THREADS      = ptx_plan::BLOCK_THREADS;
-  static constexpr int ITEMS_PER_TILE     = ptx_plan::ITEMS_PER_TILE;
-  static constexpr int VECTOR_LOAD_LENGTH = ptx_plan::VECTOR_LOAD_LENGTH;
-
-  static constexpr bool ATTEMPT_VECTORIZATION =
-    (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0)
-    && ::cuda::std::is_pointer_v<InputIt> && ::cuda::std::is_arithmetic_v<::cuda::std::remove_cv_t<T>>;
-
-  struct impl
-  {
-    //---------------------------------------------------------------------
-    // Per thread data
-    //---------------------------------------------------------------------
-
-    TempStorage& storage;
-    InputIt input_it;
-    LoadIt load_it;
-    ReductionOp reduction_op;
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    _CCCL_DEVICE_API _CCCL_FORCEINLINE impl(TempStorage& storage_, InputIt input_it_, ReductionOp reduction_op_)
-        : storage(storage_)
-        , input_it(input_it_)
-        , load_it(cub::detail::try_make_cache_modified_iterator<ptx_plan::LOAD_MODIFIER>(input_it))
-        , reduction_op(reduction_op_)
-    {}
-
-    //---------------------------------------------------------------------
-    // Utility
-    //---------------------------------------------------------------------
-
-    // Whether or not the input is aligned with the vector type
-    // (specialized for types we can vectorize)
-    //
-    template <class Iterator>
-    static _CCCL_DEVICE_API _CCCL_FORCEINLINE bool
-    is_aligned(Iterator d_in, thrust::detail::true_type /* can_vectorize */)
-    {
-      return ::cuda::std::is_sufficiently_aligned<alignof(Vector)>(d_in);
-    }
-
-    // Whether or not the input is aligned with the vector type
-    // (specialized for types we cannot vectorize)
-    //
-    template <class Iterator>
-    static _CCCL_DEVICE_API _CCCL_FORCEINLINE bool is_aligned(Iterator, thrust::detail::false_type /* can_vectorize */)
-    {
-      return false;
-    }
-
-    //---------------------------------------------------------------------
-    // Tile processing
-    //---------------------------------------------------------------------
-
-    // Consume a full tile of input (non-vectorized)
-    //
-    template <int IS_FIRST_TILE>
-    _CCCL_DEVICE_API _CCCL_FORCEINLINE void consume_tile(
-      T& thread_aggregate,
-      Size block_offset,
-      int /*valid_items*/,
-      thrust::detail::true_type /* is_full_tile */,
-      thrust::detail::false_type /* can_vectorize */)
-    {
-      T items[ITEMS_PER_THREAD];
-
-      // Load items in striped fashion
-      cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, load_it + block_offset, items);
-
-      // Reduce items within each thread stripe
-      thread_aggregate = (IS_FIRST_TILE) ? cub::ThreadReduce(items, reduction_op)
-                                         : cub::ThreadReduce(items, reduction_op, thread_aggregate);
-    }
-
-    // Consume a full tile of input (vectorized)
-    //
-    template <int IS_FIRST_TILE>
-    _CCCL_DEVICE_API _CCCL_FORCEINLINE void consume_tile(
-      T& thread_aggregate,
-      Size block_offset,
-      int /*valid_items*/,
-      thrust::detail::true_type /* is_full_tile */,
-      thrust::detail::true_type /* can_vectorize */)
-    {
-      // Alias items as an array of VectorT and load it in striped fashion
-      static constexpr int WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH;
-
-      T items[ITEMS_PER_THREAD];
-
-      Vector* vec_items = reinterpret_cast<Vector*>(items);
-
-      // Vector Input iterator wrapper type (for applying cache modifier)
-      T* d_in_unqualified = const_cast<T*>(input_it) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
-      VectorLoadIt vec_load_it(reinterpret_cast<Vector*>(d_in_unqualified));
-
-      _CCCL_PRAGMA_UNROLL_FULL()
-      for (int i = 0; i < WORDS; ++i)
-      {
-        vec_items[i] = vec_load_it[BLOCK_THREADS * i];
-      }
-
-      // Reduce items within each thread stripe
-      thread_aggregate = (IS_FIRST_TILE) ? cub::ThreadReduce(items, reduction_op)
-                                         : cub::ThreadReduce(items, reduction_op, thread_aggregate);
-    }
-
-    // Consume a partial tile of input
-    //
-    template <int IS_FIRST_TILE, class CAN_VECTORIZE>
-    _CCCL_DEVICE_API _CCCL_FORCEINLINE void consume_tile(
-      T& thread_aggregate,
-      Size block_offset,
-      int valid_items,
-      thrust::detail::false_type /* is_full_tile */,
-      CAN_VECTORIZE)
-    {
-      // Partial tile
-      int thread_offset = threadIdx.x;
-
-      // Read first item
-      if ((IS_FIRST_TILE) && (thread_offset < valid_items))
-      {
-        thread_aggregate = load_it[block_offset + thread_offset];
-        thread_offset += BLOCK_THREADS;
-      }
-
-      // Continue reading items (block-striped)
-      while (thread_offset < valid_items)
-      {
-        thread_aggregate =
-          reduction_op(thread_aggregate, thrust::raw_reference_cast(load_it[block_offset + thread_offset]));
-        thread_offset += BLOCK_THREADS;
-      }
-    }
-
-    //---------------------------------------------------------------
-    // Consume a contiguous segment of tiles
-    //---------------------------------------------------------------------
-
-    // Reduce a contiguous segment of input tiles
-    //
-    template <class CAN_VECTORIZE>
-    _CCCL_DEVICE_API _CCCL_FORCEINLINE T
-    consume_range_impl(Size block_offset, Size block_end, CAN_VECTORIZE can_vectorize)
-    {
-      T thread_aggregate;
-
-      if (block_offset + ITEMS_PER_TILE > block_end)
-      {
-        // First tile isn't full (not all threads have valid items)
-        int valid_items = block_end - block_offset;
-        consume_tile<true>(thread_aggregate, block_offset, valid_items, thrust::detail::false_type(), can_vectorize);
-        return BlockReduce(storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
-      }
-
-      // At least one full block
-      consume_tile<true>(thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize);
-      block_offset += ITEMS_PER_TILE;
-
-      // Consume subsequent full tiles of input
-      while (block_offset + ITEMS_PER_TILE <= block_end)
-      {
-        consume_tile<false>(thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize);
-        block_offset += ITEMS_PER_TILE;
-      }
-
-      // Consume a partially-full tile
-      if (block_offset < block_end)
-      {
-        int valid_items = block_end - block_offset;
-        consume_tile<false>(thread_aggregate, block_offset, valid_items, thrust::detail::false_type(), can_vectorize);
-      }
-
-      // Compute block-wide reduction (all threads have valid items)
-      return BlockReduce(storage.reduce).Reduce(thread_aggregate, reduction_op);
-    }
-
-    // Reduce a contiguous segment of input tiles
-    //
-    _CCCL_DEVICE_API _CCCL_FORCEINLINE T consume_range(Size block_offset, Size block_end)
-    {
-      using attempt_vec = is_true<ATTEMPT_VECTORIZATION>;
-      using path_a      = is_true<true && ATTEMPT_VECTORIZATION>;
-      using path_b      = is_true<false && ATTEMPT_VECTORIZATION>;
-
-      return is_aligned(input_it + block_offset, attempt_vec())
-             ? consume_range_impl(block_offset, block_end, path_a())
-             : consume_range_impl(block_offset, block_end, path_b());
-    }
-
-    // Reduce a contiguous segment of input tiles
-    //
-    _CCCL_DEVICE_API _CCCL_FORCEINLINE T consume_tiles(
-      Size /*num_items*/,
-      cub::GridEvenShare<Size>& even_share,
-      cub::GridQueue<UnsignedSize>& /*queue*/,
-      thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_RAKE> /*is_rake*/)
-    {
-      using attempt_vec = is_true<ATTEMPT_VECTORIZATION>;
-      using path_a      = is_true<true && ATTEMPT_VECTORIZATION>;
-      using path_b      = is_true<false && ATTEMPT_VECTORIZATION>;
-
-      // Initialize even-share descriptor for this thread block
-      even_share.template BlockInit<ITEMS_PER_TILE, cub::GRID_MAPPING_RAKE>();
-
-      return is_aligned(input_it, attempt_vec())
-             ? consume_range_impl(even_share.block_offset, even_share.block_end, path_a())
-             : consume_range_impl(even_share.block_offset, even_share.block_end, path_b());
-    }
-
-    //---------------------------------------------------------------------
-    // Dynamically consume tiles
-    //---------------------------------------------------------------------
-
-    // Dequeue and reduce tiles of items as part of a inter-block reduction
-    //
-    template <class CAN_VECTORIZE>
-    _CCCL_DEVICE_API _CCCL_FORCEINLINE T
-    consume_tiles_impl(Size num_items, cub::GridQueue<UnsignedSize> queue, CAN_VECTORIZE can_vectorize)
-    {
-      // We give each thread block at least one tile of input.
-      T thread_aggregate;
-      Size block_offset    = blockIdx.x * ITEMS_PER_TILE;
-      Size even_share_base = gridDim.x * ITEMS_PER_TILE;
-
-      if (block_offset + ITEMS_PER_TILE > num_items)
-      {
-        // First tile isn't full (not all threads have valid items)
-        int valid_items = num_items - block_offset;
-        consume_tile<true>(thread_aggregate, block_offset, valid_items, thrust::detail::false_type(), can_vectorize);
-        return BlockReduce(storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
-      }
-
-      // Consume first full tile of input
-      consume_tile<true>(thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize);
-
-      if (num_items > even_share_base)
-      {
-        // Dequeue a tile of items
-        if (threadIdx.x == 0)
-        {
-          storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base;
-        }
-
-        __syncthreads();
-
-        // Grab tile offset and check if we're done with full tiles
-        block_offset = storage.dequeue_offset;
-
-        // Consume more full tiles
-        while (block_offset + ITEMS_PER_TILE <= num_items)
-        {
-          consume_tile<false>(
-            thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize);
-
-          __syncthreads();
-
-          // Dequeue a tile of items
-          if (threadIdx.x == 0)
-          {
-            storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base;
-          }
-
-          __syncthreads();
-
-          // Grab tile offset and check if we're done with full tiles
-          block_offset = storage.dequeue_offset;
-        }
-
-        // Consume partial tile
-        if (block_offset < num_items)
-        {
-          int valid_items = num_items - block_offset;
-          consume_tile<false>(thread_aggregate, block_offset, valid_items, thrust::detail::false_type(), can_vectorize);
-        }
-      }
-
-      // Compute block-wide reduction (all threads have valid items)
-      return BlockReduce(storage.reduce).Reduce(thread_aggregate, reduction_op);
-    }
-
-    // Dequeue and reduce tiles of items as part of a inter-block reduction
-    //
-    _CCCL_DEVICE_API _CCCL_FORCEINLINE T consume_tiles(
-      Size num_items,
-      cub::GridEvenShare<Size>& /*even_share*/,
-      cub::GridQueue<UnsignedSize>& queue,
-      thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_DYNAMIC>)
-    {
-      using attempt_vec = is_true<ATTEMPT_VECTORIZATION>;
-      using path_a      = is_true<true && ATTEMPT_VECTORIZATION>;
-      using path_b      = is_true<false && ATTEMPT_VECTORIZATION>;
-
-      return is_aligned(input_it, attempt_vec())
-             ? consume_tiles_impl(num_items, queue, path_a())
-             : consume_tiles_impl(num_items, queue, path_b());
-    }
-  }; // struct impl
-
-  //---------------------------------------------------------------------
-  // Agent entry points
-  //---------------------------------------------------------------------
-
-  // single tile reduce entry point
-  //
-  THRUST_AGENT_ENTRY(InputIt input_it, OutputIt output_it, Size num_items, ReductionOp reduction_op, char* shmem)
-  {
-    TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-    if (num_items == 0)
-    {
-      return;
-    }
-
-    T block_aggregate = impl(storage, input_it, reduction_op).consume_range((Size) 0, num_items);
-
-    if (threadIdx.x == 0)
-    {
-      *output_it = block_aggregate;
-    }
-  }
-
-  // single tile reduce entry point
-  //
-  THRUST_AGENT_ENTRY(InputIt input_it, OutputIt output_it, Size num_items, ReductionOp reduction_op, T init, char* shmem)
-  {
-    TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-    if (num_items == 0)
-    {
-      if (threadIdx.x == 0)
-      {
-        *output_it = init;
-      }
-      return;
-    }
-
-    T block_aggregate = impl(storage, input_it, reduction_op).consume_range((Size) 0, num_items);
-
-    if (threadIdx.x == 0)
-    {
-      *output_it = reduction_op(init, block_aggregate);
-    }
-  }
-
-  THRUST_AGENT_ENTRY(
-    InputIt input_it,
-    OutputIt output_it,
-    Size num_items,
-    cub::GridEvenShare<Size> even_share,
-    cub::GridQueue<UnsignedSize> queue,
-    ReductionOp reduction_op,
-    char* shmem)
-  {
-    TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-    using grid_mapping = thrust::detail::integral_constant<cub::GridMappingStrategy, ptx_plan::GRID_MAPPING>;
-
-    T block_aggregate =
-      impl(storage, input_it, reduction_op).consume_tiles(num_items, even_share, queue, grid_mapping());
-
-    if (threadIdx.x == 0)
-    {
-      output_it[blockIdx.x] = block_aggregate;
-    }
-  }
-}; // struct ReduceAgent
-
-template <class Size>
-struct DrainAgent
-{
-  using UnsignedSize = typename detail::make_unsigned_special<Size>::type;
-
-  template <class Arch>
-  struct PtxPlan : PtxPolicy<1>
-  {};
-  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
-
-  //---------------------------------------------------------------------
-  // Agent entry point
-  //---------------------------------------------------------------------
-
-  THRUST_AGENT_ENTRY(cub::GridQueue<UnsignedSize> grid_queue, Size num_items, char* /*shmem*/)
-  {
-    grid_queue.FillAndResetDrain(num_items);
-  }
-}; // struct DrainAgent;
-} // namespace __reduce
-
 namespace detail
 {
 template <typename Derived, typename InputIt, typename Size, typename T, typename BinaryOp>