diff --git a/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/nvbench_helper/nvbench_helper/nvbench_helper.cuh index 344c261bf8c..9a54355754e 100644 --- a/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -47,8 +47,8 @@ NVBENCH_DECLARE_TYPE_STRINGS(complex64, "C64", "complex64"); NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type"); NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type"); -NVBENCH_DECLARE_TYPE_STRINGS(cub::ArgMin, "ArgMin", "cub::ArgMin"); -NVBENCH_DECLARE_TYPE_STRINGS(cub::ArgMax, "ArgMax", "cub::ArgMax"); +NVBENCH_DECLARE_TYPE_STRINGS(cub::detail::arg_min, "arg_min", "cub::detail::arg_min"); +NVBENCH_DECLARE_TYPE_STRINGS(cub::detail::arg_max, "arg_max", "cub::detail::arg_max"); template struct nvbench::type_strings<::cuda::std::integral_constant> diff --git a/thrust/benchmarks/bench/extrema/basic.cu b/thrust/benchmarks/bench/extrema/basic.cu new file mode 100644 index 00000000000..3bae59603d2 --- /dev/null +++ b/thrust/benchmarks/bench/extrema/basic.cu @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include + +#include "nvbench_helper.cuh" + +template +static void bench_extremum(nvbench::state& state, nvbench::type_list, Func func) +{ + const auto elements = static_cast(state.get_int64("Elements")); + + thrust::device_vector in = generate(elements); + + using offset_t = typename decltype(in.cbegin())::difference_type; + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(1); + + caching_allocator_t alloc; + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { + do_not_optimize(func(policy(alloc, launch), in.cbegin(), in.cend())); + }); +} + +template +static void min_element(nvbench::state& state, nvbench::type_list list) +{ + bench_extremum(state, list, [](auto&&... args) { + return thrust::min_element(args...); + }); +} + +NVBENCH_BENCH_TYPES(min_element, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("min_element") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); + +template +static void max_element(nvbench::state& state, nvbench::type_list list) +{ + bench_extremum(state, list, [](auto&&... args) { + return thrust::max_element(args...); + }); +} + +NVBENCH_BENCH_TYPES(max_element, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("max_element") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); + +template +static void minmax_element(nvbench::state& state, nvbench::type_list list) +{ + bench_extremum(state, list, [](auto&&... args) { + return thrust::minmax_element(args...); + }); +} + +NVBENCH_BENCH_TYPES(minmax_element, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("minmax_element") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h index d2d9f69dc80..252ef6a29dc 100644 --- a/thrust/thrust/system/cuda/detail/extrema.h +++ b/thrust/thrust/system/cuda/detail/extrema.h @@ -17,15 +17,12 @@ # include -# include - # include # include -# include -# include # include -# include +# include +# include # include # include # include @@ -36,416 +33,181 @@ namespace cuda_cub { namespace __extrema { -template -struct arg_min_f -{ - Predicate predicate; - using pair_type = ::cuda::std::tuple; - - _CCCL_HOST_DEVICE arg_min_f(Predicate p) - : predicate(p) - {} - - pair_type _CCCL_DEVICE operator()(pair_type const& lhs, pair_type const& rhs) - { - InputType const& rhs_value = ::cuda::std::get<0>(rhs); - InputType const& lhs_value = ::cuda::std::get<0>(lhs); - IndexType const& rhs_key = ::cuda::std::get<1>(rhs); - IndexType const& lhs_key = ::cuda::std::get<1>(lhs); - - // check values first - if (predicate(lhs_value, rhs_value)) - { - return lhs; - } - else if (predicate(rhs_value, lhs_value)) - { - return rhs; - } - - // values are equivalent, prefer smaller index - if (lhs_key < rhs_key) - { - return lhs; - } - else - { - return rhs; - } - } -}; // struct arg_min_f - -template -struct arg_max_f +template +ItemsIt CUB_RUNTIME_FUNCTION +cub_min_element(execution_policy& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred) { - Predicate predicate; - using pair_type = ::cuda::std::tuple; + cudaStream_t stream = cuda_cub::stream(policy); + using offset_t = thrust::detail::it_difference_t; + const offset_t num_items = ::cuda::std::distance(first, last); - _CCCL_HOST_DEVICE arg_max_f(Predicate p) - : predicate(p) - {} - - pair_type _CCCL_DEVICE operator()(pair_type const& lhs, pair_type const& rhs) + if (num_items == 0) { - InputType const& rhs_value = ::cuda::std::get<0>(rhs); - InputType const& lhs_value = ::cuda::std::get<0>(lhs); - IndexType const& rhs_key = ::cuda::std::get<1>(rhs); - IndexType const& lhs_key = ::cuda::std::get<1>(lhs); - - // check values first - if (predicate(lhs_value, rhs_value)) - { - return rhs; - } - else if (predicate(rhs_value, lhs_value)) - { - return lhs; - } - - // values are equivalent, prefer smaller index - if (lhs_key < rhs_key) - { - return lhs; - } - else - { - return rhs; - } + return last; } -}; // struct arg_max_f - -template -struct arg_minmax_f -{ - Predicate predicate; - using pair_type = ::cuda::std::tuple; - using two_pairs_type = ::cuda::std::tuple; - - using arg_min_t = arg_min_f; - using arg_max_t = arg_max_f; + size_t tmp_size = 0; + auto error = cub::DeviceReduce::ArgMin( + nullptr, + tmp_size, + first, + ::cuda::discard_iterator{}, + static_cast(nullptr), + num_items, + binary_pred, + stream); + throw_on_error(error, "min_element failed to allocate temporary storages"); - _CCCL_HOST_DEVICE arg_minmax_f(Predicate p) - : predicate(p) - {} + // We allocate both the temporary storage needed for the algorithm, and a `size_type` to store the result. + thrust::detail::temporary_array tmp(policy, sizeof(offset_t) + tmp_size); + offset_t* index_ptr = thrust::detail::aligned_reinterpret_cast(tmp.data().get()); + void* tmp_ptr = static_cast(tmp.data().get() + sizeof(offset_t)); - two_pairs_type _CCCL_DEVICE operator()(two_pairs_type const& lhs, two_pairs_type const& rhs) - { - pair_type const& rhs_min = ::cuda::std::get<0>(rhs); - pair_type const& lhs_min = ::cuda::std::get<0>(lhs); - pair_type const& rhs_max = ::cuda::std::get<1>(rhs); - pair_type const& lhs_max = ::cuda::std::get<1>(lhs); + error = cub::DeviceReduce::ArgMin( + tmp_ptr, tmp_size, first, ::cuda::discard_iterator{}, index_ptr, num_items, binary_pred, stream); + cuda_cub::throw_on_error(error, "min_element failed to launch cub::DeviceReduce::ArgMin"); - auto result = - ::cuda::std::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min), arg_max_t(predicate)(lhs_max, rhs_max)); + cuda_cub::throw_on_error(cuda_cub::synchronize(policy), "min_element failed to synchronize"); - return result; - } + return first + get_value(policy, index_ptr); +} - struct duplicate_tuple - { - _CCCL_DEVICE two_pairs_type operator()(pair_type const& t) - { - return ::cuda::std::make_tuple(t, t); - } - }; -}; // struct arg_minmax_f - -template -cudaError_t THRUST_RUNTIME_FUNCTION doit_step( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIt input_it, - Size num_items, - ReductionOp reduction_op, - OutputIt output_it, - cudaStream_t stream) +template +struct minmax_accum_t { - using core::detail::AgentLauncher; - using core::detail::AgentPlan; - using core::detail::cuda_optional; - using core::detail::get_agent_plan; + ::cuda::std::pair min, max; +}; - using UnsignedSize = typename detail::make_unsigned_special::type; - - if (num_items == 0) +template +struct minmax_load_transformation +{ + // convert from zip_iterator + template + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto operator()(::cuda::std::tuple input) const + -> minmax_accum_t { - return cudaErrorNotSupported; + auto p = ::cuda::std::pair{::cuda::std::get<0>(input), ::cuda::std::get<1>(input)}; + return {p, p}; } +}; - using reduce_agent = AgentLauncher<__reduce::ReduceAgent>; +template +struct output_t +{ + OffsetT min_offset; + OffsetT max_offset; - typename reduce_agent::Plan reduce_plan = reduce_agent::get_plan(stream); + output_t() = default; - cudaError_t status = cudaSuccess; + // convert from accumulator type (during assignment at the end of the kernel) + template + _CCCL_API _CCCL_FORCEINLINE output_t(minmax_accum_t result) + : min_offset(result.min.first) + , max_offset(result.max.first) + {} +}; - if (num_items <= reduce_plan.items_per_tile) - { - size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1); - - // small, single tile size - if (d_temp_storage == nullptr) - { - temp_storage_bytes = max(1, vshmem_size); - return status; - } - char* vshmem_ptr = vshmem_size > 0 ? (char*) d_temp_storage : nullptr; - - reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only"); - ra.launch(input_it, output_it, num_items, reduction_op); - _CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - } - else +template > +struct minmax_reduce_op : ValueLessThen +{ + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto + operator()(const minmax_accum_t& a, const minmax_accum_t& b) const + -> minmax_accum_t { - // regular size - cuda_optional sm_count = core::detail::get_sm_count(); - _CUDA_CUB_RET_IF_FAIL(sm_count.status()); - - // reduction will not use more cta counts than requested - cuda_optional max_blocks_per_sm = reduce_agent::template get_max_blocks_per_sm< - InputIt, - OutputIt, - Size, - cub::GridEvenShare, - cub::GridQueue, - ReductionOp>(reduce_plan); - _CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status()); - - int reduce_device_occupancy = (int) max_blocks_per_sm * sm_count; - - int sm_oversubscription = 5; - int max_blocks = reduce_device_occupancy * sm_oversubscription; - - cub::GridEvenShare even_share; - even_share.DispatchInit(num_items, max_blocks, reduce_plan.items_per_tile); - - // we will launch at most "max_blocks" blocks in a grid - // so preallocate virtual shared memory storage for this if required - // - size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks); - - // Temporary storage allocation requirements - void* allocations[3] = {nullptr, nullptr, nullptr}; - size_t allocation_sizes[3] = { - max_blocks * sizeof(T), // bytes needed for privatized block reductions - cub::GridQueue::AllocationSize(), // bytes needed for grid queue descriptor0 - vshmem_size // size of virtualized shared memory storage - }; - status = cub::detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); - _CUDA_CUB_RET_IF_FAIL(status); - if (d_temp_storage == nullptr) - { - return status; - } - - T* d_block_reductions = (T*) allocations[0]; - cub::GridQueue queue(allocations[1]); - char* vshmem_ptr = vshmem_size > 0 ? (char*) allocations[2] : nullptr; - - // Get grid size for device_reduce_sweep_kernel - int reduce_grid_size = 0; - if (reduce_plan.grid_mapping == cub::GRID_MAPPING_RAKE) - { - // Work is distributed evenly - reduce_grid_size = even_share.grid_size; - } - else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC) - { - // Work is distributed dynamically - size_t num_tiles = ::cuda::ceil_div(num_items, reduce_plan.items_per_tile); - - // if not enough to fill the device with threadblocks - // then fill the device with threadblocks - reduce_grid_size = static_cast((min) (num_tiles, static_cast(reduce_device_occupancy))); - - using drain_agent = AgentLauncher<__reduce::DrainAgent>; - AgentPlan drain_plan = drain_agent::get_plan(); - drain_plan.grid_size = 1; - drain_agent da(drain_plan, stream, "__reduce::drain_agent"); - da.launch(queue, num_items); - _CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - } - else - { - _CUDA_CUB_RET_IF_FAIL(cudaErrorNotSupported); - } - - reduce_plan.grid_size = reduce_grid_size; - reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce"); - ra.launch(input_it, d_block_reductions, num_items, even_share, queue, reduction_op); - _CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - - using reduce_agent_single = AgentLauncher<__reduce::ReduceAgent>; - - reduce_plan.grid_size = 1; - reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce"); - - ra1.launch(d_block_reductions, output_it, reduce_grid_size, reduction_op); - _CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); + const auto& less = static_cast(*this); + const auto min = cub::detail::arg_less{less}(a.min, b.min); + const auto max = cub::detail::arg_less>{less}(a.max, b.max); + return {min, max}; } - return status; -} // func doit_step + // needed for __accumulator_t, never called at runtime + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto operator()( + const cub::detail::reduce::empty_problem_init_t>&, const minmax_accum_t&) const + -> minmax_accum_t; +}; -// this is an init-less reduce, needed for min/max-element functionality -// this will avoid copying the first value from device->host -template -THRUST_RUNTIME_FUNCTION T -extrema(execution_policy& policy, InputIt first, Size num_items, BinaryOp binary_op, T*) +template +::cuda::std::pair CUB_RUNTIME_FUNCTION +cub_minmax_element(execution_policy& policy, ItemsIt first, ItemsIt last, BinaryPred binary_pred) { - size_t temp_storage_bytes = 0; - cudaStream_t stream = cuda_cub::stream(policy); - - cudaError_t status; - THRUST_INDEX_TYPE_DISPATCH( - status, - doit_step, - num_items, - (nullptr, temp_storage_bytes, first, num_items_fixed, binary_op, static_cast(nullptr), stream)); - cuda_cub::throw_on_error(status, "extrema failed on 1st step"); - - size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes}; - void* allocations[2] = {nullptr, nullptr}; - - size_t storage_size = 0; - status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes); - cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage"); - - // Allocate temporary storage. - thrust::detail::temporary_array tmp(policy, storage_size); - void* ptr = static_cast(tmp.data().get()); - - status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes); - cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage"); + cudaStream_t stream = cuda_cub::stream(policy); + using offset_t = thrust::detail::it_difference_t; + const offset_t num_items = ::cuda::std::distance(first, last); - T* d_result = thrust::detail::aligned_reinterpret_cast(allocations[0]); - - THRUST_INDEX_TYPE_DISPATCH( - status, - doit_step, - num_items, - (allocations[1], temp_storage_bytes, first, num_items_fixed, binary_op, d_result, stream)); - cuda_cub::throw_on_error(status, "extrema failed on 2nd step"); - - status = cuda_cub::synchronize(policy); - cuda_cub::throw_on_error(status, "extrema failed to synchronize"); - - T result = cuda_cub::get_value(policy, d_result); - - return result; -} - -template