Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 0 additions & 17 deletions libcudacxx/include/cuda/__warp/warp_shuffle.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,10 @@
# include <cuda/__cmath/pow2.h>
# include <cuda/__ptx/instructions/get_sreg.h>
# include <cuda/__ptx/instructions/shfl_sync.h>
# include <cuda/std/__concepts/concept_macros.h>
# include <cuda/std/__memory/addressof.h>
# include <cuda/std/__type_traits/enable_if.h>
# include <cuda/std/__type_traits/integral_constant.h>
# include <cuda/std/__type_traits/is_default_constructible.h>
# include <cuda/std/__type_traits/is_pointer.h>
# include <cuda/std/__type_traits/is_trivially_copyable.h>
# include <cuda/std/__type_traits/is_void.h>
# include <cuda/std/__type_traits/remove_cvref.h>
# include <cuda/std/cstdint>

# include <cuda/std/__cccl/prologue.h>
Expand All @@ -60,9 +55,6 @@ template <int _Width = 32, typename _Tp, typename _Up = ::cuda::std::remove_cv_t
[[nodiscard]] _CCCL_DEVICE_API warp_shuffle_result<_Up> warp_shuffle_idx(
const _Tp& __data, int __src_lane, uint32_t __lane_mask = 0xFFFFFFFF, ::cuda::std::integral_constant<int, _Width> = {})
{
static_assert(::cuda::std::is_default_constructible_v<_Tp>, "_Tp must be default constructible");
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Instead of wholesale removing these checks, should we just add explicit exceptions for known types? With the ability to allow people to proclaim types as valid for use with these APIs?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As @fbusato explained to me, the problem is that if there is a struct containing __half, it won't be trivially copyable.. However, we do the same think for cuda::std::bit_cast and noone has complained yet.

But I think we should keep at least the requirement on default constructibility.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would second default_constructability, because that is a much clearer error message than what a C++ compiler generates 5 lines below

Copy link
Copy Markdown
Collaborator

@jrhemstad jrhemstad Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd recommend taking a look at what we did in cuCollections by offering a is_bitwise_comparable custom trait. By default, we use has_unique_object_representation<T>, but that is false for floating-point values due to NaNs. However, for the majority of use cases that doesn't matter, and so we allow an escape hatch of specializing is_bitwise_comparable to opt-in. We emit a helpful diagnostic when this situation arises pointing people towards specializing is_bitwise_comparable.

We could do something similar here.

https://github.com/NVIDIA/cuCollections/blob/6477be2182668015f9a91e3a0bb7e248eceecd09/include/cuco/utility/traits.hpp#L24-L60

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the idea is a bit invasive but nice. The problem affects other warp instructions as well, so this solution applies to all of them. We can specialize the new type traits for reduced precision floating points + array.

I opened an RFE for the compiler nvbug 5497120 a while ago. We can rely on the proposed solution until we don't get an official workaround.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the funny aspect is that has_unique_object_representation<T> recognizes __half, __nv_bfloat16 as unique object representation, while this is not the case

static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "_Tp must be trivially copyable");

constexpr auto __warp_size = 32u;
constexpr bool __is_void_ptr = ::cuda::std::is_same_v<_Up, void*> || ::cuda::std::is_same_v<_Up, const void*>;
static_assert(!::cuda::std::is_pointer_v<_Up> || __is_void_ptr,
Expand Down Expand Up @@ -107,9 +99,6 @@ template <int _Width = 32, typename _Tp, typename _Up = ::cuda::std::remove_cv_t
[[nodiscard]] _CCCL_DEVICE_API warp_shuffle_result<_Tp> warp_shuffle_up(
const _Tp& __data, int __delta, uint32_t __lane_mask = 0xFFFFFFFF, ::cuda::std::integral_constant<int, _Width> = {})
{
static_assert(::cuda::std::is_default_constructible_v<_Tp>, "_Tp must be default constructible");
static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "_Tp must be trivially copyable");

constexpr auto __warp_size = 32u;
constexpr bool __is_void_ptr = ::cuda::std::is_same_v<_Up, void*> || ::cuda::std::is_same_v<_Up, const void*>;
static_assert(!::cuda::std::is_pointer_v<_Up> || __is_void_ptr,
Expand Down Expand Up @@ -158,9 +147,6 @@ template <int _Width = 32, typename _Tp, typename _Up = ::cuda::std::remove_cv_t
[[nodiscard]] _CCCL_DEVICE_API warp_shuffle_result<_Up> warp_shuffle_down(
const _Tp& __data, int __delta, uint32_t __lane_mask = 0xFFFFFFFF, ::cuda::std::integral_constant<int, _Width> = {})
{
static_assert(::cuda::std::is_default_constructible_v<_Tp>, "_Tp must be default constructible");
static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "_Tp must be trivially copyable");

constexpr auto __warp_size = 32u;
constexpr bool __is_void_ptr = ::cuda::std::is_same_v<_Up, void*> || ::cuda::std::is_same_v<_Up, const void*>;
static_assert(!::cuda::std::is_pointer_v<_Up> || __is_void_ptr,
Expand Down Expand Up @@ -209,9 +195,6 @@ template <int _Width = 32, typename _Tp, typename _Up = ::cuda::std::remove_cv_t
[[nodiscard]] _CCCL_DEVICE_API warp_shuffle_result<_Up> warp_shuffle_xor(
const _Tp& __data, int __xor_mask, uint32_t __lane_mask = 0xFFFFFFFF, ::cuda::std::integral_constant<int, _Width> = {})
{
static_assert(::cuda::std::is_default_constructible_v<_Tp>, "_Tp must be default constructible");
static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "_Tp must be trivially copyable");

constexpr auto __warp_size = 32u;
constexpr bool __is_void_ptr = ::cuda::std::is_same_v<_Up, void*> || ::cuda::std::is_same_v<_Up, const void*>;
static_assert(!::cuda::std::is_pointer_v<_Up> || __is_void_ptr,
Expand Down
Loading