Skip to content

Commit 9cf04b6

Browse files
Migrate cuco HLL (#6666)
* Migrate cuco HLL * Add `sketch_size` strong type * add standard_deviation strong type and its constructors * __uglify detail strong typed names * Rename `_MemoryResource` to `_MemoryResourceRef` * Rename template paramter `_T` to `_Tp` to avoid NASTY macros * suppress GCC visibility hidden attribute warnings for kernels * move __cuco/detail/* to __cuco/* * uglify detail folder names * Refactor detail headers and namespaces * use const west style * use unsigned shifts to avoid U.B * remove usage explicit this->() * MSVC WAR * use ::cuda::std::log * use ::cuda::is_aligned * move strong types inside the classes, making them nested types * enable merge host APIs * replace usage of _MemoryResourceRef with _MemoryResource * remove __int128 tests * use [[nodiscard]] attribute and `::cuda::std::size_t` * include specific headers * update `__register_mask` to be a member function * Improve header guard macro's naming * use noexcept * move __m * __m out of switch statement * replace `and` with `&` * use const keyword * remove usage of explicit `this->` * remove usage of `new` * use pinned memory resource and host buffer for copying sketch to host * Add precision based contructor * Add proper bound check for precision, sd, sketch_size in constructors * accept host mr for estimate * update exceptions macro usage * update CUDAX_CUCO_HLL_TUNING_ARR_DECL macro * fix ::cooperative_groups::invoke_one CTK requirements issues * remove redundant static --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 1b9a0ee commit 9cf04b6

File tree

13 files changed

+2346
-76
lines changed

13 files changed

+2346
-76
lines changed

cudax/include/cuda/experimental/__cuco/detail/hash_functions/murmurhash3.cuh renamed to cudax/include/cuda/experimental/__cuco/__hash_functions/murmurhash3.cuh

Lines changed: 28 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// under the Apache License v2.0 with LLVM Exceptions.
55
// See https://llvm.org/LICENSE.txt for license information.
66
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7-
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
88
//
99
//===----------------------------------------------------------------------===//
1010

@@ -19,8 +19,8 @@
1919
* platform, but your performance with the non-native version will be less than optimal.
2020
*/
2121

22-
#ifndef _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_MURMURHASH3_CUH
23-
#define _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_MURMURHASH3_CUH
22+
#ifndef _CUDAX___CUCO___HASH_FUNCTIONS_MURMURHASH3_CUH
23+
#define _CUDAX___CUCO___HASH_FUNCTIONS_MURMURHASH3_CUH
2424

2525
#include <cuda/__cccl_config>
2626

@@ -40,11 +40,11 @@
4040
#include <cuda/std/cstdint>
4141
#include <cuda/std/span>
4242

43-
#include <cuda/experimental/__cuco/detail/hash_functions/utils.cuh>
43+
#include <cuda/experimental/__cuco/__hash_functions/utils.cuh>
4444

4545
#include <cuda/std/__cccl/prologue.h>
4646

47-
namespace cuda::experimental::cuco::__detail
47+
namespace cuda::experimental::cuco
4848
{
4949
template <typename _Key>
5050
[[nodiscard]] _CCCL_API constexpr ::cuda::std::uint32_t __fmix32(_Key __key, ::cuda::std::uint32_t __seed = 0) noexcept
@@ -158,7 +158,7 @@ private:
158158
//----------
159159
// finalization
160160
__h1 ^= ::cuda::std::uint32_t{sizeof(_Holder)};
161-
__h1 = ::cuda::experimental::cuco::__detail::__fmix32(__h1);
161+
__h1 = ::cuda::experimental::cuco::__fmix32(__h1);
162162
return __h1;
163163
}
164164

@@ -175,8 +175,7 @@ private:
175175
// body
176176
for (::cuda::std::remove_const_t<decltype(__nblocks)> __i = 0; __i < __nblocks; __i++)
177177
{
178-
::cuda::std::uint32_t __k1 =
179-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, __i);
178+
::cuda::std::uint32_t __k1 = ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, __i);
180179
__k1 *= __c1;
181180
__k1 = ::cuda::std::rotl(__k1, 15);
182181
__k1 *= __c2;
@@ -205,7 +204,7 @@ private:
205204
//----------
206205
// finalization
207206
__h1 ^= __size;
208-
__h1 = ::cuda::experimental::cuco::__detail::__fmix32(__h1);
207+
__h1 = ::cuda::experimental::cuco::__fmix32(__h1);
209208
return __h1;
210209
}
211210

@@ -393,10 +392,10 @@ private:
393392
__h[2] += __h[0];
394393
__h[3] += __h[0];
395394

396-
__h[0] = ::cuda::experimental::cuco::__detail::__fmix32(__h[0]);
397-
__h[1] = ::cuda::experimental::cuco::__detail::__fmix32(__h[1]);
398-
__h[2] = ::cuda::experimental::cuco::__detail::__fmix32(__h[2]);
399-
__h[3] = ::cuda::experimental::cuco::__detail::__fmix32(__h[3]);
395+
__h[0] = ::cuda::experimental::cuco::__fmix32(__h[0]);
396+
__h[1] = ::cuda::experimental::cuco::__fmix32(__h[1]);
397+
__h[2] = ::cuda::experimental::cuco::__fmix32(__h[2]);
398+
__h[3] = ::cuda::experimental::cuco::__fmix32(__h[3]);
400399

401400
__h[0] += __h[1];
402401
__h[0] += __h[2];
@@ -421,14 +420,13 @@ private:
421420
// body
422421
for (::cuda::std::remove_const_t<decltype(__nchunks)> __i = 0; __size >= __chunk_size && __i < __nchunks; ++__i)
423422
{
424-
::cuda::std::uint32_t __k1 =
425-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i);
423+
::cuda::std::uint32_t __k1 = ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i);
426424
::cuda::std::uint32_t __k2 =
427-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 1);
425+
::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 1);
428426
::cuda::std::uint32_t __k3 =
429-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 2);
427+
::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 2);
430428
::cuda::std::uint32_t __k4 =
431-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 3);
429+
::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 3);
432430

433431
__k1 *= __c1;
434432
__k1 = ::cuda::std::rotl(__k1, 15);
@@ -555,10 +553,10 @@ private:
555553
__h[2] += __h[0];
556554
__h[3] += __h[0];
557555

558-
__h[0] = ::cuda::experimental::cuco::__detail::__fmix32(__h[0]);
559-
__h[1] = ::cuda::experimental::cuco::__detail::__fmix32(__h[1]);
560-
__h[2] = ::cuda::experimental::cuco::__detail::__fmix32(__h[2]);
561-
__h[3] = ::cuda::experimental::cuco::__detail::__fmix32(__h[3]);
556+
__h[0] = ::cuda::experimental::cuco::__fmix32(__h[0]);
557+
__h[1] = ::cuda::experimental::cuco::__fmix32(__h[1]);
558+
__h[2] = ::cuda::experimental::cuco::__fmix32(__h[2]);
559+
__h[3] = ::cuda::experimental::cuco::__fmix32(__h[3]);
562560

563561
__h[0] += __h[1];
564562
__h[0] += __h[2];
@@ -712,8 +710,8 @@ private:
712710
__h[0] += __h[1];
713711
__h[1] += __h[0];
714712

715-
__h[0] = ::cuda::experimental::cuco::__detail::__fmix64(__h[0]);
716-
__h[1] = ::cuda::experimental::cuco::__detail::__fmix64(__h[1]);
713+
__h[0] = ::cuda::experimental::cuco::__fmix64(__h[0]);
714+
__h[1] = ::cuda::experimental::cuco::__fmix64(__h[1]);
717715

718716
__h[0] += __h[1];
719717
__h[1] += __h[0];
@@ -734,10 +732,9 @@ private:
734732
// body
735733
for (::cuda::std::remove_const_t<decltype(__nchunks)> __i = 0; __size >= __chunk_size && __i < __nchunks; ++__i)
736734
{
737-
::cuda::std::uint64_t __k1 =
738-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint64_t>(__bytes, 2 * __i);
735+
::cuda::std::uint64_t __k1 = ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint64_t>(__bytes, 2 * __i);
739736
::cuda::std::uint64_t __k2 =
740-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint64_t>(__bytes, 2 * __i + 1);
737+
::cuda::experimental::cuco::__load_chunk<::cuda::std::uint64_t>(__bytes, 2 * __i + 1);
741738

742739
__k1 *= __c1;
743740
__k1 = ::cuda::std::rotl(__k1, 31);
@@ -827,8 +824,8 @@ private:
827824
__h[0] += __h[1];
828825
__h[1] += __h[0];
829826

830-
__h[0] = ::cuda::experimental::cuco::__detail::__fmix64(__h[0]);
831-
__h[1] = ::cuda::experimental::cuco::__detail::__fmix64(__h[1]);
827+
__h[0] = ::cuda::experimental::cuco::__fmix64(__h[0]);
828+
__h[1] = ::cuda::experimental::cuco::__fmix64(__h[1]);
832829

833830
__h[0] += __h[1];
834831
__h[1] += __h[0];
@@ -841,8 +838,8 @@ private:
841838
};
842839

843840
#endif // _CCCL_HAS_INT128()
844-
} // namespace cuda::experimental::cuco::__detail
841+
} // namespace cuda::experimental::cuco
845842

846843
#include <cuda/std/__cccl/epilogue.h>
847844

848-
#endif // _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
845+
#endif // _CUDAX___CUCO___HASH_FUNCTIONS_MURMURHASH3_CUH

cudax/include/cuda/experimental/__cuco/detail/hash_functions/utils.cuh renamed to cudax/include/cuda/experimental/__cuco/__hash_functions/utils.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
// under the Apache License v2.0 with LLVM Exceptions.
55
// See https://llvm.org/LICENSE.txt for license information.
66
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7-
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
88
//
99
//===----------------------------------------------------------------------===//
1010

11-
#ifndef _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_UTILS_CUH
12-
#define _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_UTILS_CUH
11+
#ifndef _CUDAX___CUCO___HASH_FUNCTIONS_UTILS_CUH
12+
#define _CUDAX___CUCO___HASH_FUNCTIONS_UTILS_CUH
1313

1414
#include <cuda/__cccl_config>
1515

@@ -27,7 +27,7 @@
2727

2828
#include <cuda/std/__cccl/prologue.h>
2929

30-
namespace cuda::experimental::cuco::__detail
30+
namespace cuda::experimental::cuco
3131
{
3232
//! @brief Loads a chunk of type _Tp from a byte pointer at a given index, handling alignment
3333
//!
@@ -139,8 +139,8 @@ struct _Byte_holder<_KeySize, _ChunkSize, _BlockSize, _UseTailBlock, _BlockT, tr
139139

140140
_BlockT __blocks[__num_blocks];
141141
};
142-
}; // namespace cuda::experimental::cuco::__detail
142+
} // namespace cuda::experimental::cuco
143143

144144
#include <cuda/std/__cccl/epilogue.h>
145145

146-
#endif // _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_UTILS_CUH
146+
#endif // _CUDAX___CUCO___HASH_FUNCTIONS_UTILS_CUH

cudax/include/cuda/experimental/__cuco/detail/hash_functions/xxhash.cuh renamed to cudax/include/cuda/experimental/__cuco/__hash_functions/xxhash.cuh

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// under the Apache License v2.0 with LLVM Exceptions.
55
// See https://llvm.org/LICENSE.txt for license information.
66
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7-
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
88
//
99
//===----------------------------------------------------------------------===//
1010

@@ -42,8 +42,8 @@
4242
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4343
*/
4444

45-
#ifndef _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
46-
#define _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
45+
#ifndef _CUDAX___CUCO___HASH_FUNCTIONS_XXHASH_CUH
46+
#define _CUDAX___CUCO___HASH_FUNCTIONS_XXHASH_CUH
4747

4848
#include <cuda/__cccl_config>
4949

@@ -63,11 +63,11 @@
6363
#include <cuda/std/cstdint>
6464
#include <cuda/std/span>
6565

66-
#include <cuda/experimental/__cuco/detail/hash_functions/utils.cuh>
66+
#include <cuda/experimental/__cuco/__hash_functions/utils.cuh>
6767

6868
#include <cuda/std/__cccl/prologue.h>
6969

70-
namespace cuda::experimental::cuco::__detail
70+
namespace cuda::experimental::cuco
7171
{
7272
//! @brief A `_XXHash_32` hash function to hash the given argument on host and device.
7373
//!
@@ -205,9 +205,8 @@ private:
205205
// pipeline 4*4byte computations
206206
const auto __pipeline_offset = __offset / 4;
207207
::cuda::static_for<4>([&](auto i) {
208-
__v[i] +=
209-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, __pipeline_offset + i)
210-
* __prime2;
208+
__v[i] += ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, __pipeline_offset + i)
209+
* __prime2;
211210
__v[i] = ::cuda::std::rotl(__v[i], 13);
212211
__v[i] *= __prime1;
213212
});
@@ -229,8 +228,7 @@ private:
229228
_CCCL_PRAGMA_UNROLL(4)
230229
for (; __offset <= __size - 4; __offset += 4)
231230
{
232-
__h32 += ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, __offset / 4)
233-
* __prime3;
231+
__h32 += ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, __offset / 4) * __prime3;
234232
__h32 = ::cuda::std::rotl(__h32, 17) * __prime4;
235233
}
236234
}
@@ -342,9 +340,8 @@ private:
342340
// pipeline 4*8byte computations
343341
const auto __pipeline_offset = __offset / 8;
344342
::cuda::static_for<4>([&](auto i) {
345-
__v[i] +=
346-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint64_t>(__bytes, __pipeline_offset + i)
347-
* __prime2;
343+
__v[i] += ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint64_t>(__bytes, __pipeline_offset + i)
344+
* __prime2;
348345
__v[i] = ::cuda::std::rotl(__v[i], 31);
349346
__v[i] *= __prime1;
350347
});
@@ -375,7 +372,7 @@ private:
375372
for (; __offset <= __size - 8; __offset += 8)
376373
{
377374
::cuda::std::uint64_t __k1 =
378-
::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint64_t>(__bytes, __offset / 8) * __prime2;
375+
::cuda::experimental::cuco::__load_chunk<::cuda::std::uint64_t>(__bytes, __offset / 8) * __prime2;
379376
__k1 = ::cuda::std::rotl(__k1, 31) * __prime1;
380377
__h64 ^= __k1;
381378
__h64 = ::cuda::std::rotl(__h64, 27) * __prime1 + __prime4;
@@ -387,8 +384,7 @@ private:
387384
{
388385
for (; __offset <= __size - 4; __offset += 4)
389386
{
390-
__h64 ^= (::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, __offset / 4))
391-
* __prime1;
387+
__h64 ^= (::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, __offset / 4)) * __prime1;
392388
__h64 = ::cuda::std::rotl(__h64, 23) * __prime2 + __prime3;
393389
}
394390
}
@@ -420,8 +416,8 @@ private:
420416

421417
::cuda::std::uint64_t __seed_;
422418
};
423-
} // namespace cuda::experimental::cuco::__detail
419+
} // namespace cuda::experimental::cuco
424420

425421
#include <cuda/std/__cccl/epilogue.h>
426422

427-
#endif // _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
423+
#endif // _CUDAX___CUCO___HASH_FUNCTIONS_XXHASH_CUH

0 commit comments

Comments
 (0)