NVIDIA
diff --git a/‎…co/detail/hash_functions/murmurhash3.cuh‎ ‎…/__cuco/__hash_functions/murmurhash3.cuh‎cudax/include/cuda/experimental/__cuco/detail/hash_functions/murmurhash3.cuh renamed to cudax/include/cuda/experimental/__cuco/__hash_functions/murmurhash3.cuh
Lines changed: 28 additions & 31 deletions b/‎…co/detail/hash_functions/murmurhash3.cuh‎ ‎…/__cuco/__hash_functions/murmurhash3.cuh‎cudax/include/cuda/experimental/__cuco/detail/hash_functions/murmurhash3.cuh renamed to cudax/include/cuda/experimental/__cuco/__hash_functions/murmurhash3.cuh
Lines changed: 28 additions & 31 deletions
diff --git a/‎…l/__cuco/detail/hash_functions/utils.cuh‎ ‎…mental/__cuco/__hash_functions/utils.cuh‎cudax/include/cuda/experimental/__cuco/detail/hash_functions/utils.cuh renamed to cudax/include/cuda/experimental/__cuco/__hash_functions/utils.cuh
Lines changed: 6 additions & 6 deletions b/‎…l/__cuco/detail/hash_functions/utils.cuh‎ ‎…mental/__cuco/__hash_functions/utils.cuh‎cudax/include/cuda/experimental/__cuco/detail/hash_functions/utils.cuh renamed to cudax/include/cuda/experimental/__cuco/__hash_functions/utils.cuh
Lines changed: 6 additions & 6 deletions
diff --git a/‎…/__cuco/detail/hash_functions/xxhash.cuh‎ ‎…ental/__cuco/__hash_functions/xxhash.cuh‎cudax/include/cuda/experimental/__cuco/detail/hash_functions/xxhash.cuh renamed to cudax/include/cuda/experimental/__cuco/__hash_functions/xxhash.cuh
Lines changed: 14 additions & 18 deletions b/‎…/__cuco/detail/hash_functions/xxhash.cuh‎ ‎…ental/__cuco/__hash_functions/xxhash.cuh‎cudax/include/cuda/experimental/__cuco/detail/hash_functions/xxhash.cuh renamed to cudax/include/cuda/experimental/__cuco/__hash_functions/xxhash.cuh
Lines changed: 14 additions & 18 deletions
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,8 +19,8 @@
  * platform, but your performance with the non-native version will be less than optimal.
  */
 
-#ifndef _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_MURMURHASH3_CUH
-#define _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_MURMURHASH3_CUH
+#ifndef _CUDAX___CUCO___HASH_FUNCTIONS_MURMURHASH3_CUH
+#define _CUDAX___CUCO___HASH_FUNCTIONS_MURMURHASH3_CUH
 
 #include <cuda/__cccl_config>
 
@@ -40,11 +40,11 @@
 #include <cuda/std/cstdint>
 #include <cuda/std/span>
 
-#include <cuda/experimental/__cuco/detail/hash_functions/utils.cuh>
+#include <cuda/experimental/__cuco/__hash_functions/utils.cuh>
 
 #include <cuda/std/__cccl/prologue.h>
 
-namespace cuda::experimental::cuco::__detail
+namespace cuda::experimental::cuco
 {
 template <typename _Key>
 [[nodiscard]] _CCCL_API constexpr ::cuda::std::uint32_t __fmix32(_Key __key, ::cuda::std::uint32_t __seed = 0) noexcept
@@ -158,7 +158,7 @@ private:
     //----------
     // finalization
     __h1 ^= ::cuda::std::uint32_t{sizeof(_Holder)};
-    __h1 = ::cuda::experimental::cuco::__detail::__fmix32(__h1);
+    __h1 = ::cuda::experimental::cuco::__fmix32(__h1);
     return __h1;
   }
 
@@ -175,8 +175,7 @@ private:
     // body
     for (::cuda::std::remove_const_t<decltype(__nblocks)> __i = 0; __i < __nblocks; __i++)
     {
-      ::cuda::std::uint32_t __k1 =
-        ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, __i);
+      ::cuda::std::uint32_t __k1 = ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, __i);
       __k1 *= __c1;
       __k1 = ::cuda::std::rotl(__k1, 15);
       __k1 *= __c2;
@@ -205,7 +204,7 @@ private:
     //----------
     // finalization
     __h1 ^= __size;
-    __h1 = ::cuda::experimental::cuco::__detail::__fmix32(__h1);
+    __h1 = ::cuda::experimental::cuco::__fmix32(__h1);
     return __h1;
   }
 
@@ -393,10 +392,10 @@ private:
     __h[2] += __h[0];
     __h[3] += __h[0];
 
-    __h[0] = ::cuda::experimental::cuco::__detail::__fmix32(__h[0]);
-    __h[1] = ::cuda::experimental::cuco::__detail::__fmix32(__h[1]);
-    __h[2] = ::cuda::experimental::cuco::__detail::__fmix32(__h[2]);
-    __h[3] = ::cuda::experimental::cuco::__detail::__fmix32(__h[3]);
+    __h[0] = ::cuda::experimental::cuco::__fmix32(__h[0]);
+    __h[1] = ::cuda::experimental::cuco::__fmix32(__h[1]);
+    __h[2] = ::cuda::experimental::cuco::__fmix32(__h[2]);
+    __h[3] = ::cuda::experimental::cuco::__fmix32(__h[3]);
 
     __h[0] += __h[1];
     __h[0] += __h[2];
@@ -421,14 +420,13 @@ private:
     // body
     for (::cuda::std::remove_const_t<decltype(__nchunks)> __i = 0; __size >= __chunk_size && __i < __nchunks; ++__i)
     {
-      ::cuda::std::uint32_t __k1 =
-        ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i);
+      ::cuda::std::uint32_t __k1 = ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i);
       ::cuda::std::uint32_t __k2 =
-        ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 1);
+        ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 1);
       ::cuda::std::uint32_t __k3 =
-        ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 2);
+        ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 2);
       ::cuda::std::uint32_t __k4 =
-        ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 3);
+        ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, 4 * __i + 3);
 
       __k1 *= __c1;
       __k1 = ::cuda::std::rotl(__k1, 15);
@@ -555,10 +553,10 @@ private:
     __h[2] += __h[0];
     __h[3] += __h[0];
 
-    __h[0] = ::cuda::experimental::cuco::__detail::__fmix32(__h[0]);
-    __h[1] = ::cuda::experimental::cuco::__detail::__fmix32(__h[1]);
-    __h[2] = ::cuda::experimental::cuco::__detail::__fmix32(__h[2]);
-    __h[3] = ::cuda::experimental::cuco::__detail::__fmix32(__h[3]);
+    __h[0] = ::cuda::experimental::cuco::__fmix32(__h[0]);
+    __h[1] = ::cuda::experimental::cuco::__fmix32(__h[1]);
+    __h[2] = ::cuda::experimental::cuco::__fmix32(__h[2]);
+    __h[3] = ::cuda::experimental::cuco::__fmix32(__h[3]);
 
     __h[0] += __h[1];
     __h[0] += __h[2];
@@ -712,8 +710,8 @@ private:
     __h[0] += __h[1];
     __h[1] += __h[0];
 
-    __h[0] = ::cuda::experimental::cuco::__detail::__fmix64(__h[0]);
-    __h[1] = ::cuda::experimental::cuco::__detail::__fmix64(__h[1]);
+    __h[0] = ::cuda::experimental::cuco::__fmix64(__h[0]);
+    __h[1] = ::cuda::experimental::cuco::__fmix64(__h[1]);
 
     __h[0] += __h[1];
     __h[1] += __h[0];
@@ -734,10 +732,9 @@ private:
     // body
     for (::cuda::std::remove_const_t<decltype(__nchunks)> __i = 0; __size >= __chunk_size && __i < __nchunks; ++__i)
     {
-      ::cuda::std::uint64_t __k1 =
-        ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint64_t>(__bytes, 2 * __i);
+      ::cuda::std::uint64_t __k1 = ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint64_t>(__bytes, 2 * __i);
       ::cuda::std::uint64_t __k2 =
-        ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint64_t>(__bytes, 2 * __i + 1);
+        ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint64_t>(__bytes, 2 * __i + 1);
 
       __k1 *= __c1;
       __k1 = ::cuda::std::rotl(__k1, 31);
@@ -827,8 +824,8 @@ private:
     __h[0] += __h[1];
     __h[1] += __h[0];
 
-    __h[0] = ::cuda::experimental::cuco::__detail::__fmix64(__h[0]);
-    __h[1] = ::cuda::experimental::cuco::__detail::__fmix64(__h[1]);
+    __h[0] = ::cuda::experimental::cuco::__fmix64(__h[0]);
+    __h[1] = ::cuda::experimental::cuco::__fmix64(__h[1]);
 
     __h[0] += __h[1];
     __h[1] += __h[0];
@@ -841,8 +838,8 @@ private:
 };
 
 #endif // _CCCL_HAS_INT128()
-} // namespace cuda::experimental::cuco::__detail
+} // namespace cuda::experimental::cuco
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
+#endif // _CUDAX___CUCO___HASH_FUNCTIONS_MURMURHASH3_CUH
@@ -4,12 +4,12 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_UTILS_CUH
-#define _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_UTILS_CUH
+#ifndef _CUDAX___CUCO___HASH_FUNCTIONS_UTILS_CUH
+#define _CUDAX___CUCO___HASH_FUNCTIONS_UTILS_CUH
 
 #include <cuda/__cccl_config>
 
@@ -27,7 +27,7 @@
 
 #include <cuda/std/__cccl/prologue.h>
 
-namespace cuda::experimental::cuco::__detail
+namespace cuda::experimental::cuco
 {
 //! @brief Loads a chunk of type _Tp from a byte pointer at a given index, handling alignment
 //!
@@ -139,8 +139,8 @@ struct _Byte_holder<_KeySize, _ChunkSize, _BlockSize, _UseTailBlock, _BlockT, tr
 
   _BlockT __blocks[__num_blocks];
 };
-}; // namespace cuda::experimental::cuco::__detail
+} // namespace cuda::experimental::cuco
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_UTILS_CUH
+#endif // _CUDAX___CUCO___HASH_FUNCTIONS_UTILS_CUH
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -42,8 +42,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
-#define _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
+#ifndef _CUDAX___CUCO___HASH_FUNCTIONS_XXHASH_CUH
+#define _CUDAX___CUCO___HASH_FUNCTIONS_XXHASH_CUH
 
 #include <cuda/__cccl_config>
 
@@ -63,11 +63,11 @@
 #include <cuda/std/cstdint>
 #include <cuda/std/span>
 
-#include <cuda/experimental/__cuco/detail/hash_functions/utils.cuh>
+#include <cuda/experimental/__cuco/__hash_functions/utils.cuh>
 
 #include <cuda/std/__cccl/prologue.h>
 
-namespace cuda::experimental::cuco::__detail
+namespace cuda::experimental::cuco
 {
 //! @brief A `_XXHash_32` hash function to hash the given argument on host and device.
 //!
@@ -205,9 +205,8 @@ private:
         // pipeline 4*4byte computations
         const auto __pipeline_offset = __offset / 4;
         ::cuda::static_for<4>([&](auto i) {
-          __v[i] +=
-            ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, __pipeline_offset + i)
-            * __prime2;
+          __v[i] += ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, __pipeline_offset + i)
+                  * __prime2;
           __v[i] = ::cuda::std::rotl(__v[i], 13);
           __v[i] *= __prime1;
         });
@@ -229,8 +228,7 @@ private:
       _CCCL_PRAGMA_UNROLL(4)
       for (; __offset <= __size - 4; __offset += 4)
       {
-        __h32 += ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, __offset / 4)
-               * __prime3;
+        __h32 += ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, __offset / 4) * __prime3;
         __h32 = ::cuda::std::rotl(__h32, 17) * __prime4;
       }
     }
@@ -342,9 +340,8 @@ private:
         // pipeline 4*8byte computations
         const auto __pipeline_offset = __offset / 8;
         ::cuda::static_for<4>([&](auto i) {
-          __v[i] +=
-            ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint64_t>(__bytes, __pipeline_offset + i)
-            * __prime2;
+          __v[i] += ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint64_t>(__bytes, __pipeline_offset + i)
+                  * __prime2;
           __v[i] = ::cuda::std::rotl(__v[i], 31);
           __v[i] *= __prime1;
         });
@@ -375,7 +372,7 @@ private:
       for (; __offset <= __size - 8; __offset += 8)
       {
         ::cuda::std::uint64_t __k1 =
-          ::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint64_t>(__bytes, __offset / 8) * __prime2;
+          ::cuda::experimental::cuco::__load_chunk<::cuda::std::uint64_t>(__bytes, __offset / 8) * __prime2;
         __k1 = ::cuda::std::rotl(__k1, 31) * __prime1;
         __h64 ^= __k1;
         __h64 = ::cuda::std::rotl(__h64, 27) * __prime1 + __prime4;
@@ -387,8 +384,7 @@ private:
     {
       for (; __offset <= __size - 4; __offset += 4)
       {
-        __h64 ^= (::cuda::experimental::cuco::__detail::__load_chunk<::cuda::std::uint32_t>(__bytes, __offset / 4))
-               * __prime1;
+        __h64 ^= (::cuda::experimental::cuco::__load_chunk<::cuda::std::uint32_t>(__bytes, __offset / 4)) * __prime1;
         __h64 = ::cuda::std::rotl(__h64, 23) * __prime2 + __prime3;
       }
     }
@@ -420,8 +416,8 @@ private:
 
   ::cuda::std::uint64_t __seed_;
 };
-} // namespace cuda::experimental::cuco::__detail
+} // namespace cuda::experimental::cuco
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
+#endif // _CUDAX___CUCO___HASH_FUNCTIONS_XXHASH_CUH