NVIDIA
diff --git a/‎cudax/examples/simple_p2p.cu‎
Lines changed: 1 addition & 1 deletion b/‎cudax/examples/simple_p2p.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cudax/examples/vector_add.cu‎
Lines changed: 1 addition & 1 deletion b/‎cudax/examples/vector_add.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cudax/include/cuda/experimental/__execution/stream/adaptor.cuh‎
Lines changed: 3 additions & 3 deletions b/‎cudax/include/cuda/experimental/__execution/stream/adaptor.cuh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cudax/include/cuda/experimental/__execution/stream/scheduler.cuh‎
Lines changed: 3 additions & 3 deletions b/‎cudax/include/cuda/experimental/__execution/stream/scheduler.cuh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cudax/include/cuda/experimental/__launch/configuration.cuh‎
Lines changed: 84 additions & 53 deletions b/‎cudax/include/cuda/experimental/__launch/configuration.cuh‎
Lines changed: 84 additions & 53 deletions
diff --git a/‎cudax/include/cuda/experimental/hierarchy.cuh‎
Lines changed: 0 additions & 16 deletions b/‎cudax/include/cuda/experimental/hierarchy.cuh‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎cudax/test/CMakeLists.txt‎
Lines changed: 0 additions & 6 deletions b/‎cudax/test/CMakeLists.txt‎
Lines changed: 0 additions & 6 deletions
@@ -51,7 +51,7 @@ struct simple_kernel
   __device__ void operator()(Configuration config, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)
   {
     // Just a dummy kernel, doing enough for us to verify that everything worked
-    const auto idx = config.dims.rank(cudax::thread);
+    const auto idx = config.dims.rank(cuda::thread);
     dst[idx]       = src[idx] * 2.0f;
   }
 };
 
@@ -96,7 +96,7 @@ try
 
   // Launch the vectorAdd kernel
   printf(
-    "CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cudax::block, cudax::grid), threadsPerBlock);
+    "CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cuda::block, cuda::grid), threadsPerBlock);
   cudax::launch(stream, config, vectorAdd, in(A), in(B), out(C));
 
   printf("waiting for the stream to finish\n");
 
@@ -269,8 +269,8 @@ private:
     // the receiver tell us how to launch the kernel.
     auto const __launch_config    = get_launch_config(execution::get_env(__state.__state_.__rcvr_));
     using __launch_dims_t         = decltype(__launch_config.dims);
-    constexpr int __block_threads = __launch_dims_t::static_count(experimental::thread, experimental::block);
-    int const __grid_blocks       = __launch_config.dims.count(experimental::block, experimental::grid);
+    constexpr int __block_threads = __launch_dims_t::static_count(thread, block);
+    int const __grid_blocks       = __launch_config.dims.count(block, grid);
     static_assert(__block_threads != ::cuda::std::dynamic_extent);
 
     // Start the child operation state. This will launch kernels for all the predecessors
@@ -291,7 +291,7 @@ private:
   _CCCL_DEVICE_API void __device_start() noexcept
   {
     using __launch_dims_t         = __dims_of_t<__rcvr_config_t>;
-    constexpr int __block_threads = __launch_dims_t::static_count(experimental::thread, experimental::block);
+    constexpr int __block_threads = __launch_dims_t::static_count(thread, block);
     auto& __state                 = __get_state();
 
     // without the following, the kernel in __host_start will fail to launch with
 
@@ -133,8 +133,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
       // the completion kernel, we will be completing the parent's receiver, so we must let
       // the receiver tell us how to launch the kernel.
       auto const __launch_dims      = get_launch_config(execution::get_env(__rcvr_)).dims;
-      constexpr int __block_threads = decltype(__launch_dims)::static_count(experimental::thread, experimental::block);
-      int const __grid_blocks       = __launch_dims.count(experimental::block, experimental::grid);
+      constexpr int __block_threads = decltype(__launch_dims)::static_count(cuda::thread, cuda::block);
+      int const __grid_blocks       = __launch_dims.count(cuda::block, cuda::grid);
       static_assert(__block_threads != ::cuda::std::dynamic_extent);
 
       // Launch the kernel that completes the receiver with the launch configuration from
@@ -152,7 +152,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
     _CCCL_DEVICE_API void __device_start() noexcept
     {
       using __launch_dims_t         = decltype(get_launch_config(execution::get_env(__rcvr_)).dims);
-      constexpr int __block_threads = __launch_dims_t::static_count(experimental::thread, experimental::block);
+      constexpr int __block_threads = __launch_dims_t::static_count(cuda::thread, cuda::block);
 
       // without the following, the kernel in __host_start will fail to launch with
       // cudaErrorInvalidDeviceFunction.
 
@@ -12,6 +12,7 @@
 #define _CUDAX__LAUNCH_CONFIGURATION_CUH
 
 #include <cuda/__driver/driver_api.h>
+#include <cuda/__hierarchy/hierarchy_dimensions.h>
 #include <cuda/__numeric/overflow_cast.h>
 #include <cuda/__ptx/instructions/get_sreg.h>
 #include <cuda/std/__cstddef/types.h>
@@ -23,7 +24,6 @@
 #include <cuda/std/tuple>
 
 #include <cuda/experimental/__detail/utility.cuh>
-#include <cuda/experimental/hierarchy.cuh>
 
 #include <cuda/std/__cccl/prologue.h>
 
@@ -95,10 +95,11 @@ inline constexpr bool no_duplicate_options<Option, Rest...> =
  * @brief Launch option enabling cooperative launch
  *
  * This launch option causes the launched grid to be restricted to a number of
- * blocks that can simultaneously execute on the device. It means that every thread
- * in the launched grid can eventually observe execution of each other thread in the grid.
- * It also enables usage of cooperative_groups::grid_group::sync() function, that
- * synchronizes all threads in the grid.
+ * blocks that can simultaneously execute on the device. It means that every
+ * thread in the launched grid can eventually observe execution of each other
+ * thread in the grid. It also enables usage of
+ * cooperative_groups::grid_group::sync() function, that synchronizes all
+ * threads in the grid.
  *
  * @par Snippet
  * @code
@@ -181,14 +182,14 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
 /**
  * @brief Launch option specifying dynamic shared memory configuration
  *
- * This launch option causes the launch to allocate amount of shared memory sufficient
- * to store the specified number of object of the specified type.
+ * This launch option causes the launch to allocate amount of shared memory
+ * sufficient to store the specified number of object of the specified type.
  * This type can be constructed with dynamic_shared_memory helper function.
  *
- * When launch configuration contains this option, that configuration can be then
- * passed to dynamic_shared_memory_view to get the view_type over the dynamic shared memory.
- * It is also possible to obtain that memory through the original
- * extern __shared__ variable[] declaration.
+ * When launch configuration contains this option, that configuration can be
+ * then passed to dynamic_shared_memory_view to get the view_type over the
+ * dynamic shared memory. It is also possible to obtain that memory through the
+ * original extern __shared__ variable[] declaration.
  *
  * CUDA guarantees that each device has at least 48kB of shared memory
  * per block, but most devices have more than that.
@@ -209,7 +210,8 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
  *
  * void kernel_launch(cuda::stream_ref stream) {
  *     auto dims = cudax::make_hierarchy(cudax::block<128>(), cudax::grid(4));
- *     auto conf = cudax::make_configuration(dims, dynamic_shared_memory<int[128]>());
+ *     auto conf = cudax::make_configuration(dims,
+ * dynamic_shared_memory<int[128]>());
  *
  *     cudax::launch(stream, conf, kernel);
  * }
@@ -224,7 +226,8 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
  *  or cuda::std::dynamic_extent, if its dynamic
  *
  * @tparam NonPortableSize
- *  Needs to be enabled to exceed the portable limit of 48kB of shared memory per block
+ *  Needs to be enabled to exceed the portable limit of 48kB of shared memory
+ * per block
  */
 template <class _Tp>
 class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory
@@ -234,14 +237,17 @@ class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory
   using __base_type = __dyn_smem_option_base<_Tp>;
 
   static_assert(::cuda::std::rank_v<_Tp> <= 1,
-                "multidimensional arrays cannot be used with dynamic shared memory option");
+                "multidimensional arrays cannot be used with dynamic shared "
+                "memory option");
   static_assert(!::cuda::std::is_const_v<typename __base_type::value_type>, "the value type cannot be const");
   static_assert(!::cuda::std::is_reference_v<typename __base_type::value_type>, "the value type cannot be a reference");
 
 public:
-  bool __non_portable_{}; //!< \c true if the object was created with non_portable flag.
+  bool __non_portable_{}; //!< \c true if the object was created with
+                          //!< non_portable flag.
 
-  using typename __base_type::value_type; //!< Value type of the dynamic shared memory elements.
+  using typename __base_type::value_type; //!< Value type of the dynamic shared
+                                          //!< memory elements.
   using typename __base_type::view_type; //!< The view type returned by the
                                          //!< cuda::device::dynamic_shared_memory_view(config).
 
@@ -321,7 +327,8 @@ template <class _Tp>
 {
   ::cudaError_t __status = ::cudaSuccess;
 
-  // Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires the function to be loaded.
+  // Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires the
+  // function to be loaded.
   if (::cuda::__driver::__version_at_least(12, 4))
   {
     __status = ::cuda::__driver::__functionLoadNoThrow(__kernel);
@@ -376,9 +383,10 @@ template <class _Tp>
 /**
  * @brief Launch option specifying launch priority
  *
- * This launch option causes the launched grid to be scheduled with the specified priority.
- * More about stream priorities and valid values can be found in the CUDA programming guide
- * `here <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#stream-priorities>`_
+ * This launch option causes the launched grid to be scheduled with the
+ * specified priority. More about stream priorities and valid values can be
+ * found in the CUDA programming guide `here
+ * <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#stream-priorities>`_
  */
 struct launch_priority : public __detail::launch_option
 {
@@ -444,11 +452,12 @@ _CCCL_CONCEPT __kernel_has_default_config =
 /**
  * @brief Type describing a kernel launch configuration
  *
- * This type should not be constructed directly and make_config helper function should be used instead
+ * This type should not be constructed directly and make_config helper function
+ * should be used instead
  *
  * @tparam Dimensions
- * cuda::experimental::hierarchy_dimensions instance that describes dimensions of thread hierarchy in this
- * configuration object
+ * cuda::experimental::hierarchy_dimensions instance that describes dimensions
+ * of thread hierarchy in this configuration object
  *
  * @tparam Options
  * Types of options that were added to this configuration object
@@ -472,8 +481,8 @@ struct kernel_config
   /**
    * @brief Add a new option to this configuration
    *
-   * Returns a new kernel_config that has all option and dimensions from this kernel_config
-   * with the option from the argument added to it
+   * Returns a new kernel_config that has all option and dimensions from this
+   * kernel_config with the option from the argument added to it
    *
    * @param new_option
    * Option to be added to the configuration
@@ -488,34 +497,42 @@ struct kernel_config
   /**
    * @brief Combine this configuration with another configuration object
    *
-   * Returns a new `kernel_config` that is a combination of this configuration and the configuration from argument.
-   * It contains dimensions that are combination of dimensions in this object and the other configuration. The resulting
-   * hierarchy holds levels present in both hierarchies. In case of overlap of levels hierarchy from this configuration
-   * is prioritized, so the result always holds all levels from this hierarchy and non-overlapping
-   * levels from the other hierarchy. This behavior is the same as `combine()` member function of the hierarchy type.
-   * The result also contains configuration options from both configurations. In case the same type of a configuration
-   * option is present in both configuration this configuration is copied into the resulting configuration.
+   * Returns a new `kernel_config` that is a combination of this configuration
+   * and the configuration from argument. It contains dimensions that are
+   * combination of dimensions in this object and the other configuration. The
+   * resulting hierarchy holds levels present in both hierarchies. In case of
+   * overlap of levels hierarchy from this configuration is prioritized, so the
+   * result always holds all levels from this hierarchy and non-overlapping
+   * levels from the other hierarchy. This behavior is the same as `combine()`
+   * member function of the hierarchy type. The result also contains
+   * configuration options from both configurations. In case the same type of a
+   * configuration option is present in both configuration this configuration is
+   * copied into the resulting configuration.
    *
    * @param __other_config
    * Other configuration to combine with this configuration
    */
   template <typename _OtherDimensions, typename... _OtherOptions>
   [[nodiscard]] auto combine(const kernel_config<_OtherDimensions, _OtherOptions...>& __other_config) const
   {
-    // can't use fully qualified kernel_config name here because of nvcc bug, TODO remove __make_config_from_tuple once
-    // fixed
+    // can't use fully qualified kernel_config name here because of nvcc bug,
+    // TODO remove __make_config_from_tuple once fixed
     return __make_config_from_tuple(
       dims.combine(__other_config.dims),
       ::cuda::std::tuple_cat(options, ::cuda::std::apply(__filter_options<Options...>{}, __other_config.options)));
   }
 
   /**
-   * @brief Combine this configuration with default configuration of a kernel functor
+   * @brief Combine this configuration with default configuration of a kernel
+   * functor
    *
-   * Returns a new `kernel_config` that is a combination of this configuration and a default configuration from the
-   * kernel argument. Default configuration is a `kernel_config` object returned from `default_config()` member function
-   * of the kernel type. The configurations are combined using the `combine()` member function of this configuration.
-   * If the kernel has no default configuration, a copy of this configuration is returned without any changes.
+   * Returns a new `kernel_config` that is a combination of this configuration
+   * and a default configuration from the kernel argument. Default configuration
+   * is a `kernel_config` object returned from `default_config()` member
+   * function of the kernel type. The configurations are combined using the
+   * `combine()` member function of this configuration. If the kernel has no
+   * default configuration, a copy of this configuration is returned without any
+   * changes.
    *
    * @param __kernel
    * Kernel functor to search for the default configuration
@@ -533,18 +550,22 @@ struct kernel_config
     }
   }
 };
+} // namespace cuda::experimental
 
-// We can consider removing the operator&, but its convenient for in-line construction
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+// We can consider removing the operator&, but its convenient for in-line
+// construction
 template <typename Dimensions, typename... Options, typename NewLevel>
 _CCCL_HOST_API constexpr auto
-operator&(const kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
+operator&(const experimental::kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
 {
   return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
 }
 
 template <typename NewLevel, typename Dimensions, typename... Options>
 _CCCL_HOST_API constexpr auto
-operator&(const NewLevel& new_level, const kernel_config<Dimensions, Options...>& config) noexcept
+operator&(const NewLevel& new_level, const experimental::kernel_config<Dimensions, Options...>& config) noexcept
 {
   return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
 }
@@ -553,9 +574,13 @@ template <typename L1, typename Dims1, typename L2, typename Dims2>
 _CCCL_HOST_API constexpr auto
 operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims2>& l2) noexcept
 {
-  return kernel_config(make_hierarchy(l1, l2));
+  return experimental::kernel_config(cuda::make_hierarchy(l1, l2));
 }
 
+_CCCL_END_NAMESPACE_CUDA
+
+namespace cuda::experimental
+{
 template <typename _Dimensions, typename... _Options>
 auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts)
 {
@@ -583,15 +608,18 @@ template <typename... Levels,
 /**
  * @brief Construct kernel configuration
  *
- * This function takes thread hierarchy dimensions description and any number of launch options and combines
- * them into kernel configuration object. It can be then used along with kernel function and its argument to launch
- * that kernel with the specified dimensions and options
+ * This function takes thread hierarchy dimensions description and any number of
+ * launch options and combines them into kernel configuration object. It can be
+ * then used along with kernel function and its argument to launch that kernel
+ * with the specified dimensions and options
  *
  * @param dims
- * Object describing dimensions of the thread hierarchy in the resulting kernel configuration object
+ * Object describing dimensions of the thread hierarchy in the resulting kernel
+ * configuration object
  *
  * @param opts
- * Variadic number of launch configuration options to be included in the resulting kernel configuration object
+ * Variadic number of launch configuration options to be included in the
+ * resulting kernel configuration object
  */
 template <typename BottomUnit, typename... Levels, typename... Opts>
 [[nodiscard]] constexpr auto
@@ -601,8 +629,8 @@ make_config(const hierarchy_dimensions<BottomUnit, Levels...>& dims, const Opts&
 }
 
 /**
- * @brief A shorthand for creating a kernel configuration with a hierarchy of CUDA threads evenly
- * distributing elements among blocks and threads.
+ * @brief A shorthand for creating a kernel configuration with a hierarchy of
+ * CUDA threads evenly distributing elements among blocks and threads.
  *
  * @par Snippet
  * @code
@@ -615,7 +643,8 @@ make_config(const hierarchy_dimensions<BottomUnit, Levels...>& dims, const Opts&
  * // Equivalent to:
  * constexpr int threadsPerBlock = 256;
  * int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
- * auto dims = make_hierarchy(grid_dims(blocksPerGrid), block_dims<threadsPerBlock>());
+ * auto dims = make_hierarchy(grid_dims(blocksPerGrid),
+ * block_dims<threadsPerBlock>());
  * @endcode
  */
 template <int _ThreadsPerBlock>
@@ -685,7 +714,8 @@ template <typename Dimensions, typename... Options>
 
   ::cuda::std::apply(
     [&](auto&... config_options) {
-      // Use short-cutting && to skip the rest on error, is this too convoluted?
+      // Use short-cutting && to skip the rest on error, is this too
+      // convoluted?
       (void) (... && [&](cudaError_t call_status) {
         status = call_status;
         return call_status == cudaSuccess;
@@ -704,7 +734,8 @@ template <typename Dimensions, typename... Options>
 
   ::cuda::std::apply(
     [&](auto&... config_options) {
-      // Use short-cutting && to skip the rest on error, is this too convoluted?
+      // Use short-cutting && to skip the rest on error, is this too
+      // convoluted?
       (void) (... && [&](cudaError_t call_status) {
         status = call_status;
         return call_status == cudaSuccess;
 
@@ -60,12 +60,6 @@ foreach (cudax_target IN LISTS cudax_TARGETS)
   add_custom_target(${config_meta_target})
   add_dependencies(${config_prefix}.all ${config_meta_target})
 
-  # Add tests:
-  cudax_add_catch2_test(test_target hierarchy ${cudax_target}
-    hierarchy/hierarchy_smoke.cu
-    hierarchy/hierarchy_custom_types.cu
-  )
-
   cudax_add_catch2_test(test_target launch ${cudax_target}
     launch/launch_smoke.cu
   )
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ struct simple_kernel`
`51`	`51`	`__device__ void operator()(Configuration config, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)`
`52`	`52`	`{`
`53`	`53`	`// Just a dummy kernel, doing enough for us to verify that everything worked`
`54`		`- const auto idx = config.dims.rank(cudax::thread);`
	`54`	`+ const auto idx = config.dims.rank(cuda::thread);`
`55`	`55`	`dst[idx] = src[idx] * 2.0f;`
`56`	`56`	`}`
`57`	`57`	`};`