Skip to content

Commit c5a7406

Browse files
authored
[cudax->libcu++] Move the hierarchy type from cudax to libcu++ (#6611)
* Move hierarchy to libcu++ * Fix old GCC and MSVC * More fixes * Review feedback
1 parent 2e98940 commit c5a7406

30 files changed

+1321
-1104
lines changed

cudax/examples/simple_p2p.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ struct simple_kernel
5151
__device__ void operator()(Configuration config, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)
5252
{
5353
// Just a dummy kernel, doing enough for us to verify that everything worked
54-
const auto idx = config.dims.rank(cudax::thread);
54+
const auto idx = config.dims.rank(cuda::thread);
5555
dst[idx] = src[idx] * 2.0f;
5656
}
5757
};

cudax/examples/vector_add.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ try
9696

9797
// Launch the vectorAdd kernel
9898
printf(
99-
"CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cudax::block, cudax::grid), threadsPerBlock);
99+
"CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cuda::block, cuda::grid), threadsPerBlock);
100100
cudax::launch(stream, config, vectorAdd, in(A), in(B), out(C));
101101

102102
printf("waiting for the stream to finish\n");

cudax/include/cuda/experimental/__execution/stream/adaptor.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -269,8 +269,8 @@ private:
269269
// the receiver tell us how to launch the kernel.
270270
auto const __launch_config = get_launch_config(execution::get_env(__state.__state_.__rcvr_));
271271
using __launch_dims_t = decltype(__launch_config.dims);
272-
constexpr int __block_threads = __launch_dims_t::static_count(experimental::thread, experimental::block);
273-
int const __grid_blocks = __launch_config.dims.count(experimental::block, experimental::grid);
272+
constexpr int __block_threads = __launch_dims_t::static_count(thread, block);
273+
int const __grid_blocks = __launch_config.dims.count(block, grid);
274274
static_assert(__block_threads != ::cuda::std::dynamic_extent);
275275

276276
// Start the child operation state. This will launch kernels for all the predecessors
@@ -291,7 +291,7 @@ private:
291291
_CCCL_DEVICE_API void __device_start() noexcept
292292
{
293293
using __launch_dims_t = __dims_of_t<__rcvr_config_t>;
294-
constexpr int __block_threads = __launch_dims_t::static_count(experimental::thread, experimental::block);
294+
constexpr int __block_threads = __launch_dims_t::static_count(thread, block);
295295
auto& __state = __get_state();
296296

297297
// without the following, the kernel in __host_start will fail to launch with

cudax/include/cuda/experimental/__execution/stream/scheduler.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
133133
// the completion kernel, we will be completing the parent's receiver, so we must let
134134
// the receiver tell us how to launch the kernel.
135135
auto const __launch_dims = get_launch_config(execution::get_env(__rcvr_)).dims;
136-
constexpr int __block_threads = decltype(__launch_dims)::static_count(experimental::thread, experimental::block);
137-
int const __grid_blocks = __launch_dims.count(experimental::block, experimental::grid);
136+
constexpr int __block_threads = decltype(__launch_dims)::static_count(cuda::thread, cuda::block);
137+
int const __grid_blocks = __launch_dims.count(cuda::block, cuda::grid);
138138
static_assert(__block_threads != ::cuda::std::dynamic_extent);
139139

140140
// Launch the kernel that completes the receiver with the launch configuration from
@@ -152,7 +152,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
152152
_CCCL_DEVICE_API void __device_start() noexcept
153153
{
154154
using __launch_dims_t = decltype(get_launch_config(execution::get_env(__rcvr_)).dims);
155-
constexpr int __block_threads = __launch_dims_t::static_count(experimental::thread, experimental::block);
155+
constexpr int __block_threads = __launch_dims_t::static_count(cuda::thread, cuda::block);
156156

157157
// without the following, the kernel in __host_start will fail to launch with
158158
// cudaErrorInvalidDeviceFunction.

cudax/include/cuda/experimental/__launch/configuration.cuh

Lines changed: 84 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#define _CUDAX__LAUNCH_CONFIGURATION_CUH
1313

1414
#include <cuda/__driver/driver_api.h>
15+
#include <cuda/__hierarchy/hierarchy_dimensions.h>
1516
#include <cuda/__numeric/overflow_cast.h>
1617
#include <cuda/__ptx/instructions/get_sreg.h>
1718
#include <cuda/std/__cstddef/types.h>
@@ -23,7 +24,6 @@
2324
#include <cuda/std/tuple>
2425

2526
#include <cuda/experimental/__detail/utility.cuh>
26-
#include <cuda/experimental/hierarchy.cuh>
2727

2828
#include <cuda/std/__cccl/prologue.h>
2929

@@ -95,10 +95,11 @@ inline constexpr bool no_duplicate_options<Option, Rest...> =
9595
* @brief Launch option enabling cooperative launch
9696
*
9797
* This launch option causes the launched grid to be restricted to a number of
98-
* blocks that can simultaneously execute on the device. It means that every thread
99-
* in the launched grid can eventually observe execution of each other thread in the grid.
100-
* It also enables usage of cooperative_groups::grid_group::sync() function, that
101-
* synchronizes all threads in the grid.
98+
* blocks that can simultaneously execute on the device. It means that every
99+
* thread in the launched grid can eventually observe execution of each other
100+
* thread in the grid. It also enables usage of
101+
* cooperative_groups::grid_group::sync() function, that synchronizes all
102+
* threads in the grid.
102103
*
103104
* @par Snippet
104105
* @code
@@ -181,14 +182,14 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
181182
/**
182183
* @brief Launch option specifying dynamic shared memory configuration
183184
*
184-
* This launch option causes the launch to allocate amount of shared memory sufficient
185-
* to store the specified number of object of the specified type.
185+
* This launch option causes the launch to allocate amount of shared memory
186+
* sufficient to store the specified number of object of the specified type.
186187
* This type can be constructed with dynamic_shared_memory helper function.
187188
*
188-
* When launch configuration contains this option, that configuration can be then
189-
* passed to dynamic_shared_memory_view to get the view_type over the dynamic shared memory.
190-
* It is also possible to obtain that memory through the original
191-
* extern __shared__ variable[] declaration.
189+
* When launch configuration contains this option, that configuration can be
190+
* then passed to dynamic_shared_memory_view to get the view_type over the
191+
* dynamic shared memory. It is also possible to obtain that memory through the
192+
* original extern __shared__ variable[] declaration.
192193
*
193194
* CUDA guarantees that each device has at least 48kB of shared memory
194195
* per block, but most devices have more than that.
@@ -209,7 +210,8 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
209210
*
210211
* void kernel_launch(cuda::stream_ref stream) {
211212
* auto dims = cudax::make_hierarchy(cudax::block<128>(), cudax::grid(4));
212-
* auto conf = cudax::make_configuration(dims, dynamic_shared_memory<int[128]>());
213+
* auto conf = cudax::make_configuration(dims,
214+
* dynamic_shared_memory<int[128]>());
213215
*
214216
* cudax::launch(stream, conf, kernel);
215217
* }
@@ -224,7 +226,8 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
224226
* or cuda::std::dynamic_extent, if its dynamic
225227
*
226228
* @tparam NonPortableSize
227-
* Needs to be enabled to exceed the portable limit of 48kB of shared memory per block
229+
* Needs to be enabled to exceed the portable limit of 48kB of shared memory
230+
* per block
228231
*/
229232
template <class _Tp>
230233
class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory
@@ -234,14 +237,17 @@ class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory
234237
using __base_type = __dyn_smem_option_base<_Tp>;
235238

236239
static_assert(::cuda::std::rank_v<_Tp> <= 1,
237-
"multidimensional arrays cannot be used with dynamic shared memory option");
240+
"multidimensional arrays cannot be used with dynamic shared "
241+
"memory option");
238242
static_assert(!::cuda::std::is_const_v<typename __base_type::value_type>, "the value type cannot be const");
239243
static_assert(!::cuda::std::is_reference_v<typename __base_type::value_type>, "the value type cannot be a reference");
240244

241245
public:
242-
bool __non_portable_{}; //!< \c true if the object was created with non_portable flag.
246+
bool __non_portable_{}; //!< \c true if the object was created with
247+
//!< non_portable flag.
243248

244-
using typename __base_type::value_type; //!< Value type of the dynamic shared memory elements.
249+
using typename __base_type::value_type; //!< Value type of the dynamic shared
250+
//!< memory elements.
245251
using typename __base_type::view_type; //!< The view type returned by the
246252
//!< cuda::device::dynamic_shared_memory_view(config).
247253

@@ -321,7 +327,8 @@ template <class _Tp>
321327
{
322328
::cudaError_t __status = ::cudaSuccess;
323329

324-
// Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires the function to be loaded.
330+
// Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires the
331+
// function to be loaded.
325332
if (::cuda::__driver::__version_at_least(12, 4))
326333
{
327334
__status = ::cuda::__driver::__functionLoadNoThrow(__kernel);
@@ -376,9 +383,10 @@ template <class _Tp>
376383
/**
377384
* @brief Launch option specifying launch priority
378385
*
379-
* This launch option causes the launched grid to be scheduled with the specified priority.
380-
* More about stream priorities and valid values can be found in the CUDA programming guide
381-
* `here <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#stream-priorities>`_
386+
* This launch option causes the launched grid to be scheduled with the
387+
* specified priority. More about stream priorities and valid values can be
388+
* found in the CUDA programming guide `here
389+
* <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#stream-priorities>`_
382390
*/
383391
struct launch_priority : public __detail::launch_option
384392
{
@@ -444,11 +452,12 @@ _CCCL_CONCEPT __kernel_has_default_config =
444452
/**
445453
* @brief Type describing a kernel launch configuration
446454
*
447-
* This type should not be constructed directly and make_config helper function should be used instead
455+
* This type should not be constructed directly and make_config helper function
456+
* should be used instead
448457
*
449458
* @tparam Dimensions
450-
* cuda::experimental::hierarchy_dimensions instance that describes dimensions of thread hierarchy in this
451-
* configuration object
459+
* cuda::experimental::hierarchy_dimensions instance that describes dimensions
460+
* of thread hierarchy in this configuration object
452461
*
453462
* @tparam Options
454463
* Types of options that were added to this configuration object
@@ -472,8 +481,8 @@ struct kernel_config
472481
/**
473482
* @brief Add a new option to this configuration
474483
*
475-
* Returns a new kernel_config that has all option and dimensions from this kernel_config
476-
* with the option from the argument added to it
484+
* Returns a new kernel_config that has all option and dimensions from this
485+
* kernel_config with the option from the argument added to it
477486
*
478487
* @param new_option
479488
* Option to be added to the configuration
@@ -488,34 +497,42 @@ struct kernel_config
488497
/**
489498
* @brief Combine this configuration with another configuration object
490499
*
491-
* Returns a new `kernel_config` that is a combination of this configuration and the configuration from argument.
492-
* It contains dimensions that are combination of dimensions in this object and the other configuration. The resulting
493-
* hierarchy holds levels present in both hierarchies. In case of overlap of levels hierarchy from this configuration
494-
* is prioritized, so the result always holds all levels from this hierarchy and non-overlapping
495-
* levels from the other hierarchy. This behavior is the same as `combine()` member function of the hierarchy type.
496-
* The result also contains configuration options from both configurations. In case the same type of a configuration
497-
* option is present in both configuration this configuration is copied into the resulting configuration.
500+
* Returns a new `kernel_config` that is a combination of this configuration
501+
* and the configuration from argument. It contains dimensions that are
502+
* combination of dimensions in this object and the other configuration. The
503+
* resulting hierarchy holds levels present in both hierarchies. In case of
504+
* overlap of levels hierarchy from this configuration is prioritized, so the
505+
* result always holds all levels from this hierarchy and non-overlapping
506+
* levels from the other hierarchy. This behavior is the same as `combine()`
507+
* member function of the hierarchy type. The result also contains
508+
* configuration options from both configurations. In case the same type of a
509+
* configuration option is present in both configuration this configuration is
510+
* copied into the resulting configuration.
498511
*
499512
* @param __other_config
500513
* Other configuration to combine with this configuration
501514
*/
502515
template <typename _OtherDimensions, typename... _OtherOptions>
503516
[[nodiscard]] auto combine(const kernel_config<_OtherDimensions, _OtherOptions...>& __other_config) const
504517
{
505-
// can't use fully qualified kernel_config name here because of nvcc bug, TODO remove __make_config_from_tuple once
506-
// fixed
518+
// can't use fully qualified kernel_config name here because of nvcc bug,
519+
// TODO remove __make_config_from_tuple once fixed
507520
return __make_config_from_tuple(
508521
dims.combine(__other_config.dims),
509522
::cuda::std::tuple_cat(options, ::cuda::std::apply(__filter_options<Options...>{}, __other_config.options)));
510523
}
511524

512525
/**
513-
* @brief Combine this configuration with default configuration of a kernel functor
526+
* @brief Combine this configuration with default configuration of a kernel
527+
* functor
514528
*
515-
* Returns a new `kernel_config` that is a combination of this configuration and a default configuration from the
516-
* kernel argument. Default configuration is a `kernel_config` object returned from `default_config()` member function
517-
* of the kernel type. The configurations are combined using the `combine()` member function of this configuration.
518-
* If the kernel has no default configuration, a copy of this configuration is returned without any changes.
529+
* Returns a new `kernel_config` that is a combination of this configuration
530+
* and a default configuration from the kernel argument. Default configuration
531+
* is a `kernel_config` object returned from `default_config()` member
532+
* function of the kernel type. The configurations are combined using the
533+
* `combine()` member function of this configuration. If the kernel has no
534+
* default configuration, a copy of this configuration is returned without any
535+
* changes.
519536
*
520537
* @param __kernel
521538
* Kernel functor to search for the default configuration
@@ -533,18 +550,22 @@ struct kernel_config
533550
}
534551
}
535552
};
553+
} // namespace cuda::experimental
536554

537-
// We can consider removing the operator&, but its convenient for in-line construction
555+
_CCCL_BEGIN_NAMESPACE_CUDA
556+
557+
// We can consider removing the operator&, but its convenient for in-line
558+
// construction
538559
template <typename Dimensions, typename... Options, typename NewLevel>
539560
_CCCL_HOST_API constexpr auto
540-
operator&(const kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
561+
operator&(const experimental::kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
541562
{
542563
return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
543564
}
544565

545566
template <typename NewLevel, typename Dimensions, typename... Options>
546567
_CCCL_HOST_API constexpr auto
547-
operator&(const NewLevel& new_level, const kernel_config<Dimensions, Options...>& config) noexcept
568+
operator&(const NewLevel& new_level, const experimental::kernel_config<Dimensions, Options...>& config) noexcept
548569
{
549570
return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
550571
}
@@ -553,9 +574,13 @@ template <typename L1, typename Dims1, typename L2, typename Dims2>
553574
_CCCL_HOST_API constexpr auto
554575
operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims2>& l2) noexcept
555576
{
556-
return kernel_config(make_hierarchy(l1, l2));
577+
return experimental::kernel_config(cuda::make_hierarchy(l1, l2));
557578
}
558579

580+
_CCCL_END_NAMESPACE_CUDA
581+
582+
namespace cuda::experimental
583+
{
559584
template <typename _Dimensions, typename... _Options>
560585
auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts)
561586
{
@@ -583,15 +608,18 @@ template <typename... Levels,
583608
/**
584609
* @brief Construct kernel configuration
585610
*
586-
* This function takes thread hierarchy dimensions description and any number of launch options and combines
587-
* them into kernel configuration object. It can be then used along with kernel function and its argument to launch
588-
* that kernel with the specified dimensions and options
611+
* This function takes thread hierarchy dimensions description and any number of
612+
* launch options and combines them into kernel configuration object. It can be
613+
* then used along with kernel function and its argument to launch that kernel
614+
* with the specified dimensions and options
589615
*
590616
* @param dims
591-
* Object describing dimensions of the thread hierarchy in the resulting kernel configuration object
617+
* Object describing dimensions of the thread hierarchy in the resulting kernel
618+
* configuration object
592619
*
593620
* @param opts
594-
* Variadic number of launch configuration options to be included in the resulting kernel configuration object
621+
* Variadic number of launch configuration options to be included in the
622+
* resulting kernel configuration object
595623
*/
596624
template <typename BottomUnit, typename... Levels, typename... Opts>
597625
[[nodiscard]] constexpr auto
@@ -601,8 +629,8 @@ make_config(const hierarchy_dimensions<BottomUnit, Levels...>& dims, const Opts&
601629
}
602630

603631
/**
604-
* @brief A shorthand for creating a kernel configuration with a hierarchy of CUDA threads evenly
605-
* distributing elements among blocks and threads.
632+
* @brief A shorthand for creating a kernel configuration with a hierarchy of
633+
* CUDA threads evenly distributing elements among blocks and threads.
606634
*
607635
* @par Snippet
608636
* @code
@@ -615,7 +643,8 @@ make_config(const hierarchy_dimensions<BottomUnit, Levels...>& dims, const Opts&
615643
* // Equivalent to:
616644
* constexpr int threadsPerBlock = 256;
617645
* int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
618-
* auto dims = make_hierarchy(grid_dims(blocksPerGrid), block_dims<threadsPerBlock>());
646+
* auto dims = make_hierarchy(grid_dims(blocksPerGrid),
647+
* block_dims<threadsPerBlock>());
619648
* @endcode
620649
*/
621650
template <int _ThreadsPerBlock>
@@ -685,7 +714,8 @@ template <typename Dimensions, typename... Options>
685714

686715
::cuda::std::apply(
687716
[&](auto&... config_options) {
688-
// Use short-cutting && to skip the rest on error, is this too convoluted?
717+
// Use short-cutting && to skip the rest on error, is this too
718+
// convoluted?
689719
(void) (... && [&](cudaError_t call_status) {
690720
status = call_status;
691721
return call_status == cudaSuccess;
@@ -704,7 +734,8 @@ template <typename Dimensions, typename... Options>
704734

705735
::cuda::std::apply(
706736
[&](auto&... config_options) {
707-
// Use short-cutting && to skip the rest on error, is this too convoluted?
737+
// Use short-cutting && to skip the rest on error, is this too
738+
// convoluted?
708739
(void) (... && [&](cudaError_t call_status) {
709740
status = call_status;
710741
return call_status == cudaSuccess;

cudax/include/cuda/experimental/hierarchy.cuh

Lines changed: 0 additions & 16 deletions
This file was deleted.

cudax/test/CMakeLists.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,6 @@ foreach (cudax_target IN LISTS cudax_TARGETS)
6060
add_custom_target(${config_meta_target})
6161
add_dependencies(${config_prefix}.all ${config_meta_target})
6262

63-
# Add tests:
64-
cudax_add_catch2_test(test_target hierarchy ${cudax_target}
65-
hierarchy/hierarchy_smoke.cu
66-
hierarchy/hierarchy_custom_types.cu
67-
)
68-
6963
cudax_add_catch2_test(test_target launch ${cudax_target}
7064
launch/launch_smoke.cu
7165
)

0 commit comments

Comments
 (0)