1212#define _CUDAX__LAUNCH_CONFIGURATION_CUH
1313
1414#include < cuda/__driver/driver_api.h>
15+ #include < cuda/__hierarchy/hierarchy_dimensions.h>
1516#include < cuda/__numeric/overflow_cast.h>
1617#include < cuda/__ptx/instructions/get_sreg.h>
1718#include < cuda/std/__cstddef/types.h>
2324#include < cuda/std/tuple>
2425
2526#include < cuda/experimental/__detail/utility.cuh>
26- #include < cuda/experimental/hierarchy.cuh>
2727
2828#include < cuda/std/__cccl/prologue.h>
2929
@@ -95,10 +95,11 @@ inline constexpr bool no_duplicate_options<Option, Rest...> =
9595 * @brief Launch option enabling cooperative launch
9696 *
9797 * This launch option causes the launched grid to be restricted to a number of
98- * blocks that can simultaneously execute on the device. It means that every thread
99- * in the launched grid can eventually observe execution of each other thread in the grid.
100- * It also enables usage of cooperative_groups::grid_group::sync() function, that
101- * synchronizes all threads in the grid.
98+ * blocks that can simultaneously execute on the device. It means that every
99+ * thread in the launched grid can eventually observe execution of each other
100+ * thread in the grid. It also enables usage of
101+ * cooperative_groups::grid_group::sync() function, that synchronizes all
102+ * threads in the grid.
102103 *
103104 * @par Snippet
104105 * @code
@@ -181,14 +182,14 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
181182/* *
182183 * @brief Launch option specifying dynamic shared memory configuration
183184 *
184- * This launch option causes the launch to allocate amount of shared memory sufficient
185- * to store the specified number of object of the specified type.
185+ * This launch option causes the launch to allocate amount of shared memory
186+ * sufficient to store the specified number of object of the specified type.
186187 * This type can be constructed with dynamic_shared_memory helper function.
187188 *
188- * When launch configuration contains this option, that configuration can be then
189- * passed to dynamic_shared_memory_view to get the view_type over the dynamic shared memory.
190- * It is also possible to obtain that memory through the original
191- * extern __shared__ variable[] declaration.
189+ * When launch configuration contains this option, that configuration can be
190+ * then passed to dynamic_shared_memory_view to get the view_type over the
191+ * dynamic shared memory. It is also possible to obtain that memory through the
192+ * original extern __shared__ variable[] declaration.
192193 *
193194 * CUDA guarantees that each device has at least 48kB of shared memory
194195 * per block, but most devices have more than that.
@@ -209,7 +210,8 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
209210 *
210211 * void kernel_launch(cuda::stream_ref stream) {
211212 * auto dims = cudax::make_hierarchy(cudax::block<128>(), cudax::grid(4));
212- * auto conf = cudax::make_configuration(dims, dynamic_shared_memory<int[128]>());
213+ * auto conf = cudax::make_configuration(dims,
214+ * dynamic_shared_memory<int[128]>());
213215 *
214216 * cudax::launch(stream, conf, kernel);
215217 * }
@@ -224,7 +226,8 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
224226 * or cuda::std::dynamic_extent, if its dynamic
225227 *
226228 * @tparam NonPortableSize
227- * Needs to be enabled to exceed the portable limit of 48kB of shared memory per block
229+ * Needs to be enabled to exceed the portable limit of 48kB of shared memory
230+ * per block
228231 */
229232template <class _Tp >
230233class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory
@@ -234,14 +237,17 @@ class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory
234237 using __base_type = __dyn_smem_option_base<_Tp>;
235238
236239 static_assert (::cuda::std::rank_v<_Tp> <= 1 ,
237- " multidimensional arrays cannot be used with dynamic shared memory option" );
240+ " multidimensional arrays cannot be used with dynamic shared "
241+ " memory option" );
238242 static_assert (!::cuda::std::is_const_v<typename __base_type::value_type>, " the value type cannot be const" );
239243 static_assert (!::cuda::std::is_reference_v<typename __base_type::value_type>, " the value type cannot be a reference" );
240244
241245public:
242- bool __non_portable_{}; // !< \c true if the object was created with non_portable flag.
246+ bool __non_portable_{}; // !< \c true if the object was created with
247+ // !< non_portable flag.
243248
244- using typename __base_type::value_type; // !< Value type of the dynamic shared memory elements.
249+ using typename __base_type::value_type; // !< Value type of the dynamic shared
250+ // !< memory elements.
245251 using typename __base_type::view_type; // !< The view type returned by the
246252 // !< cuda::device::dynamic_shared_memory_view(config).
247253
@@ -321,7 +327,8 @@ template <class _Tp>
321327{
322328 ::cudaError_t __status = ::cudaSuccess;
323329
324- // Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires the function to be loaded.
330+ // Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires the
331+ // function to be loaded.
325332 if (::cuda::__driver::__version_at_least (12 , 4 ))
326333 {
327334 __status = ::cuda::__driver::__functionLoadNoThrow (__kernel);
@@ -376,9 +383,10 @@ template <class _Tp>
376383/* *
377384 * @brief Launch option specifying launch priority
378385 *
379- * This launch option causes the launched grid to be scheduled with the specified priority.
380- * More about stream priorities and valid values can be found in the CUDA programming guide
381- * `here <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#stream-priorities>`_
386+ * This launch option causes the launched grid to be scheduled with the
387+ * specified priority. More about stream priorities and valid values can be
388+ * found in the CUDA programming guide `here
389+ * <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#stream-priorities>`_
382390 */
383391struct launch_priority : public __detail ::launch_option
384392{
@@ -444,11 +452,12 @@ _CCCL_CONCEPT __kernel_has_default_config =
444452/* *
445453 * @brief Type describing a kernel launch configuration
446454 *
447- * This type should not be constructed directly and make_config helper function should be used instead
455+ * This type should not be constructed directly and make_config helper function
456+ * should be used instead
448457 *
449458 * @tparam Dimensions
450- * cuda::experimental::hierarchy_dimensions instance that describes dimensions of thread hierarchy in this
451- * configuration object
459+ * cuda::experimental::hierarchy_dimensions instance that describes dimensions
460+ * of thread hierarchy in this configuration object
452461 *
453462 * @tparam Options
454463 * Types of options that were added to this configuration object
@@ -472,8 +481,8 @@ struct kernel_config
472481 /* *
473482 * @brief Add a new option to this configuration
474483 *
475- * Returns a new kernel_config that has all option and dimensions from this kernel_config
476- * with the option from the argument added to it
484+ * Returns a new kernel_config that has all option and dimensions from this
485+ * kernel_config with the option from the argument added to it
477486 *
478487 * @param new_option
479488 * Option to be added to the configuration
@@ -488,34 +497,42 @@ struct kernel_config
488497 /* *
489498 * @brief Combine this configuration with another configuration object
490499 *
491- * Returns a new `kernel_config` that is a combination of this configuration and the configuration from argument.
492- * It contains dimensions that are combination of dimensions in this object and the other configuration. The resulting
493- * hierarchy holds levels present in both hierarchies. In case of overlap of levels hierarchy from this configuration
494- * is prioritized, so the result always holds all levels from this hierarchy and non-overlapping
495- * levels from the other hierarchy. This behavior is the same as `combine()` member function of the hierarchy type.
496- * The result also contains configuration options from both configurations. In case the same type of a configuration
497- * option is present in both configuration this configuration is copied into the resulting configuration.
500+ * Returns a new `kernel_config` that is a combination of this configuration
501+ * and the configuration from argument. It contains dimensions that are
502+ * combination of dimensions in this object and the other configuration. The
503+ * resulting hierarchy holds levels present in both hierarchies. In case of
504+ * overlap of levels hierarchy from this configuration is prioritized, so the
505+ * result always holds all levels from this hierarchy and non-overlapping
506+ * levels from the other hierarchy. This behavior is the same as `combine()`
507+ * member function of the hierarchy type. The result also contains
508+ * configuration options from both configurations. In case the same type of a
509+ * configuration option is present in both configuration this configuration is
510+ * copied into the resulting configuration.
498511 *
499512 * @param __other_config
500513 * Other configuration to combine with this configuration
501514 */
502515 template <typename _OtherDimensions, typename ... _OtherOptions>
503516 [[nodiscard]] auto combine (const kernel_config<_OtherDimensions, _OtherOptions...>& __other_config) const
504517 {
505- // can't use fully qualified kernel_config name here because of nvcc bug, TODO remove __make_config_from_tuple once
506- // fixed
518+ // can't use fully qualified kernel_config name here because of nvcc bug,
519+ // TODO remove __make_config_from_tuple once fixed
507520 return __make_config_from_tuple (
508521 dims.combine (__other_config.dims ),
509522 ::cuda::std::tuple_cat (options, ::cuda::std::apply(__filter_options<Options...>{}, __other_config.options)));
510523 }
511524
512525 /* *
513- * @brief Combine this configuration with default configuration of a kernel functor
526+ * @brief Combine this configuration with default configuration of a kernel
527+ * functor
514528 *
515- * Returns a new `kernel_config` that is a combination of this configuration and a default configuration from the
516- * kernel argument. Default configuration is a `kernel_config` object returned from `default_config()` member function
517- * of the kernel type. The configurations are combined using the `combine()` member function of this configuration.
518- * If the kernel has no default configuration, a copy of this configuration is returned without any changes.
529+ * Returns a new `kernel_config` that is a combination of this configuration
530+ * and a default configuration from the kernel argument. Default configuration
531+ * is a `kernel_config` object returned from `default_config()` member
532+ * function of the kernel type. The configurations are combined using the
533+ * `combine()` member function of this configuration. If the kernel has no
534+ * default configuration, a copy of this configuration is returned without any
535+ * changes.
519536 *
520537 * @param __kernel
521538 * Kernel functor to search for the default configuration
@@ -533,18 +550,22 @@ struct kernel_config
533550 }
534551 }
535552};
553+ } // namespace cuda::experimental
536554
537- // We can consider removing the operator&, but its convenient for in-line construction
555+ _CCCL_BEGIN_NAMESPACE_CUDA
556+
557+ // We can consider removing the operator&, but its convenient for in-line
558+ // construction
538559template <typename Dimensions, typename ... Options, typename NewLevel>
539560_CCCL_HOST_API constexpr auto
540- operator &(const kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
561+ operator &(const experimental:: kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
541562{
542563 return kernel_config (hierarchy_add_level (config.dims , new_level), config.options );
543564}
544565
545566template <typename NewLevel, typename Dimensions, typename ... Options>
546567_CCCL_HOST_API constexpr auto
547- operator &(const NewLevel& new_level, const kernel_config<Dimensions, Options...>& config) noexcept
568+ operator &(const NewLevel& new_level, const experimental:: kernel_config<Dimensions, Options...>& config) noexcept
548569{
549570 return kernel_config (hierarchy_add_level (config.dims , new_level), config.options );
550571}
@@ -553,9 +574,13 @@ template <typename L1, typename Dims1, typename L2, typename Dims2>
553574_CCCL_HOST_API constexpr auto
554575operator &(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims2>& l2) noexcept
555576{
556- return kernel_config (make_hierarchy (l1, l2));
577+ return experimental:: kernel_config (cuda:: make_hierarchy (l1, l2));
557578}
558579
580+ _CCCL_END_NAMESPACE_CUDA
581+
582+ namespace cuda ::experimental
583+ {
559584template <typename _Dimensions, typename ... _Options>
560585auto __make_config_from_tuple (const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts)
561586{
@@ -583,15 +608,18 @@ template <typename... Levels,
583608/* *
584609 * @brief Construct kernel configuration
585610 *
586- * This function takes thread hierarchy dimensions description and any number of launch options and combines
587- * them into kernel configuration object. It can be then used along with kernel function and its argument to launch
588- * that kernel with the specified dimensions and options
611+ * This function takes thread hierarchy dimensions description and any number of
612+ * launch options and combines them into kernel configuration object. It can be
613+ * then used along with kernel function and its argument to launch that kernel
614+ * with the specified dimensions and options
589615 *
590616 * @param dims
591- * Object describing dimensions of the thread hierarchy in the resulting kernel configuration object
617+ * Object describing dimensions of the thread hierarchy in the resulting kernel
618+ * configuration object
592619 *
593620 * @param opts
594- * Variadic number of launch configuration options to be included in the resulting kernel configuration object
621+ * Variadic number of launch configuration options to be included in the
622+ * resulting kernel configuration object
595623 */
596624template <typename BottomUnit, typename ... Levels, typename ... Opts>
597625[[nodiscard]] constexpr auto
@@ -601,8 +629,8 @@ make_config(const hierarchy_dimensions<BottomUnit, Levels...>& dims, const Opts&
601629}
602630
603631/* *
604- * @brief A shorthand for creating a kernel configuration with a hierarchy of CUDA threads evenly
605- * distributing elements among blocks and threads.
632+ * @brief A shorthand for creating a kernel configuration with a hierarchy of
633+ * CUDA threads evenly distributing elements among blocks and threads.
606634 *
607635 * @par Snippet
608636 * @code
@@ -615,7 +643,8 @@ make_config(const hierarchy_dimensions<BottomUnit, Levels...>& dims, const Opts&
615643 * // Equivalent to:
616644 * constexpr int threadsPerBlock = 256;
617645 * int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
618- * auto dims = make_hierarchy(grid_dims(blocksPerGrid), block_dims<threadsPerBlock>());
646+ * auto dims = make_hierarchy(grid_dims(blocksPerGrid),
647+ * block_dims<threadsPerBlock>());
619648 * @endcode
620649 */
621650template <int _ThreadsPerBlock>
@@ -685,7 +714,8 @@ template <typename Dimensions, typename... Options>
685714
686715 ::cuda::std::apply (
687716 [&](auto &... config_options) {
688- // Use short-cutting && to skip the rest on error, is this too convoluted?
717+ // Use short-cutting && to skip the rest on error, is this too
718+ // convoluted?
689719 (void ) (... && [&](cudaError_t call_status) {
690720 status = call_status;
691721 return call_status == cudaSuccess;
@@ -704,7 +734,8 @@ template <typename Dimensions, typename... Options>
704734
705735 ::cuda::std::apply (
706736 [&](auto &... config_options) {
707- // Use short-cutting && to skip the rest on error, is this too convoluted?
737+ // Use short-cutting && to skip the rest on error, is this too
738+ // convoluted?
708739 (void ) (... && [&](cudaError_t call_status) {
709740 status = call_status;
710741 return call_status == cudaSuccess;
0 commit comments