@@ -41,14 +41,18 @@ _CCCL_DIAG_POP
4141# include < cuda/std/__exception/cuda_error.h>
4242# include < cuda/std/__execution/env.h>
4343# include < cuda/std/__execution/policy.h>
44+ # include < cuda/std/__functional/invoke.h>
4445# include < cuda/std/__iterator/distance.h>
4546# include < cuda/std/__iterator/iterator_traits.h>
47+ # include < cuda/std/__iterator/next.h>
4648# include < cuda/std/__memory/addressof.h>
4749# include < cuda/std/__memory/construct_at.h>
4850# include < cuda/std/__new/bad_alloc.h>
4951# include < cuda/std/__numeric/reduce.h>
5052# include < cuda/std/__pstl/dispatch.h>
5153# include < cuda/std/__type_traits/always_false.h>
54+ # include < cuda/std/__type_traits/is_nothrow_constructible.h>
55+ # include < cuda/std/__utility/forward.h>
5256# include < cuda/std/__utility/move.h>
5357
5458# include < cuda_runtime.h>
@@ -76,7 +80,8 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
7680 {}
7781
7882 template <class _Index , class _Up >
79- _CCCL_DEVICE_API void operator ()(_Index, _Up&& __value)
83+ _CCCL_DEVICE_API _CCCL_FORCEINLINE void
84+ operator ()(_Index, _Up&& __value) noexcept (is_nothrow_constructible_v<_Tp, _Up>)
8085 {
8186 ::cuda::std::__construct_at (__ptr_, ::cuda::std::forward<_Up>(__value));
8287 }
@@ -97,57 +102,66 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
97102 _CCCL_HOST_API ~__allocation_guard ()
98103 {
99104 __resource_.deallocate (__stream_, __ptr_, __num_bytes_, alignof (_Tp));
100- __stream_.sync ();
101105 }
102106
103- [[nodiscard]] _CCCL_HOST_API auto __get_result_iter ()
107+ [[nodiscard]] _CCCL_HOST_API auto __get_result_iter () noexcept
104108 {
105109 if constexpr (::cuda::std::__detail::__can_optimize_construct_at<_Tp, _AccumT>)
106110 {
107- return reinterpret_cast <_Tp*>( __ptr_) ;
111+ return __ptr_;
108112 }
109113 else
110114 {
111- return ::cuda::tabulate_output_iterator{__construct_result{reinterpret_cast <_Tp*>( __ptr_) }};
115+ return ::cuda::tabulate_output_iterator{__construct_result{__ptr_}};
112116 }
113117 }
114118
115- [[nodiscard]] _CCCL_HOST_API void * __get_temp_storage ()
119+ [[nodiscard]] _CCCL_HOST_API void * __get_temp_storage () noexcept
116120 {
117- return static_cast <void *>(reinterpret_cast < unsigned char *>( __ptr_) + sizeof (_Tp) );
121+ return static_cast <void *>(__ptr_ + 1 );
118122 }
119123 };
120124
121- template <class _Policy , class _Iter , class _Tp , class _BinaryOp >
125+ template <class _Policy , class _Iter , class _Size , class _Tp , class _BinaryOp >
122126 [[nodiscard]] _CCCL_HOST_API static _Tp
123- __par_impl ([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last , _Tp __init, _BinaryOp __func)
127+ __par_impl ([[maybe_unused]] const _Policy& __policy, _Iter __first, _Size __count , _Tp __init, _BinaryOp __func)
124128 {
125129 _Tp __ret;
126130
131+ // We need to know the accumulator type to determine whether we need construct_at for the return value
132+ using _AccumT = __accumulator_t <_BinaryOp, iter_reference_t <_Iter>, _Tp>;
133+
134+ // Determine temporary device storage requirements for reduce
135+ void * __temp_storage = nullptr ;
136+ size_t __num_bytes = 0 ;
137+ _CCCL_TRY_CUDA_API (
138+ ::cub::DeviceReduce::Reduce,
139+ " __pstl_cuda_reduce: determination of device storage for cub::DeviceReduce::Reduce failed" ,
140+ __temp_storage,
141+ __num_bytes,
142+ __first,
143+ static_cast <_Tp*>(nullptr ),
144+ __count,
145+ __func,
146+ __init);
147+
148+ // Allocate memory for result
149+ auto __stream = ::cuda::__call_or (::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
150+ auto __resource = ::cuda::__call_or (
151+ ::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool (__stream.device ()), __policy);
152+
127153 {
128- // We need to know the accumulator type to determine whether we need construct_at for the return value
129- using _AccumT = __accumulator_t <_BinaryOp, iter_reference_t <_Iter>, _Tp>;
130-
131- // ! // Determine temporary device storage requirements for reduce
132- void * __temp_storage = nullptr ;
133- size_t __num_bytes = 0 ;
134- const auto __num_items = ::cuda::std::distance (__first, __last);
135- ::cub::DeviceReduce::Reduce (
136- __temp_storage, __num_bytes, __first, static_cast <_Tp*>(nullptr ), __num_items, __func, __init);
137-
138- // Allocate memory for result
139- auto __stream = ::cuda::__call_or (::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
140- auto __resource = ::cuda::__call_or (
141- ::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool (__stream.device ()), __policy);
142154 __allocation_guard<_Tp, _AccumT, decltype (__resource)> __guard{__stream, __resource, __num_bytes};
143155
144156 // Run the reduction
145- ::cub::DeviceReduce::Reduce (
157+ _CCCL_TRY_CUDA_API (
158+ ::cub::DeviceReduce::Reduce,
159+ " __pstl_cuda_reduce: kernel launch of cub::DeviceReduce::Reduce failed" ,
146160 __guard.__get_temp_storage (),
147161 __num_bytes,
148162 ::cuda::std::move (__first),
149163 __guard.__get_result_iter(),
150- __num_items ,
164+ __count ,
151165 ::cuda::std::move(__func),
152166 ::cuda::std::move(__init),
153167 __stream.get());
@@ -163,23 +177,20 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
163177 __stream.get());
164178 }
165179
180+ __stream.sync ();
166181 return __ret;
167182 }
168183
169- template <class _Policy , class _Iter , class _Tp , class _BinaryOp >
184+ template <class _Policy , class _Iter , class _Size , class _Tp , class _BinaryOp >
170185 [[nodiscard]] _CCCL_HOST_API _Tp
171- operator ()([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last , _Tp __init, _BinaryOp __func) const
186+ operator ()([[maybe_unused]] const _Policy& __policy, _Iter __first, _Size __count , _Tp __init, _BinaryOp __func) const
172187 {
173188 if constexpr (::cuda::std::__has_random_access_traversal<_Iter>)
174189 {
175190 try
176191 {
177192 return __par_impl (
178- __policy,
179- ::cuda::std::move (__first),
180- ::cuda::std::move(__last),
181- ::cuda::std::move(__init),
182- ::cuda::std::move(__func));
193+ __policy, ::cuda::std::move (__first), __count, ::cuda::std::move (__init), ::cuda::std::move (__func));
183194 }
184195 catch (const ::cuda::cuda_error& __err)
185196 {
@@ -198,9 +209,17 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
198209 static_assert (__always_false_v<_Policy>,
199210 " __pstl_dispatch: CUDA backend of cuda::std::reduce requires at least random access iterators" );
200211 return ::cuda::std::reduce (
201- ::cuda::std::move ( __first) , ::cuda::std::move(__last ), ::cuda::std::move(__init), ::cuda::std::move(__func));
212+ __first, ::cuda::std::next (__first, __count ), ::cuda::std::move (__init), ::cuda::std::move (__func));
202213 }
203214 }
215+
216+ template <class _Policy , class _Iter , class _Tp , class _BinaryOp >
217+ [[nodiscard]] _CCCL_HOST_API _Tp
218+ operator ()([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func) const
219+ {
220+ const auto __count = ::cuda::std::distance (__first, __last);
221+ return (*this )(__policy, ::cuda::std::move (__first), __count, ::cuda::std::move (__init), ::cuda::std::move (__func));
222+ }
204223};
205224
206225_CCCL_END_NAMESPACE_ARCH_DEPENDENT
0 commit comments