Skip to content

Commit 4e6119d

Browse files
authored
Merge branch 'main' into stf_fix_place_equality
2 parents b012af2 + 7942d8c commit 4e6119d

File tree

6 files changed

+227
-47
lines changed

6 files changed

+227
-47
lines changed

cub/cub/device/device_transform.cuh

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,110 @@ private:
108108
}
109109
}
110110

111+
// TODO(bgruber): we want to eventually forward the output tuple to the kernel and optimize writing multiple streams
112+
template <detail::transform::requires_stable_address StableAddress = detail::transform::requires_stable_address::no,
113+
typename... RandomAccessIteratorsIn,
114+
typename... RandomAccessIteratorsOut,
115+
typename NumItemsT,
116+
typename Predicate,
117+
typename TransformOp,
118+
typename Env>
119+
CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
120+
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
121+
::cuda::std::tuple<RandomAccessIteratorsOut...> outputs,
122+
NumItemsT num_items,
123+
Predicate predicate,
124+
TransformOp transform_op,
125+
Env env)
126+
{
127+
return TransformInternal<StableAddress>(
128+
::cuda::std::move(inputs),
129+
::cuda::make_zip_iterator(::cuda::std::move(outputs)),
130+
num_items,
131+
::cuda::std::move(predicate),
132+
::cuda::std::move(transform_op),
133+
::cuda::std::move(env));
134+
}
135+
111136
public:
137+
//! @rst
138+
//! Overview
139+
//! +++++++++++++++++++++++++++++++++++++++++++++
140+
//! Transforms many input sequences into many output sequence, by applying a transformation operation on corresponding
141+
//! input elements and writing the tuple result to the corresponding output elements. No guarantee is given on the
142+
//! identity (i.e. address) of the objects passed to the call operator of the transformation operation.
143+
//!
144+
//! A Simple Example
145+
//! +++++++++++++++++++++++++++++++++++++++++++++
146+
//!
147+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
148+
//! :language: c++
149+
//! :dedent:
150+
//! :start-after: example-begin transform-many-many
151+
//! :end-before: example-end transform-many-many
152+
//!
153+
//! @endrst
154+
//!
155+
//! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
156+
//! iterators' value types must be trivially relocatable.
157+
//! @param outputs A tuple of iterators to the output sequences where num_items results are written to each. Each
158+
//! sequence may point to the beginning of one of the input sequences, performing the transformation inplace. Any
159+
//! output sequence must not overlap with any of the input sequence in any other way.
160+
//! @param num_items The number of elements in each input and output sequence.
161+
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
162+
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
163+
//! operator must be a tuple where each tuple element is assignable to the corresponding dereferenced output
164+
//! iterators.
165+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
166+
//! stream\ :sub:`0`
167+
template <typename... RandomAccessIteratorsIn,
168+
typename... RandomAccessIteratorsOut,
169+
typename NumItemsT,
170+
typename TransformOp,
171+
typename Env = ::cuda::std::execution::env<>>
172+
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
173+
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
174+
::cuda::std::tuple<RandomAccessIteratorsOut...> outputs,
175+
NumItemsT num_items,
176+
TransformOp transform_op,
177+
Env env = {})
178+
{
179+
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
180+
return TransformInternal(
181+
::cuda::std::move(inputs),
182+
::cuda::std::move(outputs),
183+
num_items,
184+
detail::transform::always_true_predicate{},
185+
::cuda::std::move(transform_op),
186+
::cuda::std::move(env));
187+
}
188+
189+
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
190+
// Overload with additional parameters to specify temporary storage. Provided for compatibility with other CUB APIs.
191+
template <typename... RandomAccessIteratorsIn,
192+
typename... RandomAccessIteratorsOut,
193+
typename NumItemsT,
194+
typename TransformOp>
195+
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
196+
void* d_temp_storage,
197+
size_t& temp_storage_bytes,
198+
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
199+
::cuda::std::tuple<RandomAccessIteratorsOut...> outputs,
200+
NumItemsT num_items,
201+
TransformOp transform_op,
202+
cudaStream_t stream = nullptr)
203+
{
204+
if (d_temp_storage == nullptr)
205+
{
206+
temp_storage_bytes = 1;
207+
return cudaSuccess;
208+
}
209+
210+
return Transform(
211+
::cuda::std::move(inputs), ::cuda::std::move(outputs), num_items, ::cuda::std::move(transform_op), stream);
212+
}
213+
#endif // _CCCL_DOXYGEN_INVOKED
214+
112215
//! @rst
113216
//! Overview
114217
//! +++++++++++++++++++++++++++++++++++++++++++++

cub/test/catch2_test_device_transform.cu

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -435,22 +435,42 @@ C2H_TEST("DeviceTransform::Transform fancy output iterator type with void value
435435
REQUIRE(result == c2h::device_vector<type>(num_items, 3));
436436
}
437437

438-
C2H_TEST("DeviceTransform::Transform mixed input iterator types", "[device][transform]")
438+
struct plus_mul_neg
439439
{
440-
using type = int;
440+
template <typename T>
441+
__host__ __device__ auto operator()(T a, T b) const
442+
{
443+
return cuda::std::tuple{a + b, a * b, -a};
444+
}
445+
};
446+
447+
C2H_TEST("DeviceTransform::Transform mixed iterator types 2 -> 3", "[device][transform]")
448+
{
449+
using type = unsigned; // overflow is defined
441450
const int num_items = GENERATE(100, 100'000); // try to hit the small and full tile code paths
442451
cuda::counting_iterator<type> a{0};
443452
c2h::device_vector<type> b(num_items, thrust::no_init);
444453
c2h::gen(C2H_SEED(1), b);
445454

446-
c2h::device_vector<type> result(num_items, thrust::no_init);
447-
transform_many(cuda::std::make_tuple(a, b.begin()), result.begin(), num_items, cuda::std::plus<type>{});
455+
c2h::device_vector<type> result_a(num_items, thrust::no_init);
456+
c2h::device_vector<type> result_b(num_items, thrust::no_init);
457+
c2h::device_vector<type> result_c(num_items, thrust::no_init);
458+
transform_many(
459+
cuda::std::make_tuple(a, b.begin()),
460+
cuda::std::make_tuple(
461+
result_a.begin(), result_b.begin(), thrust::make_transform_output_iterator(result_c.begin(), cuda::std::negate{})),
462+
num_items,
463+
plus_mul_neg{});
448464

449465
// compute reference and verify
450466
c2h::host_vector<type> b_h = b;
451-
c2h::host_vector<type> reference_h(num_items);
452-
std::transform(a, a + num_items, b_h.begin(), reference_h.begin(), std::plus<type>{});
453-
REQUIRE(reference_h == result);
467+
c2h::host_vector<type> reference_a_h(num_items, thrust::no_init);
468+
std::transform(a, a + num_items, b_h.begin(), reference_a_h.begin(), cuda::std::plus<type>{});
469+
c2h::host_vector<type> reference_b_h(num_items, thrust::no_init);
470+
std::transform(a, a + num_items, b_h.begin(), reference_b_h.begin(), cuda::std::multiplies<type>{});
471+
CHECK(reference_a_h == result_a);
472+
CHECK(reference_b_h == result_b);
473+
CHECK(thrust::equal(a, a + num_items, result_c.begin()));
454474
}
455475

456476
struct plus_needs_stable_address

cub/test/catch2_test_device_transform_api.cu

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,37 @@
1010

1111
#include <c2h/catch2_test_helper.h>
1212

13+
// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
14+
void test_transform_many_many_api()
15+
{
16+
// example-begin transform-many-many
17+
auto input1 = thrust::device_vector<int>{0, -1, 2, -3, 4, -5};
18+
auto input2 = thrust::device_vector<double>{5.2, 3.1, -1.1, 3.0, 3.2, 0.0};
19+
auto op = [] __device__(int a, double b) -> cuda::std::tuple<double, bool> {
20+
const double product = a * b;
21+
return {product, product < 0};
22+
};
23+
24+
auto result1 = thrust::device_vector<double>(input1.size(), thrust::no_init);
25+
auto result2 = thrust::device_vector<bool>(input1.size(), thrust::no_init);
26+
cub::DeviceTransform::Transform(
27+
cuda::std::tuple{input1.begin(), input2.begin()},
28+
cuda::std::tuple{result1.begin(), result2.begin()},
29+
input1.size(),
30+
op);
31+
32+
const auto expected1 = thrust::host_vector<double>{0, -3.1, -2.2, -9, 12.8, -0};
33+
const auto expected2 = thrust::host_vector<bool>{false, true, true, true, false, false};
34+
// example-end transform-many-many
35+
CHECK(result1 == expected1);
36+
CHECK(result2 == expected2);
37+
}
38+
39+
C2H_TEST("DeviceTransform::Transform many->many API example", "[device][device_transform]")
40+
{
41+
test_transform_many_many_api();
42+
}
43+
1344
// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
1445
void test_transform_api()
1546
{
@@ -21,7 +52,7 @@ void test_transform_api()
2152
return (a + b) * c;
2253
};
2354

24-
auto result = thrust::device_vector<int>(input1.size());
55+
auto result = thrust::device_vector<int>(input1.size(), thrust::no_init);
2556
cub::DeviceTransform::Transform(
2657
cuda::std::tuple{input1.begin(), input2.begin(), input3}, result.begin(), input1.size(), op);
2758

@@ -74,7 +105,7 @@ void test_transform_stable_api()
74105
return a + input2_ptr[i];
75106
};
76107

77-
auto result = thrust::device_vector<int>(input1.size());
108+
auto result = thrust::device_vector<int>(input1.size(), thrust::no_init);
78109
cub::DeviceTransform::TransformStableArgumentAddresses(
79110
cuda::std::tuple{input1_ptr}, result.begin(), input1.size(), op);
80111

libcudacxx/include/cuda/std/__pstl/cuda/reduce.h

Lines changed: 52 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,18 @@ _CCCL_DIAG_POP
4141
# include <cuda/std/__exception/cuda_error.h>
4242
# include <cuda/std/__execution/env.h>
4343
# include <cuda/std/__execution/policy.h>
44+
# include <cuda/std/__functional/invoke.h>
4445
# include <cuda/std/__iterator/distance.h>
4546
# include <cuda/std/__iterator/iterator_traits.h>
47+
# include <cuda/std/__iterator/next.h>
4648
# include <cuda/std/__memory/addressof.h>
4749
# include <cuda/std/__memory/construct_at.h>
4850
# include <cuda/std/__new/bad_alloc.h>
4951
# include <cuda/std/__numeric/reduce.h>
5052
# include <cuda/std/__pstl/dispatch.h>
5153
# include <cuda/std/__type_traits/always_false.h>
54+
# include <cuda/std/__type_traits/is_nothrow_constructible.h>
55+
# include <cuda/std/__utility/forward.h>
5256
# include <cuda/std/__utility/move.h>
5357

5458
# include <cuda_runtime.h>
@@ -76,7 +80,8 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
7680
{}
7781

7882
template <class _Index, class _Up>
79-
_CCCL_DEVICE_API void operator()(_Index, _Up&& __value)
83+
_CCCL_DEVICE_API _CCCL_FORCEINLINE void
84+
operator()(_Index, _Up&& __value) noexcept(is_nothrow_constructible_v<_Tp, _Up>)
8085
{
8186
::cuda::std::__construct_at(__ptr_, ::cuda::std::forward<_Up>(__value));
8287
}
@@ -97,57 +102,66 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
97102
_CCCL_HOST_API ~__allocation_guard()
98103
{
99104
__resource_.deallocate(__stream_, __ptr_, __num_bytes_, alignof(_Tp));
100-
__stream_.sync();
101105
}
102106

103-
[[nodiscard]] _CCCL_HOST_API auto __get_result_iter()
107+
[[nodiscard]] _CCCL_HOST_API auto __get_result_iter() noexcept
104108
{
105109
if constexpr (::cuda::std::__detail::__can_optimize_construct_at<_Tp, _AccumT>)
106110
{
107-
return reinterpret_cast<_Tp*>(__ptr_);
111+
return __ptr_;
108112
}
109113
else
110114
{
111-
return ::cuda::tabulate_output_iterator{__construct_result{reinterpret_cast<_Tp*>(__ptr_)}};
115+
return ::cuda::tabulate_output_iterator{__construct_result{__ptr_}};
112116
}
113117
}
114118

115-
[[nodiscard]] _CCCL_HOST_API void* __get_temp_storage()
119+
[[nodiscard]] _CCCL_HOST_API void* __get_temp_storage() noexcept
116120
{
117-
return static_cast<void*>(reinterpret_cast<unsigned char*>(__ptr_) + sizeof(_Tp));
121+
return static_cast<void*>(__ptr_ + 1);
118122
}
119123
};
120124

121-
template <class _Policy, class _Iter, class _Tp, class _BinaryOp>
125+
template <class _Policy, class _Iter, class _Size, class _Tp, class _BinaryOp>
122126
[[nodiscard]] _CCCL_HOST_API static _Tp
123-
__par_impl([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func)
127+
__par_impl([[maybe_unused]] const _Policy& __policy, _Iter __first, _Size __count, _Tp __init, _BinaryOp __func)
124128
{
125129
_Tp __ret;
126130

131+
// We need to know the accumulator type to determine whether we need construct_at for the return value
132+
using _AccumT = __accumulator_t<_BinaryOp, iter_reference_t<_Iter>, _Tp>;
133+
134+
// Determine temporary device storage requirements for reduce
135+
void* __temp_storage = nullptr;
136+
size_t __num_bytes = 0;
137+
_CCCL_TRY_CUDA_API(
138+
::cub::DeviceReduce::Reduce,
139+
"__pstl_cuda_reduce: determination of device storage for cub::DeviceReduce::Reduce failed",
140+
__temp_storage,
141+
__num_bytes,
142+
__first,
143+
static_cast<_Tp*>(nullptr),
144+
__count,
145+
__func,
146+
__init);
147+
148+
// Allocate memory for result
149+
auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
150+
auto __resource = ::cuda::__call_or(
151+
::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy);
152+
127153
{
128-
// We need to know the accumulator type to determine whether we need construct_at for the return value
129-
using _AccumT = __accumulator_t<_BinaryOp, iter_reference_t<_Iter>, _Tp>;
130-
131-
//! // Determine temporary device storage requirements for reduce
132-
void* __temp_storage = nullptr;
133-
size_t __num_bytes = 0;
134-
const auto __num_items = ::cuda::std::distance(__first, __last);
135-
::cub::DeviceReduce::Reduce(
136-
__temp_storage, __num_bytes, __first, static_cast<_Tp*>(nullptr), __num_items, __func, __init);
137-
138-
// Allocate memory for result
139-
auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
140-
auto __resource = ::cuda::__call_or(
141-
::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy);
142154
__allocation_guard<_Tp, _AccumT, decltype(__resource)> __guard{__stream, __resource, __num_bytes};
143155

144156
// Run the reduction
145-
::cub::DeviceReduce::Reduce(
157+
_CCCL_TRY_CUDA_API(
158+
::cub::DeviceReduce::Reduce,
159+
"__pstl_cuda_reduce: kernel launch of cub::DeviceReduce::Reduce failed",
146160
__guard.__get_temp_storage(),
147161
__num_bytes,
148162
::cuda::std::move(__first),
149163
__guard.__get_result_iter(),
150-
__num_items,
164+
__count,
151165
::cuda::std::move(__func),
152166
::cuda::std::move(__init),
153167
__stream.get());
@@ -163,23 +177,20 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
163177
__stream.get());
164178
}
165179

180+
__stream.sync();
166181
return __ret;
167182
}
168183

169-
template <class _Policy, class _Iter, class _Tp, class _BinaryOp>
184+
template <class _Policy, class _Iter, class _Size, class _Tp, class _BinaryOp>
170185
[[nodiscard]] _CCCL_HOST_API _Tp
171-
operator()([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func) const
186+
operator()([[maybe_unused]] const _Policy& __policy, _Iter __first, _Size __count, _Tp __init, _BinaryOp __func) const
172187
{
173188
if constexpr (::cuda::std::__has_random_access_traversal<_Iter>)
174189
{
175190
try
176191
{
177192
return __par_impl(
178-
__policy,
179-
::cuda::std::move(__first),
180-
::cuda::std::move(__last),
181-
::cuda::std::move(__init),
182-
::cuda::std::move(__func));
193+
__policy, ::cuda::std::move(__first), __count, ::cuda::std::move(__init), ::cuda::std::move(__func));
183194
}
184195
catch (const ::cuda::cuda_error& __err)
185196
{
@@ -198,9 +209,17 @@ struct __pstl_dispatch<__pstl_algorithm::__reduce, __execution_backend::__cuda>
198209
static_assert(__always_false_v<_Policy>,
199210
"__pstl_dispatch: CUDA backend of cuda::std::reduce requires at least random access iterators");
200211
return ::cuda::std::reduce(
201-
::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__init), ::cuda::std::move(__func));
212+
__first, ::cuda::std::next(__first, __count), ::cuda::std::move(__init), ::cuda::std::move(__func));
202213
}
203214
}
215+
216+
template <class _Policy, class _Iter, class _Tp, class _BinaryOp>
217+
[[nodiscard]] _CCCL_HOST_API _Tp
218+
operator()([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, _Tp __init, _BinaryOp __func) const
219+
{
220+
const auto __count = ::cuda::std::distance(__first, __last);
221+
return (*this)(__policy, ::cuda::std::move(__first), __count, ::cuda::std::move(__init), ::cuda::std::move(__func));
222+
}
204223
};
205224

206225
_CCCL_END_NAMESPACE_ARCH_DEPENDENT

0 commit comments

Comments
 (0)