Skip to content

Commit ff09673

Browse files
committed
Implement parallel cuda::std::replace_copy
This implements the `replace_copy{_if}` algorithms for the cuda backend. * std::replace_copy see https://en.cppreference.com/w/cpp/algorithm/replace_copy.html * std::replace_copy_if see https://en.cppreference.com/w/cpp/algorithm/replace_copy.html It provides tests and benchmarks similar to Thrust and some boilerplate for libcu++ The functionality is publicly available yet and implemented in a private internal header Fixes #7409
1 parent 556c103 commit ff09673

File tree

7 files changed

+509
-0
lines changed

7 files changed

+509
-0
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <thrust/device_vector.h>
12+
13+
#include <cuda/memory_pool>
14+
#include <cuda/std/__pstl_algorithm>
15+
#include <cuda/stream_ref>
16+
17+
#include "nvbench_helper.cuh"
18+
19+
template <typename T>
20+
static void basic(nvbench::state& state, nvbench::type_list<T>)
21+
{
22+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
23+
24+
thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});
25+
thrust::device_vector<T> out(elements, thrust::no_init);
26+
27+
state.add_element_count(elements);
28+
state.add_global_memory_reads<T>(elements);
29+
state.add_global_memory_writes<T>(elements);
30+
31+
caching_allocator_t alloc{};
32+
auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);
33+
34+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
35+
[&](nvbench::launch& launch) {
36+
cuda::std::replace_copy(
37+
policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), out.begin(), 42, 1337);
38+
});
39+
}
40+
41+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
42+
.set_name("base")
43+
.set_type_axes_names({"T{ct}"})
44+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <thrust/device_vector.h>
12+
13+
#include <cuda/memory_pool>
14+
#include <cuda/std/__pstl_algorithm>
15+
#include <cuda/stream_ref>
16+
17+
#include "nvbench_helper.cuh"
18+
19+
struct equal_to_42
20+
{
21+
template <class T>
22+
__device__ constexpr bool operator()(const T& val) const noexcept
23+
{
24+
return val == static_cast<T>(42);
25+
}
26+
};
27+
28+
template <typename T>
29+
static void basic(nvbench::state& state, nvbench::type_list<T>)
30+
{
31+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
32+
33+
thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});
34+
thrust::device_vector<T> out(elements, thrust::no_init);
35+
36+
state.add_element_count(elements);
37+
state.add_global_memory_reads<T>(elements);
38+
state.add_global_memory_writes<T>(elements);
39+
40+
caching_allocator_t alloc{};
41+
auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);
42+
43+
state.exec(
44+
nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
45+
cuda::std::replace_copy_if(
46+
policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), out.begin(), equal_to_42{}, 1337);
47+
});
48+
}
49+
50+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
51+
.set_name("base")
52+
.set_type_axes_names({"T{ct}"})
53+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_REPLACE_COPY_H
12+
#define _CUDA_STD___PSTL_REPLACE_COPY_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/std/__algorithm/replace_copy.h>
27+
# include <cuda/std/__concepts/concept_macros.h>
28+
# include <cuda/std/__execution/policy.h>
29+
# include <cuda/std/__iterator/concepts.h>
30+
# include <cuda/std/__iterator/iterator_traits.h>
31+
# include <cuda/std/__pstl/dispatch.h>
32+
# include <cuda/std/__type_traits/always_false.h>
33+
# include <cuda/std/__type_traits/is_comparable.h>
34+
# include <cuda/std/__type_traits/is_execution_policy.h>
35+
# include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
36+
# include <cuda/std/__utility/move.h>
37+
38+
# if _CCCL_HAS_BACKEND_CUDA()
39+
# include <cuda/std/__pstl/cuda/transform.h>
40+
# endif // _CCCL_HAS_BACKEND_CUDA()
41+
42+
# include <cuda/std/__cccl/prologue.h>
43+
44+
_CCCL_BEGIN_NAMESPACE_CUDA_STD
45+
46+
template <class _Tp>
47+
struct __replace_copy_select
48+
{
49+
_Tp __old_value_;
50+
_Tp __new_value_;
51+
52+
_CCCL_HOST_API constexpr __replace_copy_select(const _Tp& __old_value,
53+
const _Tp& __new_value) noexcept(is_nothrow_copy_constructible_v<_Tp>)
54+
: __old_value_(__old_value)
55+
, __new_value_(__new_value)
56+
{}
57+
58+
template <class _Up>
59+
[[nodiscard]] _CCCL_DEVICE_API constexpr _Tp operator()(const _Up& __val) const
60+
noexcept(is_nothrow_copy_constructible_v<_Tp>)
61+
{
62+
return __val == __old_value_ ? __new_value_ : static_cast<_Tp>(__val);
63+
}
64+
};
65+
66+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
67+
68+
_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _Tp = iter_value_t<_InputIterator>)
69+
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND
70+
is_execution_policy_v<_Policy>)
71+
_CCCL_HOST_API _OutputIterator replace_copy(
72+
[[maybe_unused]] const _Policy& __policy,
73+
_InputIterator __first,
74+
_InputIterator __last,
75+
_OutputIterator __result,
76+
const _Tp& __old_value,
77+
const _Tp& __new_value)
78+
{
79+
static_assert(__is_cpp17_equality_comparable_v<_Tp, iter_reference_t<_InputIterator>>,
80+
"cuda::std::replace_copy requires T to be comparable with iter_reference_t<InputIterator>");
81+
82+
if (__first == __last)
83+
{
84+
return __result;
85+
}
86+
87+
[[maybe_unused]] auto __dispatch =
88+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
89+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
90+
{
91+
return __dispatch(
92+
__policy,
93+
::cuda::std::move(__first),
94+
::cuda::std::move(__last),
95+
::cuda::std::move(__result),
96+
__replace_copy_select{__old_value, __new_value});
97+
}
98+
else
99+
{
100+
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::replace_copy requires at least one selected backend");
101+
return ::cuda::std::replace_copy(
102+
::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__result), __old_value, __new_value);
103+
}
104+
}
105+
106+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
107+
108+
_CCCL_END_NAMESPACE_CUDA_STD
109+
110+
# include <cuda/std/__cccl/epilogue.h>
111+
112+
#endif // !_CCCL_COMPILER(NVRTC)
113+
114+
#endif // _CUDA_STD___PSTL_REPLACE_COPY_H
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_REPLACE_COPY_IF_H
12+
#define _CUDA_STD___PSTL_REPLACE_COPY_IF_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/std/__algorithm/replace_copy_if.h>
27+
# include <cuda/std/__concepts/concept_macros.h>
28+
# include <cuda/std/__execution/policy.h>
29+
# include <cuda/std/__functional/invoke.h>
30+
# include <cuda/std/__iterator/concepts.h>
31+
# include <cuda/std/__iterator/iterator_traits.h>
32+
# include <cuda/std/__pstl/dispatch.h>
33+
# include <cuda/std/__type_traits/always_false.h>
34+
# include <cuda/std/__type_traits/is_execution_policy.h>
35+
# include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
36+
# include <cuda/std/__type_traits/is_nothrow_move_constructible.h>
37+
# include <cuda/std/__utility/move.h>
38+
39+
# if _CCCL_HAS_BACKEND_CUDA()
40+
# include <cuda/std/__pstl/cuda/transform.h>
41+
# endif // _CCCL_HAS_BACKEND_CUDA()
42+
43+
# include <cuda/std/__cccl/prologue.h>
44+
45+
_CCCL_BEGIN_NAMESPACE_CUDA_STD
46+
template <class _UnaryPred, class _Tp>
47+
struct __replace_copy_if_select
48+
{
49+
_UnaryPred __pred_;
50+
_Tp __new_value_;
51+
52+
_CCCL_HOST_API constexpr __replace_copy_if_select(_UnaryPred __pred, const _Tp& __new_value) noexcept(
53+
is_nothrow_move_constructible_v<_UnaryPred> && is_nothrow_copy_constructible_v<_Tp>)
54+
: __pred_(__pred)
55+
, __new_value_(__new_value)
56+
{}
57+
58+
template <class _Up>
59+
[[nodiscard]] _CCCL_DEVICE_API constexpr _Tp operator()(const _Up& __val) const
60+
noexcept(is_nothrow_invocable_v<const _UnaryPred&, const _Up&> && is_nothrow_copy_constructible_v<_Tp>)
61+
{
62+
return ::cuda::std::invoke(__pred_, __val) ? __new_value_ : static_cast<_Tp>(__val);
63+
}
64+
};
65+
66+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
67+
68+
_CCCL_TEMPLATE(
69+
class _Policy, class _InputIterator, class _OutputIterator, class _UnaryPred, class _Tp = iter_value_t<_InputIterator>)
70+
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND
71+
is_execution_policy_v<_Policy>)
72+
_CCCL_HOST_API _OutputIterator replace_copy_if(
73+
[[maybe_unused]] const _Policy& __policy,
74+
_InputIterator __first,
75+
_InputIterator __last,
76+
_OutputIterator __result,
77+
_UnaryPred __pred,
78+
const _Tp& __new_value)
79+
{
80+
static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>,
81+
"cuda::std::replace_copy_if: UnaryPred must satisfy indirect_unary_predicate<InputIterator>");
82+
83+
if (__first == __last)
84+
{
85+
return __result;
86+
}
87+
88+
[[maybe_unused]] auto __dispatch =
89+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
90+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
91+
{
92+
return __dispatch(
93+
__policy,
94+
::cuda::std::move(__first),
95+
::cuda::std::move(__last),
96+
::cuda::std::move(__result),
97+
__replace_copy_if_select{::cuda::std::move(__pred), __new_value});
98+
}
99+
else
100+
{
101+
static_assert(__always_false_v<_Policy>,
102+
"Parallel cuda::std::replace_copy_if requires at least one selected backend");
103+
return ::cuda::std::replace_copy_if(
104+
::cuda::std::move(__first),
105+
::cuda::std::move(__last),
106+
::cuda::std::move(__result),
107+
::cuda::std::move(__pred),
108+
__new_value);
109+
}
110+
}
111+
112+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
113+
114+
_CCCL_END_NAMESPACE_CUDA_STD
115+
116+
# include <cuda/std/__cccl/epilogue.h>
117+
118+
#endif // !_CCCL_COMPILER(NVRTC)
119+
120+
#endif // _CUDA_STD___PSTL_REPLACE_COPY_IF_H

libcudacxx/include/cuda/std/__pstl_algorithm

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#include <cuda/std/__pstl/for_each_n.h>
2626
#include <cuda/std/__pstl/reduce.h>
2727
#include <cuda/std/__pstl/replace.h>
28+
#include <cuda/std/__pstl/replace_copy.h>
29+
#include <cuda/std/__pstl/replace_copy_if.h>
2830
#include <cuda/std/__pstl/replace_if.h>
2931
#include <cuda/std/__pstl/transform.h>
3032

0 commit comments

Comments
 (0)