Implement parallel cuda::std::replace_copy

miscco · miscco · commit ff09673d60d1 · 2026-02-04T17:51:27.000+01:00
This implements the `replace_copy{_if}` algorithms for the cuda backend. * std::replace_copy see https://en.cppreference.com/w/cpp/algorithm/replace_copy.html * std::replace_copy_if see https://en.cppreference.com/w/cpp/algorithm/replace_copy.html It provides tests and benchmarks similar to Thrust and some boilerplate for libcu++ The functionality is publicly available yet and implemented in a private internal header Fixes #7409
diff --git a/libcudacxx/benchmarks/bench/replace_copy/basic.cu b/libcudacxx/benchmarks/bench/replace_copy/basic.cu
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/stream_ref>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});
+  thrust::device_vector<T> out(elements, thrust::no_init);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc{};
+  auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);
+
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               cuda::std::replace_copy(
+                 policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), out.begin(), 42, 1337);
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/libcudacxx/benchmarks/bench/replace_copy_if/basic.cu b/libcudacxx/benchmarks/bench/replace_copy_if/basic.cu
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/stream_ref>
+
+#include "nvbench_helper.cuh"
+
+struct equal_to_42
+{
+  template <class T>
+  __device__ constexpr bool operator()(const T& val) const noexcept
+  {
+    return val == static_cast<T>(42);
+  }
+};
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});
+  thrust::device_vector<T> out(elements, thrust::no_init);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc{};
+  auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);
+
+  state.exec(
+    nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cuda::std::replace_copy_if(
+        policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), out.begin(), equal_to_42{}, 1337);
+    });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/libcudacxx/include/cuda/std/__pstl/replace_copy.h b/libcudacxx/include/cuda/std/__pstl/replace_copy.h
@@ -0,0 +1,114 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_REPLACE_COPY_H
+#define _CUDA_STD___PSTL_REPLACE_COPY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/std/__algorithm/replace_copy.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_comparable.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/transform.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+template <class _Tp>
+struct __replace_copy_select
+{
+  _Tp __old_value_;
+  _Tp __new_value_;
+
+  _CCCL_HOST_API constexpr __replace_copy_select(const _Tp& __old_value,
+                                                 const _Tp& __new_value) noexcept(is_nothrow_copy_constructible_v<_Tp>)
+      : __old_value_(__old_value)
+      , __new_value_(__new_value)
+  {}
+
+  template <class _Up>
+  [[nodiscard]] _CCCL_DEVICE_API constexpr _Tp operator()(const _Up& __val) const
+    noexcept(is_nothrow_copy_constructible_v<_Tp>)
+  {
+    return __val == __old_value_ ? __new_value_ : static_cast<_Tp>(__val);
+  }
+};
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _Tp = iter_value_t<_InputIterator>)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND
+                 is_execution_policy_v<_Policy>)
+_CCCL_HOST_API _OutputIterator replace_copy(
+  [[maybe_unused]] const _Policy& __policy,
+  _InputIterator __first,
+  _InputIterator __last,
+  _OutputIterator __result,
+  const _Tp& __old_value,
+  const _Tp& __new_value)
+{
+  static_assert(__is_cpp17_equality_comparable_v<_Tp, iter_reference_t<_InputIterator>>,
+                "cuda::std::replace_copy requires T to be comparable with iter_reference_t<InputIterator>");
+
+  if (__first == __last)
+  {
+    return __result;
+  }
+
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    return __dispatch(
+      __policy,
+      ::cuda::std::move(__first),
+      ::cuda::std::move(__last),
+      ::cuda::std::move(__result),
+      __replace_copy_select{__old_value, __new_value});
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>, "Parallel cuda::std::replace_copy requires at least one selected backend");
+    return ::cuda::std::replace_copy(
+      ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__result), __old_value, __new_value);
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_REPLACE_COPY_H
diff --git a/libcudacxx/include/cuda/std/__pstl/replace_copy_if.h b/libcudacxx/include/cuda/std/__pstl/replace_copy_if.h
@@ -0,0 +1,120 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_REPLACE_COPY_IF_H
+#define _CUDA_STD___PSTL_REPLACE_COPY_IF_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/std/__algorithm/replace_copy_if.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/invoke.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
+#  include <cuda/std/__type_traits/is_nothrow_move_constructible.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/transform.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+template <class _UnaryPred, class _Tp>
+struct __replace_copy_if_select
+{
+  _UnaryPred __pred_;
+  _Tp __new_value_;
+
+  _CCCL_HOST_API constexpr __replace_copy_if_select(_UnaryPred __pred, const _Tp& __new_value) noexcept(
+    is_nothrow_move_constructible_v<_UnaryPred> && is_nothrow_copy_constructible_v<_Tp>)
+      : __pred_(__pred)
+      , __new_value_(__new_value)
+  {}
+
+  template <class _Up>
+  [[nodiscard]] _CCCL_DEVICE_API constexpr _Tp operator()(const _Up& __val) const
+    noexcept(is_nothrow_invocable_v<const _UnaryPred&, const _Up&> && is_nothrow_copy_constructible_v<_Tp>)
+  {
+    return ::cuda::std::invoke(__pred_, __val) ? __new_value_ : static_cast<_Tp>(__val);
+  }
+};
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(
+  class _Policy, class _InputIterator, class _OutputIterator, class _UnaryPred, class _Tp = iter_value_t<_InputIterator>)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND
+                 is_execution_policy_v<_Policy>)
+_CCCL_HOST_API _OutputIterator replace_copy_if(
+  [[maybe_unused]] const _Policy& __policy,
+  _InputIterator __first,
+  _InputIterator __last,
+  _OutputIterator __result,
+  _UnaryPred __pred,
+  const _Tp& __new_value)
+{
+  static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>,
+                "cuda::std::replace_copy_if: UnaryPred must satisfy indirect_unary_predicate<InputIterator>");
+
+  if (__first == __last)
+  {
+    return __result;
+  }
+
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    return __dispatch(
+      __policy,
+      ::cuda::std::move(__first),
+      ::cuda::std::move(__last),
+      ::cuda::std::move(__result),
+      __replace_copy_if_select{::cuda::std::move(__pred), __new_value});
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>,
+                  "Parallel cuda::std::replace_copy_if requires at least one selected backend");
+    return ::cuda::std::replace_copy_if(
+      ::cuda::std::move(__first),
+      ::cuda::std::move(__last),
+      ::cuda::std::move(__result),
+      ::cuda::std::move(__pred),
+      __new_value);
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_REPLACE_COPY_IF_H
diff --git a/libcudacxx/include/cuda/std/__pstl_algorithm b/libcudacxx/include/cuda/std/__pstl_algorithm
@@ -25,6 +25,8 @@
 #include <cuda/std/__pstl/for_each_n.h>
 #include <cuda/std/__pstl/reduce.h>
 #include <cuda/std/__pstl/replace.h>
+#include <cuda/std/__pstl/replace_copy.h>
+#include <cuda/std/__pstl/replace_copy_if.h>
 #include <cuda/std/__pstl/replace_if.h>
 #include <cuda/std/__pstl/transform.h>
 
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.replace/pstl_replace_copy.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.replace/pstl_replace_copy.cu
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.replace/pstl_replace_copy_if.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.replace/pstl_replace_copy_if.cu