Implement parallel cuda::std::count

miscco · miscco · commit da4cba1af130 · 2026-02-04T17:44:51.000+01:00
This reuses the `cuda:std::reduce` functionality to implement * `cuda::std::count` * `cuda::std::count_if` It provides tests and benchmarks similar to Thrust and some boilerplate for libcu++ The functionality is publicly available yet and implemented in a private internal header Fixes #7367
diff --git a/libcudacxx/benchmarks/bench/count/basic.cu b/libcudacxx/benchmarks/bench/count/basic.cu
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/stream_ref>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> in = generate(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(1);
+
+  caching_allocator_t alloc{};
+  auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    do_not_optimize(
+      cuda::std::count(policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), T{42}));
+  });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/libcudacxx/benchmarks/bench/count_if/basic.cu b/libcudacxx/benchmarks/bench/count_if/basic.cu
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/stream_ref>
+
+#include "nvbench_helper.cuh"
+
+struct equal_to_42
+{
+  template <class T>
+  __device__ constexpr bool operator()(const T& val) const noexcept
+  {
+    return val == 42;
+  }
+};
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> in = generate(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(1);
+
+  caching_allocator_t alloc{};
+  auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    do_not_optimize(
+      cuda::std::count_if(policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), equal_to_42{}));
+  });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/libcudacxx/include/cuda/std/__pstl/count.h b/libcudacxx/include/cuda/std/__pstl/count.h
@@ -0,0 +1,102 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_COUNT_H
+#define _CUDA_STD___PSTL_COUNT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__iterator/transform_iterator.h>
+#  include <cuda/std/__algorithm/count.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/operations.h>
+#  include <cuda/std/__iterator/distance.h>
+#  include <cuda/std/__iterator/incrementable_traits.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_comparable.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/reduce.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+template <class _Tp>
+struct __count_compare_eq
+{
+  _Tp __val_;
+
+  _CCCL_API constexpr __count_compare_eq(const _Tp& __val) noexcept(is_nothrow_copy_constructible_v<_Tp>)
+      : __val_(__val)
+  {}
+
+  template <class _Up>
+  [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE constexpr int operator()(const _Up& __rhs) const
+    noexcept(__is_cpp17_nothrow_equality_comparable_v<_Tp, _Up>)
+  {
+    return static_cast<bool>(__val_ == __rhs) ? 1 : 0;
+  }
+};
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _Iter, class _Tp)
+_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>)
+[[nodiscard]] _CCCL_HOST_API iter_difference_t<_Iter>
+count([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, const _Tp& __value)
+{
+  static_assert(__is_cpp17_equality_comparable_v<iter_reference_t<_Iter>, _Tp>,
+                "cuda::std::count: T must be equality comparable to Iter's value type.");
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__reduce, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    const auto __count = ::cuda::std::distance(__first, __last);
+    return __dispatch(
+      __policy,
+      ::cuda::transform_iterator{::cuda::std::move(__first), __count_compare_eq{__value}},
+      __count,
+      iter_difference_t<_Iter>{0},
+      ::cuda::std::plus<>{});
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>, "Parallel cuda::std::count requires at least one selected backend");
+    return ::cuda::std::count(::cuda::std::move(__first), ::cuda::std::move(__last), __value);
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_COUNT_H
diff --git a/libcudacxx/include/cuda/std/__pstl/count_if.h b/libcudacxx/include/cuda/std/__pstl/count_if.h
@@ -0,0 +1,103 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_COUNT_IF_H
+#define _CUDA_STD___PSTL_COUNT_IF_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__iterator/transform_iterator.h>
+#  include <cuda/std/__algorithm/count_if.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/operations.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/distance.h>
+#  include <cuda/std/__iterator/incrementable_traits.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_callable.h>
+#  include <cuda/std/__type_traits/is_comparable.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/reduce.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+template <class _UnaryPredicate>
+struct __count_if_compare_eq
+{
+  _UnaryPredicate __pred_;
+
+  _CCCL_API constexpr __count_if_compare_eq(_UnaryPredicate __pred)
+      : __pred_(__pred)
+  {}
+
+  template <class _Up>
+  [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE constexpr int operator()(const _Up& __rhs) const
+    noexcept(__is_nothrow_callable_v<_UnaryPredicate&, const _Up&>)
+  {
+    return static_cast<bool>(__pred_(__rhs)) ? 1 : 0;
+  }
+};
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _Iter, class _UnaryPredicate)
+_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>)
+[[nodiscard]] _CCCL_HOST_API iter_difference_t<_Iter>
+count_if([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, _UnaryPredicate __pred)
+{
+  static_assert(indirect_unary_predicate<_UnaryPredicate, _Iter>,
+                "cuda::std::count_if: UnaryPred must satisfy indirect_unary_predicate<UnaryPred, Iter>");
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__reduce, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    const auto __count = ::cuda::std::distance(__first, __last);
+    return __dispatch(
+      __policy,
+      ::cuda::transform_iterator{::cuda::std::move(__first), __count_if_compare_eq{::cuda::std::move(__pred)}},
+      __count,
+      iter_difference_t<_Iter>{0},
+      ::cuda::std::plus<>{});
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>, "Parallel cuda::std::count_if requires at least one selected backend");
+    return ::cuda::std::count_if(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_COUNT_IF_H
diff --git a/libcudacxx/include/cuda/std/__pstl_algorithm b/libcudacxx/include/cuda/std/__pstl_algorithm
@@ -21,6 +21,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__pstl/count.h>
+#include <cuda/std/__pstl/count_if.h>
 #include <cuda/std/__pstl/for_each.h>
 #include <cuda/std/__pstl/for_each_n.h>
 #include <cuda/std/__pstl/reduce.h>
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.count/pstl.count.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.count/pstl.count.pass.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+// XFAIL: true
+
+// template<class ExecutionPolicy, class ForwardIterator, class T>
+//   void count(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, const T& value);
+
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/std/algorithm>
+#include <cuda/std/cassert>
+
+#include "test_execution_policies.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+
+EXECUTION_POLICY_SFINAE_TEST(count);
+
+static_assert(!sfinae_test_count<int, int*, int*, int>);
+static_assert(sfinae_test_count<cuda::std::execution::parallel_policy, int*, int*, int>);
+
+int data[100];
+
+template <class Iter>
+struct Test
+{
+  template <class Policy>
+  void operator()(Policy&& policy)
+  {
+    int sizes[] = {0, 1, 2, 100};
+    cuda::std::iota(data, data + size, 0);
+    for (auto size : sizes)
+    {
+      const auto res = cuda::std::count(policy, Iter(data), Iter(data + size), 42);
+      assert(res == 1);
+    }
+  }
+};
+
+__host__ void test()
+{
+  types::count(types::forward_iterator_list<int*>{}, TestIteratorWithPolicies<Test>{});
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(NV_IS_HOST, test();)
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.count/pstl.count_if.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.count/pstl.count_if.pass.cpp
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.count/pstl_count.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.count/pstl_count.cu
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.count/pstl_count_if.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.count/pstl_count_if.cu