Skip to content

Commit da4cba1

Browse files
committed
Implement parallel cuda::std::count
This reuses the `cuda:std::reduce` functionality to implement * `cuda::std::count` * `cuda::std::count_if` It provides tests and benchmarks similar to Thrust and some boilerplate for libcu++ The functionality is publicly available yet and implemented in a private internal header Fixes #7367
1 parent 52b7d36 commit da4cba1

File tree

9 files changed

+564
-0
lines changed

9 files changed

+564
-0
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <thrust/device_vector.h>
12+
13+
#include <cuda/memory_pool>
14+
#include <cuda/std/__pstl_algorithm>
15+
#include <cuda/stream_ref>
16+
17+
#include "nvbench_helper.cuh"
18+
19+
template <typename T>
20+
static void basic(nvbench::state& state, nvbench::type_list<T>)
21+
{
22+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
23+
24+
thrust::device_vector<T> in = generate(elements);
25+
26+
state.add_element_count(elements);
27+
state.add_global_memory_reads<T>(elements);
28+
state.add_global_memory_writes<T>(1);
29+
30+
caching_allocator_t alloc{};
31+
auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);
32+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
33+
do_not_optimize(
34+
cuda::std::count(policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), T{42}));
35+
});
36+
}
37+
38+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
39+
.set_name("base")
40+
.set_type_axes_names({"T{ct}"})
41+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <thrust/device_vector.h>
12+
13+
#include <cuda/memory_pool>
14+
#include <cuda/std/__pstl_algorithm>
15+
#include <cuda/stream_ref>
16+
17+
#include "nvbench_helper.cuh"
18+
19+
struct equal_to_42
20+
{
21+
template <class T>
22+
__device__ constexpr bool operator()(const T& val) const noexcept
23+
{
24+
return val == 42;
25+
}
26+
};
27+
28+
template <typename T>
29+
static void basic(nvbench::state& state, nvbench::type_list<T>)
30+
{
31+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
32+
33+
thrust::device_vector<T> in = generate(elements);
34+
35+
state.add_element_count(elements);
36+
state.add_global_memory_reads<T>(elements);
37+
state.add_global_memory_writes<T>(1);
38+
39+
caching_allocator_t alloc{};
40+
auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);
41+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
42+
do_not_optimize(
43+
cuda::std::count_if(policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), equal_to_42{}));
44+
});
45+
}
46+
47+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
48+
.set_name("base")
49+
.set_type_axes_names({"T{ct}"})
50+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_COUNT_H
12+
#define _CUDA_STD___PSTL_COUNT_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/__iterator/transform_iterator.h>
27+
# include <cuda/std/__algorithm/count.h>
28+
# include <cuda/std/__concepts/concept_macros.h>
29+
# include <cuda/std/__execution/policy.h>
30+
# include <cuda/std/__functional/operations.h>
31+
# include <cuda/std/__iterator/distance.h>
32+
# include <cuda/std/__iterator/incrementable_traits.h>
33+
# include <cuda/std/__iterator/iterator_traits.h>
34+
# include <cuda/std/__pstl/dispatch.h>
35+
# include <cuda/std/__type_traits/always_false.h>
36+
# include <cuda/std/__type_traits/is_comparable.h>
37+
# include <cuda/std/__type_traits/is_execution_policy.h>
38+
# include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
39+
# include <cuda/std/__utility/move.h>
40+
41+
# if _CCCL_HAS_BACKEND_CUDA()
42+
# include <cuda/std/__pstl/cuda/reduce.h>
43+
# endif // _CCCL_HAS_BACKEND_CUDA()
44+
45+
# include <cuda/std/__cccl/prologue.h>
46+
47+
_CCCL_BEGIN_NAMESPACE_CUDA_STD
48+
49+
template <class _Tp>
50+
struct __count_compare_eq
51+
{
52+
_Tp __val_;
53+
54+
_CCCL_API constexpr __count_compare_eq(const _Tp& __val) noexcept(is_nothrow_copy_constructible_v<_Tp>)
55+
: __val_(__val)
56+
{}
57+
58+
template <class _Up>
59+
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE constexpr int operator()(const _Up& __rhs) const
60+
noexcept(__is_cpp17_nothrow_equality_comparable_v<_Tp, _Up>)
61+
{
62+
return static_cast<bool>(__val_ == __rhs) ? 1 : 0;
63+
}
64+
};
65+
66+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
67+
68+
_CCCL_TEMPLATE(class _Policy, class _Iter, class _Tp)
69+
_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>)
70+
[[nodiscard]] _CCCL_HOST_API iter_difference_t<_Iter>
71+
count([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, const _Tp& __value)
72+
{
73+
static_assert(__is_cpp17_equality_comparable_v<iter_reference_t<_Iter>, _Tp>,
74+
"cuda::std::count: T must be equality comparable to Iter's value type.");
75+
[[maybe_unused]] auto __dispatch =
76+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__reduce, _Policy>();
77+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
78+
{
79+
const auto __count = ::cuda::std::distance(__first, __last);
80+
return __dispatch(
81+
__policy,
82+
::cuda::transform_iterator{::cuda::std::move(__first), __count_compare_eq{__value}},
83+
__count,
84+
iter_difference_t<_Iter>{0},
85+
::cuda::std::plus<>{});
86+
}
87+
else
88+
{
89+
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::count requires at least one selected backend");
90+
return ::cuda::std::count(::cuda::std::move(__first), ::cuda::std::move(__last), __value);
91+
}
92+
}
93+
94+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
95+
96+
_CCCL_END_NAMESPACE_CUDA_STD
97+
98+
# include <cuda/std/__cccl/epilogue.h>
99+
100+
#endif // !_CCCL_COMPILER(NVRTC)
101+
102+
#endif // _CUDA_STD___PSTL_COUNT_H
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_COUNT_IF_H
12+
#define _CUDA_STD___PSTL_COUNT_IF_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/__iterator/transform_iterator.h>
27+
# include <cuda/std/__algorithm/count_if.h>
28+
# include <cuda/std/__concepts/concept_macros.h>
29+
# include <cuda/std/__execution/policy.h>
30+
# include <cuda/std/__functional/operations.h>
31+
# include <cuda/std/__iterator/concepts.h>
32+
# include <cuda/std/__iterator/distance.h>
33+
# include <cuda/std/__iterator/incrementable_traits.h>
34+
# include <cuda/std/__iterator/iterator_traits.h>
35+
# include <cuda/std/__pstl/dispatch.h>
36+
# include <cuda/std/__type_traits/always_false.h>
37+
# include <cuda/std/__type_traits/is_callable.h>
38+
# include <cuda/std/__type_traits/is_comparable.h>
39+
# include <cuda/std/__type_traits/is_execution_policy.h>
40+
# include <cuda/std/__utility/move.h>
41+
42+
# if _CCCL_HAS_BACKEND_CUDA()
43+
# include <cuda/std/__pstl/cuda/reduce.h>
44+
# endif // _CCCL_HAS_BACKEND_CUDA()
45+
46+
# include <cuda/std/__cccl/prologue.h>
47+
48+
_CCCL_BEGIN_NAMESPACE_CUDA_STD
49+
50+
template <class _UnaryPredicate>
51+
struct __count_if_compare_eq
52+
{
53+
_UnaryPredicate __pred_;
54+
55+
_CCCL_API constexpr __count_if_compare_eq(_UnaryPredicate __pred)
56+
: __pred_(__pred)
57+
{}
58+
59+
template <class _Up>
60+
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE constexpr int operator()(const _Up& __rhs) const
61+
noexcept(__is_nothrow_callable_v<_UnaryPredicate&, const _Up&>)
62+
{
63+
return static_cast<bool>(__pred_(__rhs)) ? 1 : 0;
64+
}
65+
};
66+
67+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
68+
69+
_CCCL_TEMPLATE(class _Policy, class _Iter, class _UnaryPredicate)
70+
_CCCL_REQUIRES(__has_forward_traversal<_Iter> _CCCL_AND is_execution_policy_v<_Policy>)
71+
[[nodiscard]] _CCCL_HOST_API iter_difference_t<_Iter>
72+
count_if([[maybe_unused]] const _Policy& __policy, _Iter __first, _Iter __last, _UnaryPredicate __pred)
73+
{
74+
static_assert(indirect_unary_predicate<_UnaryPredicate, _Iter>,
75+
"cuda::std::count_if: UnaryPred must satisfy indirect_unary_predicate<UnaryPred, Iter>");
76+
[[maybe_unused]] auto __dispatch =
77+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__reduce, _Policy>();
78+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
79+
{
80+
const auto __count = ::cuda::std::distance(__first, __last);
81+
return __dispatch(
82+
__policy,
83+
::cuda::transform_iterator{::cuda::std::move(__first), __count_if_compare_eq{::cuda::std::move(__pred)}},
84+
__count,
85+
iter_difference_t<_Iter>{0},
86+
::cuda::std::plus<>{});
87+
}
88+
else
89+
{
90+
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::count_if requires at least one selected backend");
91+
return ::cuda::std::count_if(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
92+
}
93+
}
94+
95+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
96+
97+
_CCCL_END_NAMESPACE_CUDA_STD
98+
99+
# include <cuda/std/__cccl/epilogue.h>
100+
101+
#endif // !_CCCL_COMPILER(NVRTC)
102+
103+
#endif // _CUDA_STD___PSTL_COUNT_IF_H

libcudacxx/include/cuda/std/__pstl_algorithm

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
# pragma system_header
2222
#endif // no system header
2323

24+
#include <cuda/std/__pstl/count.h>
25+
#include <cuda/std/__pstl/count_if.h>
2426
#include <cuda/std/__pstl/for_each.h>
2527
#include <cuda/std/__pstl/for_each_n.h>
2628
#include <cuda/std/__pstl/reduce.h>
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
7+
//
8+
//===----------------------------------------------------------------------===//
9+
10+
// UNSUPPORTED: nvrtc
11+
// XFAIL: true
12+
13+
// template<class ExecutionPolicy, class ForwardIterator, class T>
14+
// void count(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, const T& value);
15+
16+
#include <cuda/std/__pstl_algorithm>
17+
#include <cuda/std/algorithm>
18+
#include <cuda/std/cassert>
19+
20+
#include "test_execution_policies.h"
21+
#include "test_iterators.h"
22+
#include "test_macros.h"
23+
24+
EXECUTION_POLICY_SFINAE_TEST(count);
25+
26+
static_assert(!sfinae_test_count<int, int*, int*, int>);
27+
static_assert(sfinae_test_count<cuda::std::execution::parallel_policy, int*, int*, int>);
28+
29+
int data[100];
30+
31+
template <class Iter>
32+
struct Test
33+
{
34+
template <class Policy>
35+
void operator()(Policy&& policy)
36+
{
37+
int sizes[] = {0, 1, 2, 100};
38+
cuda::std::iota(data, data + size, 0);
39+
for (auto size : sizes)
40+
{
41+
const auto res = cuda::std::count(policy, Iter(data), Iter(data + size), 42);
42+
assert(res == 1);
43+
}
44+
}
45+
};
46+
47+
__host__ void test()
48+
{
49+
types::count(types::forward_iterator_list<int*>{}, TestIteratorWithPolicies<Test>{});
50+
}
51+
52+
int main(int, char**)
53+
{
54+
NV_IF_TARGET(NV_IS_HOST, test();)
55+
56+
return 0;
57+
}

0 commit comments

Comments
 (0)