Skip to content

Commit 1962a21

Browse files
Move tuning header
1 parent 5d63b36 commit 1962a21

File tree

4 files changed

+110
-84
lines changed

4 files changed

+110
-84
lines changed

cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cub/agent/agent_reduce.cuh>
2323
#include <cub/detail/rfa.cuh>
2424
#include <cub/device/dispatch/dispatch_reduce.cuh>
25+
#include <cub/device/dispatch/tuning/tuning_reduce_deterministic.cuh>
2526
#include <cub/grid/grid_even_share.cuh>
2627
#include <cub/iterator/arg_index_input_iterator.cuh>
2728
#include <cub/thread/thread_operators.cuh>

cub/cub/device/dispatch/kernels/kernel_reduce.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <cub/agent/agent_reduce.cuh>
1717
#include <cub/detail/rfa.cuh>
1818
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
19+
#include <cub/device/dispatch/tuning/tuning_reduce_deterministic.cuh>
1920
#include <cub/grid/grid_even_share.cuh>
2021

2122
#include <thrust/type_traits/unwrap_contiguous_iterator.h>

cub/cub/device/dispatch/tuning/tuning_reduce.cuh

Lines changed: 0 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -415,90 +415,6 @@ struct policy_selector_from_types
415415
};
416416
} // namespace reduce
417417

418-
namespace rfa
419-
{
420-
struct reduce_policy
421-
{
422-
int block_threads;
423-
int items_per_thread;
424-
BlockReduceAlgorithm block_algorithm;
425-
};
426-
427-
struct single_tile_policy
428-
{
429-
int block_threads;
430-
int items_per_thread;
431-
BlockReduceAlgorithm block_algorithm;
432-
};
433-
434-
struct rfa_policy
435-
{
436-
reduce_policy reduce;
437-
single_tile_policy single_tile;
438-
};
439-
440-
struct policy_selector
441-
{
442-
type_t accum_t;
443-
int accum_size;
444-
445-
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rfa_policy
446-
{
447-
if (arch >= ::cuda::arch_id::sm_90)
448-
{
449-
// only tuned for float, fall through for other types
450-
if (accum_t == type_t::float32)
451-
{
452-
// ipt_13.tpb_224 1.107188 1.009709 1.097114 1.316820
453-
const auto scaled = scale_mem_bound(224, 13, accum_size);
454-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
455-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
456-
}
457-
}
458-
459-
if (arch >= ::cuda::arch_id::sm_86)
460-
{
461-
// only tuned for float and double, fall through for other types
462-
if (accum_t == type_t::float32)
463-
{
464-
// ipt_6.tpb_224 1.034383 1.000000 1.032097 1.090909
465-
const auto scaled = scale_mem_bound(224, 6, accum_size);
466-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
467-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
468-
}
469-
if (accum_t == type_t::float64)
470-
{
471-
// ipt_11.tpb_128 () 1.232089 1.002124 1.245336 1.582279
472-
const auto scaled = scale_mem_bound(128, 11, accum_size);
473-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
474-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
475-
}
476-
}
477-
478-
if (arch >= ::cuda::arch_id::sm_60)
479-
{
480-
const auto scaled = scale_mem_bound(256, 16, accum_size);
481-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
482-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
483-
}
484-
485-
const auto scaled = scale_mem_bound(256, 20, accum_size);
486-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
487-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
488-
}
489-
};
490-
491-
// stateless version which can be passed to kernels
492-
template <typename AccumT>
493-
struct policy_selector_from_types
494-
{
495-
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rfa_policy
496-
{
497-
return policy_selector{classify_type<AccumT>, int{sizeof(AccumT)}}(arch);
498-
}
499-
};
500-
} // namespace rfa
501-
502418
namespace fixed_size_segmented_reduce
503419
{
504420
template <typename AccumT, typename OffsetT, typename ReductionOpT>
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
// SPDX-License-Identifier: BSD-3
3+
4+
#pragma once
5+
6+
#include <cub/config.cuh>
7+
8+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
9+
# pragma GCC system_header
10+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
11+
# pragma clang system_header
12+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
13+
# pragma system_header
14+
#endif // no system header
15+
16+
#include <cub/agent/agent_reduce.cuh>
17+
#include <cub/device/dispatch/tuning/common.cuh>
18+
#include <cub/util_arch.cuh>
19+
20+
#include <cuda/__device/arch_id.h>
21+
22+
CUB_NAMESPACE_BEGIN
23+
24+
namespace detail::rfa
25+
{
26+
struct reduce_policy
27+
{
28+
int block_threads;
29+
int items_per_thread;
30+
BlockReduceAlgorithm block_algorithm;
31+
};
32+
33+
struct single_tile_policy
34+
{
35+
int block_threads;
36+
int items_per_thread;
37+
BlockReduceAlgorithm block_algorithm;
38+
};
39+
40+
struct rfa_policy
41+
{
42+
reduce_policy reduce;
43+
single_tile_policy single_tile;
44+
};
45+
46+
struct policy_selector
47+
{
48+
type_t accum_t;
49+
int accum_size;
50+
51+
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rfa_policy
52+
{
53+
if (arch >= ::cuda::arch_id::sm_90)
54+
{
55+
// only tuned for float, fall through for other types
56+
if (accum_t == type_t::float32)
57+
{
58+
// ipt_13.tpb_224 1.107188 1.009709 1.097114 1.316820
59+
const auto scaled = scale_mem_bound(224, 13, accum_size);
60+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
61+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
62+
}
63+
}
64+
65+
if (arch >= ::cuda::arch_id::sm_86)
66+
{
67+
// only tuned for float and double, fall through for other types
68+
if (accum_t == type_t::float32)
69+
{
70+
// ipt_6.tpb_224 1.034383 1.000000 1.032097 1.090909
71+
const auto scaled = scale_mem_bound(224, 6, accum_size);
72+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
73+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
74+
}
75+
if (accum_t == type_t::float64)
76+
{
77+
// ipt_11.tpb_128 () 1.232089 1.002124 1.245336 1.582279
78+
const auto scaled = scale_mem_bound(128, 11, accum_size);
79+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
80+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
81+
}
82+
}
83+
84+
if (arch >= ::cuda::arch_id::sm_60)
85+
{
86+
const auto scaled = scale_mem_bound(256, 16, accum_size);
87+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
88+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
89+
}
90+
91+
const auto scaled = scale_mem_bound(256, 20, accum_size);
92+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
93+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
94+
}
95+
};
96+
97+
// stateless version which can be passed to kernels
98+
template <typename AccumT>
99+
struct policy_selector_from_types
100+
{
101+
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rfa_policy
102+
{
103+
return policy_selector{classify_type<AccumT>, int{sizeof(AccumT)}}(arch);
104+
}
105+
};
106+
} // namespace detail::rfa
107+
108+
CUB_NAMESPACE_END

0 commit comments

Comments
 (0)