Skip to content

Commit 797e34a

Browse files
Move tuning header
1 parent 38a781c commit 797e34a

File tree

4 files changed

+110
-84
lines changed

4 files changed

+110
-84
lines changed

cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cub/agent/agent_reduce.cuh>
2323
#include <cub/detail/rfa.cuh>
2424
#include <cub/device/dispatch/dispatch_reduce.cuh>
25+
#include <cub/device/dispatch/tuning/tuning_reduce_deterministic.cuh>
2526
#include <cub/grid/grid_even_share.cuh>
2627
#include <cub/iterator/arg_index_input_iterator.cuh>
2728
#include <cub/thread/thread_operators.cuh>

cub/cub/device/dispatch/kernels/kernel_reduce.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <cub/agent/agent_reduce.cuh>
1717
#include <cub/detail/rfa.cuh>
1818
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
19+
#include <cub/device/dispatch/tuning/tuning_reduce_deterministic.cuh>
1920
#include <cub/grid/grid_even_share.cuh>
2021

2122
#include <thrust/type_traits/unwrap_contiguous_iterator.h>

cub/cub/device/dispatch/tuning/tuning_reduce.cuh

Lines changed: 0 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -464,90 +464,6 @@ struct policy_selector_from_types
464464
};
465465
} // namespace reduce
466466

467-
namespace rfa
468-
{
469-
struct reduce_policy
470-
{
471-
int block_threads;
472-
int items_per_thread;
473-
BlockReduceAlgorithm block_algorithm;
474-
};
475-
476-
struct single_tile_policy
477-
{
478-
int block_threads;
479-
int items_per_thread;
480-
BlockReduceAlgorithm block_algorithm;
481-
};
482-
483-
struct rfa_policy
484-
{
485-
reduce_policy reduce;
486-
single_tile_policy single_tile;
487-
};
488-
489-
struct policy_selector
490-
{
491-
type_t accum_t;
492-
int accum_size;
493-
494-
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rfa_policy
495-
{
496-
if (arch >= ::cuda::arch_id::sm_90)
497-
{
498-
// only tuned for float, fall through for other types
499-
if (accum_t == type_t::float32)
500-
{
501-
// ipt_13.tpb_224 1.107188 1.009709 1.097114 1.316820
502-
const auto scaled = scale_mem_bound(224, 13, accum_size);
503-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
504-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
505-
}
506-
}
507-
508-
if (arch >= ::cuda::arch_id::sm_86)
509-
{
510-
// only tuned for float and double, fall through for other types
511-
if (accum_t == type_t::float32)
512-
{
513-
// ipt_6.tpb_224 1.034383 1.000000 1.032097 1.090909
514-
const auto scaled = scale_mem_bound(224, 6, accum_size);
515-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
516-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
517-
}
518-
if (accum_t == type_t::float64)
519-
{
520-
// ipt_11.tpb_128 () 1.232089 1.002124 1.245336 1.582279
521-
const auto scaled = scale_mem_bound(128, 11, accum_size);
522-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
523-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
524-
}
525-
}
526-
527-
if (arch >= ::cuda::arch_id::sm_60)
528-
{
529-
const auto scaled = scale_mem_bound(256, 16, accum_size);
530-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
531-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
532-
}
533-
534-
const auto scaled = scale_mem_bound(256, 20, accum_size);
535-
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
536-
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
537-
}
538-
};
539-
540-
// stateless version which can be passed to kernels
541-
template <typename AccumT>
542-
struct policy_selector_from_types
543-
{
544-
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rfa_policy
545-
{
546-
return policy_selector{classify_type<AccumT>, int{sizeof(AccumT)}}(arch);
547-
}
548-
};
549-
} // namespace rfa
550-
551467
namespace fixed_size_segmented_reduce
552468
{
553469
template <typename AccumT, typename OffsetT, typename ReductionOpT>
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
// SPDX-License-Identifier: BSD-3
3+
4+
#pragma once
5+
6+
#include <cub/config.cuh>
7+
8+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
9+
# pragma GCC system_header
10+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
11+
# pragma clang system_header
12+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
13+
# pragma system_header
14+
#endif // no system header
15+
16+
#include <cub/agent/agent_reduce.cuh>
17+
#include <cub/device/dispatch/tuning/common.cuh>
18+
#include <cub/util_arch.cuh>
19+
20+
#include <cuda/__device/arch_id.h>
21+
22+
CUB_NAMESPACE_BEGIN
23+
24+
namespace detail::rfa
25+
{
26+
struct reduce_policy
27+
{
28+
int block_threads;
29+
int items_per_thread;
30+
BlockReduceAlgorithm block_algorithm;
31+
};
32+
33+
struct single_tile_policy
34+
{
35+
int block_threads;
36+
int items_per_thread;
37+
BlockReduceAlgorithm block_algorithm;
38+
};
39+
40+
struct rfa_policy
41+
{
42+
reduce_policy reduce;
43+
single_tile_policy single_tile;
44+
};
45+
46+
struct policy_selector
47+
{
48+
type_t accum_t;
49+
int accum_size;
50+
51+
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rfa_policy
52+
{
53+
if (arch >= ::cuda::arch_id::sm_90)
54+
{
55+
// only tuned for float, fall through for other types
56+
if (accum_t == type_t::float32)
57+
{
58+
// ipt_13.tpb_224 1.107188 1.009709 1.097114 1.316820
59+
const auto scaled = scale_mem_bound(224, 13, accum_size);
60+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
61+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
62+
}
63+
}
64+
65+
if (arch >= ::cuda::arch_id::sm_86)
66+
{
67+
// only tuned for float and double, fall through for other types
68+
if (accum_t == type_t::float32)
69+
{
70+
// ipt_6.tpb_224 1.034383 1.000000 1.032097 1.090909
71+
const auto scaled = scale_mem_bound(224, 6, accum_size);
72+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
73+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
74+
}
75+
if (accum_t == type_t::float64)
76+
{
77+
// ipt_11.tpb_128 () 1.232089 1.002124 1.245336 1.582279
78+
const auto scaled = scale_mem_bound(128, 11, accum_size);
79+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
80+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
81+
}
82+
}
83+
84+
if (arch >= ::cuda::arch_id::sm_60)
85+
{
86+
const auto scaled = scale_mem_bound(256, 16, accum_size);
87+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
88+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
89+
}
90+
91+
const auto scaled = scale_mem_bound(256, 20, accum_size);
92+
return {{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING},
93+
{scaled.block_threads, scaled.items_per_thread, BLOCK_REDUCE_RAKING}};
94+
}
95+
};
96+
97+
// stateless version which can be passed to kernels
98+
template <typename AccumT>
99+
struct policy_selector_from_types
100+
{
101+
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> rfa_policy
102+
{
103+
return policy_selector{classify_type<AccumT>, int{sizeof(AccumT)}}(arch);
104+
}
105+
};
106+
} // namespace detail::rfa
107+
108+
CUB_NAMESPACE_END

0 commit comments

Comments
 (0)