Skip to content

Commit b0e3135

Browse files
authored
[cuda.coop]: add device-side coop.warp.sum benchmark with pynvbench (#6846)
* Add cuda-bench to benchmark dependencies
1 parent 0231796 commit b0e3135

File tree

4 files changed

+256
-0
lines changed

4 files changed

+256
-0
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
We benchmark block- and warp-level algorithms with a kernel that does
2+
nothing else. The challenge is keeping the compiler from optimizing
3+
away the work while ensuring the algorithm dominates runtime.
4+
5+
To avoid memory traffic, we generate input data from `clock()`. This
6+
keeps the values unknown at compile time without paying the cost of
7+
global loads. We also avoid constants, which would enable further
8+
optimization and skew results away from realistic workloads.
9+
10+
To make the algorithm dominate execution time, we call it in an
11+
unrolled loop. To prevent the compiler from collapsing identical calls,
12+
each iteration depends on the previous one: the output of one call
13+
becomes the input to the next.
14+
15+
Finally, we introduce a side effect so the compiler must keep the code.
16+
We do this with a write behind a condition that is never true, avoiding
17+
the cost of an actual store while still preventing dead-code removal.
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
5+
import os
6+
import sys
7+
8+
import numba
9+
import numpy as np
10+
from numba import cuda
11+
12+
import cuda.bench as bench
13+
14+
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
15+
from device_side_benchmark import ( # isort: skip # type: ignore[import-not-found] # noqa: E402
16+
make_unrolled_kernel,
17+
get_grid_size,
18+
)
19+
20+
numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
21+
22+
23+
def bench_warp_reduce(state: bench.State):
24+
dtype_str = state.get_string("T{ct}")
25+
algorithm_str = state.get_string("Algorithm{ct}")
26+
27+
if algorithm_str == "warp_min" and dtype_str == "F16":
28+
state.skip("custom ops not supported for F16, which is needed for warp_min")
29+
30+
types_map = {
31+
"I8": np.int8,
32+
"I16": np.int16,
33+
"I32": np.int32,
34+
"I64": np.int64,
35+
"F16": np.float16,
36+
"F32": np.float32,
37+
"F64": np.float64,
38+
}
39+
40+
dtype = types_map[dtype_str]
41+
42+
numba_dtype = numba.from_dtype(dtype)
43+
block_size = 256
44+
unroll_factor = 128
45+
46+
benchmark_kernel = make_unrolled_kernel(
47+
block_size, algorithm_str, unroll_factor, numba_dtype
48+
)
49+
50+
sink_buffer = cuda.device_array(16, dtype=np.int32)
51+
52+
# This calls the kernel (and then immediately synchronizes the device) to
53+
# force compilation so we can extract occupancy info.
54+
grid_size = get_grid_size(
55+
state.get_device(), block_size, benchmark_kernel, sink_buffer
56+
)
57+
58+
def launcher(_: bench.Launch):
59+
benchmark_kernel[grid_size, block_size](sink_buffer)
60+
61+
state.exec(launcher, batched=False)
62+
63+
64+
if __name__ == "__main__":
65+
b = bench.register(bench_warp_reduce)
66+
b.add_string_axis("T{ct}", ["I8", "I16", "I32", "I64", "F16", "F32", "F64"])
67+
b.add_string_axis("Algorithm{ct}", ["warp_sum", "warp_min"])
68+
bench.run_all_benchmarks(sys.argv)
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
5+
import math
6+
7+
from llvmlite import ir
8+
from numba import cuda, types
9+
from numba.core.extending import intrinsic
10+
11+
import cuda.bindings.driver as driver
12+
from cuda import coop
13+
from cuda.core import Device
14+
15+
16+
@intrinsic
17+
def get_smid(typingctx):
18+
"""Read the %smid special register (SM ID)."""
19+
sig = types.uint32()
20+
21+
def codegen(context, builder, signature, args):
22+
ftype = ir.FunctionType(ir.IntType(32), [])
23+
asm_ir = ir.InlineAsm(ftype, "mov.u32 $0, %smid;", "=r", side_effect=True)
24+
return builder.call(asm_ir, [])
25+
26+
return sig, codegen
27+
28+
29+
def make_unrolled_kernel(block_size, algorithm_name, unroll_factor, numba_dtype):
30+
"""Generate a kernel with manually unrolled loop."""
31+
32+
@intrinsic
33+
def generate_random_data(typingctx, dtype_type):
34+
"""
35+
Generate random data of the specified type using the local array + memcpy pattern.
36+
37+
Equivalent to C++:
38+
uint32_t data[sizeof(T) / sizeof(uint32_t)];
39+
for (...) data[i] = clock();
40+
T ret;
41+
memcpy(&ret, data, sizeof(T));
42+
return ret;
43+
44+
Usage: generate_random_data(numba.float64)
45+
"""
46+
target_type = dtype_type.dtype # Extract the actual type from Type[T]
47+
48+
def codegen(context, builder, signature, args):
49+
# Get LLVM type info
50+
target_llvm = context.get_value_type(target_type)
51+
size_bytes = target_llvm.get_abi_size(context.target_data)
52+
num_u32s = math.ceil(size_bytes / 4)
53+
54+
# 1. Allocate local array: uint32_t data[num_u32s]
55+
u32_type = ir.IntType(32)
56+
array_type = ir.ArrayType(u32_type, num_u32s)
57+
data_ptr = builder.alloca(array_type, name="data")
58+
59+
# 2. Fill array with clock values
60+
# Clock read inline asm
61+
asm_ftype = ir.FunctionType(ir.IntType(32), [])
62+
asm_ir = ir.InlineAsm(
63+
asm_ftype, "mov.u32 $0, %clock;", "=r", side_effect=True
64+
)
65+
66+
for i in range(num_u32s):
67+
clock_val = builder.call(asm_ir, [])
68+
# GEP to get &data[i]
69+
elem_ptr = builder.gep(
70+
data_ptr,
71+
[ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), i)],
72+
)
73+
builder.store(clock_val, elem_ptr)
74+
75+
# 3. Allocate result: T ret
76+
ret_ptr = builder.alloca(target_llvm, name="ret")
77+
78+
# 4. memcpy(&ret, data, sizeof(T))
79+
# Cast both pointers to i8* for memcpy
80+
i8_ptr_type = ir.PointerType(ir.IntType(8))
81+
dest = builder.bitcast(ret_ptr, i8_ptr_type)
82+
src = builder.bitcast(data_ptr, i8_ptr_type)
83+
84+
# Call LLVM memcpy intrinsic
85+
memcpy_fn = builder.module.declare_intrinsic(
86+
"llvm.memcpy", [i8_ptr_type, i8_ptr_type, ir.IntType(64)]
87+
)
88+
builder.call(
89+
memcpy_fn,
90+
[
91+
dest,
92+
src,
93+
ir.Constant(ir.IntType(64), size_bytes),
94+
# not volatile, means it can be optimized away
95+
ir.Constant(ir.IntType(1), 0),
96+
],
97+
)
98+
99+
# 5. return ret
100+
return builder.load(ret_ptr)
101+
102+
sig = target_type(dtype_type)
103+
return sig, codegen
104+
105+
@cuda.jit(device=True)
106+
def sink(value, sink_buffer):
107+
"""Prevent dead code elimination. Condition is always false."""
108+
if get_smid() == 0xFFFFFFFF:
109+
sink_buffer[0] = value
110+
111+
# Generate unrolled code as a string
112+
unrolled_body = "\n ".join(
113+
f"data = {algorithm_name}(data) # iteration {i}" for i in range(unroll_factor)
114+
)
115+
116+
kernel_code = f"""
117+
@cuda.jit(link={algorithm_name}.files, launch_bounds={block_size})
118+
def benchmark_kernel(sink_buffer):
119+
data = generate_random_data(target_dtype)
120+
121+
# Manually unrolled {unroll_factor} iterations:
122+
{unrolled_body}
123+
124+
sink(data, sink_buffer)
125+
"""
126+
127+
if algorithm_name == "warp_sum":
128+
algorithm = coop.warp.sum(numba_dtype)
129+
elif algorithm_name == "warp_min":
130+
131+
def min_op(a, b):
132+
return a if a < b else b
133+
134+
algorithm = coop.warp.reduce(numba_dtype, min_op)
135+
136+
# Create local namespace with required functions
137+
local_ns = {
138+
"cuda": cuda,
139+
algorithm_name: algorithm,
140+
"generate_random_data": generate_random_data,
141+
"get_smid": get_smid,
142+
"sink": sink,
143+
"target_dtype": numba_dtype,
144+
}
145+
146+
exec(kernel_code, local_ns)
147+
return local_ns["benchmark_kernel"]
148+
149+
150+
def get_grid_size(device_id, block_size, kernel, sink_buffer):
151+
"""Get the grid size for the given kernel and block size."""
152+
153+
# warmup to force compilation so we can extract occupancy info
154+
kernel[1, block_size](sink_buffer)
155+
156+
device = Device(device_id)
157+
device.sync()
158+
num_SMs = device.properties.multiprocessor_count
159+
160+
sig = kernel.signatures[0]
161+
cufunc = kernel.overloads[sig].library.get_cufunc()
162+
163+
err, max_blocks_per_sm = driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(
164+
cufunc.handle, block_size, 0
165+
)
166+
if err != driver.CUresult.CUDA_SUCCESS:
167+
raise RuntimeError(f"Failed to get occupancy info: {err}")
168+
169+
return max_blocks_per_sm * num_SMs

python/cuda_cccl/pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ test-cu13 = [
6464
"cupy-cuda13x",
6565
"pytest-benchmark",
6666
]
67+
bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]"]
68+
bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]"]
6769

6870
[project.urls]
6971
Homepage = "https://github.com/NVIDIA/cccl"

0 commit comments

Comments
 (0)