[cuda.coop]: add device-side coop.warp.sum benchmark with pynvbench (#6846)

NaderAlAwar · web-flow · commit b0e31359eb99 · 2026-02-03T15:12:42.000Z
* Add cuda-bench to benchmark dependencies
diff --git a/python/cuda_cccl/benchmarks/coop/README.md b/python/cuda_cccl/benchmarks/coop/README.md
@@ -0,0 +1,17 @@
+We benchmark block- and warp-level algorithms with a kernel that does
+nothing else. The challenge is keeping the compiler from optimizing
+away the work while ensuring the algorithm dominates runtime.
+
+To avoid memory traffic, we generate input data from `clock()`. This
+keeps the values unknown at compile time without paying the cost of
+global loads. We also avoid constants, which would enable further
+optimization and skew results away from realistic workloads.
+
+To make the algorithm dominate execution time, we call it in an
+unrolled loop. To prevent the compiler from collapsing identical calls,
+each iteration depends on the previous one: the output of one call
+becomes the input to the next.
+
+Finally, we introduce a side effect so the compiler must keep the code.
+We do this with a write behind a condition that is never true, avoiding
+the cost of an actual store while still preventing dead-code removal.
diff --git a/python/cuda_cccl/benchmarks/coop/bench_warp_reduce.py b/python/cuda_cccl/benchmarks/coop/bench_warp_reduce.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import sys
+
+import numba
+import numpy as np
+from numba import cuda
+
+import cuda.bench as bench
+
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+from device_side_benchmark import (  # isort: skip # type: ignore[import-not-found] # noqa: E402
+    make_unrolled_kernel,
+    get_grid_size,
+)
+
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+def bench_warp_reduce(state: bench.State):
+    dtype_str = state.get_string("T{ct}")
+    algorithm_str = state.get_string("Algorithm{ct}")
+
+    if algorithm_str == "warp_min" and dtype_str == "F16":
+        state.skip("custom ops not supported for F16, which is needed for warp_min")
+
+    types_map = {
+        "I8": np.int8,
+        "I16": np.int16,
+        "I32": np.int32,
+        "I64": np.int64,
+        "F16": np.float16,
+        "F32": np.float32,
+        "F64": np.float64,
+    }
+
+    dtype = types_map[dtype_str]
+
+    numba_dtype = numba.from_dtype(dtype)
+    block_size = 256
+    unroll_factor = 128
+
+    benchmark_kernel = make_unrolled_kernel(
+        block_size, algorithm_str, unroll_factor, numba_dtype
+    )
+
+    sink_buffer = cuda.device_array(16, dtype=np.int32)
+
+    # This calls the kernel (and then immediately synchronizes the device) to
+    # force compilation so we can extract occupancy info.
+    grid_size = get_grid_size(
+        state.get_device(), block_size, benchmark_kernel, sink_buffer
+    )
+
+    def launcher(_: bench.Launch):
+        benchmark_kernel[grid_size, block_size](sink_buffer)
+
+    state.exec(launcher, batched=False)
+
+
+if __name__ == "__main__":
+    b = bench.register(bench_warp_reduce)
+    b.add_string_axis("T{ct}", ["I8", "I16", "I32", "I64", "F16", "F32", "F64"])
+    b.add_string_axis("Algorithm{ct}", ["warp_sum", "warp_min"])
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/cuda_cccl/benchmarks/coop/device_side_benchmark.py b/python/cuda_cccl/benchmarks/coop/device_side_benchmark.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import math
+
+from llvmlite import ir
+from numba import cuda, types
+from numba.core.extending import intrinsic
+
+import cuda.bindings.driver as driver
+from cuda import coop
+from cuda.core import Device
+
+
+@intrinsic
+def get_smid(typingctx):
+    """Read the %smid special register (SM ID)."""
+    sig = types.uint32()
+
+    def codegen(context, builder, signature, args):
+        ftype = ir.FunctionType(ir.IntType(32), [])
+        asm_ir = ir.InlineAsm(ftype, "mov.u32 $0, %smid;", "=r", side_effect=True)
+        return builder.call(asm_ir, [])
+
+    return sig, codegen
+
+
+def make_unrolled_kernel(block_size, algorithm_name, unroll_factor, numba_dtype):
+    """Generate a kernel with manually unrolled loop."""
+
+    @intrinsic
+    def generate_random_data(typingctx, dtype_type):
+        """
+        Generate random data of the specified type using the local array + memcpy pattern.
+
+        Equivalent to C++:
+            uint32_t data[sizeof(T) / sizeof(uint32_t)];
+            for (...) data[i] = clock();
+            T ret;
+            memcpy(&ret, data, sizeof(T));
+            return ret;
+
+        Usage: generate_random_data(numba.float64)
+        """
+        target_type = dtype_type.dtype  # Extract the actual type from Type[T]
+
+        def codegen(context, builder, signature, args):
+            # Get LLVM type info
+            target_llvm = context.get_value_type(target_type)
+            size_bytes = target_llvm.get_abi_size(context.target_data)
+            num_u32s = math.ceil(size_bytes / 4)
+
+            # 1. Allocate local array: uint32_t data[num_u32s]
+            u32_type = ir.IntType(32)
+            array_type = ir.ArrayType(u32_type, num_u32s)
+            data_ptr = builder.alloca(array_type, name="data")
+
+            # 2. Fill array with clock values
+            # Clock read inline asm
+            asm_ftype = ir.FunctionType(ir.IntType(32), [])
+            asm_ir = ir.InlineAsm(
+                asm_ftype, "mov.u32 $0, %clock;", "=r", side_effect=True
+            )
+
+            for i in range(num_u32s):
+                clock_val = builder.call(asm_ir, [])
+                # GEP to get &data[i]
+                elem_ptr = builder.gep(
+                    data_ptr,
+                    [ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), i)],
+                )
+                builder.store(clock_val, elem_ptr)
+
+            # 3. Allocate result: T ret
+            ret_ptr = builder.alloca(target_llvm, name="ret")
+
+            # 4. memcpy(&ret, data, sizeof(T))
+            # Cast both pointers to i8* for memcpy
+            i8_ptr_type = ir.PointerType(ir.IntType(8))
+            dest = builder.bitcast(ret_ptr, i8_ptr_type)
+            src = builder.bitcast(data_ptr, i8_ptr_type)
+
+            # Call LLVM memcpy intrinsic
+            memcpy_fn = builder.module.declare_intrinsic(
+                "llvm.memcpy", [i8_ptr_type, i8_ptr_type, ir.IntType(64)]
+            )
+            builder.call(
+                memcpy_fn,
+                [
+                    dest,
+                    src,
+                    ir.Constant(ir.IntType(64), size_bytes),
+                    # not volatile, means it can be optimized away
+                    ir.Constant(ir.IntType(1), 0),
+                ],
+            )
+
+            # 5. return ret
+            return builder.load(ret_ptr)
+
+        sig = target_type(dtype_type)
+        return sig, codegen
+
+    @cuda.jit(device=True)
+    def sink(value, sink_buffer):
+        """Prevent dead code elimination. Condition is always false."""
+        if get_smid() == 0xFFFFFFFF:
+            sink_buffer[0] = value
+
+    # Generate unrolled code as a string
+    unrolled_body = "\n    ".join(
+        f"data = {algorithm_name}(data)  # iteration {i}" for i in range(unroll_factor)
+    )
+
+    kernel_code = f"""
+@cuda.jit(link={algorithm_name}.files, launch_bounds={block_size})
+def benchmark_kernel(sink_buffer):
+    data = generate_random_data(target_dtype)
+
+    # Manually unrolled {unroll_factor} iterations:
+    {unrolled_body}
+
+    sink(data, sink_buffer)
+"""
+
+    if algorithm_name == "warp_sum":
+        algorithm = coop.warp.sum(numba_dtype)
+    elif algorithm_name == "warp_min":
+
+        def min_op(a, b):
+            return a if a < b else b
+
+        algorithm = coop.warp.reduce(numba_dtype, min_op)
+
+    # Create local namespace with required functions
+    local_ns = {
+        "cuda": cuda,
+        algorithm_name: algorithm,
+        "generate_random_data": generate_random_data,
+        "get_smid": get_smid,
+        "sink": sink,
+        "target_dtype": numba_dtype,
+    }
+
+    exec(kernel_code, local_ns)
+    return local_ns["benchmark_kernel"]
+
+
+def get_grid_size(device_id, block_size, kernel, sink_buffer):
+    """Get the grid size for the given kernel and block size."""
+
+    # warmup to force compilation so we can extract occupancy info
+    kernel[1, block_size](sink_buffer)
+
+    device = Device(device_id)
+    device.sync()
+    num_SMs = device.properties.multiprocessor_count
+
+    sig = kernel.signatures[0]
+    cufunc = kernel.overloads[sig].library.get_cufunc()
+
+    err, max_blocks_per_sm = driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(
+        cufunc.handle, block_size, 0
+    )
+    if err != driver.CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to get occupancy info: {err}")
+
+    return max_blocks_per_sm * num_SMs
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
@@ -64,6 +64,8 @@ test-cu13 = [
   "cupy-cuda13x",
   "pytest-benchmark",
 ]
+bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]"]
+bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]"]
 
 [project.urls]
 Homepage = "https://github.com/NVIDIA/cccl"

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,8 @@ test-cu13 = [`
`64`	`64`	`"cupy-cuda13x",`
`65`	`65`	`"pytest-benchmark",`
`66`	`66`	`]`
	`67`	`+bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]"]`
	`68`	`+bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]"]`
`67`	`69`
`68`	`70`	`[project.urls]`
`69`	`71`	`Homepage = "https://github.com/NVIDIA/cccl"`