Skip to content

Commit 00858e5

Browse files
committed
Add ubench to test LRC merge size
1 parent 650220a commit 00858e5

2 files changed

Lines changed: 107 additions & 0 deletions

File tree

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
SRC = lrc_merge_size.cu
2+
3+
EXE = lrc_merge_size
4+
5+
# NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt
6+
# LRC is supported on SM_90 and above
7+
ARCH?=sm_90a sm_100a sm_101 sm_120
8+
# Unset the CUDA_CPPFLAGS which is set based on CUDA version
9+
# but LRC is only supported on SM_90 and above
10+
CUDA_CPPFLAGS=
11+
# Generate code for both sm_XXX and compute_XXX (SASS and PTX)
12+
NVCC_FLAGS := $(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch))) -std=c++17 -O0
13+
14+
include ../../../common/common.mk
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// LRC Merge Size Microbenchmark
2+
// Use mbarrier and threadblock cluster to ensure
3+
// best synchronization among warps sending request
4+
// to the same L2 sector
5+
6+
#include <assert.h>
7+
#include <bits/getopt_core.h>
8+
#include <cstdint>
9+
#include <stdio.h>
10+
#include <stdlib.h>
11+
#include <string.h>
12+
#include "../../../hw_def/common/gpuConfig.h"
13+
#include "../../../hw_def/hw_def.h"
14+
15+
#include <cuda.h>
16+
#include <cuda/ptx>
17+
#include <cooperative_groups.h>
18+
namespace cg = cooperative_groups;
19+
20+
/**
21+
* @brief Kernel to test LRC merge size being sector or cacheline
22+
*
23+
* @param data Same sector data for all warps to access for LRC merge
24+
* @param dsink Global sink to avoid optimization
25+
* @return __global__
26+
*/
27+
__global__ void __cluster_dims__(4, 1, 1) lrc_merge_size_kernel(uint8_t *data, uint8_t *dsink) {
28+
// The LRC merge size should be reflected from NCU metrics
29+
30+
// Shmem buffer
31+
__shared__ volatile uint8_t smem_buffer[16];
32+
33+
// Get thread index within a cluster
34+
cg::cluster_group cluster = cg::this_cluster();
35+
unsigned block_rank = cluster.block_rank();
36+
37+
38+
/**
39+
* LRC merge size test
40+
*/
41+
// Base on the block rank in a cluster, each block have one warp to access the data + i
42+
// with i being the block rank
43+
// Only one sector access per warp
44+
if (threadIdx.x % 32 == 0) {
45+
uint64_t data_value;
46+
// As sector size is 32B, so we access at 32B stride
47+
uint8_t *ptr = data + block_rank * 32;
48+
asm volatile("{\t\n"
49+
"ld.global.cg.u64 %0, [%1];\n\t"
50+
"}"
51+
: "=l"(data_value)
52+
: "l"(ptr));
53+
smem_buffer[0] = (uint8_t)data_value;
54+
}
55+
__syncthreads();
56+
57+
// Write to global sink to prevent optimization
58+
dsink[0] = smem_buffer[0];
59+
}
60+
61+
int main(int argc, char *argv[]) {
62+
initializeDeviceProp(0, argc, argv);
63+
64+
// Initialize the data and global sink with single value
65+
uint8_t *data_g;
66+
const unsigned SECTOR_SIZE = 32;
67+
const unsigned CACHELINE_SIZE = 128;
68+
69+
// cudaMalloc is aligned to 256B, so this array is in a single cacheline
70+
gpuErrchk(cudaMalloc(&data_g, sizeof(uint8_t) * CACHELINE_SIZE));
71+
gpuErrchk(cudaMemset(data_g, 0, sizeof(uint8_t) * CACHELINE_SIZE));
72+
uint8_t *dsink_g;
73+
gpuErrchk(cudaMalloc(&dsink_g, sizeof(uint8_t)));
74+
gpuErrchk(cudaMemset(dsink_g, 0, sizeof(uint8_t)));
75+
76+
printf("=== LRC Merge Size ===\n");
77+
printf("Profile with ncu to measure LRC merge size.\n");
78+
79+
printf("Launching threadblocks normally...\n");
80+
// 2 clusters, 8 threadblocks, each TB has 1 warp, each warp has 32 threads
81+
lrc_merge_size_kernel<<<8, 32>>>(data_g, dsink_g);
82+
83+
gpuErrchk(cudaPeekAtLastError());
84+
gpuErrchk(cudaDeviceSynchronize());
85+
86+
printf("Kernel completed. Use ncu to analyze LRC merge size.\n");
87+
88+
// Cleanup
89+
cudaFree(data_g);
90+
cudaFree(dsink_g);
91+
92+
return 0;
93+
}

0 commit comments

Comments
 (0)