Skip to content

Commit c511dd7

Browse files
William-AnJRPan
andauthored
Add TMA unittest app (#58)
* Add tma unittest * add regular load to TMA benchmark * make the regular load to have same access pattern as TMA load * avoid compiler optimization * move cuda mempcy to be before kernel launch * add iteration count for tma ubench * minor formatting * move tma to ubench folder * make setup script works with zsh * fix the issue that ubench all return 1 even without issue * add a sample test kernel for mbarrier PTX mapping to SASS * update gitignore * add gmma kernels for latency measurement * increase iter to 1024 * add missed kernels * add maxflops for gmma * update block size * update prints for MaxFlops_gmma * fix a bug * fix include after updating it * fix for cpp and c source * fix compile * fix for pattern matching * fix compilation for mbarrier * Fix makefile for tma app * generate SASS and PTX for TMA and GMMA workloads * update makefile to force PTX to be embedded in final fat bin * change naming * comment out parboil as it is using python2 * Add GPU ubench to clean target * Use dynamic linking by default for GPU apps * Add test binaries for GMMA instruction * Checkout CUTLASS during ci * Use type to specify gmma ubench iteration count and update test code * Fix typos * Update Makefiles and setup_environment to use C++17 standard * missed one rename * Remove unused clean target and tma build steps from Makefile --------- Co-authored-by: JRPAN <25518778+JRPan@users.noreply.github.com>
1 parent 6a9bc05 commit c511dd7

337 files changed

Lines changed: 3519 additions & 94 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/test-build.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ jobs:
3030
run: |
3131
git config --global --add safe.directory /__w/gpu-app-collection/gpu-app-collection
3232
git submodule update --init -- src/cuda/cuda-samples
33+
git submodule update --init -- src/cuda/cutlass-bench
3334
/bin/bash test-build.sh ci
3435
3536
- name: Print Successful Apps

.gitignore

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,13 @@ src/cuda/rodinia/3.1/cuda/particlefilter/particlefilter_naive
1515
src/cuda/rodinia/3.1/cuda/pathfinder/pathfinder
1616
4.2
1717
.venv/
18-
__pycache__/
18+
__pycache__/
19+
compile_commands.json
20+
.cache/
21+
tmp/
22+
23+
# Ignoring files without extension (but keep Makefile and files with extensions)
24+
src/cuda/GPU_Microbenchmark/ubench/**/*
25+
!src/cuda/GPU_Microbenchmark/ubench/**/*/
26+
!src/cuda/GPU_Microbenchmark/ubench/**/*.*
27+
!src/cuda/GPU_Microbenchmark/ubench/**/Makefile

src/Makefile

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,6 @@ GPU_Microbenchmark:
109109
mkdir -p $(BINDIR)/$(BINSUBDIR)/
110110
$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/GPU_Microbenchmark
111111
mv cuda/GPU_Microbenchmark/bin/* $(BINDIR)/$(BINSUBDIR)/
112-
clean_GPU_Microbenchmark:
113-
find cuda/GPU_Microbenchmark/ubench -type f -executable -delete
114112

115113

116114
Deepbench_nvidia:
@@ -540,22 +538,23 @@ clean_shoc:
540538
cd cuda/shoc-master/; $(MAKE) clean; $(MAKE) distclean
541539

542540
clean_parboil:
543-
$(SETENV) cd cuda/parboil; ./parboil clean cutcp cuda
544-
$(SETENV) cd cuda/parboil; ./parboil clean bfs cuda
545-
$(SETENV) cd cuda/parboil; ./parboil clean histo cuda
546-
$(SETENV) cd cuda/parboil; ./parboil clean lbm cuda
547-
$(SETENV) cd cuda/parboil; ./parboil clean mri-gridding cuda
548-
$(SETENV) cd cuda/parboil; ./parboil clean mri-q cuda
549-
$(SETENV) cd cuda/parboil; ./parboil clean sad cuda
550-
$(SETENV) cd cuda/parboil; ./parboil clean sgemm cuda
551-
$(SETENV) cd cuda/parboil; ./parboil clean spmv cuda
552-
$(SETENV) cd cuda/parboil; ./parboil clean stencil cuda
553-
$(SETENV) cd cuda/parboil; ./parboil clean tpacf cuda
541+
# Commented out as parboil uses Python2
542+
# $(SETENV) cd cuda/parboil; ./parboil clean cutcp cuda
543+
# $(SETENV) cd cuda/parboil; ./parboil clean bfs cuda
544+
# $(SETENV) cd cuda/parboil; ./parboil clean histo cuda
545+
# $(SETENV) cd cuda/parboil; ./parboil clean lbm cuda
546+
# $(SETENV) cd cuda/parboil; ./parboil clean mri-gridding cuda
547+
# $(SETENV) cd cuda/parboil; ./parboil clean mri-q cuda
548+
# $(SETENV) cd cuda/parboil; ./parboil clean sad cuda
549+
# $(SETENV) cd cuda/parboil; ./parboil clean sgemm cuda
550+
# $(SETENV) cd cuda/parboil; ./parboil clean spmv cuda
551+
# $(SETENV) cd cuda/parboil; ./parboil clean stencil cuda
552+
# $(SETENV) cd cuda/parboil; ./parboil clean tpacf cuda
554553

555554
clean_parboil_hw_power:
556-
$(SETENV) cd cuda/parboil; ./parboil clean mri-q cuda_k1
557-
$(SETENV) cd cuda/parboil; ./parboil clean sad cuda_k1
558-
$(SETENV) cd cuda/parboil; ./parboil clean sgemm cuda_k1
555+
# $(SETENV) cd cuda/parboil; ./parboil clean mri-q cuda_k1
556+
# $(SETENV) cd cuda/parboil; ./parboil clean sad cuda_k1
557+
# $(SETENV) cd cuda/parboil; ./parboil clean sgemm cuda_k1
559558

560559
clean_lonestargpu-2.0:
561560
$(setenv) $(MAKE) $(make_args) noinline=$(noinline) -C cuda/lonestargpu-2.0 clean
@@ -698,4 +697,7 @@ clean_cuda_samples:
698697
$(MAKE) clean -C ./cuda/cuda-samples/build
699698

700699
clean_huggingface:
701-
rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
700+
rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
701+
702+
clean_GPU_Microbenchmark:
703+
$(MAKE) clean -C ./cuda/GPU_Microbenchmark
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
bin/
22
*.o
33
*.out
4+
*.a
5+
*.ptx

src/cuda/GPU_Microbenchmark/common/common.mk

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,42 @@ CC := nvcc
99

1010
LIB :=
1111

12-
release:
13-
$(CC) $(NVCC_FLAGS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
12+
# Generate object file list from SRC (for parallel compilation)
13+
CUDA_SRC_FILES := $(filter %.cu, $(SRC))
14+
CPP_SRC_FILES := $(filter %.cpp, $(SRC))
15+
C_SRC_FILES := $(filter %.c, $(SRC))
16+
17+
# To preserve PTX in multi-step compilation, we have to compile the CUDA source files to .a files
18+
CUDA_LIB_FILES := $(CUDA_SRC_FILES:.cu=.a)
19+
20+
# Host side source files
21+
CPP_OBJECT_FILES := $(CPP_SRC_FILES:.cpp=.o)
22+
C_OBJECT_FILES := $(C_SRC_FILES:.c=.o)
23+
OBJECT_FILES := $(CPP_OBJECT_FILES) $(C_OBJECT_FILES)
24+
25+
# If multiple source files are provided, compile them separately and link
26+
# To preserve PTX in final binary: First create static library, then link to executable
27+
# This avoids nvlink stripping PTX during device linking
28+
release: $(CUDA_LIB_FILES) $(OBJECT_FILES)
29+
$(CC) $(NVCC_FLAGS) $^ -o $(EXE) -L$(LIB) -lcudart --cudart shared
1430
mv $(EXE) $(BIN_DIR)
1531

32+
# Pattern rule for compiling individual .cu files to .o files
33+
%.a: %.cu
34+
$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) --lib $< -o $@
35+
36+
%.o: %.cpp
37+
$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@
38+
39+
%.o: %.c
40+
$(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@
41+
1642
tuner:
17-
$(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
43+
$(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) $(INCLUDE) -L$(LIB) -lcudart --cudart shared
1844
mv $(EXE) $(BIN_DIR)
1945

2046
clean:
21-
rm -f *.o; rm -f $(EXE)
47+
rm -f *.o *.a *.ptx *.sass $(OBJECTS) $(CUDA_LIB_FILES); rm -f $(EXE) $(LIB_FILE)
2248

2349
run:
2450
./$(EXE)
@@ -36,7 +62,7 @@ nvsight:
3662
nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum --csv --page raw ./$(EXE) | tee nsight.csv
3763

3864
ptx:
39-
cuobjdump -ptx ./$(EXE) tee ptx.txt
65+
cuobjdump -ptx ./$(EXE) | tee $(EXE).ptx
4066

4167
sass:
42-
cuobjdump -sass ./$(EXE) tee sass.txt
68+
cuobjdump -sass ./$(EXE) | tee $(EXE).sass

src/cuda/GPU_Microbenchmark/hw_def/common/common.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ enum dram_model { GDDR5 = 1, GDDR5X = 2, GDDR6 = 3, HBM = 4 };
2222

2323
// source:
2424
// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2
25-
unsigned round_up_2n(unsigned v) {
25+
inline unsigned round_up_2n(unsigned v) {
2626
v--;
2727
v |= v >> 1;
2828
v |= v >> 2;
@@ -34,9 +34,9 @@ unsigned round_up_2n(unsigned v) {
3434
return v;
3535
}
3636

37-
unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }
37+
inline unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }
3838

39-
bool isPowerOfTwo(int n) {
39+
inline bool isPowerOfTwo(int n) {
4040
if (n == 0)
4141
return false;
4242

@@ -51,12 +51,12 @@ static const unsigned dram_model_burst_length[] = {0, 8, 8, 16, 2};
5151
static const unsigned dram_model_freq_ratio[] = {0, 4, 4, 4, 2};
5252
// atom size =
5353
// dram_model_channel_width*dram_model_mem_per_ctrlr*dram_model_burst_length
54-
unsigned get_atom_size_inByte(enum dram_model model) {
54+
inline unsigned get_atom_size_inByte(enum dram_model model) {
5555
return (dram_model_bus_width[model] / 8) * dram_model_mem_per_ctrlr[model] *
5656
dram_model_burst_length[model];
5757
}
5858
// CCD = dram_model_burst_length/dram_model_freq_ratio
59-
unsigned get_adjusted_CCD(enum dram_model model) {
59+
inline unsigned get_adjusted_CCD(enum dram_model model) {
6060
assert(dram_model_burst_length[model] % dram_model_freq_ratio[model] == 0);
6161
return dram_model_burst_length[model] / dram_model_freq_ratio[model];
6262
}

src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ struct GpuConfig
4545
unsigned FBP_COUNT = 0; // Frame Buffer Partitions
4646
unsigned L2_BANKS = 0; // L2 Cache Banks (LTCs)
4747
};
48-
GpuConfig config;
48+
inline GpuConfig config;
4949
// Parses short flags like --sm 80 into a GpuConfig object
5050
inline void parseGpuConfigArgs(int argc, char *argv[])
5151
{
@@ -158,7 +158,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
158158
}
159159
}
160160

161-
cudaDeviceProp deviceProp;
161+
inline cudaDeviceProp deviceProp;
162162

163163
// NVIDIA RM API defines
164164
#define NV_IOCTL_MAGIC 'F'
@@ -237,7 +237,7 @@ inline unsigned queryGrInfo(uint32_t info_index)
237237
return result;
238238
}
239239

240-
unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
240+
inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])
241241
{
242242
// Check if running in GPGPU-Sim by looking for gpgpusim.config
243243
std::ifstream configFile("gpgpusim.config");

src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ __global__ void atomic_bw(uint64_t *startClk, uint64_t *stopClk, T *data1,
5151
int main(int argc, char *argv[])
5252
{
5353

54-
intilizeDeviceProp(0, argc, argv);
54+
initializeDeviceProp(0, argc, argv);
5555

5656
// Parse command line arguments for --fast flag
5757
uint32_t repeat_times = 2048; // default

src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ __global__ void atomic_bw(uint32_t *startClk, uint32_t *stopClk, T *data1,
4040
int main(int argc, char *argv[])
4141
{
4242

43-
intilizeDeviceProp(0, argc, argv);
43+
initializeDeviceProp(0, argc, argv);
4444
config.BLOCKS_NUM = config.SM_NUMBER * 2;
4545
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;
4646

src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ __global__ void atmoic_latency(uint32_t *startClk, uint32_t *stopClk, T *data1,
4242
int main(int argc, char *argv[])
4343
{
4444

45-
intilizeDeviceProp(0, argc, argv);
45+
initializeDeviceProp(0, argc, argv);
4646

4747
config.THREADS_PER_BLOCK = 1;
4848
config.THREADS_PER_SM = 1;

0 commit comments

Comments
 (0)