Skip to content
This repository was archived by the owner on May 27, 2021. It is now read-only.

Commit 6561ad7

Browse files
Add benchmark scripts and results
1 parent cdadcbf commit 6561ad7

File tree

14 files changed

+252
-0
lines changed

14 files changed

+252
-0
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
N,runtime
2+
128,9241.600000
3+
256,13564.800000
4+
512,23936.000000
5+
1024,69990.400000
6+
2048,459043.200000
7+
4096,3187926.400000
8+
8192,24734774.400000
9+
16384,192036652.800000
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
using CUDAapi
2+
using CUDAdrv
3+
using CUDAnative
4+
using CUDAnative.MatMul
5+
using CuArrays
6+
7+
M = parse(Int, ARGS[1])
8+
N = parse(Int, ARGS[2])
9+
K = parse(Int, ARGS[3])
10+
11+
function benchmark_matmul(a, b, c, d)
12+
CuArrays.@sync begin
13+
CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH)
14+
CUBLAS.cublasGemmEx(CUBLAS.handle(), CUBLAS.CUBLAS_OP_N, CUBLAS.CUBLAS_OP_N, M, N, K, [Float32(1)], a, CUDAapi.R_16F, M, b, CUDAapi.R_16F, K, [Float32(1)], c, CUDAapi.R_32F, M, CUDAapi.R_32F, CUBLAS.CUBLAS_GEMM_DEFAULT)
15+
end
16+
end
17+
18+
a_h = rand(Float16, (M, K)) / sqrt(Float16(K))
19+
b_h = rand(Float16, (K, N)) / sqrt(Float16(K))
20+
c_h = rand(Float32, (M, N))
21+
22+
a = CuArray(a_h)
23+
b = CuArray(b_h)
24+
c = CuArray(c_h)
25+
d = similar(c)
26+
27+
# warmup
28+
benchmark_matmul(a, b, c, d)
29+
30+
# profile
31+
for i = 1 : 10
32+
CUDAdrv.@profile benchmark_matmul(a, b, c, d)
33+
end
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
set -Eeuo pipefail
3+
4+
if [[ $# < 1 ]]; then
5+
echo "Usage $0 <JULIA_PATH>" 1>&2
6+
exit 1
7+
fi
8+
9+
JULIA_PATH=$1
10+
11+
cd "$( dirname "${BASH_SOURCE[0]}" )"
12+
13+
printf "N,runtime\n" >cublas.csv
14+
15+
for i in {7..14}; do
16+
N=$((2**i))
17+
18+
# runtime in ns
19+
runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cublas.jl $N $N $N 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g')
20+
21+
printf "$N,$runtime\n" >>cublas.csv
22+
done
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
N,runtime
2+
128,17462.400000
3+
256,26332.800000
4+
512,43344.000000
5+
1024,87014.400000
6+
2048,540777.600000
7+
4096,3967702.400000
8+
8192,30435030.400000
9+
16384,236893779.200000
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
using CUDAdrv
2+
using CUDAnative
3+
using CUDAnative.MatMul
4+
using CuArrays
5+
6+
M = parse(Int, ARGS[1])
7+
N = parse(Int, ARGS[2])
8+
K = parse(Int, ARGS[3])
9+
10+
function benchmark_matmul(a, b, c, d)
11+
CuArrays.@sync begin
12+
conf = MatMul.get_config(
13+
gemm_shape = (M = M, N = N, K = K),
14+
operator = Operator.WMMAOp{16, 16, 16},
15+
global_a_layout = Layout.AlignedColMajor{Float16},
16+
global_c_layout = Layout.AlignedColMajor{Float32},
17+
)
18+
MatMul.matmul(a, b, c, d, conf)
19+
end
20+
end
21+
22+
a_h = rand(Float16, (M, K)) / sqrt(Float16(K))
23+
b_h = rand(Float16, (K, N)) / sqrt(Float16(K))
24+
c_h = rand(Float32, (M, N))
25+
26+
a = CuArray(a_h)
27+
b = CuArray(b_h)
28+
c = CuArray(c_h)
29+
d = similar(c)
30+
31+
# warmup
32+
benchmark_matmul(a, b, c, d)
33+
34+
# profile
35+
for i = 1 : 10
36+
CUDAdrv.@profile benchmark_matmul(a, b, c, d)
37+
end
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
set -Eeuo pipefail
3+
4+
if [[ $# < 1 ]]; then
5+
echo "Usage $0 <JULIA_PATH>" 1>&2
6+
exit 1
7+
fi
8+
9+
JULIA_PATH=$1
10+
11+
cd "$( dirname "${BASH_SOURCE[0]}" )"
12+
13+
printf "N,runtime\n" >cudanative.csv
14+
15+
for i in {7..14}; do
16+
N=$((2**i))
17+
18+
# runtime in ns
19+
runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative.jl $N $N $N 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g')
20+
21+
printf "$N,$runtime\n" >>cudanative.csv
22+
done
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
N,runtime
2+
128,20493.333333
3+
256,36458.666667
4+
512,62733.333333
5+
1024,119813.333333
6+
2048,465450.666667
7+
4096,3440157.333333
8+
8192,26701152.000000
9+
16384,215024610.666667
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
set -Eeuo pipefail
3+
4+
if [[ $# < 1 ]]; then
5+
echo "Usage $0 <CUTLASS_BUILD_PATH>" 1>&2
6+
exit 1
7+
fi
8+
9+
CUTLASS_BUILD_PATH=$1
10+
11+
cd "$( dirname "${BASH_SOURCE[0]}" )"
12+
13+
printf "N,runtime\n" >cutlass-mma-turing.csv
14+
15+
for i in {7..14}; do
16+
N=$((2**i))
17+
18+
# runtime in ns
19+
runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=16 --inst_n=8 --inst_k=8 --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g')
20+
21+
printf "$N,$runtime\n" >>cutlass-mma-turing.csv
22+
done
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
N,runtime
2+
128,20309.333333
3+
256,33522.666667
4+
512,59837.333333
5+
1024,118997.333333
6+
2048,827818.666667
7+
4096,6395536.000000
8+
8192,49197301.333333
9+
16384,400406416.000000
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
set -Eeuo pipefail
3+
4+
if [[ $# < 1 ]]; then
5+
echo "Usage $0 <CUTLASS_BUILD_PATH>" 1>&2
6+
exit 1
7+
fi
8+
9+
CUTLASS_BUILD_PATH=$1
10+
11+
cd "$( dirname "${BASH_SOURCE[0]}" )"
12+
13+
printf "N,runtime\n" >cutlass-mma.csv
14+
15+
for i in {7..14}; do
16+
N=$((2**i))
17+
18+
# runtime in ns
19+
runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=8 --inst_n=8 --inst_k=4 --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g')
20+
21+
printf "$N,$runtime\n" >>cutlass-mma.csv
22+
done

0 commit comments

Comments
 (0)