Add benchmark scripts and results

thomasfaingnaert · thomasfaingnaert · commit 6561ad79de7b · 2020-04-25T19:18:57.000+02:00
diff --git a/test/perf/matmul_kernels/wmma/cublas.csv b/test/perf/matmul_kernels/wmma/cublas.csv
@@ -0,0 +1,9 @@
+N,runtime
+128,9241.600000
+256,13564.800000
+512,23936.000000
+1024,69990.400000
+2048,459043.200000
+4096,3187926.400000
+8192,24734774.400000
+16384,192036652.800000
diff --git a/test/perf/matmul_kernels/wmma/cublas.jl b/test/perf/matmul_kernels/wmma/cublas.jl
@@ -0,0 +1,33 @@
+using CUDAapi
+using CUDAdrv
+using CUDAnative
+using CUDAnative.MatMul
+using CuArrays
+
+M = parse(Int, ARGS[1])
+N = parse(Int, ARGS[2])
+K = parse(Int, ARGS[3])
+
+function benchmark_matmul(a, b, c, d)
+    CuArrays.@sync begin
+        CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH)
+        CUBLAS.cublasGemmEx(CUBLAS.handle(), CUBLAS.CUBLAS_OP_N, CUBLAS.CUBLAS_OP_N, M, N, K, [Float32(1)], a, CUDAapi.R_16F, M, b, CUDAapi.R_16F, K, [Float32(1)], c, CUDAapi.R_32F, M, CUDAapi.R_32F, CUBLAS.CUBLAS_GEMM_DEFAULT)
+    end
+end
+
+a_h = rand(Float16, (M, K)) / sqrt(Float16(K))
+b_h = rand(Float16, (K, N)) / sqrt(Float16(K))
+c_h = rand(Float32, (M, N))
+
+a   = CuArray(a_h)
+b   = CuArray(b_h)
+c   = CuArray(c_h)
+d   = similar(c)
+
+# warmup
+benchmark_matmul(a, b, c, d)
+
+# profile
+for i = 1 : 10
+    CUDAdrv.@profile benchmark_matmul(a, b, c, d)
+end
diff --git a/test/perf/matmul_kernels/wmma/cublas.sh b/test/perf/matmul_kernels/wmma/cublas.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+if [[ $# < 1 ]]; then
+    echo "Usage $0 <JULIA_PATH>" 1>&2
+    exit 1
+fi
+
+JULIA_PATH=$1
+
+cd "$( dirname "${BASH_SOURCE[0]}" )"
+
+printf "N,runtime\n" >cublas.csv
+
+for i in {7..14}; do
+    N=$((2**i))
+
+    # runtime in ns
+    runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cublas.jl $N $N $N 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g')
+
+    printf "$N,$runtime\n" >>cublas.csv
+done
diff --git a/test/perf/matmul_kernels/wmma/cudanative.csv b/test/perf/matmul_kernels/wmma/cudanative.csv
@@ -0,0 +1,9 @@
+N,runtime
+128,17462.400000
+256,26332.800000
+512,43344.000000
+1024,87014.400000
+2048,540777.600000
+4096,3967702.400000
+8192,30435030.400000
+16384,236893779.200000
diff --git a/test/perf/matmul_kernels/wmma/cudanative.jl b/test/perf/matmul_kernels/wmma/cudanative.jl
@@ -0,0 +1,37 @@
+using CUDAdrv
+using CUDAnative
+using CUDAnative.MatMul
+using CuArrays
+
+M = parse(Int, ARGS[1])
+N = parse(Int, ARGS[2])
+K = parse(Int, ARGS[3])
+
+function benchmark_matmul(a, b, c, d)
+    CuArrays.@sync begin
+        conf = MatMul.get_config(
+            gemm_shape = (M = M, N = N, K = K),
+            operator = Operator.WMMAOp{16, 16, 16},
+            global_a_layout = Layout.AlignedColMajor{Float16},
+            global_c_layout = Layout.AlignedColMajor{Float32},
+                                )
+        MatMul.matmul(a, b, c, d, conf)
+    end
+end
+
+a_h = rand(Float16, (M, K)) / sqrt(Float16(K))
+b_h = rand(Float16, (K, N)) / sqrt(Float16(K))
+c_h = rand(Float32, (M, N))
+
+a   = CuArray(a_h)
+b   = CuArray(b_h)
+c   = CuArray(c_h)
+d   = similar(c)
+
+# warmup
+benchmark_matmul(a, b, c, d)
+
+# profile
+for i = 1 : 10
+    CUDAdrv.@profile benchmark_matmul(a, b, c, d)
+end
diff --git a/test/perf/matmul_kernels/wmma/cudanative.sh b/test/perf/matmul_kernels/wmma/cudanative.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+if [[ $# < 1 ]]; then
+    echo "Usage $0 <JULIA_PATH>" 1>&2
+    exit 1
+fi
+
+JULIA_PATH=$1
+
+cd "$( dirname "${BASH_SOURCE[0]}" )"
+
+printf "N,runtime\n" >cudanative.csv
+
+for i in {7..14}; do
+    N=$((2**i))
+
+    # runtime in ns
+    runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative.jl $N $N $N 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g')
+
+    printf "$N,$runtime\n" >>cudanative.csv
+done
diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv
@@ -0,0 +1,9 @@
+N,runtime
+128,20493.333333
+256,36458.666667
+512,62733.333333
+1024,119813.333333
+2048,465450.666667
+4096,3440157.333333
+8192,26701152.000000
+16384,215024610.666667
diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+if [[ $# < 1 ]]; then
+    echo "Usage $0 <CUTLASS_BUILD_PATH>" 1>&2
+    exit 1
+fi
+
+CUTLASS_BUILD_PATH=$1
+
+cd "$( dirname "${BASH_SOURCE[0]}" )"
+
+printf "N,runtime\n" >cutlass-mma-turing.csv
+
+for i in {7..14}; do
+    N=$((2**i))
+
+    # runtime in ns
+    runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=16 --inst_n=8 --inst_k=8 --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g')
+
+    printf "$N,$runtime\n" >>cutlass-mma-turing.csv
+done
diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma.csv b/test/perf/matmul_kernels/wmma/cutlass-mma.csv
@@ -0,0 +1,9 @@
+N,runtime
+128,20309.333333
+256,33522.666667
+512,59837.333333
+1024,118997.333333
+2048,827818.666667
+4096,6395536.000000
+8192,49197301.333333
+16384,400406416.000000
diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma.sh b/test/perf/matmul_kernels/wmma/cutlass-mma.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+if [[ $# < 1 ]]; then
+    echo "Usage $0 <CUTLASS_BUILD_PATH>" 1>&2
+    exit 1
+fi
+
+CUTLASS_BUILD_PATH=$1
+
+cd "$( dirname "${BASH_SOURCE[0]}" )"
+
+printf "N,runtime\n" >cutlass-mma.csv
+
+for i in {7..14}; do
+    N=$((2**i))
+
+    # runtime in ns
+    runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=8 --inst_n=8 --inst_k=4 --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g')
+
+    printf "$N,$runtime\n" >>cutlass-mma.csv
+done
diff --git a/test/perf/matmul_kernels/wmma/cutlass-wmma.csv b/test/perf/matmul_kernels/wmma/cutlass-wmma.csv
@@ -0,0 +1,9 @@
+N,runtime
+128,14274.666667
+256,22589.333333
+512,38648.000000
+1024,79410.666667
+2048,560162.666667
+4096,4084114.666667
+8192,31448712.000000
+16384,406712666.666667
diff --git a/test/perf/matmul_kernels/wmma/cutlass-wmma.sh b/test/perf/matmul_kernels/wmma/cutlass-wmma.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+if [[ $# < 1 ]]; then
+    echo "Usage $0 <CUTLASS_BUILD_PATH>" 1>&2
+    exit 1
+fi
+
+CUTLASS_BUILD_PATH=$1
+
+cd "$( dirname "${BASH_SOURCE[0]}" )"
+
+printf "N,runtime\n" >cutlass-wmma.csv
+
+for i in {7..14}; do
+    N=$((2**i))
+
+    # runtime in ns
+    runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=wmmatensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g')
+
+    printf "$N,$runtime\n" >>cutlass-wmma.csv
+done
diff --git a/test/perf/matmul_kernels/wmma/plot.jl b/test/perf/matmul_kernels/wmma/plot.jl
@@ -0,0 +1,27 @@
+using CSV
+using DataFrames
+using Plots
+
+pyplot()
+
+function plot_results(file, label)
+    df = DataFrame(CSV.File(file))
+
+    N = df[!, :N]
+    mean_runtime = df[!, :runtime] .* 1e3 # in ps
+
+    tflops = (2 .* N .^ 3) ./ mean_runtime
+
+    plot!(N, tflops, label=label, xscale=:log2, markershape=:circle)
+end
+
+plot_results("cudanative.csv", "CUDAnative")
+plot_results("cublas.csv", "cuBLAS")
+plot_results("cutlass-wmma.csv", "CUTLASS (WMMA)")
+plot_results("cutlass-mma.csv", "CUTLASS (mma.m8n8k4)")
+plot_results("cutlass-mma-turing.csv", "CUTLASS (mma.m16n8k8)")
+
+title!("Performance of mixed-precision GEMM\nProblem size: N x N x N")
+xlabel!("N")
+ylabel!("TFLOPS")
+savefig("plot.pdf")
diff --git a/test/perf/matmul_kernels/wmma/plot.pdf b/test/perf/matmul_kernels/wmma/plot.pdf