Completed Python program

Pythonic-Rainbow · Pythonic-Rainbow · commit 1ed7fca2d600 · 2023-03-14T05:21:58.000Z
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.idea
+**/__pycache__
diff --git a/README.md b/README.md
@@ -1 +1,45 @@
 # MatrixMultBenchmark
+
+This project benchmarks the matrix multiply algorithm against different hardware and language combinations.  
+For hardware, there are 3 options: Single-core, Multi-core and GPU.
+
+Adjust the matrix size so each run takes ~10s.
+
+
+For CPU testing, take a screenshot of HWinfo when the benchmark is done.
+For GPU testing, start logging when you choose the device for benchmark. stop logging when the benchmark is done.  
+Make sure that there are at least 3 rows with 100% core utilization.
+
+# Results
+`Power` measures the clock speed and power consumption when under certain workloads.  
+They are measured using the HWiNFO program.  
+
+Bandwidth is measured in Gbps. It serves as a metric of data transfer rate: if the recorded bandwidth reaches the design limit, it hinders performance and the run should be disqualified.
+
+For CPU:
+* Power refers to the "Core+SoC power".
+* Single-core frequency refers to the frequency of the core that is under stress.
+
+GPU benchmarks are complicated because we need to copy data to/from the main memory.
+I've decided to record the average/max of several data.  
+Max shows the performance in a single event. For example, my dGPU clocks at 1.8-9GHz when it's doing actual computation.  
+Average shows the performance with all factors considered. For example, dGPU clocks at just 300MHz when receiving data from main memory. The average clock from CPU sending data to CPU receiving results is ~1.5GHz. This gives an estimation of the relative performance over the entire test.
+* Main memory bandwidth DOES NOT mean video RAM bandwidth, but rather the rate of the CPU pulling from/pushing to main memory.
+* PCIe bandwidth: In this benchmark I am only counting uni-directional bandwidth. The equation is `Link speed * Encoding(128/130) * Lanes`.
+* VRAM bandwidth: The equation is `Clock * Bus width * pump rate`.
+
+For dGPU:
+* Power is the sum of CPU "Core+SoC power" and "GPU power". Avg only.
+* PCIe transfer rate: Much lower when idle. Avg&Max.
+* VRAM bandwidth: Much lower when idle. Avg&Max.
+
+For iGPU:
+* Power refers to the "CPU Package power". [See this](https://www.hwinfo.com/forum/threads/how-to-read-apu-power-consumption-properly.8206/)
+* Bandwidth: iGPU doesn't use PCIe (I think), it uses the main memory instead and the memory clock is constant.
+
+
+# Tools
+The `tool` folder contains some tools for processing benchmark results.
+* calc.xlsx: A spreadsheet that computes the average of several runs. The data is my Python iGPU benchmark. Ignore the colored columns if you're just using this.
+* bandwidth-calc.py: Calculates PCIe/VRAM bandwidth.
+* efficiency-calc.py: Calculates computation per second and computation per joule.
diff --git a/gpu/opencl.cl b/gpu/opencl.cl
@@ -0,0 +1,11 @@
+__kernel void multiply(int n, int m, int p,
+__global int *a, __global int *b, __global int *c)
+{
+  int t = get_global_id(0);
+  int row_a = t/n;
+  int coln_b = t%p;
+
+  for (int i=0; i <m; i++) {
+   c[t] += a[row_a*m+i] * b[i*p+coln_b];
+  }
+}
diff --git a/lang/py3/benchmark.py b/lang/py3/benchmark.py
@@ -0,0 +1,114 @@
+from multiprocessing import Pool
+import pyopencl as ocl
+import pyopencl.array
+import numpy as np
+import time
+from random import randint
+
+with open('../../gpu/opencl.cl') as f:
+    cl_prg = ''.join(f.readlines())
+
+class Matrix:
+
+    def __init__(self):
+        self.data = [[]]
+        self.rows = 0
+        self.cols = 0
+
+    def print(self):
+        for row in self.data:
+            print(' '.join(str(c) for c in row) + ';\n')
+        print(f'{self.rows}x{self.cols}')
+
+    @staticmethod
+    def __gen__(col):
+        return [randint(0, 1000) for _ in range(col)]
+
+    def resize(self, row , col):
+        if self.rows != row or self.cols != col:
+            with Pool() as pool:
+                self.data = pool.map(self.__gen__, map(lambda r: col, range(row)))
+            self.rows = row
+            self.cols = col
+
+
+class Matrices:
+
+    def __init__(self):
+        self.a = Matrix()
+        self.b = Matrix()
+
+    def resize(self, np, m):
+        self.a.resize(np, m)
+        self.b.resize(m, np)
+
+
+class Stopwatch:
+
+    def __init__(self):
+        self.msgs = []
+        self.start = time.perf_counter()
+
+    def lap(self, msg: str):
+        t = time.perf_counter()
+        duration = t - self.start
+        self.start = t
+        self.msgs.append(f'{msg}: {duration:.4f}s')
+
+    def print(self):
+        print('\n'.join(self.msgs))
+
+
+def __row__(row, b):
+    cl = []
+    for column in range(len(b[0])):
+        s = 0
+        for i, otr in enumerate(b):
+            s += row[i] * otr[column]
+        cl.append(s)
+    return cl
+
+
+def single(m: Matrices) -> [[int]]:
+    """Multiplies using single core"""
+    start = time.perf_counter()
+    n = []
+    for row in m.a.data:
+        n.append(__row__(row, m.b.data))
+    duration = time.perf_counter() - start
+    print(f'Single-core: {duration:.4f}s')
+    return n
+
+
+def multiple(m: Matrices) -> [[int]]:
+    """Multiplies using multiple-core"""
+    start = time.perf_counter()
+    with Pool() as p:
+        result = p.starmap(__row__, map(lambda row: (row, m.b.data), m.a.data))
+    duration = time.perf_counter() - start
+    print(f'Multi-core: {duration:.4f}s')
+    return result
+
+
+def opencl(m: Matrices, dev: ocl.Device) -> [[int]]:
+    """Multiplies using OpenCL"""
+    ctx = ocl.Context(devices=(dev,))
+    sw = Stopwatch()
+    with ocl.CommandQueue(ctx) as q:
+        a = ocl.array.to_device(q, np.array(m.a.data))
+        b = ocl.array.to_device(q, np.array(m.b.data))
+        s = ocl.array.Array(q, m.a.rows ** 2, np.int32)
+        sw.lap('->GPU')
+
+        prg = ocl.Program(ctx, cl_prg).build()
+        prg.multiply(q, s.shape, None,
+                     np.int32(m.a.rows), np.int32(m.a.cols), np.int32(m.b.cols),
+                     a.data, b.data, s.data)
+        s = s.reshape(m.a.rows, m.b.cols)
+        q.finish()
+        sw.lap('GPU compute')
+
+        result = s.map_to_host().tolist()
+    sw.lap('->CPU')
+    sw.print()
+    return result
diff --git a/lang/py3/main.py b/lang/py3/main.py
@@ -0,0 +1,38 @@
+import os
+
+import pyopencl as ocl
+
+import benchmark
+
+os.environ['PYOPENCL_NO_CACHE'] = '1'
+
+if __name__ == '__main__':
+    m = benchmark.Matrices()
+    while True:
+        hw_choice = int(input(
+            '1. Single-core\n'
+            '2. Multi-core\n'
+            '3. GPU (OpenCL)\n'
+        ))
+        if hw_choice == 1:
+            m.resize(1200, 100)
+            input('Start')
+            benchmark.single(m)
+            print(benchmark.count)
+        elif hw_choice == 2:
+            m.resize(1900, 190)
+            input('Start')
+            benchmark.multiple(m)
+        else:
+            m.resize(8250, 1500)
+            devices = []
+            print('The following devices in your system support OpenCL:')
+            for platform in ocl.get_platforms():
+                print(platform.name + ' | ' + platform.version)
+                for device in platform.get_devices():
+                    print(f'{len(devices)}. {device.name} | {device.version} | {device.max_compute_units} CU')
+                    devices.append(device)
+            device_choice = int(input('Enter device number to benchmark: '))
+            device = devices[device_choice]
+
+            benchmark.opencl(m, device)
diff --git a/lang/py3/requirements.txt b/lang/py3/requirements.txt
@@ -0,0 +1 @@
+pyopencl
diff --git a/result/legion-r7000-2020.md b/result/legion-r7000-2020.md
@@ -0,0 +1,45 @@
+# System
+* Name: Lenovo Legion R7000 2020
+* Class: Laptop
+* CPU: AMD Ryzen 5 4600H (TDP 45W)
+* RAM: Dual-channel DDR4-3200 CL22
+* GPU0: Integrated AMD Vega 'gfx902'
+  * 6 CU, 384 shader units, 512MB 128bit DDR4
+* GPU1: Nvidia GeForce GTX 1650 Ti (TDP 50W)
+  * 16 CU, 1024 shader units, 4GB 128bit GDDR6
+  * Max PCIe bandwidth: Gen3 x16 -> 126Gbps
+
+
+# Note
+iGPU
+* Memory capacity: With most of my background tasks closed, it still consumes ~300MB. When running the tests, it reaches the 512MB limit.
+* The results fluctuate a lot for some reason. GPU processing time can vary between 5s-9s. I've run 20 benchmarks for the iGPU. Other devices produce consistent result and I only ran 3 times per device. See the yellow columns in `calc.xlsx` for details.
+
+
+# Power
+* CPU
+  * Single core: 4.0GHz, 9.703W
+    * Main memory bandwidth: RAvg 1.198 RMax 3.661 WAvg 0.368 WMax 0.903
+  * All cores: ~3.9GHz 49.567W
+    * Main memory bandwidth: RAvg 5.517 RMax 20.396 WAvg 2.180 WMax 3.255
+* GPU0
+  * Core: ~1.104GHz, max 1.5GHz. 15.233W
+  * Main memory bandwidth: RAvg 36.104 RMax 47.743 WAvg 0.413 WMax 2.285
+  * 'VRAM' bandwidth: 1.6GHz -> 409.6Gbps
+* GPU1
+  * Core: ~1.601GHz, max 1.9GHz. CPU 5.278W + GPU 33.389W = Total 38.667W
+  * Main memory bandwidth: RAvg 2.177 RMax 11.036 WAvg 0.606 WMax 2.758
+  * PCIe bandwidth: Avg 7.228GHz-> 114Gbps Max 8GHz-> 126Gbps
+  * VRAM Bandwidth: Avg 1.304GHz-> 1335Gbps Max 1.5GHz-> 1536Gbps
+
+
+# Results (Windows 10 21H2)
+## Python 3
+* Single-core: 10.458s 1200x100|100x1200
+  * 13769 comp/s, 1419 comp/j
+* All cores: 10.016s 1900x190|190x1900
+  * 68480 comp/s, 1381 comp/j
+* GPU0: ->GPU 1.666s Compute: 7.821s ->CPU: 1.264s Total 10.752s 8250x1500|1500x8250
+  * 9495326 comp/s, 623339 comp/j
+* GPU1: ->GPU 2.293s Compute: 5.542s ->CPU: 1.916s Total 9.751s 9000x2100|2100x9000
+  * 17444364 comp/s, 451143 comp/j
diff --git a/tool/bandwidth-calc.py b/tool/bandwidth-calc.py
@@ -0,0 +1,14 @@
+choice = int(input(
+    '1. PCIe\n'
+    '2. VRAM\n'
+))
+
+if choice == 1:
+    link_speed = float(input('Enter link speed in GHz: '))
+    lane_count = int(input('Enter number of lanes: '))
+    print(link_speed * 128/130 * lane_count)  # 128/130 is the encoding scheme of PCIe Gen3+
+elif choice == 2:
+    clock_speed = float(input('Enter clock speed in GHz: '))
+    bus_width = int(input('Enter bus width in bits: '))
+    pump_rate = int(input('Enter pump rate: '))
+    print(clock_speed * bus_width * pump_rate)
diff --git a/tool/calc.xlsx b/tool/calc.xlsx
diff --git a/tool/efficiency-calc.py b/tool/efficiency-calc.py
@@ -0,0 +1,14 @@
+np = int(input('Enter the first number of the Matrices class: '))
+m = int(input('Enter the second number of the Matrices class: '))
+
+computation = np ** 2 * m
+print(str(computation) + ' computations')
+
+t = float(input('Enter time: '))
+power = float(input('Enter power: '))
+
+comp_per_time = int(computation / t / 1000)
+print(str(comp_per_time) + ' computations per second')
+
+comp_per_j = int(computation / (power * t) / 1000)
+print(str(comp_per_j) +' computations per joule')