parallel101
diff --git a/‎customers/8_cuda_omp_accel/a.out‎
827 KB b/‎customers/8_cuda_omp_accel/a.out‎
827 KB
diff --git a/‎customers/8_cuda_omp_accel/main.cu‎
Lines changed: 158 additions & 0 deletions b/‎customers/8_cuda_omp_accel/main.cu‎
Lines changed: 158 additions & 0 deletions
diff --git a/‎customers/issue6_i22filter/.gitignore‎ ‎…mers/issue6_i22convolvefilter/.gitignore‎customers/issue6_i22filter/.gitignore renamed to customers/issue6_i22convolvefilter/.gitignore b/‎customers/issue6_i22filter/.gitignore‎ ‎…mers/issue6_i22convolvefilter/.gitignore‎customers/issue6_i22filter/.gitignore renamed to customers/issue6_i22convolvefilter/.gitignore
diff --git a/‎customers/issue6_i22filter/CMakeLists.txt‎ ‎…/issue6_i22convolvefilter/CMakeLists.txt‎customers/issue6_i22filter/CMakeLists.txt renamed to customers/issue6_i22convolvefilter/CMakeLists.txt b/‎customers/issue6_i22filter/CMakeLists.txt‎ ‎…/issue6_i22convolvefilter/CMakeLists.txt‎customers/issue6_i22filter/CMakeLists.txt renamed to customers/issue6_i22convolvefilter/CMakeLists.txt
diff --git a/‎customers/issue6_i22filter/main.cpp‎ ‎…tomers/issue6_i22convolvefilter/main.cpp‎customers/issue6_i22filter/main.cpp renamed to customers/issue6_i22convolvefilter/main.cpp
Lines changed: 56 additions & 29 deletions b/‎customers/issue6_i22filter/main.cpp‎ ‎…tomers/issue6_i22convolvefilter/main.cpp‎customers/issue6_i22filter/main.cpp renamed to customers/issue6_i22convolvefilter/main.cpp
Lines changed: 56 additions & 29 deletions
diff --git a/‎foundation/cpplifetimetest/.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎foundation/cpplifetimetest/.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎foundation/cpplifetimetest/CMakeLists.txt‎
Lines changed: 10 additions & 0 deletions b/‎foundation/cpplifetimetest/CMakeLists.txt‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎foundation/cpplifetimetest/cppdemangle.h‎
Lines changed: 58 additions & 0 deletions b/‎foundation/cpplifetimetest/cppdemangle.h‎
Lines changed: 58 additions & 0 deletions
@@ -0,0 +1,158 @@
+#include <omp.h>
+#include <chrono>
+#include <iostream>
+#include <vector>
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#define checkCudaErrors(x) do { cudaError_t __ret = (x); if (__ret) { printf("CUDA ERROR %d: " #x "\n", __ret); abort(); } } while (0)
+
+#define TYPE double
+#define imgW 2448
+#define imgH 2048
+#define N (imgW*imgH)
+
+__constant__ TYPE c_para0[] = {1.5, 1.5, 1.5, 1.5, 1.5, 1.5};
+__constant__ TYPE c_para1[] = {1.5, 1.5, 1.5, 1.5, 1.5, 1.5};
+__constant__ TYPE c_para2[] = {1246, 1037, 2448, 2048};
+
+#if 1
+__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
+                                      TYPE *para2) {
+    // 2d grid stride loop
+    for (int row = blockIdx.y * blockDim.y + threadIdx.y; row < height; row += blockDim.y * gridDim.y) {
+        for (int col = blockIdx.x * blockDim.x + threadIdx.x; col < width; col += blockDim.x * gridDim.x) {
+            int i = row * width + col;
+            TYPE data = input[i];
+            TYPE x = (row - para2[0]) * para2[2];
+            TYPE y = (col - para2[1]) * para2[3];
+
+            const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
+            const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;
+
+            output[i] = a / b;
+        }
+    }
+}
+#else
+__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
+                                      TYPE *para2) {
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < width * height; i += gridDim.x * blockDim.x) {
+        TYPE data = input[i];
+        int row = i / width;
+        int col = i % height;
+        TYPE x = (col - para2[0]) * para2[2];
+        TYPE y = (row - para2[1]) * para2[3];
+
+        const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
+        const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;
+
+        output[i] = a / b;
+    }
+}
+#endif
+
+void CPU_Cal(const TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1, TYPE *para2) {
+#pragma omp parallel for
+    for (int row = 0; row < height; ++row) {
+        TYPE *_output = output + row * width;
+        const TYPE *_input = input + row * width;
+        for (int col = 0; col < width; ++col) {
+            const TYPE data = *_input;
+            const TYPE x = (col - para2[0]) * para2[2];
+            const TYPE y = (row - para2[1]) * para2[3];
+
+            const TYPE a =
+                para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
+            const TYPE b =
+                para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;
+
+            *_output = a / b; 
+            ++_output;
+            ++_input;
+        }
+    }
+}
+
+int main() {
+    // 准备数据
+    std::vector<TYPE> input(N, 2);
+    std::vector<TYPE> output(N, 0);
+    std::vector<TYPE> para0(30, 1.5);
+    std::vector<TYPE> para1(30, 1.5);
+    std::vector<TYPE> para3{1246, 1037, 2448, 2048};
+    // 随机准备一段数据
+    for (int i = 0; i < N; ++i) {
+        input[i] = (double)i / N;
+        output[i] = (double)i / N + 2;
+    }
+    for (int i = 0; i < 30; ++i) {
+        para0[i] = (double)i / 30;
+        para1[i] = (double)i / 30 + 4.0;
+    }
+
+    TYPE *d_input;
+    TYPE *d_output;
+    TYPE *d_para0;
+    TYPE *d_para1;
+    TYPE *d_para2;
+    cudaMalloc((void **)&d_input, N * sizeof(TYPE));
+    cudaMalloc((void **)&d_output, N * sizeof(TYPE));
+    cudaMalloc((void **)&d_para0, 30 * sizeof(TYPE));
+    cudaMalloc((void **)&d_para1, 30 * sizeof(TYPE));
+    cudaMalloc((void **)&d_para2, 4 * sizeof(TYPE));
+    cudaMemcpy(d_input, input.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_output, output.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_para0, para0.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_para1, para1.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_para2, para3.data(), 4 * sizeof(TYPE), cudaMemcpyHostToDevice);
+
+    // GPU计算时间（取最短时间）
+    dim3 thread_num = dim3(32, 32, 1);
+    dim3 block_num = dim3(256, 256, 1);
+    double gpu_time = 10000000;
+    checkCudaErrors(cudaDeviceSynchronize());
+    for (size_t i = 0; i < 50; i++) {
+        auto t0 = std::chrono::steady_clock::now();
+        GPU_Cal<<<block_num, thread_num>>>(d_input, d_output, imgW, imgH, d_para0, d_para1, d_para2);
+        checkCudaErrors(cudaDeviceSynchronize());
+        double time =
+            std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
+        gpu_time = std::min(gpu_time, time);
+    }
+    std::cout << "GPU time: " << gpu_time << std::endl;
+
+    // CPU计算时间（取最短时间）
+    TYPE *h_output;
+    h_output = (TYPE *)malloc(N * sizeof(TYPE));
+    cudaMemcpy(h_output, d_output, N * sizeof(TYPE), cudaMemcpyDeviceToHost);
+    double cpu_time = 10000000;
+    for (size_t i = 0; i < 50; i++) {
+        auto t0 = std::chrono::steady_clock::now();
+        CPU_Cal(input.data(), output.data(), imgW, imgH, para0.data(), para1.data(), para3.data());
+        double time =
+            std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
+        cpu_time = std::min(cpu_time, time);
+    }
+    std::cout << "CPU time: " << cpu_time << std::endl;
+    std::cout << "ratio: " << cpu_time / gpu_time << std::endl;
+
+    // 检测计算结果是否一致
+    for (int i = 0; i < N; i++) {
+        if (h_output[i] != h_output[i] && output[i] != output[i]) {
+            continue;
+        }
+        if (fabs(h_output[i] - output[i]) > 1e-2) {
+            printf("Error! i: %d, cpu: %f, gpu:%f.\n", i, output[i], h_output[i]);
+            abort();
+        }
+    }
+
+    cudaFree(d_input);
+    cudaFree(d_output);
+    cudaFree(d_para0);
+    cudaFree(d_para1);
+    cudaFree(d_para2);
+    return 0;
+}
@@ -34,10 +34,32 @@ s16 tbl_filt_list[3][4][4] =
         }
 };
 
-static void i22(pel* src, s16* dst, int i_dst, int width, int height, const int td)
+/* #ifdef __GNUC__ */
+/* __attribute__((__always_inline__)) */
+/* #endif */
+static void convolve(int16_t const *__restrict in, int16_t *__restrict out, int16_t const *__restrict filter, size_t n, uint8_t shift, uint16_t offset) {
+    for (size_t i = 0; i < n; i++) {
+        uint16_t tmp = 0;
+        for (size_t j = 0; j < 4; j++) {
+            tmp += in[i + j] * filter[j];
+        }
+        out[i] = tmp >> shift;
+    }
+}
+
+/* #ifdef __GNUC__ */
+/* __attribute__((__always_inline__)) */
+/* #endif */
+void s16copy(int16_t *__restrict out, int16_t const *__restrict in, size_t n) {
+    for (size_t i = 0; i < n; i++) {
+        out[i] = in[i];
+    }
+}
+
+static void i22(pel*__restrict src, s16*__restrict dst, int i_dst, int width, int height, const int td)
 {
     const int is_small = width * height <= ((td - 1) ? 64 : 32);
-    s16* filter;
+    s16 *__restrict filter;
     s8 offset, shift_r;
 
     // i < td
@@ -46,24 +68,26 @@ static void i22(pel* src, s16* dst, int i_dst, int width, int height, const int
     s16 col_0[64], col_1_td2[64];
 
     filter = tbl_filt_list[is_small + 1][1];
-    for (int j = 0; j < height; j++) {
-        col_0[j] = (s16)((
-            src[j - height - 1] * filter[0] +
-            src[j - height - 1 + 1] * filter[1] +
-            src[j - height - 1 + 2] * filter[2] +
-            src[j - height - 1 + 3] * filter[3] +
-            offset) >> shift_r);
-    }
+    convolve(src - height - 1, col_0, filter, height, shift_r, offset);
+    /* for (int j = 0; j < height; j++) { */
+    /*     col_0[j] = (s16)(( */
+    /*         src[j - height - 1] * filter[0] + */
+    /*         src[j - height - 1 + 1] * filter[1] + */
+    /*         src[j - height - 1 + 2] * filter[2] + */
+    /*         src[j - height - 1 + 3] * filter[3] + */
+    /*         offset) >> shift_r); */
+    /* } */
     if (2 == td) {
         filter = tbl_filt_list[is_small + 1][2];
-        for (int j = 0; j < height; j++) {
-            col_1_td2[j] = (s16)((
-                src[j - height - 1] * filter[0] +
-                src[j - height - 1 + 1] * filter[1] +
-                src[j - height - 1 + 2] * filter[2] +
-                src[j - height - 1 + 3] * filter[3] +
-                offset) >> shift_r);
-        }
+        convolve(src - height - 1, col_1_td2, filter, height, shift_r, offset);
+        /* for (int j = 0; j < height; j++) { */
+        /*     col_1_td2[j] = (s16)(( */
+        /*         src[j - height - 1] * filter[0] + */
+        /*         src[j - height - 1 + 1] * filter[1] + */
+        /*         src[j - height - 1 + 2] * filter[2] + */
+        /*         src[j - height - 1 + 3] * filter[3] + */
+        /*         offset) >> shift_r); */
+        /* } */
     }
 
     // i >= td
@@ -186,21 +210,24 @@ static void i22(pel* src, s16* dst, int i_dst, int width, int height, const int
     if (width > 4) {
         for (int j = 0; j < height; j++) {
             dst[0] = col_0[height - 1 - j];
-            if (2 == td) {
+        }
+        if (2 == td) {
+            for (int j = 0; j < height; j++) {
                 dst[1] = col_1_td2[height - 1 - j];
             }
+        }
 
-            if ((3 + 4 * j) < width) {
-                memcpy(dst + td, ref_left + (4 * (height - 1) + rem_rl - 1) - (4 * j + rem_rl - 1), (rem_rl + 4 * j) * sizeof(s16));
-                memcpy(dst + 3 + 4 * j, ref_above, (width - (3 + 4 * j)) * sizeof(s16));
-            }
-            else {
-                // w - 3
-                memcpy(dst + td, ref_left + (4 * (height - 1) + rem_rl - 1) - (4 * j + rem_rl - 1), (width - td) * sizeof(s16));
-            }
-
-            dst += i_dst;
+        auto mid = std::min((width - 3) / 4, height);
+        auto rlp = ref_left + (4 * (height - 1) + rem_rl - 1) - (rem_rl - 1);
+        s16copy(dst + td, rlp - 4 * mid, rem_rl + 4 * mid);
+        for (int j = 0; j < mid; j++) {
+            s16copy(dst + 3 + 4 * j, ref_above, width - 3 - 4 * j);
         }
+        for (int j = mid; j < height; j++) {
+            s16copy(dst + td, rlp - 4 * j, width - td);
+        }
+
+        dst += i_dst;
     }
     else {
         for (int j = 0; j < height; j++) {
 
@@ -0,0 +1,2 @@
+.cache/
+build/
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.18)
+
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+set(CMAKE_CXX_STANDARD 20)
+
+project(main LANGUAGES CXX)
+
+add_executable(main main.cpp)
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <typeinfo>
+#include <type_traits>
+#include <string>
+#if (defined(__GNUC__) || defined(__clang__)) && __has_include(<cxxabi.h>)
+#include <cxxabi.h>
+#include <cstdlib>
+#endif
+
+namespace _cppdemangle_details {
+
+static std::string cppdemangle(const char *name) {
+#if (defined(__GNUC__) || defined(__clang__)) && __has_include(<cxxabi.h>)
+    int status;
+    char *p = abi::__cxa_demangle(name, 0, 0, &status);
+    std::string s = p ? p : name;
+    std::free(p);
+#else
+    std::string s = name;
+#endif
+    return s;
+}
+
+static std::string cppdemangle(std::type_info const &type) {
+    return cppdemangle(type.name());
+}
+
+template <class T>
+static std::string cppdemangle() {
+    std::string s{cppdemangle(typeid(std::remove_cv_t<std::remove_reference_t<T>>))};
+    if (std::is_const_v<std::remove_reference_t<T>>)
+        s += " const";
+    if (std::is_volatile_v<std::remove_reference_t<T>>)
+        s += " volatile";
+    if (std::is_lvalue_reference_v<T>)
+        s += " &";
+    if (std::is_rvalue_reference_v<T>)
+        s += " &&";
+    return s;
+}
+
+}
+
+using _cppdemangle_details::cppdemangle;
+
+// Usage:
+//
+// cppdemangle<int>()
+// => "int"
+//
+// int i;
+// cppdemangle<decltype(i)>()
+// => "int"
+//
+// int i;
+// cppdemangle<decltype(std::as_const(i))>()
+// => "int const &"