|
| 1 | +#include <omp.h> |
| 2 | +#include <chrono> |
| 3 | +#include <iostream> |
| 4 | +#include <vector> |
| 5 | + |
| 6 | +#include "cuda_runtime.h" |
| 7 | +#include "device_launch_parameters.h" |
| 8 | + |
| 9 | +#define checkCudaErrors(x) do { cudaError_t __ret = (x); if (__ret) { printf("CUDA ERROR %d: " #x "\n", __ret); abort(); } } while (0) |
| 10 | + |
| 11 | +#define TYPE double |
| 12 | +#define imgW 2448 |
| 13 | +#define imgH 2048 |
| 14 | +#define N (imgW*imgH) |
| 15 | + |
| 16 | +__constant__ TYPE c_para0[] = {1.5, 1.5, 1.5, 1.5, 1.5, 1.5}; |
| 17 | +__constant__ TYPE c_para1[] = {1.5, 1.5, 1.5, 1.5, 1.5, 1.5}; |
| 18 | +__constant__ TYPE c_para2[] = {1246, 1037, 2448, 2048}; |
| 19 | + |
| 20 | +#if 1 |
| 21 | +__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1, |
| 22 | + TYPE *para2) { |
| 23 | + // 2d grid stride loop |
| 24 | + for (int row = blockIdx.y * blockDim.y + threadIdx.y; row < height; row += blockDim.y * gridDim.y) { |
| 25 | + for (int col = blockIdx.x * blockDim.x + threadIdx.x; col < width; col += blockDim.x * gridDim.x) { |
| 26 | + int i = row * width + col; |
| 27 | + TYPE data = input[i]; |
| 28 | + TYPE x = (row - para2[0]) * para2[2]; |
| 29 | + TYPE y = (col - para2[1]) * para2[3]; |
| 30 | + |
| 31 | + const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y; |
| 32 | + const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y; |
| 33 | + |
| 34 | + output[i] = a / b; |
| 35 | + } |
| 36 | + } |
| 37 | +} |
| 38 | +#else |
| 39 | +__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1, |
| 40 | + TYPE *para2) { |
| 41 | + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < width * height; i += gridDim.x * blockDim.x) { |
| 42 | + TYPE data = input[i]; |
| 43 | + int row = i / width; |
| 44 | + int col = i % height; |
| 45 | + TYPE x = (col - para2[0]) * para2[2]; |
| 46 | + TYPE y = (row - para2[1]) * para2[3]; |
| 47 | +
|
| 48 | + const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y; |
| 49 | + const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y; |
| 50 | +
|
| 51 | + output[i] = a / b; |
| 52 | + } |
| 53 | +} |
| 54 | +#endif |
| 55 | + |
| 56 | +void CPU_Cal(const TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1, TYPE *para2) { |
| 57 | +#pragma omp parallel for |
| 58 | + for (int row = 0; row < height; ++row) { |
| 59 | + TYPE *_output = output + row * width; |
| 60 | + const TYPE *_input = input + row * width; |
| 61 | + for (int col = 0; col < width; ++col) { |
| 62 | + const TYPE data = *_input; |
| 63 | + const TYPE x = (col - para2[0]) * para2[2]; |
| 64 | + const TYPE y = (row - para2[1]) * para2[3]; |
| 65 | + |
| 66 | + const TYPE a = |
| 67 | + para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y; |
| 68 | + const TYPE b = |
| 69 | + para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y; |
| 70 | + |
| 71 | + *_output = a / b; |
| 72 | + ++_output; |
| 73 | + ++_input; |
| 74 | + } |
| 75 | + } |
| 76 | +} |
| 77 | + |
| 78 | +int main() { |
| 79 | + // 准备数据 |
| 80 | + std::vector<TYPE> input(N, 2); |
| 81 | + std::vector<TYPE> output(N, 0); |
| 82 | + std::vector<TYPE> para0(30, 1.5); |
| 83 | + std::vector<TYPE> para1(30, 1.5); |
| 84 | + std::vector<TYPE> para3{1246, 1037, 2448, 2048}; |
| 85 | + // 随机准备一段数据 |
| 86 | + for (int i = 0; i < N; ++i) { |
| 87 | + input[i] = (double)i / N; |
| 88 | + output[i] = (double)i / N + 2; |
| 89 | + } |
| 90 | + for (int i = 0; i < 30; ++i) { |
| 91 | + para0[i] = (double)i / 30; |
| 92 | + para1[i] = (double)i / 30 + 4.0; |
| 93 | + } |
| 94 | + |
| 95 | + TYPE *d_input; |
| 96 | + TYPE *d_output; |
| 97 | + TYPE *d_para0; |
| 98 | + TYPE *d_para1; |
| 99 | + TYPE *d_para2; |
| 100 | + cudaMalloc((void **)&d_input, N * sizeof(TYPE)); |
| 101 | + cudaMalloc((void **)&d_output, N * sizeof(TYPE)); |
| 102 | + cudaMalloc((void **)&d_para0, 30 * sizeof(TYPE)); |
| 103 | + cudaMalloc((void **)&d_para1, 30 * sizeof(TYPE)); |
| 104 | + cudaMalloc((void **)&d_para2, 4 * sizeof(TYPE)); |
| 105 | + cudaMemcpy(d_input, input.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice); |
| 106 | + cudaMemcpy(d_output, output.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice); |
| 107 | + cudaMemcpy(d_para0, para0.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice); |
| 108 | + cudaMemcpy(d_para1, para1.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice); |
| 109 | + cudaMemcpy(d_para2, para3.data(), 4 * sizeof(TYPE), cudaMemcpyHostToDevice); |
| 110 | + |
| 111 | + // GPU计算时间(取最短时间) |
| 112 | + dim3 thread_num = dim3(32, 32, 1); |
| 113 | + dim3 block_num = dim3(256, 256, 1); |
| 114 | + double gpu_time = 10000000; |
| 115 | + checkCudaErrors(cudaDeviceSynchronize()); |
| 116 | + for (size_t i = 0; i < 50; i++) { |
| 117 | + auto t0 = std::chrono::steady_clock::now(); |
| 118 | + GPU_Cal<<<block_num, thread_num>>>(d_input, d_output, imgW, imgH, d_para0, d_para1, d_para2); |
| 119 | + checkCudaErrors(cudaDeviceSynchronize()); |
| 120 | + double time = |
| 121 | + std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count(); |
| 122 | + gpu_time = std::min(gpu_time, time); |
| 123 | + } |
| 124 | + std::cout << "GPU time: " << gpu_time << std::endl; |
| 125 | + |
| 126 | + // CPU计算时间(取最短时间) |
| 127 | + TYPE *h_output; |
| 128 | + h_output = (TYPE *)malloc(N * sizeof(TYPE)); |
| 129 | + cudaMemcpy(h_output, d_output, N * sizeof(TYPE), cudaMemcpyDeviceToHost); |
| 130 | + double cpu_time = 10000000; |
| 131 | + for (size_t i = 0; i < 50; i++) { |
| 132 | + auto t0 = std::chrono::steady_clock::now(); |
| 133 | + CPU_Cal(input.data(), output.data(), imgW, imgH, para0.data(), para1.data(), para3.data()); |
| 134 | + double time = |
| 135 | + std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count(); |
| 136 | + cpu_time = std::min(cpu_time, time); |
| 137 | + } |
| 138 | + std::cout << "CPU time: " << cpu_time << std::endl; |
| 139 | + std::cout << "ratio: " << cpu_time / gpu_time << std::endl; |
| 140 | + |
| 141 | + // 检测计算结果是否一致 |
| 142 | + for (int i = 0; i < N; i++) { |
| 143 | + if (h_output[i] != h_output[i] && output[i] != output[i]) { |
| 144 | + continue; |
| 145 | + } |
| 146 | + if (fabs(h_output[i] - output[i]) > 1e-2) { |
| 147 | + printf("Error! i: %d, cpu: %f, gpu:%f.\n", i, output[i], h_output[i]); |
| 148 | + abort(); |
| 149 | + } |
| 150 | + } |
| 151 | + |
| 152 | + cudaFree(d_input); |
| 153 | + cudaFree(d_output); |
| 154 | + cudaFree(d_para0); |
| 155 | + cudaFree(d_para1); |
| 156 | + cudaFree(d_para2); |
| 157 | + return 0; |
| 158 | +} |
0 commit comments