Skip to content

Commit c5701ee

Browse files
committed
updatecode
1 parent 6cedbe4 commit c5701ee

File tree

11 files changed

+455
-122
lines changed

11 files changed

+455
-122
lines changed

customers/8_cuda_omp_accel/a.out

827 KB
Binary file not shown.

customers/8_cuda_omp_accel/main.cu

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
#include <omp.h>
2+
#include <chrono>
3+
#include <iostream>
4+
#include <vector>
5+
6+
#include "cuda_runtime.h"
7+
#include "device_launch_parameters.h"
8+
9+
#define checkCudaErrors(x) do { cudaError_t __ret = (x); if (__ret) { printf("CUDA ERROR %d: " #x "\n", __ret); abort(); } } while (0)
10+
11+
#define TYPE double
12+
#define imgW 2448
13+
#define imgH 2048
14+
#define N (imgW*imgH)
15+
16+
__constant__ TYPE c_para0[] = {1.5, 1.5, 1.5, 1.5, 1.5, 1.5};
17+
__constant__ TYPE c_para1[] = {1.5, 1.5, 1.5, 1.5, 1.5, 1.5};
18+
__constant__ TYPE c_para2[] = {1246, 1037, 2448, 2048};
19+
20+
#if 1
21+
__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
22+
TYPE *para2) {
23+
// 2d grid stride loop
24+
for (int row = blockIdx.y * blockDim.y + threadIdx.y; row < height; row += blockDim.y * gridDim.y) {
25+
for (int col = blockIdx.x * blockDim.x + threadIdx.x; col < width; col += blockDim.x * gridDim.x) {
26+
int i = row * width + col;
27+
TYPE data = input[i];
28+
TYPE x = (row - para2[0]) * para2[2];
29+
TYPE y = (col - para2[1]) * para2[3];
30+
31+
const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
32+
const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;
33+
34+
output[i] = a / b;
35+
}
36+
}
37+
}
38+
#else
39+
__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
40+
TYPE *para2) {
41+
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < width * height; i += gridDim.x * blockDim.x) {
42+
TYPE data = input[i];
43+
int row = i / width;
44+
int col = i % height;
45+
TYPE x = (col - para2[0]) * para2[2];
46+
TYPE y = (row - para2[1]) * para2[3];
47+
48+
const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
49+
const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;
50+
51+
output[i] = a / b;
52+
}
53+
}
54+
#endif
55+
56+
void CPU_Cal(const TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1, TYPE *para2) {
57+
#pragma omp parallel for
58+
for (int row = 0; row < height; ++row) {
59+
TYPE *_output = output + row * width;
60+
const TYPE *_input = input + row * width;
61+
for (int col = 0; col < width; ++col) {
62+
const TYPE data = *_input;
63+
const TYPE x = (col - para2[0]) * para2[2];
64+
const TYPE y = (row - para2[1]) * para2[3];
65+
66+
const TYPE a =
67+
para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
68+
const TYPE b =
69+
para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;
70+
71+
*_output = a / b;
72+
++_output;
73+
++_input;
74+
}
75+
}
76+
}
77+
78+
int main() {
79+
// 准备数据
80+
std::vector<TYPE> input(N, 2);
81+
std::vector<TYPE> output(N, 0);
82+
std::vector<TYPE> para0(30, 1.5);
83+
std::vector<TYPE> para1(30, 1.5);
84+
std::vector<TYPE> para3{1246, 1037, 2448, 2048};
85+
// 随机准备一段数据
86+
for (int i = 0; i < N; ++i) {
87+
input[i] = (double)i / N;
88+
output[i] = (double)i / N + 2;
89+
}
90+
for (int i = 0; i < 30; ++i) {
91+
para0[i] = (double)i / 30;
92+
para1[i] = (double)i / 30 + 4.0;
93+
}
94+
95+
TYPE *d_input;
96+
TYPE *d_output;
97+
TYPE *d_para0;
98+
TYPE *d_para1;
99+
TYPE *d_para2;
100+
cudaMalloc((void **)&d_input, N * sizeof(TYPE));
101+
cudaMalloc((void **)&d_output, N * sizeof(TYPE));
102+
cudaMalloc((void **)&d_para0, 30 * sizeof(TYPE));
103+
cudaMalloc((void **)&d_para1, 30 * sizeof(TYPE));
104+
cudaMalloc((void **)&d_para2, 4 * sizeof(TYPE));
105+
cudaMemcpy(d_input, input.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
106+
cudaMemcpy(d_output, output.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
107+
cudaMemcpy(d_para0, para0.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
108+
cudaMemcpy(d_para1, para1.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
109+
cudaMemcpy(d_para2, para3.data(), 4 * sizeof(TYPE), cudaMemcpyHostToDevice);
110+
111+
// GPU计算时间(取最短时间)
112+
dim3 thread_num = dim3(32, 32, 1);
113+
dim3 block_num = dim3(256, 256, 1);
114+
double gpu_time = 10000000;
115+
checkCudaErrors(cudaDeviceSynchronize());
116+
for (size_t i = 0; i < 50; i++) {
117+
auto t0 = std::chrono::steady_clock::now();
118+
GPU_Cal<<<block_num, thread_num>>>(d_input, d_output, imgW, imgH, d_para0, d_para1, d_para2);
119+
checkCudaErrors(cudaDeviceSynchronize());
120+
double time =
121+
std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
122+
gpu_time = std::min(gpu_time, time);
123+
}
124+
std::cout << "GPU time: " << gpu_time << std::endl;
125+
126+
// CPU计算时间(取最短时间)
127+
TYPE *h_output;
128+
h_output = (TYPE *)malloc(N * sizeof(TYPE));
129+
cudaMemcpy(h_output, d_output, N * sizeof(TYPE), cudaMemcpyDeviceToHost);
130+
double cpu_time = 10000000;
131+
for (size_t i = 0; i < 50; i++) {
132+
auto t0 = std::chrono::steady_clock::now();
133+
CPU_Cal(input.data(), output.data(), imgW, imgH, para0.data(), para1.data(), para3.data());
134+
double time =
135+
std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
136+
cpu_time = std::min(cpu_time, time);
137+
}
138+
std::cout << "CPU time: " << cpu_time << std::endl;
139+
std::cout << "ratio: " << cpu_time / gpu_time << std::endl;
140+
141+
// 检测计算结果是否一致
142+
for (int i = 0; i < N; i++) {
143+
if (h_output[i] != h_output[i] && output[i] != output[i]) {
144+
continue;
145+
}
146+
if (fabs(h_output[i] - output[i]) > 1e-2) {
147+
printf("Error! i: %d, cpu: %f, gpu:%f.\n", i, output[i], h_output[i]);
148+
abort();
149+
}
150+
}
151+
152+
cudaFree(d_input);
153+
cudaFree(d_output);
154+
cudaFree(d_para0);
155+
cudaFree(d_para1);
156+
cudaFree(d_para2);
157+
return 0;
158+
}
File renamed without changes.
File renamed without changes.

customers/issue6_i22filter/main.cpp renamed to customers/issue6_i22convolvefilter/main.cpp

Lines changed: 56 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,32 @@ s16 tbl_filt_list[3][4][4] =
3434
}
3535
};
3636

37-
static void i22(pel* src, s16* dst, int i_dst, int width, int height, const int td)
37+
/* #ifdef __GNUC__ */
38+
/* __attribute__((__always_inline__)) */
39+
/* #endif */
40+
static void convolve(int16_t const *__restrict in, int16_t *__restrict out, int16_t const *__restrict filter, size_t n, uint8_t shift, uint16_t offset) {
41+
for (size_t i = 0; i < n; i++) {
42+
uint16_t tmp = 0;
43+
for (size_t j = 0; j < 4; j++) {
44+
tmp += in[i + j] * filter[j];
45+
}
46+
out[i] = tmp >> shift;
47+
}
48+
}
49+
50+
/* #ifdef __GNUC__ */
51+
/* __attribute__((__always_inline__)) */
52+
/* #endif */
53+
void s16copy(int16_t *__restrict out, int16_t const *__restrict in, size_t n) {
54+
for (size_t i = 0; i < n; i++) {
55+
out[i] = in[i];
56+
}
57+
}
58+
59+
static void i22(pel*__restrict src, s16*__restrict dst, int i_dst, int width, int height, const int td)
3860
{
3961
const int is_small = width * height <= ((td - 1) ? 64 : 32);
40-
s16* filter;
62+
s16 *__restrict filter;
4163
s8 offset, shift_r;
4264

4365
// i < td
@@ -46,24 +68,26 @@ static void i22(pel* src, s16* dst, int i_dst, int width, int height, const int
4668
s16 col_0[64], col_1_td2[64];
4769

4870
filter = tbl_filt_list[is_small + 1][1];
49-
for (int j = 0; j < height; j++) {
50-
col_0[j] = (s16)((
51-
src[j - height - 1] * filter[0] +
52-
src[j - height - 1 + 1] * filter[1] +
53-
src[j - height - 1 + 2] * filter[2] +
54-
src[j - height - 1 + 3] * filter[3] +
55-
offset) >> shift_r);
56-
}
71+
convolve(src - height - 1, col_0, filter, height, shift_r, offset);
72+
/* for (int j = 0; j < height; j++) { */
73+
/* col_0[j] = (s16)(( */
74+
/* src[j - height - 1] * filter[0] + */
75+
/* src[j - height - 1 + 1] * filter[1] + */
76+
/* src[j - height - 1 + 2] * filter[2] + */
77+
/* src[j - height - 1 + 3] * filter[3] + */
78+
/* offset) >> shift_r); */
79+
/* } */
5780
if (2 == td) {
5881
filter = tbl_filt_list[is_small + 1][2];
59-
for (int j = 0; j < height; j++) {
60-
col_1_td2[j] = (s16)((
61-
src[j - height - 1] * filter[0] +
62-
src[j - height - 1 + 1] * filter[1] +
63-
src[j - height - 1 + 2] * filter[2] +
64-
src[j - height - 1 + 3] * filter[3] +
65-
offset) >> shift_r);
66-
}
82+
convolve(src - height - 1, col_1_td2, filter, height, shift_r, offset);
83+
/* for (int j = 0; j < height; j++) { */
84+
/* col_1_td2[j] = (s16)(( */
85+
/* src[j - height - 1] * filter[0] + */
86+
/* src[j - height - 1 + 1] * filter[1] + */
87+
/* src[j - height - 1 + 2] * filter[2] + */
88+
/* src[j - height - 1 + 3] * filter[3] + */
89+
/* offset) >> shift_r); */
90+
/* } */
6791
}
6892

6993
// i >= td
@@ -186,21 +210,24 @@ static void i22(pel* src, s16* dst, int i_dst, int width, int height, const int
186210
if (width > 4) {
187211
for (int j = 0; j < height; j++) {
188212
dst[0] = col_0[height - 1 - j];
189-
if (2 == td) {
213+
}
214+
if (2 == td) {
215+
for (int j = 0; j < height; j++) {
190216
dst[1] = col_1_td2[height - 1 - j];
191217
}
218+
}
192219

193-
if ((3 + 4 * j) < width) {
194-
memcpy(dst + td, ref_left + (4 * (height - 1) + rem_rl - 1) - (4 * j + rem_rl - 1), (rem_rl + 4 * j) * sizeof(s16));
195-
memcpy(dst + 3 + 4 * j, ref_above, (width - (3 + 4 * j)) * sizeof(s16));
196-
}
197-
else {
198-
// w - 3
199-
memcpy(dst + td, ref_left + (4 * (height - 1) + rem_rl - 1) - (4 * j + rem_rl - 1), (width - td) * sizeof(s16));
200-
}
201-
202-
dst += i_dst;
220+
auto mid = std::min((width - 3) / 4, height);
221+
auto rlp = ref_left + (4 * (height - 1) + rem_rl - 1) - (rem_rl - 1);
222+
s16copy(dst + td, rlp - 4 * mid, rem_rl + 4 * mid);
223+
for (int j = 0; j < mid; j++) {
224+
s16copy(dst + 3 + 4 * j, ref_above, width - 3 - 4 * j);
203225
}
226+
for (int j = mid; j < height; j++) {
227+
s16copy(dst + td, rlp - 4 * j, width - td);
228+
}
229+
230+
dst += i_dst;
204231
}
205232
else {
206233
for (int j = 0; j < height; j++) {
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.cache/
2+
build/
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
cmake_minimum_required(VERSION 3.18)
2+
3+
if (NOT CMAKE_BUILD_TYPE)
4+
set(CMAKE_BUILD_TYPE Release)
5+
endif()
6+
set(CMAKE_CXX_STANDARD 20)
7+
8+
project(main LANGUAGES CXX)
9+
10+
add_executable(main main.cpp)
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#pragma once
2+
3+
#include <typeinfo>
4+
#include <type_traits>
5+
#include <string>
6+
#if (defined(__GNUC__) || defined(__clang__)) && __has_include(<cxxabi.h>)
7+
#include <cxxabi.h>
8+
#include <cstdlib>
9+
#endif
10+
11+
namespace _cppdemangle_details {
12+
13+
static std::string cppdemangle(const char *name) {
14+
#if (defined(__GNUC__) || defined(__clang__)) && __has_include(<cxxabi.h>)
15+
int status;
16+
char *p = abi::__cxa_demangle(name, 0, 0, &status);
17+
std::string s = p ? p : name;
18+
std::free(p);
19+
#else
20+
std::string s = name;
21+
#endif
22+
return s;
23+
}
24+
25+
static std::string cppdemangle(std::type_info const &type) {
26+
return cppdemangle(type.name());
27+
}
28+
29+
template <class T>
30+
static std::string cppdemangle() {
31+
std::string s{cppdemangle(typeid(std::remove_cv_t<std::remove_reference_t<T>>))};
32+
if (std::is_const_v<std::remove_reference_t<T>>)
33+
s += " const";
34+
if (std::is_volatile_v<std::remove_reference_t<T>>)
35+
s += " volatile";
36+
if (std::is_lvalue_reference_v<T>)
37+
s += " &";
38+
if (std::is_rvalue_reference_v<T>)
39+
s += " &&";
40+
return s;
41+
}
42+
43+
}
44+
45+
using _cppdemangle_details::cppdemangle;
46+
47+
// Usage:
48+
//
49+
// cppdemangle<int>()
50+
// => "int"
51+
//
52+
// int i;
53+
// cppdemangle<decltype(i)>()
54+
// => "int"
55+
//
56+
// int i;
57+
// cppdemangle<decltype(std::as_const(i))>()
58+
// => "int const &"

0 commit comments

Comments
 (0)