Skip to content

Commit 6cedbe4

Browse files
committed
add vtunefile
1 parent 52911d2 commit 6cedbe4

File tree

8 files changed

+290
-0
lines changed

8 files changed

+290
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
.cache/
22
build/
3+
build-vtune/

customers/issue6_i22filter/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.cache/
2+
build/
+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
cmake_minimum_required(VERSION 3.18)
2+
3+
if (NOT CMAKE_BUILD_TYPE)
4+
set(CMAKE_BUILD_TYPE Release)
5+
endif()
6+
set(CMAKE_CXX_STANDARD 20)
7+
8+
project(main LANGUAGES CXX)
9+
10+
add_executable(main main.cpp)
11+
12+
find_package(OpenMP REQUIRED)
13+
target_link_libraries(main OpenMP::OpenMP_CXX)
14+
target_compile_options(main PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-mavx2$<SEMICOLON>-mfma> $<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/arch:AVX2>) # 如果你的电脑不支持 AVX2,请删除本行

customers/issue6_i22filter/main.cpp

+255
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
#include <cstring>
2+
#include <immintrin.h>
3+
#include <chrono>
4+
#include <iostream>
5+
#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
6+
#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"\n";
7+
8+
typedef signed short s16;
9+
typedef signed char s8;
10+
typedef s16 pel;
11+
typedef const s8 tab_s8;
12+
13+
tab_s8 filter_offset_list[4] = { 32, 32 ,64, 32 };
14+
tab_s8 filter_bits_list[4] = { 6, 6, 7, 6 };
15+
s16 tbl_filt_list[3][4][4] =
16+
{
17+
{
18+
{ 10, 100, 11, -3 },
19+
{ 1, 65, 15, -4 },
20+
{ -4, 21, 44, -5 },
21+
{ -9, 65, 24, -3 }
22+
},
23+
{
24+
{ -5, 21, 55, -6 },
25+
{ -2, 11, 23, -3 },
26+
{ -4, 9, 64, -6 },
27+
{ 2, 3, 42, -6 }
28+
},
29+
{
30+
{ -7, 5, 55, -6 },
31+
{ -3, 11, 3, -3 },
32+
{ -7, 5, 64, -6 },
33+
{ 2, 3, 73, -6 }
34+
}
35+
};
36+
37+
static void i22(pel* src, s16* dst, int i_dst, int width, int height, const int td)
38+
{
39+
const int is_small = width * height <= ((td - 1) ? 64 : 32);
40+
s16* filter;
41+
s8 offset, shift_r;
42+
43+
// i < td
44+
offset = filter_offset_list[is_small + 1];
45+
shift_r = filter_bits_list[is_small + 1];
46+
s16 col_0[64], col_1_td2[64];
47+
48+
filter = tbl_filt_list[is_small + 1][1];
49+
for (int j = 0; j < height; j++) {
50+
col_0[j] = (s16)((
51+
src[j - height - 1] * filter[0] +
52+
src[j - height - 1 + 1] * filter[1] +
53+
src[j - height - 1 + 2] * filter[2] +
54+
src[j - height - 1 + 3] * filter[3] +
55+
offset) >> shift_r);
56+
}
57+
if (2 == td) {
58+
filter = tbl_filt_list[is_small + 1][2];
59+
for (int j = 0; j < height; j++) {
60+
col_1_td2[j] = (s16)((
61+
src[j - height - 1] * filter[0] +
62+
src[j - height - 1 + 1] * filter[1] +
63+
src[j - height - 1 + 2] * filter[2] +
64+
src[j - height - 1 + 3] * filter[3] +
65+
offset) >> shift_r);
66+
}
67+
}
68+
69+
// i >= td
70+
offset = filter_offset_list[is_small];
71+
shift_r = filter_bits_list[is_small];
72+
73+
// i >= td, else(td=1)
74+
int rem_rl = 3 - td; // remainder of ref_left
75+
s16 ref_left[254];
76+
for (int k = 0; k < rem_rl; k++) {
77+
filter = tbl_filt_list[is_small][(k + 1 + td)];
78+
ref_left[k] = (s16)((
79+
src[-height - 1] * filter[0] +
80+
src[-height - 1 + 1] * filter[1] +
81+
src[-height - 1 + 2] * filter[2] +
82+
src[-height - 1 + 3] * filter[3] +
83+
offset) >> shift_r);
84+
}
85+
86+
if (width > 4) {
87+
for (int j = 0; j < (height - 1); j++) {
88+
filter = tbl_filt_list[is_small][((3 + 1) % 4)];
89+
ref_left[rem_rl + 4 * j] = (s16)((
90+
src[j - height] * filter[0] +
91+
src[j - height + 1] * filter[1] +
92+
src[j - height + 2] * filter[2] +
93+
src[j - height + 3] * filter[3] +
94+
offset) >> shift_r);
95+
96+
97+
filter = tbl_filt_list[is_small][((4 + 1) % 4)];
98+
ref_left[rem_rl + 4 * j + 1] = (s16)((
99+
src[j - height] * filter[0] +
100+
src[j - height + 1] * filter[1] +
101+
src[j - height + 2] * filter[2] +
102+
src[j - height + 3] * filter[3] +
103+
offset) >> shift_r);
104+
105+
filter = tbl_filt_list[is_small][((5 + 1) % 4)];
106+
ref_left[rem_rl + 4 * j + 2] = (s16)((
107+
src[j - height] * filter[0] +
108+
src[j - height + 1] * filter[1] +
109+
src[j - height + 2] * filter[2] +
110+
src[j - height + 3] * filter[3] +
111+
offset) >> shift_r);
112+
113+
filter = tbl_filt_list[is_small][((6 + 1) % 4)];
114+
ref_left[rem_rl + 4 * j + 3] = (s16)((
115+
src[j - height] * filter[0] +
116+
src[j - height + 1] * filter[1] +
117+
src[j - height + 2] * filter[2] +
118+
src[j - height + 3] * filter[3] +
119+
offset) >> shift_r);
120+
}
121+
}
122+
else {
123+
if (1 == td) {
124+
for (int j = 0; j < (height - 1); j++) {
125+
filter = tbl_filt_list[is_small][((3 + 1) % 4)];
126+
ref_left[rem_rl + 3 * j] = (s16)((
127+
src[j - height] * filter[0] +
128+
src[j - height + 1] * filter[1] +
129+
src[j - height + 2] * filter[2] +
130+
src[j - height + 3] * filter[3] +
131+
offset) >> shift_r);
132+
133+
134+
filter = tbl_filt_list[is_small][((1 + 1) % 4)];
135+
ref_left[rem_rl + 3 * j + 1] = (s16)((
136+
src[j - height] * filter[0] +
137+
src[j - height + 1] * filter[1] +
138+
src[j - height + 2] * filter[2] +
139+
src[j - height + 3] * filter[3] +
140+
offset) >> shift_r);
141+
142+
filter = tbl_filt_list[is_small][((2 + 1) % 4)];
143+
ref_left[rem_rl + 3 * j + 2] = (s16)((
144+
src[j - height] * filter[0] +
145+
src[j - height + 1] * filter[1] +
146+
src[j - height + 2] * filter[2] +
147+
src[j - height + 3] * filter[3] +
148+
offset) >> shift_r);
149+
}
150+
}
151+
else {
152+
for (int j = 0; j < (height - 1); j++) {
153+
filter = tbl_filt_list[is_small][((3 + 1) % 4)];
154+
ref_left[rem_rl + 2 * j] = (s16)((
155+
src[j - height] * filter[0] +
156+
src[j - height + 1] * filter[1] +
157+
src[j - height + 2] * filter[2] +
158+
src[j - height + 3] * filter[3] +
159+
offset) >> shift_r);
160+
161+
162+
filter = tbl_filt_list[is_small][((2 + 1) % 4)];
163+
ref_left[rem_rl + 2 * j + 1] = (s16)((
164+
src[j - height] * filter[0] +
165+
src[j - height + 1] * filter[1] +
166+
src[j - height + 2] * filter[2] +
167+
src[j - height + 3] * filter[3] +
168+
offset) >> shift_r);
169+
}
170+
}
171+
}
172+
173+
// i >= td, if
174+
filter = tbl_filt_list[is_small][0];
175+
s16 ref_above[61];
176+
for (int i = 0; i < (width - 3); i++) {
177+
ref_above[i] = (s16)((
178+
src[i + 1] * filter[0] +
179+
src[i] * filter[1] +
180+
src[i - 1] * filter[2] +
181+
src[i - 2] * filter[3] +
182+
offset) >> shift_r);
183+
}
184+
185+
// store
186+
if (width > 4) {
187+
for (int j = 0; j < height; j++) {
188+
dst[0] = col_0[height - 1 - j];
189+
if (2 == td) {
190+
dst[1] = col_1_td2[height - 1 - j];
191+
}
192+
193+
if ((3 + 4 * j) < width) {
194+
memcpy(dst + td, ref_left + (4 * (height - 1) + rem_rl - 1) - (4 * j + rem_rl - 1), (rem_rl + 4 * j) * sizeof(s16));
195+
memcpy(dst + 3 + 4 * j, ref_above, (width - (3 + 4 * j)) * sizeof(s16));
196+
}
197+
else {
198+
// w - 3
199+
memcpy(dst + td, ref_left + (4 * (height - 1) + rem_rl - 1) - (4 * j + rem_rl - 1), (width - td) * sizeof(s16));
200+
}
201+
202+
dst += i_dst;
203+
}
204+
}
205+
else {
206+
for (int j = 0; j < height; j++) {
207+
dst[0] = col_0[height - 1 - j];
208+
if (2 == td) {
209+
dst[1] = col_1_td2[height - 1 - j];
210+
}
211+
if (0 == j) {
212+
memcpy(dst + td, ref_left + (rem_rl + 1) * (height - 1) + rem_rl - 1 - (rem_rl - 1), rem_rl * sizeof(s16));
213+
dst[3] = ref_above[0];
214+
}
215+
else {
216+
memcpy(dst + td, ref_left + (rem_rl + 1) * (height - 1) + rem_rl - 1 - (j * (rem_rl + 1) + rem_rl - 1), (rem_rl + 1) * sizeof(s16));
217+
}
218+
219+
dst += i_dst;
220+
}
221+
}
222+
223+
}
224+
225+
int main(void) {
226+
pel src[3][64 * 64];
227+
s16 d[64 * 64];
228+
229+
int w, h, l = 64, td = 2;
230+
int whl_size[7] = { 4, 8, 12, 16, 24, 32, 64 };
231+
232+
for (int k = 0; k < 64 * 64; k++)
233+
{
234+
int i_pixel = k % 1024;
235+
src[0][k] = i_pixel;
236+
src[1][k] = i_pixel;
237+
src[2][k] = i_pixel;
238+
}
239+
240+
TICK(i22);
241+
for (int times = 0; times < 100000; times++) {
242+
for (int j = 0; j < 7; j++) {
243+
for (int f = 0; f < 7; f++) {
244+
w = whl_size[j], h = whl_size[f];
245+
i22(src[1], d, l, w, h, td);
246+
}
247+
}
248+
#ifdef __GNUC__
249+
asm volatile ("" ::: "cc", "memory");
250+
#endif
251+
}
252+
TOCK(i22);
253+
254+
return 0;
255+
}

vtune/Dockerfile

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
FROM intel/oneapi-basekit
2+
RUN DEBIAN_FRONTEND=noninteractive apt update
3+
RUN apt install -y libnss3-dev
4+
RUN apt install -y libatk1.0-dev
5+
RUN apt install -y libatk-bridge2.0-dev
6+
RUN apt install -y libcups2-dev
7+
RUN apt install -y libdrm-dev
8+
RUN apt install -y libgtk-3-dev
9+
RUN apt install -y libasound2-dev
10+
RUN apt install -y x11-apps x11-xserver-utils
11+
RUN apt install -y vim cmake make
12+
RUN mkdir -p /root/workspace
13+
WORKDIR /root/workspace

vtune/attach.sh

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
docker exec -it $(docker ps -lq) bash

vtune/build.sh

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
cd $(realpath $(dirname $0))
2+
docker build -t archibate/oneapi .

vtune/run.sh

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
xhost + local:
2+
docker run --cap-add CAP_SYS_ADMIN -e DISPLAY -e XAUTHORITY=/root/.Xauthority -v /tmp/.X11-unix:/tmp/.X11-unix:rw -v $HOME/.Xauthority:/root/.Xauthority:rw -v $(realpath $(dirname $0))/..:/root/workspace -v /tmp:/tmp/host -v /home/bate/Codes/zeno3:/root/zeno3 -it --rm archibate/oneapi

0 commit comments

Comments
 (0)