Skip to content

Commit f4fed57

Browse files
committedAug 17, 2023
fixed
1 parent f426a4a commit f4fed57

File tree

5 files changed

+322
-51
lines changed

5 files changed

+322
-51
lines changed
 

‎README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,9 @@ cp source/findp.cpp main.cpp # 测试 findp 任务
118118
## 实验需求
119119

120120
- 硬件要求:支持 AVX2 的 x86 CPU,Intel 和 AMD 均可,8 GB 内存。
121-
- 硬件最低要求:64 位的 x86 CPU,Intel 和 AMD 均可,2 GB 内存。
121+
- 硬件最低要求:支持 SSE4.1 的 64 位的 x86 CPU,Intel 和 AMD 均可,2 GB 内存。
122122

123-
> 注:所有 64 位 CPU 均支持 SSE,过老的硬件可能无法运行部分含 AVX 的实验代码
123+
> 注:所有 64 位 CPU 均能支持到 SSE2,从 Haswell 架构开始的 CPU 都能支持到 AVX2
124124
125125
Linux 做实验所需包(以 Arch Linux 为例):
126126

‎main.cpp

+32-24
Original file line numberDiff line numberDiff line change
@@ -6,41 +6,49 @@
66
#include <immintrin.h>
77

88
// BEGIN CODE
9-
void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
10-
__m256i shuf12 = _mm256_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15,
11-
5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4);
12-
__m256i shuf34 = _mm256_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9,
13-
3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14);
14-
__m256i perm12 = _mm256_setr_epi32(0, 1, 2, 7, 4, 5, 3, 6);
15-
__m256i perm34 = _mm256_setr_epi32(0, 5, 6, 7, 1, 4, 2, 3);
9+
void simd_rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
10+
const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15);
11+
const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4);
12+
const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9);
13+
const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14);
1614
auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4;
1715
auto in_rgba_true_end = in_rgba + n * 4;
1816
while (in_rgba < in_rgba_end) {
19-
// rgbargbargbargba RGBaRgbargbargba rgbargbargBaRGBa rgbargbargbargba
20-
// rgbrgbrgbrgb.... gbrgbrgb RGBR BRGB rgbrgbrg ....rgbrgbrgbrgb
21-
__m256i v12_rgba = _mm256_loadu_si256((__m256i *)in_rgba);
22-
in_rgba += 32;
23-
__m256i v34_rgba = _mm256_loadu_si256((__m256i *)in_rgba);
24-
in_rgba += 32;
25-
__m256i v12_rgb = _mm256_shuffle_epi8(v12_rgba, shuf12);
26-
__m256i v34_rgb = _mm256_shuffle_epi8(v34_rgba, shuf34);
27-
__m256i v12t_rgb = _mm256_permutevar8x32_epi32(v12_rgb, perm12);
28-
__m256i v34t_rgb = _mm256_permutevar8x32_epi32(v34_rgb, perm34);
29-
__m256i v12e_rgb = _mm256_blend_epi32(v12t_rgb, v34t_rgb, 0b11000000);
30-
__m128i v3e_rgb = _mm256_castsi256_si128(v34t_rgb);
31-
_mm256_storeu_si256((__m256i *)out_rgb, v12e_rgb);
32-
out_rgb += 32;
33-
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb);
34-
out_rgb += 16;
17+
// v1~4_rgba rgbargbargbargba RGBaRgbargbargba rgbargbargBaRGBa rgbargbargbargba
18+
// v1~4_rgb rgbrgbrgbrgbaaaa gbrgbrgbaaaaRGBR BRGBaaaargbrgbrg aaaargbrgbrgbrgb
19+
// v1~4e_rgb rgbrgbrgbrgbRGBR gbrgbrgbrgbrgbrg BRGBrgbrgbrgbrgb
20+
__m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
21+
__m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
22+
__m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
23+
__m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
24+
// 核心代码开始
25+
__m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1);
26+
__m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2);
27+
__m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3);
28+
__m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4);
29+
__m128i v1e_rgb = _mm_blend_epi32(v1_rgb, v2_rgb, 0b1000);
30+
__m128i v2e_rgb = _mm_blend_epi32(v2_rgb, v3_rgb, 0b1100);
31+
__m128i v3e_rgb = _mm_blend_epi32(v3_rgb, v4_rgb, 0b1110);
32+
// 核心代码结束
33+
_mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16;
34+
_mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16;
35+
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16;
3536
}
36-
3737
while (in_rgba != in_rgba_true_end) {
3838
*out_rgb++ = *in_rgba++;
3939
*out_rgb++ = *in_rgba++;
4040
*out_rgb++ = *in_rgba++;
4141
in_rgba++;
4242
}
4343
}
44+
45+
void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
46+
const size_t chunk = 65536;
47+
#pragma omp parallel for
48+
for (size_t i = 0; i < n; i += chunk) {
49+
simd_rgba2rgb(in_rgba + i * 4, out_rgb + i * 3, std::min(chunk, n - i));
50+
}
51+
}
4452
// END CODE
4553

4654
static void bench(benchmark::State &s) {

‎practices/dispatch.md

+196
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
编译期静态分发,根据是否指定了 -mavx2 参数:
2+
3+
```cpp
4+
#ifdef __AVX2__
5+
#include <immintrin.h>
6+
#endif
7+
8+
void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
9+
#ifdef __AVX2__
10+
const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15);
11+
const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4);
12+
const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9);
13+
const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14);
14+
auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4;
15+
auto in_rgba_true_end = in_rgba + n * 4;
16+
while (in_rgba < in_rgba_end) {
17+
__m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
18+
__m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
19+
__m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
20+
__m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
21+
__m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1);
22+
__m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2);
23+
__m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3);
24+
__m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4);
25+
__m128i v1e_rgb = _mm_blend_epi32(v1_rgb, v2_rgb, 0b1000);
26+
__m128i v2e_rgb = _mm_blend_epi32(v2_rgb, v3_rgb, 0b1100);
27+
__m128i v3e_rgb = _mm_blend_epi32(v3_rgb, v4_rgb, 0b1110);
28+
_mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16;
29+
_mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16;
30+
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16;
31+
}
32+
while (in_rgba != in_rgba_true_end) {
33+
*out_rgb++ = *in_rgba++;
34+
*out_rgb++ = *in_rgba++;
35+
*out_rgb++ = *in_rgba++;
36+
in_rgba++;
37+
}
38+
#else
39+
for (size_t i = 0; i < n; i++) {
40+
out_rgb[i * 3 + 0] = in_rgba[i * 4 + 0];
41+
out_rgb[i * 3 + 1] = in_rgba[i * 4 + 1];
42+
out_rgb[i * 3 + 2] = in_rgba[i * 4 + 2];
43+
}
44+
#endif
45+
}
46+
```
47+
48+
运行时动态分发,根据运行时检测到的 cpuid 自动决定调用哪个版本:
49+
50+
```cpp
51+
#ifdef __x86_64__
52+
#include <immintrin.h>
53+
#endif
54+
55+
__attribute__((__target__("avx2"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
56+
const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15);
57+
const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4);
58+
const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9);
59+
const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14);
60+
auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4;
61+
auto in_rgba_true_end = in_rgba + n * 4;
62+
while (in_rgba < in_rgba_end) {
63+
__m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
64+
__m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
65+
__m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
66+
__m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
67+
__m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1);
68+
__m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2);
69+
__m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3);
70+
__m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4);
71+
__m128i v1e_rgb = _mm_blend_epi32(v1_rgb, v2_rgb, 0b1000);
72+
__m128i v2e_rgb = _mm_blend_epi32(v2_rgb, v3_rgb, 0b1100);
73+
__m128i v3e_rgb = _mm_blend_epi32(v3_rgb, v4_rgb, 0b1110);
74+
_mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16;
75+
_mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16;
76+
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16;
77+
}
78+
while (in_rgba != in_rgba_true_end) {
79+
*out_rgb++ = *in_rgba++;
80+
*out_rgb++ = *in_rgba++;
81+
*out_rgb++ = *in_rgba++;
82+
in_rgba++;
83+
}
84+
}
85+
86+
__attribute__((__target__("sse4.1"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
87+
const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15);
88+
const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4);
89+
const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9);
90+
const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14);
91+
auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4;
92+
auto in_rgba_true_end = in_rgba + n * 4;
93+
while (in_rgba < in_rgba_end) {
94+
__m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
95+
__m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
96+
__m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
97+
__m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
98+
__m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1);
99+
__m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2);
100+
__m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3);
101+
__m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4);
102+
__m128i v1e_rgb = _mm_blend_epi16(v1_rgb, v2_rgb, 0b11000000);
103+
__m128i v2e_rgb = _mm_blend_epi16(v2_rgb, v3_rgb, 0b11110000);
104+
__m128i v3e_rgb = _mm_blend_epi16(v3_rgb, v4_rgb, 0b11111100);
105+
_mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16;
106+
_mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16;
107+
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16;
108+
}
109+
while (in_rgba != in_rgba_true_end) {
110+
*out_rgb++ = *in_rgba++;
111+
*out_rgb++ = *in_rgba++;
112+
*out_rgb++ = *in_rgba++;
113+
in_rgba++;
114+
}
115+
}
116+
117+
__attribute__((__target__("default"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
118+
for (size_t i = 0; i < n; i++) {
119+
out_rgb[i * 3 + 0] = in_rgba[i * 4 + 0];
120+
out_rgb[i * 3 + 1] = in_rgba[i * 4 + 1];
121+
out_rgb[i * 3 + 2] = in_rgba[i * 4 + 2];
122+
}
123+
}
124+
```
125+
126+
运行时动态分发,但都让编译器自动根据检测到的 CPU 架构针对性地优化:
127+
128+
```cpp
129+
__attribute__((target_clones("sse4.1,avx"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
130+
for (size_t i = 0; i < n; i++) {
131+
out_rgb[i * 3 + 0] = in_rgba[i * 4 + 0];
132+
out_rgb[i * 3 + 1] = in_rgba[i * 4 + 1];
133+
out_rgb[i * 3 + 2] = in_rgba[i * 4 + 2];
134+
}
135+
}
136+
```
137+
138+
用户自定义的运行时分发规则,手动使用 `__builtin_cpu_supports` 检测:
139+
140+
```cpp
141+
__attribute__((ifunc("rgba2rgb_dispatch"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n);
142+
143+
void rgba2rgb_avx2(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n);
144+
void rgba2rgb_default(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n);
145+
146+
typedef void rgba2rgb_t(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n);
147+
148+
extern "C" rgba2rgb_t *rgba2rgb_dispatch() {
149+
if (__builtin_cpu_supports("avx2")) {
150+
return &rgba2rgb_avx2;
151+
} else {
152+
return &rgba2rgb_default;
153+
}
154+
}
155+
156+
void rgba2rgb_avx2(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
157+
const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15);
158+
const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4);
159+
const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9);
160+
const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14);
161+
auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4;
162+
auto in_rgba_true_end = in_rgba + n * 4;
163+
while (in_rgba < in_rgba_end) {
164+
__m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
165+
__m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
166+
__m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
167+
__m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
168+
__m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1);
169+
__m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2);
170+
__m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3);
171+
__m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4);
172+
__m128i v1e_rgb = _mm_blend_epi32(v1_rgb, v2_rgb, 0b1000);
173+
__m128i v2e_rgb = _mm_blend_epi32(v2_rgb, v3_rgb, 0b1100);
174+
__m128i v3e_rgb = _mm_blend_epi32(v3_rgb, v4_rgb, 0b1110);
175+
_mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16;
176+
_mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16;
177+
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16;
178+
}
179+
while (in_rgba != in_rgba_true_end) {
180+
*out_rgb++ = *in_rgba++;
181+
*out_rgb++ = *in_rgba++;
182+
*out_rgb++ = *in_rgba++;
183+
in_rgba++;
184+
}
185+
}
186+
187+
void rgba2rgb_default(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
188+
for (size_t i = 0; i < n; i++) {
189+
out_rgb[i * 3 + 0] = in_rgba[i * 4 + 0];
190+
out_rgb[i * 3 + 1] = in_rgba[i * 4 + 1];
191+
out_rgb[i * 3 + 2] = in_rgba[i * 4 + 2];
192+
}
193+
}
194+
```
195+
196+
注意:MSVC 不支持动态分发,只能编译期分发,动态分发是 GCC 和 Clang 才有的特性。

‎practices/u8rgba2rgb.md

+67-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,20 @@
1-
# u8rgba2rgb 最佳实践
1+
# u8rgba2rgb 优化最佳实践
2+
3+
## 原版
4+
5+
测试数据:1920x1080 的 u8rgba 图像
6+
7+
测试结果:584902 ns 1.13 cpi 23.16 GB/s
8+
9+
```
10+
void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
11+
for (size_t i = 0; i < n; i++) {
12+
out_rgb[i * 3 + 0] = in_rgba[i * 4 + 0];
13+
out_rgb[i * 3 + 1] = in_rgba[i * 4 + 1];
14+
out_rgb[i * 3 + 2] = in_rgba[i * 4 + 2];
15+
}
16+
}
17+
```
218

319
## SSE4.1 版
420

@@ -10,8 +26,6 @@
1026

1127
实现思路:使用 shuffle 进行压缩,由于 4x4 到 4x3 空出来了 4 格空间,用 blend 从下一组 4x3 中提取前 4 格过来。
1228

13-
测试数据:1920x1080 的 u8rgba 图像
14-
1529
测试结果:408279 ns 0.79 cpi 33.18 GB/s
1630

1731
```cpp
@@ -77,3 +91,53 @@ for (...) {
7791
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16;
7892
}
7993
```
94+
95+
# AVX2 + 并行
96+
97+
测试结果:228981 ns 0.44 cpi 59.88 GB/s
98+
99+
最终完整代码:
100+
101+
```cpp
102+
void simd_rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
103+
__m256i shuf12 = _mm256_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15,
104+
5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4);
105+
__m256i shuf34 = _mm256_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9,
106+
3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14);
107+
__m256i perm12 = _mm256_setr_epi32(0, 1, 2, 7, 4, 5, 3, 6);
108+
__m256i perm34 = _mm256_setr_epi32(0, 5, 6, 7, 1, 4, 2, 3);
109+
auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4;
110+
auto in_rgba_true_end = in_rgba + n * 4;
111+
while (in_rgba < in_rgba_end) {
112+
__m256i v12_rgba = _mm256_loadu_si256((__m256i *)in_rgba);
113+
in_rgba += 32;
114+
__m256i v34_rgba = _mm256_loadu_si256((__m256i *)in_rgba);
115+
in_rgba += 32;
116+
__m256i v12_rgb = _mm256_shuffle_epi8(v12_rgba, shuf12);
117+
__m256i v34_rgb = _mm256_shuffle_epi8(v34_rgba, shuf34);
118+
__m256i v12t_rgb = _mm256_permutevar8x32_epi32(v12_rgb, perm12);
119+
__m256i v34t_rgb = _mm256_permutevar8x32_epi32(v34_rgb, perm34);
120+
__m256i v12e_rgb = _mm256_blend_epi32(v12t_rgb, v34t_rgb, 0b11000000);
121+
__m128i v3e_rgb = _mm256_castsi256_si128(v34t_rgb);
122+
_mm256_storeu_si256((__m256i *)out_rgb, v12e_rgb);
123+
out_rgb += 32;
124+
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb);
125+
out_rgb += 16;
126+
}
127+
128+
while (in_rgba != in_rgba_true_end) {
129+
*out_rgb++ = *in_rgba++;
130+
*out_rgb++ = *in_rgba++;
131+
*out_rgb++ = *in_rgba++;
132+
in_rgba++;
133+
}
134+
}
135+
136+
void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
137+
const size_t chunk = 65536;
138+
#pragma omp parallel for
139+
for (size_t i = 0; i < n; i += chunk) {
140+
simd_rgba2rgb(in_rgba + i * 4, out_rgb + i * 3, std::min(chunk, n - i));
141+
}
142+
}
143+
```

‎source/rgba2rgb.cpp

+25-22
Original file line numberDiff line numberDiff line change
@@ -6,46 +6,49 @@
66
#include <immintrin.h>
77

88
// BEGIN CODE
9-
void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
10-
__m128i shuf1 = _mm_setr_epi8(0,1,2, 4,5,6, 8,9,10, 12,13,14, 3,7,11,15);
11-
__m128i shuf2 = _mm_setr_epi8(5,6, 8,9,10, 12,13,14, 3,7,11,15, 0,1,2,4);
12-
__m128i shuf3 = _mm_setr_epi8(10,12,13,14, 3,7,11,15, 0,1,2, 4,5,6, 8,9);
13-
__m128i shuf4 = _mm_setr_epi8(3,7,11,15, 0,1,2, 4,5,6, 8,9,10, 12,13,14);
9+
void simd_rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
10+
const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15);
11+
const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4);
12+
const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9);
13+
const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14);
1414
auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4;
1515
auto in_rgba_true_end = in_rgba + n * 4;
1616
while (in_rgba < in_rgba_end) {
17-
// rgbargbargbargba RGBaRgbargbargba rgbargbargBaRGBa rgbargbargbargba
18-
// rgbrgbrgbrgb.... gbrgbrgb RGBR BRGB rgbrgbrg ....rgbrgbrgbrgb
19-
__m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba);
20-
in_rgba += 16;
21-
__m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba);
22-
in_rgba += 16;
23-
__m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba);
24-
in_rgba += 16;
25-
__m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba);
26-
in_rgba += 16;
17+
// v1~4_rgba rgbargbargbargba RGBaRgbargbargba rgbargbargBaRGBa rgbargbargbargba
18+
// v1~4_rgb rgbrgbrgbrgbaaaa gbrgbrgbaaaaRGBR BRGBaaaargbrgbrg aaaargbrgbrgbrgb
19+
// v1~4e_rgb rgbrgbrgbrgbRGBR gbrgbrgbrgbrgbrg BRGBrgbrgbrgbrgb
20+
__m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
21+
__m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
22+
__m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
23+
__m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16;
24+
// 核心代码开始
2725
__m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1);
2826
__m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2);
2927
__m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3);
3028
__m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4);
3129
__m128i v1e_rgb = _mm_blend_epi32(v1_rgb, v2_rgb, 0b1000);
3230
__m128i v2e_rgb = _mm_blend_epi32(v2_rgb, v3_rgb, 0b1100);
3331
__m128i v3e_rgb = _mm_blend_epi32(v3_rgb, v4_rgb, 0b1110);
34-
_mm_storeu_si128((__m128i *)out_rgb, v1e_rgb);
35-
out_rgb += 16;
36-
_mm_storeu_si128((__m128i *)out_rgb, v2e_rgb);
37-
out_rgb += 16;
38-
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb);
39-
out_rgb += 16;
32+
// 核心代码结束
33+
_mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16;
34+
_mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16;
35+
_mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16;
4036
}
41-
4237
while (in_rgba != in_rgba_true_end) {
4338
*out_rgb++ = *in_rgba++;
4439
*out_rgb++ = *in_rgba++;
4540
*out_rgb++ = *in_rgba++;
4641
in_rgba++;
4742
}
4843
}
44+
45+
void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) {
46+
const size_t chunk = 65536;
47+
#pragma omp parallel for
48+
for (size_t i = 0; i < n; i += chunk) {
49+
simd_rgba2rgb(in_rgba + i * 4, out_rgb + i * 3, std::min(chunk, n - i));
50+
}
51+
}
4952
// END CODE
5053

5154
static void bench(benchmark::State &s) {

0 commit comments

Comments
 (0)
Please sign in to comment.