|
| 1 | +编译期静态分发,根据是否指定了 -mavx2 参数: |
| 2 | + |
| 3 | +```cpp |
| 4 | +#ifdef __AVX2__ |
| 5 | +#include <immintrin.h> |
| 6 | +#endif |
| 7 | + |
| 8 | +void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) { |
| 9 | +#ifdef __AVX2__ |
| 10 | + const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15); |
| 11 | + const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4); |
| 12 | + const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9); |
| 13 | + const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14); |
| 14 | + auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4; |
| 15 | + auto in_rgba_true_end = in_rgba + n * 4; |
| 16 | + while (in_rgba < in_rgba_end) { |
| 17 | + __m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 18 | + __m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 19 | + __m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 20 | + __m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 21 | + __m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1); |
| 22 | + __m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2); |
| 23 | + __m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3); |
| 24 | + __m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4); |
| 25 | + __m128i v1e_rgb = _mm_blend_epi32(v1_rgb, v2_rgb, 0b1000); |
| 26 | + __m128i v2e_rgb = _mm_blend_epi32(v2_rgb, v3_rgb, 0b1100); |
| 27 | + __m128i v3e_rgb = _mm_blend_epi32(v3_rgb, v4_rgb, 0b1110); |
| 28 | + _mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16; |
| 29 | + _mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16; |
| 30 | + _mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16; |
| 31 | + } |
| 32 | + while (in_rgba != in_rgba_true_end) { |
| 33 | + *out_rgb++ = *in_rgba++; |
| 34 | + *out_rgb++ = *in_rgba++; |
| 35 | + *out_rgb++ = *in_rgba++; |
| 36 | + in_rgba++; |
| 37 | + } |
| 38 | +#else |
| 39 | + for (size_t i = 0; i < n; i++) { |
| 40 | + out_rgb[i * 3 + 0] = in_rgba[i * 4 + 0]; |
| 41 | + out_rgb[i * 3 + 1] = in_rgba[i * 4 + 1]; |
| 42 | + out_rgb[i * 3 + 2] = in_rgba[i * 4 + 2]; |
| 43 | + } |
| 44 | +#endif |
| 45 | +} |
| 46 | +``` |
| 47 | +
|
| 48 | +运行时动态分发,根据运行时检测到的 cpuid 自动决定调用哪个版本: |
| 49 | +
|
| 50 | +```cpp |
| 51 | +#ifdef __x86_64__ |
| 52 | +#include <immintrin.h> |
| 53 | +#endif |
| 54 | +
|
| 55 | +__attribute__((__target__("avx2"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) { |
| 56 | + const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15); |
| 57 | + const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4); |
| 58 | + const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9); |
| 59 | + const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14); |
| 60 | + auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4; |
| 61 | + auto in_rgba_true_end = in_rgba + n * 4; |
| 62 | + while (in_rgba < in_rgba_end) { |
| 63 | + __m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 64 | + __m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 65 | + __m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 66 | + __m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 67 | + __m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1); |
| 68 | + __m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2); |
| 69 | + __m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3); |
| 70 | + __m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4); |
| 71 | + __m128i v1e_rgb = _mm_blend_epi32(v1_rgb, v2_rgb, 0b1000); |
| 72 | + __m128i v2e_rgb = _mm_blend_epi32(v2_rgb, v3_rgb, 0b1100); |
| 73 | + __m128i v3e_rgb = _mm_blend_epi32(v3_rgb, v4_rgb, 0b1110); |
| 74 | + _mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16; |
| 75 | + _mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16; |
| 76 | + _mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16; |
| 77 | + } |
| 78 | + while (in_rgba != in_rgba_true_end) { |
| 79 | + *out_rgb++ = *in_rgba++; |
| 80 | + *out_rgb++ = *in_rgba++; |
| 81 | + *out_rgb++ = *in_rgba++; |
| 82 | + in_rgba++; |
| 83 | + } |
| 84 | +} |
| 85 | +
|
| 86 | +__attribute__((__target__("sse4.1"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) { |
| 87 | + const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15); |
| 88 | + const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4); |
| 89 | + const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9); |
| 90 | + const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14); |
| 91 | + auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4; |
| 92 | + auto in_rgba_true_end = in_rgba + n * 4; |
| 93 | + while (in_rgba < in_rgba_end) { |
| 94 | + __m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 95 | + __m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 96 | + __m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 97 | + __m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 98 | + __m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1); |
| 99 | + __m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2); |
| 100 | + __m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3); |
| 101 | + __m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4); |
| 102 | + __m128i v1e_rgb = _mm_blend_epi16(v1_rgb, v2_rgb, 0b11000000); |
| 103 | + __m128i v2e_rgb = _mm_blend_epi16(v2_rgb, v3_rgb, 0b11110000); |
| 104 | + __m128i v3e_rgb = _mm_blend_epi16(v3_rgb, v4_rgb, 0b11111100); |
| 105 | + _mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16; |
| 106 | + _mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16; |
| 107 | + _mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16; |
| 108 | + } |
| 109 | + while (in_rgba != in_rgba_true_end) { |
| 110 | + *out_rgb++ = *in_rgba++; |
| 111 | + *out_rgb++ = *in_rgba++; |
| 112 | + *out_rgb++ = *in_rgba++; |
| 113 | + in_rgba++; |
| 114 | + } |
| 115 | +} |
| 116 | +
|
| 117 | +__attribute__((__target__("default"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) { |
| 118 | + for (size_t i = 0; i < n; i++) { |
| 119 | + out_rgb[i * 3 + 0] = in_rgba[i * 4 + 0]; |
| 120 | + out_rgb[i * 3 + 1] = in_rgba[i * 4 + 1]; |
| 121 | + out_rgb[i * 3 + 2] = in_rgba[i * 4 + 2]; |
| 122 | + } |
| 123 | +} |
| 124 | +``` |
| 125 | + |
| 126 | +运行时动态分发,但都让编译器自动根据检测到的 CPU 架构针对性地优化: |
| 127 | + |
| 128 | +```cpp |
| 129 | +__attribute__((target_clones("sse4.1,avx"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) { |
| 130 | + for (size_t i = 0; i < n; i++) { |
| 131 | + out_rgb[i * 3 + 0] = in_rgba[i * 4 + 0]; |
| 132 | + out_rgb[i * 3 + 1] = in_rgba[i * 4 + 1]; |
| 133 | + out_rgb[i * 3 + 2] = in_rgba[i * 4 + 2]; |
| 134 | + } |
| 135 | +} |
| 136 | +``` |
| 137 | +
|
| 138 | +用户自定义的运行时分发规则,手动使用 `__builtin_cpu_supports` 检测: |
| 139 | +
|
| 140 | +```cpp |
| 141 | +__attribute__((ifunc("rgba2rgb_dispatch"))) void rgba2rgb(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n); |
| 142 | +
|
| 143 | +void rgba2rgb_avx2(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n); |
| 144 | +void rgba2rgb_default(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n); |
| 145 | +
|
| 146 | +typedef void rgba2rgb_t(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n); |
| 147 | +
|
| 148 | +extern "C" rgba2rgb_t *rgba2rgb_dispatch() { |
| 149 | + if (__builtin_cpu_supports("avx2")) { |
| 150 | + return &rgba2rgb_avx2; |
| 151 | + } else { |
| 152 | + return &rgba2rgb_default; |
| 153 | + } |
| 154 | +} |
| 155 | +
|
| 156 | +void rgba2rgb_avx2(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) { |
| 157 | + const __m128i shuf1 = _mm_setr_epi8(0,1,2,4,5,6,8,9,10,12,13,14,3,7,11,15); |
| 158 | + const __m128i shuf2 = _mm_setr_epi8(5,6,8,9,10,12,13,14,3,7,11,15,0,1,2,4); |
| 159 | + const __m128i shuf3 = _mm_setr_epi8(10,12,13,14,3,7,11,15,0,1,2,4,5,6,8,9); |
| 160 | + const __m128i shuf4 = _mm_setr_epi8(3,7,11,15,0,1,2,4,5,6,8,9,10,12,13,14); |
| 161 | + auto in_rgba_end = in_rgba + ((n - 16) / 16 * 16) * 4; |
| 162 | + auto in_rgba_true_end = in_rgba + n * 4; |
| 163 | + while (in_rgba < in_rgba_end) { |
| 164 | + __m128i v1_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 165 | + __m128i v2_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 166 | + __m128i v3_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 167 | + __m128i v4_rgba = _mm_loadu_si128((__m128i *)in_rgba); in_rgba += 16; |
| 168 | + __m128i v1_rgb = _mm_shuffle_epi8(v1_rgba, shuf1); |
| 169 | + __m128i v2_rgb = _mm_shuffle_epi8(v2_rgba, shuf2); |
| 170 | + __m128i v3_rgb = _mm_shuffle_epi8(v3_rgba, shuf3); |
| 171 | + __m128i v4_rgb = _mm_shuffle_epi8(v4_rgba, shuf4); |
| 172 | + __m128i v1e_rgb = _mm_blend_epi32(v1_rgb, v2_rgb, 0b1000); |
| 173 | + __m128i v2e_rgb = _mm_blend_epi32(v2_rgb, v3_rgb, 0b1100); |
| 174 | + __m128i v3e_rgb = _mm_blend_epi32(v3_rgb, v4_rgb, 0b1110); |
| 175 | + _mm_storeu_si128((__m128i *)out_rgb, v1e_rgb); out_rgb += 16; |
| 176 | + _mm_storeu_si128((__m128i *)out_rgb, v2e_rgb); out_rgb += 16; |
| 177 | + _mm_storeu_si128((__m128i *)out_rgb, v3e_rgb); out_rgb += 16; |
| 178 | + } |
| 179 | + while (in_rgba != in_rgba_true_end) { |
| 180 | + *out_rgb++ = *in_rgba++; |
| 181 | + *out_rgb++ = *in_rgba++; |
| 182 | + *out_rgb++ = *in_rgba++; |
| 183 | + in_rgba++; |
| 184 | + } |
| 185 | +} |
| 186 | +
|
| 187 | +void rgba2rgb_default(uint8_t const *in_rgba, uint8_t *out_rgb, size_t n) { |
| 188 | + for (size_t i = 0; i < n; i++) { |
| 189 | + out_rgb[i * 3 + 0] = in_rgba[i * 4 + 0]; |
| 190 | + out_rgb[i * 3 + 1] = in_rgba[i * 4 + 1]; |
| 191 | + out_rgb[i * 3 + 2] = in_rgba[i * 4 + 2]; |
| 192 | + } |
| 193 | +} |
| 194 | +``` |
| 195 | + |
| 196 | +注意:MSVC 不支持动态分发,只能编译期分发,动态分发是 GCC 和 Clang 才有的特性。 |
0 commit comments