Skip to content

Commit 1a8d416

Browse files
committed
c: use AVX and SSE to bulk scan
Use fancy instructions to scan 16, 32, and 64 byte ranges instead of only inspecting a single byte at a time. While this was a lot of fun to do, it turns out to not be as efficient as being clever about avoiding comparisons whenever possible. That is, reading 1/10 bytes is better than reading 10 at once even if they are the same number of instructions. This is because there is overhead in loading the 128 and 256 bit registers and that overhead reduces the gains enough to give us a net speed that is slightly slower.
1 parent 473f3f3 commit 1a8d416

File tree

1 file changed

+230
-0
lines changed

1 file changed

+230
-0
lines changed

main.c

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ limitations under the License.
2222
#include <stdbool.h>
2323
#include <stdint.h>
2424
#include <assert.h>
25+
#if USE_SIMD && (__SSE4__ || __AVX2__)
26+
#include <immintrin.h>
27+
#endif
2528

2629
#ifndef DO_INST
2730
#define DO_INST 0
@@ -43,6 +46,10 @@ limitations under the License.
4346
static int_fast32_t runlens[4096] = {0};
4447
static int_fast32_t skips[128] = {0};
4548
static int_fast32_t remainders[64] = {0};
49+
static int_fast32_t non_asciis32 = 0;
50+
static int_fast32_t non_asciis16 = 0;
51+
static int_fast32_t non_asciis8 = 0;
52+
static int_fast32_t non_asciis4 = 0;
4653
#endif
4754

4855
#ifndef NDEBUG
@@ -58,6 +65,32 @@ limitations under the License.
5865
# define likely(x) __builtin_expect((x), 1)
5966
# define unlikely(x) __builtin_expect((x), 0)
6067

68+
typedef union mask {
69+
uint8_t bytes[32];
70+
uint8_t u8;
71+
uint16_t u16;
72+
uint32_t u32;
73+
uint64_t u64;
74+
__uint128_t u128;
75+
#if USE_SIMD && __SSE4__
76+
__m128i m128i;
77+
#endif
78+
#if USE_SIMD && __AVX2__
79+
__m256i m128i;
80+
#endif
81+
} mask;
82+
83+
#if USE_SIMD
84+
const static mask non_ascii = {
85+
.bytes = {
86+
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
87+
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
88+
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
89+
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
90+
}
91+
};
92+
#endif
93+
6194
#if USE_HEX_TABLE
6295
static const bool lhex[256] = {
6396
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -106,6 +139,55 @@ static void print_hit(const unsigned char *buf) {
106139
printf("%.40s\n", buf);
107140
}
108141

142+
#if USE_NON_ASCII4
143+
// perform (0x80 & C) on next 4 bytes at once
144+
// 32-bit operation
145+
static bool non_ascii4(const unsigned char *b) {
146+
uint32_t *bs = (uint32_t*)b;
147+
const uint32_t non_ascii = 0xffffffff;
148+
return (*bs & non_ascii) == non_ascii;
149+
}
150+
#endif
151+
152+
#if USE_SIMD
153+
// perform (0x80 & C) on next 8 bytes at once
154+
// 64-bit operation
155+
static bool non_ascii8(const unsigned char *b) {
156+
uint64_t *bs = (uint64_t*)b;
157+
return (*bs & non_ascii.u64) == non_ascii.u64;
158+
}
159+
#endif
160+
161+
#if USE_SIMD && __SSE4__
162+
// perform (0x80 & C) on next 16 bytes at once
163+
// 128-bit SSE operation
164+
static bool non_ascii16(const unsigned char *b) {
165+
// load 16 bytes as packed 16x8 byte value
166+
__m128i bs = _mm_loadu_si128((const __m128i *)b);
167+
// get the high bit (0x80) from each byte and put it in
168+
// to a the low 16 bites of a 32-bit int as a mask
169+
int high_bits = _mm_movemask_epi8(bs) & 0xffff;
170+
// any high bits mean a non-ascii
171+
// we only care if they are all high.
172+
return high_bits == 0xffff;
173+
}
174+
#endif
175+
176+
#if MAX_STEP >= 32 && USE_SIMD && __AVX2__
177+
// perform (0x80 & C) on next 16 bytes at once 256-bit AVX operation
178+
static bool non_ascii32(const unsigned char *b) {
179+
// load 32 bytes
180+
__m256i bs = _mm256_loadu_si256((const __m256i *)b);
181+
// vpmovmskb is an awesome instruction. It gathers the MSBs from the input
182+
// as packed bytes and returns it as a mask. That's equivalent to &0x80 on
183+
// 32 bytes at once!
184+
int high_bits = _mm256_movemask_epi8(bs);
185+
// any byte with the high bit set cannot be a hex because
186+
// it is outside of the main ascii range;
187+
return high_bits == 0xffffffff;
188+
}
189+
#endif
190+
109191
// At the start of this function, buf is pointing at a non-hex character and the
110192
// goal is to find the next hex character.
111193
static const unsigned char * scan_skip(const unsigned char *buf, const unsigned char *end) {
@@ -114,13 +196,75 @@ static const unsigned char * scan_skip(const unsigned char *buf, const unsigned
114196
#ifndef NDEBUG
115197
const unsigned char * io = buf;
116198
#endif
199+
200+
#if USE_SIMD
201+
if (unlikely(buf + 41 >= end)) {
202+
return buf;
203+
}
204+
205+
#define MAX_STEP 8
206+
207+
while (skip > 0 && buf + skip + MAX_STEP < end) {
208+
// Runs of 32+ and 16+ non-ascii bytes are not common
209+
// enough to justify the overhead of using these
210+
#if MAX_STEP >= 32
211+
if (non_ascii32(buf+skip)) {
212+
buf += skip + 32;
213+
skip = 40;
214+
INST(non_asciis32++);
215+
continue;
216+
}
217+
#endif
218+
#if MAX_STEP >= 16
219+
if (non_ascii16(buf+skip)) {
220+
buf += skip + 16;
221+
skip = 40;
222+
INST(non_asciis16++);
223+
continue;
224+
}
225+
#endif
226+
#if MAX_STEP >= 8
227+
if (non_ascii8(buf+skip)) {
228+
buf += skip + 8;
229+
skip = 40;
230+
INST(non_asciis8++);
231+
continue;
232+
}
233+
#endif
234+
// this works but hits so few cases that it doesn't give any benefit
235+
#if USE_NON_ASCII4
236+
if (non_ascii4(buf+skip)) {
237+
buf += skip + 4;
238+
skip = 40;
239+
INST(non_asciis4++);
240+
continue;
241+
}
242+
#endif
243+
if (!is_lower_hex(buf+skip)) {
244+
buf += skip;
245+
skip = 40;
246+
continue;
247+
}
248+
skip /= 2;
249+
}
250+
251+
while (skip > 0 && buf + skip < end) {
252+
if (!is_lower_hex(buf+skip)) {
253+
buf += skip;
254+
skip = 40;
255+
continue;
256+
}
257+
skip /= 2;
258+
}
259+
#else
117260
do {
118261
while (buf + skip < end && !is_lower_hex(buf+skip)) {
119262
buf += skip;
120263
skip = 40;
121264
}
122265
skip /= 2;
123266
} while (skip > 1 && buf + skip < end);
267+
#endif
124268
assert(io <= buf);
125269
assert(buf < end);
126270
return buf+1;
@@ -183,6 +327,11 @@ static const unsigned char * scan_hit_long(const unsigned char *buf, const unsig
183327
// at 50 we know that the current run ends before then and that any runs
184328
// between here and there are too short to care about.
185329

330+
// a sha256 would have ended at buf+24 so buf+25 wouldn't be a hex
331+
if (!is_lower_hex(buf+25) ) {
332+
return scan_skip(buf+25, end);
333+
}
334+
186335
assert(buf +30 < end);
187336

188337
if (!is_lower_hex(buf+30)) {
@@ -207,6 +356,66 @@ static const unsigned char * scan_hit_long(const unsigned char *buf, const unsig
207356
return scan_hit_short(start, end);
208357
}
209358

359+
#if USE_SIMD && __AVX2__
360+
static int is_hex64(const unsigned char *start) {
361+
uint64_t mask, res;
362+
int pos;
363+
364+
const __m256i b0 = _mm256_loadu_si256((void*)start);
365+
const __m256i b1 = _mm256_loadu_si256((void*)(start+32));
366+
367+
const __m256i rr0 = _mm256_set1_epi8('0'-1);
368+
const __m256i rr1 = _mm256_set1_epi8('9');
369+
const __m256i rr2 = _mm256_set1_epi8('a'-1);
370+
const __m256i rr3 = _mm256_set1_epi8('f');
371+
372+
// x > 0x29
373+
__m256i gz0 = _mm256_cmpgt_epi8(b0, rr0);
374+
__m256i gz1 = _mm256_cmpgt_epi8(b1, rr0);
375+
// .. &! (>0x39)
376+
__m256i le9_0 = _mm256_andnot_si256(_mm256_cmpgt_epi8(b0, rr1), gz0);
377+
__m256i le9_1 = _mm256_andnot_si256(_mm256_cmpgt_epi8(b1, rr1), gz1);
378+
// x > 0x60
379+
__m256i ga0 = _mm256_cmpgt_epi8(b0, rr2);
380+
__m256i ga1 = _mm256_cmpgt_epi8(b1, rr2);
381+
// .. &!(>0x66)
382+
__m256i lef0 = _mm256_andnot_si256(_mm256_cmpgt_epi8(b0, rr3), ga0);
383+
__m256i lef1 = _mm256_andnot_si256(_mm256_cmpgt_epi8(b1, rr3), ga1);
384+
385+
/* Generate bit masks */
386+
unsigned int numeric0 = _mm256_movemask_epi8(le9_0);
387+
unsigned int numeric1 = _mm256_movemask_epi8(le9_1);
388+
unsigned int alpha1 = _mm256_movemask_epi8(lef1);
389+
unsigned int alpha0 = _mm256_movemask_epi8(lef0);
390+
391+
// x > 0x29 && !(x > 0x39) || x > 0x60 && !(x > 0x66)
392+
uint64_t res0 = numeric0 | alpha0;
393+
uint64_t res1 = numeric1 | alpha1;
394+
// [0-31] | [32-63]
395+
res = res0 | (res1 << 32);
396+
397+
// yay little endian! :-/
398+
// 64.............0
399+
// 0x00000080ffffffff
400+
// 0x ffffffff 0-32
401+
// 0x ff 33-40
402+
// 0x 1 41
403+
// 0x000001ffffffffff = mask
404+
// 0x???????????????? & res
405+
// 0x000000ffffffffff = hit!
406+
407+
// bool hit = (res & 0x000001ffffffffff) == 0x000000ffffffffff;
408+
409+
mask = 1;
410+
pos = 0;
411+
while (res & mask) {
412+
pos++;
413+
mask <<= 1;
414+
}
415+
return pos;
416+
}
417+
#endif
418+
210419
// We are at the first hex character. The goal is to determine as efficiently as
211420
// possible if this is a 40 hex character run terminated by a non-hex, something
212421
// shorter, or something longer.
@@ -220,6 +429,23 @@ static const unsigned char * scan_hit_short(const unsigned char *buf, const unsi
220429
return buf;
221430
}
222431

432+
// Use AVX2 instructions to check 32 bytes + 32 bytes
433+
#if USE_SIMD && __AVX2__
434+
if (likely(buf + 64 < end)) {
435+
int len = is_hex64(buf);
436+
assert(len > 0);
437+
assert(len <= 64);
438+
if (len == 40) {
439+
print_hit(buf);
440+
return scan_skip(buf+len, end);
441+
}
442+
if (len < 64) {
443+
return scan_skip(buf+len, end);
444+
}
445+
return scan_hit_long(buf+40, end);
446+
}
447+
#endif
448+
223449
// We know offset 0 is a hex because that's why we're here.
224450
// We know offset 40 needs to be a non-hex otherwise we're in a 41+ run.
225451
// We know 1-39 all need to be hex characters.
@@ -372,6 +598,10 @@ int main(int argc, const char *argv[]) {
372598
for (int i = 0; i < arr_len(runlens); i++)
373599
if (runlens[i])
374600
dprintf(2, " [%4d] %10d%s\n", i, runlens[i], i==40 ? " *" : "");
601+
dprintf(2, "non-ascii32: %10d\n", non_asciis32);
602+
dprintf(2, "non-ascii16: %10d\n", non_asciis16);
603+
dprintf(2, "non-ascii8: %10d\n", non_asciis8);
604+
dprintf(2, "non-ascii4: %10d\n", non_asciis4);
375605
#endif
376606

377607
return nread;

0 commit comments

Comments
 (0)