@@ -22,6 +22,9 @@ limitations under the License.
22
22
#include <stdbool.h>
23
23
#include <stdint.h>
24
24
#include <assert.h>
25
+ #if USE_SIMD && (__SSE4__ || __AVX2__ )
26
+ #include <immintrin.h>
27
+ #endif
25
28
26
29
#ifndef DO_INST
27
30
#define DO_INST 0
@@ -43,6 +46,10 @@ limitations under the License.
43
46
static int_fast32_t runlens [4096 ] = {0 };
44
47
static int_fast32_t skips [128 ] = {0 };
45
48
static int_fast32_t remainders [64 ] = {0 };
49
+ static int_fast32_t non_asciis32 = 0 ;
50
+ static int_fast32_t non_asciis16 = 0 ;
51
+ static int_fast32_t non_asciis8 = 0 ;
52
+ static int_fast32_t non_asciis4 = 0 ;
46
53
#endif
47
54
48
55
#ifndef NDEBUG
@@ -58,6 +65,32 @@ limitations under the License.
58
65
# define likely (x ) __builtin_expect((x), 1)
59
66
# define unlikely (x ) __builtin_expect((x), 0)
60
67
68
+ typedef union mask {
69
+ uint8_t bytes [32 ];
70
+ uint8_t u8 ;
71
+ uint16_t u16 ;
72
+ uint32_t u32 ;
73
+ uint64_t u64 ;
74
+ __uint128_t u128 ;
75
+ #if USE_SIMD && __SSE4__
76
+ __m128i m128i ;
77
+ #endif
78
+ #if USE_SIMD && __AVX2__
79
+ __m256i m128i ;
80
+ #endif
81
+ } mask ;
82
+
83
+ #if USE_SIMD
84
+ const static mask non_ascii = {
85
+ .bytes = {
86
+ 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 ,
87
+ 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 ,
88
+ 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 ,
89
+ 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 , 0x80 ,
90
+ }
91
+ };
92
+ #endif
93
+
61
94
#if USE_HEX_TABLE
62
95
static const bool lhex [256 ] = {
63
96
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
@@ -106,6 +139,55 @@ static void print_hit(const unsigned char *buf) {
106
139
printf ("%.40s\n" , buf );
107
140
}
108
141
142
+ #if USE_NON_ASCII4
143
+ // perform (0x80 & C) on next 4 bytes at once
144
+ // 32-bit operation
145
+ static bool non_ascii4 (const unsigned char * b ) {
146
+ uint32_t * bs = (uint32_t * )b ;
147
+ const uint32_t non_ascii = 0xffffffff ;
148
+ return (* bs & non_ascii ) == non_ascii ;
149
+ }
150
+ #endif
151
+
152
+ #if USE_SIMD
153
+ // perform (0x80 & C) on next 8 bytes at once
154
+ // 64-bit operation
155
+ static bool non_ascii8 (const unsigned char * b ) {
156
+ uint64_t * bs = (uint64_t * )b ;
157
+ return (* bs & non_ascii .u64 ) == non_ascii .u64 ;
158
+ }
159
+ #endif
160
+
161
+ #if USE_SIMD && __SSE4__
162
+ // perform (0x80 & C) on next 16 bytes at once
163
+ // 128-bit SSE operation
164
+ static bool non_ascii16 (const unsigned char * b ) {
165
+ // load 16 bytes as packed 16x8 byte value
166
+ __m128i bs = _mm_loadu_si128 ((const __m128i * )b );
167
+ // get the high bit (0x80) from each byte and put it in
168
+ // to a the low 16 bites of a 32-bit int as a mask
169
+ int high_bits = _mm_movemask_epi8 (bs ) & 0xffff ;
170
+ // any high bits mean a non-ascii
171
+ // we only care if they are all high.
172
+ return high_bits == 0xffff ;
173
+ }
174
+ #endif
175
+
176
+ #if MAX_STEP >= 32 && USE_SIMD && __AVX2__
177
+ // perform (0x80 & C) on next 16 bytes at once 256-bit AVX operation
178
+ static bool non_ascii32 (const unsigned char * b ) {
179
+ // load 32 bytes
180
+ __m256i bs = _mm256_loadu_si256 ((const __m256i * )b );
181
+ // vpmovmskb is an awesome instruction. It gathers the MSBs from the input
182
+ // as packed bytes and returns it as a mask. That's equivalent to &0x80 on
183
+ // 32 bytes at once!
184
+ int high_bits = _mm256_movemask_epi8 (bs );
185
+ // any byte with the high bit set cannot be a hex because
186
+ // it is outside of the main ascii range;
187
+ return high_bits == 0xffffffff ;
188
+ }
189
+ #endif
190
+
109
191
// At the start of this function, buf is pointing at a non-hex character and the
110
192
// goal is to find the next hex character.
111
193
static const unsigned char * scan_skip (const unsigned char * buf , const unsigned char * end ) {
@@ -114,13 +196,75 @@ static const unsigned char * scan_skip(const unsigned char *buf, const unsigned
114
196
#ifndef NDEBUG
115
197
const unsigned char * io = buf ;
116
198
#endif
199
+
200
+ #if USE_SIMD
201
+ if (unlikely (buf + 41 >= end )) {
202
+ return buf ;
203
+ }
204
+
205
+ #define MAX_STEP 8
206
+
207
+ while (skip > 0 && buf + skip + MAX_STEP < end ) {
208
+ // Runs of 32+ and 16+ non-ascii bytes are not common
209
+ // enough to justify the overhead of using these
210
+ #if MAX_STEP >= 32
211
+ if (non_ascii32 (buf + skip )) {
212
+ buf += skip + 32 ;
213
+ skip = 40 ;
214
+ INST (non_asciis32 ++ );
215
+ continue ;
216
+ }
217
+ #endif
218
+ #if MAX_STEP >= 16
219
+ if (non_ascii16 (buf + skip )) {
220
+ buf += skip + 16 ;
221
+ skip = 40 ;
222
+ INST (non_asciis16 ++ );
223
+ continue ;
224
+ }
225
+ #endif
226
+ #if MAX_STEP >= 8
227
+ if (non_ascii8 (buf + skip )) {
228
+ buf += skip + 8 ;
229
+ skip = 40 ;
230
+ INST (non_asciis8 ++ );
231
+ continue ;
232
+ }
233
+ #endif
234
+ // this works but hits so few cases that it doesn't give any benefit
235
+ #if USE_NON_ASCII4
236
+ if (non_ascii4 (buf + skip )) {
237
+ buf += skip + 4 ;
238
+ skip = 40 ;
239
+ INST (non_asciis4 ++ );
240
+ continue ;
241
+ }
242
+ #endif
243
+ if (!is_lower_hex (buf + skip )) {
244
+ buf += skip ;
245
+ skip = 40 ;
246
+ continue ;
247
+ }
248
+ skip /= 2 ;
249
+ }
250
+
251
+ while (skip > 0 && buf + skip < end ) {
252
+ if (!is_lower_hex (buf + skip )) {
253
+ buf += skip ;
254
+ skip = 40 ;
255
+ continue ;
256
+ }
257
+ skip /= 2 ;
258
+ }
259
+ #else
117
260
do {
118
261
while (buf + skip < end && !is_lower_hex (buf + skip )) {
119
262
buf += skip ;
120
263
skip = 40 ;
121
264
}
122
265
skip /= 2 ;
123
266
} while (skip > 1 && buf + skip < end );
267
+ #endif
124
268
assert (io <= buf );
125
269
assert (buf < end );
126
270
return buf + 1 ;
@@ -183,6 +327,11 @@ static const unsigned char * scan_hit_long(const unsigned char *buf, const unsig
183
327
// at 50 we know that the current run ends before then and that any runs
184
328
// between here and there are too short to care about.
185
329
330
+ // a sha256 would have ended at buf+24 so buf+25 wouldn't be a hex
331
+ if (!is_lower_hex (buf + 25 ) ) {
332
+ return scan_skip (buf + 25 , end );
333
+ }
334
+
186
335
assert (buf + 30 < end );
187
336
188
337
if (!is_lower_hex (buf + 30 )) {
@@ -207,6 +356,66 @@ static const unsigned char * scan_hit_long(const unsigned char *buf, const unsig
207
356
return scan_hit_short (start , end );
208
357
}
209
358
359
+ #if USE_SIMD && __AVX2__
360
+ static int is_hex64 (const unsigned char * start ) {
361
+ uint64_t mask , res ;
362
+ int pos ;
363
+
364
+ const __m256i b0 = _mm256_loadu_si256 ((void * )start );
365
+ const __m256i b1 = _mm256_loadu_si256 ((void * )(start + 32 ));
366
+
367
+ const __m256i rr0 = _mm256_set1_epi8 ('0' - 1 );
368
+ const __m256i rr1 = _mm256_set1_epi8 ('9' );
369
+ const __m256i rr2 = _mm256_set1_epi8 ('a' - 1 );
370
+ const __m256i rr3 = _mm256_set1_epi8 ('f' );
371
+
372
+ // x > 0x29
373
+ __m256i gz0 = _mm256_cmpgt_epi8 (b0 , rr0 );
374
+ __m256i gz1 = _mm256_cmpgt_epi8 (b1 , rr0 );
375
+ // .. &! (>0x39)
376
+ __m256i le9_0 = _mm256_andnot_si256 (_mm256_cmpgt_epi8 (b0 , rr1 ), gz0 );
377
+ __m256i le9_1 = _mm256_andnot_si256 (_mm256_cmpgt_epi8 (b1 , rr1 ), gz1 );
378
+ // x > 0x60
379
+ __m256i ga0 = _mm256_cmpgt_epi8 (b0 , rr2 );
380
+ __m256i ga1 = _mm256_cmpgt_epi8 (b1 , rr2 );
381
+ // .. &!(>0x66)
382
+ __m256i lef0 = _mm256_andnot_si256 (_mm256_cmpgt_epi8 (b0 , rr3 ), ga0 );
383
+ __m256i lef1 = _mm256_andnot_si256 (_mm256_cmpgt_epi8 (b1 , rr3 ), ga1 );
384
+
385
+ /* Generate bit masks */
386
+ unsigned int numeric0 = _mm256_movemask_epi8 (le9_0 );
387
+ unsigned int numeric1 = _mm256_movemask_epi8 (le9_1 );
388
+ unsigned int alpha1 = _mm256_movemask_epi8 (lef1 );
389
+ unsigned int alpha0 = _mm256_movemask_epi8 (lef0 );
390
+
391
+ // x > 0x29 && !(x > 0x39) || x > 0x60 && !(x > 0x66)
392
+ uint64_t res0 = numeric0 | alpha0 ;
393
+ uint64_t res1 = numeric1 | alpha1 ;
394
+ // [0-31] | [32-63]
395
+ res = res0 | (res1 << 32 );
396
+
397
+ // yay little endian! :-/
398
+ // 64.............0
399
+ // 0x00000080ffffffff
400
+ // 0x ffffffff 0-32
401
+ // 0x ff 33-40
402
+ // 0x 1 41
403
+ // 0x000001ffffffffff = mask
404
+ // 0x???????????????? & res
405
+ // 0x000000ffffffffff = hit!
406
+
407
+ // bool hit = (res & 0x000001ffffffffff) == 0x000000ffffffffff;
408
+
409
+ mask = 1 ;
410
+ pos = 0 ;
411
+ while (res & mask ) {
412
+ pos ++ ;
413
+ mask <<= 1 ;
414
+ }
415
+ return pos ;
416
+ }
417
+ #endif
418
+
210
419
// We are at the first hex character. The goal is to determine as efficiently as
211
420
// possible if this is a 40 hex character run terminated by a non-hex, something
212
421
// shorter, or something longer.
@@ -220,6 +429,23 @@ static const unsigned char * scan_hit_short(const unsigned char *buf, const unsi
220
429
return buf ;
221
430
}
222
431
432
+ // Use AVX2 instructions to check 32 bytes + 32 bytes
433
+ #if USE_SIMD && __AVX2__
434
+ if (likely (buf + 64 < end )) {
435
+ int len = is_hex64 (buf );
436
+ assert (len > 0 );
437
+ assert (len <= 64 );
438
+ if (len == 40 ) {
439
+ print_hit (buf );
440
+ return scan_skip (buf + len , end );
441
+ }
442
+ if (len < 64 ) {
443
+ return scan_skip (buf + len , end );
444
+ }
445
+ return scan_hit_long (buf + 40 , end );
446
+ }
447
+ #endif
448
+
223
449
// We know offset 0 is a hex because that's why we're here.
224
450
// We know offset 40 needs to be a non-hex otherwise we're in a 41+ run.
225
451
// We know 1-39 all need to be hex characters.
@@ -372,6 +598,10 @@ int main(int argc, const char *argv[]) {
372
598
for (int i = 0 ; i < arr_len (runlens ); i ++ )
373
599
if (runlens [i ])
374
600
dprintf (2 , " [%4d] %10d%s\n" , i , runlens [i ], i == 40 ? " *" : "" );
601
+ dprintf (2 , "non-ascii32: %10d\n" , non_asciis32 );
602
+ dprintf (2 , "non-ascii16: %10d\n" , non_asciis16 );
603
+ dprintf (2 , "non-ascii8: %10d\n" , non_asciis8 );
604
+ dprintf (2 , "non-ascii4: %10d\n" , non_asciis4 );
375
605
#endif
376
606
377
607
return nread ;
0 commit comments