6
6
/* @(#) $Id$ */
7
7
8
8
#include "zutil.h"
9
- #include <xmmintrin.h>
10
- #include <tmmintrin.h>
11
9
12
- #include <immintrin.h>
10
+ #define local static
13
11
14
- #ifdef __x86_64__
15
- #include "cpuid.h"
16
- #endif
17
-
18
- static uLong adler32_combine_ OF ((uLong adler1 , uLong adler2 , z_off64_t len2 ));
12
+ local uLong adler32_combine_ OF ( (uLong adler1 , uLong adler2 , z_off64_t len2 ));
19
13
20
14
#define BASE 65521 /* largest prime smaller than 65536 */
21
15
#define NMAX 5552
22
16
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
23
17
24
- /*
25
- * As we are using _signed_ integer arithmetic for the SSE/AVX2 implementations,
26
- * we consider the max as 2^31-1
27
- */
28
- #define NMAX_VEC 5552
29
-
30
- #define NMAX_VEC2 5552
31
-
32
18
#define DO1 (buf ,i ) {adler += (buf)[i]; sum2 += adler;}
33
19
#define DO2 (buf ,i ) DO1(buf,i); DO1(buf,i+1);
34
20
#define DO4 (buf ,i ) DO2(buf,i); DO2(buf,i+2);
@@ -76,9 +62,11 @@ static uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
76
62
#endif
77
63
78
64
/* ========================================================================= */
79
- uLong ZEXPORT adler32_default (uLong adler , const Bytef * buf , uInt len )
65
+ uLong ZEXPORT adler32 (adler , buf , len )
66
+ uLong adler ;
67
+ const Bytef * buf ;
68
+ uInt len ;
80
69
{
81
-
82
70
unsigned long sum2 ;
83
71
unsigned n ;
84
72
@@ -144,246 +132,11 @@ uLong ZEXPORT adler32_default(uLong adler, const Bytef *buf, uInt len)
144
132
return adler | (sum2 << 16 );
145
133
}
146
134
147
- #define likely (x ) __builtin_expect(!!(x), 1)
148
- #define unlikely (x ) __builtin_expect(!!(x), 0)
149
-
150
- /* ========================================================================= */
151
- __attribute__ ((target ("sse4.2" )))
152
- uLong ZEXPORT adler32_sse42 (uLong adler , const Bytef * buf , uInt len )
153
- {
154
- unsigned long sum2 ;
155
-
156
- /* split Adler-32 into component sums */
157
- sum2 = (adler >> 16 ) & 0xffff ;
158
- adler &= 0xffff ;
159
-
160
- /* in case user likes doing a byte at a time, keep it fast */
161
- if (unlikely (len == 1 )) {
162
- adler += buf [0 ];
163
- if (adler >= BASE )
164
- adler -= BASE ;
165
- sum2 += adler ;
166
- if (sum2 >= BASE )
167
- sum2 -= BASE ;
168
- return adler | (sum2 << 16 );
169
- }
170
-
171
- /* initial Adler-32 value (deferred check for len == 1 speed) */
172
- if (unlikely (buf == Z_NULL ))
173
- return 1L ;
174
-
175
- /* in case short lengths are provided, keep it somewhat fast */
176
- if (unlikely (len < 16 )) {
177
- while (len -- ) {
178
- adler += * buf ++ ;
179
- sum2 += adler ;
180
- }
181
- if (adler >= BASE )
182
- adler -= BASE ;
183
- MOD28 (sum2 ); /* only added so many BASE's */
184
- return adler | (sum2 << 16 );
185
- }
186
-
187
- uint32_t __attribute__ ((aligned (16 ))) s1 [4 ], s2 [4 ];
188
- s1 [0 ] = s1 [1 ] = s1 [2 ] = 0 ; s1 [3 ] = adler ;
189
- s2 [0 ] = s2 [1 ] = s2 [2 ] = 0 ; s2 [3 ] = sum2 ;
190
- char __attribute__ ((aligned (16 ))) dot1 [16 ] = {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 };
191
- __m128i dot1v = _mm_load_si128 ((__m128i * )dot1 );
192
- char __attribute__ ((aligned (16 ))) dot2 [16 ] = {16 , 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 };
193
- __m128i dot2v = _mm_load_si128 ((__m128i * )dot2 );
194
- short __attribute__ ((aligned (16 ))) dot3 [8 ] = {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 };
195
- __m128i dot3v = _mm_load_si128 ((__m128i * )dot3 );
196
- // We will need to multiply by
197
- //char __attribute__ ((aligned(16))) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
198
- char __attribute__ ((aligned (16 ))) shift [16 ] = {4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
199
- __m128i shiftv = _mm_load_si128 ((__m128i * )shift );
200
- while (len >= 16 ) {
201
- __m128i vs1 = _mm_load_si128 ((__m128i * )s1 );
202
- __m128i vs2 = _mm_load_si128 ((__m128i * )s2 );
203
- __m128i vs1_0 = vs1 ;
204
- int k = (len < NMAX_VEC ? (int )len : NMAX_VEC );
205
- k -= k % 16 ;
206
- len -= k ;
207
- while (k >= 16 ) {
208
- /*
209
- vs1 = adler + sum(c[i])
210
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
211
-
212
- NOTE: 256-bit equivalents are:
213
- _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
214
- _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
215
- We could rewrite the below to use 256-bit instructions instead of 128-bit.
216
- */
217
- __m128i vbuf = _mm_loadu_si128 ((__m128i * )buf );
218
- buf += 16 ;
219
- k -= 16 ;
220
- __m128i v_short_sum1 = _mm_maddubs_epi16 (vbuf , dot1v ); // multiply-add, resulting in 8 shorts.
221
- __m128i vsum1 = _mm_madd_epi16 (v_short_sum1 , dot3v ); // sum 8 shorts to 4 int32_t;
222
- __m128i v_short_sum2 = _mm_maddubs_epi16 (vbuf , dot2v );
223
- vs1 = _mm_add_epi32 (vsum1 , vs1 );
224
- __m128i vsum2 = _mm_madd_epi16 (v_short_sum2 , dot3v );
225
- vs1_0 = _mm_sll_epi32 (vs1_0 , shiftv );
226
- vsum2 = _mm_add_epi32 (vsum2 , vs2 );
227
- vs2 = _mm_add_epi32 (vsum2 , vs1_0 );
228
- vs1_0 = vs1 ;
229
- }
230
- // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
231
- // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
232
- uint32_t __attribute__((aligned (16 ))) s1_unpack [4 ];
233
- uint32_t __attribute__((aligned (16 ))) s2_unpack [4 ];
234
- _mm_store_si128 ((__m128i * )s1_unpack , vs1 );
235
- _mm_store_si128 ((__m128i * )s2_unpack , vs2 );
236
- adler = (s1_unpack [0 ] % BASE ) + (s1_unpack [1 ] % BASE ) + (s1_unpack [2 ] % BASE ) + (s1_unpack [3 ] % BASE );
237
- MOD (adler );
238
- s1 [3 ] = adler ;
239
- sum2 = (s2_unpack [0 ] % BASE ) + (s2_unpack [1 ] % BASE ) + (s2_unpack [2 ] % BASE ) + (s2_unpack [3 ] % BASE );
240
- MOD (sum2 );
241
- s2 [3 ] = sum2 ;
242
- }
243
-
244
- while (len -- ) {
245
- adler += * buf ++ ;
246
- sum2 += adler ;
247
- }
248
- MOD (adler );
249
- MOD (sum2 );
250
-
251
- /* return recombined sums */
252
- return adler | (sum2 << 16 );
253
- }
254
-
255
- /* ========================================================================= */
256
- __attribute__ ((target ("avx2" )))
257
- uLong ZEXPORT adler32_avx2 (uLong adler , const Bytef * buf , uInt len )
258
- {
259
- unsigned long sum2 ;
260
-
261
- /* split Adler-32 into component sums */
262
- sum2 = (adler >> 16 ) & 0xffff ;
263
- adler &= 0xffff ;
264
-
265
- /* in case user likes doing a byte at a time, keep it fast */
266
- if (unlikely (len == 1 )) {
267
- adler += buf [0 ];
268
- if (adler >= BASE )
269
- adler -= BASE ;
270
- sum2 += adler ;
271
- if (sum2 >= BASE )
272
- sum2 -= BASE ;
273
- return adler | (sum2 << 16 );
274
- }
275
-
276
- /* initial Adler-32 value (deferred check for len == 1 speed) */
277
- if (unlikely (buf == Z_NULL ))
278
- return 1L ;
279
-
280
- /* in case short lengths are provided, keep it somewhat fast */
281
- if (unlikely (len < 32 )) {
282
- while (len -- ) {
283
- adler += * buf ++ ;
284
- sum2 += adler ;
285
- }
286
- if (adler >= BASE )
287
- adler -= BASE ;
288
- MOD28 (sum2 ); /* only added so many BASE's */
289
- return adler | (sum2 << 16 );
290
- }
291
-
292
- uint32_t __attribute__ ((aligned (32 ))) s1 [8 ], s2 [8 ];
293
- memset (s1 , '\0' , sizeof (uint32_t )* 7 ); s1 [7 ] = adler ; // TODO: would a masked load be faster?
294
- memset (s2 , '\0' , sizeof (uint32_t )* 7 ); s2 [7 ] = sum2 ;
295
- char __attribute__ ((aligned (32 ))) dot1 [32 ] = {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 };
296
- __m256i dot1v = _mm256_load_si256 ((__m256i * )dot1 );
297
- char __attribute__ ((aligned (32 ))) dot2 [32 ] = {32 , 31 , 30 , 29 , 28 , 27 , 26 , 25 , 24 , 23 , 22 , 21 , 20 , 19 , 18 , 17 , 16 , 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 };
298
- __m256i dot2v = _mm256_load_si256 ((__m256i * )dot2 );
299
- short __attribute__ ((aligned (32 ))) dot3 [16 ] = {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 };
300
- __m256i dot3v = _mm256_load_si256 ((__m256i * )dot3 );
301
- // We will need to multiply by
302
- char __attribute__ ((aligned (16 ))) shift [16 ] = {5 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
303
- __m128i shiftv = _mm_load_si128 ((__m128i * )shift );
304
- while (len >= 32 ) {
305
- __m256i vs1 = _mm256_load_si256 ((__m256i * )s1 );
306
- __m256i vs2 = _mm256_load_si256 ((__m256i * )s2 );
307
- __m256i vs1_0 = vs1 ;
308
- int k = (len < NMAX_VEC ? (int )len : NMAX_VEC );
309
- k -= k % 32 ;
310
- len -= k ;
311
- while (k >= 32 ) {
312
- /*
313
- vs1 = adler + sum(c[i])
314
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
315
- */
316
- __m256i vbuf = _mm256_loadu_si256 ((__m256i * )buf );
317
- buf += 32 ;
318
- k -= 32 ;
319
- __m256i v_short_sum1 = _mm256_maddubs_epi16 (vbuf , dot1v ); // multiply-add, resulting in 8 shorts.
320
- __m256i vsum1 = _mm256_madd_epi16 (v_short_sum1 , dot3v ); // sum 8 shorts to 4 int32_t;
321
- __m256i v_short_sum2 = _mm256_maddubs_epi16 (vbuf , dot2v );
322
- vs1 = _mm256_add_epi32 (vsum1 , vs1 );
323
- __m256i vsum2 = _mm256_madd_epi16 (v_short_sum2 , dot3v );
324
- vs1_0 = _mm256_sll_epi32 (vs1_0 , shiftv );
325
- vsum2 = _mm256_add_epi32 (vsum2 , vs2 );
326
- vs2 = _mm256_add_epi32 (vsum2 , vs1_0 );
327
- vs1_0 = vs1 ;
328
- }
329
- // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
330
- // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
331
- uint32_t __attribute__((aligned (32 ))) s1_unpack [8 ];
332
- uint32_t __attribute__((aligned (32 ))) s2_unpack [8 ];
333
- _mm256_store_si256 ((__m256i * )s1_unpack , vs1 );
334
- _mm256_store_si256 ((__m256i * )s2_unpack , vs2 );
335
- adler = (s1_unpack [0 ] % BASE ) + (s1_unpack [1 ] % BASE ) + (s1_unpack [2 ] % BASE ) + (s1_unpack [3 ] % BASE ) + (s1_unpack [4 ] % BASE ) + (s1_unpack [5 ] % BASE ) + (s1_unpack [6 ] % BASE ) + (s1_unpack [7 ] % BASE );
336
- MOD (adler );
337
- s1 [7 ] = adler ;
338
- sum2 = (s2_unpack [0 ] % BASE ) + (s2_unpack [1 ] % BASE ) + (s2_unpack [2 ] % BASE ) + (s2_unpack [3 ] % BASE ) + (s2_unpack [4 ] % BASE ) + (s2_unpack [5 ] % BASE ) + (s2_unpack [6 ] % BASE ) + (s2_unpack [7 ] % BASE );
339
- MOD (sum2 );
340
- s2 [7 ] = sum2 ;
341
- }
342
-
343
- while (len -- ) {
344
- adler += * buf ++ ;
345
- sum2 += adler ;
346
- }
347
- MOD (adler );
348
- MOD (sum2 );
349
-
350
- /* return recombined sums */
351
- return adler | (sum2 << 16 );
352
- }
353
-
354
- uLong ZEXPORT adler32 (uLong adler , const Bytef * buf , uInt len ) __attribute__ ((ifunc ("resolve_adler32" )));
355
-
356
- void * resolve_adler32 (void )
357
- {
358
- unsigned int eax , ebx , ecx , edx ;
359
- signed char has_sse42 = 0 ;
360
- signed char has_avx2 = 0 ;
361
-
362
- /* Collect CPU features */
363
- if (!__get_cpuid (1 , & eax , & ebx , & ecx , & edx ))
364
- return adler32_default ;
365
- has_sse42 = ((ecx & bit_SSE4_2 ) != 0 );
366
- #if defined(bit_AVX2 )
367
- if (__get_cpuid_max (0 , NULL ) < 7 )
368
- return adler32_default ;
369
- __cpuid_count (7 , 0 , eax , ebx , ecx , edx );
370
- has_avx2 = ((ebx & bit_AVX2 ) != 0 );
371
- #endif /* defined(bit_AVX2) */
372
-
373
- /* Pick AVX2 version */
374
- if (has_avx2 )
375
- return adler32_avx2 ;
376
-
377
- /* Pick SSE4.2 version */
378
- if (has_sse42 )
379
- return adler32_sse42 ;
380
-
381
- /* Fallback to default implementation */
382
- return adler32_default ;
383
- }
384
-
385
135
/* ========================================================================= */
386
- static uLong adler32_combine_ (uLong adler1 , uLong adler2 , z_off64_t len2 )
136
+ local uLong adler32_combine_ (adler1 , adler2 , len2 )
137
+ uLong adler1 ;
138
+ uLong adler2 ;
139
+ z_off64_t len2 ;
387
140
{
388
141
unsigned long sum1 ;
389
142
unsigned long sum2 ;
@@ -409,12 +162,18 @@ static uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2)
409
162
}
410
163
411
164
/* ========================================================================= */
412
- uLong adler32_combine (uLong adler1 , uLong adler2 , z_off_t len2 )
165
+ uLong ZEXPORT adler32_combine (adler1 , adler2 , len2 )
166
+ uLong adler1 ;
167
+ uLong adler2 ;
168
+ z_off_t len2 ;
413
169
{
414
170
return adler32_combine_ (adler1 , adler2 , len2 );
415
171
}
416
172
417
- uLong adler32_combine64 (uLong adler1 , uLong adler2 , z_off64_t len2 )
173
+ uLong ZEXPORT adler32_combine64 (adler1 , adler2 , len2 )
174
+ uLong adler1 ;
175
+ uLong adler2 ;
176
+ z_off64_t len2 ;
418
177
{
419
178
return adler32_combine_ (adler1 , adler2 , len2 );
420
179
}
0 commit comments