Skip to content

Commit 62d04a8

Browse files
oshadurapcanal
authored andcommitted
Revert "Updating Zlib/Cloudflare/CMS-externals to latest version (SIMD), together with new updates from Cloudflare (support of aarch64 and MSVC)"
This reverts commit c3bb0c2.
1 parent eaf493f commit 62d04a8

File tree

12 files changed

+746
-1506
lines changed

12 files changed

+746
-1506
lines changed

builtins/zlib/adler32.c

+18-259
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,15 @@
66
/* @(#) $Id$ */
77

88
#include "zutil.h"
9-
#include <xmmintrin.h>
10-
#include <tmmintrin.h>
119

12-
#include <immintrin.h>
10+
#define local static
1311

14-
#ifdef __x86_64__
15-
#include "cpuid.h"
16-
#endif
17-
18-
static uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
12+
local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
1913

2014
#define BASE 65521 /* largest prime smaller than 65536 */
2115
#define NMAX 5552
2216
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
2317

24-
/*
25-
* As we are using _signed_ integer arithmetic for the SSE/AVX2 implementations,
26-
* we consider the max as 2^31-1
27-
*/
28-
#define NMAX_VEC 5552
29-
30-
#define NMAX_VEC2 5552
31-
3218
#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;}
3319
#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1);
3420
#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2);
@@ -76,9 +62,11 @@ static uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
7662
#endif
7763

7864
/* ========================================================================= */
79-
uLong ZEXPORT adler32_default(uLong adler, const Bytef *buf, uInt len)
65+
uLong ZEXPORT adler32(adler, buf, len)
66+
uLong adler;
67+
const Bytef *buf;
68+
uInt len;
8069
{
81-
8270
unsigned long sum2;
8371
unsigned n;
8472

@@ -144,246 +132,11 @@ uLong ZEXPORT adler32_default(uLong adler, const Bytef *buf, uInt len)
144132
return adler | (sum2 << 16);
145133
}
146134

147-
#define likely(x) __builtin_expect(!!(x), 1)
148-
#define unlikely(x) __builtin_expect(!!(x), 0)
149-
150-
/* ========================================================================= */
151-
__attribute__ ((target ("sse4.2")))
152-
uLong ZEXPORT adler32_sse42(uLong adler, const Bytef *buf, uInt len)
153-
{
154-
unsigned long sum2;
155-
156-
/* split Adler-32 into component sums */
157-
sum2 = (adler >> 16) & 0xffff;
158-
adler &= 0xffff;
159-
160-
/* in case user likes doing a byte at a time, keep it fast */
161-
if (unlikely(len == 1)) {
162-
adler += buf[0];
163-
if (adler >= BASE)
164-
adler -= BASE;
165-
sum2 += adler;
166-
if (sum2 >= BASE)
167-
sum2 -= BASE;
168-
return adler | (sum2 << 16);
169-
}
170-
171-
/* initial Adler-32 value (deferred check for len == 1 speed) */
172-
if (unlikely(buf == Z_NULL))
173-
return 1L;
174-
175-
/* in case short lengths are provided, keep it somewhat fast */
176-
if (unlikely(len < 16)) {
177-
while (len--) {
178-
adler += *buf++;
179-
sum2 += adler;
180-
}
181-
if (adler >= BASE)
182-
adler -= BASE;
183-
MOD28(sum2); /* only added so many BASE's */
184-
return adler | (sum2 << 16);
185-
}
186-
187-
uint32_t __attribute__ ((aligned(16))) s1[4], s2[4];
188-
s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
189-
s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
190-
char __attribute__ ((aligned(16))) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
191-
__m128i dot1v = _mm_load_si128((__m128i*)dot1);
192-
char __attribute__ ((aligned(16))) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
193-
__m128i dot2v = _mm_load_si128((__m128i*)dot2);
194-
short __attribute__ ((aligned(16))) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
195-
__m128i dot3v = _mm_load_si128((__m128i*)dot3);
196-
// We will need to multiply by
197-
//char __attribute__ ((aligned(16))) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
198-
char __attribute__ ((aligned(16))) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
199-
__m128i shiftv = _mm_load_si128((__m128i*)shift);
200-
while (len >= 16) {
201-
__m128i vs1 = _mm_load_si128((__m128i*)s1);
202-
__m128i vs2 = _mm_load_si128((__m128i*)s2);
203-
__m128i vs1_0 = vs1;
204-
int k = (len < NMAX_VEC ? (int)len : NMAX_VEC);
205-
k -= k % 16;
206-
len -= k;
207-
while (k >= 16) {
208-
/*
209-
vs1 = adler + sum(c[i])
210-
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
211-
212-
NOTE: 256-bit equivalents are:
213-
_mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
214-
_mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
215-
We could rewrite the below to use 256-bit instructions instead of 128-bit.
216-
*/
217-
__m128i vbuf = _mm_loadu_si128((__m128i*)buf);
218-
buf += 16;
219-
k -= 16;
220-
__m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
221-
__m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
222-
__m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
223-
vs1 = _mm_add_epi32(vsum1, vs1);
224-
__m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
225-
vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
226-
vsum2 = _mm_add_epi32(vsum2, vs2);
227-
vs2 = _mm_add_epi32(vsum2, vs1_0);
228-
vs1_0 = vs1;
229-
}
230-
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
231-
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
232-
uint32_t __attribute__((aligned(16))) s1_unpack[4];
233-
uint32_t __attribute__((aligned(16))) s2_unpack[4];
234-
_mm_store_si128((__m128i*)s1_unpack, vs1);
235-
_mm_store_si128((__m128i*)s2_unpack, vs2);
236-
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
237-
MOD(adler);
238-
s1[3] = adler;
239-
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
240-
MOD(sum2);
241-
s2[3] = sum2;
242-
}
243-
244-
while (len--) {
245-
adler += *buf++;
246-
sum2 += adler;
247-
}
248-
MOD(adler);
249-
MOD(sum2);
250-
251-
/* return recombined sums */
252-
return adler | (sum2 << 16);
253-
}
254-
255-
/* ========================================================================= */
256-
__attribute__ ((target ("avx2")))
257-
uLong ZEXPORT adler32_avx2(uLong adler, const Bytef *buf, uInt len)
258-
{
259-
unsigned long sum2;
260-
261-
/* split Adler-32 into component sums */
262-
sum2 = (adler >> 16) & 0xffff;
263-
adler &= 0xffff;
264-
265-
/* in case user likes doing a byte at a time, keep it fast */
266-
if (unlikely(len == 1)) {
267-
adler += buf[0];
268-
if (adler >= BASE)
269-
adler -= BASE;
270-
sum2 += adler;
271-
if (sum2 >= BASE)
272-
sum2 -= BASE;
273-
return adler | (sum2 << 16);
274-
}
275-
276-
/* initial Adler-32 value (deferred check for len == 1 speed) */
277-
if (unlikely(buf == Z_NULL))
278-
return 1L;
279-
280-
/* in case short lengths are provided, keep it somewhat fast */
281-
if (unlikely(len < 32)) {
282-
while (len--) {
283-
adler += *buf++;
284-
sum2 += adler;
285-
}
286-
if (adler >= BASE)
287-
adler -= BASE;
288-
MOD28(sum2); /* only added so many BASE's */
289-
return adler | (sum2 << 16);
290-
}
291-
292-
uint32_t __attribute__ ((aligned(32))) s1[8], s2[8];
293-
memset(s1, '\0', sizeof(uint32_t)*7); s1[7] = adler; // TODO: would a masked load be faster?
294-
memset(s2, '\0', sizeof(uint32_t)*7); s2[7] = sum2;
295-
char __attribute__ ((aligned(32))) dot1[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
296-
__m256i dot1v = _mm256_load_si256((__m256i*)dot1);
297-
char __attribute__ ((aligned(32))) dot2[32] = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
298-
__m256i dot2v = _mm256_load_si256((__m256i*)dot2);
299-
short __attribute__ ((aligned(32))) dot3[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
300-
__m256i dot3v = _mm256_load_si256((__m256i*)dot3);
301-
// We will need to multiply by
302-
char __attribute__ ((aligned(16))) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
303-
__m128i shiftv = _mm_load_si128((__m128i*)shift);
304-
while (len >= 32) {
305-
__m256i vs1 = _mm256_load_si256((__m256i*)s1);
306-
__m256i vs2 = _mm256_load_si256((__m256i*)s2);
307-
__m256i vs1_0 = vs1;
308-
int k = (len < NMAX_VEC ? (int)len : NMAX_VEC);
309-
k -= k % 32;
310-
len -= k;
311-
while (k >= 32) {
312-
/*
313-
vs1 = adler + sum(c[i])
314-
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
315-
*/
316-
__m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
317-
buf += 32;
318-
k -= 32;
319-
__m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
320-
__m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
321-
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
322-
vs1 = _mm256_add_epi32(vsum1, vs1);
323-
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
324-
vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
325-
vsum2 = _mm256_add_epi32(vsum2, vs2);
326-
vs2 = _mm256_add_epi32(vsum2, vs1_0);
327-
vs1_0 = vs1;
328-
}
329-
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
330-
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
331-
uint32_t __attribute__((aligned(32))) s1_unpack[8];
332-
uint32_t __attribute__((aligned(32))) s2_unpack[8];
333-
_mm256_store_si256((__m256i*)s1_unpack, vs1);
334-
_mm256_store_si256((__m256i*)s2_unpack, vs2);
335-
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
336-
MOD(adler);
337-
s1[7] = adler;
338-
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) + (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
339-
MOD(sum2);
340-
s2[7] = sum2;
341-
}
342-
343-
while (len--) {
344-
adler += *buf++;
345-
sum2 += adler;
346-
}
347-
MOD(adler);
348-
MOD(sum2);
349-
350-
/* return recombined sums */
351-
return adler | (sum2 << 16);
352-
}
353-
354-
uLong ZEXPORT adler32(uLong adler, const Bytef *buf, uInt len) __attribute__ ((ifunc ("resolve_adler32")));
355-
356-
void *resolve_adler32(void)
357-
{
358-
unsigned int eax, ebx, ecx, edx;
359-
signed char has_sse42 = 0;
360-
signed char has_avx2 = 0;
361-
362-
/* Collect CPU features */
363-
if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx))
364-
return adler32_default;
365-
has_sse42 = ((ecx & bit_SSE4_2) != 0);
366-
#if defined(bit_AVX2)
367-
if (__get_cpuid_max (0, NULL) < 7)
368-
return adler32_default;
369-
__cpuid_count (7, 0, eax, ebx, ecx, edx);
370-
has_avx2 = ((ebx & bit_AVX2) != 0);
371-
#endif /* defined(bit_AVX2) */
372-
373-
/* Pick AVX2 version */
374-
if (has_avx2)
375-
return adler32_avx2;
376-
377-
/* Pick SSE4.2 version */
378-
if (has_sse42)
379-
return adler32_sse42;
380-
381-
/* Fallback to default implementation */
382-
return adler32_default;
383-
}
384-
385135
/* ========================================================================= */
386-
static uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2)
136+
local uLong adler32_combine_(adler1, adler2, len2)
137+
uLong adler1;
138+
uLong adler2;
139+
z_off64_t len2;
387140
{
388141
unsigned long sum1;
389142
unsigned long sum2;
@@ -409,12 +162,18 @@ static uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2)
409162
}
410163

411164
/* ========================================================================= */
412-
uLong adler32_combine(uLong adler1, uLong adler2, z_off_t len2)
165+
uLong ZEXPORT adler32_combine(adler1, adler2, len2)
166+
uLong adler1;
167+
uLong adler2;
168+
z_off_t len2;
413169
{
414170
return adler32_combine_(adler1, adler2, len2);
415171
}
416172

417-
uLong adler32_combine64(uLong adler1, uLong adler2, z_off64_t len2)
173+
uLong ZEXPORT adler32_combine64(adler1, adler2, len2)
174+
uLong adler1;
175+
uLong adler2;
176+
z_off64_t len2;
418177
{
419178
return adler32_combine_(adler1, adler2, len2);
420179
}

0 commit comments

Comments
 (0)