Skip to content

Commit 6eca783

Browse files
TimWollacmb69nielsdos
authored
hash: Add SHA-NI implementation of SHA-256 (#15152)
* hash: Add SSE2 implementation of SHA-256 Implementation taken from Tarsnap/libcperciva@661752a. Co-authored-by: Christoph M. Becker <[email protected]> Co-authored-by: Niels Dossche <[email protected]> * zend_cpuinfo: Add ZEND_CPU_FEATURE_SHA * hash: Add SHA-NI implementation of SHA-256 Implementation taken from Tarsnap/libcperciva@661752a. Co-authored-by: Christoph M. Becker <[email protected]> * NEWS / UPGRADING --------- Co-authored-by: Christoph M. Becker <[email protected]> Co-authored-by: Niels Dossche <[email protected]>
1 parent a355c35 commit 6eca783

10 files changed

+483
-3
lines changed

NEWS

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ PHP NEWS
2626
- Hash:
2727
. Deprecated passing incorrect data types for options to ext/hash functions.
2828
(nielsdos)
29+
. Added SSE2 and SHA-NI implementation of SHA-256. (timwolla, Colin Percival,
30+
Graham Percival)
2931

3032
- PHPDBG:
3133
. array out of bounds, stack overflow handled for segfault handler on windows.

README.REDIST.BINS

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
18. avifinfo (ext/standard/libavifinfo) see ext/standard/libavifinfo/LICENSE
1919
19. xxHash (ext/hash/xxhash)
2020
20. Lexbor (ext/dom/lexbor/lexbor) see ext/dom/lexbor/LICENSE
21-
21+
21. Portions of libcperciva (ext/hash/hash_sha_{ni,sse2}.c) see the header in the source file
2222

2323
3. pcre2lib (ext/pcre)
2424

UPGRADING

+4
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,10 @@ PHP 8.4 UPGRADE NOTES
955955
. Improved the performance of FTP uploads up to a factor of 10x for large
956956
uploads.
957957

958+
- Hash:
959+
. Added SSE2 and SHA-NI implementations of SHA-256. This improves the performance
960+
on supported CPUs by ~1.3x (SSE2) and 3x - 5x (SHA-NI).
961+
958962
- MBString:
959963
. The performance of strspn() and strcspn() is greatly improved.
960964
They now run in linear time instead of being bounded by quadratic time.

Zend/zend_cpuinfo.h

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ typedef enum _zend_cpu_feature {
6464
ZEND_CPU_FEATURE_AVX512F = (1<<16 | ZEND_CPU_EBX_MASK),
6565
ZEND_CPU_FEATURE_AVX512DQ = (1<<17 | ZEND_CPU_EBX_MASK),
6666
ZEND_CPU_FEATURE_AVX512CD = (1<<28 | ZEND_CPU_EBX_MASK),
67+
ZEND_CPU_FEATURE_SHA = (1<<29 | ZEND_CPU_EBX_MASK),
6768
/* intentionally don't support = (1<<30 | ZEND_CPU_EBX_MASK) */
6869
/* intentionally don't support = (1<<31 | ZEND_CPU_EBX_MASK) */
6970

ext/hash/config.m4

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ else
3434
PHP_HASH_CFLAGS="$PHP_HASH_CFLAGS -I@ext_srcdir@/$SHA3_DIR -DKeccakP200_excluded -DKeccakP400_excluded -DKeccakP800_excluded -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1"
3535
fi
3636

37-
EXT_HASH_SOURCES="hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c \
37+
EXT_HASH_SOURCES="hash.c hash_md.c hash_sha.c hash_sha_sse2.c hash_sha_ni.c hash_ripemd.c hash_haval.c \
3838
hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c hash_adler32.c \
3939
hash_crc32.c hash_fnv.c hash_joaat.c $EXT_HASH_SHA3_SOURCES
4040
murmur/PMurHash.c murmur/PMurHash128.c hash_murmur.c hash_xxhash.c"

ext/hash/config.w32

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ if (PHP_MHASH != 'no') {
99

1010
PHP_HASH = 'yes';
1111

12-
EXTENSION('hash', 'hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c ' +
12+
EXTENSION('hash', 'hash.c hash_md.c hash_sha.c hash_sha_sse2.c hash_sha_ni.c hash_ripemd.c hash_haval.c ' +
1313
'hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c ' +
1414
'hash_adler32.c hash_crc32.c hash_joaat.c hash_fnv.c ' +
1515
'hash_sha3.c hash_murmur.c hash_xxhash.c', false);

ext/hash/hash_sha.c

+19
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "php_hash.h"
1919
#include "php_hash_sha.h"
20+
#include "Zend/zend_cpuinfo.h"
2021

2122
static const unsigned char PADDING[128] =
2223
{
@@ -160,6 +161,24 @@ PHP_HASH_API void PHP_SHA256InitArgs(PHP_SHA256_CTX * context, ZEND_ATTRIBUTE_UN
160161
*/
161162
static void SHA256Transform(uint32_t state[8], const unsigned char block[64])
162163
{
164+
#if defined(PHP_HASH_INTRIN_SHA_NATIVE)
165+
SHA256_Transform_shani(state, block);
166+
return;
167+
#elif defined(PHP_HASH_INTRIN_SHA_RESOLVER)
168+
if (zend_cpu_supports(ZEND_CPU_FEATURE_SSSE3) && zend_cpu_supports(ZEND_CPU_FEATURE_SHA)) {
169+
SHA256_Transform_shani(state, block);
170+
return;
171+
}
172+
#endif
173+
174+
#if defined(__SSE2__)
175+
uint32_t tmp32[72];
176+
177+
SHA256_Transform_sse2(state, block, &tmp32[0], &tmp32[64]);
178+
ZEND_SECURE_ZERO((unsigned char*) tmp32, sizeof(tmp32));
179+
return;
180+
#endif
181+
163182
uint32_t a = state[0], b = state[1], c = state[2], d = state[3];
164183
uint32_t e = state[4], f = state[5], g = state[6], h = state[7];
165184
uint32_t x[16], T1, T2, W[64];

ext/hash/hash_sha_ni.c

+176
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
/*-
2+
* Copyright 2018 Tarsnap Backup Inc.
3+
* All rights reserved.
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions
7+
* are met:
8+
* 1. Redistributions of source code must retain the above copyright
9+
* notice, this list of conditions and the following disclaimer.
10+
* 2. Redistributions in binary form must reproduce the above copyright
11+
* notice, this list of conditions and the following disclaimer in the
12+
* documentation and/or other materials provided with the distribution.
13+
*
14+
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15+
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17+
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20+
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21+
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22+
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23+
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24+
* SUCH DAMAGE.
25+
*/
26+
27+
#include "php_hash.h"
28+
#include "php_hash_sha.h"
29+
30+
#if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_IMMINTRIN_H)
31+
32+
# include <immintrin.h>
33+
34+
# if PHP_HASH_INTRIN_SHA_RESOLVER
35+
static __m128i be32dec_128(const uint8_t * src) __attribute__((target("ssse3")));
36+
void SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8], const uint8_t block[PHP_STATIC_RESTRICT 64]) __attribute__((target("ssse3,sha")));
37+
# endif
38+
39+
/* Original implementation from libcperciva follows.
40+
*
41+
* Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility.
42+
*/
43+
44+
/**
45+
* This code uses intrinsics from the following feature sets:
46+
* SHANI: _mm_sha256msg1_epu32, _mm_sha256msg2_epu32, _mm_sha256rnds2_epu32
47+
* SSSE3: _mm_shuffle_epi8, _mm_alignr_epi8
48+
* SSE2: Everything else
49+
*
50+
* The SSSE3 intrinsics could be avoided at a slight cost by using a few SSE2
51+
* instructions in their place; we have not done this since to our knowledge
52+
* there are presently no CPUs which support the SHANI instruction set but do
53+
* not support SSSE3.
54+
*/
55+
56+
/* Load 32-bit big-endian words. */
57+
static __m128i
58+
be32dec_128(const uint8_t * src)
59+
{
60+
const __m128i SHUF = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11,
61+
4, 5, 6, 7, 0, 1, 2, 3);
62+
__m128i x;
63+
64+
/* Load four 32-bit words. */
65+
x = _mm_loadu_si128((const __m128i *)src);
66+
67+
/* Reverse the order of the bytes in each word. */
68+
return (_mm_shuffle_epi8(x, SHUF));
69+
}
70+
71+
/* Convert an unsigned 32-bit immediate into a signed value. */
72+
#define I32(a) ((UINT32_C(a) >= UINT32_C(0x80000000)) ? \
73+
-(int32_t)(UINT32_C(0xffffffff) - UINT32_C(a)) - 1 : (int32_t)INT32_C(a))
74+
75+
/* Load four unsigned 32-bit immediates into a vector register. */
76+
#define IMM4(a, b, c, d) _mm_set_epi32(I32(a), I32(b), I32(c), I32(d))
77+
78+
/* Run four rounds of SHA256. */
79+
#define RND4(S, W, K0, K1, K2, K3) do { \
80+
__m128i M; \
81+
\
82+
/* Add the next four words of message schedule and round constants. */ \
83+
M = _mm_add_epi32(W, IMM4(K3, K2, K1, K0)); \
84+
\
85+
/* Perform two rounds of SHA256, using the low two words in M. */ \
86+
S[1] = _mm_sha256rnds2_epu32(S[1], S[0], M); \
87+
\
88+
/* Shift the two words of M down and perform the next two rounds. */ \
89+
M = _mm_srli_si128(M, 8); \
90+
S[0] = _mm_sha256rnds2_epu32(S[0], S[1], M); \
91+
} while (0)
92+
93+
/* Compute the ith set of four words of message schedule. */
94+
#define MSG4(W, i) do { \
95+
W[(i + 0) % 4] = _mm_sha256msg1_epu32(W[(i + 0) % 4], W[(i + 1) % 4]); \
96+
W[(i + 0) % 4] = _mm_add_epi32(W[(i + 0) % 4], \
97+
_mm_alignr_epi8(W[(i + 3) % 4], W[(i + 2) % 4], 4)); \
98+
W[(i + 0) % 4] = _mm_sha256msg2_epu32(W[(i + 0) % 4], W[(i + 3) % 4]); \
99+
} while (0)
100+
101+
/* Perform 4 rounds of SHA256 and generate more message schedule if needed. */
102+
#define RNDMSG(S, W, i, K0, K1, K2, K3) do { \
103+
RND4(S, W[i % 4], K0, K1, K2, K3); \
104+
if (i < 12) \
105+
MSG4(W, i + 4); \
106+
} while (0)
107+
108+
/**
109+
* SHA256_Transform_shani(state, block):
110+
* Compute the SHA256 block compression function, transforming ${state} using
111+
* the data in ${block}. This implementation uses x86 SHANI and SSSE3
112+
* instructions, and should only be used if CPUSUPPORT_X86_SHANI and _SSSE3
113+
* are defined and cpusupport_x86_shani() and _ssse3() return nonzero.
114+
*/
115+
void
116+
SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8],
117+
const uint8_t block[PHP_STATIC_RESTRICT 64])
118+
{
119+
__m128i S3210, S7654;
120+
__m128i S0123, S4567;
121+
__m128i S0145, S2367;
122+
__m128i W[4];
123+
__m128i S[2];
124+
125+
/* Load state. */
126+
S3210 = _mm_loadu_si128((const __m128i *)&state[0]);
127+
S7654 = _mm_loadu_si128((const __m128i *)&state[4]);
128+
129+
/* Shuffle the 8 32-bit values into the order we need them. */
130+
S0123 = _mm_shuffle_epi32(S3210, 0x1B);
131+
S4567 = _mm_shuffle_epi32(S7654, 0x1B);
132+
S0145 = _mm_unpackhi_epi64(S4567, S0123);
133+
S2367 = _mm_unpacklo_epi64(S4567, S0123);
134+
135+
/* Load input block; this is the start of the message schedule. */
136+
W[0] = be32dec_128(&block[0]);
137+
W[1] = be32dec_128(&block[16]);
138+
W[2] = be32dec_128(&block[32]);
139+
W[3] = be32dec_128(&block[48]);
140+
141+
/* Initialize working variables. */
142+
S[0] = S0145;
143+
S[1] = S2367;
144+
145+
/* Perform 64 rounds, 4 at a time. */
146+
RNDMSG(S, W, 0, 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5);
147+
RNDMSG(S, W, 1, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5);
148+
RNDMSG(S, W, 2, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3);
149+
RNDMSG(S, W, 3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174);
150+
RNDMSG(S, W, 4, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc);
151+
RNDMSG(S, W, 5, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da);
152+
RNDMSG(S, W, 6, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7);
153+
RNDMSG(S, W, 7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967);
154+
RNDMSG(S, W, 8, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13);
155+
RNDMSG(S, W, 9, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85);
156+
RNDMSG(S, W, 10, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3);
157+
RNDMSG(S, W, 11, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070);
158+
RNDMSG(S, W, 12, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5);
159+
RNDMSG(S, W, 13, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3);
160+
RNDMSG(S, W, 14, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208);
161+
RNDMSG(S, W, 15, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2);
162+
163+
/* Mix local working variables into global state. */
164+
S0145 = _mm_add_epi32(S0145, S[0]);
165+
S2367 = _mm_add_epi32(S2367, S[1]);
166+
167+
/* Shuffle state back to the original word order and store. */
168+
S0123 = _mm_unpackhi_epi64(S2367, S0145);
169+
S4567 = _mm_unpacklo_epi64(S2367, S0145);
170+
S3210 = _mm_shuffle_epi32(S0123, 0x1B);
171+
S7654 = _mm_shuffle_epi32(S4567, 0x1B);
172+
_mm_storeu_si128((__m128i *)&state[0], S3210);
173+
_mm_storeu_si128((__m128i *)&state[4], S7654);
174+
}
175+
176+
#endif

0 commit comments

Comments
 (0)