Add intel simd

Raimo33 · Raimo33 · commit a543eded8ab5 · 2025-09-05T14:44:01.000+02:00
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
@@ -14,6 +14,10 @@
 
 #include "field_5x52_int128_impl.h"
 
+#ifdef X86
+# include <immintrin.h>
+#endif
+
 #ifdef VERIFY
 static void secp256k1_fe_impl_verify(const secp256k1_fe *a) {
     const uint64_t *d = a->n;
@@ -37,10 +41,15 @@ static void secp256k1_fe_impl_get_bounds(secp256k1_fe *r, int m) {
     const uint64_t bound1 = 0xFFFFFFFFFFFFFULL * two_m;
     const uint64_t bound2 = 0x0FFFFFFFFFFFFULL * two_m;
 
+#ifdef __AVX__
+    __m256i vec = _mm256_set1_epi64x(bound1);
+    _mm256_storeu_si256((__m256i *)r->n, vec);
+#else
     r->n[0] = bound1;
     r->n[1] = bound1;
     r->n[2] = bound1;
     r->n[3] = bound1;
+#endif
     r->n[4] = bound2;
 }
 
@@ -239,6 +248,8 @@ static void secp256k1_fe_impl_set_b32_mod(secp256k1_fe *r, const unsigned char *
     limbs[3] = BYTESWAP_64(limbs[3]);
 #endif
 
+    /* TODO: parallelize avx2 */
+
     r->n[0] =                     (limbs[3] & 0xFFFFFFFFFFFFFULL);
     r->n[1] = (limbs[3] >> 52) | ((limbs[2] & 0xFFFFFFFFFFULL) << 12);
     r->n[2] = (limbs[2] >> 40) | ((limbs[1] & 0xFFFFFFFULL) << 24);
@@ -291,6 +302,10 @@ static void secp256k1_fe_impl_get_b32(unsigned char *r, const secp256k1_fe *a) {
 }
 
 SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r, const secp256k1_fe *a, int m) {
+#if defined(__AVX__) && defined(__AVX2__)
+    /* load here to mitigate load latency */
+    __m256i vec_a = _mm256_loadu_si256((__m256i *)a->n);
+#endif
     const uint32_t two_m1 = 2 * (m + 1);
     const uint64_t bound1 = 0xFFFFEFFFFFC2FULL * two_m1;
     const uint64_t bound2 = 0xFFFFFFFFFFFFFULL * two_m1;
@@ -303,10 +318,18 @@ SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r,
 
     /* Due to the properties above, the left hand in the subtractions below is never less than
      * the right hand. */
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_bounds = _mm256_setr_epi64x(bound1, bound2, bound2, bound2);
+        __m256i out = _mm256_sub_epi64(vec_bounds, vec_a);
+        _mm256_storeu_si256((__m256i *)r->n, out);
+    }
+#else
     r->n[0] = bound1 - a->n[0];
     r->n[1] = bound2 - a->n[1];
     r->n[2] = bound2 - a->n[2];
     r->n[3] = bound2 - a->n[3];
+#endif
     r->n[4] = bound3 - a->n[4];
 }
 
@@ -339,15 +362,32 @@ SECP256K1_INLINE static void secp256k1_fe_impl_sqr(secp256k1_fe *r, const secp25
 }
 
 SECP256K1_INLINE static void secp256k1_fe_impl_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) {
+#if defined(__AVX__) && defined(__AVX2__)
+    /* load here to mitigate load latency */
+    __m256i vec_r = _mm256_loadu_si256((__m256i *)(r->n));
+    __m256i vec_a = _mm256_loadu_si256((__m256i *)(a->n));
+#endif
+
     uint64_t mask0, mask1;
     volatile int vflag = flag;
     SECP256K1_CHECKMEM_CHECK_VERIFY(r->n, sizeof(r->n));
     mask0 = vflag + ~((uint64_t)0);
     mask1 = ~mask0;
+
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_mask0 = _mm256_set1_epi64x(mask0);
+        __m256i vec_mask1 = _mm256_set1_epi64x(mask1);
+        vec_r = _mm256_and_si256(vec_r, vec_mask0);
+        vec_a = _mm256_and_si256(vec_a, vec_mask1);
+        _mm256_storeu_si256((__m256i *)r->n, _mm256_or_si256(vec_r, vec_a));
+    }
+#else
     r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
     r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
     r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
     r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
+#endif
     r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
 }
 
@@ -418,19 +458,42 @@ static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r,
 }
 
 static void secp256k1_fe_impl_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) {
+#if defined(__AVX__) && defined(__AVX2__)
+    __m256i limbs_0123 = _mm256_loadu_si256((__m256i *)a->n);
+    __m256i limbs_1234 = _mm256_loadu_si256((__m256i *)(a->n + 1));
+    const __m256i shift_lhs = _mm256_setr_epi64x(0, 12, 24, 36); /* TODO: precompute */
+    const __m256i shift_rhs = _mm256_setr_epi64x(52, 40, 28, 16); /* TODO: precompute */
+    __m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs);
+    __m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs);
+    _mm256_storeu_si256((__m256i *)r->n, _mm256_or_si256(lhs, rhs));
+#else
     r->n[0] = a->n[0]       | a->n[1] << 52;
     r->n[1] = a->n[1] >> 12 | a->n[2] << 40;
     r->n[2] = a->n[2] >> 24 | a->n[3] << 28;
     r->n[3] = a->n[3] >> 36 | a->n[4] << 16;
+#endif
 }
 
 static SECP256K1_INLINE void secp256k1_fe_impl_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) {
     const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3];
 
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
+        __m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2);
+        const __m256i shift_lhs = _mm256_setr_epi64x(64, 52, 40, 28); /* TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(0, 12, 24, 36); /* TODO: precompute */
+        const __m256i mask52 = _mm256_set1_epi64x(0xFFFFFFFFFFFFFULL); /* TODO: precompute */
+        __m256i rhs = _mm256_and_si256(_mm256_sllv_epi64(limbs_0123, shift_rhs), mask52);
+        __m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs);
+        _mm256_storeu_si256((__m256i*)r->n, _mm256_or_si256(lhs, rhs));
+    }
+#else
     r->n[0] = a0 & 0xFFFFFFFFFFFFFULL;
     r->n[1] = a0 >> 52 | ((a1 << 12) & 0xFFFFFFFFFFFFFULL);
     r->n[2] = a1 >> 40 | ((a2 << 24) & 0xFFFFFFFFFFFFFULL);
     r->n[3] = a2 >> 28 | ((a3 << 36) & 0xFFFFFFFFFFFFFULL);
+#endif
     r->n[4] = a3 >> 16;
 }
 
@@ -447,21 +510,49 @@ static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64
     VERIFY_CHECK(a3 >> 62 == 0);
     VERIFY_CHECK(a4 >> 8 == 0);
 
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
+        __m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2);
+        const __m256i shift_lhs = _mm256_setr_epi64x(64, 52, 42, 32); /*TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(0, 10, 20, 30); /*TODO: precompute */
+        const __m256i mask52 = _mm256_set1_epi64x(M52); /*TODO: precompute */
+        __m256i rhs = _mm256_sllv_epi64(limbs_0123, shift_rhs);
+        __m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs);
+        __m256i out = _mm256_or_si256(lhs, rhs);
+        _mm256_storeu_si256((__m256i*)r->n, _mm256_and_si256(out, mask52));
+    }
+#else
     r->n[0] =  a0                   & M52;
     r->n[1] = (a0 >> 52 | a1 << 10) & M52;
     r->n[2] = (a1 >> 42 | a2 << 20) & M52;
     r->n[3] = (a2 >> 32 | a3 << 30) & M52;
+#endif
     r->n[4] = (a3 >> 22 | a4 << 40);
 }
 
 static void secp256k1_fe_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_fe *a) {
     const uint64_t M62 = UINT64_MAX >> 2;
     const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4];
 
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
+        __m256i limbs_1234 = _mm256_setr_epi64x(a1, a2, a3, a4);
+        const __m256i shift_lhs = _mm256_setr_epi64x(0, 10, 20, 30); /*TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(52, 42, 32, 22); /*TODO: precompute */
+        const __m256i mask62 = _mm256_set1_epi64x(M62); /*TODO: precompute */
+        __m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs);
+        __m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs);
+        __m256i out = _mm256_or_si256(lhs, rhs);
+        _mm256_storeu_si256((__m256i *)r->v, _mm256_and_si256(out, mask62));
+    }
+#else
     r->v[0] = (a0       | a1 << 52) & M62;
     r->v[1] = (a1 >> 10 | a2 << 42) & M62;
     r->v[2] = (a2 >> 20 | a3 << 32) & M62;
     r->v[3] = (a3 >> 30 | a4 << 22) & M62;
+#endif
     r->v[4] =  a4 >> 40;
 }
 
diff --git a/src/hash_impl.h b/src/hash_impl.h
@@ -14,6 +14,10 @@
 #include <stdint.h>
 #include <string.h>
 
+#ifdef X86
+# include <immintrin.h>
+#endif
+
 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
 #define Maj(x,y,z) (((x) & (y)) | ((z) & ((x) | (y))))
 #define Sigma0(x) (((x) >> 2  | (x) << 30) ^ ((x) >> 13 | (x) << 19) ^ ((x) >> 22 | (x) << 10))
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
@@ -12,6 +12,10 @@
 #include "modinv64_impl.h"
 #include "util.h"
 
+#ifdef X86
+# include <immintrin.h>
+#endif
+
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL)
 #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL)
@@ -143,10 +147,25 @@ static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int
 
 static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b32, int *overflow) {
     int over;
+
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i input = _mm256_loadu_si256((const __m256i*)b32);
+        input = _mm256_permute4x64_epi64(input, _MM_SHUFFLE(0,1,2,3));
+        const __m256i bswap_mask = _mm256_setr_epi8( /* TODO: precompute */
+            7,6,5,4,3,2,1,0,
+            15,14,13,12,11,10,9,8,
+            23,22,21,20,19,18,17,16,
+            31,30,29,28,27,26,25,24);
+        __m256i output = _mm256_shuffle_epi8(input, bswap_mask);
+        _mm256_storeu_si256((__m256i*)r->d, output);
+    }
+#else
     r->d[0] = secp256k1_read_be64(&b32[24]);
     r->d[1] = secp256k1_read_be64(&b32[16]);
     r->d[2] = secp256k1_read_be64(&b32[8]);
     r->d[3] = secp256k1_read_be64(&b32[0]);
+#endif
     over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
     if (overflow) {
         *overflow = over;
@@ -158,6 +177,8 @@ static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b
 static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar* a) {
     SECP256K1_SCALAR_VERIFY(a);
 
+    /* TODO: parallelize */
+
     secp256k1_write_be64(&bin[0],  a->d[3]);
     secp256k1_write_be64(&bin[8],  a->d[2]);
     secp256k1_write_be64(&bin[16], a->d[1]);
@@ -166,7 +187,6 @@ static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar*
 
 SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a) {
     SECP256K1_SCALAR_VERIFY(a);
-
     return (a->d[0] | a->d[1] | a->d[2] | a->d[3]) == 0;
 }
 
@@ -882,8 +902,16 @@ static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r
 SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b) {
     SECP256K1_SCALAR_VERIFY(a);
     SECP256K1_SCALAR_VERIFY(b);
-
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_a = _mm256_loadu_si256((__m256i *)a->d);
+        __m256i vec_b = _mm256_loadu_si256((__m256i *)b->d);
+        __m256i vec_xor = _mm256_xor_si256(vec_a, vec_b);
+        return _mm256_testz_si256(vec_xor, vec_xor);
+    }
+#else
     return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3])) == 0;
+#endif
 }
 
 SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b, unsigned int shift) {
@@ -899,6 +927,9 @@ SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r,
     shiftlimbs = shift >> 6;
     shiftlow = shift & 0x3F;
     shifthigh = 64 - shiftlow;
+
+    /* TODO: parallelize */
+
     r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[1] = shift < 448 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[2] = shift < 384 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0;
@@ -909,37 +940,68 @@ SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r,
 }
 
 static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const secp256k1_scalar *a, int flag) {
+#if defined(__AVX__) && defined(__AVX2__)
+    /* load here to mitigate load latency */
+    __m256i vec_r = _mm256_loadu_si256((__m256i *)(r->d));
+    __m256i vec_a = _mm256_loadu_si256((__m256i *)(a->d));
+#endif
+
     uint64_t mask0, mask1;
     volatile int vflag = flag;
     SECP256K1_SCALAR_VERIFY(a);
     SECP256K1_CHECKMEM_CHECK_VERIFY(r->d, sizeof(r->d));
 
     mask0 = vflag + ~((uint64_t)0);
     mask1 = ~mask0;
+
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i vec_mask0 = _mm256_set1_epi64x(mask0);
+        __m256i vec_mask1 = _mm256_set1_epi64x(mask1);
+        vec_r = _mm256_and_si256(vec_r, vec_mask0);
+        vec_a = _mm256_and_si256(vec_a, vec_mask1);
+        _mm256_storeu_si256((__m256i *)(r->d), _mm256_or_si256(vec_r, vec_a));
+    }
+#else
     r->d[0] = (r->d[0] & mask0) | (a->d[0] & mask1);
     r->d[1] = (r->d[1] & mask0) | (a->d[1] & mask1);
     r->d[2] = (r->d[2] & mask0) | (a->d[2] & mask1);
     r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
+#endif
 
     SECP256K1_SCALAR_VERIFY(r);
 }
 
 static void secp256k1_scalar_from_signed62(secp256k1_scalar *r, const secp256k1_modinv64_signed62 *a) {
-    const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
-
     /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and
      * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4).
      */
-    VERIFY_CHECK(a0 >> 62 == 0);
-    VERIFY_CHECK(a1 >> 62 == 0);
-    VERIFY_CHECK(a2 >> 62 == 0);
-    VERIFY_CHECK(a3 >> 62 == 0);
-    VERIFY_CHECK(a4 >> 8 == 0);
+    VERIFY_CHECK(a->v[0] >> 62 == 0);
+    VERIFY_CHECK(a->v[1] >> 62 == 0);
+    VERIFY_CHECK(a->v[2] >> 62 == 0);
+    VERIFY_CHECK(a->v[3] >> 62 == 0);
+    VERIFY_CHECK(a->v[4] >> 8 == 0);
+
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0123 = _mm256_loadu_si256((__m256i *)a->v);
+        __m256i limbs_1234 = _mm256_loadu_si256((__m256i *)(a->v + 1));
+        const __m256i shift_lhs = _mm256_setr_epi64x(0, 2, 4, 6); /* TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(62, 60, 58, 56); /* TODO: precompute */
+        __m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs);
+        __m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs);
+        _mm256_storeu_si256((__m256i *)(r->d), _mm256_or_si256(lhs, rhs));
+    }
+#else
+    {
+        const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
 
-    r->d[0] = a0      | a1 << 62;
-    r->d[1] = a1 >> 2 | a2 << 60;
-    r->d[2] = a2 >> 4 | a3 << 58;
-    r->d[3] = a3 >> 6 | a4 << 56;
+        r->d[0] = a0      | a1 << 62;
+        r->d[1] = a1 >> 2 | a2 << 60;
+        r->d[2] = a2 >> 4 | a3 << 58;
+        r->d[3] = a3 >> 6 | a4 << 56;
+    }
+#endif
 
     SECP256K1_SCALAR_VERIFY(r);
 }
@@ -949,10 +1011,24 @@ static void secp256k1_scalar_to_signed62(secp256k1_modinv64_signed62 *r, const s
     const uint64_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3];
     SECP256K1_SCALAR_VERIFY(a);
 
+#if defined(__AVX__) && defined(__AVX2__)
+    {
+        __m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2);
+        __m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
+        const __m256i shift_lhs = _mm256_setr_epi64x(0, 62, 60, 58); /*TODO: precompute */
+        const __m256i shift_rhs = _mm256_setr_epi64x(64, 2, 4, 6); /*TODO: precompute */
+        const __m256i mask62 = _mm256_set1_epi64x(M62); /*TODO: precompute */
+        __m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs);
+        __m256i rhs = _mm256_sllv_epi64(limbs_0123, shift_rhs);
+        __m256i out = _mm256_or_si256(lhs, rhs);
+        _mm256_storeu_si256((__m256i *)r->v, _mm256_and_si256(out, mask62));
+    }
+#else
     r->v[0] =  a0                   & M62;
     r->v[1] = (a0 >> 62 | a1 <<  2) & M62;
     r->v[2] = (a1 >> 60 | a2 <<  4) & M62;
     r->v[3] = (a2 >> 58 | a3 <<  6) & M62;
+#endif
     r->v[4] =  a3 >> 56;
 }
 
diff --git a/src/util.h b/src/util.h