zama-ai · bbarbakadze · Apr 3, 2026 · pdroalves · Apr 22, 2026 · bbarbakadze
@@ -17,7 +17,13 @@ __host__ __device__ void fp2_zero(Fp2 &a);
 
 // G1 point: (x, y) coordinates in Fp
 // Curve equation: y^2 = x^3 + b (short Weierstrass form with a = 0)
-struct G1Affine {
+//
+// alignas(sizeof(uint64_t)): The bool infinity field causes the struct to be
+// padded to the largest field alignment (4 bytes in 32-bit limb mode, 8 bytes
+// in 64-bit). Forcing alignment to sizeof(uint64_t) ensures
+// sizeof(G1Affine)==120 in both modes, matching the Rust FFI bindings which
+// are always generated from the 64-bit layout regardless of LIMB_BITS_CONFIG.
+struct alignas(sizeof(uint64_t)) G1Affine {
   Fp x;
   Fp y;
   bool infinity; // true if point at infinity (identity element)
@@ -36,7 +42,9 @@ struct G1Affine {
 
 // G2 point: (x, y) coordinates in Fp2
 // Curve equation: y^2 = x^3 + b' (twisted curve over Fp2)
-struct G2Affine {
+//
+// alignas(sizeof(uint64_t)): same ABI-stability reason as G1Affine above.
+struct alignas(sizeof(uint64_t)) G2Affine {
   Fp2 x;
   Fp2 y;
   bool infinity; // true if point at infinity (identity element)

@@ -18,7 +18,7 @@
 // Supported values: 32, 64.
 // ============================================================================
 #ifndef LIMB_BITS_CONFIG
-#define LIMB_BITS_CONFIG 64
+#define LIMB_BITS_CONFIG 32
 #endif
 
 #if LIMB_BITS_CONFIG == 64
@@ -209,6 +209,17 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b);
 // MONTGOMERY: Both inputs and output must be in Montgomery form
 __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b);
 
+// Lazy addition: c = a + b, output in [0, 2p) for inputs in [0, p).
+// Skips the final conditional subtraction of fp_add.
+// Safe as input to fp_mont_mul (CIOS accepts [0, 2p)); NOT safe for final
+// results or as input to fp_sub/fp_neg which require [0, p) inputs.
+__host__ __device__ void fp_add_lazy(Fp &c, const Fp &a, const Fp &b);
+
+// Lazy subtraction: c ≡ a - b (mod p), output in [0, 2p) for inputs in [0, p).
+// Adds p unconditionally, skipping the borrow-select of fp_sub.
+// Same safety concerns as fp_add_lazy.
+__host__ __device__ void fp_sub_lazy(Fp &c, const Fp &a, const Fp &b);
+
 // Multiplication: c = a * b (without reduction)
 // "Raw" means the operation is performed without modular reduction modulo p.
 // The result is stored in double-width (2*FP_LIMBS limbs) and may be >= p.
@@ -225,6 +236,11 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a);
 // Both a and b are in Montgomery form, result is in Montgomery form
 __host__ __device__ void fp_mont_mul(Fp &c, const Fp &a, const Fp &b);
 
+// Montgomery squaring: c = (a^2 * R_INV) mod p
+// Both input and output in Montgomery form.
+// On device uses a triangular MAD chain (fewer multiplications).
+__host__ __device__ void fp_mont_sqr(Fp &c, const Fp &a);
+
 // CONVERSION: Input is normal form, output is Montgomery form
 __host__ __device__ void fp_to_montgomery(Fp &c, const Fp &a);
 

@@ -72,6 +72,11 @@ __host__ __device__ void fp2_add(Fp2 &c, const Fp2 &a, const Fp2 &b);
 // Subtraction: c = a - b
 __host__ __device__ void fp2_sub(Fp2 &c, const Fp2 &a, const Fp2 &b);
 
+// Lazy add/sub: each component output in [0, 2p) for inputs in [0, p).
+// Safe as input to fp2_mont_mul; same contract as fp_add_lazy / fp_sub_lazy.
+__host__ __device__ void fp2_add_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b);
+__host__ __device__ void fp2_sub_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b);
+
 // Multiplication: c = a * b
 // (a0 + a1*i) * (b0 + b1*i) = (a0*b0 - a1*b1) + (a0*b1 + a1*b0)*i
 // NOTE: Assumes inputs are in normal form and converts to/from Montgomery
@@ -84,7 +89,7 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b);
 // Montgomery squaring: c = a^2 (all in Montgomery form)
 // Uses the complex-squaring identity: c0 = (a0+a1)(a0-a1), c1 = 2*a0*a1
 // Only 2 Fp multiplications vs 3 for fp2_mont_mul(c, a, a).
-// NOTE: All inputs and outputs are in Montgomery form (no conversions)
+// NOTE: All inputs should be in Montgomery form
 __host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a);
 
 // Squaring: c = a^2

@@ -1413,7 +1413,7 @@ __host__ __device__ void projective_point_add(G1Projective &result,
   u = Y2Z1 - Y1Z2;
 
   // uu = u^2
-  fp_mont_mul(uu, u, u);
+  fp_mont_sqr(uu, u);
 
   // v = X2 * Z1 - X1 * Z2 = X2*Z1 - X1Z2
   Fp X2Z1;
@@ -1428,7 +1428,7 @@ __host__ __device__ void projective_point_add(G1Projective &result,
   }
 
   // vv = v^2
-  fp_mont_mul(vv, v, v);
+  fp_mont_sqr(vv, v);
   // vvv = v * vv
   fp_mont_mul(vvv, v, vv);
 
@@ -1568,9 +1568,9 @@ __host__ __device__ void projective_mixed_add(G1Projective &result,
   }
 
   // uu = u^2
-  fp_mont_mul(uu, u, u);
+  fp_mont_sqr(uu, u);
   // vv = v^2
-  fp_mont_mul(vv, v, v);
+  fp_mont_sqr(vv, v);
   // vvv = v * vv
   fp_mont_mul(vvv, v, vv);
 
@@ -1692,7 +1692,7 @@ __host__ __device__ void projective_point_double(G1Projective &result,
 
   // A = 3 * X^2
   Fp X_sq, A;
-  fp_mont_mul(X_sq, p.X, p.X);
+  fp_mont_sqr(X_sq, p.X);
   fp_mul3(A, X_sq);
 
   // B = Y * Z
@@ -1706,7 +1706,7 @@ __host__ __device__ void projective_point_double(G1Projective &result,
 
   // D = A^2 - 8*C
   Fp A_sq, eight_C;
-  fp_mont_mul(A_sq, A, A);
+  fp_mont_sqr(A_sq, A);
   fp_mul8(eight_C, C);
   Fp D = A_sq - eight_C;
 
@@ -1716,14 +1716,16 @@ __host__ __device__ void projective_point_double(G1Projective &result,
   fp_double(result.X, BD);
 
   // Y3 = A * (4*C - D) - 8 * Y^2 * B^2
-  Fp four_C, A_times_diff;
+  Fp four_C, four_C_minus_D, A_times_diff;
   fp_mul4(four_C, C);
-  Fp four_C_minus_D = four_C - D;
+  // Lazy sub: four_C_minus_D feeds fp_mont_mul, so skip the conditional
+  // subtract and output in [0, 2p) instead of [0, p).
+  fp_sub_lazy(four_C_minus_D, four_C, D);
   fp_mont_mul(A_times_diff, A, four_C_minus_D);
 
   Fp Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;
-  fp_mont_mul(Y_sq, p.Y, p.Y);
-  fp_mont_mul(B_sq, B, B);
+  fp_mont_sqr(Y_sq, p.Y);
+  fp_mont_sqr(B_sq, B);
   fp_mont_mul(Y_sq_B_sq, Y_sq, B_sq);
   fp_mul8(eight_Y_sq_B_sq, Y_sq_B_sq);
   result.Y = A_times_diff - eight_Y_sq_B_sq;
@@ -1773,9 +1775,13 @@ __host__ __device__ void projective_point_double(G2Projective &result,
   fp2_double(result.X, BD);
 
   // Y3 = A * (4*C - D) - 8 * Y^2 * B^2
-  Fp2 four_C, A_times_diff;
+  Fp2 four_C, four_C_minus_D, A_times_diff;
   fp2_mul4(four_C, C);
-  Fp2 four_C_minus_D = four_C - D;
+
+  // we can't use lazy sub here because for fp2 with Karatsuba path we will end
+  // up with values in [0, 4p) instead of [0, 2p), which would break the final
+  // result
+  fp2_sub(four_C_minus_D, four_C, D);
   fp2_mont_mul(A_times_diff, A, four_C_minus_D);
 
   Fp2 Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;