Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions backends/zk-cuda-backend/cuda/include/curve.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ __host__ __device__ void fp2_zero(Fp2 &a);

// G1 point: (x, y) coordinates in Fp
// Curve equation: y^2 = x^3 + b (short Weierstrass form with a = 0)
struct G1Affine {
//
// alignas(sizeof(uint64_t)): The bool infinity field causes the struct to be
// padded to the largest field alignment (4 bytes in 32-bit limb mode, 8 bytes
// in 64-bit). Forcing alignment to sizeof(uint64_t) ensures
// sizeof(G1Affine)==120 in both modes, matching the Rust FFI bindings which
// are always generated from the 64-bit layout regardless of LIMB_BITS_CONFIG.
struct alignas(sizeof(uint64_t)) G1Affine {
Fp x;
Fp y;
bool infinity; // true if point at infinity (identity element)
Expand All @@ -36,7 +42,9 @@ struct G1Affine {

// G2 point: (x, y) coordinates in Fp2
// Curve equation: y^2 = x^3 + b' (twisted curve over Fp2)
struct G2Affine {
//
// alignas(sizeof(uint64_t)): same ABI-stability reason as G1Affine above.
struct alignas(sizeof(uint64_t)) G2Affine {
Fp2 x;
Fp2 y;
bool infinity; // true if point at infinity (identity element)
Expand Down
18 changes: 17 additions & 1 deletion backends/zk-cuda-backend/cuda/include/fp.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
// Supported values: 32, 64.
// ============================================================================
#ifndef LIMB_BITS_CONFIG
#define LIMB_BITS_CONFIG 64
#define LIMB_BITS_CONFIG 32
#endif

#if LIMB_BITS_CONFIG == 64
Comment thread
pdroalves marked this conversation as resolved.
Expand Down Expand Up @@ -209,6 +209,17 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b);
// MONTGOMERY: Both inputs and output must be in Montgomery form
__host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b);

// Lazy addition: c = a + b, output in [0, 2p) for inputs in [0, p).
// Skips the final conditional subtraction of fp_add.
// Safe as input to fp_mont_mul (CIOS accepts [0, 2p)); NOT safe for final
// results or as input to fp_sub/fp_neg which require [0, p) inputs.
__host__ __device__ void fp_add_lazy(Fp &c, const Fp &a, const Fp &b);

// Lazy subtraction: c ≡ a - b (mod p), output in [0, 2p) for inputs in [0, p).
// Adds p unconditionally, skipping the borrow-select of fp_sub.
// Same safety concerns as fp_add_lazy.
__host__ __device__ void fp_sub_lazy(Fp &c, const Fp &a, const Fp &b);

// Multiplication: c = a * b (without reduction)
// "Raw" means the operation is performed without modular reduction modulo p.
// The result is stored in double-width (2*FP_LIMBS limbs) and may be >= p.
Expand All @@ -225,6 +236,11 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a);
// Both a and b are in Montgomery form, result is in Montgomery form
__host__ __device__ void fp_mont_mul(Fp &c, const Fp &a, const Fp &b);

// Montgomery squaring: c = (a^2 * R_INV) mod p
// Both input and output in Montgomery form.
// On device uses a triangular MAD chain (fewer multiplications).
__host__ __device__ void fp_mont_sqr(Fp &c, const Fp &a);

// CONVERSION: Input is normal form, output is Montgomery form
__host__ __device__ void fp_to_montgomery(Fp &c, const Fp &a);

Expand Down
7 changes: 6 additions & 1 deletion backends/zk-cuda-backend/cuda/include/fp2.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ __host__ __device__ void fp2_add(Fp2 &c, const Fp2 &a, const Fp2 &b);
// Subtraction: c = a - b
__host__ __device__ void fp2_sub(Fp2 &c, const Fp2 &a, const Fp2 &b);

// Lazy add/sub: each component output in [0, 2p) for inputs in [0, p).
// Safe as input to fp2_mont_mul; same contract as fp_add_lazy / fp_sub_lazy.
__host__ __device__ void fp2_add_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b);
__host__ __device__ void fp2_sub_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b);

// Multiplication: c = a * b
// (a0 + a1*i) * (b0 + b1*i) = (a0*b0 - a1*b1) + (a0*b1 + a1*b0)*i
// NOTE: Assumes inputs are in normal form and converts to/from Montgomery
Expand All @@ -84,7 +89,7 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b);
// Montgomery squaring: c = a^2 (all in Montgomery form)
// Uses the complex-squaring identity: c0 = (a0+a1)(a0-a1), c1 = 2*a0*a1
// Only 2 Fp multiplications vs 3 for fp2_mont_mul(c, a, a).
// NOTE: All inputs and outputs are in Montgomery form (no conversions)
// NOTE: All inputs should be in Montgomery form
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about outputs?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, both should be in a Montgomery form

__host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a);

// Squaring: c = a^2
Expand Down
30 changes: 18 additions & 12 deletions backends/zk-cuda-backend/cuda/src/curve.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1413,7 +1413,7 @@ __host__ __device__ void projective_point_add(G1Projective &result,
u = Y2Z1 - Y1Z2;

// uu = u^2
fp_mont_mul(uu, u, u);
fp_mont_sqr(uu, u);

// v = X2 * Z1 - X1 * Z2 = X2*Z1 - X1Z2
Fp X2Z1;
Expand All @@ -1428,7 +1428,7 @@ __host__ __device__ void projective_point_add(G1Projective &result,
}

// vv = v^2
fp_mont_mul(vv, v, v);
fp_mont_sqr(vv, v);
// vvv = v * vv
fp_mont_mul(vvv, v, vv);

Expand Down Expand Up @@ -1568,9 +1568,9 @@ __host__ __device__ void projective_mixed_add(G1Projective &result,
}

// uu = u^2
fp_mont_mul(uu, u, u);
fp_mont_sqr(uu, u);
// vv = v^2
fp_mont_mul(vv, v, v);
fp_mont_sqr(vv, v);
// vvv = v * vv
fp_mont_mul(vvv, v, vv);

Expand Down Expand Up @@ -1692,7 +1692,7 @@ __host__ __device__ void projective_point_double(G1Projective &result,

// A = 3 * X^2
Fp X_sq, A;
fp_mont_mul(X_sq, p.X, p.X);
fp_mont_sqr(X_sq, p.X);
fp_mul3(A, X_sq);

// B = Y * Z
Expand All @@ -1706,7 +1706,7 @@ __host__ __device__ void projective_point_double(G1Projective &result,

// D = A^2 - 8*C
Fp A_sq, eight_C;
fp_mont_mul(A_sq, A, A);
fp_mont_sqr(A_sq, A);
fp_mul8(eight_C, C);
Fp D = A_sq - eight_C;

Expand All @@ -1716,14 +1716,16 @@ __host__ __device__ void projective_point_double(G1Projective &result,
fp_double(result.X, BD);

// Y3 = A * (4*C - D) - 8 * Y^2 * B^2
Fp four_C, A_times_diff;
Fp four_C, four_C_minus_D, A_times_diff;
fp_mul4(four_C, C);
Fp four_C_minus_D = four_C - D;
// Lazy sub: four_C_minus_D feeds fp_mont_mul, so skip the conditional
// subtract and output in [0, 2p) instead of [0, p).
fp_sub_lazy(four_C_minus_D, four_C, D);
fp_mont_mul(A_times_diff, A, four_C_minus_D);

Fp Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;
fp_mont_mul(Y_sq, p.Y, p.Y);
fp_mont_mul(B_sq, B, B);
fp_mont_sqr(Y_sq, p.Y);
fp_mont_sqr(B_sq, B);
fp_mont_mul(Y_sq_B_sq, Y_sq, B_sq);
fp_mul8(eight_Y_sq_B_sq, Y_sq_B_sq);
result.Y = A_times_diff - eight_Y_sq_B_sq;
Expand Down Expand Up @@ -1773,9 +1775,13 @@ __host__ __device__ void projective_point_double(G2Projective &result,
fp2_double(result.X, BD);

// Y3 = A * (4*C - D) - 8 * Y^2 * B^2
Fp2 four_C, A_times_diff;
Fp2 four_C, four_C_minus_D, A_times_diff;
fp2_mul4(four_C, C);
Fp2 four_C_minus_D = four_C - D;

// we can't use lazy sub here because for fp2 with Karatsuba path we will end
// up with values in [0, 4p) instead of [0, 2p), which would break the final
// result
fp2_sub(four_C_minus_D, four_C, D);
fp2_mont_mul(A_times_diff, A, four_C_minus_D);

Fp2 Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;
Expand Down
Loading
Loading