zama-ai
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h‎
Lines changed: 1 addition & 1 deletion b/‎backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu‎
Lines changed: 4 additions & 4 deletions b/‎backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh‎
Lines changed: 68 additions & 31 deletions b/‎backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh‎
Lines changed: 68 additions & 31 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh‎
Lines changed: 102 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/src/bindings.rs‎
Lines changed: 1 addition & 0 deletions b/‎backends/tfhe-cuda-backend/src/bindings.rs‎
Lines changed: 1 addition & 0 deletions
@@ -17,7 +17,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void const *lwe_output_indexes, void const *lwe_array_in,
     void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
     uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, int8_t *ksk_tmp_buffer);
+    uint32_t num_samples, int8_t *ksk_tmp_buffer, bool uses_trivial_indexes);
 
 uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
     void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
 
@@ -10,15 +10,15 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    void *ksk_tmp_buffer) {
+    void *ksk_tmp_buffer, bool uses_trivial_indices) {
   host_gemm_keyswitch_lwe_ciphertext_vector<uint32_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
       static_cast<uint32_t *>(lwe_array_in),
       static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
       lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
-      static_cast<uint32_t *>(ksk_tmp_buffer));
+      static_cast<uint32_t *>(ksk_tmp_buffer), uses_trivial_indices);
 }
 
 /* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
@@ -42,7 +42,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void const *lwe_output_indexes, void const *lwe_array_in,
     void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
     uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, int8_t *ksk_tmp_buffer) {
+    uint32_t num_samples, int8_t *ksk_tmp_buffer, bool uses_trivial_indices) {
   host_gemm_keyswitch_lwe_ciphertext_vector<uint64_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
@@ -51,7 +51,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
       static_cast<const uint64_t *>(lwe_input_indexes),
       static_cast<const uint64_t *>(ksk), lwe_dimension_in, lwe_dimension_out,
       base_log, level_count, num_samples,
-      (uint64_t *)((ks_mem *)ksk_tmp_buffer)->buffer);
+      (uint64_t *)((ks_mem *)ksk_tmp_buffer)->buffer, uses_trivial_indices);
 }
 
 uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
 
@@ -113,6 +113,26 @@ __global__ void keyswitch_gemm_copy_message(const Torus *lwe_in, Torus *lwe_out,
       -lwe_in[lwe_id * (lwe_dimension_in + 1) + lwe_dimension_in];
 }
 
+template <typename Torus>
+__global__ void keyswitch_gemm_copy_message_with_indices(
+    const Torus *__restrict__ lwe_in,
+    const Torus *__restrict__ lwe_input_indices, Torus *__restrict__ lwe_out,
+    const Torus *__restrict__ lwe_output_indices,
+
+    uint32_t lwe_dimension_in, uint32_t num_lwes, uint32_t lwe_dimension_out) {
+
+  uint32_t lwe_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (lwe_id >= num_lwes)
+    return;
+
+  uint32_t lwe_in_idx = lwe_input_indices[lwe_id];
+  uint32_t lwe_out_idx = lwe_output_indices[lwe_id];
+
+  lwe_out[lwe_in_idx * (lwe_dimension_out + 1) + lwe_dimension_out] =
+      -lwe_in[lwe_out_idx * (lwe_dimension_in + 1) + lwe_dimension_in];
+}
+
 // Continue decomposition of an array of Torus elements in place. Supposes
 // that the array contains already decomposed elements and
 // computes the new decomposed level in place.
@@ -256,10 +276,10 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
 template <typename Torus>
 __host__ int host_gemm_keyswitch_lwe_ciphertext_vector(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *ksk, uint32_t lwe_dimension_in,
+    Torus const *lwe_output_indices, Torus const *lwe_array_in,
+    Torus const *lwe_input_indices, Torus const *ksk, uint32_t lwe_dimension_in,
     uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, Torus *fp_tmp_buffer) {
+    uint32_t num_samples, Torus *fp_tmp_buffer, bool uses_trivial_indices) {
   cuda_set_device(gpu_index);
   check_cuda_error(cudaGetLastError());
 
@@ -280,10 +300,18 @@ __host__ int host_gemm_keyswitch_lwe_ciphertext_vector(
   // lwe_array_out is num_samples x (lwe_dimension_out + 1). copy the bodies
   // lwe_array_in[:,lwe_dimension_in] to lwe_array_out[:,lwe_dimension_out]
   // and negate
-  keyswitch_gemm_copy_message<Torus><<<grid_copy, threads_copy, 0, stream>>>(
-      lwe_array_in, lwe_array_out, lwe_dimension_in, num_samples,
-      lwe_dimension_out);
-  check_cuda_error(cudaGetLastError());
+  if (uses_trivial_indices) {
+    keyswitch_gemm_copy_message<Torus><<<grid_copy, threads_copy, 0, stream>>>(
+        lwe_array_in, lwe_array_out, lwe_dimension_in, num_samples,
+        lwe_dimension_out);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    keyswitch_gemm_copy_message_with_indices<Torus>
+        <<<grid_copy, threads_copy, 0, stream>>>(
+            lwe_array_in, lwe_input_indices, lwe_array_out, lwe_output_indices,
+            lwe_dimension_in, num_samples, lwe_dimension_out);
+    check_cuda_error(cudaGetLastError());
+  }
 
   //  dump_2d_gpu_to_file(lwe_array_out, num_samples, lwe_dimension_out + 1,
   //                      "lwe_out_only_body", prefix, stream, gpu_index);
@@ -322,10 +350,19 @@ __host__ int host_gemm_keyswitch_lwe_ciphertext_vector(
                         lwe_dimension_in, "state_init", prefix, stream,
                         gpu_index);*/
 
-  tgemm<Torus><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
-      num_samples, (lwe_dimension_out + 1), lwe_dimension_in, d_mem_0, ksk,
-      stride_KSK_buffer, lwe_array_out, lwe_dimension_out + 1);
-  check_cuda_error(cudaGetLastError());
+  if (uses_trivial_indices) {
+    tgemm<Torus><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
+        num_samples, (lwe_dimension_out + 1), lwe_dimension_in, d_mem_0, ksk,
+        stride_KSK_buffer, lwe_array_out, lwe_dimension_out + 1);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    tgemm_with_indices<Torus>
+        <<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
+            num_samples, (lwe_dimension_out + 1), lwe_dimension_in, d_mem_0,
+            lwe_input_indices, ksk, stride_KSK_buffer, lwe_array_out,
+            lwe_dimension_out + 1, lwe_output_indices);
+    check_cuda_error(cudaGetLastError());
+  }
 
   /*  dump_2d_gpu_to_file(lwe_array_out, num_samples, lwe_dimension_out + 1,
                         "tgemm0", prefix, stream, gpu_index);*/
@@ -400,35 +437,35 @@ void execute_keyswitch_async(CudaStreams streams,
     Torus *current_lwe_input_indexes =
         get_variant_element(lwe_input_indexes, i);
 
-    if (uses_trivial_indices && num_samples >= 19) {
+    if (num_samples >= 144) {
       // Compute Keyswitch
-/*      Torus *dup_out = (Torus *)cuda_malloc_async(
-          num_samples_on_gpu * (lwe_dimension_out + 1) * sizeof(Torus),
-          streams.stream(i), streams.gpu_index(i));
-      uint64_t buffer_size = scratch_cuda_keyswitch_size<Torus>(
-          lwe_dimension_in, lwe_dimension_out, num_samples_on_gpu);
-      Torus *tmp_buf = (Torus *)cuda_malloc_async(
-          buffer_size, streams.stream(i), streams.gpu_index(i));*/
+      /*      Torus *dup_out = (Torus *)cuda_malloc_async(
+                num_samples_on_gpu * (lwe_dimension_out + 1) * sizeof(Torus),
+                streams.stream(i), streams.gpu_index(i));
+            uint64_t buffer_size = scratch_cuda_keyswitch_size<Torus>(
+                lwe_dimension_in, lwe_dimension_out, num_samples_on_gpu);
+            Torus *tmp_buf = (Torus *)cuda_malloc_async(
+                buffer_size, streams.stream(i), streams.gpu_index(i));*/
 
       host_gemm_keyswitch_lwe_ciphertext_vector<Torus>(
           streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
           current_lwe_output_indexes, current_lwe_array_in,
           current_lwe_input_indexes, ksks[i], lwe_dimension_in,
           lwe_dimension_out, base_log, level_count, num_samples_on_gpu,
-          fp_tmp_buffer[i]);
+          fp_tmp_buffer[i], uses_trivial_indices);
 
       // Compute Keyswitch
-/*      host_keyswitch_lwe_ciphertext_vector<Torus>(
-          streams.stream(i), streams.gpu_index(i), dup_out,
-          current_lwe_output_indexes, current_lwe_array_in,
-          current_lwe_input_indexes, ksks[i], lwe_dimension_in,
-          lwe_dimension_out, base_log, level_count, num_samples_on_gpu);*/
-
-/*      compare_2d_arrays(dup_out, current_lwe_array_out, num_samples_on_gpu,
-                        lwe_dimension_out + 1, streams.stream(i),
-                        streams.gpu_index(i));*/
-/*      cuda_drop_async(dup_out, streams.stream(i), streams.gpu_index(i));
-      cuda_drop_async(tmp_buf, streams.stream(i), streams.gpu_index(i));*/
+      /*      host_keyswitch_lwe_ciphertext_vector<Torus>(
+                streams.stream(i), streams.gpu_index(i), dup_out,
+                current_lwe_output_indexes, current_lwe_array_in,
+                current_lwe_input_indexes, ksks[i], lwe_dimension_in,
+                lwe_dimension_out, base_log, level_count, num_samples_on_gpu);*/
+
+      /*      compare_2d_arrays(dup_out, current_lwe_array_out,
+         num_samples_on_gpu, lwe_dimension_out + 1, streams.stream(i),
+                              streams.gpu_index(i));*/
+      /*      cuda_drop_async(dup_out, streams.stream(i), streams.gpu_index(i));
+            cuda_drop_async(tmp_buf, streams.stream(i), streams.gpu_index(i));*/
       ;
     } else {
       // Compute Keyswitch
 
@@ -192,4 +192,106 @@ __global__ void tgemm(uint M, uint N, uint K, const Torus *A, const Torus *B,
   }
 }
 
+// Multiply matrices A, B of size (M, K), (K, N) respectively
+// with K as the inner dimension.
+//
+// A block of threads processeds blocks of size (BLOCK_SIZE_GEMM,
+// BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM,
+// THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM,
+// BLOCK_SIZE_GEMM)-shaped tiles of values from B.
+//
+// This code is adapted by generalizing the 1d block-tiling
+// kernel from https://github.com/siboehm/SGEMM_CUDA
+// to any matrix dimension
+template <typename Torus>
+__global__ void tgemm_with_indices(uint M, uint N, uint K, const Torus *A,
+                                   const Torus *__restrict__ A_indices,
+                                   const Torus *B, uint stride_B, Torus *C,
+                                   uint stride_C,
+                                   const Torus *__restrict__ C_indices) {
+
+  const int BM = BLOCK_SIZE_GEMM;
+  const int BN = BLOCK_SIZE_GEMM;
+  const int BK = THREADS_GEMM;
+  const int TM = THREADS_GEMM;
+
+  const uint cRow = blockIdx.y;
+  const uint cCol = blockIdx.x;
+
+  const int threadCol = threadIdx.x % BN;
+  const int threadRow = threadIdx.x / BN;
+
+  // Allocate space for the current block tile in shared memory
+  __shared__ Torus As[BM * BK];
+  __shared__ Torus Bs[BK * BN];
+
+  // Initialize the pointers to the input blocks from A, B
+  // Tiles from these blocks are loaded to shared memory
+  B += cCol * BN;
+
+  // Each thread will handle multiple sub-blocks
+  const uint innerColA = threadIdx.x % BK;
+  const uint innerRowA = threadIdx.x / BK;
+  const uint innerColB = threadIdx.x % BN;
+  const uint innerRowB = threadIdx.x / BN;
+
+  // allocate thread-local cache for results in registerfile
+  Torus threadResults[TM] = {0};
+
+  auto row_A = cRow * BM + innerRowA;
+  auto col_B = cCol * BN + innerColB;
+
+  // For each thread, loop over block tiles
+  for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
+    auto col_A = bkIdx + innerColA;
+    auto row_B = bkIdx + innerRowB;
+
+    if (row_A < M && col_A < K) {
+      As[innerRowA * BK + innerColA] = A[A_indices[row_A] * K + innerColA];
+    } else {
+      As[innerRowA * BK + innerColA] = 0;
+    }
+
+    if (col_B < N && row_B < K) {
+      Bs[innerRowB * BN + innerColB] = B[innerRowB * stride_B + innerColB];
+    } else {
+      Bs[innerRowB * BN + innerColB] = 0;
+    }
+    __syncthreads();
+
+    // Advance blocktile for the next iteration of this loop
+    B += BK * stride_B;
+
+    // calculate per-thread results
+    for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
+      // we make the dotproduct loop the outside loop, which facilitates
+      // reuse of the Bs entry, which we can cache in a tmp var.
+      Torus tmp = Bs[dotIdx * BN + threadCol];
+      for (uint resIdx = 0; resIdx < TM; ++resIdx) {
+        threadResults[resIdx] +=
+            As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp;
+      }
+    }
+    __syncthreads();
+  }
+
+  // Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM,
+  // BLOCK_SIZE_GEMM)
+  // C += cRow * BM * stride_C + cCol * BN;
+
+  // write out the results
+  for (uint resIdx = 0; resIdx < TM; ++resIdx) {
+    int outRow = cRow * BM + threadRow * TM + resIdx;
+    int outCol = cCol * BN + threadCol;
+
+    if (outRow >= M)
+      continue;
+    if (outCol >= N)
+      continue;
+
+    C[C_indices[outRow] * stride_C + cCol * BN + threadCol] +=
+        threadResults[resIdx];
+  }
+}
+
 #endif // CUDA_MULT_H
@@ -1933,6 +1933,7 @@ unsafe extern "C" {
         level_count: u32,
         num_samples: u32,
         ksk_tmp_buffer: *mut i8,
+        uses_trivial_indexes: bool,
     );
 }
 unsafe extern "C" {
Original file line number	Diff line number	Diff line change
`@@ -1933,6 +1933,7 @@ unsafe extern "C" {`
`1933`	`1933`	`level_count: u32,`
`1934`	`1934`	`num_samples: u32,`
`1935`	`1935`	`ksk_tmp_buffer: *mut i8,`
	`1936`	`+ uses_trivial_indexes: bool,`
`1936`	`1937`	`);`
`1937`	`1938`	`}`
`1938`	`1939`	`unsafe extern "C" {`