zama-ai
diff --git a/‎Makefile‎
Lines changed: 3 additions & 3 deletions b/‎Makefile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h‎
Lines changed: 27 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h‎
Lines changed: 25 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h‎
Lines changed: 10 additions & 1 deletion b/‎backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu‎
Lines changed: 30 additions & 6 deletions b/‎backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu‎
Lines changed: 30 additions & 6 deletions
@@ -705,9 +705,9 @@ test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=gpu -p tfhe -- core_crypto::gpu::
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=gpu -p tfhe -- core_crypto::gpu::
+		--features=gpu -p tfhe -- core_crypto::gpu::algorithms::test::lwe_keyswitch::test_gpu_lwe_encrypt_ks_decrypt_custom_mod_test_params_4_bits_native_u64
+#	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+#		--features=gpu -p tfhe -- core_crypto::gpu::
 
 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
 test_integer_gpu: install_rs_build_toolchain
 
@@ -13,6 +13,8 @@
 
 #include <stdio.h>
 
+#include "crypto/keyswitch.cuh"
+
 class NoiseLevel {
 public:
   // Constants equivalent to the Rust code
@@ -336,6 +338,8 @@ struct int_radix_lut_custom_input_output {
   std::vector<InputTorus *> lwe_after_ks_vec;
   std::vector<OutputTorus *> lwe_after_pbs_vec;
   std::vector<InputTorus *> lwe_trivial_indexes_vec;
+  std::vector<InputTorus *>
+      ks_tmp_buf_vec; // buffers on each GPU to store keyswitch temporary data
   std::vector<InputTorus *> lwe_aligned_vec;
 
   bool gpu_memory_allocated;
@@ -439,6 +443,21 @@ struct int_radix_lut_custom_input_output {
     multi_gpu_copy_array_async(active_streams, lwe_trivial_indexes_vec,
                                lwe_trivial_indexes, num_radix_blocks,
                                allocate_gpu_memory);
+
+    for (auto i = 0; i < active_streams.count(); ++i) {
+      uint64_t sub_size_tracker = 0;
+      uint64_t buffer_size = scratch_cuda_keyswitch_size<InputTorus>(
+          params.small_lwe_dimension, params.big_lwe_dimension,
+          num_radix_blocks);
+      auto *gpu_ks_buffer = (InputTorus *)cuda_malloc_with_size_tracking_async(
+          buffer_size, active_streams.stream(i), active_streams.gpu_index(i),
+          sub_size_tracker, allocate_gpu_memory);
+
+      if (i == 0) {
+        size_tracker += sub_size_tracker;
+      }
+      ks_tmp_buf_vec.push_back(gpu_ks_buffer);
+    }
   }
 
   void setup_mem_reuse(uint32_t num_radix_blocks,
@@ -459,6 +478,8 @@ struct int_radix_lut_custom_input_output {
     lwe_after_pbs_vec = base_lut_object->lwe_after_pbs_vec;
     lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;
 
+    ks_tmp_buf_vec = base_lut_object->ks_tmp_buf_vec;
+
     mem_reuse = true;
   }
 
@@ -861,6 +882,12 @@ struct int_radix_lut_custom_input_output {
         }
         lwe_aligned_vec.clear();
       }
+
+      for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
+        cuda_drop_with_size_tracking_async(
+            ks_tmp_buf_vec[i], active_streams.stream(i),
+            active_streams.gpu_index(i), gpu_memory_allocated);
+      }
     }
     free(h_lut_indexes);
     free(degrees);
 
@@ -15,6 +15,9 @@ template <typename Torus> struct int_rerand_mem {
 
   bool gpu_memory_allocated;
 
+  std::vector<Torus *>
+      ks_tmp_buf_vec; // buffers on each GPU to store keyswitch temporary data
+
   expand_job<Torus> *d_expand_jobs;
   expand_job<Torus> *h_expand_jobs;
 
@@ -54,6 +57,20 @@ template <typename Torus> struct int_rerand_mem {
                              num_lwes * sizeof(Torus), streams.stream(0),
                              streams.gpu_index(0));
 
+    for (auto i = 0; i < streams.count(); ++i) {
+      uint64_t sub_size_tracker = 0;
+      uint64_t buffer_size = scratch_cuda_keyswitch_size<Torus>(
+          params.small_lwe_dimension, params.big_lwe_dimension, num_lwes);
+      auto *gpu_ks_buffer = (Torus *)cuda_malloc_with_size_tracking_async(
+          buffer_size, streams.stream(i), streams.gpu_index(i),
+          sub_size_tracker, allocate_gpu_memory);
+
+      if (i == 0) {
+        size_tracker += sub_size_tracker;
+      }
+      ks_tmp_buf_vec.push_back(gpu_ks_buffer);
+    }
+
     streams.synchronize();
 
     free(h_lwe_trivial_indexes);
@@ -69,6 +86,14 @@ template <typename Torus> struct int_rerand_mem {
     cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
                                        streams.gpu_index(0),
                                        gpu_memory_allocated);
+
+    for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
+      cuda_drop_with_size_tracking_async(ks_tmp_buf_vec[i], streams.stream(i),
+                                         streams.gpu_index(i),
+                                         gpu_memory_allocated);
+    }
+    ks_tmp_buf_vec.clear();
+
     cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
     free(h_expand_jobs);
   }
 
@@ -17,13 +17,22 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void const *lwe_output_indexes, void const *lwe_array_in,
     void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
     uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
+    uint32_t num_samples, int8_t *ksk_tmp_buffer, bool uses_trivial_indexes);
 
 uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
     void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t num_lwes, bool allocate_gpu_memory);
 
+uint64_t scratch_cuda_keyswitch_64(void *stream, uint32_t gpu_index,
+                                   int8_t **fp_ks_buffer,
+                                   uint32_t lwe_dimension_in,
+                                   uint32_t lwe_dimension_out,
+                                   uint32_t num_lwes, bool allocate_gpu_memory);
+
+void cleanup_cuda_keyswitch_64(void *stream, uint32_t gpu_index,
+                               int8_t **fp_ks_buffer, bool allocate_gpu_memory);
+
 void cuda_packing_keyswitch_lwe_list_to_glwe_64(
     void *stream, uint32_t gpu_index, void *glwe_array_out,
     void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
 
@@ -9,14 +9,16 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector<uint32_t>(
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    void *ksk_tmp_buffer, bool uses_trivial_indices) {
+  host_gemm_keyswitch_lwe_ciphertext_vector<uint32_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
       static_cast<uint32_t *>(lwe_array_in),
       static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
+      static_cast<uint32_t *>(ksk_tmp_buffer), uses_trivial_indices);
 }
 
 /* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
@@ -40,15 +42,16 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void const *lwe_output_indexes, void const *lwe_array_in,
     void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
     uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector<uint64_t>(
+    uint32_t num_samples, int8_t *ksk_tmp_buffer, bool uses_trivial_indices) {
+  host_gemm_keyswitch_lwe_ciphertext_vector<uint64_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<const uint64_t *>(lwe_output_indexes),
       static_cast<const uint64_t *>(lwe_array_in),
       static_cast<const uint64_t *>(lwe_input_indexes),
       static_cast<const uint64_t *>(ksk), lwe_dimension_in, lwe_dimension_out,
-      base_log, level_count, num_samples);
+      base_log, level_count, num_samples,
+      (uint64_t *)((ks_mem *)ksk_tmp_buffer)->buffer, uses_trivial_indices);
 }
 
 uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
@@ -60,6 +63,27 @@ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
       glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
 }
 
+uint64_t scratch_cuda_keyswitch_64(void *stream, uint32_t gpu_index,
+                                   int8_t **fp_ks_buffer,
+                                   uint32_t lwe_dimension_in,
+                                   uint32_t lwe_dimension_out,
+                                   uint32_t num_lwes,
+                                   bool allocate_gpu_memory) {
+  return scratch_cuda_keyswitch<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index, (ks_mem **)fp_ks_buffer,
+      lwe_dimension_in, lwe_dimension_out, num_lwes, allocate_gpu_memory);
+}
+
+void cleanup_cuda_keyswitch_64(void *stream, uint32_t gpu_index,
+                               int8_t **fp_ks_buffer,
+                               bool allocate_gpu_memory) {
+  cleanup_cuda_keyswitch<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                                   (ks_mem *)*fp_ks_buffer,
+                                   allocate_gpu_memory);
+  delete (ks_mem *)*fp_ks_buffer;
+  *fp_ks_buffer = nullptr;
+}
+
 /* Perform functional packing keyswitch on a batch of 64 bits input LWE
  * ciphertexts.
  */