zama-ai
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/linear_algebra.h‎
Lines changed: 6 additions & 10 deletions b/‎backends/tfhe-cuda-backend/cuda/include/linear_algebra.h‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh‎
Lines changed: 5 additions & 5 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh‎
Lines changed: 53 additions & 23 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh‎
Lines changed: 53 additions & 23 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh‎
Lines changed: 1 addition & 1 deletion b/‎backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh‎
Lines changed: 15 additions & 19 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh‎
Lines changed: 15 additions & 19 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh‎
Lines changed: 6 additions & 8 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/negation.cu‎
Lines changed: 1 addition & 1 deletion b/‎backends/tfhe-cuda-backend/cuda/src/integer/negation.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh‎
Lines changed: 1 addition & 1 deletion b/‎backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh‎
Lines changed: 3 additions & 2 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh‎
Lines changed: 2 additions & 2 deletions b/‎backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh‎
Lines changed: 2 additions & 2 deletions
@@ -7,13 +7,11 @@
 extern "C" {
 
 void cuda_negate_lwe_ciphertext_vector_32(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, const uint32_t input_lwe_dimension,
-    const uint32_t input_lwe_ciphertext_count);
+    void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in);
 void cuda_negate_lwe_ciphertext_vector_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, const uint32_t input_lwe_dimension,
-    const uint32_t input_lwe_ciphertext_count);
+    void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in);
 void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                        CudaRadixCiphertextFFI *output,
                                        CudaRadixCiphertextFFI const *input_1,
@@ -60,10 +58,8 @@ void cuda_glwe_wrapping_polynomial_mul_one_to_many_64_async(
     int8_t *circulant, void const *poly_rhs, uint32_t polynomial_size,
     uint32_t glwe_dimension, uint32_t n_rhs);
 void cuda_add_lwe_ciphertext_vector_plaintext_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, const uint64_t plaintext_in,
-    const uint32_t input_lwe_dimension,
-    const uint32_t input_lwe_ciphertext_count);
+    void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in, const uint64_t plaintext_in);
 void cuda_add_lwe_ciphertext_vector_inplace_32(
     void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_inout,
     CudaRadixCiphertextFFI const *input_2);
 
@@ -176,14 +176,14 @@ host_bitnot(CudaStreams streams, CudaRadixCiphertextFFI *radix_ciphertext,
       (ct_message_modulus - 1);
 
   host_negation<Torus>(
-      streams.stream(0), streams.gpu_index(0), (Torus *)radix_ciphertext->ptr,
-      (Torus *)radix_ciphertext->ptr, radix_ciphertext->lwe_dimension,
+      streams.stream(0), streams.gpu_index(0), radix_ciphertext,
+      radix_ciphertext, radix_ciphertext->lwe_dimension,
       radix_ciphertext->num_radix_blocks);
 
   host_addition_plaintext_scalar<Torus>(
-      streams.stream(0), streams.gpu_index(0), (Torus *)radix_ciphertext->ptr,
-      (Torus *)radix_ciphertext->ptr, encoded_scalar,
-      radix_ciphertext->lwe_dimension, radix_ciphertext->num_radix_blocks);
+      streams.stream(0), streams.gpu_index(0), radix_ciphertext,
+      radix_ciphertext, encoded_scalar, radix_ciphertext->lwe_dimension,
+      radix_ciphertext->num_radix_blocks);
 
   for (size_t i = 0; i < radix_ciphertext->num_radix_blocks; ++i) {
     radix_ciphertext->degrees[i] = ct_message_modulus - 1;
 
@@ -35,7 +35,8 @@ device_accumulate_all_blocks(Torus *output, Torus const *input_block,
 
 template <typename Torus>
 __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
-                                    Torus *output, Torus const *input,
+                                    CudaRadixCiphertextFFI *output,
+                                    CudaRadixCiphertextFFI const *input,
                                     uint32_t lwe_dimension,
                                     uint32_t num_radix_blocks) {
 
@@ -45,7 +46,8 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
   getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
   // Add all blocks and store in sum
   device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
-      output, input, lwe_dimension, num_radix_blocks);
+      (Torus *)output->ptr, (Torus const *)input->ptr, lwe_dimension,
+      num_radix_blocks);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -102,23 +104,33 @@ __host__ void are_all_comparisons_block_true(
 
     // Since all blocks encrypt either 0 or 1, we can sum max_value of them
     // as in the worst case we will be adding `max_value` ones
-    auto input_blocks = (Torus *)tmp_out->ptr;
-    auto accumulator_ptr =
-        (Torus *)are_all_block_true_buffer->tmp_block_accumulated->ptr;
     auto is_max_value_lut = are_all_block_true_buffer->is_max_value;
+    GPU_ASSERT(are_all_block_true_buffer->tmp_block_accumulated->lwe_dimension ==
+                   big_lwe_dimension,
+               "lwe_dimension mismatch between tmp_block_accumulated and "
+               "big_lwe_dimension");
+    GPU_ASSERT(tmp_out->lwe_dimension == big_lwe_dimension,
+               "lwe_dimension mismatch between tmp_out and big_lwe_dimension");
     uint32_t chunk_lengths[num_chunks];
     auto begin_remaining_blocks = remaining_blocks;
+    uint32_t acc_offset = 0, inp_offset = 0;
     for (int i = 0; i < num_chunks; i++) {
       uint32_t chunk_length =
           std::min(max_value, begin_remaining_blocks - i * max_value);
       chunk_lengths[i] = chunk_length;
+      CudaRadixCiphertextFFI acc_slice, inp_slice;
+      as_radix_ciphertext_slice<Torus>(
+          &acc_slice, are_all_block_true_buffer->tmp_block_accumulated,
+          acc_offset, acc_offset + 1);
+      as_radix_ciphertext_slice<Torus>(&inp_slice, tmp_out, inp_offset,
+                                       inp_offset + chunk_length);
       accumulate_all_blocks<Torus>(streams.stream(0), streams.gpu_index(0),
-                                   accumulator_ptr, input_blocks,
-                                   big_lwe_dimension, chunk_length);
+                                   &acc_slice, &inp_slice, big_lwe_dimension,
+                                   chunk_length);
 
-      accumulator_ptr += (big_lwe_dimension + 1);
+      acc_offset += 1;
       remaining_blocks -= (chunk_length - 1);
-      input_blocks += (big_lwe_dimension + 1) * chunk_length;
+      inp_offset += chunk_length;
     }
     auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
 
@@ -219,21 +231,31 @@ __host__ void is_at_least_one_comparisons_block_true(
 
     // Since all blocks encrypt either 0 or 1, we can sum max_value of them
     // as in the worst case we will be adding `max_value` ones
-    auto input_blocks = (Torus *)mem_ptr->tmp_lwe_array_out->ptr;
-    auto accumulator = (Torus *)buffer->tmp_block_accumulated->ptr;
+    GPU_ASSERT(buffer->tmp_block_accumulated->lwe_dimension == big_lwe_dimension,
+               "lwe_dimension mismatch between tmp_block_accumulated and "
+               "big_lwe_dimension");
+    GPU_ASSERT(mem_ptr->tmp_lwe_array_out->lwe_dimension == big_lwe_dimension,
+               "lwe_dimension mismatch between tmp_lwe_array_out and "
+               "big_lwe_dimension");
     uint32_t chunk_lengths[num_chunks];
     auto begin_remaining_blocks = remaining_blocks;
+    uint32_t acc_offset = 0, inp_offset = 0;
     for (int i = 0; i < num_chunks; i++) {
       uint32_t chunk_length =
           std::min(max_value, begin_remaining_blocks - i * max_value);
       chunk_lengths[i] = chunk_length;
+      CudaRadixCiphertextFFI acc_slice, inp_slice;
+      as_radix_ciphertext_slice<Torus>(&acc_slice, buffer->tmp_block_accumulated,
+                                       acc_offset, acc_offset + 1);
+      as_radix_ciphertext_slice<Torus>(&inp_slice, mem_ptr->tmp_lwe_array_out,
+                                       inp_offset, inp_offset + chunk_length);
       accumulate_all_blocks<Torus>(streams.stream(0), streams.gpu_index(0),
-                                   accumulator, input_blocks, big_lwe_dimension,
+                                   &acc_slice, &inp_slice, big_lwe_dimension,
                                    chunk_length);
 
-      accumulator += (big_lwe_dimension + 1);
+      acc_offset += 1;
       remaining_blocks -= (chunk_length - 1);
-      input_blocks += (big_lwe_dimension + 1) * chunk_length;
+      inp_offset += chunk_length;
     }
 
     // Selects a LUT
@@ -296,22 +318,31 @@ __host__ void host_compare_blocks_with_zero(
         streams.stream(0), streams.gpu_index(0), sum, 0, 1, lwe_array_in, 0, 1);
     num_sum_blocks = 1;
   } else {
+    GPU_ASSERT(sum->lwe_dimension == big_lwe_dimension,
+               "lwe_dimension mismatch between sum and big_lwe_dimension");
+    GPU_ASSERT(lwe_array_in->lwe_dimension == big_lwe_dimension,
+               "lwe_dimension mismatch between lwe_array_in and "
+               "big_lwe_dimension");
     uint32_t remainder_blocks = num_radix_blocks;
-    auto sum_i = (Torus *)sum->ptr;
-    auto chunk = (Torus *)lwe_array_in->ptr;
+    uint32_t sum_offset = 0, inp_offset = 0;
     while (remainder_blocks > 1) {
       uint32_t chunk_size =
           std::min(remainder_blocks, num_elements_to_fill_carry);
-
+      CudaRadixCiphertextFFI sum_slice, inp_slice;
+      as_radix_ciphertext_slice<Torus>(&sum_slice, sum, sum_offset,
+                                       sum_offset + 1);
+      as_radix_ciphertext_slice<Torus>(&inp_slice, lwe_array_in, inp_offset,
+                                       inp_offset + chunk_size);
       accumulate_all_blocks<Torus>(streams.stream(0), streams.gpu_index(0),
-                                   sum_i, chunk, big_lwe_dimension, chunk_size);
+                                   &sum_slice, &inp_slice, big_lwe_dimension,
+                                   chunk_size);
 
       num_sum_blocks++;
       remainder_blocks -= (chunk_size - 1);
 
       // Update operands
-      chunk += (chunk_size - 1) * big_lwe_size;
-      sum_i += big_lwe_size;
+      inp_offset += chunk_size - 1;
+      sum_offset += 1;
     }
   }
 
@@ -381,9 +412,8 @@ compare_radix_blocks(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
 
   // Subtract
   host_subtraction<Torus>(
-      streams.stream(0), streams.gpu_index(0), (Torus *)lwe_array_out->ptr,
-      (Torus *)lwe_array_left->ptr, (Torus *)lwe_array_right->ptr,
-      big_lwe_dimension, num_radix_blocks);
+      streams.stream(0), streams.gpu_index(0), lwe_array_out, lwe_array_left,
+      lwe_array_right, big_lwe_dimension, num_radix_blocks);
 
   // Apply LUT to compare to 0
   auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
 
@@ -214,7 +214,7 @@ host_integer_compress(CudaStreams streams,
 
   if constexpr (std::is_same_v<Torus, uint64_t>) {
     lwe_pksk_input = mem_ptr->tmp_lwe;
-    host_cleartext_multiplication<Torus>(
+    host_cleartext_multiplication_unsafe_no_degrees<Torus>(
         streams.stream(0), streams.gpu_index(0), lwe_pksk_input, lwe_array_in,
         (uint64_t)compression_params.message_modulus);
   }
 
@@ -192,16 +192,16 @@ __host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
 
         host_negation<Torus>(
             streams.stream(gpu_index), streams.gpu_index(gpu_index),
-            (Torus *)out_boolean_block->ptr, (Torus *)out_boolean_block->ptr,
+            out_boolean_block, out_boolean_block,
             radix_params.big_lwe_dimension, 1);
 
         // we calculate encoding because this block works only for
         // message_modulus = 4 and carry_modulus = 4.
         const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
         host_addition_plaintext_scalar<Torus>(
             streams.stream(gpu_index), streams.gpu_index(gpu_index),
-            (Torus *)out_boolean_block->ptr, (Torus *)out_boolean_block->ptr,
-            encoded_scalar, radix_params.big_lwe_dimension, 1);
+            out_boolean_block, out_boolean_block, encoded_scalar,
+            radix_params.big_lwe_dimension, 1);
       }
     };
 
@@ -289,35 +289,32 @@ __host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
     // c3 = !o3
     copy_radix_ciphertext_slice_async<Torus>(
         streams.stream(0), streams.gpu_index(0), c3, 0, 1, o3, 0, 1);
-    host_negation<Torus>(streams.stream(0), streams.gpu_index(0),
-                         (Torus *)c3->ptr, (Torus *)c3->ptr,
+    host_negation<Torus>(streams.stream(0), streams.gpu_index(0), c3, c3,
                          radix_params.big_lwe_dimension, 1);
     const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
     host_addition_plaintext_scalar<Torus>(
-        streams.stream(0), streams.gpu_index(0), (Torus *)c3->ptr,
-        (Torus *)c3->ptr, encoded_scalar, radix_params.big_lwe_dimension, 1);
+        streams.stream(0), streams.gpu_index(0), c3, c3, encoded_scalar,
+        radix_params.big_lwe_dimension, 1);
 
     // c2 = !o2 + o3
     copy_radix_ciphertext_slice_async<Torus>(
         streams.stream(1), streams.gpu_index(1), c2, 0, 1, o2, 0, 1);
-    host_negation<Torus>(streams.stream(1), streams.gpu_index(1),
-                         (Torus *)c2->ptr, (Torus *)c2->ptr,
+    host_negation<Torus>(streams.stream(1), streams.gpu_index(1), c2, c2,
                          radix_params.big_lwe_dimension, 1);
     host_addition_plaintext_scalar<Torus>(
-        streams.stream(1), streams.gpu_index(1), (Torus *)c2->ptr,
-        (Torus *)c2->ptr, encoded_scalar, radix_params.big_lwe_dimension, 1);
+        streams.stream(1), streams.gpu_index(1), c2, c2, encoded_scalar,
+        radix_params.big_lwe_dimension, 1);
     host_addition<Torus>(streams.stream(1), streams.gpu_index(1), c2, c2,
                          o3_gpu_1, 1, 4, 4);
 
     // c1 = !o1 + o2
     copy_radix_ciphertext_slice_async<Torus>(
         streams.stream(2), streams.gpu_index(2), c1, 0, 1, o1, 0, 1);
-    host_negation<Torus>(streams.stream(2), streams.gpu_index(2),
-                         (Torus *)c1->ptr, (Torus *)c1->ptr,
+    host_negation<Torus>(streams.stream(2), streams.gpu_index(2), c1, c1,
                          radix_params.big_lwe_dimension, 1);
     host_addition_plaintext_scalar<Torus>(
-        streams.stream(2), streams.gpu_index(2), (Torus *)c1->ptr,
-        (Torus *)c1->ptr, encoded_scalar, radix_params.big_lwe_dimension, 1);
+        streams.stream(2), streams.gpu_index(2), c1, c1, encoded_scalar,
+        radix_params.big_lwe_dimension, 1);
     host_addition<Torus>(streams.stream(2), streams.gpu_index(2), c1, c1,
                          o2_gpu_2, 1, 4, 4);
 
@@ -330,10 +327,9 @@ __host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
                                   CudaRadixCiphertextFFI *cx,
                                   CudaRadixCiphertextFFI *rx,
                                   int_radix_lut<Torus> *lut, Torus factor) {
-      auto rx_list = to_lwe_ciphertext_list(rx);
       host_cleartext_multiplication<Torus>(streams.stream(gpu_index),
                                            streams.gpu_index(gpu_index),
-                                           (Torus *)rx->ptr, &rx_list, factor);
+                                           rx, rx, factor);
       host_add_the_same_block_to_all_blocks<Torus>(streams.stream(gpu_index),
                                                    streams.gpu_index(gpu_index),
                                                    rx, rx, cx, 4, 4);
@@ -954,7 +950,7 @@ __host__ void host_integer_div_rem(
     int_mem_ptr->sub_streams_1.synchronize();
     int_mem_ptr->sub_streams_2.synchronize();
 
-    host_negation<Torus>(
+    host_integer_negation<Torus>(
         int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, quotient,
         radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
 
@@ -965,7 +961,7 @@ __host__ void host_integer_div_rem(
                                        nullptr, int_mem_ptr->scp_mem_1, bsks,
                                        ksks, requested_flag, uses_carry);
 
-    host_negation<Torus>(
+    host_integer_negation<Torus>(
         int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
         radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
 
 
@@ -2267,14 +2267,13 @@ void host_single_borrow_propagate(CudaStreams streams,
       streams, borrow_states, params, mem->prop_simu_group_carries_mem, bsks,
       ksks, num_radix_blocks, num_groups);
 
-  auto shifted_blocks =
-      (Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
   auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
-  auto simulators = (Torus *)mem->prop_simu_group_carries_mem->simulators->ptr;
 
-  host_subtraction<Torus>(streams.stream(0), streams.gpu_index(0),
-                          (Torus *)prepared_blocks->ptr, shifted_blocks,
-                          simulators, big_lwe_dimension, num_radix_blocks);
+  host_subtraction<Torus>(
+      streams.stream(0), streams.gpu_index(0), prepared_blocks,
+      mem->shifted_blocks_borrow_state_mem->shifted_blocks,
+      mem->prop_simu_group_carries_mem->simulators, big_lwe_dimension,
+      num_radix_blocks);
 
   host_add_scalar_one_inplace<Torus>(streams, prepared_blocks, message_modulus,
                                      carry_modulus);
@@ -2318,8 +2317,7 @@ void host_single_borrow_propagate(CudaStreams streams,
 
   auto resolved_carries = mem->prop_simu_group_carries_mem->resolved_carries;
   host_negation<Torus>(sub_streams_2.stream(0), sub_streams_2.gpu_index(0),
-                       (Torus *)resolved_carries->ptr,
-                       (Torus *)resolved_carries->ptr, big_lwe_dimension,
+                       resolved_carries, resolved_carries, big_lwe_dimension,
                        num_groups);
 
   host_radix_sum_in_groups<Torus>(
 
@@ -10,7 +10,7 @@ void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
                  "operations");
 
   auto cuda_streams = CudaStreams(streams);
-  host_negation<uint64_t>(cuda_streams, lwe_array_out, lwe_array_in,
+  host_integer_negation<uint64_t>(cuda_streams, lwe_array_out, lwe_array_in,
                           message_modulus, carry_modulus, num_radix_blocks);
   cuda_synchronize_stream(cuda_streams.stream(0), cuda_streams.gpu_index(0));
 }
@@ -48,7 +48,7 @@ __global__ void device_negation(Torus *output, Torus const *input,
 }
 
 template <typename Torus>
-__host__ void host_negation(CudaStreams streams,
+__host__ void host_integer_negation(CudaStreams streams,
                             CudaRadixCiphertextFFI *lwe_array_out,
                             CudaRadixCiphertextFFI const *lwe_array_in,
                             uint64_t message_modulus, uint64_t carry_modulus,
 
@@ -112,7 +112,7 @@ device_scalar_subtraction_inplace(Torus *lwe_array, Torus *scalar_input,
 
 template <typename Torus>
 __host__ void host_scalar_subtraction_inplace(
-    CudaStreams streams, Torus *lwe_array, Torus *scalar_input,
+    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, Torus *scalar_input,
     uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
     uint32_t message_modulus, uint32_t carry_modulus) {
   cuda_set_device(streams.gpu_index(0));
@@ -130,7 +130,8 @@ __host__ void host_scalar_subtraction_inplace(
   uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
 
   device_scalar_subtraction_inplace<Torus>
-      <<<grid, thds, 0, streams.stream(0)>>>(lwe_array, scalar_input,
+      <<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
+                                             scalar_input,
                                              input_lwe_ciphertext_count,
                                              lwe_dimension, delta);
   check_cuda_error(cudaGetLastError());
 
@@ -63,8 +63,8 @@ __host__ void scalar_compare_radix_blocks(
   // Subtract
   // Here we need the true lwe sub, not the one that comes from shortint.
   host_scalar_subtraction_inplace<Torus>(
-      streams, (Torus *)subtracted_blocks->ptr, scalar_blocks,
-      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
+      streams, subtracted_blocks, scalar_blocks, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
 
   // Apply LUT to compare to 0
   auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
Original file line number	Diff line number	Diff line change
`@@ -214,7 +214,7 @@ host_integer_compress(CudaStreams streams,`
`214`	`214`
`215`	`215`	`if constexpr (std::is_same_v<Torus, uint64_t>) {`
`216`	`216`	`lwe_pksk_input = mem_ptr->tmp_lwe;`
`217`		`- host_cleartext_multiplication<Torus>(`
	`217`	`+ host_cleartext_multiplication_unsafe_no_degrees<Torus>(`
`218`	`218`	`streams.stream(0), streams.gpu_index(0), lwe_pksk_input, lwe_array_in,`
`219`	`219`	`(uint64_t)compression_params.message_modulus);`
`220`	`220`	`}`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ void cuda_negate_ciphertext_64(CudaStreamsFFI streams,`
`10`	`10`	`"operations");`
`11`	`11`
`12`	`12`	`auto cuda_streams = CudaStreams(streams);`
`13`		`- host_negation<uint64_t>(cuda_streams, lwe_array_out, lwe_array_in,`
	`13`	`+ host_integer_negation<uint64_t>(cuda_streams, lwe_array_out, lwe_array_in,`
`14`	`14`	`message_modulus, carry_modulus, num_radix_blocks);`
`15`	`15`	`cuda_synchronize_stream(cuda_streams.stream(0), cuda_streams.gpu_index(0));`
`16`	`16`	`}`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ __global__ void device_negation(Torus output, Torus const input,`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`template <typename Torus>`
`51`		`-__host__ void host_negation(CudaStreams streams,`
	`51`	`+__host__ void host_integer_negation(CudaStreams streams,`
`52`	`52`	`CudaRadixCiphertextFFI *lwe_array_out,`
`53`	`53`	`CudaRadixCiphertextFFI const *lwe_array_in,`
`54`	`54`	`uint64_t message_modulus, uint64_t carry_modulus,`