fix(gpu): div rem size

andrei-stoian-zama · andrei-stoian-zama · commit 44732ffd75e5 · 2026-04-20T16:38:43.000+02:00
diff --git a/.github/workflows/gpu_fast_tests.yml b/.github/workflows/gpu_fast_tests.yml
@@ -133,6 +133,10 @@ jobs:
         run: |
           nvidia-cuda-mps-control -d
 
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_fake_multi_gpu
+
       - name: Run core crypto and internal CUDA backend tests
         run: |
           make test_core_crypto_gpu
@@ -147,9 +151,6 @@ jobs:
         run: |
           make test_c_api_gpu
 
-      - name: Run High Level API Tests
-        run: |
-          make test_high_level_api_fake_multi_gpu
 
   slack-notify:
     name: gpu_fast_tests/slack-notify
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -381,16 +381,16 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
     bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
                        ->use_sequential_algorithm_to_resolve_group_carries;
 
-    cuda_set_device(0);
+    cuda_set_device(streams.gpu_index(0));
     check_cuda_error(
         cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming));
     create_indexes_for_overflow_sub(streams.get_ith(0), num_blocks, group_size,
                                     use_seq, allocate_gpu_memory, size_tracker);
     check_cuda_error(cudaEventRecord(create_indexes_done, streams.stream(0)));
-    cuda_set_device(1);
+    cuda_set_device(streams.gpu_index(1));
     check_cuda_error(
         cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0));
-    cuda_set_device(2);
+    cuda_set_device(streams.gpu_index(2));
     check_cuda_error(
         cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0));