JeffreyXiang · cuzelac · Mar 8, 2026
diff --git a/src/atlas.cu b/src/atlas.cu
diff --git a/src/clean_up.cu b/src/clean_up.cu
diff --git a/src/connectivity.cu b/src/connectivity.cu
diff --git a/src/geometry.cu b/src/geometry.cu
@@ -24,9 +24,10 @@ static __global__ void compute_face_areas_kernel(
 
 
 void CuMesh::compute_face_areas() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     this->face_areas.resize(F);
-    compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->faces.ptr,
         F,
@@ -57,9 +58,10 @@ static __global__ void compute_face_normals_kernel(
 
 
 void CuMesh::compute_face_normals() {
+    cudaStream_t stream = current_stream();
     size_t F = this->faces.size;
     this->face_normals.resize(F);
-    compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->faces.ptr,
         F,
@@ -113,9 +115,10 @@ void CuMesh::compute_vertex_normals() {
         this->get_vertex_face_adjacency();
     }
 
+    cudaStream_t stream = current_stream();
     size_t V = this->vertices.size;
     this->vertex_normals.resize(V);
-    compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+    compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
         this->vertices.ptr,
         this->faces.ptr,
         this->vert2face.ptr,

diff --git a/src/hash/hash.cu b/src/hash/hash.cu
@@ -1,6 +1,7 @@
 #include <torch/extension.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <c10/cuda/CUDAStream.h>
 
 #include "api.h"
 #include "hash.cuh"
@@ -32,9 +33,12 @@ static void dispatch_hashmap_insert_cuda(
     const torch::Tensor& keys,
     const torch::Tensor& values
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_insert_cuda_kernel<<<
         (keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         keys.size(0),
@@ -111,9 +115,12 @@ static void dispatch_hashmap_lookup_cuda(
     const torch::Tensor& keys,
     torch::Tensor& values
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_lookup_cuda_kernel<<<
         (keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         keys.size(0),
@@ -205,9 +212,12 @@ static void dispatch_hashmap_insert_3d_cuda(
     const torch::Tensor& values,
     int W, int H, int D
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_insert_3d_cuda_kernel<<<
         (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         coords.size(0),
@@ -303,9 +313,12 @@ static void dispatch_hashmap_lookup_3d_cuda(
     torch::Tensor& values,
     int W, int H, int D
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_lookup_3d_cuda_kernel<<<
         (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         coords.size(0),
@@ -395,9 +408,12 @@ static void dispatch_hashmap_insert_3d_idx_as_val_cuda(
     const torch::Tensor& coords,
     int W, int H, int D
 ) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     hashmap_insert_3d_idx_as_val_cuda_kernel<<<
         (coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-        BLOCK_SIZE
+        BLOCK_SIZE,
+        0,
+        stream
     >>>(
         hashmap_keys.size(0),
         coords.size(0),

diff --git a/src/remesh/simple_dual_contour.cu b/src/remesh/simple_dual_contour.cu
@@ -1,6 +1,7 @@
 #include <torch/extension.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <c10/cuda/CUDAStream.h>
 #include <vector>
 
 #include "api.h"
@@ -181,6 +182,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
 ) {
     const size_t M = coords.size(0);
     const size_t N_vert = hashmap_keys.size(0);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
 
     auto vertices = torch::empty({(long)M, 3}, torch::dtype(torch::kFloat32).device(coords.device()));
     auto intersected = torch::empty({(long)M, 3}, torch::dtype(torch::kInt32).device(coords.device()));
@@ -189,7 +191,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
     dim3 blocks((M + BLOCK_SIZE - 1) / BLOCK_SIZE);
 
     if (hashmap_keys.dtype() == torch::kUInt32) {
-        simple_dual_contour_kernel<<<blocks, threads>>>(
+        simple_dual_contour_kernel<<<blocks, threads, 0, stream>>>(
             N_vert,
             M,
             W, H, D,
@@ -200,9 +202,9 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
             vertices.data_ptr<float>(),
             intersected.data_ptr<int32_t>()
         );
-    } 
+    }
     else if (hashmap_keys.dtype() == torch::kUInt64) {
-        simple_dual_contour_kernel<<<blocks, threads>>>(
+        simple_dual_contour_kernel<<<blocks, threads, 0, stream>>>(
             N_vert,
             M,
             W, H, D,
@@ -213,7 +215,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
             vertices.data_ptr<float>(),
             intersected.data_ptr<int32_t>()
         );
-    } 
+    }
     else {
         TORCH_CHECK(false, "Unsupported hashmap data type");
     }

diff --git a/src/remesh/svox2vert.cu b/src/remesh/svox2vert.cu
@@ -2,6 +2,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
 
 #include "api.h"
 #include "../utils.h"
@@ -147,10 +148,11 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
 
     // Get the number of active vertices for each voxel
     size_t N = hashmap_keys.size(0);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
     int* num_vertices;
     CUDA_CHECK(cudaMalloc(&num_vertices, (M + 1) * sizeof(int)));
     if (hashmap_keys.dtype() == torch::kUInt32) {
-        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -162,7 +164,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
             num_vertices
         );
     } else if (hashmap_keys.dtype() == torch::kUInt64) {
-        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -180,18 +182,19 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
 
     // Compute the offset
     size_t temp_storage_bytes = 0;
-    cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
+    cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1, stream);
     void* d_temp_storage = nullptr;
     CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
-    CUDA_CHECK(cudaFree(d_temp_storage));
+    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1, stream);
     int total_vertices;
-    CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaFree(d_temp_storage));
 
     // Set the active vertices for each voxel
     auto vertices = torch::empty({total_vertices, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device()));
     if (hashmap_keys.dtype() == torch::kUInt32) {
-        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -205,7 +208,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
         );
     }
     else if (hashmap_keys.dtype() == torch::kUInt64) {
-        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+        set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
             N,
             M,
             W,
@@ -220,7 +223,8 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
     }
     CUDA_CHECK(cudaGetLastError());
 
-    // Free the temporary memory
+    // Free the temporary memory — sync stream first so set_vertex kernel is done
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(num_vertices));
 
     return vertices;