diff --git a/src/clean_up.cu b/src/clean_up.cu
index 3c12cd7..7deb921 100644
--- a/src/clean_up.cu
+++ b/src/clean_up.cu
@@ -2,10 +2,59 @@
 #include "dtypes.cuh"
 #include "shared.h"
 #include <cub/cub.cuh>
-
+#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040)
+#include <thrust/sort.h>
+#include <thrust/sequence.h>
+#include <thrust/execution_policy.h>
+#endif
 
 namespace cumesh {
 
+#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040)
+// Marks faces as 1 (keep) or 0 (remove) by comparing adjacent sorted faces
+__global__ void mark_duplicates_from_indices_kernel(
+    const int* sorted_indices,
+    const int3* faces,
+    uint8_t* mask_original,
+    int n
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+
+    int current_original_idx = sorted_indices[idx];
+
+    // The first element in the sorted list is always unique
+    if (idx == 0) {
+        mask_original[current_original_idx] = 1;
+        return;
+    }
+
+    // Compare with the previous element in the sorted list
+    int prev_original_idx = sorted_indices[idx - 1];
+    
+    int3 curr_f = faces[current_original_idx];
+    int3 prev_f = faces[prev_original_idx];
+
+    // If identical to previous, it's a duplicate -> mark 0 (remove)
+    // Otherwise -> mark 1 (keep)
+    bool is_duplicate = (curr_f.x == prev_f.x && curr_f.y == prev_f.y && curr_f.z == prev_f.z);
+    mask_original[current_original_idx] = is_duplicate ? 0 : 1;
+}
+
+// Comparator for Thrust to sort indices based on face values
+struct FaceComparator {
+    const int3* faces;
+    FaceComparator(const int3* f) : faces(f) {}
+
+    __device__ bool operator()(int i, int j) const {
+        const int3& a = faces[i];
+        const int3& b = faces[j];
+        if (a.x != b.x) return a.x < b.x;
+        if (a.y != b.y) return a.y < b.y;
+        return a.z < b.z;
+    }
+};
+#endif
 
 static __global__ void copy_vec3f_to_float3_kernel(
     const Vec3f* vec3f,
@@ -152,12 +201,12 @@ void CuMesh::remove_unreferenced_vertices() {
     size_t temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        cu_vertex_is_referenced, V+1
+        cu_vertex_is_referenced,cu_vertex_is_referenced, V+1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
-        cu_vertex_is_referenced, V+1
+        cu_vertex_is_referenced,cu_vertex_is_referenced, V+1
     ));
     int new_num_vertices;
     CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost));
@@ -226,7 +275,6 @@ static __global__ void select_first_in_each_group_kernel(
     }
 }
 
-
 struct int3_decomposer
 {
     __host__ __device__ ::cuda::std::tuple<int&, int&, int&> operator()(int3& key) const
@@ -252,7 +300,38 @@ void CuMesh::remove_duplicate_faces() {
     );
     CUDA_CHECK(cudaGetLastError());
 
-    // Sort all faces globally by their sorted vertex indices
+#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040)
+    // CUDA < 12.4: use Thrust implementation
+    int *cu_sorted_face_indices;
+    CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int)));
+
+    thrust::sequence(thrust::device,
+                     thrust::device_pointer_cast(cu_sorted_face_indices),
+                     thrust::device_pointer_cast(cu_sorted_face_indices + F));
+
+    thrust::sort(thrust::device,
+                 thrust::device_pointer_cast(cu_sorted_face_indices),
+                 thrust::device_pointer_cast(cu_sorted_face_indices + F),
+                 FaceComparator(cu_sorted_faces));
+
+    uint8_t* cu_face_mask_original;
+    CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));
+
+    mark_duplicates_from_indices_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
+        cu_sorted_face_indices,
+        cu_sorted_faces,
+        cu_face_mask_original,
+        (int)F
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    CUDA_CHECK(cudaFree(cu_sorted_faces));
+    CUDA_CHECK(cudaFree(cu_sorted_face_indices));
+
+    this->_remove_faces(cu_face_mask_original);
+    CUDA_CHECK(cudaFree(cu_face_mask_original));
+#else
+    // CUDA >= 12.4: keep existing CUB behavior
     size_t temp_storage_bytes = 0;
     int *cu_sorted_face_indices;
     CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int)));
@@ -282,7 +361,6 @@ void CuMesh::remove_duplicate_faces() {
     CUDA_CHECK(cudaFree(cu_sorted_faces));
     CUDA_CHECK(cudaFree(cu_sorted_face_indices));
 
-    // Select first in each group of duplicate faces (based on sorted faces)
     uint8_t* cu_face_mask_sorted;
     CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t)));
     select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
@@ -293,27 +371,23 @@ void CuMesh::remove_duplicate_faces() {
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cudaFree(cu_sorted_faces_output));
 
-    // Map the mask back to original face order using scatter
-    // scatter: output[indices[i]] = values[i]
-    // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position]
     uint8_t* cu_face_mask_original;
     CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));
     scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
-        cu_sorted_indices_output,  // indices: sorted_position -> original_idx
-        cu_face_mask_sorted,       // values: mask at sorted_position
+        cu_sorted_indices_output,
+        cu_face_mask_sorted,
         F,
-        cu_face_mask_original      // output: mask at original position
+        cu_face_mask_original
     );
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cudaFree(cu_face_mask_sorted));
     CUDA_CHECK(cudaFree(cu_sorted_indices_output));
 
-    // Select faces to keep (preserving original vertex order)
     this->_remove_faces(cu_face_mask_original);
     CUDA_CHECK(cudaFree(cu_face_mask_original));
+#endif
 }
 
-
 static __global__ void mark_degenerate_faces_kernel(
     const float3* vertices,
     const int3* faces,
@@ -542,13 +616,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         nullptr, temp_storage_bytes,
-        cu_loop_bound_loop_ids,
+        cu_loop_bound_loop_ids,cu_loop_bound_loop_ids,
         E
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
-        cu_loop_bound_loop_ids,
+        cu_loop_bound_loop_ids,cu_loop_bound_loop_ids,
         E
     ));
 
@@ -614,13 +688,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
     temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         nullptr, temp_storage_bytes,
-        cu_new_loop_bound_loop_ids,
+        cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids,
         new_num_loop_boundaries
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
-        cu_new_loop_bound_loop_ids,
+        cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids,
         new_num_loop_boundaries
     ));
 
@@ -1216,4 +1290,4 @@ void CuMesh::unify_face_orientations() {
 }
 
 
-} // namespace cumesh
\ No newline at end of file
+} // namespace cumesh
diff --git a/src/connectivity.cu b/src/connectivity.cu
index 6e2f5fe..c061163 100644
--- a/src/connectivity.cu
+++ b/src/connectivity.cu
@@ -1074,13 +1074,13 @@ void CuMesh::get_boundary_loops() {
     temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        this->loop_boundaries_offset.ptr,
+        this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr,
         this->num_bound_loops + 1
     ));
     this->cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         this->cub_temp_storage.ptr, temp_storage_bytes,
-        this->loop_boundaries_offset.ptr,
+        this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr,
         this->num_bound_loops + 1
     ));
 }
diff --git a/src/remesh/svox2vert.cu b/src/remesh/svox2vert.cu
index 6f1d517..4c69981 100644
--- a/src/remesh/svox2vert.cu
+++ b/src/remesh/svox2vert.cu
@@ -180,10 +180,10 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
 
     // Compute the offset
     size_t temp_storage_bytes = 0;
-    cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
+    cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices,num_vertices, M + 1);
     void* d_temp_storage = nullptr;
     CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes));
-    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
+    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices,num_vertices, M + 1);
     CUDA_CHECK(cudaFree(d_temp_storage));
     int total_vertices;
     CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost));
diff --git a/src/shared.h b/src/shared.h
index 66ecac7..039eb88 100644
--- a/src/shared.h
+++ b/src/shared.h
@@ -215,13 +215,13 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
     temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        cu_new_ids,
+        cu_new_ids,cu_new_ids,
         N
     ));
     cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         cub_temp_storage.ptr, temp_storage_bytes,
-        cu_new_ids,
+        cu_new_ids,cu_new_ids,
         N
     ));
     
diff --git a/src/simplify.cu b/src/simplify.cu
index 9efde9e..b83fc38 100644
--- a/src/simplify.cu
+++ b/src/simplify.cu
@@ -473,12 +473,12 @@ void collapse_edges(
     size_t temp_storage_bytes = 0;
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        ctx.vertices_map.ptr, V+1
+        ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1
     ));
     ctx.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         ctx.cub_temp_storage.ptr, temp_storage_bytes,
-        ctx.vertices_map.ptr, V+1
+        ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1
     ));
     int new_num_vertices;
     CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost));
@@ -497,12 +497,12 @@ void collapse_edges(
     // get faces map
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         nullptr, temp_storage_bytes,
-        ctx.faces_map.ptr, F+1
+        ctx.faces_map.ptr,ctx.faces_map.ptr,F+1
     ));
     ctx.cub_temp_storage.resize(temp_storage_bytes);
     CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
         ctx.cub_temp_storage.ptr, temp_storage_bytes,
-        ctx.faces_map.ptr, F+1
+        ctx.faces_map.ptr,ctx.faces_map.ptr, F+1
     ));
     int new_num_faces;
     CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost));