Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 90 additions & 69 deletions src/clean_up.cu
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like remove_duplicate_faces() has gone through a significant refactor and now introduces Thrust as a dependency.

Could you share more context on the motivation behind this change? Specifically, which CUB feature is broken or unavailable in CUDA 11.8 that necessitated this switch?

Also, were any alternative approaches considered to avoid adding a new dependency?

Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,56 @@
#include "dtypes.cuh"
#include "shared.h"
#include <cub/cub.cuh>

#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/sequence.h>
#include <thrust/execution_policy.h>

namespace cumesh {

// Marks faces as 1 (keep) or 0 (remove) by comparing adjacent sorted faces
__global__ void mark_duplicates_from_indices_kernel(
const int* sorted_indices,
const int3* faces,
uint8_t* mask_original,
int n
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) return;

int current_original_idx = sorted_indices[idx];

// The first element in the sorted list is always unique
if (idx == 0) {
mask_original[current_original_idx] = 1;
return;
}

// Compare with the previous element in the sorted list
int prev_original_idx = sorted_indices[idx - 1];

int3 curr_f = faces[current_original_idx];
int3 prev_f = faces[prev_original_idx];

// If identical to previous, it's a duplicate -> mark 0 (remove)
// Otherwise -> mark 1 (keep)
bool is_duplicate = (curr_f.x == prev_f.x && curr_f.y == prev_f.y && curr_f.z == prev_f.z);
mask_original[current_original_idx] = is_duplicate ? 0 : 1;
}

// Comparator for Thrust to sort indices based on face values
struct FaceComparator {
const int3* faces;
FaceComparator(const int3* f) : faces(f) {}

__device__ bool operator()(int i, int j) const {
const int3& a = faces[i];
const int3& b = faces[j];
if (a.x != b.x) return a.x < b.x;
if (a.y != b.y) return a.y < b.y;
return a.z < b.z;
}
};

static __global__ void copy_vec3f_to_float3_kernel(
const Vec3f* vec3f,
Expand Down Expand Up @@ -152,12 +198,12 @@ void CuMesh::remove_unreferenced_vertices() {
size_t temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
cu_vertex_is_referenced, V+1
cu_vertex_is_referenced,cu_vertex_is_referenced, V+1
));
this->cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
this->cub_temp_storage.ptr, temp_storage_bytes,
cu_vertex_is_referenced, V+1
cu_vertex_is_referenced,cu_vertex_is_referenced, V+1
));
int new_num_vertices;
CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost));
Expand Down Expand Up @@ -226,94 +272,69 @@ static __global__ void select_first_in_each_group_kernel(
}
}


struct int3_decomposer
struct int3_comparator
{
__host__ __device__ ::cuda::std::tuple<int&, int&, int&> operator()(int3& key) const
__host__ __device__ bool operator()(const int3& a, const int3& b) const
{
return {key.x, key.y, key.z};
// Lexicographical comparison: check x, then y, then z
if (a.x != b.x) return a.x < b.x;
if (a.y != b.y) return a.y < b.y;
return a.z < b.z;
}
};


void CuMesh::remove_duplicate_faces() {
size_t F = this->faces.size;

// Create a temporary sorted copy of faces for duplicate detection
// Do NOT modify the original faces to preserve vertex order and normals
// 1. Create a temporary copy of faces for canonicalization
int3 *cu_sorted_faces;
CUDA_CHECK(cudaMalloc(&cu_sorted_faces, F * sizeof(int3)));
CUDA_CHECK(cudaMemcpy(cu_sorted_faces, this->faces.ptr, F * sizeof(int3), cudaMemcpyDeviceToDevice));

// Sort vertices within each face (in the temporary copy)
sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
cu_sorted_faces,
F
);
// 2. Sort vertices within each face (canonical form)
// (This ensures that face [0,1,2] is treated same as [2,0,1])
sort_faces_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_faces, F);
CUDA_CHECK(cudaGetLastError());

// Sort all faces globally by their sorted vertex indices
size_t temp_storage_bytes = 0;
// 3. Create indices [0, 1, 2, ... F-1]
int *cu_sorted_face_indices;
CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int)));
arange_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(cu_sorted_face_indices, F);
CUDA_CHECK(cudaGetLastError());

int *cu_sorted_indices_output;
int3 *cu_sorted_faces_output;
CUDA_CHECK(cudaMalloc(&cu_sorted_indices_output, F * sizeof(int)));
CUDA_CHECK(cudaMalloc(&cu_sorted_faces_output, F * sizeof(int3)));

CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
nullptr, temp_storage_bytes,
cu_sorted_faces, cu_sorted_faces_output,
cu_sorted_face_indices, cu_sorted_indices_output,
F,
int3_decomposer{}
));
this->cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceRadixSort::SortPairs(
this->cub_temp_storage.ptr, temp_storage_bytes,
cu_sorted_faces, cu_sorted_faces_output,
cu_sorted_face_indices, cu_sorted_indices_output,
F,
int3_decomposer{}
));
CUDA_CHECK(cudaFree(cu_sorted_faces));
CUDA_CHECK(cudaFree(cu_sorted_face_indices));

// Select first in each group of duplicate faces (based on sorted faces)
uint8_t* cu_face_mask_sorted;
CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t)));
select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
cu_sorted_faces_output,
F,
cu_face_mask_sorted
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaFree(cu_sorted_faces_output));

// Map the mask back to original face order using scatter
// scatter: output[indices[i]] = values[i]
// This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position]

// Use Thrust to generate sequence
thrust::sequence(thrust::device,
thrust::device_pointer_cast(cu_sorted_face_indices),
thrust::device_pointer_cast(cu_sorted_face_indices + F));

// 4. Sort the INDICES based on the face values using Thrust
// This groups identical faces together in the index list
thrust::sort(thrust::device,
thrust::device_pointer_cast(cu_sorted_face_indices),
thrust::device_pointer_cast(cu_sorted_face_indices + F),
FaceComparator(cu_sorted_faces));

// 5. Mark duplicates
// We traverse the sorted indices. If neighbors are equal, mark original index for removal.
uint8_t* cu_face_mask_original;
CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));
scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
cu_sorted_indices_output, // indices: sorted_position -> original_idx
cu_face_mask_sorted, // values: mask at sorted_position
F,
cu_face_mask_original // output: mask at original position

mark_duplicates_from_indices_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
cu_sorted_face_indices,
cu_sorted_faces,
cu_face_mask_original,
(int)F
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaFree(cu_face_mask_sorted));
CUDA_CHECK(cudaFree(cu_sorted_indices_output));

// Select faces to keep (preserving original vertex order)
// 6. Cleanup temporary memory
CUDA_CHECK(cudaFree(cu_sorted_faces));
CUDA_CHECK(cudaFree(cu_sorted_face_indices));

// 7. Remove faces using the generated mask
this->_remove_faces(cu_face_mask_original);
CUDA_CHECK(cudaFree(cu_face_mask_original));
}


static __global__ void mark_degenerate_faces_kernel(
const float3* vertices,
const int3* faces,
Expand Down Expand Up @@ -542,13 +563,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::InclusiveSum(
nullptr, temp_storage_bytes,
cu_loop_bound_loop_ids,
cu_loop_bound_loop_ids,cu_loop_bound_loop_ids,
E
));
this->cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::InclusiveSum(
this->cub_temp_storage.ptr, temp_storage_bytes,
cu_loop_bound_loop_ids,
cu_loop_bound_loop_ids,cu_loop_bound_loop_ids,
E
));

Expand Down Expand Up @@ -614,13 +635,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::InclusiveSum(
nullptr, temp_storage_bytes,
cu_new_loop_bound_loop_ids,
cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids,
new_num_loop_boundaries
));
this->cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::InclusiveSum(
this->cub_temp_storage.ptr, temp_storage_bytes,
cu_new_loop_bound_loop_ids,
cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids,
new_num_loop_boundaries
));

Expand Down
4 changes: 2 additions & 2 deletions src/connectivity.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1074,13 +1074,13 @@ void CuMesh::get_boundary_loops() {
temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
this->loop_boundaries_offset.ptr,
this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr,
this->num_bound_loops + 1
));
this->cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
this->cub_temp_storage.ptr, temp_storage_bytes,
this->loop_boundaries_offset.ptr,
this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr,
this->num_bound_loops + 1
));
}
Expand Down
4 changes: 2 additions & 2 deletions src/remesh/svox2vert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,10 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(

// Compute the offset
size_t temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices,num_vertices, M + 1);
void* d_temp_storage = nullptr;
CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes));
cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices,num_vertices, M + 1);
CUDA_CHECK(cudaFree(d_temp_storage));
int total_vertices;
CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost));
Expand Down
4 changes: 2 additions & 2 deletions src/shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,13 +215,13 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
cu_new_ids,
cu_new_ids,cu_new_ids,
N
));
cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
cub_temp_storage.ptr, temp_storage_bytes,
cu_new_ids,
cu_new_ids,cu_new_ids,
N
));

Expand Down
8 changes: 4 additions & 4 deletions src/simplify.cu
Original file line number Diff line number Diff line change
Expand Up @@ -473,12 +473,12 @@ void collapse_edges(
size_t temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
ctx.vertices_map.ptr, V+1
ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1
));
ctx.cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
ctx.cub_temp_storage.ptr, temp_storage_bytes,
ctx.vertices_map.ptr, V+1
ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1
));
int new_num_vertices;
CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost));
Expand All @@ -497,12 +497,12 @@ void collapse_edges(
// get faces map
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
ctx.faces_map.ptr, F+1
ctx.faces_map.ptr,ctx.faces_map.ptr,F+1
));
ctx.cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
ctx.cub_temp_storage.ptr, temp_storage_bytes,
ctx.faces_map.ptr, F+1
ctx.faces_map.ptr,ctx.faces_map.ptr, F+1
));
int new_num_faces;
CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost));
Expand Down