Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 93 additions & 19 deletions src/clean_up.cu
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like remove_duplicate_faces() has gone through a significant refactor and now introduces Thrust as a dependency.

Could you share more context on the motivation behind this change? Specifically, which CUB feature is broken or unavailable in CUDA 11.8 that necessitated this switch?

Also, were any alternative approaches considered to avoid adding a new dependency?

Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,59 @@
#include "dtypes.cuh"
#include "shared.h"
#include <cub/cub.cuh>

#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040)
#include <thrust/sort.h>
#include <thrust/sequence.h>
#include <thrust/execution_policy.h>
#endif

namespace cumesh {

#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040)
// Marks faces as 1 (keep) or 0 (remove) by comparing adjacent sorted faces
__global__ void mark_duplicates_from_indices_kernel(
const int* sorted_indices,
const int3* faces,
uint8_t* mask_original,
int n
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) return;

int current_original_idx = sorted_indices[idx];

// The first element in the sorted list is always unique
if (idx == 0) {
mask_original[current_original_idx] = 1;
return;
}

// Compare with the previous element in the sorted list
int prev_original_idx = sorted_indices[idx - 1];

int3 curr_f = faces[current_original_idx];
int3 prev_f = faces[prev_original_idx];

// If identical to previous, it's a duplicate -> mark 0 (remove)
// Otherwise -> mark 1 (keep)
bool is_duplicate = (curr_f.x == prev_f.x && curr_f.y == prev_f.y && curr_f.z == prev_f.z);
mask_original[current_original_idx] = is_duplicate ? 0 : 1;
}

// Comparator for Thrust to sort indices based on face values
struct FaceComparator {
const int3* faces;
FaceComparator(const int3* f) : faces(f) {}

__device__ bool operator()(int i, int j) const {
const int3& a = faces[i];
const int3& b = faces[j];
if (a.x != b.x) return a.x < b.x;
if (a.y != b.y) return a.y < b.y;
return a.z < b.z;
}
};
#endif

static __global__ void copy_vec3f_to_float3_kernel(
const Vec3f* vec3f,
Expand Down Expand Up @@ -152,12 +201,12 @@ void CuMesh::remove_unreferenced_vertices() {
size_t temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
cu_vertex_is_referenced, V+1
cu_vertex_is_referenced,cu_vertex_is_referenced, V+1
));
this->cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
this->cub_temp_storage.ptr, temp_storage_bytes,
cu_vertex_is_referenced, V+1
cu_vertex_is_referenced,cu_vertex_is_referenced, V+1
));
int new_num_vertices;
CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost));
Expand Down Expand Up @@ -226,7 +275,6 @@ static __global__ void select_first_in_each_group_kernel(
}
}


struct int3_decomposer
{
__host__ __device__ ::cuda::std::tuple<int&, int&, int&> operator()(int3& key) const
Expand All @@ -252,7 +300,38 @@ void CuMesh::remove_duplicate_faces() {
);
CUDA_CHECK(cudaGetLastError());

// Sort all faces globally by their sorted vertex indices
#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040)
// CUDA < 12.4: use Thrust implementation
int *cu_sorted_face_indices;
CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int)));

thrust::sequence(thrust::device,
thrust::device_pointer_cast(cu_sorted_face_indices),
thrust::device_pointer_cast(cu_sorted_face_indices + F));

thrust::sort(thrust::device,
thrust::device_pointer_cast(cu_sorted_face_indices),
thrust::device_pointer_cast(cu_sorted_face_indices + F),
FaceComparator(cu_sorted_faces));

uint8_t* cu_face_mask_original;
CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));

mark_duplicates_from_indices_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
cu_sorted_face_indices,
cu_sorted_faces,
cu_face_mask_original,
(int)F
);
CUDA_CHECK(cudaGetLastError());

CUDA_CHECK(cudaFree(cu_sorted_faces));
CUDA_CHECK(cudaFree(cu_sorted_face_indices));

this->_remove_faces(cu_face_mask_original);
CUDA_CHECK(cudaFree(cu_face_mask_original));
#else
// CUDA >= 12.4: keep existing CUB behavior
size_t temp_storage_bytes = 0;
int *cu_sorted_face_indices;
CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int)));
Expand Down Expand Up @@ -282,7 +361,6 @@ void CuMesh::remove_duplicate_faces() {
CUDA_CHECK(cudaFree(cu_sorted_faces));
CUDA_CHECK(cudaFree(cu_sorted_face_indices));

// Select first in each group of duplicate faces (based on sorted faces)
uint8_t* cu_face_mask_sorted;
CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t)));
select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
Expand All @@ -293,27 +371,23 @@ void CuMesh::remove_duplicate_faces() {
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaFree(cu_sorted_faces_output));

// Map the mask back to original face order using scatter
// scatter: output[indices[i]] = values[i]
// This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position]
uint8_t* cu_face_mask_original;
CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t)));
scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(
cu_sorted_indices_output, // indices: sorted_position -> original_idx
cu_face_mask_sorted, // values: mask at sorted_position
cu_sorted_indices_output,
cu_face_mask_sorted,
F,
cu_face_mask_original // output: mask at original position
cu_face_mask_original
);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaFree(cu_face_mask_sorted));
CUDA_CHECK(cudaFree(cu_sorted_indices_output));

// Select faces to keep (preserving original vertex order)
this->_remove_faces(cu_face_mask_original);
CUDA_CHECK(cudaFree(cu_face_mask_original));
#endif
}


static __global__ void mark_degenerate_faces_kernel(
const float3* vertices,
const int3* faces,
Expand Down Expand Up @@ -542,13 +616,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::InclusiveSum(
nullptr, temp_storage_bytes,
cu_loop_bound_loop_ids,
cu_loop_bound_loop_ids,cu_loop_bound_loop_ids,
E
));
this->cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::InclusiveSum(
this->cub_temp_storage.ptr, temp_storage_bytes,
cu_loop_bound_loop_ids,
cu_loop_bound_loop_ids,cu_loop_bound_loop_ids,
E
));

Expand Down Expand Up @@ -614,13 +688,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) {
temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::InclusiveSum(
nullptr, temp_storage_bytes,
cu_new_loop_bound_loop_ids,
cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids,
new_num_loop_boundaries
));
this->cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::InclusiveSum(
this->cub_temp_storage.ptr, temp_storage_bytes,
cu_new_loop_bound_loop_ids,
cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids,
new_num_loop_boundaries
));

Expand Down Expand Up @@ -1216,4 +1290,4 @@ void CuMesh::unify_face_orientations() {
}


} // namespace cumesh
} // namespace cumesh
4 changes: 2 additions & 2 deletions src/connectivity.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1074,13 +1074,13 @@ void CuMesh::get_boundary_loops() {
temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
this->loop_boundaries_offset.ptr,
this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr,
this->num_bound_loops + 1
));
this->cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
this->cub_temp_storage.ptr, temp_storage_bytes,
this->loop_boundaries_offset.ptr,
this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr,
this->num_bound_loops + 1
));
}
Expand Down
4 changes: 2 additions & 2 deletions src/remesh/svox2vert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,10 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(

// Compute the offset
size_t temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices,num_vertices, M + 1);
void* d_temp_storage = nullptr;
CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes));
cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices,num_vertices, M + 1);
CUDA_CHECK(cudaFree(d_temp_storage));
int total_vertices;
CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost));
Expand Down
4 changes: 2 additions & 2 deletions src/shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,13 +215,13 @@ int compress_ids(T* ids, size_t N, Buffer<char>& cub_temp_storage, T* inverse=nu
temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
cu_new_ids,
cu_new_ids,cu_new_ids,
N
));
cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
cub_temp_storage.ptr, temp_storage_bytes,
cu_new_ids,
cu_new_ids,cu_new_ids,
N
));

Expand Down
8 changes: 4 additions & 4 deletions src/simplify.cu
Original file line number Diff line number Diff line change
Expand Up @@ -473,12 +473,12 @@ void collapse_edges(
size_t temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
ctx.vertices_map.ptr, V+1
ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1
));
ctx.cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
ctx.cub_temp_storage.ptr, temp_storage_bytes,
ctx.vertices_map.ptr, V+1
ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1
));
int new_num_vertices;
CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost));
Expand All @@ -497,12 +497,12 @@ void collapse_edges(
// get faces map
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
nullptr, temp_storage_bytes,
ctx.faces_map.ptr, F+1
ctx.faces_map.ptr,ctx.faces_map.ptr,F+1
));
ctx.cub_temp_storage.resize(temp_storage_bytes);
CUDA_CHECK(cub::DeviceScan::ExclusiveSum(
ctx.cub_temp_storage.ptr, temp_storage_bytes,
ctx.faces_map.ptr, F+1
ctx.faces_map.ptr,ctx.faces_map.ptr, F+1
));
int new_num_faces;
CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost));
Expand Down