diff --git a/src/clean_up.cu b/src/clean_up.cu index 3c12cd7..7deb921 100644 --- a/src/clean_up.cu +++ b/src/clean_up.cu @@ -2,10 +2,59 @@ #include "dtypes.cuh" #include "shared.h" #include - +#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040) +#include +#include +#include +#endif namespace cumesh { +#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040) +// Marks faces as 1 (keep) or 0 (remove) by comparing adjacent sorted faces +__global__ void mark_duplicates_from_indices_kernel( + const int* sorted_indices, + const int3* faces, + uint8_t* mask_original, + int n +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + int current_original_idx = sorted_indices[idx]; + + // The first element in the sorted list is always unique + if (idx == 0) { + mask_original[current_original_idx] = 1; + return; + } + + // Compare with the previous element in the sorted list + int prev_original_idx = sorted_indices[idx - 1]; + + int3 curr_f = faces[current_original_idx]; + int3 prev_f = faces[prev_original_idx]; + + // If identical to previous, it's a duplicate -> mark 0 (remove) + // Otherwise -> mark 1 (keep) + bool is_duplicate = (curr_f.x == prev_f.x && curr_f.y == prev_f.y && curr_f.z == prev_f.z); + mask_original[current_original_idx] = is_duplicate ? 0 : 1; +} + +// Comparator for Thrust to sort indices based on face values +struct FaceComparator { + const int3* faces; + FaceComparator(const int3* f) : faces(f) {} + + __device__ bool operator()(int i, int j) const { + const int3& a = faces[i]; + const int3& b = faces[j]; + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + return a.z < b.z; + } +}; +#endif static __global__ void copy_vec3f_to_float3_kernel( const Vec3f* vec3f, @@ -152,12 +201,12 @@ void CuMesh::remove_unreferenced_vertices() { size_t temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - cu_vertex_is_referenced, V+1 + cu_vertex_is_referenced,cu_vertex_is_referenced, V+1 )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, - cu_vertex_is_referenced, V+1 + cu_vertex_is_referenced,cu_vertex_is_referenced, V+1 )); int new_num_vertices; CUDA_CHECK(cudaMemcpy(&new_num_vertices, cu_vertex_is_referenced + V, sizeof(int), cudaMemcpyDeviceToHost)); @@ -226,7 +275,6 @@ static __global__ void select_first_in_each_group_kernel( } } - struct int3_decomposer { __host__ __device__ ::cuda::std::tuple operator()(int3& key) const @@ -252,7 +300,38 @@ void CuMesh::remove_duplicate_faces() { ); CUDA_CHECK(cudaGetLastError()); - // Sort all faces globally by their sorted vertex indices +#if defined(CUDART_VERSION) && (CUDART_VERSION < 12040) + // CUDA < 12.4: use Thrust implementation + int *cu_sorted_face_indices; + CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int))); + + thrust::sequence(thrust::device, + thrust::device_pointer_cast(cu_sorted_face_indices), + thrust::device_pointer_cast(cu_sorted_face_indices + F)); + + thrust::sort(thrust::device, + thrust::device_pointer_cast(cu_sorted_face_indices), + thrust::device_pointer_cast(cu_sorted_face_indices + F), + FaceComparator(cu_sorted_faces)); + + uint8_t* cu_face_mask_original; + CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t))); + + mark_duplicates_from_indices_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( + cu_sorted_face_indices, + cu_sorted_faces, + cu_face_mask_original, + (int)F + ); + CUDA_CHECK(cudaGetLastError()); + + CUDA_CHECK(cudaFree(cu_sorted_faces)); + CUDA_CHECK(cudaFree(cu_sorted_face_indices)); + + this->_remove_faces(cu_face_mask_original); + CUDA_CHECK(cudaFree(cu_face_mask_original)); +#else + // CUDA >= 12.4: keep existing CUB behavior size_t temp_storage_bytes = 0; int *cu_sorted_face_indices; CUDA_CHECK(cudaMalloc(&cu_sorted_face_indices, F * sizeof(int))); @@ -282,7 +361,6 @@ void CuMesh::remove_duplicate_faces() { CUDA_CHECK(cudaFree(cu_sorted_faces)); CUDA_CHECK(cudaFree(cu_sorted_face_indices)); - // Select first in each group of duplicate faces (based on sorted faces) uint8_t* cu_face_mask_sorted; CUDA_CHECK(cudaMalloc(&cu_face_mask_sorted, F * sizeof(uint8_t))); select_first_in_each_group_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( @@ -293,27 +371,23 @@ void CuMesh::remove_duplicate_faces() { CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaFree(cu_sorted_faces_output)); - // Map the mask back to original face order using scatter - // scatter: output[indices[i]] = values[i] - // This maps: cu_face_mask_original[original_idx] = cu_face_mask_sorted[sorted_position] uint8_t* cu_face_mask_original; CUDA_CHECK(cudaMalloc(&cu_face_mask_original, F * sizeof(uint8_t))); scatter_kernel<<<(F+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>( - cu_sorted_indices_output, // indices: sorted_position -> original_idx - cu_face_mask_sorted, // values: mask at sorted_position + cu_sorted_indices_output, + cu_face_mask_sorted, F, - cu_face_mask_original // output: mask at original position + cu_face_mask_original ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaFree(cu_face_mask_sorted)); CUDA_CHECK(cudaFree(cu_sorted_indices_output)); - // Select faces to keep (preserving original vertex order) this->_remove_faces(cu_face_mask_original); CUDA_CHECK(cudaFree(cu_face_mask_original)); +#endif } - static __global__ void mark_degenerate_faces_kernel( const float3* vertices, const int3* faces, @@ -542,13 +616,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) { temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, - cu_loop_bound_loop_ids, + cu_loop_bound_loop_ids,cu_loop_bound_loop_ids, E )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::InclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, - cu_loop_bound_loop_ids, + cu_loop_bound_loop_ids,cu_loop_bound_loop_ids, E )); @@ -614,13 +688,13 @@ void CuMesh::fill_holes(float max_hole_perimeter) { temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, - cu_new_loop_bound_loop_ids, + cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids, new_num_loop_boundaries )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::InclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, - cu_new_loop_bound_loop_ids, + cu_new_loop_bound_loop_ids,cu_new_loop_bound_loop_ids, new_num_loop_boundaries )); @@ -1216,4 +1290,4 @@ void CuMesh::unify_face_orientations() { } -} // namespace cumesh \ No newline at end of file +} // namespace cumesh diff --git a/src/connectivity.cu b/src/connectivity.cu index 6e2f5fe..c061163 100644 --- a/src/connectivity.cu +++ b/src/connectivity.cu @@ -1074,13 +1074,13 @@ void CuMesh::get_boundary_loops() { temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - this->loop_boundaries_offset.ptr, + this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr, this->num_bound_loops + 1 )); this->cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( this->cub_temp_storage.ptr, temp_storage_bytes, - this->loop_boundaries_offset.ptr, + this->loop_boundaries_offset.ptr,this->loop_boundaries_offset.ptr, this->num_bound_loops + 1 )); } diff --git a/src/remesh/svox2vert.cu b/src/remesh/svox2vert.cu index 6f1d517..4c69981 100644 --- a/src/remesh/svox2vert.cu +++ b/src/remesh/svox2vert.cu @@ -180,10 +180,10 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices( // Compute the offset size_t temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1); + cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices,num_vertices, M + 1); void* d_temp_storage = nullptr; CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1); + cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices,num_vertices, M + 1); CUDA_CHECK(cudaFree(d_temp_storage)); int total_vertices; CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost)); diff --git a/src/shared.h b/src/shared.h index 66ecac7..039eb88 100644 --- a/src/shared.h +++ b/src/shared.h @@ -215,13 +215,13 @@ int compress_ids(T* ids, size_t N, Buffer& cub_temp_storage, T* inverse=nu temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - cu_new_ids, + cu_new_ids,cu_new_ids, N )); cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( cub_temp_storage.ptr, temp_storage_bytes, - cu_new_ids, + cu_new_ids,cu_new_ids, N )); diff --git a/src/simplify.cu b/src/simplify.cu index 9efde9e..b83fc38 100644 --- a/src/simplify.cu +++ b/src/simplify.cu @@ -473,12 +473,12 @@ void collapse_edges( size_t temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - ctx.vertices_map.ptr, V+1 + ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1 )); ctx.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( ctx.cub_temp_storage.ptr, temp_storage_bytes, - ctx.vertices_map.ptr, V+1 + ctx.vertices_map.ptr,ctx.vertices_map.ptr, V+1 )); int new_num_vertices; CUDA_CHECK(cudaMemcpy(&new_num_vertices, ctx.vertices_map.ptr + V, sizeof(int), cudaMemcpyDeviceToHost)); @@ -497,12 +497,12 @@ void collapse_edges( // get faces map CUDA_CHECK(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_bytes, - ctx.faces_map.ptr, F+1 + ctx.faces_map.ptr,ctx.faces_map.ptr,F+1 )); ctx.cub_temp_storage.resize(temp_storage_bytes); CUDA_CHECK(cub::DeviceScan::ExclusiveSum( ctx.cub_temp_storage.ptr, temp_storage_bytes, - ctx.faces_map.ptr, F+1 + ctx.faces_map.ptr,ctx.faces_map.ptr, F+1 )); int new_num_faces; CUDA_CHECK(cudaMemcpy(&new_num_faces, ctx.faces_map.ptr + F, sizeof(int), cudaMemcpyDeviceToHost));