Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 130 additions & 96 deletions src/atlas.cu

Large diffs are not rendered by default.

231 changes: 133 additions & 98 deletions src/clean_up.cu

Large diffs are not rendered by default.

198 changes: 117 additions & 81 deletions src/connectivity.cu

Large diffs are not rendered by default.

9 changes: 6 additions & 3 deletions src/geometry.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ static __global__ void compute_face_areas_kernel(


void CuMesh::compute_face_areas() {
cudaStream_t stream = current_stream();
size_t F = this->faces.size;
this->face_areas.resize(F);
compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
compute_face_areas_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
this->vertices.ptr,
this->faces.ptr,
F,
Expand Down Expand Up @@ -57,9 +58,10 @@ static __global__ void compute_face_normals_kernel(


void CuMesh::compute_face_normals() {
cudaStream_t stream = current_stream();
size_t F = this->faces.size;
this->face_normals.resize(F);
compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
compute_face_normals_kernel<<<(F + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
this->vertices.ptr,
this->faces.ptr,
F,
Expand Down Expand Up @@ -113,9 +115,10 @@ void CuMesh::compute_vertex_normals() {
this->get_vertex_face_adjacency();
}

cudaStream_t stream = current_stream();
size_t V = this->vertices.size;
this->vertex_normals.resize(V);
compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
compute_vertex_normals_kernel<<<(V + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
this->vertices.ptr,
this->faces.ptr,
this->vert2face.ptr,
Expand Down
26 changes: 21 additions & 5 deletions src/hash/hash.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <c10/cuda/CUDAStream.h>

#include "api.h"
#include "hash.cuh"
Expand Down Expand Up @@ -32,9 +33,12 @@ static void dispatch_hashmap_insert_cuda(
const torch::Tensor& keys,
const torch::Tensor& values
) {
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
hashmap_insert_cuda_kernel<<<
(keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
BLOCK_SIZE
BLOCK_SIZE,
0,
stream
>>>(
hashmap_keys.size(0),
keys.size(0),
Expand Down Expand Up @@ -111,9 +115,12 @@ static void dispatch_hashmap_lookup_cuda(
const torch::Tensor& keys,
torch::Tensor& values
) {
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
hashmap_lookup_cuda_kernel<<<
(keys.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
BLOCK_SIZE
BLOCK_SIZE,
0,
stream
>>>(
hashmap_keys.size(0),
keys.size(0),
Expand Down Expand Up @@ -205,9 +212,12 @@ static void dispatch_hashmap_insert_3d_cuda(
const torch::Tensor& values,
int W, int H, int D
) {
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
hashmap_insert_3d_cuda_kernel<<<
(coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
BLOCK_SIZE
BLOCK_SIZE,
0,
stream
>>>(
hashmap_keys.size(0),
coords.size(0),
Expand Down Expand Up @@ -303,9 +313,12 @@ static void dispatch_hashmap_lookup_3d_cuda(
torch::Tensor& values,
int W, int H, int D
) {
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
hashmap_lookup_3d_cuda_kernel<<<
(coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
BLOCK_SIZE
BLOCK_SIZE,
0,
stream
>>>(
hashmap_keys.size(0),
coords.size(0),
Expand Down Expand Up @@ -395,9 +408,12 @@ static void dispatch_hashmap_insert_3d_idx_as_val_cuda(
const torch::Tensor& coords,
int W, int H, int D
) {
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
hashmap_insert_3d_idx_as_val_cuda_kernel<<<
(coords.size(0) + BLOCK_SIZE - 1) / BLOCK_SIZE,
BLOCK_SIZE
BLOCK_SIZE,
0,
stream
>>>(
hashmap_keys.size(0),
coords.size(0),
Expand Down
10 changes: 6 additions & 4 deletions src/remesh/simple_dual_contour.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <c10/cuda/CUDAStream.h>
#include <vector>

#include "api.h"
Expand Down Expand Up @@ -181,6 +182,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
) {
const size_t M = coords.size(0);
const size_t N_vert = hashmap_keys.size(0);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();

auto vertices = torch::empty({(long)M, 3}, torch::dtype(torch::kFloat32).device(coords.device()));
auto intersected = torch::empty({(long)M, 3}, torch::dtype(torch::kInt32).device(coords.device()));
Expand All @@ -189,7 +191,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
dim3 blocks((M + BLOCK_SIZE - 1) / BLOCK_SIZE);

if (hashmap_keys.dtype() == torch::kUInt32) {
simple_dual_contour_kernel<<<blocks, threads>>>(
simple_dual_contour_kernel<<<blocks, threads, 0, stream>>>(
N_vert,
M,
W, H, D,
Expand All @@ -200,9 +202,9 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
vertices.data_ptr<float>(),
intersected.data_ptr<int32_t>()
);
}
}
else if (hashmap_keys.dtype() == torch::kUInt64) {
simple_dual_contour_kernel<<<blocks, threads>>>(
simple_dual_contour_kernel<<<blocks, threads, 0, stream>>>(
N_vert,
M,
W, H, D,
Expand All @@ -213,7 +215,7 @@ std::tuple<torch::Tensor, torch::Tensor> cumesh::simple_dual_contour(
vertices.data_ptr<float>(),
intersected.data_ptr<int32_t>()
);
}
}
else {
TORCH_CHECK(false, "Unsupported hashmap data type");
}
Expand Down
22 changes: 13 additions & 9 deletions src/remesh/svox2vert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cub/cub.cuh>
#include <c10/cuda/CUDAStream.h>

#include "api.h"
#include "../utils.h"
Expand Down Expand Up @@ -147,10 +148,11 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(

// Get the number of active vertices for each voxel
size_t N = hashmap_keys.size(0);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
int* num_vertices;
CUDA_CHECK(cudaMalloc(&num_vertices, (M + 1) * sizeof(int)));
if (hashmap_keys.dtype() == torch::kUInt32) {
get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
N,
M,
W,
Expand All @@ -162,7 +164,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
num_vertices
);
} else if (hashmap_keys.dtype() == torch::kUInt64) {
get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
get_vertex_num<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
N,
M,
W,
Expand All @@ -180,18 +182,19 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(

// Compute the offset
size_t temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1);
cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, num_vertices, M + 1, stream);
void* d_temp_storage = nullptr;
CUDA_CHECK(cudaMalloc(&d_temp_storage, temp_storage_bytes));
cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1);
CUDA_CHECK(cudaFree(d_temp_storage));
cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, num_vertices, M + 1, stream);
int total_vertices;
CUDA_CHECK(cudaMemcpy(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpyAsync(&total_vertices, num_vertices + M, sizeof(int), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(cudaFree(d_temp_storage));

// Set the active vertices for each voxel
auto vertices = torch::empty({total_vertices, 3}, torch::dtype(torch::kInt32).device(hashmap_keys.device()));
if (hashmap_keys.dtype() == torch::kUInt32) {
set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
N,
M,
W,
Expand All @@ -205,7 +208,7 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
);
}
else if (hashmap_keys.dtype() == torch::kUInt64) {
set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
set_vertex<<<(M + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
N,
M,
W,
Expand All @@ -220,7 +223,8 @@ torch::Tensor cumesh::get_sparse_voxel_grid_active_vertices(
}
CUDA_CHECK(cudaGetLastError());

// Free the temporary memory
// Free the temporary memory — sync stream first so set_vertex kernel is done
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(cudaFree(num_vertices));

return vertices;
Expand Down
Loading