diff --git a/o-voxel/notebooks/test_shape.glb b/o-voxel/notebooks/test_shape.glb new file mode 100644 index 00000000..671c1b89 Binary files /dev/null and b/o-voxel/notebooks/test_shape.glb differ diff --git a/o-voxel/notebooks/verify_decoded_shape.ipynb b/o-voxel/notebooks/verify_decoded_shape.ipynb new file mode 100644 index 00000000..d2cc569c --- /dev/null +++ b/o-voxel/notebooks/verify_decoded_shape.ipynb @@ -0,0 +1,1313 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "e3d7eccb", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import zipfile\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import o_voxel\n", + "import torch\n", + "import trimesh" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "51a80d2b", + "metadata": {}, + "outputs": [], + "source": [ + "mesh_path = './test_shape.glb'\n", + "mesh = trimesh.load_mesh(mesh_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c4455691", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scene = trimesh.Scene([mesh])\n", + "scene.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3344e227", + "metadata": {}, + "outputs": [], + "source": [ + "verts = torch.from_numpy(np.array(mesh.vertices)).cuda()\n", + "faces = torch.from_numpy(np.array(mesh.faces)).cuda()\n", + "\n", + "vmin = verts.min(dim=0).values\n", + "vmax = verts.max(dim=0).values\n", + "center = 0.5 * (vmin + vmax)\n", + "scale = 0.99999 / (vmax - vmin).max()\n", + "\n", + "vertices = (verts - center) * scale\n", + "\n", + "INPUT_RES = 1024\n", + "AABB = [[-0.5, -0.5, -0.5], [0.5, 0.5, 0.5]]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "648149e0", + "metadata": {}, + "outputs": [], + "source": [ + "coords, dual_vertices, intersected = o_voxel.convert.mesh_to_flexible_dual_grid(\n", + " vertices=verts,\n", + " faces=faces,\n", + " voxel_size=1 / INPUT_RES,\n", + " grid_size=INPUT_RES,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "54c84976", + "metadata": {}, + "outputs": [], + "source": [ + "verts_decoded, faces_decoded = o_voxel.convert.flexible_dual_grid_to_mesh(\n", + " coords=coords,\n", + " dual_vertices=dual_vertices,\n", + " intersected_flag=intersected,\n", + " split_weight=None,\n", + " aabb=AABB,\n", + " voxel_size=1 / INPUT_RES,\n", + " grid_size=INPUT_RES,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "4f060f3d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Simplifying [thres=1.00e-05]: 100%|██████████| 7010662/7010662 [00:01<00:00, 3897064.44it/s]\n" + ] + } + ], + "source": [ + "import cumesh\n", + "\n", + "cu_mesh = cumesh.CuMesh()\n", + "cu_mesh.init(verts_decoded, faces_decoded)\n", + "\n", + "cu_mesh.remove_duplicate_faces()\n", + "cu_mesh.repair_non_manifold_edges()\n", + "cu_mesh.remove_small_connected_components(1e-5)\n", + "cu_mesh.fill_holes(max_hole_perimeter=3e-2)\n", + "\n", + "cu_mesh.simplify(200_000, verbose=True)\n", + "\n", + "cu_mesh.remove_duplicate_faces()\n", + "cu_mesh.repair_non_manifold_edges()\n", + "cu_mesh.remove_small_connected_components(1e-5)\n", + "cu_mesh.fill_holes(max_hole_perimeter=3e-2)\n", + "cu_mesh.unify_face_orientations()\n", + "\n", + "vertices_clean, faces_clean = cu_mesh.read()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "0c510ee8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mesh_decoded_clean = trimesh.Trimesh(\n", + " vertices=vertices_clean.detach().cpu().numpy(),\n", + " faces=faces_clean.detach().cpu().numpy(),\n", + " process=False,\n", + ")\n", + "\n", + "scene = trimesh.Scene([mesh_decoded_clean])\n", + "scene.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "71d17350", + "metadata": {}, + "outputs": [], + "source": [ + "coords_cpu, dual_vertices_cpu, intersecteds_cpu = o_voxel.convert.mesh_to_flexible_dual_grid(\n", + " vertices=verts.cpu(),\n", + " faces=faces.cpu(),\n", + " voxel_size=1 / INPUT_RES,\n", + " grid_size=INPUT_RES,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "93ff3b69", + "metadata": {}, + "outputs": [], + "source": [ + "verts_cpu_decoded, faces_cpu_decoded = o_voxel.convert.flexible_dual_grid_to_mesh(\n", + " coords=coords_cpu.cuda(),\n", + " dual_vertices=dual_vertices_cpu.cuda(),\n", + " intersected_flag=intersecteds_cpu.cuda(),\n", + " split_weight=None,\n", + " aabb=AABB,\n", + " voxel_size=1 / INPUT_RES,\n", + " grid_size=INPUT_RES,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "420dd3e8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Simplifying [thres=1.00e-05]: 100%|██████████| 7010662/7010662 [00:01<00:00, 5011057.90it/s]\n" + ] + } + ], + "source": [ + "cu_mesh = cumesh.CuMesh()\n", + "cu_mesh.init(verts_cpu_decoded, faces_cpu_decoded)\n", + "\n", + "cu_mesh.remove_duplicate_faces()\n", + "cu_mesh.repair_non_manifold_edges()\n", + "cu_mesh.remove_small_connected_components(1e-5)\n", + "cu_mesh.fill_holes(max_hole_perimeter=3e-2)\n", + "\n", + "cu_mesh.simplify(200_000, verbose=True)\n", + "\n", + "cu_mesh.remove_duplicate_faces()\n", + "cu_mesh.repair_non_manifold_edges()\n", + "cu_mesh.remove_small_connected_components(1e-5)\n", + "cu_mesh.fill_holes(max_hole_perimeter=3e-2)\n", + "cu_mesh.unify_face_orientations()\n", + "\n", + "vertices_cpu_clean, faces_cpu_clean = cu_mesh.read()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "0f235d44", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mesh_cpu_decoded_clean = trimesh.Trimesh(\n", + " vertices=vertices_cpu_clean.detach().cpu().numpy(),\n", + " faces=faces_cpu_clean.detach().cpu().numpy(),\n", + " process=False,\n", + ")\n", + "\n", + "scene = trimesh.Scene([mesh_decoded_clean])\n", + "scene.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1153e2f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "symm-enforce", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/o-voxel/notebooks/verify_fdg_jit_cpu_gpu.ipynb b/o-voxel/notebooks/verify_fdg_jit_cpu_gpu.ipynb new file mode 100644 index 00000000..99c62bb7 --- /dev/null +++ b/o-voxel/notebooks/verify_fdg_jit_cpu_gpu.ipynb @@ -0,0 +1,708 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b125f8bb", + "metadata": {}, + "source": [ + "# Verify FDG Build + CPU/GPU Integrity\n", + "\n", + "This notebook performs a fast verification flow for `mesh_to_flexible_dual_grid`:\n", + "\n", + "1. Choose whether to JIT compile the current local sources or directly import an already-installed `o_voxel` package.\n", + "2. Generate Gaussian point cloud on GPU and random triangle indices on GPU.\n", + "3. Run CPU path and GPU path through the selected extension module.\n", + "4. Validate output dtypes, shapes, and CPU/GPU consistency.\n", + "\n", + "No local mesh file is loaded.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "c3d9ec7f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import time\n", + "import types\n", + "import importlib\n", + "import importlib.util\n", + "from pathlib import Path\n", + "\n", + "import torch\n", + "from torch.utils.cpp_extension import load\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "e8ab51ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROOT = /mnt/nvmefs/Projects/Part Generation/TRELLIS.2-o-voxel-gpu-mod/o-voxel\n", + "USE_JIT = False\n", + "torch = 2.6.0+cu124\n", + "cuda available = True\n", + "cuda device = NVIDIA GeForce RTX 4090\n", + "installed package path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/__init__.py\n", + "installed extension path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/_C.cpython-310-x86_64-linux-gnu.so\n", + "installed API path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/convert/flexible_dual_grid.py\n", + "api_mode = installed\n", + "ext_mod = \n", + "fdg_api = \n" + ] + } + ], + "source": [ + "ROOT = Path(r'/mnt/nvmefs/Projects/Part Generation/TRELLIS.2-o-voxel-gpu-mod/o-voxel').resolve()\n", + "USE_JIT = False\n", + "INSTALLED_IMPORT_NAME = 'o_voxel'\n", + "\n", + "print('ROOT =', ROOT)\n", + "print('USE_JIT =', USE_JIT)\n", + "print('torch =', torch.__version__)\n", + "print('cuda available =', torch.cuda.is_available())\n", + "if torch.cuda.is_available():\n", + " print('cuda device =', torch.cuda.get_device_name(0))\n", + "\n", + "\n", + "def build_jit_extension():\n", + " sources = [\n", + " 'src/hash/hash.cu',\n", + " 'src/convert/flexible_dual_grid.cpp',\n", + " 'src/convert/volumetic_attr.cpp',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/torch_bindings.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.cu',\n", + " 'src/serialize/api.cu',\n", + " 'src/serialize/hilbert.cu',\n", + " 'src/serialize/z_order.cu',\n", + " 'src/io/svo.cpp',\n", + " 'src/io/filter_parent.cpp',\n", + " 'src/io/filter_neighbor.cpp',\n", + " 'src/rasterize/rasterize.cu',\n", + " 'src/ext.cpp',\n", + "]\n", + " full_sources = [str(ROOT / s) for s in sources]\n", + " missing = [s for s in full_sources if not Path(s).exists()]\n", + " if missing:\n", + " raise FileNotFoundError(f'Missing sources: {missing}')\n", + "\n", + " build_dir = ROOT / '.verify_build'\n", + " build_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + " unique_suffix = f\"{os.getpid()}_{time.time_ns()}_{os.urandom(4).hex()}\"\n", + " mod_name = f\"o_voxel_verify_{unique_suffix}\"\n", + "\n", + " max_jobs = max(1, os.cpu_count() or 1)\n", + " os.environ['MAX_JOBS'] = str(max_jobs)\n", + " print('MAX_JOBS =', os.environ['MAX_JOBS'])\n", + " print('JIT module name =', mod_name)\n", + "\n", + " ext_mod = load(\n", + " name=mod_name,\n", + " sources=full_sources,\n", + " extra_include_paths=[str(ROOT / 'third_party/eigen')],\n", + " extra_cflags=['-O3', '-std=c++17'],\n", + " extra_cuda_cflags=['-O3', '-std=c++17', '--expt-relaxed-constexpr'],\n", + " with_cuda=True,\n", + " build_directory=str(build_dir),\n", + " verbose=True,\n", + " )\n", + " print('JIT build/link: OK')\n", + " print('jit module path =', ext_mod.__file__)\n", + " return ext_mod\n", + "\n", + "\n", + "def load_local_flexible_dual_grid(ext_mod):\n", + " pkg = types.ModuleType('o_voxel')\n", + " pkg.__path__ = [str(ROOT / 'o_voxel')]\n", + " pkg._C = ext_mod\n", + " sys.modules['o_voxel'] = pkg\n", + " sys.modules['o_voxel._C'] = ext_mod\n", + "\n", + " convert_pkg = types.ModuleType('o_voxel.convert')\n", + " convert_pkg.__path__ = [str(ROOT / 'o_voxel' / 'convert')]\n", + " sys.modules['o_voxel.convert'] = convert_pkg\n", + "\n", + " spec = importlib.util.spec_from_file_location(\n", + " 'o_voxel.convert.flexible_dual_grid',\n", + " ROOT / 'o_voxel' / 'convert' / 'flexible_dual_grid.py',\n", + " )\n", + " mod = importlib.util.module_from_spec(spec)\n", + " sys.modules['o_voxel.convert.flexible_dual_grid'] = mod\n", + " spec.loader.exec_module(mod)\n", + " return mod\n", + "\n", + "\n", + "if USE_JIT:\n", + " ext_mod = build_jit_extension()\n", + " fdg_api = load_local_flexible_dual_grid(ext_mod)\n", + " api_mode = 'jit'\n", + "else:\n", + " installed_pkg = importlib.import_module(INSTALLED_IMPORT_NAME)\n", + " ext_mod = installed_pkg._C\n", + " fdg_api = importlib.import_module(f'{INSTALLED_IMPORT_NAME}.convert.flexible_dual_grid')\n", + " api_mode = 'installed'\n", + " print('installed package path =', getattr(installed_pkg, '__file__', ''))\n", + " print('installed extension path =', getattr(ext_mod, '__file__', ''))\n", + " print('installed API path =', getattr(fdg_api, '__file__', ''))\n", + "\n", + "print('api_mode =', api_mode)\n", + "print('ext_mod =', ext_mod)\n", + "print('fdg_api =', fdg_api)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "3a6b0493", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "has mesh_to_flexible_dual_grid_cpu = True\n", + "has mesh_to_flexible_dual_grid_gpu = True\n", + "has intersection_occ = True\n", + "has intersect_qef = True\n", + "has voxelize_mesh_gpu = True\n", + "has voxelize_edge_gpu = True\n", + "has face_qef = True\n", + "has voxel_traverse_edge_dda_gpu = True\n", + "has boundary_qef = True\n" + ] + } + ], + "source": [ + "print('has mesh_to_flexible_dual_grid_cpu =', hasattr(ext_mod, 'mesh_to_flexible_dual_grid_cpu'))\n", + "print('has mesh_to_flexible_dual_grid_gpu =', hasattr(ext_mod, 'mesh_to_flexible_dual_grid_gpu'))\n", + "print('has intersection_occ =', hasattr(fdg_api, 'intersection_occ'))\n", + "print('has intersect_qef =', hasattr(fdg_api, 'intersect_qef'))\n", + "print('has voxelize_mesh_gpu =', hasattr(fdg_api, 'voxelize_mesh_gpu'))\n", + "print('has voxelize_edge_gpu =', hasattr(fdg_api, 'voxelize_edge_gpu'))\n", + "print('has face_qef =', hasattr(fdg_api, 'face_qef'))\n", + "print('has voxel_traverse_edge_dda_gpu =', hasattr(fdg_api, 'voxel_traverse_edge_dda_gpu'))\n", + "print('has boundary_qef =', hasattr(fdg_api, 'boundary_qef'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "526b81fb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "vertices_gpu : torch.Size([12000, 3]) torch.float32 cuda:0\n", + "faces_gpu : torch.Size([30000, 3]) torch.int32 cuda:0\n", + "voxel_size : tensor([0.0078, 0.0078, 0.0078], device='cuda:0') torch.float32 cuda:0\n", + "grid_range : torch.Size([2, 3]) torch.int32 cuda:0\n", + "vertices_local: torch.Size([12000, 3]) torch.float32 cuda:0\n" + ] + } + ], + "source": [ + "# Synthetic geometry: Gaussian points + random triangles built on GPU.\n", + "assert torch.cuda.is_available(), 'CUDA is required for this notebook workflow.'\n", + "device = torch.device('cuda:0')\n", + "\n", + "N_VERT = 12000\n", + "N_FACE = 30000\n", + "GRID = 128\n", + "\n", + "assert N_VERT >= 3, 'Need at least 3 vertices to build non-degenerate triangles.'\n", + "\n", + "torch.manual_seed(7)\n", + "vertices_gpu = (torch.randn(N_VERT, 3, device=device, dtype=torch.float32) * 0.20).contiguous()\n", + "\n", + "# # Construct faces with guaranteed distinct vertex ids per triangle.\n", + "# v0 = torch.randint(0, N_VERT, (N_FACE,), device=device, dtype=torch.int64)\n", + "# o1 = torch.randint(1, N_VERT, (N_FACE,), device=device, dtype=torch.int64)\n", + "# v1 = (v0 + o1) % N_VERT # guaranteed v1 != v0\n", + "\n", + "# # Draw an index in [0, N_VERT - 2), then map it to [0, N_VERT) excluding {v0, v1}.\n", + "# lo = torch.minimum(v0, v1)\n", + "# hi = torch.maximum(v0, v1)\n", + "# u = torch.randint(0, N_VERT - 2, (N_FACE,), device=device, dtype=torch.int64)\n", + "# v2 = u + (u >= lo).to(torch.int64)\n", + "# v2 = v2 + (v2 >= hi).to(torch.int64)\n", + "\n", + "# Construct faces with guaranteed distinct vertex ids per triangle.\n", + "v0 = torch.randint(0, N_VERT, (N_FACE,), device=device, dtype=torch.int64)\n", + "v1 = torch.randint(0, N_VERT, (N_FACE,), device=device, dtype=torch.int64)\n", + "v2 = torch.randint(0, N_VERT, (N_FACE,), device=device, dtype=torch.int64)\n", + "\n", + "\n", + "faces_gpu = torch.stack([v0, v1, v2], dim=1)\n", + "# assert torch.all(faces_gpu[:, 0] != faces_gpu[:, 1])\n", + "# assert torch.all(faces_gpu[:, 0] != faces_gpu[:, 2])\n", + "# assert torch.all(faces_gpu[:, 1] != faces_gpu[:, 2])\n", + "\n", + "faces_gpu = faces_gpu.to(torch.int32).contiguous()\n", + "aabb = torch.tensor([[-0.5, -0.5, -0.5], [0.5, 0.5, 0.5]], device=device, dtype=torch.float32)\n", + "voxel_size = ((aabb[1] - aabb[0]) / GRID).to(torch.float32).contiguous()\n", + "grid_range = torch.tensor([[0, 0, 0], [GRID, GRID, GRID]], device=device, dtype=torch.int32)\n", + "vertices_local = (vertices_gpu - aabb[0].view(1, 3)).contiguous()\n", + "\n", + "print('vertices_gpu :', vertices_gpu.shape, vertices_gpu.dtype, vertices_gpu.device)\n", + "print('faces_gpu :', faces_gpu.shape, faces_gpu.dtype, faces_gpu.device)\n", + "print('voxel_size :', voxel_size, voxel_size.dtype, voxel_size.device)\n", + "print('grid_range :', grid_range.shape, grid_range.dtype, grid_range.device)\n", + "print('vertices_local:', vertices_local.shape, vertices_local.dtype, vertices_local.device)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "d2b377ff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# CPU verification (through JIT module API directly)\n", + "cpu_out = ext_mod.mesh_to_flexible_dual_grid_cpu(\n", + " vertices_local.cpu(),\n", + " faces_gpu.cpu(),\n", + " voxel_size.cpu(),\n", + " grid_range.cpu(),\n", + " 1.0,\n", + " 0.2,\n", + " 1e-2,\n", + " False,\n", + ")\n", + "cpu_ok = True\n", + "\n", + "cpu_ok" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "9e1d8c6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# GPU verification (through JIT module API directly)\n", + "gpu_out = ext_mod.mesh_to_flexible_dual_grid_gpu(\n", + " vertices_local,\n", + " faces_gpu,\n", + " voxel_size,\n", + " grid_range,\n", + " 1.0,\n", + " 0.2,\n", + " 1e-2,\n", + " 262144,\n", + " 1024,\n", + ")\n", + "torch.cuda.synchronize(device)\n", + "gpu_ok = True\n", + "\n", + "gpu_ok" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "439fc96f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CPU] coords : (1441050, 3) torch.int32 cpu\n", + "[CPU] dual_vertices: (1441050, 3) torch.float32 cpu\n", + "[CPU] intersected : (1441050, 3) torch.bool cpu\n", + "[GPU] coords : (1441050, 3) torch.int32 cuda:0\n", + "[GPU] dual_vertices: (1441050, 3) torch.float32 cuda:0\n", + "[GPU] intersected : (1441050, 3) torch.bool cuda:0\n", + "voxel_count cpu/gpu = 1441050 1441050\n", + "relative voxel count gap = 0.0\n" + ] + } + ], + "source": [ + "def summarize_out(tag, out):\n", + " coords, dual_vertices, intersected = out\n", + " print(f'[{tag}] coords :', tuple(coords.shape), coords.dtype, coords.device)\n", + " print(f'[{tag}] dual_vertices:', tuple(dual_vertices.shape), dual_vertices.dtype, dual_vertices.device)\n", + " print(f'[{tag}] intersected :', tuple(intersected.shape), intersected.dtype, intersected.device)\n", + " assert coords.dim() == 2 and coords.size(1) == 3\n", + " assert dual_vertices.dim() == 2 and dual_vertices.size(1) == 3\n", + " assert intersected.dim() == 2 and intersected.size(1) == 3\n", + "\n", + "if cpu_ok:\n", + " summarize_out('CPU', cpu_out)\n", + "if gpu_ok:\n", + " summarize_out('GPU', gpu_out)\n", + "\n", + "if cpu_ok and gpu_ok:\n", + " c_cpu = int(cpu_out[0].shape[0])\n", + " c_gpu = int(gpu_out[0].shape[0])\n", + " print('voxel_count cpu/gpu =', c_cpu, c_gpu)\n", + " if max(c_cpu, c_gpu) > 0:\n", + " rel = abs(c_cpu - c_gpu) / max(c_cpu, c_gpu)\n", + " print('relative voxel count gap =', rel)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "88322a94", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "voxel geometry check\n", + " cpu voxel count: 1441050 (duplicate coords: 0 )\n", + " gpu voxel count: 1441050 (duplicate coords: 0 )\n", + " matched voxels : 1441050\n", + " cpu-only voxels: 0\n", + " gpu-only voxels: 0\n", + "\n", + "dual vertices finite-value check on matched voxels\n", + " cpu non-finite elements : 0\n", + " gpu non-finite elements : 0\n", + " either-side non-finite : 0\n", + "\n", + "dual vertices per-element error on matched voxels (finite elements only)\n", + " mean MSE: 1.7465549007056325e-09\n", + " max MSE: 1.5312387404264882e-05\n", + " RMSE : 4.1791805415414274e-05\n", + "\n", + "intersected consistency on matched voxels\n", + " matched rows : 1441050\n", + " equal rows : 1441050\n", + " mismatch rows: 0\n", + " mismatch ratio: 0.0\n" + ] + } + ], + "source": [ + "assert cpu_ok and gpu_ok, 'Run CPU/GPU cells first.'\n", + "\n", + "cpu_coords, cpu_dual, cpu_inter = cpu_out\n", + "gpu_coords, gpu_dual, gpu_inter = gpu_out\n", + "\n", + "cpu_coords_i64 = cpu_coords.to(dtype=torch.int64, device='cpu').contiguous()\n", + "gpu_coords_i64 = gpu_coords.to(dtype=torch.int64, device='cpu').contiguous()\n", + "cpu_dual_f32 = cpu_dual.to(dtype=torch.float32, device='cpu').contiguous()\n", + "gpu_dual_f32 = gpu_dual.to(dtype=torch.float32, device='cpu').contiguous()\n", + "cpu_inter_cpu = cpu_inter.to(device='cpu').contiguous()\n", + "gpu_inter_cpu = gpu_inter.to(device='cpu').contiguous()\n", + "\n", + "def build_coord_map(coords_i64):\n", + " # key: (x, y, z) -> row index\n", + " m = {}\n", + " dup = 0\n", + " for i in range(coords_i64.shape[0]):\n", + " k = tuple(int(v) for v in coords_i64[i].tolist())\n", + " if k in m:\n", + " dup += 1\n", + " m[k] = i\n", + " return m, dup\n", + "\n", + "cpu_map, cpu_dup = build_coord_map(cpu_coords_i64)\n", + "gpu_map, gpu_dup = build_coord_map(gpu_coords_i64)\n", + "\n", + "cpu_keys = set(cpu_map.keys())\n", + "gpu_keys = set(gpu_map.keys())\n", + "common_keys = sorted(cpu_keys & gpu_keys)\n", + "only_cpu = sorted(cpu_keys - gpu_keys)\n", + "only_gpu = sorted(gpu_keys - cpu_keys)\n", + "\n", + "print('voxel geometry check')\n", + "print(' cpu voxel count:', len(cpu_keys), '(duplicate coords:', cpu_dup, ')')\n", + "print(' gpu voxel count:', len(gpu_keys), '(duplicate coords:', gpu_dup, ')')\n", + "print(' matched voxels :', len(common_keys))\n", + "print(' cpu-only voxels:', len(only_cpu))\n", + "print(' gpu-only voxels:', len(only_gpu))\n", + "\n", + "if len(common_keys) > 0:\n", + " cpu_idx = torch.tensor([cpu_map[k] for k in common_keys], dtype=torch.long)\n", + " gpu_idx = torch.tensor([gpu_map[k] for k in common_keys], dtype=torch.long)\n", + "\n", + " cpu_dual_match = cpu_dual_f32[cpu_idx]\n", + " gpu_dual_match = gpu_dual_f32[gpu_idx]\n", + "\n", + " cpu_finite = torch.isfinite(cpu_dual_match)\n", + " gpu_finite = torch.isfinite(gpu_dual_match)\n", + " both_finite = cpu_finite & gpu_finite\n", + "\n", + " cpu_nonfinite = int((~cpu_finite).sum().item())\n", + " gpu_nonfinite = int((~gpu_finite).sum().item())\n", + " either_nonfinite = int((~both_finite).sum().item())\n", + "\n", + " print('\\ndual vertices finite-value check on matched voxels')\n", + " print(' cpu non-finite elements :', cpu_nonfinite)\n", + " print(' gpu non-finite elements :', gpu_nonfinite)\n", + " print(' either-side non-finite :', either_nonfinite)\n", + "\n", + " if int(both_finite.sum().item()) > 0:\n", + " diff = cpu_dual_match - gpu_dual_match\n", + " sq_err = diff.pow(2)\n", + " sq_err_finite = sq_err[both_finite]\n", + "\n", + " mean_mse = float(sq_err_finite.mean().item())\n", + " max_mse = float(sq_err_finite.max().item())\n", + " rmse = float(torch.sqrt(sq_err_finite.mean()).item())\n", + "\n", + " print('\\ndual vertices per-element error on matched voxels (finite elements only)')\n", + " print(' mean MSE:', mean_mse)\n", + " print(' max MSE:', max_mse)\n", + " print(' RMSE :', rmse)\n", + " else:\n", + " print('\\nNo finite dual-vertex elements to compare.')\n", + "\n", + " cpu_inter_match = cpu_inter_cpu[cpu_idx]\n", + " gpu_inter_match = gpu_inter_cpu[gpu_idx]\n", + " inter_same = (cpu_inter_match == gpu_inter_match).all(dim=1)\n", + " inter_mismatch = int((~inter_same).sum().item())\n", + "\n", + " print('\\nintersected consistency on matched voxels')\n", + " print(' matched rows :', len(common_keys))\n", + " print(' equal rows :', int(inter_same.sum().item()))\n", + " print(' mismatch rows:', inter_mismatch)\n", + " print(' mismatch ratio:', inter_mismatch / len(common_keys))\n", + "else:\n", + " print('\\nNo matched voxels between CPU and GPU outputs; cannot compute dual/intersected comparisons.')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "010acf78", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "matched voxels = 1441050\n", + "finite dual elements = 4323150 / 4323150\n", + "finite ratio = 1.0\n", + "\n", + "absolute error summary\n", + " max abs err = 0.00391310453414917\n", + " mean abs err = 2.9527145670726895e-06\n", + " p95 abs err = 3.0994415283203125e-06\n", + " p99 abs err = 1.1920928955078125e-05\n", + "\n", + "relative error summary (|cpu-gpu| / max(|cpu|, 1e-8))\n", + " max rel err = 187565.953125\n", + " mean rel err = 0.5353795289993286\n", + " p95 rel err = 9.835954188019969e-06\n", + " p99 rel err = 0.00011704320786520839\n", + "\n", + "ratio above thresholds\n", + " > 1e-3: 9227 / 4323150 = 0.002134323352185328\n", + " > 1e-2: 1165 / 4323150 = 0.0002694794305078473\n", + " > 1e-1: 155 / 4323150 = 3.585348646241745e-05\n" + ] + } + ], + "source": [ + "assert cpu_ok and gpu_ok, 'Run CPU/GPU cells first.'\n", + "\n", + "cpu_coords, cpu_dual, cpu_inter = cpu_out\n", + "gpu_coords, gpu_dual, gpu_inter = gpu_out\n", + "\n", + "cpu_coords_i64 = cpu_coords.to(dtype=torch.int64, device='cpu').contiguous()\n", + "gpu_coords_i64 = gpu_coords.to(dtype=torch.int64, device='cpu').contiguous()\n", + "cpu_dual_f32 = cpu_dual.to(dtype=torch.float32, device='cpu').contiguous()\n", + "gpu_dual_f32 = gpu_dual.to(dtype=torch.float32, device='cpu').contiguous()\n", + "\n", + "cpu_map = {tuple(int(v) for v in cpu_coords_i64[i].tolist()): i for i in range(cpu_coords_i64.shape[0])}\n", + "gpu_map = {tuple(int(v) for v in gpu_coords_i64[i].tolist()): i for i in range(gpu_coords_i64.shape[0])}\n", + "\n", + "common_keys = sorted(set(cpu_map.keys()) & set(gpu_map.keys()))\n", + "assert len(common_keys) > 0, 'No matched voxels.'\n", + "\n", + "cpu_idx = torch.tensor([cpu_map[k] for k in common_keys], dtype=torch.long)\n", + "gpu_idx = torch.tensor([gpu_map[k] for k in common_keys], dtype=torch.long)\n", + "\n", + "cpu_dual_match = cpu_dual_f32[cpu_idx]\n", + "gpu_dual_match = gpu_dual_f32[gpu_idx]\n", + "\n", + "both_finite = torch.isfinite(cpu_dual_match) & torch.isfinite(gpu_dual_match)\n", + "finite_count = int(both_finite.sum().item())\n", + "all_count = int(both_finite.numel())\n", + "\n", + "abs_err = (cpu_dual_match - gpu_dual_match).abs()\n", + "abs_err_finite = abs_err[both_finite]\n", + "\n", + "eps = 1e-8\n", + "den = torch.maximum(cpu_dual_match.abs(), torch.full_like(cpu_dual_match, eps))\n", + "rel_err = abs_err / den\n", + "rel_err_finite = rel_err[both_finite]\n", + "\n", + "print('matched voxels =', len(common_keys))\n", + "print('finite dual elements =', finite_count, '/', all_count)\n", + "print('finite ratio =', finite_count / all_count)\n", + "\n", + "print('\\nabsolute error summary')\n", + "print(' max abs err =', float(abs_err_finite.max().item()))\n", + "print(' mean abs err =', float(abs_err_finite.mean().item()))\n", + "print(' p95 abs err =', float(torch.quantile(abs_err_finite, 0.95).item()))\n", + "print(' p99 abs err =', float(torch.quantile(abs_err_finite, 0.99).item()))\n", + "\n", + "print('\\nrelative error summary (|cpu-gpu| / max(|cpu|, 1e-8))')\n", + "print(' max rel err =', float(rel_err_finite.max().item()))\n", + "print(' mean rel err =', float(rel_err_finite.mean().item()))\n", + "print(' p95 rel err =', float(torch.quantile(rel_err_finite, 0.95).item()))\n", + "print(' p99 rel err =', float(torch.quantile(rel_err_finite, 0.99).item()))\n", + "\n", + "bad_1e3 = int((rel_err_finite > 1e-3).sum().item())\n", + "bad_1e2 = int((rel_err_finite > 1e-2).sum().item())\n", + "bad_1e1 = int((rel_err_finite > 1e-1).sum().item())\n", + "print('\\nratio above thresholds')\n", + "print(' > 1e-3:', bad_1e3, '/', finite_count, '=', bad_1e3 / finite_count)\n", + "print(' > 1e-2:', bad_1e2, '/', finite_count, '=', bad_1e2 / finite_count)\n", + "print(' > 1e-1:', bad_1e1, '/', finite_count, '=', bad_1e1 / finite_count)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "0456f91d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "symmetric relative error (|cpu-gpu| / max(|cpu|,|gpu|,1e-8))\n", + " max = 1.0\n", + " p95 = 9.835891432885546e-06\n", + " p99 = 0.00011704320786520839\n", + " > 0.001: 9224 / 4323150 = 0.002133629413737668\n", + " > 0.01: 1155 / 4323150 = 0.0002671663023489816\n", + " > 0.1: 149 / 4323150 = 3.446560956709807e-05\n", + "\n", + "non-tiny entries (max(|cpu|,|gpu|) > 1e-3)\n", + " count = 4319275\n", + " max rel = 1.0\n", + " p95 rel = 9.817938007472549e-06\n", + " p99 rel = 0.00011523898865561932\n" + ] + } + ], + "source": [ + "assert cpu_ok and gpu_ok\n", + "\n", + "cpu_coords, cpu_dual, _ = cpu_out\n", + "gpu_coords, gpu_dual, _ = gpu_out\n", + "\n", + "cpu_coords_i64 = cpu_coords.to(dtype=torch.int64, device='cpu').contiguous()\n", + "gpu_coords_i64 = gpu_coords.to(dtype=torch.int64, device='cpu').contiguous()\n", + "cpu_dual_f32 = cpu_dual.to(dtype=torch.float32, device='cpu').contiguous()\n", + "gpu_dual_f32 = gpu_dual.to(dtype=torch.float32, device='cpu').contiguous()\n", + "\n", + "cpu_map = {tuple(int(v) for v in cpu_coords_i64[i].tolist()): i for i in range(cpu_coords_i64.shape[0])}\n", + "gpu_map = {tuple(int(v) for v in gpu_coords_i64[i].tolist()): i for i in range(gpu_coords_i64.shape[0])}\n", + "common_keys = sorted(set(cpu_map.keys()) & set(gpu_map.keys()))\n", + "\n", + "cpu_idx = torch.tensor([cpu_map[k] for k in common_keys], dtype=torch.long)\n", + "gpu_idx = torch.tensor([gpu_map[k] for k in common_keys], dtype=torch.long)\n", + "\n", + "cpu_dual_match = cpu_dual_f32[cpu_idx]\n", + "gpu_dual_match = gpu_dual_f32[gpu_idx]\n", + "\n", + "finite = torch.isfinite(cpu_dual_match) & torch.isfinite(gpu_dual_match)\n", + "abs_err = (cpu_dual_match - gpu_dual_match).abs()[finite]\n", + "scale = torch.maximum(cpu_dual_match.abs(), gpu_dual_match.abs())[finite]\n", + "\n", + "# Symmetric relative error; better behaved than cpu-only denominator.\n", + "sym_rel = abs_err / torch.maximum(scale, torch.full_like(scale, 1e-8))\n", + "\n", + "print('symmetric relative error (|cpu-gpu| / max(|cpu|,|gpu|,1e-8))')\n", + "print(' max =', float(sym_rel.max().item()))\n", + "print(' p95 =', float(torch.quantile(sym_rel, 0.95).item()))\n", + "print(' p99 =', float(torch.quantile(sym_rel, 0.99).item()))\n", + "\n", + "for thr in [1e-3, 1e-2, 1e-1]:\n", + " c = int((sym_rel > thr).sum().item())\n", + " print(f' > {thr}: {c} / {sym_rel.numel()} = {c / sym_rel.numel()}')\n", + "\n", + "# Report on non-tiny magnitude entries to avoid near-zero blow-up.\n", + "mask_non_tiny = scale > 1e-3\n", + "if int(mask_non_tiny.sum().item()) > 0:\n", + " rel_non_tiny = (abs_err[mask_non_tiny] / scale[mask_non_tiny])\n", + " print('\\nnon-tiny entries (max(|cpu|,|gpu|) > 1e-3)')\n", + " print(' count =', int(rel_non_tiny.numel()))\n", + " print(' max rel =', float(rel_non_tiny.max().item()))\n", + " print(' p95 rel =', float(torch.quantile(rel_non_tiny, 0.95).item()))\n", + " print(' p99 rel =', float(torch.quantile(rel_non_tiny, 0.99).item()))" + ] + }, + { + "cell_type": "markdown", + "id": "c597cddf", + "metadata": {}, + "source": [ + "## How To Read Failures\n", + "\n", + "- If Cell 3 (JIT build) fails with unresolved symbols, native declarations/registrations are ahead of implementations.\n", + "- If CPU passes and GPU fails in Cells 7/8, Python dispatch is correct but GPU binding implementation/build wiring is incomplete.\n", + "- If both pass, the end-to-end API is likely usable for further numeric comparison and profiling." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "symm-enforce", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/o-voxel/notebooks/verify_fdg_stage_profile_jit_cpu_gpu.ipynb b/o-voxel/notebooks/verify_fdg_stage_profile_jit_cpu_gpu.ipynb new file mode 100644 index 00000000..65bb9ed2 --- /dev/null +++ b/o-voxel/notebooks/verify_fdg_stage_profile_jit_cpu_gpu.ipynb @@ -0,0 +1,815 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "261359a5", + "metadata": {}, + "source": [ + "# Verify FDG Stage Profile JIT CPU/GPU\n", + "\n", + "This notebook profiles the flexible dual grid implementation stage by stage:\n", + "\n", + "1. Choose whether to JIT compile the current local sources or directly import an already-installed `o_voxel` package.\n", + "2. Load the selected Python API.\n", + "3. Generate synthetic Gaussian triangle soup directly in the notebook, including a few degenerate triangles.\n", + "4. Time `intersect_qef`, `face_qef`, `boundary_qef`, and the full `mesh_to_flexible_dual_grid` pipeline on both CPU and GPU.\n", + "5. Report the residual runtime between the summed three stages and the full pipeline.\n", + "\n", + "No local mesh file is loaded.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7697cc7f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import time\n", + "import types\n", + "import importlib\n", + "import importlib.util\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import torch\n", + "from torch.utils.cpp_extension import load\n", + "\n", + "import pandas as pd\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ee33f45f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROOT = /mnt/nvmefs/Projects/Part Generation/TRELLIS.2-o-voxel-gpu-mod/o-voxel\n", + "USE_JIT = False\n", + "torch = 2.6.0+cu124\n", + "cuda available = True\n", + "cuda device = NVIDIA GeForce RTX 4090\n", + "installed package path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/__init__.py\n", + "installed extension path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/_C.cpython-310-x86_64-linux-gnu.so\n", + "installed API path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/convert/flexible_dual_grid.py\n", + "api_mode = installed\n", + "ext_mod = \n", + "fdg_api = \n" + ] + } + ], + "source": [ + "ROOT = Path(r'/mnt/nvmefs/Projects/Part Generation/TRELLIS.2-o-voxel-gpu-mod/o-voxel').resolve()\n", + "USE_JIT = False\n", + "INSTALLED_IMPORT_NAME = 'o_voxel'\n", + "\n", + "print('ROOT =', ROOT)\n", + "print('USE_JIT =', USE_JIT)\n", + "print('torch =', torch.__version__)\n", + "print('cuda available =', torch.cuda.is_available())\n", + "if torch.cuda.is_available():\n", + " print('cuda device =', torch.cuda.get_device_name(0))\n", + "\n", + "\n", + "def build_jit_extension():\n", + " sources = [\n", + " 'src/hash/hash.cu',\n", + " 'src/convert/flexible_dual_grid.cpp',\n", + " 'src/convert/volumetic_attr.cpp',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/torch_bindings.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.cu',\n", + " 'src/serialize/api.cu',\n", + " 'src/serialize/hilbert.cu',\n", + " 'src/serialize/z_order.cu',\n", + " 'src/io/svo.cpp',\n", + " 'src/io/filter_parent.cpp',\n", + " 'src/io/filter_neighbor.cpp',\n", + " 'src/rasterize/rasterize.cu',\n", + " 'src/ext.cpp',\n", + "]\n", + " full_sources = [str(ROOT / s) for s in sources]\n", + " missing = [s for s in full_sources if not Path(s).exists()]\n", + " if missing:\n", + " raise FileNotFoundError(f'Missing sources: {missing}')\n", + "\n", + " build_dir = ROOT / '.verify_build'\n", + " build_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + " unique_suffix = f\"{os.getpid()}_{time.time_ns()}_{os.urandom(4).hex()}\"\n", + " mod_name = f\"o_voxel_verify_{unique_suffix}\"\n", + "\n", + " max_jobs = max(1, os.cpu_count() or 1)\n", + " os.environ['MAX_JOBS'] = str(max_jobs)\n", + " print('MAX_JOBS =', os.environ['MAX_JOBS'])\n", + " print('JIT module name =', mod_name)\n", + "\n", + " ext_mod = load(\n", + " name=mod_name,\n", + " sources=full_sources,\n", + " extra_include_paths=[str(ROOT / 'third_party/eigen')],\n", + " extra_cflags=['-O3', '-std=c++17'],\n", + " extra_cuda_cflags=['-O3', '-std=c++17', '--expt-relaxed-constexpr'],\n", + " with_cuda=True,\n", + " build_directory=str(build_dir),\n", + " verbose=True,\n", + " )\n", + " print('JIT build/link: OK')\n", + " print('jit module path =', ext_mod.__file__)\n", + " return ext_mod\n", + "\n", + "\n", + "def load_local_flexible_dual_grid(ext_mod):\n", + " pkg = types.ModuleType('o_voxel')\n", + " pkg.__path__ = [str(ROOT / 'o_voxel')]\n", + " pkg._C = ext_mod\n", + " sys.modules['o_voxel'] = pkg\n", + " sys.modules['o_voxel._C'] = ext_mod\n", + "\n", + " convert_pkg = types.ModuleType('o_voxel.convert')\n", + " convert_pkg.__path__ = [str(ROOT / 'o_voxel' / 'convert')]\n", + " sys.modules['o_voxel.convert'] = convert_pkg\n", + "\n", + " spec = importlib.util.spec_from_file_location(\n", + " 'o_voxel.convert.flexible_dual_grid',\n", + " ROOT / 'o_voxel' / 'convert' / 'flexible_dual_grid.py',\n", + " )\n", + " mod = importlib.util.module_from_spec(spec)\n", + " sys.modules['o_voxel.convert.flexible_dual_grid'] = mod\n", + " spec.loader.exec_module(mod)\n", + " return mod\n", + "\n", + "\n", + "if USE_JIT:\n", + " ext_mod = build_jit_extension()\n", + " fdg_api = load_local_flexible_dual_grid(ext_mod)\n", + " api_mode = 'jit'\n", + "else:\n", + " installed_pkg = importlib.import_module(INSTALLED_IMPORT_NAME)\n", + " ext_mod = installed_pkg._C\n", + " fdg_api = importlib.import_module(f'{INSTALLED_IMPORT_NAME}.convert.flexible_dual_grid')\n", + " api_mode = 'installed'\n", + " print('installed package path =', getattr(installed_pkg, '__file__', ''))\n", + " print('installed extension path =', getattr(ext_mod, '__file__', ''))\n", + " print('installed API path =', getattr(fdg_api, '__file__', ''))\n", + "\n", + "print('api_mode =', api_mode)\n", + "print('ext_mod =', ext_mod)\n", + "print('fdg_api =', fdg_api)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1948f45d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "has mesh_to_flexible_dual_grid_cpu = True\n", + "has mesh_to_flexible_dual_grid_gpu = True\n", + "has intersection_occ = True\n", + "has intersect_qef = True\n", + "has voxelize_mesh_gpu = True\n", + "has voxelize_edge_gpu = True\n", + "has face_qef = True\n", + "has voxel_traverse_edge_dda_gpu = True\n", + "has boundary_qef = True\n" + ] + } + ], + "source": [ + "print('has mesh_to_flexible_dual_grid_cpu =', hasattr(ext_mod, 'mesh_to_flexible_dual_grid_cpu'))\n", + "print('has mesh_to_flexible_dual_grid_gpu =', hasattr(ext_mod, 'mesh_to_flexible_dual_grid_gpu'))\n", + "print('has intersection_occ =', hasattr(fdg_api, 'intersection_occ'))\n", + "print('has intersect_qef =', hasattr(fdg_api, 'intersect_qef'))\n", + "print('has voxelize_mesh_gpu =', hasattr(fdg_api, 'voxelize_mesh_gpu'))\n", + "print('has voxelize_edge_gpu =', hasattr(fdg_api, 'voxelize_edge_gpu'))\n", + "print('has face_qef =', hasattr(fdg_api, 'face_qef'))\n", + "print('has voxel_traverse_edge_dda_gpu =', hasattr(fdg_api, 'voxel_traverse_edge_dda_gpu'))\n", + "print('has boundary_qef =', hasattr(fdg_api, 'boundary_qef'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "70206a67", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "vertices_gpu: torch.Size([12000, 3])\n", + "faces_gpu: torch.Size([36000, 3])\n", + "triangles_gpu: torch.Size([36000, 3, 3])\n", + "voxel_size_gpu: tensor([0.0078, 0.0078, 0.0078], device='cuda:0')\n", + "grid_range_gpu: tensor([[ 0, 0, 0],\n", + " [128, 128, 128]], device='cuda:0', dtype=torch.int32)\n" + ] + } + ], + "source": [ + "assert torch.cuda.is_available(), 'CUDA is required for this notebook.'\n", + "device = torch.device('cuda:0')\n", + "torch.cuda.set_device(device)\n", + "\n", + "GRID = 128\n", + "N_VERT = 12000\n", + "N_FACE = 36000\n", + "FACE_WEIGHT = 1.0\n", + "BOUNDARY_WEIGHT = 0.2\n", + "REGULARIZATION_WEIGHT = 1e-2\n", + "INTERSECT_CHUNK_TRIANGLES = 262144\n", + "BOUNDARY_CHUNK_STEPS = 1024\n", + "WARMUP = 3\n", + "ITERS = 10\n", + "AABB = [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]\n", + "\n", + "torch.manual_seed(13)\n", + "vertices_gpu = (0.5 + 0.18 * torch.randn(N_VERT, 3, device=device, dtype=torch.float32)).clamp_(0.0, 1.0)\n", + "vertices_gpu[1] = vertices_gpu[0]\n", + "vertices_gpu[3] = vertices_gpu[2]\n", + "faces_gpu = torch.randint(0, N_VERT, (N_FACE, 3), device=device, dtype=torch.int64)\n", + "faces_gpu[:3] = torch.tensor([[0, 0, 1], [2, 3, 3], [4, 4, 4]], device=device, dtype=torch.int64)\n", + "faces_gpu = faces_gpu.to(torch.int32).contiguous()\n", + "\n", + "vertices_cpu = vertices_gpu.cpu().contiguous()\n", + "faces_cpu = faces_gpu.cpu().contiguous()\n", + "\n", + "aabb_gpu = torch.tensor(AABB, dtype=torch.float32, device=device)\n", + "aabb_cpu = aabb_gpu.cpu()\n", + "voxel_size_gpu = ((aabb_gpu[1] - aabb_gpu[0]) / GRID).to(torch.float32).contiguous()\n", + "voxel_size_cpu = voxel_size_gpu.cpu().contiguous()\n", + "grid_range_gpu = torch.tensor([[0, 0, 0], [GRID, GRID, GRID]], dtype=torch.int32, device=device)\n", + "grid_range_cpu = grid_range_gpu.cpu().contiguous()\n", + "\n", + "triangles_gpu = vertices_gpu[faces_gpu.long()].contiguous().to(torch.float32)\n", + "triangles_cpu = triangles_gpu.cpu().contiguous()\n", + "\n", + "print('vertices_gpu:', vertices_gpu.shape)\n", + "print('faces_gpu:', faces_gpu.shape)\n", + "print('triangles_gpu:', triangles_gpu.shape)\n", + "print('voxel_size_gpu:', voxel_size_gpu)\n", + "print('grid_range_gpu:', grid_range_gpu)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "487de893", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "boundaries_gpu: torch.Size([107821, 2, 3])\n" + ] + } + ], + "source": [ + "def build_boundary_segments(vertices: torch.Tensor, faces: torch.Tensor):\n", + " faces_i64 = faces.to(torch.int64).cpu()\n", + " e01 = faces_i64[:, [0, 1]]\n", + " e12 = faces_i64[:, [1, 2]]\n", + " e20 = faces_i64[:, [2, 0]]\n", + " edges = torch.cat([e01, e12, e20], dim=0)\n", + " edges = torch.sort(edges, dim=1).values\n", + " unique_edges, counts = torch.unique(edges, dim=0, return_counts=True)\n", + " boundary_edges = unique_edges[counts == 1]\n", + " boundary_edges = boundary_edges.to(device=vertices.device, dtype=torch.long)\n", + " return vertices[boundary_edges].contiguous().to(torch.float32)\n", + "\n", + "boundaries_gpu = build_boundary_segments(vertices_gpu, faces_gpu)\n", + "boundaries_cpu = boundaries_gpu.cpu().contiguous()\n", + "print('boundaries_gpu:', boundaries_gpu.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c9b3b74b", + "metadata": {}, + "outputs": [], + "source": [ + "def time_call(fn, warmup=WARMUP, iters=ITERS, sync_device=None):\n", + " out = None\n", + " for _ in range(warmup):\n", + " out = fn()\n", + " if sync_device is not None:\n", + " torch.cuda.synchronize(sync_device)\n", + "\n", + " times_ms = []\n", + " for _ in range(iters):\n", + " if sync_device is not None:\n", + " torch.cuda.synchronize(sync_device)\n", + " t0 = time.perf_counter()\n", + " out = fn()\n", + " if sync_device is not None:\n", + " torch.cuda.synchronize(sync_device)\n", + " t1 = time.perf_counter()\n", + " times_ms.append((t1 - t0) * 1000.0)\n", + " return out, np.asarray(times_ms, dtype=np.float64)\n", + "\n", + "\n", + "def summarize_times(device_name, stage_name, times_ms):\n", + " return {\n", + " 'device': device_name,\n", + " 'stage': stage_name,\n", + " 'mean_ms': float(times_ms.mean()),\n", + " 'std_ms': float(times_ms.std(ddof=0)),\n", + " 'min_ms': float(times_ms.min()),\n", + " 'max_ms': float(times_ms.max()),\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a17118c8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "intersect_qef gpu/cpu voxel counts: 1118854 1118854\n" + ] + } + ], + "source": [ + "(voxels_gpu, mean_sum_gpu, cnt_gpu, intersected_gpu, qefs_gpu), t_intersect_gpu = time_call(\n", + " lambda: fdg_api.intersect_qef(\n", + " triangles_gpu,\n", + " voxel_size_gpu,\n", + " grid_range_gpu,\n", + " chunk_triangles=INTERSECT_CHUNK_TRIANGLES,\n", + " ),\n", + " sync_device=device,\n", + ")\n", + "(voxels_cpu, mean_sum_cpu, cnt_cpu, intersected_cpu, qefs_cpu), t_intersect_cpu = time_call(\n", + " lambda: fdg_api.intersect_qef(\n", + " triangles_cpu,\n", + " voxel_size_cpu,\n", + " grid_range_cpu,\n", + " chunk_triangles=INTERSECT_CHUNK_TRIANGLES,\n", + " ),\n", + ")\n", + "print(\"intersect_qef gpu/cpu voxel counts:\", voxels_gpu.shape[0], voxels_cpu.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "74f3428c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "face_qef gpu/cpu shapes: torch.Size([1118854, 4, 4]) torch.Size([1118854, 4, 4])\n" + ] + } + ], + "source": [ + "qefs_face_gpu, t_face_gpu = time_call(\n", + " lambda: fdg_api.face_qef(triangles_gpu, voxel_size_gpu, grid_range_gpu, voxels_gpu),\n", + " sync_device=device,\n", + ")\n", + "qefs_face_cpu, t_face_cpu = time_call(\n", + " lambda: fdg_api.face_qef(triangles_cpu, voxel_size_cpu, grid_range_cpu, voxels_cpu),\n", + ")\n", + "print('face_qef gpu/cpu shapes:', qefs_face_gpu.shape, qefs_face_cpu.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "49e5fbcf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "boundary_qef gpu/cpu shapes: torch.Size([1118854, 4, 4]) torch.Size([1118854, 4, 4])\n" + ] + } + ], + "source": [ + "qefs_boundary_gpu, t_boundary_gpu = time_call(\n", + " lambda: fdg_api.boundary_qef(boundaries_gpu, voxel_size_gpu, grid_range_gpu, BOUNDARY_WEIGHT, voxels_gpu, chunk_steps=BOUNDARY_CHUNK_STEPS),\n", + " sync_device=device,\n", + ")\n", + "qefs_boundary_cpu, t_boundary_cpu = time_call(\n", + " lambda: fdg_api.boundary_qef(boundaries_cpu, voxel_size_cpu, grid_range_cpu, BOUNDARY_WEIGHT, voxels_cpu, chunk_steps=BOUNDARY_CHUNK_STEPS),\n", + ")\n", + "print('boundary_qef gpu/cpu shapes:', qefs_boundary_gpu.shape, qefs_boundary_cpu.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "854cc87d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "full gpu voxel count: 1118854\n", + "full cpu voxel count: 1118854\n" + ] + } + ], + "source": [ + "full_gpu, t_full_gpu = time_call(\n", + " lambda: fdg_api.mesh_to_flexible_dual_grid(\n", + " vertices_gpu,\n", + " faces_gpu,\n", + " grid_size=GRID,\n", + " aabb=AABB,\n", + " face_weight=FACE_WEIGHT,\n", + " boundary_weight=BOUNDARY_WEIGHT,\n", + " regularization_weight=REGULARIZATION_WEIGHT,\n", + " intersect_chunk_triangles=INTERSECT_CHUNK_TRIANGLES,\n", + " boundary_chunk_steps=BOUNDARY_CHUNK_STEPS,\n", + " ),\n", + " sync_device=device,\n", + ")\n", + "full_cpu, t_full_cpu = time_call(\n", + " lambda: fdg_api.mesh_to_flexible_dual_grid(\n", + " vertices_cpu,\n", + " faces_cpu,\n", + " grid_size=GRID,\n", + " aabb=AABB,\n", + " face_weight=FACE_WEIGHT,\n", + " boundary_weight=BOUNDARY_WEIGHT,\n", + " regularization_weight=REGULARIZATION_WEIGHT,\n", + " timing=False,\n", + " ),\n", + ")\n", + "print('full gpu voxel count:', full_gpu[0].shape[0])\n", + "print('full cpu voxel count:', full_cpu[0].shape[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "04db1677", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
devicestagemean_msstd_msmin_msmax_ms
0gpuintersect_qef1281.98030516.9903251257.2011761312.670635
1cpuintersect_qef10661.798837398.5747799799.40937011291.610871
2gpuface_qef315.7292651.217875314.277277317.836051
3cpuface_qef8524.037188153.0392988334.8954618864.845227
4gpuboundary_qef58.0988251.00747457.12511560.802159
5cpuboundary_qef851.0242933.636328843.272538855.898882
6gpufull_fdg1551.46451517.7605161533.1264351582.916710
7cpufull_fdg21210.323991452.19601220212.20365421949.401679
\n", + "
" + ], + "text/plain": [ + " device stage mean_ms std_ms min_ms max_ms\n", + "0 gpu intersect_qef 1281.980305 16.990325 1257.201176 1312.670635\n", + "1 cpu intersect_qef 10661.798837 398.574779 9799.409370 11291.610871\n", + "2 gpu face_qef 315.729265 1.217875 314.277277 317.836051\n", + "3 cpu face_qef 8524.037188 153.039298 8334.895461 8864.845227\n", + "4 gpu boundary_qef 58.098825 1.007474 57.125115 60.802159\n", + "5 cpu boundary_qef 851.024293 3.636328 843.272538 855.898882\n", + "6 gpu full_fdg 1551.464515 17.760516 1533.126435 1582.916710\n", + "7 cpu full_fdg 21210.323991 452.196012 20212.203654 21949.401679" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rows = [\n", + " summarize_times('gpu', 'intersect_qef', t_intersect_gpu),\n", + " summarize_times('cpu', 'intersect_qef', t_intersect_cpu),\n", + " summarize_times('gpu', 'face_qef', t_face_gpu),\n", + " summarize_times('cpu', 'face_qef', t_face_cpu),\n", + " summarize_times('gpu', 'boundary_qef', t_boundary_gpu),\n", + " summarize_times('cpu', 'boundary_qef', t_boundary_cpu),\n", + " summarize_times('gpu', 'full_fdg', t_full_gpu),\n", + " summarize_times('cpu', 'full_fdg', t_full_cpu),\n", + "]\n", + "\n", + "stage_df = pd.DataFrame(rows)\n", + "stage_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e5ae597f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
devicethree_stage_sum_mean_msfull_mean_msresidual_mean_ms
0gpu1655.8083951551.464515-104.343879
1cpu20036.86031721210.3239911173.463674
\n", + "
" + ], + "text/plain": [ + " device three_stage_sum_mean_ms full_mean_ms residual_mean_ms\n", + "0 gpu 1655.808395 1551.464515 -104.343879\n", + "1 cpu 20036.860317 21210.323991 1173.463674" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary = pd.DataFrame([\n", + " {\n", + " 'device': 'gpu',\n", + " 'three_stage_sum_mean_ms': float(t_intersect_gpu.mean() + t_face_gpu.mean() + t_boundary_gpu.mean()),\n", + " 'full_mean_ms': float(t_full_gpu.mean()),\n", + " 'residual_mean_ms': float(t_full_gpu.mean() - (t_intersect_gpu.mean() + t_face_gpu.mean() + t_boundary_gpu.mean())),\n", + " },\n", + " {\n", + " 'device': 'cpu',\n", + " 'three_stage_sum_mean_ms': float(t_intersect_cpu.mean() + t_face_cpu.mean() + t_boundary_cpu.mean()),\n", + " 'full_mean_ms': float(t_full_cpu.mean()),\n", + " 'residual_mean_ms': float(t_full_cpu.mean() - (t_intersect_cpu.mean() + t_face_cpu.mean() + t_boundary_cpu.mean())),\n", + " },\n", + "])\n", + "summary\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "df7a68d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stagecpu_over_gpu
0intersect_qef8.316664
1face_qef26.997932
2boundary_qef14.647874
3full_fdg13.671163
\n", + "
" + ], + "text/plain": [ + " stage cpu_over_gpu\n", + "0 intersect_qef 8.316664\n", + "1 face_qef 26.997932\n", + "2 boundary_qef 14.647874\n", + "3 full_fdg 13.671163" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speedup_df = pd.DataFrame([\n", + " {\n", + " 'stage': 'intersect_qef',\n", + " 'cpu_over_gpu': float(t_intersect_cpu.mean() / t_intersect_gpu.mean()),\n", + " },\n", + " {\n", + " 'stage': 'face_qef',\n", + " 'cpu_over_gpu': float(t_face_cpu.mean() / t_face_gpu.mean()),\n", + " },\n", + " {\n", + " 'stage': 'boundary_qef',\n", + " 'cpu_over_gpu': float(t_boundary_cpu.mean() / t_boundary_gpu.mean()),\n", + " },\n", + " {\n", + " 'stage': 'full_fdg',\n", + " 'cpu_over_gpu': float(t_full_cpu.mean() / t_full_gpu.mean()),\n", + " },\n", + "])\n", + "speedup_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c6ca585", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "symm-enforce", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/o-voxel/notebooks/verify_voxelize_edge_jit_oct_vs_dda.ipynb b/o-voxel/notebooks/verify_voxelize_edge_jit_oct_vs_dda.ipynb new file mode 100644 index 00000000..a70a2400 --- /dev/null +++ b/o-voxel/notebooks/verify_voxelize_edge_jit_oct_vs_dda.ipynb @@ -0,0 +1,502 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Verify `voxelize_edge_gpu` vs `voxel_traverse_edge_dda_gpu`\n", + "\n", + "This notebook compares the two edge voxelization paths exposed by the selected API:\n", + "\n", + "1. Choose whether to JIT compile the current local sources or directly import an already-installed `o_voxel` package.\n", + "2. Load the selected Python API.\n", + "3. Generate synthetic Gaussian triangle soup directly in the notebook, including a few degenerate triangles.\n", + "4. Extract unique edges and compare the OCT and DDA voxelization results using pair-level and voxel-level Jaccard scores.\n", + "\n", + "No local mesh file is loaded.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import time\n", + "import types\n", + "import importlib\n", + "import importlib.util\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import torch\n", + "from torch.utils.cpp_extension import load\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROOT = /mnt/nvmefs/Projects/Part Generation/TRELLIS.2-o-voxel-gpu-mod/o-voxel\n", + "USE_JIT = False\n", + "torch = 2.6.0+cu124\n", + "cuda available = True\n", + "cuda device = NVIDIA GeForce RTX 4090\n", + "installed package path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/__init__.py\n", + "installed extension path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/_C.cpython-310-x86_64-linux-gnu.so\n", + "installed API path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/convert/flexible_dual_grid.py\n", + "api_mode = installed\n", + "ext_mod = \n", + "fdg_api = \n" + ] + } + ], + "source": [ + "ROOT = Path(r'/mnt/nvmefs/Projects/Part Generation/TRELLIS.2-o-voxel-gpu-mod/o-voxel').resolve()\n", + "USE_JIT = False\n", + "INSTALLED_IMPORT_NAME = 'o_voxel'\n", + "\n", + "print('ROOT =', ROOT)\n", + "print('USE_JIT =', USE_JIT)\n", + "print('torch =', torch.__version__)\n", + "print('cuda available =', torch.cuda.is_available())\n", + "if torch.cuda.is_available():\n", + " print('cuda device =', torch.cuda.get_device_name(0))\n", + "\n", + "\n", + "def build_jit_extension():\n", + " sources = [\n", + " 'src/hash/hash.cu',\n", + " 'src/convert/flexible_dual_grid.cpp',\n", + " 'src/convert/volumetic_attr.cpp',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/torch_bindings.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.cu',\n", + " 'src/serialize/api.cu',\n", + " 'src/serialize/hilbert.cu',\n", + " 'src/serialize/z_order.cu',\n", + " 'src/io/svo.cpp',\n", + " 'src/io/filter_parent.cpp',\n", + " 'src/io/filter_neighbor.cpp',\n", + " 'src/rasterize/rasterize.cu',\n", + " 'src/ext.cpp',\n", + "]\n", + " full_sources = [str(ROOT / s) for s in sources]\n", + " missing = [s for s in full_sources if not Path(s).exists()]\n", + " if missing:\n", + " raise FileNotFoundError(f'Missing sources: {missing}')\n", + "\n", + " build_dir = ROOT / '.verify_build'\n", + " build_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + " unique_suffix = f\"{os.getpid()}_{time.time_ns()}_{os.urandom(4).hex()}\"\n", + " mod_name = f\"o_voxel_verify_{unique_suffix}\"\n", + "\n", + " max_jobs = max(1, os.cpu_count() or 1)\n", + " os.environ['MAX_JOBS'] = str(max_jobs)\n", + " print('MAX_JOBS =', os.environ['MAX_JOBS'])\n", + " print('JIT module name =', mod_name)\n", + "\n", + " ext_mod = load(\n", + " name=mod_name,\n", + " sources=full_sources,\n", + " extra_include_paths=[str(ROOT / 'third_party/eigen')],\n", + " extra_cflags=['-O3', '-std=c++17'],\n", + " extra_cuda_cflags=['-O3', '-std=c++17', '--expt-relaxed-constexpr'],\n", + " with_cuda=True,\n", + " build_directory=str(build_dir),\n", + " verbose=True,\n", + " )\n", + " print('JIT build/link: OK')\n", + " print('jit module path =', ext_mod.__file__)\n", + " return ext_mod\n", + "\n", + "\n", + "def load_local_flexible_dual_grid(ext_mod):\n", + " pkg = types.ModuleType('o_voxel')\n", + " pkg.__path__ = [str(ROOT / 'o_voxel')]\n", + " pkg._C = ext_mod\n", + " sys.modules['o_voxel'] = pkg\n", + " sys.modules['o_voxel._C'] = ext_mod\n", + "\n", + " convert_pkg = types.ModuleType('o_voxel.convert')\n", + " convert_pkg.__path__ = [str(ROOT / 'o_voxel' / 'convert')]\n", + " sys.modules['o_voxel.convert'] = convert_pkg\n", + "\n", + " spec = importlib.util.spec_from_file_location(\n", + " 'o_voxel.convert.flexible_dual_grid',\n", + " ROOT / 'o_voxel' / 'convert' / 'flexible_dual_grid.py',\n", + " )\n", + " mod = importlib.util.module_from_spec(spec)\n", + " sys.modules['o_voxel.convert.flexible_dual_grid'] = mod\n", + " spec.loader.exec_module(mod)\n", + " return mod\n", + "\n", + "\n", + "if USE_JIT:\n", + " ext_mod = build_jit_extension()\n", + " fdg_api = load_local_flexible_dual_grid(ext_mod)\n", + " api_mode = 'jit'\n", + "else:\n", + " installed_pkg = importlib.import_module(INSTALLED_IMPORT_NAME)\n", + " ext_mod = installed_pkg._C\n", + " fdg_api = importlib.import_module(f'{INSTALLED_IMPORT_NAME}.convert.flexible_dual_grid')\n", + " api_mode = 'installed'\n", + " print('installed package path =', getattr(installed_pkg, '__file__', ''))\n", + " print('installed extension path =', getattr(ext_mod, '__file__', ''))\n", + " print('installed API path =', getattr(fdg_api, '__file__', ''))\n", + "\n", + "print('api_mode =', api_mode)\n", + "print('ext_mod =', ext_mod)\n", + "print('fdg_api =', fdg_api)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "has mesh_to_flexible_dual_grid_cpu = True\n", + "has mesh_to_flexible_dual_grid_gpu = True\n", + "has intersection_occ = True\n", + "has intersect_qef = True\n", + "has voxelize_mesh_gpu = True\n", + "has voxelize_edge_gpu = True\n", + "has face_qef = True\n", + "has voxel_traverse_edge_dda_gpu = True\n", + "has boundary_qef = True\n" + ] + } + ], + "source": [ + "print('has mesh_to_flexible_dual_grid_cpu =', hasattr(ext_mod, 'mesh_to_flexible_dual_grid_cpu'))\n", + "print('has mesh_to_flexible_dual_grid_gpu =', hasattr(ext_mod, 'mesh_to_flexible_dual_grid_gpu'))\n", + "print('has intersection_occ =', hasattr(fdg_api, 'intersection_occ'))\n", + "print('has intersect_qef =', hasattr(fdg_api, 'intersect_qef'))\n", + "print('has voxelize_mesh_gpu =', hasattr(fdg_api, 'voxelize_mesh_gpu'))\n", + "print('has voxelize_edge_gpu =', hasattr(fdg_api, 'voxelize_edge_gpu'))\n", + "print('has face_qef =', hasattr(fdg_api, 'face_qef'))\n", + "print('has voxel_traverse_edge_dda_gpu =', hasattr(fdg_api, 'voxel_traverse_edge_dda_gpu'))\n", + "print('has boundary_qef =', hasattr(fdg_api, 'boundary_qef'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "vertices: torch.Size([8000, 3])\n", + "faces: torch.Size([24000, 3])\n", + "edges: torch.Size([60000, 2])\n", + "voxel_size: tensor([0.0078, 0.0078, 0.0078], device='cuda:0')\n", + "grid_range: tensor([[ 0, 0, 0],\n", + " [128, 128, 128]], device='cuda:0', dtype=torch.int32)\n" + ] + } + ], + "source": [ + "assert torch.cuda.is_available(), 'CUDA is required for this notebook.'\n", + "device = torch.device('cuda:0')\n", + "torch.cuda.set_device(device)\n", + "\n", + "GRID = 128\n", + "N_VERT = 8000\n", + "N_FACE = 24000\n", + "EDGE_LIMIT = 60000\n", + "CHUNK_STEPS = 256\n", + "AABB = torch.tensor([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]], dtype=torch.float32, device=device)\n", + "\n", + "torch.manual_seed(11)\n", + "vertices = (0.5 + 0.18 * torch.randn(N_VERT, 3, device=device, dtype=torch.float32)).clamp_(0.0, 1.0)\n", + "vertices[1] = vertices[0]\n", + "vertices[3] = vertices[2]\n", + "\n", + "faces = torch.randint(0, N_VERT, (N_FACE, 3), device=device, dtype=torch.int64)\n", + "faces[:3] = torch.tensor([[0, 0, 1], [2, 3, 3], [4, 4, 4]], device=device, dtype=torch.int64)\n", + "faces = faces.to(torch.int32).contiguous()\n", + "\n", + "e01 = faces[:, [0, 1]]\n", + "e12 = faces[:, [1, 2]]\n", + "e20 = faces[:, [2, 0]]\n", + "edges_all = torch.cat([e01, e12, e20], dim=0)\n", + "edges_all = torch.sort(edges_all, dim=1).values\n", + "edges_unique = torch.unique(edges_all, dim=0)\n", + "num_edges = min(EDGE_LIMIT, int(edges_unique.shape[0]))\n", + "perm = torch.randperm(edges_unique.shape[0], device=device)\n", + "edges = edges_unique[perm[:num_edges]].contiguous().to(torch.int32)\n", + "\n", + "voxel_size = ((AABB[1] - AABB[0]) / GRID).to(torch.float32).contiguous()\n", + "grid_range = torch.tensor([[0, 0, 0], [GRID, GRID, GRID]], device=device, dtype=torch.int32)\n", + "\n", + "print('vertices:', vertices.shape)\n", + "print('faces:', faces.shape)\n", + "print('edges:', edges.shape)\n", + "print('voxel_size:', voxel_size)\n", + "print('grid_range:', grid_range)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "oct pairs: 4724738\n", + "dda pairs: 4724664\n" + ] + } + ], + "source": [ + "edge_id_oct, voxel_ijk_oct = fdg_api.voxelize_edge_gpu(\n", + " vertices,\n", + " edges,\n", + " voxel_size=voxel_size,\n", + " grid_range=grid_range,\n", + ")\n", + "edge_id_dda, voxel_ijk_dda = fdg_api.voxel_traverse_edge_dda_gpu(\n", + " vertices,\n", + " edges,\n", + " voxel_size=voxel_size,\n", + " grid_range=grid_range,\n", + " chunk_steps=CHUNK_STEPS,\n", + ")\n", + "\n", + "print('oct pairs:', edge_id_oct.shape[0])\n", + "print('dda pairs:', edge_id_dda.shape[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pair oct: 4724738\n", + "pair dda: 4724664\n", + "pair intersection: 4724664\n", + "pair only oct: 74\n", + "pair only dda: 0\n", + "pair union: 4724738\n", + "pair jaccard: 0.9999843377558714\n" + ] + } + ], + "source": [ + "gx, gy, gz = [int(x) for x in (grid_range[1] - grid_range[0]).tolist()]\n", + "\n", + "pair_keys_oct = (\n", + " edge_id_oct.to(torch.int64)\n", + " + edges.shape[0]\n", + " * (\n", + " voxel_ijk_oct[:, 0].to(torch.int64)\n", + " + gx * (voxel_ijk_oct[:, 1].to(torch.int64) + gy * voxel_ijk_oct[:, 2].to(torch.int64))\n", + " )\n", + ")\n", + "pair_keys_dda = (\n", + " edge_id_dda.to(torch.int64)\n", + " + edges.shape[0]\n", + " * (\n", + " voxel_ijk_dda[:, 0].to(torch.int64)\n", + " + gx * (voxel_ijk_dda[:, 1].to(torch.int64) + gy * voxel_ijk_dda[:, 2].to(torch.int64))\n", + " )\n", + ")\n", + "\n", + "pair_unique_oct = torch.unique(pair_keys_oct)\n", + "pair_unique_dda = torch.unique(pair_keys_dda)\n", + "num_pair_oct = int(pair_unique_oct.shape[0])\n", + "num_pair_dda = int(pair_unique_dda.shape[0])\n", + "num_pair_oct_only = int((~torch.isin(pair_unique_oct, pair_unique_dda)).sum())\n", + "num_pair_dda_only = int((~torch.isin(pair_unique_dda, pair_unique_oct)).sum())\n", + "num_pair_intersection = num_pair_oct - num_pair_oct_only\n", + "num_pair_union = num_pair_intersection + num_pair_oct_only + num_pair_dda_only\n", + "pair_jaccard = float(num_pair_intersection / num_pair_union) if num_pair_union > 0 else 1.0\n", + "\n", + "print('pair oct:', num_pair_oct)\n", + "print('pair dda:', num_pair_dda)\n", + "print('pair intersection:', num_pair_intersection)\n", + "print('pair only oct:', num_pair_oct_only)\n", + "print('pair only dda:', num_pair_dda_only)\n", + "print('pair union:', num_pair_union)\n", + "print('pair jaccard:', pair_jaccard)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "voxel oct: 731279\n", + "voxel dda: 731216\n", + "voxel intersection: 731216\n", + "voxel only oct: 63\n", + "voxel only dda: 0\n", + "voxel union: 731279\n", + "voxel jaccard: 0.9999138495704102\n" + ] + } + ], + "source": [ + "vox_unique_oct = torch.unique(voxel_ijk_oct.to(torch.int32), dim=0)\n", + "vox_unique_dda = torch.unique(voxel_ijk_dda.to(torch.int32), dim=0)\n", + "\n", + "vox_keys_oct = (\n", + " vox_unique_oct[:, 0].to(torch.int64)\n", + " + gx * (vox_unique_oct[:, 1].to(torch.int64) + gy * vox_unique_oct[:, 2].to(torch.int64))\n", + ")\n", + "vox_keys_dda = (\n", + " vox_unique_dda[:, 0].to(torch.int64)\n", + " + gx * (vox_unique_dda[:, 1].to(torch.int64) + gy * vox_unique_dda[:, 2].to(torch.int64))\n", + ")\n", + "\n", + "vox_oct_only_mask = ~torch.isin(vox_keys_oct, vox_keys_dda)\n", + "vox_dda_only_mask = ~torch.isin(vox_keys_dda, vox_keys_oct)\n", + "vox_oct_only = vox_unique_oct[vox_oct_only_mask]\n", + "vox_dda_only = vox_unique_dda[vox_dda_only_mask]\n", + "\n", + "num_vox_oct = int(vox_unique_oct.shape[0])\n", + "num_vox_dda = int(vox_unique_dda.shape[0])\n", + "num_vox_oct_only = int(vox_oct_only.shape[0])\n", + "num_vox_dda_only = int(vox_dda_only.shape[0])\n", + "num_vox_intersection = num_vox_oct - num_vox_oct_only\n", + "num_vox_union = num_vox_intersection + num_vox_oct_only + num_vox_dda_only\n", + "vox_jaccard = float(num_vox_intersection / num_vox_union) if num_vox_union > 0 else 1.0\n", + "\n", + "print('voxel oct:', num_vox_oct)\n", + "print('voxel dda:', num_vox_dda)\n", + "print('voxel intersection:', num_vox_intersection)\n", + "print('voxel only oct:', num_vox_oct_only)\n", + "print('voxel only dda:', num_vox_dda_only)\n", + "print('voxel union:', num_vox_union)\n", + "print('voxel jaccard:', vox_jaccard)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sample voxel only-oct:\n", + "tensor([[ 22, 127, 38],\n", + " [ 22, 127, 39],\n", + " [ 23, 127, 39],\n", + " [ 24, 127, 39],\n", + " [ 24, 127, 40],\n", + " [ 25, 127, 40],\n", + " [ 26, 127, 40],\n", + " [ 26, 127, 41],\n", + " [ 27, 127, 41],\n", + " [ 28, 127, 41],\n", + " [ 28, 127, 42],\n", + " [ 29, 127, 42],\n", + " [ 30, 127, 42],\n", + " [ 30, 127, 43],\n", + " [ 31, 127, 43],\n", + " [ 32, 127, 43],\n", + " [ 32, 127, 44],\n", + " [ 33, 127, 44],\n", + " [ 33, 127, 45],\n", + " [ 34, 127, 45],\n", + " [ 35, 127, 45],\n", + " [ 35, 127, 46],\n", + " [ 36, 127, 46],\n", + " [ 37, 127, 46],\n", + " [ 37, 127, 47],\n", + " [ 38, 127, 47],\n", + " [ 39, 127, 47],\n", + " [ 39, 127, 48],\n", + " [ 40, 127, 48],\n", + " [ 41, 127, 48],\n", + " [ 41, 127, 49],\n", + " [ 42, 127, 49],\n", + " [ 43, 127, 49],\n", + " [ 43, 127, 50],\n", + " [ 44, 127, 50],\n", + " [ 44, 127, 51],\n", + " [ 45, 127, 51],\n", + " [ 46, 127, 51],\n", + " [ 46, 127, 52],\n", + " [ 47, 127, 52],\n", + " [ 48, 127, 52],\n", + " [ 48, 127, 53],\n", + " [ 49, 127, 53],\n", + " [ 49, 127, 67],\n", + " [ 50, 127, 53],\n", + " [ 50, 127, 54],\n", + " [ 50, 127, 67],\n", + " [ 50, 127, 68],\n", + " [ 51, 127, 54],\n", + " [ 52, 127, 54]], dtype=torch.int32)\n", + "sample voxel only-dda:\n", + "tensor([], size=(0, 3), dtype=torch.int32)\n" + ] + } + ], + "source": [ + "print('sample voxel only-oct:')\n", + "print(vox_oct_only[:50].detach().cpu())\n", + "print('sample voxel only-dda:')\n", + "print(vox_dda_only[:50].detach().cpu())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d88204b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "symm-enforce", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/o-voxel/notebooks/verify_voxelize_mesh_gpu_jit_open3d.ipynb b/o-voxel/notebooks/verify_voxelize_mesh_gpu_jit_open3d.ipynb new file mode 100644 index 00000000..0c2e162d --- /dev/null +++ b/o-voxel/notebooks/verify_voxelize_mesh_gpu_jit_open3d.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Verify `voxelize_mesh_gpu` vs Open3D\n", + "\n", + "This notebook validates `o_voxel.convert.voxelize_mesh_gpu` against Open3D voxelization:\n", + "\n", + "1. Choose whether to JIT compile the current local sources or directly import an already-installed `o_voxel` package.\n", + "2. Load the selected Python API.\n", + "3. Generate synthetic Gaussian triangle soup directly in the notebook, including a few degenerate triangles.\n", + "4. Compare occupied voxels against Open3D and report the Jaccard score.\n", + "\n", + "No local mesh file is loaded.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Jupyter environment detected. Enabling Open3D WebVisualizer.\n", + "[Open3D INFO] WebRTC GUI backend enabled.\n", + "[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import time\n", + "import types\n", + "import importlib\n", + "import importlib.util\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import torch\n", + "from torch.utils.cpp_extension import load\n", + "\n", + "import open3d as o3d\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROOT = /mnt/nvmefs/Projects/Part Generation/TRELLIS.2-o-voxel-gpu-mod/o-voxel\n", + "USE_JIT = False\n", + "torch = 2.6.0+cu124\n", + "cuda available = True\n", + "cuda device = NVIDIA GeForce RTX 4090\n", + "installed package path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/__init__.py\n", + "installed extension path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/_C.cpython-310-x86_64-linux-gnu.so\n", + "installed API path = /home/quanta/.conda/envs/symm-enforce/lib/python3.10/site-packages/o_voxel/convert/flexible_dual_grid.py\n", + "api_mode = installed\n", + "ext_mod = \n", + "fdg_api = \n" + ] + } + ], + "source": [ + "ROOT = Path(r'/mnt/nvmefs/Projects/Part Generation/TRELLIS.2-o-voxel-gpu-mod/o-voxel').resolve()\n", + "USE_JIT = False\n", + "INSTALLED_IMPORT_NAME = 'o_voxel'\n", + "\n", + "print('ROOT =', ROOT)\n", + "print('USE_JIT =', USE_JIT)\n", + "print('torch =', torch.__version__)\n", + "print('cuda available =', torch.cuda.is_available())\n", + "if torch.cuda.is_available():\n", + " print('cuda device =', torch.cuda.get_device_name(0))\n", + "\n", + "\n", + "def build_jit_extension():\n", + " sources = [\n", + " 'src/hash/hash.cu',\n", + " 'src/convert/flexible_dual_grid.cpp',\n", + " 'src/convert/volumetic_attr.cpp',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/torch_bindings.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.cu',\n", + " 'src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.cu',\n", + " 'src/serialize/api.cu',\n", + " 'src/serialize/hilbert.cu',\n", + " 'src/serialize/z_order.cu',\n", + " 'src/io/svo.cpp',\n", + " 'src/io/filter_parent.cpp',\n", + " 'src/io/filter_neighbor.cpp',\n", + " 'src/rasterize/rasterize.cu',\n", + " 'src/ext.cpp',\n", + "]\n", + " full_sources = [str(ROOT / s) for s in sources]\n", + " missing = [s for s in full_sources if not Path(s).exists()]\n", + " if missing:\n", + " raise FileNotFoundError(f'Missing sources: {missing}')\n", + "\n", + " build_dir = ROOT / '.verify_build'\n", + " build_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + " unique_suffix = f\"{os.getpid()}_{time.time_ns()}_{os.urandom(4).hex()}\"\n", + " mod_name = f\"o_voxel_verify_{unique_suffix}\"\n", + "\n", + " max_jobs = max(1, os.cpu_count() or 1)\n", + " os.environ['MAX_JOBS'] = str(max_jobs)\n", + " print('MAX_JOBS =', os.environ['MAX_JOBS'])\n", + " print('JIT module name =', mod_name)\n", + "\n", + " ext_mod = load(\n", + " name=mod_name,\n", + " sources=full_sources,\n", + " extra_include_paths=[str(ROOT / 'third_party/eigen')],\n", + " extra_cflags=['-O3', '-std=c++17'],\n", + " extra_cuda_cflags=['-O3', '-std=c++17', '--expt-relaxed-constexpr'],\n", + " with_cuda=True,\n", + " build_directory=str(build_dir),\n", + " verbose=True,\n", + " )\n", + " print('JIT build/link: OK')\n", + " print('jit module path =', ext_mod.__file__)\n", + " return ext_mod\n", + "\n", + "\n", + "def load_local_flexible_dual_grid(ext_mod):\n", + " pkg = types.ModuleType('o_voxel')\n", + " pkg.__path__ = [str(ROOT / 'o_voxel')]\n", + " pkg._C = ext_mod\n", + " sys.modules['o_voxel'] = pkg\n", + " sys.modules['o_voxel._C'] = ext_mod\n", + "\n", + " convert_pkg = types.ModuleType('o_voxel.convert')\n", + " convert_pkg.__path__ = [str(ROOT / 'o_voxel' / 'convert')]\n", + " sys.modules['o_voxel.convert'] = convert_pkg\n", + "\n", + " spec = importlib.util.spec_from_file_location(\n", + " 'o_voxel.convert.flexible_dual_grid',\n", + " ROOT / 'o_voxel' / 'convert' / 'flexible_dual_grid.py',\n", + " )\n", + " mod = importlib.util.module_from_spec(spec)\n", + " sys.modules['o_voxel.convert.flexible_dual_grid'] = mod\n", + " spec.loader.exec_module(mod)\n", + " return mod\n", + "\n", + "\n", + "if USE_JIT:\n", + " ext_mod = build_jit_extension()\n", + " fdg_api = load_local_flexible_dual_grid(ext_mod)\n", + " api_mode = 'jit'\n", + "else:\n", + " installed_pkg = importlib.import_module(INSTALLED_IMPORT_NAME)\n", + " ext_mod = installed_pkg._C\n", + " fdg_api = importlib.import_module(f'{INSTALLED_IMPORT_NAME}.convert.flexible_dual_grid')\n", + " api_mode = 'installed'\n", + " print('installed package path =', getattr(installed_pkg, '__file__', ''))\n", + " print('installed extension path =', getattr(ext_mod, '__file__', ''))\n", + " print('installed API path =', getattr(fdg_api, '__file__', ''))\n", + "\n", + "print('api_mode =', api_mode)\n", + "print('ext_mod =', ext_mod)\n", + "print('fdg_api =', fdg_api)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "has mesh_to_flexible_dual_grid_cpu = True\n", + "has mesh_to_flexible_dual_grid_gpu = True\n", + "has intersection_occ = True\n", + "has intersect_qef = True\n", + "has voxelize_mesh_gpu = True\n", + "has voxelize_edge_gpu = True\n", + "has face_qef = True\n", + "has voxel_traverse_edge_dda_gpu = True\n", + "has boundary_qef = True\n" + ] + } + ], + "source": [ + "print('has mesh_to_flexible_dual_grid_cpu =', hasattr(ext_mod, 'mesh_to_flexible_dual_grid_cpu'))\n", + "print('has mesh_to_flexible_dual_grid_gpu =', hasattr(ext_mod, 'mesh_to_flexible_dual_grid_gpu'))\n", + "print('has intersection_occ =', hasattr(fdg_api, 'intersection_occ'))\n", + "print('has intersect_qef =', hasattr(fdg_api, 'intersect_qef'))\n", + "print('has voxelize_mesh_gpu =', hasattr(fdg_api, 'voxelize_mesh_gpu'))\n", + "print('has voxelize_edge_gpu =', hasattr(fdg_api, 'voxelize_edge_gpu'))\n", + "print('has face_qef =', hasattr(fdg_api, 'face_qef'))\n", + "print('has voxel_traverse_edge_dda_gpu =', hasattr(fdg_api, 'voxel_traverse_edge_dda_gpu'))\n", + "print('has boundary_qef =', hasattr(fdg_api, 'boundary_qef'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "vertices: torch.Size([8000, 3]) torch.float32 cuda:0\n", + "faces : torch.Size([24000, 3]) torch.int32 cuda:0\n", + "voxel_size: tensor([0.0078, 0.0078, 0.0078], device='cuda:0')\n", + "grid_range: tensor([[ 0, 0, 0],\n", + " [128, 128, 128]], device='cuda:0', dtype=torch.int32)\n" + ] + } + ], + "source": [ + "assert torch.cuda.is_available(), 'CUDA is required for this notebook.'\n", + "device = torch.device('cuda:0')\n", + "torch.cuda.set_device(device)\n", + "\n", + "GRID = 128\n", + "N_VERT = 8000\n", + "N_FACE = 24000\n", + "AABB = torch.tensor([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]], dtype=torch.float32, device=device)\n", + "\n", + "torch.manual_seed(7)\n", + "vertices = (0.5 + 0.18 * torch.randn(N_VERT, 3, device=device, dtype=torch.float32)).clamp_(0.0, 1.0)\n", + "vertices[1] = vertices[0]\n", + "vertices[3] = vertices[2]\n", + "\n", + "faces = torch.randint(0, N_VERT, (N_FACE, 3), device=device, dtype=torch.int64)\n", + "faces[:3] = torch.tensor([[0, 0, 1], [2, 3, 3], [4, 4, 4]], device=device, dtype=torch.int64)\n", + "faces = faces.to(torch.int32).contiguous()\n", + "\n", + "voxel_size = ((AABB[1] - AABB[0]) / GRID).to(torch.float32).contiguous()\n", + "grid_range = torch.tensor([[0, 0, 0], [GRID, GRID, GRID]], device=device, dtype=torch.int32)\n", + "\n", + "print('vertices:', vertices.shape, vertices.dtype, vertices.device)\n", + "print('faces :', faces.shape, faces.dtype, faces.device)\n", + "print('voxel_size:', voxel_size)\n", + "print('grid_range:', grid_range)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "face_id: torch.Size([36236356]) torch.int32 cuda:0\n", + "voxel_ijk: torch.Size([36236356, 3]) torch.int32 cuda:0\n", + "ours unique voxel count: 1049126\n" + ] + } + ], + "source": [ + "face_id, voxel_ijk = fdg_api.voxelize_mesh_gpu(\n", + " vertices,\n", + " faces,\n", + " voxel_size=voxel_size,\n", + " grid_range=grid_range,\n", + ")\n", + "ours_unique = torch.unique(voxel_ijk.to(torch.int32), dim=0)\n", + "print('face_id:', face_id.shape, face_id.dtype, face_id.device)\n", + "print('voxel_ijk:', voxel_ijk.shape, voxel_ijk.dtype, voxel_ijk.device)\n", + "print('ours unique voxel count:', ours_unique.shape[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "open3d unique voxel count: 1049317\n" + ] + } + ], + "source": [ + "mesh = o3d.geometry.TriangleMesh()\n", + "mesh.vertices = o3d.utility.Vector3dVector(vertices.detach().cpu().numpy().astype(np.float64, copy=False))\n", + "mesh.triangles = o3d.utility.Vector3iVector(faces.detach().cpu().numpy().astype(np.int32, copy=False))\n", + "\n", + "voxel_size_scalar = float(voxel_size[0].item())\n", + "voxel_grid = o3d.geometry.VoxelGrid.create_from_triangle_mesh_within_bounds(\n", + " mesh,\n", + " voxel_size=voxel_size_scalar,\n", + " min_bound=AABB[0].detach().cpu().numpy().astype(np.float64, copy=False),\n", + " max_bound=AABB[1].detach().cpu().numpy().astype(np.float64, copy=False),\n", + ")\n", + "\n", + "o3d_voxels = voxel_grid.get_voxels()\n", + "o3d_ijk_np = np.asarray([v.grid_index for v in o3d_voxels], dtype=np.int32).reshape(-1, 3)\n", + "o3d_ijk = torch.from_numpy(o3d_ijk_np).to(device=device, dtype=torch.int32)\n", + "o3d_unique = torch.unique(o3d_ijk, dim=0) if o3d_ijk.numel() > 0 else torch.empty((0, 3), device=device, dtype=torch.int32)\n", + "\n", + "print('open3d unique voxel count:', o3d_unique.shape[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ours: 1049126\n", + "open3d: 1049317\n", + "intersection: 1049126\n", + "only ours: 0\n", + "only open3d: 183\n", + "union: 1049309\n", + "jaccard: 0.9998255995135846\n" + ] + } + ], + "source": [ + "gx, gy, gz = [int(x) for x in (grid_range[1] - grid_range[0]).tolist()]\n", + "\n", + "ours_keys = (\n", + " ours_unique[:, 0].to(torch.int64)\n", + " + gx * (ours_unique[:, 1].to(torch.int64) + gy * ours_unique[:, 2].to(torch.int64))\n", + ")\n", + "o3d_keys = (\n", + " o3d_unique[:, 0].to(torch.int64)\n", + " + gx * (o3d_unique[:, 1].to(torch.int64) + gy * o3d_unique[:, 2].to(torch.int64))\n", + ")\n", + "\n", + "ours_only_mask = ~torch.isin(ours_keys, o3d_keys)\n", + "o3d_only_mask = ~torch.isin(o3d_keys, ours_keys)\n", + "ours_only = ours_unique[ours_only_mask]\n", + "o3d_only = o3d_unique[o3d_only_mask]\n", + "\n", + "num_ours = int(ours_unique.shape[0])\n", + "num_o3d = int(o3d_unique.shape[0])\n", + "num_ours_only = int(ours_only.shape[0])\n", + "num_o3d_only = int(o3d_only.shape[0])\n", + "num_intersection = num_ours - num_ours_only\n", + "num_union = num_intersection + num_ours_only + num_o3d_only\n", + "jaccard = float(num_intersection / num_union) if num_union > 0 else 1.0\n", + "\n", + "print('ours:', num_ours)\n", + "print('open3d:', num_o3d)\n", + "print('intersection:', num_intersection)\n", + "print('only ours:', num_ours_only)\n", + "print('only open3d:', num_o3d_only)\n", + "print('union:', num_union)\n", + "print('jaccard:', jaccard)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sample only-ours voxels:\n", + "tensor([], size=(0, 3), dtype=torch.int32)\n", + "sample only-open3d voxels:\n", + "tensor([[ 27, 128, 68],\n", + " [ 29, 128, 57],\n", + " [ 39, 128, 44],\n", + " [ 41, 23, 128],\n", + " [ 47, 128, 55],\n", + " [ 48, 128, 40],\n", + " [ 51, 64, 128],\n", + " [ 52, 36, 128],\n", + " [ 52, 128, 80],\n", + " [ 54, 96, 128],\n", + " [ 57, 128, 23],\n", + " [ 58, 42, 128],\n", + " [ 58, 88, 128],\n", + " [ 58, 128, 49],\n", + " [ 63, 128, 85],\n", + " [ 64, 127, 128],\n", + " [ 64, 128, 127],\n", + " [ 64, 128, 128],\n", + " [ 66, 128, 83],\n", + " [ 68, 82, 128],\n", + " [ 71, 128, 49],\n", + " [ 72, 71, 128],\n", + " [ 72, 81, 128],\n", + " [ 73, 85, 128],\n", + " [ 75, 52, 128],\n", + " [ 76, 49, 128],\n", + " [ 81, 68, 128],\n", + " [ 81, 128, 71],\n", + " [ 82, 52, 128],\n", + " [ 83, 128, 50],\n", + " [ 86, 128, 53],\n", + " [ 89, 128, 89],\n", + " [ 91, 128, 69],\n", + " [ 94, 128, 87],\n", + " [ 99, 128, 82],\n", + " [100, 64, 128],\n", + " [123, 57, 128],\n", + " [128, 13, 43],\n", + " [128, 15, 41],\n", + " [128, 23, 37],\n", + " [128, 25, 66],\n", + " [128, 26, 66],\n", + " [128, 27, 66],\n", + " [128, 28, 66],\n", + " [128, 29, 66],\n", + " [128, 30, 66],\n", + " [128, 30, 67],\n", + " [128, 31, 67],\n", + " [128, 32, 67],\n", + " [128, 33, 67]], dtype=torch.int32)\n" + ] + } + ], + "source": [ + "print('sample only-ours voxels:')\n", + "print(ours_only[:50].detach().cpu())\n", + "print('sample only-open3d voxels:')\n", + "print(o3d_only[:50].detach().cpu())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e4b9176", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "symm-enforce", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/o-voxel/o_voxel/convert/flexible_dual_grid.py b/o-voxel/o_voxel/convert/flexible_dual_grid.py index 7cf1397e..a2cbba83 100644 --- a/o-voxel/o_voxel/convert/flexible_dual_grid.py +++ b/o-voxel/o_voxel/convert/flexible_dual_grid.py @@ -5,6 +5,13 @@ __all__ = [ "mesh_to_flexible_dual_grid", + "intersection_occ", + "intersect_qef", + "voxelize_mesh_gpu", + "voxelize_edge_gpu", + "face_qef", + "voxel_traverse_edge_dda_gpu", + "boundary_qef", "flexible_dual_grid_to_mesh", ] @@ -25,6 +32,251 @@ def _init_hashmap(grid_size, capacity, device): return hashmap_keys, hashmap_vals +def _as_float3_tensor( + value: Union[float, list, tuple, np.ndarray, torch.Tensor], + name: str, + device: torch.device, +): + if isinstance(value, float): + value = [value, value, value] + if isinstance(value, (list, tuple)): + value = np.array(value) + if isinstance(value, np.ndarray): + value = torch.tensor(value, dtype=torch.float32) + assert isinstance(value, torch.Tensor), f"{name} must be a float, list, tuple, np.ndarray, or torch.Tensor, but got {type(value)}" + assert value.dim() == 1, f"{name} must be a 1D tensor, but got {value.shape}" + assert value.size(0) == 3, f"{name} must have 3 elements, but got {value.size(0)}" + return value.to(device=device, dtype=torch.float32).contiguous() + + +def _as_grid_range_tensor( + value: Union[list, tuple, np.ndarray, torch.Tensor], + device: torch.device, +): + if isinstance(value, (list, tuple)): + value = np.array(value) + if isinstance(value, np.ndarray): + value = torch.tensor(value, dtype=torch.int32) + assert isinstance(value, torch.Tensor), f"grid_range must be a list, tuple, np.ndarray, or torch.Tensor, but got {type(value)}" + assert value.dim() == 2, f"grid_range must be a 2D tensor, but got {value.shape}" + assert value.size(0) == 2, f"grid_range must have 2 rows, but got {value.size(0)}" + assert value.size(1) == 3, f"grid_range must have 3 columns, but got {value.size(1)}" + return value.to(device=device, dtype=torch.int32).contiguous() + + +def _sym10_to_mat4(q: torch.Tensor): + assert q.dim() == 2 and q.size(1) == 10, f"q must have shape [N, 10], got {tuple(q.shape)}" + out = torch.zeros((q.size(0), 4, 4), dtype=torch.float32, device=q.device) + out[:, 0, 0] = q[:, 0] + out[:, 0, 1] = out[:, 1, 0] = q[:, 1] + out[:, 0, 2] = out[:, 2, 0] = q[:, 2] + out[:, 0, 3] = out[:, 3, 0] = q[:, 3] + out[:, 1, 1] = q[:, 4] + out[:, 1, 2] = out[:, 2, 1] = q[:, 5] + out[:, 1, 3] = out[:, 3, 1] = q[:, 6] + out[:, 2, 2] = q[:, 7] + out[:, 2, 3] = out[:, 3, 2] = q[:, 8] + out[:, 3, 3] = q[:, 9] + return out + + +@torch.no_grad() +def intersection_occ( + triangles: torch.Tensor, + voxel_size: Union[float, list, tuple, np.ndarray, torch.Tensor], + grid_range: Union[list, tuple, np.ndarray, torch.Tensor], + chunk_triangles: int = 262144, +): + if triangles.dim() != 3 or triangles.size(1) != 3 or triangles.size(2) != 3: + raise ValueError(f"triangles must have shape [T, 3, 3], got {tuple(triangles.shape)}") + + device = triangles.device + triangles = triangles.to(device=device, dtype=torch.float32).contiguous() + voxel_size = _as_float3_tensor(voxel_size, "voxel_size", device) + grid_range = _as_grid_range_tensor(grid_range, device) + + if triangles.is_cuda: + return _C.intersection_occ_gpu( + triangles, + voxel_size, + grid_range, + int(chunk_triangles), + ) + return _C.intersect_qef_cpu( + triangles, + voxel_size.cpu(), + grid_range.cpu(), + )[0] + + +@torch.no_grad() +def intersect_qef( + triangles: torch.Tensor, + voxel_size: Union[float, list, tuple, np.ndarray, torch.Tensor], + grid_range: Union[list, tuple, np.ndarray, torch.Tensor], + chunk_triangles: int = 262144, +): + if triangles.dim() != 3 or triangles.size(1) != 3 or triangles.size(2) != 3: + raise ValueError(f"triangles must have shape [T, 3, 3], got {tuple(triangles.shape)}") + + device = triangles.device + triangles = triangles.to(device=device, dtype=torch.float32).contiguous() + voxel_size = _as_float3_tensor(voxel_size, "voxel_size", device) + grid_range = _as_grid_range_tensor(grid_range, device) + + if triangles.is_cuda: + voxels, mean_sum, cnt, intersected, qef_sym10 = _C.intersect_qef_gpu( + triangles, + voxel_size, + grid_range, + int(chunk_triangles), + ) + return voxels, mean_sum, cnt, intersected, _sym10_to_mat4(qef_sym10) + return _C.intersect_qef_cpu( + triangles, + voxel_size.cpu(), + grid_range.cpu(), + ) + + +@torch.no_grad() +def voxelize_mesh_gpu( + vertices: torch.Tensor, + faces: torch.Tensor, + voxel_size: Union[float, list, tuple, np.ndarray, torch.Tensor], + grid_range: Union[list, tuple, np.ndarray, torch.Tensor], +): + if vertices.dim() != 2 or vertices.size(1) != 3: + raise ValueError(f"vertices must have shape [V, 3], got {tuple(vertices.shape)}") + if faces.dim() != 2 or faces.size(1) != 3: + raise ValueError(f"faces must have shape [F, 3], got {tuple(faces.shape)}") + if vertices.device != faces.device: + raise ValueError("vertices and faces must be on the same device") + if not vertices.is_cuda: + raise ValueError("vertices and faces must be CUDA tensors") + + device = vertices.device + vertices = vertices.to(device=device, dtype=torch.float32).contiguous() + faces = faces.to(device=device, dtype=torch.int32).contiguous() + voxel_size = _as_float3_tensor(voxel_size, "voxel_size", device) + grid_range = _as_grid_range_tensor(grid_range, device) + return _C.voxelize_mesh_oct_gpu(vertices, faces, voxel_size, grid_range) + + +@torch.no_grad() +def voxelize_edge_gpu( + vertices: torch.Tensor, + edges: torch.Tensor, + voxel_size: Union[float, list, tuple, np.ndarray, torch.Tensor], + grid_range: Union[list, tuple, np.ndarray, torch.Tensor], +): + if vertices.dim() != 2 or vertices.size(1) != 3: + raise ValueError(f"vertices must have shape [V, 3], got {tuple(vertices.shape)}") + if edges.dim() != 2 or edges.size(1) != 2: + raise ValueError(f"edges must have shape [E, 2], got {tuple(edges.shape)}") + if vertices.device != edges.device: + raise ValueError("vertices and edges must be on the same device") + if not vertices.is_cuda: + raise ValueError("vertices and edges must be CUDA tensors") + + device = vertices.device + vertices = vertices.to(device=device, dtype=torch.float32).contiguous() + edges = edges.to(device=device, dtype=torch.int32).contiguous() + voxel_size = _as_float3_tensor(voxel_size, "voxel_size", device) + grid_range = _as_grid_range_tensor(grid_range, device) + return _C.voxelize_edge_oct_gpu(vertices, edges, voxel_size, grid_range) + + +@torch.no_grad() +def face_qef( + triangles: torch.Tensor, + voxel_size: Union[float, list, tuple, np.ndarray, torch.Tensor], + grid_range: Union[list, tuple, np.ndarray, torch.Tensor], + voxels: torch.Tensor, +): + if triangles.dim() != 3 or triangles.size(1) != 3 or triangles.size(2) != 3: + raise ValueError(f"triangles must have shape [T, 3, 3], got {tuple(triangles.shape)}") + if voxels.dim() != 2 or voxels.size(1) != 3: + raise ValueError(f"voxels must have shape [N, 3], got {tuple(voxels.shape)}") + if triangles.device != voxels.device: + raise ValueError("triangles and voxels must be on the same device") + + device = triangles.device + triangles = triangles.to(device=device, dtype=torch.float32).contiguous() + voxels = voxels.to(device=device, dtype=torch.int32).contiguous() + voxel_size = _as_float3_tensor(voxel_size, "voxel_size", device) + grid_range = _as_grid_range_tensor(grid_range, device) + + if triangles.is_cuda: + return _sym10_to_mat4(_C.face_qef_gpu(triangles, voxel_size, grid_range, voxels)) + return _C.face_qef_cpu( + triangles, + voxel_size.cpu(), + grid_range.cpu(), + voxels.cpu(), + ) + + +@torch.no_grad() +def voxel_traverse_edge_dda_gpu( + vertices: torch.Tensor, + edges: torch.Tensor, + voxel_size: Union[float, list, tuple, np.ndarray, torch.Tensor], + grid_range: Union[list, tuple, np.ndarray, torch.Tensor], + chunk_steps: int = 1024, +): + if vertices.dim() != 2 or vertices.size(1) != 3: + raise ValueError(f"vertices must have shape [V, 3], got {tuple(vertices.shape)}") + if edges.dim() != 2 or edges.size(1) != 2: + raise ValueError(f"edges must have shape [E, 2], got {tuple(edges.shape)}") + if vertices.device != edges.device: + raise ValueError("vertices and edges must be on the same device") + if not vertices.is_cuda: + raise ValueError("vertices and edges must be CUDA tensors") + + device = vertices.device + vertices = vertices.to(device=device, dtype=torch.float32).contiguous() + edges = edges.to(device=device, dtype=torch.int32).contiguous() + voxel_size = _as_float3_tensor(voxel_size, "voxel_size", device) + grid_range = _as_grid_range_tensor(grid_range, device) + return _C.voxel_traverse_edge_dda_gpu(vertices, edges, voxel_size, grid_range, int(chunk_steps)) + + +@torch.no_grad() +def boundary_qef( + boundaries: torch.Tensor, + voxel_size: Union[float, list, tuple, np.ndarray, torch.Tensor], + grid_range: Union[list, tuple, np.ndarray, torch.Tensor], + boundary_weight: float, + voxels: torch.Tensor, + chunk_steps: int = 1024, +): + if boundaries.dim() != 3 or boundaries.size(1) != 2 or boundaries.size(2) != 3: + raise ValueError(f"boundaries must have shape [B, 2, 3], got {tuple(boundaries.shape)}") + if voxels.dim() != 2 or voxels.size(1) != 3: + raise ValueError(f"voxels must have shape [N, 3], got {tuple(voxels.shape)}") + if boundaries.device != voxels.device: + raise ValueError("boundaries and voxels must be on the same device") + + device = boundaries.device + boundaries = boundaries.to(device=device, dtype=torch.float32).contiguous() + voxels = voxels.to(device=device, dtype=torch.int32).contiguous() + voxel_size = _as_float3_tensor(voxel_size, "voxel_size", device) + grid_range = _as_grid_range_tensor(grid_range, device) + + if boundaries.is_cuda: + return _sym10_to_mat4( + _C.boundary_qef_gpu(boundaries, voxel_size, grid_range, float(boundary_weight), voxels, int(chunk_steps)) + ) + return _C.boundary_qef_cpu( + boundaries, + voxel_size.cpu(), + grid_range.cpu(), + float(boundary_weight), + voxels.cpu(), + ) + + @torch.no_grad() def mesh_to_flexible_dual_grid( vertices: torch.Tensor, @@ -35,6 +287,8 @@ def mesh_to_flexible_dual_grid( face_weight: float = 1.0, boundary_weight: float = 1.0, regularization_weight: float = 0.1, + intersect_chunk_triangles: int = 262144, + boundary_chunk_steps: int = 1024, timing: bool = False, ) -> Union[torch.Tensor, torch.Tensor, torch.Tensor]: """ @@ -51,7 +305,9 @@ def mesh_to_flexible_dual_grid( face_weight (float): The weight of the face term in the QEF when solving the dual vertices. boundary_weight (float): The weight of the boundary term in the QEF when solving the dual vertices. regularization_weight (float): The weight of the regularization term in the QEF when solving the dual vertices. - timing (bool): Whether to time the voxelization process. + intersect_chunk_triangles (int): Triangle chunk size used by CUDA path. + boundary_chunk_steps (int): Edge-step chunk size used by CUDA path. + timing (bool): Whether to time the voxelization process (CPU path). Returns: torch.Tensor: The indices of the voxels that are occupied by the mesh. @@ -60,9 +316,19 @@ def mesh_to_flexible_dual_grid( torch.Tensor: The intersected flag of each voxel. """ + if vertices.dim() != 2 or vertices.size(1) != 3: + raise ValueError(f"vertices must have shape [V, 3], got {tuple(vertices.shape)}") + if faces.dim() != 2 or faces.size(1) != 3: + raise ValueError(f"faces must have shape [F, 3], got {tuple(faces.shape)}") + + use_cuda = vertices.is_cuda or faces.is_cuda + if vertices.device != faces.device: + raise ValueError("vertices and faces must be on the same device") + device = vertices.device if use_cuda else torch.device("cpu") + # Load mesh - vertices = vertices.float() - faces = faces.int() + vertices = vertices.to(device=device, dtype=torch.float32).contiguous() + faces = faces.to(device=device, dtype=torch.int32).contiguous() # Voxelize settings assert voxel_size is not None or grid_size is not None, "Either voxel_size or grid_size must be provided" @@ -77,6 +343,7 @@ def mesh_to_flexible_dual_grid( assert isinstance(voxel_size, torch.Tensor), f"voxel_size must be a float, list, tuple, np.ndarray, or torch.Tensor, but got {type(voxel_size)}" assert voxel_size.dim() == 1, f"voxel_size must be a 1D tensor, but got {voxel_size.shape}" assert voxel_size.size(0) == 3, f"voxel_size must have 3 elements, but got {voxel_size.size(0)}" + voxel_size = voxel_size.to(device=device, dtype=torch.float32).contiguous() if grid_size is not None: if isinstance(grid_size, int): @@ -88,6 +355,7 @@ def mesh_to_flexible_dual_grid( assert isinstance(grid_size, torch.Tensor), f"grid_size must be an int, list, tuple, np.ndarray, or torch.Tensor, but got {type(grid_size)}" assert grid_size.dim() == 1, f"grid_size must be a 1D tensor, but got {grid_size.shape}" assert grid_size.size(0) == 3, f"grid_size must have 3 elements, but got {grid_size.size(0)}" + grid_size = grid_size.to(device=device, dtype=torch.int32).contiguous() if aabb is not None: if isinstance(aabb, (list, tuple)): @@ -98,6 +366,7 @@ def mesh_to_flexible_dual_grid( assert aabb.dim() == 2, f"aabb must be a 2D tensor, but got {aabb.shape}" assert aabb.size(0) == 2, f"aabb must have 2 rows, but got {aabb.size(0)}" assert aabb.size(1) == 3, f"aabb must have 3 columns, but got {aabb.size(1)}" + aabb = aabb.to(device=device, dtype=torch.float32).contiguous() # Auto adjust aabb if aabb is None: @@ -113,28 +382,45 @@ def mesh_to_flexible_dual_grid( min_xyz -= padding * 0.5 max_xyz += padding * 0.5 - aabb = torch.stack([min_xyz, max_xyz], dim=0).float().cuda() + aabb = torch.stack([min_xyz, max_xyz], dim=0).to(device=device, dtype=torch.float32).contiguous() # Fill voxel size or grid size if voxel_size is None: - voxel_size = (aabb[1] - aabb[0]) / grid_size + voxel_size = ((aabb[1] - aabb[0]) / grid_size.to(dtype=torch.float32)).to(dtype=torch.float32) if grid_size is None: grid_size = ((aabb[1] - aabb[0]) / voxel_size).round().int() + voxel_size = voxel_size.to(device=device, dtype=torch.float32).contiguous() + grid_size = grid_size.to(device=device, dtype=torch.int32).contiguous() # subdivide mesh - vertices = vertices - aabb[0].reshape(1, 3) - grid_range = torch.stack([torch.zeros_like(grid_size), grid_size], dim=0).int() - - ret = _C.mesh_to_flexible_dual_grid_cpu( - vertices, - faces, - voxel_size, - grid_range, - face_weight, - boundary_weight, - regularization_weight, - timing, - ) + vertices = (vertices - aabb[0].reshape(1, 3)).contiguous() + grid_range = torch.stack([torch.zeros_like(grid_size), grid_size], dim=0).to(dtype=torch.int32).contiguous() + + if use_cuda: + if not hasattr(_C, "mesh_to_flexible_dual_grid_gpu"): + raise RuntimeError("o_voxel._C.mesh_to_flexible_dual_grid_gpu is not available in the current build") + ret = _C.mesh_to_flexible_dual_grid_gpu( + vertices, + faces, + voxel_size, + grid_range, + face_weight, + boundary_weight, + regularization_weight, + int(intersect_chunk_triangles), + int(boundary_chunk_steps), + ) + else: + ret = _C.mesh_to_flexible_dual_grid_cpu( + vertices, + faces, + voxel_size, + grid_range, + face_weight, + boundary_weight, + regularization_weight, + timing, + ) return ret diff --git a/o-voxel/setup.py b/o-voxel/setup.py index 91cb5cec..e02a070d 100644 --- a/o-voxel/setup.py +++ b/o-voxel/setup.py @@ -38,6 +38,11 @@ # Convert functions "src/convert/flexible_dual_grid.cpp", "src/convert/volumetic_attr.cpp", + "src/convert/mesh_to_flexible_dual_grid_gpu/torch_bindings.cu", + "src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.cu", + "src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.cu", + "src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.cu", + "src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.cu", ## Serialization functions "src/serialize/api.cu", "src/serialize/hilbert.cu", diff --git a/o-voxel/src/convert/api.h b/o-voxel/src/convert/api.h index b70551c8..f62c2000 100644 --- a/o-voxel/src/convert/api.h +++ b/o-voxel/src/convert/api.h @@ -39,6 +39,135 @@ std::tuple mesh_to_flexible_dual_gr ); +/** + * Extract flexible dual grid from a triangle mesh with CUDA backend. + */ +std::tuple mesh_to_flexible_dual_grid_gpu( + const torch::Tensor& vertices, + const torch::Tensor& faces, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + float face_weight, + float boundary_weight, + float regularization_weight, + int64_t intersect_chunk_triangles, + int boundary_chunk_steps +); + + +/** + * Intersection occupancy only (CUDA). + */ +torch::Tensor intersection_occ_gpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + int64_t chunk_triangles +); + + +/** + * Intersect and build QEF terms (CPU). + */ +std::tuple intersect_qef_cpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range +); + + +/** + * Intersect and build QEF terms (CUDA). + */ +std::tuple intersect_qef_gpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + int64_t chunk_triangles +); + + +/** + * Octree voxelization against mesh faces (CUDA). + */ +std::tuple voxelize_mesh_oct_gpu( + const torch::Tensor& vertices, + const torch::Tensor& faces, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range +); + + +/** + * Octree voxelization against edges (CUDA). + */ +std::tuple voxelize_edge_oct_gpu( + const torch::Tensor& vertices, + const torch::Tensor& edges, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range +); + + +/** + * Face QEF accumulation (CPU). + */ +torch::Tensor face_qef_cpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + const torch::Tensor& voxels +); + + +/** + * Face QEF accumulation (CUDA). + */ +torch::Tensor face_qef_gpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + const torch::Tensor& voxels +); + + +/** + * Edge traversal with DDA (CUDA). + */ +std::tuple voxel_traverse_edge_dda_gpu( + const torch::Tensor& vertices, + const torch::Tensor& edges, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + int chunk_steps +); + + +/** + * Boundary QEF accumulation (CPU). + */ +torch::Tensor boundary_qef_cpu( + const torch::Tensor& boundaries, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + float boundary_weight, + const torch::Tensor& voxels +); + + +/** + * Boundary QEF accumulation (CUDA). + */ +torch::Tensor boundary_qef_gpu( + const torch::Tensor& boundaries, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + float boundary_weight, + const torch::Tensor& voxels, + int chunk_steps +); + + /** * Voxelizes a triangle mesh with PBR materials * diff --git a/o-voxel/src/convert/flexible_dual_grid.cpp b/o-voxel/src/convert/flexible_dual_grid.cpp index ad89edc0..f72e4e22 100644 --- a/o-voxel/src/convert/flexible_dual_grid.cpp +++ b/o-voxel/src/convert/flexible_dual_grid.cpp @@ -519,7 +519,11 @@ std::tuple mesh_to_flexible_dual_gr // Face QEF computation if (face_weight > 0.0f) { start = clock(); - face_qef(e_voxel_size, e_grid_min, e_grid_max, triangles, hash_table, qefs); + std::vector face_qefs(voxels.size(), Eigen::Matrix4f::Zero()); + face_qef(e_voxel_size, e_grid_min, e_grid_max, triangles, hash_table, face_qefs); + for (size_t i = 0; i < qefs.size(); ++i) { + qefs[i] += face_weight * face_qefs[i]; + } end = clock(); if (timing) std::cout << "Face QEF computation took " << double(end - start) / CLOCKS_PER_SEC << " seconds." << std::endl; } @@ -772,4 +776,3 @@ std::tuple mesh_to_flexible_dual_gr torch::from_blob(intersected.data(), {int(intersected.size()), 3}, torch::kBool).clone() ); } - diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/fdg_gpu_common.h b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/fdg_gpu_common.h new file mode 100644 index 00000000..247ffef3 --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/fdg_gpu_common.h @@ -0,0 +1,152 @@ +#pragma once + +#include + +#include +#include +#include +#include + +namespace fdg_gpu { + +inline void throw_cuda_error(cudaError_t error, const char* context) { + if (error == cudaSuccess) return; + throw std::runtime_error(std::string(context) + ": " + cudaGetErrorString(error)); +} + +struct int2_ { + int x; + int y; +}; + +struct int3_ { + int x; + int y; + int z; + + __host__ __device__ int& operator[](int i) { return (&x)[i]; } + __host__ __device__ int operator[](int i) const { return (&x)[i]; } +}; + +struct bool3_ { + bool x; + bool y; + bool z; + + __host__ __device__ bool& operator[](int i) { return (&x)[i]; } + __host__ __device__ bool operator[](int i) const { return (&x)[i]; } +}; + +template +class DeviceBuffer { +public: + DeviceBuffer() = default; + explicit DeviceBuffer(int64_t count) { allocate(count); } + ~DeviceBuffer() { release(); } + + DeviceBuffer(const DeviceBuffer&) = delete; + DeviceBuffer& operator=(const DeviceBuffer&) = delete; + + DeviceBuffer(DeviceBuffer&& other) noexcept + : ptr_(other.ptr_), size_(other.size_), owns_(other.owns_) { + other.ptr_ = nullptr; + other.size_ = 0; + other.owns_ = true; + } + + DeviceBuffer& operator=(DeviceBuffer&& other) noexcept { + if (this != &other) { + release(); + ptr_ = other.ptr_; + size_ = other.size_; + owns_ = other.owns_; + other.ptr_ = nullptr; + other.size_ = 0; + other.owns_ = true; + } + return *this; + } + + void allocate(int64_t count) { + if (count < 0) { + throw std::invalid_argument("DeviceBuffer::allocate count must be non-negative"); + } + release(); + size_ = count; + owns_ = true; + if (count == 0) return; + throw_cuda_error(cudaMalloc(reinterpret_cast(&ptr_), static_cast(count) * sizeof(T)), + "cudaMalloc failed in DeviceBuffer::allocate"); + } + + void adopt(T* ptr, int64_t count) { + release(); + ptr_ = ptr; + size_ = count; + owns_ = true; + } + + void clear_async(cudaStream_t stream = nullptr) { + if (size_ == 0) return; + throw_cuda_error(cudaMemsetAsync(ptr_, 0, static_cast(size_) * sizeof(T), stream), + "cudaMemsetAsync failed in DeviceBuffer::clear_async"); + } + + T* data() noexcept { return ptr_; } + const T* data() const noexcept { return ptr_; } + int64_t size() const noexcept { return size_; } + bool empty() const noexcept { return size_ == 0; } + + T* release_ownership() noexcept { + T* out = ptr_; + ptr_ = nullptr; + size_ = 0; + owns_ = true; + return out; + } + +private: + void release() noexcept { + if (ptr_ != nullptr && owns_) { + cudaFree(ptr_); + } + ptr_ = nullptr; + size_ = 0; + owns_ = true; + } + + T* ptr_ = nullptr; + int64_t size_ = 0; + bool owns_ = true; +}; + +struct SymQEF10 { + float q00, q01, q02, q03; + float q11, q12, q13; + float q22, q23; + float q33; +}; + +struct PrimitivePairResult { + int64_t size = 0; + DeviceBuffer prim_id; + DeviceBuffer voxel_i; + DeviceBuffer voxel_j; + DeviceBuffer voxel_k; +}; + +__host__ __device__ __forceinline__ int ceil_div_i64(int64_t n, int block) { + return static_cast((n + block - 1) / block); +} + +__host__ __device__ __forceinline__ uint64_t pack_voxel_key( + int x, int y, int z, int3_ grid_min, int3_ grid_max) { + const uint64_t sx = static_cast(grid_max.x - grid_min.x); + const uint64_t sy = static_cast(grid_max.y - grid_min.y); + const uint64_t ux = static_cast(x - grid_min.x); + const uint64_t uy = static_cast(y - grid_min.y); + const uint64_t uz = static_cast(z - grid_min.z); + return ux + sx * (uy + sy * uz); +} + +} // namespace fdg_gpu diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/fdg_gpu_small_cpqr_device.cuh b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/fdg_gpu_small_cpqr_device.cuh new file mode 100644 index 00000000..e0394a3d --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/fdg_gpu_small_cpqr_device.cuh @@ -0,0 +1,316 @@ +#pragma once + +#include +#include + +namespace fdg_gpu::small_cpqr { +namespace detail { + +template +__device__ __forceinline__ float absf(float x) { + return x < 0.0f ? -x : x; +} + +template +__device__ __forceinline__ void swap_cols( + float* qr, + float* col_norms_updated, + float* col_norms_direct, + int* perm, + int c0, + int c1) { + if (c0 == c1) return; + #pragma unroll + for (int r = 0; r < N; ++r) { + const float tmp = qr[r * N + c0]; + qr[r * N + c0] = qr[r * N + c1]; + qr[r * N + c1] = tmp; + } + const float tmp_u = col_norms_updated[c0]; + col_norms_updated[c0] = col_norms_updated[c1]; + col_norms_updated[c1] = tmp_u; + + const float tmp_d = col_norms_direct[c0]; + col_norms_direct[c0] = col_norms_direct[c1]; + col_norms_direct[c1] = tmp_d; + + const int tmp_p = perm[c0]; + perm[c0] = perm[c1]; + perm[c1] = tmp_p; +} + +template +__device__ __forceinline__ void make_householder_real( + float x0, + const float* tail_in, + int tail_len, + float* beta, + float* tau, + float* essential_out) { + float tail_sq_norm = 0.0f; + #pragma unroll + for (int i = 0; i < N - 1; ++i) { + if (i < tail_len) { + tail_sq_norm += tail_in[i] * tail_in[i]; + } + } + + const float tol = FLT_MIN; + if (tail_sq_norm <= tol) { + *beta = x0; + *tau = 0.0f; + #pragma unroll + for (int i = 0; i < N - 1; ++i) { + essential_out[i] = 0.0f; + } + return; + } + + float b = sqrtf(x0 * x0 + tail_sq_norm); + if (x0 >= 0.0f) { + b = -b; + } + const float denom = x0 - b; + #pragma unroll + for (int i = 0; i < N - 1; ++i) { + essential_out[i] = (i < tail_len) ? (tail_in[i] / denom) : 0.0f; + } + *beta = b; + *tau = (b - x0) / b; +} + +template +__device__ __forceinline__ void apply_householder_left_matrix( + float* qr, + int row0, + int col0, + const float* essential, + int tail_len, + float tau) { + if (tau == 0.0f) return; + #pragma unroll + for (int j = 0; j < N; ++j) { + if (j < col0) continue; + float tmp = qr[row0 * N + j]; + #pragma unroll + for (int i = 0; i < N - 1; ++i) { + if (i < tail_len) { + tmp += essential[i] * qr[(row0 + 1 + i) * N + j]; + } + } + qr[row0 * N + j] -= tau * tmp; + #pragma unroll + for (int i = 0; i < N - 1; ++i) { + if (i < tail_len) { + qr[(row0 + 1 + i) * N + j] -= tau * essential[i] * tmp; + } + } + } +} + +template +__device__ __forceinline__ void apply_householder_left_vector( + float* c, + int row0, + const float* essential, + int tail_len, + float tau) { + if (tau == 0.0f) return; + float tmp = c[row0]; + #pragma unroll + for (int i = 0; i < N - 1; ++i) { + if (i < tail_len) { + tmp += essential[i] * c[row0 + 1 + i]; + } + } + c[row0] -= tau * tmp; + #pragma unroll + for (int i = 0; i < N - 1; ++i) { + if (i < tail_len) { + c[row0 + 1 + i] -= tau * essential[i] * tmp; + } + } +} + +template +__device__ __forceinline__ void backsolve_upper_ranked( + const float* qr, + int rank, + const float* c, + const int* perm, + float* x_out) { + float y[N]; + #pragma unroll + for (int i = 0; i < N; ++i) { + y[i] = 0.0f; + x_out[i] = 0.0f; + } + for (int i = rank - 1; i >= 0; --i) { + float s = c[i]; + #pragma unroll + for (int j = 0; j < N; ++j) { + if (j > i && j < rank) { + s -= qr[i * N + j] * y[j]; + } + } + y[i] = s / qr[i * N + i]; + } + #pragma unroll + for (int i = 0; i < N; ++i) { + if (i < rank) { + x_out[perm[i]] = y[i]; + } else { + x_out[perm[i]] = 0.0f; + } + } +} + +template +__device__ __forceinline__ void cpqr_solve_small_impl( + const float* A_in, + const float* b_in, + float* x_out) { + float qr[N * N]; + float c[N]; + int perm[N]; + float col_norms_direct[N]; + float col_norms_updated[N]; + float essential[N > 1 ? N - 1 : 1]; + + #pragma unroll + for (int i = 0; i < N * N; ++i) { + qr[i] = A_in[i]; + } + #pragma unroll + for (int i = 0; i < N; ++i) { + c[i] = b_in[i]; + perm[i] = i; + x_out[i] = 0.0f; + } + + #pragma unroll + for (int j = 0; j < N; ++j) { + float norm_sq = 0.0f; + #pragma unroll + for (int r = 0; r < N; ++r) { + const float v = qr[r * N + j]; + norm_sq += v * v; + } + const float norm = sqrtf(norm_sq); + col_norms_direct[j] = norm; + col_norms_updated[j] = norm; + } + + float max_norm_updated = col_norms_updated[0]; + #pragma unroll + for (int j = 1; j < N; ++j) { + if (col_norms_updated[j] > max_norm_updated) { + max_norm_updated = col_norms_updated[j]; + } + } + + const float threshold_helper = (max_norm_updated * FLT_EPSILON) * (max_norm_updated * FLT_EPSILON) / float(N); + const float norm_downdate_threshold = sqrtf(FLT_EPSILON); + int nonzero_pivots = N; + float maxpivot = 0.0f; + + #pragma unroll + for (int k = 0; k < N; ++k) { + int biggest_col_index = k; + float best_updated = col_norms_updated[k]; + #pragma unroll + for (int j = 0; j < N; ++j) { + if (j > k && col_norms_updated[j] > best_updated) { + best_updated = col_norms_updated[j]; + biggest_col_index = j; + } + } + const float biggest_col_sq_norm = best_updated * best_updated; + if (nonzero_pivots == N && biggest_col_sq_norm < threshold_helper * float(N - k)) { + nonzero_pivots = k; + } + + swap_cols(qr, col_norms_updated, col_norms_direct, perm, k, biggest_col_index); + + const int tail_len = N - k - 1; + float tail_local[N > 1 ? N - 1 : 1]; + #pragma unroll + for (int i = 0; i < N - 1; ++i) { + tail_local[i] = (i < tail_len) ? qr[(k + 1 + i) * N + k] : 0.0f; + } + + float beta = 0.0f; + float tau = 0.0f; + make_householder_real(qr[k * N + k], tail_local, tail_len, &beta, &tau, essential); + + qr[k * N + k] = beta; + #pragma unroll + for (int i = 0; i < N - 1; ++i) { + if (i < tail_len) { + qr[(k + 1 + i) * N + k] = essential[i]; + } + } + const float abs_beta = absf(beta); + if (abs_beta > maxpivot) { + maxpivot = abs_beta; + } + + apply_householder_left_matrix(qr, k, k + 1, essential, tail_len, tau); + if (k < nonzero_pivots) { + apply_householder_left_vector(c, k, essential, tail_len, tau); + } + + #pragma unroll + for (int j = 0; j < N; ++j) { + if (j <= k) continue; + if (col_norms_updated[j] != 0.0f) { + float temp = absf(qr[k * N + j]) / col_norms_updated[j]; + temp = (1.0f + temp) * (1.0f - temp); + if (temp < 0.0f) temp = 0.0f; + const float ratio = col_norms_updated[j] / col_norms_direct[j]; + const float temp2 = temp * ratio * ratio; + if (temp2 <= norm_downdate_threshold) { + float norm_sq = 0.0f; + #pragma unroll + for (int r = 0; r < N; ++r) { + if (r > k) { + const float v = qr[r * N + j]; + norm_sq += v * v; + } + } + const float norm = sqrtf(norm_sq); + col_norms_direct[j] = norm; + col_norms_updated[j] = norm; + } else { + col_norms_updated[j] *= sqrtf(temp); + } + } + } + } + + if (nonzero_pivots == 0) { + #pragma unroll + for (int i = 0; i < N; ++i) { + x_out[i] = 0.0f; + } + return; + } + + backsolve_upper_ranked(qr, nonzero_pivots, c, perm, x_out); +} + +} // namespace detail + +__device__ __forceinline__ void cpqr_solve_3x3(const float A[9], const float b[3], float x[3]) { + detail::cpqr_solve_small_impl<3>(A, b, x); +} + +__device__ __forceinline__ void cpqr_solve_2x2(const float A[4], const float b[2], float x[2]) { + detail::cpqr_solve_small_impl<2>(A, b, x); +} + +__device__ __forceinline__ float solve_1x1_unchecked(float a, float rhs) { + return rhs / a; +} + +} // namespace fdg_gpu::small_cpqr diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.cu b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.cu new file mode 100644 index 00000000..71549fab --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.cu @@ -0,0 +1,766 @@ +#include "flexible_dual_grid_gpu.h" + +#include "intersection_qef.h" +#include "voxel_traverse_edge_dda.h" +#include "voxelize_mesh_oct.h" + +#include +#include +#include "fdg_gpu_small_cpqr_device.cuh" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace fdg_gpu { +namespace { + +constexpr int kBlockSize = 128; + +struct FlatTriangles { + int64_t num_triangles = 0; + DeviceBuffer triangles; // [3 * num_triangles, 3] +}; + +struct BoundaryEdgeIndexResult { + int64_t size = 0; + DeviceBuffer edge_vertex_ids; // [size, 2] +}; + +struct BoundarySegments { + int64_t size = 0; + DeviceBuffer segments; // [2 * size, 3] +}; + +struct IsOne { + __host__ __device__ bool operator()(int v) const { return v == 1; } +}; + +__host__ __device__ __forceinline__ uint64_t pack_edge_key(int a, int b) { + return (static_cast(static_cast(a)) << 32) | + static_cast(b); +} + +__host__ __device__ __forceinline__ int edge_key_v0(uint64_t key) { + return static_cast(key >> 32); +} + +__host__ __device__ __forceinline__ int edge_key_v1(uint64_t key) { + return static_cast(key & 0xffffffffu); +} + +__host__ __device__ __forceinline__ SymQEF10 sym10_zero() { + return SymQEF10{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; +} + +__host__ __device__ __forceinline__ SymQEF10 sym10_add(const SymQEF10& a, const SymQEF10& b) { + return SymQEF10{ + a.q00 + b.q00, + a.q01 + b.q01, + a.q02 + b.q02, + a.q03 + b.q03, + a.q11 + b.q11, + a.q12 + b.q12, + a.q13 + b.q13, + a.q22 + b.q22, + a.q23 + b.q23, + a.q33 + b.q33, + }; +} + +__host__ __device__ __forceinline__ SymQEF10 sym10_scale(const SymQEF10& a, float s) { + return SymQEF10{ + a.q00 * s, + a.q01 * s, + a.q02 * s, + a.q03 * s, + a.q11 * s, + a.q12 * s, + a.q13 * s, + a.q22 * s, + a.q23 * s, + a.q33 * s, + }; +} + +__global__ void gather_flat_triangles_kernel( + const float* vertices, + const int32_t* faces, + int64_t num_faces, + float* triangles) { + const int64_t tid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (tid >= 3 * num_faces) return; + + const int64_t f = tid / 3; + const int lv = static_cast(tid % 3); + const int32_t vid = faces[3 * f + lv]; + + triangles[3 * tid + 0] = vertices[3 * vid + 0]; + triangles[3 * tid + 1] = vertices[3 * vid + 1]; + triangles[3 * tid + 2] = vertices[3 * vid + 2]; +} + +__global__ void emit_face_edges_kernel( + const int32_t* faces, + int64_t num_faces, + uint64_t* edge_keys) { + const int64_t f = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (f >= num_faces) return; + + int32_t e00 = faces[3 * f + 0]; + int32_t e01 = faces[3 * f + 1]; + int32_t e10 = faces[3 * f + 1]; + int32_t e11 = faces[3 * f + 2]; + int32_t e20 = faces[3 * f + 2]; + int32_t e21 = faces[3 * f + 0]; + + if (e00 > e01) { const int32_t t = e00; e00 = e01; e01 = t; } + if (e10 > e11) { const int32_t t = e10; e10 = e11; e11 = t; } + if (e20 > e21) { const int32_t t = e20; e20 = e21; e21 = t; } + + edge_keys[3 * f + 0] = pack_edge_key(e00, e01); + edge_keys[3 * f + 1] = pack_edge_key(e10, e11); + edge_keys[3 * f + 2] = pack_edge_key(e20, e21); +} + +__global__ void unpack_boundary_keys_kernel( + const uint64_t* boundary_keys, + int64_t size, + int32_t* edge_vertex_ids) { + const int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= size) return; + + const uint64_t key = boundary_keys[i]; + edge_vertex_ids[2 * i + 0] = edge_key_v0(key); + edge_vertex_ids[2 * i + 1] = edge_key_v1(key); +} + +__global__ void gather_boundary_segments_kernel( + const float* vertices, + const int32_t* edge_vertex_ids, + int64_t num_boundary_edges, + float* segments) { + const int64_t eid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (eid >= num_boundary_edges) return; + + const int32_t v0 = edge_vertex_ids[2 * eid + 0]; + const int32_t v1 = edge_vertex_ids[2 * eid + 1]; + + segments[6 * eid + 0] = vertices[3 * v0 + 0]; + segments[6 * eid + 1] = vertices[3 * v0 + 1]; + segments[6 * eid + 2] = vertices[3 * v0 + 2]; + segments[6 * eid + 3] = vertices[3 * v1 + 0]; + segments[6 * eid + 4] = vertices[3 * v1 + 1]; + segments[6 * eid + 5] = vertices[3 * v1 + 2]; +} + +__global__ void zero_qef_kernel(SymQEF10* qefs, int64_t n) { + const int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= n) return; + qefs[i] = sym10_zero(); +} + +__global__ void sum_qef_kernel( + const SymQEF10* qef_init, + const SymQEF10* qef_face, + const SymQEF10* qef_boundary, + int64_t n, + float face_weight, + SymQEF10* qef_total) { + const int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= n) return; + + SymQEF10 q = qef_init[i]; + q = sym10_add(q, sym10_scale(qef_face[i], face_weight)); + q = sym10_add(q, qef_boundary[i]); + qef_total[i] = q; +} + +__global__ void unpack_intersected_kernel( + const uint8_t* intersected_mask, + int64_t n, + bool* intersected_bool) { + const int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= n) return; + + const uint8_t m = intersected_mask[i]; + intersected_bool[3 * i + 0] = (m & (1u << 0)) != 0; + intersected_bool[3 * i + 1] = (m & (1u << 1)) != 0; + intersected_bool[3 * i + 2] = (m & (1u << 2)) != 0; +} + + +__device__ __forceinline__ int idx4(int r, int c) { return r * 4 + c; } +__device__ __forceinline__ int idx3(int r, int c) { return r * 3 + c; } +__device__ __forceinline__ int idx2(int r, int c) { return r * 2 + c; } + +__device__ __forceinline__ void sym10_to_dense4x4(const SymQEF10& q, float Q[16]) { + Q[idx4(0,0)] = q.q00; Q[idx4(0,1)] = q.q01; Q[idx4(0,2)] = q.q02; Q[idx4(0,3)] = q.q03; + Q[idx4(1,0)] = q.q01; Q[idx4(1,1)] = q.q11; Q[idx4(1,2)] = q.q12; Q[idx4(1,3)] = q.q13; + Q[idx4(2,0)] = q.q02; Q[idx4(2,1)] = q.q12; Q[idx4(2,2)] = q.q22; Q[idx4(2,3)] = q.q23; + Q[idx4(3,0)] = q.q03; Q[idx4(3,1)] = q.q13; Q[idx4(3,2)] = q.q23; Q[idx4(3,3)] = q.q33; +} + +__device__ __forceinline__ bool point_inside_box3( + const float v[3], + const float min_corner[3], + const float max_corner[3]) { + return ( + v[0] >= min_corner[0] && v[0] <= max_corner[0] && + v[1] >= min_corner[1] && v[1] <= max_corner[1] && + v[2] >= min_corner[2] && v[2] <= max_corner[2]); +} + +__device__ __forceinline__ float qef_error4(const float Q[16], const float p[4]) { + const float y0 = Q[idx4(0,0)] * p[0] + Q[idx4(0,1)] * p[1] + Q[idx4(0,2)] * p[2] + Q[idx4(0,3)] * p[3]; + const float y1 = Q[idx4(1,0)] * p[0] + Q[idx4(1,1)] * p[1] + Q[idx4(1,2)] * p[2] + Q[idx4(1,3)] * p[3]; + const float y2 = Q[idx4(2,0)] * p[0] + Q[idx4(2,1)] * p[1] + Q[idx4(2,2)] * p[2] + Q[idx4(2,3)] * p[3]; + const float y3 = Q[idx4(3,0)] * p[0] + Q[idx4(3,1)] * p[1] + Q[idx4(3,2)] * p[2] + Q[idx4(3,3)] * p[3]; + return p[0] * y0 + p[1] * y1 + p[2] * y2 + p[3] * y3; +} + +__device__ __forceinline__ void add_qef_regularization_inplace( + float Q[16], + const float mean_sum[3], + float cnt, + float regularization_weight) { + if (regularization_weight <= 0.0f || cnt <= 0.0f) { + return; + } + + const float px = mean_sum[0] / cnt; + const float py = mean_sum[1] / cnt; + const float pz = mean_sum[2] / cnt; + const float w = regularization_weight * cnt; + + Q[idx4(0,0)] += w; + Q[idx4(1,1)] += w; + Q[idx4(2,2)] += w; + + Q[idx4(0,3)] += -w * px; + Q[idx4(1,3)] += -w * py; + Q[idx4(2,3)] += -w * pz; + + Q[idx4(3,0)] += -w * px; + Q[idx4(3,1)] += -w * py; + Q[idx4(3,2)] += -w * pz; + + Q[idx4(3,3)] += w * (px * px + py * py + pz * pz); +} + +__device__ __forceinline__ void try_single_constraint( + const float Q[16], + int fixed_axis, + const float min_corner[3], + const float max_corner[3], + float& best, + float v_new[3]) { + const int ax1 = (fixed_axis + 1) % 3; + const int ax2 = (fixed_axis + 2) % 3; + + float A2[4]; + float B2[4]; + float q2[2]; + float rhs2[2]; + float x2[2]; + + A2[idx2(0,0)] = Q[idx4(ax1, ax1)]; + A2[idx2(0,1)] = Q[idx4(ax1, ax2)]; + A2[idx2(1,0)] = Q[idx4(ax2, ax1)]; + A2[idx2(1,1)] = Q[idx4(ax2, ax2)]; + + B2[idx2(0,0)] = Q[idx4(ax1, fixed_axis)]; + B2[idx2(0,1)] = Q[idx4(ax1, 3)]; + B2[idx2(1,0)] = Q[idx4(ax2, fixed_axis)]; + B2[idx2(1,1)] = Q[idx4(ax2, 3)]; + + q2[0] = min_corner[fixed_axis]; + q2[1] = 1.0f; + rhs2[0] = -(B2[idx2(0,0)] * q2[0] + B2[idx2(0,1)] * q2[1]); + rhs2[1] = -(B2[idx2(1,0)] * q2[0] + B2[idx2(1,1)] * q2[1]); + fdg_gpu::small_cpqr::cpqr_solve_2x2(A2, rhs2, x2); + if (x2[0] >= min_corner[ax1] && x2[0] <= max_corner[ax1] && + x2[1] >= min_corner[ax2] && x2[1] <= max_corner[ax2]) { + float p4[4]; + p4[fixed_axis] = min_corner[fixed_axis]; + p4[ax1] = x2[0]; + p4[ax2] = x2[1]; + p4[3] = 1.0f; + const float err = qef_error4(Q, p4); + if (err < best) { + best = err; + v_new[0] = p4[0]; + v_new[1] = p4[1]; + v_new[2] = p4[2]; + } + } + + q2[0] = max_corner[fixed_axis]; + q2[1] = 1.0f; + rhs2[0] = -(B2[idx2(0,0)] * q2[0] + B2[idx2(0,1)] * q2[1]); + rhs2[1] = -(B2[idx2(1,0)] * q2[0] + B2[idx2(1,1)] * q2[1]); + fdg_gpu::small_cpqr::cpqr_solve_2x2(A2, rhs2, x2); + if (x2[0] >= min_corner[ax1] && x2[0] <= max_corner[ax1] && + x2[1] >= min_corner[ax2] && x2[1] <= max_corner[ax2]) { + float p4[4]; + p4[fixed_axis] = max_corner[fixed_axis]; + p4[ax1] = x2[0]; + p4[ax2] = x2[1]; + p4[3] = 1.0f; + const float err = qef_error4(Q, p4); + if (err < best) { + best = err; + v_new[0] = p4[0]; + v_new[1] = p4[1]; + v_new[2] = p4[2]; + } + } +} + +__device__ __forceinline__ void try_two_constraint( + const float Q[16], + int free_axis, + const float min_corner[3], + const float max_corner[3], + float& best, + float v_new[3]) { + const int ax1 = (free_axis + 1) % 3; + const int ax2 = (free_axis + 2) % 3; + + const float a = Q[idx4(free_axis, free_axis)]; + const float b0 = Q[idx4(free_axis, ax1)]; + const float b1 = Q[idx4(free_axis, ax2)]; + const float b2 = Q[idx4(free_axis, 3)]; + + float rhs = -(b0 * min_corner[ax1] + b1 * min_corner[ax2] + b2); + float x = fdg_gpu::small_cpqr::solve_1x1_unchecked(a, rhs); + if (x >= min_corner[free_axis] && x <= max_corner[free_axis]) { + float p4[4]; + p4[free_axis] = x; + p4[ax1] = min_corner[ax1]; + p4[ax2] = min_corner[ax2]; + p4[3] = 1.0f; + const float err = qef_error4(Q, p4); + if (err < best) { + best = err; + v_new[0] = p4[0]; + v_new[1] = p4[1]; + v_new[2] = p4[2]; + } + } + + rhs = -(b0 * min_corner[ax1] + b1 * max_corner[ax2] + b2); + x = fdg_gpu::small_cpqr::solve_1x1_unchecked(a, rhs); + if (x >= min_corner[free_axis] && x <= max_corner[free_axis]) { + float p4[4]; + p4[free_axis] = x; + p4[ax1] = min_corner[ax1]; + p4[ax2] = max_corner[ax2]; + p4[3] = 1.0f; + const float err = qef_error4(Q, p4); + if (err < best) { + best = err; + v_new[0] = p4[0]; + v_new[1] = p4[1]; + v_new[2] = p4[2]; + } + } + + rhs = -(b0 * max_corner[ax1] + b1 * min_corner[ax2] + b2); + x = fdg_gpu::small_cpqr::solve_1x1_unchecked(a, rhs); + if (x >= min_corner[free_axis] && x <= max_corner[free_axis]) { + float p4[4]; + p4[free_axis] = x; + p4[ax1] = max_corner[ax1]; + p4[ax2] = min_corner[ax2]; + p4[3] = 1.0f; + const float err = qef_error4(Q, p4); + if (err < best) { + best = err; + v_new[0] = p4[0]; + v_new[1] = p4[1]; + v_new[2] = p4[2]; + } + } + + rhs = -(b0 * max_corner[ax1] + b1 * max_corner[ax2] + b2); + x = fdg_gpu::small_cpqr::solve_1x1_unchecked(a, rhs); + if (x >= min_corner[free_axis] && x <= max_corner[free_axis]) { + float p4[4]; + p4[free_axis] = x; + p4[ax1] = max_corner[ax1]; + p4[ax2] = max_corner[ax2]; + p4[3] = 1.0f; + const float err = qef_error4(Q, p4); + if (err < best) { + best = err; + v_new[0] = p4[0]; + v_new[1] = p4[1]; + v_new[2] = p4[2]; + } + } +} + +__device__ __forceinline__ void try_three_constraint( + const float Q[16], + const float min_corner[3], + const float max_corner[3], + float& best, + float v_new[3]) { + for (int x_constraint = 0; x_constraint < 2; ++x_constraint) { + for (int y_constraint = 0; y_constraint < 2; ++y_constraint) { + for (int z_constraint = 0; z_constraint < 2; ++z_constraint) { + float p4[4]; + p4[0] = x_constraint ? min_corner[0] : max_corner[0]; + p4[1] = y_constraint ? min_corner[1] : max_corner[1]; + p4[2] = z_constraint ? min_corner[2] : max_corner[2]; + p4[3] = 1.0f; + const float err = qef_error4(Q, p4); + if (err < best) { + best = err; + v_new[0] = p4[0]; + v_new[1] = p4[1]; + v_new[2] = p4[2]; + } + } + } + } +} + +__global__ void solve_qef_full_kernel( + const int* voxel_coords, + const float* mean_sum, + const float* cnt, + const SymQEF10* qef_total, + int64_t n, + float3 voxel_size, + float regularization_weight, + float* dual_vertices) { + const int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= n) return; + + const int x = voxel_coords[3 * i + 0]; + const int y = voxel_coords[3 * i + 1]; + const int z = voxel_coords[3 * i + 2]; + + float min_corner[3] = { + x * voxel_size.x, + y * voxel_size.y, + z * voxel_size.z, + }; + float max_corner[3] = { + (x + 1) * voxel_size.x, + (y + 1) * voxel_size.y, + (z + 1) * voxel_size.z, + }; + + float Q[16]; + sym10_to_dense4x4(qef_total[i], Q); + + const float mean_i[3] = { + mean_sum[3 * i + 0], + mean_sum[3 * i + 1], + mean_sum[3 * i + 2], + }; + add_qef_regularization_inplace(Q, mean_i, cnt[i], regularization_weight); + + float A3[9]; + float b3[3]; + float v_new[3]; + + A3[idx3(0,0)] = Q[idx4(0,0)]; + A3[idx3(0,1)] = Q[idx4(0,1)]; + A3[idx3(0,2)] = Q[idx4(0,2)]; + A3[idx3(1,0)] = Q[idx4(1,0)]; + A3[idx3(1,1)] = Q[idx4(1,1)]; + A3[idx3(1,2)] = Q[idx4(1,2)]; + A3[idx3(2,0)] = Q[idx4(2,0)]; + A3[idx3(2,1)] = Q[idx4(2,1)]; + A3[idx3(2,2)] = Q[idx4(2,2)]; + + b3[0] = -Q[idx4(0,3)]; + b3[1] = -Q[idx4(1,3)]; + b3[2] = -Q[idx4(2,3)]; + + fdg_gpu::small_cpqr::cpqr_solve_3x3(A3, b3, v_new); + + if (!point_inside_box3(v_new, min_corner, max_corner)) { + float best = CUDART_INF_F; + try_single_constraint(Q, 0, min_corner, max_corner, best, v_new); + try_single_constraint(Q, 1, min_corner, max_corner, best, v_new); + try_single_constraint(Q, 2, min_corner, max_corner, best, v_new); + try_two_constraint(Q, 0, min_corner, max_corner, best, v_new); + try_two_constraint(Q, 1, min_corner, max_corner, best, v_new); + try_two_constraint(Q, 2, min_corner, max_corner, best, v_new); + try_three_constraint(Q, min_corner, max_corner, best, v_new); + } + + dual_vertices[3 * i + 0] = v_new[0]; + dual_vertices[3 * i + 1] = v_new[1]; + dual_vertices[3 * i + 2] = v_new[2]; +} + +inline FlatTriangles build_flat_triangles_gpu( + const float* vertices, + const int32_t* faces, + int64_t num_faces, + cudaStream_t stream) { + FlatTriangles out; + out.num_triangles = num_faces; + out.triangles.allocate(9 * num_faces); + if (num_faces == 0) return out; + + gather_flat_triangles_kernel<<>>( + vertices, + faces, + num_faces, + out.triangles.data()); + throw_cuda_error(cudaGetLastError(), "gather_flat_triangles_kernel"); + return out; +} + +inline BoundaryEdgeIndexResult detect_boundary_edges_gpu( + const int32_t* faces, + int64_t num_faces, + cudaStream_t stream) { + BoundaryEdgeIndexResult out; + if (num_faces == 0) return out; + + DeviceBuffer edge_keys(3 * num_faces); + emit_face_edges_kernel<<>>( + faces, + num_faces, + edge_keys.data()); + throw_cuda_error(cudaGetLastError(), "emit_face_edges_kernel"); + + auto policy = thrust::cuda::par.on(stream); + auto edge_keys_begin = thrust::device_pointer_cast(edge_keys.data()); + thrust::sort(policy, edge_keys_begin, edge_keys_begin + 3 * num_faces); + + DeviceBuffer unique_keys(3 * num_faces); + DeviceBuffer counts(3 * num_faces); + auto reduce_end = thrust::reduce_by_key( + policy, + edge_keys_begin, + edge_keys_begin + 3 * num_faces, + thrust::make_constant_iterator(1), + thrust::device_pointer_cast(unique_keys.data()), + thrust::device_pointer_cast(counts.data())); + const int64_t unique_size = reduce_end.first - thrust::device_pointer_cast(unique_keys.data()); + if (unique_size == 0) return out; + + const int64_t boundary_count = thrust::count_if( + policy, + thrust::device_pointer_cast(counts.data()), + thrust::device_pointer_cast(counts.data()) + unique_size, + IsOne{}); + out.size = boundary_count; + out.edge_vertex_ids.allocate(2 * boundary_count); + if (boundary_count == 0) return out; + + DeviceBuffer boundary_keys(boundary_count); + auto copied_end = thrust::copy_if( + policy, + thrust::device_pointer_cast(unique_keys.data()), + thrust::device_pointer_cast(unique_keys.data()) + unique_size, + thrust::device_pointer_cast(counts.data()), + thrust::device_pointer_cast(boundary_keys.data()), + IsOne{}); + const int64_t copied = copied_end - thrust::device_pointer_cast(boundary_keys.data()); + if (copied != boundary_count) { + throw std::runtime_error("boundary edge count mismatch"); + } + + unpack_boundary_keys_kernel<<>>( + boundary_keys.data(), + boundary_count, + out.edge_vertex_ids.data()); + throw_cuda_error(cudaGetLastError(), "unpack_boundary_keys_kernel"); + return out; +} + +inline BoundarySegments gather_boundary_segments_gpu( + const float* vertices, + const BoundaryEdgeIndexResult& boundary_edges, + cudaStream_t stream) { + BoundarySegments out; + out.size = boundary_edges.size; + out.segments.allocate(6 * out.size); + if (out.size == 0) return out; + + gather_boundary_segments_kernel<<>>( + vertices, + boundary_edges.edge_vertex_ids.data(), + out.size, + out.segments.data()); + throw_cuda_error(cudaGetLastError(), "gather_boundary_segments_kernel"); + return out; +} + +inline DeviceBuffer make_zero_qef_buffer(int64_t n, cudaStream_t stream) { + DeviceBuffer out(n); + if (n > 0) { + zero_qef_kernel<<>>(out.data(), n); + throw_cuda_error(cudaGetLastError(), "zero_qef_kernel"); + } + return out; +} + +} // namespace + +cudaError_t mesh_to_flexible_dual_grid_gpu( + const float* vertices, + int64_t num_vertices, + const int32_t* faces, + int64_t num_faces, + float3 voxel_size, + int3_ grid_min, + int3_ grid_max, + float face_weight, + float boundary_weight, + float regularization_weight, + int64_t intersect_chunk_triangles, + int boundary_chunk_steps, + cudaStream_t stream, + FlexibleDualGridGPUOutput* out) { + if (out == nullptr) { + return cudaErrorInvalidValue; + } + out->size = 0; + out->voxel_coords = nullptr; + out->dual_vertices = nullptr; + out->intersected = nullptr; + + if (num_vertices < 0 || num_faces < 0) { + return cudaErrorInvalidValue; + } + if (!(voxel_size.x > 0.0f && voxel_size.y > 0.0f && voxel_size.z > 0.0f)) { + return cudaErrorInvalidValue; + } + if (grid_max.x <= grid_min.x || grid_max.y <= grid_min.y || grid_max.z <= grid_min.z) { + return cudaErrorInvalidValue; + } + if (num_vertices > 0 && vertices == nullptr) { + return cudaErrorInvalidValue; + } + if (num_faces > 0 && faces == nullptr) { + return cudaErrorInvalidValue; + } + if (intersect_chunk_triangles <= 0 || boundary_chunk_steps <= 0) { + return cudaErrorInvalidValue; + } + + try { + FlatTriangles flat = build_flat_triangles_gpu(vertices, faces, num_faces, stream); + + auto surface = intersection_qef::intersect_qef_gpu( + flat.triangles.data(), + flat.num_triangles, + voxel_size, + grid_min, + grid_max, + intersect_chunk_triangles, + stream); + + auto face_qefs = make_zero_qef_buffer(surface.size, stream); + if (surface.size > 0 && face_weight > 0.0f) { + auto face_result = oct_pairs::face_qef_gpu( + voxel_size, + grid_min, + grid_max, + flat.triangles.data(), + flat.num_triangles, + surface.voxels.data(), + surface.size, + stream); + face_qefs = std::move(face_result.qefs); + } + + auto boundary_qefs = make_zero_qef_buffer(surface.size, stream); + if (surface.size > 0 && boundary_weight > 0.0f) { + BoundaryEdgeIndexResult boundary_edges = detect_boundary_edges_gpu(faces, num_faces, stream); + BoundarySegments boundary_segments = gather_boundary_segments_gpu(vertices, boundary_edges, stream); + auto boundary_result = edge_dda::boundary_qef_gpu( + voxel_size, + grid_min, + grid_max, + boundary_segments.segments.data(), + boundary_segments.size, + boundary_weight, + surface.voxels.data(), + surface.size, + boundary_chunk_steps, + stream); + boundary_qefs = std::move(boundary_result.qefs); + } + + DeviceBuffer qef_total(surface.size); + if (surface.size > 0) { + sum_qef_kernel<<>>( + surface.qefs.data(), + face_qefs.data(), + boundary_qefs.data(), + surface.size, + face_weight, + qef_total.data()); + throw_cuda_error(cudaGetLastError(), "sum_qef_kernel"); + } + + DeviceBuffer dual_vertices(3 * surface.size); + if (surface.size > 0) { + solve_qef_full_kernel<<>>( + surface.voxels.data(), + surface.mean_sum.data(), + surface.cnt.data(), + qef_total.data(), + surface.size, + voxel_size, + regularization_weight, + dual_vertices.data()); + throw_cuda_error(cudaGetLastError(), "solve_qef_full_kernel"); + } + + DeviceBuffer intersected_bool(3 * surface.size); + if (surface.size > 0) { + unpack_intersected_kernel<<>>( + surface.intersected.data(), + surface.size, + intersected_bool.data()); + throw_cuda_error(cudaGetLastError(), "unpack_intersected_kernel"); + } + + out->size = surface.size; + out->voxel_coords = surface.voxels.release_ownership(); + out->dual_vertices = dual_vertices.release_ownership(); + out->intersected = intersected_bool.release_ownership(); + return cudaSuccess; + } catch (const std::bad_alloc&) { + return cudaErrorMemoryAllocation; + } catch (const std::invalid_argument&) { + return cudaErrorInvalidValue; + } catch (const std::exception&) { + return cudaErrorUnknown; + } +} + +void free_flexible_dual_grid_gpu_output(FlexibleDualGridGPUOutput* out) noexcept { + if (out == nullptr) return; + if (out->voxel_coords) cudaFree(out->voxel_coords); + if (out->dual_vertices) cudaFree(out->dual_vertices); + if (out->intersected) cudaFree(out->intersected); + out->size = 0; + out->voxel_coords = nullptr; + out->dual_vertices = nullptr; + out->intersected = nullptr; +} + +} // namespace fdg_gpu diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.h b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.h new file mode 100644 index 00000000..04b6729c --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.h @@ -0,0 +1,35 @@ +#pragma once + +#include "fdg_gpu_common.h" + +#include +#include + +namespace fdg_gpu { + +struct FlexibleDualGridGPUOutput { + int64_t size = 0; + int32_t* voxel_coords = nullptr; // [size, 3] + float* dual_vertices = nullptr; // [size, 3] + bool* intersected = nullptr; // [size, 3] +}; + +cudaError_t mesh_to_flexible_dual_grid_gpu( + const float* vertices, + int64_t num_vertices, + const int32_t* faces, + int64_t num_faces, + float3 voxel_size, + int3_ grid_min, + int3_ grid_max, + float face_weight, + float boundary_weight, + float regularization_weight, + int64_t intersect_chunk_triangles, + int boundary_chunk_steps, + cudaStream_t stream, + FlexibleDualGridGPUOutput* out); + +void free_flexible_dual_grid_gpu_output(FlexibleDualGridGPUOutput* out) noexcept; + +} // namespace fdg_gpu diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.cu b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.cu new file mode 100644 index 00000000..692fd522 --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.cu @@ -0,0 +1,823 @@ +#include "intersection_qef.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace intersection_qef { +namespace { + +using fdg_gpu::DeviceBuffer; +using fdg_gpu::SymQEF10; +using fdg_gpu::int3_; +using fdg_gpu::throw_cuda_error; + +#define IQ_CUDA_CHECK(expr) ::fdg_gpu::throw_cuda_error((expr), #expr) + +struct D2 { + double x; + double z; +}; + +struct D3 { + double x; + double y; + double z; + + __host__ __device__ double operator[](int i) const { return (&x)[i]; } +}; + +struct QEFEventValue { + float mean_sum_x; + float mean_sum_y; + float mean_sum_z; + float cnt; + uint8_t intersected; + SymQEF10 qef; +}; + +struct OccChunk { + DeviceBuffer keys; + int64_t size = 0; +}; + +struct QEFChunk { + DeviceBuffer keys; + DeviceBuffer values; + int64_t size = 0; +}; + +struct AddQEFEventValue { + __host__ __device__ QEFEventValue operator()(const QEFEventValue& a, const QEFEventValue& b) const { + QEFEventValue out; + out.mean_sum_x = a.mean_sum_x + b.mean_sum_x; + out.mean_sum_y = a.mean_sum_y + b.mean_sum_y; + out.mean_sum_z = a.mean_sum_z + b.mean_sum_z; + out.cnt = a.cnt + b.cnt; + out.intersected = static_cast(a.intersected | b.intersected); + out.qef.q00 = a.qef.q00 + b.qef.q00; + out.qef.q01 = a.qef.q01 + b.qef.q01; + out.qef.q02 = a.qef.q02 + b.qef.q02; + out.qef.q03 = a.qef.q03 + b.qef.q03; + out.qef.q11 = a.qef.q11 + b.qef.q11; + out.qef.q12 = a.qef.q12 + b.qef.q12; + out.qef.q13 = a.qef.q13 + b.qef.q13; + out.qef.q22 = a.qef.q22 + b.qef.q22; + out.qef.q23 = a.qef.q23 + b.qef.q23; + out.qef.q33 = a.qef.q33 + b.qef.q33; + return out; + } +}; + +__host__ __device__ inline double lerp_scalar(double a, double b, double t, double va, double vb) { + if (a == b) return va; + const double alpha = (t - a) / (b - a); + return (1.0 - alpha) * va + alpha * vb; +} + +__host__ __device__ inline D2 lerp_vec2(double a, double b, double t, D2 va, D2 vb) { + if (a == b) return va; + const double alpha = (t - a) / (b - a); + return D2{(1.0 - alpha) * va.x + alpha * vb.x, (1.0 - alpha) * va.z + alpha * vb.z}; +} + +__host__ __device__ inline int clamp_int(int x, int lo, int hi) { + return x < lo ? lo : (x > hi ? hi : x); +} + +__host__ __device__ inline void swap_d3(D3& a, D3& b) { + D3 t = a; + a = b; + b = t; +} + +__host__ __device__ inline void sort_by_y(D3& t0, D3& t1, D3& t2) { + if (t0.y > t1.y) swap_d3(t0, t1); + if (t1.y > t2.y) swap_d3(t1, t2); + if (t0.y > t1.y) swap_d3(t0, t1); +} + +__device__ inline void normalize3(double& x, double& y, double& z) { + const double n = sqrt(x * x + y * y + z * z); + if (n > 0.0) { + x /= n; + y /= n; + z /= n; + } +} + +__device__ inline SymQEF10 make_plane_qef_from_triangle(const double v0[3], const double v1[3], const double v2[3]) { + const double e0x = v1[0] - v0[0]; + const double e0y = v1[1] - v0[1]; + const double e0z = v1[2] - v0[2]; + + const double e1x = v2[0] - v1[0]; + const double e1y = v2[1] - v1[1]; + const double e1z = v2[2] - v1[2]; + + double nx = e0y * e1z - e0z * e1y; + double ny = e0z * e1x - e0x * e1z; + double nz = e0x * e1y - e0y * e1x; + normalize3(nx, ny, nz); + + const double d = -(nx * v0[0] + ny * v0[1] + nz * v0[2]); + + SymQEF10 q; + q.q00 = static_cast(nx * nx); + q.q01 = static_cast(nx * ny); + q.q02 = static_cast(nx * nz); + q.q03 = static_cast(nx * d); + q.q11 = static_cast(ny * ny); + q.q12 = static_cast(ny * nz); + q.q13 = static_cast(ny * d); + q.q22 = static_cast(nz * nz); + q.q23 = static_cast(nz * d); + q.q33 = static_cast(d * d); + return q; +} + +__device__ inline int64_t count_triangle_axis_surface_voxels( + const float* tri, + int ax2, + const float voxel_size[3], + int3_ grid_min, + int3_ grid_max) { + const double v0[3] = {static_cast(tri[0]), static_cast(tri[1]), static_cast(tri[2])}; + const double v1[3] = {static_cast(tri[3]), static_cast(tri[4]), static_cast(tri[5])}; + const double v2[3] = {static_cast(tri[6]), static_cast(tri[7]), static_cast(tri[8])}; + + const int ax0 = (ax2 + 1) % 3; + const int ax1 = (ax2 + 2) % 3; + + D3 t0{v0[ax0], v0[ax1], v0[ax2]}; + D3 t1{v1[ax0], v1[ax1], v1[ax2]}; + D3 t2{v2[ax0], v2[ax1], v2[ax2]}; + sort_by_y(t0, t1, t2); + + const int start = clamp_int(static_cast(t0.y / voxel_size[ax1]), grid_min[ax1], grid_max[ax1] - 1); + const int mid = clamp_int(static_cast(t1.y / voxel_size[ax1]), grid_min[ax1], grid_max[ax1] - 1); + const int end = clamp_int(static_cast(t2.y / voxel_size[ax1]), grid_min[ax1], grid_max[ax1] - 1); + + int64_t total = 0; + auto scan_half = [&](int row_start, int row_end, D3 a, D3 b, D3 c) { + for (int y_idx = row_start; y_idx < row_end; ++y_idx) { + const double y = (static_cast(y_idx) + 1.0) * voxel_size[ax1]; + D2 t3 = lerp_vec2(a.y, b.y, y, D2{a.x, a.z}, D2{b.x, b.z}); + D2 t4 = lerp_vec2(a.y, c.y, y, D2{a.x, a.z}, D2{c.x, c.z}); + if (t3.x > t4.x) { + D2 tmp = t3; + t3 = t4; + t4 = tmp; + } + + const int line_start = clamp_int(static_cast(t3.x / voxel_size[ax0]), grid_min[ax0], grid_max[ax0] - 1); + const int line_end = clamp_int(static_cast(t4.x / voxel_size[ax0]), grid_min[ax0], grid_max[ax0] - 1); + for (int x_idx = line_start; x_idx < line_end; ++x_idx) { + const double x = (static_cast(x_idx) + 1.0) * voxel_size[ax0]; + const double z = lerp_scalar(t3.x, t4.x, x, t3.z, t4.z); + const int z_idx = static_cast(z / voxel_size[ax2]); + if (z_idx < grid_min[ax2] || z_idx >= grid_max[ax2]) continue; + total += 4; + } + } + }; + + scan_half(start, mid, t0, t1, t2); + scan_half(mid, end, t2, t1, t0); + return total; +} + +__global__ void intersection_count_kernel( + const float* triangles, + int64_t tri_begin, + int64_t tri_count, + float vx, + float vy, + float vz, + int3_ grid_min, + int3_ grid_max, + int64_t* counts) { + const int64_t local_t = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (local_t >= tri_count) return; + + const int64_t t = tri_begin + local_t; + const float* tri = triangles + t * 9; + const float voxel_size[3] = {vx, vy, vz}; + + int64_t total = 0; + total += count_triangle_axis_surface_voxels(tri, 0, voxel_size, grid_min, grid_max); + total += count_triangle_axis_surface_voxels(tri, 1, voxel_size, grid_min, grid_max); + total += count_triangle_axis_surface_voxels(tri, 2, voxel_size, grid_min, grid_max); + counts[local_t] = total; +} + +__global__ void intersection_occ_emit_kernel( + const float* triangles, + int64_t tri_begin, + int64_t tri_count, + float vx, + float vy, + float vz, + int3_ grid_min, + int3_ grid_max, + const int64_t* offsets, + uint64_t* event_keys) { + const int64_t local_t = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (local_t >= tri_count) return; + + const int64_t t = tri_begin + local_t; + const float* tri = triangles + t * 9; + const double v0[3] = {static_cast(tri[0]), static_cast(tri[1]), static_cast(tri[2])}; + const double v1[3] = {static_cast(tri[3]), static_cast(tri[4]), static_cast(tri[5])}; + const double v2[3] = {static_cast(tri[6]), static_cast(tri[7]), static_cast(tri[8])}; + const float voxel_size[3] = {vx, vy, vz}; + + int64_t out = offsets[local_t]; + + for (int ax2 = 0; ax2 < 3; ++ax2) { + const int ax0 = (ax2 + 1) % 3; + const int ax1 = (ax2 + 2) % 3; + + D3 t0{v0[ax0], v0[ax1], v0[ax2]}; + D3 t1{v1[ax0], v1[ax1], v1[ax2]}; + D3 t2{v2[ax0], v2[ax1], v2[ax2]}; + sort_by_y(t0, t1, t2); + + const int start = clamp_int(static_cast(t0.y / voxel_size[ax1]), grid_min[ax1], grid_max[ax1] - 1); + const int mid = clamp_int(static_cast(t1.y / voxel_size[ax1]), grid_min[ax1], grid_max[ax1] - 1); + const int end = clamp_int(static_cast(t2.y / voxel_size[ax1]), grid_min[ax1], grid_max[ax1] - 1); + + auto emit_one = [&](int x_idx, int y_idx, int z_idx) { + int coord[3]; + coord[ax0] = x_idx; + coord[ax1] = y_idx; + coord[ax2] = z_idx; + event_keys[out++] = fdg_gpu::pack_voxel_key(coord[0], coord[1], coord[2], grid_min, grid_max); + }; + + auto scan_half = [&](int row_start, int row_end, D3 a, D3 b, D3 c) { + for (int y_idx = row_start; y_idx < row_end; ++y_idx) { + const double y = (static_cast(y_idx) + 1.0) * voxel_size[ax1]; + D2 t3 = lerp_vec2(a.y, b.y, y, D2{a.x, a.z}, D2{b.x, b.z}); + D2 t4 = lerp_vec2(a.y, c.y, y, D2{a.x, a.z}, D2{c.x, c.z}); + if (t3.x > t4.x) { + D2 tmp = t3; + t3 = t4; + t4 = tmp; + } + + const int line_start = clamp_int(static_cast(t3.x / voxel_size[ax0]), grid_min[ax0], grid_max[ax0] - 1); + const int line_end = clamp_int(static_cast(t4.x / voxel_size[ax0]), grid_min[ax0], grid_max[ax0] - 1); + + for (int x_idx = line_start; x_idx < line_end; ++x_idx) { + const double x = (static_cast(x_idx) + 1.0) * voxel_size[ax0]; + const double z = lerp_scalar(t3.x, t4.x, x, t3.z, t4.z); + const int z_idx = static_cast(z / voxel_size[ax2]); + if (z_idx < grid_min[ax2] || z_idx >= grid_max[ax2]) continue; + + emit_one(x_idx + 0, y_idx + 0, z_idx); + emit_one(x_idx + 1, y_idx + 0, z_idx); + emit_one(x_idx + 0, y_idx + 1, z_idx); + emit_one(x_idx + 1, y_idx + 1, z_idx); + } + } + }; + + scan_half(start, mid, t0, t1, t2); + scan_half(mid, end, t2, t1, t0); + } +} + +__global__ void intersect_qef_emit_kernel( + const float* triangles, + int64_t tri_begin, + int64_t tri_count, + float vx, + float vy, + float vz, + int3_ grid_min, + int3_ grid_max, + const int64_t* offsets, + uint64_t* event_keys, + QEFEventValue* event_values) { + const int64_t local_t = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (local_t >= tri_count) return; + + const int64_t t = tri_begin + local_t; + const float* tri = triangles + t * 9; + const double v0[3] = {static_cast(tri[0]), static_cast(tri[1]), static_cast(tri[2])}; + const double v1[3] = {static_cast(tri[3]), static_cast(tri[4]), static_cast(tri[5])}; + const double v2[3] = {static_cast(tri[6]), static_cast(tri[7]), static_cast(tri[8])}; + const float voxel_size[3] = {vx, vy, vz}; + const SymQEF10 qef = make_plane_qef_from_triangle(v0, v1, v2); + + int64_t out = offsets[local_t]; + + for (int ax2 = 0; ax2 < 3; ++ax2) { + const int ax0 = (ax2 + 1) % 3; + const int ax1 = (ax2 + 2) % 3; + + D3 t0{v0[ax0], v0[ax1], v0[ax2]}; + D3 t1{v1[ax0], v1[ax1], v1[ax2]}; + D3 t2{v2[ax0], v2[ax1], v2[ax2]}; + sort_by_y(t0, t1, t2); + + const int start = clamp_int(static_cast(t0.y / voxel_size[ax1]), grid_min[ax1], grid_max[ax1] - 1); + const int mid = clamp_int(static_cast(t1.y / voxel_size[ax1]), grid_min[ax1], grid_max[ax1] - 1); + const int end = clamp_int(static_cast(t2.y / voxel_size[ax1]), grid_min[ax1], grid_max[ax1] - 1); + + auto emit_one = [&](int x_idx, int y_idx, int z_idx, double x, double y, double z, uint8_t mask) { + int coord[3]; + coord[ax0] = x_idx; + coord[ax1] = y_idx; + coord[ax2] = z_idx; + event_keys[out] = fdg_gpu::pack_voxel_key(coord[0], coord[1], coord[2], grid_min, grid_max); + event_values[out].mean_sum_x = static_cast(ax0 == 0 ? x : (ax1 == 0 ? y : z)); + event_values[out].mean_sum_y = static_cast(ax0 == 1 ? x : (ax1 == 1 ? y : z)); + event_values[out].mean_sum_z = static_cast(ax0 == 2 ? x : (ax1 == 2 ? y : z)); + event_values[out].cnt = 1.0f; + event_values[out].intersected = mask; + event_values[out].qef = qef; + ++out; + }; + + auto scan_half = [&](int row_start, int row_end, D3 a, D3 b, D3 c) { + for (int y_idx = row_start; y_idx < row_end; ++y_idx) { + const double y = (static_cast(y_idx) + 1.0) * voxel_size[ax1]; + D2 t3 = lerp_vec2(a.y, b.y, y, D2{a.x, a.z}, D2{b.x, b.z}); + D2 t4 = lerp_vec2(a.y, c.y, y, D2{a.x, a.z}, D2{c.x, c.z}); + if (t3.x > t4.x) { + D2 tmp = t3; + t3 = t4; + t4 = tmp; + } + + const int line_start = clamp_int(static_cast(t3.x / voxel_size[ax0]), grid_min[ax0], grid_max[ax0] - 1); + const int line_end = clamp_int(static_cast(t4.x / voxel_size[ax0]), grid_min[ax0], grid_max[ax0] - 1); + + for (int x_idx = line_start; x_idx < line_end; ++x_idx) { + const double x = (static_cast(x_idx) + 1.0) * voxel_size[ax0]; + const double z = lerp_scalar(t3.x, t4.x, x, t3.z, t4.z); + const int z_idx = static_cast(z / voxel_size[ax2]); + if (z_idx < grid_min[ax2] || z_idx >= grid_max[ax2]) continue; + + emit_one(x_idx + 0, y_idx + 0, z_idx, x, y, z, static_cast(1u << ax2)); + emit_one(x_idx + 1, y_idx + 0, z_idx, x, y, z, static_cast(0u)); + emit_one(x_idx + 0, y_idx + 1, z_idx, x, y, z, static_cast(0u)); + emit_one(x_idx + 1, y_idx + 1, z_idx, x, y, z, static_cast(0u)); + } + } + }; + + scan_half(start, mid, t0, t1, t2); + scan_half(mid, end, t2, t1, t0); + } +} + +__global__ void decode_occ_output_kernel( + const uint64_t* keys, + int64_t size, + int3_ grid_min, + int3_ grid_max, + int* out_voxels) { + const int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= size) return; + int x, y, z; + const uint64_t key = keys[i]; + const uint64_t sx = static_cast(grid_max.x - grid_min.x); + const uint64_t sy = static_cast(grid_max.y - grid_min.y); + const uint64_t yz = sx * sy; + const uint64_t zz = key / yz; + const uint64_t rem = key - zz * yz; + const uint64_t yy = rem / sx; + const uint64_t xx = rem - yy * sx; + x = static_cast(xx) + grid_min.x; + y = static_cast(yy) + grid_min.y; + z = static_cast(zz) + grid_min.z; + out_voxels[3 * i + 0] = x; + out_voxels[3 * i + 1] = y; + out_voxels[3 * i + 2] = z; +} + +__global__ void decode_qef_output_kernel( + const uint64_t* keys, + const QEFEventValue* values, + int64_t size, + int3_ grid_min, + int3_ grid_max, + int* out_voxels, + float* out_mean_sum, + float* out_cnt, + uint8_t* out_intersected, + SymQEF10* out_qefs) { + const int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= size) return; + + const uint64_t key = keys[i]; + const uint64_t sx = static_cast(grid_max.x - grid_min.x); + const uint64_t sy = static_cast(grid_max.y - grid_min.y); + const uint64_t yz = sx * sy; + const uint64_t zz = key / yz; + const uint64_t rem = key - zz * yz; + const uint64_t yy = rem / sx; + const uint64_t xx = rem - yy * sx; + + out_voxels[3 * i + 0] = static_cast(xx) + grid_min.x; + out_voxels[3 * i + 1] = static_cast(yy) + grid_min.y; + out_voxels[3 * i + 2] = static_cast(zz) + grid_min.z; + + out_mean_sum[3 * i + 0] = values[i].mean_sum_x; + out_mean_sum[3 * i + 1] = values[i].mean_sum_y; + out_mean_sum[3 * i + 2] = values[i].mean_sum_z; + out_cnt[i] = values[i].cnt; + out_intersected[i] = values[i].intersected; + out_qefs[i] = values[i].qef; +} + +inline int64_t copy_last_i64(const int64_t* ptr, int64_t count, cudaStream_t stream) { + if (count <= 0) return 0; + int64_t value = 0; + IQ_CUDA_CHECK(cudaMemcpyAsync(&value, ptr + (count - 1), sizeof(int64_t), cudaMemcpyDeviceToHost, stream)); + IQ_CUDA_CHECK(cudaStreamSynchronize(stream)); + return value; +} + +OccChunk make_occ_chunk_exact(DeviceBuffer& src_keys, int64_t size, cudaStream_t stream) { + OccChunk out; + out.size = size; + if (size <= 0) return out; + out.keys.allocate(size); + thrust::copy_n( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(src_keys.data()), + size, + thrust::device_pointer_cast(out.keys.data())); + return out; +} + +QEFChunk make_qef_chunk_exact( + DeviceBuffer& src_keys, + DeviceBuffer& src_values, + int64_t size, + cudaStream_t stream) { + QEFChunk out; + out.size = size; + if (size <= 0) return out; + out.keys.allocate(size); + out.values.allocate(size); + thrust::copy_n( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(src_keys.data()), + size, + thrust::device_pointer_cast(out.keys.data())); + thrust::copy_n( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(src_values.data()), + size, + thrust::device_pointer_cast(out.values.data())); + return out; +} + +OccChunk merge_occ_two_chunks(OccChunk a, OccChunk b, cudaStream_t stream) { + if (a.size == 0) return std::move(b); + if (b.size == 0) return std::move(a); + + DeviceBuffer merged(a.size + b.size); + auto merged_end = thrust::merge( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(a.keys.data()), + thrust::device_pointer_cast(a.keys.data()) + a.size, + thrust::device_pointer_cast(b.keys.data()), + thrust::device_pointer_cast(b.keys.data()) + b.size, + thrust::device_pointer_cast(merged.data())); + const int64_t merged_size = merged_end - thrust::device_pointer_cast(merged.data()); + + auto unique_end = thrust::unique( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(merged.data()), + thrust::device_pointer_cast(merged.data()) + merged_size); + + OccChunk out; + out.size = unique_end - thrust::device_pointer_cast(merged.data()); + out.keys = std::move(merged); + return out; +} + +QEFChunk merge_qef_two_chunks(QEFChunk a, QEFChunk b, cudaStream_t stream) { + if (a.size == 0) return std::move(b); + if (b.size == 0) return std::move(a); + + DeviceBuffer merged_keys(a.size + b.size); + DeviceBuffer merged_values(a.size + b.size); + + auto merged_end = thrust::merge_by_key( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(a.keys.data()), + thrust::device_pointer_cast(a.keys.data()) + a.size, + thrust::device_pointer_cast(b.keys.data()), + thrust::device_pointer_cast(b.keys.data()) + b.size, + thrust::device_pointer_cast(a.values.data()), + thrust::device_pointer_cast(b.values.data()), + thrust::device_pointer_cast(merged_keys.data()), + thrust::device_pointer_cast(merged_values.data())); + const int64_t merged_size = merged_end.first - thrust::device_pointer_cast(merged_keys.data()); + + DeviceBuffer next_keys(merged_size); + DeviceBuffer next_values(merged_size); + auto next_end = thrust::reduce_by_key( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(merged_keys.data()), + thrust::device_pointer_cast(merged_keys.data()) + merged_size, + thrust::device_pointer_cast(merged_values.data()), + thrust::device_pointer_cast(next_keys.data()), + thrust::device_pointer_cast(next_values.data()), + thrust::equal_to(), + AddQEFEventValue()); + + QEFChunk out; + out.size = next_end.first - thrust::device_pointer_cast(next_keys.data()); + out.keys = std::move(next_keys); + out.values = std::move(next_values); + return out; +} + +OccChunk final_merge_occ_chunks(std::vector chunks, cudaStream_t stream) { + if (chunks.empty()) return OccChunk{}; + while (chunks.size() > 1) { + std::vector next_level; + next_level.reserve((chunks.size() + 1) / 2); + for (size_t i = 0; i < chunks.size(); i += 2) { + if (i + 1 >= chunks.size()) { + next_level.push_back(std::move(chunks[i])); + } else { + next_level.push_back(merge_occ_two_chunks(std::move(chunks[i]), std::move(chunks[i + 1]), stream)); + } + } + chunks = std::move(next_level); + } + return std::move(chunks[0]); +} + +QEFChunk final_merge_qef_chunks(std::vector chunks, cudaStream_t stream) { + if (chunks.empty()) return QEFChunk{}; + while (chunks.size() > 1) { + std::vector next_level; + next_level.reserve((chunks.size() + 1) / 2); + for (size_t i = 0; i < chunks.size(); i += 2) { + if (i + 1 >= chunks.size()) { + next_level.push_back(std::move(chunks[i])); + } else { + next_level.push_back(merge_qef_two_chunks(std::move(chunks[i]), std::move(chunks[i + 1]), stream)); + } + } + chunks = std::move(next_level); + } + return std::move(chunks[0]); +} + +IntersectionOccResult run_occ_impl( + const float* triangles, + int64_t num_triangles, + float3 voxel_size, + int3_ grid_min, + int3_ grid_max, + int64_t chunk_triangles, + cudaStream_t stream) { + if (num_triangles < 0) throw std::invalid_argument("num_triangles must be non-negative"); + if (chunk_triangles <= 0) throw std::invalid_argument("chunk_triangles must be positive"); + + constexpr int threads = 256; + std::vector chunks; + chunks.reserve(static_cast((num_triangles + chunk_triangles - 1) / chunk_triangles)); + + for (int64_t tri_begin = 0; tri_begin < num_triangles; tri_begin += chunk_triangles) { + const int64_t tri_count = std::min(chunk_triangles, num_triangles - tri_begin); + if (tri_count == 0) continue; + + DeviceBuffer counts(tri_count); + const int blocks = fdg_gpu::ceil_div_i64(tri_count, threads); + intersection_count_kernel<<>>( + triangles, + tri_begin, + tri_count, + voxel_size.x, + voxel_size.y, + voxel_size.z, + grid_min, + grid_max, + counts.data()); + IQ_CUDA_CHECK(cudaGetLastError()); + + DeviceBuffer offsets(tri_count); + thrust::exclusive_scan( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(counts.data()), + thrust::device_pointer_cast(counts.data()) + tri_count, + thrust::device_pointer_cast(offsets.data())); + + const int64_t last_count = copy_last_i64(counts.data(), tri_count, stream); + const int64_t last_offset = copy_last_i64(offsets.data(), tri_count, stream); + const int64_t raw_size = last_offset + last_count; + if (raw_size == 0) continue; + + DeviceBuffer partial_keys(raw_size); + intersection_occ_emit_kernel<<>>( + triangles, + tri_begin, + tri_count, + voxel_size.x, + voxel_size.y, + voxel_size.z, + grid_min, + grid_max, + offsets.data(), + partial_keys.data()); + IQ_CUDA_CHECK(cudaGetLastError()); + + thrust::sort( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(partial_keys.data()), + thrust::device_pointer_cast(partial_keys.data()) + raw_size); + + auto partial_end = thrust::unique( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(partial_keys.data()), + thrust::device_pointer_cast(partial_keys.data()) + raw_size); + const int64_t partial_size = partial_end - thrust::device_pointer_cast(partial_keys.data()); + if (partial_size == 0) continue; + + chunks.push_back(make_occ_chunk_exact(partial_keys, partial_size, stream)); + } + + OccChunk final_chunk = final_merge_occ_chunks(std::move(chunks), stream); + + IntersectionOccResult out; + out.size = final_chunk.size; + out.voxels.allocate(out.size * 3); + if (out.size == 0) return out; + + const int blocks = fdg_gpu::ceil_div_i64(out.size, threads); + decode_occ_output_kernel<<>>( + final_chunk.keys.data(), out.size, grid_min, grid_max, out.voxels.data()); + IQ_CUDA_CHECK(cudaGetLastError()); + return out; +} + +IntersectQEFResult run_qef_impl( + const float* triangles, + int64_t num_triangles, + float3 voxel_size, + int3_ grid_min, + int3_ grid_max, + int64_t chunk_triangles, + cudaStream_t stream) { + if (num_triangles < 0) throw std::invalid_argument("num_triangles must be non-negative"); + if (chunk_triangles <= 0) throw std::invalid_argument("chunk_triangles must be positive"); + + constexpr int threads = 256; + std::vector chunks; + chunks.reserve(static_cast((num_triangles + chunk_triangles - 1) / chunk_triangles)); + + for (int64_t tri_begin = 0; tri_begin < num_triangles; tri_begin += chunk_triangles) { + const int64_t tri_count = std::min(chunk_triangles, num_triangles - tri_begin); + if (tri_count == 0) continue; + + DeviceBuffer counts(tri_count); + const int blocks = fdg_gpu::ceil_div_i64(tri_count, threads); + intersection_count_kernel<<>>( + triangles, + tri_begin, + tri_count, + voxel_size.x, + voxel_size.y, + voxel_size.z, + grid_min, + grid_max, + counts.data()); + IQ_CUDA_CHECK(cudaGetLastError()); + + DeviceBuffer offsets(tri_count); + thrust::exclusive_scan( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(counts.data()), + thrust::device_pointer_cast(counts.data()) + tri_count, + thrust::device_pointer_cast(offsets.data())); + + const int64_t last_count = copy_last_i64(counts.data(), tri_count, stream); + const int64_t last_offset = copy_last_i64(offsets.data(), tri_count, stream); + const int64_t raw_size = last_offset + last_count; + if (raw_size == 0) continue; + + DeviceBuffer partial_keys(raw_size); + DeviceBuffer partial_values(raw_size); + intersect_qef_emit_kernel<<>>( + triangles, + tri_begin, + tri_count, + voxel_size.x, + voxel_size.y, + voxel_size.z, + grid_min, + grid_max, + offsets.data(), + partial_keys.data(), + partial_values.data()); + IQ_CUDA_CHECK(cudaGetLastError()); + + thrust::sort_by_key( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(partial_keys.data()), + thrust::device_pointer_cast(partial_keys.data()) + raw_size, + thrust::device_pointer_cast(partial_values.data())); + + DeviceBuffer reduced_keys(raw_size); + DeviceBuffer reduced_values(raw_size); + auto reduce_end = thrust::reduce_by_key( + thrust::cuda::par.on(stream), + thrust::device_pointer_cast(partial_keys.data()), + thrust::device_pointer_cast(partial_keys.data()) + raw_size, + thrust::device_pointer_cast(partial_values.data()), + thrust::device_pointer_cast(reduced_keys.data()), + thrust::device_pointer_cast(reduced_values.data()), + thrust::equal_to(), + AddQEFEventValue()); + const int64_t reduced_size = reduce_end.first - thrust::device_pointer_cast(reduced_keys.data()); + if (reduced_size == 0) continue; + + chunks.push_back(make_qef_chunk_exact(reduced_keys, reduced_values, reduced_size, stream)); + } + + QEFChunk final_chunk = final_merge_qef_chunks(std::move(chunks), stream); + + IntersectQEFResult out; + out.size = final_chunk.size; + out.voxels.allocate(out.size * 3); + out.mean_sum.allocate(out.size * 3); + out.cnt.allocate(out.size); + out.intersected.allocate(out.size); + out.qefs.allocate(out.size); + if (out.size == 0) return out; + + const int blocks = fdg_gpu::ceil_div_i64(out.size, threads); + decode_qef_output_kernel<<>>( + final_chunk.keys.data(), + final_chunk.values.data(), + out.size, + grid_min, + grid_max, + out.voxels.data(), + out.mean_sum.data(), + out.cnt.data(), + out.intersected.data(), + out.qefs.data()); + IQ_CUDA_CHECK(cudaGetLastError()); + return out; +} + +} // namespace + +IntersectionOccResult intersection_occ_gpu( + const float* triangles, + int64_t num_triangles, + float3 voxel_size, + int3_ grid_min, + int3_ grid_max, + int64_t chunk_triangles, + cudaStream_t stream) { + if (triangles == nullptr && num_triangles > 0) throw std::invalid_argument("triangles is null"); + if (!(voxel_size.x > 0.0f && voxel_size.y > 0.0f && voxel_size.z > 0.0f)) { + throw std::invalid_argument("voxel_size must be positive"); + } + return run_occ_impl(triangles, num_triangles, voxel_size, grid_min, grid_max, chunk_triangles, stream); +} + +IntersectQEFResult intersect_qef_gpu( + const float* triangles, + int64_t num_triangles, + float3 voxel_size, + int3_ grid_min, + int3_ grid_max, + int64_t chunk_triangles, + cudaStream_t stream) { + if (triangles == nullptr && num_triangles > 0) throw std::invalid_argument("triangles is null"); + if (!(voxel_size.x > 0.0f && voxel_size.y > 0.0f && voxel_size.z > 0.0f)) { + throw std::invalid_argument("voxel_size must be positive"); + } + return run_qef_impl(triangles, num_triangles, voxel_size, grid_min, grid_max, chunk_triangles, stream); +} + +} // namespace intersection_qef diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.h b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.h new file mode 100644 index 00000000..c5bebcc5 --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.h @@ -0,0 +1,39 @@ +#pragma once + +#include "fdg_gpu_common.h" + +namespace intersection_qef { + +struct IntersectionOccResult { + int64_t size = 0; + fdg_gpu::DeviceBuffer voxels; // [size,3] flattened +}; + +struct IntersectQEFResult { + int64_t size = 0; + fdg_gpu::DeviceBuffer voxels; // [size,3] flattened + fdg_gpu::DeviceBuffer mean_sum; // [size,3] flattened + fdg_gpu::DeviceBuffer cnt; // [size] + fdg_gpu::DeviceBuffer intersected; // [size], bitmask for bool3 + fdg_gpu::DeviceBuffer qefs;// [size] +}; + +IntersectionOccResult intersection_occ_gpu( + const float* triangles, // [num_triangles, 3, 3] flattened + int64_t num_triangles, + float3 voxel_size, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + int64_t chunk_triangles = 4096, + cudaStream_t stream = nullptr); + +IntersectQEFResult intersect_qef_gpu( + const float* triangles, // [num_triangles, 3, 3] flattened + int64_t num_triangles, + float3 voxel_size, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + int64_t chunk_triangles = 4096, + cudaStream_t stream = nullptr); + +} // namespace intersection_qef diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/torch_bindings.cu b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/torch_bindings.cu new file mode 100644 index 00000000..2c35c4ed --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/torch_bindings.cu @@ -0,0 +1,936 @@ +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include "../api.h" +#include "flexible_dual_grid_gpu.h" +#include "intersection_qef.h" +#include "voxelize_mesh_oct.h" +#include "voxel_traverse_edge_dda.h" + +struct bool3 { bool x, y, z; bool& operator[](int i) { return (&x)[i]; } }; + +struct VoxelCoord { + int x, y, z; + + int& operator[](int i) { return (&x)[i]; } + + bool operator==(const VoxelCoord& other) const { + return x == other.x && y == other.y && z == other.z; + } +}; + +namespace std { +template <> +struct hash { + size_t operator()(const VoxelCoord& v) const { + const std::size_t p1 = 73856093; + const std::size_t p2 = 19349663; + const std::size_t p3 = 83492791; + return static_cast(v.x) * p1 ^ + static_cast(v.y) * p2 ^ + static_cast(v.z) * p3; + } +}; +} // namespace std + +void intersect_qef( + const Eigen::Vector3f& voxel_size, + const Eigen::Vector3i& grid_min, + const Eigen::Vector3i& grid_max, + const std::vector& triangles, + std::unordered_map& hash_table, + std::vector& voxels, + std::vector& means, + std::vector& cnt, + std::vector& intersected, + std::vector& qefs +); + +void face_qef( + const Eigen::Vector3f& voxel_size, + const Eigen::Vector3i& grid_min, + const Eigen::Vector3i& grid_max, + const std::vector& triangles, + std::unordered_map& hash_table, + std::vector& qefs +); + +void boundry_qef( + const Eigen::Vector3f& voxel_size, + const Eigen::Vector3i& grid_min, + const Eigen::Vector3i& grid_max, + const std::vector& boundries, + float boundary_weight, + std::unordered_map& hash_table, + std::vector& qefs +); + +namespace { + +inline void check_cuda_success(cudaError_t err, const char* context) { + TORCH_CHECK(err == cudaSuccess, context, ": ", cudaGetErrorString(err)); +} + +inline float3 tensor_to_float3_cpu(const torch::Tensor& t) { + auto tc = t.to(torch::kFloat32).contiguous().cpu(); + TORCH_CHECK(tc.dim() == 1 && tc.size(0) == 3, "voxel_size must have shape [3]"); + const float* p = tc.data_ptr(); + return float3{p[0], p[1], p[2]}; +} + +inline void tensor_to_grid_min_max_cpu( + const torch::Tensor& t, + fdg_gpu::int3_& grid_min, + fdg_gpu::int3_& grid_max +) { + auto tc = t.to(torch::kInt32).contiguous().cpu(); + TORCH_CHECK(tc.dim() == 2 && tc.size(0) == 2 && tc.size(1) == 3, "grid_range must have shape [2, 3]"); + const int32_t* p = tc.data_ptr(); + grid_min = fdg_gpu::int3_{p[0], p[1], p[2]}; + grid_max = fdg_gpu::int3_{p[3], p[4], p[5]}; +} + +inline fdg_gpu::int3_ grid_size_from_min_max( + const fdg_gpu::int3_& grid_min, + const fdg_gpu::int3_& grid_max +) { + return fdg_gpu::int3_{ + grid_max.x - grid_min.x, + grid_max.y - grid_min.y, + grid_max.z - grid_min.z, + }; +} + +__global__ void unpack_intersected_mask_kernel( + const uint8_t* mask, + int64_t n, + bool* out_bool3 +) { + const int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= n) return; + const uint8_t m = mask[i]; + out_bool3[3 * i + 0] = (m & (1u << 0)) != 0; + out_bool3[3 * i + 1] = (m & (1u << 1)) != 0; + out_bool3[3 * i + 2] = (m & (1u << 2)) != 0; +} + +inline void check_triangles_tensor(const torch::Tensor& triangles_c) { + TORCH_CHECK( + triangles_c.dim() == 3 && triangles_c.size(1) == 3 && triangles_c.size(2) == 3, + "triangles must have shape [T, 3, 3]" + ); +} + +inline void check_voxels_tensor(const torch::Tensor& voxels_c) { + TORCH_CHECK( + voxels_c.dim() == 2 && voxels_c.size(1) == 3, + "voxels must have shape [N, 3]" + ); +} + +inline void check_edges_tensor(const torch::Tensor& edges_c) { + TORCH_CHECK(edges_c.dim() == 2 && edges_c.size(1) == 2, "edges must have shape [E, 2]"); +} + +inline void check_boundaries_tensor(const torch::Tensor& boundaries_c) { + TORCH_CHECK( + boundaries_c.dim() == 3 && boundaries_c.size(1) == 2 && boundaries_c.size(2) == 3, + "boundaries must have shape [B, 2, 3]" + ); +} + +inline void check_cpu_tensor(const torch::Tensor& t, const char* name) { + TORCH_CHECK(!t.is_cuda(), name, " must be a CPU tensor"); +} + +inline Eigen::Vector3f tensor_to_eigen_vec3_cpu(const torch::Tensor& t) { + auto tc = t.to(torch::kFloat32).contiguous().cpu(); + TORCH_CHECK(tc.dim() == 1 && tc.size(0) == 3, "voxel_size must have shape [3]"); + const float* p = tc.data_ptr(); + return Eigen::Vector3f(p[0], p[1], p[2]); +} + +inline void tensor_to_eigen_grid_min_max_cpu( + const torch::Tensor& t, + Eigen::Vector3i& grid_min, + Eigen::Vector3i& grid_max +) { + auto tc = t.to(torch::kInt32).contiguous().cpu(); + TORCH_CHECK(tc.dim() == 2 && tc.size(0) == 2 && tc.size(1) == 3, "grid_range must have shape [2, 3]"); + const int32_t* p = tc.data_ptr(); + grid_min = Eigen::Vector3i(p[0], p[1], p[2]); + grid_max = Eigen::Vector3i(p[3], p[4], p[5]); +} + +inline std::vector triangles_tensor_to_vector_cpu(const torch::Tensor& triangles) { + auto triangles_c = triangles.to(torch::kFloat32).contiguous().cpu(); + check_triangles_tensor(triangles_c); + const float* p = triangles_c.data_ptr(); + const int64_t n = triangles_c.size(0); + std::vector out; + out.reserve(static_cast(n) * 3); + for (int64_t i = 0; i < n; ++i) { + for (int v = 0; v < 3; ++v) { + const int64_t base = (i * 3 + v) * 3; + out.emplace_back(p[base + 0], p[base + 1], p[base + 2]); + } + } + return out; +} + +inline std::vector boundaries_tensor_to_vector_cpu(const torch::Tensor& boundaries) { + auto boundaries_c = boundaries.to(torch::kFloat32).contiguous().cpu(); + check_boundaries_tensor(boundaries_c); + const float* p = boundaries_c.data_ptr(); + const int64_t n = boundaries_c.size(0); + std::vector out; + out.reserve(static_cast(n) * 2); + for (int64_t i = 0; i < n; ++i) { + for (int v = 0; v < 2; ++v) { + const int64_t base = (i * 2 + v) * 3; + out.emplace_back(p[base + 0], p[base + 1], p[base + 2]); + } + } + return out; +} + +inline std::vector voxels_tensor_to_vector_cpu( + const torch::Tensor& voxels, + std::unordered_map& hash_table +) { + auto voxels_c = voxels.to(torch::kInt32).contiguous().cpu(); + check_voxels_tensor(voxels_c); + const int32_t* p = voxels_c.data_ptr(); + const int64_t n = voxels_c.size(0); + std::vector out; + out.reserve(static_cast(n)); + hash_table.reserve(static_cast(n)); + for (int64_t i = 0; i < n; ++i) { + const VoxelCoord coord{p[3 * i + 0], p[3 * i + 1], p[3 * i + 2]}; + hash_table[coord] = static_cast(i); + out.push_back(int3{coord.x, coord.y, coord.z}); + } + return out; +} + +inline torch::Tensor int3_vector_to_tensor_cpu(const std::vector& values) { + auto out = torch::empty({static_cast(values.size()), 3}, torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU)); + int32_t* p = out.data_ptr(); + for (size_t i = 0; i < values.size(); ++i) { + p[3 * i + 0] = values[i].x; + p[3 * i + 1] = values[i].y; + p[3 * i + 2] = values[i].z; + } + return out; +} + +inline torch::Tensor vec3f_vector_to_tensor_cpu(const std::vector& values) { + auto out = torch::empty({static_cast(values.size()), 3}, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU)); + float* p = out.data_ptr(); + for (size_t i = 0; i < values.size(); ++i) { + p[3 * i + 0] = values[i].x(); + p[3 * i + 1] = values[i].y(); + p[3 * i + 2] = values[i].z(); + } + return out; +} + +inline torch::Tensor float_vector_to_tensor_cpu(const std::vector& values) { + auto out = torch::empty({static_cast(values.size())}, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU)); + float* p = out.data_ptr(); + for (size_t i = 0; i < values.size(); ++i) { + p[i] = values[i]; + } + return out; +} + +inline torch::Tensor bool3_vector_to_tensor_cpu(const std::vector& values) { + auto out = torch::empty({static_cast(values.size()), 3}, torch::TensorOptions().dtype(torch::kBool).device(torch::kCPU)); + bool* p = out.data_ptr(); + for (size_t i = 0; i < values.size(); ++i) { + p[3 * i + 0] = values[i].x; + p[3 * i + 1] = values[i].y; + p[3 * i + 2] = values[i].z; + } + return out; +} + +inline torch::Tensor matrix4f_vector_to_tensor_cpu(const std::vector& values) { + auto out = torch::empty({static_cast(values.size()), 4, 4}, torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU)); + float* p = out.data_ptr(); + for (size_t i = 0; i < values.size(); ++i) { + for (int r = 0; r < 4; ++r) { + for (int c = 0; c < 4; ++c) { + p[i * 16 + r * 4 + c] = values[i](r, c); + } + } + } + return out; +} + +inline std::tuple primitive_pair_to_tensors( + const fdg_gpu::PrimitivePairResult& pairs, + const torch::Device& device, + cudaStream_t stream +) { + auto opts_i32 = torch::TensorOptions().dtype(torch::kInt32).device(device); + torch::Tensor prim_id = torch::empty({pairs.size}, opts_i32); + torch::Tensor voxels_axis_major = torch::empty({3, pairs.size}, opts_i32); + + if (pairs.size > 0) { + check_cuda_success( + cudaMemcpyAsync( + prim_id.data_ptr(), + pairs.prim_id.data(), + static_cast(pairs.size) * sizeof(int32_t), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync primitive prim_id" + ); + + check_cuda_success( + cudaMemcpyAsync( + voxels_axis_major.data_ptr() + pairs.size * 0, + pairs.voxel_i.data(), + static_cast(pairs.size) * sizeof(int32_t), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync primitive voxel_i" + ); + check_cuda_success( + cudaMemcpyAsync( + voxels_axis_major.data_ptr() + pairs.size * 1, + pairs.voxel_j.data(), + static_cast(pairs.size) * sizeof(int32_t), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync primitive voxel_j" + ); + check_cuda_success( + cudaMemcpyAsync( + voxels_axis_major.data_ptr() + pairs.size * 2, + pairs.voxel_k.data(), + static_cast(pairs.size) * sizeof(int32_t), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync primitive voxel_k" + ); + + check_cuda_success(cudaStreamSynchronize(stream), "cudaStreamSynchronize primitive_pair_to_tensors"); + } + + torch::Tensor voxels = voxels_axis_major.transpose(0, 1).contiguous(); + return std::make_tuple(prim_id, voxels); +} + +} // namespace + + +std::tuple mesh_to_flexible_dual_grid_gpu( + const torch::Tensor& vertices, + const torch::Tensor& faces, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + float face_weight, + float boundary_weight, + float regularization_weight, + int64_t intersect_chunk_triangles, + int boundary_chunk_steps +) { + TORCH_CHECK(vertices.is_cuda(), "vertices must be a CUDA tensor"); + TORCH_CHECK(faces.is_cuda(), "faces must be a CUDA tensor"); + TORCH_CHECK(vertices.device() == faces.device(), "vertices and faces must be on the same CUDA device"); + + auto vertices_c = vertices.to(torch::kFloat32).contiguous(); + auto faces_c = faces.to(torch::kInt32).contiguous(); + + TORCH_CHECK(vertices_c.dim() == 2 && vertices_c.size(1) == 3, "vertices must have shape [V, 3]"); + TORCH_CHECK(faces_c.dim() == 2 && faces_c.size(1) == 3, "faces must have shape [F, 3]"); + + float3 voxel_size_h = tensor_to_float3_cpu(voxel_size); + fdg_gpu::int3_ grid_min{}; + fdg_gpu::int3_ grid_max{}; + tensor_to_grid_min_max_cpu(grid_range, grid_min, grid_max); + + fdg_gpu::FlexibleDualGridGPUOutput out{}; + cudaStream_t stream = nullptr; + + cudaError_t status = fdg_gpu::mesh_to_flexible_dual_grid_gpu( + vertices_c.data_ptr(), + vertices_c.size(0), + faces_c.data_ptr(), + faces_c.size(0), + voxel_size_h, + grid_min, + grid_max, + face_weight, + boundary_weight, + regularization_weight, + intersect_chunk_triangles, + boundary_chunk_steps, + stream, + &out + ); + + if (status != cudaSuccess) { + fdg_gpu::free_flexible_dual_grid_gpu_output(&out); + TORCH_CHECK(false, "mesh_to_flexible_dual_grid_gpu failed: ", cudaGetErrorString(status)); + } + + auto opts_i32 = torch::TensorOptions().dtype(torch::kInt32).device(vertices_c.device()); + auto opts_f32 = torch::TensorOptions().dtype(torch::kFloat32).device(vertices_c.device()); + auto opts_b = torch::TensorOptions().dtype(torch::kBool).device(vertices_c.device()); + + torch::Tensor voxel_coords = torch::empty({out.size, 3}, opts_i32); + torch::Tensor dual_vertices = torch::empty({out.size, 3}, opts_f32); + torch::Tensor intersected = torch::empty({out.size, 3}, opts_b); + + if (out.size > 0) { + check_cuda_success( + cudaMemcpyAsync( + voxel_coords.data_ptr(), + out.voxel_coords, + static_cast(out.size) * 3 * sizeof(int32_t), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync voxel_coords" + ); + + check_cuda_success( + cudaMemcpyAsync( + dual_vertices.data_ptr(), + out.dual_vertices, + static_cast(out.size) * 3 * sizeof(float), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync dual_vertices" + ); + + check_cuda_success( + cudaMemcpyAsync( + intersected.data_ptr(), + out.intersected, + static_cast(out.size) * 3 * sizeof(bool), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync intersected" + ); + + check_cuda_success(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); + } + + fdg_gpu::free_flexible_dual_grid_gpu_output(&out); + return std::make_tuple(voxel_coords, dual_vertices, intersected); +} + + +std::tuple intersect_qef_cpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range +) { + check_cpu_tensor(triangles, "triangles"); + auto triangles_c = triangles.to(torch::kFloat32).contiguous(); + check_triangles_tensor(triangles_c); + + Eigen::Vector3f voxel_size_h = tensor_to_eigen_vec3_cpu(voxel_size); + Eigen::Vector3i grid_min, grid_max; + tensor_to_eigen_grid_min_max_cpu(grid_range, grid_min, grid_max); + + std::vector triangles_vec = triangles_tensor_to_vector_cpu(triangles_c); + std::unordered_map hash_table; + std::vector voxels_vec; + std::vector mean_sum; + std::vector cnt; + std::vector intersected_vec; + std::vector qefs; + + intersect_qef( + voxel_size_h, + grid_min, + grid_max, + triangles_vec, + hash_table, + voxels_vec, + mean_sum, + cnt, + intersected_vec, + qefs + ); + + return std::make_tuple( + int3_vector_to_tensor_cpu(voxels_vec), + vec3f_vector_to_tensor_cpu(mean_sum), + float_vector_to_tensor_cpu(cnt), + bool3_vector_to_tensor_cpu(intersected_vec), + matrix4f_vector_to_tensor_cpu(qefs) + ); +} + + +torch::Tensor face_qef_cpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + const torch::Tensor& voxels +) { + check_cpu_tensor(triangles, "triangles"); + check_cpu_tensor(voxels, "voxels"); + auto triangles_c = triangles.to(torch::kFloat32).contiguous(); + auto voxels_c = voxels.to(torch::kInt32).contiguous(); + check_triangles_tensor(triangles_c); + check_voxels_tensor(voxels_c); + + Eigen::Vector3f voxel_size_h = tensor_to_eigen_vec3_cpu(voxel_size); + Eigen::Vector3i grid_min, grid_max; + tensor_to_eigen_grid_min_max_cpu(grid_range, grid_min, grid_max); + + std::vector triangles_vec = triangles_tensor_to_vector_cpu(triangles_c); + std::unordered_map hash_table; + std::vector voxels_vec = voxels_tensor_to_vector_cpu(voxels_c, hash_table); + std::vector qefs(voxels_vec.size(), Eigen::Matrix4f::Zero()); + + face_qef( + voxel_size_h, + grid_min, + grid_max, + triangles_vec, + hash_table, + qefs + ); + + return matrix4f_vector_to_tensor_cpu(qefs); +} + + +torch::Tensor boundary_qef_cpu( + const torch::Tensor& boundaries, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + float boundary_weight, + const torch::Tensor& voxels +) { + check_cpu_tensor(boundaries, "boundaries"); + check_cpu_tensor(voxels, "voxels"); + auto boundaries_c = boundaries.to(torch::kFloat32).contiguous(); + auto voxels_c = voxels.to(torch::kInt32).contiguous(); + check_boundaries_tensor(boundaries_c); + check_voxels_tensor(voxels_c); + + Eigen::Vector3f voxel_size_h = tensor_to_eigen_vec3_cpu(voxel_size); + Eigen::Vector3i grid_min, grid_max; + tensor_to_eigen_grid_min_max_cpu(grid_range, grid_min, grid_max); + + std::vector boundaries_vec = boundaries_tensor_to_vector_cpu(boundaries_c); + std::unordered_map hash_table; + std::vector voxels_vec = voxels_tensor_to_vector_cpu(voxels_c, hash_table); + std::vector qefs(voxels_vec.size(), Eigen::Matrix4f::Zero()); + + boundry_qef( + voxel_size_h, + grid_min, + grid_max, + boundaries_vec, + boundary_weight, + hash_table, + qefs + ); + + return matrix4f_vector_to_tensor_cpu(qefs); +} + + +torch::Tensor intersection_occ_gpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + int64_t chunk_triangles +) { + TORCH_CHECK(triangles.is_cuda(), "triangles must be a CUDA tensor"); + TORCH_CHECK(chunk_triangles > 0, "chunk_triangles must be > 0"); + + auto triangles_c = triangles.to(torch::kFloat32).contiguous(); + check_triangles_tensor(triangles_c); + + float3 voxel_size_h = tensor_to_float3_cpu(voxel_size); + fdg_gpu::int3_ grid_min{}; + fdg_gpu::int3_ grid_max{}; + tensor_to_grid_min_max_cpu(grid_range, grid_min, grid_max); + + cudaStream_t stream = nullptr; + auto out = intersection_qef::intersection_occ_gpu( + triangles_c.data_ptr(), + triangles_c.size(0), + voxel_size_h, + grid_min, + grid_max, + chunk_triangles, + stream + ); + + auto opts_i32 = torch::TensorOptions().dtype(torch::kInt32).device(triangles_c.device()); + torch::Tensor voxels = torch::empty({out.size, 3}, opts_i32); + if (out.size > 0) { + check_cuda_success( + cudaMemcpyAsync( + voxels.data_ptr(), + out.voxels.data(), + static_cast(out.size) * 3 * sizeof(int32_t), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync intersection_occ voxels" + ); + check_cuda_success(cudaStreamSynchronize(stream), "cudaStreamSynchronize intersection_occ"); + } + return voxels; +} + + +std::tuple intersect_qef_gpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + int64_t chunk_triangles +) { + TORCH_CHECK(triangles.is_cuda(), "triangles must be a CUDA tensor"); + TORCH_CHECK(chunk_triangles > 0, "chunk_triangles must be > 0"); + + auto triangles_c = triangles.to(torch::kFloat32).contiguous(); + check_triangles_tensor(triangles_c); + + float3 voxel_size_h = tensor_to_float3_cpu(voxel_size); + fdg_gpu::int3_ grid_min{}; + fdg_gpu::int3_ grid_max{}; + tensor_to_grid_min_max_cpu(grid_range, grid_min, grid_max); + + cudaStream_t stream = nullptr; + auto out = intersection_qef::intersect_qef_gpu( + triangles_c.data_ptr(), + triangles_c.size(0), + voxel_size_h, + grid_min, + grid_max, + chunk_triangles, + stream + ); + + auto opts_i32 = torch::TensorOptions().dtype(torch::kInt32).device(triangles_c.device()); + auto opts_f32 = torch::TensorOptions().dtype(torch::kFloat32).device(triangles_c.device()); + auto opts_u8 = torch::TensorOptions().dtype(torch::kUInt8).device(triangles_c.device()); + auto opts_b = torch::TensorOptions().dtype(torch::kBool).device(triangles_c.device()); + + static_assert(sizeof(fdg_gpu::SymQEF10) == sizeof(float) * 10, "Unexpected SymQEF10 layout"); + + torch::Tensor voxels = torch::empty({out.size, 3}, opts_i32); + torch::Tensor mean_sum = torch::empty({out.size, 3}, opts_f32); + torch::Tensor cnt = torch::empty({out.size}, opts_f32); + torch::Tensor intersected_mask = torch::empty({out.size}, opts_u8); + torch::Tensor qefs = torch::empty({out.size, 10}, opts_f32); + + if (out.size > 0) { + check_cuda_success( + cudaMemcpyAsync( + voxels.data_ptr(), + out.voxels.data(), + static_cast(out.size) * 3 * sizeof(int32_t), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync intersect_qef voxels" + ); + check_cuda_success( + cudaMemcpyAsync( + mean_sum.data_ptr(), + out.mean_sum.data(), + static_cast(out.size) * 3 * sizeof(float), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync intersect_qef mean_sum" + ); + check_cuda_success( + cudaMemcpyAsync( + cnt.data_ptr(), + out.cnt.data(), + static_cast(out.size) * sizeof(float), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync intersect_qef cnt" + ); + check_cuda_success( + cudaMemcpyAsync( + intersected_mask.data_ptr(), + out.intersected.data(), + static_cast(out.size) * sizeof(uint8_t), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync intersect_qef intersected" + ); + check_cuda_success( + cudaMemcpyAsync( + qefs.data_ptr(), + out.qefs.data(), + static_cast(out.size) * sizeof(fdg_gpu::SymQEF10), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync intersect_qef qefs" + ); + } + + torch::Tensor intersected = torch::empty({out.size, 3}, opts_b); + if (out.size > 0) { + const int kBlock = 256; + const int grid = static_cast((out.size + kBlock - 1) / kBlock); + unpack_intersected_mask_kernel<<>>( + intersected_mask.data_ptr(), + out.size, + intersected.data_ptr() + ); + check_cuda_success(cudaGetLastError(), "unpack_intersected_mask_kernel"); + check_cuda_success(cudaStreamSynchronize(stream), "cudaStreamSynchronize intersect_qef"); + } + + return std::make_tuple(voxels, mean_sum, cnt, intersected, qefs); +} + + +std::tuple voxelize_mesh_oct_gpu( + const torch::Tensor& vertices, + const torch::Tensor& faces, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range +) { + TORCH_CHECK(vertices.is_cuda(), "vertices must be a CUDA tensor"); + TORCH_CHECK(faces.is_cuda(), "faces must be a CUDA tensor"); + TORCH_CHECK(vertices.device() == faces.device(), "vertices and faces must be on the same CUDA device"); + + auto vertices_c = vertices.to(torch::kFloat32).contiguous(); + auto faces_c = faces.to(torch::kInt32).contiguous(); + TORCH_CHECK(vertices_c.dim() == 2 && vertices_c.size(1) == 3, "vertices must have shape [V, 3]"); + TORCH_CHECK(faces_c.dim() == 2 && faces_c.size(1) == 3, "faces must have shape [F, 3]"); + + float3 voxel_size_h = tensor_to_float3_cpu(voxel_size); + fdg_gpu::int3_ grid_min{}; + fdg_gpu::int3_ grid_max{}; + tensor_to_grid_min_max_cpu(grid_range, grid_min, grid_max); + fdg_gpu::int3_ grid_size = grid_size_from_min_max(grid_min, grid_max); + + cudaStream_t stream = nullptr; + auto out = oct_pairs::voxelize_mesh_oct_gpu( + vertices_c.data_ptr(), + vertices_c.size(0), + faces_c.data_ptr(), + faces_c.size(0), + grid_min, + grid_size, + voxel_size_h, + stream + ); + + return primitive_pair_to_tensors(out, vertices_c.device(), stream); +} + + +std::tuple voxelize_edge_oct_gpu( + const torch::Tensor& vertices, + const torch::Tensor& edges, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range +) { + TORCH_CHECK(vertices.is_cuda(), "vertices must be a CUDA tensor"); + TORCH_CHECK(edges.is_cuda(), "edges must be a CUDA tensor"); + TORCH_CHECK(vertices.device() == edges.device(), "vertices and edges must be on the same CUDA device"); + + auto vertices_c = vertices.to(torch::kFloat32).contiguous(); + auto edges_c = edges.to(torch::kInt32).contiguous(); + TORCH_CHECK(vertices_c.dim() == 2 && vertices_c.size(1) == 3, "vertices must have shape [V, 3]"); + check_edges_tensor(edges_c); + + float3 voxel_size_h = tensor_to_float3_cpu(voxel_size); + fdg_gpu::int3_ grid_min{}; + fdg_gpu::int3_ grid_max{}; + tensor_to_grid_min_max_cpu(grid_range, grid_min, grid_max); + fdg_gpu::int3_ grid_size = grid_size_from_min_max(grid_min, grid_max); + + cudaStream_t stream = nullptr; + auto out = oct_pairs::voxelize_edge_oct_gpu( + vertices_c.data_ptr(), + vertices_c.size(0), + edges_c.data_ptr(), + edges_c.size(0), + grid_min, + grid_size, + voxel_size_h, + stream + ); + + return primitive_pair_to_tensors(out, vertices_c.device(), stream); +} + + +torch::Tensor face_qef_gpu( + const torch::Tensor& triangles, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + const torch::Tensor& voxels +) { + TORCH_CHECK(triangles.is_cuda(), "triangles must be a CUDA tensor"); + TORCH_CHECK(voxels.is_cuda(), "voxels must be a CUDA tensor"); + TORCH_CHECK(triangles.device() == voxels.device(), "triangles and voxels must be on the same CUDA device"); + + auto triangles_c = triangles.to(torch::kFloat32).contiguous(); + auto voxels_c = voxels.to(torch::kInt32).contiguous(); + check_triangles_tensor(triangles_c); + check_voxels_tensor(voxels_c); + + float3 voxel_size_h = tensor_to_float3_cpu(voxel_size); + fdg_gpu::int3_ grid_min{}; + fdg_gpu::int3_ grid_max{}; + tensor_to_grid_min_max_cpu(grid_range, grid_min, grid_max); + + cudaStream_t stream = nullptr; + auto out = oct_pairs::face_qef_gpu( + voxel_size_h, + grid_min, + grid_max, + triangles_c.data_ptr(), + triangles_c.size(0), + voxels_c.data_ptr(), + voxels_c.size(0), + stream + ); + + static_assert(sizeof(fdg_gpu::SymQEF10) == sizeof(float) * 10, "Unexpected SymQEF10 layout"); + auto opts_f32 = torch::TensorOptions().dtype(torch::kFloat32).device(triangles_c.device()); + torch::Tensor qefs = torch::empty({out.size, 10}, opts_f32); + if (out.size > 0) { + check_cuda_success( + cudaMemcpyAsync( + qefs.data_ptr(), + out.qefs.data(), + static_cast(out.size) * sizeof(fdg_gpu::SymQEF10), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync face_qef qefs" + ); + check_cuda_success(cudaStreamSynchronize(stream), "cudaStreamSynchronize face_qef"); + } + return qefs; +} + + +std::tuple voxel_traverse_edge_dda_gpu( + const torch::Tensor& vertices, + const torch::Tensor& edges, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + int chunk_steps +) { + TORCH_CHECK(vertices.is_cuda(), "vertices must be a CUDA tensor"); + TORCH_CHECK(edges.is_cuda(), "edges must be a CUDA tensor"); + TORCH_CHECK(vertices.device() == edges.device(), "vertices and edges must be on the same CUDA device"); + TORCH_CHECK(chunk_steps > 0, "chunk_steps must be > 0"); + + auto vertices_c = vertices.to(torch::kFloat32).contiguous(); + auto edges_c = edges.to(torch::kInt32).contiguous(); + TORCH_CHECK(vertices_c.dim() == 2 && vertices_c.size(1) == 3, "vertices must have shape [V, 3]"); + check_edges_tensor(edges_c); + + float3 voxel_size_h = tensor_to_float3_cpu(voxel_size); + fdg_gpu::int3_ grid_min{}; + fdg_gpu::int3_ grid_max{}; + tensor_to_grid_min_max_cpu(grid_range, grid_min, grid_max); + + cudaStream_t stream = nullptr; + auto out = edge_dda::voxel_traverse_edge_dda_gpu( + vertices_c.data_ptr(), + vertices_c.size(0), + edges_c.data_ptr(), + edges_c.size(0), + voxel_size_h, + grid_min, + grid_max, + chunk_steps, + stream + ); + + return primitive_pair_to_tensors(out, vertices_c.device(), stream); +} + + +torch::Tensor boundary_qef_gpu( + const torch::Tensor& boundaries, + const torch::Tensor& voxel_size, + const torch::Tensor& grid_range, + float boundary_weight, + const torch::Tensor& voxels, + int chunk_steps +) { + TORCH_CHECK(boundaries.is_cuda(), "boundaries must be a CUDA tensor"); + TORCH_CHECK(voxels.is_cuda(), "voxels must be a CUDA tensor"); + TORCH_CHECK(boundaries.device() == voxels.device(), "boundaries and voxels must be on the same CUDA device"); + TORCH_CHECK(chunk_steps > 0, "chunk_steps must be > 0"); + + auto boundaries_c = boundaries.to(torch::kFloat32).contiguous(); + auto voxels_c = voxels.to(torch::kInt32).contiguous(); + check_boundaries_tensor(boundaries_c); + check_voxels_tensor(voxels_c); + + float3 voxel_size_h = tensor_to_float3_cpu(voxel_size); + fdg_gpu::int3_ grid_min{}; + fdg_gpu::int3_ grid_max{}; + tensor_to_grid_min_max_cpu(grid_range, grid_min, grid_max); + + cudaStream_t stream = nullptr; + auto out = edge_dda::boundary_qef_gpu( + voxel_size_h, + grid_min, + grid_max, + boundaries_c.data_ptr(), + boundaries_c.size(0), + boundary_weight, + voxels_c.data_ptr(), + voxels_c.size(0), + chunk_steps, + stream + ); + + static_assert(sizeof(fdg_gpu::SymQEF10) == sizeof(float) * 10, "Unexpected SymQEF10 layout"); + auto opts_f32 = torch::TensorOptions().dtype(torch::kFloat32).device(boundaries_c.device()); + torch::Tensor qefs = torch::empty({out.size, 10}, opts_f32); + if (out.size > 0) { + check_cuda_success( + cudaMemcpyAsync( + qefs.data_ptr(), + out.qefs.data(), + static_cast(out.size) * sizeof(fdg_gpu::SymQEF10), + cudaMemcpyDeviceToDevice, + stream + ), + "cudaMemcpyAsync boundary_qef qefs" + ); + check_cuda_success(cudaStreamSynchronize(stream), "cudaStreamSynchronize boundary_qef"); + } + + return qefs; +} diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.cu b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.cu new file mode 100644 index 00000000..dbc0d88c --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.cu @@ -0,0 +1,1029 @@ +#include "voxel_traverse_edge_dda.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include + +namespace voxel_traverse_edge_dda_impl { + +#define VOX_CUDA_CHECK(expr) \ + do { \ + cudaError_t _err = (expr); \ + if (_err != cudaSuccess) { \ + throw std::runtime_error(std::string("CUDA error: ") + \ + cudaGetErrorString(_err) + \ + " at " + __FILE__ + ":" + \ + std::to_string(__LINE__)); \ + } \ + } while (0) + +constexpr int kDefaultBlockSize = 128; + +struct EdgeDesc { + float3 v0_ws; + float3 v1_ws; + + double3 dir_unit; + double segment_length; + + int32_t start_x; + int32_t start_y; + int32_t start_z; + + int8_t step_x; + int8_t step_y; + int8_t step_z; + + double tmax0_x; + double tmax0_y; + double tmax0_z; + + double tdelta_x; + double tdelta_y; + double tdelta_z; +}; + +struct DDAJobQueue { + int32_t* edge_id = nullptr; + + int32_t* cur_x = nullptr; + int32_t* cur_y = nullptr; + int32_t* cur_z = nullptr; + + double* tmax_x = nullptr; + double* tmax_y = nullptr; + double* tmax_z = nullptr; + + int64_t size = 0; + int64_t capacity = 0; +}; + +struct RoundBuffers { + int32_t* pair_count = nullptr; + int32_t* next_job_count = nullptr; + + int32_t* pair_offset = nullptr; + int32_t* next_job_offset = nullptr; + + void* cub_temp_storage = nullptr; + size_t cub_temp_bytes = 0; + int64_t capacity = 0; +}; + +struct ResultBuffer { + int32_t* edge_id = nullptr; + int32_t* vi = nullptr; + int32_t* vj = nullptr; + int32_t* vk = nullptr; + int64_t size = 0; +}; + +struct DeviceResult { + int32_t* edge_id = nullptr; + int32_t* voxel_i = nullptr; + int32_t* voxel_j = nullptr; + int32_t* voxel_k = nullptr; + int64_t size = 0; +}; + +struct Workspace { + EdgeDesc* edge_desc = nullptr; + + uint8_t* edge_valid = nullptr; + int32_t* init_count = nullptr; + int32_t* init_offset = nullptr; + + DDAJobQueue queue_a; + DDAJobQueue queue_b; + + RoundBuffers round; + std::vector result_rounds; +}; + +inline int ceil_div_i64(int64_t n, int block) { + return static_cast((n + block - 1) / block); +} + +inline void free_ptr(void* ptr) { + if (ptr != nullptr) { + cudaFree(ptr); + } +} + +inline void alloc_i32(int32_t** ptr, int64_t n) { + *ptr = nullptr; + if (n > 0) { + VOX_CUDA_CHECK(cudaMalloc(reinterpret_cast(ptr), sizeof(int32_t) * n)); + } +} + +inline void alloc_u8(uint8_t** ptr, int64_t n) { + *ptr = nullptr; + if (n > 0) { + VOX_CUDA_CHECK(cudaMalloc(reinterpret_cast(ptr), sizeof(uint8_t) * n)); + } +} + +inline void alloc_double(double** ptr, int64_t n) { + *ptr = nullptr; + if (n > 0) { + VOX_CUDA_CHECK(cudaMalloc(reinterpret_cast(ptr), sizeof(double) * n)); + } +} + +inline void alloc_edge_desc(EdgeDesc** ptr, int64_t n) { + *ptr = nullptr; + if (n > 0) { + VOX_CUDA_CHECK(cudaMalloc(reinterpret_cast(ptr), sizeof(EdgeDesc) * n)); + } +} + +inline void release_dda_job_queue(DDAJobQueue& q) { + free_ptr(q.edge_id); + free_ptr(q.cur_x); + free_ptr(q.cur_y); + free_ptr(q.cur_z); + free_ptr(q.tmax_x); + free_ptr(q.tmax_y); + free_ptr(q.tmax_z); + q = {}; +} + +inline void release_round_buffers(RoundBuffers& b) { + free_ptr(b.pair_count); + free_ptr(b.next_job_count); + free_ptr(b.pair_offset); + free_ptr(b.next_job_offset); + free_ptr(b.cub_temp_storage); + b = {}; +} + +inline void release_result_buffer(ResultBuffer& r) { + free_ptr(r.edge_id); + free_ptr(r.vi); + free_ptr(r.vj); + free_ptr(r.vk); + r = {}; +} + +inline void release_workspace(Workspace& ws) { + free_ptr(ws.edge_desc); + free_ptr(ws.edge_valid); + free_ptr(ws.init_count); + free_ptr(ws.init_offset); + release_dda_job_queue(ws.queue_a); + release_dda_job_queue(ws.queue_b); + release_round_buffers(ws.round); + for (auto& r : ws.result_rounds) { + release_result_buffer(r); + } + ws.result_rounds.clear(); +} + +inline void ensure_dda_job_queue_capacity(DDAJobQueue& q, int64_t capacity) { + if (capacity <= q.capacity) { + return; + } + release_dda_job_queue(q); + alloc_i32(&q.edge_id, capacity); + alloc_i32(&q.cur_x, capacity); + alloc_i32(&q.cur_y, capacity); + alloc_i32(&q.cur_z, capacity); + alloc_double(&q.tmax_x, capacity); + alloc_double(&q.tmax_y, capacity); + alloc_double(&q.tmax_z, capacity); + q.capacity = capacity; + q.size = 0; +} + +inline void ensure_round_capacity(RoundBuffers& b, int64_t capacity) { + if (capacity <= b.capacity) { + return; + } + free_ptr(b.pair_count); + free_ptr(b.next_job_count); + free_ptr(b.pair_offset); + free_ptr(b.next_job_offset); + + alloc_i32(&b.pair_count, capacity); + alloc_i32(&b.next_job_count, capacity); + alloc_i32(&b.pair_offset, capacity); + alloc_i32(&b.next_job_offset, capacity); + b.capacity = capacity; +} + +inline void ensure_scan_temp_storage( + RoundBuffers& b, + int32_t* d_in, + int32_t* d_out, + int64_t count, + cudaStream_t stream) { + if (count <= 0) { + return; + } + if (count > INT32_MAX) { + throw std::runtime_error("CUB scan count exceeds int32 range"); + } + size_t bytes = 0; + VOX_CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + nullptr, + bytes, + d_in, + d_out, + static_cast(count), + stream)); + if (bytes > b.cub_temp_bytes) { + free_ptr(b.cub_temp_storage); + VOX_CUDA_CHECK(cudaMalloc(&b.cub_temp_storage, bytes)); + b.cub_temp_bytes = bytes; + } +} + +inline void exclusive_scan_i32( + RoundBuffers& b, + int32_t* d_in, + int32_t* d_out, + int64_t count, + cudaStream_t stream) { + if (count <= 0) { + return; + } + ensure_scan_temp_storage(b, d_in, d_out, count, stream); + VOX_CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + b.cub_temp_storage, + b.cub_temp_bytes, + d_in, + d_out, + static_cast(count), + stream)); +} + +inline int32_t copy_last_i32(const int32_t* ptr, int64_t count, cudaStream_t stream) { + if (count <= 0) { + return 0; + } + int32_t value = 0; + VOX_CUDA_CHECK(cudaMemcpyAsync( + &value, + ptr + (count - 1), + sizeof(int32_t), + cudaMemcpyDeviceToHost, + stream)); + VOX_CUDA_CHECK(cudaStreamSynchronize(stream)); + return value; +} + +inline ResultBuffer make_result_buffer(int64_t count) { + ResultBuffer r; + if (count <= 0) { + return r; + } + alloc_i32(&r.edge_id, count); + alloc_i32(&r.vi, count); + alloc_i32(&r.vj, count); + alloc_i32(&r.vk, count); + r.size = count; + return r; +} + +inline DeviceResult gather_result_rounds( + const std::vector& rounds, + cudaStream_t stream) { + DeviceResult out; + int64_t total = 0; + for (const auto& r : rounds) { + total += r.size; + } + out.size = total; + if (total == 0) { + return out; + } + + alloc_i32(&out.edge_id, total); + alloc_i32(&out.voxel_i, total); + alloc_i32(&out.voxel_j, total); + alloc_i32(&out.voxel_k, total); + + int64_t cursor = 0; + for (const auto& r : rounds) { + if (r.size == 0) { + continue; + } + VOX_CUDA_CHECK(cudaMemcpyAsync( + out.edge_id + cursor, + r.edge_id, + sizeof(int32_t) * r.size, + cudaMemcpyDeviceToDevice, + stream)); + VOX_CUDA_CHECK(cudaMemcpyAsync( + out.voxel_i + cursor, + r.vi, + sizeof(int32_t) * r.size, + cudaMemcpyDeviceToDevice, + stream)); + VOX_CUDA_CHECK(cudaMemcpyAsync( + out.voxel_j + cursor, + r.vj, + sizeof(int32_t) * r.size, + cudaMemcpyDeviceToDevice, + stream)); + VOX_CUDA_CHECK(cudaMemcpyAsync( + out.voxel_k + cursor, + r.vk, + sizeof(int32_t) * r.size, + cudaMemcpyDeviceToDevice, + stream)); + cursor += r.size; + } + + VOX_CUDA_CHECK(cudaStreamSynchronize(stream)); + return out; +} + +__device__ inline int argmin_axis_strict(double tx, double ty, double tz) { + if (tx < ty) { + return (tx < tz) ? 0 : 2; + } + return (ty < tz) ? 1 : 2; +} + +__device__ inline bool in_bounds_voxel_abs( + int x, + int y, + int z, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max) { + return (grid_min.x <= x && x < grid_max.x) && + (grid_min.y <= y && y < grid_max.y) && + (grid_min.z <= z && z < grid_max.z); +} + +__global__ void kernel_build_edge_desc( + const float* __restrict__ vertices, + const int32_t* __restrict__ edges, + int64_t num_edges, + float3 voxel_size, + EdgeDesc* __restrict__ edge_desc, + uint8_t* __restrict__ edge_valid) { + int64_t eid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (eid >= num_edges) { + return; + } + + int v0_id = edges[2 * eid + 0]; + int v1_id = edges[2 * eid + 1]; + + float3 v0 = make_float3(vertices[3 * v0_id + 0], vertices[3 * v0_id + 1], vertices[3 * v0_id + 2]); + float3 v1 = make_float3(vertices[3 * v1_id + 0], vertices[3 * v1_id + 1], vertices[3 * v1_id + 2]); + + double dx = static_cast(v1.x) - static_cast(v0.x); + double dy = static_cast(v1.y) - static_cast(v0.y); + double dz = static_cast(v1.z) - static_cast(v0.z); + double segment_length = sqrt(dx * dx + dy * dy + dz * dz); + + if (segment_length < 1e-6) { + edge_valid[eid] = 0; + return; + } + + double3 dir_unit = make_double3(dx / segment_length, dy / segment_length, dz / segment_length); + + int32_t sx = static_cast(floor(static_cast(v0.x) / static_cast(voxel_size.x))); + int32_t sy = static_cast(floor(static_cast(v0.y) / static_cast(voxel_size.y))); + int32_t sz = static_cast(floor(static_cast(v0.z) / static_cast(voxel_size.z))); + + int8_t step_x = (dir_unit.x > 0.0) ? 1 : -1; + int8_t step_y = (dir_unit.y > 0.0) ? 1 : -1; + int8_t step_z = (dir_unit.z > 0.0) ? 1 : -1; + + double tmax_x, tmax_y, tmax_z; + double tdelta_x, tdelta_y, tdelta_z; + + if (dir_unit.x == 0.0) { + tmax_x = CUDART_INF; + tdelta_x = CUDART_INF; + } else { + double voxel_border = static_cast(voxel_size.x) * static_cast(sx + (step_x > 0 ? 1 : 0)); + tmax_x = (voxel_border - static_cast(v0.x)) / dir_unit.x; + tdelta_x = static_cast(voxel_size.x) / fabs(dir_unit.x); + } + + if (dir_unit.y == 0.0) { + tmax_y = CUDART_INF; + tdelta_y = CUDART_INF; + } else { + double voxel_border = static_cast(voxel_size.y) * static_cast(sy + (step_y > 0 ? 1 : 0)); + tmax_y = (voxel_border - static_cast(v0.y)) / dir_unit.y; + tdelta_y = static_cast(voxel_size.y) / fabs(dir_unit.y); + } + + if (dir_unit.z == 0.0) { + tmax_z = CUDART_INF; + tdelta_z = CUDART_INF; + } else { + double voxel_border = static_cast(voxel_size.z) * static_cast(sz + (step_z > 0 ? 1 : 0)); + tmax_z = (voxel_border - static_cast(v0.z)) / dir_unit.z; + tdelta_z = static_cast(voxel_size.z) / fabs(dir_unit.z); + } + + EdgeDesc desc; + desc.v0_ws = v0; + desc.v1_ws = v1; + desc.dir_unit = dir_unit; + desc.segment_length = segment_length; + desc.start_x = sx; + desc.start_y = sy; + desc.start_z = sz; + desc.step_x = step_x; + desc.step_y = step_y; + desc.step_z = step_z; + desc.tmax0_x = tmax_x; + desc.tmax0_y = tmax_y; + desc.tmax0_z = tmax_z; + desc.tdelta_x = tdelta_x; + desc.tdelta_y = tdelta_y; + desc.tdelta_z = tdelta_z; + + edge_desc[eid] = desc; + edge_valid[eid] = 1; +} + +__global__ void kernel_count_init_jobs( + const uint8_t* __restrict__ edge_valid, + int64_t num_edges, + int32_t* __restrict__ init_count) { + int64_t eid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (eid >= num_edges) { + return; + } + init_count[eid] = edge_valid[eid] ? 1 : 0; +} + +__global__ void kernel_emit_init_jobs( + const uint8_t* __restrict__ edge_valid, + const EdgeDesc* __restrict__ edge_desc, + const int32_t* __restrict__ init_offset, + int64_t num_edges, + DDAJobQueue out_q) { + int64_t eid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (eid >= num_edges || !edge_valid[eid]) { + return; + } + + int32_t out = init_offset[eid]; + const EdgeDesc& desc = edge_desc[eid]; + out_q.edge_id[out] = static_cast(eid); + out_q.cur_x[out] = desc.start_x; + out_q.cur_y[out] = desc.start_y; + out_q.cur_z[out] = desc.start_z; + out_q.tmax_x[out] = desc.tmax0_x; + out_q.tmax_y[out] = desc.tmax0_y; + out_q.tmax_z[out] = desc.tmax0_z; +} + +__global__ void kernel_count_dda_jobs( + DDAJobQueue curr_q, + const EdgeDesc* __restrict__ edge_desc, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + int chunk_steps, + int32_t* __restrict__ pair_count, + int32_t* __restrict__ next_job_count) { + int64_t jid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (jid >= curr_q.size) { + return; + } + + int32_t eid = curr_q.edge_id[jid]; + const EdgeDesc& desc = edge_desc[eid]; + + int32_t cx = curr_q.cur_x[jid]; + int32_t cy = curr_q.cur_y[jid]; + int32_t cz = curr_q.cur_z[jid]; + double tx = curr_q.tmax_x[jid]; + double ty = curr_q.tmax_y[jid]; + double tz = curr_q.tmax_z[jid]; + + int32_t local_pairs = 0; + bool alive = true; + + if (in_bounds_voxel_abs(cx, cy, cz, grid_min, grid_max)) { + local_pairs += 1; + } + + for (int step_idx = 0; step_idx < chunk_steps; ++step_idx) { + int axis = argmin_axis_strict(tx, ty, tz); + double t_axis = (axis == 0) ? tx : (axis == 1 ? ty : tz); + if (t_axis > desc.segment_length) { + alive = false; + break; + } + + if (axis == 0) { + cx += static_cast(desc.step_x); + tx += desc.tdelta_x; + } else if (axis == 1) { + cy += static_cast(desc.step_y); + ty += desc.tdelta_y; + } else { + cz += static_cast(desc.step_z); + tz += desc.tdelta_z; + } + + if (in_bounds_voxel_abs(cx, cy, cz, grid_min, grid_max)) { + local_pairs += 1; + } + } + + pair_count[jid] = local_pairs; + next_job_count[jid] = alive ? 1 : 0; +} + +__global__ void kernel_emit_dda_jobs( + DDAJobQueue curr_q, + const EdgeDesc* __restrict__ edge_desc, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + int chunk_steps, + const int32_t* __restrict__ pair_offset, + const int32_t* __restrict__ next_job_offset, + ResultBuffer out_res, + DDAJobQueue next_q) { + int64_t jid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (jid >= curr_q.size) { + return; + } + + int32_t eid = curr_q.edge_id[jid]; + const EdgeDesc& desc = edge_desc[eid]; + + int32_t cx = curr_q.cur_x[jid]; + int32_t cy = curr_q.cur_y[jid]; + int32_t cz = curr_q.cur_z[jid]; + double tx = curr_q.tmax_x[jid]; + double ty = curr_q.tmax_y[jid]; + double tz = curr_q.tmax_z[jid]; + + int32_t out_pair = pair_offset[jid]; + bool alive = true; + + if (in_bounds_voxel_abs(cx, cy, cz, grid_min, grid_max)) { + out_res.edge_id[out_pair] = eid; + out_res.vi[out_pair] = cx; + out_res.vj[out_pair] = cy; + out_res.vk[out_pair] = cz; + out_pair += 1; + } + + for (int step_idx = 0; step_idx < chunk_steps; ++step_idx) { + int axis = argmin_axis_strict(tx, ty, tz); + double t_axis = (axis == 0) ? tx : (axis == 1 ? ty : tz); + if (t_axis > desc.segment_length) { + alive = false; + break; + } + + if (axis == 0) { + cx += static_cast(desc.step_x); + tx += desc.tdelta_x; + } else if (axis == 1) { + cy += static_cast(desc.step_y); + ty += desc.tdelta_y; + } else { + cz += static_cast(desc.step_z); + tz += desc.tdelta_z; + } + + if (in_bounds_voxel_abs(cx, cy, cz, grid_min, grid_max)) { + out_res.edge_id[out_pair] = eid; + out_res.vi[out_pair] = cx; + out_res.vj[out_pair] = cy; + out_res.vk[out_pair] = cz; + out_pair += 1; + } + } + + if (alive) { + int32_t out_job = next_job_offset[jid]; + next_q.edge_id[out_job] = eid; + next_q.cur_x[out_job] = cx; + next_q.cur_y[out_job] = cy; + next_q.cur_z[out_job] = cz; + next_q.tmax_x[out_job] = tx; + next_q.tmax_y[out_job] = ty; + next_q.tmax_z[out_job] = tz; + } +} + +inline void release_device_result(DeviceResult& out) { + free_ptr(out.edge_id); + free_ptr(out.voxel_i); + free_ptr(out.voxel_j); + free_ptr(out.voxel_k); + out = {}; +} + +} // namespace voxel_traverse_edge_dda_impl + +namespace { + +inline fdg_gpu::PrimitivePairResult to_primitive_pair(voxel_traverse_edge_dda_impl::DeviceResult&& r) { + fdg_gpu::PrimitivePairResult out; + out.size = r.size; + out.prim_id.adopt(r.edge_id, r.size); + out.voxel_i.adopt(r.voxel_i, r.size); + out.voxel_j.adopt(r.voxel_j, r.size); + out.voxel_k.adopt(r.voxel_k, r.size); + r.edge_id = nullptr; + r.voxel_i = nullptr; + r.voxel_j = nullptr; + r.voxel_k = nullptr; + r.size = 0; + return out; +} + +__host__ __device__ inline fdg_gpu::SymQEF10 symqef10_zero() { + return fdg_gpu::SymQEF10{0,0,0,0,0,0,0,0,0,0}; +} + +struct SymQEF10Add { + __host__ __device__ fdg_gpu::SymQEF10 operator()(const fdg_gpu::SymQEF10& a, const fdg_gpu::SymQEF10& b) const { + return fdg_gpu::SymQEF10{ + a.q00 + b.q00, a.q01 + b.q01, a.q02 + b.q02, a.q03 + b.q03, + a.q11 + b.q11, a.q12 + b.q12, a.q13 + b.q13, + a.q22 + b.q22, a.q23 + b.q23, + a.q33 + b.q33}; + } +}; + +struct SurfaceLookup { + int64_t size = 0; + fdg_gpu::DeviceBuffer keys_sorted; + fdg_gpu::DeviceBuffer ids_sorted; +}; + +struct EdgePairKeys { + int64_t size = 0; + fdg_gpu::DeviceBuffer pair_keys; +}; + +struct BoundaryContribStream { + int64_t size = 0; + fdg_gpu::DeviceBuffer voxel_id; + fdg_gpu::DeviceBuffer qef; +}; + + +__global__ void copy_boundaries_to_vertices_kernel(const float* boundaries, int64_t num_boundaries, float* vertices_out) { + int64_t tid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (tid >= 2 * num_boundaries) return; + vertices_out[3 * tid + 0] = boundaries[3 * tid + 0]; + vertices_out[3 * tid + 1] = boundaries[3 * tid + 1]; + vertices_out[3 * tid + 2] = boundaries[3 * tid + 2]; +} + +__global__ void build_synth_edges_kernel(int64_t num_boundaries, int32_t* edges_out) { + int64_t eid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (eid >= num_boundaries) return; + edges_out[2 * eid + 0] = static_cast(2 * eid + 0); + edges_out[2 * eid + 1] = static_cast(2 * eid + 1); +} + +__global__ void build_surface_keys_kernel(const int* voxels, int64_t num_voxels, fdg_gpu::int3_ grid_min, fdg_gpu::int3_ grid_max, uint64_t* keys, int32_t* ids) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_voxels) return; + int x = voxels[3 * i + 0]; + int y = voxels[3 * i + 1]; + int z = voxels[3 * i + 2]; + keys[i] = fdg_gpu::pack_voxel_key(x, y, z, grid_min, grid_max); + ids[i] = static_cast(i); +} + +__global__ void build_raw_pair_voxel_keys_kernel(const int32_t* voxel_i, const int32_t* voxel_j, const int32_t* voxel_k, int64_t num_pairs, fdg_gpu::int3_ grid_min, fdg_gpu::int3_ grid_max, uint64_t* pair_voxel_keys) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_pairs) return; + pair_voxel_keys[i] = fdg_gpu::pack_voxel_key(voxel_i[i], voxel_j[i], voxel_k[i], grid_min, grid_max); +} + +__device__ inline int lower_bound_u64(const uint64_t* arr, int64_t n, uint64_t key) { + int64_t lo = 0; + int64_t hi = n; + while (lo < hi) { + int64_t mid = (lo + hi) >> 1; + uint64_t v = arr[mid]; + if (v < key) lo = mid + 1; + else hi = mid; + } + return static_cast(lo); +} + +__global__ void map_pair_to_voxel_id_kernel(const uint64_t* pair_voxel_keys, const int32_t* edge_id, int64_t num_pairs, const uint64_t* surface_keys_sorted, const int32_t* surface_ids_sorted, int64_t num_voxels, int32_t* mapped_voxel_id, int32_t* mapped_edge_id, int32_t* valid) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_pairs) return; + uint64_t key = pair_voxel_keys[i]; + int pos = lower_bound_u64(surface_keys_sorted, num_voxels, key); + if (pos < num_voxels && surface_keys_sorted[pos] == key) { + mapped_voxel_id[i] = surface_ids_sorted[pos]; + mapped_edge_id[i] = edge_id[i]; + valid[i] = 1; + } else { + mapped_voxel_id[i] = -1; + mapped_edge_id[i] = -1; + valid[i] = 0; + } +} + +__host__ __device__ inline uint64_t pack_edge_voxel_pair_key(int32_t edge_id, uint64_t voxel_key) { + return (static_cast(static_cast(edge_id)) << 32) ^ voxel_key; +} + +__global__ void compact_valid_pairs_kernel(const int32_t* mapped_voxel_id, const int32_t* mapped_edge_id, const int32_t* valid, const int32_t* offsets, int64_t num_pairs, uint64_t* pair_keys_out) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_pairs || valid[i] == 0) return; + int32_t out = offsets[i]; + uint64_t voxel_key = static_cast(mapped_voxel_id[i]); + pair_keys_out[out] = (static_cast(static_cast(mapped_edge_id[i])) << 32) | voxel_key; +} + +__global__ void decode_pair_keys_kernel(const uint64_t* pair_keys, int64_t num_pairs, int32_t* voxel_id, int32_t* edge_id) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_pairs) return; + edge_id[i] = static_cast(pair_keys[i] >> 32); + voxel_id[i] = static_cast(pair_keys[i] & 0xffffffffu); +} + +__device__ inline fdg_gpu::SymQEF10 symqef10_from_boundary(float3 p0, float3 p1, float boundary_weight) { + double dx = static_cast(p1.x) - static_cast(p0.x); + double dy = static_cast(p1.y) - static_cast(p0.y); + double dz = static_cast(p1.z) - static_cast(p0.z); + double L = sqrt(dx * dx + dy * dy + dz * dz); + if (L < 1e-6) return symqef10_zero(); + double ux = dx / L; + double uy = dy / L; + double uz = dz / L; + double A00 = 1.0 - ux * ux; + double A01 = -ux * uy; + double A02 = -ux * uz; + double A11 = 1.0 - uy * uy; + double A12 = -uy * uz; + double A22 = 1.0 - uz * uz; + double bx = -(A00 * p0.x + A01 * p0.y + A02 * p0.z); + double by = -(A01 * p0.x + A11 * p0.y + A12 * p0.z); + double bz = -(A02 * p0.x + A12 * p0.y + A22 * p0.z); + double c = p0.x * (A00 * p0.x + A01 * p0.y + A02 * p0.z) + + p0.y * (A01 * p0.x + A11 * p0.y + A12 * p0.z) + + p0.z * (A02 * p0.x + A12 * p0.y + A22 * p0.z); + float w = boundary_weight; + return fdg_gpu::SymQEF10{ + static_cast(w * A00), static_cast(w * A01), static_cast(w * A02), static_cast(w * bx), + static_cast(w * A11), static_cast(w * A12), static_cast(w * by), + static_cast(w * A22), static_cast(w * bz), + static_cast(w * c) + }; +} + +__global__ void build_boundary_qef_contrib_kernel(const int32_t* voxel_id, const int32_t* edge_id, int64_t num_pairs, const float* boundary_vertices, const int32_t* boundary_edges, float boundary_weight, int32_t* out_voxel_id, fdg_gpu::SymQEF10* out_qef) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_pairs) return; + int32_t eid = edge_id[i]; + int32_t i0 = boundary_edges[2 * eid + 0]; + int32_t i1 = boundary_edges[2 * eid + 1]; + float3 p0 = make_float3(boundary_vertices[3 * i0 + 0], boundary_vertices[3 * i0 + 1], boundary_vertices[3 * i0 + 2]); + float3 p1 = make_float3(boundary_vertices[3 * i1 + 0], boundary_vertices[3 * i1 + 1], boundary_vertices[3 * i1 + 2]); + out_voxel_id[i] = voxel_id[i]; + out_qef[i] = symqef10_from_boundary(p0, p1, boundary_weight); +} + +__global__ void scatter_reduced_qef_kernel(const int32_t* reduced_voxel_id, const fdg_gpu::SymQEF10* reduced_qef, int64_t M, fdg_gpu::SymQEF10* full_qef) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= M) return; + full_qef[reduced_voxel_id[i]] = reduced_qef[i]; +} + +inline SurfaceLookup build_surface_lookup(const int* voxels, int64_t num_voxels, fdg_gpu::int3_ grid_min, fdg_gpu::int3_ grid_max, cudaStream_t stream) { + SurfaceLookup out; + out.size = num_voxels; + out.keys_sorted.allocate(num_voxels); + out.ids_sorted.allocate(num_voxels); + constexpr int kBlock = 128; + build_surface_keys_kernel<<>>(voxels, num_voxels, grid_min, grid_max, out.keys_sorted.data(), out.ids_sorted.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "build_surface_keys_kernel"); + thrust::sort_by_key(thrust::cuda::par.on(stream), thrust::device_pointer_cast(out.keys_sorted.data()), thrust::device_pointer_cast(out.keys_sorted.data()) + num_voxels, thrust::device_pointer_cast(out.ids_sorted.data())); + return out; +} + +inline EdgePairKeys map_and_unique_edge_pairs(const fdg_gpu::PrimitivePairResult& raw_pairs, const SurfaceLookup& lookup, fdg_gpu::int3_ grid_min, fdg_gpu::int3_ grid_max, cudaStream_t stream) { + EdgePairKeys out; + const int64_t N = raw_pairs.size; + if (N == 0) return out; + fdg_gpu::DeviceBuffer pair_voxel_keys(N); + fdg_gpu::DeviceBuffer mapped_voxel_id(N); + fdg_gpu::DeviceBuffer mapped_edge_id(N); + fdg_gpu::DeviceBuffer valid(N); + fdg_gpu::DeviceBuffer offsets(N); + constexpr int kBlock = 128; + build_raw_pair_voxel_keys_kernel<<>>(raw_pairs.voxel_i.data(), raw_pairs.voxel_j.data(), raw_pairs.voxel_k.data(), N, grid_min, grid_max, pair_voxel_keys.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "build_raw_pair_voxel_keys_kernel"); + map_pair_to_voxel_id_kernel<<>>(pair_voxel_keys.data(), raw_pairs.prim_id.data(), N, lookup.keys_sorted.data(), lookup.ids_sorted.data(), lookup.size, mapped_voxel_id.data(), mapped_edge_id.data(), valid.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "map_pair_to_voxel_id_kernel"); + size_t temp_bytes = 0; + void* temp = nullptr; + VOX_CUDA_CHECK(cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, valid.data(), offsets.data(), static_cast(N), stream)); + VOX_CUDA_CHECK(cudaMalloc(&temp, temp_bytes)); + VOX_CUDA_CHECK(cub::DeviceScan::ExclusiveSum(temp, temp_bytes, valid.data(), offsets.data(), static_cast(N), stream)); + int32_t last_off = voxel_traverse_edge_dda_impl::copy_last_i32(offsets.data(), N, stream); + int32_t last_valid = voxel_traverse_edge_dda_impl::copy_last_i32(valid.data(), N, stream); + cudaFree(temp); + int64_t M = static_cast(last_off) + static_cast(last_valid); + out.size = M; + out.pair_keys.allocate(M); + compact_valid_pairs_kernel<<>>(mapped_voxel_id.data(), mapped_edge_id.data(), valid.data(), offsets.data(), N, out.pair_keys.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "compact_valid_pairs_kernel"); + auto ptr = thrust::device_pointer_cast(out.pair_keys.data()); + thrust::sort(thrust::cuda::par.on(stream), ptr, ptr + M); + auto new_end = thrust::unique(thrust::cuda::par.on(stream), ptr, ptr + M); + out.size = static_cast(new_end - ptr); + return out; +} + +inline BoundaryContribStream build_boundary_contrib_stream(const EdgePairKeys& pair_keys, const float* boundary_vertices, const int32_t* boundary_edges, float boundary_weight, cudaStream_t stream) { + BoundaryContribStream out; + out.size = pair_keys.size; + out.voxel_id.allocate(out.size); + out.qef.allocate(out.size); + fdg_gpu::DeviceBuffer edge_id(out.size); + constexpr int kBlock = 128; + decode_pair_keys_kernel<<>>(pair_keys.pair_keys.data(), out.size, out.voxel_id.data(), edge_id.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "decode_pair_keys_kernel"); + build_boundary_qef_contrib_kernel<<>>(out.voxel_id.data(), edge_id.data(), out.size, boundary_vertices, boundary_edges, boundary_weight, out.voxel_id.data(), out.qef.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "build_boundary_qef_contrib_kernel"); + return out; +} + +inline edge_dda::BoundaryQEFResult reduce_boundary_contribs(BoundaryContribStream&& contrib, int64_t num_voxels, cudaStream_t stream) { + edge_dda::BoundaryQEFResult out; + out.size = num_voxels; + out.qefs.allocate(num_voxels); + out.qefs.clear_async(stream); + if (contrib.size == 0) return out; + auto kptr = thrust::device_pointer_cast(contrib.voxel_id.data()); + auto vptr = thrust::device_pointer_cast(contrib.qef.data()); + thrust::sort_by_key(thrust::cuda::par.on(stream), kptr, kptr + contrib.size, vptr); + fdg_gpu::DeviceBuffer reduced_ids(contrib.size); + fdg_gpu::DeviceBuffer reduced_qefs(contrib.size); + auto end_pair = thrust::reduce_by_key(thrust::cuda::par.on(stream), kptr, kptr + contrib.size, vptr, thrust::device_pointer_cast(reduced_ids.data()), thrust::device_pointer_cast(reduced_qefs.data()), thrust::equal_to(), SymQEF10Add{}); + int64_t M = end_pair.first - thrust::device_pointer_cast(reduced_ids.data()); + constexpr int kBlock = 128; + scatter_reduced_qef_kernel<<>>(reduced_ids.data(), reduced_qefs.data(), M, out.qefs.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "scatter_reduced_qef_kernel"); + return out; +} + +inline fdg_gpu::PrimitivePairResult dedup_pairs(fdg_gpu::PrimitivePairResult&& in, cudaStream_t stream) { + auto pid = thrust::device_pointer_cast(in.prim_id.data()); + auto vi = thrust::device_pointer_cast(in.voxel_i.data()); + auto vj = thrust::device_pointer_cast(in.voxel_j.data()); + auto vk = thrust::device_pointer_cast(in.voxel_k.data()); + auto begin = thrust::make_zip_iterator(thrust::make_tuple(pid, vi, vj, vk)); + auto end = thrust::make_zip_iterator(thrust::make_tuple(pid + in.size, vi + in.size, vj + in.size, vk + in.size)); + thrust::sort(thrust::cuda::par.on(stream), begin, end); + auto new_end = thrust::unique(thrust::cuda::par.on(stream), begin, end); + in.size = static_cast(new_end - begin); + return std::move(in); +} + +} // anonymous namespace + +namespace edge_dda { + +fdg_gpu::PrimitivePairResult voxel_traverse_edge_dda_gpu( + const float* d_vertices, + int64_t num_vertices, + const int32_t* d_edges, + int64_t num_edges, + float3 voxel_size, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + int chunk_steps, + cudaStream_t stream) { + using namespace voxel_traverse_edge_dda_impl; + if (d_vertices == nullptr || d_edges == nullptr || num_vertices < 0 || num_edges < 0) { + throw std::invalid_argument("invalid edge inputs"); + } + if (!(voxel_size.x > 0.0f && voxel_size.y > 0.0f && voxel_size.z > 0.0f)) { + throw std::invalid_argument("invalid voxel_size"); + } + if (grid_max.x <= grid_min.x || grid_max.y <= grid_min.y || grid_max.z <= grid_min.z) { + throw std::invalid_argument("invalid grid range"); + } + if (chunk_steps <= 0) { + throw std::invalid_argument("chunk_steps must be positive"); + } + if (num_vertices == 0 || num_edges == 0) return {}; + + Workspace ws; + DeviceResult gathered{}; + gathered.edge_id = nullptr; + gathered.voxel_i = nullptr; + gathered.voxel_j = nullptr; + gathered.voxel_k = nullptr; + gathered.size = 0; + + VOX_CUDA_CHECK(cudaGetLastError()); + try { + alloc_edge_desc(&ws.edge_desc, num_edges); + alloc_u8(&ws.edge_valid, num_edges); + alloc_i32(&ws.init_count, num_edges); + alloc_i32(&ws.init_offset, num_edges); + kernel_build_edge_desc<<>>(d_vertices, d_edges, num_edges, voxel_size, ws.edge_desc, ws.edge_valid); + VOX_CUDA_CHECK(cudaGetLastError()); + kernel_count_init_jobs<<>>(ws.edge_valid, num_edges, ws.init_count); + VOX_CUDA_CHECK(cudaGetLastError()); + ensure_round_capacity(ws.round, num_edges); + exclusive_scan_i32(ws.round, ws.init_count, ws.init_offset, num_edges, stream); + int32_t last_init_offset = copy_last_i32(ws.init_offset, num_edges, stream); + int32_t last_init_count = copy_last_i32(ws.init_count, num_edges, stream); + int64_t num_init_jobs = static_cast(last_init_offset) + static_cast(last_init_count); + ensure_dda_job_queue_capacity(ws.queue_a, num_init_jobs); + ws.queue_a.size = num_init_jobs; + kernel_emit_init_jobs<<>>(ws.edge_valid, ws.edge_desc, ws.init_offset, num_edges, ws.queue_a); + VOX_CUDA_CHECK(cudaGetLastError()); + DDAJobQueue* curr = &ws.queue_a; + DDAJobQueue* next = &ws.queue_b; + while (curr->size > 0) { + int64_t nj = curr->size; + ensure_round_capacity(ws.round, nj); + kernel_count_dda_jobs<<>>(*curr, ws.edge_desc, grid_min, grid_max, chunk_steps, ws.round.pair_count, ws.round.next_job_count); + VOX_CUDA_CHECK(cudaGetLastError()); + exclusive_scan_i32(ws.round, ws.round.pair_count, ws.round.pair_offset, nj, stream); + exclusive_scan_i32(ws.round, ws.round.next_job_count, ws.round.next_job_offset, nj, stream); + int32_t last_pair_offset = copy_last_i32(ws.round.pair_offset, nj, stream); + int32_t last_pair_count = copy_last_i32(ws.round.pair_count, nj, stream); + int64_t num_pairs = static_cast(last_pair_offset) + static_cast(last_pair_count); + int32_t last_next_offset = copy_last_i32(ws.round.next_job_offset, nj, stream); + int32_t last_next_count = copy_last_i32(ws.round.next_job_count, nj, stream); + int64_t num_next_jobs = static_cast(last_next_offset) + static_cast(last_next_count); + ensure_dda_job_queue_capacity(*next, num_next_jobs); + next->size = num_next_jobs; + ResultBuffer round_result = make_result_buffer(num_pairs); + kernel_emit_dda_jobs<<>>(*curr, ws.edge_desc, grid_min, grid_max, chunk_steps, ws.round.pair_offset, ws.round.next_job_offset, round_result, *next); + VOX_CUDA_CHECK(cudaGetLastError()); + if (num_pairs > 0) ws.result_rounds.push_back(round_result); + else release_result_buffer(round_result); + std::swap(curr, next); + } + gathered = gather_result_rounds(ws.result_rounds, stream); + release_workspace(ws); + return dedup_pairs(to_primitive_pair(std::move(gathered)), stream); + } catch (...) { + + release_workspace(ws); + throw; + } +} + +BoundaryQEFResult boundary_qef_gpu( + float3 voxel_size, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + const float* boundaries, + int64_t num_boundaries, + float boundary_weight, + const int* voxels, + int64_t num_voxels, + int chunk_steps, + cudaStream_t stream) { + BoundaryQEFResult out; + out.size = num_voxels; + out.qefs.allocate(num_voxels); + out.qefs.clear_async(stream); + if (num_voxels == 0 || num_boundaries == 0) return out; + if (boundaries == nullptr || voxels == nullptr) throw std::invalid_argument("null boundary_qef inputs"); + fdg_gpu::DeviceBuffer boundary_vertices(2 * num_boundaries * 3); + fdg_gpu::DeviceBuffer boundary_edges(num_boundaries * 2); + constexpr int kBlock = 128; + copy_boundaries_to_vertices_kernel<<>>(boundaries, num_boundaries, boundary_vertices.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "copy_boundaries_to_vertices_kernel"); + build_synth_edges_kernel<<>>(num_boundaries, boundary_edges.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "build_synth_edges_kernel"); + auto raw_pairs = voxel_traverse_edge_dda_gpu(boundary_vertices.data(), 2 * num_boundaries, boundary_edges.data(), num_boundaries, voxel_size, grid_min, grid_max, chunk_steps, stream); + auto lookup = build_surface_lookup(voxels, num_voxels, grid_min, grid_max, stream); + auto pair_keys = map_and_unique_edge_pairs(raw_pairs, lookup, grid_min, grid_max, stream); + auto contrib = build_boundary_contrib_stream(pair_keys, boundary_vertices.data(), boundary_edges.data(), boundary_weight, stream); + return reduce_boundary_contribs(std::move(contrib), num_voxels, stream); +} + +} // namespace edge_dda diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.h b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.h new file mode 100644 index 00000000..a1e54e0e --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.h @@ -0,0 +1,37 @@ +#pragma once + +#include "fdg_gpu_common.h" +#include +#include + +namespace edge_dda { + +fdg_gpu::PrimitivePairResult voxel_traverse_edge_dda_gpu( + const float* vertices, + int64_t num_vertices, + const int32_t* edges, + int64_t num_edges, + float3 voxel_size, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + int chunk_steps, + cudaStream_t stream = nullptr); + +struct BoundaryQEFResult { + int64_t size = 0; + fdg_gpu::DeviceBuffer qefs; +}; + +BoundaryQEFResult boundary_qef_gpu( + float3 voxel_size, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + const float* boundaries, + int64_t num_boundaries, + float boundary_weight, + const int* voxels, + int64_t num_voxels, + int chunk_steps, + cudaStream_t stream = nullptr); + +} // namespace edge_dda diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.cu b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.cu new file mode 100644 index 00000000..68c16f56 --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.cu @@ -0,0 +1,1502 @@ +#include "voxelize_mesh_oct.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace voxelize_oct_impl { + +#define VOX_CUDA_CHECK(expr) \ + do { \ + cudaError_t _err = (expr); \ + if (_err != cudaSuccess) { \ + throw std::runtime_error(std::string("CUDA error: ") + \ + cudaGetErrorString(_err) + \ + " at " + __FILE__ + ":" + \ + std::to_string(__LINE__)); \ + } \ + } while (0) + +constexpr int kRootNeighborCount = 27; +constexpr int kDefaultBlockSize = 128; + +struct FaceDesc { + float3 v0; + float3 v1; + float3 v2; + + float3 e0; + float3 e1; + float3 e2; + + float3 n_unit; + + float3 tri_bmin; + float3 tri_bmax; +}; + +struct EdgeDesc { + float3 p0; + float3 p1; + + float3 seg; + float seg_len; + float3 dir_unit; + + float3 seg_bmin; + float3 seg_bmax; +}; + +struct JobQueue { + int32_t* prim_id = nullptr; + uint8_t* level = nullptr; + int32_t* i = nullptr; + int32_t* j = nullptr; + int32_t* k = nullptr; + int64_t size = 0; + int64_t capacity = 0; +}; + +struct RoundBuffers { + uint8_t* job_hit = nullptr; + int32_t* child_count = nullptr; + int32_t* result_count = nullptr; + int32_t* child_offset = nullptr; + int32_t* result_offset = nullptr; + + void* cub_temp_storage = nullptr; + size_t cub_temp_bytes = 0; + int64_t capacity = 0; +}; + +struct ResultBuffer { + int32_t* prim_id = nullptr; + int32_t* vi = nullptr; + int32_t* vj = nullptr; + int32_t* vk = nullptr; + int64_t size = 0; +}; + +struct DeviceResult { + int32_t* prim_id = nullptr; + int32_t* voxel_i = nullptr; + int32_t* voxel_j = nullptr; + int32_t* voxel_k = nullptr; + int64_t size = 0; +}; + +struct VoxelizeWorkspace { + int32_t* leaf_ix = nullptr; + int32_t* leaf_iy = nullptr; + int32_t* leaf_iz = nullptr; + + JobQueue queue_a; + JobQueue queue_b; + RoundBuffers round; + std::vector result_rounds; +}; + +inline int ceil_div_i64(int64_t n, int block) { + return static_cast((n + block - 1) / block); +} + +inline int max3_int(int a, int b, int c) { + return a > b ? (a > c ? a : c) : (b > c ? b : c); +} + +inline int ceil_log2_pos_int(int x) { + int d = 0; + int v = 1; + while (v < x) { + v <<= 1; + ++d; + } + return d; +} + +inline int compute_grid_depth_from_grid_size(int3 grid_size) { + const int max_dim = max3_int(grid_size.x, grid_size.y, grid_size.z); + return ceil_log2_pos_int(max_dim); +} + +inline float3 reciprocal_voxel_size(float3 voxel_size) { + return make_float3(1.0f / voxel_size.x, 1.0f / voxel_size.y, 1.0f / voxel_size.z); +} + +inline void free_ptr(void* ptr) { + if (ptr != nullptr) { + cudaFree(ptr); + } +} + +inline void alloc_i32(int32_t** ptr, int64_t n) { + *ptr = nullptr; + if (n > 0) { + VOX_CUDA_CHECK(cudaMalloc(reinterpret_cast(ptr), sizeof(int32_t) * n)); + } +} + +inline void alloc_u8(uint8_t** ptr, int64_t n) { + *ptr = nullptr; + if (n > 0) { + VOX_CUDA_CHECK(cudaMalloc(reinterpret_cast(ptr), sizeof(uint8_t) * n)); + } +} + +inline void alloc_face_desc(FaceDesc** ptr, int64_t n) { + *ptr = nullptr; + if (n > 0) { + VOX_CUDA_CHECK(cudaMalloc(reinterpret_cast(ptr), sizeof(FaceDesc) * n)); + } +} + +inline void alloc_edge_desc(EdgeDesc** ptr, int64_t n) { + *ptr = nullptr; + if (n > 0) { + VOX_CUDA_CHECK(cudaMalloc(reinterpret_cast(ptr), sizeof(EdgeDesc) * n)); + } +} + +inline void release_job_queue(JobQueue& q) { + free_ptr(q.prim_id); + free_ptr(q.level); + free_ptr(q.i); + free_ptr(q.j); + free_ptr(q.k); + q = {}; +} + +inline void release_round_buffers(RoundBuffers& b) { + free_ptr(b.job_hit); + free_ptr(b.child_count); + free_ptr(b.result_count); + free_ptr(b.child_offset); + free_ptr(b.result_offset); + free_ptr(b.cub_temp_storage); + b = {}; +} + +inline void release_result_buffer(ResultBuffer& r) { + free_ptr(r.prim_id); + free_ptr(r.vi); + free_ptr(r.vj); + free_ptr(r.vk); + r = {}; +} + +inline void release_workspace(VoxelizeWorkspace& ws) { + free_ptr(ws.leaf_ix); + free_ptr(ws.leaf_iy); + free_ptr(ws.leaf_iz); + release_job_queue(ws.queue_a); + release_job_queue(ws.queue_b); + release_round_buffers(ws.round); + for (auto& r : ws.result_rounds) { + release_result_buffer(r); + } + ws.result_rounds.clear(); +} + +inline void ensure_job_queue_capacity(JobQueue& q, int64_t capacity) { + if (capacity <= q.capacity) { + return; + } + release_job_queue(q); + alloc_i32(&q.prim_id, capacity); + alloc_u8(&q.level, capacity); + alloc_i32(&q.i, capacity); + alloc_i32(&q.j, capacity); + alloc_i32(&q.k, capacity); + q.capacity = capacity; + q.size = 0; +} + +inline void ensure_round_capacity(RoundBuffers& b, int64_t capacity) { + if (capacity <= b.capacity) { + return; + } + + free_ptr(b.job_hit); + free_ptr(b.child_count); + free_ptr(b.result_count); + free_ptr(b.child_offset); + free_ptr(b.result_offset); + + alloc_u8(&b.job_hit, capacity); + alloc_i32(&b.child_count, capacity); + alloc_i32(&b.result_count, capacity); + alloc_i32(&b.child_offset, capacity); + alloc_i32(&b.result_offset, capacity); + b.capacity = capacity; +} + +inline void ensure_scan_temp_storage( + RoundBuffers& b, + int32_t* d_in, + int32_t* d_out, + int64_t count, + cudaStream_t stream) { + if (count <= 0) { + return; + } + if (count > INT32_MAX) { + throw std::runtime_error("CUB scan count exceeds int32 range"); + } + size_t bytes = 0; + VOX_CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + nullptr, + bytes, + d_in, + d_out, + static_cast(count), + stream)); + if (bytes > b.cub_temp_bytes) { + free_ptr(b.cub_temp_storage); + VOX_CUDA_CHECK(cudaMalloc(&b.cub_temp_storage, bytes)); + b.cub_temp_bytes = bytes; + } +} + +inline void exclusive_scan_i32( + RoundBuffers& b, + int32_t* d_in, + int32_t* d_out, + int64_t count, + cudaStream_t stream) { + if (count <= 0) { + return; + } + ensure_scan_temp_storage(b, d_in, d_out, count, stream); + VOX_CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + b.cub_temp_storage, + b.cub_temp_bytes, + d_in, + d_out, + static_cast(count), + stream)); +} + +inline int32_t copy_last_i32(const int32_t* ptr, int64_t count, cudaStream_t stream) { + if (count <= 0) { + return 0; + } + int32_t value = 0; + VOX_CUDA_CHECK(cudaMemcpyAsync( + &value, + ptr + (count - 1), + sizeof(int32_t), + cudaMemcpyDeviceToHost, + stream)); + VOX_CUDA_CHECK(cudaStreamSynchronize(stream)); + return value; +} + +inline ResultBuffer make_result_buffer(int64_t count) { + ResultBuffer r; + if (count <= 0) { + return r; + } + alloc_i32(&r.prim_id, count); + alloc_i32(&r.vi, count); + alloc_i32(&r.vj, count); + alloc_i32(&r.vk, count); + r.size = count; + return r; +} + +inline DeviceResult gather_result_rounds( + const std::vector& rounds, + cudaStream_t stream) { + DeviceResult out; + int64_t total = 0; + for (const auto& r : rounds) { + total += r.size; + } + out.size = total; + if (total == 0) { + return out; + } + + alloc_i32(&out.prim_id, total); + alloc_i32(&out.voxel_i, total); + alloc_i32(&out.voxel_j, total); + alloc_i32(&out.voxel_k, total); + + int64_t cursor = 0; + for (const auto& r : rounds) { + if (r.size == 0) { + continue; + } + VOX_CUDA_CHECK(cudaMemcpyAsync( + out.prim_id + cursor, + r.prim_id, + sizeof(int32_t) * r.size, + cudaMemcpyDeviceToDevice, + stream)); + VOX_CUDA_CHECK(cudaMemcpyAsync( + out.voxel_i + cursor, + r.vi, + sizeof(int32_t) * r.size, + cudaMemcpyDeviceToDevice, + stream)); + VOX_CUDA_CHECK(cudaMemcpyAsync( + out.voxel_j + cursor, + r.vj, + sizeof(int32_t) * r.size, + cudaMemcpyDeviceToDevice, + stream)); + VOX_CUDA_CHECK(cudaMemcpyAsync( + out.voxel_k + cursor, + r.vk, + sizeof(int32_t) * r.size, + cudaMemcpyDeviceToDevice, + stream)); + cursor += r.size; + } + + VOX_CUDA_CHECK(cudaStreamSynchronize(stream)); + return out; +} + +__device__ inline float3 add3(const float3& a, const float3& b) { + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +} + +__device__ inline float3 sub3(const float3& a, const float3& b) { + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +} + +__device__ inline float3 mul3(const float3& a, float s) { + return make_float3(a.x * s, a.y * s, a.z * s); +} + +__device__ inline float3 mul3_comp(const float3& a, const float3& b) { + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); +} + +__device__ inline float dot3(const float3& a, const float3& b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +__device__ inline float2 dot2_pair(const float2& a, const float2& b) { + return make_float2(a.x * b.x, a.y * b.y); +} + +__device__ inline float dot2(const float2& a, const float2& b) { + return a.x * b.x + a.y * b.y; +} + +__device__ inline float3 cross3(const float3& a, const float3& b) { + return make_float3( + a.y * b.z - a.z * b.y, + a.z * b.x - a.x * b.z, + a.x * b.y - a.y * b.x); +} + +__device__ inline float3 min3(const float3& a, const float3& b) { + return make_float3(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z)); +} + +__device__ inline float3 max3(const float3& a, const float3& b) { + return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z)); +} + +__device__ inline float3 normalize3(const float3& a) { + const float z = dot3(a, a); + if (z > 0.0f) { + const float n = sqrtf(z); + return make_float3(a.x / n, a.y / n, a.z / n); + } else { + return a; + } +} + +__device__ inline bool bbox_overlap_closed( + const float3& a_min, + const float3& a_max, + const float3& b_min, + const float3& b_max) { + return !(a_max.x < b_min.x || b_max.x < a_min.x || + a_max.y < b_min.y || b_max.y < a_min.y || + a_max.z < b_min.z || b_max.z < a_min.z); +} + +__device__ inline float2 max2_zero(const float2& a) { + return make_float2(fmaxf(a.x, 0.0f), fmaxf(a.y, 0.0f)); +} + +__device__ inline void compute_face_min_level_and_root( + int ix0, int iy0, int iz0, + int ix1, int iy1, int iz1, + int ix2, int iy2, int iz2, + int d, + uint8_t& level, + int& root_i, + int& root_j, + int& root_k) { + uint32_t diff = + static_cast(ix0 ^ ix1) | static_cast(ix0 ^ ix2) | + static_cast(iy0 ^ iy1) | static_cast(iy0 ^ iy2) | + static_cast(iz0 ^ iz1) | static_cast(iz0 ^ iz2); + + if (diff == 0) { + level = static_cast(d); + root_i = ix0; + root_j = iy0; + root_k = iz0; + return; + } + + int msb = 31 - __clz(diff); + int l = d - 1 - msb; + level = static_cast(l); + + int shift = d - l; + root_i = ix0 >> shift; + root_j = iy0 >> shift; + root_k = iz0 >> shift; +} + +__device__ inline void compute_edge_min_level_and_root( + int ix0, int iy0, int iz0, + int ix1, int iy1, int iz1, + int d, + uint8_t& level, + int& root_i, + int& root_j, + int& root_k) { + uint32_t diff = + static_cast(ix0 ^ ix1) | + static_cast(iy0 ^ iy1) | + static_cast(iz0 ^ iz1); + + if (diff == 0) { + level = static_cast(d); + root_i = ix0; + root_j = iy0; + root_k = iz0; + return; + } + + int msb = 31 - __clz(diff); + int l = d - 1 - msb; + level = static_cast(l); + + int shift = d - l; + root_i = ix0 >> shift; + root_j = iy0 >> shift; + root_k = iz0 >> shift; +} + +__device__ inline bool node_intersects_valid_domain( + int d, + int level, + int i, + int j, + int k, + int3 grid_size) { + int cells = 1 << level; + if (i < 0 || i >= cells || j < 0 || j >= cells || k < 0 || k >= cells) { + return false; + } + + int node_span = 1 << (d - level); + int x0 = i * node_span; + int y0 = j * node_span; + int z0 = k * node_span; + + return (x0 < grid_size.x) && (y0 < grid_size.y) && (z0 < grid_size.z); +} + +__device__ inline void compute_node_box_world( + int d, + int level, + int i, + int j, + int k, + int3 grid_min, + float3 voxel_size, + float3& box_min, + float3& box_size, + float3& box_max) { + int node_span = 1 << (d - level); + + int global_base_x = grid_min.x + i * node_span; + int global_base_y = grid_min.y + j * node_span; + int global_base_z = grid_min.z + k * node_span; + + int global_end_x = global_base_x + node_span; + int global_end_y = global_base_y + node_span; + int global_end_z = global_base_z + node_span; + + box_min = make_float3( + static_cast(global_base_x) * voxel_size.x, + static_cast(global_base_y) * voxel_size.y, + static_cast(global_base_z) * voxel_size.z); + box_max = make_float3( + static_cast(global_end_x) * voxel_size.x, + static_cast(global_end_y) * voxel_size.y, + static_cast(global_end_z) * voxel_size.z); + box_size = sub3(box_max, box_min); +} + +__device__ inline bool face_qef_style_triangle_box_hit( + const FaceDesc& f, + const float3& box_min, + const float3& box_size, + const float3& box_max) { + if (!bbox_overlap_closed(f.tri_bmin, f.tri_bmax, box_min, box_max)) { + return false; + } + + const float3& n = f.n_unit; + + float3 c = make_float3( + n.x > 0.0f ? box_size.x : 0.0f, + n.y > 0.0f ? box_size.y : 0.0f, + n.z > 0.0f ? box_size.z : 0.0f); + + float d1 = dot3(n, sub3(c, f.v0)); + float d2 = dot3(n, sub3(sub3(box_size, c), f.v0)); + + int mul_xy = (n.z < 0.0f) ? -1 : 1; + float2 n_xy_e0 = make_float2(-mul_xy * f.e0.y, mul_xy * f.e0.x); + float2 n_xy_e1 = make_float2(-mul_xy * f.e1.y, mul_xy * f.e1.x); + float2 n_xy_e2 = make_float2(-mul_xy * f.e2.y, mul_xy * f.e2.x); + + float d_xy_e0 = -dot2(n_xy_e0, make_float2(f.v0.x, f.v0.y)) + + dot2(max2_zero(n_xy_e0), make_float2(box_size.x, box_size.y)); + float d_xy_e1 = -dot2(n_xy_e1, make_float2(f.v1.x, f.v1.y)) + + dot2(max2_zero(n_xy_e1), make_float2(box_size.x, box_size.y)); + float d_xy_e2 = -dot2(n_xy_e2, make_float2(f.v2.x, f.v2.y)) + + dot2(max2_zero(n_xy_e2), make_float2(box_size.x, box_size.y)); + + int mul_yz = (n.x < 0.0f) ? -1 : 1; + float2 n_yz_e0 = make_float2(-mul_yz * f.e0.z, mul_yz * f.e0.y); + float2 n_yz_e1 = make_float2(-mul_yz * f.e1.z, mul_yz * f.e1.y); + float2 n_yz_e2 = make_float2(-mul_yz * f.e2.z, mul_yz * f.e2.y); + + float d_yz_e0 = -dot2(n_yz_e0, make_float2(f.v0.y, f.v0.z)) + + dot2(max2_zero(n_yz_e0), make_float2(box_size.y, box_size.z)); + float d_yz_e1 = -dot2(n_yz_e1, make_float2(f.v1.y, f.v1.z)) + + dot2(max2_zero(n_yz_e1), make_float2(box_size.y, box_size.z)); + float d_yz_e2 = -dot2(n_yz_e2, make_float2(f.v2.y, f.v2.z)) + + dot2(max2_zero(n_yz_e2), make_float2(box_size.y, box_size.z)); + + int mul_zx = (n.y < 0.0f) ? -1 : 1; + float2 n_zx_e0 = make_float2(-mul_zx * f.e0.x, mul_zx * f.e0.z); + float2 n_zx_e1 = make_float2(-mul_zx * f.e1.x, mul_zx * f.e1.z); + float2 n_zx_e2 = make_float2(-mul_zx * f.e2.x, mul_zx * f.e2.z); + + float d_zx_e0 = -dot2(n_zx_e0, make_float2(f.v0.z, f.v0.x)) + + dot2(max2_zero(n_zx_e0), make_float2(box_size.z, box_size.x)); + float d_zx_e1 = -dot2(n_zx_e1, make_float2(f.v1.z, f.v1.x)) + + dot2(max2_zero(n_zx_e1), make_float2(box_size.z, box_size.x)); + float d_zx_e2 = -dot2(n_zx_e2, make_float2(f.v2.z, f.v2.x)) + + dot2(max2_zero(n_zx_e2), make_float2(box_size.z, box_size.x)); + + float n_dot_p = dot3(n, box_min); + if (((n_dot_p + d1) * (n_dot_p + d2)) > 0.0f) { + return false; + } + + float2 p_xy = make_float2(box_min.x, box_min.y); + if (dot2(n_xy_e0, p_xy) + d_xy_e0 < 0.0f) return false; + if (dot2(n_xy_e1, p_xy) + d_xy_e1 < 0.0f) return false; + if (dot2(n_xy_e2, p_xy) + d_xy_e2 < 0.0f) return false; + + float2 p_yz = make_float2(box_min.y, box_min.z); + if (dot2(n_yz_e0, p_yz) + d_yz_e0 < 0.0f) return false; + if (dot2(n_yz_e1, p_yz) + d_yz_e1 < 0.0f) return false; + if (dot2(n_yz_e2, p_yz) + d_yz_e2 < 0.0f) return false; + + float2 p_zx = make_float2(box_min.z, box_min.x); + if (dot2(n_zx_e0, p_zx) + d_zx_e0 < 0.0f) return false; + if (dot2(n_zx_e1, p_zx) + d_zx_e1 < 0.0f) return false; + if (dot2(n_zx_e2, p_zx) + d_zx_e2 < 0.0f) return false; + + return true; +} + +__device__ inline bool segment_box_overlap_world( + const EdgeDesc& e, + const float3& box_min, + const float3& box_max) { + if (e.seg_len < 1.0e-6f) { + return false; + } + + if (!bbox_overlap_closed(e.seg_bmin, e.seg_bmax, box_min, box_max)) { + return false; + } + + float tmin = 0.0f; + float tmax = 1.0f; + + if (e.seg.x == 0.0f) { + if (!(box_min.x <= e.p0.x && e.p0.x <= box_max.x)) return false; + } else { + float inv_d = 1.0f / e.seg.x; + float t1 = (box_min.x - e.p0.x) * inv_d; + float t2 = (box_max.x - e.p0.x) * inv_d; + if (t1 > t2) { float tmp = t1; t1 = t2; t2 = tmp; } + tmin = fmaxf(tmin, t1); + tmax = fminf(tmax, t2); + if (tmin > tmax) return false; + } + + if (e.seg.y == 0.0f) { + if (!(box_min.y <= e.p0.y && e.p0.y <= box_max.y)) return false; + } else { + float inv_d = 1.0f / e.seg.y; + float t1 = (box_min.y - e.p0.y) * inv_d; + float t2 = (box_max.y - e.p0.y) * inv_d; + if (t1 > t2) { float tmp = t1; t1 = t2; t2 = tmp; } + tmin = fmaxf(tmin, t1); + tmax = fminf(tmax, t2); + if (tmin > tmax) return false; + } + + if (e.seg.z == 0.0f) { + if (!(box_min.z <= e.p0.z && e.p0.z <= box_max.z)) return false; + } else { + float inv_d = 1.0f / e.seg.z; + float t1 = (box_min.z - e.p0.z) * inv_d; + float t2 = (box_max.z - e.p0.z) * inv_d; + if (t1 > t2) { float tmp = t1; t1 = t2; t2 = tmp; } + tmin = fmaxf(tmin, t1); + tmax = fminf(tmax, t2); + if (tmin > tmax) return false; + } + + return true; +} + +__global__ void kernel_build_leaf_coords( + const float* __restrict__ vertices, + int64_t num_vertices, + float3 inv_voxel_size, + int3 grid_min, + int3 grid_size, + int32_t* __restrict__ leaf_ix, + int32_t* __restrict__ leaf_iy, + int32_t* __restrict__ leaf_iz) { + int64_t vid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (vid >= num_vertices) { + return; + } + + float x = vertices[3 * vid + 0] * inv_voxel_size.x; + float y = vertices[3 * vid + 1] * inv_voxel_size.y; + float z = vertices[3 * vid + 2] * inv_voxel_size.z; + + int ix = static_cast(floorf(x)) - grid_min.x; + int iy = static_cast(floorf(y)) - grid_min.y; + int iz = static_cast(floorf(z)) - grid_min.z; + + ix = max(0, min(ix, grid_size.x - 1)); + iy = max(0, min(iy, grid_size.y - 1)); + iz = max(0, min(iz, grid_size.z - 1)); + + leaf_ix[vid] = ix; + leaf_iy[vid] = iy; + leaf_iz[vid] = iz; +} + +__global__ void kernel_init_faces_and_emit_root27( + const float* __restrict__ vertices, + const int32_t* __restrict__ faces, + const int32_t* __restrict__ leaf_ix, + const int32_t* __restrict__ leaf_iy, + const int32_t* __restrict__ leaf_iz, + int64_t num_faces, + int d, + FaceDesc* __restrict__ face_desc, + JobQueue out_q) { + int64_t fid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (fid >= num_faces) { + return; + } + + int v0_id = faces[3 * fid + 0]; + int v1_id = faces[3 * fid + 1]; + int v2_id = faces[3 * fid + 2]; + + float3 v0 = make_float3(vertices[3 * v0_id + 0], vertices[3 * v0_id + 1], vertices[3 * v0_id + 2]); + float3 v1 = make_float3(vertices[3 * v1_id + 0], vertices[3 * v1_id + 1], vertices[3 * v1_id + 2]); + float3 v2 = make_float3(vertices[3 * v2_id + 0], vertices[3 * v2_id + 1], vertices[3 * v2_id + 2]); + + FaceDesc fd; + fd.v0 = v0; + fd.v1 = v1; + fd.v2 = v2; + fd.e0 = sub3(v1, v0); + fd.e1 = sub3(v2, v1); + fd.e2 = sub3(v0, v2); + fd.n_unit = normalize3(cross3(fd.e0, fd.e1)); + fd.tri_bmin = min3(v0, min3(v1, v2)); + fd.tri_bmax = max3(v0, max3(v1, v2)); + face_desc[fid] = fd; + + uint8_t level; + int root_i, root_j, root_k; + compute_face_min_level_and_root( + leaf_ix[v0_id], leaf_iy[v0_id], leaf_iz[v0_id], + leaf_ix[v1_id], leaf_iy[v1_id], leaf_iz[v1_id], + leaf_ix[v2_id], leaf_iy[v2_id], leaf_iz[v2_id], + d, + level, + root_i, + root_j, + root_k); + + int64_t base = static_cast(kRootNeighborCount) * fid; + int slot = 0; + for (int dz = -1; dz <= 1; ++dz) { + for (int dy = -1; dy <= 1; ++dy) { + for (int dx = -1; dx <= 1; ++dx) { + int64_t out = base + slot++; + out_q.prim_id[out] = static_cast(fid); + out_q.level[out] = level; + out_q.i[out] = root_i + dx; + out_q.j[out] = root_j + dy; + out_q.k[out] = root_k + dz; + } + } + } +} + +__global__ void kernel_init_edges_and_emit_root27( + const float* __restrict__ vertices, + const int32_t* __restrict__ edges, + const int32_t* __restrict__ leaf_ix, + const int32_t* __restrict__ leaf_iy, + const int32_t* __restrict__ leaf_iz, + int64_t num_edges, + int d, + EdgeDesc* __restrict__ edge_desc, + JobQueue out_q) { + int64_t eid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (eid >= num_edges) { + return; + } + + int v0_id = edges[2 * eid + 0]; + int v1_id = edges[2 * eid + 1]; + + float3 p0 = make_float3(vertices[3 * v0_id + 0], vertices[3 * v0_id + 1], vertices[3 * v0_id + 2]); + float3 p1 = make_float3(vertices[3 * v1_id + 0], vertices[3 * v1_id + 1], vertices[3 * v1_id + 2]); + + EdgeDesc ed; + ed.p0 = p0; + ed.p1 = p1; + ed.seg = sub3(p1, p0); + ed.seg_len = sqrtf(dot3(ed.seg, ed.seg)); + ed.dir_unit = (ed.seg_len >= 1.0e-6f) ? mul3(ed.seg, 1.0f / ed.seg_len) : make_float3(0.0f, 0.0f, 0.0f); + ed.seg_bmin = min3(p0, p1); + ed.seg_bmax = max3(p0, p1); + edge_desc[eid] = ed; + + uint8_t level; + int root_i, root_j, root_k; + compute_edge_min_level_and_root( + leaf_ix[v0_id], leaf_iy[v0_id], leaf_iz[v0_id], + leaf_ix[v1_id], leaf_iy[v1_id], leaf_iz[v1_id], + d, + level, + root_i, + root_j, + root_k); + + int64_t base = static_cast(kRootNeighborCount) * eid; + int slot = 0; + for (int dz = -1; dz <= 1; ++dz) { + for (int dy = -1; dy <= 1; ++dy) { + for (int dx = -1; dx <= 1; ++dx) { + int64_t out = base + slot++; + out_q.prim_id[out] = static_cast(eid); + out_q.level[out] = level; + out_q.i[out] = root_i + dx; + out_q.j[out] = root_j + dy; + out_q.k[out] = root_k + dz; + } + } + } +} + +__global__ void kernel_count_face_jobs( + JobQueue curr_q, + const FaceDesc* __restrict__ face_desc, + int d, + int3 grid_min, + int3 grid_size, + float3 voxel_size, + uint8_t* __restrict__ job_hit, + int32_t* __restrict__ child_count, + int32_t* __restrict__ result_count) { + int64_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= curr_q.size) { + return; + } + + int fid = curr_q.prim_id[idx]; + int level = static_cast(curr_q.level[idx]); + int i = curr_q.i[idx]; + int j = curr_q.j[idx]; + int k = curr_q.k[idx]; + + if (!node_intersects_valid_domain(d, level, i, j, k, grid_size)) { + job_hit[idx] = 0; + child_count[idx] = 0; + result_count[idx] = 0; + return; + } + + float3 box_min, box_size, box_max; + compute_node_box_world(d, level, i, j, k, grid_min, voxel_size, box_min, box_size, box_max); + + bool hit = face_qef_style_triangle_box_hit(face_desc[fid], box_min, box_size, box_max); + job_hit[idx] = static_cast(hit ? 1 : 0); + + if (!hit) { + child_count[idx] = 0; + result_count[idx] = 0; + } else if (level < d) { + child_count[idx] = 8; + result_count[idx] = 0; + } else { + child_count[idx] = 0; + result_count[idx] = 1; + } +} + +__global__ void kernel_count_edge_jobs( + JobQueue curr_q, + const EdgeDesc* __restrict__ edge_desc, + int d, + int3 grid_min, + int3 grid_size, + float3 voxel_size, + uint8_t* __restrict__ job_hit, + int32_t* __restrict__ child_count, + int32_t* __restrict__ result_count) { + int64_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= curr_q.size) { + return; + } + + int eid = curr_q.prim_id[idx]; + int level = static_cast(curr_q.level[idx]); + int i = curr_q.i[idx]; + int j = curr_q.j[idx]; + int k = curr_q.k[idx]; + + if (!node_intersects_valid_domain(d, level, i, j, k, grid_size)) { + job_hit[idx] = 0; + child_count[idx] = 0; + result_count[idx] = 0; + return; + } + + float3 box_min, box_size, box_max; + compute_node_box_world(d, level, i, j, k, grid_min, voxel_size, box_min, box_size, box_max); + + bool hit = segment_box_overlap_world(edge_desc[eid], box_min, box_max); + job_hit[idx] = static_cast(hit ? 1 : 0); + + if (!hit) { + child_count[idx] = 0; + result_count[idx] = 0; + } else if (level < d) { + child_count[idx] = 8; + result_count[idx] = 0; + } else { + child_count[idx] = 0; + result_count[idx] = 1; + } +} + +__global__ void kernel_emit_jobs( + JobQueue curr_q, + const uint8_t* __restrict__ job_hit, + const int32_t* __restrict__ child_offset, + const int32_t* __restrict__ result_offset, + int d, + JobQueue next_q, + ResultBuffer out_res) { + int64_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= curr_q.size) { + return; + } + if (job_hit[idx] == 0) { + return; + } + + int prim_id = curr_q.prim_id[idx]; + int level = static_cast(curr_q.level[idx]); + int i = curr_q.i[idx]; + int j = curr_q.j[idx]; + int k = curr_q.k[idx]; + + if (level < d) { + int32_t base = child_offset[idx]; + int child_level = level + 1; + int slot = 0; + for (int bz = 0; bz < 2; ++bz) { + for (int by = 0; by < 2; ++by) { + for (int bx = 0; bx < 2; ++bx) { + int32_t out = base + slot++; + next_q.prim_id[out] = prim_id; + next_q.level[out] = static_cast(child_level); + next_q.i[out] = 2 * i + bx; + next_q.j[out] = 2 * j + by; + next_q.k[out] = 2 * k + bz; + } + } + } + } else { + int32_t out = result_offset[idx]; + out_res.prim_id[out] = prim_id; + out_res.vi[out] = i; + out_res.vj[out] = j; + out_res.vk[out] = k; + } +} + +inline void reset_output(DeviceResult& out) { + out.prim_id = nullptr; + out.voxel_i = nullptr; + out.voxel_j = nullptr; + out.voxel_k = nullptr; + out.size = 0; +} + +inline void release_device_result(DeviceResult& out) { + free_ptr(out.prim_id); + free_ptr(out.voxel_i); + free_ptr(out.voxel_j); + free_ptr(out.voxel_k); + out = {}; +} + +} // namespace voxelize_oct_impl + + +namespace { + +inline fdg_gpu::PrimitivePairResult to_primitive_pair(voxelize_oct_impl::DeviceResult&& r) { + fdg_gpu::PrimitivePairResult out; + out.size = r.size; + out.prim_id.adopt(r.prim_id, r.size); + out.voxel_i.adopt(r.voxel_i, r.size); + out.voxel_j.adopt(r.voxel_j, r.size); + out.voxel_k.adopt(r.voxel_k, r.size); + r.prim_id = nullptr; + r.voxel_i = nullptr; + r.voxel_j = nullptr; + r.voxel_k = nullptr; + r.size = 0; + return out; +} + +struct SurfaceLookup { + int64_t size = 0; + fdg_gpu::DeviceBuffer keys_sorted; + fdg_gpu::DeviceBuffer ids_sorted; +}; + +struct FacePairKeys { + int64_t size = 0; + fdg_gpu::DeviceBuffer keys; +}; + +struct FaceContribStream { + int64_t size = 0; + fdg_gpu::DeviceBuffer voxel_id; + fdg_gpu::DeviceBuffer qef; +}; + +__host__ __device__ inline fdg_gpu::SymQEF10 symqef10_zero() { + return fdg_gpu::SymQEF10{0,0,0,0,0,0,0,0,0,0}; +} + +struct SymQEF10Add { + __host__ __device__ fdg_gpu::SymQEF10 operator()(const fdg_gpu::SymQEF10& a, const fdg_gpu::SymQEF10& b) const { + return fdg_gpu::SymQEF10{ + a.q00 + b.q00, a.q01 + b.q01, a.q02 + b.q02, a.q03 + b.q03, + a.q11 + b.q11, a.q12 + b.q12, a.q13 + b.q13, + a.q22 + b.q22, a.q23 + b.q23, + a.q33 + b.q33}; + } +}; + +__host__ __device__ inline uint64_t pack_pair_key(int32_t voxel_id, int32_t face_id) { + return (static_cast(static_cast(voxel_id)) << 32) | + static_cast(face_id); +} + +__host__ __device__ inline int32_t unpack_pair_voxel_id(uint64_t k) { + return static_cast(k >> 32); +} + +__host__ __device__ inline int32_t unpack_pair_face_id(uint64_t k) { + return static_cast(k & 0xffffffffu); +} + +__host__ __device__ inline fdg_gpu::SymQEF10 symqef10_from_plane(float4 p) { + const float a = p.x, b = p.y, c = p.z, d = p.w; + return fdg_gpu::SymQEF10{ + a*a, a*b, a*c, a*d, + b*b, b*c, b*d, + c*c, c*d, + d*d + }; +} + +__global__ void build_synth_faces_kernel(int64_t num_triangles, int32_t* __restrict__ faces) { + int64_t fid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (fid >= num_triangles) return; + faces[3 * fid + 0] = static_cast(3 * fid + 0); + faces[3 * fid + 1] = static_cast(3 * fid + 1); + faces[3 * fid + 2] = static_cast(3 * fid + 2); +} + +__global__ void build_surface_keys_kernel( + const int* __restrict__ voxels, + int64_t num_voxels, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + uint64_t* __restrict__ keys, + int32_t* __restrict__ ids) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_voxels) return; + int x = voxels[3 * i + 0]; + int y = voxels[3 * i + 1]; + int z = voxels[3 * i + 2]; + keys[i] = fdg_gpu::pack_voxel_key(x, y, z, grid_min, grid_max); + ids[i] = static_cast(i); +} + +__global__ void build_raw_pair_keys_kernel( + const int32_t* __restrict__ voxel_i, + const int32_t* __restrict__ voxel_j, + const int32_t* __restrict__ voxel_k, + const int32_t* __restrict__ face_id, + int64_t num_pairs, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + uint64_t* __restrict__ voxel_keys, + int32_t* __restrict__ pair_face_ids) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_pairs) return; + const int32_t gx = voxel_i[i] + grid_min.x; + const int32_t gy = voxel_j[i] + grid_min.y; + const int32_t gz = voxel_k[i] + grid_min.z; + voxel_keys[i] = fdg_gpu::pack_voxel_key(gx, gy, gz, grid_min, grid_max); + pair_face_ids[i] = face_id[i]; +} + +__device__ inline int lower_bound_u64(const uint64_t* arr, int64_t n, uint64_t key) { + int64_t lo = 0; + int64_t hi = n; + while (lo < hi) { + int64_t mid = (lo + hi) >> 1; + uint64_t v = arr[mid]; + if (v < key) lo = mid + 1; + else hi = mid; + } + return static_cast(lo); +} + +__global__ void map_pair_to_voxel_id_kernel( + const uint64_t* __restrict__ pair_keys, + const int32_t* __restrict__ pair_face_ids, + int64_t num_pairs, + const uint64_t* __restrict__ surface_keys_sorted, + const int32_t* __restrict__ surface_ids_sorted, + int64_t num_voxels, + int32_t* __restrict__ mapped_voxel_id, + int32_t* __restrict__ mapped_face_id, + int32_t* __restrict__ valid) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_pairs) return; + uint64_t key = pair_keys[i]; + int pos = lower_bound_u64(surface_keys_sorted, num_voxels, key); + if (pos < num_voxels && surface_keys_sorted[pos] == key) { + mapped_voxel_id[i] = surface_ids_sorted[pos]; + mapped_face_id[i] = pair_face_ids[i]; + valid[i] = 1; + } else { + mapped_voxel_id[i] = -1; + mapped_face_id[i] = -1; + valid[i] = 0; + } +} + +__global__ void compact_valid_pairs_kernel( + const int32_t* __restrict__ mapped_voxel_id, + const int32_t* __restrict__ mapped_face_id, + const int32_t* __restrict__ valid, + const int32_t* __restrict__ offsets, + int64_t num_pairs, + uint64_t* __restrict__ pair_keys_out) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_pairs || valid[i] == 0) return; + int32_t out = offsets[i]; + pair_keys_out[out] = pack_pair_key(mapped_voxel_id[i], mapped_face_id[i]); +} + +__global__ void build_face_qef_contrib_kernel( + const uint64_t* __restrict__ pair_keys, + int64_t num_pairs, + const float* __restrict__ vertices, + const int32_t* __restrict__ faces, + int32_t* __restrict__ voxel_id_out, + fdg_gpu::SymQEF10* __restrict__ qef_out) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_pairs) return; + + int32_t voxel_id = unpack_pair_voxel_id(pair_keys[i]); + int32_t fid = unpack_pair_face_id(pair_keys[i]); + + int32_t i0 = faces[3 * fid + 0]; + int32_t i1 = faces[3 * fid + 1]; + int32_t i2 = faces[3 * fid + 2]; + + float3 v0 = make_float3(vertices[3 * i0 + 0], vertices[3 * i0 + 1], vertices[3 * i0 + 2]); + float3 v1 = make_float3(vertices[3 * i1 + 0], vertices[3 * i1 + 1], vertices[3 * i1 + 2]); + float3 v2 = make_float3(vertices[3 * i2 + 0], vertices[3 * i2 + 1], vertices[3 * i2 + 2]); + + float3 e0 = voxelize_oct_impl::sub3(v1, v0); + float3 e1 = voxelize_oct_impl::sub3(v2, v1); + float3 n = voxelize_oct_impl::normalize3(voxelize_oct_impl::cross3(e0, e1)); + float4 plane = make_float4(n.x, n.y, n.z, -voxelize_oct_impl::dot3(n, v0)); + + voxel_id_out[i] = voxel_id; + qef_out[i] = symqef10_from_plane(plane); +} + +__global__ void scatter_reduced_face_qef_kernel( + const int32_t* __restrict__ reduced_voxel_id, + const fdg_gpu::SymQEF10* __restrict__ reduced_qef, + int64_t num_reduced, + fdg_gpu::SymQEF10* __restrict__ full_qefs) { + int64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= num_reduced) return; + full_qefs[reduced_voxel_id[i]] = reduced_qef[i]; +} + +inline SurfaceLookup build_surface_lookup( + const int* voxels, + int64_t num_voxels, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + cudaStream_t stream) { + SurfaceLookup out; + out.size = num_voxels; + out.keys_sorted.allocate(num_voxels); + out.ids_sorted.allocate(num_voxels); + constexpr int kBlock = 256; + build_surface_keys_kernel<<>>( + voxels, num_voxels, grid_min, grid_max, out.keys_sorted.data(), out.ids_sorted.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "build_surface_keys_kernel"); + thrust::device_ptr kptr(out.keys_sorted.data()); + thrust::device_ptr iptr(out.ids_sorted.data()); + thrust::sort_by_key(thrust::cuda::par.on(stream), kptr, kptr + num_voxels, iptr); + return out; +} + +inline FacePairKeys map_and_unique_face_pairs( + const fdg_gpu::PrimitivePairResult& raw_pairs, + const SurfaceLookup& lookup, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + cudaStream_t stream) { + FacePairKeys out; + const int64_t N = raw_pairs.size; + if (N == 0) return out; + fdg_gpu::DeviceBuffer pair_voxel_keys(N); + fdg_gpu::DeviceBuffer pair_face_ids(N); + fdg_gpu::DeviceBuffer mapped_voxel_id(N); + fdg_gpu::DeviceBuffer mapped_face_id(N); + fdg_gpu::DeviceBuffer valid(N); + fdg_gpu::DeviceBuffer offsets(N); + constexpr int kBlock = 256; + + build_raw_pair_keys_kernel<<>>( + raw_pairs.voxel_i.data(), raw_pairs.voxel_j.data(), raw_pairs.voxel_k.data(), raw_pairs.prim_id.data(), + N, grid_min, grid_max, pair_voxel_keys.data(), pair_face_ids.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "build_raw_pair_keys_kernel"); + + map_pair_to_voxel_id_kernel<<>>( + pair_voxel_keys.data(), pair_face_ids.data(), N, + lookup.keys_sorted.data(), lookup.ids_sorted.data(), lookup.size, + mapped_voxel_id.data(), mapped_face_id.data(), valid.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "map_pair_to_voxel_id_kernel"); + + void* temp = nullptr; + size_t temp_bytes = 0; + VOX_CUDA_CHECK(cub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, valid.data(), offsets.data(), static_cast(N), stream)); + VOX_CUDA_CHECK(cudaMalloc(&temp, temp_bytes)); + VOX_CUDA_CHECK(cub::DeviceScan::ExclusiveSum(temp, temp_bytes, valid.data(), offsets.data(), static_cast(N), stream)); + int32_t last_off = voxelize_oct_impl::copy_last_i32(offsets.data(), N, stream); + int32_t last_valid = voxelize_oct_impl::copy_last_i32(valid.data(), N, stream); + int64_t M = static_cast(last_off + last_valid); + cudaFree(temp); + + out.size = M; + out.keys.allocate(M); + compact_valid_pairs_kernel<<>>( + mapped_voxel_id.data(), mapped_face_id.data(), valid.data(), offsets.data(), N, out.keys.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "compact_valid_pairs_kernel"); + + thrust::device_ptr kptr(out.keys.data()); + thrust::sort(thrust::cuda::par.on(stream), kptr, kptr + M); + auto new_end = thrust::unique(thrust::cuda::par.on(stream), kptr, kptr + M); + out.size = static_cast(new_end - kptr); + return out; +} + +inline FaceContribStream build_face_contrib_stream( + const FacePairKeys& pair_keys, + const float* triangles_world, + const int32_t* faces_synth, + cudaStream_t stream) { + FaceContribStream out; + out.size = pair_keys.size; + if (out.size == 0) return out; + out.voxel_id.allocate(out.size); + out.qef.allocate(out.size); + constexpr int kBlock = 256; + build_face_qef_contrib_kernel<<>>( + pair_keys.keys.data(), out.size, triangles_world, faces_synth, out.voxel_id.data(), out.qef.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "build_face_qef_contrib_kernel"); + return out; +} + +inline oct_pairs::FaceQEFResult reduce_face_contribs( + FaceContribStream&& contrib, + int64_t num_voxels, + cudaStream_t stream) { + oct_pairs::FaceQEFResult out; + out.size = num_voxels; + out.qefs.allocate(num_voxels); + out.qefs.clear_async(stream); + if (contrib.size == 0) return out; + + thrust::device_ptr kptr(contrib.voxel_id.data()); + thrust::device_ptr vptr(contrib.qef.data()); + thrust::sort_by_key(thrust::cuda::par.on(stream), kptr, kptr + contrib.size, vptr); + + fdg_gpu::DeviceBuffer reduced_ids(contrib.size); + fdg_gpu::DeviceBuffer reduced_qefs(contrib.size); + auto end_pair = thrust::reduce_by_key( + thrust::cuda::par.on(stream), + kptr, kptr + contrib.size, + vptr, + thrust::device_pointer_cast(reduced_ids.data()), + thrust::device_pointer_cast(reduced_qefs.data()), + thrust::equal_to(), + SymQEF10Add()); + int64_t M = end_pair.first - thrust::device_pointer_cast(reduced_ids.data()); + + constexpr int kBlock = 256; + scatter_reduced_face_qef_kernel<<>>( + reduced_ids.data(), reduced_qefs.data(), M, out.qefs.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "scatter_reduced_face_qef_kernel"); + return out; +} + +} // anonymous namespace + +namespace oct_pairs { + +fdg_gpu::PrimitivePairResult voxelize_mesh_oct_gpu( + const float* d_vertices, + int64_t num_vertices, + const int32_t* d_faces, + int64_t num_faces, + fdg_gpu::int3_ grid_min_, + fdg_gpu::int3_ grid_size_, + float3 voxel_size, + cudaStream_t stream) { + using namespace voxelize_oct_impl; + int3 grid_min{grid_min_.x, grid_min_.y, grid_min_.z}; + int3 grid_size{grid_size_.x, grid_size_.y, grid_size_.z}; + if (d_vertices == nullptr || d_faces == nullptr || num_vertices < 0 || num_faces < 0) { + throw std::invalid_argument("invalid mesh inputs"); + } + if (grid_size.x <= 0 || grid_size.y <= 0 || grid_size.z <= 0) { + throw std::invalid_argument("invalid grid_size"); + } + if (!(voxel_size.x > 0.0f && voxel_size.y > 0.0f && voxel_size.z > 0.0f)) { + throw std::invalid_argument("invalid voxel_size"); + } + if (num_vertices == 0 || num_faces == 0) { + return {}; + } + const int d = compute_grid_depth_from_grid_size(grid_size); + if (d < 0 || d > 21) throw std::invalid_argument("grid depth exceeds 21"); + const float3 inv_voxel_size = reciprocal_voxel_size(voxel_size); + + VoxelizeWorkspace ws; + FaceDesc* face_desc = nullptr; + DeviceResult gathered; reset_output(gathered); + + alloc_i32(&ws.leaf_ix, num_vertices); + alloc_i32(&ws.leaf_iy, num_vertices); + alloc_i32(&ws.leaf_iz, num_vertices); + alloc_face_desc(&face_desc, num_faces); + + kernel_build_leaf_coords<<>>( + d_vertices, num_vertices, inv_voxel_size, grid_min, grid_size, ws.leaf_ix, ws.leaf_iy, ws.leaf_iz); + VOX_CUDA_CHECK(cudaGetLastError()); + + ensure_job_queue_capacity(ws.queue_a, static_cast(kRootNeighborCount) * num_faces); + ws.queue_a.size = static_cast(kRootNeighborCount) * num_faces; + + kernel_init_faces_and_emit_root27<<>>( + d_vertices, d_faces, ws.leaf_ix, ws.leaf_iy, ws.leaf_iz, num_faces, d, face_desc, ws.queue_a); + VOX_CUDA_CHECK(cudaGetLastError()); + + JobQueue* curr = &ws.queue_a; JobQueue* next = &ws.queue_b; + while (curr->size > 0) { + int64_t nj = curr->size; + ensure_round_capacity(ws.round, nj); + kernel_count_face_jobs<<>>( + *curr, face_desc, d, grid_min, grid_size, voxel_size, ws.round.job_hit, ws.round.child_count, ws.round.result_count); + VOX_CUDA_CHECK(cudaGetLastError()); + exclusive_scan_i32(ws.round, ws.round.child_count, ws.round.child_offset, nj, stream); + exclusive_scan_i32(ws.round, ws.round.result_count, ws.round.result_offset, nj, stream); + const int32_t num_children_total = copy_last_i32(ws.round.child_offset, nj, stream) + copy_last_i32(ws.round.child_count, nj, stream); + const int32_t num_results_total = copy_last_i32(ws.round.result_offset, nj, stream) + copy_last_i32(ws.round.result_count, nj, stream); + ensure_job_queue_capacity(*next, num_children_total); + next->size = num_children_total; + ResultBuffer round_res = make_result_buffer(num_results_total); + kernel_emit_jobs<<>>( + *curr, ws.round.job_hit, ws.round.child_offset, ws.round.result_offset, d, *next, round_res); + VOX_CUDA_CHECK(cudaGetLastError()); + if (num_results_total > 0) ws.result_rounds.push_back(round_res); + std::swap(curr, next); + } + gathered = gather_result_rounds(ws.result_rounds, stream); + free_ptr(face_desc); + release_workspace(ws); + return to_primitive_pair(std::move(gathered)); +} + +fdg_gpu::PrimitivePairResult voxelize_edge_oct_gpu( + const float* d_vertices, + int64_t num_vertices, + const int32_t* d_edges, + int64_t num_edges, + fdg_gpu::int3_ grid_min_, + fdg_gpu::int3_ grid_size_, + float3 voxel_size, + cudaStream_t stream) { + using namespace voxelize_oct_impl; + int3 grid_min{grid_min_.x, grid_min_.y, grid_min_.z}; + int3 grid_size{grid_size_.x, grid_size_.y, grid_size_.z}; + if (d_vertices == nullptr || d_edges == nullptr || num_vertices < 0 || num_edges < 0) { + throw std::invalid_argument("invalid edge inputs"); + } + if (grid_size.x <= 0 || grid_size.y <= 0 || grid_size.z <= 0) { + throw std::invalid_argument("invalid grid_size"); + } + if (!(voxel_size.x > 0.0f && voxel_size.y > 0.0f && voxel_size.z > 0.0f)) { + throw std::invalid_argument("invalid voxel_size"); + } + if (num_vertices == 0 || num_edges == 0) { + return {}; + } + const int d = compute_grid_depth_from_grid_size(grid_size); + if (d < 0 || d > 21) throw std::invalid_argument("grid depth exceeds 21"); + const float3 inv_voxel_size = reciprocal_voxel_size(voxel_size); + + VoxelizeWorkspace ws; + EdgeDesc* edge_desc = nullptr; + DeviceResult gathered; reset_output(gathered); + + alloc_i32(&ws.leaf_ix, num_vertices); + alloc_i32(&ws.leaf_iy, num_vertices); + alloc_i32(&ws.leaf_iz, num_vertices); + alloc_edge_desc(&edge_desc, num_edges); + kernel_build_leaf_coords<<>>( + d_vertices, num_vertices, inv_voxel_size, grid_min, grid_size, ws.leaf_ix, ws.leaf_iy, ws.leaf_iz); + VOX_CUDA_CHECK(cudaGetLastError()); + ensure_job_queue_capacity(ws.queue_a, static_cast(kRootNeighborCount) * num_edges); + ws.queue_a.size = static_cast(kRootNeighborCount) * num_edges; + kernel_init_edges_and_emit_root27<<>>( + d_vertices, d_edges, ws.leaf_ix, ws.leaf_iy, ws.leaf_iz, num_edges, d, edge_desc, ws.queue_a); + VOX_CUDA_CHECK(cudaGetLastError()); + JobQueue* curr = &ws.queue_a; JobQueue* next = &ws.queue_b; + while (curr->size > 0) { + int64_t nj = curr->size; + ensure_round_capacity(ws.round, nj); + kernel_count_edge_jobs<<>>( + *curr, edge_desc, d, grid_min, grid_size, voxel_size, ws.round.job_hit, ws.round.child_count, ws.round.result_count); + VOX_CUDA_CHECK(cudaGetLastError()); + exclusive_scan_i32(ws.round, ws.round.child_count, ws.round.child_offset, nj, stream); + exclusive_scan_i32(ws.round, ws.round.result_count, ws.round.result_offset, nj, stream); + const int32_t num_children_total = copy_last_i32(ws.round.child_offset, nj, stream) + copy_last_i32(ws.round.child_count, nj, stream); + const int32_t num_results_total = copy_last_i32(ws.round.result_offset, nj, stream) + copy_last_i32(ws.round.result_count, nj, stream); + ensure_job_queue_capacity(*next, num_children_total); + next->size = num_children_total; + ResultBuffer round_res = make_result_buffer(num_results_total); + kernel_emit_jobs<<>>( + *curr, ws.round.job_hit, ws.round.child_offset, ws.round.result_offset, d, *next, round_res); + VOX_CUDA_CHECK(cudaGetLastError()); + if (num_results_total > 0) ws.result_rounds.push_back(round_res); + std::swap(curr, next); + } + gathered = gather_result_rounds(ws.result_rounds, stream); + free_ptr(edge_desc); + release_workspace(ws); + return to_primitive_pair(std::move(gathered)); +} + +FaceQEFResult face_qef_gpu( + float3 voxel_size, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + const float* triangles, + int64_t num_triangles, + const int* voxels, + int64_t num_voxels, + cudaStream_t stream) { + FaceQEFResult out; + out.size = num_voxels; + out.qefs.allocate(num_voxels); + out.qefs.clear_async(stream); + if (num_voxels == 0 || num_triangles == 0) return out; + if (triangles == nullptr || voxels == nullptr) throw std::invalid_argument("null face_qef inputs"); + + const int64_t num_tri_vertices = num_triangles * 3; + fdg_gpu::DeviceBuffer faces_synth(num_triangles * 3); + + constexpr int kBlock = 256; + build_synth_faces_kernel<<>>( + num_triangles, faces_synth.data()); + fdg_gpu::throw_cuda_error(cudaGetLastError(), "build_synth_faces_kernel"); + + fdg_gpu::int3_ grid_size{grid_max.x - grid_min.x, grid_max.y - grid_min.y, grid_max.z - grid_min.z}; + auto raw_pairs = voxelize_mesh_oct_gpu( + triangles, num_tri_vertices, faces_synth.data(), num_triangles, grid_min, grid_size, voxel_size, stream); + + auto lookup = build_surface_lookup(voxels, num_voxels, grid_min, grid_max, stream); + auto face_pair_keys = map_and_unique_face_pairs(raw_pairs, lookup, grid_min, grid_max, stream); + auto contrib = build_face_contrib_stream(face_pair_keys, triangles, faces_synth.data(), stream); + return reduce_face_contribs(std::move(contrib), num_voxels, stream); +} + +} // namespace oct_pairs diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.h b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.h new file mode 100644 index 00000000..20195901 --- /dev/null +++ b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.h @@ -0,0 +1,44 @@ +#pragma once + +#include "fdg_gpu_common.h" +#include +#include + +namespace oct_pairs { + +fdg_gpu::PrimitivePairResult voxelize_mesh_oct_gpu( + const float* vertices, + int64_t num_vertices, + const int32_t* faces, + int64_t num_faces, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_size, + float3 voxel_size, + cudaStream_t stream = nullptr); + +fdg_gpu::PrimitivePairResult voxelize_edge_oct_gpu( + const float* vertices, + int64_t num_vertices, + const int32_t* edges, + int64_t num_edges, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_size, + float3 voxel_size, + cudaStream_t stream = nullptr); + +struct FaceQEFResult { + int64_t size = 0; + fdg_gpu::DeviceBuffer qefs; +}; + +FaceQEFResult face_qef_gpu( + float3 voxel_size, + fdg_gpu::int3_ grid_min, + fdg_gpu::int3_ grid_max, + const float* triangles, + int64_t num_triangles, + const int* voxels, + int64_t num_voxels, + cudaStream_t stream = nullptr); + +} // namespace oct_pairs diff --git a/o-voxel/src/ext.cpp b/o-voxel/src/ext.cpp index e2ac946d..b23071f8 100644 --- a/o-voxel/src/ext.cpp +++ b/o-voxel/src/ext.cpp @@ -15,6 +15,17 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("hashmap_insert_3d_idx_as_val_cuda", &hashmap_insert_3d_idx_as_val_cuda); // Convert functions m.def("mesh_to_flexible_dual_grid_cpu", &mesh_to_flexible_dual_grid_cpu, py::call_guard()); + m.def("mesh_to_flexible_dual_grid_gpu", &mesh_to_flexible_dual_grid_gpu, py::call_guard()); + m.def("intersect_qef_cpu", &intersect_qef_cpu, py::call_guard()); + m.def("intersection_occ_gpu", &intersection_occ_gpu, py::call_guard()); + m.def("intersect_qef_gpu", &intersect_qef_gpu, py::call_guard()); + m.def("voxelize_mesh_oct_gpu", &voxelize_mesh_oct_gpu, py::call_guard()); + m.def("voxelize_edge_oct_gpu", &voxelize_edge_oct_gpu, py::call_guard()); + m.def("face_qef_cpu", &face_qef_cpu, py::call_guard()); + m.def("face_qef_gpu", &face_qef_gpu, py::call_guard()); + m.def("voxel_traverse_edge_dda_gpu", &voxel_traverse_edge_dda_gpu, py::call_guard()); + m.def("boundary_qef_cpu", &boundary_qef_cpu, py::call_guard()); + m.def("boundary_qef_gpu", &boundary_qef_gpu, py::call_guard()); m.def("textured_mesh_to_volumetric_attr_cpu", &textured_mesh_to_volumetric_attr_cpu, py::call_guard()); // Serialization functions m.def("z_order_encode_cpu", &z_order_encode_cpu, py::call_guard()); @@ -34,4 +45,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("decode_sparse_voxel_octree_attr_neighbor_cpu", &decode_sparse_voxel_octree_attr_neighbor_cpu, py::call_guard()); // Rasterization functions m.def("rasterize_voxels_cuda", &rasterize_voxels_cuda); -} \ No newline at end of file +}