microsoft · quantaji · Mar 19, 2026
diff --git a/o-voxel/notebooks/test_shape.glb b/o-voxel/notebooks/test_shape.glb
diff --git a/o-voxel/notebooks/verify_decoded_shape.ipynb b/o-voxel/notebooks/verify_decoded_shape.ipynb
diff --git a/o-voxel/notebooks/verify_fdg_jit_cpu_gpu.ipynb b/o-voxel/notebooks/verify_fdg_jit_cpu_gpu.ipynb
diff --git a/o-voxel/notebooks/verify_fdg_stage_profile_jit_cpu_gpu.ipynb b/o-voxel/notebooks/verify_fdg_stage_profile_jit_cpu_gpu.ipynb
diff --git a/o-voxel/notebooks/verify_voxelize_edge_jit_oct_vs_dda.ipynb b/o-voxel/notebooks/verify_voxelize_edge_jit_oct_vs_dda.ipynb
diff --git a/o-voxel/notebooks/verify_voxelize_mesh_gpu_jit_open3d.ipynb b/o-voxel/notebooks/verify_voxelize_mesh_gpu_jit_open3d.ipynb
diff --git a/o-voxel/o_voxel/convert/flexible_dual_grid.py b/o-voxel/o_voxel/convert/flexible_dual_grid.py
diff --git a/o-voxel/setup.py b/o-voxel/setup.py
@@ -38,6 +38,11 @@
                 # Convert functions
                 "src/convert/flexible_dual_grid.cpp",
                 "src/convert/volumetic_attr.cpp",
+                "src/convert/mesh_to_flexible_dual_grid_gpu/torch_bindings.cu",
+                "src/convert/mesh_to_flexible_dual_grid_gpu/flexible_dual_grid_gpu.cu",
+                "src/convert/mesh_to_flexible_dual_grid_gpu/intersection_qef.cu",
+                "src/convert/mesh_to_flexible_dual_grid_gpu/voxelize_mesh_oct.cu",
+                "src/convert/mesh_to_flexible_dual_grid_gpu/voxel_traverse_edge_dda.cu",
                 ## Serialization functions
                 "src/serialize/api.cu",
                 "src/serialize/hilbert.cu",

diff --git a/o-voxel/src/convert/api.h b/o-voxel/src/convert/api.h
@@ -39,6 +39,135 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> mesh_to_flexible_dual_gr
 );
 
 
+/**
+ * Extract flexible dual grid from a triangle mesh with CUDA backend.
+ */
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> mesh_to_flexible_dual_grid_gpu(
+    const torch::Tensor& vertices,
+    const torch::Tensor& faces,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range,
+    float face_weight,
+    float boundary_weight,
+    float regularization_weight,
+    int64_t intersect_chunk_triangles,
+    int boundary_chunk_steps
+);
+
+
+/**
+ * Intersection occupancy only (CUDA).
+ */
+torch::Tensor intersection_occ_gpu(
+    const torch::Tensor& triangles,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range,
+    int64_t chunk_triangles
+);
+
+
+/**
+ * Intersect and build QEF terms (CPU).
+ */
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> intersect_qef_cpu(
+    const torch::Tensor& triangles,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range
+);
+
+
+/**
+ * Intersect and build QEF terms (CUDA).
+ */
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> intersect_qef_gpu(
+    const torch::Tensor& triangles,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range,
+    int64_t chunk_triangles
+);
+
+
+/**
+ * Octree voxelization against mesh faces (CUDA).
+ */
+std::tuple<torch::Tensor, torch::Tensor> voxelize_mesh_oct_gpu(
+    const torch::Tensor& vertices,
+    const torch::Tensor& faces,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range
+);
+
+
+/**
+ * Octree voxelization against edges (CUDA).
+ */
+std::tuple<torch::Tensor, torch::Tensor> voxelize_edge_oct_gpu(
+    const torch::Tensor& vertices,
+    const torch::Tensor& edges,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range
+);
+
+
+/**
+ * Face QEF accumulation (CPU).
+ */
+torch::Tensor face_qef_cpu(
+    const torch::Tensor& triangles,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range,
+    const torch::Tensor& voxels
+);
+
+
+/**
+ * Face QEF accumulation (CUDA).
+ */
+torch::Tensor face_qef_gpu(
+    const torch::Tensor& triangles,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range,
+    const torch::Tensor& voxels
+);
+
+
+/**
+ * Edge traversal with DDA (CUDA).
+ */
+std::tuple<torch::Tensor, torch::Tensor> voxel_traverse_edge_dda_gpu(
+    const torch::Tensor& vertices,
+    const torch::Tensor& edges,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range,
+    int chunk_steps
+);
+
+
+/**
+ * Boundary QEF accumulation (CPU).
+ */
+torch::Tensor boundary_qef_cpu(
+    const torch::Tensor& boundaries,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range,
+    float boundary_weight,
+    const torch::Tensor& voxels
+);
+
+
+/**
+ * Boundary QEF accumulation (CUDA).
+ */
+torch::Tensor boundary_qef_gpu(
+    const torch::Tensor& boundaries,
+    const torch::Tensor& voxel_size,
+    const torch::Tensor& grid_range,
+    float boundary_weight,
+    const torch::Tensor& voxels,
+    int chunk_steps
+);
+
+
 /**
  * Voxelizes a triangle mesh with PBR materials
  * 

diff --git a/o-voxel/src/convert/flexible_dual_grid.cpp b/o-voxel/src/convert/flexible_dual_grid.cpp
@@ -519,7 +519,11 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> mesh_to_flexible_dual_gr
     // Face QEF computation
     if (face_weight > 0.0f) {
         start = clock();
-        face_qef(e_voxel_size, e_grid_min, e_grid_max, triangles, hash_table, qefs);
+        std::vector<Eigen::Matrix4f> face_qefs(voxels.size(), Eigen::Matrix4f::Zero());
+        face_qef(e_voxel_size, e_grid_min, e_grid_max, triangles, hash_table, face_qefs);
+        for (size_t i = 0; i < qefs.size(); ++i) {
+            qefs[i] += face_weight * face_qefs[i];
+        }
         end = clock();
         if (timing) std::cout << "Face QEF computation took " << double(end - start) / CLOCKS_PER_SEC << " seconds." << std::endl;
     }
@@ -772,4 +776,3 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> mesh_to_flexible_dual_gr
         torch::from_blob(intersected.data(), {int(intersected.size()), 3}, torch::kBool).clone()
     );
 }
-
diff --git a/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/fdg_gpu_common.h b/o-voxel/src/convert/mesh_to_flexible_dual_grid_gpu/fdg_gpu_common.h
@@ -0,0 +1,152 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+namespace fdg_gpu {
+
+inline void throw_cuda_error(cudaError_t error, const char* context) {
+    if (error == cudaSuccess) return;
+    throw std::runtime_error(std::string(context) + ": " + cudaGetErrorString(error));
+}
+
+struct int2_ {
+    int x;
+    int y;
+};
+
+struct int3_ {
+    int x;
+    int y;
+    int z;
+
+    __host__ __device__ int& operator[](int i) { return (&x)[i]; }
+    __host__ __device__ int operator[](int i) const { return (&x)[i]; }
+};
+
+struct bool3_ {
+    bool x;
+    bool y;
+    bool z;
+
+    __host__ __device__ bool& operator[](int i) { return (&x)[i]; }
+    __host__ __device__ bool operator[](int i) const { return (&x)[i]; }
+};
+
+template <typename T>
+class DeviceBuffer {
+public:
+    DeviceBuffer() = default;
+    explicit DeviceBuffer(int64_t count) { allocate(count); }
+    ~DeviceBuffer() { release(); }
+
+    DeviceBuffer(const DeviceBuffer&) = delete;
+    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
+
+    DeviceBuffer(DeviceBuffer&& other) noexcept
+        : ptr_(other.ptr_), size_(other.size_), owns_(other.owns_) {
+        other.ptr_ = nullptr;
+        other.size_ = 0;
+        other.owns_ = true;
+    }
+
+    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
+        if (this != &other) {
+            release();
+            ptr_ = other.ptr_;
+            size_ = other.size_;
+            owns_ = other.owns_;
+            other.ptr_ = nullptr;
+            other.size_ = 0;
+            other.owns_ = true;
+        }
+        return *this;
+    }
+
+    void allocate(int64_t count) {
+        if (count < 0) {
+            throw std::invalid_argument("DeviceBuffer::allocate count must be non-negative");
+        }
+        release();
+        size_ = count;
+        owns_ = true;
+        if (count == 0) return;
+        throw_cuda_error(cudaMalloc(reinterpret_cast<void**>(&ptr_), static_cast<size_t>(count) * sizeof(T)),
+                         "cudaMalloc failed in DeviceBuffer::allocate");
+    }
+
+    void adopt(T* ptr, int64_t count) {
+        release();
+        ptr_ = ptr;
+        size_ = count;
+        owns_ = true;
+    }
+
+    void clear_async(cudaStream_t stream = nullptr) {
+        if (size_ == 0) return;
+        throw_cuda_error(cudaMemsetAsync(ptr_, 0, static_cast<size_t>(size_) * sizeof(T), stream),
+                         "cudaMemsetAsync failed in DeviceBuffer::clear_async");
+    }
+
+    T* data() noexcept { return ptr_; }
+    const T* data() const noexcept { return ptr_; }
+    int64_t size() const noexcept { return size_; }
+    bool empty() const noexcept { return size_ == 0; }
+
+    T* release_ownership() noexcept {
+        T* out = ptr_;
+        ptr_ = nullptr;
+        size_ = 0;
+        owns_ = true;
+        return out;
+    }
+
+private:
+    void release() noexcept {
+        if (ptr_ != nullptr && owns_) {
+            cudaFree(ptr_);
+        }
+        ptr_ = nullptr;
+        size_ = 0;
+        owns_ = true;
+    }
+
+    T* ptr_ = nullptr;
+    int64_t size_ = 0;
+    bool owns_ = true;
+};
+
+struct SymQEF10 {
+    float q00, q01, q02, q03;
+    float q11, q12, q13;
+    float q22, q23;
+    float q33;
+};
+
+struct PrimitivePairResult {
+    int64_t size = 0;
+    DeviceBuffer<int32_t> prim_id;
+    DeviceBuffer<int32_t> voxel_i;
+    DeviceBuffer<int32_t> voxel_j;
+    DeviceBuffer<int32_t> voxel_k;
+};
+
+__host__ __device__ __forceinline__ int ceil_div_i64(int64_t n, int block) {
+    return static_cast<int>((n + block - 1) / block);
+}
+
+__host__ __device__ __forceinline__ uint64_t pack_voxel_key(
+    int x, int y, int z, int3_ grid_min, int3_ grid_max) {
+    const uint64_t sx = static_cast<uint64_t>(grid_max.x - grid_min.x);
+    const uint64_t sy = static_cast<uint64_t>(grid_max.y - grid_min.y);
+    const uint64_t ux = static_cast<uint64_t>(x - grid_min.x);
+    const uint64_t uy = static_cast<uint64_t>(y - grid_min.y);
+    const uint64_t uz = static_cast<uint64_t>(z - grid_min.z);
+    return ux + sx * (uy + sy * uz);
+}
+
+} // namespace fdg_gpu