diff --git a/README.md b/README.md
index d00cc87..4233d6d 100644
--- a/README.md
+++ b/README.md
@@ -197,10 +197,22 @@ prediction/your_model
 ├── ...
 ```
 
-3. Run `ray_metrics.py` to evaluate on the RayIoU:
+3. Install `rayiou_metrics` package and use `evaluate_metrics` to evaluate on the RayIoU:
 
 ```
-python ray_metrics.py --pred-dir prediction/your_model
+cd rayiou_metrics
+python setup.py install
+```
+
+```
+# usage example:
+from rayiou_metrics.evaluation import evaluate_metrics
+
+gt_dir_root = 'data/nuscenes'
+data_type = 'occ3d'  # or 'openocc_v2'
+pred_dir = prediction/your_model
+
+metrics = evaluate_metrics(gt_dir_root, pred_dir, data_type)
 ```
 
 ## Timing
diff --git a/ray_metrics.py b/ray_metrics.py
deleted file mode 100644
index 225f4f4..0000000
--- a/ray_metrics.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import os
-import glob
-import mmcv
-import argparse
-import numpy as np
-import torch
-from torch.utils.data import DataLoader
-from loaders.ray_metrics import main_rayiou
-from loaders.ego_pose_dataset import EgoPoseDataset
-from configs.r50_nuimg_704x256_8f import occ_class_names as occ3d_class_names
-from configs.r50_nuimg_704x256_8f_openocc import occ_class_names as openocc_class_names
-
-
-def main(args):
-    data_infos = mmcv.load(os.path.join(args.data_root, 'nuscenes_infos_val.pkl'))['infos']
-    gt_filepaths = sorted(glob.glob(os.path.join(args.data_root, args.data_type, '*/*/*.npz')))
-
-    # retrieve scene_name
-    token2scene = {}
-    for gt_path in gt_filepaths:
-        token = gt_path.split('/')[-2]
-        scene_name = gt_path.split('/')[-3]
-        token2scene[token] = scene_name
-
-    for i in range(len(data_infos)):
-        scene_name = token2scene[data_infos[i]['token']]
-        data_infos[i]['scene_name'] = scene_name
-
-    lidar_origins = []
-    occ_gts = []
-    occ_preds = []
-
-    for idx, batch in enumerate(DataLoader(EgoPoseDataset(data_infos), num_workers=8)):
-        output_origin = batch[1]
-        info = data_infos[idx]
-
-        occ_path = os.path.join(args.data_root, args.data_type, info['scene_name'], info['token'], 'labels.npz')
-        occ_gt = np.load(occ_path, allow_pickle=True)['semantics']
-        occ_gt = np.reshape(occ_gt, [200, 200, 16]).astype(np.uint8)
-
-        occ_path = os.path.join(args.pred_dir, info['token'] + '.npz')
-        occ_pred = np.load(occ_path, allow_pickle=True)['pred']
-        occ_pred = np.reshape(occ_pred, [200, 200, 16]).astype(np.uint8)
-        
-        lidar_origins.append(output_origin)
-        occ_gts.append(occ_gt)
-        occ_preds.append(occ_pred)
-    
-    if args.data_type == 'occ3d':
-        occ_class_names = occ3d_class_names
-    elif args.data_type == 'openocc_v2':
-        occ_class_names = openocc_class_names
-    else:
-        raise ValueError
-    
-    print(main_rayiou(occ_preds, occ_gts, lidar_origins, occ_class_names=occ_class_names))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--data-root", type=str, default='data/nuscenes')
-    parser.add_argument("--pred-dir", type=str)
-    parser.add_argument("--data-type", type=str, choices=['occ3d', 'openocc_v2'], default='occ3d')
-    args = parser.parse_args()
-
-    torch.random.manual_seed(0)
-    np.random.seed(0)
-
-    main(args)
diff --git a/rayiou_metrics/lib/dvr_cpu/dvr.cpp b/rayiou_metrics/lib/dvr_cpu/dvr.cpp
new file mode 100644
index 0000000..9c3d722
--- /dev/null
+++ b/rayiou_metrics/lib/dvr_cpu/dvr.cpp
@@ -0,0 +1,71 @@
+// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+// Modified by Haisong Liu
+
+#include <string>
+#include <torch/extension.h>
+#include <vector>
+
+/*
+ * CUDA forward declarations
+ */
+
+std::vector<torch::Tensor> render_forward_cpu(torch::Tensor sigma,
+                                               torch::Tensor origin,
+                                               torch::Tensor points,
+                                               torch::Tensor tindex,
+                                               const std::vector<int> grid,
+                                               std::string phase_name);
+
+std::vector<torch::Tensor>
+render_cpu(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points,
+            torch::Tensor tindex, std::string loss_name);
+
+torch::Tensor init_cpu(torch::Tensor points, torch::Tensor tindex,
+                        const std::vector<int> grid);
+
+/*
+ * C++ interface
+ */
+
+#define CHECK_CUDA(x)                                                          \
+  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor>
+render_forward(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points,
+               torch::Tensor tindex, const std::vector<int> grid,
+               std::string phase_name) {
+  CHECK_CONTIGUOUS(sigma);
+  CHECK_CONTIGUOUS(origin);
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(tindex);
+  return render_forward_cpu(sigma, origin, points, tindex, grid, phase_name);
+}
+
+
+std::vector<torch::Tensor> render(torch::Tensor sigma, torch::Tensor origin,
+                                  torch::Tensor points, torch::Tensor tindex,
+                                  std::string loss_name) {
+  CHECK_CONTIGUOUS(sigma);
+  CHECK_CONTIGUOUS(origin);
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(tindex);
+  return render_cpu(sigma, origin, points, tindex, loss_name);
+}
+
+torch::Tensor init(torch::Tensor points, torch::Tensor tindex,
+                   const std::vector<int> grid) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(tindex);
+  return init_cpu(points, tindex, grid);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("init", &init, "Initialize");
+  m.def("render", &render, "Render");
+  m.def("render_forward", &render_forward, "Render (forward pass only)");
+}
diff --git a/rayiou_metrics/lib/dvr_cpu/dvr_render_cpu.cpp b/rayiou_metrics/lib/dvr_cpu/dvr_render_cpu.cpp
new file mode 100644
index 0000000..07ace22
--- /dev/null
+++ b/rayiou_metrics/lib/dvr_cpu/dvr_render_cpu.cpp
@@ -0,0 +1,650 @@
+      
+      
+// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+// Modified by Haisong Liu
+
+#include <torch/extension.h>
+#include <stdio.h>
+#include <float.h>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <algorithm>
+
+#define MAX_D 1446 // 700 + 700 + 45 + 1
+#define MAX_STEP 1000
+
+
+enum LossType {L1, L2, ABSREL};
+enum PhaseName {TEST, TRAIN};
+
+struct int3{
+    int x, y, z;
+    int3(int _x, int _y, int _z) : x(_x), y(_y), z(_z) {}
+    int3() {}
+};
+
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ */
+std::vector<torch::Tensor> render_forward_cpu(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid,
+    std::string phase_name) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+
+    const auto device = sigma.device();
+
+    //
+    // const auto dtype = points.dtype();
+    // const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    // auto pog = torch::zeros({N, T, H, L, W}, options);
+
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+
+    auto coord_index = torch::zeros({N, M, 3}, device);
+
+    PhaseName train_phase;
+    if (phase_name.compare("test") == 0) {
+        train_phase = TEST;
+    } else if (phase_name.compare("train") == 0){
+        train_phase = TRAIN;
+    } else {
+        std::cout << "UNKNOWN PHASE NAME: " << phase_name << std::endl;
+        exit(1);
+    }
+
+    for (int n = 0; n < N; n++) {  // batch index
+        for (int m = 0; m < M; m++) {  // ray index
+            // ray end point
+            const int t = tindex[n][m].item<int>();   // cancel auto to change dtype into int
+
+            // invalid points
+            // assert(t < T);
+            assert(T == 1 || t < T);
+
+            // time index for sigma
+            // when T = 1, we have a static sigma
+            const auto ts = (T == 1) ? 0 : t;
+
+            // if t < 0, it is a padded point
+            if (t < 0) continue;
+
+            // grid shape
+            const int vzsize = sigma.size(2);
+            const int vysize = sigma.size(3);
+            const int vxsize = sigma.size(4);
+            // assert(vzsize + vysize + vxsize <= MAX_D);
+
+            // origin
+            const double xo = origin[n][t][0].item<double>();
+            const double yo = origin[n][t][1].item<double>();
+            const double zo = origin[n][t][2].item<double>();
+
+            // end point
+            const double xe = points[n][m][0].item<double>();
+            const double ye = points[n][m][1].item<double>();
+            const double ze = points[n][m][2].item<double>();
+
+            // locate the voxel where the origin resides
+            const int vxo = int(xo);
+            const int vyo = int(yo);
+            const int vzo = int(zo);
+
+            const int vxe = int(xe);
+            const int vye = int(ye);
+            const int vze = int(ze);
+
+            // NOTE: new
+            int vx = vxo;
+            int vy = vyo;
+            int vz = vzo;
+
+            // origin to end
+            const double rx = xe - xo;
+            const double ry = ye - yo;
+            const double rz = ze - zo;
+            double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+
+            // directional vector
+            const double dx = rx / gt_d;
+            const double dy = ry / gt_d;
+            const double dz = rz / gt_d;
+
+            // In which direction the voxel ids are incremented.
+            const int stepX = (dx >= 0) ? 1 : -1;
+            const int stepY = (dy >= 0) ? 1 : -1;
+            const int stepZ = (dz >= 0) ? 1 : -1;
+
+            // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+            const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+            const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+            const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+
+            // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+            // the value of t at which the ray crosses the first vertical voxel boundary
+            double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+            double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+            double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+
+            // tDeltaX, tDeltaY, tDeltaZ --
+            // how far along the ray we must move for the horizontal component to equal the width of a voxel
+            // the direction in which we traverse the grid
+            // can only be FLT_MAX if we never go in that direction
+            const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+            const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+            const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+
+            
+            int3 path[MAX_D];
+            double csd[MAX_D];  // cumulative sum of sigma times delta
+            double p[MAX_D];  // alpha
+            double d[MAX_D];
+
+            // forward raymarching with voxel traversal
+            int step = 0;  // total number of voxels traversed
+            int count = 0;  // number of voxels traversed inside the voxel grid
+            double last_d = 0.0;  // correct initialization
+
+            // voxel traversal raycasting
+            bool was_inside = false;
+            while (true) {
+                bool inside = (0 <= vx && vx < vxsize) &&
+                    (0 <= vy && vy < vysize) &&
+                    (0 <= vz && vz < vzsize);
+                if (inside) {
+                    was_inside = true;
+                    path[count] = int3(vx, vy, vz);
+                } else if (was_inside) { // was but no longer inside
+                    // we know we are not coming back so terminate
+                    break;
+                } /*else if (last_d > gt_d) {
+                    break;
+                } */
+                /*else { // has not gone inside yet
+                    // assert(count == 0);
+                    // (1) when we have hit the destination but haven't gone inside the voxel grid
+                    // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                    //     handle intersection corner cases in case of infinite loop
+                    bool hit = (vx == vxe && vy == vye && vz == vze);  // this test seems brittle with corner cases
+                    if (hit || step >= MAX_D)
+                        break;
+                    //if (last_d >= gt_d || step >= MAX_D) break;
+                } */
+                // _d represents the ray distance has traveled before escaping the current voxel cell
+                double _d = 0.0;
+                // voxel traversal
+                if (tMaxX < tMaxY) {
+                    if (tMaxX < tMaxZ) {
+                        _d = tMaxX;
+                        vx += stepX;
+                        tMaxX += tDeltaX;
+                    } else {
+                        _d = tMaxZ;
+                        vz += stepZ;
+                        tMaxZ += tDeltaZ;
+                    }
+                } else {
+                    if (tMaxY < tMaxZ) {
+                        _d = tMaxY;
+                        vy += stepY;
+                        tMaxY += tDeltaY;
+                    } else {
+                        _d = tMaxZ;
+                        vz += stepZ;
+                        tMaxZ += tDeltaZ;
+                    }
+                }
+                if (inside) {
+                    // get sigma at the current voxel
+                    const int3 &v = path[count];  // use the recorded index
+                    const double _sigma = sigma[n][ts][v.z][v.y][v.x].item<double>();
+                    const double _delta = std::max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                    const double sd = _sigma * _delta;
+                    if (count == 0) { // the first voxel inside
+                        csd[count] = sd;
+                        p[count] = 1 - exp(-sd);
+                    } else {
+                        csd[count] = csd[count-1] + sd;
+                        p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                    }
+                    // record the traveled distance
+                    d[count] = _d;
+                    // count the number of voxels we have escaped
+                    count ++;
+                }
+                last_d = _d;
+                step ++;
+
+                if (step > MAX_STEP) {
+                    break;
+                }
+            }
+
+            // the total number of voxels visited should not exceed this number
+            assert(count <= MAX_D);
+            
+            if (count > 0) {
+                // compute the expected ray distance
+                //double exp_d = 0.0;
+                double exp_d = d[count-1];
+                
+                const int3 &v_init = path[count-1];
+                int x = v_init.x;
+                int y = v_init.y;
+                int z = v_init.z;
+
+                for (int i = 0; i < count; i++) {
+                    //printf("%f\t%f\n",p[i], d[i]);
+                    //exp_d += p[i] * d[i];
+                    const int3 &v = path[i];
+                    const double occ = sigma[n][ts][v.z][v.y][v.x].item<double>();
+                    if (occ > 0.5) {
+                        exp_d = d[i];
+                        
+                        x = v.x;
+                        y = v.y;
+                        z = v.z;
+                    
+                        break;
+                    }
+
+                }
+                //printf("%f\n",exp_d);
+
+                // add an imaginary sample at the end point should gt_d exceeds max_d
+                double p_out = exp(-csd[count-1]);
+                double max_d = d[count-1];
+
+                // if (gt_d > max_d)
+                //   exp_d += (p_out * gt_d);
+
+                // p_out is the probability the ray escapes the voxel grid
+                //exp_d += (p_out * max_d);
+                if (train_phase == 1) {
+                    gt_d = std::min(gt_d, max_d);
+                }
+
+                // write the rendered ray distance (max_d)
+                pred_dist[n][m] = exp_d;
+                gt_dist[n][m] = gt_d;
+            
+                coord_index[n][m][0] = double(x);
+                coord_index[n][m][1] = double(y);
+                coord_index[n][m][2] = double(z);
+
+                // // write occupancy
+                // for (int i = 0; i < count; i ++) {
+                //     const int3 &v = path[i];
+                //     auto & occ = pog[n][t][v.z][v.y][v.x];
+                //     if (p[i] >= occ) {
+                //         occ = p[i];
+                //     }
+                // }
+            }
+        }
+    }
+
+    // return {pog, pred_dist, gt_dist};
+    return {pred_dist, gt_dist, coord_index};
+}
+
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ *   loss     : N x M
+ *   grad_sigma : N x T x H x L x W
+ */
+std::vector<torch::Tensor> render_cpu(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    std::string loss_name) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+    const auto T = sigma.size(1); // num of time steps
+
+    const auto device = sigma.device();
+
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+    auto grad_sigma = torch::zeros_like(sigma);
+    // auto grad_sigma_count = torch::zeros_like(sigma);
+
+    LossType loss_type;
+    if (loss_name.compare("l1") == 0) {
+        loss_type = L1;
+    } else if (loss_name.compare("l2") == 0) {
+        loss_type = L2;
+    } else if (loss_name.compare("absrel") == 0) {
+        loss_type = ABSREL;
+    } else if (loss_name.compare("bce") == 0){
+        loss_type = L1;
+    } else {
+        std::cout << "UNKNOWN LOSS TYPE: " << loss_name << std::endl;
+        exit(1);
+    }
+
+    for (int n = 0; n < N; n++) {  // batch index
+        for (int m = 0; m < M; m++) {  // ray index
+            const int t = tindex[n][m].item<int>();   // cancel auto to change dtype into int
+
+            assert(T == 1 || t < T);
+
+            // time index for sigma
+            // when T = 1, we have a static sigma
+            const auto ts = (T == 1) ? 0 : t;
+
+            // if t < 0, it is a padded point
+            if (t < 0) continue;
+
+            // grid shape
+            const int vzsize = sigma.size(2);
+            const int vysize = sigma.size(3);
+            const int vxsize = sigma.size(4);
+            // assert(vzsize + vysize + vxsize <= MAX_D);
+
+            // origin
+            const double xo = origin[n][t][0].item<double>();
+            const double yo = origin[n][t][1].item<double>();
+            const double zo = origin[n][t][2].item<double>();
+
+            // end point
+            const double xe = points[n][m][0].item<double>();
+            const double ye = points[n][m][1].item<double>();
+            const double ze = points[n][m][2].item<double>();
+
+            // locate the voxel where the origin resides
+            const int vxo = int(xo);
+            const int vyo = int(yo);
+            const int vzo = int(zo);
+
+            //
+            const int vxe = int(xe);
+            const int vye = int(ye);
+            const int vze = int(ze);
+
+            // NOTE: new
+            int vx = vxo;
+            int vy = vyo;
+            int vz = vzo;
+
+            // origin to end
+            const double rx = xe - xo;
+            const double ry = ye - yo;
+            const double rz = ze - zo;
+            double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+
+            // directional vector
+            const double dx = rx / gt_d;
+            const double dy = ry / gt_d;
+            const double dz = rz / gt_d;
+
+            // In which direction the voxel ids are incremented.
+            const int stepX = (dx >= 0) ? 1 : -1;
+            const int stepY = (dy >= 0) ? 1 : -1;
+            const int stepZ = (dz >= 0) ? 1 : -1;
+
+            // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+            const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+            const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+            const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+
+            // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+            // the value of t at which the ray crosses the first vertical voxel boundary
+            double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+            double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+            double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+
+            // tDeltaX, tDeltaY, tDeltaZ --
+            // how far along the ray we must move for the horizontal component to equal the width of a voxel
+            // the direction in which we traverse the grid
+            // can only be FLT_MAX if we never go in that direction
+            const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+            const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+            const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+
+            int3 path[MAX_D];
+            double csd[MAX_D];  // cumulative sum of sigma times delta
+            double p[MAX_D];  // alpha
+            double d[MAX_D];
+            double dt[MAX_D];
+
+            // forward raymarching with voxel traversal
+            int step = 0;  // total number of voxels traversed
+            int count = 0;  // number of voxels traversed inside the voxel grid
+            double last_d = 0.0;  // correct initialization
+
+            // voxel traversal raycasting
+            bool was_inside = false;
+            while (true) {
+                bool inside = (0 <= vx && vx < vxsize) &&
+                    (0 <= vy && vy < vysize) &&
+                    (0 <= vz && vz < vzsize);
+                if (inside) { // now inside
+                    was_inside = true;
+                    path[count] = int3(vx, vy, vz);
+                } else if (was_inside) { // was inside but no longer
+                    // we know we are not coming back so terminate
+                    break;
+                } else if (last_d > gt_d) {
+                    break;
+                } /* else { // has not gone inside yet
+                    // assert(count == 0);
+                    // (1) when we have hit the destination but haven't gone inside the voxel grid
+                    // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                    //     handle intersection corner cases in case of infinite loop
+                    // bool hit = (vx == vxe && vy == vye && vz == vze);
+                    // if (hit || step >= MAX_D)
+                    //     break;
+                    if (last_d >= gt_d || step >= MAX_D) break;
+                } */
+                // _d represents the ray distance has traveled before escaping the current voxel cell
+                double _d = 0.0;
+                // voxel traversal
+                if (tMaxX < tMaxY) {
+                    if (tMaxX < tMaxZ) {
+                        _d = tMaxX;
+                        vx += stepX;
+                        tMaxX += tDeltaX;
+                    } else {
+                        _d = tMaxZ;
+                        vz += stepZ;
+                        tMaxZ += tDeltaZ;
+                    }
+                } else {
+                    if (tMaxY < tMaxZ) {
+                        _d = tMaxY;
+                        vy += stepY;
+                        tMaxY += tDeltaY;
+                    } else {
+                        _d = tMaxZ;
+                        vz += stepZ;
+                        tMaxZ += tDeltaZ;
+                    }
+                }
+                if (inside) {
+                    // get sigma at the current voxel
+                    const int3 &v = path[count];  // use the recorded index
+                    const double _sigma = sigma[n][ts][v.z][v.y][v.x].item<double>();
+                    const double _delta = std::max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                    const double sd = _sigma * _delta;
+                    if (count == 0) { // the first voxel inside
+                        csd[count] = sd;
+                        p[count] = 1 - exp(-sd);
+                    } else {
+                        csd[count] = csd[count-1] + sd;
+                        p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                    }
+                    // record the traveled distance
+                    d[count] = _d;
+                    dt[count] = _delta;
+                    // count the number of voxels we have escaped
+                    count ++;
+                }
+                last_d = _d;
+                step ++;
+
+                if (step > MAX_STEP) {
+                    break;
+                }
+            }
+
+            // the total number of voxels visited should not exceed this number
+            assert(count <= MAX_D);
+
+            // WHEN THERE IS AN INTERSECTION BETWEEN THE RAY AND THE VOXEL GRID
+            if (count > 0) {
+                // compute the expected ray distance
+                double exp_d = 0.0;
+                for (int i = 0; i < count; i ++)
+                    exp_d += p[i] * d[i];
+
+                // add an imaginary sample at the end point should gt_d exceeds max_d
+                double p_out = exp(-csd[count-1]);
+                double max_d = d[count-1];
+
+                exp_d += (p_out * max_d);
+                gt_d = std::min(gt_d, max_d);
+
+                // write the rendered ray distance (max_d)
+                pred_dist[n][m] = exp_d;
+                gt_dist[n][m] = gt_d;
+
+                /* backward raymarching */
+                double dd_dsigma[MAX_D];
+                for (int i = count - 1; i >= 0; i --) {
+                    // NOTE: probably need to double check again
+                    if (i == count - 1)
+                        dd_dsigma[i] = p_out * max_d;
+                    else
+                        dd_dsigma[i] = dd_dsigma[i+1] - exp(-csd[i]) * (d[i+1] - d[i]);
+                }
+
+                for (int i = count - 1; i >= 0; i --)
+                    dd_dsigma[i] *= dt[i];
+
+                // option 2: cap at the boundary
+                for (int i = count - 1; i >= 0; i --)
+                    dd_dsigma[i] -= dt[i] * p_out * max_d;
+
+                double dl_dd = 1.0;
+                if (loss_type == L1)
+                    dl_dd = (exp_d >= gt_d) ? 1 : -1;
+                else if (loss_type == L2)
+                    dl_dd = (exp_d - gt_d);
+                else if (loss_type == ABSREL)
+                    dl_dd = (exp_d >= gt_d) ? (1.0/gt_d) : -(1.0/gt_d);
+
+                // apply chain rule
+                for (int i = 0; i < count; i ++) {
+                    const int3 &v = path[i];
+                    // NOTE: potential race conditions when writing gradients
+                    grad_sigma[n][ts][v.z][v.y][v.x] += dl_dd * dd_dsigma[i];
+                    // grad_sigma_count[n][ts][v.z][v.y][v.x] += 1;
+                }
+            }
+        }
+    }
+
+
+    // grad_sigma_count += (grad_sigma_count == 0);
+    // grad_sigma /= grad_sigma_count;
+
+    return {pred_dist, gt_dist, grad_sigma};
+}
+
+
+/*
+ * input shape
+ *   origin   : N x T x 3
+ *   points   : N x M x 3
+ *   tindex   : N x M
+ * output shape
+ *   occupancy: N x T x H x L x W
+ */
+torch::Tensor init_cpu(
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+
+    const auto dtype = points.dtype();
+    const auto device = points.device();
+    const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    auto occupancy = torch::zeros({N, T, H, L, W}, options);
+
+    // // initialize occupancy such that every voxel with one or more points is occupied
+
+    for (int n = 0; n < N; n++) {  // batch index
+        for (int m = 0; m < M; m++) {  // ray index
+            // ray end point
+            const int t = tindex[n][m].item<int>();   // cancel auto to change dtype into int
+
+            // invalid points
+            assert(T == 1 || t < T);
+
+            // if t < 0, it is a padded point
+            if (t < 0) continue;
+
+            // time index for sigma
+            // when T = 1, we have a static sigma
+            const auto ts = (T == 1) ? 0 : t;
+
+            // grid shape
+            const int vzsize = occupancy.size(2);
+            const int vysize = occupancy.size(3);
+            const int vxsize = occupancy.size(4);
+            // assert(vzsize + vysize + vxsize <= MAX_D);
+
+            // end point
+            const int vx = int(points[n][m][0].item<int>());
+            const int vy = int(points[n][m][1].item<int>());
+            const int vz = int(points[n][m][2].item<int>());
+
+            //
+            if (0 <= vx && vx < vxsize &&
+                0 <= vy && vy < vysize &&
+                0 <= vz && vz < vzsize) {
+                occupancy[n][ts][vz][vy][vx] = 1;
+            }
+        }
+    }
+
+    return occupancy;
+}
+
+    
+
+    
\ No newline at end of file
diff --git a/rayiou_metrics/lib/dvr_cuda/dvr_cuda.cpp b/rayiou_metrics/lib/dvr_cuda/dvr_cuda.cpp
new file mode 100644
index 0000000..b2a7c09
--- /dev/null
+++ b/rayiou_metrics/lib/dvr_cuda/dvr_cuda.cpp
@@ -0,0 +1,72 @@
+// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+// Modified by Haisong Liu
+
+#include <string>
+#include <torch/extension.h>
+#include <vector>
+
+/*
+ * CUDA forward declarations
+ */
+
+std::vector<torch::Tensor> render_forward_cuda(torch::Tensor sigma,
+                                               torch::Tensor origin,
+                                               torch::Tensor points,
+                                               torch::Tensor tindex,
+                                               const std::vector<int> grid,
+                                               std::string phase_name);
+
+std::vector<torch::Tensor>
+render_cuda(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points,
+            torch::Tensor tindex, std::string loss_name);
+
+torch::Tensor init_cuda(torch::Tensor points, torch::Tensor tindex,
+                        const std::vector<int> grid);
+
+
+/*
+ * C++ interface
+ */
+
+#define CHECK_CUDA(x)                                                          \
+  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor>
+render_forward(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points,
+               torch::Tensor tindex, const std::vector<int> grid,
+               std::string phase_name) {
+  CHECK_INPUT(sigma);
+  CHECK_INPUT(origin);
+  CHECK_INPUT(points);
+  CHECK_INPUT(tindex);
+  return render_forward_cuda(sigma, origin, points, tindex, grid, phase_name);
+}
+
+
+std::vector<torch::Tensor> render(torch::Tensor sigma, torch::Tensor origin,
+                                  torch::Tensor points, torch::Tensor tindex,
+                                  std::string loss_name) {
+  CHECK_INPUT(sigma);
+  CHECK_INPUT(origin);
+  CHECK_INPUT(points);
+  CHECK_INPUT(tindex);
+  return render_cuda(sigma, origin, points, tindex, loss_name);
+}
+
+torch::Tensor init(torch::Tensor points, torch::Tensor tindex,
+                   const std::vector<int> grid) {
+  CHECK_INPUT(points);
+  CHECK_INPUT(tindex);
+  return init_cuda(points, tindex, grid);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("init", &init, "Initialize");
+  m.def("render", &render, "Render");
+  m.def("render_forward", &render_forward, "Render (forward pass only)");
+}
diff --git a/rayiou_metrics/lib/dvr_cuda/dvr_render_cuda.cu b/rayiou_metrics/lib/dvr_cuda/dvr_render_cuda.cu
new file mode 100644
index 0000000..8010eb8
--- /dev/null
+++ b/rayiou_metrics/lib/dvr_cuda/dvr_render_cuda.cu
@@ -0,0 +1,747 @@
+// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+// Modified by Haisong Liu
+
+#include <torch/extension.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#include <string>
+#include <iostream>
+
+#define MAX_D 1446 // 700 + 700 + 45 + 1
+#define MAX_STEP 1000
+
+enum LossType {L1, L2, ABSREL};
+enum PhaseName {TEST, TRAIN};
+
+template <typename scalar_t>
+__global__ void init_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy) {
+
+    // batch index
+    const auto n = blockIdx.y;
+
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = occupancy.size(1);
+
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+
+        // invalid points
+        assert(T == 1 || t < T);
+
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+
+        // grid shape
+        const int vzsize = occupancy.size(2);
+        const int vysize = occupancy.size(3);
+        const int vxsize = occupancy.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+
+        // end point
+        const int vx = int(points[n][c][0]);
+        const int vy = int(points[n][c][1]);
+        const int vz = int(points[n][c][2]);
+
+        //
+        if (0 <= vx && vx < vxsize &&
+            0 <= vy && vy < vysize &&
+            0 <= vz && vz < vzsize) {
+            occupancy[n][ts][vz][vy][vx] = 1;
+        }
+    }
+}
+
+template <typename scalar_t>
+__global__ void render_forward_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> pog,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> coord_index,
+    PhaseName train_phase) {
+
+    // batch index
+    const auto n = blockIdx.y;
+
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) {
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was but no longer inside
+                // we know we are not coming back so terminate
+                break;
+            } /*else if (last_d > gt_d) {
+                break;
+            } */
+            /*else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                bool hit = (vx == vxe && vy == vye && vz == vze);  // this test seems brittle with corner cases
+                if (hit || step >= MAX_D)
+                    break;
+                //if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+        
+        if (count > 0) {
+            // compute the expected ray distance
+            //double exp_d = 0.0;
+            double exp_d = d[count-1];
+            
+            const int3 &v_init = path[count-1];
+            int x = v_init.x;
+            int y = v_init.y;
+            int z = v_init.z;
+
+            for (int i = 0; i < count; i++) {
+                //printf("%f\t%f\n",p[i], d[i]);
+                //exp_d += p[i] * d[i];
+                const int3 &v = path[i];
+                const double occ = sigma[n][ts][v.z][v.y][v.x];
+                if (occ > 0.5) {
+                    exp_d = d[i];
+                    
+                    x = v.x;
+                    y = v.y;
+                    z = v.z;
+                
+                    break;
+                }
+
+            }
+            //printf("%f\n",exp_d);
+
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+
+            // if (gt_d > max_d)
+            //   exp_d += (p_out * gt_d);
+
+            // p_out is the probability the ray escapes the voxel grid
+            //exp_d += (p_out * max_d);
+            if (train_phase == 1) {
+                gt_d = min(gt_d, max_d);
+            }
+
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+          
+            coord_index[n][c][0] = double(x);
+            coord_index[n][c][1] = double(y);
+            coord_index[n][c][2] = double(z);
+
+            // // write occupancy
+            // for (int i = 0; i < count; i ++) {
+            //     const int3 &v = path[i];
+            //     auto & occ = pog[n][t][v.z][v.y][v.x];
+            //     if (p[i] >= occ) {
+            //         occ = p[i];
+            //     }
+            // }
+        }
+    }
+}
+
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ */
+std::vector<torch::Tensor> render_forward_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid,
+    std::string phase_name) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+
+    const auto device = sigma.device();
+
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+
+    //
+    // const auto dtype = points.dtype();
+    // const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    // auto pog = torch::zeros({N, T, H, L, W}, options);
+
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+
+    auto coord_index = torch::zeros({N, M, 3}, device);
+
+    PhaseName train_phase;
+    if (phase_name.compare("test") == 0) {
+        train_phase = TEST;
+    } else if (phase_name.compare("train") == 0){
+        train_phase = TRAIN;
+    } else {
+        std::cout << "UNKNOWN PHASE NAME: " << phase_name << std::endl;
+        exit(1);
+    }
+
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_forward_cuda", ([&] {
+                render_forward_cuda_kernel<scalar_t><<<blocks, threads>>>(
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // pog.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    coord_index.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    train_phase);
+            }));
+
+    cudaDeviceSynchronize();
+
+    // return {pog, pred_dist, gt_dist};
+    return {pred_dist, gt_dist, coord_index};
+}
+
+template <typename scalar_t>
+__global__ void render_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma_count,
+    LossType loss_type) {
+
+    // batch index
+    const auto n = blockIdx.y;
+
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+
+        //
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+        double dt[MAX_D];
+
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) { // now inside
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was inside but no longer
+                // we know we are not coming back so terminate
+                break;
+            } else if (last_d > gt_d) {
+                break;
+            } /* else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                // bool hit = (vx == vxe && vy == vye && vz == vze);
+                // if (hit || step >= MAX_D)
+                //     break;
+                if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                dt[count] = _delta;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+
+        // WHEN THERE IS AN INTERSECTION BETWEEN THE RAY AND THE VOXEL GRID
+        if (count > 0) {
+            // compute the expected ray distance
+            double exp_d = 0.0;
+            for (int i = 0; i < count; i ++)
+                exp_d += p[i] * d[i];
+
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+
+            exp_d += (p_out * max_d);
+            gt_d = min(gt_d, max_d);
+
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+
+            /* backward raymarching */
+            double dd_dsigma[MAX_D];
+            for (int i = count - 1; i >= 0; i --) {
+                // NOTE: probably need to double check again
+                if (i == count - 1)
+                    dd_dsigma[i] = p_out * max_d;
+                else
+                    dd_dsigma[i] = dd_dsigma[i+1] - exp(-csd[i]) * (d[i+1] - d[i]);
+            }
+
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] *= dt[i];
+
+            // option 2: cap at the boundary
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] -= dt[i] * p_out * max_d;
+
+            double dl_dd = 1.0;
+            if (loss_type == L1)
+                dl_dd = (exp_d >= gt_d) ? 1 : -1;
+            else if (loss_type == L2)
+                dl_dd = (exp_d - gt_d);
+            else if (loss_type == ABSREL)
+                dl_dd = (exp_d >= gt_d) ? (1.0/gt_d) : -(1.0/gt_d);
+
+            // apply chain rule
+            for (int i = 0; i < count; i ++) {
+                const int3 &v = path[i];
+                // NOTE: potential race conditions when writing gradients
+                grad_sigma[n][ts][v.z][v.y][v.x] += dl_dd * dd_dsigma[i];
+                // grad_sigma_count[n][ts][v.z][v.y][v.x] += 1;
+            }
+        }
+    }
+}
+
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ *   loss     : N x M
+ *   grad_sigma : N x T x H x L x W
+ */
+std::vector<torch::Tensor> render_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    std::string loss_name) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto device = sigma.device();
+
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+    auto grad_sigma = torch::zeros_like(sigma);
+    // auto grad_sigma_count = torch::zeros_like(sigma);
+
+    LossType loss_type;
+    if (loss_name.compare("l1") == 0) {
+        loss_type = L1;
+    } else if (loss_name.compare("l2") == 0) {
+        loss_type = L2;
+    } else if (loss_name.compare("absrel") == 0) {
+        loss_type = ABSREL;
+    } else if (loss_name.compare("bce") == 0){
+        loss_type = L1;
+    } else {
+        std::cout << "UNKNOWN LOSS TYPE: " << loss_name << std::endl;
+        exit(1);
+    }
+
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_cuda", ([&] {
+                render_cuda_kernel<scalar_t><<<blocks, threads>>>(
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    grad_sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    // grad_sigma_count.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    loss_type);
+            }));
+
+    cudaDeviceSynchronize();
+
+    // grad_sigma_count += (grad_sigma_count == 0);
+    // grad_sigma /= grad_sigma_count;
+
+    return {pred_dist, gt_dist, grad_sigma};
+}
+
+
+/*
+ * input shape
+ *   origin   : N x T x 3
+ *   points   : N x M x 3
+ *   tindex   : N x M
+ * output shape
+ *   occupancy: N x T x H x L x W
+ */
+torch::Tensor init_cuda(
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+
+    const auto dtype = points.dtype();
+    const auto device = points.device();
+    const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    auto occupancy = torch::zeros({N, T, H, L, W}, options);
+
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+
+    // initialize occupancy such that every voxel with one or more points is occupied
+    AT_DISPATCH_FLOATING_TYPES(points.type(), "init_cuda", ([&] {
+                init_cuda_kernel<scalar_t><<<blocks, threads>>>(
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>());
+            }));
+
+    // synchronize
+    cudaDeviceSynchronize();
+
+    return occupancy;
+}
\ No newline at end of file
diff --git a/rayiou_metrics/rayiou_metrics/__init__.py b/rayiou_metrics/rayiou_metrics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rayiou_metrics/rayiou_metrics/ego_infos_val.pkl b/rayiou_metrics/rayiou_metrics/ego_infos_val.pkl
new file mode 100644
index 0000000..2fa2aad
Binary files /dev/null and b/rayiou_metrics/rayiou_metrics/ego_infos_val.pkl differ
diff --git a/rayiou_metrics/rayiou_metrics/ego_pose_dataset.py b/rayiou_metrics/rayiou_metrics/ego_pose_dataset.py
new file mode 100644
index 0000000..d99e7bb
--- /dev/null
+++ b/rayiou_metrics/rayiou_metrics/ego_pose_dataset.py
@@ -0,0 +1,87 @@
+import torch
+import numpy as np
+from pyquaternion import Quaternion
+from torch.utils.data import Dataset
+np.set_printoptions(precision=3, suppress=True)
+
+
+def trans_matrix(T, R):
+    tm = np.eye(4)
+    tm[:3, :3] = R.rotation_matrix
+    tm[:3, 3] = T
+    return tm
+
+
+# A helper dataset for RayIoU. It is NOT used during training.
+class EgoPoseDataset(Dataset):
+    def __init__(self, data_infos):
+        super(EgoPoseDataset, self).__init__()
+
+        self.data_infos = data_infos
+        self.scene_frames = {}
+
+        for info in data_infos:
+            scene_name = info['scene_name']
+            if scene_name not in self.scene_frames:
+                self.scene_frames[scene_name] = []
+            self.scene_frames[scene_name].append(info)
+
+    def __len__(self):
+        return len(self.data_infos)
+
+    def get_ego_from_lidar(self, info):
+        ego_from_lidar = trans_matrix(
+            np.array(info['lidar2ego_translation']), 
+            Quaternion(info['lidar2ego_rotation']))
+        return ego_from_lidar
+
+    def get_global_pose(self, info, inverse=False):
+        global_from_ego = trans_matrix(
+            np.array(info['ego2global_translation']), 
+            Quaternion(info['ego2global_rotation']))
+        ego_from_lidar = trans_matrix(
+            np.array(info['lidar2ego_translation']), 
+            Quaternion(info['lidar2ego_rotation']))
+        pose = global_from_ego.dot(ego_from_lidar)
+        if inverse:
+            pose = np.linalg.inv(pose)
+        return pose
+
+    def __getitem__(self, idx):
+        info = self.data_infos[idx]
+
+        ref_sample_token = info['token']
+        ref_lidar_from_global = self.get_global_pose(info, inverse=True)
+        ref_ego_from_lidar = self.get_ego_from_lidar(info)
+
+        scene_frame = self.scene_frames[info['scene_name']]
+        ref_index = scene_frame.index(info)
+
+        # NOTE: getting output frames
+        output_origin_list = []
+        for curr_index in range(len(scene_frame)):
+            # if this exists a valid target
+            if curr_index == ref_index:
+                origin_tf = np.array([0.0, 0.0, 0.0], dtype=np.float32)
+            else:
+                # transform from the current lidar frame to global and then to the reference lidar frame
+                global_from_curr = self.get_global_pose(scene_frame[curr_index], inverse=False)
+                ref_from_curr = ref_lidar_from_global.dot(global_from_curr)
+                origin_tf = np.array(ref_from_curr[:3, 3], dtype=np.float32)
+
+            origin_tf_pad = np.ones([4])
+            origin_tf_pad[:3] = origin_tf  # pad to [4]
+            origin_tf = np.dot(ref_ego_from_lidar[:3], origin_tf_pad.T).T  # [3]
+
+            # origin
+            if np.abs(origin_tf[0]) < 39 and np.abs(origin_tf[1]) < 39:
+                output_origin_list.append(origin_tf)
+        
+        # select 8 origins
+        if len(output_origin_list) > 8:
+            select_idx = np.round(np.linspace(0, len(output_origin_list) - 1, 8)).astype(np.int64)
+            output_origin_list = [output_origin_list[i] for i in select_idx]
+
+        output_origin_tensor = torch.from_numpy(np.stack(output_origin_list))  # [T, 3]
+
+        return (ref_sample_token, output_origin_tensor)
diff --git a/rayiou_metrics/rayiou_metrics/evaluation.py b/rayiou_metrics/rayiou_metrics/evaluation.py
new file mode 100644
index 0000000..df4de8e
--- /dev/null
+++ b/rayiou_metrics/rayiou_metrics/evaluation.py
@@ -0,0 +1,72 @@
+import os
+import glob
+import mmcv
+import numpy as np
+import pkg_resources
+from torch.utils.data import DataLoader
+from .ray_metrics import main_rayiou
+from .ego_pose_dataset import EgoPoseDataset
+
+openocc_class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+    'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier',
+    'driveable_surface', 'other_flat', 'sidewalk',
+    'terrain', 'manmade', 'vegetation', 'free'
+]
+occ3d_class_names = [
+    'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
+    'driveable_surface', 'other_flat', 'sidewalk',
+    'terrain', 'manmade', 'vegetation', 'free'
+]
+
+def evaluate_metrics(data_root, pred_dir, data_type):
+    data_path = pkg_resources.resource_filename('rayiou_metrics', 'ego_infos_val.pkl')
+    data_infos = mmcv.load(data_path)['infos']
+    gt_filepaths = sorted(glob.glob(os.path.join(data_root, data_type, '*/*/*.npz')))
+
+    # retrieve scene_name
+    token2scene = {}
+    for gt_path in gt_filepaths:
+        token = gt_path.split('/')[-2]
+        scene_name = gt_path.split('/')[-3]
+        token2scene[token] = scene_name
+
+    for i in range(len(data_infos)):
+        scene_name = token2scene[data_infos[i]['token']]
+        data_infos[i]['scene_name'] = scene_name
+
+    lidar_origins = []
+    occ_gts = []
+    occ_preds = []
+
+    for idx, batch in enumerate(DataLoader(EgoPoseDataset(data_infos), num_workers=8)):
+        output_origin = batch[1]
+        info = data_infos[idx]
+
+        occ_path = os.path.join(data_root, data_type, info['scene_name'], info['token'], 'labels.npz')
+        occ_gt = np.load(occ_path, allow_pickle=True)['semantics']
+        occ_gt = np.reshape(occ_gt, [200, 200, 16]).astype(np.uint8)
+
+        occ_path = os.path.join(pred_dir, info['token'] + '.npz')
+        occ_pred = np.load(occ_path, allow_pickle=True)['pred']
+        occ_pred = np.reshape(occ_pred, [200, 200, 16]).astype(np.uint8)
+        
+        lidar_origins.append(output_origin)
+        occ_gts.append(occ_gt)
+        occ_preds.append(occ_pred)
+    
+    if data_type == 'occ3d':
+        occ_class_names = occ3d_class_names
+    elif data_type == 'openocc_v2':
+        occ_class_names = openocc_class_names
+    else:
+        raise ValueError("Invalid data_type. Support ['occ3d', 'openocc_v2']")
+    
+    metrics = main_rayiou(occ_preds, occ_gts, lidar_origins, occ_class_names=occ_class_names)
+
+    print('--- Evaluation Results ---')
+    for k, v in metrics.items():
+        print('%s: %.4f' % (k, v))
+
+    return metrics
diff --git a/rayiou_metrics/rayiou_metrics/ray_metrics.py b/rayiou_metrics/rayiou_metrics/ray_metrics.py
new file mode 100644
index 0000000..c3f8e88
--- /dev/null
+++ b/rayiou_metrics/rayiou_metrics/ray_metrics.py
@@ -0,0 +1,309 @@
+# Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+# Modified by Haisong Liu
+import functools
+import math
+import copy
+import multiprocessing
+import numpy as np
+import torch
+from torch.utils.cpp_extension import load
+from tqdm import tqdm
+from prettytable import PrettyTable
+from .ray_pq import Metric_RayPQ
+
+
+# Detect the device
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+if torch.cuda.is_available():
+    import dvr_cuda_ops as dvr
+else:
+    import dvr_cpu_ops as dvr
+
+_pc_range = [-40, -40, -1.0, 40, 40, 5.4]
+_voxel_size = 0.4
+
+
+# https://github.com/tarashakhurana/4d-occ-forecasting/blob/ff986082cd6ea10e67ab7839bf0e654736b3f4e2/test_fgbg.py#L29C1-L46C16
+def get_rendered_pcds(origin, points, tindex, pred_dist):
+    pcds = []
+    
+    for t in range(len(origin)):
+        mask = (tindex == t)
+        # skip the ones with no data
+        if not mask.any():
+            continue
+        _pts = points[mask, :3]
+        # use ground truth lidar points for the raycasting direction
+        v = _pts - origin[t][None, :]
+        d = v / np.sqrt((v ** 2).sum(axis=1, keepdims=True))
+        pred_pts = origin[t][None, :] + d * pred_dist[mask][:, None]
+        pcds.append(torch.from_numpy(pred_pts))
+        
+    return pcds
+
+
+def meshgrid3d(occ_size, pc_range):
+    W, H, D = occ_size
+    
+    xs = torch.linspace(0.5, W - 0.5, W).view(W, 1, 1).expand(W, H, D) / W
+    ys = torch.linspace(0.5, H - 0.5, H).view(1, H, 1).expand(W, H, D) / H
+    zs = torch.linspace(0.5, D - 0.5, D).view(1, 1, D).expand(W, H, D) / D
+    xs = xs * (pc_range[3] - pc_range[0]) + pc_range[0]
+    ys = ys * (pc_range[4] - pc_range[1]) + pc_range[1]
+    zs = zs * (pc_range[5] - pc_range[2]) + pc_range[2]
+    xyz = torch.stack((xs, ys, zs), -1)
+
+    return xyz
+
+
+def generate_lidar_rays():
+    # prepare lidar ray angles
+    pitch_angles = []
+    for k in range(10):
+        angle = math.pi / 2 - math.atan(k + 1)
+        pitch_angles.append(-angle)
+    
+    # nuscenes lidar fov: [0.2107773983152201, -0.5439104895672159] (rad)
+    while pitch_angles[-1] < 0.21:
+        delta = pitch_angles[-1] - pitch_angles[-2]
+        pitch_angles.append(pitch_angles[-1] + delta)
+
+    lidar_rays = []
+    for pitch_angle in pitch_angles:
+        for azimuth_angle in np.arange(0, 360, 1):
+            azimuth_angle = np.deg2rad(azimuth_angle)
+
+            x = np.cos(pitch_angle) * np.cos(azimuth_angle)
+            y = np.cos(pitch_angle) * np.sin(azimuth_angle)
+            z = np.sin(pitch_angle)
+
+            lidar_rays.append((x, y, z))
+
+    return np.array(lidar_rays, dtype=np.float32)
+
+
+def process_one_sample(sem_pred, lidar_rays, output_origin, instance_pred=None, occ_class_names=None):
+    # lidar origin in ego coordinate
+    # lidar_origin = torch.tensor([[[0.9858, 0.0000, 1.8402]]])
+    T = output_origin.shape[1]
+    pred_pcds_t = []
+
+    free_id = len(occ_class_names) - 1 
+    occ_pred = copy.deepcopy(sem_pred)
+    occ_pred[sem_pred < free_id] = 1
+    occ_pred[sem_pred == free_id] = 0
+    occ_pred = occ_pred.permute(2, 1, 0)
+    occ_pred = occ_pred[None, None, :].contiguous().float()
+
+    offset = torch.Tensor(_pc_range[:3])[None, None, :]
+    scaler = torch.Tensor([_voxel_size] * 3)[None, None, :]
+
+    lidar_tindex = torch.zeros([1, lidar_rays.shape[0]])
+    
+    for t in range(T): 
+        lidar_origin = output_origin[:, t:t+1, :]  # [1, 1, 3]
+        lidar_endpts = lidar_rays[None] + lidar_origin  # [1, 15840, 3]
+
+        output_origin_render = ((lidar_origin - offset) / scaler).float()  # [1, 1, 3]
+        output_points_render = ((lidar_endpts - offset) / scaler).float()  # [1, N, 3]
+        output_tindex_render = lidar_tindex  # [1, N], all zeros
+
+        with torch.no_grad():
+            pred_dist, _, coord_index = dvr.render_forward(
+                occ_pred.to(device),
+                output_origin_render.to(device),
+                output_points_render.to(device),
+                output_tindex_render.to(device),
+                [1, 16, 200, 200],
+                "test"
+            )
+            pred_dist *= _voxel_size
+
+        pred_pcds = get_rendered_pcds(
+            lidar_origin[0].cpu().numpy(),
+            lidar_endpts[0].cpu().numpy(),
+            lidar_tindex[0].cpu().numpy(),
+            pred_dist[0].cpu().numpy()
+        )
+        coord_index = coord_index[0, :, :].int().cpu()  # [N, 3]
+
+        pred_label = sem_pred[coord_index[:, 0], coord_index[:, 1], coord_index[:, 2]][:, None]  # [N, 1]        
+        pred_dist = pred_dist[0, :, None].cpu()
+
+        if instance_pred is not None:
+            pred_instance = instance_pred[coord_index[:, 0], coord_index[:, 1], coord_index[:, 2]][:, None]  # [N, 1]
+            pred_pcds = torch.cat([pred_label.float(), pred_instance.float(), pred_dist], dim=-1)
+        else:
+            pred_pcds = torch.cat([pred_label.float(), pred_dist], dim=-1)
+
+        pred_pcds_t.append(pred_pcds)
+
+    pred_pcds_t = torch.cat(pred_pcds_t, dim=0)
+   
+    return pred_pcds_t.numpy()
+
+# for pool.starmap when device is cpu
+def process_pred_gt(sem_pred, sem_gt, lidar_origins, lidar_rays, occ_class_names):
+    sem_pred = torch.from_numpy(np.reshape(sem_pred, [200, 200, 16]))
+    sem_gt = torch.from_numpy(np.reshape(sem_gt, [200, 200, 16]))
+
+    pcd_pred = process_one_sample(sem_pred, lidar_rays, lidar_origins, occ_class_names=occ_class_names)
+    pcd_gt = process_one_sample(sem_gt, lidar_rays, lidar_origins, occ_class_names=occ_class_names)
+
+    # evalute on non-free rays
+    valid_mask = (pcd_gt[:, 0].astype(np.int32) != len(occ_class_names) - 1)
+    pcd_pred = pcd_pred[valid_mask]
+    pcd_gt = pcd_gt[valid_mask]
+
+    assert pcd_pred.shape == pcd_gt.shape
+    
+    return pcd_pred, pcd_gt
+
+def calc_rayiou(pcd_pred_list, pcd_gt_list, occ_class_names):
+    thresholds = [1, 2, 4]
+
+    gt_cnt = np.zeros([len(occ_class_names)])
+    pred_cnt = np.zeros([len(occ_class_names)])
+    tp_cnt = np.zeros([len(thresholds), len(occ_class_names)])
+
+    for pcd_pred, pcd_gt in zip(pcd_pred_list, pcd_gt_list):
+        for j, threshold in enumerate(thresholds):
+            # L1
+            depth_pred = pcd_pred[:, 1]
+            depth_gt = pcd_gt[:, 1]
+            l1_error = np.abs(depth_pred - depth_gt)
+            tp_dist_mask = (l1_error < threshold)
+            
+            for i, cls in enumerate(occ_class_names):
+                cls_id = occ_class_names.index(cls)
+                cls_mask_pred = (pcd_pred[:, 0] == cls_id)
+                cls_mask_gt = (pcd_gt[:, 0] == cls_id)
+
+                gt_cnt_i = cls_mask_gt.sum()
+                pred_cnt_i = cls_mask_pred.sum()
+                if j == 0:
+                    gt_cnt[i] += gt_cnt_i
+                    pred_cnt[i] += pred_cnt_i
+
+                tp_cls = cls_mask_gt & cls_mask_pred  # [N]
+                tp_mask = np.logical_and(tp_cls, tp_dist_mask)
+                tp_cnt[j][i] += tp_mask.sum()
+    
+    iou_list = []
+    for j, threshold in enumerate(thresholds):
+        iou_list.append((tp_cnt[j] / (gt_cnt + pred_cnt - tp_cnt[j]))[:-1])
+
+    return iou_list
+
+
+def main_rayiou(sem_pred_list, sem_gt_list, lidar_origin_list, occ_class_names):
+    if device.type == 'cuda':
+        torch.cuda.empty_cache()
+
+    # generate lidar rays
+    lidar_rays = generate_lidar_rays()
+    lidar_rays = torch.from_numpy(lidar_rays)
+
+    if device.type == 'cuda':
+        pcd_pred_list, pcd_gt_list = [], []
+        for sem_pred, sem_gt, lidar_origins in tqdm(zip(sem_pred_list, sem_gt_list, lidar_origin_list), ncols=50):
+
+            pcd_pred, pcd_gt = process_pred_gt(sem_pred, sem_gt, lidar_origins, lidar_rays, occ_class_names)
+            
+            pcd_pred_list.append(pcd_pred)
+            pcd_gt_list.append(pcd_gt)
+
+    else:
+        pool = multiprocessing.Pool(multiprocessing.cpu_count())
+
+        iterable_input = zip(sem_pred_list, sem_gt_list, lidar_origin_list)
+        warpped_func = functools.partial(process_pred_gt, lidar_rays=lidar_rays, occ_class_names=occ_class_names)
+        result = pool.starmap(warpped_func, tqdm(iterable_input, total=len(sem_gt_list), ncols=50))
+
+        pool.close()
+        pool.join()
+
+        pcd_pred_list, pcd_gt_list = zip(*result)
+    
+    iou_list = calc_rayiou(pcd_pred_list, pcd_gt_list, occ_class_names)
+    rayiou = np.nanmean(iou_list)
+    rayiou_0 = np.nanmean(iou_list[0])
+    rayiou_1 = np.nanmean(iou_list[1])
+    rayiou_2 = np.nanmean(iou_list[2])
+    
+    table = PrettyTable([
+        'Class Names',
+        'RayIoU@1', 'RayIoU@2', 'RayIoU@4'
+    ])
+    table.float_format = '.3'
+
+    for i in range(len(occ_class_names) - 1):
+        table.add_row([
+            occ_class_names[i],
+            iou_list[0][i], iou_list[1][i], iou_list[2][i]
+        ], divider=(i == len(occ_class_names) - 2))
+    
+    table.add_row(['MEAN', rayiou_0, rayiou_1, rayiou_2])
+
+    print(table)
+
+    if device.type == 'cuda':
+        torch.cuda.empty_cache()
+
+    return {
+        'RayIoU': rayiou,
+        'RayIoU@1': rayiou_0,
+        'RayIoU@2': rayiou_1,
+        'RayIoU@4': rayiou_2,
+    }
+
+
+def main_raypq(sem_pred_list, sem_gt_list, inst_pred_list, inst_gt_list, lidar_origin_list, occ_class_names):
+    if device.type == 'cuda':
+        torch.cuda.empty_cache()
+
+    eval_metrics_pq = Metric_RayPQ(
+        occ_class_names=occ_class_names,
+        num_classes=len(occ_class_names),
+        thresholds=[1, 2, 4]
+    )
+
+    # generate lidar rays
+    lidar_rays = generate_lidar_rays()
+    lidar_rays = torch.from_numpy(lidar_rays)
+
+    for sem_pred, sem_gt, inst_pred, inst_gt, lidar_origins in \
+        tqdm(zip(sem_pred_list, sem_gt_list, inst_pred_list, inst_gt_list, lidar_origin_list), ncols=50):
+        sem_pred = torch.from_numpy(np.reshape(sem_pred, [200, 200, 16]))
+        sem_gt = torch.from_numpy(np.reshape(sem_gt, [200, 200, 16]))
+
+        inst_pred = torch.from_numpy(np.reshape(inst_pred, [200, 200, 16]))
+        inst_gt = torch.from_numpy(np.reshape(inst_gt, [200, 200, 16]))
+
+        pcd_pred = process_one_sample(sem_pred, lidar_rays, lidar_origins, instance_pred=inst_pred, occ_class_names=occ_class_names)
+        pcd_gt = process_one_sample(sem_gt, lidar_rays, lidar_origins, instance_pred=inst_gt, occ_class_names=occ_class_names)
+
+        # evalute on non-free rays
+        valid_mask = (pcd_gt[:, 0].astype(np.int32) != len(occ_class_names) - 1)
+        pcd_pred = pcd_pred[valid_mask]
+        pcd_gt = pcd_gt[valid_mask]
+
+        assert pcd_pred.shape == pcd_gt.shape
+        
+        sem_gt = pcd_gt[:, 0].astype(np.int32)
+        sem_pred = pcd_pred[:, 0].astype(np.int32)
+
+        instances_gt = pcd_gt[:, 1].astype(np.int32)
+        instances_pred = pcd_pred[:, 1].astype(np.int32)
+
+        # L1
+        depth_gt = pcd_gt[:, 2]
+        depth_pred = pcd_pred[:, 2]
+        l1_error = np.abs(depth_pred - depth_gt)
+
+        eval_metrics_pq.add_batch(sem_pred, sem_gt, instances_pred, instances_gt, l1_error)
+
+    if device.type == 'cuda':
+        torch.cuda.empty_cache()
+
+    return eval_metrics_pq.count_pq()
diff --git a/rayiou_metrics/rayiou_metrics/ray_pq.py b/rayiou_metrics/rayiou_metrics/ray_pq.py
new file mode 100644
index 0000000..de66860
--- /dev/null
+++ b/rayiou_metrics/rayiou_metrics/ray_pq.py
@@ -0,0 +1,193 @@
+import numpy as np
+from prettytable import PrettyTable
+
+
+class Metric_RayPQ:
+    def __init__(self,
+                 occ_class_names, 
+                 num_classes=18,
+                 thresholds=[1, 2, 4]):
+        """
+        Args:
+            ignore_index (llist): Class ids that not be considered in pq counting.
+        """
+        if num_classes == 18 or num_classes == 17:
+            self.class_names = occ_class_names
+        else:
+            raise ValueError
+        
+        self.num_classes = num_classes
+        self.id_offset = 2 ** 16
+        self.eps = 1e-5
+        self.thresholds = thresholds
+        
+        self.min_num_points = 10
+        self.include = np.array(
+            [n for n in range(self.num_classes - 1)],
+            dtype=int)
+        self.cnt = 0
+        
+        # panoptic stuff
+        self.pan_tp = np.zeros([len(self.thresholds), num_classes], dtype=int)
+        self.pan_iou = np.zeros([len(self.thresholds), num_classes], dtype=np.double)
+        self.pan_fp = np.zeros([len(self.thresholds), num_classes], dtype=int)
+        self.pan_fn = np.zeros([len(self.thresholds), num_classes], dtype=int)
+        
+    def add_batch(self,semantics_pred,semantics_gt,instances_pred,instances_gt, l1_error):
+        self.cnt += 1
+        self.add_panoptic_sample(semantics_pred, semantics_gt, instances_pred, instances_gt, l1_error) 
+    
+    def add_panoptic_sample(self, semantics_pred, semantics_gt, instances_pred, instances_gt, l1_error):
+        """Add one sample of panoptic predictions and ground truths for
+        evaluation.
+
+        Args:
+            semantics_pred (np.ndarray): Semantic predictions.
+            semantics_gt (np.ndarray): Semantic ground truths.
+            instances_pred (np.ndarray): Instance predictions.
+            instances_gt (np.ndarray): Instance ground truths.
+        """
+        # get instance_class_id from instance_gt
+        instance_class_ids = [self.num_classes - 1]
+        for i in range(1, instances_gt.max() + 1):
+            class_id = np.unique(semantics_gt[instances_gt == i])
+            # assert class_id.shape[0] == 1, "each instance must belong to only one class"
+            if class_id.shape[0] == 1:
+                instance_class_ids.append(class_id[0])
+            else:
+                instance_class_ids.append(self.num_classes - 1)
+        instance_class_ids = np.array(instance_class_ids)
+
+        instance_count = 1
+        final_instance_class_ids = []
+        final_instances = np.zeros_like(instances_gt)  # empty space has instance id "0"
+
+        for class_id in range(self.num_classes - 1):
+            if np.sum(semantics_gt == class_id) == 0:
+                continue
+
+            if self.class_names[class_id] in ['car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'motorcycle', 'bicycle', 'pedestrian']:
+                # treat as instances
+                for instance_id in range(len(instance_class_ids)):
+                    if instance_class_ids[instance_id] != class_id:
+                        continue
+                    final_instances[instances_gt == instance_id] = instance_count
+                    instance_count += 1
+                    final_instance_class_ids.append(class_id)
+            else:
+                # treat as semantics
+                final_instances[semantics_gt == class_id] = instance_count
+                instance_count += 1
+                final_instance_class_ids.append(class_id)
+                
+        instances_gt = final_instances
+        
+        # avoid zero (ignored label)
+        instances_pred = instances_pred + 1
+        instances_gt = instances_gt + 1
+        
+        for j, threshold in enumerate(self.thresholds):
+            tp_dist_mask = l1_error < threshold
+            # for each class (except the ignored ones)
+            for cl in self.include:
+                # get a class mask
+                pred_inst_in_cl_mask = semantics_pred == cl
+                gt_inst_in_cl_mask = semantics_gt == cl
+
+                # get instance points in class (makes outside stuff 0)
+                pred_inst_in_cl = instances_pred * pred_inst_in_cl_mask.astype(int)
+                gt_inst_in_cl = instances_gt * gt_inst_in_cl_mask.astype(int)
+
+                # generate the areas for each unique instance prediction
+                unique_pred, counts_pred = np.unique(
+                    pred_inst_in_cl[pred_inst_in_cl > 0], return_counts=True)
+                id2idx_pred = {id: idx for idx, id in enumerate(unique_pred)}
+                matched_pred = np.array([False] * unique_pred.shape[0])
+
+                # generate the areas for each unique instance gt_np
+                unique_gt, counts_gt = np.unique(
+                    gt_inst_in_cl[gt_inst_in_cl > 0], return_counts=True)
+                id2idx_gt = {id: idx for idx, id in enumerate(unique_gt)}
+                matched_gt = np.array([False] * unique_gt.shape[0])
+
+                # generate intersection using offset
+                valid_combos = np.logical_and(pred_inst_in_cl > 0,
+                                            gt_inst_in_cl > 0)
+                # add dist_mask
+                valid_combos = np.logical_and(valid_combos, tp_dist_mask)
+
+                id_offset_combo = pred_inst_in_cl[
+                    valid_combos] + self.id_offset * gt_inst_in_cl[valid_combos]
+                unique_combo, counts_combo = np.unique(
+                    id_offset_combo, return_counts=True)
+
+                # generate an intersection map
+                # count the intersections with over 0.5 IoU as TP
+                gt_labels = unique_combo // self.id_offset
+                pred_labels = unique_combo % self.id_offset
+                gt_areas = np.array([counts_gt[id2idx_gt[id]] for id in gt_labels])
+                pred_areas = np.array(
+                    [counts_pred[id2idx_pred[id]] for id in pred_labels])
+                intersections = counts_combo
+                unions = gt_areas + pred_areas - intersections
+                ious = intersections.astype(float) / unions.astype(float)
+
+                tp_indexes = ious > 0.5
+                self.pan_tp[j][cl] += np.sum(tp_indexes)
+                self.pan_iou[j][cl] += np.sum(ious[tp_indexes])
+
+                matched_gt[[id2idx_gt[id] for id in gt_labels[tp_indexes]]] = True
+                matched_pred[[id2idx_pred[id]
+                            for id in pred_labels[tp_indexes]]] = True
+
+                # count the FN
+                if len(counts_gt) > 0:
+                    self.pan_fn[j][cl] += np.sum(
+                        np.logical_and(counts_gt >= self.min_num_points,
+                                    ~matched_gt))
+
+                # count the FP
+                if len(matched_pred) > 0:
+                    self.pan_fp[j][cl] += np.sum(
+                        np.logical_and(counts_pred >= self.min_num_points,
+                                    ~matched_pred))
+    
+    def count_pq(self):
+        sq_all = self.pan_iou.astype(np.double) / np.maximum(
+            self.pan_tp.astype(np.double), self.eps)
+        rq_all = self.pan_tp.astype(np.double) / np.maximum(
+            self.pan_tp.astype(np.double) + 0.5 * self.pan_fp.astype(np.double)
+            + 0.5 * self.pan_fn.astype(np.double), self.eps)
+        pq_all = sq_all * rq_all
+        
+        # mask classes not occurring in dataset
+        mask = (self.pan_tp + self.pan_fp + self.pan_fn) > 0
+        pq_all[~mask] = float('nan')
+
+        table = PrettyTable([
+            'Class Names',
+            'RayPQ@%d' % self.thresholds[0],
+            'RayPQ@%d' % self.thresholds[1],
+            'RayPQ@%d' % self.thresholds[2]
+        ])
+        table.float_format = '.3'
+
+        for i in range(len(self.class_names) - 1):
+            table.add_row([
+                self.class_names[i],
+                pq_all[0][i], pq_all[1][i], pq_all[2][i],
+            ], divider=(i == len(self.class_names) - 2))
+        
+        table.add_row([
+            'MEAN',
+            np.nanmean(pq_all[0]), np.nanmean(pq_all[1]), np.nanmean(pq_all[2])
+        ])
+
+        print(table)
+
+        return {
+            'RayPQ': np.nanmean(pq_all),
+            'RayPQ@1': np.nanmean(pq_all[0]),
+            'RayPQ@2': np.nanmean(pq_all[1]),
+            'RayPQ@4': np.nanmean(pq_all[2]),
+        }
diff --git a/rayiou_metrics/setup.py b/rayiou_metrics/setup.py
new file mode 100644
index 0000000..dd21913
--- /dev/null
+++ b/rayiou_metrics/setup.py
@@ -0,0 +1,52 @@
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
+import torch
+
+ext_modules = []
+
+if torch.cuda.is_available():
+    ext_modules.append(
+        CUDAExtension(
+            name='dvr_cuda_ops',
+            sources=[
+                'lib/dvr_cuda/dvr_cuda.cpp',
+                'lib/dvr_cuda/dvr_render_cuda.cu'
+            ],
+            extra_compile_args={'cxx': ['-g'], 'nvcc': ['-allow-unsupported-compiler']}
+        )
+    )
+
+ext_modules.append(
+    CppExtension(
+        name='dvr_cpu_ops',
+        sources=[
+            'lib/dvr_cpu/dvr.cpp',
+            'lib/dvr_cpu/dvr_render_cpu.cpp'
+        ]
+    )
+)
+
+setup(
+    name='rayiou_metrics',
+    version='1.0',
+    description='A package for calculating ray metrics with CPU and CUDA support.',
+    packages=find_packages(),
+    package_data={
+        'rayiou_metrics': ['ego_infos_val.pkl']  # Include .pkl in the package
+    },
+    include_package_data=True,
+    ext_modules=ext_modules,
+    cmdclass={
+        'build_ext': BuildExtension
+    },
+    install_requires=[
+        'torch',
+        'numpy'
+    ],
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+    ],
+    python_requires='>=3.6',
+)