facebookresearch · codepk37 · Nov 30, 2025 · Nov 30, 2025 · Dec 6, 2025
diff --git a/notebook/demo_aligned_pointmap.ipynb b/notebook/demo_aligned_pointmap.ipynb
@@ -0,0 +1,264 @@
+{
+    "cells": [
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Copyright (c) Meta Platforms, Inc. and affiliates."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 1. Imports and Model Loading"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import os\n",
+                "import uuid\n",
+                "import imageio\n",
+                "import numpy as np\n",
+                "import torch\n",
+                "from IPython.display import Image as ImageDisplay\n",
+                "from inference import Inference, ready_gaussian_for_video_rendering, load_image, load_masks, display_image, make_scene, render_video, interactive_visualizer\n",
+                "import imageio.v3 as iio\n",
+                "from pytorch3d.transforms import quaternion_to_matrix, Transform3d\n"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "PATH = os.getcwd()\n",
+                "TAG = \"hf\"\n",
+                "config_path = f\"{PATH}/../checkpoints/{TAG}/pipeline.yaml\"\n",
+                "inference = Inference(config_path, compile=False)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 2. Load input image to lift to 3D (multiple objects)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "IMAGE_PATH = f\"{PATH}/images/nocs_0003_0354/rgb.png\"\n",
+                "IMAGE_NAME = os.path.basename(os.path.dirname(IMAGE_PATH))\n",
+                "\n",
+                "image = load_image(IMAGE_PATH)\n",
+                "masks = load_masks(os.path.dirname(IMAGE_PATH), extension=\".png\")\n",
+                "\n",
+                "display_image(image, masks)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 3. Generating Pointmap from Depth Image "
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "depth_path = f\"{PATH}/images/nocs_0003_0354/depth.png\"\n",
+                "depth = iio.imread(depth_path).astype(np.float32)\n",
+                "depth = depth / 1000.0 #convert to mm -> m\n",
+                "depth[depth <= 0] = np.nan  \n",
+                "\n",
+                "H, W = depth.shape\n",
+                "\n",
+                "K = np.array([\n",
+                "    [591.012500, 0.0,      322.525000],\n",
+                "    [0.0,        590.167750, 244.110840],\n",
+                "    [0.0,        0.0,        1.0]\n",
+                "], dtype=np.float32)\n",
+                "\n",
+                "fx = K[0, 0]\n",
+                "fy = K[1, 1]\n",
+                "cx = K[0, 2]\n",
+                "cy = K[1, 2]\n",
+                "\n",
+                "u = np.arange(W)\n",
+                "v = np.arange(H)\n",
+                "uu, vv = np.meshgrid(u, v)\n",
+                "\n",
+                "Z = depth\n",
+                "X = (uu - cx) * Z / fx\n",
+                "Y = (vv - cy) * Z / fy\n",
+                "\n",
+                "# Convert to right-handed PyTorch3D coordinates\n",
+                "pointmap = np.stack([-X, -Y, Z], axis=-1)\n",
+                "pointmaP = torch.tensor(pointmap, dtype=torch.float32)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 4. Generate Gaussian Splats"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "outputs = [inference(image, mask, seed=42,pointmap=pointmaP) for mask in masks]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 5. Mesh Alignment & Coordinate Frame Conversion"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "R_yup_to_zup = torch.tensor([[-1,0,0],[0,0,1],[0,1,0]], dtype=torch.float32)\n",
+                "R_flip_z = torch.tensor([[1,0,0],[0,1,0],[0,0,-1]], dtype=torch.float32)\n",
+                "R_pytorch3d_to_cam = torch.tensor([[-1,0,0],[0,-1,0],[0,0,1]], dtype=torch.float32)\n",
+                "\n",
+                "def transform_mesh_vertices(vertices, rotation, translation, scale):\n",
+                "\n",
+                "    if isinstance(vertices, np.ndarray):\n",
+                "        vertices = torch.tensor(vertices, dtype=torch.float32)\n",
+                "\n",
+                "    vertices = vertices.unsqueeze(0)  #  batch dimension [1, N, 3]\n",
+                "    vertices = vertices @ R_flip_z.to(vertices.device) \n",
+                "    vertices = vertices @ R_yup_to_zup.to(vertices.device)\n",
+                "    R_mat = quaternion_to_matrix(rotation.to(vertices.device))\n",
+                "    tfm = Transform3d(dtype=vertices.dtype, device=vertices.device)\n",
+                "    tfm = (\n",
+                "        tfm.scale(scale)\n",
+                "           .rotate(R_mat)\n",
+                "           .translate(translation[0], translation[1], translation[2])\n",
+                "    )\n",
+                "    vertices_world = tfm.transform_points(vertices)\n",
+                "    vertices_world = vertices_world @ R_pytorch3d_to_cam.to(vertices_world.device)\n",
+                "    \n",
+                "    return vertices_world[0]  # remove batch dimension\n",
+                "\n",
+                "\n",
+                "for i, out in enumerate(outputs):\n",
+                "    mesh = out[\"glb\"]\n",
+                "    vertices = mesh.vertices\n",
+                "    vertices_tensor = torch.tensor(vertices)\n",
+                "\n",
+                "    S = out[\"scale\"][0].cpu().float()\n",
+                "    T = out[\"translation\"][0].cpu().float()\n",
+                "    R = out[\"rotation\"].squeeze().cpu().float()\n",
+                "\n",
+                "    vertices_transformed = transform_mesh_vertices(vertices, R, T, S)\n",
+                "    mesh.vertices = vertices_transformed.cpu().numpy().astype(np.float32)\n",
+                "\n",
+                "    save_path = f\"{PATH}/meshes/multi/{IMAGE_NAME}/object_{i}.ply\"\n",
+                "    os.makedirs(os.path.dirname(save_path), exist_ok=True)\n",
+                "    mesh.export(save_path)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 4. Visualize Gaussian Splat of the Scene\n",
+                "### a. Animated Gif"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "scene_gs = make_scene(*outputs)\n",
+                "scene_gs = ready_gaussian_for_video_rendering(scene_gs,fix_alignment=False)\n",
+                "\n",
+                "# export gaussian splatting (as point cloud)\n",
+                "scene_gs.save_ply(f\"{PATH}/gaussians/multi/{IMAGE_NAME}.ply\")\n",
+                "\n",
+                "video = render_video(\n",
+                "    scene_gs,\n",
+                "    r=1,\n",
+                "    fov=60,\n",
+                "    resolution=512,\n",
+                ")[\"color\"]\n",
+                " \n",
+                "# save video as gif\n",
+                "imageio.mimsave(\n",
+                "    os.path.join(f\"{PATH}/gaussians/multi/{IMAGE_NAME}.gif\"),\n",
+                "    video,\n",
+                "    format=\"GIF\",\n",
+                "    duration=1000 / 30,  # default assuming 30fps from the input MP4\n",
+                "    loop=0,  # 0 means loop indefinitely\n",
+                ")\n",
+                "\n",
+                "# notebook display\n",
+                "ImageDisplay(url=f\"gaussians/multi/{IMAGE_NAME}.gif?cache_invalidator={uuid.uuid4()}\",)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### b. Interactive Visualizer"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# might take a while to load (black screen)\n",
+                "interactive_visualizer(f\"{PATH}/gaussians/single/{IMAGE_NAME}.ply\")"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "sam3d-objects",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.0"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}
diff --git a/notebook/images/nocs_0003_0354/0.png b/notebook/images/nocs_0003_0354/0.png
diff --git a/notebook/images/nocs_0003_0354/1.png b/notebook/images/nocs_0003_0354/1.png
diff --git a/notebook/images/nocs_0003_0354/2.png b/notebook/images/nocs_0003_0354/2.png
diff --git a/notebook/images/nocs_0003_0354/3.png b/notebook/images/nocs_0003_0354/3.png
diff --git a/notebook/images/nocs_0003_0354/4.png b/notebook/images/nocs_0003_0354/4.png
diff --git a/notebook/images/nocs_0003_0354/5.png b/notebook/images/nocs_0003_0354/5.png
diff --git a/notebook/images/nocs_0003_0354/cam_K.txt b/notebook/images/nocs_0003_0354/cam_K.txt
@@ -0,0 +1,3 @@
+591.012500 0.000000 322.525000
+0.000000 590.167750 244.110840
+0.000000 0.000000 1.000000
diff --git a/notebook/images/nocs_0003_0354/depth.png b/notebook/images/nocs_0003_0354/depth.png
diff --git a/notebook/images/nocs_0003_0354/label.png b/notebook/images/nocs_0003_0354/label.png
diff --git a/notebook/images/nocs_0003_0354/rgb.png b/notebook/images/nocs_0003_0354/rgb.png