diff --git a/DATA.md b/DATA.md
index 643b538..92a22dd 100644
--- a/DATA.md
+++ b/DATA.md
@@ -10,6 +10,8 @@ We list the available data used in the current version of CrossOver in the table
 | ------------ | ----------------------------- | ----------------------------------- |  -------------------------- | -------------------------- |
 | ScanNet      | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` |    ❌                       |          ✅                |
 | 3RScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ✅                       |          ✅                |
+| ARKitScenes       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                      |          ✅                |
+| MultiScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                       |          ✅                |
 
 
 We detail data download and release instructions for preprocessing with scripts for ScanNet + 3RScan. 
@@ -110,4 +112,69 @@ Scan3R/
 |   │   ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading)
 |   │   └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan)
 |   └── ...
-```
\ No newline at end of file
+```
+### MultiScan
+
+#### Running preprocessing scripts
+Adjust the path parameters of `MultiScan` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file):
+
+```bash
+$ bash scripts/preprocess/process_multiscan.sh
+```
+
+Our script for MultiScan dataset performs the following additional processing:
+
+- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan.
+
+Post running preprocessing, the data structure should look like the following:
+
+```
+MultiScan/
+├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training)
+|   ├── train_objects.h5
+|   └── val_objects.h5
+├── scans/
+|   ├── scene_00000_00/
+|   │   ├── gt-projection-seg.pt -> 3D-to-2D projected data  consisting of framewise 2D instance segmentation
+|   │   ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) 
+|   │   ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features)
+|   │   ├── data2D_all_images.pt (RGB features of every image of every scan)
+|   │   ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only)
+|   │   ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped
+|   │   ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading)
+|   │   └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan)
+|   └── ...
+```
+
+### ARKitScenes
+
+#### Running preprocessing scripts
+Adjust the path parameters of `ARKitScenes` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file):
+
+```bash
+$ bash scripts/preprocess/process_arkit.sh
+```
+
+Our script for ARKitScenes dataset performs the following additional processing:
+
+- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan.
+
+Post running preprocessing, the data structure should look like the following:
+
+```
+ARKitScenes/
+├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training)
+|   ├── train_objects.h5
+|   └── val_objects.h5
+├── scans/
+|   ├── 40753679/
+|   │   ├── gt-projection-seg.pt -> 3D-to-2D projected data  consisting of framewise 2D instance segmentation
+|   │   ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) 
+|   │   ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features)
+|   │   ├── data2D_all_images.pt (RGB features of every image of every scan )
+|   │   ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only)
+|   │   ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped
+|   │   ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading)
+|   │   └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan)
+|   └── ...
+```
diff --git a/README.md b/README.md
index 1cb1030..c133ec5 100644
--- a/README.md
+++ b/README.md
@@ -118,6 +118,9 @@ See [DATA.MD](DATA.md) for detailed instructions on data download, preparation a
 | ------------ | ----------------------------- | ----------------------------------- |  -------------------------- | -------------------------- |
 | Scannet      | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` |    ❌                       |          ✅                |
 | 3RScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ✅                       |          ✅                |
+| ARKitScenes       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                       |          ✅                |
+| MultiScan       | `[point, rgb, referral]`      | `[point, rgb, referral]`            |    ❌                       |          ✅                |
+
 
 > To run our demo, you only need to download generated embedding data; no need for any data preprocessing.
 
@@ -134,7 +137,7 @@ Various configurable parameters:
 - `--database_path`: Path to the precomputed embeddings of the database scenes downloaded before (eg: `./release_data/embed_scannet.pt`).
 - `--query_modality`: Modality of the query scene, Options: `point`, `rgb`, `floorplan`, `referral`
 - `--database_modality`: Modality used for retrieval. Same options as above.
-- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`).
+- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`.
 
 For embedding and pre-trained model download, refer to [generated embedding data](DATA.md#generated-embedding-data) and [checkpoints](#checkpoints) sections.
 
diff --git a/TRAIN.md b/TRAIN.md
index fd56dcd..5520b7d 100644
--- a/TRAIN.md
+++ b/TRAIN.md
@@ -21,7 +21,7 @@ $ bash scripts/train/train_instance_crossover.sh
 ```
 
 #### Train Scene Retrieval Pipeline
-Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet & 3RScan or either. Run the following:
+Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan, MultiScan, & ARKitScenes or any combination of the same. Run the following:
 
 ```bash
 $ bash scripts/train/train_scene_crossover.sh
diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml
index a14c626..2b2310b 100644
--- a/configs/evaluation/eval_instance.yaml
+++ b/configs/evaluation/eval_instance.yaml
@@ -43,14 +43,33 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+    
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+
 task: 
   name       : InferenceObjectRetrieval
   InferenceObjectRetrieval:
     val                     : [Scannet]
     modalities              : ['rgb', 'point', 'cad', 'referral']
     scene_modalities        : ['rgb', 'point', 'referral', 'floorplan']
-    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r.pth
-    
+    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth
 
 inference_module: ObjectRetrieval
 
diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml
index 0f1b6f2..a666183 100644
--- a/configs/evaluation/eval_scene.yaml
+++ b/configs/evaluation/eval_scene.yaml
@@ -43,13 +43,32 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    max_object_len : 150
+    voxel_size     : 0.02
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+
 task: 
   name       : InferenceSceneRetrieval
   InferenceSceneRetrieval:
     val                     : [Scannet]
     modalities              : ['rgb', 'point', 'cad', 'referral']
     scene_modalities        : ['rgb', 'point', 'referral', 'floorplan'] #, 'point']
-    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r.pth
+    ckpt_path               : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+multiscan.pth
 
 inference_module: SceneRetrieval
 model: 
diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml
index c74b6bc..230643f 100644
--- a/configs/preprocess/process_1d.yaml
+++ b/configs/preprocess/process_1d.yaml
@@ -25,6 +25,28 @@ data:
     label_filename : labels.instances.align.annotated.v2.ply
     skip_frames    : 1
 
+  ARKitScenes:
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    skip_frames    : 1
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    skip_frames    : 1
+    
+    Structured3D:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/
+    process_dir    : ${data.process_dir}/Structured3D/scans
+    processor3D    : Structured3D_3DProcessor
+    processor2D    : Structured3D_2DProcessor
+    processor1D    : Structured3D_1DProcessor
+    skip_frames    : 1
   Shapenet:
     base_dir       : /drive/datasets/Shapenet/ShapeNetCore.v2/
 
diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml
index 74898cd..fd3422a 100644
--- a/configs/preprocess/process_2d.yaml
+++ b/configs/preprocess/process_2d.yaml
@@ -27,6 +27,29 @@ data:
     label_filename : labels.instances.align.annotated.v2.ply
     skip_frames    : 1
 
+  ARKitScenes:
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    skip_frames    : 1
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    skip_frames    : 1
+    
+    Structured3D:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/
+    process_dir    : ${data.process_dir}/Structured3D/scans
+    processor3D    : Structured3D_3DProcessor
+    processor2D    : Structured3D_2DProcessor
+    processor1D    : Structured3D_1DProcessor
+    skip_frames    : 1
+    
 modality_info:
   1D  :
     feature_extractor: 
@@ -60,4 +83,4 @@ task:
   name       : Preprocess 
   Preprocess :
     modality : '2D'
-    splits   : ['val']
\ No newline at end of file
+    splits   : ['train', 'val']
\ No newline at end of file
diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml
index 3d15f23..2a405c5 100644
--- a/configs/preprocess/process_3d.yaml
+++ b/configs/preprocess/process_3d.yaml
@@ -24,6 +24,28 @@ data:
     processor1D    : Scan3R1DProcessor
     label_filename : labels.instances.align.annotated.v2.ply
 
+  ARKitScenes:
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir    : ${data.process_dir}/MultiScan
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    skip_frames    : 1
+    
+    Structured3D:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/
+    process_dir    : ${data.process_dir}/Structured3D/scans
+    processor3D    : Structured3D_3DProcessor
+    processor2D    : Structured3D_2DProcessor
+    processor1D    : Structured3D_1DProcessor
+
 modality_info:
   1D  :
     feature_extractor: 
diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml
index 3eb5ace..0806365 100644
--- a/configs/preprocess/process_multimodal.yaml
+++ b/configs/preprocess/process_multimodal.yaml
@@ -28,6 +28,33 @@ data:
     skip_frames      : 1
     avail_modalities : ['point', 'rgb', 'referral']
 
+  ARKitScenes:
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
+    chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+
+  MultiScan:
+    base_dir         : /media/sayan/Expansion/data/datasets/MultiScan
+    process_dir      : ${data.process_dir}/MultiScan/
+    chunked_dir      : ${data.process_dir}/MultiScan/objects_chunked
+    processor3D      : Scan3R3DProcessor
+    processor2D      : Scan3R2DProcessor
+    processor1D      : Scan3R1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+    
+    Structured3D:
+    base_dir       : /Users/gauravpradeep/CrossOver_ScaleUp/Structured3D
+    process_dir    : ${data.process_dir}/Structured3D/scans
+    chunked_dir    : ${data.process_dir}/Structured3D/objects_chunked
+    processor3D    : Structured3D_3DProcessor
+    processor2D    : Structured3D_2DProcessor
+    processor1D    : Structured3D_1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+    
 modality_info:
   1D  :
     feature_extractor: 
diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml
index 8b6bc89..ee70d74 100644
--- a/configs/train/train_instance_baseline.yaml
+++ b/configs/train/train_instance_baseline.yaml
@@ -44,6 +44,27 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
+    chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/Multiscan
+    process_dir    : ${data.process_dir}/MultiScan/
+    chunked_dir    : ${data.process_dir}/MultiScan/objects_chunked
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    avail_modalities : ['point', 'rgb', 'referral']
+    max_object_len   : 150
+    voxel_size       : 0.02
+    
 task: 
   name       : ObjectLevelGrounding 
   ObjectLevelGrounding :
diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml
index c54257d..35a6a15 100644
--- a/configs/train/train_instance_crossover.yaml
+++ b/configs/train/train_instance_crossover.yaml
@@ -44,12 +44,33 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
+    chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/Multiscan
+    process_dir    : ${data.process_dir}/MultiScan/
+    chunked_dir    : ${data.process_dir}/MultiScan/objects_chunked
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len   : 150
+    voxel_size       : 0.02
+    
 task: 
   name       : SceneLevelGrounding 
   SceneLevelGrounding :
     modalities  : ['rgb', 'point', 'cad', 'referral']
-    train       : [Scannet, Scan3R]
-    val         : [Scannet, Scan3R]
+    train       : [Scannet, Scan3R, MultiScan, ARKitScenes]
+    val         : [Scannet, Scan3R, MultiScan, ARKitScenes]
 
 trainer: GroundingTrainer
 
diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml
index f9459da..9886e95 100644
--- a/configs/train/train_scene_crossover.yaml
+++ b/configs/train/train_scene_crossover.yaml
@@ -44,14 +44,35 @@ data :
     max_object_len : 150
     voxel_size     : 0.02
 
+  ARKitScenes:
+    base_dir       : /media/sayan/Expansion/data/datasets/ARKitScenes
+    process_dir    : ${data.process_dir}/ARKitScenes/
+    chunked_dir    : ${data.process_dir}/ARKitScenes/objects_chunked
+    processor3D    : ARKitScenes3DProcessor
+    processor2D    : ARKitScenes2DProcessor
+    processor1D    : ARKitScenes1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len : 150
+    voxel_size     : 0.02
+  MultiScan:
+    base_dir       : /media/sayan/Expansion/data/datasets/Multiscan
+    process_dir    : ${data.process_dir}/MultiScan/
+    chunked_dir    : ${data.process_dir}/MultiScan/objects_chunked
+    processor3D    : MultiScan3DProcessor
+    processor2D    : MultiScan2DProcessor
+    processor1D    : MultiScan1DProcessor
+    avail_modalities : ['point', 'cad', 'rgb', 'referral']
+    max_object_len   : 150
+    voxel_size       : 0.02
+    
 task: 
   name         : UnifiedTrain 
   UnifiedTrain :
     modalities       : ['rgb', 'point', 'cad', 'referral']
     scene_modalities : ['rgb', 'point', 'floorplan', 'referral']
-    train            : [Scannet, Scan3R, MultiScan]
-    val              : [Scannet, Scan3R, MultiScan]
-    object_enc_ckpt  : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth
+    train            : [Scannet, Scan3R, MultiScan, ARKitScenes]
+    val              : [Scannet, Scan3R, MultiScan, ARKitScenes]
+    object_enc_ckpt  : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth
     
 trainer: UnifiedTrainer
 
@@ -78,7 +99,7 @@ model:
   base_modality : 'rgb'
 
 dataloader:
-  batch_size  : 16
+  batch_size  : 32
   num_workers : 6
 
 eval:
diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py
index 9a1b744..7db5e81 100644
--- a/data/datasets/__init__.py
+++ b/data/datasets/__init__.py
@@ -1,2 +1,5 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
+from .multiscan import *
+from .structured3d import *
\ No newline at end of file
diff --git a/data/datasets/arkit.py b/data/datasets/arkit.py
new file mode 100644
index 0000000..4944dae
--- /dev/null
+++ b/data/datasets/arkit.py
@@ -0,0 +1,41 @@
+import os.path as osp
+import numpy as np
+from typing import List, Any
+from omegaconf import DictConfig
+import pandas as pd
+from ..build import DATASET_REGISTRY
+from .scanbase import ScanObjectBase, ScanBase
+
+@DATASET_REGISTRY.register()
+class ARKitScenesObject(ScanObjectBase):
+    """ARKitScenes dataset class for instance level baseline"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+
+@DATASET_REGISTRY.register()
+class ARKitScenes(ScanBase):
+    """ARKitScenes dataset class"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+        
+        filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split))
+        self.scan_ids = np.genfromtxt(filepath, dtype = str)
+    
+    def get_temporal_scan_pairs(self):
+        """Groups scans into temporal pairs based on shared visit_id."""
+        csv_path=osp.join(self.files_dir,'3dod_train_val_splits.csv')
+        df = pd.read_csv(csv_path)
+
+        df = df[df["visit_id"].notna()]
+
+        grouped_scans = df.groupby("visit_id")["video_id"].apply(list).to_dict()
+
+        scene_pairs = []
+        for video_ids in grouped_scans.values():
+            if len(video_ids) > 1: 
+                ref_scan_id = video_ids[0]  # First video_id as reference
+                rescan_list = [{"scan_id": rescan_id} for rescan_id in video_ids[1:]] 
+                
+                scene_pairs.append([ref_scan_id, rescan_list])
+        
+        return scene_pairs
\ No newline at end of file
diff --git a/data/datasets/multiscan.py b/data/datasets/multiscan.py
new file mode 100644
index 0000000..a43d8a1
--- /dev/null
+++ b/data/datasets/multiscan.py
@@ -0,0 +1,42 @@
+import os.path as osp
+import numpy as np
+from typing import List, Any
+from omegaconf import DictConfig
+
+from ..build import DATASET_REGISTRY
+from .scanbase import ScanObjectBase, ScanBase
+
+@DATASET_REGISTRY.register()
+class MultiScanObject(ScanObjectBase):
+    """MultiScan dataset class for instance level baseline"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+
+@DATASET_REGISTRY.register()
+class MultiScan(ScanBase):
+    """MultiScan dataset class"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+        
+        filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split))
+        self.scan_ids = np.genfromtxt(filepath, dtype = str)
+    
+    def get_temporal_scan_pairs(self) -> List[List[Any]]:
+        """Gets pairs of temporal scans from the dataset."""
+        scene_pairs = []
+        
+        ref_scan_ids = [scan_id for scan_id in self.scan_ids if scan_id.endswith('00')]
+        
+        for ref_scan_id in ref_scan_ids:    
+            rescan_list = []
+            
+            for rescan_id in self.scan_ids:
+                rescan = {}
+                if rescan_id.startswith(ref_scan_id.split('_')[0]) and rescan_id != ref_scan_id:
+                    rescan['scan_id'] = rescan_id
+                    rescan_list.append(rescan)
+            if len(rescan_list) == 0: 
+                continue
+            
+            scene_pairs.append([ref_scan_id, rescan_list])
+        return scene_pairs
\ No newline at end of file
diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py
index 7f8d3fe..e891266 100644
--- a/data/datasets/scanbase.py
+++ b/data/datasets/scanbase.py
@@ -131,14 +131,18 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         
         scan_process_dir = osp.join(self.process_dir, 'scans', scan_id)
         
-        scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt'))
+        # scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt'))
+        scan_objects_data = np.load(osp.join(scan_process_dir, 'objectsDataMultimodal.npz'), allow_pickle=True)
         
-        scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt'))
-        scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt'))
-        scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt'))
+        # scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt'))
+        scandata_1d = np.load(osp.join(scan_process_dir, 'data1D.npz'), allow_pickle=True)
+        # scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt'))
+        scandata_2d = np.load(osp.join(scan_process_dir, 'data2D.npz'), allow_pickle=True)
+        # scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt'))
+        scandata_3d = np.load(osp.join(scan_process_dir, 'data3D.npz'), allow_pickle=True)
         
         # Point Cloud Data -- Scene
-        points, feats, scene_label = scandata_3d['scene']['pcl_coords'], scandata_3d['scene']['pcl_feats'], scandata_3d['scene']['scene_label']
+        points, feats, scene_label = scandata_3d['scene'].item()['pcl_coords'], scandata_3d['scene'].item()['pcl_feats'], scandata_3d['scene'].item()['scene_label']
         feats /= 255.
         feats -= 0.5
         
@@ -152,9 +156,9 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True)
         coords, feats = points[sel], feats[sel]
         
-        # Get coords, shift to center
+        # Get coords, already zero centered during preprocessing
         coords = np.floor(coords / self.voxel_size)
-        coords-=coords.min(0)
+        # coords-=coords.min(0)
         
         # Object Data
         scene_dict = {}
@@ -185,9 +189,8 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         
         scene_dict['scene_masks'] = {}
         
-        rgb_embedding = torch.from_numpy(scandata_2d['scene']['scene_embeddings'])
+        rgb_embedding = torch.from_numpy(scandata_2d['scene'].item()['scene_embeddings'])
         rgb_embedding = torch.concatenate([rgb_embedding[:, 0, :], rgb_embedding[:, 1:, :].mean(dim=1)], dim=1)
-        rgb_embedding = rgb_embedding[list(range(0, rgb_embedding.shape[0], 2)), :]
         scene_dict['rgb_embedding'] = rgb_embedding
         
         scene_dict['scene_masks']['rgb'] = torch.Tensor([1.0])
@@ -195,7 +198,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         scene_dict['scene_masks']['object'] = torch.Tensor([1.0])
         
         referral_mask = torch.Tensor([0.0])       
-        referral_embedding = scandata_1d['scene']['referral_embedding']
+        referral_embedding = scandata_1d['scene'].item()['referral_embedding']
         
         if referral_embedding is not None:
             referral_embedding = torch.from_numpy(referral_embedding[0]['feat']).reshape(-1,)
@@ -203,7 +206,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         else:
             referral_embedding = torch.zeros((scene_dict['rgb_embedding'].shape[-1] // 4, ))
         
-        floorplan_embedding = scandata_2d['scene']['floorplan']['embedding']
+        floorplan_embedding = scandata_2d['scene'].item()['floorplan']['embedding']
         floorplan_mask = torch.Tensor([0.0])
         if floorplan_embedding is not None:
             floorplan_embedding = torch.from_numpy(floorplan_embedding[0, 0]).reshape(-1, )
@@ -258,4 +261,4 @@ def collate_fn(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
         }
         collated_batch['coordinates'], collated_batch['features'] = coordinates, features
         
-        return collated_batch
+        return collated_batch
\ No newline at end of file
diff --git a/data/datasets/structured3d.py b/data/datasets/structured3d.py
new file mode 100644
index 0000000..2b73b41
--- /dev/null
+++ b/data/datasets/structured3d.py
@@ -0,0 +1,23 @@
+import os.path as osp
+import numpy as np
+from typing import List, Any
+from omegaconf import DictConfig
+
+from ..build import DATASET_REGISTRY
+from .scanbase import ScanObjectBase, ScanBase
+
+@DATASET_REGISTRY.register()
+class Structured3DObject(ScanObjectBase):
+    """Structured3D dataset class for instance level baseline"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+
+@DATASET_REGISTRY.register()
+class Structured3D(ScanBase):
+    """Structured3D dataset class"""
+    def __init__(self, data_config: DictConfig, split: str) -> None:
+        super().__init__(data_config, split)
+        
+        filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split))
+        self.scan_ids = np.genfromtxt(filepath, dtype = str)
+                   
\ No newline at end of file
diff --git a/prepare_data/README.md b/prepare_data/README.md
index dba34f5..c369156 100644
--- a/prepare_data/README.md
+++ b/prepare_data/README.md
@@ -5,6 +5,8 @@
 This document provides instructions for pre-processing different datasets, including 
 - ScanNet
 - 3RScan
+- ARKitScenes
+- MultiScan
 
 ## Prerequisites
 
@@ -16,20 +18,17 @@ Before you begin, simply activate the `crossover` conda environment.
 #### Original Data
 - **ScanNet**: Download ScanNet v2 data from the [official website](https://github.com/ScanNet/ScanNet), we use the official training and validation split from [here](https://github.com/ScanNet/ScanNet/tree/master/Tasks/Benchmark).
 
-- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan), we use the official (full list of scan ids including reference + rescans) training split from [here](https://campar.in.tum.de/public_datasets/3RScan/train_scans.txt) and validation split from [here](https://campar.in.tum.de/public_datasets/3RScan/val_scans.txt).
-    - Download `3RScan.json` from [here](https://campar.in.tum.de/public_datasets/3RScan/3RScan.json) and `objects.json` from [here](https://campar.in.tum.de/public_datasets/3DSSG/3DSSG/objects.json).
-    - Download the class mapping file `3RScan.v2 Semantic Classes - Mapping.csv` from [here](https://docs.google.com/spreadsheets/d/1eRTJ2M9OHz7ypXfYD-KTR1AIT-CrVLmhJf8mxgVZWnI/edit?gid=0#gid=0).
+- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan).
 
-- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip.
+- **MultiScan**: Download MultiScan dataset from the [official website](https://github.com/smartscenes/multiscan).
+
+- **ARKitScenes**: Download ARKitScenes dataset from the [official website](https://github.com/apple/ARKitScenes).
 
-#### Referral and CAD annotations
-We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet & 3RScan) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). 
+- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip.
 
-- **SceneVerse** - Download the Scannet and 3RScan data under `annotations/refer` from the [official website](https://scene-verse.github.io/).
-- **Scan2CAD** - Download `full_annotations.json` from the [official website](https://github.com/skanti/Scan2CAD?tab=readme-ov-file#download-dataset).
+### Download Referral and CAD annotations
+We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, MultiScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below.
 
-### Prepare The Data
-Exact instructions for data setup + preparation below:
 
 #### ScanNet
 1. Run the following to extract ScanNet data 
@@ -107,3 +106,81 @@ Scan3R/
     └── sceneverse  
         └── ssg_ref_rel2_template.json
 ```
+
+#### ARKitScenes
+1. Download ARKitScenes 3dod data using the following command:
+
+```bash
+python ARKitScenes/download_data.py 3dod --video_id_csv PATH_TO_3dod_train_val_splits.csv --download_dir PATH_TO_ARKITSCENES
+```
+The files mentioned in the above command - ```download_data.py``` and ```3dod_train_val_splits.csv``` can be found in the official repository [here](https://github.com/apple/ARKitScenes), along with more detailed instructions and descriptions of the data.
+
+2. Once the data is downloaded, run the following to organize it as per our requirements.
+ 
+ ```bash
+cd ARKitScenes
+mv 3dod/Training/* scans
+mv 3dod/Validation/* scans
+```
+
+3. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`.
+
+Once completed, the data structure would look like the following:
+```
+ARKitScenes/
+├── scans/
+│   ├── 40753679/
+│   │   ├── 40753679_frames/ 
+│   │   │    ├── lowres_depth/ (folder containing depth images)
+│   │   │    ├── lowres_wide/ (folder containing rgb images)
+│   │   │    ├── lowres_wide_intrinsics/ (folder containing frame wise camera intrinsics)
+│   │   │    ├── lowres_wide.traj (camera trajectory)
+│   │   ├── 40753679_3dod_annotation.json
+│   │   ├── 40753679_3dod_mesh.ply
+|   └── 
+└── files
+    ├── scannetv2-labels.combined.tsv
+    ├── train_scans.txt
+    ├── val_scans.txt
+    ├── metadata.csv
+    ├── 3dod_train_val_splits.csv
+    └── sceneverse  
+        └── ssg_ref_rel2_template.json
+```
+
+#### MultiScan
+1. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data 
+ 
+ ```bash
+cd MultiScan/scenes
+unzip '*.zip'
+rm -rf '*.zip'
+```
+3. To generate sequence of RGB images and corresponding camera poses from the ```.mp4``` file, run the follwing
+```bash
+cd prepare_data/multiscan
+python preprocess_2d_multiscan.py --base_dir PATH_TO_MULTISCAN --frame_interval {frame_interval}
+```
+Once completed, the data structure would look like the following:
+```
+MultiScan/
+├── scenes/
+│   ├── scene_00000_00/
+│   │   ├── sequence/ (folder containing rgb images at specified frame interval)
+|   |   ├── frame_ids.txt
+│   │   ├── scene_00000_00.annotations.json
+│   │   ├── scene_00000_00.jsonl
+│   │   ├── scene_00000_00.confidence.zlib
+│   │   ├── scene_00000_00.mp4
+│   │   ├── poses.jsonl
+│   │   ├── scene_00000_00.ply
+│   │   ├── scene_00000_00.align.json
+│   │   ├── scene_00000_00.json
+|   └── 
+└── files
+    ├── scannetv2-labels.combined.tsv
+    ├── train_scans.txt
+    ├── test_scans.txt
+    └── sceneverse  
+        └── ssg_ref_rel2_template.json
+```
\ No newline at end of file
diff --git a/prepare_data/multiscan/preprocess_2d_multiscan.py b/prepare_data/multiscan/preprocess_2d_multiscan.py
new file mode 100644
index 0000000..da89da1
--- /dev/null
+++ b/prepare_data/multiscan/preprocess_2d_multiscan.py
@@ -0,0 +1,94 @@
+import os
+import cv2
+import json
+import jsonlines
+import argparse
+import os.path as osp
+import shutil
+
+def process_scene_folders(base_dir, frame_interval=10):
+    base_dir=osp.join(base_dir, 'scenes')
+    scene_folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
+
+    for scene_folder in scene_folders:
+        scene_path = os.path.join(base_dir, scene_folder)
+        video_path = os.path.join(scene_path, f"{scene_folder}.mp4")
+        jsonl_path = os.path.join(scene_path, f"{scene_folder}.jsonl")
+        frame_output_dir = os.path.join(scene_path, "sequence")
+        frame_ids_txt_path = os.path.join(scene_path, "frame_ids.txt")
+        metadata_output_path = os.path.join(scene_path, "poses.jsonl")
+
+        if os.path.exists(frame_output_dir):
+            shutil.rmtree(frame_output_dir)
+        os.makedirs(frame_output_dir)
+
+        if not os.path.exists(video_path):
+            print(f"Video file not found: {video_path}")
+            continue
+        if not os.path.exists(jsonl_path):
+            print(f"Metadata file not found: {jsonl_path}")
+            continue
+
+        print(f"Processing scene: {scene_folder}")
+
+        frame_ids = extract_frames_from_video(video_path, frame_output_dir, frame_interval)
+
+        with open(frame_ids_txt_path, "w") as f:
+            for frame_id in frame_ids:
+                f.write(f"{frame_id}\n")
+
+        selected_metadata = extract_metadata_by_line_number(jsonl_path, frame_ids)
+
+        with jsonlines.open(metadata_output_path, mode="w") as writer:
+            for entry in selected_metadata:
+                writer.write(entry)
+
+        print(f"Finished processing scene: {scene_folder}")
+
+
+def extract_frames_from_video(video_path, output_dir, frame_interval):
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file: {video_path}")
+
+    frame_ids = []
+    frame_count = 0
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break  # End of video
+
+        if frame_count % frame_interval == 0:
+            frame_id = frame_count
+            frame_ids.append(frame_id)
+            output_path = os.path.join(output_dir, f"frame-{frame_id}.color.jpg")
+            cv2.imwrite(output_path, frame)  # Save frame as an image
+
+        frame_count += 1
+
+    cap.release()
+    return frame_ids
+
+
+def extract_metadata_by_line_number(jsonl_path, line_numbers):
+
+    selected_metadata = []
+
+    with jsonlines.open(jsonl_path) as reader:
+        for line_idx, entry in enumerate(reader):
+            if line_idx in line_numbers:
+                entry["frame_id"] = line_idx
+                selected_metadata.append(entry)
+
+    return selected_metadata
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process scene folders.")
+    parser.add_argument("--base_dir", type=str, required=True, help="Base dataset directory.")
+    parser.add_argument("--frame_interval", type=int, default=10, help="Interval for saving frames.")
+    args = parser.parse_args()
+
+    process_scene_folders(args.base_dir, args.frame_interval)
\ No newline at end of file
diff --git a/prepare_data/structured3d/generate_ply.py b/prepare_data/structured3d/generate_ply.py
new file mode 100644
index 0000000..19b3cd9
--- /dev/null
+++ b/prepare_data/structured3d/generate_ply.py
@@ -0,0 +1,366 @@
+import os
+import cv2
+import numpy as np
+import open3d as o3d
+from plyfile import PlyData, PlyElement
+import json
+import argparse
+import misc.utils
+BASE_PATH = "/Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/scans/"
+
+
+def create_color_palette():
+    """Returns the NYU40 colormap mapping RGB to class indices."""
+    return [
+       (0, 0, 0),  # Unlabeled (0)
+       (174, 199, 232),  # wall (1)
+       (152, 223, 138),  # floor (2)
+       (31, 119, 180),  # cabinet (3)
+       (255, 187, 120),  # bed (4)
+       (188, 189, 34),  # chair (5)
+       (140, 86, 75),  # sofa (6)
+       (255, 152, 150),  # table (7)
+       (214, 39, 40),  # door (8)
+       (197, 176, 213),  # window (9)
+       (148, 103, 189),  # bookshelf (10)
+       (196, 156, 148),  # picture (11)
+       (23, 190, 207),  # counter (12)
+       (178, 76, 76),  
+       (247, 182, 210),  # desk (14)
+       (66, 188, 102), 
+       (219, 219, 141),  # curtain (16)
+       (140, 57, 197), 
+       (202, 185, 52), 
+       (51, 176, 203), 
+       (200, 54, 131), 
+       (92, 193, 61),  
+       (78, 71, 183),  
+       (172, 114, 82), 
+       (255, 127, 14),  # refrigerator (25)
+       (91, 163, 138), 
+       (153, 98, 156), 
+       (140, 153, 101),
+       (158, 218, 229),  # shower curtain (28)
+       (100, 125, 154),
+       (178, 127, 135),
+       (120, 185, 128),
+       (146, 111, 194),
+       (44, 160, 44),  # toilet (33)
+       (112, 128, 144),  # sink (34)
+       (96, 207, 209), 
+       (227, 119, 194),  # bathtub (36)
+       (213, 92, 176), 
+       (94, 106, 211), 
+       (82, 84, 163),  # otherfurn (39)
+       (100, 85, 144)
+    ]
+
+def normalize(vector):
+    return vector / np.linalg.norm(vector)
+  
+def parse_camera_info(camera_info, height, width):
+    """ extract intrinsic and extrinsic matrix
+    """
+    lookat = normalize(camera_info[3:6])
+    up = normalize(camera_info[6:9])
+
+    W = lookat
+    U = np.cross(W, up)
+    V = np.cross(W, U)
+
+    rot = np.vstack((U, V, W))
+
+    trans = camera_info[:3]
+
+    xfov = camera_info[9]
+    yfov = camera_info[10]
+
+    K = np.diag([1, 1, 1])
+
+    K[0, 2] = width / 2
+    K[1, 2] = height / 2
+
+    K[0, 0] = K[0, 2] / np.tan(xfov)
+    K[1, 1] = K[1, 2] / np.tan(yfov)
+
+    return rot, trans, K
+
+def point_inside_bbox(point, bbox_corners):
+    """Check if a point is inside a 3D bounding box defined by its 8 corners."""
+    min_coords = np.min(bbox_corners, axis=0)
+    max_coords = np.max(bbox_corners, axis=0)
+
+    return np.all(min_coords <= point) and np.all(point <= max_coords)
+
+def load_bounding_boxes(bbox_json_path):
+    """Load 3D bounding boxes from a JSON file."""
+    with open(bbox_json_path, 'r') as f:
+        bboxes = json.load(f)
+    return bboxes
+
+def rgb_to_nyu40id(rgb_image):
+    """Convert RGB values from `semantic.png` to corresponding NYU40 IDs."""
+    palette = create_color_palette()
+    color_to_id = {color: idx for idx, color in enumerate(palette)}
+
+    h, w, _ = rgb_image.shape
+    rgb_flatten = rgb_image.reshape(-1, 3)
+
+    # Convert each RGB value to corresponding NYU40 ID
+    nyu40_ids = np.array([color_to_id.get(tuple(rgb), 0) for rgb in rgb_flatten], dtype=np.int32)
+    
+    return nyu40_ids.reshape(h, w)
+
+
+def save_ply_with_labels(filename, pointcloud, object_ids, nyu40_ids):
+    """Save PLY file with object_id and nyu40id."""
+    points = np.asarray(pointcloud.points)
+    colors = (np.asarray(pointcloud.colors) * 255).astype(np.uint8) if pointcloud.has_colors() else np.zeros_like(points, dtype=np.uint8)
+
+    vertex_data = np.array(
+        list(zip(
+            points[:, 0], points[:, 1], points[:, 2],  # x, y, z
+            colors[:, 0], colors[:, 1], colors[:, 2],  # red, green, blue
+            np.full(len(points), 255, dtype=np.uint8),  # alpha
+            object_ids,  # Object ID
+            nyu40_ids  # NYU40 Semantic ID
+        )),
+        dtype=[
+            ('x', 'f4'), ('y', 'f4'), ('z', 'f4'),
+            ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('alpha', 'u1'),
+            ('object_id', 'i4'),
+            ('nyu40id', 'i4')
+        ]
+    )
+
+    el = PlyElement.describe(vertex_data, 'vertex')
+    PlyData([el], text=False).write(filename)
+    
+def process_room(scene_id, room_id, room_path):
+    """Processes a single room by merging all views and generating a 3D mesh."""
+    pcd_list = []
+    object_ids_list = []
+    nyu40_ids_list = []
+
+    # Iterate over all views in the room
+    for view_id in sorted(os.listdir(room_path)):
+        view_path = os.path.join(room_path, view_id)
+
+        rgb_image_path = os.path.join(view_path, "rgb_rawlight.png")
+        depth_image_path = os.path.join(view_path, "depth.png")
+        camera_path = os.path.join(view_path, "camera_pose.txt")
+        # instance_image_path = os.path.join(view_path, "instance.png")
+        semantic_image_path = os.path.join(view_path, "semantic.png")
+
+        if not all(os.path.exists(p) for p in [rgb_image_path, depth_image_path, camera_path, semantic_image_path]):
+            print(f"Skipping Scene {scene_id}, Room {room_id}, View {view_id}: Missing files")
+            continue
+
+        print(f"Processing Scene {scene_id}, Room {room_id}, View {view_id}...")
+
+        color = cv2.imread(rgb_image_path)
+        # cv2.imshow("color", color)
+        # cv2.waitKey(0)
+        # color = cv2.cvtColor(color, cv2.COLOR_BGR2RGB)
+        depth = cv2.imread(depth_image_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0  # Convert mm to meters
+        # instance = cv2.imread(instance_image_path, cv2.IMREAD_UNCHANGED)  # Object ID image
+        semantic = cv2.imread(semantic_image_path)  # Read as BGR
+        semantic = cv2.cvtColor(semantic, cv2.COLOR_BGR2RGB)  # Convert to RGB
+
+        nyu40_id_image = rgb_to_nyu40id(semantic)
+
+        valid_mask = depth.flatten() > 0
+        # object_ids = instance.flatten()[valid_mask]
+        nyu40_ids = nyu40_id_image.flatten()[valid_mask]
+
+        height, width = color.shape[:2]
+        camera_info = np.loadtxt(camera_path)
+        rot, trans, K = parse_camera_info(camera_info, height, width)
+        trans = np.array(trans) / 1000
+        
+        
+        color_o3d = o3d.geometry.Image(color)
+        depth_o3d = o3d.geometry.Image(depth)
+        rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth(
+            color_o3d, depth_o3d, depth_scale=1.0, depth_trunc=10.0, convert_rgb_to_intensity=False
+        )
+        extrinsic = np.eye(4)
+        extrinsic[:3, :3] = rot.T
+        extrinsic[:3, -1] = trans
+        extrinsic = np.linalg.inv(extrinsic)
+        
+        intrinsic = o3d.camera.PinholeCameraIntrinsic(width, height, K[0][0], K[1][1], K[0][2], K[1][2])
+        pointcloud = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, intrinsic, extrinsic)
+
+        pcd_list.append(pointcloud)
+        # object_ids_list.append(object_ids)
+        nyu40_ids_list.append(nyu40_ids)
+
+    if not pcd_list:
+        print(f"Skipping Scene {scene_id}, Room {room_id}: No valid views.")
+        return
+
+    pcd_combined = pcd_list[0]
+    for pcd in pcd_list[1:]:
+        pcd_combined += pcd
+    
+    object_ids_combined = np.array([-1]*len(np.asarray(pcd_combined.points)), dtype=int)  # Initialize object IDs
+
+    # Efficient assignment of object IDs based on bounding box inclusion
+    points = np.asarray(pcd_combined.points)
+    colors = np.asarray(pcd_combined.colors)
+    
+    
+    bboxes_json_path = os.path.join(BASE_PATH, scene_id, "bbox_3d.json")
+    bboxes = load_bounding_boxes(bboxes_json_path)
+    for idx, bbox in enumerate(bboxes):
+        basis = np.array(bbox['basis'])
+        coeffs = np.array(bbox['coeffs'])
+        centroid = np.array(bbox['centroid'])
+        bbox_corners = misc.utils.get_corners_of_bb3d_no_index(basis, coeffs, centroid)  # 8 corners of the bounding box
+        bbox_corners =  bbox_corners / 1000
+        # Create mask for points inside this bounding box
+        box_min = np.min(bbox_corners, axis=0, keepdims=True)
+        box_max = np.max(bbox_corners, axis=0, keepdims=True)
+        # print(min_corner, max_corner)
+        # print(points)
+        # mask = np.all((points >= box_min) & (points <= max_corner), axis=1)
+        point_max_mask = np.all(points < box_max, axis=1)
+        point_min_mask = np.all(points > box_min, axis=1)
+        point_mask = np.logical_and(point_max_mask, point_min_mask)
+        points_in_bbox = points[point_mask]
+        # print(points_in_bbox.shape)
+        # if points_in_bbox.shape[0] != 0:
+        #     print(bbox['ID'])
+        #     colors_in_bbox = colors[mask]
+        #     object_pcd = o3d.geometry.PointCloud()
+        #     object_pcd.points = o3d.utility.Vector3dVector(points_in_bbox)
+        #     object_pcd.colors = o3d.utility.Vector3dVector(colors_in_bbox)
+        #     o3d.visualization.draw_geometries([object_pcd])
+        # print(np.all(points>=min_corner, axis=1))
+        # Assign object ID to points inside this bounding box
+        object_ids_combined[point_mask] = bbox['ID']
+    # o3d.visualization.draw_geometries([pcd_combined])
+   
+    
+    nyu40_ids_combined = np.concatenate(nyu40_ids_list)
+    # print(np.unique(object_ids_combined))
+    # Save the mesh file
+    output_dir = os.path.join(BASE_PATH, scene_id, "3D_rendering", room_id)
+    os.makedirs(output_dir, exist_ok=True)
+    ply_filename = os.path.join(output_dir, "room_mesh.ply")
+
+    save_ply_with_labels(ply_filename, pcd_combined, object_ids_combined, nyu40_ids_combined)
+    print(f"Saved mesh for Scene {scene_id}, Room {room_id} -> {ply_filename}")
+
+
+# if __name__ == '__main__':
+#     for scene_id in sorted(os.listdir(BASE_PATH)):
+#         scene_path = os.path.join(BASE_PATH, scene_id, "2D_rendering")
+#         if not os.path.isdir(scene_path):
+#             continue
+
+#         for room_id in sorted(os.listdir(scene_path)):
+#             room_path = os.path.join(scene_path, room_id, "perspective", "full")
+#             if os.path.isdir(room_path):
+                # process_room(scene_id, room_id, room_path)
+def parse_args():
+    parser = argparse.ArgumentParser(description='Generate PLY files from Structured3D dataset')
+    parser.add_argument('--base_path', type=str, default="/Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/scans/",
+                        help='Base path to the Structured3D dataset')
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    args = parse_args()
+    BASE_PATH = args.base_path
+    
+    for scene_id in sorted(os.listdir(BASE_PATH)):
+        scene_path = os.path.join(BASE_PATH, scene_id, "2D_rendering")
+        if not os.path.isdir(scene_path):
+            continue
+
+        for room_id in sorted(os.listdir(scene_path)):
+            room_path = os.path.join(scene_path, room_id, "perspective", "full")
+            if os.path.isdir(room_path):
+                process_room(scene_id, room_id, room_path)
+# ---------------------------------------
+# instance image based object id assignment
+# ---------------------------------------
+
+# def process_room(scene_id, room_id, room_path):
+#     """Processes a single room by merging all views and generating a 3D mesh."""
+#     pcd_list = []
+#     object_ids_list = []
+#     nyu40_ids_list = []
+
+#     # Iterate over all views in the room
+#     for view_id in sorted(os.listdir(room_path)):
+#         view_path = os.path.join(room_path, view_id)
+
+#         rgb_image_path = os.path.join(view_path, "rgb_rawlight.png")
+#         depth_image_path = os.path.join(view_path, "depth.png")
+#         camera_path = os.path.join(view_path, "camera_pose.txt")
+#         instance_image_path = os.path.join(view_path, "instance.png")
+#         semantic_image_path = os.path.join(view_path, "semantic.png")
+
+#         if not all(os.path.exists(p) for p in [rgb_image_path, depth_image_path, camera_path, instance_image_path, semantic_image_path]):
+#             print(f"Skipping Scene {scene_id}, Room {room_id}, View {view_id}: Missing files")
+#             continue
+
+#         print(f"Processing Scene {scene_id}, Room {room_id}, View {view_id}...")
+
+#         color = cv2.imread(rgb_image_path)
+#         depth = cv2.imread(depth_image_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0  # Convert mm to meters
+#         instance = cv2.imread(instance_image_path, cv2.IMREAD_UNCHANGED)  # Object ID image
+#         semantic = cv2.imread(semantic_image_path)  # Read as BGR
+#         semantic = cv2.cvtColor(semantic, cv2.COLOR_BGR2RGB)  # Convert to RGB
+
+#         nyu40_id_image = rgb_to_nyu40id(semantic)
+
+#         valid_mask = depth.flatten() > 0
+#         object_ids = instance.flatten()[valid_mask]
+#         nyu40_ids = nyu40_id_image.flatten()[valid_mask]
+
+#         height, width = color.shape[:2]
+#         camera_info = np.loadtxt(camera_path)
+#         rot, trans, K = parse_camera_info(camera_info, height, width)
+#         trans = np.array(trans) / 1000
+        
+        
+#         color_o3d = o3d.geometry.Image(color)
+#         depth_o3d = o3d.geometry.Image(depth)
+#         rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth(
+#             color_o3d, depth_o3d, depth_scale=1.0, depth_trunc=10.0, convert_rgb_to_intensity=False
+#         )
+#         extrinsic = np.eye(4)
+#         extrinsic[:3, :3] = rot.T
+#         extrinsic[:3, -1] = trans
+#         extrinsic = np.linalg.inv(extrinsic)
+        
+#         intrinsic = o3d.camera.PinholeCameraIntrinsic(width, height, K[0][0], K[1][1], K[0][2], K[1][2])
+#         pointcloud = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, intrinsic, extrinsic)
+
+#         pcd_list.append(pointcloud)
+#         object_ids_list.append(object_ids)
+#         nyu40_ids_list.append(nyu40_ids)
+
+#     if not pcd_list:
+#         print(f"Skipping Scene {scene_id}, Room {room_id}: No valid views.")
+#         return
+
+#     pcd_combined = pcd_list[0]
+#     for pcd in pcd_list[1:]:
+#         pcd_combined += pcd
+#     # o3d.visualization.draw_geometries([pcd_combined])
+
+#     object_ids_combined = np.concatenate(object_ids_list)
+#     nyu40_ids_combined = np.concatenate(nyu40_ids_list)
+
+#     # Save the mesh file
+#     output_dir = os.path.join(BASE_PATH, scene_id, "3D_rendering", room_id)
+#     os.makedirs(output_dir, exist_ok=True)
+#     ply_filename = os.path.join(output_dir, "room_mesh.ply")
+
+#     save_ply_with_labels(ply_filename, pcd_combined, object_ids_combined, nyu40_ids_combined)
+#     print(f"Saved mesh for Scene {scene_id}, Room {room_id} -> {ply_filename}")
+
diff --git a/prepare_data/structured3d/misc/colors.py b/prepare_data/structured3d/misc/colors.py
new file mode 100644
index 0000000..191f845
--- /dev/null
+++ b/prepare_data/structured3d/misc/colors.py
@@ -0,0 +1,47 @@
+semantics_cmap = {
+    'living room': '#e6194b',
+    'kitchen': '#3cb44b',
+    'bedroom': '#ffe119',
+    'bathroom': '#0082c8',
+    'balcony': '#f58230',
+    'corridor': '#911eb4',
+    'dining room': '#46f0f0',
+    'study': '#f032e6',
+    'studio': '#d2f53c',
+    'store room': '#fabebe',
+    'garden': '#008080',
+    'laundry room': '#e6beff',
+    'office': '#aa6e28',
+    'basement': '#fffac8',
+    'garage': '#800000',
+    'undefined': '#aaffc3',
+    'door': '#808000',
+    'window': '#ffd7b4',
+    'outwall': '#000000',
+}
+
+
+colormap_255 = [
+    [230,  25,  75],
+    [ 60, 180,  75],
+    [255, 225,  25],
+    [  0, 130, 200],
+    [245, 130,  48],
+    [145,  30, 180],
+    [ 70, 240, 240],
+    [240,  50, 230],
+    [210, 245,  60],
+    [250, 190, 190],
+    [  0, 128, 128],
+    [230, 190, 255],
+    [170, 110,  40],
+    [255, 250, 200],
+    [128,   0,   0],
+    [170, 255, 195],
+    [128, 128,   0],
+    [255, 215, 180],
+    [  0,   0, 128],
+    [128, 128, 128],
+    [255, 255, 255],
+    [  0,   0,   0]
+]
\ No newline at end of file
diff --git a/prepare_data/structured3d/misc/figures.py b/prepare_data/structured3d/misc/figures.py
new file mode 100644
index 0000000..013acbf
--- /dev/null
+++ b/prepare_data/structured3d/misc/figures.py
@@ -0,0 +1,78 @@
+"""
+Copy from https://github.com/Toblerity/Shapely/blob/master/docs/code/figures.py
+"""
+
+from math import sqrt
+from shapely import affinity
+
+GM = (sqrt(5)-1.0)/2.0
+W = 8.0
+H = W*GM
+SIZE = (W, H)
+
+BLUE = '#6699cc'
+GRAY = '#999999'
+DARKGRAY = '#333333'
+YELLOW = '#ffcc33'
+GREEN = '#339933'
+RED = '#ff3333'
+BLACK = '#000000'
+
+COLOR_ISVALID = {
+    True: BLUE,
+    False: RED,
+}
+
+
+def plot_line(ax, ob, color=GRAY, zorder=1, linewidth=3, alpha=1):
+    x, y = ob.xy
+    ax.plot(x, y, color=color, linewidth=linewidth, solid_capstyle='round', zorder=zorder, alpha=alpha)
+
+
+def plot_coords(ax, ob, color=BLACK, zorder=1, alpha=1):
+    x, y = ob.xy
+    ax.plot(x, y, color=color, zorder=zorder, alpha=alpha)
+
+
+def color_isvalid(ob, valid=BLUE, invalid=RED):
+    if ob.is_valid:
+        return valid
+    else:
+        return invalid
+
+
+def color_issimple(ob, simple=BLUE, complex=YELLOW):
+    if ob.is_simple:
+        return simple
+    else:
+        return complex
+
+
+def plot_line_isvalid(ax, ob, **kwargs):
+    kwargs["color"] = color_isvalid(ob)
+    plot_line(ax, ob, **kwargs)
+
+
+def plot_line_issimple(ax, ob, **kwargs):
+    kwargs["color"] = color_issimple(ob)
+    plot_line(ax, ob, **kwargs)
+
+
+def plot_bounds(ax, ob, zorder=1, alpha=1):
+    x, y = zip(*list((p.x, p.y) for p in ob.boundary))
+    ax.plot(x, y, 'o', color=BLACK, zorder=zorder, alpha=alpha)
+
+
+def add_origin(ax, geom, origin):
+    x, y = xy = affinity.interpret_origin(geom, origin, 2)
+    ax.plot(x, y, 'o', color=GRAY, zorder=1)
+    ax.annotate(str(xy), xy=xy, ha='center',
+                textcoords='offset points', xytext=(0, 8))
+
+
+def set_limits(ax, x0, xN, y0, yN):
+    ax.set_xlim(x0, xN)
+    ax.set_xticks(range(x0, xN+1))
+    ax.set_ylim(y0, yN)
+    ax.set_yticks(range(y0, yN+1))
+    ax.set_aspect("equal")
\ No newline at end of file
diff --git a/prepare_data/structured3d/misc/panorama.py b/prepare_data/structured3d/misc/panorama.py
new file mode 100644
index 0000000..ba2feef
--- /dev/null
+++ b/prepare_data/structured3d/misc/panorama.py
@@ -0,0 +1,243 @@
+"""
+Copy from https://github.com/sunset1995/pytorch-layoutnet/blob/master/pano.py
+"""
+import numpy as np
+import numpy.matlib as matlib
+
+
+def xyz_2_coorxy(xs, ys, zs, H=512, W=1024):
+    us = np.arctan2(xs, ys)
+    vs = -np.arctan(zs / np.sqrt(xs**2 + ys**2))
+    coorx = (us / (2 * np.pi) + 0.5) * W
+    coory = (vs / np.pi + 0.5) * H
+    return coorx, coory
+
+
+def coords2uv(coords, width, height):
+    """
+    Image coordinates (xy) to uv
+    """
+    middleX = width / 2 + 0.5
+    middleY = height / 2 + 0.5
+    uv = np.hstack([
+        (coords[:, [0]] - middleX) / width * 2 * np.pi,
+        -(coords[:, [1]] - middleY) / height * np.pi])
+    return uv
+
+
+def uv2xyzN(uv, planeID=1):
+    ID1 = (int(planeID) - 1 + 0) % 3
+    ID2 = (int(planeID) - 1 + 1) % 3
+    ID3 = (int(planeID) - 1 + 2) % 3
+    xyz = np.zeros((uv.shape[0], 3))
+    xyz[:, ID1] = np.cos(uv[:, 1]) * np.sin(uv[:, 0])
+    xyz[:, ID2] = np.cos(uv[:, 1]) * np.cos(uv[:, 0])
+    xyz[:, ID3] = np.sin(uv[:, 1])
+    return xyz
+
+
+def uv2xyzN_vec(uv, planeID):
+    """
+    vectorization version of uv2xyzN
+    @uv       N x 2
+    @planeID  N
+    """
+    assert (planeID.astype(int) != planeID).sum() == 0
+    planeID = planeID.astype(int)
+    ID1 = (planeID - 1 + 0) % 3
+    ID2 = (planeID - 1 + 1) % 3
+    ID3 = (planeID - 1 + 2) % 3
+    ID = np.arange(len(uv))
+    xyz = np.zeros((len(uv), 3))
+    xyz[ID, ID1] = np.cos(uv[:, 1]) * np.sin(uv[:, 0])
+    xyz[ID, ID2] = np.cos(uv[:, 1]) * np.cos(uv[:, 0])
+    xyz[ID, ID3] = np.sin(uv[:, 1])
+    return xyz
+
+
+def xyz2uvN(xyz, planeID=1):
+    ID1 = (int(planeID) - 1 + 0) % 3
+    ID2 = (int(planeID) - 1 + 1) % 3
+    ID3 = (int(planeID) - 1 + 2) % 3
+    normXY = np.sqrt(xyz[:, [ID1]] ** 2 + xyz[:, [ID2]] ** 2)
+    normXY[normXY < 0.000001] = 0.000001
+    normXYZ = np.sqrt(xyz[:, [ID1]] ** 2 + xyz[:, [ID2]] ** 2 + xyz[:, [ID3]] ** 2)
+    v = np.arcsin(xyz[:, [ID3]] / normXYZ)
+    u = np.arcsin(xyz[:, [ID1]] / normXY)
+    valid = (xyz[:, [ID2]] < 0) & (u >= 0)
+    u[valid] = np.pi - u[valid]
+    valid = (xyz[:, [ID2]] < 0) & (u <= 0)
+    u[valid] = -np.pi - u[valid]
+    uv = np.hstack([u, v])
+    uv[np.isnan(uv[:, 0]), 0] = 0
+    return uv
+
+
+def computeUVN(n, in_, planeID):
+    """
+    compute v given u and normal.
+    """
+    if planeID == 2:
+        n = np.array([n[1], n[2], n[0]])
+    elif planeID == 3:
+        n = np.array([n[2], n[0], n[1]])
+    bc = n[0] * np.sin(in_) + n[1] * np.cos(in_)
+    bs = n[2]
+    out = np.arctan(-bc / (bs + 1e-9))
+    return out
+
+
+def computeUVN_vec(n, in_, planeID):
+    """
+    vectorization version of computeUVN
+    @n         N x 3
+    @in_      MN x 1
+    @planeID   N
+    """
+    n = n.copy()
+    if (planeID == 2).sum():
+        n[planeID == 2] = np.roll(n[planeID == 2], 2, axis=1)
+    if (planeID == 3).sum():
+        n[planeID == 3] = np.roll(n[planeID == 3], 1, axis=1)
+    n = np.repeat(n, in_.shape[0] // n.shape[0], axis=0)
+    assert n.shape[0] == in_.shape[0]
+    bc = n[:, [0]] * np.sin(in_) + n[:, [1]] * np.cos(in_)
+    bs = n[:, [2]]
+    out = np.arctan(-bc / (bs + 1e-9))
+    return out
+
+
+def lineFromTwoPoint(pt1, pt2):
+    """
+    Generate line segment based on two points on panorama
+    pt1, pt2: two points on panorama
+    line:
+        1~3-th dim: normal of the line
+        4-th dim: the projection dimension ID
+        5~6-th dim: the u of line segment endpoints in projection plane
+    """
+    numLine = pt1.shape[0]
+    lines = np.zeros((numLine, 6))
+    n = np.cross(pt1, pt2)
+    n = n / (matlib.repmat(np.sqrt(np.sum(n ** 2, 1, keepdims=True)), 1, 3) + 1e-9)
+    lines[:, 0:3] = n
+
+    areaXY = np.abs(np.sum(n * matlib.repmat([0, 0, 1], numLine, 1), 1, keepdims=True))
+    areaYZ = np.abs(np.sum(n * matlib.repmat([1, 0, 0], numLine, 1), 1, keepdims=True))
+    areaZX = np.abs(np.sum(n * matlib.repmat([0, 1, 0], numLine, 1), 1, keepdims=True))
+    planeIDs = np.argmax(np.hstack([areaXY, areaYZ, areaZX]), axis=1) + 1
+    lines[:, 3] = planeIDs
+
+    for i in range(numLine):
+        uv = xyz2uvN(np.vstack([pt1[i, :], pt2[i, :]]), lines[i, 3])
+        umax = uv[:, 0].max() + np.pi
+        umin = uv[:, 0].min() + np.pi
+        if umax - umin > np.pi:
+            lines[i, 4:6] = np.array([umax, umin]) / 2 / np.pi
+        else:
+            lines[i, 4:6] = np.array([umin, umax]) / 2 / np.pi
+
+    return lines
+
+
+def lineIdxFromCors(cor_all, im_w, im_h):
+    assert len(cor_all) % 2 == 0
+    uv = coords2uv(cor_all, im_w, im_h)
+    xyz = uv2xyzN(uv)
+    lines = lineFromTwoPoint(xyz[0::2], xyz[1::2])
+    num_sample = max(im_h, im_w)
+
+    cs, rs = [], []
+    for i in range(lines.shape[0]):
+        n = lines[i, 0:3]
+        sid = lines[i, 4] * 2 * np.pi
+        eid = lines[i, 5] * 2 * np.pi
+        if eid < sid:
+            x = np.linspace(sid, eid + 2 * np.pi, num_sample)
+            x = x % (2 * np.pi)
+        else:
+            x = np.linspace(sid, eid, num_sample)
+
+        u = -np.pi + x.reshape(-1, 1)
+        v = computeUVN(n, u, lines[i, 3])
+        xyz = uv2xyzN(np.hstack([u, v]), lines[i, 3])
+        uv = xyz2uvN(xyz, 1)
+
+        r = np.minimum(np.floor((uv[:, 0] + np.pi) / (2 * np.pi) * im_w) + 1,
+                       im_w).astype(np.int32)
+        c = np.minimum(np.floor((np.pi / 2 - uv[:, 1]) / np.pi * im_h) + 1,
+                       im_h).astype(np.int32)
+        cs.extend(r - 1)
+        rs.extend(c - 1)
+    return rs, cs
+
+
+def draw_boundary_from_cor_id(cor_id, img_src):
+    im_h, im_w = img_src.shape[:2]
+    cor_all = [cor_id]
+    for i in range(len(cor_id)):
+        cor_all.append(cor_id[i, :])
+        cor_all.append(cor_id[(i+2) % len(cor_id), :])
+    cor_all = np.vstack(cor_all)
+
+    rs, cs = lineIdxFromCors(cor_all, im_w, im_h)
+    rs = np.array(rs)
+    cs = np.array(cs)
+
+    panoEdgeC = img_src.astype(np.uint8)
+    for dx, dy in [[-1, 0], [1, 0], [0, 0], [0, 1], [0, -1]]:
+        panoEdgeC[np.clip(rs + dx, 0, im_h - 1), np.clip(cs + dy, 0, im_w - 1), 0] = 0
+        panoEdgeC[np.clip(rs + dx, 0, im_h - 1), np.clip(cs + dy, 0, im_w - 1), 1] = 0
+        panoEdgeC[np.clip(rs + dx, 0, im_h - 1), np.clip(cs + dy, 0, im_w - 1), 2] = 255
+
+    return panoEdgeC
+
+
+def coorx2u(x, w=1024):
+    return ((x + 0.5) / w - 0.5) * 2 * np.pi
+
+
+def coory2v(y, h=512):
+    return ((y + 0.5) / h - 0.5) * np.pi
+
+
+def u2coorx(u, w=1024):
+    return (u / (2 * np.pi) + 0.5) * w - 0.5
+
+
+def v2coory(v, h=512):
+    return (v / np.pi + 0.5) * h - 0.5
+
+
+def uv2xy(u, v, z=-50):
+    c = z / np.tan(v)
+    x = c * np.cos(u)
+    y = c * np.sin(u)
+    return x, y
+
+
+def pano_connect_points(p1, p2, z=-50, w=1024, h=512):
+    u1 = coorx2u(p1[0], w)
+    v1 = coory2v(p1[1], h)
+    u2 = coorx2u(p2[0], w)
+    v2 = coory2v(p2[1], h)
+
+    x1, y1 = uv2xy(u1, v1, z)
+    x2, y2 = uv2xy(u2, v2, z)
+
+    if abs(p1[0] - p2[0]) < w / 2:
+        pstart = np.ceil(min(p1[0], p2[0]))
+        pend = np.floor(max(p1[0], p2[0]))
+    else:
+        pstart = np.ceil(max(p1[0], p2[0]))
+        pend = np.floor(min(p1[0], p2[0]) + w)
+    coorxs = (np.arange(pstart, pend + 1) % w).astype(np.float64)
+    vx = x2 - x1
+    vy = y2 - y1
+    us = coorx2u(coorxs, w)
+    ps = (np.tan(us) * x1 - y1) / (vy - np.tan(us) * vx)
+    cs = np.sqrt((x1 + ps * vx) ** 2 + (y1 + ps * vy) ** 2)
+    vs = np.arctan2(z, cs)
+    coorys = v2coory(vs)
+
+    return np.stack([coorxs, coorys], axis=-1)
\ No newline at end of file
diff --git a/prepare_data/structured3d/misc/utils.py b/prepare_data/structured3d/misc/utils.py
new file mode 100644
index 0000000..93c63f9
--- /dev/null
+++ b/prepare_data/structured3d/misc/utils.py
@@ -0,0 +1,138 @@
+"""
+Adapted from https://github.com/thusiyuan/cooperative_scene_parsing/blob/master/utils/sunrgbd_utils.py
+"""
+import numpy as np
+
+
+def normalize(vector):
+    return vector / np.linalg.norm(vector)
+
+
+def parse_camera_info(camera_info, height, width):
+    """ extract intrinsic and extrinsic matrix
+    """
+    lookat = normalize(camera_info[3:6])
+    up = normalize(camera_info[6:9])
+
+    W = lookat
+    U = np.cross(W, up)
+    V = -np.cross(W, U)
+
+    rot = np.vstack((U, V, W))
+    trans = camera_info[:3]
+
+    xfov = camera_info[9]
+    yfov = camera_info[10]
+
+    K = np.diag([1, 1, 1])
+
+    K[0, 2] = width / 2
+    K[1, 2] = height / 2
+
+    K[0, 0] = K[0, 2] / np.tan(xfov)
+    K[1, 1] = K[1, 2] / np.tan(yfov)
+
+    return rot, trans, K
+
+
+def flip_towards_viewer(normals, points):
+    points = points / np.linalg.norm(points)
+    proj = points.dot(normals[:2, :].T)
+    flip = np.where(proj > 0)
+    normals[flip, :] = -normals[flip, :]
+    return normals
+
+
+def get_corners_of_bb3d(basis, coeffs, centroid):
+    corners = np.zeros((8, 3))
+    # order the basis
+    index = np.argsort(np.abs(basis[:, 0]))[::-1]
+    # the case that two same value appear the same time
+    if index[2] != 2:
+        index[1:] = index[1:][::-1]
+    basis = basis[index, :]
+    coeffs = coeffs[index]
+    # Now, we know the basis vectors are orders X, Y, Z. Next, flip the basis vectors towards the viewer
+    basis = flip_towards_viewer(basis, centroid)
+    coeffs = np.abs(coeffs)
+    corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+
+    corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners = corners + np.tile(centroid, (8, 1))
+    return corners
+
+
+def get_corners_of_bb3d_no_index(basis, coeffs, centroid):
+    corners = np.zeros((8, 3))
+    coeffs = np.abs(coeffs)
+    corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+
+    corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+
+    corners = corners + np.tile(centroid, (8, 1))
+    return corners
+
+
+def project_3d_points_to_2d(points3d, R_ex, K):
+    """
+    Project 3d points from camera-centered coordinate to 2D image plane
+    Parameters
+    ----------
+    points3d: numpy array
+        3d location of point
+    R_ex: numpy array
+        extrinsic camera parameter
+    K: numpy array
+        intrinsic camera parameter
+    Returns
+    -------
+    points2d: numpy array
+        2d location of the point
+    """
+    points3d = R_ex.dot(points3d.T).T
+    x3 = points3d[:, 0]
+    y3 = -points3d[:, 1]
+    z3 = np.abs(points3d[:, 2])
+    xx = x3 * K[0, 0] / z3 + K[0, 2]
+    yy = y3 * K[1, 1] / z3 + K[1, 2]
+    points2d = np.vstack((xx, yy))
+    return points2d
+
+
+def project_struct_bdb_to_2d(basis, coeffs, center, R_ex, K):
+    """
+    Project 3d bounding box to 2d bounding box
+    Parameters
+    ----------
+    basis, coeffs, center, R_ex, K
+        : K is the intrinsic camera parameter matrix
+        : Rtilt is the extrinsic camera parameter matrix in right hand coordinates
+    Returns
+    -------
+    bdb2d: dict
+        Keys: {'x1', 'x2', 'y1', 'y2'}
+        The (x1, y1) position is at the top left corner,
+        the (x2, y2) position is at the bottom right corner
+    """
+    corners3d = get_corners_of_bb3d(basis, coeffs, center)
+    corners = project_3d_points_to_2d(corners3d, R_ex, K)
+    bdb2d = dict()
+    bdb2d['x1'] = int(max(np.min(corners[0, :]), 1))  # x1
+    bdb2d['y1'] = int(max(np.min(corners[1, :]), 1))  # y1
+    bdb2d['x2'] = int(min(np.max(corners[0, :]), 2*K[0, 2]))  # x2
+    bdb2d['y2'] = int(min(np.max(corners[1, :]), 2*K[1, 2]))  # y2
+    # if not check_bdb(bdb2d, 2*K[0, 2], 2*K[1, 2]):
+    #     bdb2d = None
+    return bdb2d
\ No newline at end of file
diff --git a/prepare_data/structured3d/save_floorplan.py b/prepare_data/structured3d/save_floorplan.py
new file mode 100644
index 0000000..efa2391
--- /dev/null
+++ b/prepare_data/structured3d/save_floorplan.py
@@ -0,0 +1,170 @@
+import argparse
+import json
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib import colors
+from shapely.geometry import Polygon, Point
+from shapely.plotting import plot_polygon
+
+from misc.colors import semantics_cmap
+from misc.utils import get_corners_of_bb3d_no_index
+
+rooms = [
+    "living room",
+    "kitchen",
+    "bedroom",
+    "bathroom",
+    "balcony",
+    "corridor",
+    "dining room",
+    "study",
+    "studio",
+    "store room",
+    "garden",
+    "laundry room",
+    "office",
+    "basement",
+    "garage",
+    "undefined"
+]
+
+def convert_lines_to_vertices(lines):
+    """convert line representation to polygon vertices
+    """
+    polygons = []
+    lines = np.array(lines)
+
+    polygon = None
+    while len(lines) != 0:
+        if polygon is None:
+            polygon = lines[0].tolist()
+            lines = np.delete(lines, 0, 0)
+
+        lineID, juncID = np.where(lines == polygon[-1])
+        vertex = lines[lineID[0], 1 - juncID[0]]
+        lines = np.delete(lines, lineID, 0)
+
+        if vertex in polygon:
+            polygons.append(polygon)
+            polygon = None
+        else:
+            polygon.append(vertex)
+
+    return polygons
+
+
+def visualize_floorplan(scene_path):
+    """visualize floorplan
+    """
+    with open(os.path.join(scene_path, "annotation_3d.json")) as file:
+        annos = json.load(file)
+
+    with open(os.path.join(scene_path, "bbox_3d.json")) as file:
+        boxes = json.load(file)
+
+    # extract the floor in each semantic for floorplan visualization
+    planes = []
+    for semantic in annos['semantics']:
+        for planeID in semantic['planeID']:
+            if annos['planes'][planeID]['type'] == 'floor':
+                planes.append({'planeID': planeID, 'type': semantic['type'], 'room_ID': semantic['ID']})
+
+        if semantic['type'] == 'outwall':
+            outerwall_planes = semantic['planeID']
+
+    # extract hole vertices
+    lines_holes = []
+    for semantic in annos['semantics']:
+        if semantic['type'] in ['window', 'door']:
+            for planeID in semantic['planeID']:
+                lines_holes.extend(np.where(np.array(annos['planeLineMatrix'][planeID]))[0].tolist())
+    lines_holes = np.unique(lines_holes)
+
+    # junctions on the floor
+    junctions = np.array([junc['coordinate'] for junc in annos['junctions']])
+    junction_floor = np.where(np.isclose(junctions[:, -1], 0))[0]
+
+    # construct each polygon
+    polygons = []
+    for plane in planes:
+        lineIDs = np.where(np.array(annos['planeLineMatrix'][plane['planeID']]))[0].tolist()
+        junction_pairs = [np.where(np.array(annos['lineJunctionMatrix'][lineID]))[0].tolist() for lineID in lineIDs]
+        polygon = convert_lines_to_vertices(junction_pairs)
+        polygons.append([polygon[0], plane['type'], plane['room_ID']])
+
+    outerwall_floor = []
+    for planeID in outerwall_planes:
+        lineIDs = np.where(np.array(annos['planeLineMatrix'][planeID]))[0].tolist()
+        lineIDs = np.setdiff1d(lineIDs, lines_holes)
+        junction_pairs = [np.where(np.array(annos['lineJunctionMatrix'][lineID]))[0].tolist() for lineID in lineIDs]
+        for start, end in junction_pairs:
+            if start in junction_floor and end in junction_floor:
+                outerwall_floor.append([start, end])
+
+    outerwall_polygon = convert_lines_to_vertices(outerwall_floor)
+    polygons.append([outerwall_polygon[0], 'outwall', 0])
+
+    junctions = np.array([junc['coordinate'][:2] for junc in annos['junctions']])
+    
+    room_polygons = {}
+    for (polygon, poly_type, room_id) in polygons:
+        if poly_type in rooms:
+            if poly_type not in room_polygons:
+                room_polygons[room_id] = []
+            room_polygons[room_id].append(polygon)
+
+    floorplans_dir = os.path.join(scene_path, 'floorplans')
+    os.makedirs(floorplans_dir, exist_ok=True)
+
+    for room_id, room_polys in room_polygons.items():
+        fig = plt.figure()
+        ax = fig.add_subplot(1, 1, 1)
+        room_polygon_objects = []
+        for polygon in room_polys:
+            polygon = np.array(polygon + [polygon[0], ])
+            polygon = Polygon(junctions[polygon])
+            room_polygon_objects.append(polygon)
+            room_type = next((item['type'] for item in annos['semantics'] if item['ID'] == room_id))
+            plot_polygon(polygon, ax=ax, add_points=False, facecolor=semantics_cmap[room_type], alpha=0.5)
+        
+        for bbox in boxes:
+            basis = np.array(bbox['basis'])
+            coeffs = np.array(bbox['coeffs'])
+            centroid = np.array(bbox['centroid'])
+
+            corners = get_corners_of_bb3d_no_index(basis, coeffs, centroid)
+            corners = corners[[0, 1, 2, 3, 0], :2]
+
+            bbox_polygon = Polygon(corners)
+            for room_polygon in room_polygon_objects:
+                if room_polygon.contains(Point(centroid[:2])):
+                    plot_polygon(bbox_polygon, ax=ax, add_points=False, facecolor=colors.rgb2hex(np.random.rand(3)), alpha=0.5)
+                    
+                    
+        plt.axis('equal')
+        plt.axis('off')
+        output_file = os.path.join(floorplans_dir, f"{room_id}.png")
+        plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight', pad_inches=0)
+        plt.close(fig)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Structured3D Floorplan Visualization")
+    parser.add_argument("--path", required=True,
+                        help="dataset path", metavar="DIR")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    scenes = [d for d in os.listdir(args.path) if os.path.isdir(os.path.join(args.path, d)) and d.startswith('scene_')]
+    for scene in scenes:
+        scene_path = os.path.join(args.path, scene)
+        visualize_floorplan(scene_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/prepare_data/structured3d/uni3dscene.py b/prepare_data/structured3d/uni3dscene.py
new file mode 100644
index 0000000..e1c6ec7
--- /dev/null
+++ b/prepare_data/structured3d/uni3dscene.py
@@ -0,0 +1,417 @@
+# pylint: disable=no-member
+import os
+import io
+import json
+import pickle
+from typing import List, Tuple, Dict
+import multiprocessing
+import cv2
+import numpy as np
+from PIL import Image as pil_image
+
+from utils.config import ProcessUnit, EnvsConfig
+from utils.nyu_40 import NYU40
+from utils.s3dutilize import S3DUtilize, Annotations
+from utils.base_dataset import DatasetBase
+import argparse
+
+BASE_DIR = '/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans'
+class Structured3DDataGen(DatasetBase):
+    """
+    Dataset generation for Structured3D.
+
+    Two separated folders will be created in target folder -- points and semantic_mask.
+        Points will be saved a .bin file with raw shape [N, 6] (3 for XYZ, 3 for RGB)
+        and data type np.float32. Semantic mask will be saved a .bin file with raw shape
+        [N] and data type np.int64.
+    """
+    IMAGE_PREFIX = '/2D_rendering'
+
+    PERSPECTIVE_PREFIX = '/perspective/full'
+    PRSP_CAM_FILE = 'camera_pose.txt'
+
+    PANORAMIC_PREFIX = '/panorama/full'
+    PANO_CAM_PREFIX = '/panorama'
+    PANO_CAM_FILE = 'camera_xyz.txt'
+
+    SEMANTIC_FILE = 'semantic.png'
+    RGB_FILE = 'rgb_rawlight.png'
+    DEPTH_FILE = 'depth.png'
+
+    ANNO_FILE = 'bbox_3d.json'
+
+    def __init__(self, proc_units: List[ProcessUnit], envs: EnvsConfig) -> None:
+        super().__init__(proc_units, envs)
+        self._zip_folder = BASE_DIR
+
+    def _get_rooms_list_by_types(self, room_types: List[str]) -> List[str]:
+        assert len(room_types) == 1 and 'all' in room_types
+        scenes_list = [d for d in os.listdir(self._zip_folder) if os.path.isdir(os.path.join(self._zip_folder, d))]
+        rooms_list = list()
+        for scene_path in scenes_list:
+            rooms_name = os.listdir(os.path.join(self._zip_folder, scene_path, __class__.IMAGE_PREFIX.strip('/')))
+            rooms_list.extend([os.path.join(scene_path, __class__.IMAGE_PREFIX.strip('/'), _r) for _r in rooms_name])
+        return rooms_list
+
+    @staticmethod
+    def read_camera_and_image(cam_path: str, info_flags: int, info_root: str) -> Tuple[List, List[np.ndarray]]:
+        """
+        Read camera poses and images from the file system
+
+        Args:
+            cam_path (str): the relative path of camera
+            info_flags (int): the flag of the type of images to be read
+
+        Returns:
+            Tuple[List, List[np.ndarray]]: Camera information and a list of images
+        """
+        if info_root is None:
+            info_root = cam_path[:cam_path.rfind('/')]
+
+        out_cams = list()
+        if info_flags & 1:
+            # Load camera poses
+            z2y_top_m = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]], dtype=np.float32)
+            with open(cam_path, 'r') as f:
+                cam_extr = np.fromstring(f.read(), dtype=np.float32, sep=' ')
+            cam_t = np.matmul(z2y_top_m, cam_extr[:3] / 1000)
+            if cam_extr.shape[0] > 3:
+                cam_r = S3DUtilize.get_rotation_matrix_from_tu(cam_extr[3:6], cam_extr[6:9])
+                cam_r = np.matmul(z2y_top_m, cam_r)
+                cam_hf = cam_extr[9:11]
+            else:
+                cam_r = np.eye(3, dtype=np.float32)
+                cam_hf = None
+            out_cams.extend([cam_r, cam_t, cam_hf])
+        out_images = list()
+        if info_flags & 2:
+            # Load depth image
+            depth_image = cv2.imread(os.path.join(info_root, __class__.DEPTH_FILE), cv2.IMREAD_UNCHANGED)[..., np.newaxis]
+            depth_image[depth_image == 0] = 65535
+            out_images.append(depth_image)
+        if info_flags & 4:
+            # Load RGB image
+            color_image = cv2.imread(os.path.join(info_root, __class__.RGB_FILE), cv2.IMREAD_UNCHANGED)[..., :3][..., ::-1]
+            out_images.append(color_image)
+        if info_flags & 8:
+            # Load semantic image
+            smnt_image = np.array(pil_image.open(os.path.join(info_root, __class__.SEMANTIC_FILE)))[..., np.newaxis]
+            out_images.append(smnt_image)
+        return out_cams, out_images
+    
+    @staticmethod
+    def normal_from_cross_product(points_2d: np.ndarray) -> np.ndarray:
+        xyz_points_pad = np.pad(points_2d, ((0, 1), (0, 1), (0, 0)), mode='symmetric')
+        xyz_points_ver = (xyz_points_pad[:, :-1, :] - xyz_points_pad[:, 1:, :])[:-1, :, :]
+        xyz_points_hor = (xyz_points_pad[:-1, :, :] - xyz_points_pad[1:, :, :])[:, :-1, :]
+        xyz_normal = np.cross(xyz_points_hor, xyz_points_ver)
+        xyz_dist = np.linalg.norm(xyz_normal, axis=-1, keepdims=True)
+        xyz_normal = np.divide(xyz_normal, xyz_dist, out=np.zeros_like(xyz_normal), where=xyz_dist != 0)
+        return xyz_normal
+    
+    @staticmethod
+    def view2points_prsp(cam_paras: List[np.ndarray], attr_images: List[np.ndarray], cos_thrsh=0.15):
+        """
+        View to 3D points casting of a single perspective image
+
+        Args:
+            cam_paras (List[np.ndarray]): camera parameters
+            attr_images (List[np.ndarray]): a list of images to be casted
+            cos_thrsh (float, optional): the cosine threshold to filtering interpolated depth. Defaults to 0.15.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray, np.ndarray]
+        """
+        depth_img, color_img, smnt_img = attr_images
+        cam_r, cam_t, cam_hf = cam_paras
+        img_size = np.asarray(depth_img.shape[:2])[::-1]
+        cam_focal = img_size / 2 / np.tan(cam_hf)
+        cam_fov_d = S3DUtilize.get_fov_normal(img_size, cam_focal).astype(np.float32)
+        v_points = S3DUtilize.cast_perspective_to_local_coord(depth_img, cam_fov_d)
+        vi_normals =  __class__.normal_from_cross_product(v_points)
+
+        # Filtering invalid points
+        view_dist = np.maximum(np.linalg.norm(v_points, axis=-1, keepdims=True), float(10e-5))
+        cosine_dist = np.sum((v_points * vi_normals / view_dist), axis=-1, keepdims=True)
+        cosine_dist = np.abs(cosine_dist)
+        point_valid = cosine_dist > cos_thrsh
+        depth_valid = depth_img < 65535
+        smnt_valid = smnt_img > 0
+        all_valid = (point_valid & depth_valid & smnt_valid)[..., 0]
+
+        v_points = np.matmul(v_points / 1000, cam_r.T) + cam_t
+        v_normal = __class__.normal_from_cross_product(v_points)
+
+        return v_points[all_valid], color_img[all_valid], v_normal[all_valid], smnt_img[all_valid]
+
+    @staticmethod
+    def view2points_pano(cam_paras: List[np.ndarray], attr_images: List[np.ndarray], cos_thrsh=0.15):
+        """
+        View to 3D points casting of a single panoramic image
+
+        Args:
+            cam_paras (List[np.ndarray]): camera parameters
+            attr_images (List[np.ndarray]): a list of images to be casted
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray, np.ndarray]
+        """
+        depth_img, color_img, smnt_img = attr_images
+        _, cam_t, _ = cam_paras
+        p_h, p_w = attr_images[0].shape[:2]
+        p_a = np.arange(p_w, dtype=np.float32) / p_w * 2 * np.pi - np.pi
+        p_b = np.arange(p_h, dtype=np.float32) / p_h * np.pi * -1 + np.pi/2
+        p_a = np.tile(p_a[None], [p_h, 1])[..., np.newaxis]
+        p_b = np.tile(p_b[:, None], [1, p_w])[..., np.newaxis]
+        p_a_sin, p_a_cos, p_b_sin, p_b_cos = np.sin(p_a), np.cos(p_a), np.sin(p_b), np.cos(p_b)
+        point_x = depth_img * p_a_cos * p_b_cos
+        point_y = depth_img * p_b_sin
+        point_z = depth_img * p_a_sin * p_b_cos
+        points = np.concatenate([point_x, point_y, point_z], axis=-1) / 1000
+        vi_normals = __class__.normal_from_cross_product(points)
+        # Filtering invalid points
+        view_dist = np.maximum(np.linalg.norm(points, axis=-1, keepdims=True), float(10e-5))
+        cosine_dist = np.sum((points * vi_normals / view_dist), axis=-1, keepdims=True)
+        cosine_dist = np.abs(cosine_dist)
+        point_valid = cosine_dist > cos_thrsh
+        all_valid = (point_valid & (depth_img < 65535) & (smnt_img > 0))[..., 0]
+
+        points = points + cam_t
+
+        return points[all_valid], color_img[all_valid], vi_normals[all_valid], smnt_img[all_valid]
+
+    @staticmethod
+    def _points2voxel(attr_points: List[np.ndarray], res=0.005) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        p_points, p_colors, p_labels, p_normals = attr_points
+
+        try:
+            vd_points = np.floor(p_points / res).astype(np.int64)
+            vd_max = np.max(vd_points, axis=0)
+            vd_min = np.min(vd_points, axis=0)
+            vd_box = np.cumprod([1, *(vd_max - vd_min)[:2]])
+
+            vd_indices = np.sum((vd_points - vd_min[np.newaxis, ...]) * vd_box[np.newaxis, ...], axis=-1)
+            _, vd_uni = np.unique(vd_indices, return_index=True)
+        except ValueError:
+            return None, None, None, None
+
+        return p_points[vd_uni], p_colors[vd_uni], p_labels[vd_uni], p_normals[vd_uni]
+
+    @staticmethod
+    def _view2points(room_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        prsp_root = os.path.join(room_path, __class__.PERSPECTIVE_PREFIX.strip('/'))
+        cam_paths = [os.path.join(prsp_root, f) for f in os.listdir(os.path.join(BASE_DIR,prsp_root)) if f.endswith(__class__.PRSP_CAM_FILE)]
+        all_infos = list()
+        for cam_path in cam_paths:
+            cam_paras, attr_images = __class__.read_camera_and_image(cam_path, 15, None)
+            r_points, r_colors, r_normal, r_labels = __class__.view2points_prsp(cam_paras, attr_images)
+            all_infos.append((r_points, r_colors, r_normal, r_labels))
+
+        pano_cam_root = os.path.join(room_path, __class__.PANO_CAM_PREFIX.strip('/'))
+        cam_paths = [os.path.join(BASE_DIR, pano_cam_root, f) for f in os.listdir(os.path.join(BASE_DIR, pano_cam_root)) if f.endswith(__class__.PANO_CAM_FILE)]
+        for cam_path in cam_paths:
+            pano_root = os.path.dirname(os.path.dirname(cam_path))
+            pano_root = os.path.join(pano_root, __class__.PANORAMIC_PREFIX.strip('/'))
+            cam_paras, attr_images = __class__.read_camera_and_image(cam_path, 15, pano_root)
+            r_points, r_colors, r_normal, r_labels = __class__.view2points_pano(cam_paras, attr_images)
+            all_infos.append((r_points, r_colors, r_normal, r_labels))
+
+        a_points = np.concatenate([_i[0] for _i in all_infos], axis=0)
+        a_colors = np.concatenate([_i[1] for _i in all_infos], axis=0)
+        a_normals = np.concatenate([_i[2] for _i in all_infos], axis=0)
+        a_labels = np.concatenate([_i[3] for _i in all_infos], axis=0)
+
+        a_points = a_points[..., [2, 0, 1]]  # Convert Y-top to Z-top
+        a_normals = a_normals[..., [2, 0, 1]]
+        # print(len(a_points), len(a_colors), len(a_labels), len(a_normals))
+        return a_points, a_colors, a_labels, a_normals
+
+    @staticmethod
+    def _read_instance_infos(room_path: str, points: np.ndarray, labels: np.ndarray, min_pts=50) -> Dict:
+        scene_id, _, _ = room_path.split('/')
+        anno_file = os.path.join(BASE_DIR,scene_id, __class__.ANNO_FILE)
+        if not os.path.exists(anno_file):
+            return None
+
+        with open(anno_file, 'r') as f:
+            boxes_info: List[Dict] = json.load(f)
+
+        anno_infos = Annotations()
+        rb_idx = 0  # room bounding box ID
+        obj2tgid={}
+        for box_info in boxes_info:
+            
+            b_id = int(box_info['ID'])
+            centroid = np.asarray(box_info['centroid'], dtype=np.float32) / 1000
+            coeffs = np.asarray(box_info['coeffs'], dtype=np.float32) / 1000
+            basis = np.asarray(box_info['basis'], dtype=np.float32)
+            obb_8pts = S3DUtilize.get_8points_bounding_box(basis, coeffs, centroid)
+
+            box_min = np.min(obb_8pts, axis=0, keepdims=True)
+            box_max = np.max(obb_8pts, axis=0, keepdims=True)
+
+            point_max_mask = np.all(points < box_max, axis=1)
+            point_min_mask = np.all(points > box_min, axis=1)
+            point_mask = np.logical_and(point_max_mask, point_min_mask)
+            box_points: np.ndarray = points[point_mask]
+            if box_points.size < min_pts:
+                continue
+
+            box_instances = labels[point_mask][..., 0]
+            instance_id, instance_count = np.unique(box_instances, return_counts=True)
+            instance_id = instance_id[np.argmax(instance_count)]
+
+            instance_points = box_points[box_instances == instance_id]
+            ip_box_min = np.min(instance_points, axis=0)
+            ip_box_max = np.max(instance_points, axis=0)
+            dimension = np.maximum(centroid - ip_box_min, ip_box_max - centroid)
+
+            ur_depth = np.concatenate([centroid, dimension * 2], axis=0)
+
+            anno_infos.index.append(rb_idx)
+            anno_infos.classes.append(instance_id)
+            anno_infos.name.append(NYU40.index_to_label(instance_id))
+            anno_infos.location.append(centroid)
+            anno_infos.dimensions.append(dimension)
+            anno_infos.gt_boxes_upright_depth.append(ur_depth)
+            anno_infos.unaligned_location.append(centroid)
+            anno_infos.unaligned_dimensions.append(dimension)
+            anno_infos.unaligned_gt_boxes_upright_depth.append(ur_depth)
+            obj2tgid[b_id] = rb_idx
+            rb_idx += 1
+
+        obj2tgid_path = os.path.join(BASE_DIR, room_path, "obj2tgid.json")
+        with open(obj2tgid_path, 'w') as json_file:
+            json.dump(obj2tgid, json_file)
+                
+        anno_infos.gt_num = rb_idx
+        anno_infos.axis_align_matrix = np.eye(4, dtype=np.float64)
+        return anno_infos.dump()
+
+    def _mp_format_dataset(self, rooms_list: List[str], proc_unit: ProcessUnit, start_index=0, worker_id=0):
+        del start_index, worker_id
+
+        points_folder = self.envs.get_env_path(proc_unit.out_paths[0])
+        os.makedirs(points_folder, exist_ok=True)
+        semantics_folder = self.envs.get_env_path(proc_unit.out_paths[1])
+        os.makedirs(semantics_folder, exist_ok=True)
+        instance_folder = self.envs.get_env_path(proc_unit.out_paths[2])
+        os.makedirs(instance_folder, exist_ok=True)
+        annotation_folder = self.envs.get_env_path(proc_unit.out_paths[3])
+        os.makedirs(annotation_folder, exist_ok=True)
+
+        for _, room_path in enumerate(rooms_list):
+            if '.' in room_path:
+                continue
+            scene_id, _, room_id = room_path.split('/')
+            dump_name = f'{scene_id}_{room_id}_1cm.bin'
+            points_path = os.path.join(points_folder, dump_name)
+            semantics_path = os.path.join(semantics_folder, dump_name)
+            instance_path = os.path.join(instance_folder, dump_name)
+            annotation_path = os.path.join(annotation_folder, dump_name)
+            if np.all([os.path.exists(_path) for _path in [points_path, semantics_path, annotation_path]]):
+                continue
+
+            # Step 1: Read images and make point clouds
+            
+            a_points, a_colors, a_labels, a_normals = self._view2points(room_path)
+            v_points, v_colors, v_labels, v_normals = self._points2voxel((a_points, a_colors, a_labels, a_normals), 0.01)
+            if v_points is None:
+                print(f'Ignore {room_path} with invalid points')
+                continue
+            # Step 2: Read bounding box information
+            anno_infos = self._read_instance_infos(room_path, v_points, v_labels)
+            if anno_infos is None:
+                print(f'Ignore {room_path} with invalid annotations')
+                continue
+            # print(v_points.shape)
+            # print(v_colors.shape)
+            # print(v_labels.shape)
+            # print(v_normals.shape)
+            np.concatenate([v_points.astype(np.float32), v_colors.astype(np.float32), v_normals.astype(np.float32)], axis=-1).tofile(points_path)
+            v_labels.astype(np.int64).tofile(semantics_path)
+            with open(annotation_path, 'wb') as a_fp:
+                pickle.dump(anno_infos, a_fp)
+
+    def multiple_processor(func, samples: List, workers, args: Tuple):
+        samples_per_worker = int((len(samples) - 1) / workers + 1)
+        processes = list()
+        for w in range(workers):
+            start_index = w * samples_per_worker
+            end_index = min((w + 1) * samples_per_worker, len(samples))
+            f_args = (samples[start_index: end_index], ) + args + (start_index, w)
+            t = multiprocessing.Process(target=func, args=f_args)
+            processes.append(t)
+            t.start()
+        for p in processes:
+            p.join()
+        
+    def format_dataset(self, proc_unit: ProcessUnit):
+        attrs = proc_unit.attrs
+
+        desc_dir = os.path.join(self.envs.out_data_root, 'desc')
+        os.makedirs(desc_dir, exist_ok=True)
+        with open(os.path.join(desc_dir, proc_unit.out_paths[0]), 'wb') as b_fp:
+            pickle.dump(np.zeros([0, 9], np.float32), b_fp)
+        with open(os.path.join(desc_dir, proc_unit.out_paths[1]), 'wb') as b_fp:
+            pickle.dump(np.zeros([0], np.int64), b_fp)
+
+        rooms_list = self._get_rooms_list_by_types(attrs['room_types'])
+
+        __class__.multiple_processor(self._mp_format_dataset, rooms_list, 8, \
+            (proc_unit, ))
+# def main():
+#     # Create the environment configuration instance
+#     envs = EnvsConfig()
+#     envs.out_data_root = "/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/uni3d_output"
+#     envs.in_data_root = "/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans"
+#     # Add other necessary environment variables here
+
+#     # Define the process unit
+#     proc_unit = ProcessUnit()
+#     proc_unit.in_paths = ["data"]
+#     proc_unit.out_paths = ["points", "semantic_mask", "instance", "annotations"]
+#     proc_unit.attrs = {"room_types": ["all"]}
+
+#     # Create the Structured3DDataGen instance
+#     data_gen = Structured3DDataGen([proc_unit], envs)
+
+#     # Run the dataset formatting
+#     data_gen.format_dataset(proc_unit)
+
+# if __name__ == "__main__":
+#     main()
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Process Structured3D dataset')
+    parser.add_argument('--base_dir', type=str, 
+                        default='/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans',
+                        help='Base directory for scans')
+    parser.add_argument('--out_data_root', type=str, 
+                        default='/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/uni3d_output',
+                        help='Output data root directory')
+    parser.add_argument('--in_data_root', type=str, 
+                        default='/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans',
+                        help='Input data root directory')
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+    global BASE_DIR
+    BASE_DIR = args.base_dir
+    
+    envs = EnvsConfig()
+    envs.out_data_root = args.out_data_root
+    envs.in_data_root = args.in_data_root
+
+    proc_unit = ProcessUnit()
+    proc_unit.in_paths = ["data"]
+    proc_unit.out_paths = ["points", "semantic_mask", "instance", "annotations"]
+    proc_unit.attrs = {"room_types": ["all"]}
+
+    data_gen = Structured3DDataGen([proc_unit], envs)
+
+    data_gen.format_dataset(proc_unit)
+
+if __name__ == "__main__":
+    main()
diff --git a/prepare_data/structured3d/utils/base_dataset.py b/prepare_data/structured3d/utils/base_dataset.py
new file mode 100644
index 0000000..c59fc7e
--- /dev/null
+++ b/prepare_data/structured3d/utils/base_dataset.py
@@ -0,0 +1,18 @@
+"""
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""
+from abc import abstractmethod
+
+from utils.config import ProcessUnit, EntryBase
+
+
+class DatasetBase(EntryBase):
+    """
+    The base class of dataset
+    """
+    @abstractmethod
+    def format_dataset(self, proc_unit: ProcessUnit):
+        """
+        Construct 3D point cloud from views
+        """
\ No newline at end of file
diff --git a/prepare_data/structured3d/utils/config.py b/prepare_data/structured3d/utils/config.py
new file mode 100644
index 0000000..b5f89db
--- /dev/null
+++ b/prepare_data/structured3d/utils/config.py
@@ -0,0 +1,270 @@
+"""
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""
+# pylint: disable=logging-fstring-interpolation
+import logging
+import re
+import multiprocessing as mp
+from abc import abstractmethod
+from typing import List, Dict, Tuple
+import multiprocessing
+import yaml
+import json
+import copy
+import inspect
+import logging
+import numpy as np
+import os
+
+class DictRecursive(object):
+    def __init__(self):
+        pass
+
+    def load(self, kargs: dict, shared_dict=None):
+        """
+        Launch args of class from a dict. All subclass of DictRecursive will call this function automatically. Supported
+            types includes int, float, list, str and DictRecursive
+
+        Args:
+            kargs: a dict saved the pairs of name/value of attributions
+            shared_dict: a shared item used by all other items
+        """
+        if shared_dict is None:
+            shared_dict = {}
+        for cls_arg_name in self.__dict__.keys():
+            arg_value = None
+            if kargs is not None:
+                arg_value = kargs[cls_arg_name] if cls_arg_name in kargs.keys() else None
+            if shared_dict is not None:
+                arg_value = shared_dict[cls_arg_name] if cls_arg_name in shared_dict.keys() else arg_value
+            cls_arg = self.__dict__[cls_arg_name]
+            self.__dict__[cls_arg_name] = self.parse_single_arg(cls_arg, arg_value, shared_dict)
+        return self
+
+    def save(self):
+        save_dict = {}
+        for cls_arg_name in self.__dict__.keys():
+            save_dict[cls_arg_name] = self.inverse_single_arg(self.__dict__[cls_arg_name])
+        return save_dict
+
+    def load_from_yaml(self, yaml_path, shared_scope=''):
+        with open(yaml_path, 'r', encoding='utf-8') as fp:
+            cfg_cxt = yaml.load(fp.read(), Loader=yaml.FullLoader)
+            self.load(cfg_cxt, cfg_cxt[shared_scope] if shared_scope in cfg_cxt.keys() else dict())
+
+    def load_from_json(self, json_path):
+        with open(json_path, 'r', encoding='utf-8') as fp:
+            self.load(json.load(fp))
+
+    def save_to_json(self, json_path):
+        with open(json_path, 'w') as fp:
+            save_meta = self.save()
+            json.dump(self.save(), fp)
+
+    @staticmethod
+    def inverse_single_arg(arg_value):
+        if issubclass(type(arg_value), DictRecursive):
+            return arg_value.save()
+        elif isinstance(arg_value, list):
+            list_arg_value = list()
+            for a_v in arg_value:
+                list_arg_value.append(DictRecursive.inverse_single_arg(a_v))
+            return list_arg_value
+        elif isinstance(arg_value, np.ndarray):
+            return arg_value.tolist()
+        else:
+            return arg_value
+
+    @staticmethod
+    def parse_single_arg(cls_arg, arg_value, shared_dict=None):
+        if isinstance(cls_arg, int):
+            cls_arg_value = int(arg_value) if arg_value is not None else cls_arg
+        elif isinstance(cls_arg, str):
+            cls_arg_value = str(arg_value) if arg_value is not None else cls_arg
+        elif isinstance(cls_arg, float):
+            cls_arg_value = float(arg_value) if arg_value is not None else cls_arg
+        elif isinstance(cls_arg, list):
+            cls_arg_value = list()
+            cls_arg_e = str() if not cls_arg else cls_arg[0]
+            if arg_value is not None:
+                for a_v in arg_value:
+                    cls_arg_value.append(DictRecursive.parse_single_arg(cls_arg_e, a_v, shared_dict))
+        elif isinstance(cls_arg, dict):
+            if arg_value is not None:
+                cls_arg_value = dict()
+                for a_v in arg_value:
+                    cls_arg_value[a_v] = arg_value[a_v]
+            else:
+                cls_arg_value = cls_arg
+        elif isinstance(cls_arg, np.ndarray):
+            if arg_value is not None:
+                cls_arg_value = np.asarray(arg_value, cls_arg.dtype)
+            else:
+                cls_arg_value = cls_arg
+        elif issubclass(type(cls_arg), DictRecursive):
+            cls_arg_value = type(cls_arg)()
+            cls_arg_value.load(arg_value, shared_dict)
+        else:
+            raise NotImplementedError
+        return cls_arg_value
+
+    def match_function_args(self, external_dict, target_func):
+        args_dict = copy.deepcopy(external_dict)
+        for func_key in inspect.signature(target_func).parameters.keys():
+            if func_key not in self.__dict__.keys():
+                continue
+            if func_key in args_dict.keys():
+                continue
+            args_dict[func_key] = self.__dict__[func_key]
+        return args_dict
+
+
+class ProcessUnit(DictRecursive):
+    """
+    Pipeline units
+    """
+    def __init__(self):
+        super().__init__()
+        self.assemble_function = str()
+        self.name = str()
+        self.stride = 1
+        self.attrs = dict()
+        self.in_paths = list()
+        self.out_paths = list()
+
+
+class EntryConfig(DictRecursive):
+    """
+    Main entry of each task
+    """
+    def __init__(self):
+        super().__init__()
+        self.assemble_class = str()
+        self.process_pipelines = list([ProcessUnit()])
+
+
+class EnvsConfig(DictRecursive):
+    """
+    Global environments
+    """
+    def __init__(self):
+        super().__init__()
+        self.in_data_root = str()
+        self.out_data_root = str()
+        self.io_paths: Dict[str, str] = dict()
+
+    def get_env_path(self, env_name: str):
+        """
+        Get the absolute folder path by the env name
+        """
+        if 'in_data_root' not in self.io_paths:
+            self.io_paths['in_data_root'] = self.in_data_root
+            self.io_paths['out_data_root'] = self.out_data_root
+        self.io_paths = {
+            "points": os.path.join(self.out_data_root, "points"),
+            "semantic_mask": os.path.join(self.out_data_root, "semantic_mask"),
+            "instance": os.path.join(self.out_data_root, "instance"),
+            "annotations": os.path.join(self.out_data_root, "annotations"),
+        }
+        rel_path = self.io_paths[env_name]
+        while True:
+            regex_pattern = r'\$.*\$'
+            patterns = re.findall(regex_pattern, rel_path)
+            if not patterns:
+                break
+            rel_path = rel_path.replace(patterns[0], self.io_paths[patterns[0][1:-1]])
+        return rel_path
+
+
+class StreamingTasks(DictRecursive):
+    """
+    Main entry of streaming tasks
+    """
+    def __init__(self):
+        super().__init__()
+        self.envs = EnvsConfig()
+        self.streaming_lines = list([EntryConfig()])
+
+
+class EntryBase:
+    """
+    The basic config of entry
+    """
+    def __init__(self, proc_units: List[ProcessUnit], envs: EnvsConfig) -> None:
+        self.proc_units = proc_units
+        self.envs = envs
+
+    def execute_pipeline(self):
+        """
+        execute the data processing pipeline
+        """
+        for proc_unit in self.proc_units:
+            proc_func = getattr(self, proc_unit.assemble_function)
+            proc_func(proc_unit)
+
+
+class MPEntryBase(EntryBase):
+    """
+    The multi-process config of entry
+    """
+    def __init__(self, proc_units: List[ProcessUnit], envs: EnvsConfig) -> None:
+        super().__init__(proc_units, envs)
+        self._enable_mp = True
+        self._num_worker = 8
+
+    @abstractmethod
+    def _sample_list(self):
+        """
+        Return the list of samples to be processed
+        """
+
+    def _execute_proc_unit(self, sample: str, proc_unit: ProcessUnit, shared_vars: Dict):
+        proc_func = getattr(self, proc_unit.assemble_function)
+        proc_func(sample, proc_unit, shared_vars)
+
+    def _merged_cross_processing(self, ipc_vars):
+        """
+        Merge all shared list information cross all processors
+        """
+
+    def _merged_within_processing(self, shared_vars, ipc_vars):
+        """
+        Merge all information within a processor
+        """
+
+    def _mp_execute_pipeline(self, samples, ipc_vars: List, worker_offset=0, worker_id=0):
+        del worker_offset
+        logging.info(f'worker {worker_id} begin...')
+        shared_vars = dict()
+        for s_idx, sample in enumerate(samples):
+            for proc_unit in self.proc_units:
+                if s_idx % proc_unit.stride != 0:
+                    continue
+                self._execute_proc_unit(sample, proc_unit, shared_vars)
+        self._merged_within_processing(shared_vars, ipc_vars)
+    
+    def multiple_processor(func, samples: List, workers, args: Tuple):
+        samples_per_worker = int((len(samples) - 1) / workers + 1)
+        processes = list()
+        for w in range(workers):
+            start_index = w * samples_per_worker
+            end_index = min((w + 1) * samples_per_worker, len(samples))
+            f_args = (samples[start_index: end_index], ) + args + (start_index, w)
+            t = multiprocessing.Process(target=func, args=f_args)
+            processes.append(t)
+            t.start()
+        for p in processes:
+            p.join()
+            
+    def execute_pipeline(self):
+        logging.info(f'- Start to execute pipeline {self.__class__.__name__}')
+        samples = self._sample_list()
+        ipc_vars = mp.Manager().list()
+        if self._enable_mp:
+            __class__.multiple_processor(self._mp_execute_pipeline, samples, workers=8, \
+                args=(ipc_vars, ))
+        else:
+            self._mp_execute_pipeline(samples, ipc_vars)
+        self._merged_cross_processing(list(ipc_vars))
+        logging.info(f'- Finished to execute pipeline {self.__class__.__name__}')
\ No newline at end of file
diff --git a/prepare_data/structured3d/utils/label_mapping.txt b/prepare_data/structured3d/utils/label_mapping.txt
new file mode 100644
index 0000000..593508b
--- /dev/null
+++ b/prepare_data/structured3d/utils/label_mapping.txt
@@ -0,0 +1,40 @@
+1	wall
+2	floor
+3	cabinet
+4	bed
+5	chair
+6	sofa
+7	table
+8	door
+9	window
+10	bookshelf
+11	picture
+12	counter
+13	blinds
+14	desk
+15	shelves
+16	curtain
+17	dresser
+18	pillow
+19	mirror
+20	floor mat
+21	clothes
+22	ceiling
+23	books
+24	refrigerator
+25	television
+26	paper
+27	towel
+28	shower curtain
+29	box
+30	whiteboard
+31	person
+32	nightstand
+33	toilet
+34	sink
+35	lamp
+36	bathtub
+37	bag
+38	otherstructure
+39	otherfurniture
+40	otherprop
diff --git a/prepare_data/structured3d/utils/nyu_40.py b/prepare_data/structured3d/utils/nyu_40.py
new file mode 100644
index 0000000..48410ba
--- /dev/null
+++ b/prepare_data/structured3d/utils/nyu_40.py
@@ -0,0 +1,94 @@
+"""
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""
+import os
+from typing import Dict
+
+
+class NYU40:
+    """
+    NYU40 label definition and color scheme
+    """
+    LABEL_DICT: Dict[str, int] = dict()
+    INDEX_DICT: Dict[int, str] = dict()
+
+    @staticmethod
+    def load_dict(i2l: bool):
+        """
+        Load global label dictionary
+        """
+        if not __class__.LABEL_DICT:
+            label_path = os.path.join(os.path.dirname(os.path.abspath(\
+                __file__)), 'label_mapping.txt')
+            with open(label_path, encoding='utf-8') as l_fp:
+                for line in l_fp.readlines():
+                    items = line.rstrip('\n').split('\t')
+                    __class__.LABEL_DICT[items[-1]] = int(items[0])
+                    __class__.INDEX_DICT[int(items[0])] = items[-1]
+        return __class__.INDEX_DICT if i2l else __class__.LABEL_DICT
+
+    @staticmethod
+    def label_to_index(label: str):
+        """
+        Mapping index to label
+        """
+        return __class__.load_dict(False)[label]
+
+    @staticmethod
+    def index_to_label(index: int):
+        """
+        Mapping index to label
+        """
+        return __class__.load_dict(True)[index]
+
+    @staticmethod
+    def color_scheme():
+        """
+        Get the color coding scheme
+        Source from: https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/util.py
+        Copyright: ScanNet
+        """
+        return [
+            (0, 0, 0),
+            (174, 199, 232),		# wall
+            (152, 223, 138),		# floor
+            (31, 119, 180), 		# cabinet
+            (255, 187, 120),		# bed
+            (188, 189, 34), 		# chair
+            (140, 86, 75),  		# sofa
+            (255, 152, 150),		# table
+            (214, 39, 40),  		# door
+            (197, 176, 213),		# window
+            (148, 103, 189),		# bookshelf
+            (196, 156, 148),		# picture
+            (23, 190, 207), 		# counter
+            (178, 76, 76),
+            (247, 182, 210),		# desk
+            (66, 188, 102),
+            (219, 219, 141),		# curtain
+            (140, 57, 197),
+            (202, 185, 52),
+            (51, 176, 203),
+            (200, 54, 131),
+            (92, 193, 61),
+            (78, 71, 183),
+            (172, 114, 82),
+            (255, 127, 14), 		# refrigerator
+            (91, 163, 138),
+            (153, 98, 156),
+            (140, 153, 101),
+            (158, 218, 229),		# shower curtain
+            (100, 125, 154),
+            (178, 127, 135),
+            (120, 185, 128),
+            (146, 111, 194),
+            (44, 160, 44),  		# toilet
+            (112, 128, 144),		# sink
+            (96, 207, 209),
+            (227, 119, 194),		# bathtub
+            (213, 92, 176),
+            (94, 106, 211),
+            (82, 84, 163),  		# other furn
+            (100, 85, 144)
+        ]
\ No newline at end of file
diff --git a/prepare_data/structured3d/utils/s3dutilize.py b/prepare_data/structured3d/utils/s3dutilize.py
new file mode 100644
index 0000000..169c870
--- /dev/null
+++ b/prepare_data/structured3d/utils/s3dutilize.py
@@ -0,0 +1,118 @@
+import numpy as np
+
+class Annotations:
+    """
+    Annotation information
+    """
+    def __init__(self) -> None:
+        self.gt_num = 0
+        self.name = list()
+        self.location = list()
+        self.dimensions = list()
+        self.gt_boxes_upright_depth = list()
+        self.unaligned_location = list()
+        self.unaligned_dimensions = list()
+        self.unaligned_gt_boxes_upright_depth = list()
+        self.index = list()
+        self.classes = list()
+        self.axis_align_matrix = list()
+
+    def dump(self):
+        """
+        Dump information into dict
+        """
+        anno_dict = dict()
+        anno_dict['gt_num'] = int(self.gt_num)
+        anno_dict['name'] = np.asarray(self.name)
+        anno_dict['location'] = np.asarray(self.location, dtype=np.float64)
+        anno_dict['dimensions'] = np.asarray(self.dimensions, dtype=np.float64)
+        anno_dict['gt_boxes_upright_depth'] = np.asarray(self.gt_boxes_upright_depth, \
+            dtype=np.float64)
+        anno_dict['unaligned_location'] = np.asarray(self.unaligned_location, \
+            dtype=np.float64)
+        anno_dict['unaligned_dimensions'] = np.asarray(self.unaligned_dimensions, \
+            dtype=np.float64)
+        anno_dict['unaligned_gt_boxes_upright_depth'] = np.asarray(
+            self.unaligned_gt_boxes_upright_depth, dtype=np.float64)
+        anno_dict['index'] = np.asarray(self.index, dtype=np.int32)
+        anno_dict['class'] = np.asarray(self.classes, dtype=np.int64)
+        anno_dict['axis_align_matrix'] = np.asarray(self.axis_align_matrix, dtype=np.float64)
+        return anno_dict
+    
+    
+class S3DUtilize(object):
+    """
+    Structured3D utilize functions
+    """
+    @staticmethod
+    def get_fov_normal(image_size, cam_focal, norm=True):
+        """
+        Get the normal FoV directions
+        """
+        u2x, v2y = [(np.arange(1, image_size[a_i] + 1) - image_size[a_i] / 2) / cam_focal[a_i]\
+            for a_i in [0, 1]]
+        cam_m_u2x = np.tile([u2x], (image_size[1], 1))
+        cam_m_v2y = np.tile(v2y[:, np.newaxis], (1, image_size[0]))
+        cam_m_depth = np.ones(image_size).T
+        fov_normal = np.stack((cam_m_depth, -1 * cam_m_v2y, cam_m_u2x), axis=-1)
+        if norm:
+            fov_normal = fov_normal / np.sqrt(np.sum(np.square(fov_normal), axis=-1, keepdims=True))
+        return fov_normal
+
+    @staticmethod
+    def cast_perspective_to_local_coord(depth_img: np.ndarray, fov_normal):
+        """
+        Cast the perspective image into 3D coordinate system
+        """
+        return depth_img * fov_normal
+
+    @staticmethod
+    def cast_points_to_voxel(points, labels, room_size=(6.4, 3.2, 6.4), room_stride=0.2):
+        """
+        Voxelize the points
+        """
+        vol_resolution = (np.asarray(room_size) / room_stride).astype(np.int32)
+        vol_index = np.floor(points / room_stride).astype(np.int32)
+        in_vol = np.logical_and(np.all(vol_index < vol_resolution, axis=1), \
+            np.all(vol_index >= 0, axis=1))
+        v_x, v_y, v_z = [d_[..., 0] for d_ in np.split(vol_index[in_vol], 3, axis=-1)]
+        vol_label = labels[in_vol]
+        vol_data = np.zeros(vol_resolution, dtype=np.uint8)
+        vol_data[v_x, v_y, v_z] = vol_label
+        return vol_data
+
+    @staticmethod
+    def get_rotation_matrix_from_tu(cam_front, cam_up):
+        """
+        Get the rotation matrix from TU-coords
+        """
+        cam_n = np.cross(cam_front, cam_up)
+        cam_m = np.stack((cam_front, cam_up, cam_n), axis=1).astype(np.float32)
+        return cam_m
+
+    @staticmethod
+    def get_8points_bounding_box(basis, coeffs, centroid):
+        """
+        Get the 8 corners from the bounding box parameters
+        """
+        corners = np.zeros((8, 3))
+        coeffs = np.abs(coeffs)
+        corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * \
+            coeffs[1] + basis[2, :] * coeffs[2]
+        corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * \
+            coeffs[1] + basis[2, :] * coeffs[2]
+        corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * \
+            coeffs[1] + basis[2, :] * coeffs[2]
+        corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * \
+            coeffs[1] + basis[2, :] * coeffs[2]
+
+        corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * \
+            coeffs[1] + -basis[2, :] * coeffs[2]
+        corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * \
+            coeffs[1] + -basis[2, :] * coeffs[2]
+        corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * \
+            coeffs[1] + -basis[2, :] * coeffs[2]
+        corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * \
+            coeffs[1] + -basis[2, :] * coeffs[2]
+        corners = corners + np.tile(centroid, (8, 1))
+        return corners
diff --git a/preprocess/build.py b/preprocess/build.py
index 551d97f..fb3445e 100644
--- a/preprocess/build.py
+++ b/preprocess/build.py
@@ -3,5 +3,6 @@
 PROCESSOR_REGISTRY = Registry("Processor")
 
 def build_processor(processor_name, data_config, modality_config, split):
+    print(f"Building processor: {processor_name}")
     processor = PROCESSOR_REGISTRY.get(processor_name)(data_config, modality_config, split)
     return processor
\ No newline at end of file
diff --git a/preprocess/feat1D/__init__.py b/preprocess/feat1D/__init__.py
index 9a1b744..7db5e81 100644
--- a/preprocess/feat1D/__init__.py
+++ b/preprocess/feat1D/__init__.py
@@ -1,2 +1,5 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
+from .multiscan import *
+from .structured3d import *
\ No newline at end of file
diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py
new file mode 100644
index 0000000..efab03c
--- /dev/null
+++ b/preprocess/feat1D/arkit.py
@@ -0,0 +1,107 @@
+import os.path as osp
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from common import load_utils 
+from util import labelmap, arkit
+from util.arkit import ARKITSCENE_SCANNET
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat1D.base import Base1DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class ARKitScenes1DProcessor(Base1DProcessor):
+    def __init__(self, config_data, config_1D, split) -> None:
+        super(ARKitScenes1DProcessor, self).__init__(config_data, config_1D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = arkit.get_scan_ids(files_dir, split)
+        
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
+        load_utils.ensure_dir(self.out_dir)        
+        # Object Referrals
+        self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
+        
+        # label map
+        self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id')
+        self.undefined = 0
+
+    
+    def load_objects_for_scan(self, scan_id):
+        """Load and parse the annotations JSON for the given scan ID."""
+        objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        
+        objects = []
+        for _i, label_info in enumerate(annotations["data"]):
+            obj_label = label_info["label"]
+            object_id = _i + 1
+            scannet_class=ARKITSCENE_SCANNET[obj_label]
+            nyu40id=self.label_map[scannet_class]
+            objects.append({
+                "objectId": object_id,
+                "global_id": nyu40id
+            })
+        
+        
+        return objects
+    
+    
+    
+    def compute1DFeaturesEachScan(self, scan_id):
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
+        scan_objects = self.load_objects_for_scan(scan_id)
+
+        object_referral_embeddings, scene_referral_embeddings = {}, None
+        if len(scan_objects) != 0:
+            object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
+
+        scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
+        
+        if len(scene_referrals) != 0:
+            if len(scene_referrals) > 10:
+                scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+            
+            scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
+            scene_referrals = ' '.join(scene_referrals)
+            scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
+            assert scene_referral_embeddings is not None
+        
+        data1D = {}
+        data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
+        data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+        
+        torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+             
+    def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map):
+        object_referral_embeddings = {}
+        
+        scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
+        
+        for idx, scan_object in enumerate(scan_objects):
+            instance_id = int(scan_object['objectId'])
+            
+            if instance_id not in objectID_to_labelID_map.keys():
+                continue
+            
+            # Object Referral
+            object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id]
+            if len(object_referral) != 0:
+                object_referral_feats = self.extractTextFeats(object_referral)    
+                if object_referral_feats is not None:
+                    object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1)
+                    assert object_referral_feats.shape == (1, self.embed_dim)
+                    
+                    object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats}
+
+            
+        return object_referral_embeddings
diff --git a/preprocess/feat1D/multiscan.py b/preprocess/feat1D/multiscan.py
new file mode 100644
index 0000000..58b9ff9
--- /dev/null
+++ b/preprocess/feat1D/multiscan.py
@@ -0,0 +1,123 @@
+import os.path as osp
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from common import load_utils 
+from util import labelmap, multiscan
+
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat1D.base import Base1DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class MultiScan1DProcessor(Base1DProcessor):
+    def __init__(self, config_data, config_1D, split) -> None:
+        super(MultiScan1DProcessor, self).__init__(config_data, config_1D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = multiscan.get_scan_ids(files_dir, split)
+        
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
+        load_utils.ensure_dir(self.out_dir)        
+        # Object Referrals
+        self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
+        
+        # label map
+        self.undefined = 0
+
+    def load_objects_for_scan(self, scan_id):
+        """Load and parse the annotations JSON for the given scan ID."""
+        objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        objects = []
+        
+        for obj in annotations["objects"]:
+            objects.append({
+                "objectId": obj["objectId"],
+                "global_id": obj.get("label")
+            })
+        
+        return objects
+    
+    def extractTextFeats(self, texts, return_text = False):
+        text_feats = []
+        
+        for text in texts:
+            encoded_text = self.model.tokenizer(text, padding=True, add_special_tokens=True, return_tensors="pt").to(self.device)  
+            if encoded_text['input_ids'].shape[1] > 512: 
+                continue
+            
+            with torch.no_grad():
+                encoded_text = self.model.text_encoder(encoded_text.input_ids, attention_mask = encoded_text.attention_mask,                      
+                                                return_dict = True, mode = 'text').last_hidden_state[:, 0].cpu().numpy().reshape(1, -1)
+                
+            text_feats.append({'text' : text, 'feat' : encoded_text})
+        
+        if len(text_feats) == 0:
+            return None
+        
+        if return_text:
+            return text_feats
+         
+        text_feats = [text_feat['feat'] for text_feat in text_feats]
+        text_feats = np.concatenate(text_feats)
+        return text_feats
+    
+    
+    def compute1DFeaturesEachScan(self, scan_id):
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']        
+        scan_objects = self.load_objects_for_scan(scan_id)
+
+        object_referral_embeddings, scene_referral_embeddings = {}, None
+        if len(scan_objects) != 0:
+            object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map)
+
+        scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
+        
+        if len(scene_referrals) != 0:
+            if len(scene_referrals) > 10:
+                scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+            
+            scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
+            scene_referrals = ' '.join(scene_referrals)
+            scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
+            assert scene_referral_embeddings is not None
+        
+        data1D = {}
+        data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
+        data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+        
+        torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+             
+    def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map):
+        object_referral_embeddings = {}
+        
+        scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
+        
+        for idx, scan_object in enumerate(scan_objects):
+            instance_id = int(scan_object['objectId'])
+            
+            if instance_id not in objectID_to_labelID_map.keys():
+                continue
+            
+            # Object Referral
+            object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id]
+            if len(object_referral) != 0:
+                object_referral_feats = self.extractTextFeats(object_referral)    
+                if object_referral_feats is not None:
+                    object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1)
+                    assert object_referral_feats.shape == (1, self.embed_dim)
+                    
+                    object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats}
+
+            
+        return object_referral_embeddings
\ No newline at end of file
diff --git a/preprocess/feat1D/structured3d.py b/preprocess/feat1D/structured3d.py
new file mode 100644
index 0000000..bca603c
--- /dev/null
+++ b/preprocess/feat1D/structured3d.py
@@ -0,0 +1,135 @@
+import os.path as osp
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from common import load_utils 
+from util import structured3d
+from util.structured3d import S3D_SCANNET
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat1D.base import Base1DProcessor
+
+
+@PROCESSOR_REGISTRY.register()
+class Structured3D_1DProcessor(Base1DProcessor):
+    def __init__(self, config_data, config_1D, split) -> None:
+        super(Structured3D_1DProcessor, self).__init__(config_data, config_1D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = structured3d.get_scan_ids(files_dir, split)
+        
+        self.out_dir = config_data.process_dir
+        load_utils.ensure_dir(self.out_dir)        
+        # Object Referrals
+        self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
+        
+    
+    def compute1DFeaturesEachScan(self, scan_id):
+        full_scan_id = scan_id
+        scan_id = scan_id.split('_')
+        room_id = scan_id[-1]
+        scan_id = scan_id[0]+'_'+scan_id[1]
+        obj2tgtid_map = load_utils.load_json(osp.join(self.data_dir,'scans',scan_id,'2D_rendering',room_id,'obj2tgid.json'))
+        
+        scene_out_dir = osp.join(self.out_dir, full_scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] 
+        
+        object_referral_embeddings, scene_referral_embeddings = {}, None
+        if len(objectID_to_labelID_map.keys()) != 0:
+            object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(full_scan_id, objectID_to_labelID_map, obj2tgtid_map)
+
+        scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == full_scan_id]
+        
+        if len(scene_referrals) != 0:
+            if len(scene_referrals) > 10:
+                scene_referrals = np.random.choice(scene_referrals, size=10, replace=False)
+            
+            scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals]
+            scene_referrals = ' '.join(scene_referrals)
+            scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True)            
+            assert scene_referral_embeddings is not None
+        
+        data1D = {}
+        data1D['objects'] = {'referral_embeddings' : object_referral_embeddings}
+        data1D['scene']   = {'referral_embedding': scene_referral_embeddings}
+        
+        # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D)
+             
+    def computeObjectWise1DFeaturesEachScan(self, scan_id, objectID_to_labelID_map, obj2tgtid):
+        object_referral_embeddings = {}
+        matched_objids=[]
+        scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id]
+        
+        for instance_id in objectID_to_labelID_map.keys():
+            if str(instance_id) not in obj2tgtid.keys():
+                # print(f"Instance ID {instance_id} not found in obj2tgtid mapping for scan {scan_id}. Skipping...")
+                continue
+            mapped_obj_id = obj2tgtid[str(instance_id)]
+            nyu40id= objectID_to_labelID_map[instance_id]
+            if nyu40id==0:
+                continue
+            label = S3D_SCANNET[nyu40id]
+            object_referral = []
+            for referral in scan_referrals:
+                if int(referral['target_id']) == int(mapped_obj_id):
+                    if referral['instance_type'] == label:
+                    # print(referral['utterance'])
+                        matched_objids.append(instance_id)
+                        # print(scan_id,label,referral['instance_type'],referral['target_id'],mapped_obj_id)
+                        object_referral.append(referral['utterance'])
+                    # else:
+                        # print(scan_id,label,referral['instance_type'],referral['target_id'],mapped_obj_id)
+                    
+            if len(object_referral) != 0:
+                # print(scan_id,instance_id,len(object_referral))
+                object_referral_feats = self.extractTextFeats(object_referral)    
+                if object_referral_feats is not None:
+                    object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1)
+                    assert object_referral_feats.shape == (1, self.embed_dim)
+                    
+                    object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats}
+
+        # finding unmatched referrals
+        unmatched_referrals = []
+        for referral in scan_referrals:
+            mapped_obj_id = referral['target_id']
+            if int(mapped_obj_id) not in [int(obj2tgtid[str(instance_id)]) for instance_id in objectID_to_labelID_map.keys() if str(instance_id) in obj2tgtid]:
+                unmatched_referrals.append(referral)
+            elif any(int(mapped_obj_id) == int(obj2tgtid[str(instance_id)]) and S3D_SCANNET[objectID_to_labelID_map[instance_id]] != referral['instance_type'] 
+                     for instance_id in objectID_to_labelID_map.keys() if str(instance_id) in obj2tgtid and objectID_to_labelID_map[instance_id] != 0):
+                unmatched_referrals.append(referral)
+
+        label_to_instances = {}
+        for instance_id, nyu40id in objectID_to_labelID_map.items():
+            if nyu40id == 0:
+                continue
+            label = S3D_SCANNET[nyu40id]
+            if label not in label_to_instances:
+                label_to_instances[label] = []
+            label_to_instances[label].append(instance_id)
+
+        for referral in unmatched_referrals:
+            instance_type = referral['instance_type']
+            if instance_type in label_to_instances and len(label_to_instances[instance_type]) == 1:
+                instance_id = label_to_instances[instance_type][0]
+                if instance_id not in matched_objids:
+                    # print(f"Matching unmatched referral to unique instance: {scan_id},{instance_id}, {instance_type}, {referral['target_id']}")
+                    if instance_id not in object_referral_embeddings:
+                        object_referral = [referral['utterance']]
+                    else:
+                        object_referral_embeddings[instance_id]['referral'].append(referral['utterance'])
+                    
+                    object_referral_feats = self.extractTextFeats(object_referral)
+                    if object_referral_feats is not None:
+                        object_referral_feats = np.mean(object_referral_feats, axis=0).reshape(1, -1)
+                        object_referral_embeddings[instance_id] = {'referral': object_referral, 'feats': object_referral_feats}
+
+    
+        # print(object_referral_embeddings.keys())
+        return object_referral_embeddings
diff --git a/preprocess/feat2D/__init__.py b/preprocess/feat2D/__init__.py
index 9a1b744..7db5e81 100644
--- a/preprocess/feat2D/__init__.py
+++ b/preprocess/feat2D/__init__.py
@@ -1,2 +1,5 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
+from .multiscan import *
+from .structured3d import *
\ No newline at end of file
diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py
new file mode 100644
index 0000000..531b5b6
--- /dev/null
+++ b/preprocess/feat2D/arkit.py
@@ -0,0 +1,277 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+import shutil
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+from omegaconf import DictConfig
+from typing import List, Dict, Tuple
+import pandas as pd
+from common import load_utils
+from util import render, arkit, visualisation
+from util import image as image_util
+
+
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat2D.base import Base2DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class ARKitScenes2DProcessor(Base2DProcessor):
+    """ARKitScenes 2D (RGB) feature processor class."""
+    def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) -> None:
+        super(ARKitScenes2DProcessor, self).__init__(config_data, config_2D, split)
+        self.data_dir = config_data.base_dir
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.split = split
+        self.scan_ids = arkit.get_scan_ids(files_dir, self.split)
+        
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
+        load_utils.ensure_dir(self.out_dir)
+        
+        self.orig_image_size = config_2D.image.orig_size
+        self.model_image_size = config_2D.image.model_size
+        
+        self.frame_skip = config_data.skip_frames
+        self.top_k = config_2D.image.top_k
+        self.num_levels = config_2D.image.num_levels
+        self.undefined = 0
+        self.metadata = pd.read_csv(osp.join(files_dir,'metadata.csv'))
+         
+        self.frame_pose_data = {}
+        for scan_id in self.scan_ids:
+            pose_data = arkit.load_poses(osp.join(self.data_dir, 'scans', scan_id),scan_id, skip=self.frame_skip)
+            self.frame_pose_data[scan_id] = pose_data
+        
+
+    def compute2DFeatures(self) -> None:
+        for scan_id in tqdm(self.scan_ids):
+            self.compute2DImagesAndSeg(scan_id)
+            self.compute2DFeaturesEachScan(scan_id)   
+            # if self.split == 'val':
+            #     self.computeAllImageFeaturesEachScan(scan_id)
+    
+    def compute2DImagesAndSeg(self, scan_id: str) -> None:
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
+        if osp.exists(osp.join(scene_folder, 'gt-projection-seg.pt')):
+            return
+        
+        objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)        
+        ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations)
+        instance_ids = ply_data['objectId']
+        
+        mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply')
+        mesh = o3d.io.read_triangle_mesh(mesh_file)
+        mesh_triangles = np.asarray(mesh.triangles)
+        colors = np.asarray(mesh.vertex_colors)*255.0
+        colors = colors.round()
+        num_triangles = mesh_triangles.shape[0]
+        
+        scene = o3d.t.geometry.RaycastingScene()
+        scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
+         
+        # project 3D model
+        obj_id_imgs = {}
+        obj_id_imgs = {}
+        for frame_idx in self.frame_pose_data[scan_id].keys():
+            camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx)
+            intrinsics = camera_info['intrinsic_mat']
+            img_width = int(camera_info['width'])
+            img_height = int(camera_info['height'])
+            img_pose = self.frame_pose_data[scan_id][frame_idx]
+            img_pose_inv = np.linalg.inv(img_pose)
+            
+            obj_id_map = render.project_mesh3DTo2D_with_objectseg(
+                scene, intrinsics, img_pose_inv, img_width, img_height, 
+                mesh_triangles, num_triangles, instance_ids
+            )
+            obj_id_imgs[frame_idx] = obj_id_map
+
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
+        if osp.exists(osp.join(scene_folder, 'gt-projection')):
+            shutil.rmtree(osp.join(scene_folder, 'gt-projection'))
+    
+        # save scene-level file for efficient loading
+        torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt'))
+    
+    def compute2DFeaturesEachScan(self, scan_id: str) -> None:
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
+        color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide')
+        
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        if osp.exists(osp.join(scene_out_dir, 'data2D.pt')):
+            return
+        
+        obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
+        
+        # Multi-view Image -- Object (Embeddings)
+        object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
+        
+        # Multi-view Image -- Scene (Images + Embeddings)
+        frame_idxs = list(self.frame_pose_data[scan_id].keys())
+        pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs)
+        
+        # Visualise
+        for frame_idx in self.frame_pose_data[scan_id].keys():
+            camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx)
+            intrinsic_mat = camera_info['intrinsic_mat']
+            break
+            
+        
+        scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply'))
+        intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], 
+                        'w' : int(camera_info['width']), 'h' : int(camera_info['height'])}
+        
+        cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
+        image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
+        Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
+        
+        data2D = {}
+        data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
+        data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                                'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
+        
+        # dummy floorplan
+        floorplan_dict = {'img' : None, 'embedding' : None}
+        data2D['scene']['floorplan'] = floorplan_dict
+        
+        torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+    
+    def computeAllImageFeaturesEachScan(self, scan_id: str) -> None:
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
+        color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide')
+
+        
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        frame_idxs = list(self.frame_pose_data[scan_id].keys())
+        
+        # Extract Scene Image Features
+        scene_images_pt = []
+        scene_image_embeddings = []
+        # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
+            
+        for frame_index in frame_idxs:
+            image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png'))
+                
+            image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+            image_pt = self.model.base_tf(image)
+            
+            scene_image_embeddings.append(self.extractFeatures([image_pt], return_only_cls_mean= False))
+            scene_images_pt.append(image_pt)
+        
+        scene_image_embeddings = np.concatenate(scene_image_embeddings)
+        data2D = {} 
+        data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                           'frame_idxs' : frame_idxs}
+        torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt'))
+    
+    def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]:
+        # Sample Camera Indexes Based on Rotation Matrix From Grid
+        pose_data = []
+        for frame_idx in frame_idxs:
+            pose = self.frame_pose_data[scan_id][frame_idx]
+            rot_quat = R.from_matrix(pose[:3, :3]).as_quat()
+            trans = pose[:3, 3]
+            pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]])
+            
+        pose_data = np.array(pose_data)
+        
+        sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data)
+        # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
+        
+        # Extract Scene Image Features
+        scene_images_pt = []
+        for idx in sampled_frame_idxs:
+            frame_index = frame_idxs[idx]
+            
+            image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png'))
+            image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+            image_pt = self.model.base_tf(image)
+            scene_images_pt.append(image_pt)
+        
+        scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False)
+        
+        return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs
+        # return pose_data, None, None, sampled_frame_idxs
+        
+    
+    def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]:
+        object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt'))
+        object_image_votes = {}
+        scan_id=scene_folder.split('/')[-1]
+        # iterate over all frames
+        for frame_idx in object_anno_2D:
+            obj_2D_anno_frame = object_anno_2D[frame_idx]
+            # process 2D anno
+            obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True)
+            for idx in range(len(obj_ids)):
+                obj_id = obj_ids[idx]
+                count = counts[idx]
+                if obj_id == self.undefined:
+                    continue
+                
+                if obj_id not in object_image_votes:
+                    object_image_votes[obj_id] = {}
+                if frame_idx not in object_image_votes[obj_id]:
+                    object_image_votes[obj_id][frame_idx] = 0
+                object_image_votes[obj_id][frame_idx] = count
+        
+        # select top K frames for each obj
+        object_image_votes_topK = {}
+        for obj_id in object_image_votes:
+            object_image_votes_topK[obj_id] = []
+            obj_image_votes_f = object_image_votes[obj_id]
+            sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True)
+            if len(sorted_frame_idxs) > self.top_k:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k]
+            else:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs
+        
+        object_ids_in_image_votes = list(object_image_votes_topK.keys())
+        for obj_id in object_ids_in_image_votes:
+            if obj_id not in list(obj_id_to_label_id_map.keys()):
+                del object_image_votes_topK[obj_id]
+        
+        assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found'
+        
+        object_image_embeddings = {}
+        for object_id in object_image_votes_topK:
+            object_image_votes_topK_frames = object_image_votes_topK[object_id]
+            object_image_embeddings[object_id] = {}
+            
+            for frame_idx in object_image_votes_topK_frames:
+                image_path = osp.join(scene_folder, f'{scan_id}_frames', 'lowres_wide', f'{scan_id}_{frame_idx}.png')
+                color_img = Image.open(image_path)
+                object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(scan_id, color_img, object_id, object_anno_2D[frame_idx])
+
+        return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys()
+    
+    def computeImageFeaturesEachObject(self, scan_id, image: Image.Image, object_id: int, object_anno_2d: np.ndarray) -> np.ndarray:
+        object_anno_2d = object_anno_2d.transpose(1, 0)
+        object_anno_2d = np.flip(object_anno_2d, 1)
+        
+        object_mask = object_anno_2d == object_id
+        
+        images_crops = []
+        for level in range(self.num_levels):
+            mask_tensor = torch.from_numpy(object_mask).float()
+            x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level)
+            cropped_img = image.crop((x1, y1, x2, y2))
+            cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC)
+            img_pt = self.model.base_tf(cropped_img)
+            images_crops.append(img_pt)
+        
+        if(len(images_crops) > 0):
+            mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True)
+        return mean_feats
\ No newline at end of file
diff --git a/preprocess/feat2D/multiscan.py b/preprocess/feat2D/multiscan.py
new file mode 100644
index 0000000..d95239e
--- /dev/null
+++ b/preprocess/feat2D/multiscan.py
@@ -0,0 +1,240 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+
+from common import load_utils
+from util import render, multiscan, visualisation
+from util import image as image_util
+
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat2D.base import Base2DProcessor
+
+
+@PROCESSOR_REGISTRY.register()
+class MultiScan2DProcessor(Base2DProcessor):
+    def __init__(self, config_data, config_2D, split) -> None:
+        super(MultiScan2DProcessor, self).__init__(config_data, config_2D, split)
+        self.data_dir = config_data.base_dir
+        files_dir = osp.join(config_data.base_dir, 'files')
+        self.split = split
+        
+        self.scan_ids = []
+        self.scan_ids = multiscan.get_scan_ids(files_dir, split)
+        
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
+        load_utils.ensure_dir(self.out_dir)
+        
+        self.orig_image_size = config_2D.image.orig_size
+        self.model_image_size = config_2D.image.model_size
+        
+        self.frame_skip = config_data.skip_frames
+        self.top_k = config_2D.image.top_k
+        self.num_levels = config_2D.image.num_levels
+        self.undefined = 0
+        
+        
+        # get frame_indexes
+        self.frame_pose_data = {}
+        for scan_id in self.scan_ids:
+            scene_folder = osp.join(self.data_dir, 'scenes', scan_id)
+            frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip)
+            while(len(frame_idxs) > 500):
+                self.frame_skip += 2
+                frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=2)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=5)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=10)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=15)
+            # if len(frame_idxs) > 500:
+            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=20)
+            
+            pose_data = multiscan.load_all_poses(scene_folder, frame_idxs)
+            self.frame_pose_data[scan_id] = pose_data
+
+
+    def compute2DFeatures(self):
+        for scan_id in tqdm(self.scan_ids):
+            self.compute2DImagesAndSeg(scan_id)
+            self.compute2DFeaturesEachScan(scan_id)
+    
+    def compute2DImagesAndSeg(self, scan_id):
+        scene_folder = osp.join(self.data_dir, 'scenes', scan_id)
+        mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id))
+        
+        ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id)
+        instance_ids = ply_data['objectId']
+        
+        mesh = o3d.io.read_triangle_mesh(mesh_file)
+        mesh_triangles = np.asarray(mesh.triangles)
+        colors = np.asarray(mesh.vertex_colors)*255.0
+        colors = colors.round()
+        num_triangles = mesh_triangles.shape[0]
+        
+        scene = o3d.t.geometry.RaycastingScene()
+        scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh))
+         
+        # project 3D model
+        obj_id_imgs = {}
+        for frame_idx in self.frame_pose_data[scan_id]:
+            camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx))
+            intrinsics = camera_info['intrinsic_mat']
+            img_width = int(camera_info['width'])
+            img_height = int(camera_info['height'])
+            img_pose = self.frame_pose_data[scan_id][frame_idx]
+            img_pose_inv = np.linalg.inv(img_pose)
+            
+            obj_id_map = render.project_mesh3DTo2D_with_objectseg(
+                scene, intrinsics, img_pose_inv, img_width, img_height, 
+                mesh_triangles, num_triangles, instance_ids
+            )
+            obj_id_imgs[frame_idx] = obj_id_map
+ 
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+    
+        # save scene-level file for efficient loading
+        torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt'))
+    
+    def compute2DFeaturesEachScan(self, scan_id):
+        scene_folder = osp.join(self.data_dir, 'scenes', scan_id)
+        color_path = osp.join(scene_folder, 'sequence')
+        
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
+        
+        # Multi-view Image -- Object (Embeddings)
+        object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
+        
+        # Multi-view Image -- Scene (Images + Embeddings)
+        frame_idxs = list(self.frame_pose_data[scan_id].keys())
+        pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs)
+        
+        # Visualise
+        camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id)
+        intrinsic_mat = camera_info['intrinsic_mat']
+        
+        scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id)))
+        intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], 
+                        'w' : int(camera_info['width']), 'h' : int(camera_info['height'])}
+        
+        cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
+        image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
+        Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
+        
+        data2D = {}
+        data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
+        data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                                'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
+        
+        # dummy floorplan
+        floorplan_dict = {'img' : None, 'embedding' : None}
+        data2D['scene']['floorplan'] = floorplan_dict
+        
+        torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+    
+    def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs):
+        # Sample Camera Indexes Based on Rotation Matrix From Grid
+        pose_data = []
+        for frame_idx in frame_idxs:
+            pose = self.frame_pose_data[scan_id][frame_idx]
+            rot_quat = R.from_matrix(pose[:3, :3]).as_quat()  
+            trans = pose[:3, 3]
+            pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]])
+            
+        pose_data = np.array(pose_data)
+        
+        sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data)
+        
+        # Extract Scene Image Features
+        scene_images_pt = []
+        for idx in sampled_frame_idxs:
+            frame_index = frame_idxs[idx]
+            
+            image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg'))
+            image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+            image_pt = self.model.base_tf(image)
+            scene_images_pt.append(image_pt)
+            
+        scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False)
+
+        return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs
+    
+    def computeImageFeaturesAllObjectsEachScan(self, scene_folder, scene_out_dir, obj_id_to_label_id_map):
+        object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt'))
+        object_image_votes = {}
+        
+        # iterate over all frames
+        for frame_idx in object_anno_2D:
+            obj_2D_anno_frame = object_anno_2D[frame_idx]
+            # process 2D anno
+            obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True)
+            for idx in range(len(obj_ids)):
+                obj_id = obj_ids[idx]
+                count = counts[idx]
+                if obj_id == self.undefined:
+                    continue
+                
+                if obj_id not in object_image_votes:
+                    object_image_votes[obj_id] = {}
+                if frame_idx not in object_image_votes[obj_id]:
+                    object_image_votes[obj_id][frame_idx] = 0
+                object_image_votes[obj_id][frame_idx] = count
+        
+        # select top K frames for each obj
+        object_image_votes_topK = {}
+        for obj_id in object_image_votes:
+            object_image_votes_topK[obj_id] = []
+            obj_image_votes_f = object_image_votes[obj_id]
+            sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True)
+            if len(sorted_frame_idxs) > self.top_k:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k]
+            else:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs
+        
+        object_ids_in_image_votes = list(object_image_votes_topK.keys())
+        for obj_id in object_ids_in_image_votes:
+            if obj_id not in list(obj_id_to_label_id_map.keys()):
+                del object_image_votes_topK[obj_id]
+        
+        assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found'
+        
+        object_image_embeddings = {}
+        for object_id in object_image_votes_topK:
+            object_image_votes_topK_frames = object_image_votes_topK[object_id]
+            object_image_embeddings[object_id] = {}
+            
+            for frame_idx in object_image_votes_topK_frames:
+                image_path = osp.join(scene_folder, 'sequence', f'frame-{frame_idx}.color.jpg')
+                color_img = Image.open(image_path)
+                object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(color_img, object_id, object_anno_2D[frame_idx])
+
+        return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys()
+    
+    def computeImageFeaturesEachObject(self, image, object_id, object_anno_2d):
+        # load image
+        object_mask = object_anno_2d == object_id
+        
+        images_crops = []
+        for level in range(self.num_levels):
+            mask_tensor = torch.from_numpy(object_mask).float()
+            x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level)
+            cropped_img = image.crop((x1, y1, x2, y2))
+            cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC)
+            img_pt = self.model.base_tf(cropped_img)
+            images_crops.append(img_pt)
+            # images_crops.append(cropped_img)
+            
+        
+        if(len(images_crops) > 0):
+            mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True)
+        return mean_feats
\ No newline at end of file
diff --git a/preprocess/feat2D/structured3d.py b/preprocess/feat2D/structured3d.py
new file mode 100644
index 0000000..9893260
--- /dev/null
+++ b/preprocess/feat2D/structured3d.py
@@ -0,0 +1,264 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+import shutil
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+import cv2
+from common import load_utils
+from util import render, structured3d, visualisation
+from util import image as image_util
+import os
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat2D.base import Base2DProcessor
+
+
+@PROCESSOR_REGISTRY.register()
+class Structured3D_2DProcessor(Base2DProcessor):
+    def __init__(self, config_data, config_2D, split) -> None:
+        super(Structured3D_2DProcessor, self).__init__(config_data, config_2D, split)
+        self.data_dir = config_data.base_dir
+        files_dir = osp.join(config_data.base_dir, 'files')
+        self.split = split
+        
+        self.scan_ids = []
+        self.scan_ids = structured3d.get_scan_ids(files_dir, split)
+        
+        self.out_dir = config_data.process_dir
+        load_utils.ensure_dir(self.out_dir)
+        
+        self.model_image_size = config_2D.image.model_size
+        
+        self.frame_skip = config_data.skip_frames
+        self.top_k = config_2D.image.top_k
+        self.num_levels = config_2D.image.num_levels
+        
+        
+        # get frame_indexes
+        self.frame_pose_data = {}
+        for scan_id in self.scan_ids:
+            full_scan_id = scan_id
+            scan_id = scan_id.split('_')
+            room_id = scan_id[-1]
+            scan_id = scan_id[0]+'_'+scan_id[1]
+            scene_folder = osp.join(self.data_dir, 'scans', scan_id, '2D_rendering', room_id, 'perspective', 'full')
+            frame_idxs = [f for f in os.listdir(scene_folder) if f[0] != '.' and f[0] != 'g']
+            pose_data = structured3d.load_all_poses(scene_folder, frame_idxs)
+            self.frame_pose_data[full_scan_id] = pose_data
+
+
+    def compute2DFeatures(self):
+        for scan_id in tqdm(self.scan_ids):
+            self.compute2DImagesAndSeg(scan_id)
+            self.compute2DFeaturesEachScan(scan_id)
+            # if self.split == 'val':
+            #     self.computeAllImageFeaturesEachScan(scan_id)
+    
+    def compute2DImagesAndSeg(self, scan_id):
+        full_scan_id = scan_id
+        scan_id = scan_id.split('_')
+        room_id = scan_id[-1]
+        scan_id = scan_id[0]+'_'+scan_id[1]
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id,'2D_rendering', room_id, 'perspective', 'full')
+        
+        obj_id_imgs = {}
+        for frame_idx in self.frame_pose_data[full_scan_id]:
+            image_path=osp.join(scene_folder, frame_idx, 'instance.png')
+            obj_id_map = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
+            obj_id_imgs[frame_idx] = obj_id_map
+
+        if osp.exists(osp.join(scene_folder, 'gt-projection')):
+            shutil.rmtree(osp.join(scene_folder, 'gt-projection'))
+    
+        # torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt'))
+        np.savez_compressed(osp.join(scene_folder,'gt-projection-seg.npz'),**obj_id_imgs)
+    
+    def compute2DFeaturesEachScan(self, scan_id):
+        full_scan_id = scan_id
+        scan_id = scan_id.split('_')
+        room_id = scan_id[-1]
+        scan_id = scan_id[0]+'_'+scan_id[1]
+        scene_folder = osp.join(self.data_dir, 'scans', scan_id,'2D_rendering', room_id, 'perspective', 'full')
+        
+        scene_out_dir = osp.join(self.out_dir, full_scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+        
+        # obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
+        obj_id_to_label_id_map = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item()
+        
+        floorplan_img_path = osp.join(self.data_dir,'scans', scan_id, 'floorplans', f'{room_id}.png')
+        floorplan_img = cv2.imread(floorplan_img_path)
+        floorplan_img = cv2.cvtColor(floorplan_img, cv2.COLOR_BGR2RGB)
+        floorplan_img = cv2.cvtColor(floorplan_img, cv2.COLOR_RGB2GRAY)
+        floorplan_img = cv2.cvtColor(floorplan_img, cv2.COLOR_GRAY2RGB)
+        floorplan_img = image_util.crop_image(floorplan_img, floorplan_img_path.replace('.png', '_cropped.png'))
+        floorplan_embeddings = None
+        
+        if floorplan_img is not None:
+            floorplan_img = floorplan_img.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+            floorplan_img_pt = self.model.base_tf(floorplan_img)
+            floorplan_embeddings = self.extractFeatures([floorplan_img_pt], return_only_cls_mean = False)            
+        floorplan_dict = {'img' : floorplan_img, 'embedding' : floorplan_embeddings}
+        # print(floorplan_dict)
+        # Multi-view Image -- Object (Embeddings)
+        object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, obj_id_to_label_id_map)
+        
+        # Multi-view Image -- Scene (Images + Embeddings)
+        frame_idxs = list(self.frame_pose_data[full_scan_id].keys())
+        pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(full_scan_id, scene_folder, frame_idxs)
+        
+        # Visualise
+        # camera_info = structured3d.load_intrinsics(scene_folder)
+        # intrinsic_mat = camera_info['intrinsic_mat']
+        
+        # scene_mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id, '3D_rendering', room_id,'room_mesh.ply'))
+        # intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], 
+        #                 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])}
+        
+        # cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1)
+        # image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png')
+        # Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path)
+        
+        data2D = {}
+        data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK}
+        data2D['scene']   = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+                                'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs}
+        
+        data2D['scene']['floorplan'] = floorplan_dict
+        
+        # torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D)
+    
+    # def computeAllImageFeaturesEachScan(self, scan_id):
+    #     scene_folder = osp.join(self.data_dir, 'scenes', scan_id)
+    #     color_path = osp.join(scene_folder, 'sequence')
+    #     scene_out_dir = osp.join(self.out_dir, scan_id)
+    #     load_utils.ensure_dir(scene_out_dir)
+        
+    #     frame_idxs = list(self.frame_pose_data[scan_id].keys())
+        
+    #     # Extract Scene Image Features
+    #     scene_images_pt = []
+    #     scene_image_embeddings = []
+    #     for frame_index in frame_idxs:
+    #         image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg'))
+    #         image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+    #         image_pt = self.model.base_tf(image)
+    #         # image_pt = torch.zeros(1, 1536)
+            
+    #         scene_image_embeddings.append(self.extractFeatures([image_pt], return_only_cls_mean= False))
+    #         scene_images_pt.append(image_pt)
+    #     scene_image_embeddings = np.concatenate(scene_image_embeddings)
+    #     data2D = {} 
+    #     data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
+    #                        'frame_idxs' : frame_idxs}
+    #     torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt'))
+    #     np.savez_compressed(osp.join(scene_out_dir, 'data2D_all_images.npz'), **data2D)
+    
+    def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs):
+        # Sample Camera Indexes Based on Rotation Matrix From Grid
+        pose_data = []
+        for frame_idx in frame_idxs:
+            pose = self.frame_pose_data[scan_id][frame_idx]
+            rot_quat = R.from_matrix(pose[:3, :3]).as_quat()
+            trans = pose[:3, 3]
+            pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]])
+            
+        pose_data = np.array(pose_data)
+        
+        sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data)
+        # print(sampled_frame_idxs)
+        scene_images_pt = []
+        for idx in sampled_frame_idxs:
+            frame_index = frame_idxs[idx]
+            
+            image = Image.open(osp.join(color_path, frame_index, f'rgb_rawlight.png'))
+            image = image.convert('RGB')
+            image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
+            image_pt = self.model.base_tf(image)
+            scene_images_pt.append(image_pt)
+            
+        
+        scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False)
+        
+        return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs
+    
+    def computeImageFeaturesAllObjectsEachScan(self, scene_folder, obj_id_to_label_id_map):
+        # object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt'))
+        object_anno_2D = np.load(osp.join(scene_folder, 'gt-projection-seg.npz'),allow_pickle=True)
+        object_image_votes = {}
+        
+        # iterate over all frames
+        for frame_idx in object_anno_2D:
+            obj_2D_anno_frame = object_anno_2D[frame_idx]
+            # process 2D anno
+            obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True)
+            for idx in range(len(obj_ids)):
+                obj_id = obj_ids[idx]
+                count = counts[idx]
+                
+                if obj_id not in object_image_votes:
+                    object_image_votes[obj_id] = {}
+                if frame_idx not in object_image_votes[obj_id]:
+                    object_image_votes[obj_id][frame_idx] = 0
+                object_image_votes[obj_id][frame_idx] = count
+        
+        # select top K frames for each obj
+        object_image_votes_topK = {}
+        for obj_id in object_image_votes:
+            object_image_votes_topK[obj_id] = []
+            obj_image_votes_f = object_image_votes[obj_id]
+            sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True)
+            if len(sorted_frame_idxs) > self.top_k:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k]
+            else:
+                object_image_votes_topK[obj_id] = sorted_frame_idxs
+        
+        object_ids_in_image_votes = list(object_image_votes_topK.keys())
+        for obj_id in object_ids_in_image_votes:
+            if obj_id not in list(obj_id_to_label_id_map.keys()):
+                del object_image_votes_topK[obj_id]
+        
+        assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found'
+        
+        object_image_embeddings = {}
+        for object_id in object_image_votes_topK:
+            object_image_votes_topK_frames = object_image_votes_topK[object_id]
+            object_image_embeddings[object_id] = {}
+            
+            for frame_idx in object_image_votes_topK_frames:
+                image_path = osp.join(scene_folder, frame_idx, 'rgb_rawlight.png')
+                # print(image_path)
+                color_img = Image.open(image_path)
+                # print(color_img.mode)
+                color_img = color_img.convert('RGB')
+                object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(color_img, object_id, object_anno_2D[frame_idx])
+
+        return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys()
+    
+    def computeImageFeaturesEachObject(self, image, object_id, object_anno_2d):
+        # print(np.array(image).shape)
+        object_anno_2d = object_anno_2d.transpose(1, 0)
+        object_anno_2d = np.flip(object_anno_2d, 1)
+        
+        # load image
+        object_mask = object_anno_2d == object_id
+        
+        images_crops = []
+        for level in range(self.num_levels):
+            mask_tensor = torch.from_numpy(object_mask).float()
+            x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level)
+            cropped_img = image.crop((x1, y1, x2, y2))
+            # print(np.array(cropped_img).shape)
+            cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC)
+            img_pt = self.model.base_tf(cropped_img)
+            images_crops.append(img_pt)
+            # images_crops.append(cropped_img)
+            
+        
+        if(len(images_crops) > 0):
+            mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True)
+        return mean_feats
+        
diff --git a/preprocess/feat3D/__init__.py b/preprocess/feat3D/__init__.py
index 9a1b744..7db5e81 100644
--- a/preprocess/feat3D/__init__.py
+++ b/preprocess/feat3D/__init__.py
@@ -1,2 +1,5 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
+from .multiscan import *
+from .structured3d import *
\ No newline at end of file
diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py
new file mode 100644
index 0000000..6172204
--- /dev/null
+++ b/preprocess/feat3D/arkit.py
@@ -0,0 +1,97 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from common import load_utils 
+from util import point_cloud, arkit
+from util.arkit import ARKITSCENE_SCANNET
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat3D.base import Base3DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class ARKitScenes3DProcessor(Base3DProcessor):
+    def __init__(self, config_data, config_3D, split) -> None:
+        super(ARKitScenes3DProcessor, self).__init__(config_data, config_3D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = arkit.get_scan_ids(files_dir, split)
+        
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
+        load_utils.ensure_dir(self.out_dir)
+        self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id')
+        
+        self.undefined = 0        
+
+        
+    def load_objects_for_scan(self, scan_id):
+        """Load and parse the annotations JSON for the given scan ID."""
+        objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        
+        objects = []
+        for _i, label_info in enumerate(annotations["data"]):
+            obj_label = label_info["label"]
+            object_id = _i + 1
+            scannet_class=ARKITSCENE_SCANNET[obj_label]
+            nyu40id=self.label_map[scannet_class]
+            objects.append({
+                "objectId": object_id,
+                "global_id": nyu40id
+            })
+        
+        
+        return objects
+
+    def compute3DFeaturesEachScan(self, scan_id):
+        objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        ply_data = arkit.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, annotations)
+        mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0))
+                
+        mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id,'{}_3dod_mesh.ply'.format(scan_id)))
+        mesh_colors = np.asarray(mesh.vertex_colors)*255.0
+        mesh_colors = mesh_colors.round()
+        
+                
+        scan_objects=self.load_objects_for_scan(scan_id)
+        
+        object_pcl_embeddings, object_cad_embeddings = {}, {}
+        object_id_to_label_id = {}
+        for idx, scan_object in enumerate(scan_objects):
+            instance_id = int(scan_object['objectId'])
+            global_object_id = scan_object['global_id']
+
+            object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)]
+            
+            if object_pcl.shape[0] <= self.config_3D.min_points_per_object: 
+                continue
+            
+            assert instance_id not in object_id_to_label_id
+            object_id_to_label_id[instance_id] = global_object_id
+            
+            if object_pcl.shape[0] >= self.config_3D.min_points_per_object:
+                object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl)
+
+        data3D = {}    
+        data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings}
+        data3D['scene']   = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None}
+            
+        object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id}
+        
+        assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id)
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+            
+        torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
+        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
\ No newline at end of file
diff --git a/preprocess/feat3D/multiscan.py b/preprocess/feat3D/multiscan.py
new file mode 100644
index 0000000..68ba025
--- /dev/null
+++ b/preprocess/feat3D/multiscan.py
@@ -0,0 +1,94 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from common import load_utils 
+from util import point_cloud, multiscan
+from util.multiscan import MULTISCAN_SCANNET
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat3D.base import Base3DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class MultiScan3DProcessor(Base3DProcessor):
+    def __init__(self, config_data, config_3D, split) -> None:
+        super(MultiScan3DProcessor, self).__init__(config_data, config_3D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = multiscan.get_scan_ids(files_dir, split)
+        
+        self.out_dir = osp.join(config_data.process_dir, 'scans')
+        load_utils.ensure_dir(self.out_dir)
+        self.label_map = multiscan.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id')
+        
+        self.undefined = 0        
+
+        
+    def load_objects_for_scan(self, scan_id):
+        """Load and parse the annotations JSON for the given scan ID."""
+        objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json")
+        if not osp.exists(objects_path):
+            raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
+        
+        annotations = load_utils.load_json(objects_path)
+        
+        objects = []
+        
+        for obj in annotations["objects"]:
+            object_id=obj["objectId"]
+            objectName=obj["label"].split('.')[0]
+            scannet_class=MULTISCAN_SCANNET[objectName]
+            nyu40id=self.label_map[scannet_class]
+            objects.append({
+                "objectId": object_id,
+                "global_id": nyu40id
+            })
+        
+        return objects
+
+
+    
+    def compute3DFeaturesEachScan(self, scan_id):
+        ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id)
+        mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0))
+                
+        mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scenes', scan_id,'{}.ply'.format(scan_id)))
+        mesh_colors = np.asarray(mesh.vertex_colors)*255.0
+        mesh_colors = mesh_colors.round()
+                
+        scan_objects=self.load_objects_for_scan(scan_id)
+        
+        object_pcl_embeddings, object_cad_embeddings = {}, {}
+        object_id_to_label_id = {}
+        for idx, scan_object in enumerate(scan_objects):
+            instance_id = int(scan_object['objectId'])
+            global_object_id = scan_object['global_id']
+
+            object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)]
+            
+            if object_pcl.shape[0] <= self.config_3D.min_points_per_object: 
+                continue
+            
+            assert instance_id not in object_id_to_label_id
+            object_id_to_label_id[instance_id] = global_object_id
+            
+            if object_pcl.shape[0] >= self.config_3D.min_points_per_object:
+                object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl)
+
+        data3D = {}    
+        data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings}
+        data3D['scene']   = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None}
+            
+        object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id}
+        
+        assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id)
+        scene_out_dir = osp.join(self.out_dir, scan_id)
+        load_utils.ensure_dir(scene_out_dir)
+            
+        torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
+        torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
+    
\ No newline at end of file
diff --git a/preprocess/feat3D/structured3d.py b/preprocess/feat3D/structured3d.py
new file mode 100644
index 0000000..26fad6e
--- /dev/null
+++ b/preprocess/feat3D/structured3d.py
@@ -0,0 +1,96 @@
+import os.path as osp
+import open3d as o3d
+import numpy as np
+import torch
+from tqdm import tqdm
+import json
+from common import load_utils 
+from util import structured3d
+from preprocess.build import PROCESSOR_REGISTRY
+from preprocess.feat3D.base import Base3DProcessor
+
+@PROCESSOR_REGISTRY.register()
+class Structured3D_3DProcessor(Base3DProcessor):
+    def __init__(self, config_data, config_3D, split) -> None:
+        super(Structured3D_3DProcessor, self).__init__(config_data, config_3D, split)
+        self.data_dir = config_data.base_dir
+        
+        files_dir = osp.join(config_data.base_dir, 'files')
+        
+        self.scan_ids = []
+        self.scan_ids = structured3d.get_scan_ids(files_dir, split)
+        
+        self.out_dir = config_data.process_dir
+        load_utils.ensure_dir(self.out_dir)
+        # self.undefined = 0      
+
+    def compute3DFeaturesEachScan(self, scan_id):
+        scan_id = scan_id.split('_')
+        room_id = scan_id[-1]
+        scan_id = scan_id[0]+'_'+scan_id[1]
+        ply_data = structured3d.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, room_id)
+        mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0))
+            
+        # mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id, '3D_rendering', room_id, 'room_mesh.ply'))
+        # mesh_colors = np.asarray(mesh.vertex_colors)*255.0
+        mesh_colors = np.stack([ply_data['red'], ply_data['green'], ply_data['blue']]).transpose((1, 0))
+        # print(mesh_colors)
+        
+        # mesh_colors = mesh_colors.round()
+        object_ids = ply_data['objectId']
+        unique_objects = np.unique(object_ids)
+        # print(unique_objects)
+        semantic_ids = ply_data['nyu40id']
+        
+        scene_label = None
+        with open(osp.join(self.data_dir, 'scans', scan_id, 'annotation_3d.json')) as file:
+            annotations = json.load(file)
+        
+        for annos in annotations['semantics']:
+                if annos['ID'] == int(room_id):
+                    scene_label = annos['type'].strip()
+                    break
+
+
+        object_pcl_embeddings, object_cad_embeddings = {}, {}
+        object_id_to_label_id = {}
+        
+        for idx, instance_id in enumerate(unique_objects):
+            object_pcl=mesh_points[np.where(ply_data['objectId'] == instance_id)]
+            if object_pcl.shape[0] <= self.config_3D.min_points_per_object:
+                continue
+            
+            assert instance_id not in object_id_to_label_id
+            # first_point_idx = np.where(object_ids == instance_id)[0][0]
+            # nyu40id = semantic_ids[first_point_idx]
+            # object_id_to_label_id[instance_id] = nyu40id
+            # Find the most common nyu40id for this object
+            all_point_indices = np.where(object_ids == instance_id)[0]
+            nyu40ids_for_object = semantic_ids[all_point_indices]
+            unique_ids, counts = np.unique(nyu40ids_for_object, return_counts=True)
+            nyu40id = unique_ids[np.argmax(counts)]
+            object_id_to_label_id[instance_id] = nyu40id
+            # if instance_id==0:
+            #     print(nyu40id)
+            
+            if object_pcl.shape[0] >= self.config_3D.min_points_per_object:
+                object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl)
+            else:
+                print("Object {} has less than {} points".format(instance_id, self.config_3D.min_points_per_object))
+            
+        # print(scene_label)
+        data3D = {}    
+        data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings}
+        data3D['scene']   = {'pcl_coords': mesh_points, 'pcl_feats': mesh_colors, 'scene_label' : scene_label}
+        # print(object_id_to_label_id)
+        object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id}
+        
+        assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id)
+        scene_out_dir = osp.join(self.out_dir, scan_id+'_'+room_id)
+        load_utils.ensure_dir(scene_out_dir)
+            
+        # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt'))
+        # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))
+        np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D)
+        np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map)
+    
diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py
index 822135d..9070228 100644
--- a/preprocess/multimodal_preprocess.py
+++ b/preprocess/multimodal_preprocess.py
@@ -8,9 +8,9 @@
 import h5py
 from common import load_utils 
 from common.constants import ModalityType
-from util import scan3r, scannet
+from util import scan3r, scannet, arkit, multiscan, structured3d
 from typing import Dict, Optional
-
+import os
 from preprocess.build import PROCESSOR_REGISTRY
 
 @PROCESSOR_REGISTRY.register()
@@ -33,6 +33,12 @@ def __init__(self, config_data: DictConfig, modality_config: DictConfig, split:
             self.scan_ids = scannet.get_scan_ids(self.files_dir, self.split)
         elif self.dataset_name == 'Scan3R':
             self.scan_ids = scan3r.get_scan_ids(self.files_dir, self.split)
+        elif self.dataset_name == 'ARKitScenes':
+            self.scan_ids = arkit.get_scan_ids(self.files_dir, self.split)
+        elif self.dataset_name == 'MultiScan':
+            self.scan_ids = multiscan.get_scan_ids(self.files_dir, self.split)
+        elif self.dataset_name == 'Structured3D':
+            self.scan_ids = structured3d.get_scan_ids(self.files_dir, self.split)
         else:
             raise NotImplementedError
         
@@ -71,18 +77,20 @@ def prepareObjectWiseDataEachScan(self,
                                     data2D: Optional[Dict] = None, 
                                     data3D: Optional[Dict] = None) -> Dict:
         """Process object-wise data for a single scan combining features from all modalities."""
-        object_id_to_label_id_map  = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] 
+        # object_id_to_label_id_map  = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] 
+        object_id_to_label_id_map = np.load(osp.join(out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item()
+        
         map_object_ids = list(object_id_to_label_id_map.keys())
         
         precomputed_feats, inputs = {}, {}
         
         if data3D is not None:
-            precomputed_feats[ModalityType.POINT] = data3D['objects']['pcl_embeddings'] 
-            precomputed_feats[ModalityType.CAD] = data3D['objects']['cad_embeddings']
+            precomputed_feats[ModalityType.POINT] = data3D['objects'].item()['pcl_embeddings'] 
+            precomputed_feats[ModalityType.CAD] = data3D['objects'].item()['cad_embeddings']
         if data2D is not None:
-            precomputed_feats[ModalityType.RGB] = data2D['objects']['image_embeddings']
+            precomputed_feats[ModalityType.RGB] = data2D['objects'].item()['image_embeddings']
         if data1D is not None:
-            precomputed_feats[ModalityType.REF] = data1D['objects']['referral_embeddings']
+            precomputed_feats[ModalityType.REF] = data1D['objects'].item()['referral_embeddings']
         
         object_ids = []
         for modalityType in ModalityType.__dict__.values():
@@ -137,19 +145,27 @@ def prepareObjectWiseDataEachScan(self,
             'object_id2idx' : object_id2idx,
             'object_id_to_label_id_map' : object_id_to_label_id_map,
             'object_ids' : object_ids,
-            'topK_images_votes' : data2D['objects']['topK_images_votes']
+            'topK_images_votes' : data2D['objects'].item()['topK_images_votes']
         }
-        
-        torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt'))
+        pt_multimodal_path = osp.join(out_dir, 'objectsDataMultimodal.pt')
+        if osp.exists(pt_multimodal_path):
+            os.remove(pt_multimodal_path)
+        # torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt'))
+        np.savez_compressed(osp.join(out_dir, 'objectsDataMultimodal.npz'), **objects_data_pt)
         return objects_data_pt
         
     def prepareDataEachScan(self, scan_id: str, hf_handler: h5py.File) -> None:
         """Process data for a single scan and store it in the HDF5 file."""
         out_dir = osp.join(self.out_dir, scan_id)
         
-        data1D = torch.load(osp.join(out_dir, 'data1D.pt'))
-        data2D = torch.load(osp.join(out_dir, 'data2D.pt'))
-        data3D = torch.load(osp.join(out_dir, 'data3D.pt'))
+        # data1D = torch.load(osp.join(out_dir, 'data1D.pt'))
+        data1D = np.load(osp.join(out_dir, 'data1D.npz'),allow_pickle=True)
+        
+        # data2D = torch.load(osp.join(out_dir, 'data2D.pt'))
+        data2D = np.load(osp.join(out_dir, 'data2D.npz'),allow_pickle=True)
+        
+        # data3D = torch.load(osp.join(out_dir, 'data3D.pt'))
+        data3D = np.load(osp.join(out_dir, 'data3D.npz'),allow_pickle=True)
         
         objects_data_pt = self.prepareObjectWiseDataEachScan(out_dir, data1D, data2D, data3D)
         self.dumpEachObjectDataPerScan(scan_id, objects_data_pt, hf_handler)
@@ -182,4 +198,4 @@ def dumpEachObjectDataPerScan(self,
         
     def run(self) -> None:
         """Execute the complete preprocessing pipeline."""
-        self.prepareData()
+        self.prepareData()
\ No newline at end of file
diff --git a/retrieval/object_retrieval.py b/retrieval/object_retrieval.py
index 54c144f..526e5a2 100644
--- a/retrieval/object_retrieval.py
+++ b/retrieval/object_retrieval.py
@@ -293,6 +293,6 @@ def run(self) -> None:
         # Object Retrieval Evaluation
         self.eval(output_dict)
         
-        self.logger.info('Scene Retrieval Evaluation (Instance Baseline)...')
+        self.logger.info('Scene Retrieval Evaluation (Instance CrossOver)...')
         # Scene Retrieval Evaluation
         self.scene_eval(output_dict)
\ No newline at end of file
diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh
new file mode 100644
index 0000000..3acdb4a
--- /dev/null
+++ b/scripts/preprocess/process_arkit.sh
@@ -0,0 +1,9 @@
+export PYTHONWARNINGS="ignore"
+
+# Preprocessing Object Level + Scene Level + Unified Data
+python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
+
+# Multi-modal dumping
+python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null 
diff --git a/scripts/preprocess/process_multiscan.sh b/scripts/preprocess/process_multiscan.sh
new file mode 100644
index 0000000..a13a93c
--- /dev/null
+++ b/scripts/preprocess/process_multiscan.sh
@@ -0,0 +1,9 @@
+export PYTHONWARNINGS="ignore"
+
+# Preprocessing Object Level + Scene Level + Unified Data
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null
+python preprocessor.py --config-path /"$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['MultiScan']  hydra.run.dir=. hydra.output_subdir=null
+
+# Multi-modal dumping
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file
diff --git a/scripts/preprocess/process_scan3r.sh b/scripts/preprocess/process_scan3r.sh
index 6d8a981..5ac2b71 100644
--- a/scripts/preprocess/process_scan3r.sh
+++ b/scripts/preprocess/process_scan3r.sh
@@ -1,9 +1,8 @@
 export PYTHONWARNINGS="ignore"
 
 # Preprocessing Object Level + Scene Level + Unified Data
-# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null 
-# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null 
-# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R']  hydra.run.dir=. hydra.output_subdir=null
-
+python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R']  hydra.run.dir=. hydra.output_subdir=null
 # Multi-modal dumping
 python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file
diff --git a/scripts/preprocess/process_scannet.sh b/scripts/preprocess/process_scannet.sh
index 68a2366..47aa945 100644
--- a/scripts/preprocess/process_scannet.sh
+++ b/scripts/preprocess/process_scannet.sh
@@ -1,9 +1,8 @@
 export PYTHONWARNINGS="ignore"
 
 # Preprocessing Object Level + Scene Level + Unified Data
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null 
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['Scannet']  hydra.run.dir=. hydra.output_subdir=null
-
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['Scannet']  hydra.run.dir=. hydra.output_subdir=null
 # Multi-modal dumping
-python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file
+python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null
\ No newline at end of file
diff --git a/scripts/preprocess/process_structured3d.sh b/scripts/preprocess/process_structured3d.sh
new file mode 100644
index 0000000..08c0605
--- /dev/null
+++ b/scripts/preprocess/process_structured3d.sh
@@ -0,0 +1,9 @@
+export PYTHONWARNINGS="ignore"
+
+# Preprocessing Object Level + Scene Level + Unified Data
+python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_3d.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_1d.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null 
+python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_2d.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null 
+
+# # Multi-modal dumping
+python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_multimodal.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null 
diff --git a/single_inference/datasets/__init__.py b/single_inference/datasets/__init__.py
index 9a1b744..9c7b829 100644
--- a/single_inference/datasets/__init__.py
+++ b/single_inference/datasets/__init__.py
@@ -1,2 +1,4 @@
 from .scannet import *
-from .scan3r import *
\ No newline at end of file
+from .scan3r import *
+from .arkit import *
+from .multiscan import *
diff --git a/single_inference/datasets/arkit.py b/single_inference/datasets/arkit.py
new file mode 100644
index 0000000..6434bde
--- /dev/null
+++ b/single_inference/datasets/arkit.py
@@ -0,0 +1,126 @@
+import os.path as osp
+import numpy as np
+from torch.utils.data import Dataset
+import MinkowskiEngine as ME
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+from torchvision import transforms as tvf
+import torch
+import open3d as o3d
+import pandas as pd
+from common import load_utils
+from util import arkit
+from util import image as image_util
+
+class ARKitScenesInferDataset(Dataset):
+    def __init__(self, data_dir,voxel_size=0.02, frame_skip=5, image_size=[224, 224]) -> None:
+        self.voxel_size = voxel_size
+        self.frame_skip = frame_skip
+        self.image_size = image_size
+        
+        self.scans_dir = osp.join(data_dir, 'scans')
+        self.files_dir = osp.join(data_dir, 'files')
+        self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
+        
+        self.scan_ids = []
+        for split in ['train', 'val']:
+            filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split))
+            self.scan_ids.extend(np.genfromtxt(filepath, dtype = str))
+        
+        self.base_tf = tvf.Compose([
+            tvf.ToTensor(),
+            tvf.Normalize(mean=[0.485, 0.456, 0.406], 
+                          std=[0.229, 0.224, 0.225])
+        ])
+        self.metadata = pd.read_csv(osp.join(self.files_dir,'metadata.csv'))
+        
+    
+    def extract_images(self, scan_id, color_path):
+        pose_data = arkit.load_poses(self.scans_dir, scan_id, skip=self.frame_skip)      
+        frame_idxs = list(pose_data.keys())
+        
+        pose_data_arr = []
+        for frame_idx in frame_idxs:
+            pose = pose_data[frame_idx]
+            rot_quat = R.from_matrix(pose[:3, :3]).as_quat()
+            trans = pose[:3, 3]
+            pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]])
+            
+        pose_data_arr = np.array(pose_data_arr)
+        sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr)
+        sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
+        
+        image_data = None
+        for idx in sampled_frame_idxs:
+            frame_index = frame_idxs[idx]
+            image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png'))
+            if sky_direction=='Left':
+                image = image.transpose(Image.ROTATE_270)
+            elif sky_direction=='Right':
+                image = image.transpose(Image.ROTATE_90)
+            image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC)
+            image_pt = self.base_tf(image).unsqueeze(0)
+            image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0)
+
+        return image_data.unsqueeze(0)
+     
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            scan_id = self.scan_ids[index]
+        
+        if isinstance(index, str):
+            scan_id = index
+        
+        scan_folder = osp.join(self.scans_dir, scan_id)
+        data_dict = {}
+        data_dict['masks'] = {}
+        
+        # Point Cloud
+        mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, '{}_3dod_mesh.ply'.format(scan_id)))
+        points = np.asarray(mesh.vertices)
+        feats  = np.asarray(mesh.vertex_colors)*255.0
+        feats = feats.round()
+        
+        feats /= 255.
+        feats -= 0.5
+        
+        _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True)
+        coords,  feats = points[sel], feats[sel]
+        coords = np.floor(coords / self.voxel_size)
+        coords-= coords.min(0)
+        
+        coords, feats = ME.utils.sparse_collate([coords], [feats])
+        data_dict['masks']['point'] = True
+        
+        # RGB
+        color_path = osp.join(scan_folder, f'{scan_id}_frames','lowres_wide')
+        image_data = self.extract_images(scan_id, color_path)
+        data_dict['masks']['rgb'] = True
+        
+        # Floorplan (dummy)
+        floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8)
+        floorplan_img = Image.fromarray(floorplan_img)
+        data_dict['masks']['floorplan'] = False
+    
+        floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC)
+        floorplan_data = self.base_tf(floorplan_img).unsqueeze(0)
+        
+        # Referral
+        referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id]
+        if len(referrals) != 0:
+            if len(referrals) > 10:
+                referrals = np.random.choice(referrals, size=10, replace=False)
+            referrals = [referral['utterance'] for referral in referrals]
+            referrals = [' '.join(referrals)]
+            data_dict['masks']['referral'] = True
+        else:
+            referrals = ['']
+            data_dict['masks']['referral'] = False
+                
+        data_dict['coordinates'] = coords
+        data_dict['features'] = feats
+        data_dict['rgb'] = image_data
+        data_dict['floorplan'] = floorplan_data
+        data_dict['referral'] = referrals
+        
+        return data_dict
\ No newline at end of file
diff --git a/single_inference/datasets/multiscan.py b/single_inference/datasets/multiscan.py
new file mode 100644
index 0000000..06538e6
--- /dev/null
+++ b/single_inference/datasets/multiscan.py
@@ -0,0 +1,120 @@
+import os.path as osp
+import numpy as np
+from torch.utils.data import Dataset
+import MinkowskiEngine as ME
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+from torchvision import transforms as tvf
+import torch
+import open3d as o3d
+
+from common import load_utils
+from util import multiscan
+from util import image as image_util
+
+class MultiScanInferDataset(Dataset):
+    def __init__(self, data_dir, voxel_size=0.02, frame_skip=1, image_size=[224, 224]) -> None:
+        self.voxel_size = voxel_size
+        self.frame_skip = frame_skip
+        self.image_size = image_size
+        
+        self.scans_dir = osp.join(data_dir, 'scenes')
+        self.files_dir = osp.join(data_dir, 'files')
+        self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json'))
+        
+        self.scan_ids = []
+        for split in ['train', 'val']:
+            filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split))
+            self.scan_ids.extend(np.genfromtxt(filepath, dtype = str))
+        
+        self.base_tf = tvf.Compose([
+            tvf.ToTensor(),
+            tvf.Normalize(mean=[0.485, 0.456, 0.406], 
+                          std=[0.229, 0.224, 0.225])
+        ])
+    
+    def extract_images(self, scan_id, color_path):
+        frame_idxs = multiscan.load_frame_idxs(osp.join(self.scans_dir, scan_id))
+        pose_data = multiscan.load_all_poses(osp.join(self.scans_dir, scan_id), frame_idxs)    
+        frame_idxs = list(pose_data.keys())
+        
+        pose_data_arr = []
+        for frame_idx in frame_idxs:
+            pose = pose_data[frame_idx]
+            rot_quat = R.from_matrix(pose[:3, :3]).as_quat()
+            trans = pose[:3, 3]
+            pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]])
+            
+        pose_data_arr = np.array(pose_data_arr)
+        sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr)
+        
+        image_data = None
+        for idx in sampled_frame_idxs:
+            frame_index = frame_idxs[idx]
+            image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg'))
+            image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC)
+            image_pt = self.base_tf(image).unsqueeze(0)
+            image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0)
+
+        return image_data.unsqueeze(0)
+     
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            scan_id = self.scan_ids[index]
+        
+        if isinstance(index, str):
+            scan_id = index
+        
+        scan_folder = osp.join(self.scans_dir, scan_id)
+        data_dict = {}
+        data_dict['masks'] = {}
+        
+        # Point Cloud
+        mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, f'{scan_id}.ply'))
+        points = np.asarray(mesh.vertices)
+        feats  = np.asarray(mesh.vertex_colors)*255.0
+        feats = feats.round()
+        
+        feats /= 255.
+        feats -= 0.5
+        
+        _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True)
+        coords,  feats = points[sel], feats[sel]
+        coords = np.floor(coords / self.voxel_size)
+        coords-= coords.min(0)
+        
+        coords, feats = ME.utils.sparse_collate([coords], [feats])
+        data_dict['masks']['point'] = True
+        
+        # RGB
+        color_path = osp.join(scan_folder, 'sequence')
+        image_data = self.extract_images(scan_id, color_path)
+        data_dict['masks']['rgb'] = True
+        
+        # Floorplan (dummy)
+        floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8)
+        floorplan_img = Image.fromarray(floorplan_img)
+        data_dict['masks']['floorplan'] = False
+    
+        floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC)
+        floorplan_data = self.base_tf(floorplan_img).unsqueeze(0)
+        
+        # Referral
+        referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id]
+        if len(referrals) != 0:
+            if len(referrals) > 10:
+                referrals = np.random.choice(referrals, size=10, replace=False)
+            referrals = [referral['utterance'] for referral in referrals]
+            referrals = [' '.join(referrals)]
+            data_dict['masks']['referral'] = True
+        else:
+            referrals = ['']
+            data_dict['masks']['referral'] = False
+                
+        data_dict['coordinates'] = coords
+        data_dict['features'] = feats
+        data_dict['rgb'] = image_data
+        data_dict['floorplan'] = floorplan_data
+        data_dict['referral'] = referrals
+        
+        return data_dict
\ No newline at end of file
diff --git a/single_inference/scene_inference.py b/single_inference/scene_inference.py
index 9846dd5..1d13b5e 100644
--- a/single_inference/scene_inference.py
+++ b/single_inference/scene_inference.py
@@ -26,6 +26,10 @@ def run_inference(args, scan_id=None):
         dataset = datasets.ScannetInferDataset(args.data_dir, args.floorplan_dir)
     elif args.dataset == 'Scan3R':
         dataset = datasets.Scan3RInferDataset(args.data_dir)
+    elif args.dataset == 'ARKitScenes':
+        dataset = datasets.ARKitScenesInferDataset(args.data_dir)
+    elif args.dataset == 'MultiScan':
+        dataset = datasets.MultiScanInferDataset(args.data_dir)
     else:
         raise NotImplementedError('Dataset not implemented')
     
diff --git a/trainer/grounding_trainer.py b/trainer/grounding_trainer.py
index e0a40b2..7ee201c 100644
--- a/trainer/grounding_trainer.py
+++ b/trainer/grounding_trainer.py
@@ -1,5 +1,7 @@
+import os.path as osp
 from tqdm import tqdm
 from omegaconf import DictConfig
+from safetensors.torch import load_file
 
 import torch
 from trainer.build import TRAINER_REGISTRY
diff --git a/util/arkit.py b/util/arkit.py
new file mode 100644
index 0000000..c4e7593
--- /dev/null
+++ b/util/arkit.py
@@ -0,0 +1,347 @@
+import os.path as osp
+import numpy as np
+from plyfile import PlyData
+from glob import glob
+import csv
+import jsonlines
+import json
+import os
+import trimesh
+import pandas as pd
+import cv2
+
+ARKITSCENE_SCANNET= {
+'bed': 'bed',
+'cabinet': 'cabinet',
+'refrigerator': 'refrigerator',
+'table': 'table',
+'chair': 'chair',
+'sink': 'sink',
+'stove': 'stove',
+'oven': 'oven',
+'washer': 'washing machine',
+'shelf': 'shelf',
+'tv_monitor': 'tv',
+'bathtub': 'bathtub',
+'toilet': 'toilet',
+'sofa': 'sofa',
+'stool': 'stool',
+'fireplace': 'fireplace',
+'build_in_cabinet': 'cabinet',
+'dishwasher': 'dishwasher',
+'stairs': 'stairs'
+}
+
+def get_scan_ids(dirname, split):
+    filepath = osp.join(dirname, '{}_scans.txt'.format(split))
+    scan_ids = np.genfromtxt(filepath, dtype = str)
+    return scan_ids
+
+def load_frame_idxs(scan_dir, skip=None):
+    frames_paths = glob(osp.join(scan_dir, f"{scan_dir.split('/')[-1]}_frames", 'lowres_wide', '*.png'))
+    frame_names = [osp.basename(frame_path) for frame_path in frames_paths]
+    frame_idxs = [frame_name.split('.png')[0].split("_")[1] for frame_name in frame_names]
+    frame_idxs.sort() 
+
+    if skip is not None:
+        frame_idxs = frame_idxs[::skip]
+
+    return frame_idxs
+
+def TrajStringToMatrix(traj_str):
+    """ convert traj_str into translation and rotation matrices
+    Args:
+        traj_str: A space-delimited file where each line represents a camera position at a particular timestamp.
+        The file has seven columns:
+        * Column 1: timestamp
+        * Columns 2-4: rotation (axis-angle representation in radians)
+        * Columns 5-7: translation (usually in meters)
+
+    Returns:
+        ts: translation matrix
+        Rt: rotation matrix
+    """
+    # line=[float(x) for x in traj_str.split()]
+    # ts = line[0];
+    # R = cv2.Rodrigues(np.array(line[1:4]))[0];
+    # t = np.array(line[4:7]);
+    # Rt = np.concatenate((np.concatenate((R, t[:,np.newaxis]), axis=1), [[0.0,0.0,0.0,1.0]]), axis=0)
+    tokens = traj_str.split()
+    assert len(tokens) == 7
+    ts = tokens[0]
+    # Rotation in angle axis
+    angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
+    r_w_to_p = convert_angle_axis_to_matrix3(np.asarray(angle_axis))
+    # Translation
+    t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])])
+    extrinsics = np.eye(4, 4)
+    extrinsics[:3, :3] = r_w_to_p
+    extrinsics[:3, -1] = t_w_to_p
+    Rt = np.linalg.inv(extrinsics)
+    return Rt
+
+def convert_angle_axis_to_matrix3(angle_axis):
+    """Return a Matrix3 for the angle axis.
+    Arguments:
+        angle_axis {Point3} -- a rotation in angle axis form.
+    """
+    matrix, jacobian = cv2.Rodrigues(angle_axis)
+    return matrix
+
+def load_poses(scan_dir, scan_id, skip=None):
+    frame_poses = {}
+    frame_idxs = load_frame_idxs(scan_dir, skip=skip)
+    traj_file = osp.join(scan_dir, f'{scan_id}_frames', 'lowres_wide.traj')
+    with open(traj_file) as f:
+            traj = f.readlines()
+    for i,line in enumerate(traj):
+        ts=line.split(" ")[0]
+        rounded_ts = round(float(ts), 3)
+        formatted_ts = f"{rounded_ts:.3f}"
+        if formatted_ts not in frame_idxs:
+            if f"{rounded_ts - 0.001:.3f}" in frame_idxs:
+                frame_poses[f"{rounded_ts - 0.001:.3f}"] = TrajStringToMatrix(line)
+            elif f"{rounded_ts + 0.001:.3f}" in frame_idxs:
+                frame_poses[f"{rounded_ts + 0.001:.3f}"] = TrajStringToMatrix(line)
+            else:
+                print("no matching pose for frame", formatted_ts)
+                continue
+        # if f"{round(float(ts), 3):.3f}" not in frame_idxs:
+        #     if f"{round(float(ts), 3)-0.001 :.3f}" in frame_idxs:
+        #         frame_poses[f"{round(float(ts), 3)-0.001:.3f}"] = TrajStringToMatrix(line)
+        #     elif f"{round(float(ts), 3)+0.001 :.3f}" in frame_idxs:
+        #         frame_poses[f"{round(float(ts), 3)+0.001:.3f}"] = TrajStringToMatrix(line)
+        #     else:    
+        #         continue
+        else:
+            frame_poses[f"{round(float(ts), 3):.3f}"] = TrajStringToMatrix(line)
+    # data = pd.read_csv(osp.join(scan_dir,f'{scan_id}_frames','lowres_wide.traj'), delim_whitespace=True, header=None)
+    # for frame_idx,(index, row) in zip(frame_idxs,data.iterrows()):
+    #     if skip is not None and index % skip != 0:
+    #         continue
+    #     rotation_axis = row[1:4].values
+    #     rotation_angle = np.linalg.norm(rotation_axis)
+    #     if rotation_angle != 0:
+    #         rotation_axis = rotation_axis / rotation_angle
+    #     translation = row[4:7].values
+    #     # Convert axis-angle to rotation matrix
+    #     # rotation_matrix = axis_angle_to_rotation_matrix(rotation_axis, rotation_angle)
+    #     rotation_matrix=
+    #     # Construct the 4x4 homogeneous transformation matrix
+    #     homogenous_matrix = np.eye(4)
+    #     homogenous_matrix[:3, :3] = rotation_matrix
+    #     homogenous_matrix[:3, 3] = translation
+    #     frame_poses[frame_idx] = homogenous_matrix
+        
+    return frame_poses
+
+def axis_angle_to_rotation_matrix(axis, angle):
+    # Normalize the rotation axis
+    axis = axis / np.linalg.norm(axis)
+    x, y, z = axis
+    c = np.cos(angle)
+    s = np.sin(angle)
+    t = 1 - c
+
+    # Compute the rotation matrix using the axis-angle formula
+    rotation_matrix = np.array([
+        [t*x*x + c,    t*x*y - s*z,  t*x*z + s*y],
+        [t*x*y + s*z,  t*y*y + c,    t*y*z - s*x],
+        [t*x*z - s*y,  t*y*z + s*x,  t*z*z + c]
+    ])
+
+    return rotation_matrix
+
+def load_intrinsics(data_dir, scan_id, frame_id):
+    '''
+    Load ARKit intrinsic information
+    '''
+    pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{frame_id}.pincam')
+    if not os.path.exists(pincam_path):
+        pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)-0.001:.3f}.pincam')
+    if not os.path.exists(pincam_path):
+        pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)+0.001:.3f}.pincam')
+        
+        
+    intrinsics = {}
+
+    # Read the .pincam file
+    with open(pincam_path, "r") as f:
+        line = f.readline().strip()
+    
+    # Parse the intrinsic parameters
+    width, height, focal_length_x, focal_length_y, principal_point_x, principal_point_y = map(float, line.split())
+
+    # Store the width and height
+    intrinsics['width'] = width
+    intrinsics['height'] = height
+
+    # Construct the intrinsic matrix
+    intrinsic_mat = np.array([
+        [focal_length_x, 0, principal_point_x],
+        [0, focal_length_y, principal_point_y],
+        [0, 0, 1]
+    ])
+    intrinsics['intrinsic_mat'] = intrinsic_mat
+
+    return intrinsics
+
+def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'):
+    LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv')
+    assert osp.exists(LABEL_MAP_FILE)
+    
+    raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to)
+    return raw_label_map
+
+def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'):
+    assert osp.isfile(filename)
+    mapping = dict()
+    with open(filename) as csvfile:
+        reader = csv.DictReader(csvfile, delimiter='\t')
+        for row in reader:
+            mapping[row[label_from]] = row[label_to]
+    
+    if represents_int(list(mapping.keys())[0]):
+        mapping = {int(k):v for k,v in mapping.items()}
+    
+    return mapping
+
+def represents_int(s):
+    ''' if string s represents an int. '''
+    try: 
+        int(s)
+        return True
+    except ValueError:
+        return False
+    
+def load_ply_data(data_dir, scan_id, annotations):
+    filename_in = osp.join(data_dir, scan_id, f'{scan_id}_3dod_mesh.ply')
+    file = open(filename_in, 'rb')
+    plydata = PlyData.read(file)
+    file.close()
+    vertices = plydata['vertex']['x'], plydata['vertex']['y'], plydata['vertex']['z']
+    vertices = np.vstack(vertices).T
+
+    vertex_colors = plydata['vertex']['red'], plydata['vertex']['green'], plydata['vertex']['blue']
+    vertex_colors = np.vstack(vertex_colors).T
+
+    vertex_dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), 
+                    ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'),
+                    ('objectId', 'h')]  
+    vertices_structured = np.empty(vertices.shape[0], dtype=vertex_dtype)
+
+    # Assign x, y, z, and color values to the structured array
+    vertices_structured['red'] = vertex_colors[:, 0]
+    vertices_structured['green'] = vertex_colors[:, 1]
+    vertices_structured['blue'] = vertex_colors[:, 2]
+
+    vertex_instance = np.zeros(vertices.shape[0], dtype='h')  # Use 'h' for signed 16-bit integer
+    bbox_list=[]
+    for _i, label_info in enumerate(annotations["data"]):
+        object_id = _i + 1
+        rotation = np.array(label_info["segments"]["obbAligned"]["normalizedAxes"]).reshape(3, 3)
+
+        transform = np.array(label_info["segments"]["obbAligned"]["centroid"]).reshape(-1, 3)
+        scale = np.array(label_info["segments"]["obbAligned"]["axesLengths"]).reshape(-1, 3)
+
+        trns = np.eye(4)
+        trns[0:3, 3] = transform
+        trns[0:3, 0:3] = rotation.T
+
+        box_trimesh_fmt = trimesh.creation.box(scale.reshape(3,), trns)
+        obj_containment = np.argwhere(box_trimesh_fmt.contains(vertices))
+
+        vertex_instance[obj_containment] = object_id
+        box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation)
+        bbox_list.append(box3d)
+    
+    # if len(bbox_list) == 0:
+    #         return
+
+    vertices_structured['objectId'] = vertex_instance
+
+    # align_angle = calc_align_matrix(bbox_list)
+
+    # vertices_aligned = rotate_z_axis_by_degrees(np.array(vertices), align_angle)
+
+    if np.max(vertex_colors) <= 1:
+        vertex_colors = vertex_colors * 255.0
+
+    # center_points = np.mean(vertices_aligned, axis=0)
+    # center_points[2] = np.min(vertices_aligned[:, 2]) 
+    # vertices_aligned = vertices_aligned - center_points
+
+    # vertices_structured['x'] = vertices_aligned[:, 0]
+    # vertices_structured['y'] = vertices_aligned[:, 1]
+    # vertices_structured['z'] = vertices_aligned[:, 2]
+    
+    vertices_structured['x'] = plydata['vertex']['x']
+    vertices_structured['y'] = plydata['vertex']['y']
+    vertices_structured['z'] = plydata['vertex']['z']
+    
+    return vertices_structured
+
+def compute_box_3d(size, center, rotmat):
+    """Compute corners of a single box from rotation matrix
+    Args:
+        size: list of float [dx, dy, dz]
+        center: np.array [x, y, z]
+        rotmat: np.array (3, 3)
+    Returns:
+        corners: (8, 3)
+    """
+    l, h, w = [i / 2 for i in size]
+    center = np.reshape(center, (-1, 3))
+    center = center.reshape(3)
+    x_corners = [l, l, -l, -l, l, l, -l, -l]
+    y_corners = [h, -h, -h, h, h, -h, -h, h]
+    z_corners = [w, w, w, w, -w, -w, -w, -w]
+    corners_3d = np.dot(
+        np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners])
+    )
+    corners_3d[0, :] += center[0]
+    corners_3d[1, :] += center[1]
+    corners_3d[2, :] += center[2]
+    return np.transpose(corners_3d)
+
+def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True):
+    theta = np.deg2rad(theta)
+    cos_t = np.cos(theta)
+    sin_t = np.sin(theta)
+    rot_matrix = np.array([[cos_t, -sin_t, 0],
+                           [sin_t, cos_t, 0],
+                           [0, 0, 1]], pointcloud.dtype)
+    if not clockwise:
+        rot_matrix = rot_matrix.T
+    return pointcloud.dot(rot_matrix)
+
+def calc_align_matrix(bbox_list):
+    RANGE = [-45, 45]
+    NUM_BIN = 90
+    angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN)
+    angle_counts = {}
+    for _a in angles:
+        bucket = round(_a, 3)
+        for box in bbox_list:
+            box_r = rotate_z_axis_by_degrees(box, bucket)
+            bottom = box_r[4:]
+            if is_axis_aligned(bottom):
+                angle_counts[bucket] = angle_counts.get(bucket, 0) + 1
+    if len(angle_counts) == 0:
+        RANGE = [-90, 90]
+        NUM_BIN = 180
+        angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN)
+        for _a in angles:
+            bucket = round(_a, 3)
+            for box in bbox_list:
+                box_r = rotate_z_axis_by_degrees(box, bucket)
+                bottom = box_r[4:]
+                if is_axis_aligned(bottom, thres=0.15):
+                    angle_counts[bucket] = angle_counts.get(bucket, 0) + 1
+    most_common_angle = max(angle_counts, key=angle_counts.get)
+    return most_common_angle
+
+def is_axis_aligned(rotated_box, thres=0.05):
+    x_diff = abs(rotated_box[0][0] - rotated_box[1][0])
+    y_diff = abs(rotated_box[0][1] - rotated_box[3][1])
+    return x_diff < thres and y_diff < thres
diff --git a/util/multiscan.py b/util/multiscan.py
new file mode 100644
index 0000000..8478a7d
--- /dev/null
+++ b/util/multiscan.py
@@ -0,0 +1,672 @@
+import os.path as osp
+import numpy as np
+from plyfile import PlyData
+from glob import glob
+import csv
+import jsonlines
+import json
+import os
+
+MULTISCAN_SCANNET = {
+    "wall": "wall",
+    "door": "door",
+    "slippers": "shoe",
+    "mop": "broom",
+    "rug": "rug",
+    "floor": "floor",
+    "basin": "sink",
+    "basin_stand": "sink",
+    "bucket": "bucket",
+    "shower": "shower",
+    "water_tank": "container",
+    "beam": "wood beam",
+    "pillar": "pillar",
+    "ceiling": "ceiling",
+    "sink": "sink",
+    "toilet": "toilet",
+    "cabinet": "cabinet",
+    "remove": "object",
+    "towel": "towel",
+    "pillow": "pillow",
+    "sofa": "sofa",
+    "footstool": "footstool",
+    "picture": "picture",
+    "window": "window",
+    "heater": "heater",
+    "mirror": "mirror",
+    "pipe": "pipe",
+    "scarf": "cloth",
+    "ceiling_light": "ceiling light",
+    "chair": "chair",
+    "table": "table",
+    "vent": "vent",
+    "bag": "bag",
+    "wall_cabinet": "cabinet",
+    "range": "stove",
+    "ricemaker": "rice cooker",
+    "pan": "cooking pan",
+    "coffee_machine": "coffee maker",
+    "rice_bag": "bag",
+    "light": "light",
+    "trashbin": "trash bin",
+    "kettle": "kettle",
+    "refrigerator": "refrigerator",
+    "microwave": "microwave",
+    "light_switch": "light switch",
+    "rice_cooker": "rice cooker",
+    "box": "box",
+    "shoe": "shoe",
+    "range_hood": "range hood",
+    "wok": "cooking pan",
+    "router": "object",
+    "paper_towel": "paper towel roll",
+    "stock_pot": "pot",
+    "cutting_board": "cutting board",
+    "wall_calendar": "calendar",
+    "baseboard": "object",
+    "coke_box": "box",
+    "printer": "printer",
+    "bowl": "bowl",
+    "backpack": "backpack",
+    "baseboard_heater": "heater",
+    "broom": "broom",
+    "dust_pan": "dustpan",
+    "trash_bin": "trash bin",
+    "rigid_duct": "vent",
+    "electric_range": "stove",
+    "spatula": "object",
+    "faucet": "faucet",
+    "bottle": "bottle",
+    "countertop": "counter",
+    "railing": "railing",
+    "suitcase": "suitcase",
+    "trash": "trash can",
+    "pot": "pot",
+    "kitchen_tool": "object",
+    "vegetable": "object",
+    "board": "board",
+    "washing_machine": "washing machine",
+    "jar": "jar",
+    "object": "object",
+    "notebook": "book",
+    "induction_cooker": "stove",
+    "instant_pot_lid": "cooking pot",
+    "oven": "oven",
+    "air_fryer": "object",
+    "lid": "pot",
+    "sponge": "sponge",
+    "blender": "object",
+    "spoon": "object",
+    "dishwasher": "dishwasher",
+    "detergent": "laundry detergent",
+    "watermelon": "bananas",
+    "yard_waste_bag": "garbage bag",
+    "container": "container",
+    "newspapers": "paper",
+    "rag": "cloth",
+    "ladder": "ladder",
+    "gate": "door",
+    "napkin_box": "tissue box",
+    "jacket": "jacket",
+    "windowsill": "windowsill",
+    "water_faucet": "faucet",
+    "steel_ball": "ball",
+    "rice_maker": "rice cooker",
+    "watter_bottle": "water bottle",
+    "plastic_bag": "bag",
+    "paper_bag": "paper bag",
+    "cuttting_board": "cutting board",
+    "trash_bin_lid": "trash bin",
+    "hair_dryer": "hair dryer",
+    "electric_socket": "power outlet",
+    "electric_panel": "electric panel",
+    "wash_stand": "sink",
+    "soap": "soap",
+    "curtain": "curtain",
+    "bathtub": "bathtub",
+    "smoke_detector": "smoke detector",
+    "roll_paper": "paper towel roll",
+    "chandelier": "chandelier",
+    "hand_sanitizer": "hand sanitzer dispenser",
+    "plate": "plate",
+    "sticker": "sticker",
+    "power_socket": "power outlet",
+    "stacked_cups": "stack of cups",
+    "stacked_chairs": "stack of chairs",
+    "air_vent": "vent",
+    "cornice": "cabinet",
+    "wine_cabinet": "kitchen cabinet",
+    "crock": "bowl",
+    "liquor_box": "cabinet",
+    "shampoo": "shampoo",
+    "shower_curtain": "shower curtain",
+    "wall_light": "wall lamp",
+    "sink_cabinet": "sink",
+    "toilet_roll": "toilet paper",
+    "shelf": "shelf",
+    "paper_bin": "recycling bin",
+    "toilet_brush": "toilet brush",
+    "shower_head": "shower head",
+    "tv": "tv",
+    "remote_control": "remote",
+    "tv_box": "tv stand",
+    "nightstand": "nightstand",
+    "bed": "bed",
+    "quilt": "blanket",
+    "telephone": "telephone",
+    "monitor": "monitor",
+    "desk": "desk",
+    "radiator_shell": "radiator",
+    "calendar": "calendar",
+    "clock": "clock",
+    "keyboard": "keyboard",
+    "speaker": "speaker",
+    "clothes": "clothes",
+    "door_frame": "doorframe",
+    "sliding_door": "sliding door",
+    "ceiling_lamp": "ceiling lamp",
+    "scale": "scale",
+    "power_strip": "power strip",
+    "switch": "light switch",
+    "basket": "basket",
+    "stool": "stool",
+    "shoes": "shoe",
+    "slipper": "slippers",
+    "bifold_door": "door",
+    "rangehood": "range hood",
+    "books": "books",
+    "toilet_paper": "toilet paper",
+    "mouse_pad": "mouse",
+    "ipad": "ipad",
+    "scissor": "knife block",
+    "radiator": "radiator",
+    "pc": "computer tower",
+    "bicycle": "bicycle",
+    "wardrobe": "wardrobe",
+    "mouse": "mouse",
+    "advertising_board": "poster",
+    "banner": "banner",
+    "ceiling_decoration": "ceiling light",
+    "whiteboard": "whiteboard",
+    "wall_storage_set": "shelf",
+    "traffic_cone": "traffic cone",
+    "wall_decoration": "decoration",
+    "papers": "papers",
+    "hat": "hat",
+    "velvet_hangers": "clothes hanger",
+    "circular_plate": "plate",
+    "cellphone": "telephone",
+    "pen": "keyboard piano",
+    "paper": "paper",
+    "lamp": "lamp",
+    "curtain_box": "curtains",
+    "woodcarving": "wood",
+    "scissors": "knife block",
+    "hand_dryer": "hand dryer",
+    "machine": "machine",
+    "vase": "vase",
+    "plant": "plant",
+    "power_socket_case": "power outlet",
+    "gloves": "clothes",
+    "dishcloth": "cloth",
+    "painting": "painting",
+    "shower_wall": "shower wall",
+    "showerhead": "shower head",
+    "tooth_mug": "cup",
+    "map": "map",
+    "knot_artwork": "decoration",
+    "fan": "fan",
+    "sphygmomanometer": "scale",
+    "electric_kettle": "kettle",
+    "bread_maker": "oven",
+    "knife_set": "knife block",
+    "soup_pot": "cooking pot",
+    "flatware_set": "cutting board",
+    "candle": "candle",
+    "lid_rack": "dish rack",
+    "flower": "flowerpot",
+    "can": "can",
+    "scoop": "bowl",
+    "laptop": "laptop",
+    "glass": "glass doors",
+    "wet_floor_sign": "wet floor sign",
+    "shower_enclosure": "shower doors",
+    "jewelry_box": "jewelry box",
+    "bath_brush": "hair brush",
+    "sofa_cushion": "couch cushions",
+    "tv_cabinet": "tv stand",
+    "wood_fence": "wood beam",
+    "floor_lamp": "lamp",
+    "computer_case": "computer tower",
+    "waste_container": "trash bin",
+    "roadblock": "barricade",
+    "trash_can_lids": "trash can",
+    "hand_sanitizer_stand": "soap dispenser",
+    "air_conditioner": "conditioner bottle",
+    "pattern": "rug",
+    "remote_controller": "remote",
+    "phone": "telephone",
+    "speakers": "speaker",
+    "table_divider": "divider",
+    "table_card": "card",
+    "paper_trimmer": "paper cutter",
+    "stapler": "stapler",
+    "cup": "cup",
+    "bathroom_heater": "heater",
+    "wall_shelf": "shelf",
+    "towel_rack": "towel",
+    "sink_drain": "sink",
+    "floor_drain": "floor",
+    "broom_head": "broom",
+    "door_curtain": "curtain",
+    "refill_pouch": "plastic container",
+    "bin": "bin",
+    "stall_wall": "bathroom stall door",
+    "wall_speaker": "speaker",
+    "laundry_basket": "laundry basket",
+    "tissue_box": "tissue box",
+    "document_holder": "file cabinet",
+    "yoga_mat": "yoga mat",
+    "gas_range": "stove",
+    "chopping_board": "cutting board",
+    "book_scanner": "scanner",
+    "payment_terminal": "vending machine",
+    "napkin_roll": "paper towel roll",
+    "faucet_switch": "faucet",
+    "glass_door": "glass doors",
+    "carpet": "carpet",
+    "shower_floor": "shower floor",
+    "toilet_plunger": "plunger",
+    "plug_panel": "power outlet",
+    "stand": "stand",
+    "potted_plant": "potted plant",
+    "poster": "poster",
+    "isolation_board": "divider",
+    "soap_holder": "soap dish",
+    "plug": "power outlet",
+    "brush": "hair brush",
+    "threshold": "doorframe",
+    "air_conditioner_controller": "remote",
+    "iron": "iron",
+    "ironing_board": "ironing board",
+    "safe": "suitcase",
+    "gas_cooker": "stove",
+    "pressure_cooker": "cooking pot",
+    "steamer_pot": "pot",
+    "soy_sauce_bottle": "bottle",
+    "dishwashing_liquid": "dishwashing soap bottle",
+    "water_ladle": "bowl",
+    "power_socket_set": "power strip",
+    "kitchen_tool_holder": "kitchen cabinet",
+    "case": "case",
+    "wall_paper": "wall",
+    "comb": "hair brush",
+    "paper_cutter": "paper cutter",
+    "pencil_sharpener": "pen holder",
+    "sealing_machine": "machine",
+    "poster_board": "poster",
+    "shredder": "shredder",
+    "footstep": "stair",
+    "planter": "plant",
+    "floor_light": "lamp",
+    "paper_cup": "cup",
+    "divider": "divider",
+    "hanger": "clothes hanger",
+    "glove": "clothing",
+    "blanket": "blanket",
+    "remote": "remote",
+    "cloth": "cloth",
+    "clutter": "object",
+    "extinguisher": "fire extinguisher",
+    "dryer": "clothes dryer",
+    "soap_bottle": "soap bottle",
+    "fabric_softener_box": "box",
+    "dryer_sheet_box": "box",
+    "detergent_bottle": "laundry detergent",
+    "toaster": "toaster",
+    "stacked_bowls": "bowl",
+    "pot_lid": "pot",
+    "electric_pressure_cooker": "rice cooker",
+    "bread": "food display",
+    "bagels": "object",
+    "oranges": "bananas",
+    "card_reader": "card",
+    "whiteboard_detergent": "soap dispenser",
+    "power_outlet": "power outlet",
+    "bouquet": "vase",
+    "water_bottle": "water bottle",
+    "wall_mounted_telephone": "telephone",
+    "fridge": "refrigerator",
+    "toy": "toy dinosaur",
+    "shoe_box": "box",
+    "hole_puncher": "paper cutter",
+    "landline_telephone": "telephone",
+    "base": "stand",
+    "handkerchief": "cloth",
+    "cornice_molding": "frame",
+    "bathtub_base": "bathtub",
+    "bidet": "toilet",
+    "pedestal_urinal": "urinal",
+    "pedestal_urinal_covered": "urinal",
+    "pit_toilet": "toilet",
+    "low_wall": "wall",
+    "rail": "rail",
+    "bottles": "bottles",
+    "floor_otherroom": "floor",
+    "wall_otherroom": "wall",
+    "canopy": "canopy",
+    "cable_manager": "cable",
+    "sneakers": "shoes",
+    "purse": "purse",
+    "cushion": "cushion",
+    "napkin": "towel",
+    "plush_toy": "stuffed animal",
+    "adjustable_desk": "desk",
+    "tableware": "plates",
+    "computer_desk": "desk",
+    "cat_kennel": "cat litter box",
+    "back_cushion": "pillow",
+    "ukulele_bag": "guitar case",
+    "litter_box": "trash can",
+    "storage_box": "storage bin",
+    "toy_doll": "doll",
+    "drawer_unit": "drawer",
+    "doll": "stuffed animal",
+    "laptop_bag": "messenger bag",
+    "clothing_rack": "clothing rack",
+    "bookshelf": "bookshelves",
+    "mask": "cloth",
+    "watch": "clock",
+    "book": "books",
+    "ashtray": "tray",
+    "car_key": "car",
+    "wallet": "purse",
+    "tea_pot": "tea kettle",
+    "wire": "cable",
+    "rake": "broom",
+    "dispenser": "soap dispenser",
+    "toilet_tank": "toilet",
+    "door_sill": "doorframe",
+    "cleanser": "soap",
+    "armrest": "armchair",
+    "short_wall": "wall",
+    "suspended_ceiling": "ceiling",
+    "fire_extinguisher_cabinet": "fire extinguisher",
+    "plastic_box": "plastic container",
+    "sanitation_station": "soap dispenser",
+    "plant_pot": "flowerpot",
+    "fireplace": "fireplace",
+    "computer_table": "desk",
+    "tissue_bag": "tissue box",
+    "wall_frame": "frame",
+    "map_board": "map",
+    "automated_teller_machine": "vending machine",
+    "ticket": "card",
+    "tablet": "ipad",
+    "blankets": "blanket",
+    "bags": "bag",
+    "flag": "flag",
+    "blackboard": "blackboard",
+    "bar_table": "bar",
+    "cardboard_holder": "cardboard",
+    "potted_planet": "potted plant",
+    "tray": "tray",
+    "utensil_holder": "kitchen counter",
+    "bird_ceramics": "statue",
+    "shirt": "shirt",
+    "clothes_rail": "clothes hanger",
+    "power_strips": "power strip",
+    "card_board": "board",
+    "pile_of_blankets": "blanket",
+    "bed_net": "bed",
+    "umbrella": "umbrella",
+    "dragon_fruit": "bananas",
+    "tissue": "tissue box",
+    "electrical_panel": "electric panel",
+    "panel": "door",
+    "tube": "tube",
+    "pile_of_cloth": "cloth",
+    "surface": "table",
+    "chair_cushion": "cushion",
+    "guide": "book",
+    "parapet": "railing",
+    "camera": "camera",
+    "light_base": "lamp base",
+    "first_aid": "object",
+    "bench": "bench",
+    "potted_plants": "potted plant",
+    "pot_cover": "pot",
+    "yoga_mat_roll": "yoga mat",
+    "panda_doll": "stuffed animal",
+    "window_trim": "window",
+    "shoe_cabinet": "shoe rack",
+    "toilet_paper_holder": "toilet paper dispenser",
+    "shower_faucet": "shower faucet handle",
+    "bath_sponge": "sponge",
+    "ornament": "decoration",
+    "planter_box": "plant",
+    "cooktop": "stove",
+    "knife_block": "knife block",
+    "step_stool": "step stool",
+    "touchpad": "keyboard",
+    "light_box": "light",
+    "sound": "speaker",
+    "exhaust_fan_vent": "vent",
+    "paperbin": "recycling bin",
+    "mop_bucket": "bucket",
+    "sneaker": "shoes",
+    "objects": "object",
+    "cd_tray": "cd case",
+    "wall_board": "board",
+    "room_divider": "divider",
+    "paiting": "painting",
+    "cabinet_otherroom": "cabinet",
+    "electric_switch": "light switch",
+    "sign": "exit sign",
+    "hand_soap": "soap bottle",
+    "window_blinds": "blinds"
+}
+
+def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'):
+    LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv')
+    assert osp.exists(LABEL_MAP_FILE)
+    
+    raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to)
+    return raw_label_map
+
+def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'):
+    assert osp.isfile(filename)
+    mapping = dict()
+    with open(filename) as csvfile:
+        reader = csv.DictReader(csvfile, delimiter='\t')
+        for row in reader:
+            mapping[row[label_from]] = row[label_to]
+    
+    if represents_int(list(mapping.keys())[0]):
+        mapping = {int(k):v for k,v in mapping.items()}
+    
+    return mapping
+
+def get_scan_ids(dirname, split):
+    filepath = osp.join(dirname, '{}_scans.txt'.format(split))
+    scan_ids = np.genfromtxt(filepath, dtype = str)
+    return scan_ids
+
+def load_ply_data(data_dir, scan_id):
+    """
+    Load PLY data and propagate object IDs from faces to vertices.
+    """
+    filename_in = osp.join(data_dir, scan_id, '{}.ply'.format(scan_id))
+    
+    if not osp.exists(filename_in):
+        raise FileNotFoundError(f"PLY file not found: {filename_in}")
+    
+    with open(filename_in, 'rb') as file:
+        ply_data = PlyData.read(file)
+    
+    # Extract vertex properties
+    x = np.array(ply_data['vertex']['x'])
+    y = np.array(ply_data['vertex']['y'])
+    z = np.array(ply_data['vertex']['z'])
+    red = np.array(ply_data['vertex']['red'])
+    green = np.array(ply_data['vertex']['green'])
+    blue = np.array(ply_data['vertex']['blue'])
+    
+    # Extract normals if available
+    if 'nx' in ply_data['vertex'] and 'ny' in ply_data['vertex'] and 'nz' in ply_data['vertex']:
+        nx = np.array(ply_data['vertex']['nx'])
+        ny = np.array(ply_data['vertex']['ny'])
+        nz = np.array(ply_data['vertex']['nz'])
+        normals = np.stack([nx, ny, nz], axis=-1)
+    else:
+        normals = None
+
+    # Initialize object IDs for vertices with a default undefined value
+    vertex_object_ids = np.full(len(x), -1, dtype='int32')  # Default: -1 (undefined)
+    
+    # Extract face data
+    faces = ply_data['face'].data
+    face_vertex_indices = [face['vertex_indices'] for face in faces]
+    face_object_ids = [face['objectId'] for face in faces]
+    
+    # Propagate object IDs to vertices
+    for face_indices, obj_id in zip(face_vertex_indices, face_object_ids):
+        vertex_object_ids[face_indices] = obj_id  # Assign object ID to all vertices in the face
+    
+    vertex_dtype = [
+        ('x', 'f4'), ('y', 'f4'), ('z', 'f4'),       # Coordinates
+        ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'),  # Colors
+        ('objectId', 'i4')                            # Propagated Object ID
+    ]
+    
+    if normals is not None:
+        vertex_dtype.extend([('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4')])  # Normals
+    
+    vertices = np.empty(len(x), dtype=vertex_dtype)
+    
+    vertices['x'] = x.astype('f4')
+    vertices['y'] = y.astype('f4')
+    vertices['z'] = z.astype('f4')
+    vertices['red'] = red.astype('u1')
+    vertices['green'] = green.astype('u1')
+    vertices['blue'] = blue.astype('u1')
+    vertices['objectId'] = vertex_object_ids.astype('i4')
+    
+    if normals is not None:
+        vertices['nx'] = normals[:, 0].astype('f4')
+        vertices['ny'] = normals[:, 1].astype('f4')
+        vertices['nz'] = normals[:, 2].astype('f4')
+    
+    return vertices
+
+def load_meta_intrinsics(scan_dir, scene_id, stream_type="color_camera"):
+    '''
+    Load MultiScan intrinsic information
+    '''
+    meta_intrinsics_path = osp.join(scan_dir, f'{scene_id}.json')
+    intrinsics = {}
+    
+    with open(meta_intrinsics_path,"r") as f:
+        json_data=json.load(f)
+    
+    for stream in json_data.get("streams", []):
+        if stream.get("type") == stream_type:
+            intrinsic_mat = np.array(stream.get("intrinsics"))
+            intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F')
+            intrinsics['intrinsic_mat']=intrinsic_mat
+            resolution = stream.get("resolution")
+            width, height = resolution[1], resolution[0]  # [width, height]
+            intrinsics['width']=float(width)
+            intrinsics['height']=float(height)
+    
+    return intrinsics
+
+def load_intrinsics(scan_dir, scene_id, frame_id, stream_type="color_camera"):
+    '''
+    Load MultiScan intrinsic information
+    '''
+    intrinsics_path = osp.join(scan_dir, 'poses.jsonl')
+    resoultion_path = osp.join(scan_dir, f'{scene_id}.json')
+    intrinsics = {}
+    
+    with open(resoultion_path,"r") as f:
+        json_data=json.load(f)
+    
+    for stream in json_data.get("streams", []):
+        if stream.get("type") == stream_type:
+            resolution = stream.get("resolution", None)
+            if resolution:
+                width, height = resolution[1], resolution[0]  # [width, height]
+                intrinsics['width']=float(width)
+                intrinsics['height']=float(height)
+                
+        
+    with jsonlines.open(intrinsics_path) as reader:
+        for entry in reader:
+            if entry.get("frame_id") == frame_id:
+                intrinsic_mat = np.asarray(entry.get('intrinsics'))
+                intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F')
+                intrinsics['intrinsic_mat']=intrinsic_mat
+                break
+    
+    return intrinsics
+
+def load_pose(scan_dir, frame_id):
+    # Find alignment file
+    alignment_path = None
+    for file_name in os.listdir(scan_dir):
+        if file_name.endswith('.align.json'):
+            alignment_path = osp.join(scan_dir, file_name)
+            break
+
+    if alignment_path is None:
+        raise FileNotFoundError(f"No alignment file found in {scan_dir}")
+
+    with open(alignment_path, "r") as f:
+        alignment_data = json.load(f)
+    if 'coordinate_transform' not in alignment_data:
+        raise ValueError(f"Alignment file {alignment_path} does not contain 'coordinate_transform'")
+    coordinate_transform = np.reshape(alignment_data['coordinate_transform'], (4, 4), order='F')
+    inv_transform = np.linalg.inv(coordinate_transform)
+
+    pose_path = osp.join(scan_dir, 'poses.jsonl')
+    with jsonlines.open(pose_path) as reader:
+        for entry in reader:
+            if entry.get("frame_id") == frame_id:
+                transform = np.asarray(entry.get('transform'))
+                transform = np.reshape(transform, (4, 4), order='F')
+                transform = np.dot(transform, np.diag([1, -1, -1, 1]))
+                transform = transform / transform[3][3]
+                aligned_pose = inv_transform @ transform #align camera poses
+                return aligned_pose
+
+    raise ValueError(f"Pose for frame_id {frame_id} not found in {pose_path}")
+
+
+def load_all_poses(scan_dir, frame_idxs):
+    frame_poses = {}
+    for frame_idx in frame_idxs:
+        frame_pose = load_pose(scan_dir, int(frame_idx))
+        frame_poses[frame_idx] = frame_pose
+    return frame_poses
+
+def load_frame_idxs(scan_dir, skip=None):
+    frames_paths = glob(osp.join(scan_dir, 'sequence', '*.jpg'))
+    frame_names = [osp.basename(frame_path) for frame_path in frames_paths]
+    frame_idxs = [frame_name.split('.')[0].split('-')[-1] for frame_name in frame_names]
+    frame_idxs.sort()    
+
+    if skip is None:
+        frame_idxs = frame_idxs
+    else:
+        frame_idxs = [frame_idx for frame_idx in frame_idxs[::skip]]
+    return frame_idxs
+
+
+def represents_int(s):
+    ''' if string s represents an int. '''
+    try: 
+        int(s)
+        return True
+    except ValueError:
+        return False
\ No newline at end of file
diff --git a/util/structured3d.py b/util/structured3d.py
new file mode 100644
index 0000000..6fc9d46
--- /dev/null
+++ b/util/structured3d.py
@@ -0,0 +1,171 @@
+import os.path as osp
+import numpy as np
+from plyfile import PlyData
+from glob import glob
+import cv2
+
+S3D_SCANNET = {
+    1: 'wall',
+    2: 'floor',
+    3: 'cabinet',
+    4: 'bed',
+    5: 'chair',
+    6: 'sofa',
+    7: 'table',
+    8: 'door',
+    9: 'window',
+    10: 'bookshelf',
+    11: 'picture',
+    12: 'counter',
+    13: 'blinds',
+    14: 'desk',
+    15: 'shelf',
+    16: 'curtain',
+    17: 'dresser',
+    18: 'pillow',
+    19: 'mirror',
+    20: 'mat',
+    21: 'clothes',
+    22: 'ceiling',
+    23: 'books',
+    24: 'refrigerator',
+    25: 'tv',
+    26: 'paper',
+    27: 'towel',
+    28: 'shower curtain',
+    29: 'box',
+    30: 'whiteboard',
+    31: 'person',
+    32: 'nightstand',
+    33: 'toilet',
+    34: 'sink',
+    35: 'lamp',
+    36: 'bathtub',
+    37: 'bag',
+    38: 'otherstructure',
+    39: 'otherfurniture',
+    40: 'otherprop'}
+
+def get_scan_ids(dirname, split):
+    filepath = osp.join(dirname, '{}_scans.txt'.format(split))
+    scan_ids = np.genfromtxt(filepath, dtype = str)
+    return scan_ids
+
+def load_ply_data(data_dir, scan_id, room_id):
+    
+    filename_in = osp.join(data_dir, scan_id, '3D_rendering', room_id, 'room_mesh.ply')
+    print(scan_id)
+    if not osp.exists(filename_in):
+        raise FileNotFoundError(f"PLY file not found: {filename_in}")
+    
+    with open(filename_in, 'rb') as file:
+        ply_data = PlyData.read(file)
+    
+    x = np.array(ply_data['vertex']['x'])
+    y = np.array(ply_data['vertex']['y'])
+    z = np.array(ply_data['vertex']['z'])
+    red = np.array(ply_data['vertex']['red'])
+    green = np.array(ply_data['vertex']['green'])
+    blue = np.array(ply_data['vertex']['blue'])
+    vertex_object_ids = np.array(ply_data['vertex']['object_id']) 
+    vertex_nyu40ids = np.array(ply_data['vertex']['nyu40id'])  
+    # vertex_targetids = np.array(ply_data['vertex']['target_id'])  
+    
+    vertex_dtype = [
+            ('x', 'f4'), ('y', 'f4'), ('z', 'f4'),
+            ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('alpha', 'u1'),
+            ('objectId', 'i4'),
+            ('nyu40id', 'i4'),
+            ('targetId', 'i4')
+            
+        ]
+    
+    scene_vertices = np.column_stack([x, y, z])    
+    center_points = np.mean(scene_vertices, axis=0)
+    center_points[2] = np.min(scene_vertices[:, 2])
+    scene_vertices = scene_vertices - center_points
+    
+    vertices = np.empty(len(x), dtype=vertex_dtype)
+    
+    vertices['x'] = scene_vertices[:, 0].astype('f4')
+    vertices['y'] = scene_vertices[:, 1].astype('f4')
+    vertices['z'] = scene_vertices[:, 2].astype('f4')
+    
+    # vertices['x'] = x.astype('f4')
+    # vertices['y'] = y.astype('f4')
+    # vertices['z'] = z.astype('f4')
+    
+    vertices['red'] = red.astype('u1')
+    vertices['green'] = green.astype('u1')
+    vertices['blue'] = blue.astype('u1')
+    vertices['objectId'] = vertex_object_ids.astype('i4')
+    vertices['nyu40id'] = vertex_nyu40ids.astype('i4')
+    vertices['targetId'] = np.zeros_like(x).astype('i4')
+    # vertices['targetId'] = vertex_targetids.astype('i4')
+    return vertices
+
+def normalize(vector):
+    return vector / np.linalg.norm(vector)
+  
+
+def parse_camera_info(camera_info, height, width):
+    """ extract intrinsic and extrinsic matrix
+    """
+    lookat = normalize(camera_info[3:6])
+    up = normalize(camera_info[6:9])
+
+    W = lookat
+    U = np.cross(W, up)
+    V = np.cross(W, U)
+
+    rot = np.vstack((U, V, W))
+
+    trans = camera_info[:3]
+
+    xfov = camera_info[9]
+    yfov = camera_info[10]
+
+    K = np.diag([1, 1, 1])
+
+    K[0, 2] = width / 2
+    K[1, 2] = height / 2
+
+    K[0, 0] = K[0, 2] / np.tan(xfov)
+    K[1, 1] = K[1, 2] / np.tan(yfov)
+
+    return rot, trans, K
+
+def load_all_poses(scan_dir, frame_idxs):
+    frame_poses = {}
+    for frame_idx in frame_idxs:
+        frame_pose = load_pose(scan_dir, frame_idx)
+        frame_poses[frame_idx] = frame_pose
+    return frame_poses
+
+def load_pose(scan_dir, frame_id):
+    pose_path = osp.join(scan_dir, frame_id, 'camera_pose.txt')
+    camera_info = np.loadtxt(pose_path)
+    rgb_image_path = osp.join(scan_dir, frame_id, 'rgb_rawlight.png')
+    color = cv2.imread(rgb_image_path)
+    height, width = color.shape[:2]
+    rot, trans, K = parse_camera_info(camera_info, height, width)
+    
+    trans = np.array(trans) / 1000
+    extrinsic = np.eye(4)
+    extrinsic[:3, :3] = rot.T
+    extrinsic[:3, -1] = trans
+    extrinsic = np.linalg.inv(extrinsic)    
+    
+    return extrinsic
+        
+def load_intrinsics(scene_folder):
+    camera_info = np.loadtxt(osp.join(scene_folder, '0', 'camera_pose.txt'))
+    rgb_image_path = osp.join(scene_folder, '0', 'rgb_rawlight.png')
+    rgb_img = cv2.imread(rgb_image_path)
+    height, width = rgb_img.shape[:2]
+    _, _, K = parse_camera_info(camera_info, height, width)
+    intrinsics = {}
+    intrinsics['intrinsic_mat'] = K
+    intrinsics['width'] = width
+    intrinsics['height'] = height
+    return intrinsics
\ No newline at end of file