diff --git a/DATA.md b/DATA.md index 643b538..92a22dd 100644 --- a/DATA.md +++ b/DATA.md @@ -10,6 +10,8 @@ We list the available data used in the current version of CrossOver in the table | ------------ | ----------------------------- | ----------------------------------- | -------------------------- | -------------------------- | | ScanNet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | +| ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| MultiScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | We detail data download and release instructions for preprocessing with scripts for ScanNet + 3RScan. @@ -110,4 +112,69 @@ Scan3R/ | │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) | │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) | └── ... -``` \ No newline at end of file +``` +### MultiScan + +#### Running preprocessing scripts +Adjust the path parameters of `MultiScan` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file): + +```bash +$ bash scripts/preprocess/process_multiscan.sh +``` + +Our script for MultiScan dataset performs the following additional processing: + +- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan. + +Post running preprocessing, the data structure should look like the following: + +``` +MultiScan/ +├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training) +| ├── train_objects.h5 +| └── val_objects.h5 +├── scans/ +| ├── scene_00000_00/ +| │ ├── gt-projection-seg.pt -> 3D-to-2D projected data consisting of framewise 2D instance segmentation +| │ ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) +| │ ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features) +| │ ├── data2D_all_images.pt (RGB features of every image of every scan) +| │ ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only) +| │ ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped +| │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) +| │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) +| └── ... +``` + +### ARKitScenes + +#### Running preprocessing scripts +Adjust the path parameters of `ARKitScenes` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file): + +```bash +$ bash scripts/preprocess/process_arkit.sh +``` + +Our script for ARKitScenes dataset performs the following additional processing: + +- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan. + +Post running preprocessing, the data structure should look like the following: + +``` +ARKitScenes/ +├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training) +| ├── train_objects.h5 +| └── val_objects.h5 +├── scans/ +| ├── 40753679/ +| │ ├── gt-projection-seg.pt -> 3D-to-2D projected data consisting of framewise 2D instance segmentation +| │ ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) +| │ ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features) +| │ ├── data2D_all_images.pt (RGB features of every image of every scan ) +| │ ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only) +| │ ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped +| │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) +| │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) +| └── ... +``` diff --git a/README.md b/README.md index 1cb1030..c133ec5 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,9 @@ See [DATA.MD](DATA.md) for detailed instructions on data download, preparation a | ------------ | ----------------------------- | ----------------------------------- | -------------------------- | -------------------------- | | Scannet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | +| ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| MultiScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | + > To run our demo, you only need to download generated embedding data; no need for any data preprocessing. @@ -134,7 +137,7 @@ Various configurable parameters: - `--database_path`: Path to the precomputed embeddings of the database scenes downloaded before (eg: `./release_data/embed_scannet.pt`). - `--query_modality`: Modality of the query scene, Options: `point`, `rgb`, `floorplan`, `referral` - `--database_modality`: Modality used for retrieval. Same options as above. -- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`). +- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`. For embedding and pre-trained model download, refer to [generated embedding data](DATA.md#generated-embedding-data) and [checkpoints](#checkpoints) sections. diff --git a/TRAIN.md b/TRAIN.md index fd56dcd..5520b7d 100644 --- a/TRAIN.md +++ b/TRAIN.md @@ -21,7 +21,7 @@ $ bash scripts/train/train_instance_crossover.sh ``` #### Train Scene Retrieval Pipeline -Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet & 3RScan or either. Run the following: +Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan, MultiScan, & ARKitScenes or any combination of the same. Run the following: ```bash $ bash scripts/train/train_scene_crossover.sh diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml index a14c626..2b2310b 100644 --- a/configs/evaluation/eval_instance.yaml +++ b/configs/evaluation/eval_instance.yaml @@ -43,14 +43,33 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : InferenceObjectRetrieval InferenceObjectRetrieval: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r.pth - + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth inference_module: ObjectRetrieval diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml index 0f1b6f2..a666183 100644 --- a/configs/evaluation/eval_scene.yaml +++ b/configs/evaluation/eval_scene.yaml @@ -43,13 +43,32 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + max_object_len : 150 + voxel_size : 0.02 + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : InferenceSceneRetrieval InferenceSceneRetrieval: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] #, 'point'] - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r.pth + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+multiscan.pth inference_module: SceneRetrieval model: diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml index c74b6bc..230643f 100644 --- a/configs/preprocess/process_1d.yaml +++ b/configs/preprocess/process_1d.yaml @@ -25,6 +25,28 @@ data: label_filename : labels.instances.align.annotated.v2.ply skip_frames : 1 + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + skip_frames : 1 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + + Structured3D: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/ + process_dir : ${data.process_dir}/Structured3D/scans + processor3D : Structured3D_3DProcessor + processor2D : Structured3D_2DProcessor + processor1D : Structured3D_1DProcessor + skip_frames : 1 Shapenet: base_dir : /drive/datasets/Shapenet/ShapeNetCore.v2/ diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml index 74898cd..fd3422a 100644 --- a/configs/preprocess/process_2d.yaml +++ b/configs/preprocess/process_2d.yaml @@ -27,6 +27,29 @@ data: label_filename : labels.instances.align.annotated.v2.ply skip_frames : 1 + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + skip_frames : 1 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + + Structured3D: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/ + process_dir : ${data.process_dir}/Structured3D/scans + processor3D : Structured3D_3DProcessor + processor2D : Structured3D_2DProcessor + processor1D : Structured3D_1DProcessor + skip_frames : 1 + modality_info: 1D : feature_extractor: @@ -60,4 +83,4 @@ task: name : Preprocess Preprocess : modality : '2D' - splits : ['val'] \ No newline at end of file + splits : ['train', 'val'] \ No newline at end of file diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml index 3d15f23..2a405c5 100644 --- a/configs/preprocess/process_3d.yaml +++ b/configs/preprocess/process_3d.yaml @@ -24,6 +24,28 @@ data: processor1D : Scan3R1DProcessor label_filename : labels.instances.align.annotated.v2.ply + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + + Structured3D: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/ + process_dir : ${data.process_dir}/Structured3D/scans + processor3D : Structured3D_3DProcessor + processor2D : Structured3D_2DProcessor + processor1D : Structured3D_1DProcessor + modality_info: 1D : feature_extractor: diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml index 3eb5ace..0806365 100644 --- a/configs/preprocess/process_multimodal.yaml +++ b/configs/preprocess/process_multimodal.yaml @@ -28,6 +28,33 @@ data: skip_frames : 1 avail_modalities : ['point', 'rgb', 'referral'] + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : Scan3R3DProcessor + processor2D : Scan3R2DProcessor + processor1D : Scan3R1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + + Structured3D: + base_dir : /Users/gauravpradeep/CrossOver_ScaleUp/Structured3D + process_dir : ${data.process_dir}/Structured3D/scans + chunked_dir : ${data.process_dir}/Structured3D/objects_chunked + processor3D : Structured3D_3DProcessor + processor2D : Structured3D_2DProcessor + processor1D : Structured3D_1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + modality_info: 1D : feature_extractor: diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml index 8b6bc89..ee70d74 100644 --- a/configs/train/train_instance_baseline.yaml +++ b/configs/train/train_instance_baseline.yaml @@ -44,6 +44,27 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/Multiscan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : ObjectLevelGrounding ObjectLevelGrounding : diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml index c54257d..35a6a15 100644 --- a/configs/train/train_instance_crossover.yaml +++ b/configs/train/train_instance_crossover.yaml @@ -44,12 +44,33 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/Multiscan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : SceneLevelGrounding SceneLevelGrounding : modalities : ['rgb', 'point', 'cad', 'referral'] - train : [Scannet, Scan3R] - val : [Scannet, Scan3R] + train : [Scannet, Scan3R, MultiScan, ARKitScenes] + val : [Scannet, Scan3R, MultiScan, ARKitScenes] trainer: GroundingTrainer diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml index f9459da..9886e95 100644 --- a/configs/train/train_scene_crossover.yaml +++ b/configs/train/train_scene_crossover.yaml @@ -44,14 +44,35 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/Multiscan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : UnifiedTrain UnifiedTrain : modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'floorplan', 'referral'] - train : [Scannet, Scan3R, MultiScan] - val : [Scannet, Scan3R, MultiScan] - object_enc_ckpt : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth + train : [Scannet, Scan3R, MultiScan, ARKitScenes] + val : [Scannet, Scan3R, MultiScan, ARKitScenes] + object_enc_ckpt : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth trainer: UnifiedTrainer @@ -78,7 +99,7 @@ model: base_modality : 'rgb' dataloader: - batch_size : 16 + batch_size : 32 num_workers : 6 eval: diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py index 9a1b744..7db5e81 100644 --- a/data/datasets/__init__.py +++ b/data/datasets/__init__.py @@ -1,2 +1,5 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * +from .structured3d import * \ No newline at end of file diff --git a/data/datasets/arkit.py b/data/datasets/arkit.py new file mode 100644 index 0000000..4944dae --- /dev/null +++ b/data/datasets/arkit.py @@ -0,0 +1,41 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig +import pandas as pd +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class ARKitScenesObject(ScanObjectBase): + """ARKitScenes dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class ARKitScenes(ScanBase): + """ARKitScenes dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + + def get_temporal_scan_pairs(self): + """Groups scans into temporal pairs based on shared visit_id.""" + csv_path=osp.join(self.files_dir,'3dod_train_val_splits.csv') + df = pd.read_csv(csv_path) + + df = df[df["visit_id"].notna()] + + grouped_scans = df.groupby("visit_id")["video_id"].apply(list).to_dict() + + scene_pairs = [] + for video_ids in grouped_scans.values(): + if len(video_ids) > 1: + ref_scan_id = video_ids[0] # First video_id as reference + rescan_list = [{"scan_id": rescan_id} for rescan_id in video_ids[1:]] + + scene_pairs.append([ref_scan_id, rescan_list]) + + return scene_pairs \ No newline at end of file diff --git a/data/datasets/multiscan.py b/data/datasets/multiscan.py new file mode 100644 index 0000000..a43d8a1 --- /dev/null +++ b/data/datasets/multiscan.py @@ -0,0 +1,42 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig + +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class MultiScanObject(ScanObjectBase): + """MultiScan dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class MultiScan(ScanBase): + """MultiScan dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + + def get_temporal_scan_pairs(self) -> List[List[Any]]: + """Gets pairs of temporal scans from the dataset.""" + scene_pairs = [] + + ref_scan_ids = [scan_id for scan_id in self.scan_ids if scan_id.endswith('00')] + + for ref_scan_id in ref_scan_ids: + rescan_list = [] + + for rescan_id in self.scan_ids: + rescan = {} + if rescan_id.startswith(ref_scan_id.split('_')[0]) and rescan_id != ref_scan_id: + rescan['scan_id'] = rescan_id + rescan_list.append(rescan) + if len(rescan_list) == 0: + continue + + scene_pairs.append([ref_scan_id, rescan_list]) + return scene_pairs \ No newline at end of file diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py index 7f8d3fe..e891266 100644 --- a/data/datasets/scanbase.py +++ b/data/datasets/scanbase.py @@ -131,14 +131,18 @@ def __getitem__(self, index: int) -> Dict[str, Any]: scan_process_dir = osp.join(self.process_dir, 'scans', scan_id) - scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt')) + # scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt')) + scan_objects_data = np.load(osp.join(scan_process_dir, 'objectsDataMultimodal.npz'), allow_pickle=True) - scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt')) - scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt')) - scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt')) + # scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt')) + scandata_1d = np.load(osp.join(scan_process_dir, 'data1D.npz'), allow_pickle=True) + # scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt')) + scandata_2d = np.load(osp.join(scan_process_dir, 'data2D.npz'), allow_pickle=True) + # scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt')) + scandata_3d = np.load(osp.join(scan_process_dir, 'data3D.npz'), allow_pickle=True) # Point Cloud Data -- Scene - points, feats, scene_label = scandata_3d['scene']['pcl_coords'], scandata_3d['scene']['pcl_feats'], scandata_3d['scene']['scene_label'] + points, feats, scene_label = scandata_3d['scene'].item()['pcl_coords'], scandata_3d['scene'].item()['pcl_feats'], scandata_3d['scene'].item()['scene_label'] feats /= 255. feats -= 0.5 @@ -152,9 +156,9 @@ def __getitem__(self, index: int) -> Dict[str, Any]: _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) coords, feats = points[sel], feats[sel] - # Get coords, shift to center + # Get coords, already zero centered during preprocessing coords = np.floor(coords / self.voxel_size) - coords-=coords.min(0) + # coords-=coords.min(0) # Object Data scene_dict = {} @@ -185,9 +189,8 @@ def __getitem__(self, index: int) -> Dict[str, Any]: scene_dict['scene_masks'] = {} - rgb_embedding = torch.from_numpy(scandata_2d['scene']['scene_embeddings']) + rgb_embedding = torch.from_numpy(scandata_2d['scene'].item()['scene_embeddings']) rgb_embedding = torch.concatenate([rgb_embedding[:, 0, :], rgb_embedding[:, 1:, :].mean(dim=1)], dim=1) - rgb_embedding = rgb_embedding[list(range(0, rgb_embedding.shape[0], 2)), :] scene_dict['rgb_embedding'] = rgb_embedding scene_dict['scene_masks']['rgb'] = torch.Tensor([1.0]) @@ -195,7 +198,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]: scene_dict['scene_masks']['object'] = torch.Tensor([1.0]) referral_mask = torch.Tensor([0.0]) - referral_embedding = scandata_1d['scene']['referral_embedding'] + referral_embedding = scandata_1d['scene'].item()['referral_embedding'] if referral_embedding is not None: referral_embedding = torch.from_numpy(referral_embedding[0]['feat']).reshape(-1,) @@ -203,7 +206,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]: else: referral_embedding = torch.zeros((scene_dict['rgb_embedding'].shape[-1] // 4, )) - floorplan_embedding = scandata_2d['scene']['floorplan']['embedding'] + floorplan_embedding = scandata_2d['scene'].item()['floorplan']['embedding'] floorplan_mask = torch.Tensor([0.0]) if floorplan_embedding is not None: floorplan_embedding = torch.from_numpy(floorplan_embedding[0, 0]).reshape(-1, ) @@ -258,4 +261,4 @@ def collate_fn(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]: } collated_batch['coordinates'], collated_batch['features'] = coordinates, features - return collated_batch + return collated_batch \ No newline at end of file diff --git a/data/datasets/structured3d.py b/data/datasets/structured3d.py new file mode 100644 index 0000000..2b73b41 --- /dev/null +++ b/data/datasets/structured3d.py @@ -0,0 +1,23 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig + +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class Structured3DObject(ScanObjectBase): + """Structured3D dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class Structured3D(ScanBase): + """Structured3D dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + \ No newline at end of file diff --git a/prepare_data/README.md b/prepare_data/README.md index dba34f5..c369156 100644 --- a/prepare_data/README.md +++ b/prepare_data/README.md @@ -5,6 +5,8 @@ This document provides instructions for pre-processing different datasets, including - ScanNet - 3RScan +- ARKitScenes +- MultiScan ## Prerequisites @@ -16,20 +18,17 @@ Before you begin, simply activate the `crossover` conda environment. #### Original Data - **ScanNet**: Download ScanNet v2 data from the [official website](https://github.com/ScanNet/ScanNet), we use the official training and validation split from [here](https://github.com/ScanNet/ScanNet/tree/master/Tasks/Benchmark). -- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan), we use the official (full list of scan ids including reference + rescans) training split from [here](https://campar.in.tum.de/public_datasets/3RScan/train_scans.txt) and validation split from [here](https://campar.in.tum.de/public_datasets/3RScan/val_scans.txt). - - Download `3RScan.json` from [here](https://campar.in.tum.de/public_datasets/3RScan/3RScan.json) and `objects.json` from [here](https://campar.in.tum.de/public_datasets/3DSSG/3DSSG/objects.json). - - Download the class mapping file `3RScan.v2 Semantic Classes - Mapping.csv` from [here](https://docs.google.com/spreadsheets/d/1eRTJ2M9OHz7ypXfYD-KTR1AIT-CrVLmhJf8mxgVZWnI/edit?gid=0#gid=0). +- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan). -- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip. +- **MultiScan**: Download MultiScan dataset from the [official website](https://github.com/smartscenes/multiscan). + +- **ARKitScenes**: Download ARKitScenes dataset from the [official website](https://github.com/apple/ARKitScenes). -#### Referral and CAD annotations -We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet & 3RScan) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). +- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip. -- **SceneVerse** - Download the Scannet and 3RScan data under `annotations/refer` from the [official website](https://scene-verse.github.io/). -- **Scan2CAD** - Download `full_annotations.json` from the [official website](https://github.com/skanti/Scan2CAD?tab=readme-ov-file#download-dataset). +### Download Referral and CAD annotations +We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, MultiScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below. -### Prepare The Data -Exact instructions for data setup + preparation below: #### ScanNet 1. Run the following to extract ScanNet data @@ -107,3 +106,81 @@ Scan3R/ └── sceneverse └── ssg_ref_rel2_template.json ``` + +#### ARKitScenes +1. Download ARKitScenes 3dod data using the following command: + +```bash +python ARKitScenes/download_data.py 3dod --video_id_csv PATH_TO_3dod_train_val_splits.csv --download_dir PATH_TO_ARKITSCENES +``` +The files mentioned in the above command - ```download_data.py``` and ```3dod_train_val_splits.csv``` can be found in the official repository [here](https://github.com/apple/ARKitScenes), along with more detailed instructions and descriptions of the data. + +2. Once the data is downloaded, run the following to organize it as per our requirements. + + ```bash +cd ARKitScenes +mv 3dod/Training/* scans +mv 3dod/Validation/* scans +``` + +3. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`. + +Once completed, the data structure would look like the following: +``` +ARKitScenes/ +├── scans/ +│ ├── 40753679/ +│ │ ├── 40753679_frames/ +│ │ │ ├── lowres_depth/ (folder containing depth images) +│ │ │ ├── lowres_wide/ (folder containing rgb images) +│ │ │ ├── lowres_wide_intrinsics/ (folder containing frame wise camera intrinsics) +│ │ │ ├── lowres_wide.traj (camera trajectory) +│ │ ├── 40753679_3dod_annotation.json +│ │ ├── 40753679_3dod_mesh.ply +| └── +└── files + ├── scannetv2-labels.combined.tsv + ├── train_scans.txt + ├── val_scans.txt + ├── metadata.csv + ├── 3dod_train_val_splits.csv + └── sceneverse + └── ssg_ref_rel2_template.json +``` + +#### MultiScan +1. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data + + ```bash +cd MultiScan/scenes +unzip '*.zip' +rm -rf '*.zip' +``` +3. To generate sequence of RGB images and corresponding camera poses from the ```.mp4``` file, run the follwing +```bash +cd prepare_data/multiscan +python preprocess_2d_multiscan.py --base_dir PATH_TO_MULTISCAN --frame_interval {frame_interval} +``` +Once completed, the data structure would look like the following: +``` +MultiScan/ +├── scenes/ +│ ├── scene_00000_00/ +│ │ ├── sequence/ (folder containing rgb images at specified frame interval) +| | ├── frame_ids.txt +│ │ ├── scene_00000_00.annotations.json +│ │ ├── scene_00000_00.jsonl +│ │ ├── scene_00000_00.confidence.zlib +│ │ ├── scene_00000_00.mp4 +│ │ ├── poses.jsonl +│ │ ├── scene_00000_00.ply +│ │ ├── scene_00000_00.align.json +│ │ ├── scene_00000_00.json +| └── +└── files + ├── scannetv2-labels.combined.tsv + ├── train_scans.txt + ├── test_scans.txt + └── sceneverse + └── ssg_ref_rel2_template.json +``` \ No newline at end of file diff --git a/prepare_data/multiscan/preprocess_2d_multiscan.py b/prepare_data/multiscan/preprocess_2d_multiscan.py new file mode 100644 index 0000000..da89da1 --- /dev/null +++ b/prepare_data/multiscan/preprocess_2d_multiscan.py @@ -0,0 +1,94 @@ +import os +import cv2 +import json +import jsonlines +import argparse +import os.path as osp +import shutil + +def process_scene_folders(base_dir, frame_interval=10): + base_dir=osp.join(base_dir, 'scenes') + scene_folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))] + + for scene_folder in scene_folders: + scene_path = os.path.join(base_dir, scene_folder) + video_path = os.path.join(scene_path, f"{scene_folder}.mp4") + jsonl_path = os.path.join(scene_path, f"{scene_folder}.jsonl") + frame_output_dir = os.path.join(scene_path, "sequence") + frame_ids_txt_path = os.path.join(scene_path, "frame_ids.txt") + metadata_output_path = os.path.join(scene_path, "poses.jsonl") + + if os.path.exists(frame_output_dir): + shutil.rmtree(frame_output_dir) + os.makedirs(frame_output_dir) + + if not os.path.exists(video_path): + print(f"Video file not found: {video_path}") + continue + if not os.path.exists(jsonl_path): + print(f"Metadata file not found: {jsonl_path}") + continue + + print(f"Processing scene: {scene_folder}") + + frame_ids = extract_frames_from_video(video_path, frame_output_dir, frame_interval) + + with open(frame_ids_txt_path, "w") as f: + for frame_id in frame_ids: + f.write(f"{frame_id}\n") + + selected_metadata = extract_metadata_by_line_number(jsonl_path, frame_ids) + + with jsonlines.open(metadata_output_path, mode="w") as writer: + for entry in selected_metadata: + writer.write(entry) + + print(f"Finished processing scene: {scene_folder}") + + +def extract_frames_from_video(video_path, output_dir, frame_interval): + + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise ValueError(f"Could not open video file: {video_path}") + + frame_ids = [] + frame_count = 0 + + while True: + ret, frame = cap.read() + if not ret: + break # End of video + + if frame_count % frame_interval == 0: + frame_id = frame_count + frame_ids.append(frame_id) + output_path = os.path.join(output_dir, f"frame-{frame_id}.color.jpg") + cv2.imwrite(output_path, frame) # Save frame as an image + + frame_count += 1 + + cap.release() + return frame_ids + + +def extract_metadata_by_line_number(jsonl_path, line_numbers): + + selected_metadata = [] + + with jsonlines.open(jsonl_path) as reader: + for line_idx, entry in enumerate(reader): + if line_idx in line_numbers: + entry["frame_id"] = line_idx + selected_metadata.append(entry) + + return selected_metadata + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process scene folders.") + parser.add_argument("--base_dir", type=str, required=True, help="Base dataset directory.") + parser.add_argument("--frame_interval", type=int, default=10, help="Interval for saving frames.") + args = parser.parse_args() + + process_scene_folders(args.base_dir, args.frame_interval) \ No newline at end of file diff --git a/prepare_data/structured3d/generate_ply.py b/prepare_data/structured3d/generate_ply.py new file mode 100644 index 0000000..19b3cd9 --- /dev/null +++ b/prepare_data/structured3d/generate_ply.py @@ -0,0 +1,366 @@ +import os +import cv2 +import numpy as np +import open3d as o3d +from plyfile import PlyData, PlyElement +import json +import argparse +import misc.utils +BASE_PATH = "/Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/scans/" + + +def create_color_palette(): + """Returns the NYU40 colormap mapping RGB to class indices.""" + return [ + (0, 0, 0), # Unlabeled (0) + (174, 199, 232), # wall (1) + (152, 223, 138), # floor (2) + (31, 119, 180), # cabinet (3) + (255, 187, 120), # bed (4) + (188, 189, 34), # chair (5) + (140, 86, 75), # sofa (6) + (255, 152, 150), # table (7) + (214, 39, 40), # door (8) + (197, 176, 213), # window (9) + (148, 103, 189), # bookshelf (10) + (196, 156, 148), # picture (11) + (23, 190, 207), # counter (12) + (178, 76, 76), + (247, 182, 210), # desk (14) + (66, 188, 102), + (219, 219, 141), # curtain (16) + (140, 57, 197), + (202, 185, 52), + (51, 176, 203), + (200, 54, 131), + (92, 193, 61), + (78, 71, 183), + (172, 114, 82), + (255, 127, 14), # refrigerator (25) + (91, 163, 138), + (153, 98, 156), + (140, 153, 101), + (158, 218, 229), # shower curtain (28) + (100, 125, 154), + (178, 127, 135), + (120, 185, 128), + (146, 111, 194), + (44, 160, 44), # toilet (33) + (112, 128, 144), # sink (34) + (96, 207, 209), + (227, 119, 194), # bathtub (36) + (213, 92, 176), + (94, 106, 211), + (82, 84, 163), # otherfurn (39) + (100, 85, 144) + ] + +def normalize(vector): + return vector / np.linalg.norm(vector) + +def parse_camera_info(camera_info, height, width): + """ extract intrinsic and extrinsic matrix + """ + lookat = normalize(camera_info[3:6]) + up = normalize(camera_info[6:9]) + + W = lookat + U = np.cross(W, up) + V = np.cross(W, U) + + rot = np.vstack((U, V, W)) + + trans = camera_info[:3] + + xfov = camera_info[9] + yfov = camera_info[10] + + K = np.diag([1, 1, 1]) + + K[0, 2] = width / 2 + K[1, 2] = height / 2 + + K[0, 0] = K[0, 2] / np.tan(xfov) + K[1, 1] = K[1, 2] / np.tan(yfov) + + return rot, trans, K + +def point_inside_bbox(point, bbox_corners): + """Check if a point is inside a 3D bounding box defined by its 8 corners.""" + min_coords = np.min(bbox_corners, axis=0) + max_coords = np.max(bbox_corners, axis=0) + + return np.all(min_coords <= point) and np.all(point <= max_coords) + +def load_bounding_boxes(bbox_json_path): + """Load 3D bounding boxes from a JSON file.""" + with open(bbox_json_path, 'r') as f: + bboxes = json.load(f) + return bboxes + +def rgb_to_nyu40id(rgb_image): + """Convert RGB values from `semantic.png` to corresponding NYU40 IDs.""" + palette = create_color_palette() + color_to_id = {color: idx for idx, color in enumerate(palette)} + + h, w, _ = rgb_image.shape + rgb_flatten = rgb_image.reshape(-1, 3) + + # Convert each RGB value to corresponding NYU40 ID + nyu40_ids = np.array([color_to_id.get(tuple(rgb), 0) for rgb in rgb_flatten], dtype=np.int32) + + return nyu40_ids.reshape(h, w) + + +def save_ply_with_labels(filename, pointcloud, object_ids, nyu40_ids): + """Save PLY file with object_id and nyu40id.""" + points = np.asarray(pointcloud.points) + colors = (np.asarray(pointcloud.colors) * 255).astype(np.uint8) if pointcloud.has_colors() else np.zeros_like(points, dtype=np.uint8) + + vertex_data = np.array( + list(zip( + points[:, 0], points[:, 1], points[:, 2], # x, y, z + colors[:, 0], colors[:, 1], colors[:, 2], # red, green, blue + np.full(len(points), 255, dtype=np.uint8), # alpha + object_ids, # Object ID + nyu40_ids # NYU40 Semantic ID + )), + dtype=[ + ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('alpha', 'u1'), + ('object_id', 'i4'), + ('nyu40id', 'i4') + ] + ) + + el = PlyElement.describe(vertex_data, 'vertex') + PlyData([el], text=False).write(filename) + +def process_room(scene_id, room_id, room_path): + """Processes a single room by merging all views and generating a 3D mesh.""" + pcd_list = [] + object_ids_list = [] + nyu40_ids_list = [] + + # Iterate over all views in the room + for view_id in sorted(os.listdir(room_path)): + view_path = os.path.join(room_path, view_id) + + rgb_image_path = os.path.join(view_path, "rgb_rawlight.png") + depth_image_path = os.path.join(view_path, "depth.png") + camera_path = os.path.join(view_path, "camera_pose.txt") + # instance_image_path = os.path.join(view_path, "instance.png") + semantic_image_path = os.path.join(view_path, "semantic.png") + + if not all(os.path.exists(p) for p in [rgb_image_path, depth_image_path, camera_path, semantic_image_path]): + print(f"Skipping Scene {scene_id}, Room {room_id}, View {view_id}: Missing files") + continue + + print(f"Processing Scene {scene_id}, Room {room_id}, View {view_id}...") + + color = cv2.imread(rgb_image_path) + # cv2.imshow("color", color) + # cv2.waitKey(0) + # color = cv2.cvtColor(color, cv2.COLOR_BGR2RGB) + depth = cv2.imread(depth_image_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0 # Convert mm to meters + # instance = cv2.imread(instance_image_path, cv2.IMREAD_UNCHANGED) # Object ID image + semantic = cv2.imread(semantic_image_path) # Read as BGR + semantic = cv2.cvtColor(semantic, cv2.COLOR_BGR2RGB) # Convert to RGB + + nyu40_id_image = rgb_to_nyu40id(semantic) + + valid_mask = depth.flatten() > 0 + # object_ids = instance.flatten()[valid_mask] + nyu40_ids = nyu40_id_image.flatten()[valid_mask] + + height, width = color.shape[:2] + camera_info = np.loadtxt(camera_path) + rot, trans, K = parse_camera_info(camera_info, height, width) + trans = np.array(trans) / 1000 + + + color_o3d = o3d.geometry.Image(color) + depth_o3d = o3d.geometry.Image(depth) + rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth( + color_o3d, depth_o3d, depth_scale=1.0, depth_trunc=10.0, convert_rgb_to_intensity=False + ) + extrinsic = np.eye(4) + extrinsic[:3, :3] = rot.T + extrinsic[:3, -1] = trans + extrinsic = np.linalg.inv(extrinsic) + + intrinsic = o3d.camera.PinholeCameraIntrinsic(width, height, K[0][0], K[1][1], K[0][2], K[1][2]) + pointcloud = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, intrinsic, extrinsic) + + pcd_list.append(pointcloud) + # object_ids_list.append(object_ids) + nyu40_ids_list.append(nyu40_ids) + + if not pcd_list: + print(f"Skipping Scene {scene_id}, Room {room_id}: No valid views.") + return + + pcd_combined = pcd_list[0] + for pcd in pcd_list[1:]: + pcd_combined += pcd + + object_ids_combined = np.array([-1]*len(np.asarray(pcd_combined.points)), dtype=int) # Initialize object IDs + + # Efficient assignment of object IDs based on bounding box inclusion + points = np.asarray(pcd_combined.points) + colors = np.asarray(pcd_combined.colors) + + + bboxes_json_path = os.path.join(BASE_PATH, scene_id, "bbox_3d.json") + bboxes = load_bounding_boxes(bboxes_json_path) + for idx, bbox in enumerate(bboxes): + basis = np.array(bbox['basis']) + coeffs = np.array(bbox['coeffs']) + centroid = np.array(bbox['centroid']) + bbox_corners = misc.utils.get_corners_of_bb3d_no_index(basis, coeffs, centroid) # 8 corners of the bounding box + bbox_corners = bbox_corners / 1000 + # Create mask for points inside this bounding box + box_min = np.min(bbox_corners, axis=0, keepdims=True) + box_max = np.max(bbox_corners, axis=0, keepdims=True) + # print(min_corner, max_corner) + # print(points) + # mask = np.all((points >= box_min) & (points <= max_corner), axis=1) + point_max_mask = np.all(points < box_max, axis=1) + point_min_mask = np.all(points > box_min, axis=1) + point_mask = np.logical_and(point_max_mask, point_min_mask) + points_in_bbox = points[point_mask] + # print(points_in_bbox.shape) + # if points_in_bbox.shape[0] != 0: + # print(bbox['ID']) + # colors_in_bbox = colors[mask] + # object_pcd = o3d.geometry.PointCloud() + # object_pcd.points = o3d.utility.Vector3dVector(points_in_bbox) + # object_pcd.colors = o3d.utility.Vector3dVector(colors_in_bbox) + # o3d.visualization.draw_geometries([object_pcd]) + # print(np.all(points>=min_corner, axis=1)) + # Assign object ID to points inside this bounding box + object_ids_combined[point_mask] = bbox['ID'] + # o3d.visualization.draw_geometries([pcd_combined]) + + + nyu40_ids_combined = np.concatenate(nyu40_ids_list) + # print(np.unique(object_ids_combined)) + # Save the mesh file + output_dir = os.path.join(BASE_PATH, scene_id, "3D_rendering", room_id) + os.makedirs(output_dir, exist_ok=True) + ply_filename = os.path.join(output_dir, "room_mesh.ply") + + save_ply_with_labels(ply_filename, pcd_combined, object_ids_combined, nyu40_ids_combined) + print(f"Saved mesh for Scene {scene_id}, Room {room_id} -> {ply_filename}") + + +# if __name__ == '__main__': +# for scene_id in sorted(os.listdir(BASE_PATH)): +# scene_path = os.path.join(BASE_PATH, scene_id, "2D_rendering") +# if not os.path.isdir(scene_path): +# continue + +# for room_id in sorted(os.listdir(scene_path)): +# room_path = os.path.join(scene_path, room_id, "perspective", "full") +# if os.path.isdir(room_path): + # process_room(scene_id, room_id, room_path) +def parse_args(): + parser = argparse.ArgumentParser(description='Generate PLY files from Structured3D dataset') + parser.add_argument('--base_path', type=str, default="/Users/gauravpradeep/CrossOver_ScaleUp/Structured3D/scans/", + help='Base path to the Structured3D dataset') + return parser.parse_args() + +if __name__ == '__main__': + args = parse_args() + BASE_PATH = args.base_path + + for scene_id in sorted(os.listdir(BASE_PATH)): + scene_path = os.path.join(BASE_PATH, scene_id, "2D_rendering") + if not os.path.isdir(scene_path): + continue + + for room_id in sorted(os.listdir(scene_path)): + room_path = os.path.join(scene_path, room_id, "perspective", "full") + if os.path.isdir(room_path): + process_room(scene_id, room_id, room_path) +# --------------------------------------- +# instance image based object id assignment +# --------------------------------------- + +# def process_room(scene_id, room_id, room_path): +# """Processes a single room by merging all views and generating a 3D mesh.""" +# pcd_list = [] +# object_ids_list = [] +# nyu40_ids_list = [] + +# # Iterate over all views in the room +# for view_id in sorted(os.listdir(room_path)): +# view_path = os.path.join(room_path, view_id) + +# rgb_image_path = os.path.join(view_path, "rgb_rawlight.png") +# depth_image_path = os.path.join(view_path, "depth.png") +# camera_path = os.path.join(view_path, "camera_pose.txt") +# instance_image_path = os.path.join(view_path, "instance.png") +# semantic_image_path = os.path.join(view_path, "semantic.png") + +# if not all(os.path.exists(p) for p in [rgb_image_path, depth_image_path, camera_path, instance_image_path, semantic_image_path]): +# print(f"Skipping Scene {scene_id}, Room {room_id}, View {view_id}: Missing files") +# continue + +# print(f"Processing Scene {scene_id}, Room {room_id}, View {view_id}...") + +# color = cv2.imread(rgb_image_path) +# depth = cv2.imread(depth_image_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0 # Convert mm to meters +# instance = cv2.imread(instance_image_path, cv2.IMREAD_UNCHANGED) # Object ID image +# semantic = cv2.imread(semantic_image_path) # Read as BGR +# semantic = cv2.cvtColor(semantic, cv2.COLOR_BGR2RGB) # Convert to RGB + +# nyu40_id_image = rgb_to_nyu40id(semantic) + +# valid_mask = depth.flatten() > 0 +# object_ids = instance.flatten()[valid_mask] +# nyu40_ids = nyu40_id_image.flatten()[valid_mask] + +# height, width = color.shape[:2] +# camera_info = np.loadtxt(camera_path) +# rot, trans, K = parse_camera_info(camera_info, height, width) +# trans = np.array(trans) / 1000 + + +# color_o3d = o3d.geometry.Image(color) +# depth_o3d = o3d.geometry.Image(depth) +# rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth( +# color_o3d, depth_o3d, depth_scale=1.0, depth_trunc=10.0, convert_rgb_to_intensity=False +# ) +# extrinsic = np.eye(4) +# extrinsic[:3, :3] = rot.T +# extrinsic[:3, -1] = trans +# extrinsic = np.linalg.inv(extrinsic) + +# intrinsic = o3d.camera.PinholeCameraIntrinsic(width, height, K[0][0], K[1][1], K[0][2], K[1][2]) +# pointcloud = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, intrinsic, extrinsic) + +# pcd_list.append(pointcloud) +# object_ids_list.append(object_ids) +# nyu40_ids_list.append(nyu40_ids) + +# if not pcd_list: +# print(f"Skipping Scene {scene_id}, Room {room_id}: No valid views.") +# return + +# pcd_combined = pcd_list[0] +# for pcd in pcd_list[1:]: +# pcd_combined += pcd +# # o3d.visualization.draw_geometries([pcd_combined]) + +# object_ids_combined = np.concatenate(object_ids_list) +# nyu40_ids_combined = np.concatenate(nyu40_ids_list) + +# # Save the mesh file +# output_dir = os.path.join(BASE_PATH, scene_id, "3D_rendering", room_id) +# os.makedirs(output_dir, exist_ok=True) +# ply_filename = os.path.join(output_dir, "room_mesh.ply") + +# save_ply_with_labels(ply_filename, pcd_combined, object_ids_combined, nyu40_ids_combined) +# print(f"Saved mesh for Scene {scene_id}, Room {room_id} -> {ply_filename}") + diff --git a/prepare_data/structured3d/misc/colors.py b/prepare_data/structured3d/misc/colors.py new file mode 100644 index 0000000..191f845 --- /dev/null +++ b/prepare_data/structured3d/misc/colors.py @@ -0,0 +1,47 @@ +semantics_cmap = { + 'living room': '#e6194b', + 'kitchen': '#3cb44b', + 'bedroom': '#ffe119', + 'bathroom': '#0082c8', + 'balcony': '#f58230', + 'corridor': '#911eb4', + 'dining room': '#46f0f0', + 'study': '#f032e6', + 'studio': '#d2f53c', + 'store room': '#fabebe', + 'garden': '#008080', + 'laundry room': '#e6beff', + 'office': '#aa6e28', + 'basement': '#fffac8', + 'garage': '#800000', + 'undefined': '#aaffc3', + 'door': '#808000', + 'window': '#ffd7b4', + 'outwall': '#000000', +} + + +colormap_255 = [ + [230, 25, 75], + [ 60, 180, 75], + [255, 225, 25], + [ 0, 130, 200], + [245, 130, 48], + [145, 30, 180], + [ 70, 240, 240], + [240, 50, 230], + [210, 245, 60], + [250, 190, 190], + [ 0, 128, 128], + [230, 190, 255], + [170, 110, 40], + [255, 250, 200], + [128, 0, 0], + [170, 255, 195], + [128, 128, 0], + [255, 215, 180], + [ 0, 0, 128], + [128, 128, 128], + [255, 255, 255], + [ 0, 0, 0] +] \ No newline at end of file diff --git a/prepare_data/structured3d/misc/figures.py b/prepare_data/structured3d/misc/figures.py new file mode 100644 index 0000000..013acbf --- /dev/null +++ b/prepare_data/structured3d/misc/figures.py @@ -0,0 +1,78 @@ +""" +Copy from https://github.com/Toblerity/Shapely/blob/master/docs/code/figures.py +""" + +from math import sqrt +from shapely import affinity + +GM = (sqrt(5)-1.0)/2.0 +W = 8.0 +H = W*GM +SIZE = (W, H) + +BLUE = '#6699cc' +GRAY = '#999999' +DARKGRAY = '#333333' +YELLOW = '#ffcc33' +GREEN = '#339933' +RED = '#ff3333' +BLACK = '#000000' + +COLOR_ISVALID = { + True: BLUE, + False: RED, +} + + +def plot_line(ax, ob, color=GRAY, zorder=1, linewidth=3, alpha=1): + x, y = ob.xy + ax.plot(x, y, color=color, linewidth=linewidth, solid_capstyle='round', zorder=zorder, alpha=alpha) + + +def plot_coords(ax, ob, color=BLACK, zorder=1, alpha=1): + x, y = ob.xy + ax.plot(x, y, color=color, zorder=zorder, alpha=alpha) + + +def color_isvalid(ob, valid=BLUE, invalid=RED): + if ob.is_valid: + return valid + else: + return invalid + + +def color_issimple(ob, simple=BLUE, complex=YELLOW): + if ob.is_simple: + return simple + else: + return complex + + +def plot_line_isvalid(ax, ob, **kwargs): + kwargs["color"] = color_isvalid(ob) + plot_line(ax, ob, **kwargs) + + +def plot_line_issimple(ax, ob, **kwargs): + kwargs["color"] = color_issimple(ob) + plot_line(ax, ob, **kwargs) + + +def plot_bounds(ax, ob, zorder=1, alpha=1): + x, y = zip(*list((p.x, p.y) for p in ob.boundary)) + ax.plot(x, y, 'o', color=BLACK, zorder=zorder, alpha=alpha) + + +def add_origin(ax, geom, origin): + x, y = xy = affinity.interpret_origin(geom, origin, 2) + ax.plot(x, y, 'o', color=GRAY, zorder=1) + ax.annotate(str(xy), xy=xy, ha='center', + textcoords='offset points', xytext=(0, 8)) + + +def set_limits(ax, x0, xN, y0, yN): + ax.set_xlim(x0, xN) + ax.set_xticks(range(x0, xN+1)) + ax.set_ylim(y0, yN) + ax.set_yticks(range(y0, yN+1)) + ax.set_aspect("equal") \ No newline at end of file diff --git a/prepare_data/structured3d/misc/panorama.py b/prepare_data/structured3d/misc/panorama.py new file mode 100644 index 0000000..ba2feef --- /dev/null +++ b/prepare_data/structured3d/misc/panorama.py @@ -0,0 +1,243 @@ +""" +Copy from https://github.com/sunset1995/pytorch-layoutnet/blob/master/pano.py +""" +import numpy as np +import numpy.matlib as matlib + + +def xyz_2_coorxy(xs, ys, zs, H=512, W=1024): + us = np.arctan2(xs, ys) + vs = -np.arctan(zs / np.sqrt(xs**2 + ys**2)) + coorx = (us / (2 * np.pi) + 0.5) * W + coory = (vs / np.pi + 0.5) * H + return coorx, coory + + +def coords2uv(coords, width, height): + """ + Image coordinates (xy) to uv + """ + middleX = width / 2 + 0.5 + middleY = height / 2 + 0.5 + uv = np.hstack([ + (coords[:, [0]] - middleX) / width * 2 * np.pi, + -(coords[:, [1]] - middleY) / height * np.pi]) + return uv + + +def uv2xyzN(uv, planeID=1): + ID1 = (int(planeID) - 1 + 0) % 3 + ID2 = (int(planeID) - 1 + 1) % 3 + ID3 = (int(planeID) - 1 + 2) % 3 + xyz = np.zeros((uv.shape[0], 3)) + xyz[:, ID1] = np.cos(uv[:, 1]) * np.sin(uv[:, 0]) + xyz[:, ID2] = np.cos(uv[:, 1]) * np.cos(uv[:, 0]) + xyz[:, ID3] = np.sin(uv[:, 1]) + return xyz + + +def uv2xyzN_vec(uv, planeID): + """ + vectorization version of uv2xyzN + @uv N x 2 + @planeID N + """ + assert (planeID.astype(int) != planeID).sum() == 0 + planeID = planeID.astype(int) + ID1 = (planeID - 1 + 0) % 3 + ID2 = (planeID - 1 + 1) % 3 + ID3 = (planeID - 1 + 2) % 3 + ID = np.arange(len(uv)) + xyz = np.zeros((len(uv), 3)) + xyz[ID, ID1] = np.cos(uv[:, 1]) * np.sin(uv[:, 0]) + xyz[ID, ID2] = np.cos(uv[:, 1]) * np.cos(uv[:, 0]) + xyz[ID, ID3] = np.sin(uv[:, 1]) + return xyz + + +def xyz2uvN(xyz, planeID=1): + ID1 = (int(planeID) - 1 + 0) % 3 + ID2 = (int(planeID) - 1 + 1) % 3 + ID3 = (int(planeID) - 1 + 2) % 3 + normXY = np.sqrt(xyz[:, [ID1]] ** 2 + xyz[:, [ID2]] ** 2) + normXY[normXY < 0.000001] = 0.000001 + normXYZ = np.sqrt(xyz[:, [ID1]] ** 2 + xyz[:, [ID2]] ** 2 + xyz[:, [ID3]] ** 2) + v = np.arcsin(xyz[:, [ID3]] / normXYZ) + u = np.arcsin(xyz[:, [ID1]] / normXY) + valid = (xyz[:, [ID2]] < 0) & (u >= 0) + u[valid] = np.pi - u[valid] + valid = (xyz[:, [ID2]] < 0) & (u <= 0) + u[valid] = -np.pi - u[valid] + uv = np.hstack([u, v]) + uv[np.isnan(uv[:, 0]), 0] = 0 + return uv + + +def computeUVN(n, in_, planeID): + """ + compute v given u and normal. + """ + if planeID == 2: + n = np.array([n[1], n[2], n[0]]) + elif planeID == 3: + n = np.array([n[2], n[0], n[1]]) + bc = n[0] * np.sin(in_) + n[1] * np.cos(in_) + bs = n[2] + out = np.arctan(-bc / (bs + 1e-9)) + return out + + +def computeUVN_vec(n, in_, planeID): + """ + vectorization version of computeUVN + @n N x 3 + @in_ MN x 1 + @planeID N + """ + n = n.copy() + if (planeID == 2).sum(): + n[planeID == 2] = np.roll(n[planeID == 2], 2, axis=1) + if (planeID == 3).sum(): + n[planeID == 3] = np.roll(n[planeID == 3], 1, axis=1) + n = np.repeat(n, in_.shape[0] // n.shape[0], axis=0) + assert n.shape[0] == in_.shape[0] + bc = n[:, [0]] * np.sin(in_) + n[:, [1]] * np.cos(in_) + bs = n[:, [2]] + out = np.arctan(-bc / (bs + 1e-9)) + return out + + +def lineFromTwoPoint(pt1, pt2): + """ + Generate line segment based on two points on panorama + pt1, pt2: two points on panorama + line: + 1~3-th dim: normal of the line + 4-th dim: the projection dimension ID + 5~6-th dim: the u of line segment endpoints in projection plane + """ + numLine = pt1.shape[0] + lines = np.zeros((numLine, 6)) + n = np.cross(pt1, pt2) + n = n / (matlib.repmat(np.sqrt(np.sum(n ** 2, 1, keepdims=True)), 1, 3) + 1e-9) + lines[:, 0:3] = n + + areaXY = np.abs(np.sum(n * matlib.repmat([0, 0, 1], numLine, 1), 1, keepdims=True)) + areaYZ = np.abs(np.sum(n * matlib.repmat([1, 0, 0], numLine, 1), 1, keepdims=True)) + areaZX = np.abs(np.sum(n * matlib.repmat([0, 1, 0], numLine, 1), 1, keepdims=True)) + planeIDs = np.argmax(np.hstack([areaXY, areaYZ, areaZX]), axis=1) + 1 + lines[:, 3] = planeIDs + + for i in range(numLine): + uv = xyz2uvN(np.vstack([pt1[i, :], pt2[i, :]]), lines[i, 3]) + umax = uv[:, 0].max() + np.pi + umin = uv[:, 0].min() + np.pi + if umax - umin > np.pi: + lines[i, 4:6] = np.array([umax, umin]) / 2 / np.pi + else: + lines[i, 4:6] = np.array([umin, umax]) / 2 / np.pi + + return lines + + +def lineIdxFromCors(cor_all, im_w, im_h): + assert len(cor_all) % 2 == 0 + uv = coords2uv(cor_all, im_w, im_h) + xyz = uv2xyzN(uv) + lines = lineFromTwoPoint(xyz[0::2], xyz[1::2]) + num_sample = max(im_h, im_w) + + cs, rs = [], [] + for i in range(lines.shape[0]): + n = lines[i, 0:3] + sid = lines[i, 4] * 2 * np.pi + eid = lines[i, 5] * 2 * np.pi + if eid < sid: + x = np.linspace(sid, eid + 2 * np.pi, num_sample) + x = x % (2 * np.pi) + else: + x = np.linspace(sid, eid, num_sample) + + u = -np.pi + x.reshape(-1, 1) + v = computeUVN(n, u, lines[i, 3]) + xyz = uv2xyzN(np.hstack([u, v]), lines[i, 3]) + uv = xyz2uvN(xyz, 1) + + r = np.minimum(np.floor((uv[:, 0] + np.pi) / (2 * np.pi) * im_w) + 1, + im_w).astype(np.int32) + c = np.minimum(np.floor((np.pi / 2 - uv[:, 1]) / np.pi * im_h) + 1, + im_h).astype(np.int32) + cs.extend(r - 1) + rs.extend(c - 1) + return rs, cs + + +def draw_boundary_from_cor_id(cor_id, img_src): + im_h, im_w = img_src.shape[:2] + cor_all = [cor_id] + for i in range(len(cor_id)): + cor_all.append(cor_id[i, :]) + cor_all.append(cor_id[(i+2) % len(cor_id), :]) + cor_all = np.vstack(cor_all) + + rs, cs = lineIdxFromCors(cor_all, im_w, im_h) + rs = np.array(rs) + cs = np.array(cs) + + panoEdgeC = img_src.astype(np.uint8) + for dx, dy in [[-1, 0], [1, 0], [0, 0], [0, 1], [0, -1]]: + panoEdgeC[np.clip(rs + dx, 0, im_h - 1), np.clip(cs + dy, 0, im_w - 1), 0] = 0 + panoEdgeC[np.clip(rs + dx, 0, im_h - 1), np.clip(cs + dy, 0, im_w - 1), 1] = 0 + panoEdgeC[np.clip(rs + dx, 0, im_h - 1), np.clip(cs + dy, 0, im_w - 1), 2] = 255 + + return panoEdgeC + + +def coorx2u(x, w=1024): + return ((x + 0.5) / w - 0.5) * 2 * np.pi + + +def coory2v(y, h=512): + return ((y + 0.5) / h - 0.5) * np.pi + + +def u2coorx(u, w=1024): + return (u / (2 * np.pi) + 0.5) * w - 0.5 + + +def v2coory(v, h=512): + return (v / np.pi + 0.5) * h - 0.5 + + +def uv2xy(u, v, z=-50): + c = z / np.tan(v) + x = c * np.cos(u) + y = c * np.sin(u) + return x, y + + +def pano_connect_points(p1, p2, z=-50, w=1024, h=512): + u1 = coorx2u(p1[0], w) + v1 = coory2v(p1[1], h) + u2 = coorx2u(p2[0], w) + v2 = coory2v(p2[1], h) + + x1, y1 = uv2xy(u1, v1, z) + x2, y2 = uv2xy(u2, v2, z) + + if abs(p1[0] - p2[0]) < w / 2: + pstart = np.ceil(min(p1[0], p2[0])) + pend = np.floor(max(p1[0], p2[0])) + else: + pstart = np.ceil(max(p1[0], p2[0])) + pend = np.floor(min(p1[0], p2[0]) + w) + coorxs = (np.arange(pstart, pend + 1) % w).astype(np.float64) + vx = x2 - x1 + vy = y2 - y1 + us = coorx2u(coorxs, w) + ps = (np.tan(us) * x1 - y1) / (vy - np.tan(us) * vx) + cs = np.sqrt((x1 + ps * vx) ** 2 + (y1 + ps * vy) ** 2) + vs = np.arctan2(z, cs) + coorys = v2coory(vs) + + return np.stack([coorxs, coorys], axis=-1) \ No newline at end of file diff --git a/prepare_data/structured3d/misc/utils.py b/prepare_data/structured3d/misc/utils.py new file mode 100644 index 0000000..93c63f9 --- /dev/null +++ b/prepare_data/structured3d/misc/utils.py @@ -0,0 +1,138 @@ +""" +Adapted from https://github.com/thusiyuan/cooperative_scene_parsing/blob/master/utils/sunrgbd_utils.py +""" +import numpy as np + + +def normalize(vector): + return vector / np.linalg.norm(vector) + + +def parse_camera_info(camera_info, height, width): + """ extract intrinsic and extrinsic matrix + """ + lookat = normalize(camera_info[3:6]) + up = normalize(camera_info[6:9]) + + W = lookat + U = np.cross(W, up) + V = -np.cross(W, U) + + rot = np.vstack((U, V, W)) + trans = camera_info[:3] + + xfov = camera_info[9] + yfov = camera_info[10] + + K = np.diag([1, 1, 1]) + + K[0, 2] = width / 2 + K[1, 2] = height / 2 + + K[0, 0] = K[0, 2] / np.tan(xfov) + K[1, 1] = K[1, 2] / np.tan(yfov) + + return rot, trans, K + + +def flip_towards_viewer(normals, points): + points = points / np.linalg.norm(points) + proj = points.dot(normals[:2, :].T) + flip = np.where(proj > 0) + normals[flip, :] = -normals[flip, :] + return normals + + +def get_corners_of_bb3d(basis, coeffs, centroid): + corners = np.zeros((8, 3)) + # order the basis + index = np.argsort(np.abs(basis[:, 0]))[::-1] + # the case that two same value appear the same time + if index[2] != 2: + index[1:] = index[1:][::-1] + basis = basis[index, :] + coeffs = coeffs[index] + # Now, we know the basis vectors are orders X, Y, Z. Next, flip the basis vectors towards the viewer + basis = flip_towards_viewer(basis, centroid) + coeffs = np.abs(coeffs) + corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + + corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners = corners + np.tile(centroid, (8, 1)) + return corners + + +def get_corners_of_bb3d_no_index(basis, coeffs, centroid): + corners = np.zeros((8, 3)) + coeffs = np.abs(coeffs) + corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2] + + corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2] + + corners = corners + np.tile(centroid, (8, 1)) + return corners + + +def project_3d_points_to_2d(points3d, R_ex, K): + """ + Project 3d points from camera-centered coordinate to 2D image plane + Parameters + ---------- + points3d: numpy array + 3d location of point + R_ex: numpy array + extrinsic camera parameter + K: numpy array + intrinsic camera parameter + Returns + ------- + points2d: numpy array + 2d location of the point + """ + points3d = R_ex.dot(points3d.T).T + x3 = points3d[:, 0] + y3 = -points3d[:, 1] + z3 = np.abs(points3d[:, 2]) + xx = x3 * K[0, 0] / z3 + K[0, 2] + yy = y3 * K[1, 1] / z3 + K[1, 2] + points2d = np.vstack((xx, yy)) + return points2d + + +def project_struct_bdb_to_2d(basis, coeffs, center, R_ex, K): + """ + Project 3d bounding box to 2d bounding box + Parameters + ---------- + basis, coeffs, center, R_ex, K + : K is the intrinsic camera parameter matrix + : Rtilt is the extrinsic camera parameter matrix in right hand coordinates + Returns + ------- + bdb2d: dict + Keys: {'x1', 'x2', 'y1', 'y2'} + The (x1, y1) position is at the top left corner, + the (x2, y2) position is at the bottom right corner + """ + corners3d = get_corners_of_bb3d(basis, coeffs, center) + corners = project_3d_points_to_2d(corners3d, R_ex, K) + bdb2d = dict() + bdb2d['x1'] = int(max(np.min(corners[0, :]), 1)) # x1 + bdb2d['y1'] = int(max(np.min(corners[1, :]), 1)) # y1 + bdb2d['x2'] = int(min(np.max(corners[0, :]), 2*K[0, 2])) # x2 + bdb2d['y2'] = int(min(np.max(corners[1, :]), 2*K[1, 2])) # y2 + # if not check_bdb(bdb2d, 2*K[0, 2], 2*K[1, 2]): + # bdb2d = None + return bdb2d \ No newline at end of file diff --git a/prepare_data/structured3d/save_floorplan.py b/prepare_data/structured3d/save_floorplan.py new file mode 100644 index 0000000..efa2391 --- /dev/null +++ b/prepare_data/structured3d/save_floorplan.py @@ -0,0 +1,170 @@ +import argparse +import json +import os + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib import colors +from shapely.geometry import Polygon, Point +from shapely.plotting import plot_polygon + +from misc.colors import semantics_cmap +from misc.utils import get_corners_of_bb3d_no_index + +rooms = [ + "living room", + "kitchen", + "bedroom", + "bathroom", + "balcony", + "corridor", + "dining room", + "study", + "studio", + "store room", + "garden", + "laundry room", + "office", + "basement", + "garage", + "undefined" +] + +def convert_lines_to_vertices(lines): + """convert line representation to polygon vertices + """ + polygons = [] + lines = np.array(lines) + + polygon = None + while len(lines) != 0: + if polygon is None: + polygon = lines[0].tolist() + lines = np.delete(lines, 0, 0) + + lineID, juncID = np.where(lines == polygon[-1]) + vertex = lines[lineID[0], 1 - juncID[0]] + lines = np.delete(lines, lineID, 0) + + if vertex in polygon: + polygons.append(polygon) + polygon = None + else: + polygon.append(vertex) + + return polygons + + +def visualize_floorplan(scene_path): + """visualize floorplan + """ + with open(os.path.join(scene_path, "annotation_3d.json")) as file: + annos = json.load(file) + + with open(os.path.join(scene_path, "bbox_3d.json")) as file: + boxes = json.load(file) + + # extract the floor in each semantic for floorplan visualization + planes = [] + for semantic in annos['semantics']: + for planeID in semantic['planeID']: + if annos['planes'][planeID]['type'] == 'floor': + planes.append({'planeID': planeID, 'type': semantic['type'], 'room_ID': semantic['ID']}) + + if semantic['type'] == 'outwall': + outerwall_planes = semantic['planeID'] + + # extract hole vertices + lines_holes = [] + for semantic in annos['semantics']: + if semantic['type'] in ['window', 'door']: + for planeID in semantic['planeID']: + lines_holes.extend(np.where(np.array(annos['planeLineMatrix'][planeID]))[0].tolist()) + lines_holes = np.unique(lines_holes) + + # junctions on the floor + junctions = np.array([junc['coordinate'] for junc in annos['junctions']]) + junction_floor = np.where(np.isclose(junctions[:, -1], 0))[0] + + # construct each polygon + polygons = [] + for plane in planes: + lineIDs = np.where(np.array(annos['planeLineMatrix'][plane['planeID']]))[0].tolist() + junction_pairs = [np.where(np.array(annos['lineJunctionMatrix'][lineID]))[0].tolist() for lineID in lineIDs] + polygon = convert_lines_to_vertices(junction_pairs) + polygons.append([polygon[0], plane['type'], plane['room_ID']]) + + outerwall_floor = [] + for planeID in outerwall_planes: + lineIDs = np.where(np.array(annos['planeLineMatrix'][planeID]))[0].tolist() + lineIDs = np.setdiff1d(lineIDs, lines_holes) + junction_pairs = [np.where(np.array(annos['lineJunctionMatrix'][lineID]))[0].tolist() for lineID in lineIDs] + for start, end in junction_pairs: + if start in junction_floor and end in junction_floor: + outerwall_floor.append([start, end]) + + outerwall_polygon = convert_lines_to_vertices(outerwall_floor) + polygons.append([outerwall_polygon[0], 'outwall', 0]) + + junctions = np.array([junc['coordinate'][:2] for junc in annos['junctions']]) + + room_polygons = {} + for (polygon, poly_type, room_id) in polygons: + if poly_type in rooms: + if poly_type not in room_polygons: + room_polygons[room_id] = [] + room_polygons[room_id].append(polygon) + + floorplans_dir = os.path.join(scene_path, 'floorplans') + os.makedirs(floorplans_dir, exist_ok=True) + + for room_id, room_polys in room_polygons.items(): + fig = plt.figure() + ax = fig.add_subplot(1, 1, 1) + room_polygon_objects = [] + for polygon in room_polys: + polygon = np.array(polygon + [polygon[0], ]) + polygon = Polygon(junctions[polygon]) + room_polygon_objects.append(polygon) + room_type = next((item['type'] for item in annos['semantics'] if item['ID'] == room_id)) + plot_polygon(polygon, ax=ax, add_points=False, facecolor=semantics_cmap[room_type], alpha=0.5) + + for bbox in boxes: + basis = np.array(bbox['basis']) + coeffs = np.array(bbox['coeffs']) + centroid = np.array(bbox['centroid']) + + corners = get_corners_of_bb3d_no_index(basis, coeffs, centroid) + corners = corners[[0, 1, 2, 3, 0], :2] + + bbox_polygon = Polygon(corners) + for room_polygon in room_polygon_objects: + if room_polygon.contains(Point(centroid[:2])): + plot_polygon(bbox_polygon, ax=ax, add_points=False, facecolor=colors.rgb2hex(np.random.rand(3)), alpha=0.5) + + + plt.axis('equal') + plt.axis('off') + output_file = os.path.join(floorplans_dir, f"{room_id}.png") + plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight', pad_inches=0) + plt.close(fig) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Structured3D Floorplan Visualization") + parser.add_argument("--path", required=True, + help="dataset path", metavar="DIR") + return parser.parse_args() + + +def main(): + args = parse_args() + scenes = [d for d in os.listdir(args.path) if os.path.isdir(os.path.join(args.path, d)) and d.startswith('scene_')] + for scene in scenes: + scene_path = os.path.join(args.path, scene) + visualize_floorplan(scene_path) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/prepare_data/structured3d/uni3dscene.py b/prepare_data/structured3d/uni3dscene.py new file mode 100644 index 0000000..e1c6ec7 --- /dev/null +++ b/prepare_data/structured3d/uni3dscene.py @@ -0,0 +1,417 @@ +# pylint: disable=no-member +import os +import io +import json +import pickle +from typing import List, Tuple, Dict +import multiprocessing +import cv2 +import numpy as np +from PIL import Image as pil_image + +from utils.config import ProcessUnit, EnvsConfig +from utils.nyu_40 import NYU40 +from utils.s3dutilize import S3DUtilize, Annotations +from utils.base_dataset import DatasetBase +import argparse + +BASE_DIR = '/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans' +class Structured3DDataGen(DatasetBase): + """ + Dataset generation for Structured3D. + + Two separated folders will be created in target folder -- points and semantic_mask. + Points will be saved a .bin file with raw shape [N, 6] (3 for XYZ, 3 for RGB) + and data type np.float32. Semantic mask will be saved a .bin file with raw shape + [N] and data type np.int64. + """ + IMAGE_PREFIX = '/2D_rendering' + + PERSPECTIVE_PREFIX = '/perspective/full' + PRSP_CAM_FILE = 'camera_pose.txt' + + PANORAMIC_PREFIX = '/panorama/full' + PANO_CAM_PREFIX = '/panorama' + PANO_CAM_FILE = 'camera_xyz.txt' + + SEMANTIC_FILE = 'semantic.png' + RGB_FILE = 'rgb_rawlight.png' + DEPTH_FILE = 'depth.png' + + ANNO_FILE = 'bbox_3d.json' + + def __init__(self, proc_units: List[ProcessUnit], envs: EnvsConfig) -> None: + super().__init__(proc_units, envs) + self._zip_folder = BASE_DIR + + def _get_rooms_list_by_types(self, room_types: List[str]) -> List[str]: + assert len(room_types) == 1 and 'all' in room_types + scenes_list = [d for d in os.listdir(self._zip_folder) if os.path.isdir(os.path.join(self._zip_folder, d))] + rooms_list = list() + for scene_path in scenes_list: + rooms_name = os.listdir(os.path.join(self._zip_folder, scene_path, __class__.IMAGE_PREFIX.strip('/'))) + rooms_list.extend([os.path.join(scene_path, __class__.IMAGE_PREFIX.strip('/'), _r) for _r in rooms_name]) + return rooms_list + + @staticmethod + def read_camera_and_image(cam_path: str, info_flags: int, info_root: str) -> Tuple[List, List[np.ndarray]]: + """ + Read camera poses and images from the file system + + Args: + cam_path (str): the relative path of camera + info_flags (int): the flag of the type of images to be read + + Returns: + Tuple[List, List[np.ndarray]]: Camera information and a list of images + """ + if info_root is None: + info_root = cam_path[:cam_path.rfind('/')] + + out_cams = list() + if info_flags & 1: + # Load camera poses + z2y_top_m = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]], dtype=np.float32) + with open(cam_path, 'r') as f: + cam_extr = np.fromstring(f.read(), dtype=np.float32, sep=' ') + cam_t = np.matmul(z2y_top_m, cam_extr[:3] / 1000) + if cam_extr.shape[0] > 3: + cam_r = S3DUtilize.get_rotation_matrix_from_tu(cam_extr[3:6], cam_extr[6:9]) + cam_r = np.matmul(z2y_top_m, cam_r) + cam_hf = cam_extr[9:11] + else: + cam_r = np.eye(3, dtype=np.float32) + cam_hf = None + out_cams.extend([cam_r, cam_t, cam_hf]) + out_images = list() + if info_flags & 2: + # Load depth image + depth_image = cv2.imread(os.path.join(info_root, __class__.DEPTH_FILE), cv2.IMREAD_UNCHANGED)[..., np.newaxis] + depth_image[depth_image == 0] = 65535 + out_images.append(depth_image) + if info_flags & 4: + # Load RGB image + color_image = cv2.imread(os.path.join(info_root, __class__.RGB_FILE), cv2.IMREAD_UNCHANGED)[..., :3][..., ::-1] + out_images.append(color_image) + if info_flags & 8: + # Load semantic image + smnt_image = np.array(pil_image.open(os.path.join(info_root, __class__.SEMANTIC_FILE)))[..., np.newaxis] + out_images.append(smnt_image) + return out_cams, out_images + + @staticmethod + def normal_from_cross_product(points_2d: np.ndarray) -> np.ndarray: + xyz_points_pad = np.pad(points_2d, ((0, 1), (0, 1), (0, 0)), mode='symmetric') + xyz_points_ver = (xyz_points_pad[:, :-1, :] - xyz_points_pad[:, 1:, :])[:-1, :, :] + xyz_points_hor = (xyz_points_pad[:-1, :, :] - xyz_points_pad[1:, :, :])[:, :-1, :] + xyz_normal = np.cross(xyz_points_hor, xyz_points_ver) + xyz_dist = np.linalg.norm(xyz_normal, axis=-1, keepdims=True) + xyz_normal = np.divide(xyz_normal, xyz_dist, out=np.zeros_like(xyz_normal), where=xyz_dist != 0) + return xyz_normal + + @staticmethod + def view2points_prsp(cam_paras: List[np.ndarray], attr_images: List[np.ndarray], cos_thrsh=0.15): + """ + View to 3D points casting of a single perspective image + + Args: + cam_paras (List[np.ndarray]): camera parameters + attr_images (List[np.ndarray]): a list of images to be casted + cos_thrsh (float, optional): the cosine threshold to filtering interpolated depth. Defaults to 0.15. + + Returns: + Tuple[np.ndarray, np.ndarray, np.ndarray] + """ + depth_img, color_img, smnt_img = attr_images + cam_r, cam_t, cam_hf = cam_paras + img_size = np.asarray(depth_img.shape[:2])[::-1] + cam_focal = img_size / 2 / np.tan(cam_hf) + cam_fov_d = S3DUtilize.get_fov_normal(img_size, cam_focal).astype(np.float32) + v_points = S3DUtilize.cast_perspective_to_local_coord(depth_img, cam_fov_d) + vi_normals = __class__.normal_from_cross_product(v_points) + + # Filtering invalid points + view_dist = np.maximum(np.linalg.norm(v_points, axis=-1, keepdims=True), float(10e-5)) + cosine_dist = np.sum((v_points * vi_normals / view_dist), axis=-1, keepdims=True) + cosine_dist = np.abs(cosine_dist) + point_valid = cosine_dist > cos_thrsh + depth_valid = depth_img < 65535 + smnt_valid = smnt_img > 0 + all_valid = (point_valid & depth_valid & smnt_valid)[..., 0] + + v_points = np.matmul(v_points / 1000, cam_r.T) + cam_t + v_normal = __class__.normal_from_cross_product(v_points) + + return v_points[all_valid], color_img[all_valid], v_normal[all_valid], smnt_img[all_valid] + + @staticmethod + def view2points_pano(cam_paras: List[np.ndarray], attr_images: List[np.ndarray], cos_thrsh=0.15): + """ + View to 3D points casting of a single panoramic image + + Args: + cam_paras (List[np.ndarray]): camera parameters + attr_images (List[np.ndarray]): a list of images to be casted + + Returns: + Tuple[np.ndarray, np.ndarray, np.ndarray] + """ + depth_img, color_img, smnt_img = attr_images + _, cam_t, _ = cam_paras + p_h, p_w = attr_images[0].shape[:2] + p_a = np.arange(p_w, dtype=np.float32) / p_w * 2 * np.pi - np.pi + p_b = np.arange(p_h, dtype=np.float32) / p_h * np.pi * -1 + np.pi/2 + p_a = np.tile(p_a[None], [p_h, 1])[..., np.newaxis] + p_b = np.tile(p_b[:, None], [1, p_w])[..., np.newaxis] + p_a_sin, p_a_cos, p_b_sin, p_b_cos = np.sin(p_a), np.cos(p_a), np.sin(p_b), np.cos(p_b) + point_x = depth_img * p_a_cos * p_b_cos + point_y = depth_img * p_b_sin + point_z = depth_img * p_a_sin * p_b_cos + points = np.concatenate([point_x, point_y, point_z], axis=-1) / 1000 + vi_normals = __class__.normal_from_cross_product(points) + # Filtering invalid points + view_dist = np.maximum(np.linalg.norm(points, axis=-1, keepdims=True), float(10e-5)) + cosine_dist = np.sum((points * vi_normals / view_dist), axis=-1, keepdims=True) + cosine_dist = np.abs(cosine_dist) + point_valid = cosine_dist > cos_thrsh + all_valid = (point_valid & (depth_img < 65535) & (smnt_img > 0))[..., 0] + + points = points + cam_t + + return points[all_valid], color_img[all_valid], vi_normals[all_valid], smnt_img[all_valid] + + @staticmethod + def _points2voxel(attr_points: List[np.ndarray], res=0.005) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + p_points, p_colors, p_labels, p_normals = attr_points + + try: + vd_points = np.floor(p_points / res).astype(np.int64) + vd_max = np.max(vd_points, axis=0) + vd_min = np.min(vd_points, axis=0) + vd_box = np.cumprod([1, *(vd_max - vd_min)[:2]]) + + vd_indices = np.sum((vd_points - vd_min[np.newaxis, ...]) * vd_box[np.newaxis, ...], axis=-1) + _, vd_uni = np.unique(vd_indices, return_index=True) + except ValueError: + return None, None, None, None + + return p_points[vd_uni], p_colors[vd_uni], p_labels[vd_uni], p_normals[vd_uni] + + @staticmethod + def _view2points(room_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + prsp_root = os.path.join(room_path, __class__.PERSPECTIVE_PREFIX.strip('/')) + cam_paths = [os.path.join(prsp_root, f) for f in os.listdir(os.path.join(BASE_DIR,prsp_root)) if f.endswith(__class__.PRSP_CAM_FILE)] + all_infos = list() + for cam_path in cam_paths: + cam_paras, attr_images = __class__.read_camera_and_image(cam_path, 15, None) + r_points, r_colors, r_normal, r_labels = __class__.view2points_prsp(cam_paras, attr_images) + all_infos.append((r_points, r_colors, r_normal, r_labels)) + + pano_cam_root = os.path.join(room_path, __class__.PANO_CAM_PREFIX.strip('/')) + cam_paths = [os.path.join(BASE_DIR, pano_cam_root, f) for f in os.listdir(os.path.join(BASE_DIR, pano_cam_root)) if f.endswith(__class__.PANO_CAM_FILE)] + for cam_path in cam_paths: + pano_root = os.path.dirname(os.path.dirname(cam_path)) + pano_root = os.path.join(pano_root, __class__.PANORAMIC_PREFIX.strip('/')) + cam_paras, attr_images = __class__.read_camera_and_image(cam_path, 15, pano_root) + r_points, r_colors, r_normal, r_labels = __class__.view2points_pano(cam_paras, attr_images) + all_infos.append((r_points, r_colors, r_normal, r_labels)) + + a_points = np.concatenate([_i[0] for _i in all_infos], axis=0) + a_colors = np.concatenate([_i[1] for _i in all_infos], axis=0) + a_normals = np.concatenate([_i[2] for _i in all_infos], axis=0) + a_labels = np.concatenate([_i[3] for _i in all_infos], axis=0) + + a_points = a_points[..., [2, 0, 1]] # Convert Y-top to Z-top + a_normals = a_normals[..., [2, 0, 1]] + # print(len(a_points), len(a_colors), len(a_labels), len(a_normals)) + return a_points, a_colors, a_labels, a_normals + + @staticmethod + def _read_instance_infos(room_path: str, points: np.ndarray, labels: np.ndarray, min_pts=50) -> Dict: + scene_id, _, _ = room_path.split('/') + anno_file = os.path.join(BASE_DIR,scene_id, __class__.ANNO_FILE) + if not os.path.exists(anno_file): + return None + + with open(anno_file, 'r') as f: + boxes_info: List[Dict] = json.load(f) + + anno_infos = Annotations() + rb_idx = 0 # room bounding box ID + obj2tgid={} + for box_info in boxes_info: + + b_id = int(box_info['ID']) + centroid = np.asarray(box_info['centroid'], dtype=np.float32) / 1000 + coeffs = np.asarray(box_info['coeffs'], dtype=np.float32) / 1000 + basis = np.asarray(box_info['basis'], dtype=np.float32) + obb_8pts = S3DUtilize.get_8points_bounding_box(basis, coeffs, centroid) + + box_min = np.min(obb_8pts, axis=0, keepdims=True) + box_max = np.max(obb_8pts, axis=0, keepdims=True) + + point_max_mask = np.all(points < box_max, axis=1) + point_min_mask = np.all(points > box_min, axis=1) + point_mask = np.logical_and(point_max_mask, point_min_mask) + box_points: np.ndarray = points[point_mask] + if box_points.size < min_pts: + continue + + box_instances = labels[point_mask][..., 0] + instance_id, instance_count = np.unique(box_instances, return_counts=True) + instance_id = instance_id[np.argmax(instance_count)] + + instance_points = box_points[box_instances == instance_id] + ip_box_min = np.min(instance_points, axis=0) + ip_box_max = np.max(instance_points, axis=0) + dimension = np.maximum(centroid - ip_box_min, ip_box_max - centroid) + + ur_depth = np.concatenate([centroid, dimension * 2], axis=0) + + anno_infos.index.append(rb_idx) + anno_infos.classes.append(instance_id) + anno_infos.name.append(NYU40.index_to_label(instance_id)) + anno_infos.location.append(centroid) + anno_infos.dimensions.append(dimension) + anno_infos.gt_boxes_upright_depth.append(ur_depth) + anno_infos.unaligned_location.append(centroid) + anno_infos.unaligned_dimensions.append(dimension) + anno_infos.unaligned_gt_boxes_upright_depth.append(ur_depth) + obj2tgid[b_id] = rb_idx + rb_idx += 1 + + obj2tgid_path = os.path.join(BASE_DIR, room_path, "obj2tgid.json") + with open(obj2tgid_path, 'w') as json_file: + json.dump(obj2tgid, json_file) + + anno_infos.gt_num = rb_idx + anno_infos.axis_align_matrix = np.eye(4, dtype=np.float64) + return anno_infos.dump() + + def _mp_format_dataset(self, rooms_list: List[str], proc_unit: ProcessUnit, start_index=0, worker_id=0): + del start_index, worker_id + + points_folder = self.envs.get_env_path(proc_unit.out_paths[0]) + os.makedirs(points_folder, exist_ok=True) + semantics_folder = self.envs.get_env_path(proc_unit.out_paths[1]) + os.makedirs(semantics_folder, exist_ok=True) + instance_folder = self.envs.get_env_path(proc_unit.out_paths[2]) + os.makedirs(instance_folder, exist_ok=True) + annotation_folder = self.envs.get_env_path(proc_unit.out_paths[3]) + os.makedirs(annotation_folder, exist_ok=True) + + for _, room_path in enumerate(rooms_list): + if '.' in room_path: + continue + scene_id, _, room_id = room_path.split('/') + dump_name = f'{scene_id}_{room_id}_1cm.bin' + points_path = os.path.join(points_folder, dump_name) + semantics_path = os.path.join(semantics_folder, dump_name) + instance_path = os.path.join(instance_folder, dump_name) + annotation_path = os.path.join(annotation_folder, dump_name) + if np.all([os.path.exists(_path) for _path in [points_path, semantics_path, annotation_path]]): + continue + + # Step 1: Read images and make point clouds + + a_points, a_colors, a_labels, a_normals = self._view2points(room_path) + v_points, v_colors, v_labels, v_normals = self._points2voxel((a_points, a_colors, a_labels, a_normals), 0.01) + if v_points is None: + print(f'Ignore {room_path} with invalid points') + continue + # Step 2: Read bounding box information + anno_infos = self._read_instance_infos(room_path, v_points, v_labels) + if anno_infos is None: + print(f'Ignore {room_path} with invalid annotations') + continue + # print(v_points.shape) + # print(v_colors.shape) + # print(v_labels.shape) + # print(v_normals.shape) + np.concatenate([v_points.astype(np.float32), v_colors.astype(np.float32), v_normals.astype(np.float32)], axis=-1).tofile(points_path) + v_labels.astype(np.int64).tofile(semantics_path) + with open(annotation_path, 'wb') as a_fp: + pickle.dump(anno_infos, a_fp) + + def multiple_processor(func, samples: List, workers, args: Tuple): + samples_per_worker = int((len(samples) - 1) / workers + 1) + processes = list() + for w in range(workers): + start_index = w * samples_per_worker + end_index = min((w + 1) * samples_per_worker, len(samples)) + f_args = (samples[start_index: end_index], ) + args + (start_index, w) + t = multiprocessing.Process(target=func, args=f_args) + processes.append(t) + t.start() + for p in processes: + p.join() + + def format_dataset(self, proc_unit: ProcessUnit): + attrs = proc_unit.attrs + + desc_dir = os.path.join(self.envs.out_data_root, 'desc') + os.makedirs(desc_dir, exist_ok=True) + with open(os.path.join(desc_dir, proc_unit.out_paths[0]), 'wb') as b_fp: + pickle.dump(np.zeros([0, 9], np.float32), b_fp) + with open(os.path.join(desc_dir, proc_unit.out_paths[1]), 'wb') as b_fp: + pickle.dump(np.zeros([0], np.int64), b_fp) + + rooms_list = self._get_rooms_list_by_types(attrs['room_types']) + + __class__.multiple_processor(self._mp_format_dataset, rooms_list, 8, \ + (proc_unit, )) +# def main(): +# # Create the environment configuration instance +# envs = EnvsConfig() +# envs.out_data_root = "/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/uni3d_output" +# envs.in_data_root = "/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans" +# # Add other necessary environment variables here + +# # Define the process unit +# proc_unit = ProcessUnit() +# proc_unit.in_paths = ["data"] +# proc_unit.out_paths = ["points", "semantic_mask", "instance", "annotations"] +# proc_unit.attrs = {"room_types": ["all"]} + +# # Create the Structured3DDataGen instance +# data_gen = Structured3DDataGen([proc_unit], envs) + +# # Run the dataset formatting +# data_gen.format_dataset(proc_unit) + +# if __name__ == "__main__": +# main() + +def parse_args(): + parser = argparse.ArgumentParser(description='Process Structured3D dataset') + parser.add_argument('--base_dir', type=str, + default='/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans', + help='Base directory for scans') + parser.add_argument('--out_data_root', type=str, + default='/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/uni3d_output', + help='Output data root directory') + parser.add_argument('--in_data_root', type=str, + default='/Users/gauravpradeep/Crossover_ScaleUp/Structured3D/scans', + help='Input data root directory') + return parser.parse_args() + +def main(): + args = parse_args() + global BASE_DIR + BASE_DIR = args.base_dir + + envs = EnvsConfig() + envs.out_data_root = args.out_data_root + envs.in_data_root = args.in_data_root + + proc_unit = ProcessUnit() + proc_unit.in_paths = ["data"] + proc_unit.out_paths = ["points", "semantic_mask", "instance", "annotations"] + proc_unit.attrs = {"room_types": ["all"]} + + data_gen = Structured3DDataGen([proc_unit], envs) + + data_gen.format_dataset(proc_unit) + +if __name__ == "__main__": + main() diff --git a/prepare_data/structured3d/utils/base_dataset.py b/prepare_data/structured3d/utils/base_dataset.py new file mode 100644 index 0000000..c59fc7e --- /dev/null +++ b/prepare_data/structured3d/utils/base_dataset.py @@ -0,0 +1,18 @@ +""" +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +from abc import abstractmethod + +from utils.config import ProcessUnit, EntryBase + + +class DatasetBase(EntryBase): + """ + The base class of dataset + """ + @abstractmethod + def format_dataset(self, proc_unit: ProcessUnit): + """ + Construct 3D point cloud from views + """ \ No newline at end of file diff --git a/prepare_data/structured3d/utils/config.py b/prepare_data/structured3d/utils/config.py new file mode 100644 index 0000000..b5f89db --- /dev/null +++ b/prepare_data/structured3d/utils/config.py @@ -0,0 +1,270 @@ +""" +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +# pylint: disable=logging-fstring-interpolation +import logging +import re +import multiprocessing as mp +from abc import abstractmethod +from typing import List, Dict, Tuple +import multiprocessing +import yaml +import json +import copy +import inspect +import logging +import numpy as np +import os + +class DictRecursive(object): + def __init__(self): + pass + + def load(self, kargs: dict, shared_dict=None): + """ + Launch args of class from a dict. All subclass of DictRecursive will call this function automatically. Supported + types includes int, float, list, str and DictRecursive + + Args: + kargs: a dict saved the pairs of name/value of attributions + shared_dict: a shared item used by all other items + """ + if shared_dict is None: + shared_dict = {} + for cls_arg_name in self.__dict__.keys(): + arg_value = None + if kargs is not None: + arg_value = kargs[cls_arg_name] if cls_arg_name in kargs.keys() else None + if shared_dict is not None: + arg_value = shared_dict[cls_arg_name] if cls_arg_name in shared_dict.keys() else arg_value + cls_arg = self.__dict__[cls_arg_name] + self.__dict__[cls_arg_name] = self.parse_single_arg(cls_arg, arg_value, shared_dict) + return self + + def save(self): + save_dict = {} + for cls_arg_name in self.__dict__.keys(): + save_dict[cls_arg_name] = self.inverse_single_arg(self.__dict__[cls_arg_name]) + return save_dict + + def load_from_yaml(self, yaml_path, shared_scope=''): + with open(yaml_path, 'r', encoding='utf-8') as fp: + cfg_cxt = yaml.load(fp.read(), Loader=yaml.FullLoader) + self.load(cfg_cxt, cfg_cxt[shared_scope] if shared_scope in cfg_cxt.keys() else dict()) + + def load_from_json(self, json_path): + with open(json_path, 'r', encoding='utf-8') as fp: + self.load(json.load(fp)) + + def save_to_json(self, json_path): + with open(json_path, 'w') as fp: + save_meta = self.save() + json.dump(self.save(), fp) + + @staticmethod + def inverse_single_arg(arg_value): + if issubclass(type(arg_value), DictRecursive): + return arg_value.save() + elif isinstance(arg_value, list): + list_arg_value = list() + for a_v in arg_value: + list_arg_value.append(DictRecursive.inverse_single_arg(a_v)) + return list_arg_value + elif isinstance(arg_value, np.ndarray): + return arg_value.tolist() + else: + return arg_value + + @staticmethod + def parse_single_arg(cls_arg, arg_value, shared_dict=None): + if isinstance(cls_arg, int): + cls_arg_value = int(arg_value) if arg_value is not None else cls_arg + elif isinstance(cls_arg, str): + cls_arg_value = str(arg_value) if arg_value is not None else cls_arg + elif isinstance(cls_arg, float): + cls_arg_value = float(arg_value) if arg_value is not None else cls_arg + elif isinstance(cls_arg, list): + cls_arg_value = list() + cls_arg_e = str() if not cls_arg else cls_arg[0] + if arg_value is not None: + for a_v in arg_value: + cls_arg_value.append(DictRecursive.parse_single_arg(cls_arg_e, a_v, shared_dict)) + elif isinstance(cls_arg, dict): + if arg_value is not None: + cls_arg_value = dict() + for a_v in arg_value: + cls_arg_value[a_v] = arg_value[a_v] + else: + cls_arg_value = cls_arg + elif isinstance(cls_arg, np.ndarray): + if arg_value is not None: + cls_arg_value = np.asarray(arg_value, cls_arg.dtype) + else: + cls_arg_value = cls_arg + elif issubclass(type(cls_arg), DictRecursive): + cls_arg_value = type(cls_arg)() + cls_arg_value.load(arg_value, shared_dict) + else: + raise NotImplementedError + return cls_arg_value + + def match_function_args(self, external_dict, target_func): + args_dict = copy.deepcopy(external_dict) + for func_key in inspect.signature(target_func).parameters.keys(): + if func_key not in self.__dict__.keys(): + continue + if func_key in args_dict.keys(): + continue + args_dict[func_key] = self.__dict__[func_key] + return args_dict + + +class ProcessUnit(DictRecursive): + """ + Pipeline units + """ + def __init__(self): + super().__init__() + self.assemble_function = str() + self.name = str() + self.stride = 1 + self.attrs = dict() + self.in_paths = list() + self.out_paths = list() + + +class EntryConfig(DictRecursive): + """ + Main entry of each task + """ + def __init__(self): + super().__init__() + self.assemble_class = str() + self.process_pipelines = list([ProcessUnit()]) + + +class EnvsConfig(DictRecursive): + """ + Global environments + """ + def __init__(self): + super().__init__() + self.in_data_root = str() + self.out_data_root = str() + self.io_paths: Dict[str, str] = dict() + + def get_env_path(self, env_name: str): + """ + Get the absolute folder path by the env name + """ + if 'in_data_root' not in self.io_paths: + self.io_paths['in_data_root'] = self.in_data_root + self.io_paths['out_data_root'] = self.out_data_root + self.io_paths = { + "points": os.path.join(self.out_data_root, "points"), + "semantic_mask": os.path.join(self.out_data_root, "semantic_mask"), + "instance": os.path.join(self.out_data_root, "instance"), + "annotations": os.path.join(self.out_data_root, "annotations"), + } + rel_path = self.io_paths[env_name] + while True: + regex_pattern = r'\$.*\$' + patterns = re.findall(regex_pattern, rel_path) + if not patterns: + break + rel_path = rel_path.replace(patterns[0], self.io_paths[patterns[0][1:-1]]) + return rel_path + + +class StreamingTasks(DictRecursive): + """ + Main entry of streaming tasks + """ + def __init__(self): + super().__init__() + self.envs = EnvsConfig() + self.streaming_lines = list([EntryConfig()]) + + +class EntryBase: + """ + The basic config of entry + """ + def __init__(self, proc_units: List[ProcessUnit], envs: EnvsConfig) -> None: + self.proc_units = proc_units + self.envs = envs + + def execute_pipeline(self): + """ + execute the data processing pipeline + """ + for proc_unit in self.proc_units: + proc_func = getattr(self, proc_unit.assemble_function) + proc_func(proc_unit) + + +class MPEntryBase(EntryBase): + """ + The multi-process config of entry + """ + def __init__(self, proc_units: List[ProcessUnit], envs: EnvsConfig) -> None: + super().__init__(proc_units, envs) + self._enable_mp = True + self._num_worker = 8 + + @abstractmethod + def _sample_list(self): + """ + Return the list of samples to be processed + """ + + def _execute_proc_unit(self, sample: str, proc_unit: ProcessUnit, shared_vars: Dict): + proc_func = getattr(self, proc_unit.assemble_function) + proc_func(sample, proc_unit, shared_vars) + + def _merged_cross_processing(self, ipc_vars): + """ + Merge all shared list information cross all processors + """ + + def _merged_within_processing(self, shared_vars, ipc_vars): + """ + Merge all information within a processor + """ + + def _mp_execute_pipeline(self, samples, ipc_vars: List, worker_offset=0, worker_id=0): + del worker_offset + logging.info(f'worker {worker_id} begin...') + shared_vars = dict() + for s_idx, sample in enumerate(samples): + for proc_unit in self.proc_units: + if s_idx % proc_unit.stride != 0: + continue + self._execute_proc_unit(sample, proc_unit, shared_vars) + self._merged_within_processing(shared_vars, ipc_vars) + + def multiple_processor(func, samples: List, workers, args: Tuple): + samples_per_worker = int((len(samples) - 1) / workers + 1) + processes = list() + for w in range(workers): + start_index = w * samples_per_worker + end_index = min((w + 1) * samples_per_worker, len(samples)) + f_args = (samples[start_index: end_index], ) + args + (start_index, w) + t = multiprocessing.Process(target=func, args=f_args) + processes.append(t) + t.start() + for p in processes: + p.join() + + def execute_pipeline(self): + logging.info(f'- Start to execute pipeline {self.__class__.__name__}') + samples = self._sample_list() + ipc_vars = mp.Manager().list() + if self._enable_mp: + __class__.multiple_processor(self._mp_execute_pipeline, samples, workers=8, \ + args=(ipc_vars, )) + else: + self._mp_execute_pipeline(samples, ipc_vars) + self._merged_cross_processing(list(ipc_vars)) + logging.info(f'- Finished to execute pipeline {self.__class__.__name__}') \ No newline at end of file diff --git a/prepare_data/structured3d/utils/label_mapping.txt b/prepare_data/structured3d/utils/label_mapping.txt new file mode 100644 index 0000000..593508b --- /dev/null +++ b/prepare_data/structured3d/utils/label_mapping.txt @@ -0,0 +1,40 @@ +1 wall +2 floor +3 cabinet +4 bed +5 chair +6 sofa +7 table +8 door +9 window +10 bookshelf +11 picture +12 counter +13 blinds +14 desk +15 shelves +16 curtain +17 dresser +18 pillow +19 mirror +20 floor mat +21 clothes +22 ceiling +23 books +24 refrigerator +25 television +26 paper +27 towel +28 shower curtain +29 box +30 whiteboard +31 person +32 nightstand +33 toilet +34 sink +35 lamp +36 bathtub +37 bag +38 otherstructure +39 otherfurniture +40 otherprop diff --git a/prepare_data/structured3d/utils/nyu_40.py b/prepare_data/structured3d/utils/nyu_40.py new file mode 100644 index 0000000..48410ba --- /dev/null +++ b/prepare_data/structured3d/utils/nyu_40.py @@ -0,0 +1,94 @@ +""" +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +import os +from typing import Dict + + +class NYU40: + """ + NYU40 label definition and color scheme + """ + LABEL_DICT: Dict[str, int] = dict() + INDEX_DICT: Dict[int, str] = dict() + + @staticmethod + def load_dict(i2l: bool): + """ + Load global label dictionary + """ + if not __class__.LABEL_DICT: + label_path = os.path.join(os.path.dirname(os.path.abspath(\ + __file__)), 'label_mapping.txt') + with open(label_path, encoding='utf-8') as l_fp: + for line in l_fp.readlines(): + items = line.rstrip('\n').split('\t') + __class__.LABEL_DICT[items[-1]] = int(items[0]) + __class__.INDEX_DICT[int(items[0])] = items[-1] + return __class__.INDEX_DICT if i2l else __class__.LABEL_DICT + + @staticmethod + def label_to_index(label: str): + """ + Mapping index to label + """ + return __class__.load_dict(False)[label] + + @staticmethod + def index_to_label(index: int): + """ + Mapping index to label + """ + return __class__.load_dict(True)[index] + + @staticmethod + def color_scheme(): + """ + Get the color coding scheme + Source from: https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/util.py + Copyright: ScanNet + """ + return [ + (0, 0, 0), + (174, 199, 232), # wall + (152, 223, 138), # floor + (31, 119, 180), # cabinet + (255, 187, 120), # bed + (188, 189, 34), # chair + (140, 86, 75), # sofa + (255, 152, 150), # table + (214, 39, 40), # door + (197, 176, 213), # window + (148, 103, 189), # bookshelf + (196, 156, 148), # picture + (23, 190, 207), # counter + (178, 76, 76), + (247, 182, 210), # desk + (66, 188, 102), + (219, 219, 141), # curtain + (140, 57, 197), + (202, 185, 52), + (51, 176, 203), + (200, 54, 131), + (92, 193, 61), + (78, 71, 183), + (172, 114, 82), + (255, 127, 14), # refrigerator + (91, 163, 138), + (153, 98, 156), + (140, 153, 101), + (158, 218, 229), # shower curtain + (100, 125, 154), + (178, 127, 135), + (120, 185, 128), + (146, 111, 194), + (44, 160, 44), # toilet + (112, 128, 144), # sink + (96, 207, 209), + (227, 119, 194), # bathtub + (213, 92, 176), + (94, 106, 211), + (82, 84, 163), # other furn + (100, 85, 144) + ] \ No newline at end of file diff --git a/prepare_data/structured3d/utils/s3dutilize.py b/prepare_data/structured3d/utils/s3dutilize.py new file mode 100644 index 0000000..169c870 --- /dev/null +++ b/prepare_data/structured3d/utils/s3dutilize.py @@ -0,0 +1,118 @@ +import numpy as np + +class Annotations: + """ + Annotation information + """ + def __init__(self) -> None: + self.gt_num = 0 + self.name = list() + self.location = list() + self.dimensions = list() + self.gt_boxes_upright_depth = list() + self.unaligned_location = list() + self.unaligned_dimensions = list() + self.unaligned_gt_boxes_upright_depth = list() + self.index = list() + self.classes = list() + self.axis_align_matrix = list() + + def dump(self): + """ + Dump information into dict + """ + anno_dict = dict() + anno_dict['gt_num'] = int(self.gt_num) + anno_dict['name'] = np.asarray(self.name) + anno_dict['location'] = np.asarray(self.location, dtype=np.float64) + anno_dict['dimensions'] = np.asarray(self.dimensions, dtype=np.float64) + anno_dict['gt_boxes_upright_depth'] = np.asarray(self.gt_boxes_upright_depth, \ + dtype=np.float64) + anno_dict['unaligned_location'] = np.asarray(self.unaligned_location, \ + dtype=np.float64) + anno_dict['unaligned_dimensions'] = np.asarray(self.unaligned_dimensions, \ + dtype=np.float64) + anno_dict['unaligned_gt_boxes_upright_depth'] = np.asarray( + self.unaligned_gt_boxes_upright_depth, dtype=np.float64) + anno_dict['index'] = np.asarray(self.index, dtype=np.int32) + anno_dict['class'] = np.asarray(self.classes, dtype=np.int64) + anno_dict['axis_align_matrix'] = np.asarray(self.axis_align_matrix, dtype=np.float64) + return anno_dict + + +class S3DUtilize(object): + """ + Structured3D utilize functions + """ + @staticmethod + def get_fov_normal(image_size, cam_focal, norm=True): + """ + Get the normal FoV directions + """ + u2x, v2y = [(np.arange(1, image_size[a_i] + 1) - image_size[a_i] / 2) / cam_focal[a_i]\ + for a_i in [0, 1]] + cam_m_u2x = np.tile([u2x], (image_size[1], 1)) + cam_m_v2y = np.tile(v2y[:, np.newaxis], (1, image_size[0])) + cam_m_depth = np.ones(image_size).T + fov_normal = np.stack((cam_m_depth, -1 * cam_m_v2y, cam_m_u2x), axis=-1) + if norm: + fov_normal = fov_normal / np.sqrt(np.sum(np.square(fov_normal), axis=-1, keepdims=True)) + return fov_normal + + @staticmethod + def cast_perspective_to_local_coord(depth_img: np.ndarray, fov_normal): + """ + Cast the perspective image into 3D coordinate system + """ + return depth_img * fov_normal + + @staticmethod + def cast_points_to_voxel(points, labels, room_size=(6.4, 3.2, 6.4), room_stride=0.2): + """ + Voxelize the points + """ + vol_resolution = (np.asarray(room_size) / room_stride).astype(np.int32) + vol_index = np.floor(points / room_stride).astype(np.int32) + in_vol = np.logical_and(np.all(vol_index < vol_resolution, axis=1), \ + np.all(vol_index >= 0, axis=1)) + v_x, v_y, v_z = [d_[..., 0] for d_ in np.split(vol_index[in_vol], 3, axis=-1)] + vol_label = labels[in_vol] + vol_data = np.zeros(vol_resolution, dtype=np.uint8) + vol_data[v_x, v_y, v_z] = vol_label + return vol_data + + @staticmethod + def get_rotation_matrix_from_tu(cam_front, cam_up): + """ + Get the rotation matrix from TU-coords + """ + cam_n = np.cross(cam_front, cam_up) + cam_m = np.stack((cam_front, cam_up, cam_n), axis=1).astype(np.float32) + return cam_m + + @staticmethod + def get_8points_bounding_box(basis, coeffs, centroid): + """ + Get the 8 corners from the bounding box parameters + """ + corners = np.zeros((8, 3)) + coeffs = np.abs(coeffs) + corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * \ + coeffs[1] + basis[2, :] * coeffs[2] + corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * \ + coeffs[1] + basis[2, :] * coeffs[2] + corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * \ + coeffs[1] + basis[2, :] * coeffs[2] + corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * \ + coeffs[1] + basis[2, :] * coeffs[2] + + corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * \ + coeffs[1] + -basis[2, :] * coeffs[2] + corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * \ + coeffs[1] + -basis[2, :] * coeffs[2] + corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * \ + coeffs[1] + -basis[2, :] * coeffs[2] + corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * \ + coeffs[1] + -basis[2, :] * coeffs[2] + corners = corners + np.tile(centroid, (8, 1)) + return corners diff --git a/preprocess/build.py b/preprocess/build.py index 551d97f..fb3445e 100644 --- a/preprocess/build.py +++ b/preprocess/build.py @@ -3,5 +3,6 @@ PROCESSOR_REGISTRY = Registry("Processor") def build_processor(processor_name, data_config, modality_config, split): + print(f"Building processor: {processor_name}") processor = PROCESSOR_REGISTRY.get(processor_name)(data_config, modality_config, split) return processor \ No newline at end of file diff --git a/preprocess/feat1D/__init__.py b/preprocess/feat1D/__init__.py index 9a1b744..7db5e81 100644 --- a/preprocess/feat1D/__init__.py +++ b/preprocess/feat1D/__init__.py @@ -1,2 +1,5 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * +from .structured3d import * \ No newline at end of file diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py new file mode 100644 index 0000000..efab03c --- /dev/null +++ b/preprocess/feat1D/arkit.py @@ -0,0 +1,107 @@ +import os.path as osp +import torch +import numpy as np +from tqdm import tqdm + +from common import load_utils +from util import labelmap, arkit +from util.arkit import ARKITSCENE_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(ARKitScenes1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = arkit.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + # label map + self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + for _i, label_info in enumerate(annotations["data"]): + obj_label = label_info["label"] + object_id = _i + 1 + scannet_class=ARKITSCENE_SCANNET[obj_label] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + + return objects + + + + def compute1DFeaturesEachScan(self, scan_id): + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + scan_objects = self.load_objects_for_scan(scan_id) + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D = {} + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): + object_referral_embeddings = {} + + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + + if instance_id not in objectID_to_labelID_map.keys(): + continue + + # Object Referral + object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id] + if len(object_referral) != 0: + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + + return object_referral_embeddings diff --git a/preprocess/feat1D/multiscan.py b/preprocess/feat1D/multiscan.py new file mode 100644 index 0000000..58b9ff9 --- /dev/null +++ b/preprocess/feat1D/multiscan.py @@ -0,0 +1,123 @@ +import os.path as osp +import torch +import numpy as np +from tqdm import tqdm + +from common import load_utils +from util import labelmap, multiscan + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + +@PROCESSOR_REGISTRY.register() +class MultiScan1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(MultiScan1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + # label map + self.undefined = 0 + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + objects = [] + + for obj in annotations["objects"]: + objects.append({ + "objectId": obj["objectId"], + "global_id": obj.get("label") + }) + + return objects + + def extractTextFeats(self, texts, return_text = False): + text_feats = [] + + for text in texts: + encoded_text = self.model.tokenizer(text, padding=True, add_special_tokens=True, return_tensors="pt").to(self.device) + if encoded_text['input_ids'].shape[1] > 512: + continue + + with torch.no_grad(): + encoded_text = self.model.text_encoder(encoded_text.input_ids, attention_mask = encoded_text.attention_mask, + return_dict = True, mode = 'text').last_hidden_state[:, 0].cpu().numpy().reshape(1, -1) + + text_feats.append({'text' : text, 'feat' : encoded_text}) + + if len(text_feats) == 0: + return None + + if return_text: + return text_feats + + text_feats = [text_feat['feat'] for text_feat in text_feats] + text_feats = np.concatenate(text_feats) + return text_feats + + + def compute1DFeaturesEachScan(self, scan_id): + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + scan_objects = self.load_objects_for_scan(scan_id) + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D = {} + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): + object_referral_embeddings = {} + + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + + if instance_id not in objectID_to_labelID_map.keys(): + continue + + # Object Referral + object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id] + if len(object_referral) != 0: + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + + return object_referral_embeddings \ No newline at end of file diff --git a/preprocess/feat1D/structured3d.py b/preprocess/feat1D/structured3d.py new file mode 100644 index 0000000..bca603c --- /dev/null +++ b/preprocess/feat1D/structured3d.py @@ -0,0 +1,135 @@ +import os.path as osp +import torch +import numpy as np +from tqdm import tqdm + +from common import load_utils +from util import structured3d +from util.structured3d import S3D_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + + +@PROCESSOR_REGISTRY.register() +class Structured3D_1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(Structured3D_1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = structured3d.get_scan_ids(files_dir, split) + + self.out_dir = config_data.process_dir + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + + def compute1DFeaturesEachScan(self, scan_id): + full_scan_id = scan_id + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + obj2tgtid_map = load_utils.load_json(osp.join(self.data_dir,'scans',scan_id,'2D_rendering',room_id,'obj2tgid.json')) + + scene_out_dir = osp.join(self.out_dir, full_scan_id) + load_utils.ensure_dir(scene_out_dir) + + objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(objectID_to_labelID_map.keys()) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(full_scan_id, objectID_to_labelID_map, obj2tgtid_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == full_scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D = {} + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, objectID_to_labelID_map, obj2tgtid): + object_referral_embeddings = {} + matched_objids=[] + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for instance_id in objectID_to_labelID_map.keys(): + if str(instance_id) not in obj2tgtid.keys(): + # print(f"Instance ID {instance_id} not found in obj2tgtid mapping for scan {scan_id}. Skipping...") + continue + mapped_obj_id = obj2tgtid[str(instance_id)] + nyu40id= objectID_to_labelID_map[instance_id] + if nyu40id==0: + continue + label = S3D_SCANNET[nyu40id] + object_referral = [] + for referral in scan_referrals: + if int(referral['target_id']) == int(mapped_obj_id): + if referral['instance_type'] == label: + # print(referral['utterance']) + matched_objids.append(instance_id) + # print(scan_id,label,referral['instance_type'],referral['target_id'],mapped_obj_id) + object_referral.append(referral['utterance']) + # else: + # print(scan_id,label,referral['instance_type'],referral['target_id'],mapped_obj_id) + + if len(object_referral) != 0: + # print(scan_id,instance_id,len(object_referral)) + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + # finding unmatched referrals + unmatched_referrals = [] + for referral in scan_referrals: + mapped_obj_id = referral['target_id'] + if int(mapped_obj_id) not in [int(obj2tgtid[str(instance_id)]) for instance_id in objectID_to_labelID_map.keys() if str(instance_id) in obj2tgtid]: + unmatched_referrals.append(referral) + elif any(int(mapped_obj_id) == int(obj2tgtid[str(instance_id)]) and S3D_SCANNET[objectID_to_labelID_map[instance_id]] != referral['instance_type'] + for instance_id in objectID_to_labelID_map.keys() if str(instance_id) in obj2tgtid and objectID_to_labelID_map[instance_id] != 0): + unmatched_referrals.append(referral) + + label_to_instances = {} + for instance_id, nyu40id in objectID_to_labelID_map.items(): + if nyu40id == 0: + continue + label = S3D_SCANNET[nyu40id] + if label not in label_to_instances: + label_to_instances[label] = [] + label_to_instances[label].append(instance_id) + + for referral in unmatched_referrals: + instance_type = referral['instance_type'] + if instance_type in label_to_instances and len(label_to_instances[instance_type]) == 1: + instance_id = label_to_instances[instance_type][0] + if instance_id not in matched_objids: + # print(f"Matching unmatched referral to unique instance: {scan_id},{instance_id}, {instance_type}, {referral['target_id']}") + if instance_id not in object_referral_embeddings: + object_referral = [referral['utterance']] + else: + object_referral_embeddings[instance_id]['referral'].append(referral['utterance']) + + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis=0).reshape(1, -1) + object_referral_embeddings[instance_id] = {'referral': object_referral, 'feats': object_referral_feats} + + + # print(object_referral_embeddings.keys()) + return object_referral_embeddings diff --git a/preprocess/feat2D/__init__.py b/preprocess/feat2D/__init__.py index 9a1b744..7db5e81 100644 --- a/preprocess/feat2D/__init__.py +++ b/preprocess/feat2D/__init__.py @@ -1,2 +1,5 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * +from .structured3d import * \ No newline at end of file diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py new file mode 100644 index 0000000..531b5b6 --- /dev/null +++ b/preprocess/feat2D/arkit.py @@ -0,0 +1,277 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +import shutil +from PIL import Image +from scipy.spatial.transform import Rotation as R +from omegaconf import DictConfig +from typing import List, Dict, Tuple +import pandas as pd +from common import load_utils +from util import render, arkit, visualisation +from util import image as image_util + + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes2DProcessor(Base2DProcessor): + """ARKitScenes 2D (RGB) feature processor class.""" + def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) -> None: + super(ARKitScenes2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.split = split + self.scan_ids = arkit.get_scan_ids(files_dir, self.split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + + self.orig_image_size = config_2D.image.orig_size + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + self.undefined = 0 + self.metadata = pd.read_csv(osp.join(files_dir,'metadata.csv')) + + self.frame_pose_data = {} + for scan_id in self.scan_ids: + pose_data = arkit.load_poses(osp.join(self.data_dir, 'scans', scan_id),scan_id, skip=self.frame_skip) + self.frame_pose_data[scan_id] = pose_data + + + def compute2DFeatures(self) -> None: + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + # if self.split == 'val': + # self.computeAllImageFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id: str) -> None: + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + if osp.exists(osp.join(scene_folder, 'gt-projection-seg.pt')): + return + + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations) + instance_ids = ply_data['objectId'] + + mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply') + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + obj_id_imgs = {} + obj_id_imgs = {} + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map + + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + if osp.exists(osp.join(scene_folder, 'gt-projection')): + shutil.rmtree(osp.join(scene_folder, 'gt-projection')) + + # save scene-level file for efficient loading + torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt')) + + def compute2DFeaturesEachScan(self, scan_id: str) -> None: + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide') + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + if osp.exists(osp.join(scene_out_dir, 'data2D.pt')): + return + + obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsic_mat = camera_info['intrinsic_mat'] + break + + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply')) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D = {} + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict + + torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + + def computeAllImageFeaturesEachScan(self, scan_id: str) -> None: + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide') + + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + + # Extract Scene Image Features + scene_images_pt = [] + scene_image_embeddings = [] + # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + for frame_index in frame_idxs: + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + + scene_image_embeddings.append(self.extractFeatures([image_pt], return_only_cls_mean= False)) + scene_images_pt.append(image_pt) + + scene_image_embeddings = np.concatenate(scene_image_embeddings) + data2D = {} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs} + torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt')) + + def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + # Extract Scene Image Features + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + # return pose_data, None, None, sampled_frame_idxs + + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]: + object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt')) + object_image_votes = {} + scan_id=scene_folder.split('/')[-1] + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + if obj_id == self.undefined: + continue + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, f'{scan_id}_frames', 'lowres_wide', f'{scan_id}_{frame_idx}.png') + color_img = Image.open(image_path) + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(scan_id, color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, scan_id, image: Image.Image, object_id: int, object_anno_2d: np.ndarray) -> np.ndarray: + object_anno_2d = object_anno_2d.transpose(1, 0) + object_anno_2d = np.flip(object_anno_2d, 1) + + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats \ No newline at end of file diff --git a/preprocess/feat2D/multiscan.py b/preprocess/feat2D/multiscan.py new file mode 100644 index 0000000..d95239e --- /dev/null +++ b/preprocess/feat2D/multiscan.py @@ -0,0 +1,240 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +from PIL import Image +from scipy.spatial.transform import Rotation as R + +from common import load_utils +from util import render, multiscan, visualisation +from util import image as image_util + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + + +@PROCESSOR_REGISTRY.register() +class MultiScan2DProcessor(Base2DProcessor): + def __init__(self, config_data, config_2D, split) -> None: + super(MultiScan2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + self.split = split + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + + self.orig_image_size = config_2D.image.orig_size + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + self.undefined = 0 + + + # get frame_indexes + self.frame_pose_data = {} + for scan_id in self.scan_ids: + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip) + while(len(frame_idxs) > 500): + self.frame_skip += 2 + frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=2) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=5) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=10) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=15) + # if len(frame_idxs) > 500: + # frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=20) + + pose_data = multiscan.load_all_poses(scene_folder, frame_idxs) + self.frame_pose_data[scan_id] = pose_data + + + def compute2DFeatures(self): + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id): + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id)) + + ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) + instance_ids = ply_data['objectId'] + + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + obj_id_imgs = {} + for frame_idx in self.frame_pose_data[scan_id]: + camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx)) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + # save scene-level file for efficient loading + torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt')) + + def compute2DFeaturesEachScan(self, scan_id): + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + color_path = osp.join(scene_folder, 'sequence') + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id) + intrinsic_mat = camera_info['intrinsic_mat'] + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id))) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D = {} + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict + + torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + + def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs): + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + + # Extract Scene Image Features + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder, scene_out_dir, obj_id_to_label_id_map): + object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt')) + object_image_votes = {} + + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + if obj_id == self.undefined: + continue + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, 'sequence', f'frame-{frame_idx}.color.jpg') + color_img = Image.open(image_path) + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, image, object_id, object_anno_2d): + # load image + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + # images_crops.append(cropped_img) + + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats \ No newline at end of file diff --git a/preprocess/feat2D/structured3d.py b/preprocess/feat2D/structured3d.py new file mode 100644 index 0000000..9893260 --- /dev/null +++ b/preprocess/feat2D/structured3d.py @@ -0,0 +1,264 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +import shutil +from PIL import Image +from scipy.spatial.transform import Rotation as R +import cv2 +from common import load_utils +from util import render, structured3d, visualisation +from util import image as image_util +import os +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + + +@PROCESSOR_REGISTRY.register() +class Structured3D_2DProcessor(Base2DProcessor): + def __init__(self, config_data, config_2D, split) -> None: + super(Structured3D_2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + self.split = split + + self.scan_ids = [] + self.scan_ids = structured3d.get_scan_ids(files_dir, split) + + self.out_dir = config_data.process_dir + load_utils.ensure_dir(self.out_dir) + + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + + + # get frame_indexes + self.frame_pose_data = {} + for scan_id in self.scan_ids: + full_scan_id = scan_id + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + scene_folder = osp.join(self.data_dir, 'scans', scan_id, '2D_rendering', room_id, 'perspective', 'full') + frame_idxs = [f for f in os.listdir(scene_folder) if f[0] != '.' and f[0] != 'g'] + pose_data = structured3d.load_all_poses(scene_folder, frame_idxs) + self.frame_pose_data[full_scan_id] = pose_data + + + def compute2DFeatures(self): + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + # if self.split == 'val': + # self.computeAllImageFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id): + full_scan_id = scan_id + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + scene_folder = osp.join(self.data_dir, 'scans', scan_id,'2D_rendering', room_id, 'perspective', 'full') + + obj_id_imgs = {} + for frame_idx in self.frame_pose_data[full_scan_id]: + image_path=osp.join(scene_folder, frame_idx, 'instance.png') + obj_id_map = cv2.imread(image_path, cv2.IMREAD_UNCHANGED) + obj_id_imgs[frame_idx] = obj_id_map + + if osp.exists(osp.join(scene_folder, 'gt-projection')): + shutil.rmtree(osp.join(scene_folder, 'gt-projection')) + + # torch.save(obj_id_imgs, osp.join(scene_folder, 'gt-projection-seg.pt')) + np.savez_compressed(osp.join(scene_folder,'gt-projection-seg.npz'),**obj_id_imgs) + + def compute2DFeaturesEachScan(self, scan_id): + full_scan_id = scan_id + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + scene_folder = osp.join(self.data_dir, 'scans', scan_id,'2D_rendering', room_id, 'perspective', 'full') + + scene_out_dir = osp.join(self.out_dir, full_scan_id) + load_utils.ensure_dir(scene_out_dir) + + # obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + obj_id_to_label_id_map = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item() + + floorplan_img_path = osp.join(self.data_dir,'scans', scan_id, 'floorplans', f'{room_id}.png') + floorplan_img = cv2.imread(floorplan_img_path) + floorplan_img = cv2.cvtColor(floorplan_img, cv2.COLOR_BGR2RGB) + floorplan_img = cv2.cvtColor(floorplan_img, cv2.COLOR_RGB2GRAY) + floorplan_img = cv2.cvtColor(floorplan_img, cv2.COLOR_GRAY2RGB) + floorplan_img = image_util.crop_image(floorplan_img, floorplan_img_path.replace('.png', '_cropped.png')) + floorplan_embeddings = None + + if floorplan_img is not None: + floorplan_img = floorplan_img.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + floorplan_img_pt = self.model.base_tf(floorplan_img) + floorplan_embeddings = self.extractFeatures([floorplan_img_pt], return_only_cls_mean = False) + floorplan_dict = {'img' : floorplan_img, 'embedding' : floorplan_embeddings} + # print(floorplan_dict) + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[full_scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(full_scan_id, scene_folder, frame_idxs) + + # Visualise + # camera_info = structured3d.load_intrinsics(scene_folder) + # intrinsic_mat = camera_info['intrinsic_mat'] + + # scene_mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id, '3D_rendering', room_id,'room_mesh.ply')) + # intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + # 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + # cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + # image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + # Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D = {} + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + data2D['scene']['floorplan'] = floorplan_dict + + # torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) + + # def computeAllImageFeaturesEachScan(self, scan_id): + # scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + # color_path = osp.join(scene_folder, 'sequence') + # scene_out_dir = osp.join(self.out_dir, scan_id) + # load_utils.ensure_dir(scene_out_dir) + + # frame_idxs = list(self.frame_pose_data[scan_id].keys()) + + # # Extract Scene Image Features + # scene_images_pt = [] + # scene_image_embeddings = [] + # for frame_index in frame_idxs: + # image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + # image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + # image_pt = self.model.base_tf(image) + # # image_pt = torch.zeros(1, 1536) + + # scene_image_embeddings.append(self.extractFeatures([image_pt], return_only_cls_mean= False)) + # scene_images_pt.append(image_pt) + # scene_image_embeddings = np.concatenate(scene_image_embeddings) + # data2D = {} + # data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + # 'frame_idxs' : frame_idxs} + # torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt')) + # np.savez_compressed(osp.join(scene_out_dir, 'data2D_all_images.npz'), **data2D) + + def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs): + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + # print(sampled_frame_idxs) + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, frame_index, f'rgb_rawlight.png')) + image = image.convert('RGB') + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder, obj_id_to_label_id_map): + # object_anno_2D = torch.load(osp.join(scene_folder, 'gt-projection-seg.pt')) + object_anno_2D = np.load(osp.join(scene_folder, 'gt-projection-seg.npz'),allow_pickle=True) + object_image_votes = {} + + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, frame_idx, 'rgb_rawlight.png') + # print(image_path) + color_img = Image.open(image_path) + # print(color_img.mode) + color_img = color_img.convert('RGB') + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, image, object_id, object_anno_2d): + # print(np.array(image).shape) + object_anno_2d = object_anno_2d.transpose(1, 0) + object_anno_2d = np.flip(object_anno_2d, 1) + + # load image + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + # print(np.array(cropped_img).shape) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + # images_crops.append(cropped_img) + + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats + diff --git a/preprocess/feat3D/__init__.py b/preprocess/feat3D/__init__.py index 9a1b744..7db5e81 100644 --- a/preprocess/feat3D/__init__.py +++ b/preprocess/feat3D/__init__.py @@ -1,2 +1,5 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * +from .structured3d import * \ No newline at end of file diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py new file mode 100644 index 0000000..6172204 --- /dev/null +++ b/preprocess/feat3D/arkit.py @@ -0,0 +1,97 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm + +from common import load_utils +from util import point_cloud, arkit +from util.arkit import ARKITSCENE_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(ARKitScenes3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = arkit.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + for _i, label_info in enumerate(annotations["data"]): + obj_label = label_info["label"] + object_id = _i + 1 + scannet_class=ARKITSCENE_SCANNET[obj_label] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + + return objects + + def compute3DFeaturesEachScan(self, scan_id): + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + ply_data = arkit.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, annotations) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id,'{}_3dod_mesh.ply'.format(scan_id))) + mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + mesh_colors = mesh_colors.round() + + + scan_objects=self.load_objects_for_scan(scan_id) + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + global_object_id = scan_object['global_id'] + + object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)] + + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + object_id_to_label_id[instance_id] = global_object_id + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None} + + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) + torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) \ No newline at end of file diff --git a/preprocess/feat3D/multiscan.py b/preprocess/feat3D/multiscan.py new file mode 100644 index 0000000..68ba025 --- /dev/null +++ b/preprocess/feat3D/multiscan.py @@ -0,0 +1,94 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm + +from common import load_utils +from util import point_cloud, multiscan +from util.multiscan import MULTISCAN_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class MultiScan3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(MultiScan3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + self.label_map = multiscan.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + + for obj in annotations["objects"]: + object_id=obj["objectId"] + objectName=obj["label"].split('.')[0] + scannet_class=MULTISCAN_SCANNET[objectName] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + return objects + + + + def compute3DFeaturesEachScan(self, scan_id): + ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scenes', scan_id,'{}.ply'.format(scan_id))) + mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + mesh_colors = mesh_colors.round() + + scan_objects=self.load_objects_for_scan(scan_id) + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + global_object_id = scan_object['global_id'] + + object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)] + + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + object_id_to_label_id[instance_id] = global_object_id + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None} + + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) + torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) + \ No newline at end of file diff --git a/preprocess/feat3D/structured3d.py b/preprocess/feat3D/structured3d.py new file mode 100644 index 0000000..26fad6e --- /dev/null +++ b/preprocess/feat3D/structured3d.py @@ -0,0 +1,96 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +import json +from common import load_utils +from util import structured3d +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class Structured3D_3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(Structured3D_3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = structured3d.get_scan_ids(files_dir, split) + + self.out_dir = config_data.process_dir + load_utils.ensure_dir(self.out_dir) + # self.undefined = 0 + + def compute3DFeaturesEachScan(self, scan_id): + scan_id = scan_id.split('_') + room_id = scan_id[-1] + scan_id = scan_id[0]+'_'+scan_id[1] + ply_data = structured3d.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, room_id) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + # mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id, '3D_rendering', room_id, 'room_mesh.ply')) + # mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + mesh_colors = np.stack([ply_data['red'], ply_data['green'], ply_data['blue']]).transpose((1, 0)) + # print(mesh_colors) + + # mesh_colors = mesh_colors.round() + object_ids = ply_data['objectId'] + unique_objects = np.unique(object_ids) + # print(unique_objects) + semantic_ids = ply_data['nyu40id'] + + scene_label = None + with open(osp.join(self.data_dir, 'scans', scan_id, 'annotation_3d.json')) as file: + annotations = json.load(file) + + for annos in annotations['semantics']: + if annos['ID'] == int(room_id): + scene_label = annos['type'].strip() + break + + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + + for idx, instance_id in enumerate(unique_objects): + object_pcl=mesh_points[np.where(ply_data['objectId'] == instance_id)] + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + # first_point_idx = np.where(object_ids == instance_id)[0][0] + # nyu40id = semantic_ids[first_point_idx] + # object_id_to_label_id[instance_id] = nyu40id + # Find the most common nyu40id for this object + all_point_indices = np.where(object_ids == instance_id)[0] + nyu40ids_for_object = semantic_ids[all_point_indices] + unique_ids, counts = np.unique(nyu40ids_for_object, return_counts=True) + nyu40id = unique_ids[np.argmax(counts)] + object_id_to_label_id[instance_id] = nyu40id + # if instance_id==0: + # print(nyu40id) + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + else: + print("Object {} has less than {} points".format(instance_id, self.config_3D.min_points_per_object)) + + # print(scene_label) + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points, 'pcl_feats': mesh_colors, 'scene_label' : scene_label} + # print(object_id_to_label_id) + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id+'_'+room_id) + load_utils.ensure_dir(scene_out_dir) + + # torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) + # torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) + diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py index 822135d..9070228 100644 --- a/preprocess/multimodal_preprocess.py +++ b/preprocess/multimodal_preprocess.py @@ -8,9 +8,9 @@ import h5py from common import load_utils from common.constants import ModalityType -from util import scan3r, scannet +from util import scan3r, scannet, arkit, multiscan, structured3d from typing import Dict, Optional - +import os from preprocess.build import PROCESSOR_REGISTRY @PROCESSOR_REGISTRY.register() @@ -33,6 +33,12 @@ def __init__(self, config_data: DictConfig, modality_config: DictConfig, split: self.scan_ids = scannet.get_scan_ids(self.files_dir, self.split) elif self.dataset_name == 'Scan3R': self.scan_ids = scan3r.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'ARKitScenes': + self.scan_ids = arkit.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'MultiScan': + self.scan_ids = multiscan.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'Structured3D': + self.scan_ids = structured3d.get_scan_ids(self.files_dir, self.split) else: raise NotImplementedError @@ -71,18 +77,20 @@ def prepareObjectWiseDataEachScan(self, data2D: Optional[Dict] = None, data3D: Optional[Dict] = None) -> Dict: """Process object-wise data for a single scan combining features from all modalities.""" - object_id_to_label_id_map = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + # object_id_to_label_id_map = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + object_id_to_label_id_map = np.load(osp.join(out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item() + map_object_ids = list(object_id_to_label_id_map.keys()) precomputed_feats, inputs = {}, {} if data3D is not None: - precomputed_feats[ModalityType.POINT] = data3D['objects']['pcl_embeddings'] - precomputed_feats[ModalityType.CAD] = data3D['objects']['cad_embeddings'] + precomputed_feats[ModalityType.POINT] = data3D['objects'].item()['pcl_embeddings'] + precomputed_feats[ModalityType.CAD] = data3D['objects'].item()['cad_embeddings'] if data2D is not None: - precomputed_feats[ModalityType.RGB] = data2D['objects']['image_embeddings'] + precomputed_feats[ModalityType.RGB] = data2D['objects'].item()['image_embeddings'] if data1D is not None: - precomputed_feats[ModalityType.REF] = data1D['objects']['referral_embeddings'] + precomputed_feats[ModalityType.REF] = data1D['objects'].item()['referral_embeddings'] object_ids = [] for modalityType in ModalityType.__dict__.values(): @@ -137,19 +145,27 @@ def prepareObjectWiseDataEachScan(self, 'object_id2idx' : object_id2idx, 'object_id_to_label_id_map' : object_id_to_label_id_map, 'object_ids' : object_ids, - 'topK_images_votes' : data2D['objects']['topK_images_votes'] + 'topK_images_votes' : data2D['objects'].item()['topK_images_votes'] } - - torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt')) + pt_multimodal_path = osp.join(out_dir, 'objectsDataMultimodal.pt') + if osp.exists(pt_multimodal_path): + os.remove(pt_multimodal_path) + # torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt')) + np.savez_compressed(osp.join(out_dir, 'objectsDataMultimodal.npz'), **objects_data_pt) return objects_data_pt def prepareDataEachScan(self, scan_id: str, hf_handler: h5py.File) -> None: """Process data for a single scan and store it in the HDF5 file.""" out_dir = osp.join(self.out_dir, scan_id) - data1D = torch.load(osp.join(out_dir, 'data1D.pt')) - data2D = torch.load(osp.join(out_dir, 'data2D.pt')) - data3D = torch.load(osp.join(out_dir, 'data3D.pt')) + # data1D = torch.load(osp.join(out_dir, 'data1D.pt')) + data1D = np.load(osp.join(out_dir, 'data1D.npz'),allow_pickle=True) + + # data2D = torch.load(osp.join(out_dir, 'data2D.pt')) + data2D = np.load(osp.join(out_dir, 'data2D.npz'),allow_pickle=True) + + # data3D = torch.load(osp.join(out_dir, 'data3D.pt')) + data3D = np.load(osp.join(out_dir, 'data3D.npz'),allow_pickle=True) objects_data_pt = self.prepareObjectWiseDataEachScan(out_dir, data1D, data2D, data3D) self.dumpEachObjectDataPerScan(scan_id, objects_data_pt, hf_handler) @@ -182,4 +198,4 @@ def dumpEachObjectDataPerScan(self, def run(self) -> None: """Execute the complete preprocessing pipeline.""" - self.prepareData() + self.prepareData() \ No newline at end of file diff --git a/retrieval/object_retrieval.py b/retrieval/object_retrieval.py index 54c144f..526e5a2 100644 --- a/retrieval/object_retrieval.py +++ b/retrieval/object_retrieval.py @@ -293,6 +293,6 @@ def run(self) -> None: # Object Retrieval Evaluation self.eval(output_dict) - self.logger.info('Scene Retrieval Evaluation (Instance Baseline)...') + self.logger.info('Scene Retrieval Evaluation (Instance CrossOver)...') # Scene Retrieval Evaluation self.scene_eval(output_dict) \ No newline at end of file diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh new file mode 100644 index 0000000..3acdb4a --- /dev/null +++ b/scripts/preprocess/process_arkit.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null + +# Multi-modal dumping +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null diff --git a/scripts/preprocess/process_multiscan.sh b/scripts/preprocess/process_multiscan.sh new file mode 100644 index 0000000..a13a93c --- /dev/null +++ b/scripts/preprocess/process_multiscan.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /"$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null + +# Multi-modal dumping +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_scan3r.sh b/scripts/preprocess/process_scan3r.sh index 6d8a981..5ac2b71 100644 --- a/scripts/preprocess/process_scan3r.sh +++ b/scripts/preprocess/process_scan3r.sh @@ -1,9 +1,8 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null -# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null -# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null - +python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null # Multi-modal dumping python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_scannet.sh b/scripts/preprocess/process_scannet.sh index 68a2366..47aa945 100644 --- a/scripts/preprocess/process_scannet.sh +++ b/scripts/preprocess/process_scannet.sh @@ -1,9 +1,8 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null - +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null # Multi-modal dumping -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_structured3d.sh b/scripts/preprocess/process_structured3d.sh new file mode 100644 index 0000000..08c0605 --- /dev/null +++ b/scripts/preprocess/process_structured3d.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_3d.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_1d.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_2d.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null + +# # Multi-modal dumping +python preprocessor.py --config-path /Users/gauravpradeep/CrossOver_ScaleUp/configs/preprocess --config-name process_multimodal.yaml data.sources=['Structured3D'] hydra.run.dir=. hydra.output_subdir=null diff --git a/single_inference/datasets/__init__.py b/single_inference/datasets/__init__.py index 9a1b744..9c7b829 100644 --- a/single_inference/datasets/__init__.py +++ b/single_inference/datasets/__init__.py @@ -1,2 +1,4 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * diff --git a/single_inference/datasets/arkit.py b/single_inference/datasets/arkit.py new file mode 100644 index 0000000..6434bde --- /dev/null +++ b/single_inference/datasets/arkit.py @@ -0,0 +1,126 @@ +import os.path as osp +import numpy as np +from torch.utils.data import Dataset +import MinkowskiEngine as ME +from PIL import Image +from scipy.spatial.transform import Rotation as R +from torchvision import transforms as tvf +import torch +import open3d as o3d +import pandas as pd +from common import load_utils +from util import arkit +from util import image as image_util + +class ARKitScenesInferDataset(Dataset): + def __init__(self, data_dir,voxel_size=0.02, frame_skip=5, image_size=[224, 224]) -> None: + self.voxel_size = voxel_size + self.frame_skip = frame_skip + self.image_size = image_size + + self.scans_dir = osp.join(data_dir, 'scans') + self.files_dir = osp.join(data_dir, 'files') + self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + self.scan_ids = [] + for split in ['train', 'val']: + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split)) + self.scan_ids.extend(np.genfromtxt(filepath, dtype = str)) + + self.base_tf = tvf.Compose([ + tvf.ToTensor(), + tvf.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + self.metadata = pd.read_csv(osp.join(self.files_dir,'metadata.csv')) + + + def extract_images(self, scan_id, color_path): + pose_data = arkit.load_poses(self.scans_dir, scan_id, skip=self.frame_skip) + frame_idxs = list(pose_data.keys()) + + pose_data_arr = [] + for frame_idx in frame_idxs: + pose = pose_data[frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data_arr = np.array(pose_data_arr) + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr) + sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + image_data = None + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + if sky_direction=='Left': + image = image.transpose(Image.ROTATE_270) + elif sky_direction=='Right': + image = image.transpose(Image.ROTATE_90) + image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + image_pt = self.base_tf(image).unsqueeze(0) + image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0) + + return image_data.unsqueeze(0) + + def __getitem__(self, index): + if isinstance(index, int): + scan_id = self.scan_ids[index] + + if isinstance(index, str): + scan_id = index + + scan_folder = osp.join(self.scans_dir, scan_id) + data_dict = {} + data_dict['masks'] = {} + + # Point Cloud + mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, '{}_3dod_mesh.ply'.format(scan_id))) + points = np.asarray(mesh.vertices) + feats = np.asarray(mesh.vertex_colors)*255.0 + feats = feats.round() + + feats /= 255. + feats -= 0.5 + + _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) + coords, feats = points[sel], feats[sel] + coords = np.floor(coords / self.voxel_size) + coords-= coords.min(0) + + coords, feats = ME.utils.sparse_collate([coords], [feats]) + data_dict['masks']['point'] = True + + # RGB + color_path = osp.join(scan_folder, f'{scan_id}_frames','lowres_wide') + image_data = self.extract_images(scan_id, color_path) + data_dict['masks']['rgb'] = True + + # Floorplan (dummy) + floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8) + floorplan_img = Image.fromarray(floorplan_img) + data_dict['masks']['floorplan'] = False + + floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + floorplan_data = self.base_tf(floorplan_img).unsqueeze(0) + + # Referral + referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id] + if len(referrals) != 0: + if len(referrals) > 10: + referrals = np.random.choice(referrals, size=10, replace=False) + referrals = [referral['utterance'] for referral in referrals] + referrals = [' '.join(referrals)] + data_dict['masks']['referral'] = True + else: + referrals = [''] + data_dict['masks']['referral'] = False + + data_dict['coordinates'] = coords + data_dict['features'] = feats + data_dict['rgb'] = image_data + data_dict['floorplan'] = floorplan_data + data_dict['referral'] = referrals + + return data_dict \ No newline at end of file diff --git a/single_inference/datasets/multiscan.py b/single_inference/datasets/multiscan.py new file mode 100644 index 0000000..06538e6 --- /dev/null +++ b/single_inference/datasets/multiscan.py @@ -0,0 +1,120 @@ +import os.path as osp +import numpy as np +from torch.utils.data import Dataset +import MinkowskiEngine as ME +from PIL import Image +from scipy.spatial.transform import Rotation as R +from torchvision import transforms as tvf +import torch +import open3d as o3d + +from common import load_utils +from util import multiscan +from util import image as image_util + +class MultiScanInferDataset(Dataset): + def __init__(self, data_dir, voxel_size=0.02, frame_skip=1, image_size=[224, 224]) -> None: + self.voxel_size = voxel_size + self.frame_skip = frame_skip + self.image_size = image_size + + self.scans_dir = osp.join(data_dir, 'scenes') + self.files_dir = osp.join(data_dir, 'files') + self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + self.scan_ids = [] + for split in ['train', 'val']: + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split)) + self.scan_ids.extend(np.genfromtxt(filepath, dtype = str)) + + self.base_tf = tvf.Compose([ + tvf.ToTensor(), + tvf.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + + def extract_images(self, scan_id, color_path): + frame_idxs = multiscan.load_frame_idxs(osp.join(self.scans_dir, scan_id)) + pose_data = multiscan.load_all_poses(osp.join(self.scans_dir, scan_id), frame_idxs) + frame_idxs = list(pose_data.keys()) + + pose_data_arr = [] + for frame_idx in frame_idxs: + pose = pose_data[frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data_arr = np.array(pose_data_arr) + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr) + + image_data = None + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + image_pt = self.base_tf(image).unsqueeze(0) + image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0) + + return image_data.unsqueeze(0) + + def __getitem__(self, index): + if isinstance(index, int): + scan_id = self.scan_ids[index] + + if isinstance(index, str): + scan_id = index + + scan_folder = osp.join(self.scans_dir, scan_id) + data_dict = {} + data_dict['masks'] = {} + + # Point Cloud + mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, f'{scan_id}.ply')) + points = np.asarray(mesh.vertices) + feats = np.asarray(mesh.vertex_colors)*255.0 + feats = feats.round() + + feats /= 255. + feats -= 0.5 + + _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) + coords, feats = points[sel], feats[sel] + coords = np.floor(coords / self.voxel_size) + coords-= coords.min(0) + + coords, feats = ME.utils.sparse_collate([coords], [feats]) + data_dict['masks']['point'] = True + + # RGB + color_path = osp.join(scan_folder, 'sequence') + image_data = self.extract_images(scan_id, color_path) + data_dict['masks']['rgb'] = True + + # Floorplan (dummy) + floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8) + floorplan_img = Image.fromarray(floorplan_img) + data_dict['masks']['floorplan'] = False + + floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + floorplan_data = self.base_tf(floorplan_img).unsqueeze(0) + + # Referral + referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id] + if len(referrals) != 0: + if len(referrals) > 10: + referrals = np.random.choice(referrals, size=10, replace=False) + referrals = [referral['utterance'] for referral in referrals] + referrals = [' '.join(referrals)] + data_dict['masks']['referral'] = True + else: + referrals = [''] + data_dict['masks']['referral'] = False + + data_dict['coordinates'] = coords + data_dict['features'] = feats + data_dict['rgb'] = image_data + data_dict['floorplan'] = floorplan_data + data_dict['referral'] = referrals + + return data_dict \ No newline at end of file diff --git a/single_inference/scene_inference.py b/single_inference/scene_inference.py index 9846dd5..1d13b5e 100644 --- a/single_inference/scene_inference.py +++ b/single_inference/scene_inference.py @@ -26,6 +26,10 @@ def run_inference(args, scan_id=None): dataset = datasets.ScannetInferDataset(args.data_dir, args.floorplan_dir) elif args.dataset == 'Scan3R': dataset = datasets.Scan3RInferDataset(args.data_dir) + elif args.dataset == 'ARKitScenes': + dataset = datasets.ARKitScenesInferDataset(args.data_dir) + elif args.dataset == 'MultiScan': + dataset = datasets.MultiScanInferDataset(args.data_dir) else: raise NotImplementedError('Dataset not implemented') diff --git a/trainer/grounding_trainer.py b/trainer/grounding_trainer.py index e0a40b2..7ee201c 100644 --- a/trainer/grounding_trainer.py +++ b/trainer/grounding_trainer.py @@ -1,5 +1,7 @@ +import os.path as osp from tqdm import tqdm from omegaconf import DictConfig +from safetensors.torch import load_file import torch from trainer.build import TRAINER_REGISTRY diff --git a/util/arkit.py b/util/arkit.py new file mode 100644 index 0000000..c4e7593 --- /dev/null +++ b/util/arkit.py @@ -0,0 +1,347 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import csv +import jsonlines +import json +import os +import trimesh +import pandas as pd +import cv2 + +ARKITSCENE_SCANNET= { +'bed': 'bed', +'cabinet': 'cabinet', +'refrigerator': 'refrigerator', +'table': 'table', +'chair': 'chair', +'sink': 'sink', +'stove': 'stove', +'oven': 'oven', +'washer': 'washing machine', +'shelf': 'shelf', +'tv_monitor': 'tv', +'bathtub': 'bathtub', +'toilet': 'toilet', +'sofa': 'sofa', +'stool': 'stool', +'fireplace': 'fireplace', +'build_in_cabinet': 'cabinet', +'dishwasher': 'dishwasher', +'stairs': 'stairs' +} + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def load_frame_idxs(scan_dir, skip=None): + frames_paths = glob(osp.join(scan_dir, f"{scan_dir.split('/')[-1]}_frames", 'lowres_wide', '*.png')) + frame_names = [osp.basename(frame_path) for frame_path in frames_paths] + frame_idxs = [frame_name.split('.png')[0].split("_")[1] for frame_name in frame_names] + frame_idxs.sort() + + if skip is not None: + frame_idxs = frame_idxs[::skip] + + return frame_idxs + +def TrajStringToMatrix(traj_str): + """ convert traj_str into translation and rotation matrices + Args: + traj_str: A space-delimited file where each line represents a camera position at a particular timestamp. + The file has seven columns: + * Column 1: timestamp + * Columns 2-4: rotation (axis-angle representation in radians) + * Columns 5-7: translation (usually in meters) + + Returns: + ts: translation matrix + Rt: rotation matrix + """ + # line=[float(x) for x in traj_str.split()] + # ts = line[0]; + # R = cv2.Rodrigues(np.array(line[1:4]))[0]; + # t = np.array(line[4:7]); + # Rt = np.concatenate((np.concatenate((R, t[:,np.newaxis]), axis=1), [[0.0,0.0,0.0,1.0]]), axis=0) + tokens = traj_str.split() + assert len(tokens) == 7 + ts = tokens[0] + # Rotation in angle axis + angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])] + r_w_to_p = convert_angle_axis_to_matrix3(np.asarray(angle_axis)) + # Translation + t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])]) + extrinsics = np.eye(4, 4) + extrinsics[:3, :3] = r_w_to_p + extrinsics[:3, -1] = t_w_to_p + Rt = np.linalg.inv(extrinsics) + return Rt + +def convert_angle_axis_to_matrix3(angle_axis): + """Return a Matrix3 for the angle axis. + Arguments: + angle_axis {Point3} -- a rotation in angle axis form. + """ + matrix, jacobian = cv2.Rodrigues(angle_axis) + return matrix + +def load_poses(scan_dir, scan_id, skip=None): + frame_poses = {} + frame_idxs = load_frame_idxs(scan_dir, skip=skip) + traj_file = osp.join(scan_dir, f'{scan_id}_frames', 'lowres_wide.traj') + with open(traj_file) as f: + traj = f.readlines() + for i,line in enumerate(traj): + ts=line.split(" ")[0] + rounded_ts = round(float(ts), 3) + formatted_ts = f"{rounded_ts:.3f}" + if formatted_ts not in frame_idxs: + if f"{rounded_ts - 0.001:.3f}" in frame_idxs: + frame_poses[f"{rounded_ts - 0.001:.3f}"] = TrajStringToMatrix(line) + elif f"{rounded_ts + 0.001:.3f}" in frame_idxs: + frame_poses[f"{rounded_ts + 0.001:.3f}"] = TrajStringToMatrix(line) + else: + print("no matching pose for frame", formatted_ts) + continue + # if f"{round(float(ts), 3):.3f}" not in frame_idxs: + # if f"{round(float(ts), 3)-0.001 :.3f}" in frame_idxs: + # frame_poses[f"{round(float(ts), 3)-0.001:.3f}"] = TrajStringToMatrix(line) + # elif f"{round(float(ts), 3)+0.001 :.3f}" in frame_idxs: + # frame_poses[f"{round(float(ts), 3)+0.001:.3f}"] = TrajStringToMatrix(line) + # else: + # continue + else: + frame_poses[f"{round(float(ts), 3):.3f}"] = TrajStringToMatrix(line) + # data = pd.read_csv(osp.join(scan_dir,f'{scan_id}_frames','lowres_wide.traj'), delim_whitespace=True, header=None) + # for frame_idx,(index, row) in zip(frame_idxs,data.iterrows()): + # if skip is not None and index % skip != 0: + # continue + # rotation_axis = row[1:4].values + # rotation_angle = np.linalg.norm(rotation_axis) + # if rotation_angle != 0: + # rotation_axis = rotation_axis / rotation_angle + # translation = row[4:7].values + # # Convert axis-angle to rotation matrix + # # rotation_matrix = axis_angle_to_rotation_matrix(rotation_axis, rotation_angle) + # rotation_matrix= + # # Construct the 4x4 homogeneous transformation matrix + # homogenous_matrix = np.eye(4) + # homogenous_matrix[:3, :3] = rotation_matrix + # homogenous_matrix[:3, 3] = translation + # frame_poses[frame_idx] = homogenous_matrix + + return frame_poses + +def axis_angle_to_rotation_matrix(axis, angle): + # Normalize the rotation axis + axis = axis / np.linalg.norm(axis) + x, y, z = axis + c = np.cos(angle) + s = np.sin(angle) + t = 1 - c + + # Compute the rotation matrix using the axis-angle formula + rotation_matrix = np.array([ + [t*x*x + c, t*x*y - s*z, t*x*z + s*y], + [t*x*y + s*z, t*y*y + c, t*y*z - s*x], + [t*x*z - s*y, t*y*z + s*x, t*z*z + c] + ]) + + return rotation_matrix + +def load_intrinsics(data_dir, scan_id, frame_id): + ''' + Load ARKit intrinsic information + ''' + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{frame_id}.pincam') + if not os.path.exists(pincam_path): + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)-0.001:.3f}.pincam') + if not os.path.exists(pincam_path): + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)+0.001:.3f}.pincam') + + + intrinsics = {} + + # Read the .pincam file + with open(pincam_path, "r") as f: + line = f.readline().strip() + + # Parse the intrinsic parameters + width, height, focal_length_x, focal_length_y, principal_point_x, principal_point_y = map(float, line.split()) + + # Store the width and height + intrinsics['width'] = width + intrinsics['height'] = height + + # Construct the intrinsic matrix + intrinsic_mat = np.array([ + [focal_length_x, 0, principal_point_x], + [0, focal_length_y, principal_point_y], + [0, 0, 1] + ]) + intrinsics['intrinsic_mat'] = intrinsic_mat + + return intrinsics + +def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'): + LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv') + assert osp.exists(LABEL_MAP_FILE) + + raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to) + return raw_label_map + +def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'): + assert osp.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + for row in reader: + mapping[row[label_from]] = row[label_to] + + if represents_int(list(mapping.keys())[0]): + mapping = {int(k):v for k,v in mapping.items()} + + return mapping + +def represents_int(s): + ''' if string s represents an int. ''' + try: + int(s) + return True + except ValueError: + return False + +def load_ply_data(data_dir, scan_id, annotations): + filename_in = osp.join(data_dir, scan_id, f'{scan_id}_3dod_mesh.ply') + file = open(filename_in, 'rb') + plydata = PlyData.read(file) + file.close() + vertices = plydata['vertex']['x'], plydata['vertex']['y'], plydata['vertex']['z'] + vertices = np.vstack(vertices).T + + vertex_colors = plydata['vertex']['red'], plydata['vertex']['green'], plydata['vertex']['blue'] + vertex_colors = np.vstack(vertex_colors).T + + vertex_dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), + ('objectId', 'h')] + vertices_structured = np.empty(vertices.shape[0], dtype=vertex_dtype) + + # Assign x, y, z, and color values to the structured array + vertices_structured['red'] = vertex_colors[:, 0] + vertices_structured['green'] = vertex_colors[:, 1] + vertices_structured['blue'] = vertex_colors[:, 2] + + vertex_instance = np.zeros(vertices.shape[0], dtype='h') # Use 'h' for signed 16-bit integer + bbox_list=[] + for _i, label_info in enumerate(annotations["data"]): + object_id = _i + 1 + rotation = np.array(label_info["segments"]["obbAligned"]["normalizedAxes"]).reshape(3, 3) + + transform = np.array(label_info["segments"]["obbAligned"]["centroid"]).reshape(-1, 3) + scale = np.array(label_info["segments"]["obbAligned"]["axesLengths"]).reshape(-1, 3) + + trns = np.eye(4) + trns[0:3, 3] = transform + trns[0:3, 0:3] = rotation.T + + box_trimesh_fmt = trimesh.creation.box(scale.reshape(3,), trns) + obj_containment = np.argwhere(box_trimesh_fmt.contains(vertices)) + + vertex_instance[obj_containment] = object_id + box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation) + bbox_list.append(box3d) + + # if len(bbox_list) == 0: + # return + + vertices_structured['objectId'] = vertex_instance + + # align_angle = calc_align_matrix(bbox_list) + + # vertices_aligned = rotate_z_axis_by_degrees(np.array(vertices), align_angle) + + if np.max(vertex_colors) <= 1: + vertex_colors = vertex_colors * 255.0 + + # center_points = np.mean(vertices_aligned, axis=0) + # center_points[2] = np.min(vertices_aligned[:, 2]) + # vertices_aligned = vertices_aligned - center_points + + # vertices_structured['x'] = vertices_aligned[:, 0] + # vertices_structured['y'] = vertices_aligned[:, 1] + # vertices_structured['z'] = vertices_aligned[:, 2] + + vertices_structured['x'] = plydata['vertex']['x'] + vertices_structured['y'] = plydata['vertex']['y'] + vertices_structured['z'] = plydata['vertex']['z'] + + return vertices_structured + +def compute_box_3d(size, center, rotmat): + """Compute corners of a single box from rotation matrix + Args: + size: list of float [dx, dy, dz] + center: np.array [x, y, z] + rotmat: np.array (3, 3) + Returns: + corners: (8, 3) + """ + l, h, w = [i / 2 for i in size] + center = np.reshape(center, (-1, 3)) + center = center.reshape(3) + x_corners = [l, l, -l, -l, l, l, -l, -l] + y_corners = [h, -h, -h, h, h, -h, -h, h] + z_corners = [w, w, w, w, -w, -w, -w, -w] + corners_3d = np.dot( + np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners]) + ) + corners_3d[0, :] += center[0] + corners_3d[1, :] += center[1] + corners_3d[2, :] += center[2] + return np.transpose(corners_3d) + +def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True): + theta = np.deg2rad(theta) + cos_t = np.cos(theta) + sin_t = np.sin(theta) + rot_matrix = np.array([[cos_t, -sin_t, 0], + [sin_t, cos_t, 0], + [0, 0, 1]], pointcloud.dtype) + if not clockwise: + rot_matrix = rot_matrix.T + return pointcloud.dot(rot_matrix) + +def calc_align_matrix(bbox_list): + RANGE = [-45, 45] + NUM_BIN = 90 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + angle_counts = {} + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + if len(angle_counts) == 0: + RANGE = [-90, 90] + NUM_BIN = 180 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom, thres=0.15): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + most_common_angle = max(angle_counts, key=angle_counts.get) + return most_common_angle + +def is_axis_aligned(rotated_box, thres=0.05): + x_diff = abs(rotated_box[0][0] - rotated_box[1][0]) + y_diff = abs(rotated_box[0][1] - rotated_box[3][1]) + return x_diff < thres and y_diff < thres diff --git a/util/multiscan.py b/util/multiscan.py new file mode 100644 index 0000000..8478a7d --- /dev/null +++ b/util/multiscan.py @@ -0,0 +1,672 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import csv +import jsonlines +import json +import os + +MULTISCAN_SCANNET = { + "wall": "wall", + "door": "door", + "slippers": "shoe", + "mop": "broom", + "rug": "rug", + "floor": "floor", + "basin": "sink", + "basin_stand": "sink", + "bucket": "bucket", + "shower": "shower", + "water_tank": "container", + "beam": "wood beam", + "pillar": "pillar", + "ceiling": "ceiling", + "sink": "sink", + "toilet": "toilet", + "cabinet": "cabinet", + "remove": "object", + "towel": "towel", + "pillow": "pillow", + "sofa": "sofa", + "footstool": "footstool", + "picture": "picture", + "window": "window", + "heater": "heater", + "mirror": "mirror", + "pipe": "pipe", + "scarf": "cloth", + "ceiling_light": "ceiling light", + "chair": "chair", + "table": "table", + "vent": "vent", + "bag": "bag", + "wall_cabinet": "cabinet", + "range": "stove", + "ricemaker": "rice cooker", + "pan": "cooking pan", + "coffee_machine": "coffee maker", + "rice_bag": "bag", + "light": "light", + "trashbin": "trash bin", + "kettle": "kettle", + "refrigerator": "refrigerator", + "microwave": "microwave", + "light_switch": "light switch", + "rice_cooker": "rice cooker", + "box": "box", + "shoe": "shoe", + "range_hood": "range hood", + "wok": "cooking pan", + "router": "object", + "paper_towel": "paper towel roll", + "stock_pot": "pot", + "cutting_board": "cutting board", + "wall_calendar": "calendar", + "baseboard": "object", + "coke_box": "box", + "printer": "printer", + "bowl": "bowl", + "backpack": "backpack", + "baseboard_heater": "heater", + "broom": "broom", + "dust_pan": "dustpan", + "trash_bin": "trash bin", + "rigid_duct": "vent", + "electric_range": "stove", + "spatula": "object", + "faucet": "faucet", + "bottle": "bottle", + "countertop": "counter", + "railing": "railing", + "suitcase": "suitcase", + "trash": "trash can", + "pot": "pot", + "kitchen_tool": "object", + "vegetable": "object", + "board": "board", + "washing_machine": "washing machine", + "jar": "jar", + "object": "object", + "notebook": "book", + "induction_cooker": "stove", + "instant_pot_lid": "cooking pot", + "oven": "oven", + "air_fryer": "object", + "lid": "pot", + "sponge": "sponge", + "blender": "object", + "spoon": "object", + "dishwasher": "dishwasher", + "detergent": "laundry detergent", + "watermelon": "bananas", + "yard_waste_bag": "garbage bag", + "container": "container", + "newspapers": "paper", + "rag": "cloth", + "ladder": "ladder", + "gate": "door", + "napkin_box": "tissue box", + "jacket": "jacket", + "windowsill": "windowsill", + "water_faucet": "faucet", + "steel_ball": "ball", + "rice_maker": "rice cooker", + "watter_bottle": "water bottle", + "plastic_bag": "bag", + "paper_bag": "paper bag", + "cuttting_board": "cutting board", + "trash_bin_lid": "trash bin", + "hair_dryer": "hair dryer", + "electric_socket": "power outlet", + "electric_panel": "electric panel", + "wash_stand": "sink", + "soap": "soap", + "curtain": "curtain", + "bathtub": "bathtub", + "smoke_detector": "smoke detector", + "roll_paper": "paper towel roll", + "chandelier": "chandelier", + "hand_sanitizer": "hand sanitzer dispenser", + "plate": "plate", + "sticker": "sticker", + "power_socket": "power outlet", + "stacked_cups": "stack of cups", + "stacked_chairs": "stack of chairs", + "air_vent": "vent", + "cornice": "cabinet", + "wine_cabinet": "kitchen cabinet", + "crock": "bowl", + "liquor_box": "cabinet", + "shampoo": "shampoo", + "shower_curtain": "shower curtain", + "wall_light": "wall lamp", + "sink_cabinet": "sink", + "toilet_roll": "toilet paper", + "shelf": "shelf", + "paper_bin": "recycling bin", + "toilet_brush": "toilet brush", + "shower_head": "shower head", + "tv": "tv", + "remote_control": "remote", + "tv_box": "tv stand", + "nightstand": "nightstand", + "bed": "bed", + "quilt": "blanket", + "telephone": "telephone", + "monitor": "monitor", + "desk": "desk", + "radiator_shell": "radiator", + "calendar": "calendar", + "clock": "clock", + "keyboard": "keyboard", + "speaker": "speaker", + "clothes": "clothes", + "door_frame": "doorframe", + "sliding_door": "sliding door", + "ceiling_lamp": "ceiling lamp", + "scale": "scale", + "power_strip": "power strip", + "switch": "light switch", + "basket": "basket", + "stool": "stool", + "shoes": "shoe", + "slipper": "slippers", + "bifold_door": "door", + "rangehood": "range hood", + "books": "books", + "toilet_paper": "toilet paper", + "mouse_pad": "mouse", + "ipad": "ipad", + "scissor": "knife block", + "radiator": "radiator", + "pc": "computer tower", + "bicycle": "bicycle", + "wardrobe": "wardrobe", + "mouse": "mouse", + "advertising_board": "poster", + "banner": "banner", + "ceiling_decoration": "ceiling light", + "whiteboard": "whiteboard", + "wall_storage_set": "shelf", + "traffic_cone": "traffic cone", + "wall_decoration": "decoration", + "papers": "papers", + "hat": "hat", + "velvet_hangers": "clothes hanger", + "circular_plate": "plate", + "cellphone": "telephone", + "pen": "keyboard piano", + "paper": "paper", + "lamp": "lamp", + "curtain_box": "curtains", + "woodcarving": "wood", + "scissors": "knife block", + "hand_dryer": "hand dryer", + "machine": "machine", + "vase": "vase", + "plant": "plant", + "power_socket_case": "power outlet", + "gloves": "clothes", + "dishcloth": "cloth", + "painting": "painting", + "shower_wall": "shower wall", + "showerhead": "shower head", + "tooth_mug": "cup", + "map": "map", + "knot_artwork": "decoration", + "fan": "fan", + "sphygmomanometer": "scale", + "electric_kettle": "kettle", + "bread_maker": "oven", + "knife_set": "knife block", + "soup_pot": "cooking pot", + "flatware_set": "cutting board", + "candle": "candle", + "lid_rack": "dish rack", + "flower": "flowerpot", + "can": "can", + "scoop": "bowl", + "laptop": "laptop", + "glass": "glass doors", + "wet_floor_sign": "wet floor sign", + "shower_enclosure": "shower doors", + "jewelry_box": "jewelry box", + "bath_brush": "hair brush", + "sofa_cushion": "couch cushions", + "tv_cabinet": "tv stand", + "wood_fence": "wood beam", + "floor_lamp": "lamp", + "computer_case": "computer tower", + "waste_container": "trash bin", + "roadblock": "barricade", + "trash_can_lids": "trash can", + "hand_sanitizer_stand": "soap dispenser", + "air_conditioner": "conditioner bottle", + "pattern": "rug", + "remote_controller": "remote", + "phone": "telephone", + "speakers": "speaker", + "table_divider": "divider", + "table_card": "card", + "paper_trimmer": "paper cutter", + "stapler": "stapler", + "cup": "cup", + "bathroom_heater": "heater", + "wall_shelf": "shelf", + "towel_rack": "towel", + "sink_drain": "sink", + "floor_drain": "floor", + "broom_head": "broom", + "door_curtain": "curtain", + "refill_pouch": "plastic container", + "bin": "bin", + "stall_wall": "bathroom stall door", + "wall_speaker": "speaker", + "laundry_basket": "laundry basket", + "tissue_box": "tissue box", + "document_holder": "file cabinet", + "yoga_mat": "yoga mat", + "gas_range": "stove", + "chopping_board": "cutting board", + "book_scanner": "scanner", + "payment_terminal": "vending machine", + "napkin_roll": "paper towel roll", + "faucet_switch": "faucet", + "glass_door": "glass doors", + "carpet": "carpet", + "shower_floor": "shower floor", + "toilet_plunger": "plunger", + "plug_panel": "power outlet", + "stand": "stand", + "potted_plant": "potted plant", + "poster": "poster", + "isolation_board": "divider", + "soap_holder": "soap dish", + "plug": "power outlet", + "brush": "hair brush", + "threshold": "doorframe", + "air_conditioner_controller": "remote", + "iron": "iron", + "ironing_board": "ironing board", + "safe": "suitcase", + "gas_cooker": "stove", + "pressure_cooker": "cooking pot", + "steamer_pot": "pot", + "soy_sauce_bottle": "bottle", + "dishwashing_liquid": "dishwashing soap bottle", + "water_ladle": "bowl", + "power_socket_set": "power strip", + "kitchen_tool_holder": "kitchen cabinet", + "case": "case", + "wall_paper": "wall", + "comb": "hair brush", + "paper_cutter": "paper cutter", + "pencil_sharpener": "pen holder", + "sealing_machine": "machine", + "poster_board": "poster", + "shredder": "shredder", + "footstep": "stair", + "planter": "plant", + "floor_light": "lamp", + "paper_cup": "cup", + "divider": "divider", + "hanger": "clothes hanger", + "glove": "clothing", + "blanket": "blanket", + "remote": "remote", + "cloth": "cloth", + "clutter": "object", + "extinguisher": "fire extinguisher", + "dryer": "clothes dryer", + "soap_bottle": "soap bottle", + "fabric_softener_box": "box", + "dryer_sheet_box": "box", + "detergent_bottle": "laundry detergent", + "toaster": "toaster", + "stacked_bowls": "bowl", + "pot_lid": "pot", + "electric_pressure_cooker": "rice cooker", + "bread": "food display", + "bagels": "object", + "oranges": "bananas", + "card_reader": "card", + "whiteboard_detergent": "soap dispenser", + "power_outlet": "power outlet", + "bouquet": "vase", + "water_bottle": "water bottle", + "wall_mounted_telephone": "telephone", + "fridge": "refrigerator", + "toy": "toy dinosaur", + "shoe_box": "box", + "hole_puncher": "paper cutter", + "landline_telephone": "telephone", + "base": "stand", + "handkerchief": "cloth", + "cornice_molding": "frame", + "bathtub_base": "bathtub", + "bidet": "toilet", + "pedestal_urinal": "urinal", + "pedestal_urinal_covered": "urinal", + "pit_toilet": "toilet", + "low_wall": "wall", + "rail": "rail", + "bottles": "bottles", + "floor_otherroom": "floor", + "wall_otherroom": "wall", + "canopy": "canopy", + "cable_manager": "cable", + "sneakers": "shoes", + "purse": "purse", + "cushion": "cushion", + "napkin": "towel", + "plush_toy": "stuffed animal", + "adjustable_desk": "desk", + "tableware": "plates", + "computer_desk": "desk", + "cat_kennel": "cat litter box", + "back_cushion": "pillow", + "ukulele_bag": "guitar case", + "litter_box": "trash can", + "storage_box": "storage bin", + "toy_doll": "doll", + "drawer_unit": "drawer", + "doll": "stuffed animal", + "laptop_bag": "messenger bag", + "clothing_rack": "clothing rack", + "bookshelf": "bookshelves", + "mask": "cloth", + "watch": "clock", + "book": "books", + "ashtray": "tray", + "car_key": "car", + "wallet": "purse", + "tea_pot": "tea kettle", + "wire": "cable", + "rake": "broom", + "dispenser": "soap dispenser", + "toilet_tank": "toilet", + "door_sill": "doorframe", + "cleanser": "soap", + "armrest": "armchair", + "short_wall": "wall", + "suspended_ceiling": "ceiling", + "fire_extinguisher_cabinet": "fire extinguisher", + "plastic_box": "plastic container", + "sanitation_station": "soap dispenser", + "plant_pot": "flowerpot", + "fireplace": "fireplace", + "computer_table": "desk", + "tissue_bag": "tissue box", + "wall_frame": "frame", + "map_board": "map", + "automated_teller_machine": "vending machine", + "ticket": "card", + "tablet": "ipad", + "blankets": "blanket", + "bags": "bag", + "flag": "flag", + "blackboard": "blackboard", + "bar_table": "bar", + "cardboard_holder": "cardboard", + "potted_planet": "potted plant", + "tray": "tray", + "utensil_holder": "kitchen counter", + "bird_ceramics": "statue", + "shirt": "shirt", + "clothes_rail": "clothes hanger", + "power_strips": "power strip", + "card_board": "board", + "pile_of_blankets": "blanket", + "bed_net": "bed", + "umbrella": "umbrella", + "dragon_fruit": "bananas", + "tissue": "tissue box", + "electrical_panel": "electric panel", + "panel": "door", + "tube": "tube", + "pile_of_cloth": "cloth", + "surface": "table", + "chair_cushion": "cushion", + "guide": "book", + "parapet": "railing", + "camera": "camera", + "light_base": "lamp base", + "first_aid": "object", + "bench": "bench", + "potted_plants": "potted plant", + "pot_cover": "pot", + "yoga_mat_roll": "yoga mat", + "panda_doll": "stuffed animal", + "window_trim": "window", + "shoe_cabinet": "shoe rack", + "toilet_paper_holder": "toilet paper dispenser", + "shower_faucet": "shower faucet handle", + "bath_sponge": "sponge", + "ornament": "decoration", + "planter_box": "plant", + "cooktop": "stove", + "knife_block": "knife block", + "step_stool": "step stool", + "touchpad": "keyboard", + "light_box": "light", + "sound": "speaker", + "exhaust_fan_vent": "vent", + "paperbin": "recycling bin", + "mop_bucket": "bucket", + "sneaker": "shoes", + "objects": "object", + "cd_tray": "cd case", + "wall_board": "board", + "room_divider": "divider", + "paiting": "painting", + "cabinet_otherroom": "cabinet", + "electric_switch": "light switch", + "sign": "exit sign", + "hand_soap": "soap bottle", + "window_blinds": "blinds" +} + +def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'): + LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv') + assert osp.exists(LABEL_MAP_FILE) + + raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to) + return raw_label_map + +def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'): + assert osp.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + for row in reader: + mapping[row[label_from]] = row[label_to] + + if represents_int(list(mapping.keys())[0]): + mapping = {int(k):v for k,v in mapping.items()} + + return mapping + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def load_ply_data(data_dir, scan_id): + """ + Load PLY data and propagate object IDs from faces to vertices. + """ + filename_in = osp.join(data_dir, scan_id, '{}.ply'.format(scan_id)) + + if not osp.exists(filename_in): + raise FileNotFoundError(f"PLY file not found: {filename_in}") + + with open(filename_in, 'rb') as file: + ply_data = PlyData.read(file) + + # Extract vertex properties + x = np.array(ply_data['vertex']['x']) + y = np.array(ply_data['vertex']['y']) + z = np.array(ply_data['vertex']['z']) + red = np.array(ply_data['vertex']['red']) + green = np.array(ply_data['vertex']['green']) + blue = np.array(ply_data['vertex']['blue']) + + # Extract normals if available + if 'nx' in ply_data['vertex'] and 'ny' in ply_data['vertex'] and 'nz' in ply_data['vertex']: + nx = np.array(ply_data['vertex']['nx']) + ny = np.array(ply_data['vertex']['ny']) + nz = np.array(ply_data['vertex']['nz']) + normals = np.stack([nx, ny, nz], axis=-1) + else: + normals = None + + # Initialize object IDs for vertices with a default undefined value + vertex_object_ids = np.full(len(x), -1, dtype='int32') # Default: -1 (undefined) + + # Extract face data + faces = ply_data['face'].data + face_vertex_indices = [face['vertex_indices'] for face in faces] + face_object_ids = [face['objectId'] for face in faces] + + # Propagate object IDs to vertices + for face_indices, obj_id in zip(face_vertex_indices, face_object_ids): + vertex_object_ids[face_indices] = obj_id # Assign object ID to all vertices in the face + + vertex_dtype = [ + ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), # Coordinates + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), # Colors + ('objectId', 'i4') # Propagated Object ID + ] + + if normals is not None: + vertex_dtype.extend([('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4')]) # Normals + + vertices = np.empty(len(x), dtype=vertex_dtype) + + vertices['x'] = x.astype('f4') + vertices['y'] = y.astype('f4') + vertices['z'] = z.astype('f4') + vertices['red'] = red.astype('u1') + vertices['green'] = green.astype('u1') + vertices['blue'] = blue.astype('u1') + vertices['objectId'] = vertex_object_ids.astype('i4') + + if normals is not None: + vertices['nx'] = normals[:, 0].astype('f4') + vertices['ny'] = normals[:, 1].astype('f4') + vertices['nz'] = normals[:, 2].astype('f4') + + return vertices + +def load_meta_intrinsics(scan_dir, scene_id, stream_type="color_camera"): + ''' + Load MultiScan intrinsic information + ''' + meta_intrinsics_path = osp.join(scan_dir, f'{scene_id}.json') + intrinsics = {} + + with open(meta_intrinsics_path,"r") as f: + json_data=json.load(f) + + for stream in json_data.get("streams", []): + if stream.get("type") == stream_type: + intrinsic_mat = np.array(stream.get("intrinsics")) + intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F') + intrinsics['intrinsic_mat']=intrinsic_mat + resolution = stream.get("resolution") + width, height = resolution[1], resolution[0] # [width, height] + intrinsics['width']=float(width) + intrinsics['height']=float(height) + + return intrinsics + +def load_intrinsics(scan_dir, scene_id, frame_id, stream_type="color_camera"): + ''' + Load MultiScan intrinsic information + ''' + intrinsics_path = osp.join(scan_dir, 'poses.jsonl') + resoultion_path = osp.join(scan_dir, f'{scene_id}.json') + intrinsics = {} + + with open(resoultion_path,"r") as f: + json_data=json.load(f) + + for stream in json_data.get("streams", []): + if stream.get("type") == stream_type: + resolution = stream.get("resolution", None) + if resolution: + width, height = resolution[1], resolution[0] # [width, height] + intrinsics['width']=float(width) + intrinsics['height']=float(height) + + + with jsonlines.open(intrinsics_path) as reader: + for entry in reader: + if entry.get("frame_id") == frame_id: + intrinsic_mat = np.asarray(entry.get('intrinsics')) + intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F') + intrinsics['intrinsic_mat']=intrinsic_mat + break + + return intrinsics + +def load_pose(scan_dir, frame_id): + # Find alignment file + alignment_path = None + for file_name in os.listdir(scan_dir): + if file_name.endswith('.align.json'): + alignment_path = osp.join(scan_dir, file_name) + break + + if alignment_path is None: + raise FileNotFoundError(f"No alignment file found in {scan_dir}") + + with open(alignment_path, "r") as f: + alignment_data = json.load(f) + if 'coordinate_transform' not in alignment_data: + raise ValueError(f"Alignment file {alignment_path} does not contain 'coordinate_transform'") + coordinate_transform = np.reshape(alignment_data['coordinate_transform'], (4, 4), order='F') + inv_transform = np.linalg.inv(coordinate_transform) + + pose_path = osp.join(scan_dir, 'poses.jsonl') + with jsonlines.open(pose_path) as reader: + for entry in reader: + if entry.get("frame_id") == frame_id: + transform = np.asarray(entry.get('transform')) + transform = np.reshape(transform, (4, 4), order='F') + transform = np.dot(transform, np.diag([1, -1, -1, 1])) + transform = transform / transform[3][3] + aligned_pose = inv_transform @ transform #align camera poses + return aligned_pose + + raise ValueError(f"Pose for frame_id {frame_id} not found in {pose_path}") + + +def load_all_poses(scan_dir, frame_idxs): + frame_poses = {} + for frame_idx in frame_idxs: + frame_pose = load_pose(scan_dir, int(frame_idx)) + frame_poses[frame_idx] = frame_pose + return frame_poses + +def load_frame_idxs(scan_dir, skip=None): + frames_paths = glob(osp.join(scan_dir, 'sequence', '*.jpg')) + frame_names = [osp.basename(frame_path) for frame_path in frames_paths] + frame_idxs = [frame_name.split('.')[0].split('-')[-1] for frame_name in frame_names] + frame_idxs.sort() + + if skip is None: + frame_idxs = frame_idxs + else: + frame_idxs = [frame_idx for frame_idx in frame_idxs[::skip]] + return frame_idxs + + +def represents_int(s): + ''' if string s represents an int. ''' + try: + int(s) + return True + except ValueError: + return False \ No newline at end of file diff --git a/util/structured3d.py b/util/structured3d.py new file mode 100644 index 0000000..6fc9d46 --- /dev/null +++ b/util/structured3d.py @@ -0,0 +1,171 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import cv2 + +S3D_SCANNET = { + 1: 'wall', + 2: 'floor', + 3: 'cabinet', + 4: 'bed', + 5: 'chair', + 6: 'sofa', + 7: 'table', + 8: 'door', + 9: 'window', + 10: 'bookshelf', + 11: 'picture', + 12: 'counter', + 13: 'blinds', + 14: 'desk', + 15: 'shelf', + 16: 'curtain', + 17: 'dresser', + 18: 'pillow', + 19: 'mirror', + 20: 'mat', + 21: 'clothes', + 22: 'ceiling', + 23: 'books', + 24: 'refrigerator', + 25: 'tv', + 26: 'paper', + 27: 'towel', + 28: 'shower curtain', + 29: 'box', + 30: 'whiteboard', + 31: 'person', + 32: 'nightstand', + 33: 'toilet', + 34: 'sink', + 35: 'lamp', + 36: 'bathtub', + 37: 'bag', + 38: 'otherstructure', + 39: 'otherfurniture', + 40: 'otherprop'} + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def load_ply_data(data_dir, scan_id, room_id): + + filename_in = osp.join(data_dir, scan_id, '3D_rendering', room_id, 'room_mesh.ply') + print(scan_id) + if not osp.exists(filename_in): + raise FileNotFoundError(f"PLY file not found: {filename_in}") + + with open(filename_in, 'rb') as file: + ply_data = PlyData.read(file) + + x = np.array(ply_data['vertex']['x']) + y = np.array(ply_data['vertex']['y']) + z = np.array(ply_data['vertex']['z']) + red = np.array(ply_data['vertex']['red']) + green = np.array(ply_data['vertex']['green']) + blue = np.array(ply_data['vertex']['blue']) + vertex_object_ids = np.array(ply_data['vertex']['object_id']) + vertex_nyu40ids = np.array(ply_data['vertex']['nyu40id']) + # vertex_targetids = np.array(ply_data['vertex']['target_id']) + + vertex_dtype = [ + ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('alpha', 'u1'), + ('objectId', 'i4'), + ('nyu40id', 'i4'), + ('targetId', 'i4') + + ] + + scene_vertices = np.column_stack([x, y, z]) + center_points = np.mean(scene_vertices, axis=0) + center_points[2] = np.min(scene_vertices[:, 2]) + scene_vertices = scene_vertices - center_points + + vertices = np.empty(len(x), dtype=vertex_dtype) + + vertices['x'] = scene_vertices[:, 0].astype('f4') + vertices['y'] = scene_vertices[:, 1].astype('f4') + vertices['z'] = scene_vertices[:, 2].astype('f4') + + # vertices['x'] = x.astype('f4') + # vertices['y'] = y.astype('f4') + # vertices['z'] = z.astype('f4') + + vertices['red'] = red.astype('u1') + vertices['green'] = green.astype('u1') + vertices['blue'] = blue.astype('u1') + vertices['objectId'] = vertex_object_ids.astype('i4') + vertices['nyu40id'] = vertex_nyu40ids.astype('i4') + vertices['targetId'] = np.zeros_like(x).astype('i4') + # vertices['targetId'] = vertex_targetids.astype('i4') + return vertices + +def normalize(vector): + return vector / np.linalg.norm(vector) + + +def parse_camera_info(camera_info, height, width): + """ extract intrinsic and extrinsic matrix + """ + lookat = normalize(camera_info[3:6]) + up = normalize(camera_info[6:9]) + + W = lookat + U = np.cross(W, up) + V = np.cross(W, U) + + rot = np.vstack((U, V, W)) + + trans = camera_info[:3] + + xfov = camera_info[9] + yfov = camera_info[10] + + K = np.diag([1, 1, 1]) + + K[0, 2] = width / 2 + K[1, 2] = height / 2 + + K[0, 0] = K[0, 2] / np.tan(xfov) + K[1, 1] = K[1, 2] / np.tan(yfov) + + return rot, trans, K + +def load_all_poses(scan_dir, frame_idxs): + frame_poses = {} + for frame_idx in frame_idxs: + frame_pose = load_pose(scan_dir, frame_idx) + frame_poses[frame_idx] = frame_pose + return frame_poses + +def load_pose(scan_dir, frame_id): + pose_path = osp.join(scan_dir, frame_id, 'camera_pose.txt') + camera_info = np.loadtxt(pose_path) + rgb_image_path = osp.join(scan_dir, frame_id, 'rgb_rawlight.png') + color = cv2.imread(rgb_image_path) + height, width = color.shape[:2] + rot, trans, K = parse_camera_info(camera_info, height, width) + + trans = np.array(trans) / 1000 + extrinsic = np.eye(4) + extrinsic[:3, :3] = rot.T + extrinsic[:3, -1] = trans + extrinsic = np.linalg.inv(extrinsic) + + return extrinsic + +def load_intrinsics(scene_folder): + camera_info = np.loadtxt(osp.join(scene_folder, '0', 'camera_pose.txt')) + rgb_image_path = osp.join(scene_folder, '0', 'rgb_rawlight.png') + rgb_img = cv2.imread(rgb_image_path) + height, width = rgb_img.shape[:2] + _, _, K = parse_camera_info(camera_info, height, width) + intrinsics = {} + intrinsics['intrinsic_mat'] = K + intrinsics['width'] = width + intrinsics['height'] = height + return intrinsics \ No newline at end of file