datajuicer · Qirui-jiao · Dec 26, 2025 · Dec 26, 2025 · Jan 8, 2026 · Jan 8, 2026
diff --git a/data_juicer/config/config_all.yaml b/data_juicer/config/config_all.yaml
@@ -529,6 +529,35 @@ process:
       overlap_len: 200                                        # Overlap length of the split texts if not split in the split pattern.
       tokenizer: 'gpt-4o'                                     # The tokenizer name of Hugging Face tokenizers. The text length will be calculate as the token num if it is offered. Otherwise, the text length equals to string length.
       trust_remote_code: True                                 # for loading huggingface model.
+  - vggt_mapper:                                        # Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks.
+      vggt_model_path: "facebook/VGGT-1B"                     # the path to the VGGT model.
+      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      duration: 4                                             # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
+      tag_field_name: 'vggt_tags'                             # the field name to store the tags. It's "vggt_tags" in default.
+      frame_dir: None                                         # Output directory to save extracted frames.
+      if_output_camera_parameters: True                       # Determines whether to output camera parameters.
+      if_output_depth_maps: True                              # Determines whether to output depth maps.
+      if_output_point_maps_from_projection: True              # Determines whether to output point maps directly inferred by VGGT.
+      if_output_point_maps_from_unprojection: True            # Determines whether to output point maps constructed from depth maps and camera parameters.
+      if_output_point_tracks: True                            # Determines whether to output point tracks.
+  - video_camera_calibration_static_deepcalib_mapper:   # Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib.
+      model_path: "weights_10_0.02.h5"                        # The path to the DeepCalib Regression model.
+      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      duration: 0                                             # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
+      tag_field_name: 'static_camera_calibration_deepcalib_tags'   # the field name to store the tags. It's "static_camera_calibration_deepcalib_tags" in default.
+      frame_dir: None                                         # Output directory to save extracted frames.
+      output_info_dir: None                                   # Output directory for saving camera parameters.
+  - video_camera_calibration_static_moge_mapper:        # Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib).
+      model_path: "Ruicheng/moge-2-vitl"                      # The path to the Moge-2 model.
+      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      duration: 0                                             # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
+      tag_field_name: 'static_camera_calibration_moge_tags'   # the field name to store the tags. It's "static_camera_calibration_moge_tags" in default.
+      frame_dir: None                                         # Output directory to save extracted frames.
+      if_output_info: True                                    # Whether to save the camera parameters results to an JSON file.
+      output_info_dir: None                                   # Output directory for saving camera parameters.
+      if_output_points_info: True                             # Determines whether to output point map in OpenCV camera coordinate system (x right, y down, z forward).
+      if_output_depth_info: True                              # Determines whether to output depth maps.
+      if_output_mask_info: True                               # Determines whether to output a binary mask for valid pixels.
   - video_captioning_from_audio_mapper:                     # caption a video according to its audio streams based on Qwen-Audio model
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only captioned sample in the final datasets and the original sample will be removed. It's True in default.
       memory: '30GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
@@ -633,6 +662,20 @@ process:
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       tag_field_name: 'video_frame_tags'                    # the key name in the meta field to store the tags. It's "video_frame_tags" in default.
       memory: '9GB'
+  - video_undistort_mapper:                             # Undistort raw videos with corresponding camera intrinsics and distortion coefficients.
+      output_video_dir: None                                # Output directory to save undistorted videos.
+      tag_field_name: 'video_undistortion_tags'             # The field name to store the tags. It's "video_undistortion_tags" in default.
+      batch_size_each_video: 1000                           # Number of frames to process and save per temporary TS file batch.
+      crf: 22                                               # Constant Rate Factor (CRF) for FFmpeg encoding quality.
+  - video_whole_body_pose_estimation_mapper:            # Input a video containing people, and use the DWPose model to extract the body, hand, feet, and face keypoints of the human subjects in the video, i.e., 2D Whole-body Pose Estimation.
+      onnx_det_model: 'yolox_l.onnx'                          # The path to 'yolox_l.onnx'.
+      onnx_pose_model: 'dw-ll_ucoco_384.onnx'                 # The path to 'dw-ll_ucoco_384.onnx'.
+      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      duration: 0                                             # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
+      tag_field_name: 'pose_estimation_tags'                  # the field name to store the tags. It's "pose_estimation_tags" in default.
+      frame_dir: None                                         # Output directory to save extracted frames.
+      if_save_visualization: False                            # Whether to save visualization results.
+      save_visualization_dir: None                            # The path for saving visualization results.
   - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.
 
   # Filter ops

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -82,6 +82,12 @@
 from .text_chunk_mapper import TextChunkMapper
 from .text_tagging_by_prompt_mapper import TextTaggingByPromptMapper
 from .vggt_mapper import VggtMapper
+from .video_camera_calibration_static_deepcalib_mapper import (
+    VideoCameraCalibrationStaticDeepcalibMapper,
+)
+from .video_camera_calibration_static_moge_mapper import (
+    VideoCameraCalibrationStaticMogeMapper,
+)
 from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper
 from .video_captioning_from_frames_mapper import VideoCaptioningFromFramesMapper
 from .video_captioning_from_summarizer_mapper import VideoCaptioningFromSummarizerMapper
@@ -101,6 +107,7 @@
 from .video_split_by_scene_mapper import VideoSplitBySceneMapper
 from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper
 from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper
+from .video_undistort_mapper import VideoUndistortMapper
 from .video_whole_body_pose_estimation_mapper import VideoWholeBodyPoseEstimationMapper
 from .whitespace_normalization_mapper import WhitespaceNormalizationMapper
 
@@ -183,6 +190,8 @@
     "TextChunkMapper",
     "TextTaggingByPromptMapper",
     "VggtMapper",
+    "VideoCameraCalibrationStaticDeepcalibMapper",
+    "VideoCameraCalibrationStaticMogeMapper",
     "VideoCaptioningFromAudioMapper",
     "VideoCaptioningFromFramesMapper",
     "VideoCaptioningFromSummarizerMapper",
@@ -202,6 +211,7 @@
     "VideoSplitBySceneMapper",
     "VideoTaggingFromAudioMapper",
     "VideoTaggingFromFramesMapper",
+    "VideoUndistortMapper",
     "VideoWholeBodyPoseEstimationMapper",
     "WhitespaceNormalizationMapper",
 ]
diff --git a/data_juicer/ops/mapper/video_camera_calibration_static_deepcalib_mapper.py b/data_juicer/ops/mapper/video_camera_calibration_static_deepcalib_mapper.py
@@ -0,0 +1,180 @@
+import json
+import os
+
+import numpy as np
+from pydantic import PositiveInt
+
+import data_juicer
+from data_juicer.ops.load import load_ops
+from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
+from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.lazy_loader import LazyLoader
+from data_juicer.utils.mm_utils import SpecialTokens
+from data_juicer.utils.model_utils import get_model, prepare_model
+
+from ..base_op import OPERATORS, Mapper
+from ..op_fusion import LOADED_VIDEOS
+
+OP_NAME = "video_camera_calibration_static_deepcalib_mapper"
+
+cv2 = LazyLoader("cv2", "opencv-python")
+
+
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+class VideoCameraCalibrationStaticDeepcalibMapper(Mapper):
+    """Compute the camera intrinsics and field of view (FOV)
+    for a static camera using DeepCalib."""
+
+    _accelerator = "cuda"
+
+    def __init__(
+        self,
+        model_path: str = "weights_10_0.02.h5",
+        frame_num: PositiveInt = 3,
+        duration: float = 0,
+        tag_field_name: str = MetaKeys.static_camera_calibration_deepcalib_tags,
+        frame_dir: str = DATA_JUICER_ASSETS_CACHE,
+        output_info_dir: str = DATA_JUICER_ASSETS_CACHE,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialization method.
+
+        :param model_path: The path to the DeepCalib Regression model.
+        :param frame_num: The number of frames to be extracted uniformly from
+            the video. If it's 1, only the middle frame will be extracted. If
+            it's 2, only the first and the last frames will be extracted. If
+            it's larger than 2, in addition to the first and the last frames,
+            other frames will be extracted uniformly within the video duration.
+            If "duration" > 0, frame_num is the number of frames per segment.
+        :param duration: The duration of each segment in seconds.
+            If 0, frames are extracted from the entire video.
+            If duration > 0, the video is segmented into multiple segments
+            based on duration, and frames are extracted from each segment.
+        :param tag_field_name: The field name to store the tags. It's
+            "static_camera_calibration_deepcalib_tags" in default.
+        :param frame_dir: Output directory to save extracted frames.
+        :param output_info_dir: Output directory for saving camera parameters.
+        :param args: extra args
+        :param kwargs: extra args
+
+        """
+
+        super().__init__(*args, **kwargs)
+
+        LazyLoader.check_packages(["tensorflow"])
+        import keras
+        from keras.applications.imagenet_utils import preprocess_input
+
+        self.keras = keras
+        self.preprocess_input = preprocess_input
+
+        self.video_extract_frames_mapper_args = {
+            "frame_sampling_method": "uniform",
+            "frame_num": frame_num,
+            "duration": duration,
+            "frame_dir": frame_dir,
+            "frame_key": MetaKeys.video_frames,
+        }
+        self.fused_ops = load_ops([{"video_extract_frames_mapper": self.video_extract_frames_mapper_args}])
+        self.model_key = prepare_model(model_type="deepcalib", model_path=model_path)
+
+        self.frame_num = frame_num
+        self.duration = duration
+        self.tag_field_name = tag_field_name
+        self.frame_dir = frame_dir
+        self.output_info_dir = output_info_dir
+        self.INPUT_SIZE = 299
+        self.focal_start = 40
+        self.focal_end = 500
+
+    def process_single(self, sample=None, rank=None):
+
+        # check if it's generated already
+        if self.tag_field_name in sample[Fields.meta]:
+            return sample
+
+        # there is no video in this sample
+        if self.video_key not in sample or not sample[self.video_key]:
+            return []
+
+        # load videos
+        ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]
+
+        dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
+        dataset = self.fused_ops[0].run(dataset)
+
+        frames_root = os.path.join(self.frame_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0])
+        frame_names = os.listdir(frames_root)
+        frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
+        model = get_model(self.model_key, rank, self.use_cuda())
+
+        final_k_list = []
+        final_xi_list = []
+        final_hfov_list = []
+        final_vfov_list = []
+
+        for i, path in enumerate(frames_path):
+            image = cv2.imread(path)
+            height, width, channels = image.shape
+
+            image = cv2.resize(image, (self.INPUT_SIZE, self.INPUT_SIZE))
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            image = image / 255.0
+            image = image - 0.5
+            image = image * 2.0
+            image = np.expand_dims(image, 0)
+
+            image = self.preprocess_input(image)
+
+            prediction = model.predict(image)
+            prediction_focal = prediction[0]
+            prediction_dist = prediction[1]
+
+            # Scale the focal length based on the original width of the image.
+            curr_focal_pred = (
+                (prediction_focal[0][0] * (self.focal_end + 1.0 - self.focal_start * 1.0) + self.focal_start * 1.0)
+                * (width * 1.0)
+                / (self.INPUT_SIZE * 1.0)
+            )
+            curr_focal_pred = curr_focal_pred.item()
+
+            # Following DeepCalib's official codes
+            curr_dist_pred = prediction_dist[0][0] * 1.2
+            curr_dist_pred = curr_dist_pred.item()
+
+            temp_k = [[curr_focal_pred, 0, width / 2], [0, curr_focal_pred, height / 2], [0, 0, 1]]
+            temp_xi = curr_dist_pred
+
+            temp_hfov = 2 * np.arctan(width / 2 / curr_focal_pred)  # rad
+            temp_vfov = 2 * np.arctan(height / 2 / curr_focal_pred)
+
+            temp_hfov = temp_hfov.item()
+            temp_vfov = temp_vfov.item()
+
+            final_k_list.append(temp_k)
+            final_xi_list.append(temp_xi)
+            final_hfov_list.append(temp_hfov)
+            final_vfov_list.append(temp_vfov)
+
+        sample[Fields.meta][self.tag_field_name] = {
+            "frames_folder": frames_root,
+            "frame_names": frame_names,
+            "intrinsics_list": final_k_list,
+            "xi_list": final_xi_list,
+            "hfov_list": final_hfov_list,
+            "vfov_list": final_vfov_list,
+        }
+
+        os.makedirs(self.output_info_dir, exist_ok=True)
+        with open(
+            os.path.join(
+                self.output_info_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0] + ".json"
+            ),
+            "w",
+        ) as f:
+            json.dump(sample[Fields.meta][self.tag_field_name], f)
+
+        return sample