Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions data_juicer/config/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,35 @@ process:
overlap_len: 200 # Overlap length of the split texts if not split in the split pattern.
tokenizer: 'gpt-4o' # The tokenizer name of Hugging Face tokenizers. The text length will be calculate as the token num if it is offered. Otherwise, the text length equals to string length.
trust_remote_code: True # for loading huggingface model.
- vggt_mapper: # Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks.
vggt_model_path: "facebook/VGGT-1B" # the path to the VGGT model.
frame_num: 3 # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
duration: 4 # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
tag_field_name: 'vggt_tags' # the field name to store the tags. It's "vggt_tags" in default.
frame_dir: None # Output directory to save extracted frames.
if_output_camera_parameters: True # Determines whether to output camera parameters.
if_output_depth_maps: True # Determines whether to output depth maps.
if_output_point_maps_from_projection: True # Determines whether to output point maps directly inferred by VGGT.
if_output_point_maps_from_unprojection: True # Determines whether to output point maps constructed from depth maps and camera parameters.
if_output_point_tracks: True # Determines whether to output point tracks.
- video_camera_calibration_static_deepcalib_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib.
model_path: "weights_10_0.02.h5" # The path to the DeepCalib Regression model.
frame_num: 3 # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
duration: 0 # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
tag_field_name: 'static_camera_calibration_deepcalib_tags' # the field name to store the tags. It's "static_camera_calibration_deepcalib_tags" in default.
frame_dir: None # Output directory to save extracted frames.
output_info_dir: None # Output directory for saving camera parameters.
- video_camera_calibration_static_moge_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib).
model_path: "Ruicheng/moge-2-vitl" # The path to the Moge-2 model.
frame_num: 3 # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
duration: 0 # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
tag_field_name: 'static_camera_calibration_moge_tags' # the field name to store the tags. It's "static_camera_calibration_moge_tags" in default.
frame_dir: None # Output directory to save extracted frames.
if_output_info: True # Whether to save the camera parameters results to an JSON file.
output_info_dir: None # Output directory for saving camera parameters.
if_output_points_info: True # Determines whether to output point map in OpenCV camera coordinate system (x right, y down, z forward).
if_output_depth_info: True # Determines whether to output depth maps.
if_output_mask_info: True # Determines whether to output a binary mask for valid pixels.
- video_captioning_from_audio_mapper: # caption a video according to its audio streams based on Qwen-Audio model
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only captioned sample in the final datasets and the original sample will be removed. It's True in default.
memory: '30GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
Expand Down Expand Up @@ -633,6 +662,20 @@ process:
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
tag_field_name: 'video_frame_tags' # the key name in the meta field to store the tags. It's "video_frame_tags" in default.
memory: '9GB'
- video_undistort_mapper: # Undistort raw videos with corresponding camera intrinsics and distortion coefficients.
output_video_dir: None # Output directory to save undistorted videos.
tag_field_name: 'video_undistortion_tags' # The field name to store the tags. It's "video_undistortion_tags" in default.
batch_size_each_video: 1000 # Number of frames to process and save per temporary TS file batch.
crf: 22 # Constant Rate Factor (CRF) for FFmpeg encoding quality.
- video_whole_body_pose_estimation_mapper: # Input a video containing people, and use the DWPose model to extract the body, hand, feet, and face keypoints of the human subjects in the video, i.e., 2D Whole-body Pose Estimation.
onnx_det_model: 'yolox_l.onnx' # The path to 'yolox_l.onnx'.
onnx_pose_model: 'dw-ll_ucoco_384.onnx' # The path to 'dw-ll_ucoco_384.onnx'.
frame_num: 3 # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
duration: 0 # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
tag_field_name: 'pose_estimation_tags' # the field name to store the tags. It's "pose_estimation_tags" in default.
frame_dir: None # Output directory to save extracted frames.
if_save_visualization: False # Whether to save visualization results.
save_visualization_dir: None # The path for saving visualization results.
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.

# Filter ops
Expand Down
10 changes: 10 additions & 0 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@
from .text_chunk_mapper import TextChunkMapper
from .text_tagging_by_prompt_mapper import TextTaggingByPromptMapper
from .vggt_mapper import VggtMapper
from .video_camera_calibration_static_deepcalib_mapper import (
VideoCameraCalibrationStaticDeepcalibMapper,
)
from .video_camera_calibration_static_moge_mapper import (
VideoCameraCalibrationStaticMogeMapper,
)
from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper
from .video_captioning_from_frames_mapper import VideoCaptioningFromFramesMapper
from .video_captioning_from_summarizer_mapper import VideoCaptioningFromSummarizerMapper
Expand All @@ -101,6 +107,7 @@
from .video_split_by_scene_mapper import VideoSplitBySceneMapper
from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper
from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper
from .video_undistort_mapper import VideoUndistortMapper
from .video_whole_body_pose_estimation_mapper import VideoWholeBodyPoseEstimationMapper
from .whitespace_normalization_mapper import WhitespaceNormalizationMapper

Expand Down Expand Up @@ -183,6 +190,8 @@
"TextChunkMapper",
"TextTaggingByPromptMapper",
"VggtMapper",
"VideoCameraCalibrationStaticDeepcalibMapper",
"VideoCameraCalibrationStaticMogeMapper",
"VideoCaptioningFromAudioMapper",
"VideoCaptioningFromFramesMapper",
"VideoCaptioningFromSummarizerMapper",
Expand All @@ -202,6 +211,7 @@
"VideoSplitBySceneMapper",
"VideoTaggingFromAudioMapper",
"VideoTaggingFromFramesMapper",
"VideoUndistortMapper",
"VideoWholeBodyPoseEstimationMapper",
"WhitespaceNormalizationMapper",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import json
import os

import numpy as np
from pydantic import PositiveInt

import data_juicer
from data_juicer.ops.load import load_ops
from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
from data_juicer.utils.constant import Fields, MetaKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Mapper
from ..op_fusion import LOADED_VIDEOS

OP_NAME = "video_camera_calibration_static_deepcalib_mapper"

cv2 = LazyLoader("cv2", "opencv-python")


@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
class VideoCameraCalibrationStaticDeepcalibMapper(Mapper):
"""Compute the camera intrinsics and field of view (FOV)
for a static camera using DeepCalib."""

_accelerator = "cuda"

def __init__(
self,
model_path: str = "weights_10_0.02.h5",
frame_num: PositiveInt = 3,
duration: float = 0,
tag_field_name: str = MetaKeys.static_camera_calibration_deepcalib_tags,
frame_dir: str = DATA_JUICER_ASSETS_CACHE,
output_info_dir: str = DATA_JUICER_ASSETS_CACHE,
*args,
**kwargs,
):
"""
Initialization method.

:param model_path: The path to the DeepCalib Regression model.
:param frame_num: The number of frames to be extracted uniformly from
the video. If it's 1, only the middle frame will be extracted. If
it's 2, only the first and the last frames will be extracted. If
it's larger than 2, in addition to the first and the last frames,
other frames will be extracted uniformly within the video duration.
If "duration" > 0, frame_num is the number of frames per segment.
:param duration: The duration of each segment in seconds.
If 0, frames are extracted from the entire video.
If duration > 0, the video is segmented into multiple segments
based on duration, and frames are extracted from each segment.
:param tag_field_name: The field name to store the tags. It's
"static_camera_calibration_deepcalib_tags" in default.
:param frame_dir: Output directory to save extracted frames.
:param output_info_dir: Output directory for saving camera parameters.
:param args: extra args
:param kwargs: extra args

"""

super().__init__(*args, **kwargs)

LazyLoader.check_packages(["tensorflow"])
import keras
from keras.applications.imagenet_utils import preprocess_input

self.keras = keras
self.preprocess_input = preprocess_input

self.video_extract_frames_mapper_args = {
"frame_sampling_method": "uniform",
"frame_num": frame_num,
"duration": duration,
"frame_dir": frame_dir,
"frame_key": MetaKeys.video_frames,
}
self.fused_ops = load_ops([{"video_extract_frames_mapper": self.video_extract_frames_mapper_args}])
self.model_key = prepare_model(model_type="deepcalib", model_path=model_path)

self.frame_num = frame_num
self.duration = duration
self.tag_field_name = tag_field_name
self.frame_dir = frame_dir
self.output_info_dir = output_info_dir
self.INPUT_SIZE = 299
self.focal_start = 40
self.focal_end = 500

def process_single(self, sample=None, rank=None):

# check if it's generated already
if self.tag_field_name in sample[Fields.meta]:
return sample

# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
return []

# load videos
ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]

dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
dataset = self.fused_ops[0].run(dataset)

frames_root = os.path.join(self.frame_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0])
frame_names = os.listdir(frames_root)
frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
model = get_model(self.model_key, rank, self.use_cuda())

final_k_list = []
final_xi_list = []
final_hfov_list = []
final_vfov_list = []

for i, path in enumerate(frames_path):
image = cv2.imread(path)
height, width, channels = image.shape

image = cv2.resize(image, (self.INPUT_SIZE, self.INPUT_SIZE))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image / 255.0
image = image - 0.5
image = image * 2.0
image = np.expand_dims(image, 0)

image = self.preprocess_input(image)

prediction = model.predict(image)
prediction_focal = prediction[0]
prediction_dist = prediction[1]

# Scale the focal length based on the original width of the image.
curr_focal_pred = (
(prediction_focal[0][0] * (self.focal_end + 1.0 - self.focal_start * 1.0) + self.focal_start * 1.0)
* (width * 1.0)
/ (self.INPUT_SIZE * 1.0)
)
curr_focal_pred = curr_focal_pred.item()

# Following DeepCalib's official codes
curr_dist_pred = prediction_dist[0][0] * 1.2
curr_dist_pred = curr_dist_pred.item()

temp_k = [[curr_focal_pred, 0, width / 2], [0, curr_focal_pred, height / 2], [0, 0, 1]]
temp_xi = curr_dist_pred

temp_hfov = 2 * np.arctan(width / 2 / curr_focal_pred) # rad
temp_vfov = 2 * np.arctan(height / 2 / curr_focal_pred)

temp_hfov = temp_hfov.item()
temp_vfov = temp_vfov.item()

final_k_list.append(temp_k)
final_xi_list.append(temp_xi)
final_hfov_list.append(temp_hfov)
final_vfov_list.append(temp_vfov)

sample[Fields.meta][self.tag_field_name] = {
"frames_folder": frames_root,
"frame_names": frame_names,
"intrinsics_list": final_k_list,
"xi_list": final_xi_list,
"hfov_list": final_hfov_list,
"vfov_list": final_vfov_list,
}

os.makedirs(self.output_info_dir, exist_ok=True)
with open(
os.path.join(
self.output_info_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0] + ".json"
),
"w",
) as f:
json.dump(sample[Fields.meta][self.tag_field_name], f)

return sample
Loading