Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 16 additions & 22 deletions src/lerobot/datasets/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,20 +141,14 @@ def update_meta_data(
df[orig_file_col] = video_idx["file"]

# Apply per-source-file timestamp offsets
src_to_offset = video_idx.get("src_to_offset", {})
if src_to_offset:
# Apply offset based on original source file
for idx in df.index:
src_key = (df.at[idx, "_orig_chunk"], df.at[idx, "_orig_file"])
offset = src_to_offset.get(src_key, 0)
df.at[idx, f"videos/{key}/from_timestamp"] += offset
df.at[idx, f"videos/{key}/to_timestamp"] += offset
else:
# Fallback to simple offset (for backward compatibility)
df[f"videos/{key}/from_timestamp"] = (
df[f"videos/{key}/from_timestamp"] + video_idx["latest_duration"]
)
df[f"videos/{key}/to_timestamp"] = df[f"videos/{key}/to_timestamp"] + video_idx["latest_duration"]
src_to_offset = video_idx["src_to_offset"]

# Apply offset based on original source file
for idx in df.index:
src_key = (df.at[idx, "_orig_chunk"], df.at[idx, "_orig_file"])
offset = src_to_offset.get(src_key, 0)
df.at[idx, f"videos/{key}/from_timestamp"] += offset
df.at[idx, f"videos/{key}/to_timestamp"] += offset

# Clean up temporary columns
df = df.drop(columns=["_orig_chunk", "_orig_file"])
Expand Down Expand Up @@ -208,6 +202,7 @@ def aggregate_datasets(
LeRobotDatasetMetadata(repo_id, root=root) for repo_id, root in zip(repo_ids, roots, strict=False)
]
)

fps, robot_type, features = validate_all_metadata(all_metadata)
video_keys = [key for key in features if features[key]["dtype"] == "video"]

Expand All @@ -230,15 +225,14 @@ def aggregate_datasets(
meta_idx = {"chunk": 0, "file": 0}
data_idx = {"chunk": 0, "file": 0}
videos_idx = {
key: {"chunk": 0, "file": 0, "latest_duration": 0, "episode_duration": 0} for key in video_keys
key: {"chunk": 0, "file": 0, "episode_duration": 0, "episode_offset": 0, "src_to_offset": {} } for key in video_keys
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
key: {"chunk": 0, "file": 0, "episode_duration": 0, "episode_offset": 0, "src_to_offset": {} } for key in video_keys
key: {"chunk": 0, "file": 0, "episode_duration": 0, "episode_offset": 0, "src_to_offset": {} }
for key in video_keys

To fix the Quality test

}

dst_meta.episodes = {}

for src_meta in tqdm.tqdm(all_metadata, desc="Copy data and videos"):
videos_idx = aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chunk_size)
data_idx = aggregate_data(src_meta, dst_meta, data_idx, data_files_size_in_mb, chunk_size)

meta_idx = aggregate_metadata(src_meta, dst_meta, meta_idx, data_idx, videos_idx)

dst_meta.info["total_episodes"] += src_meta.total_episodes
Expand Down Expand Up @@ -282,7 +276,7 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu

chunk_idx = video_idx["chunk"]
file_idx = video_idx["file"]
current_offset = video_idx["latest_duration"]
current_offset = video_idx["episode_offset"]

for src_chunk_idx, src_file_idx in unique_chunk_file_pairs:
src_path = src_meta.root / DEFAULT_VIDEO_PATH.format(
Expand All @@ -304,8 +298,10 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
videos_idx[key]["src_to_offset"][(src_chunk_idx, src_file_idx)] = current_offset
dst_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(str(src_path), str(dst_path))
videos_idx[key]["episode_duration"] += src_duration

current_offset += src_duration
videos_idx[key]["episode_offset"] = current_offset
videos_idx[key]["episode_duration"] += src_duration
continue

# Check file sizes before appending
Expand All @@ -324,6 +320,7 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
)
dst_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(str(src_path), str(dst_path))

# Reset offset for next file
current_offset = src_duration
else:
Expand All @@ -335,6 +332,7 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
)
current_offset += src_duration

videos_idx[key]["episode_offset"] = current_offset
Copy link
Author

@nicholas-maselli nicholas-maselli Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current offset must be saved for each iteration of the loop in the function above this.

Currently the current offset is not saved after it resets to 0, it continues on from the total frame count in the next iteration of the outer loop which makes the bug tricky to spot

videos_idx[key]["episode_duration"] += src_duration

videos_idx[key]["chunk"] = chunk_idx
Expand Down Expand Up @@ -435,10 +433,6 @@ def aggregate_metadata(src_meta, dst_meta, meta_idx, data_idx, videos_idx):
aggr_root=dst_meta.root,
)

# Increment latest_duration by the total duration added from this source dataset
for k in videos_idx:
videos_idx[k]["latest_duration"] += videos_idx[k]["episode_duration"]

return meta_idx


Expand Down
Loading