Skip to content
Open
Show file tree
Hide file tree
Changes from 75 commits
Commits
Show all changes
89 commits
Select commit Hold shift + click to select a range
e63ed61
add wip code
cuichenx Sep 18, 2025
7858117
update utils for transformers config in hydra
yaoyu-33 Sep 19, 2025
457bace
temp save
yaoyu-33 Sep 19, 2025
22233a2
pipeclean conversion (forward wip)
cuichenx Sep 19, 2025
6937da4
Merge branch 'refs/heads/main' into qwen-25vl-training
yaoyu-33 Sep 22, 2025
c67f734
vlm generate script updates for nemotron vl
cuichenx Sep 25, 2025
fcca45c
Merge remote-tracking branch 'refs/remotes/origin/main' into chcui/ne…
cuichenx Sep 25, 2025
790cd8d
fix after merging with main
cuichenx Sep 25, 2025
3a9ab4f
clean up
cuichenx Sep 25, 2025
e0fc7d1
fix forward pass
cuichenx Sep 26, 2025
44faee0
add /no_think sys prompt
cuichenx Sep 29, 2025
8a51440
Merge branch 'refs/heads/main' into qwen-25vl-training
yaoyu-33 Sep 30, 2025
3bc6ba5
lint
yaoyu-33 Sep 30, 2025
8061e0f
revert qwen-vl changes in gpt
yaoyu-33 Sep 30, 2025
df4755a
revert qwen-vl changes in gpt #2
yaoyu-33 Sep 30, 2025
975efd2
Add mock dataset provider for qwen25 vl
yaoyu-33 Sep 30, 2025
be708c2
add qwen25 vl dataset support from auto
yaoyu-33 Sep 30, 2025
6822d34
lint
yaoyu-33 Sep 30, 2025
ec9c7cd
enable multi image and video inputs
cuichenx Sep 30, 2025
bc8c605
update _attn_implementation
yaoyu-33 Oct 1, 2025
689f491
update comments
yaoyu-33 Oct 1, 2025
cf2c769
Merge branch 'chcui/nemotron-nano-v2-vl' into 'dev/nemotron-nano-v2-vl'
cuichenx Oct 1, 2025
4f0e90f
add preloaded dataset provider
yaoyu-33 Oct 1, 2025
4959ea5
enable hf export (need to manually copy over modeling files)
cuichenx Oct 2, 2025
98caa7a
expose strict
cuichenx Oct 2, 2025
2af0c2e
update _processor to a private attr
yaoyu-33 Oct 2, 2025
4a3ef3b
Merge branch 'chcui/hf_export' into 'dev/nemotron-nano-v2-vl'
cuichenx Oct 2, 2025
7f3818e
Merge branch 'refs/heads/main' into chcui/nano-v2-vl-training
cuichenx Oct 2, 2025
ccf6abe
update qwen training utils
yaoyu-33 Oct 2, 2025
94c6192
training bug fix
yaoyu-33 Oct 2, 2025
95d3002
fix finalize grad
yaoyu-33 Oct 3, 2025
4b7ef60
save qwen25 vl recipes
yaoyu-33 Oct 3, 2025
c37ffa0
training WIP
cuichenx Oct 3, 2025
03e3a7c
undo ckpt modification, loading works
cuichenx Oct 3, 2025
b095aae
Merge branch 'chcui/nano-v2-vl-training' into 'dev/nemotron-nano-v2-vl'
cuichenx Oct 3, 2025
608117e
add padding logic for pp
yaoyu-33 Oct 3, 2025
a9f0e15
vlm step general
yaoyu-33 Oct 6, 2025
6ddd4b3
default update
yaoyu-33 Oct 6, 2025
f30aa39
Merge branch 'main' into qwen-25vl-training
yaoyu-33 Oct 6, 2025
e425113
update to model specific visual inputs, also update mock dataset to b…
yaoyu-33 Oct 6, 2025
5bc1f29
Merge branch 'main' into qwen-25vl-training
yaoyu-33 Oct 6, 2025
90a0ff0
add ci tests
yaoyu-33 Oct 7, 2025
49759bc
lint
yaoyu-33 Oct 8, 2025
62ffa88
update dependency
yaoyu-33 Oct 8, 2025
6af4e4c
build: add qwen-vl-utils and update lockfile
yaoyu-33 Oct 8, 2025
7e0ceaf
remove `start_of_response_token` use
yaoyu-33 Oct 8, 2025
a7e5fdc
add few more unit tests
yaoyu-33 Oct 8, 2025
1e44b97
fix wandb reinit issue
yaoyu-33 Oct 8, 2025
18012cd
Revert "fix wandb reinit issue"
yaoyu-33 Oct 9, 2025
b0b910e
lint
yaoyu-33 Oct 9, 2025
d2031ca
update and fix tests for vlm dataset
yaoyu-33 Oct 9, 2025
3d8f4b3
Merge remote-tracking branch 'origin/qwen-25vl-training' into chcui/n…
cuichenx Oct 10, 2025
70aafe2
training works
cuichenx Oct 14, 2025
398a812
add raven and llava-video datasets
cuichenx Oct 14, 2025
a44d26c
push discussion code
cuichenx Oct 15, 2025
cbc25d4
Merge branch 'chcui/nano-v2-vl-training' into 'dev/nemotron-nano-v2-vl'
cuichenx Oct 15, 2025
56f9ad9
support video training
liding-nv Oct 17, 2025
a8ad5fd
add peft merge
cuichenx Oct 17, 2025
46cd9b9
change wording
cuichenx Oct 17, 2025
6008b3e
save every 200
cuichenx Oct 17, 2025
2da5696
clean up internal paths
cuichenx Oct 17, 2025
d3dd155
add merge lora script..
cuichenx Oct 18, 2025
3a13a6c
fix import
liding-nv Oct 20, 2025
b9da6cf
support multi subset video
liding-nv Oct 20, 2025
0bcfcb8
export with copy
cuichenx Oct 26, 2025
e9ee70d
qa fixes
cuichenx Oct 27, 2025
546c233
Merge remote-tracking branch 'refs/remotes/origin/main' into chcui/ne…
cuichenx Oct 28, 2025
e69586d
clean up code
cuichenx Oct 28, 2025
85c6a44
Merge remote-tracking branch 'origin/main' into chcui/nemotron-nano-v…
cuichenx Oct 28, 2025
d31d50f
Merge remote-tracking branch 'origin/main' into chcui/nemotron-nano-v…
cuichenx Oct 28, 2025
2e223e8
change to supported HF architectures
cuichenx Oct 28, 2025
1eb8fa3
add tests
cuichenx Oct 28, 2025
6f739cf
Merge remote-tracking branch 'refs/remotes/origin/main' into chcui/ne…
cuichenx Oct 29, 2025
0abb526
Merge remote-tracking branch 'refs/remotes/origin/main' into chcui/ne…
cuichenx Oct 29, 2025
0567e20
address comments
cuichenx Oct 29, 2025
edc2d98
copy over py and json files only
cuichenx Oct 31, 2025
9e80f35
merge causal lm and vlm so that output saves preprocessor config auto…
cuichenx Oct 31, 2025
bd447ae
move nemotron vlm generation to a new script
cuichenx Oct 31, 2025
bac193a
address comment
cuichenx Oct 31, 2025
c0756ce
move path helper to common utils
cuichenx Oct 31, 2025
707562a
Merge branch 'main' into chcui/nemotron-nano-v2-vl
cuichenx Oct 31, 2025
f7e0d3b
update model name
cuichenx Oct 31, 2025
b6a60d7
Merge branch 'chcui/nemotron-nano-v2-vl' of github.com:NVIDIA-NeMo/Me…
cuichenx Oct 31, 2025
bfda67e
refactor to llava_step
cuichenx Nov 1, 2025
71b4e78
clean up
cuichenx Nov 1, 2025
8813087
Merge branch 'main' into chcui/nemotron-nano-v2-vl
cuichenx Nov 3, 2025
e67e9f1
revert previous export copy code
cuichenx Nov 3, 2025
ced4190
raise error if trying to access validation split for raven and llava …
cuichenx Nov 4, 2025
f603601
Fix typo
cuichenx Nov 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion examples/conversion/convert_checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def export_megatron_to_hf(
megatron_path: str,
hf_path: str,
show_progress: bool = True,
strict: bool = True,
) -> None:
"""
Export a Megatron checkpoint to HuggingFace format.
Expand Down Expand Up @@ -175,14 +176,15 @@ def export_megatron_to_hf(

# For demonstration, we'll create a bridge from a known config
# This would typically be extracted from the checkpoint metadata
bridge = AutoBridge.from_hf_pretrained(hf_model)
bridge = AutoBridge.from_hf_pretrained(hf_model, trust_remote_code=True)

# Export using the convenience method
print("📤 Exporting to HuggingFace format...")
bridge.export_ckpt(
megatron_path=megatron_path,
hf_path=hf_path,
show_progress=show_progress,
strict=strict,
)

print(f"✅ Successfully exported model to: {hf_path}")
Expand Down Expand Up @@ -232,6 +234,7 @@ def main():
"--hf-path", required=True, help="Directory path where the HuggingFace model will be saved"
)
export_parser.add_argument("--no-progress", action="store_true", help="Disable progress bar during export")
export_parser.add_argument("--not-strict", action="store_true", help="Allow source and target checkpoint to have different keys")

args = parser.parse_args()

Expand All @@ -254,6 +257,7 @@ def main():
megatron_path=args.megatron_path,
hf_path=args.hf_path,
show_progress=not args.no_progress,
strict=not args.not_strict,
)
else:
raise RuntimeError(f"Unknown command: {args.command}")
Expand Down
171 changes: 144 additions & 27 deletions examples/conversion/hf_to_megatron_generate_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@
"""

import argparse
from typing import Optional

from typing import Optional, List
import requests
import torch
import torch.distributed as dist
Expand All @@ -39,6 +38,7 @@
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, AutoTokenizer

from megatron.bridge.models.nemotron_vl.nemotron_vl_utils import adjust_image_tokens
from megatron.bridge import AutoBridge
from megatron.bridge.utils.common_utils import get_last_rank, print_rank_0

Expand All @@ -52,18 +52,20 @@ class SingleBatchIterator:
then raises StopIteration. Used for single-step inference in the forward pass.
"""

def __init__(self, input_ids, position_ids, attention_mask, pixel_values=None, image_grid_thw=None):
def __init__(self, input_ids, position_ids, attention_mask, **kwargs):
self.batch = dict(
tokens=input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
)

# Add vision inputs if provided
if pixel_values is not None:
self.batch["pixel_values"] = pixel_values
if image_grid_thw is not None:
self.batch["image_grid_thw"] = image_grid_thw
if kwargs.get("images", None) is not None:
self.batch["images"] = kwargs.get("images", None)
elif kwargs.get("pixel_values", None) is not None:
self.batch["pixel_values"] = kwargs.get("pixel_values", None)
if kwargs.get("image_grid_thw", None) is not None:
self.batch["image_grid_thw"] = kwargs.get("image_grid_thw", None)

self._yielded = False

Expand Down Expand Up @@ -99,8 +101,9 @@ def vlm_forward_step(data_iterator, model, **kwargs) -> torch.Tensor:
"attention_mask": batch.get("attention_mask", None),
}

# Add vision inputs if present
if "pixel_values" in batch:
if "images" in batch:
forward_args["images"] = batch["images"]
elif "pixel_values" in batch:
forward_args["pixel_values"] = batch["pixel_values"]
if "image_grid_thw" in batch:
forward_args["image_grid_thw"] = batch["image_grid_thw"]
Expand Down Expand Up @@ -128,7 +131,7 @@ def load_image(image_path: str) -> Image.Image:
return Image.open(image_path)


def process_image_inputs(processor, image_path: Optional[str], prompt: str):
def process_image_inputs(processor, image_path: Optional[str], prompt: str, system_prompt: Optional[str] = None):
"""Process image inputs for vision-language model.

Args:
Expand All @@ -140,16 +143,27 @@ def process_image_inputs(processor, image_path: Optional[str], prompt: str):
Tuple of (input_ids, pixel_values, image_grid_thw, messages)
"""
if image_path:
if "," in image_path:
image_paths = image_path.split(",")
content = []
for i, path in enumerate(image_paths):
content.append({"type": "text", "text": f"{'\n' if i > 0 else ''}Image-{i+1}: "})
content.append({"type": "image", "image": path})
content.append({"type": "text", "text": '\n' + prompt})
else:
content = [
{"type": "image", "image": image_path},
{"type": "text", "text": prompt},
]
# Create messages with image and text
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image_path},
{"type": "text", "text": prompt},
],
"content": content,
}
]
if system_prompt:
messages.insert(0, {"role": "system", "content": system_prompt})

# Process vision info
image_inputs, video_inputs = process_vision_info(messages)
Expand All @@ -162,15 +176,73 @@ def process_image_inputs(processor, image_path: Optional[str], prompt: str):
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
padding=processor.tokenizer.pad_token is not None,
return_tensors="pt",
)
return inputs.input_ids, inputs.pixel_values, getattr(inputs, "image_grid_thw", None), messages
try:
image_grid_thw = inputs.image_grid_thw
except AttributeError:
image_grid_thw = None
return inputs.input_ids, inputs.pixel_values, image_grid_thw, inputs.num_patches
else:
# Text-only processing
inputs = processor(text=[prompt], return_tensors="pt")
return inputs.input_ids, None, None, None

def process_video_inputs(processor, video_path: Optional[str], prompt: str, system_prompt: Optional[str] = None):
"""Process video inputs for vision-language model.
"""
from megatron.bridge.models.nemotron_vl.nemotron_vl_utils import maybe_path_or_url_to_data_urls, pil_image_from_base64

video_fps = -1
video_nframe = 10
video_nframe_max = -1

# Get frames and metadata
image_urls, metadata = maybe_path_or_url_to_data_urls(
video_path,
fps=max(0, int(video_fps)),
nframe=max(0, int(video_nframe)),
nframe_max=int(video_nframe_max),
)
frames = [pil_image_from_base64(image_url) for image_url in image_urls]

print(f"Video Metadata: {metadata}")

messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": f"file://{video_path}",
},
{
"type": "text",
"text": "\n" + prompt,
},
],
}
]
if system_prompt:
messages.insert(0, {"role": "system", "content": system_prompt})
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Process with FPS metadata
if metadata:
inputs = processor(
text=[prompt],
videos=frames,
videos_kwargs={'video_metadata': metadata},
return_tensors="pt",
)
else:
inputs = processor(
text=[prompt],
videos=frames,
return_tensors="pt",
)
return inputs.input_ids, inputs.pixel_values_videos, None, inputs.num_patches

def main(args) -> None:
"""Main function for vision-language generation from HuggingFace VL models.
Expand All @@ -196,7 +268,7 @@ def main(args) -> None:

# We still need HF config for tokenizer, but we'll load the model from Megatron checkpoint
# Create bridge from HF config only (no weights)
bridge = AutoBridge.from_hf_pretrained(args.hf_model_path)
bridge = AutoBridge.from_hf_pretrained(args.hf_model_path, trust_remote_code=True)

# Initialize model parallel before loading
model_provider = bridge.to_megatron_provider(load_weights=False)
Expand All @@ -223,7 +295,7 @@ def main(args) -> None:
else:
# Load from HuggingFace and convert to Megatron
print_rank_0(f"Loading HuggingFace model from: {args.hf_model_path}")
bridge = AutoBridge.from_hf_pretrained(args.hf_model_path)
bridge = AutoBridge.from_hf_pretrained(args.hf_model_path, trust_remote_code=True)
model_provider = bridge.to_megatron_provider(load_weights=True)
model_provider.tensor_model_parallel_size = tp
model_provider.pipeline_model_parallel_size = pp
Expand All @@ -240,19 +312,33 @@ def main(args) -> None:
# Initialize tokenizer and processor
tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(args.hf_model_path, trust_remote_code=True)
img_start_token_id = tokenizer.convert_tokens_to_ids("<img>")
img_end_token_id = tokenizer.convert_tokens_to_ids("</img>")

if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token

# Process inputs (text and image if provided)
prompt = args.prompt
input_ids, pixel_values, image_grid_thw, messages = process_image_inputs(processor, args.image_path, prompt)
if args.video_path:
input_ids, pixel_values, image_grid_thw, num_patches = process_video_inputs(processor, args.video_path, args.prompt, args.system_prompt)
else:
input_ids, pixel_values, image_grid_thw, num_patches = process_image_inputs(processor, args.image_path, args.prompt, args.system_prompt)

images = None
if args.use_llava_model:
images = pixel_values.bfloat16()
input_ids = adjust_image_tokens(input_ids, num_patches, img_start_token_id,
img_end_token_id)
if args.video_path:
video_token_id = tokenizer.convert_tokens_to_ids("<video>")
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
input_ids = torch.where(input_ids == video_token_id, image_token_id, input_ids)
pixel_values = None

# Move to GPU
input_ids = input_ids.cuda()
if pixel_values is not None:
pixel_values = pixel_values.cuda()
if image_grid_thw is not None:
image_grid_thw = image_grid_thw.cuda()
if images is not None:
images = images.cuda()

position_ids = (
torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device).unsqueeze(0).expand_as(input_ids)
Expand All @@ -271,7 +357,14 @@ def main(args) -> None:
# Keep passing vision inputs for all steps to ensure image features are available
# The Megatron VL model only processes vision features when pixel_values is not None,
# so we need to provide them throughout the generation process
iterator = SingleBatchIterator(input_ids, position_ids, attention_mask, pixel_values, image_grid_thw)
iterator = SingleBatchIterator(
input_ids,
position_ids,
attention_mask,
pixel_values=pixel_values,
image_grid_thw=image_grid_thw,
images=images,
)

output = fwd_bwd_function(
forward_step_func=vlm_forward_step,
Expand All @@ -285,6 +378,9 @@ def main(args) -> None:
)
if isinstance(output, list) and len(output) > 0:
output = output[0]
if isinstance(output, tuple):
# for LlavaModel
output = output[0]

if parallel_state.is_pipeline_last_stage():
world_size = parallel_state.get_tensor_model_parallel_world_size()
Expand Down Expand Up @@ -328,7 +424,7 @@ def main(args) -> None:
print_rank_0("======== GENERATED TEXT OUTPUT ========")
if args.image_path:
print_rank_0(f"Image: {args.image_path}")
print_rank_0(f"Prompt: {prompt}")
print_rank_0(f"Prompt: {args.prompt}")
print_rank_0(f"Generated: {generated_text}")
print_rank_0("=======================================")

Expand All @@ -347,6 +443,12 @@ def main(args) -> None:
default="Describe this image.",
help="Input prompt for vision-language generation.",
)
parser.add_argument(
"--system_prompt",
type=str,
default=None,
help="System prompt for vision-language generation.",
)
parser.add_argument(
"--max_new_tokens",
type=int,
Expand All @@ -362,10 +464,25 @@ def main(args) -> None:
"--image_path",
type=str,
default=None,
help="Path or URL to the image for vision-language generation (optional).",
help="Path or URL to the image for vision-language generation (optional). Multiple images paths can be separated"
"with commas.",
)
parser.add_argument(
"--video_path",
type=str,
default=None,
help="Path or URL to the video for vision-language generation (optional).",
)
parser.add_argument(
"--use_llava_model",
action="store_true",
default=False,
help="Specify whether model uses Megatron vision model (i.e. LLaVAModel)",
)
args = parser.parse_args()

if args.use_llava_model:
args.system_prompt = "/no_think" # Nemotron Nano V2 VL model requires this system prompt
main(args)

if torch.distributed.is_initialized():
Expand Down
Loading