NVIDIA-NeMo · cuichenx · Sep 18, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/examples/conversion/convert_checkpoints.py b/examples/conversion/convert_checkpoints.py
@@ -141,6 +141,7 @@ def export_megatron_to_hf(
     megatron_path: str,
     hf_path: str,
     show_progress: bool = True,
+    strict: bool = True,
 ) -> None:
     """
     Export a Megatron checkpoint to HuggingFace format.
@@ -175,14 +176,15 @@ def export_megatron_to_hf(
 
     # For demonstration, we'll create a bridge from a known config
     # This would typically be extracted from the checkpoint metadata
-    bridge = AutoBridge.from_hf_pretrained(hf_model)
+    bridge = AutoBridge.from_hf_pretrained(hf_model, trust_remote_code=True)
 
     # Export using the convenience method
     print("📤 Exporting to HuggingFace format...")
     bridge.export_ckpt(
         megatron_path=megatron_path,
         hf_path=hf_path,
         show_progress=show_progress,
+        strict=strict,
     )
 
     print(f"✅ Successfully exported model to: {hf_path}")
@@ -232,6 +234,7 @@ def main():
         "--hf-path", required=True, help="Directory path where the HuggingFace model will be saved"
     )
     export_parser.add_argument("--no-progress", action="store_true", help="Disable progress bar during export")
+    export_parser.add_argument("--not-strict", action="store_true", help="Allow source and target checkpoint to have different keys")
 
     args = parser.parse_args()
 
@@ -254,6 +257,7 @@ def main():
             megatron_path=args.megatron_path,
             hf_path=args.hf_path,
             show_progress=not args.no_progress,
+            strict=not args.not_strict,
         )
     else:
         raise RuntimeError(f"Unknown command: {args.command}")

diff --git a/examples/conversion/hf_to_megatron_generate_vlm.py b/examples/conversion/hf_to_megatron_generate_vlm.py
@@ -28,8 +28,7 @@
 """
 
 import argparse
-from typing import Optional
-
+from typing import Optional, List
 import requests
 import torch
 import torch.distributed as dist
@@ -39,6 +38,7 @@
 from qwen_vl_utils import process_vision_info
 from transformers import AutoProcessor, AutoTokenizer
 
+from megatron.bridge.models.nemotron_vl.nemotron_vl_utils import adjust_image_tokens
 from megatron.bridge import AutoBridge
 from megatron.bridge.utils.common_utils import get_last_rank, print_rank_0
 
@@ -52,18 +52,20 @@ class SingleBatchIterator:
     then raises StopIteration. Used for single-step inference in the forward pass.
     """
 
-    def __init__(self, input_ids, position_ids, attention_mask, pixel_values=None, image_grid_thw=None):
+    def __init__(self, input_ids, position_ids, attention_mask, **kwargs):
         self.batch = dict(
             tokens=input_ids,
             position_ids=position_ids,
             attention_mask=attention_mask,
         )
 
         # Add vision inputs if provided
-        if pixel_values is not None:
-            self.batch["pixel_values"] = pixel_values
-        if image_grid_thw is not None:
-            self.batch["image_grid_thw"] = image_grid_thw
+        if kwargs.get("images", None) is not None:
+            self.batch["images"] = kwargs.get("images", None)
+        elif kwargs.get("pixel_values", None) is not None:
+            self.batch["pixel_values"] = kwargs.get("pixel_values", None)
+        if kwargs.get("image_grid_thw", None) is not None:
+            self.batch["image_grid_thw"] = kwargs.get("image_grid_thw", None)
 
         self._yielded = False
 
@@ -99,8 +101,9 @@ def vlm_forward_step(data_iterator, model, **kwargs) -> torch.Tensor:
         "attention_mask": batch.get("attention_mask", None),
     }
 
-    # Add vision inputs if present
-    if "pixel_values" in batch:
+    if "images" in batch:
+        forward_args["images"] = batch["images"]
+    elif "pixel_values" in batch:
         forward_args["pixel_values"] = batch["pixel_values"]
     if "image_grid_thw" in batch:
         forward_args["image_grid_thw"] = batch["image_grid_thw"]
@@ -128,7 +131,7 @@ def load_image(image_path: str) -> Image.Image:
         return Image.open(image_path)
 
 
-def process_image_inputs(processor, image_path: Optional[str], prompt: str):
+def process_image_inputs(processor, image_path: Optional[str], prompt: str, system_prompt: Optional[str] = None):
     """Process image inputs for vision-language model.
 
     Args:
@@ -140,16 +143,27 @@ def process_image_inputs(processor, image_path: Optional[str], prompt: str):
         Tuple of (input_ids, pixel_values, image_grid_thw, messages)
     """
     if image_path:
+        if "," in image_path:
+            image_paths = image_path.split(",")
+            content = []
+            for i, path in enumerate(image_paths):
+                content.append({"type": "text", "text": f"{'\n' if i > 0 else ''}Image-{i+1}: "})
+                content.append({"type": "image", "image": path})
+            content.append({"type": "text", "text": '\n' + prompt})
+        else:
+            content = [
+                    {"type": "image", "image": image_path},
+                    {"type": "text", "text": prompt},
+                ]
         # Create messages with image and text
         messages = [
             {
                 "role": "user",
-                "content": [
-                    {"type": "image", "image": image_path},
-                    {"type": "text", "text": prompt},
-                ],
+                "content": content,
             }
         ]
+        if system_prompt:
+            messages.insert(0, {"role": "system", "content": system_prompt})
 
         # Process vision info
         image_inputs, video_inputs = process_vision_info(messages)
@@ -162,15 +176,73 @@ def process_image_inputs(processor, image_path: Optional[str], prompt: str):
             text=[text],
             images=image_inputs,
             videos=video_inputs,
-            padding=True,
+            padding=processor.tokenizer.pad_token is not None,
             return_tensors="pt",
         )
-        return inputs.input_ids, inputs.pixel_values, getattr(inputs, "image_grid_thw", None), messages
+        try:
+            image_grid_thw = inputs.image_grid_thw
+        except AttributeError:
+            image_grid_thw = None
+        return inputs.input_ids, inputs.pixel_values, image_grid_thw, inputs.num_patches
     else:
         # Text-only processing
         inputs = processor(text=[prompt], return_tensors="pt")
         return inputs.input_ids, None, None, None
 
+def process_video_inputs(processor, video_path: Optional[str], prompt: str, system_prompt: Optional[str] = None):
+    """Process video inputs for vision-language model.
+    """
+    from megatron.bridge.models.nemotron_vl.nemotron_vl_utils import maybe_path_or_url_to_data_urls, pil_image_from_base64
+
+    video_fps = -1
+    video_nframe = 10
+    video_nframe_max = -1
+
+    # Get frames and metadata
+    image_urls, metadata = maybe_path_or_url_to_data_urls(
+        video_path,
+        fps=max(0, int(video_fps)),
+        nframe=max(0, int(video_nframe)),
+        nframe_max=int(video_nframe_max),
+    )
+    frames = [pil_image_from_base64(image_url) for image_url in image_urls]
+
+    print(f"Video Metadata: {metadata}")
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video",
+                    "video": f"file://{video_path}",
+                },
+                {
+                    "type": "text",
+                    "text": "\n" + prompt,
+                },
+            ],
+        }
+    ]
+    if system_prompt:
+        messages.insert(0, {"role": "system", "content": system_prompt})
+    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+    # Process with FPS metadata
+    if metadata:
+        inputs = processor(
+            text=[prompt],
+            videos=frames,
+            videos_kwargs={'video_metadata': metadata},
+            return_tensors="pt",
+        )
+    else:
+        inputs = processor(
+            text=[prompt],
+            videos=frames,
+            return_tensors="pt",
+        )
+    return inputs.input_ids, inputs.pixel_values_videos, None, inputs.num_patches
 
 def main(args) -> None:
     """Main function for vision-language generation from HuggingFace VL models.
@@ -196,7 +268,7 @@ def main(args) -> None:
 
         # We still need HF config for tokenizer, but we'll load the model from Megatron checkpoint
         # Create bridge from HF config only (no weights)
-        bridge = AutoBridge.from_hf_pretrained(args.hf_model_path)
+        bridge = AutoBridge.from_hf_pretrained(args.hf_model_path, trust_remote_code=True)
 
         # Initialize model parallel before loading
         model_provider = bridge.to_megatron_provider(load_weights=False)
@@ -223,7 +295,7 @@ def main(args) -> None:
     else:
         # Load from HuggingFace and convert to Megatron
         print_rank_0(f"Loading HuggingFace model from: {args.hf_model_path}")
-        bridge = AutoBridge.from_hf_pretrained(args.hf_model_path)
+        bridge = AutoBridge.from_hf_pretrained(args.hf_model_path, trust_remote_code=True)
         model_provider = bridge.to_megatron_provider(load_weights=True)
         model_provider.tensor_model_parallel_size = tp
         model_provider.pipeline_model_parallel_size = pp
@@ -240,19 +312,33 @@ def main(args) -> None:
     # Initialize tokenizer and processor
     tokenizer = AutoTokenizer.from_pretrained(args.hf_model_path, trust_remote_code=True)
     processor = AutoProcessor.from_pretrained(args.hf_model_path, trust_remote_code=True)
+    img_start_token_id = tokenizer.convert_tokens_to_ids("<img>")
+    img_end_token_id = tokenizer.convert_tokens_to_ids("</img>")
+
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-
-    # Process inputs (text and image if provided)
-    prompt = args.prompt
-    input_ids, pixel_values, image_grid_thw, messages = process_image_inputs(processor, args.image_path, prompt)
+    if args.video_path:
+        input_ids, pixel_values, image_grid_thw, num_patches = process_video_inputs(processor, args.video_path, args.prompt, args.system_prompt)
+    else:
+        input_ids, pixel_values, image_grid_thw, num_patches = process_image_inputs(processor, args.image_path, args.prompt, args.system_prompt)
+
+    images = None
+    if args.use_llava_model:
+        images = pixel_values.bfloat16()
+        input_ids = adjust_image_tokens(input_ids, num_patches, img_start_token_id,
+                                         img_end_token_id)
+        if args.video_path:
+            video_token_id = tokenizer.convert_tokens_to_ids("<video>")
+            image_token_id = tokenizer.convert_tokens_to_ids("<image>")
+            input_ids = torch.where(input_ids == video_token_id, image_token_id, input_ids)
+        pixel_values = None
 
     # Move to GPU
     input_ids = input_ids.cuda()
     if pixel_values is not None:
         pixel_values = pixel_values.cuda()
-    if image_grid_thw is not None:
-        image_grid_thw = image_grid_thw.cuda()
+    if images is not None:
+        images = images.cuda()
 
     position_ids = (
         torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device).unsqueeze(0).expand_as(input_ids)
@@ -271,7 +357,14 @@ def main(args) -> None:
             # Keep passing vision inputs for all steps to ensure image features are available
             # The Megatron VL model only processes vision features when pixel_values is not None,
             # so we need to provide them throughout the generation process
-            iterator = SingleBatchIterator(input_ids, position_ids, attention_mask, pixel_values, image_grid_thw)
+            iterator = SingleBatchIterator(
+                input_ids,
+                position_ids,
+                attention_mask,
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+                images=images,
+            )
 
             output = fwd_bwd_function(
                 forward_step_func=vlm_forward_step,
@@ -285,6 +378,9 @@ def main(args) -> None:
             )
             if isinstance(output, list) and len(output) > 0:
                 output = output[0]
+                if isinstance(output, tuple):
+                    # for LlavaModel
+                    output = output[0]
 
             if parallel_state.is_pipeline_last_stage():
                 world_size = parallel_state.get_tensor_model_parallel_world_size()
@@ -328,7 +424,7 @@ def main(args) -> None:
     print_rank_0("======== GENERATED TEXT OUTPUT ========")
     if args.image_path:
         print_rank_0(f"Image: {args.image_path}")
-    print_rank_0(f"Prompt: {prompt}")
+    print_rank_0(f"Prompt: {args.prompt}")
     print_rank_0(f"Generated: {generated_text}")
     print_rank_0("=======================================")
 
@@ -347,6 +443,12 @@ def main(args) -> None:
         default="Describe this image.",
         help="Input prompt for vision-language generation.",
     )
+    parser.add_argument(
+        "--system_prompt",
+        type=str,
+        default=None,
+        help="System prompt for vision-language generation.",
+    )
     parser.add_argument(
         "--max_new_tokens",
         type=int,
@@ -362,10 +464,25 @@ def main(args) -> None:
         "--image_path",
         type=str,
         default=None,
-        help="Path or URL to the image for vision-language generation (optional).",
+        help="Path or URL to the image for vision-language generation (optional). Multiple images paths can be separated"
+             "with commas.",
+    )
+    parser.add_argument(
+        "--video_path",
+        type=str,
+        default=None,
+        help="Path or URL to the video for vision-language generation (optional).",
+    )
+    parser.add_argument(
+        "--use_llava_model",
+        action="store_true",
+        default=False,
+        help="Specify whether model uses Megatron vision model (i.e. LLaVAModel)",
     )
     args = parser.parse_args()
 
+    if args.use_llava_model:
+        args.system_prompt = "/no_think"  # Nemotron Nano V2 VL model requires this system prompt
     main(args)
 
     if torch.distributed.is_initialized():