Testing Changes

Mohit Soni · Mohit Soni · commit 2c9e1b4f4ebb · 2025-09-26T08:40:26.000Z
Signed-off-by: Mohit Soni &lt;mohisoni@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -818,12 +818,9 @@ def kv_offload_generate(
             in {"pixel_values", "image_masks", "image_input_idx", "valid_idx", "aspect_ratio_ids", "aspect_ratio_mask"}
         }
 
-        molmo = hasattr(self.model.config, "model_type") and self.model.config.model_type == "molmo"
+        vision_inputs_fp16 = {"pixel_values", "image_masks"}
+        vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs})
 
-        if vision_inputs:
-            vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16")
-            if molmo:
-                vision_inputs["image_masks"] = vision_inputs["image_masks"].astype("float16")
         vision_start = perf_counter()
 
         vision_outputs = {}
diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py
@@ -235,10 +235,7 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         if not self.config.norm_after:
-            if self._activation_checkpoint_fn is not None:
-                atten_in = self._activation_checkpoint_fn(self.attn_norm, x)
-            else:
-                atten_in = self.attn_norm(x)
+            atten_in = self.attn_norm(x)
         else:
             atten_in = x
         qkv = self.att_proj(atten_in)
@@ -249,34 +246,19 @@ def forward(
         q, k, v = qkv.split(self.fused_dims, dim=-1)
 
         # Get attention scores.
-        if self._activation_checkpoint_fn is not None:
-            att, cache = self._activation_checkpoint_fn(  # type: ignore
-                self.attention,
-                q,
-                k,
-                v,
-                attention_bias,
-                position_ids=position_ids,
-                layer_past=layer_past,
-                use_cache=use_cache,
-            )
-        else:
-            att, cache = self.attention(
-                q,
-                k,
-                v,
-                attention_bias,
-                position_ids=position_ids,
-                layer_past=layer_past,
-                batch_index=batch_index,
-                use_cache=use_cache,
-            )
+        att, cache = self.attention(
+            q,
+            k,
+            v,
+            attention_bias,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            batch_index=batch_index,
+            use_cache=use_cache,
+        )
 
         if self.config.norm_after:
-            if self._activation_checkpoint_fn is not None:
-                att = self._activation_checkpoint_fn(self.attn_norm, att)
-            else:
-                att = self.attn_norm(att)
+            att = self.attn_norm(att)
 
         # Add attention scores.
         # shape: (B, T, C)
@@ -287,23 +269,15 @@ def forward(
         og_x = x
 
         if not self.config.norm_after:
-            if self._activation_checkpoint_fn is not None:
-                x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
-            else:
-                x = self.ff_norm(x)
+            x = self.ff_norm(x)
 
         x = self.ff_proj(x)
-        if self._activation_checkpoint_fn is not None:
-            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
-        else:
-            x = self.act(x)
+
+        x = self.act(x)
         x = self.ff_out(x)
 
         if self.config.norm_after:
-            if self._activation_checkpoint_fn is not None:
-                x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
-            else:
-                x = self.ff_norm(x)
+            x = self.ff_norm(x)
 
         x = self.dropout(x)
         x = og_x + x
diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py
@@ -439,3 +439,36 @@ def run_vlm_hf_model_on_pytorch(self, model, inputs, generation_config):
         print("Original HF Model Outputs (Torch CPU):")
         print("Completion:", repr(py_output))
         return generated_ids
+
+
+class ApiRunnerMolmo(ApiRunnerVlm):
+    """
+    ApiRunner for Molmo models:
+    ---------
+
+    1. HuggingFace ``PyTorch`` model
+    2. Transformed KV Pytorch Model
+    3. ``ONNX`` model on ONNXRT
+    4. ``ONNX`` model on Cloud AI 100
+    """
+
+    def __init__(self, batch_size, processor, config, image, prompt, prompt_len, ctx_len, max_gen_len, n_layer):
+        self.processor = processor
+        self.ctx_len = ctx_len
+        self.prompt_len = prompt_len
+        self.batch_size = batch_size
+        self.config = config
+        self.gen_len = max_gen_len
+
+    @torch.no_grad()
+    def run_vlm_hf_model_on_pytorch(self, model, inputs, generation_config):
+        outputs = model.generate_from_batch(
+            inputs, generation_config, tokenizer=self.processor.tokenizer, do_sample=False
+        )
+
+        generated_ids = outputs[0, inputs["input_ids"].size(1) :]
+
+        py_output = self.processor.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+        print("Original HF Model Outputs (Torch CPU):")
+        print("Completion:", repr(py_output))
+        return generated_ids
diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py
@@ -19,6 +19,7 @@
     AutoModelForImageTextToText,
     AutoProcessor,
     AutoTokenizer,
+    GenerationConfig,
     TextStreamer,
 )
 
@@ -27,7 +28,7 @@
 from QEfficient.utils._utils import create_json, get_num_layers_vlm
 from QEfficient.utils.constants import QnnConstants
 from QEfficient.utils.device_utils import get_available_device_id
-from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerVlm
+from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerMolmo, ApiRunnerVlm
 from QEfficient.utils.test_utils import InternProcessor
 
 NEW_GENERATION_TOKENS = 10
@@ -146,6 +147,19 @@
     # ), # commented becuase QNN Convertor is not supported for this model yet.
 ]
 
+molmo_model_config = [
+    (
+        "allenai/Molmo-7B-D-0924",
+        True,
+        1,
+        128,
+        4096,
+        "https://picsum.photos/id/237/536/354",
+        "Can you describe the image in detail.",
+        2,
+    ),
+]
+
 
 def load_image_text_to_text_model(model_config):
     model_path = hf_download(
@@ -185,6 +199,8 @@ def set_num_layers(config, n_layer=1):
     elif hasattr(config, "llm_config"):
         config.llm_config.num_hidden_layers = n_layer
         config.vision_config.num_hidden_layers = n_layer
+    else:
+        config.num_hidden_layers = n_layer
     return config
 
 
@@ -276,6 +292,77 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
     return
 
 
+def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
+    model_name: str,
+    img_url: str,
+    query: str,
+    prompt_len: int,
+    ctx_len: int,
+    max_gen_len: int = 20,
+    batch_size: int = 1,
+    n_layer: int = 1,
+    kv_offload: bool = False,
+    num_devices: int = 1,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
+):
+    model_config = {"model_name": model_name}
+
+    config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True)
+    config._attn_implementation = "eager"
+    config = set_num_layers(config, n_layer=n_layer)
+    model_hf, _ = load_image_text_to_text_model(config)
+    n_layer = (n_layer, n_layer)
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True)
+    img = requests.get(img_url, stream=True)
+    image = Image.open(BytesIO(img.content)).convert("RGB")
+    image = image.resize((536, 354))
+
+    api_runner = ApiRunnerMolmo(
+        batch_size,
+        processor,
+        config,
+        image,
+        query,
+        prompt_len,
+        ctx_len,
+        max_gen_len,
+        n_layer,
+    )
+
+    inputs = processor.process(images=[image], text=query)
+    inputs = {k: v.unsqueeze(0) for k, v in inputs.items()}
+
+    generation_config = GenerationConfig(max_new_tokens=NEW_GENERATION_TOKENS, stop_strings="<|endoftext|>")
+    pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config)
+
+    batch_size, prompt_len = inputs["input_ids"].shape
+    inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64)
+    valid = inputs["image_input_idx"] > 0
+    valid = valid.reshape(1, -1)
+    inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0)
+    inputs["pixel_values"] = inputs.pop("images")
+
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
+        model_config["model_name"],
+        kv_offload=kv_offload,
+        config=config,
+    )
+
+    streamer = TextStreamer(processor.tokenizer)
+    qeff_model.export()
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+    qeff_model.compile(num_devices=num_devices, prefill_seq_len=prompt_len, ctx_len=ctx_len, mxfp6=False)
+    print("QPC Outputs (QAIC):")
+    output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
+    qpc_tokens = output.generated_ids[:, :-1]
+    assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output"
+    return
+
+
 def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
     model_name: str,
     img_url: str,
@@ -427,6 +514,27 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn(
     )
 
 
+@pytest.mark.on_qaic
+@pytest.mark.multimodal
+@pytest.mark.parametrize(
+    "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config
+)
+def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100(
+    model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer
+):
+    check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name,
+        prompt_len=prompt_len,
+        ctx_len=ctx_len,
+        max_gen_len=NEW_GENERATION_TOKENS,
+        img_url=img_url,
+        query=query,
+        n_layer=n_layer,
+        batch_size=batch_size,
+        kv_offload=kv_offload,
+    )
+
+
 @pytest.mark.on_qaic
 @pytest.mark.multimodal
 @pytest.mark.parametrize(