|
19 | 19 | AutoModelForImageTextToText,
|
20 | 20 | AutoProcessor,
|
21 | 21 | AutoTokenizer,
|
| 22 | + GenerationConfig, |
22 | 23 | TextStreamer,
|
23 | 24 | )
|
24 | 25 |
|
|
27 | 28 | from QEfficient.utils._utils import create_json, get_num_layers_vlm
|
28 | 29 | from QEfficient.utils.constants import QnnConstants
|
29 | 30 | from QEfficient.utils.device_utils import get_available_device_id
|
30 |
| -from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerVlm |
| 31 | +from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerMolmo, ApiRunnerVlm |
31 | 32 | from QEfficient.utils.test_utils import InternProcessor
|
32 | 33 |
|
33 | 34 | NEW_GENERATION_TOKENS = 10
|
|
146 | 147 | # ), # commented becuase QNN Convertor is not supported for this model yet.
|
147 | 148 | ]
|
148 | 149 |
|
| 150 | +molmo_model_config = [ |
| 151 | + ( |
| 152 | + "allenai/Molmo-7B-D-0924", |
| 153 | + True, |
| 154 | + 1, |
| 155 | + 128, |
| 156 | + 4096, |
| 157 | + "https://picsum.photos/id/237/536/354", |
| 158 | + "Can you describe the image in detail.", |
| 159 | + 2, |
| 160 | + ), |
| 161 | +] |
| 162 | + |
149 | 163 |
|
150 | 164 | def load_image_text_to_text_model(model_config):
|
151 | 165 | model_path = hf_download(
|
@@ -185,6 +199,8 @@ def set_num_layers(config, n_layer=1):
|
185 | 199 | elif hasattr(config, "llm_config"):
|
186 | 200 | config.llm_config.num_hidden_layers = n_layer
|
187 | 201 | config.vision_config.num_hidden_layers = n_layer
|
| 202 | + else: |
| 203 | + config.num_hidden_layers = n_layer |
188 | 204 | return config
|
189 | 205 |
|
190 | 206 |
|
@@ -276,6 +292,77 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
|
276 | 292 | return
|
277 | 293 |
|
278 | 294 |
|
| 295 | +def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( |
| 296 | + model_name: str, |
| 297 | + img_url: str, |
| 298 | + query: str, |
| 299 | + prompt_len: int, |
| 300 | + ctx_len: int, |
| 301 | + max_gen_len: int = 20, |
| 302 | + batch_size: int = 1, |
| 303 | + n_layer: int = 1, |
| 304 | + kv_offload: bool = False, |
| 305 | + num_devices: int = 1, |
| 306 | + enable_qnn: Optional[bool] = False, |
| 307 | + qnn_config: Optional[str] = None, |
| 308 | +): |
| 309 | + model_config = {"model_name": model_name} |
| 310 | + |
| 311 | + config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) |
| 312 | + config._attn_implementation = "eager" |
| 313 | + config = set_num_layers(config, n_layer=n_layer) |
| 314 | + model_hf, _ = load_image_text_to_text_model(config) |
| 315 | + n_layer = (n_layer, n_layer) |
| 316 | + |
| 317 | + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) |
| 318 | + img = requests.get(img_url, stream=True) |
| 319 | + image = Image.open(BytesIO(img.content)).convert("RGB") |
| 320 | + image = image.resize((536, 354)) |
| 321 | + |
| 322 | + api_runner = ApiRunnerMolmo( |
| 323 | + batch_size, |
| 324 | + processor, |
| 325 | + config, |
| 326 | + image, |
| 327 | + query, |
| 328 | + prompt_len, |
| 329 | + ctx_len, |
| 330 | + max_gen_len, |
| 331 | + n_layer, |
| 332 | + ) |
| 333 | + |
| 334 | + inputs = processor.process(images=[image], text=query) |
| 335 | + inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} |
| 336 | + |
| 337 | + generation_config = GenerationConfig(max_new_tokens=NEW_GENERATION_TOKENS, stop_strings="<|endoftext|>") |
| 338 | + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config) |
| 339 | + |
| 340 | + batch_size, prompt_len = inputs["input_ids"].shape |
| 341 | + inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) |
| 342 | + valid = inputs["image_input_idx"] > 0 |
| 343 | + valid = valid.reshape(1, -1) |
| 344 | + inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0) |
| 345 | + inputs["pixel_values"] = inputs.pop("images") |
| 346 | + |
| 347 | + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( |
| 348 | + model_config["model_name"], |
| 349 | + kv_offload=kv_offload, |
| 350 | + config=config, |
| 351 | + ) |
| 352 | + |
| 353 | + streamer = TextStreamer(processor.tokenizer) |
| 354 | + qeff_model.export() |
| 355 | + |
| 356 | + if not get_available_device_id(): |
| 357 | + pytest.skip("No available devices to run model on Cloud AI 100") |
| 358 | + qeff_model.compile(num_devices=num_devices, prefill_seq_len=prompt_len, ctx_len=ctx_len, mxfp6=False) |
| 359 | + print("QPC Outputs (QAIC):") |
| 360 | + output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) |
| 361 | + qpc_tokens = output.generated_ids[:, :-1] |
| 362 | + assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" |
| 363 | + return |
| 364 | + |
| 365 | + |
279 | 366 | def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
|
280 | 367 | model_name: str,
|
281 | 368 | img_url: str,
|
@@ -427,6 +514,27 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn(
|
427 | 514 | )
|
428 | 515 |
|
429 | 516 |
|
| 517 | +@pytest.mark.on_qaic |
| 518 | +@pytest.mark.multimodal |
| 519 | +@pytest.mark.parametrize( |
| 520 | + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config |
| 521 | +) |
| 522 | +def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100( |
| 523 | + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer |
| 524 | +): |
| 525 | + check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( |
| 526 | + model_name=model_name, |
| 527 | + prompt_len=prompt_len, |
| 528 | + ctx_len=ctx_len, |
| 529 | + max_gen_len=NEW_GENERATION_TOKENS, |
| 530 | + img_url=img_url, |
| 531 | + query=query, |
| 532 | + n_layer=n_layer, |
| 533 | + batch_size=batch_size, |
| 534 | + kv_offload=kv_offload, |
| 535 | + ) |
| 536 | + |
| 537 | + |
430 | 538 | @pytest.mark.on_qaic
|
431 | 539 | @pytest.mark.multimodal
|
432 | 540 | @pytest.mark.parametrize(
|
|
0 commit comments