feat: add initial qwen2.5-vl model and test

huggingface · Jan 30, 2025 · 1adfee4 · 1adfee4
1 parent 065aabb
commit 1adfee4
Show file tree

Hide file tree

Showing 8 changed files with 857 additions and 3 deletions.
diff --git a/...ration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple.json b/...ration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image depicts a person in a space-themed environment, possibly on another planet given the red sand and harsh landscape in the background. The individual is wearing a detailed, high-tech spacesuit with various gadgets and gadgets. The space suit features a large red button in the center of the chest, and the individual is in a crouched, ready stance, as if in a dramatic or adventurous pose. The background showcases an expansive high gorge or canyon, with walls of red and orange hues, with potential light sources in the distance that create a dramatic and intense atmosphere.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1738257213,
+  "id": "",
+  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.0.2-dev0-native",
+  "usage": {
+    "completion_tokens": 114,
+    "prompt_tokens": 1363,
+    "total_tokens": 1477
+  }
+}
diff --git a/...ts/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple_streaming.json b/...ts/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple_streaming.json
@@ -0,0 +1,20 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": "",
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1738257318,
+  "id": "",
+  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "3.0.2-dev0-native",
+  "usage": null
+}
diff --git a/integration-tests/models/test_flash_qwen2_5_vl.py b/integration-tests/models/test_flash_qwen2_5_vl.py
@@ -0,0 +1,78 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_qwen2_5_vl_handle(launcher):
+    with launcher("Qwen/Qwen2.5-VL-3B-Instruct") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_qwen2_5(flash_qwen2_5_vl_handle):
+    await flash_qwen2_5_vl_handle.health(300)
+    return flash_qwen2_5_vl_handle.client
+
+
+@pytest.mark.private
+async def test_flash_qwen2_5_vl_simple(flash_qwen2_5, response_snapshot):
+    response = await flash_qwen2_5.chat(
+        seed=1337,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe the image"},
+                ],
+            },
+        ],
+    )
+
+    assert (
+        response.choices[0].message.content
+        == "The image depicts a person in a space-themed environment, possibly on another planet given the red sand and harsh landscape in the background. The individual is wearing a detailed, high-tech spacesuit with various gadgets and gadgets. The space suit features a large red button in the center of the chest, and the individual is in a crouched, ready stance, as if in a dramatic or adventurous pose. The background showcases an expansive high gorge or canyon, with walls of red and orange hues, with potential light sources in the distance that create a dramatic and intense atmosphere."
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_5_vl_simple_streaming(flash_qwen2_5, response_snapshot):
+    responses = await flash_qwen2_5.chat(
+        seed=1337,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe the image"},
+                ],
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        generated += response.choices[0].delta.content
+        last_response = response
+
+    assert (
+        generated
+        == "The image depicts a person in a space-themed environment, possibly on another planet given the red sand and harsh landscape in the background. The individual is wearing a detailed, high-tech spacesuit with various gadgets and gadgets. The space suit features a large red button in the center of the chest, and the individual is in a crouched, ready stance, as if in a dramatic or adventurous pose. The background showcases an expansive high gorge or canyon, with walls of red and orange hues, with potential light sources in the distance that create a dramatic and intense atmosphere."
+    )
+    assert count == 114
+    assert last_response == response_snapshot
diff --git a/router/src/config.rs b/router/src/config.rs
@@ -184,10 +184,43 @@ impl Qwen2Vl {
     }
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Qwen2_5VlVisionConfig {
+    pub(crate) depth: usize,
+    pub(crate) hidden_act: String,
+    pub(crate) hidden_size: usize,
+    pub(crate) intermediate_size: usize,
+    pub(crate) num_heads: usize,
+    pub(crate) in_chans: usize,
+    pub(crate) out_hidden_size: usize,
+    pub(crate) patch_size: usize,
+    pub(crate) spatial_merge_size: usize,
+    pub(crate) spatial_patch_size: usize,
+    pub(crate) window_size: usize,
+    pub(crate) fullatt_block_indexes: Vec<usize>,
+    pub(crate) tokens_per_second: usize,
+    pub(crate) temporal_patch_size: usize,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Qwen2_5Vl {
+    pub(crate) vision_config: Qwen2_5VlVisionConfig,
+}
+
+impl Qwen2_5Vl {
+    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
+        let num_pixels = height * width;
+        num_pixels / self.vision_config.patch_size.pow(2)
+    }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "model_type")]
 #[serde(rename_all = "snake_case")]
 pub enum Config {
+    Qwen2_5Vl(Qwen2_5Vl),
     Qwen2Vl(Qwen2Vl),
     LlavaNext(LlavaNext),
     ClipVisionModel(ClipVisionModel),

diff --git a/router/src/validation.rs b/router/src/validation.rs
@@ -684,6 +684,10 @@ fn image_tokens(
             "<|vision_start|>{:?}<|vision_end|>",
             "<|image_pad|>".repeat(config.get_number_of_features(height, width))
         ),
+        Qwen2_5Vl(config) => format!(
+            "<|vision_start|>{:?}<|vision_end|>",
+            "<|image_pad|>".repeat(config.get_number_of_features(height, width))
+        ),
         _ => unimplemented!("Images tokens are not supported for this model configuration"),
     }
 }
@@ -712,7 +716,7 @@ fn prepare_input<T: TokenizerTrait>(
     let (tokenizer_query, input_chunks) = match config {
         Some(
             config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Paligemma(_) | LlavaNext(_)
-            | Qwen2Vl(_)),
+            | Qwen2Vl(_) | Qwen2_5Vl(_)),
         ) => {
             let mut input_chunks = Vec::new();
             let mut tokenizer_query = String::with_capacity(inputs.len());

diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -164,6 +164,9 @@
     from text_generation_server.models.custom_modeling.qwen2_vl import (
         Qwen2VLForConditionalGeneration,
     )
+    from text_generation_server.models.custom_modeling.qwen2_5_vl import (
+        Qwen2_5VLForConditionalGeneration,
+    )
     from text_generation_server.layers.attention import SUPPORTS_WINDOWING
 except ImportError as e:
     log_master(logger.warning, f"Could not import Flash Attention enabled models: {e}")
@@ -317,6 +320,11 @@ class ModelType(enum.Enum):
         "name": "Qwen 2 VL",
         "url": "https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d",
     }
+    QWEN2_5_VL = {
+        "type": "qwen2_5_vl",
+        "name": "Qwen 2.5 VL",
+        "url": "https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e",
+    }
     OPT = {
         "type": "opt",
         "name": "Opt",
@@ -1367,6 +1375,19 @@ def get_model(
             trust_remote_code=trust_remote_code,
             lora_adapter_ids=lora_adapter_ids,
         )
+    if model_type == QWEN2_5_VL:
+        return VlmCausalLM(
+            model_id=model_id,
+            model_class=Qwen2_5VLForConditionalGeneration,
+            revision=revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            default_dtype=torch.bfloat16,
+            kv_cache_dtype=kv_cache_dtype,
+            trust_remote_code=trust_remote_code,
+            lora_adapter_ids=lora_adapter_ids,
+        )
     if model_type == MLLAMA:
         if FLASH_ATTENTION:
             return MllamaCausalLM(