Add vision id for Qwen3-VL (#4183)

CUHKSZzxy · web-flow · commit 322b1331773f · 2025-12-08T13:02:51.000+08:00
* add vision id

* add test case

* format

* fix typo

* add for tm, simplify qwen3 vl proc_msg

* tiny rename req_state
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -586,7 +586,7 @@ class EngineOutput:
 
 @dataclass
 class VisionConfig:
-    """Vison model configs.
+    """Vision model configs.
 
     Args:
         max_batch_size (int): the max image size passed to the model, since
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -773,11 +773,13 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
             kwargs.pop('enable_thinking')
         if 'reasoning_effort' in kwargs and kwargs.get('reasoning_effort', None) is None:
             kwargs.pop('reasoning_effort')
+        add_vision_id = kwargs.pop('add_vision_id', False)
         add_generation_prompt = messages[-1]['role'] != 'assistant'
         if sequence_start:
             prompt = self.tokenizer.apply_chat_template(messages,
                                                         tokenize=False,
                                                         add_generation_prompt=add_generation_prompt,
+                                                        add_vision_id=add_vision_id,
                                                         **kwargs)
         else:
             # Use a sentinel position to avoid the influence of default system role in the tokenizer's chat template
@@ -788,6 +790,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
             prompt = self.tokenizer.apply_chat_template(sentinel_messages + messages,
                                                         tokenize=False,
                                                         add_generation_prompt=add_generation_prompt,
+                                                        add_vision_id=add_vision_id,
                                                         **kwargs)
             # remove the sentinel part
             prompt = prompt[len(sentinel_prompt):]
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -768,6 +768,7 @@ async def generate(
             rewind_stop_tokens: bool = False,
             input_ids: Optional[List] = None,
             enable_thinking: Optional[bool] = None,
+            add_vision_id: Optional[bool] = False,
             **kwargs):
         """Generate responses.
 
@@ -819,6 +820,7 @@ async def generate(
                                                         tools=tools,
                                                         reasoning_effort=reasoning_effort,
                                                         enable_thinking=enable_thinking,
+                                                        add_vision_id=add_vision_id,
                                                         **kwargs)
             prompt = prompt_input['prompt']
             input_ids = prompt_input['input_ids']
@@ -889,12 +891,12 @@ def is_error(status):
                                      sequence_end=sequence_end,
                                      step=history_len) as gen:
                 hit_stop_token = 0
-                req_state = RequestStats(prompt_tokens=input_len)  # per-request stats
+                req_stats = RequestStats(prompt_tokens=input_len)  # per-request stats
                 async for outputs in gen:
                     iteration_stats = IterationStats()  # per-iteration stats
                     specdecode_stats = SpeculativeDecodingStats(
                         self.num_spec_token) if self.num_spec_token > 0 else None
-                    metrics_processor.queue_update((outputs, req_state, iteration_stats, specdecode_stats))
+                    metrics_processor.queue_update((outputs, req_stats, iteration_stats, specdecode_stats))
                     # decode res
                     if is_error(outputs.status):
                         break
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -469,6 +469,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
         do_preprocess=do_preprocess,
         adapter_name=adapter_name,
         enable_thinking=request.enable_thinking,
+        add_vision_id=request.add_vision_id,
     )
 
     def create_stream_response_json(index: int,
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
@@ -150,6 +150,7 @@ class ChatCompletionRequest(BaseModel):
     min_new_tokens: Optional[int] = Field(default=None, examples=[None])
     min_p: float = 0.0
     enable_thinking: Optional[bool] = None
+    add_vision_id: Optional[bool] = False
     return_token_ids: Optional[bool] = False
     include_stop_str_in_output: Optional[bool] = False
 
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
@@ -56,6 +56,7 @@ async def _get_prompt_input(self,
                                 adapter_name: str,
                                 tools: Optional[List[object]] = None,
                                 enable_thinking: Optional[bool] = None,
+                                add_vision_id: Optional[bool] = False,
                                 **kwargs):
         """Process messages and return the required data for the inference
         engines.
@@ -70,6 +71,7 @@ async def _get_prompt_input(self,
                                                    adapter_name,
                                                    tools=tools,
                                                    enable_thinking=enable_thinking,
+                                                   add_vision_id=add_vision_id,
                                                    **kwargs)
         elif isinstance(messages, List):
             has_multimodal_input = any(
@@ -82,6 +84,7 @@ async def _get_prompt_input(self,
                                                        adapter_name,
                                                        tools,
                                                        enable_thinking=enable_thinking,
+                                                       add_vision_id=add_vision_id,
                                                        **kwargs)
         else:
             raise RuntimeError(f'unsupported messages {messages}')
@@ -101,7 +104,8 @@ async def _get_prompt_input(self,
                                                                self.tokenizer,
                                                                sequence_start,
                                                                tools=tools,
-                                                               enable_thinking=enable_thinking)
+                                                               enable_thinking=enable_thinking,
+                                                               add_vision_id=add_vision_id)
         elif self.backend == 'pytorch':
             # for pt engine, this module only conduct the image preprocessing
             # It leaves the vision embedding to the pt engine
@@ -110,7 +114,8 @@ async def _get_prompt_input(self,
                                                              self.tokenizer,
                                                              sequence_start,
                                                              tools=tools,
-                                                             enable_thinking=enable_thinking)
+                                                             enable_thinking=enable_thinking,
+                                                             add_vision_id=add_vision_id)
         return results
 
     @classmethod
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
@@ -69,6 +69,7 @@ async def wrap_for_pytorch(
         sequence_start,
         tools: Optional[List[object]] = None,
         enable_thinking: Optional[bool] = None,
+        add_vision_id: Optional[bool] = False,
     ) -> List[Dict]:
         """
         Args:
@@ -93,7 +94,8 @@ async def wrap_for_pytorch(
                                            tokenizer,
                                            sequence_start,
                                            tools=tools,
-                                           enable_thinking=enable_thinking)
+                                           enable_thinking=enable_thinking,
+                                           add_vision_id=add_vision_id)
         else:
             result = self.model.to_pytorch_with_input_ids(messages)
         # clear data
@@ -110,6 +112,7 @@ async def wrap_for_turbomind(
         sequence_start,
         tools: Optional[List[object]] = None,
         enable_thinking: Optional[bool] = None,
+        add_vision_id: Optional[bool] = False,
     ) -> Dict:
         """
         Args:
@@ -130,7 +133,8 @@ async def wrap_for_turbomind(
                                          tokenizer,
                                          sequence_start,
                                          tools=tools,
-                                         enable_thinking=enable_thinking)
+                                         enable_thinking=enable_thinking,
+                                         add_vision_id=add_vision_id)
         # clear data
         for i, message in enumerate(messages):
             if isinstance(message['content'], List):
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
@@ -12,7 +12,7 @@
 VISION_MODELS = Registry('vision_model')
 
 
-class VisonModel(ABC):
+class VisionModel(ABC):
     """Visual model which extract image feature."""
     _arch: Union[str, List[str]] = None
 
diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py
@@ -2,13 +2,13 @@
 from typing import Dict, List
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
 
 @VISION_MODELS.register_module()
-class CogVLMVisionModel(VisonModel):
+class CogVLMVisionModel(VisionModel):
     """CogVLM vision model."""
 
     _arch = 'CogVLMForCausalLM'
diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py
@@ -6,7 +6,7 @@
 from transformers import AutoModelForCausalLM
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 from lmdeploy.vl.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
@@ -23,7 +23,7 @@ def check_deepseek_vl_install():
 
 
 @VISION_MODELS.register_module()
-class DeepSeekVisionModel(VisonModel):
+class DeepSeekVisionModel(VisionModel):
     """Qwen vision model."""
 
     _arch = 'MultiModalityCausalLM'
diff --git a/lmdeploy/vl/model/deepseek_vl2.py b/lmdeploy/vl/model/deepseek_vl2.py
@@ -7,7 +7,7 @@
 from transformers import AutoConfig
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
@@ -36,7 +36,7 @@ def check_trans_version():
 
 
 @VISION_MODELS.register_module()
-class DeepSeek2VisionModel(VisonModel):
+class DeepSeek2VisionModel(VisionModel):
     """DeepSeek2 vision model."""
 
     _arch = 'DeepseekV2ForCausalLM'
diff --git a/lmdeploy/vl/model/gemma3_vl.py b/lmdeploy/vl/model/gemma3_vl.py
@@ -6,7 +6,7 @@
 from transformers.processing_utils import ImagesKwargs, ProcessingKwargs
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
@@ -35,7 +35,7 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
 
 
 @VISION_MODELS.register_module()
-class Gemma3VisionModel(VisonModel):
+class Gemma3VisionModel(VisionModel):
     """Gemma3 vision model."""
 
     _arch = 'Gemma3ForConditionalGeneration'
diff --git a/lmdeploy/vl/model/glm4_1v.py b/lmdeploy/vl/model/glm4_1v.py
@@ -4,13 +4,13 @@
 from transformers import AutoConfig
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
 
 @VISION_MODELS.register_module()
-class GLM4_1_VisionModel(VisonModel):
+class GLM4_1_VisionModel(VisionModel):
     """GLM-4.1V-9B-Thinking model."""
 
     _arch = ['Glm4vForConditionalGeneration']
diff --git a/lmdeploy/vl/model/glm4_v.py b/lmdeploy/vl/model/glm4_v.py
@@ -4,13 +4,13 @@
 from transformers import AutoConfig
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
 
 @VISION_MODELS.register_module()
-class GLM4VisionModel(VisonModel):
+class GLM4VisionModel(VisionModel):
     """Glm-4v-9b vision model."""
 
     _arch = ['ChatGLMModel', 'ChatGLMForConditionalGeneration']
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
@@ -5,7 +5,7 @@
 from transformers import AutoConfig, AutoModel, AutoTokenizer, CLIPImageProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 from lmdeploy.vl.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
@@ -64,7 +64,7 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
 
 
 @VISION_MODELS.register_module()
-class InternVLVisionModel(VisonModel):
+class InternVLVisionModel(VisionModel):
     """InternVL vision model."""
 
     _arch = 'InternVLChatModel'
@@ -236,7 +236,7 @@ def proc_messages(
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
         messages = [x for x in messages if x['role'] not in ['preprocess', 'forward']]
-        if VisonModel.IMAGE_TOKEN_included(messages):
+        if VisionModel.IMAGE_TOKEN_included(messages):
             # backward compatibility
             for message in messages:
                 role, content = message['role'], message['content']
diff --git a/lmdeploy/vl/model/llama4.py b/lmdeploy/vl/model/llama4.py
@@ -5,7 +5,7 @@
 from transformers import AutoConfig
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
@@ -24,7 +24,7 @@ def check_trans_version():
 
 
 @VISION_MODELS.register_module()
-class LLama4VisionModel(VisonModel):
+class LLama4VisionModel(VisionModel):
     """Llama4 vision model."""
 
     _arch = 'Llama4ForConditionalGeneration'
diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py
@@ -6,14 +6,14 @@
 from transformers import AutoProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 from lmdeploy.vl.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
 
 @VISION_MODELS.register_module()
-class LlavaHfVisionModel(VisonModel):
+class LlavaHfVisionModel(VisionModel):
     """Llava hf vision model."""
 
     _arch = 'LlavaForConditionalGeneration'
diff --git a/lmdeploy/vl/model/minicpmv.py b/lmdeploy/vl/model/minicpmv.py
@@ -8,14 +8,14 @@
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 from lmdeploy.vl.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
 
 @VISION_MODELS.register_module()
-class MiniCPMVModel(VisonModel):
+class MiniCPMVModel(VisionModel):
     """MiniCPMV vision model."""
 
     _arch = 'MiniCPMV'
diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
@@ -2,7 +2,7 @@
 
 from typing import Dict, List
 
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
 
 def check_transformers():
@@ -14,7 +14,7 @@ def check_transformers():
 
 
 @VISION_MODELS.register_module()
-class MllamaVLModel(VisonModel):
+class MllamaVLModel(VisionModel):
     """llama3.2 model."""
 
     _arch = 'MllamaForConditionalGeneration'
diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/vl/model/molmo.py
@@ -6,14 +6,14 @@
 from transformers import AutoModelForCausalLM, AutoProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 from lmdeploy.vl.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
 
 @VISION_MODELS.register_module()
-class MolmoVisionModel(VisonModel):
+class MolmoVisionModel(VisionModel):
     """Molmo's vision model."""
 
     _arch = 'MolmoForCausalLM'
diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py
@@ -6,14 +6,14 @@
 from transformers import AutoModelForCausalLM
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 from lmdeploy.vl.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
 
 @VISION_MODELS.register_module()
-class QwenVisionModel(VisonModel):
+class QwenVisionModel(VisionModel):
     """Qwen vision model."""
 
     _arch = 'QWenLMHeadModel'
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py
diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py

Original file line number	Diff line number	Diff line change
`@@ -469,6 +469,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque`
`469`	`469`	`do_preprocess=do_preprocess,`
`470`	`470`	`adapter_name=adapter_name,`
`471`	`471`	`enable_thinking=request.enable_thinking,`
	`472`	`+ add_vision_id=request.add_vision_id,`
`472`	`473`	`)`
`473`	`474`
`474`	`475`	`def create_stream_response_json(index: int,`