vllm-project · tzhouam · Dec 19, 2025 · Dec 19, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -20,7 +20,7 @@ Therefore, it is recommended to install vLLM and vLLM-Omni with a **fresh new**
 
 vLLM-Omni is built based on vLLM. Please install it with command below.
 ```bash
-uv pip install vllm==0.11.0 --torch-backend=auto
+uv pip install vllm==0.12.0 --torch-backend=auto
 ```
 
 #### Installation of vLLM-Omni
@@ -89,7 +89,7 @@ docker run --runtime nvidia --gpus 2 \
     --env "HF_TOKEN=$HF_TOKEN" \
     -p 8091:8091 \
     --ipc=host \
-    vllm/vllm-omni:v0.11.0rc1 \
+    vllm/vllm-omni:v0.12.0rc1 \
     --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 8091
 ```
 

@@ -142,12 +142,39 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin
         limit_mm_per_prompt={"audio": 1},
     )
 
+def get_mixed_modalities_query() -> QueryResult:
+    question = (
+        "What is recited in the audio? "
+        "What is the content of this image? Why is this video funny?"
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+        "<|vision_start|><|image_pad|><|vision_end|>"
+        "<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image": convert_image_mode(
+                    ImageAsset("cherry_blossom").pil_image, "RGB"
+                ),
+                "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
+    )
 
 query_map = {
     "text": get_text_query,
     "use_audio": get_audio_query,
     "use_image": get_image_query,
     "use_video": get_video_query,
+    "use_mixed_modalities": get_mixed_modalities_query,
 }
 
 
@@ -281,7 +308,7 @@ def parse_args():
         "--query-type",
         "-q",
         type=str,
-        default="use_video",
+        default="use_mixed_modalities",
         choices=query_map.keys(),
         help="Query type.",
     )