diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index c88302939..4cbbc3802 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -20,7 +20,7 @@ Therefore, it is recommended to install vLLM and vLLM-Omni with a **fresh new** vLLM-Omni is built based on vLLM. Please install it with command below. ```bash -uv pip install vllm==0.11.0 --torch-backend=auto +uv pip install vllm==0.12.0 --torch-backend=auto ``` #### Installation of vLLM-Omni @@ -89,7 +89,7 @@ docker run --runtime nvidia --gpus 2 \ --env "HF_TOKEN=$HF_TOKEN" \ -p 8091:8091 \ --ipc=host \ - vllm/vllm-omni:v0.11.0rc1 \ + vllm/vllm-omni:v0.12.0rc1 \ --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 8091 ``` diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 753f7cc36..e74ef2c9d 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -142,12 +142,39 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin limit_mm_per_prompt={"audio": 1}, ) +def get_mixed_modalities_query() -> QueryResult: + question = ( + "What is recited in the audio? " + "What is the content of this image? Why is this video funny?" + ) + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>" + "<|vision_start|><|image_pad|><|vision_end|>" + "<|vision_start|><|video_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": { + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + "image": convert_image_mode( + ImageAsset("cherry_blossom").pil_image, "RGB" + ), + "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays, + }, + }, + limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1}, + ) query_map = { "text": get_text_query, "use_audio": get_audio_query, "use_image": get_image_query, "use_video": get_video_query, + "use_mixed_modalities": get_mixed_modalities_query, } @@ -281,7 +308,7 @@ def parse_args(): "--query-type", "-q", type=str, - default="use_video", + default="use_mixed_modalities", choices=query_map.keys(), help="Query type.", )