From 7acf53158d11e22b78e29b3cd46f5058ef851d50 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 11 Dec 2025 16:22:37 +0800 Subject: [PATCH 01/26] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/contributing/tests/test_style.md | 50 +++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 docs/contributing/tests/test_style.md diff --git a/docs/contributing/tests/test_style.md b/docs/contributing/tests/test_style.md new file mode 100644 index 000000000..cccc4be81 --- /dev/null +++ b/docs/contributing/tests/test_style.md @@ -0,0 +1,50 @@ +###online serving + + +```python +models = ["Qwen/{模型名称}"] #指定对应模型 +stage_configs = [str(Path(__file__).parent / "stage_configs" / {模型yaml})] #指定对应模型配置 +test_params = [(model, stage_config) for model in models for stage_config in stage_configs] + +#OmniServer类,用于拉起OmniServer +class OmniServer: + xxx + + + +#根据指定参数拉起omni_server +@pytest.fixture +def omni_server(request): + model, stage_config_path = request.param + with OmniServer(model, ["--stage-configs-path", stage_config_path]) as server: + yield server + +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_video_to_audio( + client: openai.OpenAI, #指定client类型 + omni_server, + base64_encoded_video: str, +) -> None: + #set message + video_data_url = f"data:video/mp4;base64, {base64_encoded_video}" + messages = dummy_messages_from_video_data(video_data_url) + + #send request + chat_completion = client.chat.completions.create( + model=omni_server.model, + messages=messages, + ) + + #verify + + + +def test_text_to_audio( + +) + +def test_audio_to_audio( + +) +} +``` \ No newline at end of file From 4730e8c017261029b00dacf4bf9feb40e0dd6d7f Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 11 Dec 2025 16:24:18 +0800 Subject: [PATCH 02/26] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/contributing/tests/test_style.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/contributing/tests/test_style.md b/docs/contributing/tests/test_style.md index cccc4be81..4dc754b3c 100644 --- a/docs/contributing/tests/test_style.md +++ b/docs/contributing/tests/test_style.md @@ -18,7 +18,10 @@ def omni_server(request): model, stage_config_path = request.param with OmniServer(model, ["--stage-configs-path", stage_config_path]) as server: yield server - + + +#output: audio +#input: video @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_video_to_audio( client: openai.OpenAI, #指定client类型 From 19d5728cbe18b10eb21925b3db9859349b9595f7 Mon Sep 17 00:00:00 2001 From: Alicia Wang <115451386+congw729@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:14:36 +0800 Subject: [PATCH 03/26] Add dir structure & coding style for tests. Signed-off-by: Alicia Wang <115451386+congw729@users.noreply.github.com> --- docs/api/README.md | 2 + docs/contributing/tests/tests_style.md | 198 ++++++++++++++++++ .../examples/offline_inference/qwen_image.md | 19 +- pytest.ini | 3 + 4 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 docs/contributing/tests/tests_style.md create mode 100644 pytest.ini diff --git a/docs/api/README.md b/docs/api/README.md index 6ddc14e00..a6fb2cd2f 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -96,6 +96,8 @@ Model execution components. Configuration classes. - [vllm_omni.config.model.OmniModelConfig][] +- [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][] +- [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][] ## Workers diff --git a/docs/contributing/tests/tests_style.md b/docs/contributing/tests/tests_style.md new file mode 100644 index 000000000..841159720 --- /dev/null +++ b/docs/contributing/tests/tests_style.md @@ -0,0 +1,198 @@ +# Test File Structure and Style Guide + +To ensure project maintainability and sustainable development, we encourage contributors to submit test code (unit tests, system tests, or end-to-end tests) alongside their code changes. This document outlines the guidelines for organizing and naming test files. + +## Test Types + +### Unit Tests and System Tests +For unit tests and system tests, we strongly recommend placing test files in the same directory structure as the source code being tested, using the naming convention `test_*.py`. + +### End-to-End (E2E) Tests for Models +End-to-end tests verify the complete functionality of a system or component. For our project, the E2E tests for different omni models are organized into two subdirectories: +- **`tests/e2e/offline_inference/`**: Tests for offline inference modes (e.g., Qwen3Omni offline inference) +- **`tests/e2e/online_serving/`**: Tests for online serving scenarios (e.g., API server tests) + +**Example:** The test file for `vllm_omni/entrypoints/omni_llm.py` should be located at `tests/entrypoints/test_omni_llm.py`. + +## Test Directory Structure + +The ideal directory structure mirrors the source code organization: + +``` +vllm_omni/ tests/ +├── config/ → ├── config/ +│ └── model.py │ └── test_model.py +│ +├── core/ → ├── core/ +│ ├── dit_cache_manager.py │ ├── test_dit_cache_manager.py +│ └── sched/ │ └── sched/ # Maps to core/sched/ +│ ├── omni_ar_scheduler.py │ ├── test_omni_ar_scheduler.py +│ ├── omni_generation_scheduler.py │ ├── test_omni_generation_scheduler.py +│ └── output.py │ └── test_output.py +│ +├── diffusion/ → ├── diffusion/ +│ ├── diffusion_engine.py │ ├── test_diffusion_engine.py +│ ├── omni_diffusion.py │ ├── test_omni_diffusion.py +│ ├── attention/ │ ├── attention/ # Maps to diffusion/attention/ +│ │ └── backends/ │ │ └── test_*.py +│ ├── models/ │ ├── models/ # Maps to diffusion/models/ +│ │ ├── qwen_image/ │ │ ├── qwen_image/ +│ │ │ └── ... │ │ │ └── test_*.py +│ │ └── z_image/ │ │ └── z_image/ +│ │ └── ... │ │ └── test_*.py +│ └── worker/ │ └── worker/ # Maps to diffusion/worker/ +│ └── ... │ └── test_*.py +│ +├── distributed/ → ├── distributed/ +│ └── ... │ └── test_*.py +│ +├── engine/ → ├── engine/ +│ ├── processor.py │ ├── test_processor.py +│ └── output_processor.py │ └── test_output_processor.py +│ +├── entrypoints/ → ├── entrypoints/ +│ ├── omni_llm.py │ ├── test_omni_llm.py # UT: OmniLLM core logic (mocked) +│ ├── omni_stage.py │ ├── test_omni_stage.py # UT: OmniStage logic +│ ├── omni.py │ ├── test_omni.py # E2E: Omni class (offline inference) +│ ├── async_omni.py │ ├── test_async_omni.py # E2E: AsyncOmni class +│ ├── cli/ │ ├── cli/ # Maps to entrypoints/cli/ +│ │ └── ... │ │ └── test_*.py +│ └── openai/ │ └── openai/ # Maps to entrypoints/openai/ +│ ├── api_server.py │ ├── test_api_server.py # E2E: API server (online serving) +│ └── serving_chat.py │ └── test_serving_chat.py +│ +├── inputs/ → ├── inputs/ +│ ├── data.py │ ├── test_data.py +│ ├── parse.py │ ├── test_parse.py +│ └── preprocess.py │ └── test_preprocess.py +│ +├── model_executor/ → ├── model_executor/ +│ ├── layers/ │ ├── layers/ +│ │ └── mrope.py │ │ └── test_mrope.py +│ ├── model_loader/ │ ├── model_loader/ +│ │ └── weight_utils.py │ │ └── test_weight_utils.py +│ ├── models/ │ ├── models/ +│ │ ├── qwen2_5_omni/ │ │ ├── qwen2_5_omni/ +│ │ │ ├── qwen2_5_omni_thinker.py │ │ │ ├── test_qwen2_5_omni_thinker.py # UT +│ │ │ ├── qwen2_5_omni_talker.py │ │ │ ├── test_qwen2_5_omni_talker.py # UT +│ │ │ └── qwen2_5_omni_token2wav.py │ │ │ └── test_qwen2_5_omni_token2wav.py # UT +│ │ └── qwen3_omni/ │ │ └── qwen3_omni/ +│ │ └── ... │ │ └── test_*.py +│ ├── stage_configs/ │ └── stage_configs/ # Configuration tests (if needed) +│ │ └── ... │ └── test_*.py +│ └── stage_input_processors/ │ └── stage_input_processors/ +│ └── ... │ └── test_*.py +│ +├── sample/ → ├── sample/ +│ └── ... │ └── test_*.py +│ +├── utils/ → ├── utils/ +│ └── platform_utils.py │ └── test_platform_utils.py +│ +├── worker/ → ├── worker/ + ├── gpu_ar_worker.py │ ├── test_gpu_ar_worker.py + ├── gpu_generation_worker.py │ ├── test_gpu_generation_worker.py + ├── gpu_model_runner.py │ ├── test_gpu_model_runner.py + └── npu/ │ └── npu/ # Maps to worker/npu/ + └── ... │ └── test_*.py +│ +└── e2e/ → ├── e2e/ # End-to-end scenarios (no 1:1 source mirror) + ├── online_serving/ # Full-stack online serving flows + │ └── (empty for now) + └── offline_inference/ # Full offline inference flows + ├── test_qwen2_5_omni.py # Moved from multi_stages/ + ├── test_qwen3_omni.py # Moved from multi_stages_h100/ + ├── test_diffusion_model.py # Moved from single_stage/ + └── stage_configs/ # Shared stage configs + ├── qwen2_5_omni_ci.yaml + └── qwen3_omni_ci.yaml +``` + + + +### Naming Conventions + +- **Unit/System Tests**: Use `test_.py` format + - Example: `omni_llm.py` → `test_omni_llm.py` + +- **E2E Tests**: Place in `tests/e2e/offline_inference/` or `tests/e2e/online_serving/` with descriptive names + - Example: `tests/e2e/offline_inference/test_qwen3_omni.py`, `tests/e2e/offline_inference/test_diffusion_model.py` + +### Best Practices + +1. **Mirror Source Structure**: Test directories should mirror the source code structure +2. **Test Type Indicators**: Use comments to indicate test types (UT for unit tests, E2E for end-to-end tests) +3. **Shared Resources**: Place shared test configurations (e.g., CI configs) in appropriate subdirectories +4. **Consistent Naming**: Follow the `test_*.py` naming convention consistently across all test files + + +## Test codes requirements + +### Coding style + +1. **File header**: Add SPDX license header to all test files +2. **Imports**: Pls don't use manual `sys.path` modifications, use standard imports instead. +3. **Test type differentiation**: + - Unit tests: Maintain mock style + - Model tests: Consider using OmniRunner uniformly, avoid decorators +4. **Documentation**: Add docstrings to all test functions +5. **Environment variables**: Set uniformly in `conftest.py` or at the top of files +6. **Type annotations**: Add type annotations to all test function parameters +7. **Resources**, Using pytest tag to specify the computation resources the test required. + +### Template +#### E2E - Online serving + +```python + +``` + +#### E2E - Offline inference +```python +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Offline E2E smoke test for an omni model (video → audio). +""" + +import os +from pathlib import Path + +import pytest +from vllm.assets.video import VideoAsset + +from ..multi_stages.conftest import OmniRunner + +# Optional: set process start method for workers +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +CI_STAGE_CONFIG_PATH = str(Path(__file__).parent / "stage_configs" / "qwen3_omni_ci.yaml") # Edit here to load your model + + +# function name: test_{input_modality}_to_{output_modality} +# modality candidate: text, image, audio, video, mixed_modalities +@pytest.mark.gpu_mem_high # requires high-memory GPU node +@pytest.mark.parametrize("model", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]) +def test_video_to_audio(omni_runner: type[OmniRunner], model: str) -> None: + """Offline inference: video input, audio output.""" + with omni_runner(model, seed=42, stage_configs_path=CI_STAGE_CONFIG_PATH) as runner: + video = VideoAsset(name="sample", num_frames=4).np_ndarrays + + outputs = runner.generate_multimodal( + prompts="Describe this video briefly.", + videos=video, + ) + + # Minimal assertions: got outputs and at least one audio result + assert outputs + has_audio = any(o.final_output_type == "audio" for o in outputs) + assert has_audio +``` + + + +## Checklist before submit your test files: + +1. The file is saved in a suitable places and the file name is clear. +2. The coding style matches the requirements. +3. For e2e omni model tests, specify the diff --git a/docs/user_guide/examples/offline_inference/qwen_image.md b/docs/user_guide/examples/offline_inference/qwen_image.md index 575da435a..27f846843 100644 --- a/docs/user_guide/examples/offline_inference/qwen_image.md +++ b/docs/user_guide/examples/offline_inference/qwen_image.md @@ -3,11 +3,22 @@ Source . -This folder provides two simple entrypoints for experimenting with `Qwen/Qwen-Image` using vLLM-Omni: +This folder provides several entrypoints for experimenting with `Qwen/Qwen-Image` using vLLM-Omni: -- `text_to_image.py`: command-line script for single image generation. +- `text_to_image.py`: command-line script for single image generation with advanced options. - `web_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration. +## Basic Usage + +```python +from vllm_omni.entrypoints.omni import Omni + +if __name__ == "__main__": + omni = Omni(model="Qwen/Qwen-Image") + prompt = "a cup of coffee on the table" + images = omni.generate(prompt) + images[0].save("coffee.png") +``` ## Local CLI Usage @@ -51,6 +62,10 @@ Then open `http://localhost:7862/` on your local browser to interact with the we ``````py --8<-- "examples/offline_inference/qwen_image/gradio_demo.py" `````` +??? abstract "image_edit.py" + ``````py + --8<-- "examples/offline_inference/qwen_image/image_edit.py" + `````` ??? abstract "text_to_image.py" ``````py --8<-- "examples/offline_inference/qwen_image/text_to_image.py" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 000000000..8fb4beb97 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +markers = + gpu_mem_high: needs high VRAM From b162a3ac787713569519aa81703b72d20e30ebb4 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 11 Dec 2025 21:22:18 +0800 Subject: [PATCH 04/26] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/contributing/tests/test_style.md | 53 --------------------- docs/contributing/tests/tests_style.md | 64 +++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 54 deletions(-) delete mode 100644 docs/contributing/tests/test_style.md diff --git a/docs/contributing/tests/test_style.md b/docs/contributing/tests/test_style.md deleted file mode 100644 index 4dc754b3c..000000000 --- a/docs/contributing/tests/test_style.md +++ /dev/null @@ -1,53 +0,0 @@ -###online serving - - -```python -models = ["Qwen/{模型名称}"] #指定对应模型 -stage_configs = [str(Path(__file__).parent / "stage_configs" / {模型yaml})] #指定对应模型配置 -test_params = [(model, stage_config) for model in models for stage_config in stage_configs] - -#OmniServer类,用于拉起OmniServer -class OmniServer: - xxx - - - -#根据指定参数拉起omni_server -@pytest.fixture -def omni_server(request): - model, stage_config_path = request.param - with OmniServer(model, ["--stage-configs-path", stage_config_path]) as server: - yield server - - -#output: audio -#input: video -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_audio( - client: openai.OpenAI, #指定client类型 - omni_server, - base64_encoded_video: str, -) -> None: - #set message - video_data_url = f"data:video/mp4;base64, {base64_encoded_video}" - messages = dummy_messages_from_video_data(video_data_url) - - #send request - chat_completion = client.chat.completions.create( - model=omni_server.model, - messages=messages, - ) - - #verify - - - -def test_text_to_audio( - -) - -def test_audio_to_audio( - -) -} -``` \ No newline at end of file diff --git a/docs/contributing/tests/tests_style.md b/docs/contributing/tests/tests_style.md index 841159720..b6813ccc2 100644 --- a/docs/contributing/tests/tests_style.md +++ b/docs/contributing/tests/tests_style.md @@ -142,9 +142,71 @@ vllm_omni/ tests/ ### Template #### E2E - Online serving - +""" +Online E2E smoke test for an omni model (video,text,audio → audio). +""" ```python +from pathlib import Path + +import pytest +import openai + + +# Optional: set process start method for workers +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +models = ["{your model name}"] #Edit here to load your model +stage_configs = [str(Path(__file__).parent / "stage_configs" / {your model yaml})] #Edit here to load your model yaml +test_params = [(model, stage_config) for model in models for stage_config in stage_configs] + +#OmniServer,Used to start the vllm-omni server +class OmniServer: + xxx + + +@pytest.fixture +def omni_server(request): + model, stage_config_path = request.param + with OmniServer(model, ["--stage-configs-path", stage_config_path]) as server: + yield server + + +#handle request message +@pytest.fixture(scope="session") +def base64_encoded_video() -> str: + xxx + +@pytest.fixture(scope="session") +def dummy_messages_from_video_data(video_data_url: str, content_text: str) -> str: + xxx + +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_video_to_audio( + client: openai.OpenAI, + omni_server, + base64_encoded_video: str, +) -> None: + #set message + video_data_url = f"data:video/mp4;base64, {base64_encoded_video}" + messages = dummy_messages_from_video_data(video_data_url) + + #send request + chat_completion = client.chat.completions.create( + model=omni_server.model, + messages=messages, + ) + + #verify text output + text_choice = chat_completion.choices[0] + assert text_choice.finish_reason == "length" + + #verify audio output + audio_choice = chat_completion.choices[1] + audio_message = audio_choice.message + if hasattr(audio_message, "audio") and audio_message.audio: + assert audio_message.audio.data is not None + assert len(audio_message.audio.data) > 0 + ``` #### E2E - Offline inference From 35b7676a8f4cafddeb6651058b6f962129cfa8b0 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Mon, 15 Dec 2025 12:11:18 +0800 Subject: [PATCH 05/26] Merge branch 'main' of https://github.com/yenuo26/vllm-omni into main # Conflicts: # docs/api/README.md # docs/user_guide/examples/offline_inference/text_to_image.md --- docs/api/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/api/README.md b/docs/api/README.md index 197fd1568..d2652f80e 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -6,6 +6,8 @@ Configuration classes. - [vllm_omni.config.model.OmniModelConfig][] +- [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][] +- [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][] ## EntryPoints From 155884cc37aa7068a87e2dfc8750dfcefe05378a Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Mon, 15 Dec 2025 16:29:03 +0800 Subject: [PATCH 06/26] Merge branch 'main' of https://github.com/yenuo26/vllm-omni into main # Conflicts: # docs/api/README.md # docs/user_guide/examples/offline_inference/text_to_image.md --- tests/conftest.py | 82 +++++++++++++- .../stage_configs/qwen2_5_omni_ci.yaml | 105 ++++++++++++++++++ tests/e2e/online_serving/test_qwen2_5_omni.py | 88 +++++++++++++++ 3 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 tests/e2e/online_serving/stage_configs/qwen2_5_omni_ci.yaml create mode 100644 tests/e2e/online_serving/test_qwen2_5_omni.py diff --git a/tests/conftest.py b/tests/conftest.py index 82c959f07..3e917a6da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,12 @@ import os - +import socket +import subprocess +import sys +import time import pytest import torch from vllm.logger import init_logger +from vllm.utils import get_open_port logger = init_logger(__name__) @@ -34,3 +38,79 @@ def clean_gpu_memory_between_tests(): if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() + + + +class OmniServer: + """Omniserver for vLLM-Omni tests.""" + + def __init__( + self, + model: str, + serve_args: list[str], + *, + env_dict: dict[str, str] | None = None, + ) -> None: + self.model = model + self.serve_args = serve_args + self.env_dict = env_dict + self.proc: subprocess.Popen | None = None + self.host = "127.0.0.1" + self.port = get_open_port() + + def _start_server(self) -> None: + """Start the vLLM-Omni server subprocess.""" + env = os.environ.copy() + env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if self.env_dict is not None: + env.update(self.env_dict) + + cmd = [ + sys.executable, + "-m", + "vllm_omni.entrypoints.cli.main", + "serve", + self.model, + "--omni", + "--host", + self.host, + "--port", + str(self.port), + ] + self.serve_args + + print(f"Launching OmniServer with: {' '.join(cmd)}") + self.proc = subprocess.Popen( + cmd, + env=env, + cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # Set working directory to vllm-omni root + ) + + # Wait for server to be ready + max_wait = 600 # 10 minutes + start_time = time.time() + while time.time() - start_time < max_wait: + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(1) + result = sock.connect_ex((self.host, self.port)) + if result == 0: + print(f"Server ready on {self.host}:{self.port}") + return + except Exception: + pass + time.sleep(2) + + raise RuntimeError(f"Server failed to start within {max_wait} seconds") + + def __enter__(self): + self._start_server() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.proc: + self.proc.terminate() + try: + self.proc.wait(timeout=30) + except subprocess.TimeoutExpired: + self.proc.kill() + self.proc.wait() diff --git a/tests/e2e/online_serving/stage_configs/qwen2_5_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/qwen2_5_omni_ci.yaml new file mode 100644 index 000000000..96e9d7fa7 --- /dev/null +++ b/tests/e2e/online_serving/stage_configs/qwen2_5_omni_ci.yaml @@ -0,0 +1,105 @@ +# stage config for running qwen2.5-omni with architecture of OmniLLM. + +# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090). +# This config is optimized for CI e2e tests. +stage_args: + - stage_id: 0 + runtime: + process: true # Run this stage in a separate process + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + max_model_len: 896 + max_num_batched_tokens: 896 + max_num_seqs: 1 + gpu_memory_utilization: 0.8 + skip_mm_profiling: true + enforce_eager: true # Now we only support eager mode + trust_remote_code: true + engine_output_type: latent + enable_prefix_caching: false + is_comprehension: true + final_output: true + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + - stage_id: 1 + runtime: + process: true + devices: "1" + max_batch_size: 1 + engine_args: + model_stage: talker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + max_model_len: 896 + max_num_batched_tokens: 896 + max_num_seqs: 1 + gpu_memory_utilization: 0.8 + skip_mm_profiling: true + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: latent + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker + default_sampling_params: + temperature: 0.9 + top_p: 0.8 + top_k: 40 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + stop_token_ids: [8294] + - stage_id: 2 + runtime: + process: true + devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU + max_batch_size: 1 + engine_args: + model_stage: code2wav + model_arch: Qwen2_5OmniForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + gpu_memory_utilization: 0.15 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: audio + engine_input_source: [1] + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + +# Top-level runtime config (concise): default windows and stage edges +runtime: + enabled: true + defaults: + window_size: -1 # Simplified: trigger downstream only after full upstream completion + max_inflight: 1 # Simplified: process serially within each stage + edges: + - from: 0 # thinker → talker: trigger only after receiving full input (-1) + to: 1 + window_size: -1 + - from: 1 # talker → code2wav: trigger only after receiving full input (-1) + to: 2 + window_size: -1 diff --git a/tests/e2e/online_serving/test_qwen2_5_omni.py b/tests/e2e/online_serving/test_qwen2_5_omni.py new file mode 100644 index 000000000..2ccee9f84 --- /dev/null +++ b/tests/e2e/online_serving/test_qwen2_5_omni.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E Online tests for Qwen3-Omni model with video input and audio output. +""" + +import os + +from pathlib import Path + +import openai +import pytest +from vllm.assets.video import VideoAsset +from tests.conftest import OmniServer + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +models = ["Qwen/Qwen2.5-Omni-3B"] + +# CI stage config for 2*H100-80G GPUs +stage_configs = [str(Path(__file__).parent / "stage_configs" / "qwen2_5_omni_ci.yaml")] + +# Create parameter combinations for model and stage config +test_params = [(model, stage_config) for model in models for stage_config in stage_configs] + + +@pytest.fixture +def client(omni_server): + """OpenAI client for the running vLLM-Omni server.""" + return openai.OpenAI( + base_url=f"http://{omni_server.host}:{omni_server.port}/v1", + api_key="EMPTY", + ) + + +@pytest.fixture(scope="session") +def base64_encoded_video() -> str: + """Base64 encoded video for testing.""" + import base64 + + video = VideoAsset(name="baby_reading", num_frames=4) + with open(video.video_path, "rb") as f: + content = f.read() + return base64.b64encode(content).decode("utf-8") + + +def get_system_prompt(): + return { + "role": "system", + "content": [ + { + "type": "text", + "text": ( + "You are Qwen, a virtual human developed by the Qwen Team, " + "Alibaba Group, capable of perceiving auditory and visual inputs, " + "as well as generating text and speech." + ), + } + ], + } + + +def dummy_messages_from_video_data( + video_data_url: str, + content_text: str = "Describe the video briefly.", +): + """Create messages with video data URL for OpenAI API.""" + return [ + get_system_prompt(), + { + "role": "user", + "content": [ + {"type": "video_url", "video_url": {"url": video_data_url}}, + {"type": "text", "text": content_text}, + ], + }, + ] + + +@pytest.mark.parametrize("test_param", test_params) +def test_video_to_audio( + test_param, +) -> None: + """Test processing video, generating audio output via OpenAI API.""" + # Create data URL for the base64 encoded video + model, stage_config_path = test_param + with OmniServer(model, ["--stage-configs-path", stage_config_path]) as server: + pass From 61442b6d6d81eeb5f9508f44ee2c049298d9dc9c Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Mon, 15 Dec 2025 16:52:39 +0800 Subject: [PATCH 07/26] Merge branch 'main' of https://github.com/yenuo26/vllm-omni into main # Conflicts: # docs/api/README.md # docs/user_guide/examples/offline_inference/text_to_image.md --- benchmarks/README.md | 389 ++++++ benchmarks/auto_tune.sh | 244 ++++ benchmarks/backend_request_func.py | 622 +++++++++ benchmarks/benchmark_dataset.py | 1167 ++++++++++++++++ benchmarks/benchmark_latency.py | 180 +++ .../benchmark_long_document_qa_throughput.py | 196 +++ benchmarks/benchmark_prefix_caching.py | 272 ++++ benchmarks/benchmark_prioritization.py | 216 +++ benchmarks/benchmark_serving.py | 1230 +++++++++++++++++ .../benchmark_serving_structured_output.py | 1038 ++++++++++++++ benchmarks/benchmark_throughput.py | 724 ++++++++++ benchmarks/benchmark_utils.py | 74 + .../cutlass_benchmarks/sparse_benchmarks.py | 516 +++++++ benchmarks/cutlass_benchmarks/utils.py | 100 ++ .../cutlass_benchmarks/w8a8_benchmarks.py | 377 +++++ .../cutlass_benchmarks/weight_shapes.py | 46 + .../disagg_overhead_benchmark.sh | 145 ++ .../disagg_performance_benchmark.sh | 163 +++ .../disagg_prefill_proxy_server.py | 63 + .../disagg_benchmarks/round_robin_proxy.py | 63 + .../visualize_benchmark_results.py | 47 + .../fused_kernels/layernorm_rms_benchmarks.py | 228 +++ benchmarks/kernels/bench_fp8_gemm.py | 223 +++ benchmarks/kernels/benchmark_aqlm.py | 345 +++++ benchmarks/kernels/benchmark_bitblas.py | 242 ++++ .../kernels/benchmark_cutlass_fp4_moe.py | 490 +++++++ .../kernels/benchmark_grouped_gemm_cutlass.py | 378 +++++ benchmarks/kernels/benchmark_layernorm.py | 93 ++ benchmarks/kernels/benchmark_lora.py | 1065 ++++++++++++++ benchmarks/kernels/benchmark_machete.py | 730 ++++++++++ benchmarks/kernels/benchmark_marlin.py | 342 +++++ benchmarks/kernels/benchmark_moe.py | 737 ++++++++++ .../benchmark_moe_permute_unpermute.py | 417 ++++++ .../kernels/benchmark_paged_attention.py | 251 ++++ benchmarks/kernels/benchmark_quant.py | 108 ++ benchmarks/kernels/benchmark_rmsnorm.py | 256 ++++ benchmarks/kernels/benchmark_rope.py | 133 ++ benchmarks/kernels/benchmark_shapes.py | 94 ++ .../kernels/benchmark_w8a8_block_fp8.py | 414 ++++++ benchmarks/kernels/deepgemm/README.md | 129 ++ .../benchmark_fp8_block_dense_gemm.py | 467 +++++++ benchmarks/kernels/graph_machete_bench.py | 64 + benchmarks/kernels/requirements.txt | 1 + benchmarks/kernels/utils.py | 214 +++ benchmarks/kernels/weight_shapes.py | 98 ++ benchmarks/overheads/benchmark_hashing.py | 64 + benchmarks/pyproject.toml | 49 + benchmarks/run_structured_output_benchmark.sh | 129 ++ benchmarks/sonnet.txt | 518 +++++++ 49 files changed, 16151 insertions(+) create mode 100644 benchmarks/README.md create mode 100644 benchmarks/auto_tune.sh create mode 100644 benchmarks/backend_request_func.py create mode 100644 benchmarks/benchmark_dataset.py create mode 100644 benchmarks/benchmark_latency.py create mode 100644 benchmarks/benchmark_long_document_qa_throughput.py create mode 100644 benchmarks/benchmark_prefix_caching.py create mode 100644 benchmarks/benchmark_prioritization.py create mode 100644 benchmarks/benchmark_serving.py create mode 100644 benchmarks/benchmark_serving_structured_output.py create mode 100644 benchmarks/benchmark_throughput.py create mode 100644 benchmarks/benchmark_utils.py create mode 100644 benchmarks/cutlass_benchmarks/sparse_benchmarks.py create mode 100644 benchmarks/cutlass_benchmarks/utils.py create mode 100644 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py create mode 100644 benchmarks/cutlass_benchmarks/weight_shapes.py create mode 100644 benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh create mode 100644 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh create mode 100644 benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py create mode 100644 benchmarks/disagg_benchmarks/round_robin_proxy.py create mode 100644 benchmarks/disagg_benchmarks/visualize_benchmark_results.py create mode 100644 benchmarks/fused_kernels/layernorm_rms_benchmarks.py create mode 100644 benchmarks/kernels/bench_fp8_gemm.py create mode 100644 benchmarks/kernels/benchmark_aqlm.py create mode 100644 benchmarks/kernels/benchmark_bitblas.py create mode 100644 benchmarks/kernels/benchmark_cutlass_fp4_moe.py create mode 100644 benchmarks/kernels/benchmark_grouped_gemm_cutlass.py create mode 100644 benchmarks/kernels/benchmark_layernorm.py create mode 100644 benchmarks/kernels/benchmark_lora.py create mode 100644 benchmarks/kernels/benchmark_machete.py create mode 100644 benchmarks/kernels/benchmark_marlin.py create mode 100644 benchmarks/kernels/benchmark_moe.py create mode 100644 benchmarks/kernels/benchmark_moe_permute_unpermute.py create mode 100644 benchmarks/kernels/benchmark_paged_attention.py create mode 100644 benchmarks/kernels/benchmark_quant.py create mode 100644 benchmarks/kernels/benchmark_rmsnorm.py create mode 100644 benchmarks/kernels/benchmark_rope.py create mode 100644 benchmarks/kernels/benchmark_shapes.py create mode 100644 benchmarks/kernels/benchmark_w8a8_block_fp8.py create mode 100644 benchmarks/kernels/deepgemm/README.md create mode 100644 benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py create mode 100644 benchmarks/kernels/graph_machete_bench.py create mode 100644 benchmarks/kernels/requirements.txt create mode 100644 benchmarks/kernels/utils.py create mode 100644 benchmarks/kernels/weight_shapes.py create mode 100644 benchmarks/overheads/benchmark_hashing.py create mode 100644 benchmarks/pyproject.toml create mode 100644 benchmarks/run_structured_output_benchmark.sh create mode 100644 benchmarks/sonnet.txt diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..6f9fbb91c --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,389 @@ +# Benchmarking vLLM + +This README guides you through running benchmark tests with the extensive +datasets supported on vLLM. It’s a living document, updated as new features and datasets +become available. + +## Dataset Overview + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DatasetOnlineOfflineData Path
ShareGPTwget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
BurstGPTwget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
SonnetLocal file: benchmarks/sonnet.txt
Randomsynthetic
HuggingFace-VisionArenalmarena-ai/VisionArena-Chat
HuggingFace-InstructCoderlikaixin/InstructCoder
HuggingFace-AIMOAI-MO/aimo-validation-aime , AI-MO/NuminaMath-1.5, AI-MO/NuminaMath-CoT
HuggingFace-Otherlmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered
CustomLocal file: data.jsonl
+ +✅: supported + +🟡: Partial support + +🚧: to be supported + +**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` + +--- +## Example - Online Benchmark + +First start serving your model + +```bash +vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests +``` + +Then run the benchmarking script + +```bash +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +python3 vllm/benchmarks/benchmark_serving.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --endpoint /v1/completions \ + --dataset-name sharegpt \ + --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ + --num-prompts 10 +``` + +If successful, you will see the following output + +``` +============ Serving Benchmark Result ============ +Successful requests: 10 +Benchmark duration (s): 5.78 +Total input tokens: 1369 +Total generated tokens: 2212 +Request throughput (req/s): 1.73 +Output token throughput (tok/s): 382.89 +Total Token throughput (tok/s): 619.85 +---------------Time to First Token---------------- +Mean TTFT (ms): 71.54 +Median TTFT (ms): 73.88 +P99 TTFT (ms): 79.49 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 7.91 +Median TPOT (ms): 7.96 +P99 TPOT (ms): 8.03 +---------------Inter-token Latency---------------- +Mean ITL (ms): 7.74 +Median ITL (ms): 7.70 +P99 ITL (ms): 8.39 +================================================== +``` + +### Custom Dataset +If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl + +``` +{"prompt": "What is the capital of India?"} +{"prompt": "What is the capital of Iran?"} +{"prompt": "What is the capital of China?"} +``` + +```bash +# start server +VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests +``` + +```bash +# run benchmarking script +python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ + --backend vllm \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --endpoint /v1/completions \ + --dataset-name custom \ + --dataset-path \ + --custom-skip-chat-template \ + --num-prompts 80 \ + --max-concurrency 1 \ + --temperature=0.3 \ + --top-p=0.75 \ + --result-dir "./log/" +``` + +You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. + +### VisionArena Benchmark for Vision Language Models + +```bash +# need a model with vision capability here +vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +``` + +```bash +python3 vllm/benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path lmarena-ai/VisionArena-Chat \ + --hf-split train \ + --num-prompts 1000 +``` + +### InstructCoder Benchmark with Speculative Decoding + +``` bash +VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' +``` + +``` bash +python3 benchmarks/benchmark_serving.py \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --dataset-name hf \ + --dataset-path likaixin/InstructCoder \ + --num-prompts 2048 +``` + +### Other HuggingFaceDataset Examples + +```bash +vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +``` + +**`lmms-lab/LLaVA-OneVision-Data`** + +```bash +python3 vllm/benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path lmms-lab/LLaVA-OneVision-Data \ + --hf-split train \ + --hf-subset "chart2text(cauldron)" \ + --num-prompts 10 +``` + +**`Aeala/ShareGPT_Vicuna_unfiltered`** + +```bash +python3 vllm/benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ + --hf-split train \ + --num-prompts 10 +``` + +**`AI-MO/aimo-validation-aime`** + +``` bash +python3 vllm/benchmarks/benchmark_serving.py \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path AI-MO/aimo-validation-aime \ + --num-prompts 10 \ + --seed 42 +``` + +**`philschmid/mt-bench`** + +``` bash +python3 vllm/benchmarks/benchmark_serving.py \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 80 +``` + +### Running With Sampling Parameters + +When using OpenAI-compatible backends such as `vllm`, optional sampling +parameters can be specified. Example client command: + +```bash +python3 vllm/benchmarks/benchmark_serving.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --endpoint /v1/completions \ + --dataset-name sharegpt \ + --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ + --top-k 10 \ + --top-p 0.9 \ + --temperature 0.5 \ + --num-prompts 10 +``` + +--- +## Example - Offline Throughput Benchmark + +```bash +python3 vllm/benchmarks/benchmark_throughput.py \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset-name sonnet \ + --dataset-path vllm/benchmarks/sonnet.txt \ + --num-prompts 10 +``` + +If successful, you will see the following output + +``` +Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s +Total num prompt tokens: 5014 +Total num output tokens: 1500 +``` + +### VisionArena Benchmark for Vision Language Models + +``` bash +python3 vllm/benchmarks/benchmark_throughput.py \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --backend vllm-chat \ + --dataset-name hf \ + --dataset-path lmarena-ai/VisionArena-Chat \ + --num-prompts 1000 \ + --hf-split train +``` + +The `num prompt tokens` now includes image token counts + +``` +Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s +Total num prompt tokens: 14527 +Total num output tokens: 1280 +``` + +### InstructCoder Benchmark with Speculative Decoding + +``` bash +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +VLLM_USE_V1=1 \ +python3 vllm/benchmarks/benchmark_throughput.py \ + --dataset-name=hf \ + --dataset-path=likaixin/InstructCoder \ + --model=meta-llama/Meta-Llama-3-8B-Instruct \ + --input-len=1000 \ + --output-len=100 \ + --num-prompts=2048 \ + --async-engine \ + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' +``` + +``` +Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s +Total num prompt tokens: 261136 +Total num output tokens: 204800 +``` + +### Other HuggingFaceDataset Examples + +**`lmms-lab/LLaVA-OneVision-Data`** + +```bash +python3 vllm/benchmarks/benchmark_throughput.py \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --backend vllm-chat \ + --dataset-name hf \ + --dataset-path lmms-lab/LLaVA-OneVision-Data \ + --hf-split train \ + --hf-subset "chart2text(cauldron)" \ + --num-prompts 10 +``` + +**`Aeala/ShareGPT_Vicuna_unfiltered`** + +```bash +python3 vllm/benchmarks/benchmark_throughput.py \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --backend vllm-chat \ + --dataset-name hf \ + --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ + --hf-split train \ + --num-prompts 10 +``` + +**`AI-MO/aimo-validation-aime`** + +```bash +python3 benchmarks/benchmark_throughput.py \ + --model Qwen/QwQ-32B \ + --backend vllm \ + --dataset-name hf \ + --dataset-path AI-MO/aimo-validation-aime \ + --hf-split train \ + --num-prompts 10 +``` + +### Benchmark with LoRA Adapters + +``` bash +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +python3 vllm/benchmarks/benchmark_throughput.py \ + --model meta-llama/Llama-2-7b-hf \ + --backend vllm \ + --dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \ + --dataset_name sharegpt \ + --num-prompts 10 \ + --max-loras 2 \ + --max-lora-rank 8 \ + --enable-lora \ + --lora-path yard1/llama-2-7b-sql-lora-test + ``` diff --git a/benchmarks/auto_tune.sh b/benchmarks/auto_tune.sh new file mode 100644 index 000000000..1b01bbd61 --- /dev/null +++ b/benchmarks/auto_tune.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. +# The current server parameter combination is max_num_seqs and max_num_batched_tokens +# It also supports additional requirement: e2e latency and prefix cache. + +# Pre-requisite: +# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. +# 2. If the model is customized, replace the MODEL's config with the customized config. +# 3. Set variables (ALL REQUIRED) +# BASE: your directory for vllm repo +# MODEL: the model served by vllm +# TP: ways of tensor parallelism +# DOWNLOAD_DIR: directory to download and load model weights. +# INPUT_LEN: request input len +# OUTPUT_LEN: request output len +# MIN_CACHE_HIT_PCT: prefix cache rate +# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 +# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with. +# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with. +# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST. +# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. +# 5. The final result will be saved in RESULT file. + + +# Example use cases +# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput? +# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000 +# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter? +# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500 +# 3. If we want to reach 60% prefix cache, what's the best server parameter? +# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500 + +TAG=$(date +"%Y_%m_%d_%H_%M") +BASE="" +MODEL="meta-llama/Llama-3.1-8B-Instruct" +TP=1 +DOWNLOAD_DIR="" +INPUT_LEN=4000 +OUTPUT_LEN=16 +MIN_CACHE_HIT_PCT=0 +MAX_LATENCY_ALLOWED_MS=100000000000 +NUM_SEQS_LIST="128 256" +NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096" + +LOG_FOLDER="$BASE/auto-benchmark/$TAG" +RESULT="$LOG_FOLDER/result.txt" + +echo "result file: $RESULT" +echo "model: $MODEL" + +rm -rf $LOG_FOLDER +mkdir -p $LOG_FOLDER + +cd "$BASE/vllm" + +pip install -q datasets + +current_hash=$(git rev-parse HEAD) +echo "hash:$current_hash" >> "$RESULT" +echo "current_hash: $current_hash" + +best_throughput=0 +best_max_num_seqs=0 +best_num_batched_tokens=0 +best_goodput=0 + +start_server() { + local gpu_memory_utilization=$1 + local max_num_seqs=$2 + local max_num_batched_tokens=$3 + local vllm_log=$4 + + pkill -f vllm + + VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \ + --disable-log-requests \ + --port 8004 \ + --gpu-memory-utilization $gpu_memory_utilization \ + --max-num-seqs $max_num_seqs \ + --max-num-batched-tokens $max_num_batched_tokens \ + --tensor-parallel-size $TP \ + --enable-prefix-caching \ + --load-format dummy \ + --download-dir "$DOWNLOAD_DIR" \ + --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 & + + # wait for 10 minutes... + server_started=0 + for i in {1..60}; do + RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) + STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) + if [[ "$STATUS_CODE" -eq 200 ]]; then + server_started=1 + break + else + sleep 10 + fi + done + if (( ! server_started )); then + echo "server did not start within 10 minutes. Please check server log at $vllm_log". + return 1 + else + return 0 + fi +} + +run_benchmark() { + local max_num_seqs=$1 + local max_num_batched_tokens=$2 + local gpu_memory_utilization=$3 + echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" + local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" + echo "vllm_log: $vllm_log" + echo + rm -f $vllm_log + pkill -f vllm + + echo "starting server..." + start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log + result=$? + if [[ "$result" -eq 1 ]]; then + echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" + else + echo "server started." + fi + echo + + echo "run benchmark test..." + meet_latency_requirement=0 + # get a basic qps by using request-rate inf + bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" + prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) + python benchmarks/benchmark_serving.py \ + --backend vllm \ + --model $MODEL \ + --dataset-name random \ + --random-input-len $INPUT_LEN \ + --random-output-len $OUTPUT_LEN \ + --ignore-eos \ + --disable-tqdm \ + --request-rate inf \ + --percentile-metrics ttft,tpot,itl,e2el \ + --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ + --num-prompts 1000 \ + --random-prefix-len $prefix_len \ + --port 8004 &> "$bm_log" + throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') + e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') + goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') + + if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then + meet_latency_requirement=1 + request_rate=inf + fi + + if (( ! meet_latency_requirement )); then + # start from request-rate as int(throughput) + 1 + request_rate=$((${throughput%.*} + 1)) + while ((request_rate > 0)); do + # clear prefix cache + curl -X POST http://0.0.0.0:8004/reset_prefix_cache + sleep 5 + bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" + python benchmarks/benchmark_serving.py \ + --backend vllm \ + --model $MODEL \ + --dataset-name random \ + --random-input-len $INPUT_LEN \ + --random-output-len $OUTPUT_LEN \ + --ignore-eos \ + --disable-tqdm \ + --request-rate $request_rate \ + --percentile-metrics ttft,tpot,itl,e2el \ + --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ + --num-prompts 100 \ + --random-prefix-len $prefix_len \ + --port 8004 &> "$bm_log" + throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') + e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') + goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') + if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then + meet_latency_requirement=1 + break + fi + request_rate=$((request_rate-1)) + done + fi + # write the results and update the best result. + if ((meet_latency_requirement)); then + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT" + if (( $(echo "$throughput > $best_throughput" | bc -l) )); then + best_throughput=$throughput + best_max_num_seqs=$max_num_seqs + best_num_batched_tokens=$max_num_batched_tokens + best_goodput=$goodput + fi + else + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT" + fi + + echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" + + pkill vllm + sleep 10 + printf '=%.0s' $(seq 1 20) + return 0 +} + +read -r -a num_seqs_list <<< "$NUM_SEQS_LIST" +read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST" + +# first find out the max gpu-memory-utilization without HBM OOM. +gpu_memory_utilization=0.98 +find_gpu_memory_utilization=0 +while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do + start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" + result=$? + if [[ "$result" -eq 0 ]]; then + find_gpu_memory_utilization=1 + break + else + gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc) + fi +done + +if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then + echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model." +else + echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER." + exit 1 +fi + +for num_seqs in "${num_seqs_list[@]}"; do + for num_batched_tokens in "${num_batched_tokens_list[@]}"; do + run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization + done +done +echo "finish permutations" +echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" +echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT" + diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py new file mode 100644 index 000000000..ddb38e304 --- /dev/null +++ b/benchmarks/backend_request_func.py @@ -0,0 +1,622 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import io +import json +import os +import sys +import time +import traceback +from dataclasses import dataclass, field +from typing import Optional, Union + +import aiohttp +import huggingface_hub.constants +from tqdm.asyncio import tqdm +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + +# NOTE(simon): do not import vLLM here so the benchmark script +# can run without vLLM installed. + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + + +@dataclass +class RequestFuncInput: + prompt: str + api_url: str + prompt_len: int + output_len: int + model: str + model_name: Optional[str] = None + logprobs: Optional[int] = None + extra_body: Optional[dict] = None + multi_modal_content: Optional[dict] = None + ignore_eos: bool = False + language: Optional[str] = None + + +@dataclass +class RequestFuncOutput: + generated_text: str = "" + success: bool = False + latency: float = 0.0 + output_tokens: int = 0 + ttft: float = 0.0 # Time to first token + itl: list[float] = field(default_factory=list) # list of inter-token latencies + tpot: float = 0.0 # avg next-token latencies + prompt_len: int = 0 + error: str = "" + + +async def async_request_tgi( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith("generate_stream") + + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: + params = { + "max_new_tokens": request_func_input.output_len, + "do_sample": True, + "temperature": 0.01, # TGI does not accept 0.0 temperature. + "top_p": 0.99, # TGI does not accept 1.0 top_p. + "truncate": request_func_input.prompt_len, + "ignore_eos_token": request_func_input.ignore_eos, + } + payload = { + "inputs": request_func_input.prompt, + "parameters": params, + } + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + if request_func_input.ignore_eos: + output.output_tokens = request_func_input.output_len + else: + output.output_tokens = None + + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + chunk_bytes = chunk_bytes.decode("utf-8") + + # NOTE: Sometimes TGI returns a ping response without + # any data, we should skip it. + if chunk_bytes.startswith(":"): + continue + chunk = chunk_bytes.removeprefix("data:") + + data = json.loads(chunk) + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - most_recent_timestamp) + + most_recent_timestamp = timestamp + + output.latency = most_recent_timestamp - st + output.success = True + output.generated_text = data["generated_text"] + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_trt_llm( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith("generate_stream") + + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: + payload = { + "accumulate_tokens": True, + "text_input": request_func_input.prompt, + "temperature": 0.0, + "top_p": 1.0, + "max_tokens": request_func_input.output_len, + "stream": True, + } + if request_func_input.ignore_eos: + payload["min_length"] = request_func_input.output_len + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix("data:") + + data = json.loads(chunk) + output.generated_text += data["text_output"] + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - most_recent_timestamp) + + most_recent_timestamp = timestamp + + output.latency = most_recent_timestamp - st + output.success = True + + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_deepspeed_mii( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith(("completions", "profile")), ( + "OpenAI Completions API URL must end with 'completions' or 'profile'." + ) + + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: + payload = { + "model": request_func_input.model, + "prompt": request_func_input.prompt, + "max_tokens": request_func_input.output_len, + "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. + "top_p": 1.0, + } + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024, + # will use 0 as placeholder. + # See https://github.com/microsoft/DeepSpeed-MII/pull/311 + output.ttft = 0 + + st = time.perf_counter() + try: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: + if response.status == 200: + parsed_resp = await response.json() + output.latency = time.perf_counter() - st + if "choices" in parsed_resp: + output.generated_text = parsed_resp["choices"][0]["text"] + elif "text" in parsed_resp: + output.generated_text = parsed_resp["text"][0] + else: + output.error = ( + "Unexpected response format: " + "neither 'choices' nor 'text' found" + ) + output.success = False + output.success = True + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith(("completions", "profile")), ( + "OpenAI Completions API URL must end with 'completions' or 'profile'." + ) + + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "prompt": request_func_input.prompt, + "temperature": 0.0, + "repetition_penalty": 1.0, + "max_tokens": request_func_input.output_len, + "logprobs": request_func_input.logprobs, + "stream": True, + "stream_options": { + "include_usage": True, + }, + } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: + if response.status == 200: + first_chunk_received = False + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") + if chunk != "[DONE]": + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += text or "" + if usage := data.get("usage"): + output.output_tokens = usage.get("completion_tokens") + if first_chunk_received: + output.success = True + else: + output.success = False + output.error = ( + "Never received a valid chunk to calculate TTFT." + "This response will be marked as failed!" + ) + output.generated_text = generated_text + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_chat_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith(("chat/completions", "profile")), ( + "OpenAI Chat Completions API URL must end with 'chat/completions'." + ) + + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: + content = [{"type": "text", "text": request_func_input.prompt}] + if request_func_input.multi_modal_content: + content.append(request_func_input.multi_modal_content) + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "messages": [ + {"role": "user", "content": content}, + ], + "temperature": 0.0, + "max_completion_tokens": request_func_input.output_len, + "stream": True, + "stream_options": { + "include_usage": True, + }, + } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get("completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_audio( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + # Lazy import without PlaceholderModule to avoid vllm dep. + import soundfile + + api_url = request_func_input.api_url + assert api_url.endswith(("transcriptions", "translations")), ( + "OpenAI Chat Completions API URL must end with 'transcriptions' " + ) + "or `translations`." + + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: + content = [{"type": "text", "text": request_func_input.prompt}] + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "temperature": 0.0, + "max_completion_tokens": request_func_input.output_len, + "stream": True, + "language": "en", + # Flattened due to multipart/form-data + "stream_include_usage": True, + "stream_continuous_usage_stats": True, + } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + # Send audio file + def to_bytes(y, sr): + buffer = io.BytesIO() + soundfile.write(buffer, y, sr, format="WAV") + buffer.seek(0) + return buffer + + with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + form = aiohttp.FormData() + form.add_field("file", f, content_type="audio/wav") + for key, value in payload.items(): + form.add_field(key, str(value)) + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post( + url=api_url, data=form, headers=headers + ) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append( + timestamp - most_recent_timestamp + ) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens" + ) + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +def get_model(pretrained_model_name_or_path: str) -> str: + if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true": + from modelscope import snapshot_download + + from vllm.model_executor.model_loader.weight_utils import get_lock + + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(pretrained_model_name_or_path): + model_path = snapshot_download( + model_id=pretrained_model_name_or_path, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"], + ) + + return model_path + return pretrained_model_name_or_path + + +def get_tokenizer( + pretrained_model_name_or_path: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + **kwargs, +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + if pretrained_model_name_or_path is not None and not os.path.exists( + pretrained_model_name_or_path + ): + pretrained_model_name_or_path = get_model(pretrained_model_name_or_path) + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") + kwargs["use_fast"] = False + if tokenizer_mode == "mistral": + try: + from vllm.transformers_utils.tokenizer import MistralTokenizer + except ImportError as e: + raise ImportError( + "MistralTokenizer requires vllm package.\n" + "Please install it with `pip install vllm` " + "to use mistral tokenizer mode." + ) from e + return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path)) + else: + return AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **kwargs, + ) + + +ASYNC_REQUEST_FUNCS = { + "tgi": async_request_tgi, + "vllm": async_request_openai_completions, + "lmdeploy": async_request_openai_completions, + "deepspeed-mii": async_request_deepspeed_mii, + "openai": async_request_openai_completions, + "openai-chat": async_request_openai_chat_completions, + "openai-audio": async_request_openai_audio, + "tensorrt-llm": async_request_trt_llm, + "scalellm": async_request_openai_completions, + "sglang": async_request_openai_completions, + "llama.cpp": async_request_openai_completions, +} + +OPENAI_COMPATIBLE_BACKENDS = [ + k + for k, v in ASYNC_REQUEST_FUNCS.items() + if v in (async_request_openai_completions, async_request_openai_chat_completions) +] diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py new file mode 100644 index 000000000..5d2a26cd4 --- /dev/null +++ b/benchmarks/benchmark_dataset.py @@ -0,0 +1,1167 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This module defines a framework for sampling benchmark requests from various +datasets. Each dataset subclass of BenchmarkDataset must implement sample +generation. Supported dataset types include: + - ShareGPT + - Random (synthetic) + - Sonnet + - BurstGPT + - HuggingFace + - VisionArena +""" + +import base64 +import io +import json +import logging +import random +from abc import ABC, abstractmethod +from collections.abc import Mapping +from dataclasses import dataclass +from functools import cache +from io import BytesIO +from typing import Any, Callable, Optional, Union + +import numpy as np +import pandas as pd +from datasets import load_dataset +from PIL import Image +from transformers import PreTrainedTokenizerBase + +from vllm.lora.request import LoRARequest +from vllm.lora.utils import get_adapter_absolute_path +from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.image import convert_image_mode +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer + +logger = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- +# Data Classes +# ----------------------------------------------------------------------------- + + +@dataclass +class SampleRequest: + """ + Represents a single inference request for benchmarking. + """ + + prompt: Union[str, Any] + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None + lora_request: Optional[LoRARequest] = None + + +# ----------------------------------------------------------------------------- +# Benchmark Dataset Base Class +# ----------------------------------------------------------------------------- + + +class BenchmarkDataset(ABC): + DEFAULT_SEED = 0 + IS_MULTIMODAL = False + + def __init__( + self, + dataset_path: Optional[str] = None, + random_seed: int = DEFAULT_SEED, + ) -> None: + """ + Initialize the BenchmarkDataset with an optional dataset path and random + seed. Args: + dataset_path (Optional[str]): Path to the dataset. If None, it + indicates that a default or random dataset might be used. + random_seed (int): Seed value for reproducible shuffling or + sampling. Defaults to DEFAULT_SEED. + """ + self.dataset_path = dataset_path + # Set the random seed, ensuring that a None value is replaced with the + # default seed. + self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED + self.data = None + + def apply_multimodal_chat_transformation( + self, prompt: str, mm_content: Optional[MultiModalDataDict] = None + ) -> list[dict]: + """ + Transform a prompt and optional multimodal content into a chat format. + This method is used for chat models that expect a specific conversation + format. + """ + content = [{"text": prompt, "type": "text"}] + if mm_content is not None: + content.append(mm_content) + return [{"role": "user", "content": content}] + + def load_data(self) -> None: + """ + Load data from the dataset path into self.data. + + This method must be overridden by subclasses since the method to load + data will vary depending on the dataset format and source. + + Raises: + NotImplementedError: If a subclass does not implement this method. + """ + # TODO (jenniferzhao): add support for downloading data + raise NotImplementedError("load_data must be implemented in subclasses.") + + def get_random_lora_request( + self, + tokenizer: PreTrainedTokenizerBase, + max_loras: Optional[int] = None, + lora_path: Optional[str] = None, + ) -> tuple[Optional[LoRARequest], AnyTokenizer]: + """ + Optionally select a random LoRA request and return its associated + tokenizer. + + This method is used when LoRA parameters are provided. It randomly + selects a LoRA based on max_loras and retrieves a cached tokenizer for + that LoRA if available. Otherwise, it returns the base tokenizer. + + Args: + tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no + LoRA is selected. max_loras (Optional[int]): The maximum number of + LoRAs available. If None, LoRA is not used. lora_path + (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA + is not used. + + Returns: + tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first + element is a LoRARequest (or None if not applicable) and the second + element is the tokenizer associated with the LoRA request (or the + base tokenizer). + """ + if max_loras is None or lora_path is None: + return None, tokenizer + + # Generate a random LoRA ID in the range [1, max_loras]. + lora_id = random.randint(1, max_loras) + lora_request = LoRARequest( + lora_name=str(lora_id), + lora_int_id=lora_id, + lora_path=lora_path_on_disk(lora_path), + ) + if lora_id not in lora_tokenizer_cache: + lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) + # Return lora_request and the cached tokenizer if available; otherwise, + # return the base tokenizer + return lora_request, lora_tokenizer_cache[lora_id] or tokenizer + + @abstractmethod + def sample( + self, tokenizer: PreTrainedTokenizerBase, num_requests: int + ) -> list[SampleRequest]: + """ + Abstract method to generate sample requests from the dataset. + + Subclasses must override this method to implement dataset-specific logic + for generating a list of SampleRequest objects. + + Args: + tokenizer (PreTrainedTokenizerBase): The tokenizer to be used + for processing the dataset's text. + num_requests (int): The number of sample requests to generate. + + Returns: + list[SampleRequest]: A list of sample requests generated from the + dataset. + """ + raise NotImplementedError("sample must be implemented in subclasses.") + + def maybe_oversample_requests( + self, requests: list[SampleRequest], num_requests: int + ) -> None: + """ + Oversamples the list of requests if its size is less than the desired + number. + + Args: + requests (List[SampleRequest]): The current list of sampled + requests. num_requests (int): The target number of requests. + """ + if len(requests) < num_requests: + random.seed(self.random_seed) + additional = random.choices(requests, k=num_requests - len(requests)) + requests.extend(additional) + logger.info("Oversampled requests to reach %d total samples.", num_requests) + + +# ----------------------------------------------------------------------------- +# Utility Functions and Global Caches +# ----------------------------------------------------------------------------- + + +def is_valid_sequence( + prompt_len: int, + output_len: int, + min_len: int = 4, + max_prompt_len: int = 1024, + max_total_len: int = 2048, + skip_min_output_len_check: bool = False, +) -> bool: + """ + Validate a sequence based on prompt and output lengths. + + Default pruning criteria are copied from the original `sample_hf_requests` + and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as + from `sample_requests` in benchmark_throughput.py. + """ + # Check for invalid conditions + prompt_too_short = prompt_len < min_len + output_too_short = (not skip_min_output_len_check) and (output_len < min_len) + prompt_too_long = prompt_len > max_prompt_len + combined_too_long = (prompt_len + output_len) > max_total_len + + # Return True if none of the invalid conditions are met + return not ( + prompt_too_short or output_too_short or prompt_too_long or combined_too_long + ) + + +@cache +def lora_path_on_disk(lora_path: str) -> str: + return get_adapter_absolute_path(lora_path) + + +# Global cache for LoRA tokenizers. +lora_tokenizer_cache: dict[int, AnyTokenizer] = {} + + +def process_image(image: Any) -> Mapping[str, Any]: + """ + Process a single image input and return a multimedia content dictionary. + + Supports three input types: + + 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key + containing raw image data. - Loads the bytes as a PIL.Image.Image. + + 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as + a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns + a dictionary with the image as a base64 data URL. + + 3. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(image, dict) and "bytes" in image: + image = Image.open(BytesIO(image["bytes"])) + if isinstance(image, Image.Image): + image = convert_image_mode(image, "RGB") + with io.BytesIO() as image_data: + image.save(image_data, format="JPEG") + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + return { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, + } + + if isinstance(image, str): + image_url = ( + image if image.startswith(("http://", "file://")) else f"file://{image}" + ) + return {"type": "image_url", "image_url": {"url": image_url}} + + raise ValueError( + f"Invalid image input {image}. Must be a PIL.Image.Image" + " or str or dictionary with raw image bytes." + ) + + +# ----------------------------------------------------------------------------- +# Random Dataset Implementation (Synthetic Data) +# ----------------------------------------------------------------------------- + + +class RandomDataset(BenchmarkDataset): + # Default values copied from benchmark_serving.py for the random dataset. + DEFAULT_PREFIX_LEN = 0 + DEFAULT_RANGE_RATIO = 0.0 + DEFAULT_INPUT_LEN = 1024 + DEFAULT_OUTPUT_LEN = 128 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + range_ratio: float = DEFAULT_RANGE_RATIO, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + **kwargs, + ) -> list[SampleRequest]: + # Enforce range_ratio < 1 + assert range_ratio < 1.0, ( + "random_range_ratio must be < 1.0 to ensure a valid sampling range" + ) + + vocab_size = tokenizer.vocab_size + num_special_tokens = tokenizer.num_special_tokens_to_add() + real_input_len = input_len - num_special_tokens + + prefix_token_ids = ( + np.random.randint(0, vocab_size, size=prefix_len).tolist() + if prefix_len > 0 + else [] + ) + + # New sampling logic: [X * (1 - b), X * (1 + b)] + input_low = int(real_input_len * (1 - range_ratio)) + input_high = int(real_input_len * (1 + range_ratio)) + output_low = int(output_len * (1 - range_ratio)) + output_high = int(output_len * (1 + range_ratio)) + + # Add logging for debugging + logger.info("Sampling input_len from [%s, %s]", input_low, input_high) + logger.info("Sampling output_len from [%s, %s]", output_low, output_high) + + input_lens = np.random.randint(input_low, input_high + 1, size=num_requests) + output_lens = np.random.randint(output_low, output_high + 1, size=num_requests) + offsets = np.random.randint(0, vocab_size, size=num_requests) + + requests = [] + for i in range(num_requests): + inner_seq = ( + (offsets[i] + i + np.arange(input_lens[i])) % vocab_size + ).tolist() + token_sequence = prefix_token_ids + inner_seq + prompt = tokenizer.decode(token_sequence) + # After decoding the prompt we have to encode and decode it again. + # This is done because in some cases N consecutive tokens + # give a string tokenized into != N number of tokens. + # For example for GPT2Tokenizer: + # [6880, 6881] -> ['Ġcalls', 'here'] -> + # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] + # To avoid uncontrolled change of the prompt length, + # the encoded sequence is truncated before being decode again. + re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ + : input_lens[i] + ] + prompt = tokenizer.decode(re_encoded_sequence) + total_input_len = prefix_len + int(input_lens[i]) + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=total_input_len, + expected_output_len=int(output_lens[i]), + ) + ) + return requests + + +# ----------------------------------------------------------------------------- +# ShareGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ShareGPTDataset(BenchmarkDataset): + """ + Implements the ShareGPT dataset. Loads data from a JSON file and generates + sample requests based on conversation turns. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + with open(self.dataset_path, encoding="utf-8") as f: + self.data = json.load(f) + # Filter entries with at least two conversation turns. + self.data = [ + entry + for entry in self.data + if "conversations" in entry and len(entry["conversations"]) >= 2 + ] + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + samples: list = [] + for entry in self.data: + if len(samples) >= num_requests: + break + prompt, completion = ( + entry["conversations"][0]["value"], + entry["conversations"][1]["value"], + ) + + lora_request, tokenizer = self.get_random_lora_request( + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path + ) + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + new_output_len = len(completion_ids) if output_len is None else output_len + if not is_valid_sequence( + prompt_len, + new_output_len, + skip_min_output_len_check=output_len is not None, + ): + continue + if enable_multimodal_chat: + prompt = self.apply_multimodal_chat_transformation(prompt, None) + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=new_output_len, + lora_request=lora_request, + ) + ) + self.maybe_oversample_requests(samples, num_requests) + return samples + + +# ----------------------------------------------------------------------------- +# Custom Dataset Implementation +# ----------------------------------------------------------------------------- + + +class CustomDataset(BenchmarkDataset): + """ + Implements the Custom dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + {"prompt": "What is the capital of India?"} + {"prompt": "What is the capital of Iran?"} + {"prompt": "What is the capital of China?"} + ``` + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + # self.data will be a list of dictionaries + # e.g., [{"prompt": "What is the capital of India?"}, ...] + # This will be the standardized format which load_data() + # has to convert into depending on the filetype of dataset_path. + # sample() will assume this standardized format of self.data + self.data = [] + + # Load the JSONL file + if self.dataset_path.endswith(".jsonl"): + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) + + # check if the JSONL file has a 'prompt' column + if "prompt" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'prompt' column.") + + # Convert each row to a dictionary and append to self.data + # This will convert the DataFrame to a list of dictionaries + # where each dictionary corresponds to a row in the DataFrame. + # This is the standardized format we want for self.data + for _, row in jsonl_data.iterrows(): + self.data.append(row.to_dict()) + else: + raise NotImplementedError( + "Only JSONL format is supported for CustomDataset." + ) + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + **kwargs, + ) -> list: + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Sonnet Dataset Implementation +# ----------------------------------------------------------------------------- + + +class SonnetDataset(BenchmarkDataset): + """ + Simplified implementation of the Sonnet dataset. Loads poem lines from a + text file and generates sample requests. Default values here copied from + `benchmark_serving.py` for the sonnet dataset. + """ + + DEFAULT_PREFIX_LEN = 200 + DEFAULT_INPUT_LEN = 550 + DEFAULT_OUTPUT_LEN = 150 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if not self.dataset_path: + raise ValueError("dataset_path must be provided.") + with open(self.dataset_path, encoding="utf-8") as f: + self.data = f.readlines() + + def sample( + self, + tokenizer, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + return_prompt_formatted: bool = False, + **kwargs, + ) -> list: + # Calculate average token length for a poem line. + tokenized_lines = [tokenizer(line).input_ids for line in self.data] + avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines) + + # Build the base prompt. + base_prompt = "Pick as many lines as you can from these poem lines:\n" + base_msg = [{"role": "user", "content": base_prompt}] + base_fmt = tokenizer.apply_chat_template( + base_msg, add_generation_prompt=True, tokenize=False + ) + base_offset = len(tokenizer(base_fmt).input_ids) + if input_len <= base_offset: + raise ValueError( + f"'input_len' must be higher than the base prompt length " + f"({base_offset})." + ) + + # Determine how many poem lines to use. + num_input_lines = round((input_len - base_offset) / avg_len) + num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) + prefix_lines = self.data[:num_prefix_lines] + + samples = [] + while len(samples) < num_requests: + extra_lines = random.choices( + self.data, k=num_input_lines - num_prefix_lines + ) + prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" + msg = [{"role": "user", "content": prompt}] + prompt_formatted = tokenizer.apply_chat_template( + msg, add_generation_prompt=True, tokenize=False + ) + prompt_len = len(tokenizer(prompt_formatted).input_ids) + if prompt_len <= input_len: + samples.append( + SampleRequest( + prompt=prompt_formatted if return_prompt_formatted else prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + return samples + + +# ----------------------------------------------------------------------------- +# BurstGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class BurstGPTDataset(BenchmarkDataset): + """ + Implements the BurstGPT dataset. Loads data from a CSV file and generates + sample requests based on synthetic prompt generation. Only rows with Model + "GPT-4" and positive response tokens are used. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data( + self, + ): + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + df = pd.read_csv(self.dataset_path) + # Filter to keep only GPT-4 rows. + gpt4_df = df[df["Model"] == "GPT-4"] + # Remove failed requests (where Response tokens is 0 or less). + gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] + # Sample the desired number of rows. + self.data = gpt4_df + + def _sample_loaded_data(self, num_requests: int) -> list: + if num_requests <= len(self.data): + data = self.data.sample(n=num_requests, random_state=self.random_seed) + else: + data = self.data.sample( + n=num_requests, + random_state=self.random_seed, + replace=True, + ) + # Convert the dataframe to a list of lists. + return data.values.tolist() + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + max_loras: Optional[int] = None, + lora_path: Optional[str] = None, + **kwargs, + ) -> list[SampleRequest]: + samples = [] + data = self._sample_loaded_data(num_requests=num_requests) + for i in range(num_requests): + input_len = int(data[i][2]) + output_len = int(data[i][3]) + lora_req, tokenizer = self.get_random_lora_request( + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path + ) + vocab_size = tokenizer.vocab_size + # Generate a synthetic prompt: a list of token IDs computed as (i + + # j) modulo vocab_size. + token_ids = [(i + j) % vocab_size for j in range(input_len)] + prompt = tokenizer.decode(token_ids) + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=output_len, + lora_request=lora_req, + ) + ) + return samples + + +# ----------------------------------------------------------------------------- +# HuggingFace Dataset Base Implementation +# ----------------------------------------------------------------------------- +class HuggingFaceDataset(BenchmarkDataset): + """Base class for datasets hosted on HuggingFace.""" + + SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set() + + def __init__( + self, + dataset_path: str, + dataset_split: str, + dataset_subset: Optional[str] = None, + **kwargs, + ) -> None: + super().__init__(dataset_path=dataset_path, **kwargs) + + self.dataset_split = dataset_split + self.dataset_subset = dataset_subset + self.load_data() + + def load_data(self) -> None: + """Load data from HuggingFace datasets.""" + self.data = load_dataset( + self.dataset_path, + name=self.dataset_subset, + split=self.dataset_split, + streaming=True, + ) + self.data = self.data.shuffle(seed=self.random_seed) + + +# ----------------------------------------------------------------------------- +# Conversation Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ConversationDataset(HuggingFaceDataset): + """Dataset for conversation data with multimodal support.""" + + SUPPORTED_DATASET_PATHS = { + "lmms-lab/LLaVA-OneVision-Data", + "Aeala/ShareGPT_Vicuna_unfiltered", + } + IS_MULTIMODAL = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + # Filter examples with at least 2 conversations + filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) + sampled_requests = [] + dynamic_output = output_len is None + + for item in filtered_data: + if len(sampled_requests) >= num_requests: + break + conv = item["conversations"] + prompt, completion = conv[0]["value"], conv[1]["value"] + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence(prompt_len, completion_len): + continue + mm_content = process_image(item["image"]) if "image" in item else None + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len and output len + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Vision Arena Dataset Implementation +# ----------------------------------------------------------------------------- + + +class VisionArenaDataset(HuggingFaceDataset): + """ + Vision Arena Dataset. + """ + + DEFAULT_OUTPUT_LEN = 128 + SUPPORTED_DATASET_PATHS = { + "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"], + "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"], + } + IS_MULTIMODAL = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) + if parser_fn is None: + raise ValueError(f"Unsupported dataset path: {self.dataset_path}") + prompt = parser_fn(item) + mm_content = process_image(item["images"][0]) + prompt_len = len(tokenizer(prompt).input_ids) + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Instruct Coder Dataset Implementation +# ----------------------------------------------------------------------------- + + +class InstructCoderDataset(HuggingFaceDataset): + """ + InstructCoder Dataset. + https://huggingface.co/datasets/likaixin/InstructCoder + + InstructCoder is the dataset designed for general code editing. It consists + of 114,239 instruction-input-output triplets, and covers multiple distinct + code editing scenario. + """ + + DEFAULT_OUTPUT_LEN = 200 # this is the average default output length + SUPPORTED_DATASET_PATHS = { + "likaixin/InstructCoder", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = f"{item['input']}\n\n{item['instruction']} Just output \ + the code, do not include any explanation." + + # apply template + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# MT-Bench Dataset Implementation +# ----------------------------------------------------------------------------- + + +class MTBenchDataset(HuggingFaceDataset): + """ + MT-Bench Dataset. + https://huggingface.co/datasets/philschmid/mt-bench + + We create a single turn dataset for MT-Bench. + This is similar to Spec decoding benchmark setup in vLLM + https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 + """ # noqa: E501 + + DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM + SUPPORTED_DATASET_PATHS = { + "philschmid/mt-bench", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + sampled_requests = [] + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["turns"][0] + + # apply template + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# AIMO Dataset Implementation +# ----------------------------------------------------------------------------- + + +class AIMODataset(HuggingFaceDataset): + """ + Dataset class for processing a AIMO dataset with reasoning questions. + """ + + SUPPORTED_DATASET_PATHS = { + "AI-MO/aimo-validation-aime", + "AI-MO/NuminaMath-1.5", + "AI-MO/NuminaMath-CoT", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + **kwargs, + ) -> list: + sampled_requests = [] + dynamic_output = output_len is None + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt, completion = item["problem"], item["solution"] + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence( + prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000 + ): + continue + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Next Edit Prediction Dataset Implementation +# ----------------------------------------------------------------------------- + + +zeta_prompt = """### Instruction: +You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location. + +### User Edits: + +{} + +### User Excerpt: + +{} + +### Response: + +""" # noqa: E501 + + +def _format_zeta_prompt( + sample: dict, original_start_marker: str = "<|editable_region_start|>" +) -> dict: + """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. + + This function formats examples from the NEP dataset + into prompts and expected outputs. It could be + further extended to support more NEP datasets. + + Args: + sample: The dataset sample containing events, + inputs, and outputs. + original_start_marker: The marker indicating the + start of the editable region. Defaults to + "<|editable_region_start|>". + + Returns: + A dictionary with the formatted prompts and expected outputs. + """ + events = sample["events"] + input = sample["input"] + output = sample["output"] + prompt = zeta_prompt.format(events, input) + + # following the original implementation, extract the focused region + # from the raw output + output_start_index = output.find(original_start_marker) + output_focused_region = output[output_start_index:] + expected_output = output_focused_region + + return {"prompt": prompt, "expected_output": expected_output} + + +class NextEditPredictionDataset(HuggingFaceDataset): + """ + Dataset class for processing a Next Edit Prediction dataset. + """ + + SUPPORTED_DATASET_PATHS = { + "zed-industries/zeta", + } + MAPPING_PROMPT_FUNCS = { + "zed-industries/zeta": _format_zeta_prompt, + } + + def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs): + formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path) + if formatting_prompt_func is None: + raise ValueError(f"Unsupported dataset path: {self.dataset_path}") + samples = [] + for sample in self.data: + sample = formatting_prompt_func(sample) + samples.append( + SampleRequest( + prompt=sample["prompt"], + prompt_len=len(tokenizer(sample["prompt"]).input_ids), + expected_output_len=len( + tokenizer(sample["expected_output"]).input_ids + ), + ) + ) + if len(samples) >= num_requests: + break + self.maybe_oversample_requests(samples, num_requests) + return samples + + +# ----------------------------------------------------------------------------- +# ASR Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ASRDataset(HuggingFaceDataset): + """ + Dataset class for processing a ASR dataset for transcription. + Tested on the following set: + + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | Dataset | Domain | Speaking Style | hf-subset | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | TED-LIUM | TED talks | Oratory | release1, release2, release3| + | | | | release3-speaker-adaptation | + | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | + | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | + | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | + | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | + | AMI | Meetings | Spontaneous | ihm, sdm | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + + """ # noqa: E501 + + SUPPORTED_DATASET_PATHS = { + "openslr/librispeech_asr", + "facebook/voxpopuli", + "LIUM/tedlium", + "edinburghcstr/ami", + "speechcolab/gigaspeech", + "kensho/spgispeech", + } + + DEFAULT_OUTPUT_LEN = 128 + IS_MULTIMODAL = True + + # TODO Whisper-specific. Abstract interface when more models are supported. + TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" + skip_long_audios: bool = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + **kwargs, + ) -> list: + import librosa + + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + prompt = ASRDataset.TRANSCRIPTION_PREAMBLE + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests = [] + skipped = 0 + for item in self.data: + if len(sampled_requests) >= num_requests: + break + audio = item["audio"] + y, sr = audio["array"], audio["sampling_rate"] + duration_s = librosa.get_duration(y=y, sr=sr) + # Whisper max supported duration + if self.skip_long_audios and duration_s > 30: + skipped += 1 + continue + + mm_content = {"audio": (y, sr)} + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + ) + ) + if skipped: + logger.warning( + "%d samples discarded from dataset due to" + " their length being greater than" + " what Whisper supports.", + skipped, + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py new file mode 100644 index 000000000..c06857247 --- /dev/null +++ b/benchmarks/benchmark_latency.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark the latency of processing a single batch of requests.""" + +import argparse +import dataclasses +import json +import os +import time +from typing import Any, Optional + +import numpy as np +from tqdm import tqdm + +import vllm.envs as envs +from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.inputs import PromptType +from vllm.sampling_params import BeamSearchParams +from vllm.utils import FlexibleArgumentParser + + +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: dict[str, Any] +) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={"latency": results["latencies"]}, + extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, + ) + if pt_records: + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def main(args: argparse.Namespace): + print(args) + + engine_args = EngineArgs.from_cli_args(args) + + # NOTE(woosuk): If the request cannot be processed in a single batch, + # the engine will automatically process the request in multiple batches. + llm = LLM(**dataclasses.asdict(engine_args)) + assert llm.llm_engine.model_config.max_model_len >= ( + args.input_len + args.output_len + ), ( + "Please ensure that max_model_len is greater than" + " the sum of input_len and output_len." + ) + + sampling_params = SamplingParams( + n=args.n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=args.output_len, + detokenize=not args.disable_detokenize, + ) + print(sampling_params) + dummy_prompt_token_ids = np.random.randint( + 10000, size=(args.batch_size, args.input_len) + ) + dummy_prompts: list[PromptType] = [ + {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() + ] + + def llm_generate(): + if not args.use_beam_search: + llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) + else: + llm.beam_search( + dummy_prompts, + BeamSearchParams( + beam_width=args.n, + max_tokens=args.output_len, + ignore_eos=True, + ), + ) + + def run_to_completion(profile_dir: Optional[str] = None): + if profile_dir: + llm.start_profile() + llm_generate() + llm.stop_profile() + else: + start_time = time.perf_counter() + llm_generate() + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + print("Warming up...") + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + run_to_completion(profile_dir=None) + + if args.profile: + profile_dir = envs.VLLM_TORCH_PROFILER_DIR + print(f"Profiling (results will be saved to '{profile_dir}')...") + run_to_completion(profile_dir=profile_dir) + return + + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion(profile_dir=None)) + latencies = np.array(latencies) + percentages = [10, 25, 50, 75, 90, 99] + percentiles = np.percentile(latencies, percentages) + print(f"Avg latency: {np.mean(latencies)} seconds") + for percentage, percentile in zip(percentages, percentiles): + print(f"{percentage}% percentile latency: {percentile} seconds") + + # Output JSON results if specified + if args.output_json: + results = { + "avg_latency": np.mean(latencies), + "latencies": latencies.tolist(), + "percentiles": dict(zip(percentages, percentiles.tolist())), + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the latency of processing a single batch of " + "requests till completion." + ) + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument( + "--n", + type=int, + default=1, + help="Number of generated sequences per prompt.", + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-iters-warmup", + type=int, + default=10, + help="Number of iterations to run for warmup.", + ) + parser.add_argument( + "--num-iters", type=int, default=30, help="Number of iterations to run." + ) + parser.add_argument( + "--profile", + action="store_true", + help="profile the generation process of a single batch", + ) + parser.add_argument( + "--output-json", + type=str, + default=None, + help="Path to save the latency results in JSON format.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=( + "Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)" + ), + ) + + parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=False) + args = parser.parse_args() + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler." + ) + main(args) diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py new file mode 100644 index 000000000..00869fa94 --- /dev/null +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -0,0 +1,196 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Offline benchmark to test the long document QA throughput. + +Example usage: + # This workload samples 8 different prompts with a default input + # length of 20000 tokens, then replicates each prompt 2 times + # in random order. + python benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --repeat-count 2 + +Commandline arguments: + --num-documents: The number of documents to sample prompts from. + + --document-length: The length of each document in tokens. + (Optional, default: 20000) + + --output-len: The number of tokens to generate for each prompt. + (Optional, default: 10) + + --repeat-count: The number of times to repeat each prompt. + (Optional, default: 2) + + --repeat-mode: The mode to repeat prompts. The supported modes are: + - 'random': shuffle the prompts randomly. (Default) + - 'tile': the entire prompt list is repeated in sequence. (Potentially + lowest cache hit) + - 'interleave': each prompt is repeated consecutively before + moving to the next element. (Highest cache hit) + + --shuffle-seed: Random seed when the repeat mode is "random". + (Optional, default: 0) + +In the meantime, it also supports all the vLLM engine args to initialize the +LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more +details. +""" + +import dataclasses +import random +import time + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def test_long_document_qa(llm=None, sampling_params=None, prompts=None): + """ + Test long document QA with the given prompts and sampling parameters. + Print the time spent in processing all the prompts. + + Args: + llm: The language model used for generating responses. + sampling_params: Sampling parameter used to generate the response. + prompts: A list of prompt strings to be processed by the LLM. + """ + start_time = time.time() + llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + print(f"Time to execute all requests: {end_time - start_time:.4f} secs") + + +def repeat_prompts(prompts, repeat_count, mode: str): + """ + Repeat each prompt in the list for a specified number of times. + The order of prompts in the output list depends on the mode. + + Args: + prompts: A list of prompts to be repeated. + repeat_count: The number of times each prompt is repeated. + mode: The mode of repetition. Supported modes are: + - 'random': Shuffle the prompts randomly after repetition. + - 'tile': Repeat the entire prompt list in sequence. + Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3]. + - 'interleave': Repeat each prompt consecutively before moving to + the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3]. + + Returns: + A list of repeated prompts in the specified order. + + Raises: + ValueError: If an invalid mode is provided. + """ + print("Repeat mode: ", mode) + if mode == "random": + repeated_prompts = prompts * repeat_count + random.shuffle(repeated_prompts) + return repeated_prompts + elif mode == "tile": + return prompts * repeat_count + elif mode == "interleave": + repeated_prompts = [] + for prompt in prompts: + repeated_prompts.extend([prompt] * repeat_count) + return repeated_prompts + else: + raise ValueError( + f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'" + ) + + +def main(args): + random.seed(args.shuffle_seed) + + # Prepare the prompts: + # we append the document id at the beginning to avoid any of the document + # being the prefix of other documents + prompts = [ + str(i) + " ".join(["hi"] * args.document_length) + for i in range(args.num_documents) + ] + + prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode) + + warmup_prompts = [ + "This is warm up request " + str(i) + " ".join(["hi"] * args.document_length) + for i in range(args.num_documents) + ] + + # Create the LLM engine + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**dataclasses.asdict(engine_args)) + sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + + print("------warm up------") + test_long_document_qa( + llm=llm, + prompts=warmup_prompts, + sampling_params=sampling_params, + ) + + print("------start generating------") + test_long_document_qa( + llm=llm, + prompts=prompts, + sampling_params=sampling_params, + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the performance with or " + "without automatic prefix caching." + ) + + parser.add_argument( + "--document-length", + type=int, + # Roughly the number of tokens for a system paper, + # excluding images + default=20000, + help="Range of input lengths for sampling prompts, " + 'specified as "min:max" (e.g., "128:256").', + ) + + parser.add_argument( + "--num-documents", + type=int, + default=8, + help="Range of input lengths for sampling prompts, " + 'specified as "min:max" (e.g., "128:256").', + ) + + parser.add_argument("--output-len", type=int, default=10) + + parser.add_argument( + "--repeat-count", + type=int, + default=2, + help="Number of times to repeat each prompt", + ) + + parser.add_argument( + "--repeat-mode", + type=str, + default="random", + help="The mode to repeat prompts. The supported " + 'modes are "random", "tile", and "interleave". ' + "See repeat_prompts() in the source code for details.", + ) + + parser.add_argument( + "--shuffle-seed", + type=int, + default=0, + help='Random seed when the repeat mode is "random"', + ) + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py new file mode 100644 index 000000000..3e4704f0b --- /dev/null +++ b/benchmarks/benchmark_prefix_caching.py @@ -0,0 +1,272 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark the efficiency of prefix caching. + +This script allows you to benchmark the performance of +a model with and without prefix caching using either fixed prompts +or prompts sampled from the ShareGPT dataset. + +Fixed example usage: + python benchmark_prefix_caching.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-prompts 1 \ + --repeat-count 100 \ + --input-length-range 128:256 + +ShareGPT example usage: + # This command samples 20 prompts with input lengths + # between 128 and 256 tokens from the ShareGPT dataset, + # then replicates each prompt 5 times. + python benchmark_prefix_caching.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \ + --enable-prefix-caching \ + --num-prompts 20 \ + --repeat-count 5 \ + --input-length-range 128:256 +""" + +import dataclasses +import json +import random +import time +from typing import Optional + +from transformers import PreTrainedTokenizerBase + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 + + +def test_prefix(llm=None, sampling_params=None, prompts=None): + start_time = time.time() + + llm.generate(prompts, sampling_params=sampling_params) + + end_time = time.time() + print(f"cost time {end_time - start_time}") + + +@dataclasses.dataclass +class Request: + prompt: str + prompt_len: int + output_len: int + + +def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]: + vocab = tokenizer.get_vocab() + all_special_ids = set(tokenizer.all_special_ids) + + # Remove the special tokens. + return random.choices( + [v for k, v in vocab.items() if k not in all_special_ids], + k=length, + ) + + +def sample_requests_from_dataset( + dataset_path: str, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + input_length_range: tuple[int, int], + fixed_output_len: Optional[int], +) -> list[Request]: + if fixed_output_len is not None and fixed_output_len < 4: + raise ValueError("output_len too small") + + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [ + (data["conversations"][0]["value"], data["conversations"][1]["value"]) + for data in dataset + ] + + # Shuffle the dataset. + random.shuffle(dataset) + + min_len, max_len = input_length_range + assert min_len >= 0 and max_len >= min_len, "input_length_range too small" + + # Filter out sequences that are too long or too short + filtered_requests: list[Request] = [] + + for i in range(len(dataset)): + if len(filtered_requests) == num_requests: + break + + # Tokenize the prompts and completions. + prompt_token_ids = tokenizer(dataset[i][0]).input_ids + prompt = tokenizer.decode(prompt_token_ids) + completion = dataset[i][1] + completion_token_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_token_ids) + output_len = ( + len(completion_token_ids) if fixed_output_len is None else fixed_output_len + ) + if min_len <= prompt_len <= max_len: + filtered_requests.append(Request(prompt, prompt_len, output_len)) + + return filtered_requests + + +def sample_requests_from_random( + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + input_length_range: tuple[int, int], + fixed_output_len: Optional[int], + prefix_len: int, +) -> list[Request]: + requests = [] + prefix_token_ids = sample_tokens(tokenizer, prefix_len) + min_len, max_len = input_length_range + + for i in range(num_requests): + unique_part_token_ids = sample_tokens( + tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len) + ) + prompt_token_ids = prefix_token_ids + unique_part_token_ids + prompt = tokenizer.decode(prompt_token_ids) + prompt_len = len(prompt_token_ids) + assert min_len <= prompt_len <= max_len, ( + f"prompt_len {prompt_len} out of range {min_len}:{max_len}" + ) + requests.append(Request(prompt, prompt_len, fixed_output_len)) + return requests + + +def repeat_and_sort_requests( + requests: list[Request], repeat_count: int, sort: bool = False +) -> list[str]: + repeated_requests = requests * repeat_count + if sort: + repeated_requests.sort(key=lambda x: x[1]) + else: + random.shuffle(repeated_requests) + return [req.prompt for req in repeated_requests] + + +def main(args): + tokenizer = get_tokenizer(args.model, trust_remote_code=True) + input_length_range = tuple(map(int, args.input_length_range.split(":"))) + random.seed(args.seed) + if args.dataset_path is not None: + if args.prefix_len > 0: + raise ValueError( + "prefix-len is not supported when dataset-path is provided." + ) + print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}") + filtered_requests = sample_requests_from_dataset( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + tokenizer=tokenizer, + input_length_range=input_length_range, + fixed_output_len=args.output_len, + ) + else: + print(f"Start to sample {args.num_prompts} prompts from random") + filtered_requests = sample_requests_from_random( + num_requests=args.num_prompts, + tokenizer=tokenizer, + input_length_range=input_length_range, + fixed_output_len=args.output_len, + prefix_len=args.prefix_len, + ) + + # Print some helpful stats of the requests. + print(f"Sampled {len(filtered_requests)} requests.") + prompt_lens = [req.prompt_len for req in filtered_requests] + print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}") + print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}") + print(f"Min Prompt Length: {min(prompt_lens)}") + print(f"Max Prompt Length: {max(prompt_lens)}") + + engine_args = EngineArgs.from_cli_args(args) + + llm = LLM(**dataclasses.asdict(engine_args)) + + sampling_params = SamplingParams( + temperature=0, + max_tokens=args.output_len, + detokenize=not args.disable_detokenize, + ) + + print("Testing filtered requests") + prompts = repeat_and_sort_requests( + filtered_requests, repeat_count=args.repeat_count, sort=args.sort + ) + + print("------start generating------") + test_prefix( + llm=llm, + prompts=prompts, + sampling_params=sampling_params, + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the performance with or without " + "automatic prefix caching." + ) + parser.add_argument( + "--dataset-path", type=str, default=None, help="Path to the dataset." + ) + parser.add_argument("--output-len", type=int, default=10) + parser.add_argument( + "--num-prompts", + type=int, + required=True, + help="Number of the prompts sampled from dataset", + ) + parser.add_argument( + "--repeat-count", + type=int, + default=1, + help="Number of times to repeat each prompt", + ) + parser.add_argument( + "--sort", action="store_true", help="Sort prompts by input length" + ) + parser.add_argument( + "--input-length-range", + type=str, + required=True, + help="Range of input lengths for sampling prompts," + 'specified as "min:max" (e.g., "128:256").', + ) + parser.add_argument( + "--prefix-len", + type=int, + default=0, + help="Specifies the length of a common prefix to be " + "added to the input prompt. The input-length-range will " + "subtract this length when filtering prompts. Only used " + "when dataset-path is not provided.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=( + "Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)" + ), + ) + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py new file mode 100644 index 000000000..5496703f2 --- /dev/null +++ b/benchmarks/benchmark_prioritization.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark offline prioritization.""" + +import argparse +import dataclasses +import json +import random +import time +from typing import Optional + +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +# Select a equi-probable random priority +def get_random_flag(): + return 0 if random.random() < 0.5 else 1 + + +def sample_requests( + dataset_path: str, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + fixed_output_len: Optional[int], +) -> list[tuple[str, int, int, int]]: + if fixed_output_len is not None and fixed_output_len < 4: + raise ValueError("output_len too small") + + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [ + (data["conversations"][0]["value"], data["conversations"][1]["value"]) + for data in dataset + ] + + # Shuffle the dataset. + random.shuffle(dataset) + + # Filter out sequences that are too long or too short + filtered_dataset: list[tuple[str, int, int]] = [] + for i in range(len(dataset)): + if len(filtered_dataset) == num_requests: + break + + # Tokenize the prompts and completions. + prompt = dataset[i][0] + prompt_token_ids = tokenizer(prompt).input_ids + completion = dataset[i][1] + completion_token_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_token_ids) + output_len = ( + len(completion_token_ids) if fixed_output_len is None else fixed_output_len + ) + if prompt_len < 4 or output_len < 4: + # Prune too short sequences. + continue + if prompt_len > 1024 or prompt_len + output_len > 2048: + # Prune too long sequences. + continue + + priority = get_random_flag() + + filtered_dataset.append((prompt, prompt_len, output_len, priority)) + + return filtered_dataset + + +def run_vllm( + requests: list[tuple[str, int, int]], + n: int, + engine_args: EngineArgs, + disable_detokenize: bool = False, +) -> float: + from vllm import LLM, SamplingParams + + llm = LLM(**dataclasses.asdict(engine_args)) + + assert all( + llm.llm_engine.model_config.max_model_len >= (request[1] + request[2]) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of" + " input_len and output_len for all requests." + ) + + # Add the requests to the engine. + prompts = [] + sampling_params = [] + priority = [] + for prompt, _, output_len, _priority in requests: + prompts.append(prompt) + priority.append(_priority) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=output_len, + detokenize=not disable_detokenize, + ) + ) + + start = time.perf_counter() + llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True) + end = time.perf_counter() + return end - start + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + + # Sample the requests. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code + ) + if args.dataset is None: + # Synthesize a prompt with the given input length. + prompt = "hi" * (args.input_len - 1) + requests = [ + (prompt, args.input_len, args.output_len, get_random_flag()) + for _ in range(args.num_prompts) + ] + else: + requests = sample_requests( + args.dataset, args.num_prompts, tokenizer, args.output_len + ) + + if args.backend == "vllm": + elapsed_time = run_vllm( + requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize + ) + else: + raise ValueError(f"Unknown backend: {args.backend}") + total_num_tokens = sum( + prompt_len + output_len for _, prompt_len, output_len, priority in requests + ) + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} tokens/s" + ) + + # Output JSON results if specified + if args.output_json: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": total_num_tokens / elapsed_time, + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the throughput.") + parser.add_argument( + "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" + ) + parser.add_argument( + "--dataset", type=str, default=None, help="Path to the dataset." + ) + parser.add_argument( + "--input-len", + type=int, + default=None, + help="Input prompt length for each request", + ) + parser.add_argument( + "--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.", + ) + parser.add_argument( + "--n", type=int, default=1, help="Number of generated sequences per prompt." + ) + parser.add_argument( + "--num-prompts", type=int, default=200, help="Number of prompts to process." + ) + parser.add_argument( + "--output-json", + type=str, + default=None, + help="Path to save the throughput results in JSON format.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=( + "Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)" + ), + ) + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + if args.dataset is None: + assert args.input_len is not None + assert args.output_len is not None + else: + assert args.input_len is None + + main(args) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py new file mode 100644 index 000000000..81428fb7d --- /dev/null +++ b/benchmarks/benchmark_serving.py @@ -0,0 +1,1230 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +r"""Benchmark online serving throughput. + +On the server side, run one of the following commands: + vLLM OpenAI API server + vllm serve \ + --swap-space 16 \ + --disable-log-requests + +On the client side, run: + python benchmarks/benchmark_serving.py \ + --backend \ + --model \ + --dataset-name sharegpt \ + --dataset-path \ + --request-rate \ # By default is inf + --num-prompts # By default is 1000 + + when using tgi backend, add + --endpoint /generate_stream + to the end of the command above. +""" + +import argparse +import asyncio +import gc +import json +import os +import random +import time +import warnings +from collections.abc import AsyncGenerator, Iterable +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Optional + +import numpy as np +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +from backend_request_func import ( + ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, + RequestFuncInput, + RequestFuncOutput, +) + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + +from benchmark_dataset import ( + AIMODataset, + ASRDataset, + BurstGPTDataset, + ConversationDataset, + CustomDataset, + HuggingFaceDataset, + InstructCoderDataset, + MTBenchDataset, + NextEditPredictionDataset, + RandomDataset, + SampleRequest, + ShareGPTDataset, + SonnetDataset, + VisionArenaDataset, +) +from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + request_goodput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: list[tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: list[tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: list[tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: list[tuple[float, float]] + + +async def get_request( + input_requests: list[SampleRequest], + request_rate: float, + burstiness: float = 1.0, +) -> AsyncGenerator[SampleRequest, None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a SampleRequest. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ + input_requests: Iterable[SampleRequest] = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}." + ) + theta = 1.0 / (request_rate * burstiness) + + for request in input_requests: + yield request + + if request_rate == float("inf"): + # If the request rate is infinity, then we don't need to wait. + continue + + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) + # The next request will be sent after the interval. + await asyncio.sleep(interval) + + +def calculate_metrics( + input_requests: list[SampleRequest], + outputs: list[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + goodput_config_dict: dict[str, float], +) -> tuple[BenchmarkMetrics, list[int]]: + actual_output_lens: list[int] = [] + total_input = 0 + completed = 0 + good_completed = 0 + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + output_len = outputs[i].output_tokens + + if not output_len: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer( + outputs[i].generated_text, add_special_tokens=False + ).input_ids + ) + actual_output_lens.append(output_len) + total_input += input_requests[i].prompt_len + tpot = 0 + if output_len > 1: + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) + tpots.append(tpot) + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if goodput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in goodput_config_dict: + valid_metrics.append(ttfts) + slo_values.append( + goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) + if "tpot" in goodput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append( + goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) + if "e2el" in goodput_config_dict: + valid_metrics.append(e2els) + slo_values.append( + goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2, + ) + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) + * 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[ + (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles + ], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[ + (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles + ], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[ + (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles + ], + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[ + (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles + ], + ) + + return metrics, actual_output_lens + + +async def benchmark( + backend: str, + api_url: str, + base_url: str, + model_id: str, + model_name: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: list[SampleRequest], + logprobs: Optional[int], + request_rate: float, + burstiness: float, + disable_tqdm: bool, + profile: bool, + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + ignore_eos: bool, + goodput_config_dict: dict[str, float], + max_concurrency: Optional[int], + lora_modules: Optional[Iterable[str]], + extra_body: Optional[dict], +): + if backend in ASYNC_REQUEST_FUNCS: + request_func = ASYNC_REQUEST_FUNCS[backend] + else: + raise ValueError(f"Unknown backend: {backend}") + + print("Starting initial single prompt test run...") + test_prompt, test_prompt_len, test_output_len, test_mm_content = ( + input_requests[0].prompt, + input_requests[0].prompt_len, + input_requests[0].expected_output_len, + input_requests[0].multi_modal_data, + ) + + assert test_mm_content is None or isinstance(test_mm_content, dict) + test_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + api_url=api_url, + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + + test_output = await request_func(request_func_input=test_input) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark arguments " + f"are correctly specified. Error: {test_output.error}" + ) + else: + print("Initial test run completed. Starting main benchmark run...") + + if lora_modules: + # For each input request, choose a LoRA module at random. + lora_modules = iter( + [random.choice(lora_modules) for _ in range(len(input_requests))] + ) + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + api_url=base_url + "/start_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler started") + + distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" + + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, pbar=pbar) + + benchmark_start_time = time.perf_counter() + tasks: list[asyncio.Task] = [] + async for request in get_request(input_requests, request_rate, burstiness): + prompt, prompt_len, output_len, mm_content = ( + request.prompt, + request.prompt_len, + request.expected_output_len, + request.multi_modal_data, + ) + req_model_id, req_model_name = model_id, model_name + if lora_modules: + req_lora_module = next(lora_modules) + req_model_id, req_model_name = req_lora_module, req_lora_module + + request_func_input = RequestFuncInput( + model=req_model_id, + model_name=req_model_name, + prompt=prompt, + api_url=api_url, + prompt_len=prompt_len, + output_len=output_len, + logprobs=logprobs, + multi_modal_content=mm_content, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + tasks.append( + asyncio.create_task( + limited_request_func(request_func_input=request_func_input, pbar=pbar) + ) + ) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, + ) + + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) + print( + "{:<40} {:<10.2f}".format( + "Request throughput (req/s):", metrics.request_throughput + ) + ) + if goodput_config_dict: + print( + "{:<40} {:<10.2f}".format( + "Request goodput (req/s):", metrics.request_goodput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", metrics.output_throughput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Total Token throughput (tok/s):", metrics.total_token_throughput + ) + ) + + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "request_goodput:": metrics.request_goodput if goodput_config_dict else None, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + } + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) + ) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms" + ) + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms" + ) + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms" + ) + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + return result + + +def check_goodput_args(args): + # Check and parse goodput arguments + goodput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. " + ) + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative." + ) + return goodput_config_dict + + +def parse_goodput(slo_pairs): + goodput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + goodput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + 'Specify service level objectives for goodput as "KEY:VALUE" ' + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds." + ) from err + return goodput_config_dict + + +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: dict[str, Any], file_name: str +) -> None: + metrics = [ + "median_ttft_ms", + "mean_ttft_ms", + "std_ttft_ms", + "p99_ttft_ms", + "mean_tpot_ms", + "median_tpot_ms", + "std_tpot_ms", + "p99_tpot_ms", + "median_itl_ms", + "mean_itl_ms", + "std_itl_ms", + "p99_itl_ms", + ] + # These raw data might be useful, but they are rather big. They can be added + # later if needed + ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={k: [results[k]] for k in metrics}, + extra_info={ + k: results[k] + for k in results + if k not in metrics and k not in ignored_metrics + }, + ) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + backend = args.backend + model_id = args.model + model_name = args.served_model_name + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + tokenizer_mode = args.tokenizer_mode + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + + tokenizer = get_tokenizer( + tokenizer_id, + tokenizer_mode=tokenizer_mode, + trust_remote_code=args.trust_remote_code, + ) + + if args.dataset_name is None: + raise ValueError( + "Please specify '--dataset-name' and the corresponding " + "'--dataset-path' if required." + ) + + if args.dataset_name == "custom": + dataset = CustomDataset(dataset_path=args.dataset_path) + input_requests = dataset.sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.custom_output_len, + skip_chat_template=args.custom_skip_chat_template, + ) + + elif args.dataset_name == "sonnet": + dataset = SonnetDataset(dataset_path=args.dataset_path) + # For the "sonnet" dataset, formatting depends on the backend. + if args.backend == "openai-chat": + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=False, + ) + else: + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset." + ) + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=True, + ) + + elif args.dataset_name == "hf": + # all following datasets are implemented from the + # HuggingFaceDataset base class + if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + dataset_class = VisionArenaDataset + args.hf_split = "train" + args.hf_subset = None + elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + dataset_class = InstructCoderDataset + args.hf_split = "train" + elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: + dataset_class = MTBenchDataset + args.hf_split = "train" + elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ConversationDataset + elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + dataset_class = AIMODataset + args.hf_split = "train" + elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 + dataset_class = NextEditPredictionDataset + args.hf_split = "train" + elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ASRDataset + args.hf_split = "train" + else: + supported_datasets = set( + [ + dataset_name + for cls in HuggingFaceDataset.__subclasses__() + for dataset_name in cls.SUPPORTED_DATASET_PATHS + ] + ) + raise ValueError( + f"Unsupported dataset path: {args.dataset_path}. " + "Huggingface dataset only supports dataset_path" + f" from one of following: {supported_datasets}. " + "Please consider contributing if you would " + "like to add support for additional dataset formats." + ) + + if dataset_class.IS_MULTIMODAL and backend not in [ + "openai-chat", + "openai-audio", + ]: + # multi-modal benchmark is only available on OpenAI Chat backend. + raise ValueError( + "Multi-modal content is only supported on 'openai-chat' and " + "'openai-audio' backend." + ) + input_requests = dataset_class( + dataset_path=args.dataset_path, + dataset_subset=args.hf_subset, + dataset_split=args.hf_split, + random_seed=args.seed, + ).sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.hf_output_len, + ) + + else: + # For datasets that follow a similar structure, use a mapping. + dataset_mapping = { + "sharegpt": lambda: ShareGPTDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + ), + "burstgpt": lambda: BurstGPTDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample(tokenizer=tokenizer, num_requests=args.num_prompts), + "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + range_ratio=args.random_range_ratio, + ), + } + + try: + input_requests = dataset_mapping[args.dataset_name]() + except KeyError as err: + raise ValueError(f"Unknown dataset: {args.dataset_name}") from err + goodput_config_dict = check_goodput_args(args) + + # Collect the sampling parameters. + sampling_params = { + k: v + for k, v in { + "top_p": args.top_p, + "top_k": args.top_k, + "min_p": args.min_p, + "temperature": args.temperature, + }.items() + if v is not None + } + + # Sampling parameters are only supported by openai-compatible backend. + if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: + raise ValueError( + "Sampling parameters are only supported by openai-compatible backends." + ) + + if "temperature" not in sampling_params: + sampling_params["temperature"] = 0.0 # Default to greedy decoding. + + if args.backend == "llama.cpp": + # Disable prompt caching in llama.cpp backend + sampling_params["cache_prompt"] = False + + # Avoid GC processing "static" data - reduce pause times. + gc.collect() + gc.freeze() + + benchmark_result = asyncio.run( + benchmark( + backend=backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + model_name=model_name, + tokenizer=tokenizer, + input_requests=input_requests, + logprobs=args.logprobs, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], + ignore_eos=args.ignore_eos, + goodput_config_dict=goodput_config_dict, + max_concurrency=args.max_concurrency, + lora_modules=args.lora_modules, + extra_body=sampling_params, + ) + ) + + # Save config and results to json + if args.save_result or args.append_result: + result_json: dict[str, Any] = {} + + # Setup + current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") + result_json["date"] = current_dt + result_json["backend"] = backend + result_json["model_id"] = model_id + result_json["tokenizer_id"] = tokenizer_id + result_json["num_prompts"] = args.num_prompts + + # Metadata + if args.metadata: + for item in args.metadata: + if "=" in item: + kvstring = item.split("=") + result_json[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid metadata format. Please use KEY=VALUE format." + ) + # Traffic + result_json["request_rate"] = ( + args.request_rate if args.request_rate < float("inf") else "inf" + ) + result_json["burstiness"] = args.burstiness + result_json["max_concurrency"] = args.max_concurrency + + # Merge with benchmark result + result_json = {**result_json, **benchmark_result} + + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", + ]: + if field in result_json: + del result_json[field] + if field in benchmark_result: + del benchmark_result[field] + + # Save to file + base_model_id = model_id.split("/")[-1] + max_concurrency_str = ( + f"-concurrency{args.max_concurrency}" + if args.max_concurrency is not None + else "" + ) + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + if args.result_filename: + file_name = args.result_filename + if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) + file_name = os.path.join(args.result_dir, file_name) + with open( + file_name, mode="a+" if args.append_result else "w", encoding="utf-8" + ) as outfile: + # Append a newline. + if args.append_result and outfile.tell() != 0: + outfile.write("\n") + json.dump(result_json, outfile) + save_to_pytorch_benchmark_format(args, result_json, file_name) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the online serving throughput." + ) + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--dataset-name", + type=str, + default="sharegpt", + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], + help="Name of the dataset to benchmark on.", + ) + parser.add_argument( + "--dataset-path", + type=str, + default=None, + help="Path to the sharegpt/sonnet dataset. " + "Or the huggingface dataset ID if using HF dataset.", + ) + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.", + ) + + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--logprobs", + type=int, + default=None, + help=( + "Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed" + ), + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " + "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--save-result", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) + parser.add_argument( + "--append-result", + action="store_true", + help="Append the benchmark result to the existing json file.", + ) + parser.add_argument( + "--metadata", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " + "for metadata of this run to be saved in the result JSON file " + "for record keeping purposes.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", + ) + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-separated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' + 'Default value is "ttft,tpot,itl".', + ) + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-separated list of percentiles for selected metrics. " + 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' + 'Default value is "99". ' + 'Use "--percentile-metrics" to select metrics.', + ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help='Specify service level objectives for goodput as "KEY:VALUE" ' + "pairs, where the key is a metric name, and the value is in " + 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' + "separated by spaces. Allowed request level metric names are " + '"ttft", "tpot", "e2el". For more context on the definition of ' + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) + + # group for dataset specific arguments + custom_group = parser.add_argument_group("custom dataset options") + custom_group.add_argument( + "--custom-output-len", + type=int, + default=256, + help="Number of output tokens per request, used only for custom dataset.", + ) + custom_group.add_argument( + "--custom-skip-chat-template", + action="store_true", + help="Skip applying chat template to prompt, used only for custom dataset.", + ) + + sonnet_group = parser.add_argument_group("sonnet dataset options") + sonnet_group.add_argument( + "--sonnet-input-len", + type=int, + default=550, + help="Number of input tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-output-len", + type=int, + default=150, + help="Number of output tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-prefix-len", + type=int, + default=200, + help="Number of prefix tokens per request, used only for sonnet dataset.", + ) + + sharegpt_group = parser.add_argument_group("sharegpt dataset options") + sharegpt_group.add_argument( + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the ShareGPT dataset.", + ) + + random_group = parser.add_argument_group("random dataset options") + random_group.add_argument( + "--random-input-len", + type=int, + default=1024, + help="Number of input tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-output-len", + type=int, + default=128, + help="Number of output tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-range-ratio", + type=float, + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for random sampling. Must be in the range [0, 1) to define " + "a symmetric sampling range" + "[length * (1 - range_ratio), length * (1 + range_ratio)].", + ) + random_group.add_argument( + "--random-prefix-len", + type=int, + default=0, + help=( + "Number of fixed prefix tokens before the random context " + "in a request. " + "The total input length is the sum of `random-prefix-len` and " + "a random " + "context length sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]." + ), + ) + + hf_group = parser.add_argument_group("hf dataset options") + hf_group.add_argument( + "--hf-subset", type=str, default=None, help="Subset of the HF dataset." + ) + hf_group.add_argument( + "--hf-split", type=str, default=None, help="Split of the HF dataset." + ) + hf_group.add_argument( + "--hf-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output lengths " + "from the sampled HF dataset.", + ) + + sampling_group = parser.add_argument_group("sampling parameters") + sampling_group.add_argument( + "--top-p", + type=float, + default=None, + help="Top-p sampling parameter. Only has effect on openai-compatible backends.", + ) + sampling_group.add_argument( + "--top-k", + type=int, + default=None, + help="Top-k sampling parameter. Only has effect on openai-compatible backends.", + ) + sampling_group.add_argument( + "--min-p", + type=float, + default=None, + help="Min-p sampling parameter. Only has effect on openai-compatible backends.", + ) + sampling_group.add_argument( + "--temperature", + type=float, + default=None, + help="Temperature sampling parameter. Only has effect on " + "openai-compatible backends. If not specified, default to greedy " + "decoding (i.e. temperature==0.0).", + ) + + parser.add_argument( + "--tokenizer-mode", + type=str, + default="auto", + choices=["auto", "slow", "mistral", "custom"], + help='The tokenizer mode.\n\n* "auto" will use the ' + 'fast tokenizer if available.\n* "slow" will ' + "always use the slow tokenizer. \n* " + '"mistral" will always use the `mistral_common` tokenizer. \n*' + '"custom" will use --tokenizer to select the preregistered tokenizer.', + ) + + parser.add_argument( + "--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ", + ) + + parser.add_argument( + "--lora-modules", + nargs="+", + default=None, + help="A subset of LoRA module names passed in when " + "launching the server. For each request, the " + "script chooses a LoRA module at random.", + ) + + args = parser.parse_args() + + main(args) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py new file mode 100644 index 000000000..c1501ad52 --- /dev/null +++ b/benchmarks/benchmark_serving_structured_output.py @@ -0,0 +1,1038 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +r"""Benchmark online serving throughput with structured outputs. + +On the server side, run one of the following commands: + (vLLM OpenAI API server) + vllm serve --disable-log-requests + +On the client side, run: + python benchmarks/benchmark_serving_structured_output.py \ + --backend \ + --model \ + --dataset json \ + --structured-output-ratio 1.0 \ + --request-rate 10 \ + --num-prompts 1000 + + when using tgi backend, add + --endpoint /generate_stream + to the end of the command above. +""" + +import argparse +import asyncio +import copy +import dataclasses +import json +import os +import random +import time +import uuid +import warnings +from collections.abc import AsyncGenerator +from dataclasses import dataclass +from typing import Optional + +import datasets +import numpy as np +import pandas as pd +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +from backend_request_func import ( + ASYNC_REQUEST_FUNCS, + RequestFuncInput, + RequestFuncOutput, +) + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + +from vllm.v1.structured_output.backend_xgrammar import ( + has_xgrammar_unsupported_json_features, +) + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + request_goodput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: list[tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: list[tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: list[tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: list[tuple[float, float]] + + +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + + prompt: str + prompt_len: int + expected_output_len: int + schema: dict + structure_type: str + completion: str = None + + +def sample_requests( + tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace +) -> list[SampleRequest]: + if args.dataset == "json" or args.dataset == "json-unique": + if args.json_schema_path is None: + dir_path = os.path.dirname(os.path.realpath(__file__)) + args.json_schema_path = os.path.join( + dir_path, "structured_schemas", "structured_schema_1.json" + ) + json_schemas = [] + with open(args.json_schema_path) as f: + schema = json.load(f) + + if args.dataset == "json-unique": + json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)] + for i in range(len(json_schemas)): + if "properties" not in json_schemas[i]: + json_schemas[i]["properties"] = {} + json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = { + "type": "string", + "description": "An unique optional field to avoid cached schemas", + } + else: + json_schemas = [schema] * args.num_prompts + + def gen_prompt(index: int): + return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501 + + def get_schema(index: int): + return json_schemas[index % len(json_schemas)] + + requests = [ + SampleRequest( + prompt=gen_prompt(i), + prompt_len=len(tokenizer(gen_prompt(i)).input_ids), + expected_output_len=args.output_len, + schema=get_schema(i), + structure_type=args.structure_type, + ) + for i in range(args.num_prompts) + ] + + elif args.dataset == "grammar": + schema = """ + root ::= select_statement + + select_statement ::= "SELECT " column " from " table " where " condition + + column ::= "col_1 " | "col_2 " + + table ::= "table_1 " | "table_2 " + + condition ::= column "= " number + + number ::= "1 " | "2 " + """ + prompt = "Generate an SQL query to show the 'username' \ + and 'email' from the 'users' table." + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type, + ) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "regex": + regex = r"\w+@\w+\.com\n" + args.regex = regex + prompt = "Generate an email address for Alan Turing, \ + who works in Enigma. End in .com and new line. \ + Example result: alan.turing@enigma.com\n" + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=regex, + structure_type=args.structure_type, + ) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "choice": + choice = ["Positive", "Negative"] + args.choice = choice + prompt = "Classify this sentiment: vLLM is wonderful!" + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=choice, + structure_type=args.structure_type, + ) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "xgrammar_bench": + requests: list[SampleRequest] = [] + dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") + full_dataset_len = len(dataset) + + def _filter_func(item): + import json + + schema = json.loads(item["schema"]) + return not has_xgrammar_unsupported_json_features(schema) + + dataset = dataset.filter(_filter_func) + num_filtered_out = full_dataset_len - len(dataset) + print( + f"dataset has {len(dataset)} entries after filtering " + f"out {num_filtered_out} entries with unsupported features" + ) + len_dataset = len(dataset) + for data_point_idx in range(args.num_prompts): + idx = data_point_idx + while idx >= len_dataset: + idx -= len_dataset + schema = dataset["schema"][idx] + prompt = tokenizer.apply_chat_template( + dataset["prompt"][idx], tokenize=False, add_generation_prompt=True + ) + input_len = len(tokenizer(prompt).input_ids) + completion = dataset["completion"][idx] + + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type, + completion=completion, + ) + ) + + return requests + + +async def get_request( + input_requests: list[SampleRequest], + request_rate: float, + burstiness: float = 1.0, +) -> AsyncGenerator[tuple[int, SampleRequest], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a tuple. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ + input_requests = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}." + ) + theta = 1.0 / (request_rate * burstiness) + + for i, request in enumerate(input_requests): + yield i, request + + if request_rate == float("inf"): + # If the request rate is infinity, then we don't need to wait. + continue + + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) + # The next request will be sent after the interval. + await asyncio.sleep(interval) + + +def calculate_metrics( + input_requests: list[tuple[str, int, int]], + outputs: list[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + goodput_config_dict: Optional[dict[str, float]] = None, +) -> tuple[BenchmarkMetrics, list[int]]: + actual_output_lens: list[int] = [] + total_input = 0 + completed = 0 + good_completed = 0 + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + # We use the tokenizer to count the number of output tokens for all + # serving backends instead of looking at len(outputs[i].itl) since + # multiple output tokens may be bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids + ) + actual_output_lens.append(output_len) + total_input += input_requests[i].prompt_len + tpot = 0 + if output_len > 1: + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) + tpots.append(tpot) + outputs[i].tpot = tpot + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if goodput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in goodput_config_dict: + valid_metrics.append(ttfts) + slo_values.append( + goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) + if "tpot" in goodput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append( + goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) + if "e2el" in goodput_config_dict: + valid_metrics.append(e2els) + slo_values.append( + goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2, + ) + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) + * 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[ + (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles + ], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[ + (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles + ], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[ + (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles + ], + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[ + (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles + ], + ) + + return metrics, actual_output_lens + + +async def benchmark( + backend: str, + api_url: str, + base_url: str, + model_id: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: list[SampleRequest], + request_rate: float, + burstiness: float, + disable_tqdm: bool, + profile: bool, + selected_percentile_metrics: list[str], + selected_percentiles: list[str], + ignore_eos: bool, + max_concurrency: Optional[int], + structured_output_ratio: float, + goodput_config_dict: Optional[dict[str, float]] = None, +): + if backend in ASYNC_REQUEST_FUNCS: + request_func = ASYNC_REQUEST_FUNCS[backend] + else: + raise ValueError(f"Unknown backend: {backend}") + + def prepare_extra_body(request) -> dict: + extra_body = {} + # Add the schema to the extra_body + extra_body[request.structure_type] = request.schema + return extra_body + + print("Starting initial single prompt test run...") + structured_output_req_idx = random.sample( + range(len(input_requests)), int(len(input_requests) * structured_output_ratio) + ) + + test_request = input_requests[0] + test_req_extra_body = ( + prepare_extra_body(test_request) if 0 in structured_output_req_idx else None + ) + test_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=api_url, + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=test_req_extra_body, + ) + test_output = await request_func(request_func_input=test_input) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark arguments " + f"are correctly specified. Error: {test_output.error}" + ) + else: + print("Initial test run completed. Starting main benchmark run...") + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/start_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=test_req_extra_body, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler started") + + distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" + + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, pbar=pbar) + + benchmark_start_time = time.perf_counter() + tasks: list[asyncio.Task] = [] + expected: list[str] = [] + async for i, request in get_request(input_requests, request_rate, burstiness): + extra_body = ( + prepare_extra_body(request) if i in structured_output_req_idx else None + ) + request_func_input = RequestFuncInput( + model=model_id, + prompt=request.prompt, + api_url=api_url, + prompt_len=request.prompt_len, + output_len=request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + expected.append(request.completion) + tasks.append( + asyncio.create_task( + limited_request_func(request_func_input=request_func_input, pbar=pbar) + ) + ) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + extra_body={test_request.structure_type: test_request.schema}, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, + ) + + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) + print( + "{:<40} {:<10.2f}".format( + "Request throughput (req/s):", metrics.request_throughput + ) + ) + if goodput_config_dict: + print( + "{:<40} {:<10.2f}".format( + "Request goodput (req/s):", metrics.request_goodput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", metrics.output_throughput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Total Token throughput (tok/s):", metrics.total_token_throughput + ) + ) + + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "ttft_description": pd.Series([output.ttft for output in outputs]) + .describe() + .to_dict(), + "tpot_description": pd.Series([output.tpot for output in outputs]) + .describe() + .to_dict(), + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "errors": [output.error for output in outputs], + } + + ret = [ + {"generated": output.generated_text, "expected": gt} + for output, gt in zip(outputs, expected) + ] + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) + ) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms" + ) + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms" + ) + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms" + ) + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + return result, ret + + +def evaluate(ret, args): + def _eval_correctness_json(expected, actual): + # extract json string from string using regex + import regex as re + + actual = actual.replace("\n", "").replace(" ", "").strip() + try: + actual = re.search(r"\{.*\}", actual).group() + actual = json.loads(actual) + except Exception: + return False + + return True + + def _eval_correctness_choice(expected, actual): + return actual in args.choice + + def _eval_correctness_regex(expected, actual): + import regex as re + + return re.match(args.regex, actual) is not None + + def _eval_correctness(expected, actual): + if args.structure_type == "guided_json": + return _eval_correctness_json(expected, actual) + elif args.structure_type == "guided_regex": + return _eval_correctness_regex(expected, actual) + elif args.structure_type == "guided_choice": + return _eval_correctness_choice(expected, actual) + else: + return None + + scores = [] + for res in ret: + score = _eval_correctness(res["expected"], res["generated"]) + res["correctness"] = score + scores.append(score) + + not_none_scores = [score for score in scores if score is not None] + + return ( + (sum(not_none_scores) / len(not_none_scores) * 100) + if len(not_none_scores) > 0 + else None + ) + + +def parse_goodput(slo_pairs): + goodput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + goodput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + 'Specify service level objectives for goodput as "KEY:VALUE" ' + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds." + ) from err + return goodput_config_dict + + +def check_goodput_args(args): + goodput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. " + ) + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative." + ) + return goodput_config_dict + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + backend = args.backend + model_id = args.model + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + + tokenizer = get_tokenizer( + tokenizer_id, + trust_remote_code=args.trust_remote_code, + tokenizer_mode=args.tokenizer_mode, + ) + + if args.dataset == "grammar": + args.structure_type = "guided_grammar" + elif args.dataset == "regex": + args.structure_type = "guided_regex" + elif args.dataset == "choice": + args.structure_type = "guided_choice" + else: + args.structure_type = "guided_json" + + if args.no_structured_output: + args.structured_output_ratio = 0 + if args.save_results: + result_file_name = f"{args.structured_output_ratio}guided" + result_file_name += f"_{backend}" + result_file_name += f"_{args.request_rate}qps" + result_file_name += f"_{args.model.split('/')[-1]}" + result_file_name += f"_{args.dataset}" + result_file_name += f"_{args.num_prompts}" + result_file_name += f"_out{args.output_len}" + result_file_name += ".txt" + else: + result_file_name = None + + input_requests = sample_requests(tokenizer, args) + + goodput_config_dict = check_goodput_args(args) + + benchmark_result, ret = asyncio.run( + benchmark( + backend=backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + tokenizer=tokenizer, + input_requests=input_requests, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], + ignore_eos=args.ignore_eos, + max_concurrency=args.max_concurrency, + structured_output_ratio=args.structured_output_ratio, + goodput_config_dict=goodput_config_dict, + ) + ) + + # Save config and results to json + score = evaluate(ret, args) + print("correct_rate(%)", score, "\n") + if args.save_results: + results = { + "backend": backend, + "model_id": model_id, + "tokenizer_id": tokenizer_id, + "num_prompts": args.num_prompts, + "request_rate": args.request_rate + if args.request_rate < float("inf") + else "inf", + "burstiness": args.burstiness, + "max_concurrency": args.max_concurrency, + "correct_rate(%)": score, + } + results = {"outputs": ret, **results, **benchmark_result} + + # Save to file + if args.result_filename: + result_file_name = args.result_filename + if args.result_dir: + result_file_name = os.path.join(args.result_dir, result_file_name) + with open(result_file_name, "w", encoding="utf-8") as outfile: + json.dump(results, outfile, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the online serving throughput." + ) + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--dataset", + default="json", + choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"], + ) + parser.add_argument( + "--json-schema-path", type=str, default=None, help="Path to json schema." + ) + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.", + ) + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument( + "--tokenizer-mode", + type=str, + default="auto", + help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--output-len", + type=int, + default=128, + help="Number of output tokens.", + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--save-results", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " + "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", + ) + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-separated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' + 'Default value is "ttft,tpot,itl".', + ) + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-separated list of percentiles for selected metrics. " + 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' + 'Default value is "99". ' + 'Use "--percentile-metrics" to select metrics.', + ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help='Specify service level objectives for goodput as "KEY:VALUE" ' + "pairs, where the key is a metric name, and the value is in " + 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' + "separated by spaces. Allowed request level metric names are " + '"ttft", "tpot", "e2el". For more context on the definition of ' + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) + + parser.add_argument( + "--no-structured-output", + action="store_true", + default=False, + help="Whether to disable JSON decoding or not.", + ) + parser.add_argument( + "--structured-output-ratio", + type=float, + default=1.0, + help="Ratio of Structured Outputs requests", + ) + + args = parser.parse_args() + main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py new file mode 100644 index 000000000..d19753d40 --- /dev/null +++ b/benchmarks/benchmark_throughput.py @@ -0,0 +1,724 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark offline inference throughput.""" + +import argparse +import dataclasses +import json +import os +import random +import time +import warnings +from typing import Any, Optional, Union + +import torch +import uvloop +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase + +from benchmark_dataset import ( + AIMODataset, + BurstGPTDataset, + ConversationDataset, + InstructCoderDataset, + RandomDataset, + SampleRequest, + ShareGPTDataset, + SonnetDataset, + VisionArenaDataset, +) +from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args, +) +from vllm.inputs import TextPrompt, TokensPrompt +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import BeamSearchParams +from vllm.utils import FlexibleArgumentParser, merge_async_iterators + + +def run_vllm( + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + disable_detokenize: bool = False, +) -> tuple[float, Optional[list[RequestOutput]]]: + from vllm import LLM, SamplingParams + + llm = LLM(**dataclasses.asdict(engine_args)) + assert all( + llm.llm_engine.model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests." + ) + # Add the requests to the engine. + prompts: list[Union[TextPrompt, TokensPrompt]] = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + prompts.append( + TokensPrompt( + prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data, + ) + if "prompt_token_ids" in request.prompt + else TextPrompt( + prompt=request.prompt, multi_modal_data=request.multi_modal_data + ) + ) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + ) + ) + lora_requests: Optional[list[LoRARequest]] = None + if engine_args.enable_lora: + lora_requests = [request.lora_request for request in requests] + + use_beam_search = False + + outputs = None + if not use_beam_search: + start = time.perf_counter() + outputs = llm.generate( + prompts, sampling_params, lora_request=lora_requests, use_tqdm=True + ) + end = time.perf_counter() + else: + assert lora_requests is None, "BeamSearch API does not support LoRA" + prompts = [request.prompt for request in requests] + # output_len should be the same for all requests. + output_len = requests[0][2] + for request in requests: + assert request.expected_output_len == output_len + start = time.perf_counter() + llm.beam_search( + prompts, + BeamSearchParams( + beam_width=n, + max_tokens=output_len, + ignore_eos=True, + ), + ) + end = time.perf_counter() + return end - start, outputs + + +def run_vllm_chat( + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + disable_detokenize: bool = False, +) -> tuple[float, list[RequestOutput]]: + """ + Run vLLM chat benchmark. This function is recommended ONLY for benchmarking + multimodal models as it properly handles multimodal inputs and chat + formatting. For non-multimodal models, use run_vllm() instead. + """ + from vllm import LLM, SamplingParams + + llm = LLM(**dataclasses.asdict(engine_args)) + + assert all( + llm.llm_engine.model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of " + "prompt_len and expected_output_len for all requests." + ) + + prompts = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + ) + ) + start = time.perf_counter() + outputs = llm.chat(prompts, sampling_params, use_tqdm=True) + end = time.perf_counter() + return end - start, outputs + + +async def run_vllm_async( + requests: list[SampleRequest], + n: int, + engine_args: AsyncEngineArgs, + disable_frontend_multiprocessing: bool = False, + disable_detokenize: bool = False, +) -> float: + from vllm import SamplingParams + + async with build_async_engine_client_from_engine_args( + engine_args, disable_frontend_multiprocessing + ) as llm: + model_config = await llm.get_model_config() + assert all( + model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests." + ) + + # Add the requests to the engine. + prompts: list[Union[TextPrompt, TokensPrompt]] = [] + sampling_params: list[SamplingParams] = [] + lora_requests: list[Optional[LoRARequest]] = [] + for request in requests: + prompts.append( + TokensPrompt( + prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data, + ) + if "prompt_token_ids" in request.prompt + else TextPrompt( + prompt=request.prompt, multi_modal_data=request.multi_modal_data + ) + ) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + ) + ) + lora_requests.append(request.lora_request) + + generators = [] + start = time.perf_counter() + for i, (prompt, sp, lr) in enumerate( + zip(prompts, sampling_params, lora_requests) + ): + generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}") + generators.append(generator) + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass + end = time.perf_counter() + return end - start + + +def run_hf( + requests: list[SampleRequest], + model: str, + tokenizer: PreTrainedTokenizerBase, + n: int, + max_batch_size: int, + trust_remote_code: bool, + disable_detokenize: bool = False, +) -> float: + llm = AutoModelForCausalLM.from_pretrained( + model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + ) + if llm.config.model_type == "llama": + # To enable padding in the HF backend. + tokenizer.pad_token = tokenizer.eos_token + llm = llm.cuda() + + pbar = tqdm(total=len(requests)) + start = time.perf_counter() + batch: list[str] = [] + max_prompt_len = 0 + max_output_len = 0 + for i in range(len(requests)): + prompt = requests[i].prompt + prompt_len = requests[i].prompt_len + output_len = requests[i].expected_output_len + # Add the prompt to the batch. + batch.append(prompt) + max_prompt_len = max(max_prompt_len, prompt_len) + max_output_len = max(max_output_len, output_len) + if len(batch) < max_batch_size and i != len(requests) - 1: + # Check if we can add more requests to the batch. + next_prompt_len = requests[i + 1].prompt_len + next_output_len = requests[i + 1].expected_output_len + if ( + max(max_prompt_len, next_prompt_len) + + max(max_output_len, next_output_len) + ) <= 2048: + # We can add more requests to the batch. + continue + + # Generate the sequences. + input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids + llm_outputs = llm.generate( + input_ids=input_ids.cuda(), + do_sample=True, + num_return_sequences=n, + temperature=1.0, + top_p=1.0, + use_cache=True, + max_new_tokens=max_output_len, + ) + if not disable_detokenize: + # Include the decoding time. + tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) + pbar.update(len(batch)) + + # Clear the batch. + batch = [] + max_prompt_len = 0 + max_output_len = 0 + end = time.perf_counter() + return end - start + + +def run_mii( + requests: list[SampleRequest], + model: str, + tensor_parallel_size: int, + output_len: int, +) -> float: + from mii import client, serve + + llm = serve(model, tensor_parallel=tensor_parallel_size) + prompts = [request.prompt for request in requests] + + start = time.perf_counter() + llm.generate(prompts, max_new_tokens=output_len) + end = time.perf_counter() + client = client(model) + client.terminate_server() + return end - start + + +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: dict[str, Any] +) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={ + "requests_per_second": [results["requests_per_second"]], + "tokens_per_second": [results["tokens_per_second"]], + }, + extra_info={ + k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"] + }, + ) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def get_requests(args, tokenizer): + # Common parameters for all dataset types. + common_kwargs = { + "dataset_path": args.dataset_path, + "random_seed": args.seed, + } + sample_kwargs = { + "tokenizer": tokenizer, + "lora_path": args.lora_path, + "max_loras": args.max_loras, + "num_requests": args.num_prompts, + "input_len": args.input_len, + "output_len": args.output_len, + } + + if args.dataset_path is None or args.dataset_name == "random": + sample_kwargs["range_ratio"] = args.random_range_ratio + sample_kwargs["prefix_len"] = args.prefix_len + dataset_cls = RandomDataset + elif args.dataset_name == "sharegpt": + dataset_cls = ShareGPTDataset + if args.backend == "vllm-chat": + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_name == "sonnet": + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset." + ) + dataset_cls = SonnetDataset + sample_kwargs["prefix_len"] = args.prefix_len + sample_kwargs["return_prompt_formatted"] = True + elif args.dataset_name == "burstgpt": + dataset_cls = BurstGPTDataset + elif args.dataset_name == "hf": + if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = VisionArenaDataset + common_kwargs["dataset_subset"] = None + common_kwargs["dataset_split"] = "train" + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = InstructCoderDataset + common_kwargs["dataset_split"] = "train" + elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = ConversationDataset + common_kwargs["dataset_subset"] = args.hf_subset + common_kwargs["dataset_split"] = args.hf_split + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + dataset_cls = AIMODataset + common_kwargs["dataset_subset"] = None + common_kwargs["dataset_split"] = "train" + else: + raise ValueError(f"Unknown dataset name: {args.dataset_name}") + # Remove None values + sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} + return dataset_cls(**common_kwargs).sample(**sample_kwargs) + + +def main(args: argparse.Namespace): + if args.seed is None: + args.seed = 0 + print(args) + random.seed(args.seed) + # Sample the requests. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code + ) + requests = get_requests(args, tokenizer) + is_multi_modal = any(request.multi_modal_data is not None for request in requests) + request_outputs: Optional[list[RequestOutput]] = None + if args.backend == "vllm": + if args.async_engine: + elapsed_time = uvloop.run( + run_vllm_async( + requests, + args.n, + AsyncEngineArgs.from_cli_args(args), + args.disable_frontend_multiprocessing, + args.disable_detokenize, + ) + ) + else: + elapsed_time, request_outputs = run_vllm( + requests, + args.n, + EngineArgs.from_cli_args(args), + args.disable_detokenize, + ) + elif args.backend == "hf": + assert args.tensor_parallel_size == 1 + elapsed_time = run_hf( + requests, + args.model, + tokenizer, + args.n, + args.hf_max_batch_size, + args.trust_remote_code, + args.disable_detokenize, + ) + elif args.backend == "mii": + elapsed_time = run_mii( + requests, args.model, args.tensor_parallel_size, args.output_len + ) + elif args.backend == "vllm-chat": + elapsed_time, request_outputs = run_vllm_chat( + requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize + ) + else: + raise ValueError(f"Unknown backend: {args.backend}") + + if request_outputs: + # Note: with the vllm and vllm-chat backends, + # we have request_outputs, which we use to count tokens. + total_prompt_tokens = 0 + total_output_tokens = 0 + for ro in request_outputs: + if not isinstance(ro, RequestOutput): + continue + total_prompt_tokens += ( + len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 + ) + total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o) + total_num_tokens = total_prompt_tokens + total_output_tokens + else: + total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) + total_output_tokens = sum(r.expected_output_len for r in requests) + total_prompt_tokens = total_num_tokens - total_output_tokens + + if is_multi_modal and args.backend != "vllm-chat": + print( + "\033[91mWARNING\033[0m: Multi-modal request with " + f"{args.backend} backend detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details." + ) + # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. + # vllm-chat backend counts the image tokens now + + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s" + ) + print(f"Total num prompt tokens: {total_prompt_tokens}") + print(f"Total num output tokens: {total_output_tokens}") + + # Output JSON results if specified + if args.output_json: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": total_num_tokens / elapsed_time, + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) + + +def validate_args(args): + """ + Validate command-line arguments. + """ + + # === Deprecation and Defaulting === + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next release. " + "Please use '--dataset-name' and '--dataset-path' instead.", + stacklevel=2, + ) + args.dataset_path = args.dataset + + if not getattr(args, "tokenizer", None): + args.tokenizer = args.model + + # === Backend Validation === + valid_backends = {"vllm", "hf", "mii", "vllm-chat"} + if args.backend not in valid_backends: + raise ValueError(f"Unsupported backend: {args.backend}") + + # === Dataset Configuration === + if not args.dataset and not args.dataset_path: + print("When dataset path is not set, it will default to random dataset") + args.dataset_name = "random" + if args.input_len is None: + raise ValueError("input_len must be provided for a random dataset") + + # === Dataset Name Specific Checks === + # --hf-subset and --hf-split: only used + # when dataset_name is 'hf' + if args.dataset_name != "hf" and ( + getattr(args, "hf_subset", None) is not None + or getattr(args, "hf_split", None) is not None + ): + warnings.warn( + "--hf-subset and --hf-split will be ignored \ + since --dataset-name is not 'hf'.", + stacklevel=2, + ) + elif args.dataset_name == "hf": + if args.dataset_path in ( + VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() + | ConversationDataset.SUPPORTED_DATASET_PATHS + ): + assert args.backend == "vllm-chat", ( + f"{args.dataset_path} needs to use vllm-chat as the backend." + ) # noqa: E501 + elif args.dataset_path in ( + InstructCoderDataset.SUPPORTED_DATASET_PATHS + | AIMODataset.SUPPORTED_DATASET_PATHS + ): + assert args.backend == "vllm", ( + f"{args.dataset_path} needs to use vllm as the backend." + ) # noqa: E501 + else: + raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") + + # --random-range-ratio: only used when dataset_name is 'random' + if args.dataset_name != "random" and args.random_range_ratio is not None: + warnings.warn( + "--random-range-ratio will be ignored since \ + --dataset-name is not 'random'.", + stacklevel=2, + ) + + # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not + # set. + if ( + args.dataset_name not in {"random", "sonnet", None} + and args.prefix_len is not None + ): + warnings.warn( + "--prefix-len will be ignored since --dataset-name\ + is not 'random', 'sonnet', or not set.", + stacklevel=2, + ) + + # === LoRA Settings === + if getattr(args, "enable_lora", False) and args.backend != "vllm": + raise ValueError("LoRA benchmarking is only supported for vLLM backend") + if getattr(args, "enable_lora", False) and args.lora_path is None: + raise ValueError("LoRA path must be provided when enable_lora is True") + + # === Backend-specific Validations === + if args.backend == "hf" and args.hf_max_batch_size is None: + raise ValueError("HF max batch size is required for HF backend") + if args.backend != "hf" and args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + + if ( + args.backend in {"hf", "mii"} + and getattr(args, "quantization", None) is not None + ): + raise ValueError("Quantization is only for vLLM backend.") + + if args.backend == "mii" and args.dtype != "auto": + raise ValueError("dtype must be auto for MII backend.") + if args.backend == "mii" and args.n != 1: + raise ValueError("n must be 1 for MII backend.") + if args.backend == "mii" and args.tokenizer != args.model: + raise ValueError("Tokenizer must be the same as the model for MII backend.") + + # --data-parallel is not supported currently. + # https://github.com/vllm-project/vllm/issues/16222 + if args.data_parallel_size > 1: + raise ValueError( + "Data parallel is not supported in offline benchmark, \ + please use benchmark serving instead" + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the throughput.") + parser.add_argument( + "--backend", + type=str, + choices=["vllm", "hf", "mii", "vllm-chat"], + default="vllm", + ) + parser.add_argument( + "--dataset-name", + type=str, + choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], + help="Name of the dataset to benchmark on.", + default="sharegpt", + ) + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Path to the ShareGPT dataset, will be deprecated in\ + the next release. The dataset is expected to " + "be a json in form of list[dict[..., conversations: " + "list[dict[..., value: ]]]]", + ) + parser.add_argument( + "--dataset-path", type=str, default=None, help="Path to the dataset" + ) + parser.add_argument( + "--input-len", + type=int, + default=None, + help="Input prompt length for each request", + ) + parser.add_argument( + "--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.", + ) + parser.add_argument( + "--n", type=int, default=1, help="Number of generated sequences per prompt." + ) + parser.add_argument( + "--num-prompts", type=int, default=1000, help="Number of prompts to process." + ) + parser.add_argument( + "--hf-max-batch-size", + type=int, + default=None, + help="Maximum batch size for HF backend.", + ) + parser.add_argument( + "--output-json", + type=str, + default=None, + help="Path to save the throughput results in JSON format.", + ) + parser.add_argument( + "--async-engine", + action="store_true", + default=False, + help="Use vLLM async engine rather than LLM class.", + ) + parser.add_argument( + "--disable-frontend-multiprocessing", + action="store_true", + default=False, + help="Disable decoupled async engine frontend.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=( + "Do not detokenize the response (i.e. do not include " + "detokenization time in the measurement)" + ), + ) + # LoRA + parser.add_argument( + "--lora-path", + type=str, + default=None, + help="Path to the LoRA adapters to use. This can be an absolute path, " + "a relative path, or a Hugging Face model identifier.", + ) + parser.add_argument( + "--prefix-len", + type=int, + default=None, + help=f"Number of prefix tokens to be used in RandomDataset " + "and SonnetDataset. For RandomDataset, the total input " + "length is the sum of prefix-len (default: " + f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length " + "sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]. For SonnetDataset, " + f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) " + "controls how much of the input is fixed lines versus " + "random lines, but the total input length remains approximately " + "input_len tokens.", + ) + # random dataset + parser.add_argument( + "--random-range-ratio", + type=float, + default=None, + help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) " + "for sampling input/output length, " + "used only for RandomDataset. Must be in the range [0, 1) to " + "define a symmetric sampling range " + "[length * (1 - range_ratio), length * (1 + range_ratio)].", + ) + + # hf dtaset + parser.add_argument( + "--hf-subset", type=str, default=None, help="Subset of the HF dataset." + ) + parser.add_argument( + "--hf-split", type=str, default=None, help="Split of the HF dataset." + ) + + parser = AsyncEngineArgs.add_cli_args(parser) + args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + validate_args(args) + main(args) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py new file mode 100644 index 000000000..283f938df --- /dev/null +++ b/benchmarks/benchmark_utils.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import json +import math +import os +from typing import Any + + +def convert_to_pytorch_benchmark_format( + args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any] +) -> list: + """ + Save the benchmark results in the format used by PyTorch OSS benchmark with + on metric per record + https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + """ + records = [] + if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): + return records + + for name, benchmark_values in metrics.items(): + record = { + "benchmark": { + "name": "vLLM benchmark", + "extra_info": { + "args": vars(args), + }, + }, + "model": { + "name": args.model, + }, + "metric": { + "name": name, + "benchmark_values": benchmark_values, + "extra_info": extra_info, + }, + } + + tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") + # Save tensor_parallel_size parameter if it's part of the metadata + if not tp and "tensor_parallel_size" in extra_info: + record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = ( + extra_info["tensor_parallel_size"] + ) + + records.append(record) + + return records + + +class InfEncoder(json.JSONEncoder): + def clear_inf(self, o: Any): + if isinstance(o, dict): + return {k: self.clear_inf(v) for k, v in o.items()} + elif isinstance(o, list): + return [self.clear_inf(v) for v in o] + elif isinstance(o, float) and math.isinf(o): + return "inf" + return o + + def iterencode(self, o: Any, *args, **kwargs) -> Any: + return super().iterencode(self.clear_inf(o), *args, **kwargs) + + +def write_to_json(filename: str, records: list) -> None: + with open(filename, "w") as f: + json.dump( + records, + f, + cls=InfEncoder, + default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", + ) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py new file mode 100644 index 000000000..9ec270bbd --- /dev/null +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -0,0 +1,516 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import copy +import itertools +import pickle as pkl +import time +from collections.abc import Iterable +from typing import Callable + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from utils import make_rand_sparse_tensors +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] +DEFAULT_TP_SIZES = [1] + + +# bench +def bench_fn( + label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs +) -> TMeasurement: + min_run_time = 1 + + globals = { + "args": args, + "kwargs": kwargs, + "fn": fn, + } + return TBenchmark.Timer( + stmt="fn(*args, **kwargs)", + globals=globals, + label=label, + sub_label=sub_label, + description=description, + ).blocked_autorange(min_run_time=min_run_time) + + +def bench_int8( + dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str +) -> Iterable[TMeasurement]: + assert dtype == torch.int8 + b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k) + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) + + out = ops.cutlass_scaled_sparse_mm( + a, b_compressed, e, scale_a, scale_b, torch.bfloat16 + ) + out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16) + + if not torch.allclose(out, out_ref): + print("Incorrect results") + print(out) + print(out_ref) + else: + print("Correct results") + + timers = [] + # pytorch impl - bfloat16 + timers.append( + bench_fn( + label, + sub_label, + "pytorch_bf16_bf16_bf16_matmul-no-scales", + torch.mm, + a.to(dtype=torch.bfloat16), + b.to(dtype=torch.bfloat16), + ) + ) + + # pytorch impl - float16 + timers.append( + bench_fn( + label, + sub_label, + "pytorch_fp16_fp16_fp16_matmul-no-scales", + torch.mm, + a.to(dtype=torch.float16), + b.to(dtype=torch.float16), + ) + ) + + # cutlass impl + timers.append( + bench_fn( + label, + sub_label, + "cutlass_i8_i8_bf16_scaled_mm", + ops.cutlass_scaled_mm, + a, + b, + scale_a, + scale_b, + torch.bfloat16, + ) + ) + + # cutlass with bias + timers.append( + bench_fn( + label, + sub_label, + "cutlass_i8_i8_bf16_scaled_mm_bias", + ops.cutlass_scaled_mm, + a, + b, + scale_a, + scale_b, + torch.bfloat16, + bias, + ) + ) + + # cutlass sparse impl + timers.append( + bench_fn( + label, + sub_label, + "cutlass_i8_i8_bf16_scaled_sparse_mm", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.bfloat16, + ) + ) + + # cutlass sparse with bias + timers.append( + bench_fn( + label, + sub_label, + "cutlass_i8_i8_bf16_scaled_sparse_mm_bias", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.bfloat16, + bias, + ) + ) + + return timers + + +def bench_fp8( + dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str +) -> Iterable[TMeasurement]: + assert dtype == torch.float8_e4m3fn + b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k) + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) + + out = ops.cutlass_scaled_sparse_mm( + a, b_compressed, e, scale_a, scale_b, torch.bfloat16 + ) + out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16) + + if not torch.allclose(out, out_ref): + print("Incorrect results") + print(out) + print(out_ref) + else: + print("Correct results") + + timers = [] + + # pytorch impl w. bf16 + timers.append( + bench_fn( + label, + sub_label, + "pytorch_bf16_bf16_bf16_matmul-no-scales", + torch.mm, + a.to(dtype=torch.bfloat16, device="cuda"), + b.to(dtype=torch.bfloat16, device="cuda"), + ) + ) + + # pytorch impl: bf16 output, without fp8 fast accum + timers.append( + bench_fn( + label, + sub_label, + "pytorch_fp8_fp8_bf16_scaled_mm", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + ) + ) + + # pytorch impl: bf16 output, with fp8 fast accum + timers.append( + bench_fn( + label, + sub_label, + "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True, + ) + ) + + # pytorch impl: fp16 output, without fp8 fast accum + timers.append( + bench_fn( + label, + sub_label, + "pytorch_fp8_fp8_fp16_scaled_mm", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.float16, + ) + ) + + # pytorch impl: fp16 output, with fp8 fast accum + timers.append( + bench_fn( + label, + sub_label, + "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.float16, + use_fast_accum=True, + ) + ) + + # cutlass impl: bf16 output + timers.append( + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_bf16_scaled_mm", + ops.cutlass_scaled_mm, + a, + b, + scale_a, + scale_b, + torch.bfloat16, + ) + ) + + # cutlass impl: bf16 output + timers.append( + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_bf16_scaled_sparse_mm", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.bfloat16, + ) + ) + + # cutlass impl: fp16 output + timers.append( + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_fp16_scaled_sparse_mm", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.float16, + ) + ) + + # cutlass impl: bf16 output, with bias + timers.append( + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.bfloat16, + bias, + ) + ) + + # cutlass impl: fp16 output, with bias + timers.append( + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.float16, + bias.to(dtype=torch.float16), + ) + ) + + return timers + + +def bench( + dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str +) -> Iterable[TMeasurement]: + if dtype == torch.int8: + return bench_int8(dtype, m, k, n, label, sub_label) + if dtype == torch.float8_e4m3fn: + return bench_fp8(dtype, m, k, n, label, sub_label) + raise ValueError("unsupported type") + + +# runner +def print_timers(timers: Iterable[TMeasurement]): + compare = TBenchmark.Compare(timers) + compare.print() + + +def run( + dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]] +) -> Iterable[TMeasurement]: + results = [] + for m, k, n in MKNs: + timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", f"MKN=({m}x{k}x{n})") + print_timers(timers) + results.extend(timers) + + return results + + +# output makers +def make_output( + data: Iterable[TMeasurement], + MKNs: Iterable[tuple[int, int, int]], + base_description: str, + timestamp=None, +): + print(f"== All Results {base_description} ====") + print_timers(data) + + # pickle all the results + timestamp = int(time.time()) if timestamp is None else timestamp + with open(f"{base_description}-{timestamp}.pkl", "wb") as f: + pkl.dump(data, f) + + +# argparse runners + + +def run_square_bench(args): + dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) + MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) + data = run(args.dtype, MKNs) + + make_output(data, MKNs, f"square_bench-{args.dtype}") + + +def run_range_bench(args): + dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment)) + n = len(dim_sizes) + Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes + Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes + Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes + MKNs = list(zip(Ms, Ks, Ns)) + data = run(args.dtype, MKNs) + + make_output(data, MKNs, f"range_bench-{args.dtype}") + + +def run_model_bench(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: + KNs = [] + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KNs.append(KN) + return KNs + + model_bench_data = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + Ms = args.batch_sizes + KNs = model_shapes(model, tp_size) + MKNs = [] + for m in Ms: + for k, n in KNs: + MKNs.append((m, k, n)) + + data = run(args.dtype, MKNs) + model_bench_data.append(data) + + # Print all results + for data, model_tp in zip(model_bench_data, models_tps): + model, tp_size = model_tp + print(f"== Results {args.dtype} {model}-TP{tp_size} ====") + print_timers(data) + + timestamp = int(time.time()) + + all_data = [] + for d in model_bench_data: + all_data.extend(d) + # pickle all data + with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: + pkl.dump(all_data, f) + + +if __name__ == "__main__": + + def to_torch_dtype(dt): + if dt == "int8": + return torch.int8 + if dt == "fp8": + return torch.float8_e4m3fn + raise ValueError("unsupported dtype") + + parser = FlexibleArgumentParser( + description=""" +Benchmark Cutlass GEMM. + + To run square GEMMs: + python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 + + To run constant N and K and sweep M: + python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 + + To run dimensions from a model: + python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 + + Output: + - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter, + ) + + parser.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['int8', 'fp8']", + ) + subparsers = parser.add_subparsers(dest="cmd") + + square_parser = subparsers.add_parser("square_bench") + square_parser.add_argument("--dim-start", type=int, required=True) + square_parser.add_argument("--dim-end", type=int, required=True) + square_parser.add_argument("--dim-increment", type=int, required=True) + square_parser.set_defaults(func=run_square_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--dim-start", type=int, required=True) + range_parser.add_argument("--dim-end", type=int, required=True) + range_parser.add_argument("--dim-increment", type=int, required=True) + range_parser.add_argument("--m-constant", type=int, default=None) + range_parser.add_argument("--n-constant", type=int, default=None) + range_parser.add_argument("--k-constant", type=int, default=None) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + model_parser.add_argument( + "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES + ) + model_parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py new file mode 100644 index 000000000..b4f3c6bf9 --- /dev/null +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Cutlass bench utils +from collections.abc import Iterable + +import torch + +import vllm._custom_ops as ops + + +def to_fp8(tensor: torch.Tensor) -> torch.Tensor: + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( + dtype=torch.float8_e4m3fn + ) + + +def to_int8(tensor: torch.Tensor) -> torch.Tensor: + return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) + + +def to_bf16(tensor: torch.Tensor) -> torch.Tensor: + return tensor.to(dtype=torch.bfloat16) + + +def to_fp16(tensor: torch.Tensor) -> torch.Tensor: + return tensor.to(dtype=torch.float16) + + +def make_rand_tensors( + dtype: torch.dtype, m: int, n: int, k: int +) -> tuple[torch.Tensor, torch.Tensor]: + a = torch.randn((m, k), device="cuda") * 5 + b = torch.randn((n, k), device="cuda").t() * 5 + + if dtype == torch.int8: + return to_int8(a), to_int8(b) + if dtype == torch.float8_e4m3fn: + return to_fp8(a), to_fp8(b) + + raise ValueError("unsupported dtype") + + +def prune_to_2_4(tensor): + # Reshape tensor to [N, 4] where N is number of groups of 4 + original_shape = tensor.shape + reshaped = tensor.reshape(-1, 4) + + # Get indices of top 2 absolute values in each group of 4 + _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1) + + # Create binary mask + mask = torch.zeros_like(reshaped) + mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype)) + + # Apply mask and reshape back + pruned = reshaped * mask + + # Turn all -0.0 to 0.0 + pruned[pruned == -0.0] = 0.0 + + return pruned.reshape(original_shape) + + +def make_rand_sparse_tensors( + dtype: torch.dtype, m: int, n: int, k: int +) -> tuple[torch.Tensor, torch.Tensor]: + a = torch.randn((m, k), device="cuda") * 5 + b = torch.randn((n, k), device="cuda").t() * 5 + + b = prune_to_2_4(b.t()).t() + + if dtype == torch.int8: + a, b = to_int8(a), to_int8(b) + elif dtype == torch.float8_e4m3fn: + a, b = to_fp8(a), to_fp8(b) + elif dtype == torch.float16: + a, b = to_fp16(a), to_fp16(b) + elif dtype == torch.bfloat16: + a, b = to_bf16(a), to_bf16(b) + else: + raise ValueError("unsupported dtype") + + b_compressed, e = ops.cutlass_sparse_compress(b.t()) + + # Compressed B, Metadata, Original A, B + return b_compressed, e, a, b + + +def make_n_rand_sparse_tensors( + num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int +) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: + ABs = [] + for _ in range(num_tensors): + b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) + if b_comp is not None: + ABs.append(make_rand_sparse_tensors(dtype, m, n, k)) + BComps, Es, As, Bs = zip(*ABs) + return list(BComps), list(Es), list(As), list(Bs) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py new file mode 100644 index 000000000..cec422e8d --- /dev/null +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -0,0 +1,377 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import copy +import itertools +import pickle as pkl +import time +from collections.abc import Iterable +from typing import Callable, Optional + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from utils import make_rand_tensors +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + w8a8_block_fp8_matmul, +) +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] +DEFAULT_TP_SIZES = [1] + + +# bench +def bench_fn( + label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs +) -> TMeasurement: + min_run_time = 1 + + globals = { + "args": args, + "kwargs": kwargs, + "fn": fn, + } + return TBenchmark.Timer( + stmt="fn(*args, **kwargs)", + globals=globals, + label=label, + sub_label=sub_label, + description=description, + ).blocked_autorange(min_run_time=min_run_time) + + +def bench_int8( + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[list[str]] = None, +) -> Iterable[TMeasurement]: + """Benchmark INT8-based kernels.""" + assert dtype == torch.int8 + a, b = make_rand_tensors(torch.int8, m, n, k) + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) + azp = torch.zeros((m,), device="cuda", dtype=torch.int32) + azp_adj = torch.zeros((n,), device="cuda", dtype=torch.int32) + + bench_fns = { + "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm( + a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) + ), + "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm( + a.to(dtype=torch.float16), b.to(dtype=torch.float16) + ), + "cutlass_i8_i8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.bfloat16 + ), + "cutlass_i8_i8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.bfloat16, bias + ), + "cutlass_i8_i8_bf16_scaled_mm_azp": lambda: ops.cutlass_scaled_mm_azp( + a, b, scale_a, scale_b, torch.bfloat16, azp_adj + ), + "cutlass_i8_i8_bf16_scaled_mm_azp_bias": lambda: ops.cutlass_scaled_mm_azp( + a, b, scale_a, scale_b, torch.bfloat16, azp_adj, None, bias + ), + "cutlass_i8_i8_bf16_scaled_mm_azp_pt": lambda: ops.cutlass_scaled_mm_azp( + a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp + ), + "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": lambda: ops.cutlass_scaled_mm_azp( + a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp, bias + ), + } + + timers = [] + for name, fn in bench_fns.items(): + # If bench_kernels is None, run all. Otherwise, run only exact matches. + if bench_kernels is None or name in bench_kernels: + print(f"Running {name}") + timers.append(bench_fn(label, sub_label, name, fn)) + + return timers + + +def bench_fp8( + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[list[str]] = None, +) -> Iterable[TMeasurement]: + """Benchmark FP8-based kernels.""" + assert dtype == torch.float8_e4m3fn + a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) + a_cont = a.contiguous() + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + + def ceil_div(x: int, y: int) -> int: + return (x + y - 1) // y + + block_scale_a = torch.rand( + (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32 + ) + block_scale_b = torch.rand( + ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32 + ) + block_scale_a_M_major = block_scale_a.t().contiguous().t() + block_scale_b_K_major = block_scale_b.t().contiguous().t() + bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) + + print(m, k, n) + + bench_fns = { + "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm( + a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) + ), + "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm( + a.to(dtype=torch.float16), b.to(dtype=torch.float16) + ), + "pytorch_fp8_fp8_fp16_scaled_mm": lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.float16 + ), + "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.float16, use_fast_accum=True + ), + "pytorch_fp8_fp8_bf16_scaled_mm": lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.bfloat16 + ), + "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.bfloat16, use_fast_accum=True + ), + "cutlass_fp8_fp8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.bfloat16 + ), + "cutlass_fp8_fp8_fp16_scaled_mm": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.float16 + ), + "cutlass_fp8_fp8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.bfloat16, bias + ), + "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16) + ), + "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul( + a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128) + ), + "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm( + a, b, block_scale_a_M_major, block_scale_b_K_major, torch.float16 + ), + } + + timers = [] + for name, fn in bench_fns.items(): + # If bench_kernels is None, run all. Otherwise, run only exact matches. + if bench_kernels is None or name in bench_kernels: + print(f"Running {name}") + timers.append(bench_fn(label, sub_label, name, fn)) + + return timers + + +def bench( + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[list[str]] = None, +) -> Iterable[TMeasurement]: + if dtype == torch.int8: + return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) + if dtype == torch.float8_e4m3fn: + return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels) + raise ValueError("unsupported type") + + +# runner +def print_timers(timers: Iterable[TMeasurement]): + compare = TBenchmark.Compare(timers) + compare.print() + + +def run( + dtype: torch.dtype, + MKNs: Iterable[tuple[int, int, int]], + bench_kernels: Optional[list[str]] = None, +) -> Iterable[TMeasurement]: + results = [] + for m, k, n in MKNs: + timers = bench( + dtype, + m, + k, + n, + f"scaled-{dtype}-gemm", + f"MKN=({m}x{k}x{n})", + bench_kernels=bench_kernels, + ) + print_timers(timers) + results.extend(timers) + return results + + +def make_output( + data: Iterable[TMeasurement], + MKNs: Iterable[tuple[int, int, int]], + base_description: str, + timestamp=None, +): + print(f"== All Results {base_description} ====") + print_timers(data) + + # pickle all the results + timestamp = int(time.time()) if timestamp is None else timestamp + with open(f"{base_description}-{timestamp}.pkl", "wb") as f: + pkl.dump(data, f) + + +def run_square_bench(args): + dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) + MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) + data = run(args.dtype, MKNs, bench_kernels=args.kernels) + make_output(data, MKNs, f"square_bench-{args.dtype}") + + +def run_range_bench(args): + dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment)) + n = len(dim_sizes) + Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes + Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes + Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes + MKNs = list(zip(Ms, Ks, Ns)) + data = run(args.dtype, MKNs, bench_kernels=args.kernels) + make_output(data, MKNs, f"range_bench-{args.dtype}") + + +def run_model_bench(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: + KNs = [] + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KNs.append(KN) + return KNs + + model_bench_data = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + Ms = args.batch_sizes + KNs = model_shapes(model, tp_size) + MKNs = [] + for m in Ms: + for k, n in KNs: + MKNs.append((m, k, n)) + + data = run(args.dtype, MKNs, bench_kernels=args.kernels) + model_bench_data.append(data) + + # Print all results + for data, model_tp in zip(model_bench_data, models_tps): + model, tp_size = model_tp + print(f"== Results {args.dtype} {model}-TP{tp_size} ====") + print_timers(data) + + timestamp = int(time.time()) + + all_data = [] + for d in model_bench_data: + all_data.extend(d) + # pickle all data + with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: + pkl.dump(all_data, f) + + +if __name__ == "__main__": + + def to_torch_dtype(dt): + if dt == "int8": + return torch.int8 + if dt == "fp8": + return torch.float8_e4m3fn + raise ValueError("unsupported dtype") + + parser = FlexibleArgumentParser( + description=""" +Benchmark Cutlass GEMM. + + To run square GEMMs: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 + + To run constant N and K and sweep M: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 + + To run dimensions from a model: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 + + Output: + - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter, + ) + + parser.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['int8', 'fp8']", + ) + parser.add_argument( + "--kernels", + nargs="+", + type=str, + default=None, + help="Exact names of the kernels to benchmark. If not set, runs all kernels.", + ) + + subparsers = parser.add_subparsers(dest="cmd") + + square_parser = subparsers.add_parser("square_bench") + square_parser.add_argument("--dim-start", type=int, required=True) + square_parser.add_argument("--dim-end", type=int, required=True) + square_parser.add_argument("--dim-increment", type=int, required=True) + square_parser.set_defaults(func=run_square_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--dim-start", type=int, required=True) + range_parser.add_argument("--dim-end", type=int, required=True) + range_parser.add_argument("--dim-increment", type=int, required=True) + range_parser.add_argument("--m-constant", type=int, default=None) + range_parser.add_argument("--n-constant", type=int, default=None) + range_parser.add_argument("--k-constant", type=int, default=None) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + model_parser.add_argument( + "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES + ) + model_parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py new file mode 100644 index 000000000..25b96ef56 --- /dev/null +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Weight Shapes are in the format +# ([K, N], TP_SPLIT_DIM) +# Example: +# A shape of ([14336, 4096], 0) indicates the following GEMM shape, +# - TP1 : K = 14336, N = 4096 +# - TP2 : K = 7168, N = 4096 +# A shape of ([4096, 6144], 1) indicates the following GEMM shape, +# - TP1 : K = 4096, N = 6144 +# - TP4 : K = 4096, N = 1536 + +# TP1 shapes +WEIGHT_SHAPES = { + "mistralai/Mistral-7B-v0.1": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-7b-hf": [ + ([4096, 12288], 1), + ([4096, 4096], 0), + ([4096, 22016], 1), + ([11008, 4096], 0), + ], + "meta-llama/Llama-3-8b": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-13b-hf": [ + ([5120, 15360], 1), + ([5120, 5120], 0), + ([5120, 27648], 1), + ([13824, 5120], 0), + ], + "meta-llama/Llama-2-70b-hf": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], +} diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh new file mode 100644 index 000000000..94999630b --- /dev/null +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# benchmark the overhead of disaggregated prefill. +# methodology: +# - send all request to prefill vLLM instance. It will buffer KV cache. +# - then send all request to decode instance. +# - The TTFT of decode instance is the overhead. + +set -ex + +kill_gpu_processes() { + # kill all processes on GPU. + pgrep pt_main_thread | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 + sleep 10 + + # remove vllm config file + rm -rf ~/.config/vllm + + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + + +benchmark() { + + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + # compare chunked prefill with disaggregated prefill + + results_folder="./results" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + dataset_name="sonnet" + dataset_path="../sonnet_4x.txt" + num_prompts=10 + qps=$1 + prefix_len=50 + input_len=2048 + output_len=$2 + + + CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + + CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + wait_for_server 8100 + wait_for_server 8200 + + # let the prefill instance finish prefill + python3 ../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8100 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_tp1.json \ + --request-rate "inf" + + + # send the request to decode. + # The TTFT of this command will be the overhead of disagg prefill impl. + python3 ../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8200 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_tp1_overhead.json \ + --request-rate "$qps" + kill_gpu_processes + +} + + +main() { + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get -y install jq) + (which socat) || (apt-get -y install socat) + + pip install quart httpx datasets + + cd "$(dirname "$0")" + + cd .. + # create sonnet-4x.txt + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + cd disagg_benchmarks + + rm -rf results + mkdir results + + default_qps=1 + default_output_len=1 + benchmark $default_qps $default_output_len + +} + + +main "$@" diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh new file mode 100644 index 000000000..eb5d891d0 --- /dev/null +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -0,0 +1,163 @@ +#!/bin/bash + +# Requirement: 2x GPUs. + + +# Model: meta-llama/Meta-Llama-3.1-8B-Instruct +# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests +# Resource: 2x GPU +# Approaches: +# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4 +# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance +# Prefilling instance: max_output_token=1 +# Decoding instance: force the input tokens be the same across requests to bypass prefilling + +set -ex + +kill_gpu_processes() { + # kill all processes on GPU. + pgrep pt_main_thread | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 + for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done + sleep 1 +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + + +launch_chunked_prefill() { + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + # disagg prefill + CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --max-model-len 10000 \ + --enable-chunked-prefill \ + --gpu-memory-utilization 0.6 & + CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --max-model-len 10000 \ + --enable-chunked-prefill \ + --gpu-memory-utilization 0.6 & + wait_for_server 8100 + wait_for_server 8200 + python3 round_robin_proxy.py & + sleep 1 +} + + +launch_disagg_prefill() { + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + # disagg prefill + CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + wait_for_server 8100 + wait_for_server 8200 + python3 disagg_prefill_proxy_server.py & + sleep 1 +} + + +benchmark() { + results_folder="./results" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + dataset_name="sonnet" + dataset_path="../sonnet_4x.txt" + num_prompts=100 + qps=$1 + prefix_len=50 + input_len=1024 + output_len=$2 + tag=$3 + + python3 ../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8000 \ + --save-result \ + --result-dir $results_folder \ + --result-filename "$tag"-qps-"$qps".json \ + --request-rate "$qps" + + sleep 2 +} + + +main() { + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get -y install jq) + (which socat) || (apt-get -y install socat) + (which lsof) || (apt-get -y install lsof) + + pip install quart httpx matplotlib aiohttp datasets + + cd "$(dirname "$0")" + + cd .. + # create sonnet-4x.txt so that we can sample 2048 tokens for input + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + cd disagg_benchmarks + + rm -rf results + mkdir results + + default_output_len=6 + + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + launch_chunked_prefill + for qps in 2 4 6 8; do + benchmark $qps $default_output_len chunked_prefill + done + kill_gpu_processes + + launch_disagg_prefill + for qps in 2 4 6 8; do + benchmark $qps $default_output_len disagg_prefill + done + kill_gpu_processes + + python3 visualize_benchmark_results.py + +} + + +main "$@" diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py new file mode 100644 index 000000000..f62d8102e --- /dev/null +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os + +import aiohttp +from quart import Quart, make_response, request + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + +app = Quart(__name__) + + +async def forward_request(url, data): + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + async with session.post(url=url, json=data, headers=headers) as response: + if response.status == 200: + # if response.headers.get('Transfer-Encoding') == 'chunked': + if True: + async for chunk_bytes in response.content.iter_chunked(1024): + yield chunk_bytes + else: + content = await response.read() + yield content + + +@app.route("/v1/completions", methods=["POST"]) +async def handle_request(): + try: + original_request_data = await request.get_json() + + prefill_request = original_request_data.copy() + # change max_tokens = 1 to let it only do prefill + prefill_request["max_tokens"] = 1 + + # finish prefill + async for _ in forward_request( + "http://localhost:8100/v1/completions", prefill_request + ): + continue + + # return decode + generator = forward_request( + "http://localhost:8200/v1/completions", original_request_data + ) + response = await make_response(generator) + response.timeout = None + + return response + + except Exception as e: + import sys + import traceback + + exc_info = sys.exc_info() + print("Error occurred in disagg prefill proxy server") + print(e) + print("".join(traceback.format_exception(*exc_info))) + + +if __name__ == "__main__": + app.run(port=8000) diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py new file mode 100644 index 000000000..b1df2f255 --- /dev/null +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import itertools + +import aiohttp +from aiohttp import web + + +class RoundRobinProxy: + def __init__(self, target_ports): + self.target_ports = target_ports + self.port_cycle = itertools.cycle(self.target_ports) + + async def handle_request(self, request): + target_port = next(self.port_cycle) + target_url = f"http://localhost:{target_port}{request.path_qs}" + + async with aiohttp.ClientSession() as session: + try: + # Forward the request + async with session.request( + method=request.method, + url=target_url, + headers=request.headers, + data=request.content, + ) as response: + # Start sending the response + resp = web.StreamResponse( + status=response.status, headers=response.headers + ) + await resp.prepare(request) + + # Stream the response content + async for chunk in response.content.iter_any(): + await resp.write(chunk) + + await resp.write_eof() + return resp + + except Exception as e: + return web.Response(text=f"Error: {str(e)}", status=500) + + +async def main(): + proxy = RoundRobinProxy([8100, 8200]) + app = web.Application() + app.router.add_route("*", "/{path:.*}", proxy.handle_request) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, "localhost", 8000) + await site.start() + + print("Proxy server started on http://localhost:8000") + + # Keep the server running + await asyncio.Event().wait() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py new file mode 100644 index 000000000..74fa56d07 --- /dev/null +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import matplotlib.pyplot as plt +import pandas as pd + +if __name__ == "__main__": + data = [] + for name in ["disagg_prefill", "chunked_prefill"]: + for qps in [2, 4, 6, 8]: + with open(f"results/{name}-qps-{qps}.json") as f: + x = json.load(f) + x["name"] = name + x["qps"] = qps + data.append(x) + + df = pd.DataFrame.from_dict(data) + dis_df = df[df["name"] == "disagg_prefill"] + chu_df = df[df["name"] == "chunked_prefill"] + + plt.style.use("bmh") + plt.rcParams["font.size"] = 20 + + for key in [ + "mean_ttft_ms", + "median_ttft_ms", + "p99_ttft_ms", + "mean_itl_ms", + "median_itl_ms", + "p99_itl_ms", + ]: + fig, ax = plt.subplots(figsize=(11, 7)) + plt.plot( + dis_df["qps"], dis_df[key], label="disagg_prefill", marker="o", linewidth=4 + ) + plt.plot( + chu_df["qps"], chu_df[key], label="chunked_prefill", marker="o", linewidth=4 + ) + ax.legend() + + ax.set_xlabel("QPS") + ax.set_ylabel(key) + ax.set_ylim(bottom=0) + fig.savefig(f"results/{key}.png") + plt.close(fig) diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py new file mode 100644 index 000000000..901524214 --- /dev/null +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pickle as pkl +import time +from collections.abc import Iterable +from dataclasses import dataclass +from itertools import product +from typing import Callable, Optional + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from tqdm import tqdm + +import vllm._custom_ops as ops +from vllm.model_executor.layers.layernorm import RMSNorm + + +@dataclass +class bench_params_t: + num_tokens: int + hidden_size: int + add_residual: bool + dtype: torch.dtype + + def description(self): + return ( + f"N {self.num_tokens} " + f"x D {self.hidden_size} " + f"x R {self.add_residual} " + f"x DT {self.dtype}" + ) + + +def get_bench_params() -> list[bench_params_t]: + ## Test Fixtures + NUM_TOKENS = [2**x for x in range(11)] + HIDDEN_SIZES = list(range(1024, 8129, 1024)) + ADD_RESIDUAL = [True, False] + DTYPES = [torch.bfloat16, torch.float] + + combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES) + bench_params = list( + map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations) + ) + return bench_params + + +# Reference impls +def unfused_int8_impl( + rms_norm_layer: RMSNorm, + x: torch.Tensor, + residual: Optional[torch.Tensor], + quant_dtype: torch.dtype, +): + # Norm + torch_out = None + if residual is None: + torch_out = rms_norm_layer.forward_cuda(x, residual) + else: + torch_out, _ = rms_norm_layer.forward_cuda(x, residual) + + # Quant + torch_out, _, _ = ops.scaled_int8_quant(torch_out) + + +def unfused_fp8_impl( + rms_norm_layer: RMSNorm, + x: torch.Tensor, + residual: Optional[torch.Tensor], + quant_dtype: torch.dtype, +): + # Norm + torch_out = None + if residual is None: + torch_out = rms_norm_layer.forward_cuda(x, residual) + else: + torch_out, _ = rms_norm_layer.forward_cuda(x, residual) + + # Quant + torch_out, _ = ops.scaled_fp8_quant(torch_out) + + +def fused_impl( + rms_norm_layer: RMSNorm, # this stores the weights + x: torch.Tensor, + residual: Optional[torch.Tensor], + quant_dtype: torch.dtype, +): + out, _ = ops.rms_norm_dynamic_per_token_quant( + x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual + ) + + +# Bench functions +def bench_fn( + rms_norm_layer: RMSNorm, + x: torch.Tensor, + residual: torch.Tensor, + quant_dtype: torch.dtype, + label: str, + sub_label: str, + fn: Callable, + description: str, +) -> TMeasurement: + min_run_time = 1 + + globals = { + "rms_norm_layer": rms_norm_layer, + "x": x, + "residual": residual, + "quant_dtype": quant_dtype, + "fn": fn, + } + return TBenchmark.Timer( + stmt="fn(rms_norm_layer, x, residual, quant_dtype)", + globals=globals, + label=label, + sub_label=sub_label, + description=description, + ).blocked_autorange(min_run_time=min_run_time) + + +def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasurement]: + # Make inputs + layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype) + # Make weights + layer.weight.data.normal_(mean=1.0, std=0.1) + # Make inputs + scale = 1 / params.hidden_size + x = ( + torch.randn( + params.num_tokens, params.hidden_size, dtype=params.dtype, device="cuda" + ) + * scale + ) + residual = ( + (torch.randn_like(x) * scale).to(device="cuda") if params.add_residual else None + ) + + timers = [] + + # unfused int8 impl. + timers.append( + bench_fn( + layer, + x, + residual, + torch.int8, + label, + sub_label, + unfused_int8_impl, + "unfused_int8_impl", + ) + ) + + # unfused fp8 impl. + timers.append( + bench_fn( + layer, + x, + residual, + torch.float8_e4m3fn, + label, + sub_label, + unfused_fp8_impl, + "unfused_fp8_impl", + ) + ) + + # fused int8 impl. + timers.append( + bench_fn( + layer, + x, + residual, + torch.int8, + label, + sub_label, + fused_impl, + "fused_int8_impl", + ) + ) + + # fused fp8 impl. + timers.append( + bench_fn( + layer, + x, + residual, + torch.float8_e4m3fn, + label, + sub_label, + fused_impl, + "fused_fp8_impl", + ) + ) + + print_timers(timers) + + return timers + + +# launch bench +# runner +def print_timers(timers: Iterable[TMeasurement]): + compare = TBenchmark.Compare(timers) + compare.print() + + +def main(): + torch.set_default_device("cuda") + bench_params = get_bench_params() + + timers = [] + for bp in tqdm(bench_params): + timers.extend(bench(bp, "rms-norm-dynamic-per-token-quant", bp.description())) + print_timers(timers) + + # pickle all the results + timestamp = int(time.time()) + with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f: + pkl.dump(timers, f) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py new file mode 100644 index 000000000..b964ed242 --- /dev/null +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -0,0 +1,223 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant +from vllm.triton_utils import triton + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=[ + "torch-bf16", + # "fp8-tensor-w-token-a", + "fp8-tensor-w-tensor-a", + "fp8-channel-w-token-a", + # "fp8-channel-w-tensor-a", + # "fp8-tensor-w-token-a-noquant", + "fp8-tensor-w-tensor-a-noquant", + "fp8-channel-w-token-a-noquant", + # "fp8-channel-w-tensor-a-noquant", + ], + line_names=[ + "torch-bf16", + # "fp8-tensor-w-token-a", + "fp8-tensor-w-tensor-a", + "fp8-channel-w-token-a", + # "fp8-channel-w-tensor-a", + # "fp8-tensor-w-token-a-noquant", + "fp8-tensor-w-tensor-a-noquant", + "fp8-channel-w-token-a-noquant", + # "fp8-channel-w-tensor-a-noquant", + ], + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs FP8 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + # Create input tensors + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + + if "torch-bf16" in provider: + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + + elif "fp8" in provider: + # Weights are always quantized ahead of time + if "noquant" in provider: + # For no quantization, we just measure the GEMM + if "tensor-w-token-a" in provider: + # Dynamic per-token quant for A, per-tensor quant for B + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) + assert scale_b_fp8.numel() == 1 + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "tensor-w-tensor-a" in provider: + # Static per-tensor quantization with fixed scales + # for both A and B + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + assert scale_b_fp8.numel() == 1 + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-token-a" in provider: + # Static per-channel quantization for weights, per-token + # quant for A + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-tensor-a" in provider: + # Static per-channel quantization for weights, per-tensor + # quant for A + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + else: + # In these cases, we quantize the activations during the GEMM call + if "tensor-w-token-a" in provider: + # Dynamic per-token quant for A, per-tensor quant for B + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) + assert scale_b_fp8.numel() == 1 + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "tensor-w-tensor-a" in provider: + # Static per-tensor quantization with fixed scales + # for both A and B + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + assert scale_b_fp8.numel() == 1 + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-token-a" in provider: + # Static per-channel quantization for weights, per-token + # quant for A + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-tensor-a" in provider: + # Static per-channel quantization for weights, per-tensor + # quant for A + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + b_fp8 = b_fp8.t() + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), quantiles=quantiles + ) + + # Calculate TFLOP/s, two flops per multiply-add + tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3) + return tflops(ms), tflops(max_ms), tflops(min_ms) + + +def prepare_shapes(args): + KN_model_names = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + assert model in WEIGHT_SHAPES + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KN.append(model) + KN_model_names.append(KN) + return KN_model_names + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=[*WEIGHT_SHAPES.keys()], + help="List of models to benchmark", + ) + parser.add_argument( + "--tp-sizes", + nargs="+", + type=int, + default=[1], + help="List of tensor parallel sizes", + ) + args = parser.parse_args() + + KN_model_names = prepare_shapes(args) + for K, N, model_name in KN_model_names: + print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_fp8_res_n{N}_k{K}", + N=N, + K=K, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py new file mode 100644 index 000000000..42de062b0 --- /dev/null +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -0,0 +1,345 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import sys +from typing import Optional + +import torch +import torch.nn.functional as F + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.aqlm import ( + dequantize_weight, + generic_dequantize_gemm, + get_int_dtype, + optimized_dequantize_gemm, +) +from vllm.utils import FlexibleArgumentParser + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + + +def torch_mult( + # [..., in_features] + input: torch.Tensor, + weights: torch.Tensor, + # [num_out_groups, 1, 1, 1] + scales: torch.Tensor, +) -> torch.Tensor: + output = F.linear(input, weights) + return output + + +def dequant_out_scale( + # [..., in_features] + input: torch.Tensor, + # [num_out_groups, num_in_groups, num_codebooks] + codes: torch.IntTensor, + # [num_codebooks, codebook_size, out_group_size, in_group_size] + codebooks: torch.Tensor, + # [num_out_groups, 1, 1, 1] + scales: torch.Tensor, + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + if bias is None: + output = F.linear(input, weights, bias) + orig_shape = output.shape + flattened_output = output.view(-1, output.size(-1)) + f_scales = scales.view(-1, scales.shape[0]) + b_scales = f_scales.expand(flattened_output.shape[0], -1) + flattened_output *= b_scales + return flattened_output.view(orig_shape) + else: + b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) + weights *= b_scales + return F.linear(input, weights, bias) + + +def dequant_weight_scale( + # [..., in_features] + input: torch.Tensor, + # [num_out_groups, num_in_groups, num_codebooks] + codes: torch.IntTensor, + # [num_codebooks, codebook_size, out_group_size, in_group_size] + codebooks: torch.Tensor, + # [num_out_groups, 1, 1, 1] + scales: torch.Tensor, + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) + weights *= b_scales + return F.linear(input, weights, bias) + + +def dequant_no_scale( + # [..., in_features] + input: torch.Tensor, + # [num_out_groups, num_in_groups, num_codebooks] + codes: torch.IntTensor, + # [num_codebooks, codebook_size, out_group_size, in_group_size] + codebooks: torch.Tensor, + # [num_out_groups, 1, 1, 1] + scales: torch.Tensor, + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + return F.linear(input, weights, bias) + + +# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against +# the generic pytorch version. +# Just visual comparison. +def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: + n = int(parts.sum().item()) + + device = torch.device("cuda:0") + + code_range = (1 << bits) // 2 + ingroups = 8 + + codes = torch.randint( + -code_range, + code_range, + size=(n, k // ingroups, nbooks), + dtype=get_int_dtype(bits), + device=device, + ) + + codebooks = torch.randn( + size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), + dtype=torch.float16, + device=device, + ) + + count = 0 + for index in range(16): + for i in range(8): + for book in range(nbooks): + codebooks[book, index, 0, i] = count * (10**book) + count += 1 + + print("codes shape", codes.shape) + + for i in range(16): + for book in range(nbooks): + codes[0, i, book] = i + codes[0, -i, book] = i + + weights = dequantize_weight(codes, codebooks, None) + weights2 = ops.aqlm_dequant(codes, codebooks, parts) + + print("weights shape:", weights.shape) + print("weights2 shape:", weights2.shape) + + print("weights are:", weights) + print("weights2 are:", weights2) + + print("first 128 weights are", weights[0, 0:128].to(torch.int32)) + print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32)) + + print("last 128 weights are", weights[0, -128:]) + print("last 128 weights2 are:", weights2[0, -128:]) + + +def main(): + parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") + + # Add arguments + parser.add_argument( + "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)" + ) + parser.add_argument( + "--bits", + type=int, + default=16, + help="Number of bits per code element (default: 16)", + ) + parser.add_argument( + "--test", + type=bool, + default=False, + help="Run the decompression/dequant tester rather than benchmarking " + "(default: False)", + ) + + # Parse the arguments + args = parser.parse_args() + + # Extract values + nbooks = args.nbooks + bits = args.bits + + if args.test: + dequant_test(4096, torch.tensor((4096,)), nbooks, bits) + return + + # Otherwise, benchmark. + methods = [ + ops.aqlm_gemm, + dequant_out_scale, + generic_dequantize_gemm, + optimized_dequantize_gemm, + dequant_weight_scale, + torch_mult, + dequant_no_scale, + ] + + filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv" + print(f"writing benchmarks to file {filename}") + with open(filename, "w") as f: + sys.stdout = f + + print("m | k | n | n parts", end="") + for method in methods: + print(f" | {method.__name__.replace('_', ' ')} (µs)", end="") + print("") + + # These are reasonable prefill sizes. + ksandpartions = ( + (4096, (4096, 4096, 4096)), + (4096, (4096,)), + (4096, (11008, 11008)), + (11008, (4096,)), + ) + + # reasonable ranges for m. + for m in [ + 1, + 2, + 4, + 8, + 10, + 12, + 14, + 16, + 24, + 32, + 48, + 52, + 56, + 64, + 96, + 112, + 128, + 256, + 512, + 1024, + 1536, + 2048, + 3072, + 4096, + ]: + print(f"{m}", file=sys.__stdout__) + for ksp in ksandpartions: + run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods) + + sys.stdout = sys.__stdout__ + + +def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): + # I didn't see visible improvements from increasing these, but feel free :) + num_warmup_trials = 1 + num_trials = 1 + + num_calls = 100 + + # warmup. + for method in methods: + for _ in range(num_warmup_trials): + run_timing( + num_calls=num_calls, + m=m, + k=k, + parts=parts, + nbooks=nbooks, + bits=bits, + method=method, + ) + + n = parts.sum().item() + print(f"{m} | {k} | {n} | {parts.tolist()}", end="") + + for method in methods: + best_time_us = 1e20 + for _ in range(num_trials): + kernel_dur_ms = run_timing( + num_calls=num_calls, + m=m, + k=k, + parts=parts, + nbooks=nbooks, + bits=bits, + method=method, + ) + + kernel_dur_us = 1000 * kernel_dur_ms + + if kernel_dur_us < best_time_us: + best_time_us = kernel_dur_us + + print(f" | {kernel_dur_us:.0f}", end="") + + print("") + + +def run_timing( + num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method +) -> float: + n = int(parts.sum().item()) + + device = torch.device("cuda:0") + + input = torch.randn((1, m, k), dtype=torch.float16, device=device) + + code_range = (1 << bits) // 2 + ingroups = 8 + + codes = torch.randint( + -code_range, + code_range, + size=(n, k // ingroups, nbooks), + dtype=get_int_dtype(bits), + device=device, + ) + + codebooks = torch.randn( + size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), + dtype=torch.float16, + device=device, + ) + + scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) + + # for comparison to just a pytorch mult. + weights = torch.randn((n, k), dtype=torch.float16, device=device) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + start_event.record() + + if method is torch_mult: + for i in range(num_calls): + torch_mult(input, weights, scales) + else: + for i in range(num_calls): + method(input, codes, codebooks, scales, parts, None) + + end_event.record() + end_event.synchronize() + + dur_ms = start_event.elapsed_time(end_event) / num_calls + return dur_ms + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py new file mode 100644 index 000000000..97ee06034 --- /dev/null +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -0,0 +1,242 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( + MINIMUM_BITBLAS_VERSION, +) + +try: + import bitblas + + if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + raise ImportError( + "bitblas version is wrong. Please " + f"install bitblas>={MINIMUM_BITBLAS_VERSION}" + ) +except ImportError as e: + bitblas_import_exception = e + raise ValueError( + "Trying to use the bitblas backend, but could not import" + f"with the following error: {bitblas_import_exception}. " + "Please install bitblas through the following command: " + f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`" + ) from bitblas_import_exception + +from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target + +from vllm.utils import FlexibleArgumentParser + +parser = FlexibleArgumentParser( + description="Benchmark BitBLAS int4 on a specific target." +) + +# Add arguments to the parser +parser.add_argument( + "--target", + type=str, + default=auto_detect_nvidia_target(), + help="Specify the target device for benchmarking.", +) +parser.add_argument( + "--group_size", type=int, default=None, help="Group size for grouped quantization." +) +parser.add_argument( + "--A_dtype", + type=str, + default="float16", + choices=["float16", "float32", "float64", "int32", "int8"], + help="Data type of activation A.", +) +parser.add_argument( + "--W_dtype", + type=str, + default="int4", + choices=[ + "float16", + "float32", + "float64", + "int32", + "int8", + "int4", + "int2", + "int1", + "nf4", + "fp4_e2m1", + ], + help="Data type of weight W.", +) +parser.add_argument( + "--accum_dtype", + type=str, + default="float16", + choices=["float16", "int32"], + help="Data type for accumulation.", +) +parser.add_argument( + "--out_dtype", + type=str, + default="float16", + choices=["float16", "float32", "int32", "int8"], + help="Data type for output.", +) +parser.add_argument( + "--layout", + type=str, + default="nt", + choices=["nt", "nn"], + help="Matrix layout, 'nt' for non-transpose A and transpose W.", +) +parser.add_argument( + "--with_bias", action="store_true", help="Include bias in the benchmark." +) +parser.add_argument( + "--with_scaling", + action="store_true", + help="Include scaling factor in the quantization.", +) +parser.add_argument( + "--with_zeros", action="store_true", help="Include zeros in the quantization." +) +parser.add_argument( + "--zeros_mode", + type=str, + default=None, + choices=["original", "rescale", "quantized"], + help="Specify the mode for calculating zeros.", +) + +# Parse the arguments +args = parser.parse_args() + +# Assign arguments to variables +target = args.target +A_dtype = args.A_dtype +W_dtype = args.W_dtype +accum_dtype = args.accum_dtype +out_dtype = args.out_dtype +layout = args.layout +with_bias = args.with_bias +group_size = args.group_size +with_scaling = args.with_scaling +with_zeros = args.with_zeros +zeros_mode = args.zeros_mode + +# Define a list of shared arguments that repeat in every config +shared_args = [ + A_dtype, + W_dtype, + out_dtype, + accum_dtype, + layout, + with_bias, + group_size, + with_scaling, + with_zeros, + zeros_mode, +] + +# Define just the (M, K, N) shapes in a more compact list +shapes = [ + # square test + (1, 16384, 16384), + # BLOOM-176B + (1, 43008, 14336), + (1, 14336, 14336), + (1, 57344, 14336), + (1, 14336, 57344), + # OPT-65B + (1, 9216, 9216), + (1, 36864, 9216), + (1, 9216, 36864), + (1, 22016, 8192), + # LLAMA-70B/65B + (1, 8192, 22016), + (1, 8192, 8192), + (1, 28672, 8192), + (1, 8192, 28672), + # square test + (16384, 16384, 16384), + # BLOOM-176B + (8192, 43008, 14336), + (8192, 14336, 14336), + (8192, 57344, 14336), + (8192, 14336, 57344), + # OPT-65B + (8192, 9216, 9216), + (8192, 36864, 9216), + (8192, 9216, 36864), + (8192, 22016, 8192), + # LLAMA-70B/65B + (8192, 8192, 22016), + (8192, 8192, 8192), + (8192, 28672, 8192), + (8192, 8192, 28672), +] + +# Build test shapes with all the shared arguments +test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes] + +benchmark_sets = [] +benchmark_sets.extend(test_shapes) + +benchmark_results = {} +for config_class, operator, input_args in benchmark_sets: + config = config_class(*input_args) + matmul = operator(config, target=target, enable_tuning=True) + kernel_latency = matmul.profile_latency() + + print("Time cost is: {:.3f} ms".format(kernel_latency)) + + profile_config = { + f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": { + "BitBLAS_top20_latency": kernel_latency, + } + } + + benchmark_results.update(profile_config) + +# Define headers for the table +headers = [ + "PrimFunc", + "Input Arguments", + "BitBLAS Top20 Latency", +] + +# Calculate column widths for pretty printing +col_widths = [0, 0, 0] +for config_key, values in benchmark_results.items(): + args_split = config_key.split("-") + func_name = args_split[0] + input_args_str = "-".join(args_split[1:]) + col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2) + col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2) + col_widths[2] = max( + col_widths[2], + len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2, + len(headers[2]) + 2, + ) + # break only if you want to measure widths from a single example; + # otherwise, let it loop over all items. + +# Print header +for i, header in enumerate(headers): + headers[i] = header.ljust(col_widths[i]) +print("".join(headers)) +print("-" * sum(col_widths)) + +# Print rows +for config_key, values in benchmark_results.items(): + args_split = config_key.split("-") + func_name = args_split[0] + input_args_str = "-".join(args_split[1:]) + row = [ + func_name, + input_args_str, + f"{values['BitBLAS_top20_latency']:.3f} ms", + ] + row_str = "".join( + [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)] + ) + print(row_str) diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py new file mode 100644 index 000000000..35c20ee41 --- /dev/null +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -0,0 +1,490 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe +kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit +activations. The triton_moe kernel takes in fp8 weights(tensor scaled to fp8) +and 16-bit activations. +""" + +import nvtx +import torch +import torch.utils.benchmark as benchmark + +from vllm import _custom_ops as ops +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 +from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk +from vllm.scalar_type import scalar_types +from vllm.utils import FlexibleArgumentParser + +WEIGHT_SHAPES_MOE = { + "nvidia/DeepSeek-R1-FP4": [ + [256, 8, 2048, 7168], + ], +} + +DEFAULT_MODELS = [ + "nvidia/DeepSeek-R1-FP4", +] + +DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] +DEFAULT_TP_SIZES = [1] + +PER_ACT_TOKEN_OPTS = [False] +PER_OUT_CH_OPTS = [False] +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max + + +def to_fp8(tensor: torch.Tensor): + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( + dtype=torch.float8_e4m3fn + ) + + +def bench_run( + results: list[benchmark.Measurement], + model: str, + num_experts: int, + topk: int, + per_act_token: bool, + per_out_ch: bool, + mkn: tuple[int, int, int], +): + label = "NVFP4 Blockscaled CUTLASS MOE vs FP8 Tensor Scaled Triton" + + sub_label = ( + "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format( + model, num_experts, topk, per_act_token, per_out_ch, mkn + ) + ) + + print(f"Testing: {sub_label}") + + (m, k, n) = mkn + + dtype = torch.half + device = "cuda" + a = torch.randn((m, k), device=device, dtype=dtype) / 10 + w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10 + w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10 + + _, a_fp8_scale = ops.scaled_fp8_quant(a) + + w1_fp8q = torch.empty( + (num_experts, 2 * n, k), device=device, dtype=torch.float8_e4m3fn + ) + w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=torch.float8_e4m3fn) + w1_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) + w2_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) + + for expert in range(num_experts): + w1_fp8q[expert], w1_fp8scale[expert] = ops.scaled_fp8_quant(w1[expert]) + w2_fp8q[expert], w2_fp8scale[expert] = ops.scaled_fp8_quant(w2[expert]) + + w1_fp8q_notransp = w1_fp8q.clone() + w2_fp8q_notransp = w2_fp8q.clone() + w1_fp8q = w1_fp8q.transpose(1, 2) + w2_fp8q = w2_fp8q.transpose(1, 2) + + score = torch.randn((m, num_experts), device=device, dtype=dtype) + + topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) + + quant_blocksize = 16 + w1_blockscale = torch.empty( + (num_experts, 2 * n, k // quant_blocksize), + device=device, + dtype=torch.float8_e4m3fn, + ) + w2_blockscale = torch.empty( + (num_experts, k, n // quant_blocksize), device=device, dtype=torch.float8_e4m3fn + ) + + # n_b_scales = 2 * n if per_out_ch else 1 + # k_b_scales = k if per_out_ch else 1 + w1_fp4 = torch.empty((num_experts, 2 * n, k // 2), device=device, dtype=torch.uint8) + w2_fp4 = torch.empty((num_experts, k, n // 2), device=device, dtype=torch.uint8) + + w1_gs = torch.empty((num_experts,), device=device, dtype=torch.float32) + w2_gs = torch.empty((num_experts,), device=device, dtype=torch.float32) + a1_gs = torch.ones((num_experts,), device=device, dtype=torch.float32) + a2_gs = torch.ones((num_experts,), device=device, dtype=torch.float32) + + for expert in range(num_experts): + w1_e = w1[expert] + w2_e = w2[expert] + w1_amax = torch.abs(w1_e).max().to(torch.float32) + w2_amax = torch.abs(w2_e).max().to(torch.float32) + w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax + w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax + + w1_fp4[expert], w1_blockscale[expert] = ops.scaled_fp4_quant( + w1_e, w1_gs[expert] + ) + + w2_fp4[expert], w2_blockscale[expert] = ops.scaled_fp4_quant( + w2_e, w2_gs[expert] + ) + + def run_triton_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a_fp8_scale: torch.Tensor, + num_repeats: int, + ): + for _ in range(num_repeats): + fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) + + def run_cutlass_moe_fp4( + a: torch.Tensor, + w1_fp4: torch.Tensor, + w2_fp4: torch.Tensor, + w1_blockscale: torch.Tensor, + w2_blockscale: torch.Tensor, + w1_gs: torch.Tensor, + w2_gs: torch.Tensor, + a1_gs: torch.Tensor, + a2_gs: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + m: int, + n: int, + k: int, + e: int, + device: torch.device, + num_repeats: int, + ): + for _ in range(num_repeats): + with nvtx.annotate("cutlass_moe_fp4", color="green"): + cutlass_moe_fp4( + a=a, + a1_gscale=a1_gs, + a2_gscale=a2_gs, + w1_fp4=w1_fp4, + w1_blockscale=w1_blockscale, + w1_alphas=w1_gs, + w2_fp4=w2_fp4, + w2_blockscale=w2_blockscale, + w2_alphas=w2_gs, + topk_weights=topk_weights, + topk_ids=topk_ids, + m=m, + n=n, + k=k, + e=num_experts, + device=device, + ) + + def run_cutlass_from_graph( + a: torch.Tensor, + a1_gscale: torch.Tensor, + w1_fp4: torch.Tensor, + w1_blockscale: torch.Tensor, + w1_alphas: torch.Tensor, + a2_gscale: torch.Tensor, + w2_fp4: torch.Tensor, + w2_blockscale: torch.Tensor, + w2_alphas: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + m: int, + n: int, + k: int, + e: int, + device: torch.device, + ): + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) + ): + return cutlass_moe_fp4( + a=a, + a1_gscale=a1_gs, + w1_fp4=w1_fp4, + w1_blockscale=w1_blockscale, + w1_alphas=w1_alphas, + a2_gscale=a2_gs, + w2_fp4=w2_fp4, + w2_blockscale=w2_blockscale, + w2_alphas=w2_alphas, + topk_weights=topk_weights, + topk_ids=topk_ids, + m=m, + n=n, + k=k, + e=num_experts, + device=device, + ) + + def run_triton_from_graph( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a_fp8_scale: torch.Tensor, + ): + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) + ): + return fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) + + def replay_graph(graph, num_repeats): + for _ in range(num_repeats): + graph.replay() + torch.cuda.synchronize() + + cutlass_stream = torch.cuda.Stream() + cutlass_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): + run_cutlass_from_graph( + a=a, + a1_gscale=a1_gs, + w1_fp4=w1_fp4, + w1_blockscale=w1_blockscale, + w1_alphas=w1_gs, + a2_gscale=a2_gs, + w2_fp4=w2_fp4, + w2_blockscale=w2_blockscale, + w2_alphas=w2_gs, + topk_weights=topk_weights, + topk_ids=topk_ids, + m=m, + n=n, + k=k, + e=num_experts, + device=device, + ) + torch.cuda.synchronize() + + triton_stream = torch.cuda.Stream() + triton_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(triton_graph, stream=triton_stream): + run_triton_from_graph( + a, + w1_fp8q_notransp, + w2_fp8q_notransp, + topk_weights, + topk_ids, + w1_fp8scale, + w2_fp8scale, + a_fp8_scale, + ) + torch.cuda.synchronize() + + min_run_time = 5 + num_warmup = 5 + num_runs = 25 + + globals = { + # Baseline params + "w1": w1, + "w2": w2, + "score": score, + "topk": topk, + "w1_fp8q_notransp": w1_fp8q_notransp, + "w2_fp8q_notransp": w2_fp8q_notransp, + "w1_fp8scale": w1_fp8scale, + "w2_fp8scale": w2_fp8scale, + "a_fp8_scale": a_fp8_scale, + # Cutlass params + "a": a, + "a1_gscale": a1_gs, + "w1_fp4": w1_fp4, + "w1_blockscale": w1_blockscale, + "w1_alphas": w1_gs, + "a2_gscale": a2_gs, + "w2_fp4": w2_fp4, + "w2_blockscale": w2_blockscale, + "w2_alphas": w2_gs, + "topk_weights": topk_weights, + "topk_ids": topk_ids, + "m": m, + "n": n, + "k": k, + "e": num_experts, + "device": device, + # cuda graph params + "cutlass_graph": cutlass_graph, + "triton_graph": triton_graph, + # Gen params + "num_runs": num_runs, + # Kernels + "run_triton_moe": run_triton_moe, + "run_cutlass_moe_fp4": run_cutlass_moe_fp4, + "replay_graph": replay_graph, + } + + # Warmup + run_triton_moe( + a, + w1_fp8q_notransp, + w2_fp8q_notransp, + topk_weights, + topk_ids, + w1_fp8scale, + w2_fp8scale, + a_fp8_scale, + num_warmup, + ) + + results.append( + benchmark.Timer( + stmt="run_triton_moe(a, w1_fp8q_notransp, w2_fp8q_notransp, topk_weights, topk_ids, w1_fp8scale, w2_fp8scale, a_fp8_scale, num_runs)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="triton_moe", + ).blocked_autorange(min_run_time=min_run_time) + ) + + # Warmup + replay_graph(triton_graph, num_warmup) + + results.append( + benchmark.Timer( + stmt="replay_graph(triton_graph, num_runs)", + globals=globals, + label=label, + sub_label=sub_label, + description="triton_moe_cuda_graphs", + ).blocked_autorange(min_run_time=min_run_time) + ) + + # Warmup + + run_cutlass_moe_fp4( + a, + w1_fp4, + w2_fp4, + w1_blockscale, + w2_blockscale, + w1_gs, + w2_gs, + a1_gs, + a2_gs, + topk_weights, + topk_ids, + m, + n, + k, + num_experts, + device, + num_warmup, + ) + + results.append( + benchmark.Timer( + stmt="run_cutlass_moe_fp4(a, w1_fp4, w2_fp4, w1_blockscale, w2_blockscale, w1_alphas, w2_alphas, a1_gscale, a2_gscale, topk_weights, topk_ids, m, n, k, e, device, num_runs)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="cutlass_moe_fp4", + ).blocked_autorange(min_run_time=min_run_time) + ) + + # Warmup + replay_graph(cutlass_graph, num_warmup) + + results.append( + benchmark.Timer( + stmt="replay_graph(cutlass_graph, num_runs)", + globals=globals, + label=label, + sub_label=sub_label, + description="cutlass_moe_fp4_cuda_graphs", + ).blocked_autorange(min_run_time=min_run_time) + ) + + +def main(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + results: list[benchmark.Measurement] = [] + + for model in args.models: + for tp in args.tp_sizes: + for layer in WEIGHT_SHAPES_MOE[model]: + num_experts = layer[0] + topk = layer[1] + size_k = layer[2] + size_n = layer[3] // tp + + if len(args.limit_k) > 0 and size_k not in args.limit_k: + continue + + if len(args.limit_n) > 0 and size_n not in args.limit_n: + continue + + for per_act_token in PER_ACT_TOKEN_OPTS: + for per_out_ch in PER_OUT_CH_OPTS: + for size_m in args.batch_sizes: + mkn = (size_m, size_k, size_n) + bench_run( + results, + model, + num_experts, + topk, + per_act_token, + per_out_ch, + mkn, + ) + + compare = benchmark.Compare(results) + compare.print() + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark NVFP4 CUTLASS MOE across specified models/shapes/batches" + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES_MOE.keys(), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) + parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + parser.add_argument("--limit-k", nargs="+", type=int, default=[]) + parser.add_argument("--limit-n", nargs="+", type=int, default=[]) + parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[]) + parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[]) + parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[]) + + args = parser.parse_args() + main(args) diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py new file mode 100644 index 000000000..acabe6c1d --- /dev/null +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -0,0 +1,378 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import torch.utils.benchmark as benchmark +from benchmark_shapes import WEIGHT_SHAPES_MOE + +from vllm import _custom_ops as ops +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 +from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_experts, + fused_topk, +) +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = [ + "nm-testing/Mixtral-8x7B-Instruct-v0.1", + "nm-testing/deepseekv2-lite", + "ibm-granite/granite-3.0-1b-a400m", + "ibm-granite/granite-3.0-3b-a800m", +] +DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512] +DEFAULT_TP_SIZES = [1] + +PER_ACT_TOKEN_OPTS = [False] +PER_OUT_CH_OPTS = [False] + + +def to_fp8(tensor: torch.Tensor): + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( + dtype=torch.float8_e4m3fn + ) + + +def bench_run( + results: list[benchmark.Measurement], + model: str, + num_experts: int, + topk: int, + per_act_token: bool, + per_out_ch: bool, + mkn: tuple[int, int, int], +): + label = "Quant Matmul" + + sub_label = ( + "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format( + model, num_experts, topk, per_act_token, per_out_ch, mkn + ) + ) + + print(f"Testing: {sub_label}") + + (m, k, n) = mkn + + dtype = torch.half + + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10 + + _, a_scale = ops.scaled_fp8_quant(a) + + w1_q = torch.empty( + (num_experts, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn + ) + w2_q = torch.empty((num_experts, k, n), device="cuda", dtype=torch.float8_e4m3fn) + w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) + w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) + + for expert in range(num_experts): + w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert]) + w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert]) + + score = torch.randn((m, num_experts), device="cuda", dtype=dtype) + + topk_weights, topk_ids, token_expert_indices = fused_topk( + a, score, topk, renormalize=False + ) + + def run_triton_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a_scale: torch.Tensor, + num_repeats: int, + ): + for _ in range(num_repeats): + fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) + + def run_cutlass_moe( + a: torch.Tensor, + a_scale: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_repeats: int, + ): + for _ in range(num_repeats): + cutlass_moe_fp8( + a, + w1, + w2, + topk_weights, + topk_ids, + w1_scale, + w2_scale, + a1_scale=a_scale, + ) + + def run_cutlass_from_graph( + a: torch.Tensor, + a_scale: torch.Tensor, + w1_q: torch.Tensor, + w2_q: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ): + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) + ): + return cutlass_moe_fp8( + a, + w1_q, + w2_q, + topk_weights, + topk_ids, + w1_scale, + w2_scale, + a1_scale=a_scale, + ) + + def run_triton_from_graph( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a_scale: torch.Tensor, + ): + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) + ): + return fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) + + def replay_graph(graph, num_repeats): + for _ in range(num_repeats): + graph.replay() + torch.cuda.synchronize() + + cutlass_stream = torch.cuda.Stream() + cutlass_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): + run_cutlass_from_graph( + a, + a_scale, + w1_q, + w2_q, + w1_scale, + w2_scale, + topk_weights, + topk_ids, + ) + torch.cuda.synchronize() + + triton_stream = torch.cuda.Stream() + triton_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(triton_graph, stream=triton_stream): + run_triton_from_graph( + a, + w1_q, + w2_q, + topk_weights, + topk_ids, + w1_scale, + w2_scale, + a_scale, + ) + torch.cuda.synchronize() + + min_run_time = 5 + num_warmup = 5 + num_runs = 25 + + globals = { + # Baseline params + "w1": w1, + "w2": w2, + "score": score, + "topk": topk, + # Cutlass params + "a_scale": a_scale, + "w1_q": w1_q, + "w2_q": w2_q, + "w1_scale": w1_scale, + "w2_scale": w2_scale, + # cuda graph params + "cutlass_graph": cutlass_graph, + "triton_graph": triton_graph, + # Gen params + "a": a, + "topk_weights": topk_weights, + "topk_ids": topk_ids, + "num_runs": num_runs, + # Kernels + "run_triton_moe": run_triton_moe, + "run_cutlass_moe": run_cutlass_moe, + "replay_graph": replay_graph, + } + + # Warmup + run_triton_moe( + a, + w1_q, + w2_q, + topk_weights, + topk_ids, + w1_scale, + w2_scale, + a_scale, + num_warmup, + ) + + results.append( + benchmark.Timer( + stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="triton_moe", + ).blocked_autorange(min_run_time=min_run_time) + ) + + # Warmup + replay_graph(triton_graph, num_warmup) + + results.append( + benchmark.Timer( + stmt="replay_graph(triton_graph, num_runs)", + globals=globals, + label=label, + sub_label=sub_label, + description="triton_moe_cuda_graphs", + ).blocked_autorange(min_run_time=min_run_time) + ) + + # Warmup + run_cutlass_moe( + a, + a_scale, + w1_q, + w2_q, + w1_scale, + w2_scale, + topk_weights, + topk_ids, + num_warmup, + ) + + results.append( + benchmark.Timer( + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="grouped_gemm_moe", + ).blocked_autorange(min_run_time=min_run_time) + ) + + # Warmup + replay_graph(cutlass_graph, num_warmup) + + results.append( + benchmark.Timer( + stmt="replay_graph(cutlass_graph, num_runs)", + globals=globals, + label=label, + sub_label=sub_label, + description="grouped_gemm_moe_cuda_graphs", + ).blocked_autorange(min_run_time=min_run_time) + ) + + +def main(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + results: list[benchmark.Measurement] = [] + + for model in args.models: + for tp in args.tp_sizes: + for layer in WEIGHT_SHAPES_MOE[model]: + num_experts = layer[0] + topk = layer[1] + size_k = layer[2] + size_n = layer[3] // tp + + if len(args.limit_k) > 0 and size_k not in args.limit_k: + continue + + if len(args.limit_n) > 0 and size_n not in args.limit_n: + continue + + for per_act_token in PER_ACT_TOKEN_OPTS: + for per_out_ch in PER_OUT_CH_OPTS: + for size_m in DEFAULT_BATCH_SIZES: + mkn = (size_m, size_k, size_n) + bench_run( + results, + model, + num_experts, + topk, + per_act_token, + per_out_ch, + mkn, + ) + + compare = benchmark.Compare(results) + compare.print() + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark Marlin across specified models/shapes/batches" + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES_MOE.keys(), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) + parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + parser.add_argument("--limit-k", nargs="+", type=int, default=[]) + parser.add_argument("--limit-n", nargs="+", type=int, default=[]) + parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[]) + parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[]) + parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[]) + + args = parser.parse_args() + main(args) diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py new file mode 100644 index 000000000..69978ec6b --- /dev/null +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import time + +import torch + +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser + + +@torch.inference_mode() +def main( + num_tokens: int, + hidden_size: int, + add_residual: bool, + dtype: torch.dtype, + seed: int = 0, + do_profile: bool = False, + num_warmup_iters: int = 5, + num_iters: int = 100, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device("cuda") + + layer = RMSNorm(hidden_size).to(dtype=dtype) + layer.weight.data.normal_(mean=1.0, std=0.1) + scale = 1 / (2 * hidden_size) + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + x *= scale + residual = torch.randn_like(x) * scale if add_residual else None + + def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: + torch.cuda.synchronize() + if profile: + torch.cuda.cudart().cudaProfilerStart() + start_time = time.perf_counter() + + for _ in range(num_iters): + layer(x, residual) + torch.cuda.synchronize() + + end_time = time.perf_counter() + if profile: + torch.cuda.cudart().cudaProfilerStop() + return (end_time - start_time) / num_iters + + # Warmup. + print("Warming up...") + run_benchmark = run_cuda_benchmark + run_benchmark(num_iters=num_warmup_iters, profile=False) + + # Benchmark. + if do_profile: + latency = run_benchmark(num_iters=1, profile=True) + else: + latency = run_benchmark(num_iters=num_iters, profile=False) + print(f"Kernel running time: {latency * 1000000:.3f} us") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the layernorm kernel.") + parser.add_argument("--num-tokens", type=int, default=4096) + parser.add_argument("--hidden-size", type=int, default=8192) + parser.add_argument("--add-residual", action="store_true") + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--profile", action="store_true") + parser.add_argument("--num-warmup-iters", type=int, default=5) + parser.add_argument( + "--num-iters", + type=int, + default=100, + help="Number of benchmark iterations. " + "If --profile is set, this number is ignored", + ) + + args = parser.parse_args() + print(args) + + main( + num_tokens=args.num_tokens, + hidden_size=args.hidden_size, + add_residual=args.add_residual, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + seed=args.seed, + do_profile=args.profile, + num_warmup_iters=args.num_warmup_iters, + num_iters=args.num_iters, + ) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py new file mode 100644 index 000000000..3d38d4b35 --- /dev/null +++ b/benchmarks/kernels/benchmark_lora.py @@ -0,0 +1,1065 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import copy +import json +import pickle +import time +from dataclasses import dataclass +from enum import Enum, auto +from itertools import product +from pathlib import Path +from typing import Any, Callable, Optional + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from utils import ArgPool, Bench, CudaGraphBenchParams +from weight_shapes import WEIGHT_SHAPES + +from vllm.triton_utils import HAS_TRITON + +if HAS_TRITON: + from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink + from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT + +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) +DEFAULT_TP_SIZES = [1] +DEFAULT_BATCH_SIZES = [ + 1, + 16, + 32, + 64, + 128, + 192, + 256, + 320, + 384, + 448, + 512, + 640, + 768, + 896, + 1024, + 2048, + 3072, + 4096, + 5120, + 6144, + 7168, + 8192, +] +DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384] +DEFAULT_LORA_RANKS = [16] +DEFAULT_NUM_LORAS = [1, 2, 3, 4] +DEFAULT_SORT_BY_LORA_IDS = [False, True] +DEFAULT_SEQ_LENGTHS = [1] +DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False] + + +# Utilities +def dtype_to_str(dtype: torch.dtype): + if dtype == torch.float16: + return "f16" + if dtype == torch.bfloat16: + return "bf16" + if dtype == torch.float32: + return "f32" + raise ValueError(f"Unsupported dtype {dtype}") + + +def make_rand_lora_weight_tensor( + k: int, n: int, num_loras: int, dtype: torch.dtype, device: str = "cuda" +) -> torch.Tensor: + # LoRA weights column major + return torch.rand((num_loras, n, k), dtype=dtype).to(device) + + +def make_rand_tensors( + a_shape: tuple[int], + b_shape: tuple[int], + c_shape: tuple[int], + a_dtype: torch.dtype, + b_dtype: torch.dtype, + c_dtype: torch.dtype, + num_slices: int, + device: str = "cuda", +) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]: + """ + Make LoRA input/output matrices. + """ + A = torch.rand(a_shape, dtype=a_dtype).to(device) + + # LoRA weights column major + Bs = [torch.rand(b_shape, dtype=b_dtype).to(device) for _ in range(num_slices)] + + C = torch.zeros(c_shape, dtype=c_dtype).to(device) + return A, Bs, C + + +def make_prompt_lora_mapping( + num_prompts: int, num_active_loras: int, sort_by_lora_id: bool, device: str +) -> torch.Tensor: + """ + All prompts are mapped to a LoRA ID in range [0, num_active_loras). + where 0 refers to first lora, 1 refers to second lora and so on. + """ + assert num_active_loras > 0 + + if not sort_by_lora_id: + return torch.randint(0, num_active_loras, (num_prompts,), dtype=torch.long) + + # Divide LoRAs equally and in order. + part_size = num_prompts // num_active_loras + part_size = max(part_size, 1) + + lora_id = 0 + prompt_lora_mapping = [] + while len(prompt_lora_mapping) < num_prompts: + prompt_lora_mapping.extend([lora_id] * part_size) + lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id + return torch.tensor( + prompt_lora_mapping[:num_prompts], dtype=torch.long, device=device + ) + + +def make_token_lora_mapping( + num_tokens: int, + num_prompts: int, + prompt_lora_mapping: torch.Tensor, + seq_len_tensor: torch.Tensor, + device: str, +): + """ + Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor + """ + assert prompt_lora_mapping.shape[0] == num_prompts + + # token to lora index mapping + token_lora_mapping = [0] * num_tokens + current_offset = 0 + for b_id in range(num_prompts): + lora_index = prompt_lora_mapping[b_id].item() + s = current_offset + e = s + seq_len_tensor[b_id].item() + token_lora_mapping[s:e] = [lora_index] * (e - s) + current_offset += seq_len_tensor[b_id].item() + + return torch.tensor(token_lora_mapping, dtype=torch.long, device=device) + + +def ref_group_gemm( + ref_out: torch.Tensor, + input: torch.Tensor, + lora_weights: list[torch.Tensor], + seq_lens_cpu: torch.Tensor, + prompt_lora_mapping_cpu: torch.Tensor, + scaling: float, + add_inputs: Optional[bool], +): + """ + Torch group gemm reference implementation to test correctness of + benchmarking operations. + """ + batches = seq_lens_cpu.size(0) + out_list = [] + current_offset = 0 + for lora_index, b_length in zip(range(batches), seq_lens_cpu): + x = input[current_offset : b_length + current_offset, :] + current_offset += b_length + w = lora_weights[prompt_lora_mapping_cpu[lora_index]] + result = torch.nn.functional.linear(x, w) + result *= scaling + out_list.append(result) + + cat_result = torch.cat(out_list, dim=0) + + if add_inputs: + ref_out += cat_result + else: + ref_out.copy_(cat_result) + + +class OpType(Enum): + """ + LoRA Ops to benchmark and its properties. + """ + + LORA_SHRINK = auto() + LORA_EXPAND = auto() + + @staticmethod + def from_str(s: str) -> "OpType": + if s.lower() == "lora_shrink": + return OpType.LORA_SHRINK + if s.lower() == "lora_expand": + return OpType.LORA_EXPAND + raise ValueError(f"Unrecognized str {s} to convert to OpType") + + def is_shrink_fn(self) -> bool: + return self in [OpType.LORA_SHRINK] + + def is_expand_fn(self) -> bool: + return self in [OpType.LORA_EXPAND] + + def num_slices(self) -> list[int]: + return [1, 2, 3] + + def mkn( + self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int + ) -> tuple[int, int, int]: + num_tokens = batch_size * seq_length + if self.is_shrink_fn(): + m = num_tokens + k = hidden_size + n = lora_rank + else: + assert self.is_expand_fn() + m = num_tokens + k = lora_rank + n = hidden_size + return m, k, n + + def matmul_dtypes( + self, op_dtype: torch.dtype + ) -> tuple[torch.dtype, torch.dtype, torch.dtype]: + """ + return a type, b type and c type for A x B = C + """ + if self.is_shrink_fn(): + return op_dtype, op_dtype, torch.float32 + else: + assert self.is_expand_fn() + return torch.float32, op_dtype, op_dtype + + def matmul_shapes( + self, + batch_size: int, + seq_length: int, + hidden_size: int, + lora_rank: int, + num_loras: int, + num_slices: int, + ) -> tuple[tuple[int], tuple[int], tuple[int]]: + """ + Given num_slices, return the shapes of the A, B, and C matrices + in A x B = C, for the op_type + """ + m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank) + + b_shape = (num_loras, n, k) # col-major + if self in [OpType.LORA_SHRINK]: + # LoRA shrink kernels support num_slices inherently in the kernel. + return ((m, k), b_shape, (num_slices, m, n)) + if self in [OpType.LORA_EXPAND]: + # LoRA expand kernels support num_slices inherently in the kernel + return ((num_slices, m, k), b_shape, (m, n * num_slices)) + raise ValueError(f"Unrecognized op_type {self}") + + def bench_fn(self) -> Callable: + if self == OpType.LORA_SHRINK: + return lora_shrink + if self == OpType.LORA_EXPAND: + return lora_expand + + raise ValueError(f"Unrecognized optype {self}") + + def run_ref_group_gemm( + self, + output: torch.Tensor, + input: torch.Tensor, + lora_weights: list[torch.Tensor], + **kwargs, + ) -> Callable: + """Each benchmark operation expects the input, lora_weights and outputs + in a slightly different format. Refer to self.matmul_shapes(). + run_ref_group_gemm accounts for those differences in executing a + reference group gemm for correctness testing. + """ + w_dtype = lora_weights[0].dtype + num_slices = len(lora_weights) + if self in [OpType.LORA_SHRINK]: + for slice_idx in range(num_slices): + ref_group_gemm( + ref_out=output[slice_idx, :], + input=input, + lora_weights=lora_weights[slice_idx], + **kwargs, + ) + elif self in [OpType.LORA_EXPAND]: + hidden_size = lora_weights[0].shape[1] + for slice_idx in range(num_slices): + slice_offset = slice_idx * hidden_size + ref_group_gemm( + ref_out=output[:, slice_offset : slice_offset + hidden_size], + input=input[slice_idx].clone().to(dtype=w_dtype), + lora_weights=lora_weights[slice_idx], + **kwargs, + ) + else: + raise ValueError(f"Unrecognized optype {self}") + + +@dataclass +class BenchmarkContext: + """ + LoRA benchmark context + """ + + batch_size: int + hidden_size: int + num_loras: int + num_active_loras: int + lora_rank: int + sort_by_lora_id: bool + dtype: torch.dtype + seq_length: Optional[int] = None + num_slices: Optional[int] = None # num_slices for slice based ops + + def with_seq_length(self, seq_length: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.seq_length = seq_length + return ctx + + def with_num_slices(self, num_slices: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.num_slices = num_slices + return ctx + + def bench_label(self) -> str: + return f"lora-{self.dtype}" + + def bench_sublabel(self, op_type: OpType) -> str: + m, k, n = op_type.mkn( + self.batch_size, self.seq_length, self.hidden_size, self.lora_rank + ) + desc = { + "bs": self.batch_size, + "sl": self.seq_length, + "m": m, + "k": k, + "n": n, + "num_loras": self.num_loras, + "sort_by_lora": self.sort_by_lora_id, + "num_slices": self.num_slices, + } + return json.dumps(desc) + + +@dataclass +class BenchmarkTensors: + """ + Input/Output tensors used for benchmarks + """ + + # matmul tensors + input: torch.Tensor + lora_weights_lst: list[torch.Tensor] + output: torch.Tensor + # LoRA kernel metadata + lora_kernel_meta: LoRAKernelMeta + # Metadata tensors used in testing correctness + seq_lens: torch.Tensor + prompt_lora_mapping: torch.Tensor + + def io_types(self) -> str: + return ( + f"{dtype_to_str(self.input.dtype)}x" + f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>" + f"{dtype_to_str(self.output.dtype)}" + ) + + @staticmethod + def make( + ctx: BenchmarkContext, op_type: OpType, device: str = "cuda" + ) -> "BenchmarkTensors": + # Make input / output matmul tensors. + a_shape, b_shape, c_shape = op_type.matmul_shapes( + ctx.batch_size, + ctx.seq_length, + ctx.hidden_size, + ctx.lora_rank, + ctx.num_loras, + ctx.num_slices, + ) + a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype) + input_tensor, lora_weights, output_tensor = make_rand_tensors( + a_shape, b_shape, c_shape, a_type, b_type, c_type, num_slices=ctx.num_slices + ) + + # Make metadata tensors. + # Keep the metadata tensors in the CPU for further processing if needed. + # The tensors get moved to the GPU before benchmarking. + assert ctx.num_active_loras <= ctx.num_loras + total_tokens = ctx.batch_size * ctx.seq_length + + # Make metadata tensors involved in correctness testing. + # Prepare seq lens tensor + seq_len_tensor = torch.randint( + ctx.seq_length, ctx.seq_length + 1, (ctx.batch_size,) + ) + assert total_tokens == seq_len_tensor.sum() + # Prepare prompt lora indices tensor + prompt_lora_indices_tensor = make_prompt_lora_mapping( + ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu" + ) + + # Make LoRAKernelMeta + token_lora_indices_tensor = make_token_lora_mapping( + total_tokens, + ctx.batch_size, + prompt_lora_indices_tensor, + seq_len_tensor, + "cpu", + ) + lora_kernel_meta = LoRAKernelMeta.make( + max_loras=ctx.num_loras, + max_num_tokens=token_lora_indices_tensor.size(0), + device="cpu", + ) + lora_kernel_meta.prepare_tensors(token_lora_mapping=token_lora_indices_tensor) + + return BenchmarkTensors( + input_tensor, + lora_weights, + output_tensor, + lora_kernel_meta, + seq_len_tensor, + prompt_lora_indices_tensor, + ) + + def sanity_check(self) -> None: + """ + Fails asserts when non-conformality is detected. + """ + num_tokens = self.input.shape[-2] + # check metadata tensors + assert torch.sum(self.seq_lens) == num_tokens + num_seqs = self.seq_lens.shape[0] + # assert self.seq_start_loc.shape[0] == num_seqs + assert self.prompt_lora_mapping.shape[0] == num_seqs + assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens + + def to_device(self, device: str): + """ + Transfer tensors to device if the tensors aren't already on the device + """ + + def to_device(tensor: torch.Tensor): + if tensor.device != device: + tensor = tensor.to(device=device) + return tensor + + self.input = to_device(self.input) + self.output = to_device(self.output) + self.seq_lens = to_device(self.seq_lens) + self.prompt_lora_mapping = to_device(self.prompt_lora_mapping) + for i in range(len(self.lora_weights_lst)): + self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) + + # LoRA meta + for field_name in LoRAKernelMeta.__dataclass_fields__: + field = getattr(self.lora_kernel_meta, field_name) + assert isinstance(field, torch.Tensor) + setattr(self.lora_kernel_meta, field_name, to_device(field)) + + def metadata(self) -> tuple[int, int, int]: + """ + Return num_seqs, num_tokens and max_seq_len + """ + num_seqs = self.seq_lens.shape[0] + num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0] + max_seq_len = torch.max(self.seq_lens).item() + num_slices = len(self.lora_weights_lst) + return num_seqs, num_tokens, max_seq_len, num_slices + + def as_lora_shrink_kwargs(self) -> dict[str, Any]: + self.sanity_check() + self.to_device(self.input.device) + + _, num_tokens, _, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = ( + self.input.shape, + self.lora_weights_lst[0].shape, + self.output.shape, + ) + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_slices, num_tokens, lora_rank] + assert len(o_shape) == 3 + assert o_shape == (num_slices, num_tokens, lora_rank) + + return { + "inputs": self.input, + "lora_a_weights": self.lora_weights_lst, + "output_tensor": self.output, + "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping, + "token_indices_sorted_by_lora_ids": ( + self.lora_kernel_meta.token_indices_sorted_by_lora_ids + ), + "num_tokens_per_lora": self.lora_kernel_meta.num_tokens_per_lora, + "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc, + "lora_ids": self.lora_kernel_meta.active_lora_ids, + "scaling": 1.0, + } + + def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: + self.sanity_check() + self.to_device(self.input.device) + + _, num_tokens, _, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = ( + self.input.shape, + self.lora_weights_lst[0].shape, + self.output.shape, + ) + # Expected input shape : [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape : [num_lora, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape : [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + return { + "inputs": self.input, + "lora_b_weights": self.lora_weights_lst, + "output_tensor": self.output, + "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping, + "token_indices_sorted_by_lora_ids": ( + self.lora_kernel_meta.token_indices_sorted_by_lora_ids + ), + "num_tokens_per_lora": self.lora_kernel_meta.num_tokens_per_lora, + "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc, + "lora_ids": self.lora_kernel_meta.active_lora_ids, + "offset_start": 0, + "add_inputs": add_inputs, + } + + def bench_fn_kwargs( + self, op_type: OpType, add_inputs: Optional[bool] = None + ) -> dict[str, Any]: + if op_type.is_shrink_fn(): + assert add_inputs is None + else: + assert add_inputs is not None + + if op_type == OpType.LORA_SHRINK: + return self.as_lora_shrink_kwargs() + if op_type == OpType.LORA_EXPAND: + return self.as_lora_expand_kwargs(add_inputs) + raise ValueError(f"Unrecognized optype {self}") + + def test_correctness( + self, op_type: OpType, expand_fn_add_inputs: Optional[bool] + ) -> bool: + """ + Test correctness of op_type implementation against a grouped gemm + reference implementation. + """ + seq_lens_cpu = self.seq_lens.to(device="cpu") + prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu") + ref_output = self.output.clone() + + self.output.zero_() + op_type.bench_fn()(**self.bench_fn_kwargs(op_type, expand_fn_add_inputs)) + + op_type.run_ref_group_gemm( + ref_output, + self.input, + self.lora_weights_lst, + seq_lens_cpu=seq_lens_cpu, + prompt_lora_mapping_cpu=prompt_lora_mapping_cpu, + scaling=1.0, + add_inputs=expand_fn_add_inputs, + ) + + rtol, atol = { + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), + torch.float32: (1e-2, 1e-2), + }[self.output.dtype] + + return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol) + + +def bench_optype( + ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None, + expand_fn_add_inputs: Optional[bool] = None, + test_correctness: bool = False, +) -> TMeasurement: + assert arg_pool_size >= 1 + if op_type.is_shrink_fn(): + assert expand_fn_add_inputs is None + else: + assert expand_fn_add_inputs is not None + + # BenchmarkContext -> BenchmarkTensors + bench_tensors: list[BenchmarkTensors] = [ + BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size) + ] + for bt in bench_tensors: + bt.sanity_check() + + # Test correctness of our implementation. + if test_correctness: + assert all( + [bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors] + ) + + # BenchmarkTensors -> dict (kwargs) + kwargs_list = [ + bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) + for bt in bench_tensors + ] + + # Clear LoRA optimization hash-maps. + _LORA_A_PTR_DICT.clear() + _LORA_B_PTR_DICT.clear() + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + for kwargs in kwargs_list: + op_type.bench_fn()(**kwargs) + torch.cuda.synchronize() + + # Merge into a single kwargs and qualify arguments as ArgPool + kwargs = {k: ArgPool([]) for k in kwargs_list[0]} + for _kwargs in kwargs_list: + for k, v in _kwargs.items(): + kwargs[k].values.append(v) + + describe_args = ( + f"add_inputs={expand_fn_add_inputs}" if expand_fn_add_inputs is not None else "" + ) + description = f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})" + + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + timer = None + with Bench( + cuda_graph_params, + ctx.bench_label(), + ctx.bench_sublabel(op_type), + description, + op_type.bench_fn(), + **kwargs, + ) as bench: + timer = bench.run() + return timer + + +def bench_torch_mm( + ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None, +) -> TMeasurement: + """ + Benchmark basic torch.mm as a roofline. + + When all the input tokens have the same LoRA ID, the LoRA kernels are just + a matmul. This torch.mm benchmark serves as a roofline for that case. + + input op_type is used in determining the m, k, n dimensions for the matmul. + """ + + batch_size, hidden_size, lora_rank, seq_length, dtype = ( + ctx.batch_size, + ctx.hidden_size, + ctx.lora_rank, + ctx.seq_length, + ctx.dtype, + ) + + m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank) + # For a fairer comparison. + n = n * ctx.num_slices + + # Get matmul input and output tensors for A x B = C + As, Bs, Cs = [], [], [] + for _ in range(arg_pool_size): + As.append(torch.rand((m, k), dtype=dtype).to("cuda")) + Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t()) + Cs.append(torch.rand((m, n), dtype=dtype).to("cuda")) + + # Make torch.mm kwargs + mm_kwargs = {"input": ArgPool(As), "mat2": ArgPool(Bs), "out": ArgPool(Cs)} + + description = ( + f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}" + f"x{dtype_to_str(dtype)}" + f"=>{dtype_to_str(dtype)})" + ) + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + with Bench( + cuda_graph_params, + ctx.bench_label(), + ctx.bench_sublabel(op_type), + description, + torch.mm, + **mm_kwargs, + ) as bench: + return bench.run() + + +# runner +def use_cuda_graph_recommendation() -> str: + return """ + Triton kernels have a significant launch overhead with + launched directly via python. This overhead is more noticeable + for small the problem sizes. For these cases, it is recommended + to use the script with `--cuda-graph-nops N` to benchmark N + consecutive invocations of the benchmarking operations from + inside a CUDA Graph. Note that the returned measurement is for N + invocations of the operation. + """ + + +def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): + compare = TBenchmark.Compare(timers) + compare.print() + + if args and args.cuda_graph_nops: + print( + f"Note : The timings reported above is for {args.cuda_graph_nops} " + "consecutive invocations of the benchmarking functions. " + f"Please divide by {args.cuda_graph_nops} for single invocation " + "timings." + ) + + print( + "Note on Comparison with torch.mm : The torch.mm numbers are " + "benchmark numbers of a simple matmul emulating the single lora " + "case. It is provided as a roofline for comparing our LoRA Kernel " + "implementations. It is expected that the LoRA kernels will be " + "slower than torch.mm in cases where num_loras is big. But for " + "small num_loras the goal should be to match the torch.mm numbers." + ) + + +def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): + if args.cuda_graph_nops is not None: + assert args.cuda_graph_nops > 0 + print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA Graph") + else: + print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}") + + timers = [] + for bench_ctx in bench_ctxs: + for seq_len in args.seq_lengths: + bench_ops: list[OpType] = args.op_types + seq_len_timers = [] + for bench_op in bench_ops: + for num_slices in bench_op.num_slices(): + _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices( + num_slices + ) + # Benchmark torch.mm as a roofline + seq_len_timers.append( + bench_torch_mm( + _ctx, args.arg_pool_size, bench_op, args.cuda_graph_nops + ) + ) + + # Benchmark bench_op + expand_fn_add_inputs = ( + [None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs + ) + for add_input_arg in expand_fn_add_inputs: + seq_len_timers.append( + bench_optype( + _ctx, + args.arg_pool_size, + bench_op, + args.cuda_graph_nops, + add_input_arg, + args.test_correctness, + ) + ) + + print_timers(seq_len_timers) + timers.extend(seq_len_timers) + + # Result stdout dump + print("== All Results ====") + print_timers(timers, args) + + if args.output_directory: + # Result file dump + od = Path(args.output_directory) + if not od.exists(): + od.mkdir() + + timestamp = int(time.time()) + pkl_file = od / f"lora_bench-{timestamp}.pkl" + print(f"Writing benchmarks to {pkl_file}") + with open(pkl_file, "wb") as f: + pickle.dump(timers, f) + + +def as_benchmark_contexts( + hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace +) -> list[BenchmarkContext]: + ctxs: list[BenchmarkContext] = [] + for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa + args.batch_sizes, + list(hidden_sizes), + lora_ranks, + args.num_loras, + args.sort_by_lora_id, + ): + ctxs.append( + BenchmarkContext( + batch_size=batch_size, + hidden_size=hidden_size, + lora_rank=lora_rank, + num_loras=num_loras, + num_active_loras=args.num_active_loras + if args.num_active_loras + else num_loras, + # To be filled based on the OpType to benchmark + seq_length=None, + sort_by_lora_id=sort_by_lora_id, + dtype=args.dtype, + # To be filled based on the OpType to benchmark + num_slices=None, + ) + ) + + return ctxs + + +def run_list_bench(args: argparse.Namespace): + print(args) + + print( + "List bench :\n" + f" Hidden Sizes {args.hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}" + ) + + # Get all benchmarking contexts + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args + ) + + run(args, bench_contexts) + + +def run_range_bench(args: argparse.Namespace): + print(args) + + hidden_sizes = list( + range( + args.hidden_sizes_start, + args.hidden_sizes_end + 1, + args.hidden_sizes_increment, + ) + ) + lora_ranks = list( + range(args.lora_ranks_start, args.lora_ranks_end + 1, args.lora_ranks_increment) + ) + + print(f"Range bench :\n Hidden Sizes {hidden_sizes} LoRA Ranks {lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args + ) + + run(args, bench_contexts) + + +def run_model_bench(args: argparse.Namespace): + print(args) + + def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]: + hidden_sizes = set() + for KN, tp_split_dim in WEIGHT_SHAPES[model]: + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + hidden_sizes.add(KN[1]) + return hidden_sizes + + # Get all hidden sizes + hidden_sizes: set[int] = set() + for model_name, tp_size in product(args.models, args.tp_sizes): + hidden_sizes = hidden_sizes.union(hidden_sizes_from_model(model_name, tp_size)) + + print(f"Model bench :\n Hidden Sizes {hidden_sizes} LoRA Ranks {args.lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args + ) + + run(args, bench_contexts) + + +if __name__ == "__main__": + + def to_torch_dtype(dt): + if dt == "torch.float16": + return torch.float16 + if dt == "torch.bfloat16": + return torch.bfloat16 + raise ValueError("unsupported dtype") + + def get_bool(s: str) -> bool: + return s.lower() in ["true", "1"] + + def add_common_command_args(p: argparse.ArgumentParser): + p.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['torch.float16', 'torch.bfloat16']", + ) + + p.add_argument( + "--arg-pool-size", + type=int, + default=32, + help="Run profiles with a pool of input/output/meta tensors instead" + "of simply reusing the same tensors for all runs. A bigger arg-pool" + "mitigates hardware caching effects during benchmarking.", + ) + + p.add_argument( + "--cuda-graph-nops", + type=int, + help=( + "when set profiling is done using cudagraph, " + "with the given number of operations in a graph." + "Note that the measurement returned is the time " + "taken for N consecutive executions of the benchmarking " + "functions, where N is the value of this argument." + ), + ) + p.add_argument("--num-loras", nargs="+", type=int, default=DEFAULT_NUM_LORAS) + p.add_argument( + "--num-active-loras", + type=int, + default=None, + help="Active LoRAs. When None, all LoRAs are active", + ) + p.add_argument( + "--sort-by-lora-id", + nargs="+", + type=get_bool, + default=DEFAULT_SORT_BY_LORA_IDS, + ) + p.add_argument( + "--op-types", nargs="+", type=OpType.from_str, default=list(OpType) + ) + p.add_argument( + "--seq-lengths", nargs="+", type=int, default=DEFAULT_SEQ_LENGTHS + ) + p.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + p.add_argument( + "--expand-fn-add-inputs", + nargs="+", + type=get_bool, + default=DEFAULT_EXPAND_FN_ADD_INPUTS, + ) + p.add_argument( + "-o", + "--output-directory", + type=str, + help=( + "Output directory to store a the list of benchmarking" + "TMeasurement objects as a pickle file" + ), + ) + + p.add_argument( + "--test-correctness", + action="store_true", + help=( + "When enabled, the benchmarking functions are tested" + "for correctness before the actual benchmarking" + ), + ) + + parser = FlexibleArgumentParser( + description=f""" +Benchmark LoRA kernels: + {use_cuda_graph_recommendation()} + + list_bench example: + python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + model_bench example: + python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + range_bench example: + python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter, + ) + + subparsers = parser.add_subparsers(dest="cmd", required=True) + + list_parser = subparsers.add_parser("list_bench") + list_parser.add_argument( + "--hidden-sizes", nargs="+", type=int, default=DEFAULT_HIDDEN_SIZES + ) + list_parser.add_argument( + "--lora-ranks", nargs="+", type=int, default=DEFAULT_LORA_RANKS + ) + add_common_command_args(list_parser) + list_parser.set_defaults(func=run_list_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--hidden-sizes-start", type=int, required=True) + range_parser.add_argument("--hidden-sizes-end", type=int, required=True) + range_parser.add_argument("--hidden-sizes-increment", type=int, required=True) + range_parser.add_argument("--lora-ranks-start", type=int, required=True) + range_parser.add_argument("--lora-ranks-end", type=int, required=True) + range_parser.add_argument("--lora-ranks-increment", type=int, required=True) + add_common_command_args(range_parser) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + model_parser.add_argument( + "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES + ) + model_parser.add_argument( + "--lora-ranks", nargs="+", type=int, default=DEFAULT_LORA_RANKS + ) + add_common_command_args(model_parser) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py new file mode 100644 index 000000000..0f896f187 --- /dev/null +++ b/benchmarks/kernels/benchmark_machete.py @@ -0,0 +1,730 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import copy +import itertools +import math +import os +import pickle as pkl +import time +from collections.abc import Iterable +from dataclasses import dataclass +from itertools import product +from typing import Callable, Optional + +import pandas as pd +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + GPTQ_MARLIN_MAX_PARALLEL, + GPTQ_MARLIN_MIN_THREAD_N, + marlin_permute_scales, + marlin_zero_points, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( + MarlinWorkspace, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + pack_rows, + quantize_weights, +) +from vllm.scalar_type import ScalarType, scalar_types +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"] +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024] +DEFAULT_TP_SIZES = [1] + +NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False) + +if NVTX_PROFILE: + import nvtx + + +def terse_type_name(dt): + return { + torch.bfloat16: "bf16", + torch.float16: "fp16", + torch.int8: "int8", + torch.float8_e4m3fn: "fp8", + torch.float: "float", + torch.int: "int", + }[dt] + + +@dataclass +class BenchmarkTensors: + w_ref: torch.Tensor + a: torch.Tensor + + w_q: torch.Tensor + group_size: Optional[int] + wtype: ScalarType + w_g_s: torch.Tensor + w_g_zp: Optional[torch.Tensor] + w_ch_s: Optional[torch.Tensor] + w_tok_s: Optional[torch.Tensor] + + +@dataclass +class TypeConfig: + act_type: torch.dtype + weight_type: ScalarType + output_type: Optional[torch.dtype] + group_scale_type: Optional[torch.dtype] + group_zero_type: Optional[torch.dtype] + channel_scale_type: Optional[torch.dtype] + token_scale_type: Optional[torch.dtype] + + +def rand_data(shape, dtype=torch.float16, scale=1): + if dtype.is_floating_point: + return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype) + else: + return torch.randint(-15, 15, shape, dtype=dtype, device="cuda") + + +def quantize_and_pack( + atype: torch.dtype, + w: torch.Tensor, + wtype: ScalarType, + stype: Optional[torch.dtype], + group_size: Optional[int], + zero_points: bool = False, +): + assert wtype.is_integer(), "TODO: support floating point weights" + + w_ref, w_q, w_s, w_zp = quantize_weights( + w, + wtype, + group_size=group_size, + zero_points=zero_points, + # to match how the kernel applies zps + ref_zero_points_after_scales=True, + ) + + w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) + return w_ref, w_q, w_s, w_zp + + +def create_bench_tensors( + shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] +) -> list[BenchmarkTensors]: + m, n, k = shape + + # we want to make sure that weights don't fit into L2 cache between runs so + # we construct enough weights to exceed L2 cache, which is 50mb on a H100 + # so we target total weight size > 2*50mb + num_weights = math.ceil( + 2 * 50 * 1024**2 * 8 / (k * n * types.weight_type.size_bits) + ) + + a = rand_data((m, k), types.act_type, scale=5) + + benchmark_tensors: list[BenchmarkTensors] = [] + for _ in range(num_weights): + w = rand_data((k, n), types.act_type, scale=5) + + if types.group_scale_type is not None: + w = w.to(types.group_scale_type) + if w.dtype.itemsize == 1: + w = w.to(torch.float16) + + w_ref, w_q_packed, w_s, w_zp = quantize_and_pack( + a.dtype, + w, + types.weight_type, + types.group_scale_type, + group_size, + types.group_zero_type is not None, + ) + + if not a.dtype.is_floating_point: + aiinfo = torch.iinfo(a.dtype) + w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max) + + w_ref = w_ref.to(torch.float32) + + w_ch_s = ( + None + if types.channel_scale_type is None + else rand_data((n,), types.channel_scale_type) + ) + w_tok_s = ( + None + if types.token_scale_type is None + else rand_data((m,), types.token_scale_type) + ) + + benchmark_tensors.append( + BenchmarkTensors( + w_ref=w_ref, + a=a, + w_q=w_q_packed, + wtype=types.weight_type, + w_g_s=w_s, + w_g_zp=w_zp, + group_size=group_size, + w_ch_s=w_ch_s, + w_tok_s=w_tok_s, + ) + ) + + return benchmark_tensors + + +def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable: + a = bt.a + w = bt.w_ref.to(bt.a.dtype) # use float reference tensor + if a.dtype not in [torch.float16, torch.bfloat16]: + a = a.to(torch.float16) + w = w.to(torch.float16) + return lambda: torch.matmul(a, w) + + +def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable: + if bt.w_ch_s is not None and bt.w_tok_s is not None: + scale_a = bt.w_tok_s.to(torch.float32) + scale_b = bt.w_ch_s.to(torch.float32) + else: + scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device) + scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device) + w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t() + return lambda: ops.cutlass_scaled_mm( + bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16 + ) + + +def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: + device = bt.a.device + + workspace = MarlinWorkspace( + bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL + ) + + if bt.w_g_zp is None: + w_zp = torch.empty(0, dtype=torch.int, device=device) + else: + w_zp = marlin_zero_points( + bt.w_g_zp, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits + ) + + if bt.group_size is None: + w_s = torch.tensor([], device="cuda", dtype=torch.half) + else: + w_s = marlin_permute_scales( + bt.w_g_s, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.group_size + ) + + sort_indices = torch.empty(0, dtype=torch.int, device=device) + g_idx = torch.empty(0, dtype=torch.int, device=device) + w_q = ops.gptq_marlin_repack( + bt.w_q, sort_indices, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits + ) + + if bt.a.dtype.is_floating_point: + assert bt.w_ch_s is None + assert bt.w_tok_s is None + assert bt.group_size is not None + + fn = lambda: ops.gptq_marlin_gemm( + a=bt.a, + b_q_weight=w_q, + b_scales=w_s, + b_zeros=w_zp, + g_idx=g_idx, + perm=sort_indices, + workspace=workspace.scratch, + b_q_type=bt.wtype, + size_m=bt.a.shape[0], + size_n=bt.w_ref.shape[1], + size_k=bt.w_ref.shape[0], + is_k_full=True, + is_zp_float=False, + ) + else: + assert bt.a.dtype == torch.int8 + assert bt.wtype == scalar_types.uint4b8 + + if bt.w_ch_s is not None: + s_ch = bt.w_ch_s.to(torch.float32) + else: + s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device) + + if bt.w_tok_s is not None: + s_tok = bt.w_tok_s.to(torch.float32) + else: + s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device) + + fn = lambda: ops.marlin_qqq_gemm( + a=bt.a, + b_q_weight=w_q, + s_group=w_s, + s_tok=s_tok, + s_ch=s_ch, + workspace=workspace.scratch, + size_m=bt.a.shape[0], + size_n=bt.w_ref.shape[1], + size_k=bt.w_ref.shape[0], + ) + + return fn + + +def machete_create_bench_fn( + bt: BenchmarkTensors, out_type=torch.dtype, schedule=None +) -> Callable: + w_q = bt.w_q.t().contiguous().t() # make col major + w_q = ops.machete_prepack_B( + w_q, bt.a.dtype, bt.wtype, None if bt.w_g_s is None else bt.w_g_s.dtype + ) + + w_g_zp = bt.w_g_zp + if w_g_zp is not None: + w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype)) + + return lambda: ops.machete_mm( + a=bt.a, + b_q=w_q, + b_type=bt.wtype, + b_group_scales=bt.w_g_s, + b_group_zeros=w_g_zp, + b_group_size=bt.group_size, + b_channel_scales=bt.w_ch_s, + a_token_scales=bt.w_tok_s, + out_type=out_type, + schedule=schedule, + ) + + +# impl + +# bench + + +def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]): + min_run_time = 1 if not NVTX_PROFILE else 0.1 + res = TBenchmark.Timer( + stmt=""" + for fn in fns: + fn() + """, + globals={"fns": fns}, + label=label, + sub_label=sub_label, + description=description, + ).blocked_autorange(min_run_time=min_run_time) + + if NVTX_PROFILE: + with ( + nvtx.annotate("mm-bench"), + nvtx.annotate(f"{label}|{sub_label}|{description}"), + ): + fns[0]() + + return res + + +_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None +_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None + + +def bench( + types: TypeConfig, + group_size: int, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + sweep_schedules: bool = True, +) -> list[TMeasurement]: + benchmark_tensors = create_bench_tensors((m, n, k), types, group_size) + sub_label += f", L={len(benchmark_tensors)}" + + name_type_string = f"W{types.weight_type}" + f"-A{terse_type_name(types.act_type)}" + if types.group_scale_type is not None: + name_type_string += f"-GS{terse_type_name(types.group_scale_type)}" + if types.group_zero_type is not None: + name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}" + if group_size is not None: + name_type_string += f"-G{group_size}" + if types.channel_scale_type is not None: + name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}" + if types.token_scale_type is not None: + name_type_string += f"-TS{terse_type_name(types.token_scale_type)}" + + timers = [] + # pytorch impl + timers.append( + bench_fns( + label, + sub_label, + "torch.matmul (fp16)", + [torch_matmul_f16_create_bench_fn(bt) for bt in benchmark_tensors], + ) + ) + + if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn: + timers.append( + bench_fns( + label, + sub_label, + f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", + [cutlass_scaled_mm_create_bench_fn(bt) for bt in benchmark_tensors], + ) + ) + + if types.act_type != torch.float8_e4m3fn: + timers.append( + bench_fns( + label, + sub_label, + f"marlin ({name_type_string})", + [marlin_create_bench_fn(bt) for bt in benchmark_tensors], + ) + ) + + # machete + timers.append( + bench_fns( + label, + sub_label, + f"machete ({name_type_string})", + [ + machete_create_bench_fn(bt, out_type=types.output_type) + for bt in benchmark_tensors + ], + ) + ) + + if sweep_schedules: + global _SWEEP_SCHEDULES_RESULTS + + print("Finding best schedule for machete") + best = None + best_schedule = None + schedules = ops.machete_supported_schedules( + a_type=types.act_type, + b_type=types.weight_type, + group_scales_type=types.group_scale_type, + group_zeros_type=types.group_zero_type, + token_scales_type=types.token_scale_type, + channel_scales_type=types.channel_scale_type, + out_type=types.output_type, + ) + + if schedules is None or len(schedules) == 0: + raise ValueError("No schedules found to sweep") + + for schedule in reversed(schedules): + schedule_M = int(schedule.split("_")[0].split("x")[1]) + + # Prune known bad schedules + if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4: + continue + + res = bench_fns( + label, + sub_label, + "machete_best", + [ + machete_create_bench_fn( + bt, out_type=types.output_type, schedule=schedule + ) + for bt in benchmark_tensors + ], + ) + + results_row = { + "M": m, + "K": k, + "N": n, + "group_size": group_size, + "schedule": schedule, + "median": res.median, + } + if _SWEEP_SCHEDULES_RESULTS is None: + _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(columns=results_row.keys()) + _SWEEP_SCHEDULES_RESULTS.loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row + + print(f" {res.median:5.5} ", schedule) + if not best or res.median < best.median: + best = res + best_schedule = schedule + print("Best schedule:", best_schedule) + timers.append(best) + + return timers + + +# runner +def print_timers(timers: list[TMeasurement]): + compare = TBenchmark.Compare(timers) + compare.print() + + +def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: + types = TypeConfig( + act_type=args.act_type, + weight_type=scalar_types.uint4b8 + if args.group_zero_type is None + else scalar_types.uint4, + output_type=args.out_type, + group_scale_type=args.group_scale_type, + group_zero_type=args.group_zero_type, + channel_scale_type=args.channel_scale_type, + token_scale_type=args.token_scale_type, + ) + + results: list[TMeasurement] = [] + for m, k, n in MKNs: + timers = bench( + types, + args.group_size, + m, + k, + n, + f"{args.act_type}-gemm", + f"MKN=({m}x{k}x{n})", + sweep_schedules=args.sweep_schedules, + ) + print_timers(timers) + results.extend(timers) + + return results + + +# output makers +def make_output( + data: list[TMeasurement], + MKNs: Iterable[tuple[int, int, int]], + base_description: str, + timestamp=None, +): + print(f"== All Results {base_description} ====") + print_timers(data) + + # pickle all the results + timestamp = int(time.time()) if timestamp is None else timestamp + with open(f"{base_description}-{timestamp}.pkl", "wb") as f: + pkl.dump(data, f) + + +# argparse runners + + +def run_square_bench(args): + dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) + MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) + data = run(args.dtype, args.sweep_schedules, MKNs) + + make_output(data, MKNs, f"square_bench-{args.dtype}") + + +def run_range_bench(args): + m_start, k_start, n_start = (int(x) for x in args.dim_start.split(",")) + m_end, k_end, n_end = (int(x) for x in args.dim_end.split(",")) + m_increment, k_increment, n_increment = ( + int(x) for x in args.dim_increment.split(",") + ) + Ms = list(range(m_start, m_end + 1, m_increment)) + Ks = list(range(k_start, k_end + 1, k_increment)) + Ns = list(range(n_start, n_end + 1, n_increment)) + MKNs = list(product(Ms, Ks, Ns)) + + data = run(args.dtype, args.sweep_schedules, MKNs) + + make_output(data, MKNs, f"range_bench-{args.dtype}") + + +def run_model_bench(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: + KNs = [] + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KNs.append(KN) + return KNs + + model_bench_data = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + Ms = args.batch_sizes + KNs = model_shapes(model, tp_size) + MKNs = [] + for m in Ms: + for k, n in KNs: + MKNs.append((m, k, n)) + + data = run(args, MKNs) + model_bench_data.append(data) + + type_string = f"{args.act_type}" + + # Print all results + for data, model_tp in zip(model_bench_data, models_tps): + model, tp_size = model_tp + print(f"== Results {type_string} {model}-TP{tp_size} ====") + print_timers(data) + + timestr = time.strftime("%Y%m%d-%H%M%S") + + all_results = [] + for d in model_bench_data: + all_results.extend(d) + + # pickle all data + with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f: + args_dict = vars(args) + args_dict.pop("func") + pkl.dump( + { + "args": args_dict, + "results": all_results, + }, + f, + ) + + +if __name__ == "__main__": + + def to_torch_dtype(dt): + return { + "bfloat16": torch.bfloat16, + "float16": torch.float16, + "int8": torch.int8, + "float8_e4m3fn": torch.float8_e4m3fn, + "int": torch.int, + "float": torch.float, + }[dt] + + class ToTorchDtype(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, to_torch_dtype(values)) + + parser = FlexibleArgumentParser( + description=""" +Benchmark Machete GEMM. + + To run square GEMMs: + python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 + + To run constant N and K and sweep M: + python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 + + To run dimensions from a model: + python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 + + Output: + - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--act-type", + action=ToTorchDtype, + required=True, + choices=["bfloat16", "float16", "int8", "float8_e4m3fn"], + ) + parser.add_argument( + "--group-scale-type", + action=ToTorchDtype, + choices=["bfloat16", "float16"], + ) + parser.add_argument( + "--group-zero-type", + type=to_torch_dtype, + choices=["bfloat16", "float16"], + ) + parser.add_argument( + "--channel-scale-type", + action=ToTorchDtype, + choices=["float"], + ) + parser.add_argument( + "--token-scale-type", + action=ToTorchDtype, + choices=["float"], + ) + parser.add_argument( + "--out-type", + action=ToTorchDtype, + choices=["bfloat16", "float16"], + ) + parser.add_argument( + "--group-size", + type=int, + help="Available options are ['None', '-1', '128'], default=128", + default=128, + ) + parser.add_argument( + "--sweep-schedules", + action="store_true", + help="Run a sweep over all supported schedules", + ) + parser.add_argument( + "--sweep-csv-out", + help="CSV to store sweep results", + default="sch_sweep_results.csv", + ) + subparsers = parser.add_subparsers(dest="cmd", required=True) + + square_parser = subparsers.add_parser("square_bench") + square_parser.add_argument("--dim-start", type=int, required=True) + square_parser.add_argument("--dim-end", type=int, required=True) + square_parser.add_argument("--dim-increment", type=int, required=True) + square_parser.set_defaults(func=run_square_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument( + "--dim-start", + type=str, + required=True, + help="Start value for M,K,N as common separated list", + ) + range_parser.add_argument( + "--dim-end", + type=str, + required=True, + help="End value (inclusive) for M,K,N as common separated list", + ) + range_parser.add_argument( + "--dim-increment", + type=str, + required=True, + help="Increment value for M,K,N as common separated list", + ) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + model_parser.add_argument( + "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES + ) + model_parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + + _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out + args.func(args) + + if _SWEEP_SCHEDULES_RESULTS is not None: + _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV) diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py new file mode 100644 index 000000000..9ea1fddae --- /dev/null +++ b/benchmarks/kernels/benchmark_marlin.py @@ -0,0 +1,342 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import torch.utils.benchmark as benchmark +from benchmark_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( + GPTQ_MARLIN_24_MAX_PARALLEL, + GPTQ_MARLIN_24_MIN_THREAD_N, + GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, + GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES, +) +from vllm.model_executor.layers.quantization.utils.allspark_utils import ( + ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, + ALLSPARK_SUPPORTED_QUANT_TYPES, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + GPTQ_MARLIN_MAX_PARALLEL, + GPTQ_MARLIN_MIN_THREAD_N, + MARLIN_SUPPORTED_GROUP_SIZES, + query_marlin_supported_quant_types, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( + MarlinWorkspace, + marlin_quantize, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( + marlin_24_quantize, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + gptq_pack, + gptq_quantize_weights, + quantize_weights, + sort_weights, +) +from vllm.scalar_type import ScalarType +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] + +ACT_ORDER_OPTS = [False, True] +K_FULL_OPTS = [False, True] + + +def bench_run( + results: list[benchmark.Measurement], + model: str, + act_order: bool, + is_k_full: bool, + quant_type: ScalarType, + group_size: int, + size_m: int, + size_k: int, + size_n: int, +): + label = "Quant Matmul" + + sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format( + model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n + ) + + print(f"Testing: {sub_label}") + + a = torch.randn(size_m, size_k).to(torch.half).cuda() + b = torch.rand(size_k, size_n).to(torch.half).cuda() + + a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda() + + # Marlin quant + ( + marlin_w_ref, + marlin_q_w, + marlin_s, + marlin_g_idx, + marlin_sort_indices, + marlin_rand_perm, + ) = marlin_quantize(b, quant_type, group_size, act_order) + + # Marlin_24 quant + (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = ( + marlin_24_quantize(b, quant_type, group_size) + ) + + marlin_zp = torch.empty(0, dtype=torch.int, device=b.device) + + # GPTQ quant + (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights( + b, quant_type, group_size, act_order + ) + q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n) + + # For act_order, sort the "weights" and "g_idx" + # so that group ids are increasing + repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device) + if act_order: + (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx) + + # Prepare + marlin_workspace = MarlinWorkspace( + size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL + ) + + marlin_24_workspace = MarlinWorkspace( + size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL + ) + marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int) + + # AllSpark W8A16 quant + as_supported_case = ( + quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES + and group_size == -1 + and not act_order + and is_k_full + ) + if as_supported_case: + properties = torch.cuda.get_device_properties(b.device.index) + sm_count = properties.multi_processor_count + sm_version = properties.major * 10 + properties.minor + + supported_arch = sm_version >= 80 and sm_version < 90 + as_supported_case = as_supported_case and supported_arch + if supported_arch: + has_zp = False + w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp) + qw = qw.to(torch.uint8) + + qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight( + qw, s, zp, has_zp + ) + CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD + + globals = { + # Gen params + "quant_type": quant_type, + "group_size": group_size, + "size_m": size_m, + "size_n": size_n, + "size_k": size_k, + "a": a, + "a_tmp": a_tmp, + # Marlin params + "marlin_w_ref": marlin_w_ref, + "marlin_q_w": marlin_q_w, + "marlin_s": marlin_s, + "marlin_zp": marlin_zp, + "marlin_g_idx": marlin_g_idx, + "marlin_sort_indices": marlin_sort_indices, + "marlin_rand_perm": marlin_rand_perm, + "marlin_workspace": marlin_workspace, + "is_k_full": is_k_full, + # Marlin_24 params + "marlin_24_w_ref": marlin_24_w_ref, + "marlin_24_q_w_comp": marlin_24_q_w_comp, + "marlin_24_meta": marlin_24_meta, + "marlin_24_s": marlin_24_s, + "marlin_24_workspace": marlin_24_workspace, + # GPTQ params + "q_w_gptq": q_w_gptq, + "repack_sort_indices": repack_sort_indices, + # AllSpark W8A16 params + "qw_reorder": qw_reorder if as_supported_case else None, + "s_reorder": s_reorder if as_supported_case else None, + "zp_reorder": zp_reorder if as_supported_case else None, + "sm_count": sm_count if as_supported_case else None, + "sm_version": sm_version if as_supported_case else None, + "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None, + # Kernels + "gptq_marlin_gemm": ops.gptq_marlin_gemm, + "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, + "gptq_marlin_repack": ops.gptq_marlin_repack, + "allspark_w8a16_gemm": ops.allspark_w8a16_gemm, + } + + min_run_time = 1 + + # Warmup pytorch + for i in range(5): + torch.matmul(a, marlin_w_ref) + + results.append( + benchmark.Timer( + stmt="torch.matmul(a, marlin_w_ref)", + globals=globals, + label=label, + sub_label=sub_label, + description="pytorch_gemm", + ).blocked_autorange(min_run_time=min_run_time) + ) + + results.append( + benchmark.Timer( + stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_gemm_fp16", + ).blocked_autorange(min_run_time=min_run_time) + ) + + results.append( + benchmark.Timer( + stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_gemm_fp32", + ).blocked_autorange(min_run_time=min_run_time) + ) + + if ( + quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES + and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES + ): + results.append( + benchmark.Timer( + stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_24_gemm", + ).blocked_autorange(min_run_time=min_run_time) + ) + + results.append( + benchmark.Timer( + stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_repack", + ).blocked_autorange(min_run_time=min_run_time) + ) + + if as_supported_case: + results.append( + benchmark.Timer( + stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="allspark_w8a16_gemm_fp32", + ).blocked_autorange(min_run_time=min_run_time) + ) + + +def main(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + results: list[benchmark.Measurement] = [] + + for model in args.models: + for layer in WEIGHT_SHAPES[model]: + size_k = layer[0] + size_n = layer[1] + + if len(args.limit_k) > 0 and size_k not in args.limit_k: + continue + + if len(args.limit_n) > 0 and size_n not in args.limit_n: + continue + + for act_order in ACT_ORDER_OPTS: + if ( + len(args.limit_act_order) > 0 + and act_order not in args.limit_act_order + ): + continue + + for is_k_full in K_FULL_OPTS: + if ( + len(args.limit_k_full) > 0 + and is_k_full not in args.limit_k_full + ): + continue + + for quant_type in query_marlin_supported_quant_types(False): + if ( + len(args.limit_num_bits) > 0 + and quant_type.size_bits not in args.limit_num_bits + ): + continue + + for group_size in MARLIN_SUPPORTED_GROUP_SIZES: + if ( + len(args.limit_group_size) > 0 + and group_size not in args.limit_group_size + ): + continue + + # For act_order, the group_size must be less than + # size_k + if act_order and (group_size == size_k or group_size == -1): + continue + + for size_m in args.batch_sizes: + bench_run( + results, + model, + act_order, + is_k_full, + quant_type, + group_size, + size_m, + size_k, + size_n, + ) + + compare = benchmark.Compare(results) + compare.print() + + +# For quick benchmarking use: +# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 +# +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark Marlin across specified models/shapes/batches" + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + parser.add_argument("--limit-k", nargs="+", type=int, default=[]) + parser.add_argument("--limit-n", nargs="+", type=int, default=[]) + parser.add_argument("--limit-group-size", nargs="+", type=int, default=[]) + parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[]) + parser.add_argument("--limit-act-order", nargs="+", type=int, default=[]) + parser.add_argument("--limit-k-full", nargs="+", type=int, default=[]) + + args = parser.parse_args() + main(args) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py new file mode 100644 index 000000000..cef53b183 --- /dev/null +++ b/benchmarks/kernels/benchmark_moe.py @@ -0,0 +1,737 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import json +import time +from contextlib import nullcontext +from datetime import datetime +from itertools import product +from typing import Any, TypedDict + +import ray +import torch +from ray.experimental.tqdm_ray import tqdm + +from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.platforms import current_platform +from vllm.transformers_utils.config import get_config +from vllm.triton_utils import triton +from vllm.utils import FlexibleArgumentParser + +FP8_DTYPE = current_platform.fp8_dtype() + + +class BenchmarkConfig(TypedDict): + BLOCK_SIZE_M: int + BLOCK_SIZE_N: int + BLOCK_SIZE_K: int + GROUP_SIZE_M: int + num_warps: int + num_stages: int + + +def benchmark_config( + config: BenchmarkConfig, + num_tokens: int, + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, + block_quant_shape: list[int] = None, + use_deep_gemm: bool = False, +) -> float: + init_dtype = torch.float16 if use_fp8_w8a8 else dtype + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + if use_int8_w8a16: + w1 = torch.randint( + -127, + 127, + ( + num_experts, + shard_intermediate_size, + hidden_size, + ), + dtype=torch.int8, + ) + w2 = torch.randint( + -127, + 127, + ( + num_experts, + hidden_size, + shard_intermediate_size // 2, + ), + dtype=torch.int8, + ) + else: + w1 = torch.randn( + num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype + ) + w2 = torch.randn( + num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype + ) + gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32) + + w1_scale = None + w2_scale = None + a1_scale = None + a2_scale = None + if use_int8_w8a16: + w1_scale = torch.randn( + (num_experts, 2 * shard_intermediate_size), dtype=torch.float32 + ) + w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) + if use_fp8_w8a8: + if block_quant_shape: + block_n, block_k = block_quant_shape[0], block_quant_shape[1] + E = num_experts + N = shard_intermediate_size // 2 + K = hidden_size + factor_for_scale = 1e-2 + n_tiles_w1 = (2 * N + block_n - 1) // block_n + n_tiles_w2 = (K + block_n - 1) // block_n + k_tiles_w1 = (K + block_k - 1) // block_k + k_tiles_w2 = (N + block_k - 1) // block_k + w1_scale = ( + torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) + * factor_for_scale + ) + w2_scale = ( + torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) + * factor_for_scale + ) + else: + w1_scale = torch.randn(num_experts, dtype=torch.float32) + w2_scale = torch.randn(num_experts, dtype=torch.float32) + + a1_scale = torch.randn(1, dtype=torch.float32) + a2_scale = torch.randn(1, dtype=torch.float32) + + w1 = w1.to(FP8_DTYPE) + w2 = w2.to(FP8_DTYPE) + + input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32) + + def prepare(i: int): + input_gating.copy_(gating_output[i]) + + def run(): + from vllm.model_executor.layers.fused_moe import override_config + + with override_config(config): + if use_deep_gemm: + topk_weights, topk_ids, token_expert_indices = fused_topk( + x, input_gating, topk, False + ) + return fused_experts( + x, + w1, + w2, + topk_weights, + topk_ids, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_quant_shape, + allow_deep_gemm=True, + ) + else: + fused_moe( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_quant_shape, + ) + + # JIT compilation & warmup + run() + torch.cuda.synchronize() + + # Capture 10 invocations with CUDA graph + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + for _ in range(10): + run() + torch.cuda.synchronize() + + # Warmup + for _ in range(5): + graph.replay() + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies: list[float] = [] + for i in range(num_iters): + prepare(i) + torch.cuda.synchronize() + + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + avg = sum(latencies) / (num_iters * 10) * 1000 # us + graph.reset() + return avg + + +def get_rocm_tuning_space(use_fp16): + block_mn_range = [16, 32, 64, 128, 256] + block_k_range = [16, 32, 64, 128, 256] + if not use_fp16: + block_k_range.remove(16) # BLOCK_K=16 not supported for fp8 + num_warps_range = [1, 2, 4, 8] + group_m_range = [1, 4, 8, 16, 32] + num_stage_range = [2] + waves_per_eu_range = [0] + matrix_instr_nonkdim_range = [16, 32] if use_fp16 else [] + kpack_range = [1, 2] if use_fp16 else [] + + param_ranges = { + "BLOCK_SIZE_M": block_mn_range, + "BLOCK_SIZE_N": block_mn_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + "waves_per_eu": waves_per_eu_range, + } + if use_fp16: + param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range + param_ranges["kpack"] = kpack_range + + return param_ranges + + +def get_configs_compute_bound(use_fp16, block_quant_shape) -> list[dict[str, int]]: + configs: list[BenchmarkConfig] = [] + + if current_platform.is_rocm(): + param_ranges = get_rocm_tuning_space(use_fp16) + else: + # Reduced search space for faster tuning. + # TODO(woosuk): Increase the search space and use a performance model to + # prune the search space. + block_m_range = [16, 32, 64, 128, 256] + block_n_range = [32, 64, 128, 256] + block_k_range = [64, 128, 256] + num_warps_range = [4, 8] + group_m_range = [1, 16, 32, 64] + num_stage_range = [2, 3, 4, 5] + + param_ranges = { + "BLOCK_SIZE_M": block_m_range, + "BLOCK_SIZE_N": block_n_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + } + + keys, values = zip(*param_ranges.items()) + for config_values in product(*values): + config = dict(zip(keys, config_values)) + configs.append(config) + + # Remove configs that are not compatible with fp8 block quantization + # BLOCK_SIZE_K must be a multiple of block_k + # BLOCK_SIZE_N must be a multiple of block_n + if block_quant_shape is not None and not use_fp16: + block_n, block_k = block_quant_shape[0], block_quant_shape[1] + for config in configs[:]: + if ( + config["BLOCK_SIZE_K"] % block_k != 0 + or config["BLOCK_SIZE_N"] % block_n != 0 + ): + configs.remove(config) + return configs + + +def prune_rocm_search_space( + num_tokens, shard_intermediate_size, hidden_size, search_space, is_fp16, topk +): + N1, K1 = shard_intermediate_size, hidden_size + N2, K2 = hidden_size, shard_intermediate_size // 2 + pruned_space_1 = prune_rocm_configs( + num_tokens * topk, N1, K1, search_space, is_fp16 + ) + pruned_space_2 = prune_rocm_configs( + num_tokens * topk, N2, K2, search_space, is_fp16 + ) + search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) + return search_space + + +# The following code is inspired by ROCm/Triton GEMM tuning script: +# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89 +def prune_rocm_configs(M, N, K, configs, is_fp16=True): + pruned_configs = [] + elemBytes_a = 2 if is_fp16 else 1 + elemBytes_b = 2 if is_fp16 else 1 + + mfma = 16 if M < 32 or N < 32 else 32 + + # TODO (zhanglx): figure out the boundary between large and small gemms + large_gemm = False + if M >= 2048 and N >= 2048: + large_gemm = True + + for config in configs: + BLOCK_SIZE_M = config.get("BLOCK_SIZE_M") + BLOCK_SIZE_N = config.get("BLOCK_SIZE_N") + BLOCK_SIZE_K = config.get("BLOCK_SIZE_K") + num_warps = config.get("num_warps") + + if is_fp16: + matrix_instr_nonkdim = config.get("matrix_instr_nonkdim") + if matrix_instr_nonkdim > mfma: + continue + if mfma == 4 and BLOCK_SIZE_K < 64: + continue + # some layouts could not work properly in case + # number elements per thread is less 1 + if BLOCK_SIZE_M * BLOCK_SIZE_N < 64: + continue + SPLIT_K = config.get("SPLIT_K", 1) + GROUP_M = config.get("GROUP_SIZE_M") + if is_fp16: + if ( + matrix_instr_nonkdim > BLOCK_SIZE_M + or matrix_instr_nonkdim > BLOCK_SIZE_N + ): + continue + if matrix_instr_nonkdim >= M and matrix_instr_nonkdim != BLOCK_SIZE_M: + continue + if matrix_instr_nonkdim >= N and matrix_instr_nonkdim != BLOCK_SIZE_N: + continue + # Skip BLOCK_SIZE that is too large compare to M/N + # unless BLOCK_SIZE is already small enough + if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16: + continue + if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16: + continue + # skip large split_k when not necessary + if SPLIT_K != 1 and not need_split_k(M, N, K): + continue + # skip split_k that leads to EVEN_K = false + leap = SPLIT_K * BLOCK_SIZE_K + modv = K % leap + if modv != 0: + continue + # skip large GROUP_M + if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1: + continue + # out of shared memory resource + # TODO (zhanglx): This does not consider the LDS usage in the epilogue + LDS = ( + BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a + + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b + ) + if LDS > 65536: + continue + # Skip small block sizes and num_warps for large gemm + # For fp16 and f8, we want to only use BLOCK_SIZE >= 64 + if large_gemm: + if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64: + continue + if BLOCK_SIZE_K < 64: + continue + if num_warps < 4: + continue + + pruned_configs.append(config) + + return pruned_configs + + +def need_split_k(SIZE_M, SIZE_N, SIZE_K): + return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024 + + +def merge_unique_dicts(list1, list2): + result = [] + combined_list = list1.copy() + combined_list.extend(list2) + for dictionary in combined_list: + if dictionary not in result: + result.append(dictionary) + return result + + +@ray.remote(num_gpus=1) +class BenchmarkWorker: + def __init__(self, seed: int) -> None: + torch.set_default_device("cuda") + current_platform.seed_everything(seed) + self.seed = seed + # Get the device ID to allocate tensors and kernels + # on the respective GPU. This is required for Ray to work + # correctly with multi-GPU tuning on the ROCm platform. + self.device_id = int(ray.get_gpu_ids()[0]) + + def benchmark( + self, + num_tokens: int, + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + block_quant_shape: list[int] = None, + use_deep_gemm: bool = False, + ) -> tuple[dict[str, int], float]: + current_platform.seed_everything(self.seed) + dtype_str = get_config_dtype_str( + dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 + ) + # NOTE(woosuk): The current naming convention uses w2.shape[2], which + # is the intermediate size after silu_and_mul. + op_config = get_moe_configs( + num_experts, shard_intermediate_size // 2, dtype_str + ) + if op_config is None: + config = get_default_config( + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype_str, + is_marlin=False, + ) + else: + config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))] + kernel_time = benchmark_config( + config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=100, + block_quant_shape=block_quant_shape, + use_deep_gemm=use_deep_gemm, + ) + return config, kernel_time + + def tune( + self, + num_tokens: int, + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + search_space: list[dict[str, int]], + block_quant_shape: list[int], + use_deep_gemm: bool, + ) -> dict[str, int]: + best_config = None + best_time = float("inf") + if current_platform.is_rocm(): + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = prune_rocm_search_space( + num_tokens, + shard_intermediate_size, + hidden_size, + search_space, + is_fp16, + topk, + ) + + need_device_guard = False + if current_platform.is_rocm(): + visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None) + if visible_device != f"{self.device_id}": + need_device_guard = True + + with torch.cuda.device(self.device_id) if need_device_guard else nullcontext(): + for config in tqdm(search_space): + try: + kernel_time = benchmark_config( + config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=20, + block_quant_shape=block_quant_shape, + use_deep_gemm=use_deep_gemm, + ) + except triton.runtime.autotuner.OutOfResources: + # Some configurations may be invalid and fail to compile. + continue + + if kernel_time < best_time: + best_time = kernel_time + best_config = config + now = datetime.now() + print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") + assert best_config is not None + return best_config + + +def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: + return { + "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], + "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], + "BLOCK_SIZE_K": config["BLOCK_SIZE_K"], + "GROUP_SIZE_M": config["GROUP_SIZE_M"], + "num_warps": config["num_warps"], + "num_stages": config["num_stages"], + **( + {"waves_per_eu": config["waves_per_eu"]} if "waves_per_eu" in config else {} + ), + **( + {"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]} + if "matrix_instr_nonkdim" in config + else {} + ), + **({"kpack": config["kpack"]} if "kpack" in config else {}), + } + + +def save_configs( + configs: dict[int, BenchmarkConfig], + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + block_quant_shape: list[int], +) -> None: + dtype_str = get_config_dtype_str( + dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 + ) + + # NOTE(woosuk): The current naming convention uses w2.shape[2], which + # is the intermediate size after silu_and_mul. + filename = get_config_file_name( + num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape + ) + + print(f"Writing best config to {filename}...") + with open(filename, "w") as f: + json.dump(configs, f, indent=4) + f.write("\n") + + +def get_weight_block_size_safety(config, default_value=None): + quantization_config = getattr(config, "quantization_config", {}) + if isinstance(quantization_config, dict): + return quantization_config.get("weight_block_size", default_value) + return default_value + + +def main(args: argparse.Namespace): + print(args) + + config = get_config(model=args.model, trust_remote_code=args.trust_remote_code) + if args.model_prefix: + config = getattr(config, args.model_prefix) + + if config.architectures[0] == "DbrxForCausalLM": + E = config.ffn_config.moe_num_experts + topk = config.ffn_config.moe_top_k + intermediate_size = config.ffn_config.ffn_hidden_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] == "JambaForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"): + E = config.n_routed_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"): + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + else: + # Support for llama4 + config = config.get_text_config() + # Default: Mixtral. + E = config.num_local_experts + topk = config.num_experts_per_tok + intermediate_size = config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + + hidden_size = config.hidden_size + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + use_fp8_w8a8 = args.dtype == "fp8_w8a8" + use_int8_w8a16 = args.dtype == "int8_w8a16" + block_quant_shape = get_weight_block_size_safety(config) + + if args.batch_size is None: + batch_sizes = [ + 1, + 2, + 4, + 8, + 16, + 24, + 32, + 48, + 64, + 96, + 128, + 256, + 512, + 1024, + 1536, + 2048, + 3072, + 4096, + ] + else: + batch_sizes = [args.batch_size] + + use_deep_gemm = bool(args.use_deep_gemm) + + if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ: + # Ray will set ROCR_VISIBLE_DEVICES for device visibility + logger.warning( + "Ray uses ROCR_VISIBLE_DEVICES to control device accessibility." + "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES." + ) + val = os.environ["HIP_VISIBLE_DEVICES"] + os.environ["ROCR_VISIBLE_DEVICES"] = val + del os.environ["HIP_VISIBLE_DEVICES"] + + ray.init() + num_gpus = int(ray.available_resources()["GPU"]) + workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] + + def _distribute(method: str, inputs: list[Any]) -> list[Any]: + outputs = [] + worker_idx = 0 + for input_args in inputs: + worker = workers[worker_idx] + worker_method = getattr(worker, method) + output = worker_method.remote(*input_args) + outputs.append(output) + worker_idx = (worker_idx + 1) % num_gpus + return ray.get(outputs) + + if args.tune: + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = get_configs_compute_bound(is_fp16, block_quant_shape) + print(f"Start tuning over {len(search_space)} configurations...") + + start = time.time() + configs = _distribute( + "tune", + [ + ( + batch_size, + E, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + search_space, + block_quant_shape, + use_deep_gemm, + ) + for batch_size in batch_sizes + ], + ) + best_configs = { + M: sort_config(config) for M, config in zip(batch_sizes, configs) + } + save_configs( + best_configs, + E, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + block_quant_shape, + ) + end = time.time() + print(f"Tuning took {end - start:.2f} seconds") + else: + outputs = _distribute( + "benchmark", + [ + ( + batch_size, + E, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + block_quant_shape, + use_deep_gemm, + ) + for batch_size in batch_sizes + ], + ) + + for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): + print(f"Batch size: {batch_size}, config: {config}") + print(f"Kernel time: {kernel_time:.2f} us") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + parser.add_argument( + "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1" + ) + parser.add_argument( + "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2 + ) + parser.add_argument( + "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" + ) + parser.add_argument("--use-deep-gemm", action="store_true") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--batch-size", type=int, required=False) + parser.add_argument("--tune", action="store_true") + parser.add_argument("--trust-remote-code", action="store_true") + parser.add_argument("--model-prefix", type=str, required=False) + args = parser.parse_args() + + main(args) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py new file mode 100644 index 000000000..dba1f3943 --- /dev/null +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -0,0 +1,417 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +from typing import Any, TypedDict + +import ray +import torch +from transformers import AutoConfig + +from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( + _moe_permute, + _moe_unpermute_and_reduce, +) +from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import * +from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + +FP8_DTYPE = current_platform.fp8_dtype() + + +class BenchmarkConfig(TypedDict): + BLOCK_SIZE_M: int + BLOCK_SIZE_N: int + BLOCK_SIZE_K: int + GROUP_SIZE_M: int + num_warps: int + num_stages: int + + +def benchmark_permute( + num_tokens: int, + num_experts: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, + use_customized_permute: bool = False, +) -> float: + # init_dtype = torch.float16 if use_fp8_w8a8 else dtype + hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) + # output_hidden_states = torch.empty_like(hidden_states) + if use_fp8_w8a8: + align_block_size = 128 # deepgemm needs 128 m aligned block + qhidden_states, scale = _fp8_quantize(hidden_states, None, None) + else: + align_block_size = None + qhidden_states = hidden_states + + gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32) + + input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) + topk_weights, topk_ids, token_expert_indices = fused_topk( + qhidden_states, input_gating, topk, False + ) + + def prepare(i: int): + input_gating.copy_(gating_output[i]) + + def run(): + if use_customized_permute: + (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( + moe_permute( + qhidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + token_expert_indices=token_expert_indices, + topk=topk, + n_expert=num_experts, + n_local_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, + ) + ) + else: + ( + permuted_hidden_states, + a1q_scale, + sorted_token_ids, + expert_ids, + inv_perm, + ) = _moe_permute( + qhidden_states, None, topk_ids, num_experts, None, align_block_size + ) + + # JIT compilation & warmup + run() + torch.cuda.synchronize() + + # Capture 10 invocations with CUDA graph + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + for _ in range(10): + run() + torch.cuda.synchronize() + + # Warmup + for _ in range(5): + graph.replay() + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies: list[float] = [] + for i in range(num_iters): + prepare(i) + torch.cuda.synchronize() + + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + avg = sum(latencies) / (num_iters * 10) * 1000 # us + graph.reset() + return avg + + +def benchmark_unpermute( + num_tokens: int, + num_experts: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, + use_customized_permute: bool = False, +) -> float: + # init_dtype = torch.float16 if use_fp8_w8a8 else dtype + hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) + output_hidden_states = torch.empty_like(hidden_states) + if use_fp8_w8a8: + align_block_size = 128 # deepgemm needs 128 m aligned block + qhidden_states, scale = _fp8_quantize(hidden_states, None, None) + else: + align_block_size = None + qhidden_states = hidden_states + + input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) + + topk_weights, topk_ids, token_expert_indices = fused_topk( + qhidden_states, input_gating, topk, False + ) + + def prepare(): + if use_customized_permute: + (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( + moe_permute( + qhidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + token_expert_indices=token_expert_indices, + topk=topk, + n_expert=num_experts, + n_local_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, + ) + ) + # convert to fp16/bf16 as gemm output + return ( + permuted_hidden_states.to(dtype), + first_token_off, + inv_perm_idx, + m_indices, + ) + else: + ( + permuted_qhidden_states, + a1q_scale, + sorted_token_ids, + expert_ids, + inv_perm, + ) = _moe_permute( + qhidden_states, None, topk_ids, num_experts, None, align_block_size + ) + # convert to fp16/bf16 as gemm output + return ( + permuted_qhidden_states.to(dtype), + a1q_scale, + sorted_token_ids, + expert_ids, + inv_perm, + ) + + def run(input: tuple): + if use_customized_permute: + (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input + moe_unpermute( + permuted_hidden_states, + topk_weights, + topk_ids, + inv_perm_idx, + first_token_off, + topk, + num_experts, + num_experts, + ) + else: + ( + permuted_hidden_states, + a1q_scale, + sorted_token_ids, + expert_ids, + inv_perm, + ) = input + _moe_unpermute_and_reduce( + output_hidden_states, permuted_hidden_states, inv_perm, topk_weights + ) + + # JIT compilation & warmup + input = prepare() + run(input) + torch.cuda.synchronize() + + # Capture 10 invocations with CUDA graph + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + for _ in range(10): + run(input) + torch.cuda.synchronize() + + # Warmup + for _ in range(5): + graph.replay() + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies: list[float] = [] + for i in range(num_iters): + torch.cuda.synchronize() + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + avg = sum(latencies) / (num_iters * 10) * 1000 # us + graph.reset() + return avg + + +@ray.remote(num_gpus=1) +class BenchmarkWorker: + def __init__(self, seed: int) -> None: + torch.set_default_device("cuda") + current_platform.seed_everything(seed) + self.seed = seed + # Get the device ID to allocate tensors and kernels + # on the respective GPU. This is required for Ray to work + # correctly with multi-GPU tuning on the ROCm platform. + self.device_id = int(ray.get_gpu_ids()[0]) + + def benchmark( + self, + num_tokens: int, + num_experts: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + use_customized_permute: bool = False, + ) -> tuple[dict[str, int], float]: + current_platform.seed_everything(self.seed) + + permute_time = benchmark_permute( + num_tokens, + num_experts, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=100, + use_customized_permute=use_customized_permute, + ) + unpermute_time = benchmark_unpermute( + num_tokens, + num_experts, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=100, + use_customized_permute=use_customized_permute, + ) + return permute_time, unpermute_time + + +def get_weight_block_size_safety(config, default_value=None): + quantization_config = getattr(config, "quantization_config", {}) + if isinstance(quantization_config, dict): + return quantization_config.get("weight_block_size", default_value) + return default_value + + +def main(args: argparse.Namespace): + print(args) + + config = AutoConfig.from_pretrained( + args.model, trust_remote_code=args.trust_remote_code + ) + if config.architectures[0] == "DbrxForCausalLM": + E = config.ffn_config.moe_num_experts + topk = config.ffn_config.moe_top_k + elif config.architectures[0] == "JambaForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + elif ( + config.architectures[0] == "DeepseekV3ForCausalLM" + or config.architectures[0] == "DeepseekV2ForCausalLM" + ): + E = config.n_routed_experts + topk = config.num_experts_per_tok + elif config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"]: + E = config.num_experts + topk = config.num_experts_per_tok + + else: + # Support for llama4 + config = config.get_text_config() + # Default: Mixtral. + E = config.num_local_experts + topk = config.num_experts_per_tok + + hidden_size = config.hidden_size + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + use_fp8_w8a8 = args.dtype == "fp8_w8a8" + use_int8_w8a16 = args.dtype == "int8_w8a16" + use_customized_permute = args.use_customized_permute + + if args.batch_size is None: + batch_sizes = [ + 1, + 2, + 4, + 8, + 16, + 24, + 32, + 48, + 64, + 96, + 128, + 256, + 512, + 1024, + 1536, + 2048, + 3072, + 4096, + ] + else: + batch_sizes = [args.batch_size] + + ray.init() + num_gpus = int(ray.available_resources()["GPU"]) + workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] + + def _distribute(method: str, inputs: list[Any]) -> list[Any]: + outputs = [] + worker_idx = 0 + for input_args in inputs: + worker = workers[worker_idx] + worker_method = getattr(worker, method) + output = worker_method.remote(*input_args) + outputs.append(output) + worker_idx = (worker_idx + 1) % num_gpus + return ray.get(outputs) + + outputs = _distribute( + "benchmark", + [ + ( + batch_size, + E, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + use_customized_permute, + ) + for batch_size in batch_sizes + ], + ) + + for batch_size, (permute, unpermute) in zip(batch_sizes, outputs): + print(f"Batch size: {batch_size}") + print(f"Permute time: {permute:.2f} us") + print(f"Unpermute time: {unpermute:.2f} us") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + parser.add_argument( + "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1" + ) + parser.add_argument( + "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" + ) + parser.add_argument("--use-customized-permute", action="store_true") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--batch-size", type=int, required=False) + parser.add_argument("--trust-remote-code", action="store_true") + args = parser.parse_args() + + main(args) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py new file mode 100644 index 000000000..7e0376c18 --- /dev/null +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -0,0 +1,251 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import random +import time +from typing import Optional + +import torch + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + FlexibleArgumentParser, + create_kv_caches_with_random, +) + +logger = init_logger(__name__) + +NUM_BLOCKS = 128 * 1024 +PARTITION_SIZE = 512 +PARTITION_SIZE_ROCM = 256 + + +@torch.inference_mode() +def main( + version: str, + num_seqs: int, + seq_len: int, + num_query_heads: int, + num_kv_heads: int, + head_size: int, + use_alibi: bool, + block_size: int, + dtype: torch.dtype, + seed: int, + do_profile: bool, + device: str = "cuda", + kv_cache_dtype: Optional[str] = None, +) -> None: + current_platform.seed_everything(seed) + + scale = float(1.0 / (head_size**0.5)) + query = torch.empty( + num_seqs, num_query_heads, head_size, dtype=dtype, device=device + ) + query.uniform_(-scale, scale) + + assert num_query_heads % num_kv_heads == 0 + alibi_slopes = None + if use_alibi: + alibi_slopes = torch.randn(num_query_heads, dtype=torch.float, device=device) + + seq_lens = [seq_len for _ in range(num_seqs)] + max_seq_len = max(seq_lens) + seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device) + + # Create the block tables. + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + block_tables_lst: list[list[int]] = [] + for _ in range(num_seqs): + block_table = [ + random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) + ] + block_tables_lst.append(block_table) + + block_tables = torch.tensor(block_tables_lst, dtype=torch.int, device=device) + + # Create the KV cache. + key_caches, value_caches = create_kv_caches_with_random( + NUM_BLOCKS, + block_size, + 1, + num_kv_heads, + head_size, + kv_cache_dtype, + dtype, + device=device, + ) + key_cache, value_cache = key_caches[0], value_caches[0] + + # Prepare for the paged attention kernel. + output = torch.empty_like(query) + if version == "v2": + if current_platform.is_rocm(): + global PARTITION_SIZE + if not args.custom_paged_attn and not current_platform.is_navi(): + PARTITION_SIZE = 1024 + else: + PARTITION_SIZE = PARTITION_SIZE_ROCM + num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE + tmp_output = torch.empty( + size=(num_seqs, num_query_heads, num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_query_heads, num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + + def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: + torch.cuda.synchronize() + if profile: + torch.cuda.cudart().cudaProfilerStart() + start_time = time.perf_counter() + + # Using default kv_scale + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) + + for _ in range(num_iters): + if version == "v1": + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + ) + elif version == "v2": + if not args.custom_paged_attn: + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + ) + else: + ops.paged_attention_rocm( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + None, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + ) + else: + raise ValueError(f"Invalid version: {version}") + torch.cuda.synchronize() + + end_time = time.perf_counter() + if profile: + torch.cuda.cudart().cudaProfilerStop() + return (end_time - start_time) / num_iters + + # Warmup. + print("Warming up...") + run_benchmark = run_cuda_benchmark + run_benchmark(num_iters=3, profile=False) + + # Benchmark. + if do_profile: + latency = run_benchmark(num_iters=1, profile=True) + else: + latency = run_benchmark(num_iters=100, profile=False) + print(f"Kernel running time: {latency * 1000000:.3f} us") + + +if __name__ == "__main__": + logger.warning( + "This script benchmarks the paged attention kernel. " + "By default this is no longer used in vLLM inference." + ) + + parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.") + parser.add_argument("--version", type=str, choices=["v1", "v2"], default="v2") + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument("--seq-len", type=int, default=4096) + parser.add_argument("--num-query-heads", type=int, default=64) + parser.add_argument("--num-kv-heads", type=int, default=8) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) + parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) + parser.add_argument("--use-alibi", action="store_true") + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--profile", action="store_true") + parser.add_argument( + "--kv-cache-dtype", + type=str, + choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"], + default="auto", + help="Data type for kv cache storage. If 'auto', will use model " + "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. " + "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)", + ) + parser.add_argument( + "--custom-paged-attn", action="store_true", help="Use custom paged attention" + ) + args = parser.parse_args() + print(args) + + if args.num_query_heads % args.num_kv_heads != 0: + raise ValueError("num_query_heads must be divisible by num_kv_heads") + main( + version=args.version, + num_seqs=args.batch_size, + seq_len=args.seq_len, + num_query_heads=args.num_query_heads, + num_kv_heads=args.num_kv_heads, + head_size=args.head_size, + block_size=args.block_size, + use_alibi=args.use_alibi, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + seed=args.seed, + do_profile=args.profile, + kv_cache_dtype=args.kv_cache_dtype, + ) diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py new file mode 100644 index 000000000..6ab26f5f1 --- /dev/null +++ b/benchmarks/kernels/benchmark_quant.py @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import time + +import torch + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser + + +@torch.inference_mode() +def main( + num_tokens: int, + hidden_size: int, + static_scale: bool, + quant_dtype: torch.dtype, + dtype: torch.dtype, + seed: int = 0, + do_profile: bool = False, + num_warmup_iters: int = 5, + num_iters: int = 100, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device("cuda") + + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None + + def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: + torch.cuda.synchronize() + if profile: + torch.cuda.cudart().cudaProfilerStart() + start_time = time.perf_counter() + + for _ in range(num_iters): + if quant_dtype == torch.int8: + ops.scaled_int8_quant(x, scale) + else: + ops.scaled_fp8_quant(x, scale) + torch.cuda.synchronize() + + end_time = time.perf_counter() + if profile: + torch.cuda.cudart().cudaProfilerStop() + return (end_time - start_time) / num_iters + + # Warmup. + print("Warming up...") + run_benchmark = run_cuda_benchmark + run_benchmark(num_iters=num_warmup_iters, profile=False) + + # Benchmark. + if do_profile: + latency = run_benchmark(num_iters=1, profile=True) + else: + latency = run_benchmark(num_iters=num_iters, profile=False) + print(f"Kernel running time: {latency * 1000000:.3f} us") + + +if __name__ == "__main__": + + def to_torch_dtype(dt): + if dt == "int8": + return torch.int8 + if dt == "fp8": + return torch.float8_e4m3fn + raise ValueError(f"Unsupported dtype: {dt}") + + parser = FlexibleArgumentParser( + description="Benchmark the quantization (fp8 or int8) kernel." + ) + parser.add_argument("--num-tokens", type=int, default=4096) + parser.add_argument("--hidden-size", type=int, default=8192) + parser.add_argument("--static-scale", action="store_true") + parser.add_argument( + "--quant-dtype", type=str, choices=["fp8", "int8"], default="int8" + ) + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" + ) + + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--profile", action="store_true") + parser.add_argument("--num-warmup-iters", type=int, default=5) + parser.add_argument( + "--num-iters", + type=int, + default=100, + help="Number of benchmark iterations. " + "If --profile is set, this number is ignored", + ) + + args = parser.parse_args() + print(args) + + main( + num_tokens=args.num_tokens, + hidden_size=args.hidden_size, + static_scale=args.static_scale, + quant_dtype=to_torch_dtype(args.quant_dtype), + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + seed=args.seed, + do_profile=args.profile, + num_warmup_iters=args.num_warmup_iters, + num_iters=args.num_iters, + ) diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py new file mode 100644 index 000000000..4cf633a81 --- /dev/null +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -0,0 +1,256 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import itertools +from typing import Optional, Union + +import torch +from flashinfer.norm import fused_add_rmsnorm, rmsnorm +from torch import nn + +from vllm import _custom_ops as vllm_ops +from vllm.triton_utils import triton + + +class HuggingFaceRMSNorm(nn.Module): + def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + orig_dtype = x.dtype + x = x.to(torch.float32) + if residual is not None: + x = x + residual.to(torch.float32) + residual = x.to(orig_dtype) + + variance = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(variance + self.variance_epsilon) + x = x.to(orig_dtype) * self.weight + if residual is None: + return x + else: + return x, residual + + +def rmsnorm_naive( + x: torch.Tensor, + weight: torch.Tensor, + residual: Optional[torch.Tensor] = None, + eps: float = 1e-6, +): + naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) + naive_norm.weight = nn.Parameter(weight) + naive_norm = naive_norm.to(x.device) + + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + if residual is not None: + residual = residual.view(-1, residual.shape[-1]) + + output = naive_norm(x, residual) + + if isinstance(output, tuple): + output = (output[0].view(orig_shape), output[1].view(orig_shape)) + else: + output = output.view(orig_shape) + return output + + +def rmsnorm_flashinfer( + x: torch.Tensor, + weight: torch.Tensor, + residual: Optional[torch.Tensor] = None, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + if residual is not None: + residual = residual.view(-1, residual.shape[-1]) + + if residual is not None: + fused_add_rmsnorm(x, residual, weight, eps) + output = (x, residual) + else: + output = rmsnorm(x, weight, eps) + + if isinstance(output, tuple): + output = (output[0].view(orig_shape), output[1].view(orig_shape)) + else: + output = output.view(orig_shape) + return output + + +def rmsnorm_vllm( + x: torch.Tensor, + weight: torch.Tensor, + residual: Optional[torch.Tensor] = None, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + if residual is not None: + residual = residual.view(-1, residual.shape[-1]) + + if residual is not None: + vllm_ops.fused_add_rms_norm(x, residual, weight, eps) + output = (x, residual) + else: + out = torch.empty_like(x) + vllm_ops.rms_norm(out, x, weight, eps) + output = out + + if isinstance(output, tuple): + output = (output[0].view(orig_shape), output[1].view(orig_shape)) + else: + output = output.view(orig_shape) + return output + + +def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True): + dtype = torch.bfloat16 + x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda") + weight = torch.ones(hidden_size, dtype=dtype, device="cuda") + residual = torch.randn_like(x) if use_residual else None + + output_naive = rmsnorm_naive( + x.clone(), weight, residual.clone() if residual is not None else None + ) + output_flashinfer = rmsnorm_flashinfer( + x.clone(), weight, residual.clone() if residual is not None else None + ) + output_vllm = rmsnorm_vllm( + x.clone(), weight, residual.clone() if residual is not None else None + ) + + if use_residual: + output_naive = output_naive[0] + output_flashinfer = output_flashinfer[0] + output_vllm = output_vllm[0] + + print(f"Naive output={output_naive}") + print(f"FlashInfer output={output_flashinfer}") + print(f"vLLM output={output_vllm}") + + if torch.allclose( + output_naive, output_flashinfer, atol=1e-2, rtol=1e-2 + ) and torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2): + print("✅ All implementations match") + else: + print("❌ Implementations differ") + + +batch_size_range = [2**i for i in range(0, 7, 2)] +seq_length_range = [2**i for i in range(6, 11, 1)] +head_num_range = [32, 48] +configs = list(itertools.product(head_num_range, batch_size_range, seq_length_range)) + + +def get_benchmark(use_residual): + @triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["head_num", "batch_size", "seq_len"], + x_vals=[list(_) for _ in configs], + line_arg="provider", + line_vals=["huggingface", "flashinfer", "vllm"], + line_names=["HuggingFace", "FlashInfer", "vLLM"], + styles=[("blue", "-"), ("green", "-"), ("red", "-")], + ylabel="us", + plot_name=f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual", + args={}, + ) + ) + def benchmark(head_num, batch_size, seq_len, provider): + dtype = torch.bfloat16 + hidden_size = head_num * 128 # assuming head_dim = 128 + + x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda") + weight = torch.ones(hidden_size, dtype=dtype, device="cuda") + residual = torch.randn_like(x) if use_residual else None + + quantiles = [0.5, 0.2, 0.8] + + if provider == "huggingface": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: rmsnorm_naive( + x.clone(), + weight, + residual.clone() if residual is not None else None, + ), + quantiles=quantiles, + ) + elif provider == "flashinfer": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: rmsnorm_flashinfer( + x.clone(), + weight, + residual.clone() if residual is not None else None, + ), + quantiles=quantiles, + ) + else: + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: rmsnorm_vllm( + x.clone(), + weight, + residual.clone() if residual is not None else None, + ), + quantiles=quantiles, + ) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + return benchmark + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--batch-size", + type=int, + default=4, + help="Batch size", + ) + parser.add_argument( + "--seq-len", + type=int, + default=128, + help="Sequence length", + ) + parser.add_argument( + "--hidden-size", + type=int, + default=4096, + help="Hidden size (2nd dimension) of the sequence", + ) + parser.add_argument( + "--use-residual", action="store_true", help="Whether to use residual connection" + ) + parser.add_argument( + "--save-path", + type=str, + default="./configs/rmsnorm/", + help="Path to save rmsnorm benchmark results", + ) + + args = parser.parse_args() + + # Run correctness test + calculate_diff( + batch_size=args.batch_size, + seq_len=args.seq_len, + hidden_size=args.hidden_size, + use_residual=args.use_residual, + ) + + # Get the benchmark function with proper use_residual setting + benchmark = get_benchmark(args.use_residual) + # Run performance benchmark + benchmark.run(print_data=True, save_path=args.save_path) diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py new file mode 100644 index 000000000..b81baf17a --- /dev/null +++ b/benchmarks/kernels/benchmark_rope.py @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from itertools import accumulate +from typing import Optional + +import nvtx +import torch + +from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + + +def benchmark_rope_kernels_multi_lora( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: float = 10000, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + # silulating serving 4 LoRAs + scaling_factors = [1, 2, 4, 8] + # batched RoPE can take multiple scaling factors + batched_rope = get_rope( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + {"rope_type": "linear", "factor": tuple(scaling_factors)}, + ) + # non-batched RoPE takes only one scaling factor, we create multiple + # instances to simulate the same behavior + non_batched_ropes: list[RotaryEmbedding] = [] + for scaling_factor in scaling_factors: + non_batched_ropes.append( + get_rope( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + {"rope_type": "linear", "factor": (scaling_factor,)}, + ) + ) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype) + key = torch.randn_like(query) + + # create query offsets for batched RoPE, we concat multiple kv cache + # together and each query needs to find the right kv cache of its type + offset_map = torch.tensor( + list( + accumulate( + [0] + + [ + max_position * scaling_factor * 2 + for scaling_factor in scaling_factors[:-1] + ] + ) + ) + ) + query_types = torch.randint( + 0, len(scaling_factors), (batch_size, seq_len), device=device + ) + # map query types to offsets + query_offsets = offset_map[query_types] + # the kernel takes flattened offsets + flatten_offsets = query_offsets.flatten() + + # batched queries of the same type together for non-batched RoPE + queries = [query[query_types == i] for i in range(len(scaling_factors))] + keys = [key[query_types == i] for i in range(len(scaling_factors))] + packed_qkr = zip(queries, keys, non_batched_ropes) + # synchronize before start timing + torch.cuda.synchronize() + with nvtx.annotate("non-batched", color="yellow"): + for q, k, r in packed_qkr: + r.forward(positions, q, k) + torch.cuda.synchronize() + with nvtx.annotate("batched", color="green"): + batched_rope.forward(positions, query, key, flatten_offsets) + torch.cuda.synchronize() + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the rotary embedding kernels." + ) + parser.add_argument("--is-neox-style", type=bool, default=True) + parser.add_argument("--batch-size", type=int, default=16) + parser.add_argument("--seq-len", type=int, default=512) + parser.add_argument("--num-heads", type=int, default=8) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) + parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) + parser.add_argument( + "--dtype", type=str, choices=["bfloat16", "float"], default="float" + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0" + ) + args = parser.parse_args() + print(args) + + benchmark_rope_kernels_multi_lora( + is_neox_style=args.is_neox_style, + batch_size=args.batch_size, + seq_len=args.seq_len, + num_heads=args.num_heads, + head_size=args.head_size, + rotary_dim=args.rotary_dim, + dtype=getattr(torch, args.dtype), + seed=args.seed, + device=args.device, + ) diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py new file mode 100644 index 000000000..18c459c31 --- /dev/null +++ b/benchmarks/kernels/benchmark_shapes.py @@ -0,0 +1,94 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +WEIGHT_SHAPES = { + "ideal": [[4 * 256 * 32, 256 * 32]], + "mistralai/Mistral-7B-v0.1/TP1": [ + [4096, 6144], + [4096, 4096], + [4096, 28672], + [14336, 4096], + ], + "mistralai/Mistral-7B-v0.1/TP2": [ + [4096, 3072], + [2048, 4096], + [4096, 14336], + [7168, 4096], + ], + "mistralai/Mistral-7B-v0.1/TP4": [ + [4096, 1536], + [1024, 4096], + [4096, 7168], + [3584, 4096], + ], + "meta-llama/Llama-2-7b-hf/TP1": [ + [4096, 12288], + [4096, 4096], + [4096, 22016], + [11008, 4096], + ], + "meta-llama/Llama-2-7b-hf/TP2": [ + [4096, 6144], + [2048, 4096], + [4096, 11008], + [5504, 4096], + ], + "meta-llama/Llama-2-7b-hf/TP4": [ + [4096, 3072], + [1024, 4096], + [4096, 5504], + [2752, 4096], + ], + "meta-llama/Llama-2-13b-hf/TP1": [ + [5120, 15360], + [5120, 5120], + [5120, 27648], + [13824, 5120], + ], + "meta-llama/Llama-2-13b-hf/TP2": [ + [5120, 7680], + [2560, 5120], + [5120, 13824], + [6912, 5120], + ], + "meta-llama/Llama-2-13b-hf/TP4": [ + [5120, 3840], + [1280, 5120], + [5120, 6912], + [3456, 5120], + ], + "meta-llama/Llama-2-70b-hf/TP1": [ + [8192, 10240], + [8192, 8192], + [8192, 57344], + [28672, 8192], + ], + "meta-llama/Llama-2-70b-hf/TP2": [ + [8192, 5120], + [4096, 8192], + [8192, 28672], + [14336, 8192], + ], + "meta-llama/Llama-2-70b-hf/TP4": [ + [8192, 2560], + [2048, 8192], + [8192, 14336], + [7168, 8192], + ], +} + +WEIGHT_SHAPES_MOE = { + "nm-testing/Mixtral-8x7B-Instruct-v0.1": [ + [8, 2, 4096, 28672], + [8, 2, 14336, 4096], + ], + "nm-testing/deepseekv2-lite": [ + [64, 6, 2048, 1408], + ], + "ibm-granite/granite-3.0-1b-a400m": [ + [32, 8, 1024, 1024], + ], + "ibm-granite/granite-3.0-3b-a800m": [ + [40, 8, 1024, 1536], + ], +} diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py new file mode 100644 index 000000000..4fcdbadd6 --- /dev/null +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -0,0 +1,414 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from sglang quantization/tuning_block_wise_kernel.py + +import argparse +import json +import multiprocessing as mp +import os +import time +from datetime import datetime +from typing import Any + +import torch +import tqdm +import triton + +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + _w8a8_block_fp8_matmul, +) +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + +mp.set_start_method("spawn", force=True) + +assert current_platform.is_cuda(), ( + "Only support tune w8a8 block fp8 kernel on CUDA device." +) + +DTYPE_MAP = { + "float32": torch.float32, + "float16": torch.float16, + "half": torch.half, + "bfloat16": torch.bfloat16, +} + + +def w8a8_block_matmul( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + config: dict[str, Any], + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + """This function performs matrix multiplication with + block-wise quantization. + + It takes two input tensors `A` and `B` with scales `As` and `Bs`. + The output is returned in the specified `output_dtype`. + + Args: + A: The input tensor, e.g., activation. + B: The input tensor, e.g., weight. + As: The per-token-group quantization scale for `A`. + Bs: The per-block quantization scale for `B`. + block_size: The block size for per-block quantization. + It should be 2-dim, e.g., [128, 128]. + output_dytpe: The dtype of the returned tensor. + + Returns: + torch.Tensor: The result of matmul. + """ + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + + assert A.shape[-1] == B.shape[-1] + assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous() + assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1] + M = A.numel() // A.shape[-1] + + assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 + N, K = B.shape + assert triton.cdiv(N, block_n) == Bs.shape[0] + assert triton.cdiv(K, block_k) == Bs.shape[1] + + C_shape = A.shape[:-1] + (N,) + C = A.new_empty(C_shape, dtype=output_dtype) + + def grid(META): + return ( + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ) + + if A.dtype == torch.float8_e4m3fn: + kernel = _w8a8_block_fp8_matmul + else: + raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") + + kernel[grid]( + A, + B, + C, + As, + Bs, + M, + N, + K, + block_n, + block_k, + A.stride(-2), + A.stride(-1), + B.stride(1), + B.stride(0), + C.stride(-2), + C.stride(-1), + As.stride(-2), + As.stride(-1), + Bs.stride(1), + Bs.stride(0), + **config, + ) + + return C + + +def get_configs_compute_bound(): + configs = [] + for num_stages in [2, 3, 4, 5]: + for block_m in [16, 32, 64, 128, 256]: + for block_k in [64, 128]: + for block_n in [32, 64, 128, 256]: + for num_warps in [4, 8]: + for group_size in [1, 16, 32, 64]: + configs.append( + { + "BLOCK_SIZE_M": block_m, + "BLOCK_SIZE_N": block_n, + "BLOCK_SIZE_K": block_k, + "GROUP_SIZE_M": group_size, + "num_warps": num_warps, + "num_stages": num_stages, + } + ) + return configs + + +def get_weight_shapes(tp_size): + # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. + # Modify them, if you tune for another different model. + # cannot TP + total = [ + (512 + 64, 7168), + ((128 + 64) * 128, 7168), + (128 * (128 + 128), 512), + (7168, 16384), + (7168, 18432), + ] + # N can TP + n_tp = [ + (18432 * 2, 7168), + ((128 + 64) * 128, 7168), + (128 * (128 + 128), 512), + (24576, 1536), + (12288, 7168), + (4096, 7168), + ] + # K can TP + k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)] + + weight_shapes = [] + for t in total: + weight_shapes.append(t) + for n_t in n_tp: + new_t = (n_t[0] // tp_size, n_t[1]) + weight_shapes.append(new_t) + for k_t in k_tp: + new_t = (k_t[0], k_t[1] // tp_size) + weight_shapes.append(new_t) + return weight_shapes + + +def benchmark_config( + A, B, As, Bs, block_size, config, out_dtype=torch.float16, num_iters=10 +): + def run(): + w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype) + + torch.cuda.synchronize() + # JIT complication & warmup + for _ in range(5): + run() + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies: list[float] = [] + for i in range(num_iters): + torch.cuda.synchronize() + start_event.record() + run() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + avg = sum(latencies) / (num_iters * 10) * 1000 # us + return avg + + +def tune(M, N, K, block_size, out_dtype, search_space, input_type): + factor_for_scale = 1e-2 + + if input_type == "fp8": + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + A_fp32 = ( + (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max + ) + A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + B_fp32 = ( + (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max + ) + B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + else: + raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") + + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + As = torch.rand(M, k_tiles, dtype=torch.float32, device="cuda") * factor_for_scale + Bs = ( + torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") + * factor_for_scale + ) + + best_config = None + best_time = float("inf") + for config in tqdm(search_space): + try: + kernel_time = benchmark_config( + A, + B, + As, + Bs, + block_size, + config, + out_dtype, + num_iters=10, + ) + except triton.runtime.autotuner.OutOfResources: + # Some configurations may be invalid and fail to compile. + continue + + if kernel_time < best_time: + best_time = kernel_time + best_config = config + now = datetime.now() + print(f"{now.ctime()}] Completed tuning for batch_size={M}") + assert best_config is not None + return best_config + + +def save_configs( + N, + K, + block_n, + block_k, + configs, + save_path, + input_type="fp8", +) -> None: + os.makedirs(save_path, exist_ok=True) + device_name = current_platform.get_device_name().replace(" ", "_") + json_file_name = ( + f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8," + f"block_shape=[{block_n},{block_k}].json" + ) + + config_file_path = os.path.join(save_path, json_file_name) + print(f"Writing best config to {config_file_path}...") + + with open(config_file_path, "w") as f: + json.dump(configs, f, indent=4) + f.write("\n") + + +def tune_on_gpu(args_dict): + """Run tuning on a specific GPU.""" + gpu_id = args_dict["gpu_id"] + batch_sizes = args_dict["batch_sizes"] + weight_shapes = args_dict["weight_shapes"] + args = args_dict["args"] + + torch.cuda.set_device(gpu_id) + print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}") + + block_n = args.block_n + block_k = args.block_k + out_dtype = DTYPE_MAP[args.out_dtype] + save_path = args.save_path + input_type = args.input_type + + search_space = get_configs_compute_bound() + search_space = [ + config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0 + ] + + start = time.time() + for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"): + N, K = shape[0], shape[1] + print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`") + benchmark_results = [ + tune( + batch_size, + N, + K, + [block_n, block_k], + out_dtype, + search_space, + input_type, + ) + for batch_size in tqdm(batch_sizes, desc=f"GPU {gpu_id} - Batch sizes") + ] + best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)} + save_configs(N, K, block_n, block_k, best_configs, save_path, input_type) + + end = time.time() + print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds") + + +def distribute_batch_sizes(batch_sizes, num_gpus): + """Distribute batch sizes across available GPUs.""" + batches_per_gpu = [] + for i in range(num_gpus): + start_idx = i * len(batch_sizes) // num_gpus + end_idx = (i + 1) * len(batch_sizes) // num_gpus + batches_per_gpu.append(batch_sizes[start_idx:end_idx]) + return batches_per_gpu + + +def main(args): + print(args) + num_gpus = torch.cuda.device_count() + if num_gpus == 0: + raise RuntimeError("No GPU available for tuning") + print(f"Found {num_gpus} GPUs for parallel tuning") + + torch.cuda.init() + + if args.batch_size is None: + batch_sizes = [ + 1, + 2, + 4, + 8, + 16, + 24, + 32, + 48, + 64, + 96, + 128, + 256, + 512, + 1024, + 1536, + 2048, + 3072, + 4096, + ] + else: + batch_sizes = [args.batch_size] + num_gpus = 1 # If only one batch size, use only one GPU + + weight_shapes = get_weight_shapes(args.tp_size) + + batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus) + + process_args = [] + for gpu_id in range(num_gpus): + process_args.append( + { + "gpu_id": gpu_id, + "batch_sizes": batches_per_gpu[gpu_id], + "weight_shapes": weight_shapes, # Each GPU processes all weight shapes + "args": args, + } + ) + + ctx = mp.get_context("spawn") + with ctx.Pool(num_gpus) as pool: + pool.map(tune_on_gpu, process_args) + + print("Multi-GPU tuning completed") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description=""" +Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1: + python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8 +Then copy to model_executor/layers/quantization/utils/configs + """, + formatter_class=argparse.RawTextHelpFormatter, + ) + + parser.add_argument("--tp-size", "-tp", type=int, default=8) + parser.add_argument("--input-type", type=str, choices=["fp8"], default="fp8") + parser.add_argument( + "--out-dtype", + type=str, + choices=["float32", "float16", "bfloat16", "half"], + default="float16", + ) + parser.add_argument("--block-n", type=int, default=128) + parser.add_argument("--block-k", type=int, default=128) + parser.add_argument("--batch-size", type=int, required=False) + parser.add_argument("--save-path", type=str, default="./") + args = parser.parse_args() + + main(args) diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md new file mode 100644 index 000000000..917e81401 --- /dev/null +++ b/benchmarks/kernels/deepgemm/README.md @@ -0,0 +1,129 @@ +# DeepSeek DeepGEMM Kernels Benchmark + +This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels. + +Currently this just includes dense GEMMs and only works on Hopper GPUs. + +## Setup + +You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory: + +``` +git clone --recursive https://github.com/deepseek-ai/DeepGEMM +cd DeepGEMM +python setup.py install +uv pip install -e . +``` + +## Usage + +``` +python benchmark_fp8_block_dense_gemm.py +INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda. +===== STARTING FP8 GEMM BENCHMARK ===== +PyTorch version: 2.5.1+cu124 +CUDA version: 12.4 +Triton version: 3.1.0 +Using device: NVIDIA H100 80GB HBM3 +WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. +WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. +INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. + +===== PERFORMANCE COMPARISON ===== + +DeepGEMM Implementation: ++------+-------+-------+-----------+--------+--------+ +| m | n | k | Time (μs) | TFLOPS | GB/s | ++------+-------+-------+-----------+--------+--------+ +| 8 | 4096 | 7168 | 102.9 | 4.6 | 286.4 | +| 8 | 7168 | 18432 | 70.8 | 29.8 | 1868.8 | +| 8 | 18432 | 7168 | 69.3 | 30.5 | 1911.8 | +| 64 | 4096 | 7168 | 69.1 | 54.4 | 439.0 | +| 64 | 7168 | 18432 | 69.4 | 243.6 | 1933.6 | +| 64 | 18432 | 7168 | 70.4 | 240.3 | 1917.2 | +| 64 | 24576 | 1536 | 70.1 | 68.9 | 584.6 | +| 64 | 32768 | 512 | 68.4 | 31.4 | 307.1 | +| 64 | 7168 | 16384 | 69.5 | 216.3 | 1718.5 | +| 128 | 4096 | 7168 | 141.1 | 53.3 | 222.1 | +| 128 | 7168 | 18432 | 71.9 | 470.5 | 1896.1 | +| 128 | 18432 | 7168 | 69.3 | 488.2 | 1988.2 | +| 1024 | 4096 | 7168 | 89.7 | 670.1 | 502.5 | +| 1024 | 18432 | 7168 | 279.0 | 969.8 | 635.2 | +| 2048 | 4096 | 7168 | 175.1 | 687.0 | 347.4 | +| 4096 | 4096 | 7168 | 335.4 | 717.0 | 275.1 | ++------+-------+-------+-----------+--------+--------+ + +vLLM Triton Implementation: ++------+-------+-------+-----------+--------+--------+--------------+ +| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM | ++------+-------+-------+-----------+--------+--------+--------------+ +| 8 | 4096 | 7168 | 74.0 | 6.3 | 398.2 | 1.39x faster | +| 8 | 7168 | 18432 | 89.6 | 23.6 | 1478.1 | 0.79x slower | +| 8 | 18432 | 7168 | 113.2 | 18.7 | 1170.4 | 0.61x slower | +| 64 | 4096 | 7168 | 79.4 | 47.3 | 382.2 | 0.87x slower | +| 64 | 7168 | 18432 | 98.5 | 171.7 | 1363.0 | 0.70x slower | +| 64 | 18432 | 7168 | 119.5 | 141.5 | 1129.4 | 0.59x slower | +| 64 | 24576 | 1536 | 37.6 | 128.4 | 1089.7 | 1.86x faster | +| 64 | 32768 | 512 | 38.7 | 55.5 | 542.6 | 1.77x faster | +| 64 | 7168 | 16384 | 86.1 | 174.5 | 1386.4 | 0.81x slower | +| 128 | 4096 | 7168 | 90.7 | 82.9 | 345.4 | 1.56x faster | +| 128 | 7168 | 18432 | 144.0 | 234.9 | 946.9 | 0.50x slower | +| 128 | 18432 | 7168 | 229.5 | 147.4 | 600.1 | 0.30x slower | +| 1024 | 4096 | 7168 | 242.3 | 248.2 | 186.1 | 0.37x slower | +| 1024 | 18432 | 7168 | 897.8 | 301.4 | 197.4 | 0.31x slower | +| 2048 | 4096 | 7168 | 463.0 | 259.7 | 131.4 | 0.38x slower | +| 4096 | 4096 | 7168 | 901.8 | 266.7 | 102.3 | 0.37x slower | ++------+-------+-------+-----------+--------+--------+--------------+ + +vLLM CUTLASS Implementation: ++------+-------+-------+-----------+--------+--------+--------------+--------------+ +| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM | vs Triton | ++------+-------+-------+-----------+--------+--------+--------------+--------------+ +| 8 | 4096 | 7168 | 34.6 | 13.6 | 852.3 | 2.98x faster | 2.14x faster | +| 8 | 7168 | 18432 | 78.9 | 26.8 | 1677.3 | 0.90x slower | 1.13x faster | +| 8 | 18432 | 7168 | 81.2 | 26.0 | 1631.1 | 0.85x slower | 1.39x faster | +| 64 | 4096 | 7168 | 36.9 | 101.9 | 822.9 | 1.87x faster | 2.15x faster | +| 64 | 7168 | 18432 | 87.4 | 193.4 | 1535.2 | 0.79x slower | 1.13x faster | +| 64 | 18432 | 7168 | 85.0 | 199.0 | 1587.6 | 0.83x slower | 1.41x faster | +| 64 | 24576 | 1536 | 28.0 | 172.8 | 1465.8 | 2.51x faster | 1.35x faster | +| 64 | 32768 | 512 | 28.8 | 74.5 | 728.5 | 2.37x faster | 1.34x faster | +| 64 | 7168 | 16384 | 77.9 | 193.0 | 1532.8 | 0.89x slower | 1.11x faster | +| 128 | 4096 | 7168 | 39.1 | 192.4 | 802.0 | 3.61x faster | 2.32x faster | +| 128 | 7168 | 18432 | 93.7 | 360.8 | 1454.2 | 0.77x slower | 1.54x faster | +| 128 | 18432 | 7168 | 85.7 | 394.8 | 1608.0 | 0.81x slower | 2.68x faster | +| 1024 | 4096 | 7168 | 99.7 | 603.1 | 452.2 | 0.90x slower | 2.43x faster | +| 1024 | 18432 | 7168 | 331.3 | 816.7 | 534.9 | 0.84x slower | 2.71x faster | +| 2048 | 4096 | 7168 | 198.3 | 606.6 | 306.7 | 0.88x slower | 2.34x faster | +| 4096 | 4096 | 7168 | 392.2 | 613.2 | 235.3 | 0.86x slower | 2.30x faster | ++------+-------+-------+-----------+--------+--------+--------------+--------------+ + +===== AVERAGE PERFORMANCE ===== ++----------------+------------+----------+---------------+ +| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) | ++----------------+------------+----------+---------------+ +| DeepGEMM | 310.98 | 1052.10 | 0.11 | +| vLLM Triton | 144.30 | 715.60 | 0.23 | +| vLLM CUTLASS | 286.78 | 1076.67 | 0.11 | ++----------------+------------+----------+---------------+ + +===== AVERAGE SPEEDUPS ===== ++-----------------------------+--------------+ +| Comparison | Speedup | ++-----------------------------+--------------+ +| DeepGEMM vs vLLM Triton | 1.71x faster | +| DeepGEMM vs vLLM CUTLASS | 0.94x slower | +| vLLM CUTLASS vs vLLM Triton | 1.84x faster | ++-----------------------------+--------------+ + +===== ACCURACY COMPARISON ===== ++----------------+-----------------------+ +| Implementation | Avg Diff vs Reference | ++----------------+-----------------------+ +| DeepGEMM | 0.000684 | +| vLLM Triton | 0.000684 | +| vLLM CUTLASS | 0.000684 | ++----------------+-----------------------+ +``` diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py new file mode 100644 index 000000000..e67ce0545 --- /dev/null +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -0,0 +1,467 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# fmt: off +# ruff: noqa: E501 +import time + +# Import DeepGEMM functions +import deep_gemm +import torch +from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor + +# Import vLLM functions +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, + w8a8_block_fp8_matmul, +) +from vllm.triton_utils import triton + + +# Copied from +# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9 +def per_token_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tensor to FP8 format with per-token scaling.""" + assert x.dim() == 2 and x.size(1) % 128 == 0 + m, n = x.shape + x_view = x.view(m, -1, 128) + x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) + return (x_view * (448.0 / x_amax.unsqueeze(2))).to( + torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1) + + +# Copied from +# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17 +def per_block_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tensor to FP8 format with per-block scaling.""" + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( + x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + + +def benchmark_shape(m: int, + n: int, + k: int, + warmup: int = 100, + repeat: int = 10000, + verbose: bool = False) -> dict: + """Benchmark all implementations for a specific (m, n, k) shape.""" + if verbose: + print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===") + + # Create test tensors + A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) + B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) + + # Reference result in BF16 + torch.cuda.synchronize() + C_ref = A @ B.t() + + # Pre-quantize B for all implementations + # (weights can be pre-quantized offline) + B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B) + B_vllm, B_scale_vllm = per_block_cast_to_fp8(B) + + # Block size configuration + block_size = [128, 128] + + # Pre-quantize A for all implementations + A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) + C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) + A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) + A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( + A, block_size[1], column_major_scales=True) + + # === DeepGEMM Implementation === + def deepgemm_gemm(): + # A quantization is inside the loop as it depends on activations + # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8( + # A, block_size[1]) + # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm) + # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) + deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), + (B_deepgemm, B_scale_deepgemm), + C_deepgemm) + return C_deepgemm + + # === vLLM Triton Implementation === + def vllm_triton_gemm(): + # A quantization is inside the loop as it depends on activations + # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) + return w8a8_block_fp8_matmul(A_vllm, + B_vllm, + A_scale_vllm, + B_scale_vllm, + block_size, + output_dtype=torch.bfloat16) + + # === vLLM CUTLASS Implementation === + def vllm_cutlass_gemm(): + # A quantization is inside the loop as it depends on activations + # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( + # A, block_size[1], column_major_scales=True) + return ops.cutlass_scaled_mm(A_vllm_cutlass, + B_vllm.T, + scale_a=A_scale_vllm_cutlass, + scale_b=B_scale_vllm.T, + out_dtype=torch.bfloat16) + + # Run correctness check first + if verbose: + print("Running correctness check...") + C_deepgemm = deepgemm_gemm() + C_vllm_triton = vllm_triton_gemm() + C_vllm_cutlass = vllm_cutlass_gemm() + + deepgemm_diff = calc_diff(C_deepgemm, C_ref) + vllm_triton_diff = calc_diff(C_vllm_triton, C_ref) + vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref) + + if verbose: + print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}") + print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}") + print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}") + print("vLLM Triton vs DeepGEMM difference: " + f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}") + print("vLLM CUTLASS vs DeepGEMM difference: " + f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}") + + # Benchmark implementations + implementations = { + "DeepGEMM": deepgemm_gemm, + "vLLM Triton": vllm_triton_gemm, + "vLLM CUTLASS": vllm_cutlass_gemm + } + + benchmark_results = { + "shape": { + "m": m, + "n": n, + "k": k + }, + "implementations": {} + } + + for name, func in implementations.items(): + # Warmup + for _ in range(warmup): + func() + torch.cuda.synchronize() + + # Timing loop + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + func() + torch.cuda.synchronize() + end = time.time() + + # Calculate timing and TFLOPS + avg_time_ms = (end - start) / repeat * 1000 + avg_time_us = avg_time_ms * 1000 + tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12 + gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3) + + benchmark_results["implementations"][name] = { + "time_ms": avg_time_ms, + "time_us": avg_time_us, + "tflops": tflops, + "gb_s": gb_s, + "diff": { + "DeepGEMM": + 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm), + "Reference": + deepgemm_diff if name == "DeepGEMM" else + (vllm_triton_diff + if name == "vLLM Triton" else vllm_cutlass_diff) + } + } + + if verbose: + print( + f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s" + ) + + # Calculate speedups + baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"] + for name, data in benchmark_results["implementations"].items(): + if name != "DeepGEMM": + speedup = baseline / data["time_ms"] + benchmark_results["implementations"][name][ + "speedup_vs_deepgemm"] = speedup + if verbose: + print(f"DeepGEMM is {1/speedup:.2f}x " + f"{'faster' if 1/speedup > 1 else 'slower'} than {name}") + + vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][ + "time_ms"] + vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][ + "time_ms"] + cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time + benchmark_results["implementations"]["vLLM CUTLASS"][ + "speedup_vs_triton"] = cutlass_vs_triton + if verbose: + print( + f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x " + f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton" + ) + + return benchmark_results + + +def format_table_row(values, widths): + """Format a row with specified column widths.""" + return "| " + " | ".join(f"{val:{w}}" + for val, w in zip(values, widths)) + " |" + + +def print_table(headers, rows, title=None): + """Print a table with headers and rows.""" + if title: + print(f"\n{title}") + + # Calculate column widths based on headers and data + widths = [ + max(len(str(h)), max(len(str(row[i])) for row in rows)) + for i, h in enumerate(headers) + ] + + # Create separator line + separator = "+-" + "-+-".join("-" * w for w in widths) + "-+" + + # Print table + print(separator) + print(format_table_row(headers, widths)) + print(separator) + for row in rows: + print(format_table_row(row, widths)) + print(separator) + + +def format_speedup(value): + """Format speedup value with indicator if it's faster or slower.""" + return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}" + + +def run_benchmarks(verbose: bool = False): + """Run benchmarks for a set of common shapes.""" + print("===== STARTING FP8 GEMM BENCHMARK =====") + + # Make sure we're using the GPU + if not torch.cuda.is_available(): + print("CUDA not available! Tests require GPU.") + return + + # Print system information + print(f"PyTorch version: {torch.__version__}") + print(f"CUDA version: {torch.version.cuda}") + print(f"Triton version: {triton.__version__}") + print(f"Using device: {torch.cuda.get_device_name()}") + + # Enable TF32 for better performance + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + # Set seeds for reproducibility + torch.manual_seed(42) + torch.cuda.manual_seed(42) + + # Define benchmark shapes (m, n, k) + shapes = [ + (8, 4096, 7168), + (8, 7168, 18432), + (8, 18432, 7168), + (64, 4096, 7168), + (64, 7168, 18432), + (64, 18432, 7168), + (64, 24576, 1536), + (64, 32768, 512), + (64, 7168, 16384), + (128, 4096, 7168), + (128, 7168, 18432), + (128, 18432, 7168), + (1024, 4096, 7168), + (1024, 18432, 7168), + (2048, 4096, 7168), + (4096, 4096, 7168), + ] + shapes = [ + # (64, 2112, 7168), + (64, 24576, 1536), + (64, 32768, 512), + (64, 7168, 16384), + (64, 4096, 7168), + (64, 7168, 2048), + # (128, 2112, 7168), + (128, 24576, 1536), + (128, 32768, 512), + (128, 7168, 16384), + (128, 4096, 7168), + (128, 7168, 2048), + # (4096, 2112, 7168), + (4096, 24576, 1536), + (4096, 32768, 512), + (4096, 7168, 16384), + (4096, 4096, 7168), + (4096, 7168, 2048), + ] + + all_results = [] + for m, n, k in shapes: + result = benchmark_shape(m, n, k, verbose=verbose) + all_results.append(result) + + # Print results in a nicely formatted table + print("\n===== PERFORMANCE COMPARISON =====") + + # Print DeepGEMM table + deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"] + deepgemm_rows = [] + for result in all_results: + shape = result["shape"] + impl_data = result["implementations"]["DeepGEMM"] + deepgemm_rows.append([ + shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}" + ]) + + print_table(deepgemm_headers, + deepgemm_rows, + title="DeepGEMM Implementation:") + + # Print vLLM Triton table + triton_headers = [ + "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM" + ] + triton_rows = [] + for result in all_results: + shape = result["shape"] + impl_data = result["implementations"]["vLLM Triton"] + speedup = impl_data.get("speedup_vs_deepgemm", 1.0) + triton_rows.append([ + shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", + format_speedup(speedup) + ]) + + print_table(triton_headers, + triton_rows, + title="vLLM Triton Implementation:") + + # Print vLLM CUTLASS table + cutlass_headers = [ + "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM", + "vs Triton" + ] + cutlass_rows = [] + for result in all_results: + shape = result["shape"] + impl_data = result["implementations"]["vLLM CUTLASS"] + vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0) + vs_triton = impl_data.get("speedup_vs_triton", 1.0) + cutlass_rows.append([ + shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", + format_speedup(vs_deepgemm), + format_speedup(vs_triton) + ]) + + print_table(cutlass_headers, + cutlass_rows, + title="vLLM CUTLASS Implementation:") + + # Calculate and print averages + print("\n===== AVERAGE PERFORMANCE =====") + + implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"] + avg_metrics = { + impl: { + "tflops": 0, + "gb_s": 0, + "time_ms": 0 + } + for impl in implementations + } + + for result in all_results: + for impl in implementations: + impl_data = result["implementations"][impl] + avg_metrics[impl]["tflops"] += impl_data["tflops"] + avg_metrics[impl]["gb_s"] += impl_data["gb_s"] + avg_metrics[impl]["time_ms"] += impl_data["time_ms"] + + num_shapes = len(all_results) + avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"] + avg_rows = [] + + for impl in implementations: + avg_tflops = avg_metrics[impl]["tflops"] / num_shapes + avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes + avg_time = avg_metrics[impl]["time_ms"] / num_shapes + avg_rows.append([ + impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}" + ]) + + print_table(avg_headers, avg_rows) + + # Calculate average speedups + avg_speedups = { + "DeepGEMM vs vLLM Triton": 0, + "DeepGEMM vs vLLM CUTLASS": 0, + "vLLM CUTLASS vs vLLM Triton": 0 + } + + for result in all_results: + deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"] + vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"] + vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][ + "time_ms"] + + avg_speedups[ + "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time + avg_speedups[ + "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time + avg_speedups[ + "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time + + print("\n===== AVERAGE SPEEDUPS =====") + speedup_headers = ["Comparison", "Speedup"] + speedup_rows = [] + for comparison, total in avg_speedups.items(): + avg_speedup = total / num_shapes + status = "faster" if avg_speedup > 1 else "slower" + speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"]) + + print_table(speedup_headers, speedup_rows) + + # Average accuracy comparison + print("\n===== ACCURACY COMPARISON =====") + avg_diff = {impl: 0 for impl in implementations} + + for result in all_results: + for impl in implementations: + avg_diff[impl] += result["implementations"][impl]["diff"][ + "Reference"] + + diff_headers = ["Implementation", "Avg Diff vs Reference"] + diff_rows = [] + for impl in implementations: + diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"]) + + print_table(diff_headers, diff_rows) + + +if __name__ == "__main__": + run_benchmarks(verbose=False) diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py new file mode 100644 index 000000000..9a4da0ef5 --- /dev/null +++ b/benchmarks/kernels/graph_machete_bench.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +import pickle +from collections import defaultdict + +import matplotlib.pyplot as plt +import pandas as pd +import regex as re +import seaborn as sns +from torch.utils.benchmark import Measurement as TMeasurement + +from vllm.utils import FlexibleArgumentParser + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the latency of processing a single batch of " + "requests till completion." + ) + parser.add_argument("filename", type=str) + + args = parser.parse_args() + + with open(args.filename, "rb") as f: + data = pickle.load(f) + raw_results: list[TMeasurement] = data["results"] + + results = defaultdict(lambda: list()) + for v in raw_results: + result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label) + if result is not None: + KN = result.group(1) + else: + raise Exception("MKN not found") + result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label) + if result is not None: + M = result.group(1) + else: + raise Exception("MKN not found") + + kernel = v.task_spec.description + results[KN].append({"kernel": kernel, "batch_size": M, "median": v.median}) + + rows = int(math.ceil(len(results) / 2)) + fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows)) + axs = axs.flatten() + for axs_idx, (shape, data) in enumerate(results.items()): + plt.sca(axs[axs_idx]) + df = pd.DataFrame(data) + sns.lineplot( + data=df, + x="batch_size", + y="median", + hue="kernel", + style="kernel", + markers=True, + dashes=False, + palette="Dark2", + ) + plt.title(f"Shape: {shape}") + plt.ylabel("time (median, s)") + plt.tight_layout() + plt.savefig("graph_machete_bench.pdf") diff --git a/benchmarks/kernels/requirements.txt b/benchmarks/kernels/requirements.txt new file mode 100644 index 000000000..1411a4a0b --- /dev/null +++ b/benchmarks/kernels/requirements.txt @@ -0,0 +1 @@ +pandas \ No newline at end of file diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py new file mode 100644 index 000000000..4bbb36bb4 --- /dev/null +++ b/benchmarks/kernels/utils.py @@ -0,0 +1,214 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import dataclasses +from collections.abc import Iterable +from typing import Any, Callable, Optional + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement + + +@dataclasses.dataclass +class CudaGraphBenchParams: + num_ops_in_cuda_graph: int + + +@dataclasses.dataclass +class ArgPool: + """ + When some argument of the benchmarking function is annotated with this type, + the benchmarking class (BenchMM) will collapse the argument to a pick a + single value from the given list of values, during function invocation. + For every invocation during a benchmarking run, it will choose a + different value from the list. + """ + + values: Iterable[Any] + + def __getitem__(self, index): + return self.values[index] + + +class Bench: + class ArgsIterator: + def __init__(self, args_list, kwargs_list): + assert len(args_list) == len(kwargs_list) + self.args_list = args_list + self.kwargs_list = kwargs_list + self.n = len(self.args_list) + self.idx = 0 + + def __next__(self): + while True: + yield (self.args_list[self.idx], self.kwargs_list[self.idx]) + self.idx += 1 + self.idx = self.idx % self.n + + def reset(self): + self.idx = 0 + + @property + def n_args(self): + return self.n + + def __init__( + self, + cuda_graph_params: Optional[CudaGraphBenchParams], + label: str, + sub_label: str, + description: str, + fn: Callable, + *args, + **kwargs, + ): + self.cuda_graph_params = cuda_graph_params + self.use_cuda_graph = self.cuda_graph_params is not None + self.label = label + self.sub_label = sub_label + self.description = description + self.fn = fn + + # Process args + self._args = args + self._kwargs = kwargs + self.args_list, self.kwargs_list = self.collapse_argpool(*args, **kwargs) + self.args_iterator = self.ArgsIterator(self.args_list, self.kwargs_list) + + # Cudagraph runner + self.g = None + if self.use_cuda_graph: + self.g = self.get_cuda_graph_runner() + + # benchmark run params + self.min_run_time = 1 + + def collapse_argpool(self, *args, **kwargs): + argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [ + arg for arg in kwargs.values() if isinstance(arg, ArgPool) + ] + if len(argpool_args) == 0: + return [args], [kwargs] + + # Make sure all argpools are of the same size + argpool_size = len(argpool_args[0].values) + assert all([argpool_size == len(arg.values) for arg in argpool_args]) + + # create copies of the args + args_list = [] + kwargs_list = [] + for _ in range(argpool_size): + args_list.append(args) + kwargs_list.append(kwargs.copy()) + + for i in range(argpool_size): + # collapse args; Just pick the ith value + args_list[i] = tuple( + [arg[i] if isinstance(arg, ArgPool) else arg for arg in args_list[i]] + ) + + # collapse kwargs + kwargs_i = kwargs_list[i] + arg_pool_keys = [k for k, v in kwargs_i.items() if isinstance(v, ArgPool)] + for k in arg_pool_keys: + # again just pick the ith value + kwargs_i[k] = kwargs_i[k][i] + kwargs_list[i] = kwargs_i + + return args_list, kwargs_list + + def get_cuda_graph_runner(self): + assert self.use_cuda_graph + assert self.args_iterator is not None + + num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph + + # warmup + args_it = self.args_iterator.__next__() + for _ in range(2): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + for _ in range(num_graph_ops): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + return g + + def run_cudagrah(self) -> TMeasurement: + assert self.use_cuda_graph + globals = {"g": self.g} + + return TBenchmark.Timer( + stmt="g.replay()", + globals=globals, + label=( + f"{self.label}" + f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops" + ), + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run_eager(self) -> TMeasurement: + setup = None + stmt = None + globals = None + + has_arg_pool = self.args_iterator.n_args > 1 + if has_arg_pool: + setup = """ + args_iterator.reset() + args_it = args_iterator.__next__() + """ + stmt = """ + args, kwargs = next(args_it) + fn(*args, **kwargs) + """ + globals = {"fn": self.fn, "args_iterator": self.args_iterator} + else: + # no arg pool. Just use the args and kwargs directly + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + args, kwargs = next(args_it) + + setup = "" + stmt = """ + fn(*args, **kwargs) + """ + globals = {"fn": self.fn, "args": args, "kwargs": kwargs} + + return TBenchmark.Timer( + stmt=stmt, + setup=setup, + globals=globals, + label=self.label, + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run(self) -> TMeasurement: + timer = None + if self.use_cuda_graph: # noqa SIM108 + timer = self.run_cudagrah() + else: + timer = self.run_eager() + if not timer.meets_confidence() or timer.has_warnings: + print("Doesn't meet confidence - re-running bench ...") + return self.run() + return timer + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if exc_type: + print(f"exc type {exc_type}") + print(f"exc value {exc_value}") + print(f"exc traceback {traceback}") diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py new file mode 100644 index 000000000..a27f02394 --- /dev/null +++ b/benchmarks/kernels/weight_shapes.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Weight Shapes are in the format +# ([K, N], TP_SPLIT_DIM) +# Example: +# A shape of ([14336, 4096], 0) indicates the following GEMM shape, +# - TP1 : K = 14336, N = 4096 +# - TP2 : K = 7168, N = 4096 +# A shape of ([4096, 6144], 1) indicates the following GEMM shape, +# - TP1 : K = 4096, N = 6144 +# - TP4 : K = 4096, N = 1536 + +# TP1 shapes +WEIGHT_SHAPES = { + "mistralai/Mistral-7B-v0.1": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-7b-hf": [ + ([4096, 12288], 1), + ([4096, 4096], 0), + ([4096, 22016], 1), + ([11008, 4096], 0), + ], + "meta-llama/Llama-3-8b": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-13b-hf": [ + ([5120, 15360], 1), + ([5120, 5120], 0), + ([5120, 27648], 1), + ([13824, 5120], 0), + ], + "meta-llama/Llama-2-70b-hf": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], + "meta-llama/Llama-3.1-405b-hf": [ + ([16384, 18432], 1), + ([16384, 16384], 0), + ([16384, 106496], 1), + ([53248, 16384], 0), + ], + "meta-llama/Llama-3.1-8B-Instruct": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-3.3-70B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], + "mistralai/Mistral-Large-Instruct-2407": [ + ([12288, 14336], 1), + ([12288, 12288], 0), + ([12288, 57344], 1), + ([28672, 12288], 0), + ], + "Qwen/Qwen2.5-7B-Instruct": [ + ([3584, 4608], 1), + ([3584, 3584], 0), + ([3584, 37888], 1), + ([18944, 3584], 0), + ], + "Qwen/Qwen2.5-32B-Instruct": [ + ([5120, 7168], 1), + ([5120, 5120], 0), + ([5120, 55296], 1), + ([27648, 5120], 0), + ], + "Qwen/Qwen2.5-72B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 59136], 1), + ([29568, 8192], 0), + ], + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [ + ([2048, 3072], 1), + ([2048, 4096], 1), + ([2048, 2048], 0), + ([2048, 576], 0), + ([2048, 21888], 1), + ([10944, 2048], 0), + ([2048, 2816], 1), + ([1408, 2048], 0), + ], +} diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py new file mode 100644 index 000000000..0957a9c65 --- /dev/null +++ b/benchmarks/overheads/benchmark_hashing.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import cProfile +import pstats + +from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser + +# A very long prompt, total number of tokens is about 15k. +LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000 +LONG_PROMPT = " ".join(LONG_PROMPT) + + +def main(args): + llm = LLM( + model=args.model, + enforce_eager=True, + enable_prefix_caching=True, + tensor_parallel_size=args.tensor_parallel_size, + ) + + sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + profiler = cProfile.Profile() + + print("------warm up------") + for i in range(3): + output = llm.generate(LONG_PROMPT, sampling_params) + print(output[0].outputs[0].text) + + print("------start generating------") + for i in range(3): + profiler.runctx( + "llm.generate(LONG_PROMPT, sampling_params)", globals(), locals() + ) + + # analyze the runtime of hashing function + stats = pstats.Stats(profiler) + stats.sort_stats("cumulative") + total_time = 0 + total_calls = 0 + for func in stats.stats: + if "hash_of_block" in func[2]: + total_time = stats.stats[func][3] + total_calls = stats.stats[func][0] + percentage = (total_time / stats.total_tt) * 100 + print( + f"Hashing took {total_time:.2f} seconds,{percentage:.2f}% of the total runtime." + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the performance of hashing function in" + "automatic prefix caching." + ) + parser.add_argument("--model", type=str, default="lmsys/longchat-7b-16k") + parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) + parser.add_argument("--output-len", type=int, default=10) + parser.add_argument( + "--enable-prefix-caching", action="store_true", help="enable prefix caching" + ) + args = parser.parse_args() + main(args) diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml new file mode 100644 index 000000000..65b1e09a2 --- /dev/null +++ b/benchmarks/pyproject.toml @@ -0,0 +1,49 @@ +# This local pyproject file is part of the migration from yapf to ruff format. +# It uses the same core rules as the main pyproject.toml file, but with the +# following differences: +# - ruff line length is overridden to 88 +# - deprecated typing ignores (UP006, UP035) have been removed + +[tool.ruff] +line-length = 88 + +[tool.ruff.lint.per-file-ignores] +"vllm/third_party/**" = ["ALL"] +"vllm/version.py" = ["F401"] +"vllm/_version.py" = ["ALL"] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", + # flake8-logging-format + "G", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # Loop control variable not used within loop body + "B007", + # f-string format + "UP032", + # Can remove once 3.10+ is the minimum Python version + "UP007", +] + +[tool.ruff.lint.isort] +known-first-party = ["vllm"] + +[tool.ruff.format] +docstring-code-format = true \ No newline at end of file diff --git a/benchmarks/run_structured_output_benchmark.sh b/benchmarks/run_structured_output_benchmark.sh new file mode 100644 index 000000000..b043ab83e --- /dev/null +++ b/benchmarks/run_structured_output_benchmark.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# default values +MODEL=${MODEL:-"Qwen/Qwen2.5-7B-Instruct"} +BACKEND=${BACKEND:-"vllm"} +DATASET=${DATASET:-"xgrammar_bench"} +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OUTPUT_DIR=${OUTPUT_DIR:-"$SCRIPT_DIR/structured_output_benchmark_results"} +PORT=${PORT:-8000} +STRUCTURED_OUTPUT_RATIO=${STRUCTURED_OUTPUT_RATIO:-1} +TOTAL_SECONDS=${TOTAL_SECONDS:-90} +MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-300} +TOKENIZER_MODE=${TOKENIZER_MODE:-"auto"} + +usage() { + echo "Usage: $0 [options]" + echo "Options:" + echo " --model MODEL Model to benchmark (default: $MODEL)" + echo " --backend BACKEND Backend to use (default: $BACKEND)" + echo " --dataset DATASET Dataset to use (default: $DATASET)" + echo " --max-new-tokens N Maximum number of tokens to generate (default: $MAX_NEW_TOKENS)" + echo " --output-dir DIR Output directory for results (default: $OUTPUT_DIR)" + echo " --port PORT Port to use (default: $PORT)" + echo " --structured-output-ratio N Ratio of structured outputs (default: $STRUCTURED_OUTPUT_RATIO)" + echo " --tokenizer-mode MODE Tokenizer mode to use (default: $TOKENIZER_MODE)" + echo " --total-seconds N Total seconds to run the benchmark (default: $TOTAL_SECONDS)" + echo " -h, --help Show this help message and exit" + exit 0 +} + +# parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --model) + MODEL="$2" + shift 2 + ;; + --backend) + BACKEND="$2" + shift 2 + ;; + --dataset) + DATASET="$2" + shift 2 + ;; + --max-new-tokens) + MAX_NEW_TOKENS="$2" + shift 2 + ;; + --output-dir) + OUTPUT_DIR="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + --structured-output-ratio) + STRUCTURED_OUTPUT_RATIO="$2" + shift 2 + ;; + --tokenizer-mode) + TOKENIZER_MODE="$2" + shift 2 + ;; + --total-seconds) + TOTAL_SECONDS="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Unknown argument: $1\n" + usage + ;; + esac +done + +# Create output directory if it doesn't exist +mkdir -p "$OUTPUT_DIR" + +# Define QPS values to test +QPS_VALUES=(25 20 15 10 5 1) + +# Common parameters +COMMON_PARAMS="--backend $BACKEND \ + --model $MODEL \ + --dataset $DATASET \ + --structured-output-ratio $STRUCTURED_OUTPUT_RATIO \ + --save-results \ + --result-dir $OUTPUT_DIR \ + --output-len $MAX_NEW_TOKENS \ + --port $PORT \ + --tokenizer-mode $TOKENIZER_MODE" + +echo "Starting structured output benchmark with model: $MODEL" +echo "Backend: $BACKEND" +echo "Dataset: $DATASET" +echo "Results will be saved to: $OUTPUT_DIR" +echo "----------------------------------------" + +# Run benchmarks with different QPS values +for qps in "${QPS_VALUES[@]}"; do + echo "Running benchmark with QPS: $qps" + + # Get git hash and branch for the filename + GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown") + GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown") + + # Construct filename for this run + FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json" + + NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc) + NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part + echo "Running benchmark with $NUM_PROMPTS prompts" + + # Run the benchmark + python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \ + --request-rate $qps \ + --result-filename "$FILENAME" \ + --num-prompts $NUM_PROMPTS + + echo "Completed benchmark with QPS: $qps" + echo "----------------------------------------" +done + +echo "All benchmarks completed!" +echo "Results saved to: $OUTPUT_DIR" diff --git a/benchmarks/sonnet.txt b/benchmarks/sonnet.txt new file mode 100644 index 000000000..34c444e8c --- /dev/null +++ b/benchmarks/sonnet.txt @@ -0,0 +1,518 @@ +FROM fairest creatures we desire increase, +That thereby beauty's rose might never die, +But as the riper should by time decease, +His tender heir might bear his memory: +But thou, contracted to thine own bright eyes, +Feed'st thy light'st flame with self-substantial fuel, +Making a famine where abundance lies, +Thyself thy foe, to thy sweet self too cruel. +Thou that art now the world's fresh ornament +And only herald to the gaudy spring, +Within thine own bud buriest thy content +And, tender churl, makest waste in niggarding. +Pity the world, or else this glutton be, +To eat the world's due, by the grave and thee. +When forty winters shall beseige thy brow, +And dig deep trenches in thy beauty's field, +Thy youth's proud livery, so gazed on now, +Will be a tatter'd weed, of small worth held: +Then being ask'd where all thy beauty lies, +Where all the treasure of thy lusty days, +To say, within thine own deep-sunken eyes, +Were an all-eating shame and thriftless praise. +How much more praise deserved thy beauty's use, +If thou couldst answer 'This fair child of mine +Shall sum my count and make my old excuse,' +Proving his beauty by succession thine! +This were to be new made when thou art old, +And see thy blood warm when thou feel'st it cold. +Look in thy glass, and tell the face thou viewest +Now is the time that face should form another; +Whose fresh repair if now thou not renewest, +Thou dost beguile the world, unbless some mother. +For where is she so fair whose unear'd womb +Disdains the tillage of thy husbandry? +Or who is he so fond will be the tomb +Of his self-love, to stop posterity? +Thou art thy mother's glass, and she in thee +Calls back the lovely April of her prime: +So thou through windows of thine age shall see +Despite of wrinkles this thy golden time. +But if thou live, remember'd not to be, +Die single, and thine image dies with thee. +Unthrifty loveliness, why dost thou spend +Upon thyself thy beauty's legacy? +Nature's bequest gives nothing but doth lend, +And being frank she lends to those are free. +Then, beauteous niggard, why dost thou abuse +The bounteous largess given thee to give? +Profitless usurer, why dost thou use +So great a sum of sums, yet canst not live? +For having traffic with thyself alone, +Thou of thyself thy sweet self dost deceive. +Then how, when nature calls thee to be gone, +What acceptable audit canst thou leave? +Thy unused beauty must be tomb'd with thee, +Which, used, lives th' executor to be. +Those hours, that with gentle work did frame +The lovely gaze where every eye doth dwell, +Will play the tyrants to the very same +And that unfair which fairly doth excel: +For never-resting time leads summer on +To hideous winter and confounds him there; +Sap cheque'd with frost and lusty leaves quite gone, +Beauty o'ersnow'd and bareness every where: +Then, were not summer's distillation left, +A liquid prisoner pent in walls of glass, +Beauty's effect with beauty were bereft, +Nor it nor no remembrance what it was: +But flowers distill'd though they with winter meet, +Leese but their show; their substance still lives sweet. +Then let not winter's ragged hand deface +In thee thy summer, ere thou be distill'd: +Make sweet some vial; treasure thou some place +With beauty's treasure, ere it be self-kill'd. +That use is not forbidden usury, +Which happies those that pay the willing loan; +That's for thyself to breed another thee, +Or ten times happier, be it ten for one; +Ten times thyself were happier than thou art, +If ten of thine ten times refigured thee: +Then what could death do, if thou shouldst depart, +Leaving thee living in posterity? +Be not self-will'd, for thou art much too fair +To be death's conquest and make worms thine heir. +Lo! in the orient when the gracious light +Lifts up his burning head, each under eye +Doth homage to his new-appearing sight, +Serving with looks his sacred majesty; +And having climb'd the steep-up heavenly hill, +Resembling strong youth in his middle age, +yet mortal looks adore his beauty still, +Attending on his golden pilgrimage; +But when from highmost pitch, with weary car, +Like feeble age, he reeleth from the day, +The eyes, 'fore duteous, now converted are +From his low tract and look another way: +So thou, thyself out-going in thy noon, +Unlook'd on diest, unless thou get a son. +Music to hear, why hear'st thou music sadly? +Sweets with sweets war not, joy delights in joy. +Why lovest thou that which thou receivest not gladly, +Or else receivest with pleasure thine annoy? +If the true concord of well-tuned sounds, +By unions married, do offend thine ear, +They do but sweetly chide thee, who confounds +In singleness the parts that thou shouldst bear. +Mark how one string, sweet husband to another, +Strikes each in each by mutual ordering, +Resembling sire and child and happy mother +Who all in one, one pleasing note do sing: +Whose speechless song, being many, seeming one, +Sings this to thee: 'thou single wilt prove none.' +Is it for fear to wet a widow's eye +That thou consumest thyself in single life? +Ah! if thou issueless shalt hap to die. +The world will wail thee, like a makeless wife; +The world will be thy widow and still weep +That thou no form of thee hast left behind, +When every private widow well may keep +By children's eyes her husband's shape in mind. +Look, what an unthrift in the world doth spend +Shifts but his place, for still the world enjoys it; +But beauty's waste hath in the world an end, +And kept unused, the user so destroys it. +No love toward others in that bosom sits +That on himself such murderous shame commits. +For shame! deny that thou bear'st love to any, +Who for thyself art so unprovident. +Grant, if thou wilt, thou art beloved of many, +But that thou none lovest is most evident; +For thou art so possess'd with murderous hate +That 'gainst thyself thou stick'st not to conspire. +Seeking that beauteous roof to ruinate +Which to repair should be thy chief desire. +O, change thy thought, that I may change my mind! +Shall hate be fairer lodged than gentle love? +Be, as thy presence is, gracious and kind, +Or to thyself at least kind-hearted prove: +Make thee another self, for love of me, +That beauty still may live in thine or thee. +As fast as thou shalt wane, so fast thou growest +In one of thine, from that which thou departest; +And that fresh blood which youngly thou bestowest +Thou mayst call thine when thou from youth convertest. +Herein lives wisdom, beauty and increase: +Without this, folly, age and cold decay: +If all were minded so, the times should cease +And threescore year would make the world away. +Let those whom Nature hath not made for store, +Harsh featureless and rude, barrenly perish: +Look, whom she best endow'd she gave the more; +Which bounteous gift thou shouldst in bounty cherish: +She carved thee for her seal, and meant thereby +Thou shouldst print more, not let that copy die. +When I do count the clock that tells the time, +And see the brave day sunk in hideous night; +When I behold the violet past prime, +And sable curls all silver'd o'er with white; +When lofty trees I see barren of leaves +Which erst from heat did canopy the herd, +And summer's green all girded up in sheaves +Borne on the bier with white and bristly beard, +Then of thy beauty do I question make, +That thou among the wastes of time must go, +Since sweets and beauties do themselves forsake +And die as fast as they see others grow; +And nothing 'gainst Time's scythe can make defence +Save breed, to brave him when he takes thee hence. +O, that you were yourself! but, love, you are +No longer yours than you yourself here live: +Against this coming end you should prepare, +And your sweet semblance to some other give. +So should that beauty which you hold in lease +Find no determination: then you were +Yourself again after yourself's decease, +When your sweet issue your sweet form should bear. +Who lets so fair a house fall to decay, +Which husbandry in honour might uphold +Against the stormy gusts of winter's day +And barren rage of death's eternal cold? +O, none but unthrifts! Dear my love, you know +You had a father: let your son say so. +Not from the stars do I my judgment pluck; +And yet methinks I have astronomy, +But not to tell of good or evil luck, +Of plagues, of dearths, or seasons' quality; +Nor can I fortune to brief minutes tell, +Pointing to each his thunder, rain and wind, +Or say with princes if it shall go well, +By oft predict that I in heaven find: +But from thine eyes my knowledge I derive, +And, constant stars, in them I read such art +As truth and beauty shall together thrive, +If from thyself to store thou wouldst convert; +Or else of thee this I prognosticate: +Thy end is truth's and beauty's doom and date. +When I consider every thing that grows +Holds in perfection but a little moment, +That this huge stage presenteth nought but shows +Whereon the stars in secret influence comment; +When I perceive that men as plants increase, +Cheered and cheque'd even by the self-same sky, +Vaunt in their youthful sap, at height decrease, +And wear their brave state out of memory; +Then the conceit of this inconstant stay +Sets you most rich in youth before my sight, +Where wasteful Time debateth with Decay, +To change your day of youth to sullied night; +And all in war with Time for love of you, +As he takes from you, I engraft you new. +But wherefore do not you a mightier way +Make war upon this bloody tyrant, Time? +And fortify yourself in your decay +With means more blessed than my barren rhyme? +Now stand you on the top of happy hours, +And many maiden gardens yet unset +With virtuous wish would bear your living flowers, +Much liker than your painted counterfeit: +So should the lines of life that life repair, +Which this, Time's pencil, or my pupil pen, +Neither in inward worth nor outward fair, +Can make you live yourself in eyes of men. +To give away yourself keeps yourself still, +And you must live, drawn by your own sweet skill. +Who will believe my verse in time to come, +If it were fill'd with your most high deserts? +Though yet, heaven knows, it is but as a tomb +Which hides your life and shows not half your parts. +If I could write the beauty of your eyes +And in fresh numbers number all your graces, +The age to come would say 'This poet lies: +Such heavenly touches ne'er touch'd earthly faces.' +So should my papers yellow'd with their age +Be scorn'd like old men of less truth than tongue, +And your true rights be term'd a poet's rage +And stretched metre of an antique song: +But were some child of yours alive that time, +You should live twice; in it and in my rhyme. +Shall I compare thee to a summer's day? +Thou art more lovely and more temperate: +Rough winds do shake the darling buds of May, +And summer's lease hath all too short a date: +Sometime too hot the eye of heaven shines, +And often is his gold complexion dimm'd; +And every fair from fair sometime declines, +By chance or nature's changing course untrimm'd; +But thy eternal summer shall not fade +Nor lose possession of that fair thou owest; +Nor shall Death brag thou wander'st in his shade, +When in eternal lines to time thou growest: +So long as men can breathe or eyes can see, +So long lives this and this gives life to thee. +Devouring Time, blunt thou the lion's paws, +And make the earth devour her own sweet brood; +Pluck the keen teeth from the fierce tiger's jaws, +And burn the long-lived phoenix in her blood; +Make glad and sorry seasons as thou fleets, +And do whate'er thou wilt, swift-footed Time, +To the wide world and all her fading sweets; +But I forbid thee one most heinous crime: +O, carve not with thy hours my love's fair brow, +Nor draw no lines there with thine antique pen; +Him in thy course untainted do allow +For beauty's pattern to succeeding men. +Yet, do thy worst, old Time: despite thy wrong, +My love shall in my verse ever live young. +A woman's face with Nature's own hand painted +Hast thou, the master-mistress of my passion; +A woman's gentle heart, but not acquainted +With shifting change, as is false women's fashion; +An eye more bright than theirs, less false in rolling, +Gilding the object whereupon it gazeth; +A man in hue, all 'hues' in his controlling, +Much steals men's eyes and women's souls amazeth. +And for a woman wert thou first created; +Till Nature, as she wrought thee, fell a-doting, +And by addition me of thee defeated, +By adding one thing to my purpose nothing. +But since she prick'd thee out for women's pleasure, +Mine be thy love and thy love's use their treasure. +So is it not with me as with that Muse +Stirr'd by a painted beauty to his verse, +Who heaven itself for ornament doth use +And every fair with his fair doth rehearse +Making a couplement of proud compare, +With sun and moon, with earth and sea's rich gems, +With April's first-born flowers, and all things rare +That heaven's air in this huge rondure hems. +O' let me, true in love, but truly write, +And then believe me, my love is as fair +As any mother's child, though not so bright +As those gold candles fix'd in heaven's air: +Let them say more than like of hearsay well; +I will not praise that purpose not to sell. +My glass shall not persuade me I am old, +So long as youth and thou are of one date; +But when in thee time's furrows I behold, +Then look I death my days should expiate. +For all that beauty that doth cover thee +Is but the seemly raiment of my heart, +Which in thy breast doth live, as thine in me: +How can I then be elder than thou art? +O, therefore, love, be of thyself so wary +As I, not for myself, but for thee will; +Bearing thy heart, which I will keep so chary +As tender nurse her babe from faring ill. +Presume not on thy heart when mine is slain; +Thou gavest me thine, not to give back again. +As an unperfect actor on the stage +Who with his fear is put besides his part, +Or some fierce thing replete with too much rage, +Whose strength's abundance weakens his own heart. +So I, for fear of trust, forget to say +The perfect ceremony of love's rite, +And in mine own love's strength seem to decay, +O'ercharged with burden of mine own love's might. +O, let my books be then the eloquence +And dumb presagers of my speaking breast, +Who plead for love and look for recompense +More than that tongue that more hath more express'd. +O, learn to read what silent love hath writ: +To hear with eyes belongs to love's fine wit. +Mine eye hath play'd the painter and hath stell'd +Thy beauty's form in table of my heart; +My body is the frame wherein 'tis held, +And perspective it is the painter's art. +For through the painter must you see his skill, +To find where your true image pictured lies; +Which in my bosom's shop is hanging still, +That hath his windows glazed with thine eyes. +Now see what good turns eyes for eyes have done: +Mine eyes have drawn thy shape, and thine for me +Are windows to my breast, where-through the sun +Delights to peep, to gaze therein on thee; +Yet eyes this cunning want to grace their art; +They draw but what they see, know not the heart. +Let those who are in favour with their stars +Of public honour and proud titles boast, +Whilst I, whom fortune of such triumph bars, +Unlook'd for joy in that I honour most. +Great princes' favourites their fair leaves spread +But as the marigold at the sun's eye, +And in themselves their pride lies buried, +For at a frown they in their glory die. +The painful warrior famoused for fight, +After a thousand victories once foil'd, +Is from the book of honour razed quite, +And all the rest forgot for which he toil'd: +Then happy I, that love and am beloved +Where I may not remove nor be removed. +Lord of my love, to whom in vassalage +Thy merit hath my duty strongly knit, +To thee I send this written embassage, +To witness duty, not to show my wit: +Duty so great, which wit so poor as mine +May make seem bare, in wanting words to show it, +But that I hope some good conceit of thine +In thy soul's thought, all naked, will bestow it; +Till whatsoever star that guides my moving +Points on me graciously with fair aspect +And puts apparel on my tatter'd loving, +To show me worthy of thy sweet respect: +Then may I dare to boast how I do love thee; +Till then not show my head where thou mayst prove me. +Weary with toil, I haste me to my bed, +The dear repose for limbs with travel tired; +But then begins a journey in my head, +To work my mind, when body's work's expired: +For then my thoughts, from far where I abide, +Intend a zealous pilgrimage to thee, +And keep my drooping eyelids open wide, +Looking on darkness which the blind do see +Save that my soul's imaginary sight +Presents thy shadow to my sightless view, +Which, like a jewel hung in ghastly night, +Makes black night beauteous and her old face new. +Lo! thus, by day my limbs, by night my mind, +For thee and for myself no quiet find. +How can I then return in happy plight, +That am debarr'd the benefit of rest? +When day's oppression is not eased by night, +But day by night, and night by day, oppress'd? +And each, though enemies to either's reign, +Do in consent shake hands to torture me; +The one by toil, the other to complain +How far I toil, still farther off from thee. +I tell the day, to please them thou art bright +And dost him grace when clouds do blot the heaven: +So flatter I the swart-complexion'd night, +When sparkling stars twire not thou gild'st the even. +But day doth daily draw my sorrows longer +And night doth nightly make grief's strength seem stronger. +When, in disgrace with fortune and men's eyes, +I all alone beweep my outcast state +And trouble deal heaven with my bootless cries +And look upon myself and curse my fate, +Wishing me like to one more rich in hope, +Featured like him, like him with friends possess'd, +Desiring this man's art and that man's scope, +With what I most enjoy contented least; +Yet in these thoughts myself almost despising, +Haply I think on thee, and then my state, +Like to the lark at break of day arising +From sullen earth, sings hymns at heaven's gate; +For thy sweet love remember'd such wealth brings +That then I scorn to change my state with kings. +When to the sessions of sweet silent thought +I summon up remembrance of things past, +I sigh the lack of many a thing I sought, +And with old woes new wail my dear time's waste: +Then can I drown an eye, unused to flow, +For precious friends hid in death's dateless night, +And weep afresh love's long since cancell'd woe, +And moan the expense of many a vanish'd sight: +Then can I grieve at grievances foregone, +And heavily from woe to woe tell o'er +The sad account of fore-bemoaned moan, +Which I new pay as if not paid before. +But if the while I think on thee, dear friend, +All losses are restored and sorrows end. +Thy bosom is endeared with all hearts, +Which I by lacking have supposed dead, +And there reigns love and all love's loving parts, +And all those friends which I thought buried. +How many a holy and obsequious tear +Hath dear religious love stol'n from mine eye +As interest of the dead, which now appear +But things removed that hidden in thee lie! +Thou art the grave where buried love doth live, +Hung with the trophies of my lovers gone, +Who all their parts of me to thee did give; +That due of many now is thine alone: +Their images I loved I view in thee, +And thou, all they, hast all the all of me. +If thou survive my well-contented day, +When that churl Death my bones with dust shall cover, +And shalt by fortune once more re-survey +These poor rude lines of thy deceased lover, +Compare them with the bettering of the time, +And though they be outstripp'd by every pen, +Reserve them for my love, not for their rhyme, +Exceeded by the height of happier men. +O, then vouchsafe me but this loving thought: +'Had my friend's Muse grown with this growing age, +A dearer birth than this his love had brought, +To march in ranks of better equipage: +But since he died and poets better prove, +Theirs for their style I'll read, his for his love.' +Full many a glorious morning have I seen +Flatter the mountain-tops with sovereign eye, +Kissing with golden face the meadows green, +Gilding pale streams with heavenly alchemy; +Anon permit the basest clouds to ride +With ugly rack on his celestial face, +And from the forlorn world his visage hide, +Stealing unseen to west with this disgrace: +Even so my sun one early morn did shine +With all triumphant splendor on my brow; +But out, alack! he was but one hour mine; +The region cloud hath mask'd him from me now. +Yet him for this my love no whit disdaineth; +Suns of the world may stain when heaven's sun staineth. +Why didst thou promise such a beauteous day, +And make me travel forth without my cloak, +To let base clouds o'ertake me in my way, +Hiding thy bravery in their rotten smoke? +'Tis not enough that through the cloud thou break, +To dry the rain on my storm-beaten face, +For no man well of such a salve can speak +That heals the wound and cures not the disgrace: +Nor can thy shame give physic to my grief; +Though thou repent, yet I have still the loss: +The offender's sorrow lends but weak relief +To him that bears the strong offence's cross. +Ah! but those tears are pearl which thy love sheds, +And they are rich and ransom all ill deeds. +No more be grieved at that which thou hast done: +Roses have thorns, and silver fountains mud; +Clouds and eclipses stain both moon and sun, +And loathsome canker lives in sweetest bud. +All men make faults, and even I in this, +Authorizing thy trespass with compare, +Myself corrupting, salving thy amiss, +Excusing thy sins more than thy sins are; +For to thy sensual fault I bring in sense-- +Thy adverse party is thy advocate-- +And 'gainst myself a lawful plea commence: +Such civil war is in my love and hate +That I an accessary needs must be +To that sweet thief which sourly robs from me. +Let me confess that we two must be twain, +Although our undivided loves are one: +So shall those blots that do with me remain +Without thy help by me be borne alone. +In our two loves there is but one respect, +Though in our lives a separable spite, +Which though it alter not love's sole effect, +Yet doth it steal sweet hours from love's delight. +I may not evermore acknowledge thee, +Lest my bewailed guilt should do thee shame, +Nor thou with public kindness honour me, +Unless thou take that honour from thy name: +But do not so; I love thee in such sort +As, thou being mine, mine is thy good report. +As a decrepit father takes delight +To see his active child do deeds of youth, +So I, made lame by fortune's dearest spite, +Take all my comfort of thy worth and truth. +For whether beauty, birth, or wealth, or wit, +Or any of these all, or all, or more, +Entitled in thy parts do crowned sit, +I make my love engrafted to this store: +So then I am not lame, poor, nor despised, +Whilst that this shadow doth such substance give +That I in thy abundance am sufficed +And by a part of all thy glory live. +Look, what is best, that best I wish in thee: +This wish I have; then ten times happy me! \ No newline at end of file From f2c67c9c06cfc7287298f85c1df6cf5a4352e4e7 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Mon, 15 Dec 2025 19:23:28 +0800 Subject: [PATCH 08/26] Merge branch 'main' of https://github.com/yenuo26/vllm-omni into main # Conflicts: # docs/api/README.md # docs/user_guide/examples/offline_inference/text_to_image.md --- tests/e2e/online_serving/test_qwen2_5_omni.py | 51 ++----------------- 1 file changed, 4 insertions(+), 47 deletions(-) diff --git a/tests/e2e/online_serving/test_qwen2_5_omni.py b/tests/e2e/online_serving/test_qwen2_5_omni.py index 2ccee9f84..7521da469 100644 --- a/tests/e2e/online_serving/test_qwen2_5_omni.py +++ b/tests/e2e/online_serving/test_qwen2_5_omni.py @@ -10,16 +10,16 @@ import openai import pytest +import time from vllm.assets.video import VideoAsset from tests.conftest import OmniServer os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -models = ["Qwen/Qwen2.5-Omni-3B"] +models = ["Qwen/Qwen2.5-Omni-7B"] # CI stage config for 2*H100-80G GPUs stage_configs = [str(Path(__file__).parent / "stage_configs" / "qwen2_5_omni_ci.yaml")] - # Create parameter combinations for model and stage config test_params = [(model, stage_config) for model in models for stage_config in stage_configs] @@ -33,50 +33,6 @@ def client(omni_server): ) -@pytest.fixture(scope="session") -def base64_encoded_video() -> str: - """Base64 encoded video for testing.""" - import base64 - - video = VideoAsset(name="baby_reading", num_frames=4) - with open(video.video_path, "rb") as f: - content = f.read() - return base64.b64encode(content).decode("utf-8") - - -def get_system_prompt(): - return { - "role": "system", - "content": [ - { - "type": "text", - "text": ( - "You are Qwen, a virtual human developed by the Qwen Team, " - "Alibaba Group, capable of perceiving auditory and visual inputs, " - "as well as generating text and speech." - ), - } - ], - } - - -def dummy_messages_from_video_data( - video_data_url: str, - content_text: str = "Describe the video briefly.", -): - """Create messages with video data URL for OpenAI API.""" - return [ - get_system_prompt(), - { - "role": "user", - "content": [ - {"type": "video_url", "video_url": {"url": video_data_url}}, - {"type": "text", "text": content_text}, - ], - }, - ] - - @pytest.mark.parametrize("test_param", test_params) def test_video_to_audio( test_param, @@ -84,5 +40,6 @@ def test_video_to_audio( """Test processing video, generating audio output via OpenAI API.""" # Create data URL for the base64 encoded video model, stage_config_path = test_param - with OmniServer(model, ["--stage-configs-path", stage_config_path]) as server: + with OmniServer(model, []) as server: + time.sleep(1000000) pass From 94875c11772023689cbe08ae0f32668db2db0d0b Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Mon, 15 Dec 2025 19:56:24 +0800 Subject: [PATCH 09/26] =?UTF-8?q?=E6=8F=90=E4=BA=A4benchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmarks/README.md | 395 +---- benchmarks/auto_tune/README.md | 218 +++ benchmarks/{ => auto_tune}/auto_tune.sh | 209 ++- benchmarks/auto_tune/batch_auto_tune.sh | 128 ++ benchmarks/backend_request_func.py | 48 +- benchmarks/benchmark_block_pool.py | 74 + benchmarks/benchmark_dataset.py | 1167 ------------ benchmarks/benchmark_latency.py | 185 +- .../benchmark_long_document_qa_throughput.py | 8 +- benchmarks/benchmark_ngram_proposer.py | 213 +++ benchmarks/benchmark_prefix_caching.py | 8 +- benchmarks/benchmark_prioritization.py | 8 +- benchmarks/benchmark_serving.py | 1233 +------------ .../benchmark_serving_structured_output.py | 62 +- benchmarks/benchmark_throughput.py | 729 +------- benchmarks/benchmark_utils.py | 55 +- .../cutlass_benchmarks/w8a8_benchmarks.py | 11 +- .../disagg_overhead_benchmark.sh | 66 +- .../disagg_performance_benchmark.sh | 36 +- .../disagg_prefill_proxy_server.py | 240 ++- benchmarks/disagg_benchmarks/rate_limiter.py | 45 + benchmarks/disagg_benchmarks/request_queue.py | 39 + benchmarks/kernels/bench_block_fp8_gemm.py | 145 ++ benchmarks/kernels/bench_fp8_gemm.py | 248 +-- benchmarks/kernels/bench_int8_gemm.py | 169 ++ benchmarks/kernels/bench_nvfp4_gemm.py | 198 +++ .../kernels/bench_per_token_quant_fp8.py | 269 +++ benchmarks/kernels/benchmark_activation.py | 104 ++ benchmarks/kernels/benchmark_aqlm.py | 345 ---- benchmarks/kernels/benchmark_bitblas.py | 4 +- .../kernels/benchmark_cutlass_fp4_moe.py | 58 +- .../kernels/benchmark_cutlass_moe_fp8.py | 406 +++++ .../kernels/benchmark_device_communicators.py | 508 ++++++ .../kernels/benchmark_grouped_gemm_cutlass.py | 79 +- benchmarks/kernels/benchmark_lora.py | 18 +- benchmarks/kernels/benchmark_machete.py | 59 +- benchmarks/kernels/benchmark_marlin.py | 229 ++- benchmarks/kernels/benchmark_moe.py | 140 +- .../kernels/benchmark_moe_align_block_size.py | 74 + .../benchmark_moe_permute_unpermute.py | 77 +- benchmarks/kernels/benchmark_mrope.py | 328 ++++ .../benchmark_per_token_group_quant.py | 159 ++ benchmarks/kernels/benchmark_polynorm.py | 155 ++ .../benchmark_reshape_and_cache_flash.py | 212 +++ .../kernels/benchmark_silu_mul_fp8_quant.py | 675 +++++++ .../benchmark_trtllm_decode_attention.py | 293 +++ .../benchmark_trtllm_prefill_attention.py | 308 ++++ .../kernels/benchmark_w8a8_block_fp8.py | 7 +- benchmarks/kernels/deepgemm/README.md | 4 +- .../benchmark_fp8_block_dense_gemm.py | 60 +- benchmarks/kernels/weight_shapes.py | 6 + benchmarks/multi_turn/README.md | 174 ++ benchmarks/multi_turn/bench_dataset.py | 588 ++++++ benchmarks/multi_turn/bench_utils.py | 28 + .../benchmark_serving_multi_turn.py | 1569 +++++++++++++++++ .../multi_turn/convert_sharegpt_to_openai.py | 354 ++++ benchmarks/multi_turn/requirements.txt | 5 + 57 files changed, 8566 insertions(+), 4666 deletions(-) create mode 100644 benchmarks/auto_tune/README.md rename benchmarks/{ => auto_tune}/auto_tune.sh (54%) create mode 100644 benchmarks/auto_tune/batch_auto_tune.sh create mode 100644 benchmarks/benchmark_block_pool.py delete mode 100644 benchmarks/benchmark_dataset.py create mode 100644 benchmarks/benchmark_ngram_proposer.py create mode 100644 benchmarks/disagg_benchmarks/rate_limiter.py create mode 100644 benchmarks/disagg_benchmarks/request_queue.py create mode 100644 benchmarks/kernels/bench_block_fp8_gemm.py create mode 100644 benchmarks/kernels/bench_int8_gemm.py create mode 100644 benchmarks/kernels/bench_nvfp4_gemm.py create mode 100644 benchmarks/kernels/bench_per_token_quant_fp8.py create mode 100644 benchmarks/kernels/benchmark_activation.py delete mode 100644 benchmarks/kernels/benchmark_aqlm.py create mode 100644 benchmarks/kernels/benchmark_cutlass_moe_fp8.py create mode 100644 benchmarks/kernels/benchmark_device_communicators.py create mode 100644 benchmarks/kernels/benchmark_moe_align_block_size.py create mode 100644 benchmarks/kernels/benchmark_mrope.py create mode 100644 benchmarks/kernels/benchmark_per_token_group_quant.py create mode 100644 benchmarks/kernels/benchmark_polynorm.py create mode 100644 benchmarks/kernels/benchmark_reshape_and_cache_flash.py create mode 100644 benchmarks/kernels/benchmark_silu_mul_fp8_quant.py create mode 100644 benchmarks/kernels/benchmark_trtllm_decode_attention.py create mode 100644 benchmarks/kernels/benchmark_trtllm_prefill_attention.py create mode 100644 benchmarks/multi_turn/README.md create mode 100644 benchmarks/multi_turn/bench_dataset.py create mode 100644 benchmarks/multi_turn/bench_utils.py create mode 100644 benchmarks/multi_turn/benchmark_serving_multi_turn.py create mode 100644 benchmarks/multi_turn/convert_sharegpt_to_openai.py create mode 100644 benchmarks/multi_turn/requirements.txt diff --git a/benchmarks/README.md b/benchmarks/README.md index 6f9fbb91c..269a4d51e 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,389 +1,20 @@ -# Benchmarking vLLM +# Benchmarks -This README guides you through running benchmark tests with the extensive -datasets supported on vLLM. It’s a living document, updated as new features and datasets -become available. +This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation. -## Dataset Overview +## Contents - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DatasetOnlineOfflineData Path
ShareGPTwget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
BurstGPTwget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
SonnetLocal file: benchmarks/sonnet.txt
Randomsynthetic
HuggingFace-VisionArenalmarena-ai/VisionArena-Chat
HuggingFace-InstructCoderlikaixin/InstructCoder
HuggingFace-AIMOAI-MO/aimo-validation-aime , AI-MO/NuminaMath-1.5, AI-MO/NuminaMath-CoT
HuggingFace-Otherlmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered
CustomLocal file: data.jsonl
+- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput) +- **Throughput benchmarks**: Scripts for testing offline batch inference performance +- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference +- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.) -✅: supported +## Usage -🟡: Partial support +For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli). -🚧: to be supported +For full CLI reference see: -**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` - ---- -## Example - Online Benchmark - -First start serving your model - -```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests -``` - -Then run the benchmarking script - -```bash -# download dataset -# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 vllm/benchmarks/benchmark_serving.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --endpoint /v1/completions \ - --dataset-name sharegpt \ - --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ - --num-prompts 10 -``` - -If successful, you will see the following output - -``` -============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 5.78 -Total input tokens: 1369 -Total generated tokens: 2212 -Request throughput (req/s): 1.73 -Output token throughput (tok/s): 382.89 -Total Token throughput (tok/s): 619.85 ----------------Time to First Token---------------- -Mean TTFT (ms): 71.54 -Median TTFT (ms): 73.88 -P99 TTFT (ms): 79.49 ------Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 7.91 -Median TPOT (ms): 7.96 -P99 TPOT (ms): 8.03 ----------------Inter-token Latency---------------- -Mean ITL (ms): 7.74 -Median ITL (ms): 7.70 -P99 ITL (ms): 8.39 -================================================== -``` - -### Custom Dataset -If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl - -``` -{"prompt": "What is the capital of India?"} -{"prompt": "What is the capital of Iran?"} -{"prompt": "What is the capital of China?"} -``` - -```bash -# start server -VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests -``` - -```bash -# run benchmarking script -python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ - --backend vllm \ - --model meta-llama/Llama-3.1-8B-Instruct \ - --endpoint /v1/completions \ - --dataset-name custom \ - --dataset-path \ - --custom-skip-chat-template \ - --num-prompts 80 \ - --max-concurrency 1 \ - --temperature=0.3 \ - --top-p=0.75 \ - --result-dir "./log/" -``` - -You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. - -### VisionArena Benchmark for Vision Language Models - -```bash -# need a model with vision capability here -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests -``` - -```bash -python3 vllm/benchmarks/benchmark_serving.py \ - --backend openai-chat \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name hf \ - --dataset-path lmarena-ai/VisionArena-Chat \ - --hf-split train \ - --num-prompts 1000 -``` - -### InstructCoder Benchmark with Speculative Decoding - -``` bash -VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ - --speculative-config $'{"method": "ngram", - "num_speculative_tokens": 5, "prompt_lookup_max": 5, - "prompt_lookup_min": 2}' -``` - -``` bash -python3 benchmarks/benchmark_serving.py \ - --model meta-llama/Meta-Llama-3-8B-Instruct \ - --dataset-name hf \ - --dataset-path likaixin/InstructCoder \ - --num-prompts 2048 -``` - -### Other HuggingFaceDataset Examples - -```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests -``` - -**`lmms-lab/LLaVA-OneVision-Data`** - -```bash -python3 vllm/benchmarks/benchmark_serving.py \ - --backend openai-chat \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name hf \ - --dataset-path lmms-lab/LLaVA-OneVision-Data \ - --hf-split train \ - --hf-subset "chart2text(cauldron)" \ - --num-prompts 10 -``` - -**`Aeala/ShareGPT_Vicuna_unfiltered`** - -```bash -python3 vllm/benchmarks/benchmark_serving.py \ - --backend openai-chat \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name hf \ - --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ - --hf-split train \ - --num-prompts 10 -``` - -**`AI-MO/aimo-validation-aime`** - -``` bash -python3 vllm/benchmarks/benchmark_serving.py \ - --model Qwen/QwQ-32B \ - --dataset-name hf \ - --dataset-path AI-MO/aimo-validation-aime \ - --num-prompts 10 \ - --seed 42 -``` - -**`philschmid/mt-bench`** - -``` bash -python3 vllm/benchmarks/benchmark_serving.py \ - --model Qwen/QwQ-32B \ - --dataset-name hf \ - --dataset-path philschmid/mt-bench \ - --num-prompts 80 -``` - -### Running With Sampling Parameters - -When using OpenAI-compatible backends such as `vllm`, optional sampling -parameters can be specified. Example client command: - -```bash -python3 vllm/benchmarks/benchmark_serving.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --endpoint /v1/completions \ - --dataset-name sharegpt \ - --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ - --top-k 10 \ - --top-p 0.9 \ - --temperature 0.5 \ - --num-prompts 10 -``` - ---- -## Example - Offline Throughput Benchmark - -```bash -python3 vllm/benchmarks/benchmark_throughput.py \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset-name sonnet \ - --dataset-path vllm/benchmarks/sonnet.txt \ - --num-prompts 10 -``` - -If successful, you will see the following output - -``` -Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s -Total num prompt tokens: 5014 -Total num output tokens: 1500 -``` - -### VisionArena Benchmark for Vision Language Models - -``` bash -python3 vllm/benchmarks/benchmark_throughput.py \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --backend vllm-chat \ - --dataset-name hf \ - --dataset-path lmarena-ai/VisionArena-Chat \ - --num-prompts 1000 \ - --hf-split train -``` - -The `num prompt tokens` now includes image token counts - -``` -Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s -Total num prompt tokens: 14527 -Total num output tokens: 1280 -``` - -### InstructCoder Benchmark with Speculative Decoding - -``` bash -VLLM_WORKER_MULTIPROC_METHOD=spawn \ -VLLM_USE_V1=1 \ -python3 vllm/benchmarks/benchmark_throughput.py \ - --dataset-name=hf \ - --dataset-path=likaixin/InstructCoder \ - --model=meta-llama/Meta-Llama-3-8B-Instruct \ - --input-len=1000 \ - --output-len=100 \ - --num-prompts=2048 \ - --async-engine \ - --speculative-config $'{"method": "ngram", - "num_speculative_tokens": 5, "prompt_lookup_max": 5, - "prompt_lookup_min": 2}' -``` - -``` -Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s -Total num prompt tokens: 261136 -Total num output tokens: 204800 -``` - -### Other HuggingFaceDataset Examples - -**`lmms-lab/LLaVA-OneVision-Data`** - -```bash -python3 vllm/benchmarks/benchmark_throughput.py \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --backend vllm-chat \ - --dataset-name hf \ - --dataset-path lmms-lab/LLaVA-OneVision-Data \ - --hf-split train \ - --hf-subset "chart2text(cauldron)" \ - --num-prompts 10 -``` - -**`Aeala/ShareGPT_Vicuna_unfiltered`** - -```bash -python3 vllm/benchmarks/benchmark_throughput.py \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --backend vllm-chat \ - --dataset-name hf \ - --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ - --hf-split train \ - --num-prompts 10 -``` - -**`AI-MO/aimo-validation-aime`** - -```bash -python3 benchmarks/benchmark_throughput.py \ - --model Qwen/QwQ-32B \ - --backend vllm \ - --dataset-name hf \ - --dataset-path AI-MO/aimo-validation-aime \ - --hf-split train \ - --num-prompts 10 -``` - -### Benchmark with LoRA Adapters - -``` bash -# download dataset -# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 vllm/benchmarks/benchmark_throughput.py \ - --model meta-llama/Llama-2-7b-hf \ - --backend vllm \ - --dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \ - --dataset_name sharegpt \ - --num-prompts 10 \ - --max-loras 2 \ - --max-lora-rank 8 \ - --enable-lora \ - --lora-path yard1/llama-2-7b-sql-lora-test - ``` +- +- +- diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md new file mode 100644 index 000000000..d1bdb4c43 --- /dev/null +++ b/benchmarks/auto_tune/README.md @@ -0,0 +1,218 @@ +# Automated vLLM Server Parameter Tuning + +This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate. + +## Table of Contents + +- [Prerequisites](#prerequisites) +- [Configuration](#configuration) +- [How to Run](#how-to-run) +- [Example Use Cases](#example-use-cases) +- [Output](#output) +- [How It Works](#how-it-works) + +## Prerequisites + +Before running the script, please ensure the following steps are completed: + +1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch. + +```bash +git clone https://github.com/vllm-project/vllm.git +cd vllm +# git checkout +``` + +1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions. + +2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible. + +## Configuration + +You must set the following variables at the top of the script before execution. + + Note: You can also override the default values below via environment variables when running the script. + +```bash +MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh +``` + +| Variable | Description | Example Value | +| --- | --- | --- | +| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` | +| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` | +| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` | +| `TP` | **Required.** The tensor-parallelism size. | `1` | +| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) | +| `INPUT_LEN` | **Required.** Request input length. | `4000` | +| `OUTPUT_LEN` | **Required.** Request output length. | `16` | +| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` | +| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` | +| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` | +| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` | +| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` | + +**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`. + +## How to Run + +1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section. +2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost. + +```bash +cd +bash auto_tune.sh +``` + + Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself. + +## Example Use Cases + +Here are a few examples of how to configure the script for different goals: + +### 1. Maximize Throughput (No Latency Constraint) + +- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens. +- **Configuration**: + +```bash +INPUT_LEN=1800 +OUTPUT_LEN=20 +MAX_MODEL_LEN=2048 +MIN_CACHE_HIT_PCT=0 +MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number +``` + +#### 2. Maximize Throughput with a Latency Requirement + +- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms. +- **Configuration**: + +```bash +INPUT_LEN=1800 +OUTPUT_LEN=20 +MAX_MODEL_LEN=2048 +MIN_CACHE_HIT_PCT=0 +MAX_LATENCY_ALLOWED_MS=500 +``` + +#### 3. Maximize Throughput with Prefix Caching and Latency Requirements + +- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms. +- **Configuration**: + +```bash +INPUT_LEN=1800 +OUTPUT_LEN=20 +MAX_MODEL_LEN=2048 +MIN_CACHE_HIT_PCT=60 +MAX_LATENCY_ALLOWED_MS=500 +``` + +## Output + +After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`. + +- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run: + - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination. + - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run. + +- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found. + +```text +# Example result.txt content +hash:a1b2c3d4... +max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8 +max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500 +... +best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile +``` + + If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict. + +- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run. + +## How It Works + +The script follows a systematic process to find the optimal parameters: + +1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing. + +2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists. + +3. **Latency-Aware Throughput Search**: For each parameter combination: + - The vLLM server is started. + - A benchmark is first run with an infinite request rate (`--request-rate inf`). + - If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration. + - If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement. + +4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far. + +5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard. + +## Batched `auto_tune` + +The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file. + +### Prerequisites + +- **jq**: This script requires `jq` to parse the JSON configuration file. +- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated. + +### How to Run + +1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run. + +2. **Execute the script**: + + ```bash + bash batch_auto_tune.sh [gcs_upload_path] + ``` + + - ``: **Required.** Path to your JSON configuration file. + - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`). + +### Configuration File + +The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run. + +Here is an example `runs_config.json` with two benchmark configurations: + +```json +[ + { + "base": "/home/user", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "system": "TPU", # OR GPU + "tp": 8, + "input_len": 128, + "output_len": 2048, + "max_model_len": 2300, + "num_seqs_list": "128 256", + "num_batched_tokens_list": "8192 16384" + }, + { + "base": "/home/user", + "model": "meta-llama/Llama-3.1-70B-Instruct", + "system": "TPU", # OR GPU + "tp": 8, + "input_len": 4000, + "output_len": 16, + "max_model_len": 4096, + "num_seqs_list": "64 128", + "num_batched_tokens_list": "4096 8192", + "max_latency_allowed_ms": 500 + } +] +``` + +### Output + +The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added: + +- `run_id`: A unique identifier for the run, derived from the timestamp. +- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`). +- `results`: The content of the `result.txt` file from the `auto_tune.sh` run. +- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided). + +A summary of successful and failed runs is also printed to the console upon completion. diff --git a/benchmarks/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh similarity index 54% rename from benchmarks/auto_tune.sh rename to benchmarks/auto_tune/auto_tune.sh index 1b01bbd61..b333ba9cd 100644 --- a/benchmarks/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -1,56 +1,50 @@ #!/bin/bash -# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. -# The current server parameter combination is max_num_seqs and max_num_batched_tokens -# It also supports additional requirement: e2e latency and prefix cache. - -# Pre-requisite: -# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. -# 2. If the model is customized, replace the MODEL's config with the customized config. -# 3. Set variables (ALL REQUIRED) -# BASE: your directory for vllm repo -# MODEL: the model served by vllm -# TP: ways of tensor parallelism -# DOWNLOAD_DIR: directory to download and load model weights. -# INPUT_LEN: request input len -# OUTPUT_LEN: request output len -# MIN_CACHE_HIT_PCT: prefix cache rate -# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 -# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with. -# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with. -# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST. -# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. -# 5. The final result will be saved in RESULT file. - - -# Example use cases -# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput? -# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000 -# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter? -# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500 -# 3. If we want to reach 60% prefix cache, what's the best server parameter? -# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500 +# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. +# See details in README (benchmarks/auto_tune/README.md). TAG=$(date +"%Y_%m_%d_%H_%M") -BASE="" -MODEL="meta-llama/Llama-3.1-8B-Instruct" -TP=1 -DOWNLOAD_DIR="" -INPUT_LEN=4000 -OUTPUT_LEN=16 -MIN_CACHE_HIT_PCT=0 -MAX_LATENCY_ALLOWED_MS=100000000000 -NUM_SEQS_LIST="128 256" -NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096" +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO} +BASE=${BASE:-"$SCRIPT_DIR/../../.."} +MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"} +SYSTEM=${SYSTEM:-"TPU"} +TP=${TP:-1} +DOWNLOAD_DIR=${DOWNLOAD_DIR:-""} +INPUT_LEN=${INPUT_LEN:-4000} +OUTPUT_LEN=${OUTPUT_LEN:-16} +MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096} +MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0} +MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000} +NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"} +NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"} LOG_FOLDER="$BASE/auto-benchmark/$TAG" RESULT="$LOG_FOLDER/result.txt" +PROFILE_PATH="$LOG_FOLDER/profile" -echo "result file: $RESULT" -echo "model: $MODEL" +echo "====================== AUTO TUNE PARAMETERS ====================" +echo "SCRIPT_DIR=$SCRIPT_DIR" +echo "BASE=$BASE" +echo "MODEL=$MODEL" +echo "SYSTEM=$SYSTEM" +echo "TP=$TP" +echo "DOWNLOAD_DIR=$DOWNLOAD_DIR" +echo "INPUT_LEN=$INPUT_LEN" +echo "OUTPUT_LEN=$OUTPUT_LEN" +echo "MAX_MODEL_LEN=$MAX_MODEL_LEN" +echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT" +echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS" +echo "NUM_SEQS_LIST=$NUM_SEQS_LIST" +echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST" +echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL" +echo "RESULT_FILE=$RESULT" +echo "====================== AUTO TUNEPARAMETERS ====================" rm -rf $LOG_FOLDER +rm -rf $PROFILE_PATH mkdir -p $LOG_FOLDER +mkdir -p $PROFILE_PATH cd "$BASE/vllm" @@ -60,36 +54,66 @@ current_hash=$(git rev-parse HEAD) echo "hash:$current_hash" >> "$RESULT" echo "current_hash: $current_hash" +TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN)) +RED='\033[0;31m' +if (( TOTAL_LEN > MAX_MODEL_LEN )); then + echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2 + exit 1 +fi + best_throughput=0 best_max_num_seqs=0 best_num_batched_tokens=0 best_goodput=0 +best_request_rate=0 start_server() { local gpu_memory_utilization=$1 local max_num_seqs=$2 local max_num_batched_tokens=$3 local vllm_log=$4 - - pkill -f vllm + local profile_dir=$5 - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \ - --disable-log-requests \ - --port 8004 \ - --gpu-memory-utilization $gpu_memory_utilization \ - --max-num-seqs $max_num_seqs \ - --max-num-batched-tokens $max_num_batched_tokens \ - --tensor-parallel-size $TP \ - --enable-prefix-caching \ - --load-format dummy \ - --download-dir "$DOWNLOAD_DIR" \ - --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 & + pkill -if vllm + + # Define the common arguments as a bash array. + # Each argument and its value are separate elements. + local common_args_array=( + "$MODEL" + "--disable-log-requests" + "--port" "8004" + "--gpu-memory-utilization" "$gpu_memory_utilization" + "--max-num-seqs" "$max_num_seqs" + "--max-num-batched-tokens" "$max_num_batched_tokens" + "--tensor-parallel-size" "$TP" + "--enable-prefix-caching" + "--load-format" "dummy" + "--download-dir" "$DOWNLOAD_DIR" + "--max-model-len" "$MAX_MODEL_LEN" + ) + + # Use the array expansion "${common_args_array[@]}" + # This correctly passes each element as a separate argument. + if [[ -n "$profile_dir" ]]; then + # Start server with profiling enabled + VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ + vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + else + # Start server without profiling + VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \ + vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + fi + local server_pid=$! # wait for 10 minutes... server_started=0 - for i in {1..60}; do + for i in {1..60}; do + # This line checks whether the server is still alive or not, + # since that we should always have permission to send signal to the server process. + kill -0 $server_pid 2> /dev/null || break + RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) - STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) + STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) if [[ "$STATUS_CODE" -eq 200 ]]; then server_started=1 break @@ -97,8 +121,9 @@ start_server() { sleep 10 fi done + if (( ! server_started )); then - echo "server did not start within 10 minutes. Please check server log at $vllm_log". + echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log". return 1 else return 0 @@ -114,10 +139,11 @@ run_benchmark() { echo "vllm_log: $vllm_log" echo rm -f $vllm_log - pkill -f vllm + pkill -if vllm echo "starting server..." - start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log + # Call start_server without a profile_dir to avoid profiling overhead + start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log "" result=$? if [[ "$result" -eq 1 ]]; then echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" @@ -125,17 +151,19 @@ run_benchmark() { echo "server started." fi echo - + echo "run benchmark test..." meet_latency_requirement=0 # get a basic qps by using request-rate inf bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) - python benchmarks/benchmark_serving.py \ + adjusted_input_len=$(( INPUT_LEN - prefix_len )) + # --profile flag is removed from this call + vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name random \ - --random-input-len $INPUT_LEN \ + --random-input-len $adjusted_input_len \ --random-output-len $OUTPUT_LEN \ --ignore-eos \ --disable-tqdm \ @@ -162,11 +190,11 @@ run_benchmark() { curl -X POST http://0.0.0.0:8004/reset_prefix_cache sleep 5 bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" - python benchmarks/benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name random \ - --random-input-len $INPUT_LEN \ + --random-input-len $adjusted_input_len \ --random-output-len $OUTPUT_LEN \ --ignore-eos \ --disable-tqdm \ @@ -195,6 +223,7 @@ run_benchmark() { best_max_num_seqs=$max_num_seqs best_num_batched_tokens=$max_num_batched_tokens best_goodput=$goodput + best_request_rate=$request_rate fi else echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" @@ -203,9 +232,9 @@ run_benchmark() { echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" - pkill vllm + pkill -if vllm sleep 10 - printf '=%.0s' $(seq 1 20) + echo "====================" return 0 } @@ -216,7 +245,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST" gpu_memory_utilization=0.98 find_gpu_memory_utilization=0 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do - start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" + # Pass empty string for profile_dir argument + start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" "" result=$? if [[ "$result" -eq 0 ]]; then find_gpu_memory_utilization=1 @@ -239,6 +269,45 @@ for num_seqs in "${num_seqs_list[@]}"; do done done echo "finish permutations" -echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" -echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT" +# ================================================================================= +# FINAL PROFILING RUN FOR THE BEST CONFIGURATION +# ================================================================================= +if (( $(echo "$best_throughput > 0" | bc -l) )); then + echo + echo "Benchmark tuning finished. Now running profiling on the best configuration found..." + echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput" + echo + + vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt" + bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt" + + # Start server with the best params and profiling ENABLED + echo "Starting server for profiling..." + start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH" + + # Run benchmark with the best params and the --profile flag + echo "Running benchmark with profiling..." + prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) + adjusted_input_len=$(( INPUT_LEN - prefix_len )) + vllm bench serve \ + --backend vllm \ + --model $MODEL \ + --dataset-name random \ + --random-input-len $adjusted_input_len \ + --random-output-len $OUTPUT_LEN \ + --ignore-eos \ + --disable-tqdm \ + --request-rate $best_request_rate \ + --percentile-metrics ttft,tpot,itl,e2el \ + --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ + --num-prompts 100 \ + --random-prefix-len $prefix_len \ + --port 8004 \ + --profile &> "$bm_log" +else + echo "No configuration met the latency requirements. Skipping final profiling run." +fi +pkill -if vllm +echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" +echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" diff --git a/benchmarks/auto_tune/batch_auto_tune.sh b/benchmarks/auto_tune/batch_auto_tune.sh new file mode 100644 index 000000000..57ef20daf --- /dev/null +++ b/benchmarks/auto_tune/batch_auto_tune.sh @@ -0,0 +1,128 @@ +#!/bin/bash + +INPUT_JSON="$1" +GCS_PATH="$2" # Optional GCS path for uploading results for each run + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh" + +if [[ -z "$INPUT_JSON" ]]; then + echo "Error: Input JSON file not provided." + echo "Usage: $0 [gcs_upload_path]" + exit 1 +fi + +if [[ ! -f "$INPUT_JSON" ]]; then + echo "Error: File not found at '$INPUT_JSON'" + exit 1 +fi + +if ! command -v jq &> /dev/null; then + echo "Error: 'jq' command not found. Please install jq to process the JSON input." + exit 1 +fi + +if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then + echo "Error: 'gcloud' command not found, but a GCS_PATH was provided." + exit 1 +fi + +SUCCESS_COUNT=0 +FAILURE_COUNT=0 +FAILED_RUNS=() +SCRIPT_START_TIME=$(date +%s) + +json_content=$(cat "$INPUT_JSON") +if ! num_runs=$(echo "$json_content" | jq 'length'); then + echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2 + exit 1 +fi + +echo "Found $num_runs benchmark configurations in $INPUT_JSON." +echo "Starting benchmark runs..." +echo "--------------------------------------------------" + +for i in $(seq 0 $(($num_runs - 1))); do + run_object=$(echo "$json_content" | jq ".[$i]") + + RUN_START_TIME=$(date +%s) + ENV_VARS_ARRAY=() + # Dynamically create env vars from the JSON object's keys + for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do + value=$(echo "$run_object" | jq -r ".$key") + var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_') + ENV_VARS_ARRAY+=("${var_name}=${value}") + done + + echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}" + + # Execute auto_tune.sh and capture output + RUN_OUTPUT_FILE=$(mktemp) + if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then + STATUS="SUCCESS" + ((SUCCESS_COUNT++)) + else + STATUS="FAILURE" + ((FAILURE_COUNT++)) + FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)") + fi + + RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE") + rm "$RUN_OUTPUT_FILE" + + # Parse results and optionally upload them to GCS + RUN_ID="" + RESULTS="" + GCS_RESULTS_URL="" + if [[ "$STATUS" == "SUCCESS" ]]; then + RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true) + + if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then + RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")") + RESULT_DIR=$(dirname "$RESULT_FILE_PATH") + RESULTS=$(cat "$RESULT_FILE_PATH") + + if [[ -n "$GCS_PATH" ]]; then + GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}" + echo "Uploading results to GCS..." + if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then + echo "GCS upload successful." + else + echo "Warning: GCS upload failed for RUN_ID $RUN_ID." + fi + fi + else + echo "Warning: Could not find result file for a successful run." + STATUS="WARNING_NO_RESULT_FILE" + fi + fi + + # Add the results back into the JSON object for this run + json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \ + '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}') + + RUN_END_TIME=$(date +%s) + echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS" + echo "--------------------------------------------------" + + # Save intermediate progress back to the file + echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON" + +done + +SCRIPT_END_TIME=$(date +%s) +echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds." +echo +echo "====================== SUMMARY ======================" +echo "Successful runs: $SUCCESS_COUNT" +echo "Failed runs: $FAILURE_COUNT" +echo "===================================================" + +if [[ $FAILURE_COUNT -gt 0 ]]; then + echo "Details of failed runs (see JSON file for full parameters):" + for failed in "${FAILED_RUNS[@]}"; do + echo " - $failed" + done +fi + +echo "Updated results have been saved to '$INPUT_JSON'." diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index ddb38e304..ba7c733be 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -31,9 +31,10 @@ class RequestFuncInput: model_name: Optional[str] = None logprobs: Optional[int] = None extra_body: Optional[dict] = None - multi_modal_content: Optional[dict] = None + multi_modal_content: Optional[dict | list[dict]] = None ignore_eos: bool = False language: Optional[str] = None + request_id: Optional[str] = None @dataclass @@ -71,6 +72,9 @@ async def async_request_tgi( "inputs": request_func_input.prompt, "parameters": params, } + headers = None + if request_func_input.request_id: + headers = {"x-request-id": request_func_input.request_id} output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len if request_func_input.ignore_eos: @@ -82,7 +86,9 @@ async def async_request_tgi( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload) as response: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() @@ -145,6 +151,9 @@ async def async_request_trt_llm( } if request_func_input.ignore_eos: payload["min_length"] = request_func_input.output_len + headers = None + if request_func_input.request_id: + headers = {"x-request-id": request_func_input.request_id} output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -152,7 +161,9 @@ async def async_request_trt_llm( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload) as response: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() @@ -211,6 +222,8 @@ async def async_request_deepspeed_mii( "top_p": 1.0, } headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -283,6 +296,8 @@ async def async_request_openai_completions( if request_func_input.extra_body: payload.update(request_func_input.extra_body) headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -364,7 +379,15 @@ async def async_request_openai_chat_completions( ) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: - content.append(request_func_input.multi_modal_content) + mm_content = request_func_input.multi_modal_content + if isinstance(mm_content, list): + content.extend(mm_content) + elif isinstance(mm_content, dict): + content.append(mm_content) + else: + raise TypeError( + "multi_modal_content must be a dict or list[dict] for openai-chat" + ) payload = { "model": request_func_input.model_name if request_func_input.model_name @@ -387,6 +410,8 @@ async def async_request_openai_chat_completions( "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -404,8 +429,14 @@ async def async_request_openai_chat_completions( chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue + chunk_bytes = chunk_bytes.decode("utf-8") + # NOTE: SSE comments (often used as pings) start with a colon. + # These are not JSON data payload and should be skipped. + if chunk_bytes.startswith(":"): + continue + + chunk = chunk_bytes.removeprefix("data: ") - chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) @@ -477,6 +508,8 @@ async def async_request_openai_audio( headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id # Send audio file def to_bytes(y, sr): @@ -485,7 +518,10 @@ def to_bytes(y, sr): buffer.seek(0) return buffer - with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + mm_audio = request_func_input.multi_modal_content + if not isinstance(mm_audio, dict) or "audio" not in mm_audio: + raise TypeError("multi_modal_content must be a dict containing 'audio'") + with to_bytes(*mm_audio["audio"]) as f: form = aiohttp.FormData() form.add_field("file", f, content_type="audio/wav") for key, value in payload.items(): diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py new file mode 100644 index 000000000..eae8d9927 --- /dev/null +++ b/benchmarks/benchmark_block_pool.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc + +from tabulate import tabulate + +from benchmark_utils import TimeCollector +from vllm.utils import FlexibleArgumentParser +from vllm.v1.core.block_pool import BlockPool + + +def main(args): + rows = [] + for allocate_block in args.allocate_blocks: + # Enforce a GC collect ahead to minimize the impact among runs + gc.collect() + block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True) + + get_blocks_times = TimeCollector(TimeCollector.US) + free_blocks_times = TimeCollector(TimeCollector.US) + for _ in range(args.num_iteration): + with get_blocks_times: + blocks = block_pool.get_new_blocks(allocate_block) + with free_blocks_times: + block_pool.free_blocks(blocks) + + rows.append( + [get_blocks_times.cnt, args.num_gpu_blocks, allocate_block] + + get_blocks_times.dump_avg_max() + + free_blocks_times.dump_avg_max() + ) + + print( + tabulate( + rows, + headers=[ + "Iterations", + "Total\nBlocks", + "Allocated\nBlocks", + "Get Blocks\nAvg (us)", + "Get Blocks\nMax (us)", + "Free Blocks\nAvg (us)", + "Free Blocks\nMax (us)", + ], + tablefmt="grid", + floatfmt=".3f", + ) + ) + + +def invoke_main() -> None: + parser = FlexibleArgumentParser( + description="Benchmark the performance of BlockPool for KV Cache." + ) + parser.add_argument("--num-gpu-blocks", type=int, default=100000) + parser.add_argument( + "--num-iteration", + type=int, + default=1000, + help="Number of iterations to run to stabilize final data readings", + ) + parser.add_argument( + "--allocate-blocks", + type=int, + nargs="*", + default=[10, 50, 100, 500, 1000], + help="Number of blocks to allocate", + ) + args = parser.parse_args() + main(args) + + +if __name__ == "__main__": + invoke_main() # pragma: no cover diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py deleted file mode 100644 index 5d2a26cd4..000000000 --- a/benchmarks/benchmark_dataset.py +++ /dev/null @@ -1,1167 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This module defines a framework for sampling benchmark requests from various -datasets. Each dataset subclass of BenchmarkDataset must implement sample -generation. Supported dataset types include: - - ShareGPT - - Random (synthetic) - - Sonnet - - BurstGPT - - HuggingFace - - VisionArena -""" - -import base64 -import io -import json -import logging -import random -from abc import ABC, abstractmethod -from collections.abc import Mapping -from dataclasses import dataclass -from functools import cache -from io import BytesIO -from typing import Any, Callable, Optional, Union - -import numpy as np -import pandas as pd -from datasets import load_dataset -from PIL import Image -from transformers import PreTrainedTokenizerBase - -from vllm.lora.request import LoRARequest -from vllm.lora.utils import get_adapter_absolute_path -from vllm.multimodal import MultiModalDataDict -from vllm.multimodal.image import convert_image_mode -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer - -logger = logging.getLogger(__name__) - -# ----------------------------------------------------------------------------- -# Data Classes -# ----------------------------------------------------------------------------- - - -@dataclass -class SampleRequest: - """ - Represents a single inference request for benchmarking. - """ - - prompt: Union[str, Any] - prompt_len: int - expected_output_len: int - multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None - lora_request: Optional[LoRARequest] = None - - -# ----------------------------------------------------------------------------- -# Benchmark Dataset Base Class -# ----------------------------------------------------------------------------- - - -class BenchmarkDataset(ABC): - DEFAULT_SEED = 0 - IS_MULTIMODAL = False - - def __init__( - self, - dataset_path: Optional[str] = None, - random_seed: int = DEFAULT_SEED, - ) -> None: - """ - Initialize the BenchmarkDataset with an optional dataset path and random - seed. Args: - dataset_path (Optional[str]): Path to the dataset. If None, it - indicates that a default or random dataset might be used. - random_seed (int): Seed value for reproducible shuffling or - sampling. Defaults to DEFAULT_SEED. - """ - self.dataset_path = dataset_path - # Set the random seed, ensuring that a None value is replaced with the - # default seed. - self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED - self.data = None - - def apply_multimodal_chat_transformation( - self, prompt: str, mm_content: Optional[MultiModalDataDict] = None - ) -> list[dict]: - """ - Transform a prompt and optional multimodal content into a chat format. - This method is used for chat models that expect a specific conversation - format. - """ - content = [{"text": prompt, "type": "text"}] - if mm_content is not None: - content.append(mm_content) - return [{"role": "user", "content": content}] - - def load_data(self) -> None: - """ - Load data from the dataset path into self.data. - - This method must be overridden by subclasses since the method to load - data will vary depending on the dataset format and source. - - Raises: - NotImplementedError: If a subclass does not implement this method. - """ - # TODO (jenniferzhao): add support for downloading data - raise NotImplementedError("load_data must be implemented in subclasses.") - - def get_random_lora_request( - self, - tokenizer: PreTrainedTokenizerBase, - max_loras: Optional[int] = None, - lora_path: Optional[str] = None, - ) -> tuple[Optional[LoRARequest], AnyTokenizer]: - """ - Optionally select a random LoRA request and return its associated - tokenizer. - - This method is used when LoRA parameters are provided. It randomly - selects a LoRA based on max_loras and retrieves a cached tokenizer for - that LoRA if available. Otherwise, it returns the base tokenizer. - - Args: - tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no - LoRA is selected. max_loras (Optional[int]): The maximum number of - LoRAs available. If None, LoRA is not used. lora_path - (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA - is not used. - - Returns: - tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first - element is a LoRARequest (or None if not applicable) and the second - element is the tokenizer associated with the LoRA request (or the - base tokenizer). - """ - if max_loras is None or lora_path is None: - return None, tokenizer - - # Generate a random LoRA ID in the range [1, max_loras]. - lora_id = random.randint(1, max_loras) - lora_request = LoRARequest( - lora_name=str(lora_id), - lora_int_id=lora_id, - lora_path=lora_path_on_disk(lora_path), - ) - if lora_id not in lora_tokenizer_cache: - lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) - # Return lora_request and the cached tokenizer if available; otherwise, - # return the base tokenizer - return lora_request, lora_tokenizer_cache[lora_id] or tokenizer - - @abstractmethod - def sample( - self, tokenizer: PreTrainedTokenizerBase, num_requests: int - ) -> list[SampleRequest]: - """ - Abstract method to generate sample requests from the dataset. - - Subclasses must override this method to implement dataset-specific logic - for generating a list of SampleRequest objects. - - Args: - tokenizer (PreTrainedTokenizerBase): The tokenizer to be used - for processing the dataset's text. - num_requests (int): The number of sample requests to generate. - - Returns: - list[SampleRequest]: A list of sample requests generated from the - dataset. - """ - raise NotImplementedError("sample must be implemented in subclasses.") - - def maybe_oversample_requests( - self, requests: list[SampleRequest], num_requests: int - ) -> None: - """ - Oversamples the list of requests if its size is less than the desired - number. - - Args: - requests (List[SampleRequest]): The current list of sampled - requests. num_requests (int): The target number of requests. - """ - if len(requests) < num_requests: - random.seed(self.random_seed) - additional = random.choices(requests, k=num_requests - len(requests)) - requests.extend(additional) - logger.info("Oversampled requests to reach %d total samples.", num_requests) - - -# ----------------------------------------------------------------------------- -# Utility Functions and Global Caches -# ----------------------------------------------------------------------------- - - -def is_valid_sequence( - prompt_len: int, - output_len: int, - min_len: int = 4, - max_prompt_len: int = 1024, - max_total_len: int = 2048, - skip_min_output_len_check: bool = False, -) -> bool: - """ - Validate a sequence based on prompt and output lengths. - - Default pruning criteria are copied from the original `sample_hf_requests` - and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as - from `sample_requests` in benchmark_throughput.py. - """ - # Check for invalid conditions - prompt_too_short = prompt_len < min_len - output_too_short = (not skip_min_output_len_check) and (output_len < min_len) - prompt_too_long = prompt_len > max_prompt_len - combined_too_long = (prompt_len + output_len) > max_total_len - - # Return True if none of the invalid conditions are met - return not ( - prompt_too_short or output_too_short or prompt_too_long or combined_too_long - ) - - -@cache -def lora_path_on_disk(lora_path: str) -> str: - return get_adapter_absolute_path(lora_path) - - -# Global cache for LoRA tokenizers. -lora_tokenizer_cache: dict[int, AnyTokenizer] = {} - - -def process_image(image: Any) -> Mapping[str, Any]: - """ - Process a single image input and return a multimedia content dictionary. - - Supports three input types: - - 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key - containing raw image data. - Loads the bytes as a PIL.Image.Image. - - 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as - a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns - a dictionary with the image as a base64 data URL. - - 3. String input: - Treats the string as a URL or local file path. - - Prepends "file://" if the string doesn't start with "http://" or - "file://". - Returns a dictionary with the image URL. - - Raises: - ValueError: If the input is not a supported type. - """ - if isinstance(image, dict) and "bytes" in image: - image = Image.open(BytesIO(image["bytes"])) - if isinstance(image, Image.Image): - image = convert_image_mode(image, "RGB") - with io.BytesIO() as image_data: - image.save(image_data, format="JPEG") - image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") - return { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, - } - - if isinstance(image, str): - image_url = ( - image if image.startswith(("http://", "file://")) else f"file://{image}" - ) - return {"type": "image_url", "image_url": {"url": image_url}} - - raise ValueError( - f"Invalid image input {image}. Must be a PIL.Image.Image" - " or str or dictionary with raw image bytes." - ) - - -# ----------------------------------------------------------------------------- -# Random Dataset Implementation (Synthetic Data) -# ----------------------------------------------------------------------------- - - -class RandomDataset(BenchmarkDataset): - # Default values copied from benchmark_serving.py for the random dataset. - DEFAULT_PREFIX_LEN = 0 - DEFAULT_RANGE_RATIO = 0.0 - DEFAULT_INPUT_LEN = 1024 - DEFAULT_OUTPUT_LEN = 128 - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - prefix_len: int = DEFAULT_PREFIX_LEN, - range_ratio: float = DEFAULT_RANGE_RATIO, - input_len: int = DEFAULT_INPUT_LEN, - output_len: int = DEFAULT_OUTPUT_LEN, - **kwargs, - ) -> list[SampleRequest]: - # Enforce range_ratio < 1 - assert range_ratio < 1.0, ( - "random_range_ratio must be < 1.0 to ensure a valid sampling range" - ) - - vocab_size = tokenizer.vocab_size - num_special_tokens = tokenizer.num_special_tokens_to_add() - real_input_len = input_len - num_special_tokens - - prefix_token_ids = ( - np.random.randint(0, vocab_size, size=prefix_len).tolist() - if prefix_len > 0 - else [] - ) - - # New sampling logic: [X * (1 - b), X * (1 + b)] - input_low = int(real_input_len * (1 - range_ratio)) - input_high = int(real_input_len * (1 + range_ratio)) - output_low = int(output_len * (1 - range_ratio)) - output_high = int(output_len * (1 + range_ratio)) - - # Add logging for debugging - logger.info("Sampling input_len from [%s, %s]", input_low, input_high) - logger.info("Sampling output_len from [%s, %s]", output_low, output_high) - - input_lens = np.random.randint(input_low, input_high + 1, size=num_requests) - output_lens = np.random.randint(output_low, output_high + 1, size=num_requests) - offsets = np.random.randint(0, vocab_size, size=num_requests) - - requests = [] - for i in range(num_requests): - inner_seq = ( - (offsets[i] + i + np.arange(input_lens[i])) % vocab_size - ).tolist() - token_sequence = prefix_token_ids + inner_seq - prompt = tokenizer.decode(token_sequence) - # After decoding the prompt we have to encode and decode it again. - # This is done because in some cases N consecutive tokens - # give a string tokenized into != N number of tokens. - # For example for GPT2Tokenizer: - # [6880, 6881] -> ['Ġcalls', 'here'] -> - # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] - # To avoid uncontrolled change of the prompt length, - # the encoded sequence is truncated before being decode again. - re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ - : input_lens[i] - ] - prompt = tokenizer.decode(re_encoded_sequence) - total_input_len = prefix_len + int(input_lens[i]) - requests.append( - SampleRequest( - prompt=prompt, - prompt_len=total_input_len, - expected_output_len=int(output_lens[i]), - ) - ) - return requests - - -# ----------------------------------------------------------------------------- -# ShareGPT Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ShareGPTDataset(BenchmarkDataset): - """ - Implements the ShareGPT dataset. Loads data from a JSON file and generates - sample requests based on conversation turns. - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - with open(self.dataset_path, encoding="utf-8") as f: - self.data = json.load(f) - # Filter entries with at least two conversation turns. - self.data = [ - entry - for entry in self.data - if "conversations" in entry and len(entry["conversations"]) >= 2 - ] - random.seed(self.random_seed) - random.shuffle(self.data) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - lora_path: Optional[str] = None, - max_loras: Optional[int] = None, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - samples: list = [] - for entry in self.data: - if len(samples) >= num_requests: - break - prompt, completion = ( - entry["conversations"][0]["value"], - entry["conversations"][1]["value"], - ) - - lora_request, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path - ) - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - new_output_len = len(completion_ids) if output_len is None else output_len - if not is_valid_sequence( - prompt_len, - new_output_len, - skip_min_output_len_check=output_len is not None, - ): - continue - if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation(prompt, None) - samples.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=new_output_len, - lora_request=lora_request, - ) - ) - self.maybe_oversample_requests(samples, num_requests) - return samples - - -# ----------------------------------------------------------------------------- -# Custom Dataset Implementation -# ----------------------------------------------------------------------------- - - -class CustomDataset(BenchmarkDataset): - """ - Implements the Custom dataset. Loads data from a JSONL file and generates - sample requests based on conversation turns. E.g., - ``` - {"prompt": "What is the capital of India?"} - {"prompt": "What is the capital of Iran?"} - {"prompt": "What is the capital of China?"} - ``` - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - # self.data will be a list of dictionaries - # e.g., [{"prompt": "What is the capital of India?"}, ...] - # This will be the standardized format which load_data() - # has to convert into depending on the filetype of dataset_path. - # sample() will assume this standardized format of self.data - self.data = [] - - # Load the JSONL file - if self.dataset_path.endswith(".jsonl"): - jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) - - # check if the JSONL file has a 'prompt' column - if "prompt" not in jsonl_data.columns: - raise ValueError("JSONL file must contain a 'prompt' column.") - - # Convert each row to a dictionary and append to self.data - # This will convert the DataFrame to a list of dictionaries - # where each dictionary corresponds to a row in the DataFrame. - # This is the standardized format we want for self.data - for _, row in jsonl_data.iterrows(): - self.data.append(row.to_dict()) - else: - raise NotImplementedError( - "Only JSONL format is supported for CustomDataset." - ) - - random.seed(self.random_seed) - random.shuffle(self.data) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - lora_path: Optional[str] = None, - max_loras: Optional[int] = None, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - skip_chat_template: bool = False, - **kwargs, - ) -> list: - sampled_requests = [] - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt = item["prompt"] - - # apply template - if not skip_chat_template: - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Sonnet Dataset Implementation -# ----------------------------------------------------------------------------- - - -class SonnetDataset(BenchmarkDataset): - """ - Simplified implementation of the Sonnet dataset. Loads poem lines from a - text file and generates sample requests. Default values here copied from - `benchmark_serving.py` for the sonnet dataset. - """ - - DEFAULT_PREFIX_LEN = 200 - DEFAULT_INPUT_LEN = 550 - DEFAULT_OUTPUT_LEN = 150 - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if not self.dataset_path: - raise ValueError("dataset_path must be provided.") - with open(self.dataset_path, encoding="utf-8") as f: - self.data = f.readlines() - - def sample( - self, - tokenizer, - num_requests: int, - prefix_len: int = DEFAULT_PREFIX_LEN, - input_len: int = DEFAULT_INPUT_LEN, - output_len: int = DEFAULT_OUTPUT_LEN, - return_prompt_formatted: bool = False, - **kwargs, - ) -> list: - # Calculate average token length for a poem line. - tokenized_lines = [tokenizer(line).input_ids for line in self.data] - avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines) - - # Build the base prompt. - base_prompt = "Pick as many lines as you can from these poem lines:\n" - base_msg = [{"role": "user", "content": base_prompt}] - base_fmt = tokenizer.apply_chat_template( - base_msg, add_generation_prompt=True, tokenize=False - ) - base_offset = len(tokenizer(base_fmt).input_ids) - if input_len <= base_offset: - raise ValueError( - f"'input_len' must be higher than the base prompt length " - f"({base_offset})." - ) - - # Determine how many poem lines to use. - num_input_lines = round((input_len - base_offset) / avg_len) - num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) - prefix_lines = self.data[:num_prefix_lines] - - samples = [] - while len(samples) < num_requests: - extra_lines = random.choices( - self.data, k=num_input_lines - num_prefix_lines - ) - prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" - msg = [{"role": "user", "content": prompt}] - prompt_formatted = tokenizer.apply_chat_template( - msg, add_generation_prompt=True, tokenize=False - ) - prompt_len = len(tokenizer(prompt_formatted).input_ids) - if prompt_len <= input_len: - samples.append( - SampleRequest( - prompt=prompt_formatted if return_prompt_formatted else prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - ) - ) - return samples - - -# ----------------------------------------------------------------------------- -# BurstGPT Dataset Implementation -# ----------------------------------------------------------------------------- - - -class BurstGPTDataset(BenchmarkDataset): - """ - Implements the BurstGPT dataset. Loads data from a CSV file and generates - sample requests based on synthetic prompt generation. Only rows with Model - "GPT-4" and positive response tokens are used. - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data( - self, - ): - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - df = pd.read_csv(self.dataset_path) - # Filter to keep only GPT-4 rows. - gpt4_df = df[df["Model"] == "GPT-4"] - # Remove failed requests (where Response tokens is 0 or less). - gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] - # Sample the desired number of rows. - self.data = gpt4_df - - def _sample_loaded_data(self, num_requests: int) -> list: - if num_requests <= len(self.data): - data = self.data.sample(n=num_requests, random_state=self.random_seed) - else: - data = self.data.sample( - n=num_requests, - random_state=self.random_seed, - replace=True, - ) - # Convert the dataframe to a list of lists. - return data.values.tolist() - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - max_loras: Optional[int] = None, - lora_path: Optional[str] = None, - **kwargs, - ) -> list[SampleRequest]: - samples = [] - data = self._sample_loaded_data(num_requests=num_requests) - for i in range(num_requests): - input_len = int(data[i][2]) - output_len = int(data[i][3]) - lora_req, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path - ) - vocab_size = tokenizer.vocab_size - # Generate a synthetic prompt: a list of token IDs computed as (i + - # j) modulo vocab_size. - token_ids = [(i + j) % vocab_size for j in range(input_len)] - prompt = tokenizer.decode(token_ids) - samples.append( - SampleRequest( - prompt=prompt, - prompt_len=input_len, - expected_output_len=output_len, - lora_request=lora_req, - ) - ) - return samples - - -# ----------------------------------------------------------------------------- -# HuggingFace Dataset Base Implementation -# ----------------------------------------------------------------------------- -class HuggingFaceDataset(BenchmarkDataset): - """Base class for datasets hosted on HuggingFace.""" - - SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set() - - def __init__( - self, - dataset_path: str, - dataset_split: str, - dataset_subset: Optional[str] = None, - **kwargs, - ) -> None: - super().__init__(dataset_path=dataset_path, **kwargs) - - self.dataset_split = dataset_split - self.dataset_subset = dataset_subset - self.load_data() - - def load_data(self) -> None: - """Load data from HuggingFace datasets.""" - self.data = load_dataset( - self.dataset_path, - name=self.dataset_subset, - split=self.dataset_split, - streaming=True, - ) - self.data = self.data.shuffle(seed=self.random_seed) - - -# ----------------------------------------------------------------------------- -# Conversation Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ConversationDataset(HuggingFaceDataset): - """Dataset for conversation data with multimodal support.""" - - SUPPORTED_DATASET_PATHS = { - "lmms-lab/LLaVA-OneVision-Data", - "Aeala/ShareGPT_Vicuna_unfiltered", - } - IS_MULTIMODAL = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - # Filter examples with at least 2 conversations - filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) - sampled_requests = [] - dynamic_output = output_len is None - - for item in filtered_data: - if len(sampled_requests) >= num_requests: - break - conv = item["conversations"] - prompt, completion = conv[0]["value"], conv[1]["value"] - - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - completion_len = len(completion_ids) - output_len = completion_len if dynamic_output else output_len - assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence(prompt_len, completion_len): - continue - mm_content = process_image(item["image"]) if "image" in item else None - if enable_multimodal_chat: - # Note: when chat is enabled the request prompt_len is no longer - # accurate and we will be using request output to count the - # actual prompt len and output len - prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Vision Arena Dataset Implementation -# ----------------------------------------------------------------------------- - - -class VisionArenaDataset(HuggingFaceDataset): - """ - Vision Arena Dataset. - """ - - DEFAULT_OUTPUT_LEN = 128 - SUPPORTED_DATASET_PATHS = { - "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"], - "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"], - } - IS_MULTIMODAL = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] - for item in self.data: - if len(sampled_requests) >= num_requests: - break - parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) - if parser_fn is None: - raise ValueError(f"Unsupported dataset path: {self.dataset_path}") - prompt = parser_fn(item) - mm_content = process_image(item["images"][0]) - prompt_len = len(tokenizer(prompt).input_ids) - if enable_multimodal_chat: - # Note: when chat is enabled the request prompt_len is no longer - # accurate and we will be using request output to count the - # actual prompt len - prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Instruct Coder Dataset Implementation -# ----------------------------------------------------------------------------- - - -class InstructCoderDataset(HuggingFaceDataset): - """ - InstructCoder Dataset. - https://huggingface.co/datasets/likaixin/InstructCoder - - InstructCoder is the dataset designed for general code editing. It consists - of 114,239 instruction-input-output triplets, and covers multiple distinct - code editing scenario. - """ - - DEFAULT_OUTPUT_LEN = 200 # this is the average default output length - SUPPORTED_DATASET_PATHS = { - "likaixin/InstructCoder", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt = f"{item['input']}\n\n{item['instruction']} Just output \ - the code, do not include any explanation." - - # apply template - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# MT-Bench Dataset Implementation -# ----------------------------------------------------------------------------- - - -class MTBenchDataset(HuggingFaceDataset): - """ - MT-Bench Dataset. - https://huggingface.co/datasets/philschmid/mt-bench - - We create a single turn dataset for MT-Bench. - This is similar to Spec decoding benchmark setup in vLLM - https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 - """ # noqa: E501 - - DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM - SUPPORTED_DATASET_PATHS = { - "philschmid/mt-bench", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] - - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt = item["turns"][0] - - # apply template - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# AIMO Dataset Implementation -# ----------------------------------------------------------------------------- - - -class AIMODataset(HuggingFaceDataset): - """ - Dataset class for processing a AIMO dataset with reasoning questions. - """ - - SUPPORTED_DATASET_PATHS = { - "AI-MO/aimo-validation-aime", - "AI-MO/NuminaMath-1.5", - "AI-MO/NuminaMath-CoT", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - **kwargs, - ) -> list: - sampled_requests = [] - dynamic_output = output_len is None - - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt, completion = item["problem"], item["solution"] - - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - completion_len = len(completion_ids) - output_len = completion_len if dynamic_output else output_len - assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence( - prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000 - ): - continue - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=None, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Next Edit Prediction Dataset Implementation -# ----------------------------------------------------------------------------- - - -zeta_prompt = """### Instruction: -You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location. - -### User Edits: - -{} - -### User Excerpt: - -{} - -### Response: - -""" # noqa: E501 - - -def _format_zeta_prompt( - sample: dict, original_start_marker: str = "<|editable_region_start|>" -) -> dict: - """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. - - This function formats examples from the NEP dataset - into prompts and expected outputs. It could be - further extended to support more NEP datasets. - - Args: - sample: The dataset sample containing events, - inputs, and outputs. - original_start_marker: The marker indicating the - start of the editable region. Defaults to - "<|editable_region_start|>". - - Returns: - A dictionary with the formatted prompts and expected outputs. - """ - events = sample["events"] - input = sample["input"] - output = sample["output"] - prompt = zeta_prompt.format(events, input) - - # following the original implementation, extract the focused region - # from the raw output - output_start_index = output.find(original_start_marker) - output_focused_region = output[output_start_index:] - expected_output = output_focused_region - - return {"prompt": prompt, "expected_output": expected_output} - - -class NextEditPredictionDataset(HuggingFaceDataset): - """ - Dataset class for processing a Next Edit Prediction dataset. - """ - - SUPPORTED_DATASET_PATHS = { - "zed-industries/zeta", - } - MAPPING_PROMPT_FUNCS = { - "zed-industries/zeta": _format_zeta_prompt, - } - - def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs): - formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path) - if formatting_prompt_func is None: - raise ValueError(f"Unsupported dataset path: {self.dataset_path}") - samples = [] - for sample in self.data: - sample = formatting_prompt_func(sample) - samples.append( - SampleRequest( - prompt=sample["prompt"], - prompt_len=len(tokenizer(sample["prompt"]).input_ids), - expected_output_len=len( - tokenizer(sample["expected_output"]).input_ids - ), - ) - ) - if len(samples) >= num_requests: - break - self.maybe_oversample_requests(samples, num_requests) - return samples - - -# ----------------------------------------------------------------------------- -# ASR Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ASRDataset(HuggingFaceDataset): - """ - Dataset class for processing a ASR dataset for transcription. - Tested on the following set: - - +----------------+----------------------------------------+--------------------------+-----------------------------+ - | Dataset | Domain | Speaking Style | hf-subset | - +----------------+----------------------------------------+--------------------------+-----------------------------+ - | TED-LIUM | TED talks | Oratory | release1, release2, release3| - | | | | release3-speaker-adaptation | - | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | - | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | - | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | - | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | - | AMI | Meetings | Spontaneous | ihm, sdm | - +----------------+----------------------------------------+--------------------------+-----------------------------+ - - """ # noqa: E501 - - SUPPORTED_DATASET_PATHS = { - "openslr/librispeech_asr", - "facebook/voxpopuli", - "LIUM/tedlium", - "edinburghcstr/ami", - "speechcolab/gigaspeech", - "kensho/spgispeech", - } - - DEFAULT_OUTPUT_LEN = 128 - IS_MULTIMODAL = True - - # TODO Whisper-specific. Abstract interface when more models are supported. - TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" - skip_long_audios: bool = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - **kwargs, - ) -> list: - import librosa - - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - prompt = ASRDataset.TRANSCRIPTION_PREAMBLE - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests = [] - skipped = 0 - for item in self.data: - if len(sampled_requests) >= num_requests: - break - audio = item["audio"] - y, sr = audio["array"], audio["sampling_rate"] - duration_s = librosa.get_duration(y=y, sr=sr) - # Whisper max supported duration - if self.skip_long_audios and duration_s > 30: - skipped += 1 - continue - - mm_content = {"audio": (y, sr)} - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - ) - ) - if skipped: - logger.warning( - "%d samples discarded from dataset due to" - " their length being greater than" - " what Whisper supports.", - skipped, - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index c06857247..a7892f3f7 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,180 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark the latency of processing a single batch of requests.""" +import sys -import argparse -import dataclasses -import json -import os -import time -from typing import Any, Optional - -import numpy as np -from tqdm import tqdm - -import vllm.envs as envs -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.inputs import PromptType -from vllm.sampling_params import BeamSearchParams -from vllm.utils import FlexibleArgumentParser - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any] -) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={"latency": results["latencies"]}, - extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, - ) - if pt_records: - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def main(args: argparse.Namespace): - print(args) - - engine_args = EngineArgs.from_cli_args(args) - - # NOTE(woosuk): If the request cannot be processed in a single batch, - # the engine will automatically process the request in multiple batches. - llm = LLM(**dataclasses.asdict(engine_args)) - assert llm.llm_engine.model_config.max_model_len >= ( - args.input_len + args.output_len - ), ( - "Please ensure that max_model_len is greater than" - " the sum of input_len and output_len." - ) - - sampling_params = SamplingParams( - n=args.n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=args.output_len, - detokenize=not args.disable_detokenize, - ) - print(sampling_params) - dummy_prompt_token_ids = np.random.randint( - 10000, size=(args.batch_size, args.input_len) - ) - dummy_prompts: list[PromptType] = [ - {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() - ] - - def llm_generate(): - if not args.use_beam_search: - llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) - else: - llm.beam_search( - dummy_prompts, - BeamSearchParams( - beam_width=args.n, - max_tokens=args.output_len, - ignore_eos=True, - ), - ) - - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - llm.start_profile() - llm_generate() - llm.stop_profile() - else: - start_time = time.perf_counter() - llm_generate() - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - print("Warming up...") - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - run_to_completion(profile_dir=None) - - if args.profile: - profile_dir = envs.VLLM_TORCH_PROFILER_DIR - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=profile_dir) - return - - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) - latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90, 99] - percentiles = np.percentile(latencies, percentages) - print(f"Avg latency: {np.mean(latencies)} seconds") - for percentage, percentile in zip(percentages, percentiles): - print(f"{percentage}% percentile latency: {percentile} seconds") - - # Output JSON results if specified - if args.output_json: - results = { - "avg_latency": np.mean(latencies), - "latencies": latencies.tolist(), - "percentiles": dict(zip(percentages, percentiles.tolist())), - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) +if __name__ == "__main__": + print("""DEPRECATED: This script has been moved to the vLLM CLI. +Please use the following command instead: + vllm bench latency -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark the latency of processing a single batch of " - "requests till completion." - ) - parser.add_argument("--input-len", type=int, default=32) - parser.add_argument("--output-len", type=int, default=128) - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument( - "--n", - type=int, - default=1, - help="Number of generated sequences per prompt.", - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-iters-warmup", - type=int, - default=10, - help="Number of iterations to run for warmup.", - ) - parser.add_argument( - "--num-iters", type=int, default=30, help="Number of iterations to run." - ) - parser.add_argument( - "--profile", - action="store_true", - help="profile the generation process of a single batch", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the latency results in JSON format.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)" - ), - ) +For help with the new command, run: + vllm bench latency --help - parser = EngineArgs.add_cli_args(parser) - # V1 enables prefix caching by default which skews the latency - # numbers. We need to disable prefix caching by default. - parser.set_defaults(enable_prefix_caching=False) - args = parser.parse_args() - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: - raise OSError( - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " - "Please set it to a valid path to use torch profiler." - ) - main(args) +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench latency --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 00869fa94..6e0f3b51c 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -142,7 +142,7 @@ def main(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the performance with or " "without automatic prefix caching." @@ -192,5 +192,11 @@ def main(args): ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py new file mode 100644 index 000000000..d4b83edbd --- /dev/null +++ b/benchmarks/benchmark_ngram_proposer.py @@ -0,0 +1,213 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc +import time +from unittest import mock + +import numpy as np +from tabulate import tabulate + +from benchmark_utils import TimeCollector +from vllm.config import ( + CacheConfig, + DeviceConfig, + LoadConfig, + ModelConfig, + ParallelConfig, + SchedulerConfig, + SpeculativeConfig, + VllmConfig, +) +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser +from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.worker.gpu_input_batch import InputBatch +from vllm.v1.worker.gpu_model_runner import GPUModelRunner + + +def benchmark_propose(args): + rows = [] + for max_ngram in args.max_ngram: + collector = TimeCollector(TimeCollector.US) + + model_config = ModelConfig( + model="facebook/opt-125m", + task="generate", + max_model_len=args.num_token + args.num_spec_token, + tokenizer="facebook/opt-125m", + tokenizer_mode="auto", + dtype="auto", + seed=None, + trust_remote_code=False, + ) + proposer = NgramProposer( + vllm_config=VllmConfig( + model_config=model_config, + speculative_config=SpeculativeConfig( + prompt_lookup_min=args.min_ngram, + prompt_lookup_max=max_ngram, + num_speculative_tokens=args.num_spec_token, + method="ngram", + ), + ) + ) + + # Warm up + proposer.propose(np.random.randint(0, 20, (args.num_token,))) + + gc.collect() + for _ in range(args.num_iteration): + tokens = np.random.randint(0, 20, (args.num_req, args.num_token)) + with collector: + for i in range(args.num_req): + proposer.propose(tokens[i, :]) + rows.append( + [args.num_req, args.num_token, args.min_ngram, max_ngram] + + collector.dump_avg_max() + ) + + print( + tabulate( + rows, + headers=[ + "# Request", + "# Token", + "Min Ngram", + "Max Ngram", + "Avg (us)", + "Max (us)", + ], + tablefmt="grid", + floatfmt=".3f", + ) + ) + + +def benchmark_batched_propose(args): + NUM_SPECULATIVE_TOKENS_NGRAM = 10 + PROMPT_LOOKUP_MIN = 5 + PROMPT_LOOKUP_MAX = 15 + MAX_MODEL_LEN = int(1e7) + DEVICE = current_platform.device_type + + model_config = ModelConfig(model="facebook/opt-125m", runner="generate") + + speculative_config = SpeculativeConfig( + target_model_config=model_config, + target_parallel_config=ParallelConfig(), + method="ngram", + num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM, + prompt_lookup_max=PROMPT_LOOKUP_MAX, + prompt_lookup_min=PROMPT_LOOKUP_MIN, + ) + + vllm_config = VllmConfig( + model_config=model_config, + cache_config=CacheConfig(), + speculative_config=speculative_config, + device_config=DeviceConfig(device=current_platform.device_type), + parallel_config=ParallelConfig(), + load_config=LoadConfig(), + scheduler_config=SchedulerConfig(), + ) + + # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group + mock_pp_group = mock.MagicMock() + mock_pp_group.world_size = 1 + with mock.patch( + "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group + ): + runner = GPUModelRunner(vllm_config, DEVICE) + + # hack max model len + runner.max_model_len = MAX_MODEL_LEN + runner.drafter.max_model_len = MAX_MODEL_LEN + + dummy_input_batch = InputBatch( + max_num_reqs=args.num_req, + max_model_len=MAX_MODEL_LEN, + max_num_batched_tokens=args.num_req * args.num_token, + device=DEVICE, + pin_memory=False, + vocab_size=256000, + block_sizes=[16], + ) + dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req)) + dummy_input_batch.spec_decode_unsupported_reqs = () + dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req + dummy_input_batch.token_ids_cpu = np.random.randint( + 0, 20, (args.num_req, args.num_token) + ) + + runner.input_batch = dummy_input_batch + + sampled_token_ids = [[0]] * args.num_req + + print("Starting benchmark") + # first run is warmup so ignore it + for _ in range(args.num_iteration): + start = time.time() + runner.drafter.propose( + sampled_token_ids, + dummy_input_batch.req_ids, + dummy_input_batch.num_tokens_no_spec, + dummy_input_batch.token_ids_cpu, + dummy_input_batch.spec_decode_unsupported_reqs, + ) + end = time.time() + print(f"Iteration time (s): {end - start}") + + +def invoke_main() -> None: + parser = FlexibleArgumentParser( + description="Benchmark the performance of N-gram speculative decode drafting" + ) + parser.add_argument( + "--batched", action="store_true", help="consider time to prepare batch" + ) # noqa: E501 + parser.add_argument( + "--num-iteration", + type=int, + default=100, + help="Number of iterations to run to stabilize final data readings", + ) + parser.add_argument( + "--num-req", type=int, default=128, help="Number of requests in the batch" + ) + parser.add_argument( + "--num-token", type=int, default=1500, help="Number of tokens for each request" + ) + parser.add_argument( + "--min-ngram", + type=int, + default=3, + help="Minimum n-gram to match", + ) + parser.add_argument( + "--max-ngram", + type=int, + nargs="*", + default=[5, 7, 10, 15, 20], + help="Maximum n-gram to match", + ) + parser.add_argument( + "--num-spec-token", + type=int, + default=3, + help="Number of speculative tokens to generate", + ) + args = parser.parse_args() + + if not args.batched: + benchmark_propose(args) + else: + benchmark_batched_propose(args) + + +""" +# Example command lines: +# time python3 benchmarks/benchmark_ngram_proposer.py +# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128 +""" # noqa: E501 +if __name__ == "__main__": + invoke_main() # pragma: no cover diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 3e4704f0b..b5e2613de 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -218,7 +218,7 @@ def main(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the performance with or without " "automatic prefix caching." @@ -268,5 +268,11 @@ def main(args): ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 5496703f2..bb453791c 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -161,7 +161,7 @@ def main(args: argparse.Namespace): json.dump(results, f, indent=4) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument( "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" @@ -204,6 +204,12 @@ def main(args: argparse.Namespace): ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 81428fb7d..76cf51498 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,1230 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -r"""Benchmark online serving throughput. - -On the server side, run one of the following commands: - vLLM OpenAI API server - vllm serve \ - --swap-space 16 \ - --disable-log-requests - -On the client side, run: - python benchmarks/benchmark_serving.py \ - --backend \ - --model \ - --dataset-name sharegpt \ - --dataset-path \ - --request-rate \ # By default is inf - --num-prompts # By default is 1000 - - when using tgi backend, add - --endpoint /generate_stream - to the end of the command above. -""" - -import argparse -import asyncio -import gc -import json -import os -import random -import time -import warnings -from collections.abc import AsyncGenerator, Iterable -from dataclasses import dataclass -from datetime import datetime -from typing import Any, Optional - -import numpy as np -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase - -from backend_request_func import ( - ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, - RequestFuncInput, - RequestFuncOutput, -) - -try: - from vllm.transformers_utils.tokenizer import get_tokenizer -except ImportError: - from backend_request_func import get_tokenizer - -try: - from vllm.utils import FlexibleArgumentParser -except ImportError: - from argparse import ArgumentParser as FlexibleArgumentParser - -from benchmark_dataset import ( - AIMODataset, - ASRDataset, - BurstGPTDataset, - ConversationDataset, - CustomDataset, - HuggingFaceDataset, - InstructCoderDataset, - MTBenchDataset, - NextEditPredictionDataset, - RandomDataset, - SampleRequest, - ShareGPTDataset, - SonnetDataset, - VisionArenaDataset, -) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json - -MILLISECONDS_TO_SECONDS_CONVERSION = 1000 - - -@dataclass -class BenchmarkMetrics: - completed: int - total_input: int - total_output: int - request_throughput: float - request_goodput: float - output_throughput: float - total_token_throughput: float - mean_ttft_ms: float - median_ttft_ms: float - std_ttft_ms: float - percentiles_ttft_ms: list[tuple[float, float]] - mean_tpot_ms: float - median_tpot_ms: float - std_tpot_ms: float - percentiles_tpot_ms: list[tuple[float, float]] - mean_itl_ms: float - median_itl_ms: float - std_itl_ms: float - percentiles_itl_ms: list[tuple[float, float]] - # E2EL stands for end-to-end latency per request. - # It is the time taken on the client side from sending - # a request to receiving a complete response. - mean_e2el_ms: float - median_e2el_ms: float - std_e2el_ms: float - percentiles_e2el_ms: list[tuple[float, float]] - - -async def get_request( - input_requests: list[SampleRequest], - request_rate: float, - burstiness: float = 1.0, -) -> AsyncGenerator[SampleRequest, None]: - """ - Asynchronously generates requests at a specified rate - with OPTIONAL burstiness. - - Args: - input_requests: - A list of input requests, each represented as a SampleRequest. - request_rate: - The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. - Only takes effect when request_rate is not inf. - Default value is 1, which follows a Poisson process. - Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value - (burstiness > 1) results in a more uniform arrival of requests. - """ - input_requests: Iterable[SampleRequest] = iter(input_requests) - - # Calculate scale parameter theta to maintain the desired request_rate. - assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}." - ) - theta = 1.0 / (request_rate * burstiness) - - for request in input_requests: - yield request - - if request_rate == float("inf"): - # If the request rate is infinity, then we don't need to wait. - continue - - # Sample the request interval from the gamma distribution. - # If burstiness is 1, it follows exponential distribution. - interval = np.random.gamma(shape=burstiness, scale=theta) - # The next request will be sent after the interval. - await asyncio.sleep(interval) - - -def calculate_metrics( - input_requests: list[SampleRequest], - outputs: list[RequestFuncOutput], - dur_s: float, - tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - goodput_config_dict: dict[str, float], -) -> tuple[BenchmarkMetrics, list[int]]: - actual_output_lens: list[int] = [] - total_input = 0 - completed = 0 - good_completed = 0 - itls: list[float] = [] - tpots: list[float] = [] - all_tpots: list[float] = [] - ttfts: list[float] = [] - e2els: list[float] = [] - for i in range(len(outputs)): - if outputs[i].success: - output_len = outputs[i].output_tokens - - if not output_len: - # We use the tokenizer to count the number of output tokens - # for some serving backends instead of looking at - # len(outputs[i].itl) since multiple output tokens may be - # bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer( - outputs[i].generated_text, add_special_tokens=False - ).input_ids - ) - actual_output_lens.append(output_len) - total_input += input_requests[i].prompt_len - tpot = 0 - if output_len > 1: - latency_minus_ttft = outputs[i].latency - outputs[i].ttft - tpot = latency_minus_ttft / (output_len - 1) - tpots.append(tpot) - # Note: if output_len <= 1, we regard tpot as 0 for goodput - all_tpots.append(tpot) - itls += outputs[i].itl - ttfts.append(outputs[i].ttft) - e2els.append(outputs[i].latency) - completed += 1 - else: - actual_output_lens.append(0) - - if goodput_config_dict: - valid_metrics = [] - slo_values = [] - - if "ttft" in goodput_config_dict: - valid_metrics.append(ttfts) - slo_values.append( - goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "tpot" in goodput_config_dict: - valid_metrics.append(all_tpots) - slo_values.append( - goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "e2el" in goodput_config_dict: - valid_metrics.append(e2els) - slo_values.append( - goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - - for req_metric in zip(*valid_metrics): - is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) - if is_good_req: - good_completed += 1 - - if completed == 0: - warnings.warn( - "All requests failed. This is likely due to a misconfiguration " - "on the benchmark arguments.", - stacklevel=2, - ) - metrics = BenchmarkMetrics( - completed=completed, - total_input=total_input, - total_output=sum(actual_output_lens), - request_throughput=completed / dur_s, - request_goodput=good_completed / dur_s, - output_throughput=sum(actual_output_lens) / dur_s, - total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_ttft_ms=np.mean(ttfts or 0) - * 1000, # ttfts is empty if streaming is not supported by backend - std_ttft_ms=np.std(ttfts or 0) * 1000, - median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[ - (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles - ], - mean_tpot_ms=np.mean(tpots or 0) * 1000, - std_tpot_ms=np.std(tpots or 0) * 1000, - median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[ - (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles - ], - mean_itl_ms=np.mean(itls or 0) * 1000, - std_itl_ms=np.std(itls or 0) * 1000, - median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[ - (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles - ], - mean_e2el_ms=np.mean(e2els or 0) * 1000, - std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[ - (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles - ], - ) - - return metrics, actual_output_lens - - -async def benchmark( - backend: str, - api_url: str, - base_url: str, - model_id: str, - model_name: str, - tokenizer: PreTrainedTokenizerBase, - input_requests: list[SampleRequest], - logprobs: Optional[int], - request_rate: float, - burstiness: float, - disable_tqdm: bool, - profile: bool, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - ignore_eos: bool, - goodput_config_dict: dict[str, float], - max_concurrency: Optional[int], - lora_modules: Optional[Iterable[str]], - extra_body: Optional[dict], -): - if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS[backend] - else: - raise ValueError(f"Unknown backend: {backend}") - - print("Starting initial single prompt test run...") - test_prompt, test_prompt_len, test_output_len, test_mm_content = ( - input_requests[0].prompt, - input_requests[0].prompt_len, - input_requests[0].expected_output_len, - input_requests[0].multi_modal_data, - ) - - assert test_mm_content is None or isinstance(test_mm_content, dict) - test_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=api_url, - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - multi_modal_content=test_mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - - test_output = await request_func(request_func_input=test_input) - if not test_output.success: - raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}" - ) - else: - print("Initial test run completed. Starting main benchmark run...") - - if lora_modules: - # For each input request, choose a LoRA module at random. - lora_modules = iter( - [random.choice(lora_modules) for _ in range(len(input_requests))] - ) - - if profile: - print("Starting profiler...") - profile_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=base_url + "/start_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - multi_modal_content=test_mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler started") - - distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" - - print(f"Traffic request rate: {request_rate}") - print(f"Burstiness factor: {burstiness} ({distribution})") - print(f"Maximum request concurrency: {max_concurrency}") - - pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) - semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None - - async def limited_request_func(request_func_input, pbar): - if semaphore is None: - return await request_func(request_func_input=request_func_input, pbar=pbar) - async with semaphore: - return await request_func(request_func_input=request_func_input, pbar=pbar) - - benchmark_start_time = time.perf_counter() - tasks: list[asyncio.Task] = [] - async for request in get_request(input_requests, request_rate, burstiness): - prompt, prompt_len, output_len, mm_content = ( - request.prompt, - request.prompt_len, - request.expected_output_len, - request.multi_modal_data, - ) - req_model_id, req_model_name = model_id, model_name - if lora_modules: - req_lora_module = next(lora_modules) - req_model_id, req_model_name = req_lora_module, req_lora_module - - request_func_input = RequestFuncInput( - model=req_model_id, - model_name=req_model_name, - prompt=prompt, - api_url=api_url, - prompt_len=prompt_len, - output_len=output_len, - logprobs=logprobs, - multi_modal_content=mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - tasks.append( - asyncio.create_task( - limited_request_func(request_func_input=request_func_input, pbar=pbar) - ) - ) - outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) - - if profile: - print("Stopping profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_prompt, - api_url=base_url + "/stop_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler stopped") - - if pbar is not None: - pbar.close() - - benchmark_duration = time.perf_counter() - benchmark_start_time - - metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, - outputs=outputs, - dur_s=benchmark_duration, - tokenizer=tokenizer, - selected_percentile_metrics=selected_percentile_metrics, - selected_percentiles=selected_percentiles, - goodput_config_dict=goodput_config_dict, - ) - - print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) - print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) - print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) - print( - "{:<40} {:<10.2f}".format( - "Request throughput (req/s):", metrics.request_throughput - ) - ) - if goodput_config_dict: - print( - "{:<40} {:<10.2f}".format( - "Request goodput (req/s):", metrics.request_goodput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput - ) - ) - - result = { - "duration": benchmark_duration, - "completed": metrics.completed, - "total_input_tokens": metrics.total_input, - "total_output_tokens": metrics.total_output, - "request_throughput": metrics.request_throughput, - "request_goodput:": metrics.request_goodput if goodput_config_dict else None, - "output_throughput": metrics.output_throughput, - "total_token_throughput": metrics.total_token_throughput, - "input_lens": [output.prompt_len for output in outputs], - "output_lens": actual_output_lens, - "ttfts": [output.ttft for output in outputs], - "itls": [output.itl for output in outputs], - "generated_texts": [output.generated_text for output in outputs], - "errors": [output.error for output in outputs], - } - - def process_one_metric( - # E.g., "ttft" - metric_attribute_name: str, - # E.g., "TTFT" - metric_name: str, - # E.g., "Time to First Token" - metric_header: str, - ): - # This function prints and adds statistics of the specified - # metric. - if metric_attribute_name not in selected_percentile_metrics: - return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) - print( - "{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"), - ) - ) - print( - "{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"), - ) - ) - result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms" - ) - result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms" - ) - result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms" - ) - for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) - result[f"p{p_word}_{metric_attribute_name}_ms"] = value - - process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") - process_one_metric("itl", "ITL", "Inter-token Latency") - process_one_metric("e2el", "E2EL", "End-to-end Latency") - - print("=" * 50) - - return result - - -def check_goodput_args(args): - # Check and parse goodput arguments - goodput_config_dict = {} - VALID_NAMES = ["ttft", "tpot", "e2el"] - if args.goodput: - goodput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in goodput_config_dict.items(): - if slo_name not in VALID_NAMES: - raise ValueError( - f"Invalid metric name found, {slo_name}: {slo_val}. " - "The service level objective name should be one of " - f"{str(VALID_NAMES)}. " - ) - if slo_val < 0: - raise ValueError( - f"Invalid value found, {slo_name}: {slo_val}. " - "The service level objective value should be " - "non-negative." - ) - return goodput_config_dict - - -def parse_goodput(slo_pairs): - goodput_config_dict = {} - try: - for slo_pair in slo_pairs: - slo_name, slo_val = slo_pair.split(":") - goodput_config_dict[slo_name] = float(slo_val) - except ValueError as err: - raise argparse.ArgumentTypeError( - "Invalid format found for service level objectives. " - 'Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is a " - "number in milliseconds." - ) from err - return goodput_config_dict - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any], file_name: str -) -> None: - metrics = [ - "median_ttft_ms", - "mean_ttft_ms", - "std_ttft_ms", - "p99_ttft_ms", - "mean_tpot_ms", - "median_tpot_ms", - "std_tpot_ms", - "p99_tpot_ms", - "median_itl_ms", - "mean_itl_ms", - "std_itl_ms", - "p99_itl_ms", - ] - # These raw data might be useful, but they are rather big. They can be added - # later if needed - ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={k: [results[k]] for k in metrics}, - extra_info={ - k: results[k] - for k in results - if k not in metrics and k not in ignored_metrics - }, - ) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - np.random.seed(args.seed) - - backend = args.backend - model_id = args.model - model_name = args.served_model_name - tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - tokenizer_mode = args.tokenizer_mode - - if args.base_url is not None: - api_url = f"{args.base_url}{args.endpoint}" - base_url = f"{args.base_url}" - else: - api_url = f"http://{args.host}:{args.port}{args.endpoint}" - base_url = f"http://{args.host}:{args.port}" - - tokenizer = get_tokenizer( - tokenizer_id, - tokenizer_mode=tokenizer_mode, - trust_remote_code=args.trust_remote_code, - ) - - if args.dataset_name is None: - raise ValueError( - "Please specify '--dataset-name' and the corresponding " - "'--dataset-path' if required." - ) - - if args.dataset_name == "custom": - dataset = CustomDataset(dataset_path=args.dataset_path) - input_requests = dataset.sample( - num_requests=args.num_prompts, - tokenizer=tokenizer, - output_len=args.custom_output_len, - skip_chat_template=args.custom_skip_chat_template, - ) - - elif args.dataset_name == "sonnet": - dataset = SonnetDataset(dataset_path=args.dataset_path) - # For the "sonnet" dataset, formatting depends on the backend. - if args.backend == "openai-chat": - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=False, - ) - else: - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset." - ) - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=True, - ) - - elif args.dataset_name == "hf": - # all following datasets are implemented from the - # HuggingFaceDataset base class - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_class = VisionArenaDataset - args.hf_split = "train" - args.hf_subset = None - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_class = InstructCoderDataset - args.hf_split = "train" - elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: - dataset_class = MTBenchDataset - args.hf_split = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ConversationDataset - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_class = AIMODataset - args.hf_split = "train" - elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 - dataset_class = NextEditPredictionDataset - args.hf_split = "train" - elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ASRDataset - args.hf_split = "train" - else: - supported_datasets = set( - [ - dataset_name - for cls in HuggingFaceDataset.__subclasses__() - for dataset_name in cls.SUPPORTED_DATASET_PATHS - ] - ) - raise ValueError( - f"Unsupported dataset path: {args.dataset_path}. " - "Huggingface dataset only supports dataset_path" - f" from one of following: {supported_datasets}. " - "Please consider contributing if you would " - "like to add support for additional dataset formats." - ) - - if dataset_class.IS_MULTIMODAL and backend not in [ - "openai-chat", - "openai-audio", - ]: - # multi-modal benchmark is only available on OpenAI Chat backend. - raise ValueError( - "Multi-modal content is only supported on 'openai-chat' and " - "'openai-audio' backend." - ) - input_requests = dataset_class( - dataset_path=args.dataset_path, - dataset_subset=args.hf_subset, - dataset_split=args.hf_split, - random_seed=args.seed, - ).sample( - num_requests=args.num_prompts, - tokenizer=tokenizer, - output_len=args.hf_output_len, - ) - - else: - # For datasets that follow a similar structure, use a mapping. - dataset_mapping = { - "sharegpt": lambda: ShareGPTDataset( - random_seed=args.seed, dataset_path=args.dataset_path - ).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, - ), - "burstgpt": lambda: BurstGPTDataset( - random_seed=args.seed, dataset_path=args.dataset_path - ).sample(tokenizer=tokenizer, num_requests=args.num_prompts), - "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - prefix_len=args.random_prefix_len, - input_len=args.random_input_len, - output_len=args.random_output_len, - range_ratio=args.random_range_ratio, - ), - } - - try: - input_requests = dataset_mapping[args.dataset_name]() - except KeyError as err: - raise ValueError(f"Unknown dataset: {args.dataset_name}") from err - goodput_config_dict = check_goodput_args(args) - - # Collect the sampling parameters. - sampling_params = { - k: v - for k, v in { - "top_p": args.top_p, - "top_k": args.top_k, - "min_p": args.min_p, - "temperature": args.temperature, - }.items() - if v is not None - } - - # Sampling parameters are only supported by openai-compatible backend. - if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: - raise ValueError( - "Sampling parameters are only supported by openai-compatible backends." - ) - - if "temperature" not in sampling_params: - sampling_params["temperature"] = 0.0 # Default to greedy decoding. - - if args.backend == "llama.cpp": - # Disable prompt caching in llama.cpp backend - sampling_params["cache_prompt"] = False - - # Avoid GC processing "static" data - reduce pause times. - gc.collect() - gc.freeze() - - benchmark_result = asyncio.run( - benchmark( - backend=backend, - api_url=api_url, - base_url=base_url, - model_id=model_id, - model_name=model_name, - tokenizer=tokenizer, - input_requests=input_requests, - logprobs=args.logprobs, - request_rate=args.request_rate, - burstiness=args.burstiness, - disable_tqdm=args.disable_tqdm, - profile=args.profile, - selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], - ignore_eos=args.ignore_eos, - goodput_config_dict=goodput_config_dict, - max_concurrency=args.max_concurrency, - lora_modules=args.lora_modules, - extra_body=sampling_params, - ) - ) - - # Save config and results to json - if args.save_result or args.append_result: - result_json: dict[str, Any] = {} - - # Setup - current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - result_json["date"] = current_dt - result_json["backend"] = backend - result_json["model_id"] = model_id - result_json["tokenizer_id"] = tokenizer_id - result_json["num_prompts"] = args.num_prompts - - # Metadata - if args.metadata: - for item in args.metadata: - if "=" in item: - kvstring = item.split("=") - result_json[kvstring[0].strip()] = kvstring[1].strip() - else: - raise ValueError( - "Invalid metadata format. Please use KEY=VALUE format." - ) - # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf" - ) - result_json["burstiness"] = args.burstiness - result_json["max_concurrency"] = args.max_concurrency - - # Merge with benchmark result - result_json = {**result_json, **benchmark_result} - - if not args.save_detailed: - # Remove fields with too many data points - for field in [ - "input_lens", - "output_lens", - "ttfts", - "itls", - "generated_texts", - "errors", - ]: - if field in result_json: - del result_json[field] - if field in benchmark_result: - del benchmark_result[field] - - # Save to file - base_model_id = model_id.split("/")[-1] - max_concurrency_str = ( - f"-concurrency{args.max_concurrency}" - if args.max_concurrency is not None - else "" - ) - file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa - if args.result_filename: - file_name = args.result_filename - if args.result_dir: - os.makedirs(args.result_dir, exist_ok=True) - file_name = os.path.join(args.result_dir, file_name) - with open( - file_name, mode="a+" if args.append_result else "w", encoding="utf-8" - ) as outfile: - # Append a newline. - if args.append_result and outfile.tell() != 0: - outfile.write("\n") - json.dump(result_json, outfile) - save_to_pytorch_benchmark_format(args, result_json, file_name) - +import sys if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark the online serving throughput." - ) - parser.add_argument( - "--backend", - type=str, - default="vllm", - choices=list(ASYNC_REQUEST_FUNCS.keys()), - ) - parser.add_argument( - "--base-url", - type=str, - default=None, - help="Server or API base url if not using http host and port.", - ) - # Use 127.0.0.1 here instead of localhost to force the use of ipv4 - parser.add_argument("--host", type=str, default="127.0.0.1") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument( - "--endpoint", - type=str, - default="/v1/completions", - help="API endpoint.", - ) - parser.add_argument( - "--dataset-name", - type=str, - default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], - help="Name of the dataset to benchmark on.", - ) - parser.add_argument( - "--dataset-path", - type=str, - default=None, - help="Path to the sharegpt/sonnet dataset. " - "Or the huggingface dataset ID if using HF dataset.", - ) - parser.add_argument( - "--max-concurrency", - type=int, - default=None, - help="Maximum number of concurrent requests. This can be used " - "to help simulate an environment where a higher level component " - "is enforcing a maximum number of concurrent requests. While the " - "--request-rate argument controls the rate at which requests are " - "initiated, this argument will control how many are actually allowed " - "to execute at a time. This means that when used in combination, the " - "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.", - ) - - parser.add_argument( - "--model", - type=str, - required=True, - help="Name of the model.", - ) - parser.add_argument( - "--tokenizer", - type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.", - ) - parser.add_argument( - "--logprobs", - type=int, - default=None, - help=( - "Number of logprobs-per-token to compute & return as part of " - "the request. If unspecified, then either (1) if beam search " - "is disabled, no logprobs are computed & a single dummy " - "logprob is returned for each token; or (2) if beam search " - "is enabled 1 logprob per token is computed" - ), - ) - parser.add_argument( - "--request-rate", - type=float, - default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process or gamma distribution " - "to synthesize the request arrival times.", - ) - parser.add_argument( - "--burstiness", - type=float, - default=1.0, - help="Burstiness factor of the request generation. " - "Only take effect when request_rate is not inf. " - "Default value is 1, which follows Poisson process. " - "Otherwise, the request intervals follow a gamma distribution. " - "A lower burstiness value (0 < burstiness < 1) results in more " - "bursty requests. A higher burstiness value (burstiness > 1) " - "results in a more uniform arrival of requests.", - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from huggingface", - ) - parser.add_argument( - "--disable-tqdm", - action="store_true", - help="Specify to disable tqdm progress bar.", - ) - parser.add_argument( - "--profile", - action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", - ) - parser.add_argument( - "--save-result", - action="store_true", - help="Specify to save benchmark results to a json file", - ) - parser.add_argument( - "--save-detailed", - action="store_true", - help="When saving the results, whether to include per request " - "information such as response, error, ttfs, tpots, etc.", - ) - parser.add_argument( - "--append-result", - action="store_true", - help="Append the benchmark result to the existing json file.", - ) - parser.add_argument( - "--metadata", - metavar="KEY=VALUE", - nargs="*", - help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " - "for metadata of this run to be saved in the result JSON file " - "for record keeping purposes.", - ) - parser.add_argument( - "--result-dir", - type=str, - default=None, - help="Specify directory to save benchmark json results." - "If not specified, results are saved in the current directory.", - ) - parser.add_argument( - "--result-filename", - type=str, - default=None, - help="Specify the filename to save benchmark json results." - "If not specified, results will be saved in " - "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" - " format.", - ) - parser.add_argument( - "--ignore-eos", - action="store_true", - help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", - ) - parser.add_argument( - "--percentile-metrics", - type=str, - default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentils. " - "This argument specifies the metrics to report percentiles. " - 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' - 'Default value is "ttft,tpot,itl".', - ) - parser.add_argument( - "--metric-percentiles", - type=str, - default="99", - help="Comma-separated list of percentiles for selected metrics. " - 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' - 'Default value is "99". ' - 'Use "--percentile-metrics" to select metrics.', - ) - parser.add_argument( - "--goodput", - nargs="+", - required=False, - help='Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is in " - 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' - "separated by spaces. Allowed request level metric names are " - '"ttft", "tpot", "e2el". For more context on the definition of ' - "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve", - ) - - # group for dataset specific arguments - custom_group = parser.add_argument_group("custom dataset options") - custom_group.add_argument( - "--custom-output-len", - type=int, - default=256, - help="Number of output tokens per request, used only for custom dataset.", - ) - custom_group.add_argument( - "--custom-skip-chat-template", - action="store_true", - help="Skip applying chat template to prompt, used only for custom dataset.", - ) - - sonnet_group = parser.add_argument_group("sonnet dataset options") - sonnet_group.add_argument( - "--sonnet-input-len", - type=int, - default=550, - help="Number of input tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-output-len", - type=int, - default=150, - help="Number of output tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-prefix-len", - type=int, - default=200, - help="Number of prefix tokens per request, used only for sonnet dataset.", - ) - - sharegpt_group = parser.add_argument_group("sharegpt dataset options") - sharegpt_group.add_argument( - "--sharegpt-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output length " - "from the ShareGPT dataset.", - ) - - random_group = parser.add_argument_group("random dataset options") - random_group.add_argument( - "--random-input-len", - type=int, - default=1024, - help="Number of input tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-output-len", - type=int, - default=128, - help="Number of output tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-range-ratio", - type=float, - default=0.0, - help="Range ratio for sampling input/output length, " - "used only for random sampling. Must be in the range [0, 1) to define " - "a symmetric sampling range" - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) - random_group.add_argument( - "--random-prefix-len", - type=int, - default=0, - help=( - "Number of fixed prefix tokens before the random context " - "in a request. " - "The total input length is the sum of `random-prefix-len` and " - "a random " - "context length sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]." - ), - ) - - hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - hf_group.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) - hf_group.add_argument( - "--hf-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output lengths " - "from the sampled HF dataset.", - ) - - sampling_group = parser.add_argument_group("sampling parameters") - sampling_group.add_argument( - "--top-p", - type=float, - default=None, - help="Top-p sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--top-k", - type=int, - default=None, - help="Top-k sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--min-p", - type=float, - default=None, - help="Min-p sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--temperature", - type=float, - default=None, - help="Temperature sampling parameter. Only has effect on " - "openai-compatible backends. If not specified, default to greedy " - "decoding (i.e. temperature==0.0).", - ) - - parser.add_argument( - "--tokenizer-mode", - type=str, - default="auto", - choices=["auto", "slow", "mistral", "custom"], - help='The tokenizer mode.\n\n* "auto" will use the ' - 'fast tokenizer if available.\n* "slow" will ' - "always use the slow tokenizer. \n* " - '"mistral" will always use the `mistral_common` tokenizer. \n*' - '"custom" will use --tokenizer to select the preregistered tokenizer.', - ) - - parser.add_argument( - "--served-model-name", - type=str, - default=None, - help="The model name used in the API. " - "If not specified, the model name will be the " - "same as the ``--model`` argument. ", - ) + print("""DEPRECATED: This script has been moved to the vLLM CLI. - parser.add_argument( - "--lora-modules", - nargs="+", - default=None, - help="A subset of LoRA module names passed in when " - "launching the server. For each request, the " - "script chooses a LoRA module at random.", - ) +Please use the following command instead: + vllm bench serve - args = parser.parse_args() +For help with the new command, run: + vllm bench serve --help - main(args) +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench serve --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index c1501ad52..a03506254 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -4,7 +4,7 @@ On the server side, run one of the following commands: (vLLM OpenAI API server) - vllm serve --disable-log-requests + vllm serve On the client side, run: python benchmarks/benchmark_serving_structured_output.py \ @@ -449,7 +449,8 @@ async def benchmark( def prepare_extra_body(request) -> dict: extra_body = {} # Add the schema to the extra_body - extra_body[request.structure_type] = request.schema + extra_body["structured_outputs"] = {} + extra_body["structured_outputs"][request.structure_type] = request.schema return extra_body print("Starting initial single prompt test run...") @@ -538,20 +539,6 @@ async def limited_request_func(request_func_input, pbar): ) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) - if profile: - print("Stopping profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_request.prompt, - api_url=base_url + "/stop_profile", - prompt_len=test_request.prompt_len, - output_len=test_request.expected_output_len, - extra_body={test_request.structure_type: test_request.schema}, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler stopped") - if pbar is not None: pbar.close() @@ -569,6 +556,10 @@ async def limited_request_func(request_func_input, pbar): print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + if max_concurrency is not None: + print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) + if request_rate != float("inf"): + print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) @@ -666,6 +657,20 @@ def process_one_metric( print("=" * 50) + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + extra_body={test_request.structure_type: test_request.schema}, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + return result, ret @@ -692,11 +697,11 @@ def _eval_correctness_regex(expected, actual): return re.match(args.regex, actual) is not None def _eval_correctness(expected, actual): - if args.structure_type == "guided_json": + if args.structure_type == "json": return _eval_correctness_json(expected, actual) - elif args.structure_type == "guided_regex": + elif args.structure_type == "regex": return _eval_correctness_regex(expected, actual) - elif args.structure_type == "guided_choice": + elif args.structure_type == "choice": return _eval_correctness_choice(expected, actual) else: return None @@ -776,18 +781,18 @@ def main(args: argparse.Namespace): ) if args.dataset == "grammar": - args.structure_type = "guided_grammar" + args.structure_type = "grammar" elif args.dataset == "regex": - args.structure_type = "guided_regex" + args.structure_type = "regex" elif args.dataset == "choice": - args.structure_type = "guided_choice" + args.structure_type = "choice" else: - args.structure_type = "guided_json" + args.structure_type = "json" if args.no_structured_output: args.structured_output_ratio = 0 if args.save_results: - result_file_name = f"{args.structured_output_ratio}guided" + result_file_name = f"{args.structured_output_ratio}so" result_file_name += f"_{backend}" result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.model.split('/')[-1]}" @@ -850,7 +855,7 @@ def main(args: argparse.Namespace): json.dump(results, outfile, indent=4) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the online serving throughput." ) @@ -994,7 +999,7 @@ def main(args: argparse.Namespace): "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentiles. " "This argument specifies the metrics to report percentiles. " 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' 'Default value is "ttft,tpot,itl".', @@ -1034,5 +1039,10 @@ def main(args: argparse.Namespace): help="Ratio of Structured Outputs requests", ) + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index d19753d40..b6dc0918f 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,724 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark offline inference throughput.""" - -import argparse -import dataclasses -import json -import os -import random -import time -import warnings -from typing import Any, Optional, Union - -import torch -import uvloop -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase - -from benchmark_dataset import ( - AIMODataset, - BurstGPTDataset, - ConversationDataset, - InstructCoderDataset, - RandomDataset, - SampleRequest, - ShareGPTDataset, - SonnetDataset, - VisionArenaDataset, -) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args, -) -from vllm.inputs import TextPrompt, TokensPrompt -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput -from vllm.sampling_params import BeamSearchParams -from vllm.utils import FlexibleArgumentParser, merge_async_iterators - - -def run_vllm( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -) -> tuple[float, Optional[list[RequestOutput]]]: - from vllm import LLM, SamplingParams - - llm = LLM(**dataclasses.asdict(engine_args)) - assert all( - llm.llm_engine.model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests." - ) - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append( - TokensPrompt( - prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data, - ) - if "prompt_token_ids" in request.prompt - else TextPrompt( - prompt=request.prompt, multi_modal_data=request.multi_modal_data - ) - ) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - lora_requests: Optional[list[LoRARequest]] = None - if engine_args.enable_lora: - lora_requests = [request.lora_request for request in requests] - - use_beam_search = False - - outputs = None - if not use_beam_search: - start = time.perf_counter() - outputs = llm.generate( - prompts, sampling_params, lora_request=lora_requests, use_tqdm=True - ) - end = time.perf_counter() - else: - assert lora_requests is None, "BeamSearch API does not support LoRA" - prompts = [request.prompt for request in requests] - # output_len should be the same for all requests. - output_len = requests[0][2] - for request in requests: - assert request.expected_output_len == output_len - start = time.perf_counter() - llm.beam_search( - prompts, - BeamSearchParams( - beam_width=n, - max_tokens=output_len, - ignore_eos=True, - ), - ) - end = time.perf_counter() - return end - start, outputs - - -def run_vllm_chat( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -) -> tuple[float, list[RequestOutput]]: - """ - Run vLLM chat benchmark. This function is recommended ONLY for benchmarking - multimodal models as it properly handles multimodal inputs and chat - formatting. For non-multimodal models, use run_vllm() instead. - """ - from vllm import LLM, SamplingParams - - llm = LLM(**dataclasses.asdict(engine_args)) - - assert all( - llm.llm_engine.model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of " - "prompt_len and expected_output_len for all requests." - ) - - prompts = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - start = time.perf_counter() - outputs = llm.chat(prompts, sampling_params, use_tqdm=True) - end = time.perf_counter() - return end - start, outputs - - -async def run_vllm_async( - requests: list[SampleRequest], - n: int, - engine_args: AsyncEngineArgs, - disable_frontend_multiprocessing: bool = False, - disable_detokenize: bool = False, -) -> float: - from vllm import SamplingParams - - async with build_async_engine_client_from_engine_args( - engine_args, disable_frontend_multiprocessing - ) as llm: - model_config = await llm.get_model_config() - assert all( - model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests." - ) - - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - lora_requests: list[Optional[LoRARequest]] = [] - for request in requests: - prompts.append( - TokensPrompt( - prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data, - ) - if "prompt_token_ids" in request.prompt - else TextPrompt( - prompt=request.prompt, multi_modal_data=request.multi_modal_data - ) - ) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - lora_requests.append(request.lora_request) - - generators = [] - start = time.perf_counter() - for i, (prompt, sp, lr) in enumerate( - zip(prompts, sampling_params, lora_requests) - ): - generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}") - generators.append(generator) - all_gens = merge_async_iterators(*generators) - async for i, res in all_gens: - pass - end = time.perf_counter() - return end - start - - -def run_hf( - requests: list[SampleRequest], - model: str, - tokenizer: PreTrainedTokenizerBase, - n: int, - max_batch_size: int, - trust_remote_code: bool, - disable_detokenize: bool = False, -) -> float: - llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code - ) - if llm.config.model_type == "llama": - # To enable padding in the HF backend. - tokenizer.pad_token = tokenizer.eos_token - llm = llm.cuda() - - pbar = tqdm(total=len(requests)) - start = time.perf_counter() - batch: list[str] = [] - max_prompt_len = 0 - max_output_len = 0 - for i in range(len(requests)): - prompt = requests[i].prompt - prompt_len = requests[i].prompt_len - output_len = requests[i].expected_output_len - # Add the prompt to the batch. - batch.append(prompt) - max_prompt_len = max(max_prompt_len, prompt_len) - max_output_len = max(max_output_len, output_len) - if len(batch) < max_batch_size and i != len(requests) - 1: - # Check if we can add more requests to the batch. - next_prompt_len = requests[i + 1].prompt_len - next_output_len = requests[i + 1].expected_output_len - if ( - max(max_prompt_len, next_prompt_len) - + max(max_output_len, next_output_len) - ) <= 2048: - # We can add more requests to the batch. - continue - - # Generate the sequences. - input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids - llm_outputs = llm.generate( - input_ids=input_ids.cuda(), - do_sample=True, - num_return_sequences=n, - temperature=1.0, - top_p=1.0, - use_cache=True, - max_new_tokens=max_output_len, - ) - if not disable_detokenize: - # Include the decoding time. - tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) - pbar.update(len(batch)) - - # Clear the batch. - batch = [] - max_prompt_len = 0 - max_output_len = 0 - end = time.perf_counter() - return end - start - - -def run_mii( - requests: list[SampleRequest], - model: str, - tensor_parallel_size: int, - output_len: int, -) -> float: - from mii import client, serve - - llm = serve(model, tensor_parallel=tensor_parallel_size) - prompts = [request.prompt for request in requests] - - start = time.perf_counter() - llm.generate(prompts, max_new_tokens=output_len) - end = time.perf_counter() - client = client(model) - client.terminate_server() - return end - start - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any] -) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={ - "requests_per_second": [results["requests_per_second"]], - "tokens_per_second": [results["tokens_per_second"]], - }, - extra_info={ - k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"] - }, - ) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def get_requests(args, tokenizer): - # Common parameters for all dataset types. - common_kwargs = { - "dataset_path": args.dataset_path, - "random_seed": args.seed, - } - sample_kwargs = { - "tokenizer": tokenizer, - "lora_path": args.lora_path, - "max_loras": args.max_loras, - "num_requests": args.num_prompts, - "input_len": args.input_len, - "output_len": args.output_len, - } - - if args.dataset_path is None or args.dataset_name == "random": - sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["prefix_len"] = args.prefix_len - dataset_cls = RandomDataset - elif args.dataset_name == "sharegpt": - dataset_cls = ShareGPTDataset - if args.backend == "vllm-chat": - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_name == "sonnet": - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset." - ) - dataset_cls = SonnetDataset - sample_kwargs["prefix_len"] = args.prefix_len - sample_kwargs["return_prompt_formatted"] = True - elif args.dataset_name == "burstgpt": - dataset_cls = BurstGPTDataset - elif args.dataset_name == "hf": - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = VisionArenaDataset - common_kwargs["dataset_subset"] = None - common_kwargs["dataset_split"] = "train" - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = InstructCoderDataset - common_kwargs["dataset_split"] = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = ConversationDataset - common_kwargs["dataset_subset"] = args.hf_subset - common_kwargs["dataset_split"] = args.hf_split - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_cls = AIMODataset - common_kwargs["dataset_subset"] = None - common_kwargs["dataset_split"] = "train" - else: - raise ValueError(f"Unknown dataset name: {args.dataset_name}") - # Remove None values - sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} - return dataset_cls(**common_kwargs).sample(**sample_kwargs) - - -def main(args: argparse.Namespace): - if args.seed is None: - args.seed = 0 - print(args) - random.seed(args.seed) - # Sample the requests. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code - ) - requests = get_requests(args, tokenizer) - is_multi_modal = any(request.multi_modal_data is not None for request in requests) - request_outputs: Optional[list[RequestOutput]] = None - if args.backend == "vllm": - if args.async_engine: - elapsed_time = uvloop.run( - run_vllm_async( - requests, - args.n, - AsyncEngineArgs.from_cli_args(args), - args.disable_frontend_multiprocessing, - args.disable_detokenize, - ) - ) - else: - elapsed_time, request_outputs = run_vllm( - requests, - args.n, - EngineArgs.from_cli_args(args), - args.disable_detokenize, - ) - elif args.backend == "hf": - assert args.tensor_parallel_size == 1 - elapsed_time = run_hf( - requests, - args.model, - tokenizer, - args.n, - args.hf_max_batch_size, - args.trust_remote_code, - args.disable_detokenize, - ) - elif args.backend == "mii": - elapsed_time = run_mii( - requests, args.model, args.tensor_parallel_size, args.output_len - ) - elif args.backend == "vllm-chat": - elapsed_time, request_outputs = run_vllm_chat( - requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize - ) - else: - raise ValueError(f"Unknown backend: {args.backend}") - - if request_outputs: - # Note: with the vllm and vllm-chat backends, - # we have request_outputs, which we use to count tokens. - total_prompt_tokens = 0 - total_output_tokens = 0 - for ro in request_outputs: - if not isinstance(ro, RequestOutput): - continue - total_prompt_tokens += ( - len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 - ) - total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o) - total_num_tokens = total_prompt_tokens + total_output_tokens - else: - total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) - total_output_tokens = sum(r.expected_output_len for r in requests) - total_prompt_tokens = total_num_tokens - total_output_tokens - - if is_multi_modal and args.backend != "vllm-chat": - print( - "\033[91mWARNING\033[0m: Multi-modal request with " - f"{args.backend} backend detected. The " - "following metrics are not accurate because image tokens are not" - " counted. See vllm-project/vllm/issues/9778 for details." - ) - # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. - # vllm-chat backend counts the image tokens now - - print( - f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s" - ) - print(f"Total num prompt tokens: {total_prompt_tokens}") - print(f"Total num output tokens: {total_output_tokens}") - - # Output JSON results if specified - if args.output_json: - results = { - "elapsed_time": elapsed_time, - "num_requests": len(requests), - "total_num_tokens": total_num_tokens, - "requests_per_second": len(requests) / elapsed_time, - "tokens_per_second": total_num_tokens / elapsed_time, - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) - - -def validate_args(args): - """ - Validate command-line arguments. - """ - - # === Deprecation and Defaulting === - if args.dataset is not None: - warnings.warn( - "The '--dataset' argument will be deprecated in the next release. " - "Please use '--dataset-name' and '--dataset-path' instead.", - stacklevel=2, - ) - args.dataset_path = args.dataset - - if not getattr(args, "tokenizer", None): - args.tokenizer = args.model - - # === Backend Validation === - valid_backends = {"vllm", "hf", "mii", "vllm-chat"} - if args.backend not in valid_backends: - raise ValueError(f"Unsupported backend: {args.backend}") - - # === Dataset Configuration === - if not args.dataset and not args.dataset_path: - print("When dataset path is not set, it will default to random dataset") - args.dataset_name = "random" - if args.input_len is None: - raise ValueError("input_len must be provided for a random dataset") - - # === Dataset Name Specific Checks === - # --hf-subset and --hf-split: only used - # when dataset_name is 'hf' - if args.dataset_name != "hf" and ( - getattr(args, "hf_subset", None) is not None - or getattr(args, "hf_split", None) is not None - ): - warnings.warn( - "--hf-subset and --hf-split will be ignored \ - since --dataset-name is not 'hf'.", - stacklevel=2, - ) - elif args.dataset_name == "hf": - if args.dataset_path in ( - VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() - | ConversationDataset.SUPPORTED_DATASET_PATHS - ): - assert args.backend == "vllm-chat", ( - f"{args.dataset_path} needs to use vllm-chat as the backend." - ) # noqa: E501 - elif args.dataset_path in ( - InstructCoderDataset.SUPPORTED_DATASET_PATHS - | AIMODataset.SUPPORTED_DATASET_PATHS - ): - assert args.backend == "vllm", ( - f"{args.dataset_path} needs to use vllm as the backend." - ) # noqa: E501 - else: - raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") - - # --random-range-ratio: only used when dataset_name is 'random' - if args.dataset_name != "random" and args.random_range_ratio is not None: - warnings.warn( - "--random-range-ratio will be ignored since \ - --dataset-name is not 'random'.", - stacklevel=2, - ) - - # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not - # set. - if ( - args.dataset_name not in {"random", "sonnet", None} - and args.prefix_len is not None - ): - warnings.warn( - "--prefix-len will be ignored since --dataset-name\ - is not 'random', 'sonnet', or not set.", - stacklevel=2, - ) - - # === LoRA Settings === - if getattr(args, "enable_lora", False) and args.backend != "vllm": - raise ValueError("LoRA benchmarking is only supported for vLLM backend") - if getattr(args, "enable_lora", False) and args.lora_path is None: - raise ValueError("LoRA path must be provided when enable_lora is True") - - # === Backend-specific Validations === - if args.backend == "hf" and args.hf_max_batch_size is None: - raise ValueError("HF max batch size is required for HF backend") - if args.backend != "hf" and args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - - if ( - args.backend in {"hf", "mii"} - and getattr(args, "quantization", None) is not None - ): - raise ValueError("Quantization is only for vLLM backend.") - - if args.backend == "mii" and args.dtype != "auto": - raise ValueError("dtype must be auto for MII backend.") - if args.backend == "mii" and args.n != 1: - raise ValueError("n must be 1 for MII backend.") - if args.backend == "mii" and args.tokenizer != args.model: - raise ValueError("Tokenizer must be the same as the model for MII backend.") - - # --data-parallel is not supported currently. - # https://github.com/vllm-project/vllm/issues/16222 - if args.data_parallel_size > 1: - raise ValueError( - "Data parallel is not supported in offline benchmark, \ - please use benchmark serving instead" - ) - +import sys if __name__ == "__main__": - parser = FlexibleArgumentParser(description="Benchmark the throughput.") - parser.add_argument( - "--backend", - type=str, - choices=["vllm", "hf", "mii", "vllm-chat"], - default="vllm", - ) - parser.add_argument( - "--dataset-name", - type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], - help="Name of the dataset to benchmark on.", - default="sharegpt", - ) - parser.add_argument( - "--dataset", - type=str, - default=None, - help="Path to the ShareGPT dataset, will be deprecated in\ - the next release. The dataset is expected to " - "be a json in form of list[dict[..., conversations: " - "list[dict[..., value: ]]]]", - ) - parser.add_argument( - "--dataset-path", type=str, default=None, help="Path to the dataset" - ) - parser.add_argument( - "--input-len", - type=int, - default=None, - help="Input prompt length for each request", - ) - parser.add_argument( - "--output-len", - type=int, - default=None, - help="Output length for each request. Overrides the " - "output length from the dataset.", - ) - parser.add_argument( - "--n", type=int, default=1, help="Number of generated sequences per prompt." - ) - parser.add_argument( - "--num-prompts", type=int, default=1000, help="Number of prompts to process." - ) - parser.add_argument( - "--hf-max-batch-size", - type=int, - default=None, - help="Maximum batch size for HF backend.", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the throughput results in JSON format.", - ) - parser.add_argument( - "--async-engine", - action="store_true", - default=False, - help="Use vLLM async engine rather than LLM class.", - ) - parser.add_argument( - "--disable-frontend-multiprocessing", - action="store_true", - default=False, - help="Disable decoupled async engine frontend.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize the response (i.e. do not include " - "detokenization time in the measurement)" - ), - ) - # LoRA - parser.add_argument( - "--lora-path", - type=str, - default=None, - help="Path to the LoRA adapters to use. This can be an absolute path, " - "a relative path, or a Hugging Face model identifier.", - ) - parser.add_argument( - "--prefix-len", - type=int, - default=None, - help=f"Number of prefix tokens to be used in RandomDataset " - "and SonnetDataset. For RandomDataset, the total input " - "length is the sum of prefix-len (default: " - f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length " - "sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]. For SonnetDataset, " - f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) " - "controls how much of the input is fixed lines versus " - "random lines, but the total input length remains approximately " - "input_len tokens.", - ) - # random dataset - parser.add_argument( - "--random-range-ratio", - type=float, - default=None, - help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) " - "for sampling input/output length, " - "used only for RandomDataset. Must be in the range [0, 1) to " - "define a symmetric sampling range " - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) + print("""DEPRECATED: This script has been moved to the vLLM CLI. + +Please use the following command instead: + vllm bench throughput - # hf dtaset - parser.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - parser.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) +For help with the new command, run: + vllm bench throughput --help - parser = AsyncEngineArgs.add_cli_args(parser) - args = parser.parse_args() - if args.tokenizer is None: - args.tokenizer = args.model - validate_args(args) - main(args) +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench throughput --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 283f938df..98624abdf 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import argparse import json import math import os -from typing import Any +import time +from types import TracebackType +from typing import Any, Optional, Union def convert_to_pytorch_benchmark_format( @@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None: cls=InfEncoder, default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", ) + + +# Collect time and generate time metrics +# +# Example Usage: +# collector = TimeCollector(TimeCollector.US) +# for _ in range(total_iteration): +# with collector: +# ... +# collector.dump_avg_max() +class TimeCollector: + NS: int = 1 + US: int = NS * 1000 + MS: int = US * 1000 + S: int = MS * 1000 + + def __init__(self, scale: int) -> None: + self.cnt: int = 0 + self._sum: int = 0 + self._max: Optional[int] = None + self.scale = scale + self.start_time: int = time.monotonic_ns() + + def collect(self, v: int) -> None: + self.cnt += 1 + self._sum += v + if self._max is None: + self._max = v + else: + self._max = max(self._max, v) + + def avg(self) -> Union[float, str]: + return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" + + def max(self) -> Union[float, str]: + return self._max / self.scale if self._max else "N/A" + + def dump_avg_max(self) -> list[Union[float, str]]: + return [self.avg(), self.max()] + + def __enter__(self) -> None: + self.start_time = time.monotonic_ns() + + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_value: Optional[BaseException], + exc_traceback: Optional[TracebackType], + ) -> None: + self.collect(time.monotonic_ns() - self.start_time) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index cec422e8d..a5a5b52f6 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -19,7 +19,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( w8a8_block_fp8_matmul, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser, cdiv DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -117,14 +117,9 @@ def bench_fp8( scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - def ceil_div(x: int, y: int) -> int: - return (x + y - 1) // y - - block_scale_a = torch.rand( - (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32 - ) + block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32) block_scale_b = torch.rand( - ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32 + cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32 ) block_scale_a_M_major = block_scale_a.t().contiguous().t() block_scale_b_K_major = block_scale_b.t().contiguous().t() diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh index 94999630b..2c72941cf 100644 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -3,7 +3,7 @@ # benchmark the overhead of disaggregated prefill. # methodology: # - send all request to prefill vLLM instance. It will buffer KV cache. -# - then send all request to decode instance. +# - then send all request to decode instance. # - The TTFT of decode instance is the overhead. set -ex @@ -12,6 +12,8 @@ kill_gpu_processes() { # kill all processes on GPU. pgrep pt_main_thread | xargs -r kill -9 pgrep python3 | xargs -r kill -9 + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pgrep VLLM | xargs -r kill -9 sleep 10 # remove vllm config file @@ -60,8 +62,8 @@ benchmark() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + CUDA_VISIBLE_DEVICES=1 python3 \ -m vllm.entrypoints.openai.api_server \ @@ -70,44 +72,44 @@ benchmark() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & wait_for_server 8100 wait_for_server 8200 # let the prefill instance finish prefill - python3 ../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8100 \ - --save-result \ - --result-dir $results_folder \ - --result-filename disagg_prefill_tp1.json \ - --request-rate "inf" + vllm bench serve \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8100 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_tp1.json \ + --request-rate "inf" # send the request to decode. # The TTFT of this command will be the overhead of disagg prefill impl. - python3 ../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8200 \ - --save-result \ - --result-dir $results_folder \ - --result-filename disagg_prefill_tp1_overhead.json \ - --request-rate "$qps" + vllm bench serve \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8200 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_tp1_overhead.json \ + --request-rate "$qps" kill_gpu_processes } diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh index eb5d891d0..0bbf7cd2b 100644 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -18,6 +18,8 @@ kill_gpu_processes() { # kill all processes on GPU. pgrep pt_main_thread | xargs -r kill -9 pgrep python3 | xargs -r kill -9 + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pgrep VLLM | xargs -r kill -9 for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done sleep 1 } @@ -58,7 +60,7 @@ launch_chunked_prefill() { launch_disagg_prefill() { - model="meta-llama/Meta-Llama-3.1-8B-Instruct" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" # disagg prefill CUDA_VISIBLE_DEVICES=0 python3 \ -m vllm.entrypoints.openai.api_server \ @@ -67,7 +69,7 @@ launch_disagg_prefill() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & CUDA_VISIBLE_DEVICES=1 python3 \ -m vllm.entrypoints.openai.api_server \ @@ -76,7 +78,7 @@ launch_disagg_prefill() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & wait_for_server 8100 wait_for_server 8200 @@ -97,20 +99,20 @@ benchmark() { output_len=$2 tag=$3 - python3 ../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8000 \ - --save-result \ - --result-dir $results_folder \ - --result-filename "$tag"-qps-"$qps".json \ - --request-rate "$qps" + vllm bench serve \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8000 \ + --save-result \ + --result-dir $results_folder \ + --result-filename "$tag"-qps-"$qps".json \ + --request-rate "$qps" sleep 2 } diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index f62d8102e..904f80534 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,63 +1,199 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import asyncio +import logging import os import aiohttp -from quart import Quart, make_response, request - -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) - -app = Quart(__name__) - - -async def forward_request(url, data): - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: +from quart import Quart, Response, make_response, request +from rate_limiter import RateLimiter +from request_queue import RequestQueue + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + """parse command line arguments""" + parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server") + + # Add args + parser.add_argument( + "--timeout", + type=float, + default=300, + help="Timeout for backend service requests in seconds (default: 300)", + ) + parser.add_argument( + "--max-concurrent", + type=int, + default=100, + help="Maximum concurrent requests to backend services (default: 100)", + ) + parser.add_argument( + "--queue-size", + type=int, + default=500, + help="Maximum number of requests in the queue (default: 500)", + ) + parser.add_argument( + "--rate-limit", + type=int, + default=40, + help="Maximum requests per second (default: 40)", + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="Port to run the server on (default: 8000)", + ) + parser.add_argument( + "--prefill-url", + type=str, + default="http://localhost:8100/v1/completions", + help="Prefill service endpoint URL", + ) + parser.add_argument( + "--decode-url", + type=str, + default="http://localhost:8200/v1/completions", + help="Decode service endpoint URL", + ) + + return parser.parse_args() + + +def main(): + """parse command line arguments""" + args = parse_args() + + # Initialize configuration using command line parameters + AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) + MAX_CONCURRENT_REQUESTS = args.max_concurrent + REQUEST_QUEUE_SIZE = args.queue_size + RATE_LIMIT = args.rate_limit + PREFILL_SERVICE_URL = args.prefill_url + DECODE_SERVICE_URL = args.decode_url + PORT = args.port + + app = Quart(__name__) + + # Initialize the rate limiter and request queue + rate_limiter = RateLimiter(RATE_LIMIT) + request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE) + + # Attach the configuration object to the application instance + app.config.update( + { + "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, + "rate_limiter": rate_limiter, + "request_queue": request_queue, + "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, + "DECODE_SERVICE_URL": DECODE_SERVICE_URL, + } + ) + + # Start queue processing on app startup + @app.before_serving + async def startup(): + """Start request processing task when app starts serving""" + asyncio.create_task(request_queue.process()) + + async def forward_request(url, data): + """Forward request to backend service with rate limiting and error handling""" headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - async with session.post(url=url, json=data, headers=headers) as response: - if response.status == 200: - # if response.headers.get('Transfer-Encoding') == 'chunked': - if True: - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes - else: - content = await response.read() - yield content - - -@app.route("/v1/completions", methods=["POST"]) -async def handle_request(): - try: - original_request_data = await request.get_json() - - prefill_request = original_request_data.copy() - # change max_tokens = 1 to let it only do prefill - prefill_request["max_tokens"] = 1 - - # finish prefill - async for _ in forward_request( - "http://localhost:8100/v1/completions", prefill_request - ): - continue - # return decode - generator = forward_request( - "http://localhost:8200/v1/completions", original_request_data - ) - response = await make_response(generator) - response.timeout = None - - return response - - except Exception as e: - import sys - import traceback - - exc_info = sys.exc_info() - print("Error occurred in disagg prefill proxy server") - print(e) - print("".join(traceback.format_exception(*exc_info))) + # Use rate limiter as context manager + async with ( + rate_limiter, + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + ): + try: + async with session.post( + url=url, json=data, headers=headers + ) as response: + if response.status == 200: + # Stream response chunks + async for chunk_bytes in response.content.iter_chunked(1024): + yield chunk_bytes + else: + # Handle backend service errors + error_text = await response.text() + logger.error( + "Backend service error: %s - %s", + response.status, + error_text, + ) + yield b'{"error": "Backend service error"}' + except aiohttp.ClientError as e: + # Handle connection errors + logger.error("Connection error to %s: %s", url, str(e)) + yield b'{"error": "Service unavailable"}' + except asyncio.TimeoutError: + # Handle timeout errors + logger.error("Timeout connecting to %s", url) + yield b'{"error": "Service timeout"}' + + async def process_request(): + """Process a single request through prefill and decode stages""" + try: + original_request_data = await request.get_json() + + # Create prefill request (max_tokens=1) + prefill_request = original_request_data.copy() + prefill_request["max_tokens"] = 1 + + # Execute prefill stage + async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request): + continue + + # Execute decode stage and stream response + generator = forward_request(DECODE_SERVICE_URL, original_request_data) + response = await make_response(generator) + response.timeout = None # Disable timeout for streaming response + return response + + except Exception: + logger.exception("Error processing request") + return Response( + response=b'{"error": "Internal server error"}', + status=500, + content_type="application/json", + ) + + @app.route("/v1/completions", methods=["POST"]) + async def handle_request(): + """Handle incoming API requests with concurrency and rate limiting""" + # Create task for request processing + task = asyncio.create_task(process_request()) + + # Enqueue request or reject if queue is full + if not await request_queue.enqueue(task): + return Response( + response=b'{"error": "Server busy, try again later"}', + status=503, + content_type="application/json", + ) + + try: + # Return the response from the processing task + return await task + except asyncio.CancelledError: + # Handle task cancellation (timeout or queue full) + logger.warning("Request cancelled due to timeout or queue full") + return Response( + response=b'{"error": "Request cancelled"}', + status=503, + content_type="application/json", + ) + + # Start the Quart server with host can be set to 0.0.0.0 + app.run(port=PORT) if __name__ == "__main__": - app.run(port=8000) + main() diff --git a/benchmarks/disagg_benchmarks/rate_limiter.py b/benchmarks/disagg_benchmarks/rate_limiter.py new file mode 100644 index 000000000..87ac8cb6a --- /dev/null +++ b/benchmarks/disagg_benchmarks/rate_limiter.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import time + + +class RateLimiter: + """Token bucket rate limiter implementation""" + + def __init__(self, rate_limit): + self.rate_limit = rate_limit # Requests per second + self.num_available_tokens = rate_limit # Available tokens + self.last_refill = time.monotonic() # Last token refill time + self.lock = asyncio.Lock() # Synchronization lock + + async def acquire(self): + """Acquire a token from the rate limiter""" + while True: + async with self.lock: + current_time = time.monotonic() + elapsed = current_time - self.last_refill + + # Refill num_available_tokens if more than 1 second has passed + if elapsed > 1.0: + self.num_available_tokens = self.rate_limit + self.last_refill = current_time + + # Check if num_available_tokens are available + if self.num_available_tokens > 0: + self.num_available_tokens -= 1 + return True + + # Calculate wait time if no num_available_tokens available + wait_time = 1.0 - elapsed + await asyncio.sleep(wait_time) + + async def __aenter__(self): + """Enter async context manager - acquire token""" + await self.acquire() + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + """Exit async context manager - no cleanup needed""" + pass diff --git a/benchmarks/disagg_benchmarks/request_queue.py b/benchmarks/disagg_benchmarks/request_queue.py new file mode 100644 index 000000000..410bcb956 --- /dev/null +++ b/benchmarks/disagg_benchmarks/request_queue.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +from collections import deque + + +class RequestQueue: + """Request queue manager with concurrency control""" + + def __init__(self, max_concurrent, max_queue_size): + # Maximum concurrent requests + self.max_concurrent = max_concurrent + self.max_queue_size = max_queue_size # Maximum queue size + # Concurrency control + self.semaphore = asyncio.Semaphore(max_concurrent) + self.queue = deque() # Request queue + self.queue_size = 0 # Current queue size + self.lock = asyncio.Lock() # Sync queue Lock + + async def enqueue(self, task): + """Add a request task to the queue""" + async with self.lock: + if self.queue_size >= self.max_queue_size: + return False + + self.queue.append(task) + self.queue_size += 1 + return True + + async def process(self): + """Process queued requests using semaphore for concurrency control""" + while True: + if self.queue: + async with self.semaphore, self.lock: + task = self.queue.popleft() + self.queue_size -= 1 + await task + await asyncio.sleep(0.01) # Yield control to event loop diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py new file mode 100644 index 000000000..f1e504499 --- /dev/null +++ b/benchmarks/kernels/bench_block_fp8_gemm.py @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_w8a8_block_fp8_linear, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + CUTLASS_BLOCK_FP8_SUPPORTED, +) +from vllm.platforms import current_platform +from vllm.triton_utils import triton as vllm_triton + +assert current_platform.is_cuda(), ( + "Only support benchmarking w8a8 block fp8 kernel on CUDA device." +) + +# DeepSeek-V3 weight shapes +DEEPSEEK_V3_SHAPES = [ + (512 + 64, 7168), + (2112, 7168), + ((128 + 64) * 128, 7168), + (128 * (128 + 128), 512), + (7168, 16384), + (7168, 18432), + (18432 * 2, 7168), + (24576, 1536), + (12288, 7168), + (4096, 7168), + (7168, 2048), +] + + +def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): + """Build runner function for w8a8 block fp8 matmul.""" + factor_for_scale = 1e-2 + + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + # Create random FP8 tensors + A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max + + B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max + B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + # Create scales + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + Bs = ( + torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device) + * factor_for_scale + ) + + # SM90 CUTLASS requires row-major format for scales + if use_cutlass and current_platform.is_device_capability(90): + Bs = Bs.T.contiguous() + + def run(): + if use_cutlass: + return apply_w8a8_block_fp8_linear( + A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True + ) + else: + return apply_w8a8_block_fp8_linear( + A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False + ) + + return run + + +# Determine available providers +available_providers = ["torch-bf16", "w8a8-block-fp8-triton"] +plot_title = "BF16 vs W8A8 Block FP8 GEMMs" + +if CUTLASS_BLOCK_FP8_SUPPORTED: + available_providers.append("w8a8-block-fp8-cutlass") + + +@vllm_triton.testing.perf_report( + vllm_triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=available_providers, + line_names=available_providers, + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs W8A8 Block FP8 GEMMs", + args={}, + ) +) +def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)): + M = batch_size + device = "cuda" + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + a = torch.randn((M, K), device=device, dtype=torch.bfloat16) + b = torch.randn((N, K), device=device, dtype=torch.bfloat16) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + elif provider == "w8a8-block-fp8-triton": + run_w8a8_triton = build_w8a8_block_fp8_runner( + M, N, K, block_size, device, use_cutlass=False + ) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: run_w8a8_triton(), quantiles=quantiles + ) + elif provider == "w8a8-block-fp8-cutlass": + run_w8a8_cutlass = build_w8a8_block_fp8_runner( + M, N, K, block_size, device, use_cutlass=True + ) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: run_w8a8_cutlass(), quantiles=quantiles + ) + else: + raise ValueError(f"Unknown provider: {provider}") + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +if __name__ == "__main__": + block_size = (128, 128) + + for N, K in DEEPSEEK_V3_SHAPES: + print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}") + + print(f"TFLOP/s comparison (block_size={block_size}):") + benchmark_tflops.run( + print_data=True, + # show_plots=False, + # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}", + N=N, + K=K, + block_size=block_size, + ) + + print("\nBenchmark finished!") diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py index b964ed242..920961899 100644 --- a/benchmarks/kernels/bench_fp8_gemm.py +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -11,6 +11,80 @@ from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant from vllm.triton_utils import triton +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "fp8-tensor-w-token-a": dict( + w="tensor", a="token", no_a_quant=False, enabled=False + ), + "fp8-tensor-w-tensor-a": dict( + w="tensor", a="tensor", no_a_quant=False, enabled=True + ), + "fp8-channel-w-token-a": dict( + w="channel", a="token", no_a_quant=False, enabled=True + ), + "fp8-channel-w-tensor-a": dict( + w="channel", a="tensor", no_a_quant=False, enabled=False + ), + "fp8-tensor-w-token-a-noquant": dict( + w="tensor", a="token", no_a_quant=True, enabled=False + ), + "fp8-tensor-w-tensor-a-noquant": dict( + w="tensor", a="tensor", no_a_quant=True, enabled=True + ), + "fp8-channel-w-token-a-noquant": dict( + w="channel", a="token", no_a_quant=True, enabled=True + ), + "fp8-channel-w-tensor-a-noquant": dict( + w="channel", a="tensor", no_a_quant=True, enabled=False + ), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str): + if w_type == "tensor": + scale_b = torch.ones(1, device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + else: + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True) + return b_fp8.t(), scale_b_fp8 + + +def build_fp8_runner(cfg, a, b, dtype, device): + b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device) + + scale_a_const = ( + torch.ones(1, device=device, dtype=torch.float32) + if cfg["a"] == "tensor" + else None + ) + + if cfg["no_a_quant"]: + if cfg["a"] == "tensor": + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const) + else: + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True) + + def run(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + return run + + if cfg["a"] == "tensor": + + def run(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + else: + + def run(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + return run + @triton.testing.perf_report( triton.testing.Benchmark( @@ -18,28 +92,8 @@ x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], x_log=False, line_arg="provider", - line_vals=[ - "torch-bf16", - # "fp8-tensor-w-token-a", - "fp8-tensor-w-tensor-a", - "fp8-channel-w-token-a", - # "fp8-channel-w-tensor-a", - # "fp8-tensor-w-token-a-noquant", - "fp8-tensor-w-tensor-a-noquant", - "fp8-channel-w-token-a-noquant", - # "fp8-channel-w-tensor-a-noquant", - ], - line_names=[ - "torch-bf16", - # "fp8-tensor-w-token-a", - "fp8-tensor-w-tensor-a", - "fp8-channel-w-token-a", - # "fp8-channel-w-tensor-a", - # "fp8-tensor-w-token-a-noquant", - "fp8-tensor-w-tensor-a-noquant", - "fp8-channel-w-token-a-noquant", - # "fp8-channel-w-tensor-a-noquant", - ], + line_vals=_enabled, + line_names=_enabled, ylabel="TFLOP/s (larger is better)", plot_name="BF16 vs FP8 GEMMs", args={}, @@ -50,144 +104,34 @@ def benchmark(batch_size, provider, N, K): device = "cuda" dtype = torch.bfloat16 - # Create input tensors a = torch.randn((M, K), device=device, dtype=dtype) b = torch.randn((N, K), device=device, dtype=dtype) quantiles = [0.5, 0.2, 0.8] - if "torch-bf16" in provider: + if provider == "torch-bf16": ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( lambda: torch.nn.functional.linear(a, b), quantiles=quantiles ) - - elif "fp8" in provider: - # Weights are always quantized ahead of time - if "noquant" in provider: - # For no quantization, we just measure the GEMM - if "tensor-w-token-a" in provider: - # Dynamic per-token quant for A, per-tensor quant for B - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) - assert scale_b_fp8.numel() == 1 - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( - a, use_per_token_if_dynamic=True - ) - - def run_quant(): - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "tensor-w-tensor-a" in provider: - # Static per-tensor quantization with fixed scales - # for both A and B - scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) - scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - assert scale_b_fp8.numel() == 1 - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) - - def run_quant(): - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "channel-w-token-a" in provider: - # Static per-channel quantization for weights, per-token - # quant for A - scale_b = torch.tensor((N,), device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - scale_b_fp8 = scale_b_fp8.expand(N).contiguous() - assert scale_b_fp8.numel() == N - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( - a, use_per_token_if_dynamic=True - ) - - def run_quant(): - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "channel-w-tensor-a" in provider: - # Static per-channel quantization for weights, per-tensor - # quant for A - scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) - scale_b = torch.tensor((N,), device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - scale_b_fp8 = scale_b_fp8.expand(N).contiguous() - assert scale_b_fp8.numel() == N - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) - - def run_quant(): - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - else: - # In these cases, we quantize the activations during the GEMM call - if "tensor-w-token-a" in provider: - # Dynamic per-token quant for A, per-tensor quant for B - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) - assert scale_b_fp8.numel() == 1 - - def run_quant(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( - a, use_per_token_if_dynamic=True - ) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "tensor-w-tensor-a" in provider: - # Static per-tensor quantization with fixed scales - # for both A and B - scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) - scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - assert scale_b_fp8.numel() == 1 - - def run_quant(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "channel-w-token-a" in provider: - # Static per-channel quantization for weights, per-token - # quant for A - scale_b = torch.tensor((N,), device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - scale_b_fp8 = scale_b_fp8.expand(N).contiguous() - assert scale_b_fp8.numel() == N - - def run_quant(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( - a, use_per_token_if_dynamic=True - ) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "channel-w-tensor-a" in provider: - # Static per-channel quantization for weights, per-tensor - # quant for A - scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) - scale_b = torch.tensor((N,), device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - scale_b_fp8 = scale_b_fp8.expand(N).contiguous() - assert scale_b_fp8.numel() == N - - def run_quant(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - b_fp8 = b_fp8.t() - + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_fp8_runner(cfg, a, b, dtype, device) ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( lambda: run_quant(), quantiles=quantiles ) - # Calculate TFLOP/s, two flops per multiply-add - tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3) - return tflops(ms), tflops(max_ms), tflops(min_ms) + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) def prepare_shapes(args): - KN_model_names = [] - models_tps = list(itertools.product(args.models, args.tp_sizes)) - for model, tp_size in models_tps: - assert model in WEIGHT_SHAPES - for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]): - KN[tp_split_dim] = KN[tp_split_dim] // tp_size + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size KN.append(model) - KN_model_names.append(KN) - return KN_model_names + out.append(KN) + return out if __name__ == "__main__": @@ -197,21 +141,13 @@ def prepare_shapes(args): nargs="+", type=str, default=["meta-llama/Llama-3.1-8B-Instruct"], - choices=[*WEIGHT_SHAPES.keys()], - help="List of models to benchmark", - ) - parser.add_argument( - "--tp-sizes", - nargs="+", - type=int, - default=[1], - help="List of tensor parallel sizes", + choices=list(WEIGHT_SHAPES.keys()), ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) args = parser.parse_args() - KN_model_names = prepare_shapes(args) - for K, N, model_name in KN_model_names: - print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") + for K, N, model in prepare_shapes(args): + print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") benchmark.run( print_data=True, show_plots=True, diff --git a/benchmarks/kernels/bench_int8_gemm.py b/benchmarks/kernels/bench_int8_gemm.py new file mode 100644 index 000000000..e9c6d6440 --- /dev/null +++ b/benchmarks/kernels/bench_int8_gemm.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant +from vllm.triton_utils import triton + +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "int8-tensor-w-token-a": dict( + w="tensor", a="token", no_a_quant=False, enabled=False + ), + "int8-tensor-w-tensor-a": dict( + w="tensor", a="tensor", no_a_quant=False, enabled=True + ), + "int8-channel-w-token-a": dict( + w="channel", a="token", no_a_quant=False, enabled=True + ), + "int8-channel-w-tensor-a": dict( + w="channel", a="tensor", no_a_quant=False, enabled=False + ), + "int8-tensor-w-token-a-noquant": dict( + w="tensor", a="token", no_a_quant=True, enabled=False + ), + "int8-tensor-w-tensor-a-noquant": dict( + w="tensor", a="tensor", no_a_quant=True, enabled=True + ), + "int8-channel-w-token-a-noquant": dict( + w="channel", a="token", no_a_quant=True, enabled=True + ), + "int8-channel-w-tensor-a-noquant": dict( + w="channel", a="tensor", no_a_quant=True, enabled=False + ), +} + + +def _quant_weight(b, w_type, device): + if w_type == "tensor": + scale_b = torch.ones(1, device=device, dtype=torch.float32) + b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b) + assert scale_b_int8.numel() == 1 + else: # channel + b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b) + assert scale_b_int8.numel() == b.shape[0] + return b_int8.t(), scale_b_int8 + + +def build_int8_runner(cfg, a, b, dtype, device): + # quant before running the kernel + b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device) + + scale_a_const = None + if cfg["a"] == "tensor": + scale_a_const = torch.ones(1, device=device, dtype=torch.float32) + + # no quant, create activation ahead + if cfg["no_a_quant"]: + if cfg["a"] == "tensor": + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const) + else: # token + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a) + + def run_quant(): + return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) + + return run_quant + + # dynamic quant, create activation inside + if cfg["a"] == "tensor": + + def run_quant(): + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const) + return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) + + else: # token + + def run_quant(): + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a) + return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) + + return run_quant + + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")] + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=[k for k in _enabled], + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs INT8 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_int8_runner(cfg, a, b, dtype, device) + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +def prepare_shapes(args): + KN_model_names = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + KN_model_names.append(KN) + return KN_model_names + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + help="List of models to benchmark", + ) + parser.add_argument( + "--tp-sizes", + nargs="+", + type=int, + default=[1], + help="List of tensor parallel sizes", + ) + args = parser.parse_args() + + for K, N, model in prepare_shapes(args): + print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_int8_res_n{N}_k{K}", + N=N, + K=K, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_nvfp4_gemm.py b/benchmarks/kernels/bench_nvfp4_gemm.py new file mode 100644 index 000000000..6b19eb113 --- /dev/null +++ b/benchmarks/kernels/bench_nvfp4_gemm.py @@ -0,0 +1,198 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools +import os + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types +from vllm.triton_utils import triton + +if not current_platform.has_device_capability(100): + raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)") + + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max + +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "nvfp4": dict(no_a_quant=False, enabled=True), + "nvfp4-noquant": dict(no_a_quant=True, enabled=True), + "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True), + "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True), +} + +_needs_fbgemm = any( + v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False) +) +if _needs_fbgemm: + try: + from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import ( + triton_scale_nvfp4_quant, + ) + except ImportError: + print( + "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. " + "These providers will be skipped. Please install fbgemm_gpu with: " + "'pip install fbgemm-gpu-genai' to run them." + ) + # Disable FBGEMM providers so the benchmark can run. + for cfg in PROVIDER_CFGS.values(): + if cfg.get("fbgemm"): + cfg["enabled"] = False + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg): + # Compute global scale for weight + b_amax = torch.abs(b).max().to(torch.float32) + b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax + if "fbgemm" in cfg and cfg["fbgemm"]: + b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale) + else: + b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale) + return b_fp4, scale_b_fp4, b_global_scale + + +def build_nvfp4_runner(cfg, a, b, dtype, device): + b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg) + + # Compute global scale for activation + # NOTE: This is generally provided ahead-of-time by the model checkpoint. + a_amax = torch.abs(a).max().to(torch.float32) + a_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax + + # Alpha for the GEMM operation + alpha = 1.0 / (a_global_scale * b_global_scale) + if "fbgemm" in cfg and cfg["fbgemm"]: + if cfg["no_a_quant"]: + a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale) + + def run(): + return torch.ops.fbgemm.f4f4bf16( + a_fp4, + b_fp4, + scale_a_fp4, + scale_b_fp4, + global_scale=alpha, + use_mx=False, + ) + + return run + else: + + def run(): + a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale) + return torch.ops.fbgemm.f4f4bf16( + a_fp4, + b_fp4, + scale_a_fp4, + scale_b_fp4, + global_scale=alpha, + use_mx=False, + ) + + return run + + if cfg["no_a_quant"]: + # Pre-quantize activation + a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale) + + def run(): + return ops.cutlass_scaled_fp4_mm( + a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype + ) + + return run + + # Quantize activation on-the-fly + def run(): + a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale) + return ops.cutlass_scaled_fp4_mm( + a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype + ) + + return run + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs NVFP4 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_nvfp4_runner(cfg, a, b, dtype, device) + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + args = parser.parse_args() + + for K, N, model in prepare_shapes(args): + print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:") + save_dir = f"bench_nvfp4_res_n{N}_k{K}" + os.makedirs(save_dir, exist_ok=True) + + benchmark.run( + print_data=True, + show_plots=True, + save_path=save_dir, + N=N, + K=K, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py new file mode 100644 index 000000000..e08e5680c --- /dev/null +++ b/benchmarks/kernels/bench_per_token_quant_fp8.py @@ -0,0 +1,269 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools +from typing import Callable +from unittest.mock import patch + +import pandas as pd +import torch + +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.triton_utils import triton +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser + + +def with_triton_mode(fn): + """Temporarily force the Triton fallback path""" + + def wrapped(*args, **kwargs): + with patch("vllm.platforms.current_platform.is_cuda", return_value=False): + return fn(*args, **kwargs) + + return wrapped + + +# TODO(luka): use standalone_compile utility +def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int): + def inner(*args): + torch._dynamo.mark_dynamic(args[arg_index], dim_index) + return fn(*args) + + return inner + + +def bench_compile(fn: Callable): + # recompile for different shapes + fwd = torch.compile(fn, fullgraph=True, dynamic=False) + + # First dim is explicitly dynamic to simulate vLLM usage + return with_dyn_arg(fwd, 0, 0) + + +torch._dynamo.config.recompile_limit = 8888 + + +def calculate_diff( + batch_size: int, + hidden_size: int, + group_shape: GroupShape, + dtype: torch.dtype, +): + """Calculate the difference between Inductor and CUDA implementations.""" + device = torch.device("cuda") + x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device) + + quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False) + + torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x) + torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x) + cuda_out, cuda_scale = quant_fp8.forward_cuda(x) + + try: + torch.testing.assert_close( + cuda_out.to(torch.float32), + torch_out.to(torch.float32), + rtol=1e-3, + atol=1e-5, + ) + torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5) + torch.testing.assert_close( + cuda_out.to(torch.float32), + torch_eager_out.to(torch.float32), + rtol=1e-3, + atol=1e-5, + ) + torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5) + print("✅ All implementations match") + except AssertionError as e: + print("❌ Implementations differ") + print(e) + + +configs = [] + + +def benchmark_quantization( + batch_size, + hidden_size, + provider, + group_shape: GroupShape, + col_major: bool, + dtype: torch.dtype, +): + device = torch.device("cuda") + + x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major) + + if provider == "torch": + fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone()) + elif provider == "cuda": + fn = lambda: quant_fp8.forward_cuda(x.clone()) + elif provider == "triton": + if not group_shape.is_per_group(): + # Triton only supported for per-group + return 0, 0, 0 + + fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone()) + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + +# TODO(luka) extract to utils +def compute_geomean_speedups( + df: pd.DataFrame, + baseline_col: str, + speedup_cols: list[str], + groupby_cols: list[str] | None = None, +) -> pd.DataFrame: + """ + Compute geometric mean speedups over a baseline column. + + Args: + df: Input dataframe + baseline_col: Column to use as baseline + speedup_cols: Columns to compute speedups for + groupby_cols: Columns to group by. If None, compute over entire df. + + Returns: + pd.DataFrame with geometric mean speedups + """ + from scipy.stats import gmean + + def geo_speedup(group: pd.DataFrame) -> pd.Series: + ratios = { + col: (group[baseline_col] / group[col]).values for col in speedup_cols + } + return pd.Series({col: gmean(vals) for col, vals in ratios.items()}) + + if groupby_cols is None: + result = geo_speedup(df).to_frame().T + else: + result = ( + df.groupby(groupby_cols) + .apply(geo_speedup, include_groups=False) + .reset_index() + ) + + return result + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the various implementations of QuantFP8 (dynamic-only)" + ) + parser.add_argument("-c", "--check", action="store_true") + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" + ) + parser.add_argument( + "--hidden-sizes", + type=int, + nargs="+", + default=[896, 1024, 2048, 4096, 7168], + help="Hidden sizes to benchmark", + ) + parser.add_argument( + "--batch-sizes", + type=int, + nargs="+", + default=[1, 16, 128, 512, 1024], + help="Batch sizes to benchmark", + ) + parser.add_argument( + "--group-sizes", + type=int, + nargs="+", + default=None, + help="Group sizes for GroupShape(1,N) to benchmark. " + "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)", + ) + parser.add_argument( + "--no-column-major", + action="store_true", + help="Disable column-major scales testing", + ) + + args = parser.parse_args() + assert args + + dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] + + hidden_sizes = args.hidden_sizes + batch_sizes = args.batch_sizes + + if args.group_sizes is not None: + group_shapes = [] + for size in args.group_sizes: + if size == 0: + group_shapes.append(GroupShape.PER_TENSOR) + elif size == -1: + group_shapes.append(GroupShape.PER_TOKEN) + else: + group_shapes.append(GroupShape(1, size)) + else: + group_shapes = [ + GroupShape.PER_TENSOR, + GroupShape.PER_TOKEN, + GroupShape(1, 64), + GroupShape(1, 128), + ] + + column_major_scales = [False] if args.no_column_major else [True, False] + + config_gen = itertools.product( + group_shapes, + column_major_scales, + batch_sizes, + hidden_sizes, + ) + + # filter out column-major scales for non-group, reverse order + configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1])) + + print(f"Running {len(configs)} configurations:") + print(f" Hidden sizes: {hidden_sizes}") + print(f" Batch sizes: {batch_sizes}") + print(f" Group shapes: {[str(g) for g in group_shapes]}") + print(f" Column major scales: {column_major_scales}") + print() + + if args.check: + for group_shape in group_shapes: + group_size = group_shape[1] + print(f"{group_size=}") + calculate_diff( + batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype + ) + + benchmark = triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["hidden_size", "batch_size", "col_major", "group_shape"], + x_vals=configs, + line_arg="provider", + line_vals=["torch", "cuda", "triton"], + line_names=["Torch (Compiled)", "CUDA", "Triton"], + styles=[("blue", "-"), ("green", "-"), ("black", "-")], + ylabel="us", + plot_name="QuantFP8 performance", + args={}, + ) + )(benchmark_quantization) + + df = benchmark.run(print_data=True, dtype=dtype, return_df=True) + + # Print geomean speedups + geo_table_grouped = compute_geomean_speedups( + df, + baseline_col="Torch (Compiled)", + speedup_cols=["CUDA", "Triton"], + groupby_cols=["col_major", "group_shape"], + ) + + print("Speedup over Torch (Compiled)") + print(geo_table_grouped.to_string(index=False)) diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py new file mode 100644 index 000000000..93edbcc93 --- /dev/null +++ b/benchmarks/kernels/benchmark_activation.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# benchmark custom activation op performance +import itertools + +import torch + +import vllm.model_executor.layers.activation # noqa F401 +from vllm.model_executor.custom_op import CustomOp +from vllm.platforms import current_platform +from vllm.triton_utils import triton +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser + +batch_size_range = [1, 16, 32, 64, 128] +seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] +intermediate_size = [3072, 9728, 12288] +configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) + + +def benchmark_activation( + batch_size: int, + seq_len: int, + intermediate_size: int, + provider: str, + func_name: str, + dtype: torch.dtype, +): + device = "cuda" + num_tokens = batch_size * seq_len + dim = intermediate_size + current_platform.seed_everything(42) + torch.set_default_device(device) + + if func_name == "gelu_and_mul": + layer = CustomOp.op_registry[func_name](approximate="none") + elif func_name == "gelu_and_mul_tanh": + layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh") + elif func_name == "fatrelu_and_mul": + threshold = 0.5 + layer = CustomOp.op_registry[func_name](threshold) + else: + layer = CustomOp.op_registry[func_name]() + + x = torch.randn(num_tokens, dim, dtype=dtype, device=device) + compiled_layer = torch.compile(layer.forward_native) + + if provider == "custom": + fn = lambda: layer(x) + elif provider == "compiled": + fn = lambda: compiled_layer(x) + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + fn, quantiles=[0.5, 0.2, 0.8] + ) + return ms, max_ms, min_ms + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the custom activation op.") + parser.add_argument( + "--func-name", + type=str, + choices=[ + "mul_and_silu", + "silu_and_mul", + "gelu_and_mul", + "gelu_and_mul_tanh", + "fatrelu_and_mul", + "swigluoai_and_mul", + "gelu_new", + "gelu_fast", + "quick_gelu", + ], + default="silu_and_mul", + ) + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" + ) + args = parser.parse_args() + assert args + + func_name = args.func_name + dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] + + perf_report = triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size", "seq_len", "intermediate_size"], + x_vals=configs, + line_arg="provider", + line_vals=["custom", "compiled"], + line_names=["Custom OP", "Compiled"], + styles=[("blue", "-"), ("green", "-")], + ylabel="ms", + plot_name=f"{func_name}-op-performance", + args={}, + ) + ) + + perf_report( + lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation( + batch_size, seq_len, intermediate_size, provider, func_name, dtype + ) + ).run(print_data=True) diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py deleted file mode 100644 index 42de062b0..000000000 --- a/benchmarks/kernels/benchmark_aqlm.py +++ /dev/null @@ -1,345 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -import sys -from typing import Optional - -import torch -import torch.nn.functional as F - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.aqlm import ( - dequantize_weight, - generic_dequantize_gemm, - get_int_dtype, - optimized_dequantize_gemm, -) -from vllm.utils import FlexibleArgumentParser - -os.environ["CUDA_VISIBLE_DEVICES"] = "0" - - -def torch_mult( - # [..., in_features] - input: torch.Tensor, - weights: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, -) -> torch.Tensor: - output = F.linear(input, weights) - return output - - -def dequant_out_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - if bias is None: - output = F.linear(input, weights, bias) - orig_shape = output.shape - flattened_output = output.view(-1, output.size(-1)) - f_scales = scales.view(-1, scales.shape[0]) - b_scales = f_scales.expand(flattened_output.shape[0], -1) - flattened_output *= b_scales - return flattened_output.view(orig_shape) - else: - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_weight_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_no_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - return F.linear(input, weights, bias) - - -# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against -# the generic pytorch version. -# Just visual comparison. -def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - count = 0 - for index in range(16): - for i in range(8): - for book in range(nbooks): - codebooks[book, index, 0, i] = count * (10**book) - count += 1 - - print("codes shape", codes.shape) - - for i in range(16): - for book in range(nbooks): - codes[0, i, book] = i - codes[0, -i, book] = i - - weights = dequantize_weight(codes, codebooks, None) - weights2 = ops.aqlm_dequant(codes, codebooks, parts) - - print("weights shape:", weights.shape) - print("weights2 shape:", weights2.shape) - - print("weights are:", weights) - print("weights2 are:", weights2) - - print("first 128 weights are", weights[0, 0:128].to(torch.int32)) - print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32)) - - print("last 128 weights are", weights[0, -128:]) - print("last 128 weights2 are:", weights2[0, -128:]) - - -def main(): - parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") - - # Add arguments - parser.add_argument( - "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)" - ) - parser.add_argument( - "--bits", - type=int, - default=16, - help="Number of bits per code element (default: 16)", - ) - parser.add_argument( - "--test", - type=bool, - default=False, - help="Run the decompression/dequant tester rather than benchmarking " - "(default: False)", - ) - - # Parse the arguments - args = parser.parse_args() - - # Extract values - nbooks = args.nbooks - bits = args.bits - - if args.test: - dequant_test(4096, torch.tensor((4096,)), nbooks, bits) - return - - # Otherwise, benchmark. - methods = [ - ops.aqlm_gemm, - dequant_out_scale, - generic_dequantize_gemm, - optimized_dequantize_gemm, - dequant_weight_scale, - torch_mult, - dequant_no_scale, - ] - - filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv" - print(f"writing benchmarks to file {filename}") - with open(filename, "w") as f: - sys.stdout = f - - print("m | k | n | n parts", end="") - for method in methods: - print(f" | {method.__name__.replace('_', ' ')} (µs)", end="") - print("") - - # These are reasonable prefill sizes. - ksandpartions = ( - (4096, (4096, 4096, 4096)), - (4096, (4096,)), - (4096, (11008, 11008)), - (11008, (4096,)), - ) - - # reasonable ranges for m. - for m in [ - 1, - 2, - 4, - 8, - 10, - 12, - 14, - 16, - 24, - 32, - 48, - 52, - 56, - 64, - 96, - 112, - 128, - 256, - 512, - 1024, - 1536, - 2048, - 3072, - 4096, - ]: - print(f"{m}", file=sys.__stdout__) - for ksp in ksandpartions: - run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods) - - sys.stdout = sys.__stdout__ - - -def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): - # I didn't see visible improvements from increasing these, but feel free :) - num_warmup_trials = 1 - num_trials = 1 - - num_calls = 100 - - # warmup. - for method in methods: - for _ in range(num_warmup_trials): - run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - n = parts.sum().item() - print(f"{m} | {k} | {n} | {parts.tolist()}", end="") - - for method in methods: - best_time_us = 1e20 - for _ in range(num_trials): - kernel_dur_ms = run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - kernel_dur_us = 1000 * kernel_dur_ms - - if kernel_dur_us < best_time_us: - best_time_us = kernel_dur_us - - print(f" | {kernel_dur_us:.0f}", end="") - - print("") - - -def run_timing( - num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method -) -> float: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - input = torch.randn((1, m, k), dtype=torch.float16, device=device) - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) - - # for comparison to just a pytorch mult. - weights = torch.randn((n, k), dtype=torch.float16, device=device) - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - start_event.record() - - if method is torch_mult: - for i in range(num_calls): - torch_mult(input, weights, scales) - else: - for i in range(num_calls): - method(input, codes, codebooks, scales, parts, None) - - end_event.record() - end_event.synchronize() - - dur_ms = start_event.elapsed_time(end_event) / num_calls - return dur_ms - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index 97ee06034..66b44c27d 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -3,6 +3,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +from packaging import version + from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( MINIMUM_BITBLAS_VERSION, ) @@ -10,7 +12,7 @@ try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}" diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index 35c20ee41..726a2a371 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -13,6 +13,10 @@ from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import ( + fp8_w8a8_moe_quant_config, + nvfp4_moe_quant_config, +) from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types @@ -140,6 +144,12 @@ def run_triton_moe( a_fp8_scale: torch.Tensor, num_repeats: int, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) + for _ in range(num_repeats): fused_experts( a, @@ -147,10 +157,7 @@ def run_triton_moe( w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale, + quant_config=quant_config, ) def run_cutlass_moe_fp4( @@ -172,25 +179,27 @@ def run_cutlass_moe_fp4( device: torch.device, num_repeats: int, ): + quant_config = nvfp4_moe_quant_config( + a1_gscale=a1_gs, + a2_gscale=a2_gs, + w1_scale=w1_blockscale, + w2_scale=w2_blockscale, + g1_alphas=w1_gs, + g2_alphas=w2_gs, + ) for _ in range(num_repeats): with nvtx.annotate("cutlass_moe_fp4", color="green"): cutlass_moe_fp4( a=a, - a1_gscale=a1_gs, - a2_gscale=a2_gs, w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_gs, w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_gs, topk_weights=topk_weights, topk_ids=topk_ids, m=m, n=n, k=k, e=num_experts, - device=device, + quant_config=quant_config, ) def run_cutlass_from_graph( @@ -211,26 +220,29 @@ def run_cutlass_from_graph( e: int, device: torch.device, ): + quant_config = nvfp4_moe_quant_config( + a1_gscale=a1_gs, + a2_gscale=a2_gs, + w1_scale=w1_blockscale, + w2_scale=w2_blockscale, + g1_alphas=w1_gs, + g2_alphas=w2_gs, + ) + with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): return cutlass_moe_fp4( a=a, - a1_gscale=a1_gs, w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_alphas, - a2_gscale=a2_gs, w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_alphas, topk_weights=topk_weights, topk_ids=topk_ids, m=m, n=n, k=k, e=num_experts, - device=device, + quant_config=quant_config, ) def run_triton_from_graph( @@ -246,16 +258,18 @@ def run_triton_from_graph( with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) return fused_experts( a, w1, w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale, + quant_config=quant_config, ) def replay_graph(graph, num_repeats): diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py new file mode 100644 index 000000000..b419b2fa0 --- /dev/null +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -0,0 +1,406 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe +kernel. Both kernels take in fp8 quantized weights and 16-bit activations, +but use different quantization strategies and backends. +""" + +import nvtx +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config +from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 +from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + +# Weight shapes for different models: [num_experts, topk, hidden_size, +# intermediate_size] +WEIGHT_SHAPES_MOE = { + "mixtral-8x7b": [ + [8, 2, 4096, 14336], + ], + "deepseek-v2": [ + [160, 6, 5120, 12288], + ], + "custom-small": [ + [8, 2, 2048, 7168], + ], + "glm45-fp8": [ + [128, 8, 4096, 1408], + ], + "Llama-4-Maverick-17B-128E-Instruct-FP8": [ + [128, 1, 5120, 8192], + ], +} + +DEFAULT_MODELS = [ + "mixtral-8x7b", +] + +DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] +DEFAULT_TP_SIZES = [1] + +PER_ACT_TOKEN_OPTS = [False, True] +PER_OUT_CH_OPTS = [False, True] + +FP8_DTYPE = current_platform.fp8_dtype() + + +def bench_run( + results: list, + model: str, + num_experts: int, + topk: int, + per_act_token: bool, + per_out_ch: bool, + mkn: tuple[int, int, int], +): + (m, k, n) = mkn + + dtype = torch.half + device = "cuda" + + # Create input activations + a = torch.randn((m, k), device=device, dtype=dtype) / 10 + + # Create weights + w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10 + w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10 + + # Create FP8 quantized weights and scales for both kernels + w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE) + w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE) + + # Create scales based on quantization strategy + if per_out_ch: + # Per-channel quantization + w1_scale = torch.empty( + (num_experts, 2 * n, 1), device=device, dtype=torch.float32 + ) + w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32) + else: + # Per-tensor quantization + w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) + w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) + + # Quantize weights + for expert in range(num_experts): + if per_out_ch: + # Per-channel quantization - not yet implemented properly + # For now, fall back to per-tensor quantization + w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert]) + w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert]) + # Expand scalar scales to the expected per-channel shape + w1_scale[expert] = w1_scale_temp.expand(2 * n, 1) + w2_scale[expert] = w2_scale_temp.expand(k, 1) + else: + # Per-tensor quantization + w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert]) + w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert]) + # Store scalar scales in [1, 1] tensors + w1_scale[expert, 0, 0] = w1_scale_temp + w2_scale[expert, 0, 0] = w2_scale_temp + + # Prepare weights for CUTLASS (no transpose needed) + w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K] + w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N] + + # Create router scores and get topk + score = torch.randn((m, num_experts), device=device, dtype=dtype) + topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) + + # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization + # Force per-tensor quantization for all cases to match working e2e setup + a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32) + a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32) + + # Force per-tensor quantization for all cases + per_act_token = False + + # Create stride tensors for CUTLASS + ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device) + ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device) + c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device) + c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device) + + def run_triton_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: torch.Tensor, + a2_scale: torch.Tensor, + num_repeats: int, + ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + ) + + for _ in range(num_repeats): + fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + quant_config=quant_config, + ) + + def run_cutlass_moe_fp8( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: torch.Tensor, + a2_scale: torch.Tensor, + num_repeats: int, + ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + ) + + for _ in range(num_repeats): + with nvtx.annotate("cutlass_moe_fp8", color="blue"): + cutlass_moe_fp8( + a=a, + w1_q=w1, + w2_q=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, + quant_config=quant_config, + activation="silu", + global_num_experts=num_experts, + ) + + # Pre-create quantization config to avoid creating it inside CUDA graph + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + ) + + # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly) + cutlass_stream = torch.cuda.Stream() + cutlass_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): + # Capture 10 invocations like benchmark_moe.py + for _ in range(10): + cutlass_moe_fp8( + a=a, + w1_q=w1_fp8q_cutlass, + w2_q=w2_fp8q_cutlass, + topk_weights=topk_weights, + topk_ids=topk_ids, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, + quant_config=quant_config, + activation="silu", + global_num_experts=num_experts, + ) + torch.cuda.synchronize() + + # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly) + triton_stream = torch.cuda.Stream() + triton_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(triton_graph, stream=triton_stream): + # Capture 10 invocations like benchmark_moe.py + for _ in range(10): + fused_experts( + a, + w1_fp8q, + w2_fp8q, + topk_weights, + topk_ids, + quant_config=quant_config, + ) + torch.cuda.synchronize() + + def bench_cuda_graph(graph, num_warmup=5, num_iters=100): + """Benchmark CUDA graph using events like benchmark_moe.py""" + # Warmup + for _ in range(num_warmup): + graph.replay() + torch.cuda.synchronize() + + # Timing + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies = [] + for _ in range(num_iters): + torch.cuda.synchronize() + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + + # Divide by 10 since graph contains 10 calls + return sum(latencies) / (num_iters * 10) + + # Benchmark parameters + num_warmup = 5 + num_iters = 100 + + # Benchmark only CUDA graphs (more reliable and faster) + # Benchmark Triton MoE with CUDA graphs + triton_graph_time = bench_cuda_graph( + triton_graph, num_warmup=num_warmup, num_iters=num_iters + ) + + # Benchmark CUTLASS MoE with CUDA graphs + cutlass_graph_time = bench_cuda_graph( + cutlass_graph, num_warmup=num_warmup, num_iters=num_iters + ) + + # Convert ms to us and return results + triton_time_us = triton_graph_time * 1000 + cutlass_time_us = cutlass_graph_time * 1000 + + return { + "batch_size": m, + "triton_time_us": triton_time_us, + "cutlass_time_us": cutlass_time_us, + } + + +def main(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + all_results = [] + + for model in args.models: + for tp in args.tp_sizes: + for layer in WEIGHT_SHAPES_MOE[model]: + num_experts = layer[0] + topk = layer[1] + size_k = layer[2] + size_n = layer[3] // tp + + if len(args.limit_k) > 0 and size_k not in args.limit_k: + continue + + if len(args.limit_n) > 0 and size_n not in args.limit_n: + continue + + for per_act_token in args.per_act_token_opts: + for per_out_ch in args.per_out_ch_opts: + print( + f"\n=== {model}, experts={num_experts}, topk={topk}," + f"per_act={per_act_token}, per_out_ch={per_out_ch} ===" + ) + + config_results = [] + for size_m in args.batch_sizes: + mkn = (size_m, size_k, size_n) + result = bench_run( + [], # Not used anymore + model, + num_experts, + topk, + per_act_token, + per_out_ch, + mkn, + ) + if result: + config_results.append(result) + + # Print results table for this configuration + if config_results: + print( + f"\n{'Batch Size':<12}" + f"{'Triton (us)':<15}" + f"{'CUTLASS (us)':<15}" + ) + print("-" * 45) + for result in config_results: + print( + f"{result['batch_size']:<12}" + f"{result['triton_time_us']:<15.2f}" + f"{result['cutlass_time_us']:<15.2f}" + ) + + all_results.extend(config_results) + + print(f"\nTotal benchmarks completed: {len(all_results)}") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE + across specified models/shapes/batches + + Example usage: + python benchmark_cutlass_moe_fp8.py \ + --model "Llama-4-Maverick-17B-128E-Instruct-FP8" \ + --tp-sizes 8 \ + --batch-size 2 4 8 \ + --per-act-token-opts false \ + --per-out-ch-opts false + + """ + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES_MOE.keys(), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) + parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + parser.add_argument("--limit-k", nargs="+", type=int, default=[]) + parser.add_argument("--limit-n", nargs="+", type=int, default=[]) + parser.add_argument( + "--per-act-token-opts", + nargs="+", + type=lambda x: x.lower() == "true", + default=[False, True], + help="Per-activation token quantization options (true/false)", + ) + parser.add_argument( + "--per-out-ch-opts", + nargs="+", + type=lambda x: x.lower() == "true", + default=[False, True], + help="Per-output channel quantization options (true/false)", + ) + + args = parser.parse_args() + main(args) diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py new file mode 100644 index 000000000..4cbdde5a5 --- /dev/null +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -0,0 +1,508 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Benchmark script for device communicators: +CustomAllreduce (oneshot, twoshot), PyNcclCommunicator, +and SymmMemCommunicator (multimem, two-shot). + +for NCCL symmetric memory you need to set the environment variables +NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does +not use fast NVLS implementation for all reduce. + +Usage: + torchrun --nproc_per_node= benchmark_device_communicators.py [options] + +Example: + torchrun --nproc_per_node=2 benchmark_device_communicators.py + --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100 +""" + +import json +import os +import time +from contextlib import nullcontext +from typing import Callable, Optional + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce +from vllm.distributed.device_communicators.pynccl import ( + PyNcclCommunicator, + register_nccl_symmetric_ops, +) +from vllm.distributed.device_communicators.pynccl_allocator import ( + set_graph_pool_id, +) +from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator +from vllm.logger import init_logger +from vllm.utils import FlexibleArgumentParser + +logger = init_logger(__name__) + +# Default sequence lengths to benchmark +DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192] + +# Fixed hidden size and dtype for all benchmarks +HIDDEN_SIZE = 8192 +BENCHMARK_DTYPE = torch.bfloat16 + +# CUDA graph settings +CUDA_GRAPH_CAPTURE_CYCLES = 10 + + +class CommunicatorBenchmark: + """Benchmark class for testing device communicators.""" + + def __init__( + self, + rank: int, + world_size: int, + device: torch.device, + cpu_group: ProcessGroup, + sequence_lengths: list[int], + ): + self.rank = rank + self.world_size = world_size + self.device = device + self.cpu_group = cpu_group + + # Calculate max_size_override based on largest sequence length + max_seq_len = max(sequence_lengths) + max_tensor_elements = max_seq_len * HIDDEN_SIZE + self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1 + + # Initialize communicators + self.custom_allreduce = None + self.pynccl_comm = None + self.symm_mem_comm = None + self.symm_mem_comm_multimem = None + self.symm_mem_comm_two_shot = None + + self._init_communicators() + + def _init_communicators(self): + """Initialize all available communicators.""" + try: + self.custom_allreduce = CustomAllreduce( + group=self.cpu_group, + device=self.device, + max_size=self.max_size_override, + ) + if not self.custom_allreduce.disabled: + logger.info("Rank %s: CustomAllreduce initialized", self.rank) + else: + logger.info("Rank %s: CustomAllreduce disabled", self.rank) + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e + ) + self.custom_allreduce = None + + try: + self.pynccl_comm = PyNcclCommunicator( + group=self.cpu_group, device=self.device + ) + if not self.pynccl_comm.disabled: + logger.info("Rank %s: PyNcclCommunicator initialized", self.rank) + register_nccl_symmetric_ops(self.pynccl_comm) + else: + logger.info("Rank %s: PyNcclCommunicator disabled", self.rank) + self.pynccl_comm = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e + ) + self.pynccl_comm = None + + # Initialize variants for SymmMemCommunicator + try: + self.symm_mem_comm_multimem = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + force_multimem=True, + max_size_override=self.max_size_override, + ) + if not self.symm_mem_comm_multimem.disabled: + logger.info( + "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank + ) + else: + self.symm_mem_comm_multimem = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s", + self.rank, + e, + ) + self.symm_mem_comm_multimem = None + + try: + self.symm_mem_comm_two_shot = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + force_multimem=False, + max_size_override=self.max_size_override, + ) + if not self.symm_mem_comm_two_shot.disabled: + logger.info( + "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank + ) + else: + self.symm_mem_comm_two_shot = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s", + self.rank, + e, + ) + self.symm_mem_comm_two_shot = None + + def benchmark_allreduce( + self, sequence_length: int, num_warmup: int, num_trials: int + ) -> dict[str, float]: + """Benchmark allreduce operations for all available communicators.""" + + results = {} + + # Define communicators with their benchmark functions + communicators = [] + + if self.custom_allreduce is not None: + comm = self.custom_allreduce + # CustomAllreduce one-shot + communicators.append( + ( + "ca_1stage", + lambda t, c=comm: c.custom_all_reduce(t), + lambda t, c=comm: c.should_custom_ar(t), + comm.capture(), + "1stage", # env variable value + ) + ) + # CustomAllreduce two-shot + communicators.append( + ( + "ca_2stage", + lambda t, c=comm: c.custom_all_reduce(t), + lambda t, c=comm: c.should_custom_ar(t), + comm.capture(), + "2stage", # env variable value + ) + ) + + if self.pynccl_comm is not None: + comm = self.pynccl_comm + communicators.append( + ( + "pynccl", + lambda t, c=comm: c.all_reduce(t), + lambda t: True, # Always available if initialized + nullcontext(), + None, # no env variable needed + ) + ) + communicators.append( + ( + "pynccl-symm", + lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t), + lambda t: True, # Always available if initialized + nullcontext(), + None, # no env variable needed + ) + ) + + if self.symm_mem_comm_multimem is not None: + comm = self.symm_mem_comm_multimem + communicators.append( + ( + "symm_mem_multimem", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_symm_mem(t), + nullcontext(), + None, # no env variable needed + ) + ) + + if self.symm_mem_comm_two_shot is not None: + comm = self.symm_mem_comm_two_shot + communicators.append( + ( + "symm_mem_two_shot", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_symm_mem(t), + nullcontext(), + None, # no env variable needed + ) + ) + + # Benchmark each communicator + for name, allreduce_fn, should_use_fn, context, env_var in communicators: + # Set environment variable if needed + if env_var is not None: + os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var + else: + # Clear the environment variable to avoid interference + os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None) + + latency = self.benchmark_allreduce_single( + sequence_length, + allreduce_fn, + should_use_fn, + context, + num_warmup, + num_trials, + ) + if latency is not None: + results[name] = latency + + return results + + def benchmark_allreduce_single( + self, + sequence_length: int, + allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]], + should_use_fn: Callable[[torch.Tensor], bool], + context, + num_warmup: int, + num_trials: int, + ) -> Optional[float]: + """Benchmark method with CUDA graph optimization.""" + try: + # Create test tensor (2D: sequence_length x hidden_size) + tensor = torch.randn( + sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device + ) + if not should_use_fn(tensor): + return None + + torch.cuda.synchronize() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + graph_input = tensor.clone() + + # Warmup before capture + for _ in range(3): + allreduce_fn(graph_input) + + # Capture the graph using context manager + with context: + graph = torch.cuda.CUDAGraph() + graph_pool = torch.cuda.graph_pool_handle() + set_graph_pool_id(graph_pool) + with torch.cuda.graph(graph, pool=graph_pool): + for _ in range(CUDA_GRAPH_CAPTURE_CYCLES): + allreduce_fn(graph_input) + + torch.cuda.synchronize() + for _ in range(num_warmup): + graph.replay() + torch.cuda.synchronize() + + torch.cuda.synchronize() + start_time = time.perf_counter() + + for _ in range(num_trials): + graph.replay() + torch.cuda.synchronize() + + end_time = time.perf_counter() + + # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES + return ( + (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000 + ) + + except Exception as e: + logger.error("CUDA graph benchmark failed: %s", e) + raise RuntimeError( + f"CUDA graph benchmark failed for communicator: {e}" + ) from e + + +def _calculate_speedup_info(comm_results: dict[str, float]) -> str: + """Calculate speedup information for a single tensor size.""" + if not comm_results: + return "N/A" + + # Find the fastest communicator + fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k]) + fastest_time = comm_results[fastest_comm] + + # Calculate speedup vs PyNccl if available + if "pynccl" in comm_results: + pynccl_time = comm_results["pynccl"] + speedup = pynccl_time / fastest_time + return f"{fastest_comm} ({speedup:.2f}x)" + else: + return f"{fastest_comm} (N/A)" + + +def print_results( + results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int +): + """Print benchmark results in a formatted table.""" + + print(f"\n{'=' * 130}") + print("Device Communicator Benchmark Results") + print( + f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, " + f"Hidden Size: {HIDDEN_SIZE}" + ) + print(f"{'=' * 130}") + + # Get all communicator names + all_comms = set() + for size_results in results.values(): + all_comms.update(size_results.keys()) + + all_comms = sorted(list(all_comms)) + + # Print header + header = f"{'Tensor Shape':<20}{'Tensor Size':<15}" + for comm in all_comms: + header += f"{comm:<20}" + header += f"{'Best (Speedup vs PyNccl)':<30}" + print(header) + print("-" * len(header)) + + # Print results for each sequence length + for seq_len in sequence_lengths: + if seq_len in results: + # Calculate tensor size in elements and bytes + tensor_elements = seq_len * HIDDEN_SIZE + tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize + + # Format tensor size (MB) + tensor_size_mb = tensor_bytes / (1024 * 1024) + tensor_size_str = f"{tensor_size_mb:.2f} MB" + + # Format tensor shape + tensor_shape = f"({seq_len}, {HIDDEN_SIZE})" + + row = f"{tensor_shape:<20}{tensor_size_str:<15}" + for comm in all_comms: + if comm in results[seq_len]: + row += f"{results[seq_len][comm]:<20.3f}" + else: + row += f"{'N/A':<20}" + + # Calculate speedup information + speedup_info = _calculate_speedup_info(results[seq_len]) + row += f"{speedup_info:<30}" + + print(row) + + print(f"{'=' * 130}") + print("All times are in milliseconds (ms) per allreduce operation") + print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)") + + +def main(): + parser = FlexibleArgumentParser(description="Benchmark device communicators") + + parser.add_argument( + "--sequence-lengths", + type=int, + nargs="+", + default=DEFAULT_SEQUENCE_LENGTHS, + help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)", + ) + + parser.add_argument( + "--num-warmup", type=int, default=5, help="Number of warmup iterations" + ) + + parser.add_argument( + "--num-trials", type=int, default=50, help="Number of benchmark trials" + ) + + parser.add_argument("--output-json", type=str, help="Output results to JSON file") + + args = parser.parse_args() + + # Initialize distributed + if not dist.is_initialized(): + dist.init_process_group(backend="gloo") + rank = dist.get_rank() + world_size = dist.get_world_size() + + # Set device + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + + # Get CPU process group + cpu_group = dist.new_group(backend="gloo") + + # Disable USE_SYMM_MEM to avoid affecting the max_sizes + # in symm_mem and custom_all_reduce for benchmark + os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" + + # Initialize benchmark + benchmark = CommunicatorBenchmark( + rank, world_size, device, cpu_group, args.sequence_lengths + ) + + # Run benchmarks + all_results = {} + + for seq_len in args.sequence_lengths: + if rank == 0: + logger.info( + "Benchmarking sequence length: %s (tensor shape: %s x %s)", + seq_len, + seq_len, + HIDDEN_SIZE, + ) + + results = benchmark.benchmark_allreduce( + sequence_length=seq_len, + num_warmup=args.num_warmup, + num_trials=args.num_trials, + ) + + all_results[seq_len] = results + + # Synchronize between ranks + dist.barrier() + + # Print results (only rank 0) + if rank == 0: + print_results(all_results, args.sequence_lengths, world_size) + + # Save to JSON if requested + if args.output_json: + # Add speedup information to results + enhanced_results = {} + for seq_len, comm_results in all_results.items(): + enhanced_results[seq_len] = { + "timings": comm_results, + "speedup_info": _calculate_speedup_info(comm_results), + } + + output_data = { + "world_size": world_size, + "dtype": str(BENCHMARK_DTYPE), + "hidden_size": HIDDEN_SIZE, + "sequence_lengths": args.sequence_lengths, + "num_warmup": args.num_warmup, + "num_trials": args.num_trials, + "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES, + "results": enhanced_results, + } + + with open(args.output_json, "w") as f: + json.dump(output_data, f, indent=2) + + logger.info("Results saved to %s", args.output_json) + + # Cleanup + if cpu_group != dist.group.WORLD: + dist.destroy_process_group(cpu_group) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index acabe6c1d..14330ae6f 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -7,6 +7,7 @@ from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, @@ -80,6 +81,11 @@ def bench_run( a, score, topk, renormalize=False ) + ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + def run_triton_moe( a: torch.Tensor, w1: torch.Tensor, @@ -91,6 +97,11 @@ def run_triton_moe( a_scale: torch.Tensor, num_repeats: int, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) for _ in range(num_repeats): fused_experts( a, @@ -98,10 +109,7 @@ def run_triton_moe( w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale, + quant_config=quant_config, ) def run_cutlass_moe( @@ -111,10 +119,21 @@ def run_cutlass_moe( w2: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + per_act_token: bool, num_repeats: int, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + per_act_token_quant=per_act_token, + ) + for _ in range(num_repeats): cutlass_moe_fp8( a, @@ -122,9 +141,11 @@ def run_cutlass_moe( w2, topk_weights, topk_ids, - w1_scale, - w2_scale, - a1_scale=a_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, + quant_config=quant_config, ) def run_cutlass_from_graph( @@ -134,9 +155,19 @@ def run_cutlass_from_graph( w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + per_act_token_quant=per_act_token, + ) + with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): @@ -146,9 +177,11 @@ def run_cutlass_from_graph( w2_q, topk_weights, topk_ids, - w1_scale, - w2_scale, - a1_scale=a_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, + quant_config=quant_config, ) def run_triton_from_graph( @@ -161,6 +194,11 @@ def run_triton_from_graph( w2_scale: torch.Tensor, a_scale: torch.Tensor, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): @@ -170,10 +208,7 @@ def run_triton_from_graph( w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale, + quant_config=quant_config, ) def replay_graph(graph, num_repeats): @@ -191,6 +226,10 @@ def replay_graph(graph, num_repeats): w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, ) @@ -227,6 +266,11 @@ def replay_graph(graph, num_repeats): "w2_q": w2_q, "w1_scale": w1_scale, "w2_scale": w2_scale, + "per_act_token": per_act_token, + "ab_strides1": ab_strides1, + "ab_strides2": ab_strides2, + "c_strides1": c_strides1, + "c_strides2": c_strides2, # cuda graph params "cutlass_graph": cutlass_graph, "triton_graph": triton_graph, @@ -285,14 +329,19 @@ def replay_graph(graph, num_repeats): w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, + per_act_token, num_warmup, ) results.append( benchmark.Timer( - stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501 + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 3d38d4b35..799b16999 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor( def make_rand_tensors( - a_shape: tuple[int], - b_shape: tuple[int], - c_shape: tuple[int], + a_shape: tuple[int, ...], + b_shape: tuple[int, ...], + c_shape: tuple[int, ...], a_dtype: torch.dtype, b_dtype: torch.dtype, c_dtype: torch.dtype, @@ -243,7 +243,7 @@ def matmul_shapes( lora_rank: int, num_loras: int, num_slices: int, - ) -> tuple[tuple[int], tuple[int], tuple[int]]: + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: """ Given num_slices, return the shapes of the A, B, and C matrices in A x B = C, for the op_type @@ -464,7 +464,11 @@ def to_device(tensor: torch.Tensor): for field_name in LoRAKernelMeta.__dataclass_fields__: field = getattr(self.lora_kernel_meta, field_name) assert isinstance(field, torch.Tensor) - setattr(self.lora_kernel_meta, field_name, to_device(field)) + setattr( + self.lora_kernel_meta, + field_name, + to_device(field) if field_name != "no_lora_flag_cpu" else field, + ) def metadata(self) -> tuple[int, int, int]: """ @@ -512,6 +516,7 @@ def as_lora_shrink_kwargs(self) -> dict[str, Any]: "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc, "lora_ids": self.lora_kernel_meta.active_lora_ids, "scaling": 1.0, + "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: @@ -552,6 +557,7 @@ def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: "lora_ids": self.lora_kernel_meta.active_lora_ids, "offset_start": 0, "add_inputs": add_inputs, + "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } def bench_fn_kwargs( @@ -637,7 +643,7 @@ def bench_optype( # Clear LoRA optimization hash-maps. _LORA_A_PTR_DICT.clear() _LORA_B_PTR_DICT.clear() - # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up for kwargs in kwargs_list: op_type.bench_fn()(**kwargs) torch.cuda.synchronize() diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 0f896f187..1b1c3b321 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -234,8 +234,11 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: fn = lambda: ops.gptq_marlin_gemm( a=bt.a, + c=None, b_q_weight=w_q, + b_bias=None, b_scales=w_s, + global_scale=None, b_zeros=w_zp, g_idx=g_idx, perm=sort_indices, @@ -250,28 +253,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: else: assert bt.a.dtype == torch.int8 assert bt.wtype == scalar_types.uint4b8 - - if bt.w_ch_s is not None: - s_ch = bt.w_ch_s.to(torch.float32) - else: - s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device) - - if bt.w_tok_s is not None: - s_tok = bt.w_tok_s.to(torch.float32) - else: - s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device) - - fn = lambda: ops.marlin_qqq_gemm( - a=bt.a, - b_q_weight=w_q, - s_group=w_s, - s_tok=s_tok, - s_ch=s_ch, - workspace=workspace.scratch, - size_m=bt.a.shape[0], - size_n=bt.w_ref.shape[1], - size_k=bt.w_ref.shape[0], - ) + raise NotImplementedError("QQQ is not supported anymore") return fn @@ -302,6 +284,25 @@ def machete_create_bench_fn( ) +def cutlass_w4a8_create_bench_fn( + bt: BenchmarkTensors, out_type=torch.dtype, schedule=None +) -> Callable: + w_q = bt.w_q.t().contiguous().t() # make col major + w_q = ops.cutlass_encode_and_reorder_int4b(w_q) + # expects fp8 scales + w_s = ops.cutlass_pack_scale_fp8(bt.w_g_s.to(torch.float8_e4m3fn)) + + return lambda: ops.cutlass_w4a8_mm( + a=bt.a, + b_q=w_q, + b_group_scales=w_s, + b_group_size=bt.group_size, + b_channel_scales=bt.w_ch_s, + a_token_scales=bt.w_tok_s, + maybe_schedule=schedule, + ) + + # impl # bench @@ -403,6 +404,20 @@ def bench( ) ) + # cutlass w4a8 + if types.act_type == torch.float8_e4m3fn and group_size == 128: + timers.append( + bench_fns( + label, + sub_label, + f"cutlass w4a8 ({name_type_string})", + [ + cutlass_w4a8_create_bench_fn(bt, out_type=types.output_type) + for bt in benchmark_tensors + ], + ) + ) + if sweep_schedules: global _SWEEP_SCHEDULES_RESULTS diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 9ea1fddae..34cc45e94 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -22,8 +22,16 @@ MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types, ) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + FP4_MARLIN_SUPPORTED_GROUP_SIZES, + rand_marlin_weight_fp4_like, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + marlin_quant_fp8_torch, +) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( MarlinWorkspace, + awq_marlin_quantize, marlin_quantize, ) from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( @@ -35,7 +43,7 @@ quantize_weights, sort_weights, ) -from vllm.scalar_type import ScalarType +from vllm.scalar_type import ScalarType, scalar_types from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] @@ -57,80 +65,144 @@ def bench_run( size_n: int, ): label = "Quant Matmul" - sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format( model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n ) - print(f"Testing: {sub_label}") a = torch.randn(size_m, size_k).to(torch.half).cuda() b = torch.rand(size_k, size_n).to(torch.half).cuda() + has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8] + if act_order and (group_size == -1 or group_size == size_k or has_zp): + return + if size_k % group_size != 0: + return - a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda() + marlin_24_supported = ( + quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES + and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES + ) + repack_supported = ( + quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES + and group_size in MARLIN_SUPPORTED_GROUP_SIZES + ) + allspark_supported = ( + quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES + and group_size == -1 + and not act_order + and is_k_full + ) + + def gen_marlin_params(): + # Marlin quant + marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None + if quant_type == scalar_types.float4_e2m1f: + if group_size != 16 or act_order: + return + marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like( + b.T, group_size + ) + elif quant_type == scalar_types.float8_e4m3fn: + if group_size not in [-1, 128] or act_order: + return + marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size) + elif group_size == 16: + return + elif has_zp: + marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize( + b, quant_type, group_size + ) + else: + marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = ( + marlin_quantize(b, quant_type, group_size, act_order) + ) + return ( + marlin_w_ref, + marlin_q_w, + marlin_s, + marlin_s2, + marlin_zp, + marlin_g_idx, + marlin_sort_indices, + ) + + def gen_marlin_24_params(): + marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None + if marlin_24_supported: + (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = ( + marlin_24_quantize(b, quant_type, group_size) + ) + return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) + + def gen_repack_params(): + q_w_gptq = None + repack_sort_indices = None + if repack_supported: + (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights( + b, quant_type, group_size, act_order + ) + q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n) + + # For act_order, sort the "weights" and "g_idx" + # so that group ids are increasing + repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device) + if act_order: + (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx) + return q_w_gptq, repack_sort_indices + + def gen_allspark_params(): + qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = ( + CUBLAS_M_THRESHOLD + ) = None + nonlocal allspark_supported + if allspark_supported: + properties = torch.cuda.get_device_properties(b.device.index) + sm_count = properties.multi_processor_count + sm_version = properties.major * 10 + properties.minor + + supported_arch = sm_version >= 80 and sm_version < 90 + allspark_supported = allspark_supported and supported_arch + if supported_arch: + w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp) + qw = qw.to(torch.uint8) + + qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight( + qw, s, zp, has_zp + ) + CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD + return ( + qw_reorder, + s_reorder, + zp_reorder, + sm_count, + sm_version, + CUBLAS_M_THRESHOLD, + ) - # Marlin quant ( marlin_w_ref, marlin_q_w, marlin_s, + marlin_s2, + marlin_zp, marlin_g_idx, marlin_sort_indices, - marlin_rand_perm, - ) = marlin_quantize(b, quant_type, group_size, act_order) - - # Marlin_24 quant - (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = ( - marlin_24_quantize(b, quant_type, group_size) + ) = gen_marlin_params() + marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = ( + gen_marlin_24_params() ) - - marlin_zp = torch.empty(0, dtype=torch.int, device=b.device) - - # GPTQ quant - (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights( - b, quant_type, group_size, act_order + q_w_gptq, repack_sort_indices = gen_repack_params() + qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = ( + gen_allspark_params() ) - q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n) - - # For act_order, sort the "weights" and "g_idx" - # so that group ids are increasing - repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device) - if act_order: - (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx) # Prepare marlin_workspace = MarlinWorkspace( size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL ) - marlin_24_workspace = MarlinWorkspace( size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL ) - marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int) - - # AllSpark W8A16 quant - as_supported_case = ( - quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES - and group_size == -1 - and not act_order - and is_k_full - ) - if as_supported_case: - properties = torch.cuda.get_device_properties(b.device.index) - sm_count = properties.multi_processor_count - sm_version = properties.major * 10 + properties.minor - - supported_arch = sm_version >= 80 and sm_version < 90 - as_supported_case = as_supported_case and supported_arch - if supported_arch: - has_zp = False - w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp) - qw = qw.to(torch.uint8) - - qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight( - qw, s, zp, has_zp - ) - CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD globals = { # Gen params @@ -140,15 +212,14 @@ def bench_run( "size_n": size_n, "size_k": size_k, "a": a, - "a_tmp": a_tmp, # Marlin params "marlin_w_ref": marlin_w_ref, "marlin_q_w": marlin_q_w, "marlin_s": marlin_s, + "marlin_s2": marlin_s2, "marlin_zp": marlin_zp, "marlin_g_idx": marlin_g_idx, "marlin_sort_indices": marlin_sort_indices, - "marlin_rand_perm": marlin_rand_perm, "marlin_workspace": marlin_workspace, "is_k_full": is_k_full, # Marlin_24 params @@ -161,12 +232,12 @@ def bench_run( "q_w_gptq": q_w_gptq, "repack_sort_indices": repack_sort_indices, # AllSpark W8A16 params - "qw_reorder": qw_reorder if as_supported_case else None, - "s_reorder": s_reorder if as_supported_case else None, - "zp_reorder": zp_reorder if as_supported_case else None, - "sm_count": sm_count if as_supported_case else None, - "sm_version": sm_version if as_supported_case else None, - "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None, + "qw_reorder": qw_reorder, + "s_reorder": s_reorder, + "zp_reorder": zp_reorder, + "sm_count": sm_count, + "sm_version": sm_version, + "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD, # Kernels "gptq_marlin_gemm": ops.gptq_marlin_gemm, "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, @@ -177,7 +248,7 @@ def bench_run( min_run_time = 1 # Warmup pytorch - for i in range(5): + for _ in range(5): torch.matmul(a, marlin_w_ref) results.append( @@ -192,17 +263,17 @@ def bench_run( results.append( benchmark.Timer( - stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 + stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, - description="gptq_marlin_gemm_fp16", + description="gptq_marlin_gemm", ).blocked_autorange(min_run_time=min_run_time) ) results.append( benchmark.Timer( - stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 + stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, @@ -210,10 +281,7 @@ def bench_run( ).blocked_autorange(min_run_time=min_run_time) ) - if ( - quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES - and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES - ): + if marlin_24_supported: results.append( benchmark.Timer( stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 @@ -224,17 +292,18 @@ def bench_run( ).blocked_autorange(min_run_time=min_run_time) ) - results.append( - benchmark.Timer( - stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="gptq_marlin_repack", - ).blocked_autorange(min_run_time=min_run_time) - ) + if repack_supported: + results.append( + benchmark.Timer( + stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_repack", + ).blocked_autorange(min_run_time=min_run_time) + ) - if as_supported_case: + if allspark_supported: results.append( benchmark.Timer( stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 @@ -250,7 +319,6 @@ def main(args): print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") - results: list[benchmark.Measurement] = [] for model in args.models: @@ -278,14 +346,17 @@ def main(args): ): continue - for quant_type in query_marlin_supported_quant_types(False): + for quant_type in query_marlin_supported_quant_types(): if ( len(args.limit_num_bits) > 0 and quant_type.size_bits not in args.limit_num_bits ): continue - for group_size in MARLIN_SUPPORTED_GROUP_SIZES: + for group_size in ( + MARLIN_SUPPORTED_GROUP_SIZES + + FP4_MARLIN_SUPPORTED_GROUP_SIZES + ): if ( len(args.limit_group_size) > 0 and group_size not in args.limit_group_size diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index cef53b183..02c2db674 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -3,6 +3,7 @@ import argparse import json +import os import time from contextlib import nullcontext from datetime import datetime @@ -13,6 +14,10 @@ import torch from ray.experimental.tqdm_ray import tqdm +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + _get_config_dtype_str, +) from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config @@ -22,6 +27,13 @@ FP8_DTYPE = current_platform.fp8_dtype() +def ensure_divisibility(numerator, denominator, text): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, "{} {} is not divisible by tp {}.".format( + text, numerator, denominator + ) + + class BenchmarkConfig(TypedDict): BLOCK_SIZE_M: int BLOCK_SIZE_N: int @@ -86,6 +98,9 @@ def benchmark_config( (num_experts, 2 * shard_intermediate_size), dtype=torch.float32 ) w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) + if use_deep_gemm: + # we use the default block shape for deepgemm + block_quant_shape = [128, 128] if use_fp8_w8a8: if block_quant_shape: block_n, block_k = block_quant_shape[0], block_quant_shape[1] @@ -123,43 +138,36 @@ def prepare(i: int): def run(): from vllm.model_executor.layers.fused_moe import override_config + if use_fp8_w8a8: + quant_dtype = torch.float8_e4m3fn + elif use_int8_w8a16: + quant_dtype = torch.int8 + else: + quant_dtype = None + + quant_config = FusedMoEQuantConfig.make( + quant_dtype=quant_dtype, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_quant_shape, + ) + with override_config(config): - if use_deep_gemm: - topk_weights, topk_ids, token_expert_indices = fused_topk( - x, input_gating, topk, False - ) - return fused_experts( - x, - w1, - w2, - topk_weights, - topk_ids, - inplace=True, - use_fp8_w8a8=use_fp8_w8a8, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_quant_shape, - allow_deep_gemm=True, - ) - else: - fused_moe( - x, - w1, - w2, - input_gating, - topk, - renormalize=True, - inplace=True, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_quant_shape, - ) + topk_weights, topk_ids, token_expert_indices = fused_topk( + x, input_gating, topk, renormalize=not use_deep_gemm + ) + return fused_experts( + x, + w1, + w2, + topk_weights, + topk_ids, + inplace=True, + quant_config=quant_config, + allow_deep_gemm=use_deep_gemm, + ) # JIT compilation & warmup run() @@ -403,13 +411,15 @@ def benchmark( use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) - dtype_str = get_config_dtype_str( + dtype_str = _get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. + block_n = block_quant_shape[0] if block_quant_shape else None + block_k = block_quant_shape[1] if block_quant_shape else None op_config = get_moe_configs( - num_experts, shard_intermediate_size // 2, dtype_str + num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k ) if op_config is None: config = get_default_config( @@ -419,7 +429,7 @@ def benchmark( hidden_size, topk, dtype_str, - is_marlin=False, + block_quant_shape, ) else: config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))] @@ -532,8 +542,9 @@ def save_configs( use_fp8_w8a8: bool, use_int8_w8a16: bool, block_quant_shape: list[int], + save_dir: str, ) -> None: - dtype_str = get_config_dtype_str( + dtype_str = _get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 ) @@ -542,10 +553,11 @@ def save_configs( filename = get_config_file_name( num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape ) - + os.makedirs(save_dir, exist_ok=True) + filename = os.path.join(save_dir, filename) print(f"Writing best config to {filename}...") with open(filename, "w") as f: - json.dump(configs, f, indent=4) + json.dump({"triton_version": triton.__version__, **configs}, f, indent=4) f.write("\n") @@ -567,22 +579,31 @@ def main(args: argparse.Namespace): E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k intermediate_size = config.ffn_config.ffn_hidden_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size elif config.architectures[0] == "JambaForCausalLM": E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size - elif config.architectures[0] in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"): + elif config.architectures[0] in ( + "DeepseekV2ForCausalLM", + "DeepseekV3ForCausalLM", + "DeepseekV32ForCausalLM", + "Glm4MoeForCausalLM", + ): E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size - elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"): + elif config.architectures[0] in ( + "Qwen2MoeForCausalLM", + "Qwen3MoeForCausalLM", + "Qwen3NextForCausalLM", + ): E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"): + E = config.num_experts + topk = config.moe_topk[0] + intermediate_size = config.moe_intermediate_size[0] else: # Support for llama4 config = config.get_text_config() @@ -590,8 +611,14 @@ def main(args: argparse.Namespace): E = config.num_local_experts topk = config.num_experts_per_tok intermediate_size = config.intermediate_size + enable_ep = bool(args.enable_expert_parallel) + if enable_ep: + ensure_divisibility(E, args.tp_size, "Number of experts") + E = E // args.tp_size + shard_intermediate_size = 2 * intermediate_size + else: + ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") shard_intermediate_size = 2 * intermediate_size // args.tp_size - hidden_size = config.hidden_size dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" @@ -620,7 +647,7 @@ def main(args: argparse.Namespace): 4096, ] else: - batch_sizes = [args.batch_size] + batch_sizes = args.batch_size use_deep_gemm = bool(args.use_deep_gemm) @@ -653,7 +680,11 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) search_space = get_configs_compute_bound(is_fp16, block_quant_shape) print(f"Start tuning over {len(search_space)} configurations...") - + if use_deep_gemm: + raise ValueError( + "Tuning with --use-deep-gemm is not supported as it only tunes Triton " + "kernels. Please remove the flag." + ) start = time.time() configs = _distribute( "tune", @@ -687,6 +718,7 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: use_fp8_w8a8, use_int8_w8a16, block_quant_shape, + args.save_dir, ) end = time.time() print(f"Tuning took {end - start:.2f} seconds") @@ -723,12 +755,16 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: parser.add_argument( "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2 ) + parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true") parser.add_argument( "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" ) parser.add_argument("--use-deep-gemm", action="store_true") + parser.add_argument( + "--save-dir", type=str, default="./", help="Directory to save tuned results" + ) parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--batch-size", type=int, required=False) + parser.add_argument("--batch-size", type=int, nargs="+", required=False) parser.add_argument("--tune", action="store_true") parser.add_argument("--trust-remote-code", action="store_true") parser.add_argument("--model-prefix", type=str, required=False) diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py new file mode 100644 index 000000000..f540cff62 --- /dev/null +++ b/benchmarks/kernels/benchmark_moe_align_block_size.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import itertools + +import torch + +from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( + moe_align_block_size, +) +from vllm.triton_utils import triton + + +def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor: + return torch.stack( + [ + torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk] + for _ in range(num_tokens) + ] + ) + + +# test configurations +num_tokens_range = [1, 16, 256, 4096] +num_experts_range = [16, 64, 224, 256, 280, 512] +topk_range = [1, 2, 8] +configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range)) + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["num_tokens", "num_experts", "topk"], + x_vals=configs, + line_arg="provider", + line_vals=["vllm"], + line_names=["vLLM"], + plot_name="moe-align-block-size-performance", + args={}, + ) +) +def benchmark(num_tokens, num_experts, topk, provider): + """Benchmark function for Triton.""" + block_size = 256 + topk_ids = get_topk_ids(num_tokens, num_experts, topk) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "vllm": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: moe_align_block_size(topk_ids, block_size, num_experts), + quantiles=quantiles, + ) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--num_experts", + type=int, + default=64, + choices=[8, 16, 32, 64, 128, 256], + ) + parser.add_argument( + "--topk", + type=int, + default=8, + choices=[2, 4, 8], + help="Top-k value for correctness check.", + ) + args = parser.parse_args() + + benchmark.run(print_data=True, show_plots=True) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index dba1f3943..04d2205aa 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -8,12 +8,13 @@ import torch from transformers import AutoConfig -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( _moe_permute, _moe_unpermute_and_reduce, + moe_permute, + moe_unpermute, ) -from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import * from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser @@ -63,18 +64,19 @@ def prepare(i: int): def run(): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( - moe_permute( - qhidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - token_expert_indices=token_expert_indices, - topk=topk, - n_expert=num_experts, - n_local_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) + ( + permuted_hidden_states, + a1q_scale, + first_token_off, + inv_perm_idx, + m_indices, + ) = moe_permute( + qhidden_states, + a1q_scale=None, + topk_ids=topk_ids, + n_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, ) else: ( @@ -150,18 +152,19 @@ def benchmark_unpermute( def prepare(): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( - moe_permute( - qhidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - token_expert_indices=token_expert_indices, - topk=topk, - n_expert=num_experts, - n_local_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) + ( + permuted_hidden_states, + a1q_scale, + first_token_off, + inv_perm_idx, + m_indices, + ) = moe_permute( + qhidden_states, + a1q_scale=None, + topk_ids=topk_ids, + n_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, ) # convert to fp16/bf16 as gemm output return ( @@ -191,16 +194,19 @@ def prepare(): def run(input: tuple): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input + ( + permuted_hidden_states, + first_token_off, + inv_perm_idx, + m_indices, + ) = input + output = torch.empty_like(hidden_states) moe_unpermute( + output, permuted_hidden_states, topk_weights, - topk_ids, inv_perm_idx, first_token_off, - topk, - num_experts, - num_experts, ) else: ( @@ -211,7 +217,11 @@ def run(input: tuple): inv_perm, ) = input _moe_unpermute_and_reduce( - output_hidden_states, permuted_hidden_states, inv_perm, topk_weights + output_hidden_states, + permuted_hidden_states, + inv_perm, + topk_weights, + True, ) # JIT compilation & warmup @@ -318,6 +328,7 @@ def main(args: argparse.Namespace): elif ( config.architectures[0] == "DeepseekV3ForCausalLM" or config.architectures[0] == "DeepseekV2ForCausalLM" + or config.architectures[0] == "Glm4MoeForCausalLM" ): E = config.n_routed_experts topk = config.num_experts_per_tok diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py new file mode 100644 index 000000000..b91473617 --- /dev/null +++ b/benchmarks/kernels/benchmark_mrope.py @@ -0,0 +1,328 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models). +# It generates test data, runs benchmarks, and saves results to a CSV file. +# +# The CSV file (named with current date/time) contains these columns: +# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, +# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, +# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, +# speedup +# +# == Usage Examples == +# +# Single model benchmark: +# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \ +# --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 +# +# All models benchmark: +# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \ +# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 +# +# All models with different TP sizes: +# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \ +# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 +# +# All models with different token counts: +# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \ +# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384 +import csv +import os +import time +from datetime import datetime +from typing import Any + +import numpy as np +import torch + +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.platforms import current_platform +from vllm.transformers_utils.config import get_config +from vllm.utils import FlexibleArgumentParser + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def generate_test_data( + num_tokens: int, + num_q_heads: int, + num_kv_heads: int, + head_size: int, + max_position_embeddings: int, + dtype: torch.dtype, + device: torch.device, +): + """Generate test data for given configuration.""" + # Create 2D positions (3, num_tokens) for multimodal case + positions = torch.randint( + 0, max_position_embeddings // 4, (3, num_tokens), device=device + ) + + # Create query and key tensors + query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device) + key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device) + + return positions, query, key + + +def calculate_stats(times: list[float]) -> dict[str, float]: + """Calculate statistics from a list of times.""" + times_array = np.array(times) + return { + "mean": np.mean(times_array), + "median": np.median(times_array), + "p99": np.percentile(times_array, 99), + "min": np.min(times_array), + "max": np.max(times_array), + } + + +def benchmark_mrope( + model_name: str, + num_tokens: int, + head_dim: int, + tp_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 8192, + rope_theta: float = 10000, + is_neox_style: bool = True, + rope_scaling: dict[str, Any] = None, + dtype: torch.dtype = torch.bfloat16, + seed: int = 0, + warmup_iter: int = 10, + benchmark_iter: int = 100, + csv_writer=None, +): + current_platform.seed_everything(seed) + torch.set_default_device(device) + # the parameters to compute the q k v size based on tp_size + mrope_helper_class = get_rope( + head_size=head_dim, + rotary_dim=head_dim, + max_position=max_position, + base=rope_theta, + is_neox_style=is_neox_style, + rope_scaling=rope_scaling, + dtype=dtype, + ).to(device=device) + + print(80 * "=") + print( + f"Evaluating model: {model_name} " + f"with tp_size: {tp_size} " + f"and num_tokens: {num_tokens}, " + f"dtype: {dtype}" + ) + + # create q k v input tensors + # create rotary pos emb input tensors + positions, query, key = generate_test_data( + num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device + ) + + # Warm up + for _ in range(warmup_iter): + mrope_helper_class.forward_native( + positions, + query.clone(), + key.clone(), + ) + + mrope_helper_class.forward_cuda( + positions, + query.clone(), + key.clone(), + ) + + torch.cuda.synchronize() + + # Time reference implementation + torch_times = [] + for _ in range(benchmark_iter): + query_clone = query.clone() + key_clone = key.clone() + torch.cuda.synchronize() + start_time = time.time() + + mrope_helper_class.forward_native( + positions, + query_clone, + key_clone, + ) + + torch.cuda.synchronize() + torch_times.append(time.time() - start_time) + + # Time triton kernel implementation + triton_times = [] + for _ in range(benchmark_iter): + query_clone = query.clone() + key_clone = key.clone() + torch.cuda.synchronize() + start_time = time.time() + mrope_helper_class.forward_cuda( + positions, + query_clone, + key_clone, + ) + torch.cuda.synchronize() + triton_times.append(time.time() - start_time) + + # Calculate statistics + torch_stats = calculate_stats(torch_times) + triton_stats = calculate_stats(triton_times) + print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):") + + print( + f"Torch implementation: " + f"mean={torch_stats['mean']:.8f}s, " + f"median={torch_stats['median']:.8f}s, " + f"p99={torch_stats['p99']:.8f}s" + ) + + print( + f"Triton implementation: " + f"mean={triton_stats['mean']:.8f}s, " + f"median={triton_stats['median']:.8f}s, " + f"p99={triton_stats['p99']:.8f}s" + ) + + print( + f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x" + ) + + # Write to CSV + if csv_writer: + row = [ + model_name, + tp_size, + num_tokens, + num_heads, + num_kv_heads, + head_dim, + max_position, + rope_theta, + is_neox_style, + str(rope_scaling), + str(dtype).split(".")[-1], + torch_stats["mean"], + torch_stats["median"], + torch_stats["p99"], + torch_stats["min"], + torch_stats["max"], + triton_stats["mean"], + triton_stats["median"], + triton_stats["p99"], + triton_stats["min"], + triton_stats["max"], + torch_stats["mean"] / triton_stats["mean"], # speedup + ] + csv_writer.writerow(row) + + return torch_stats, triton_stats + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the rotary embedding kernels." + ) + parser.add_argument("--model-name", type=str, default="") + parser.add_argument("--tp-size", type=int, default=1) + parser.add_argument("--warmup-iter", type=int, default=10) + parser.add_argument("--benchmark-iter", type=int, default=100) + parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--num-tokens", type=int, nargs="+", required=False) + parser.add_argument("--trust-remote-code", action="store_true") + parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv") + args = parser.parse_args() + print(args) + + # Create CSV file for results + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv" + + with open(csv_filename, "w", newline="") as csvfile: + csv_writer = csv.writer(csvfile) + # Write header + header = [ + "model_name", + "tp_size", + "num_tokens", + "num_heads", + "num_kv_heads", + "head_dim", + "max_position", + "rope_theta", + "is_neox_style", + "rope_scaling", + "dtype", + "torch_mean", + "torch_median", + "torch_p99", + "torch_min", + "torch_max", + "triton_mean", + "triton_median", + "triton_p99", + "triton_min", + "triton_max", + "speedup", + ] + csv_writer.writerow(header) + + model_tp_dict = {} + if args.model_name == "": + model_tp_dict = { + "Qwen/Qwen2-VL-2B-Instruct": [1], + "Qwen/Qwen2-VL-7B-Instruct": [1], + "Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8], + "Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8], + "Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8], + "Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8], + } + else: + model_tp_dict[args.model_name] = [args.tp_size] + + if args.num_tokens is None: + num_tokens_list = [2**i for i in range(0, 18)] + else: + num_tokens_list = args.num_tokens + + for model_name, tp_list in model_tp_dict.items(): + config = get_config(model_name, trust_remote_code=args.trust_remote_code) + for tp_size in tp_list: + # get the model config + total_num_kv_heads = config.num_key_value_heads + total_num_heads = config.num_attention_heads + num_heads = total_num_heads // tp_size + num_kv_heads = max(1, total_num_kv_heads // tp_size) + head_dim = config.hidden_size // total_num_heads + q_size = num_heads * head_dim + kv_size = num_kv_heads * head_dim + is_neox_style = True + rope_theta = config.rope_theta + max_position = config.max_position_embeddings + + for num_tokens in num_tokens_list: + benchmark_mrope( + model_name=model_name, + num_tokens=num_tokens, + head_dim=head_dim, + tp_size=tp_size, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + max_position=max_position, + rope_theta=rope_theta, + is_neox_style=is_neox_style, + rope_scaling=config.rope_scaling, + dtype=getattr(torch, args.dtype), + seed=args.seed, + warmup_iter=args.warmup_iter, + benchmark_iter=args.benchmark_iter, + csv_writer=csv_writer, + ) + + print(f"Benchmark results saved to {csv_filename}") diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py new file mode 100644 index 000000000..1ccb5e08b --- /dev/null +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import math +from contextlib import contextmanager +from typing import Callable +from unittest.mock import patch + +import torch + +from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils +from vllm.platforms import current_platform + + +@contextmanager +def _triton_mode(): + """Temporarily force the Triton fallback path""" + with patch("vllm.platforms.current_platform.is_cuda", return_value=False): + yield + + +def _time_cuda( + fn: Callable[[], tuple[torch.Tensor, torch.Tensor]], + warmup_iters: int, + bench_iters: int, +) -> float: + # warmup + for _ in range(warmup_iters): + fn() + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(bench_iters): + fn() + end.record() + torch.cuda.synchronize() + + return start.elapsed_time(end) / bench_iters # ms/iter + + +def _run_single( + shape: tuple[int, int], + group_size: int, + dtype: str, + *, + column_major: bool = False, + scale_ue8m0: bool = False, + warmup_iters: int, + bench_iters: int, +) -> None: + num_tokens, hidden_dim = shape + + device = torch.device("cuda") + torch.manual_seed(42) + x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16) * 8 + + if dtype == "fp8": + + def cuda_impl(): + return fp8_utils.per_token_group_quant_fp8( + x, + group_size, + column_major_scales=column_major, + use_ue8m0=scale_ue8m0, + ) + + def triton_impl(): + with _triton_mode(): + return fp8_utils.per_token_group_quant_fp8( + x, + group_size, + column_major_scales=column_major, + use_ue8m0=scale_ue8m0, + ) + elif dtype == "int8": + + def cuda_impl(): + return int8_utils.per_token_group_quant_int8(x, group_size) + + def triton_impl(): + with _triton_mode(): + return int8_utils.per_token_group_quant_int8(x, group_size) + else: + raise ValueError("dtype must be 'fp8' or 'int8'") + + cuda_ms = _time_cuda(cuda_impl, warmup_iters, bench_iters) + triton_ms = _time_cuda(triton_impl, warmup_iters, bench_iters) + + speedup = triton_ms / cuda_ms if cuda_ms else math.inf + + cfg_desc = ( + f"shape={shape} gs={group_size:<3} col_major={column_major:<5} " + f"ue8m0={scale_ue8m0:<5} dtype={dtype}" + ) + print( + f"{cfg_desc:55} | CUDA {cuda_ms:7.3f} ms | Triton {triton_ms:7.3f} ms | " + f"speed-up ×{speedup:5.2f}" + ) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--warmup-iters", type=int, default=10) + parser.add_argument("--bench-iters", type=int, default=100) + parser.add_argument("--dtype", choices=["fp8", "int8", "both"], default="both") + return parser.parse_args() + + +if __name__ == "__main__": + if not current_platform.is_cuda(): + raise RuntimeError("CUDA device is required to run this benchmark.") + + args = parse_args() + warmup_iters, bench_iters = args.warmup_iters, args.bench_iters + + shapes = [(32, 128), (64, 256), (16, 512)] + group_sizes = [64, 128] + + dtypes = ["fp8", "int8"] if args.dtype == "both" else [args.dtype] + + header = ( + "Configuration".ljust(55) + + " | " + + "CUDA (ms)".center(12) + + " | " + + "Triton (ms)".center(13) + + " | " + + "Speed-up" + ) + print(header) + print("-" * len(header)) + + for dtype in dtypes: + for shape in shapes: + for gs in group_sizes: + if dtype == "fp8": + for col_major in (False, True): + for ue8m0 in (False, True): + _run_single( + shape, + gs, + dtype, + column_major=col_major, + scale_ue8m0=ue8m0, + warmup_iters=warmup_iters, + bench_iters=bench_iters, + ) + else: # INT8 has no col-major / ue8m0 switches + _run_single( + shape, + gs, + dtype, + warmup_iters=warmup_iters, + bench_iters=bench_iters, + ) diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py new file mode 100644 index 000000000..9ac8f5e65 --- /dev/null +++ b/benchmarks/kernels/benchmark_polynorm.py @@ -0,0 +1,155 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import itertools + +import torch + +from vllm import _custom_ops as vllm_ops +from vllm.triton_utils import triton + + +def polynorm_naive( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + + def norm(x, eps: float): + return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps) + + x = x.float() + return ( + ( + weight[0] * norm(x**3, eps) + + weight[1] * norm(x**2, eps) + + weight[2] * norm(x, eps) + + bias + ) + .to(weight.dtype) + .view(orig_shape) + ) + + +def polynorm_vllm( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + + out = torch.empty_like(x) + vllm_ops.poly_norm(out, x, weight, bias, eps) + output = out + + output = output.view(orig_shape) + return output + + +def calculate_diff(batch_size, seq_len, hidden_dim): + dtype = torch.bfloat16 + x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") + weight = torch.ones(3, dtype=dtype, device="cuda") + bias = torch.ones(1, dtype=dtype, device="cuda") + + output_naive = polynorm_naive(x, weight, bias) + output_vllm = polynorm_vllm(x, weight, bias) + + if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2): + print("✅ All implementations match") + else: + print("❌ Implementations differ") + + +batch_size_range = [2**i for i in range(0, 7, 2)] +seq_length_range = [2**i for i in range(6, 11, 1)] +dim_range = [2048, 4096] +configs = list(itertools.product(dim_range, batch_size_range, seq_length_range)) + + +def get_benchmark(): + @triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["dim", "batch_size", "seq_len"], + x_vals=[list(_) for _ in configs], + line_arg="provider", + line_vals=["naive", "vllm"], + line_names=["Naive", "vLLM"], + styles=[("blue", "-"), ("red", "-")], + ylabel="us", + plot_name="polynorm-perf", + args={}, + ) + ) + def benchmark(dim, batch_size, seq_len, provider): + dtype = torch.bfloat16 + hidden_dim = dim * 4 + + x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") + weight = torch.ones(3, dtype=dtype, device="cuda") + bias = torch.ones(1, dtype=dtype, device="cuda") + + quantiles = [0.5, 0.2, 0.8] + + if provider == "naive": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: polynorm_naive(x, weight, bias), + quantiles=quantiles, + ) + else: + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: polynorm_vllm(x, weight, bias), + quantiles=quantiles, + ) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + return benchmark + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--batch-size", + type=int, + default=4, + help="Batch size", + ) + parser.add_argument( + "--seq-len", + type=int, + default=128, + help="Sequence length", + ) + parser.add_argument( + "--hidden-dim", + type=int, + default=8192, + help="Intermediate size of MLP", + ) + parser.add_argument( + "--save-path", + type=str, + default="./configs/polnorm/", + help="Path to save polnorm benchmark results", + ) + + args = parser.parse_args() + + # Run correctness test + calculate_diff( + batch_size=args.batch_size, + seq_len=args.seq_len, + hidden_dim=args.hidden_dim, + ) + + benchmark = get_benchmark() + # Run performance benchmark + benchmark.run(print_data=True, save_path=args.save_path) diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py new file mode 100644 index 000000000..0aace5710 --- /dev/null +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -0,0 +1,212 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +import random +import time + +import torch +from tabulate import tabulate + +from vllm import _custom_ops as ops +from vllm.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash, +) +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + FlexibleArgumentParser, + create_kv_caches_with_random_flash, +) + +logger = init_logger(__name__) + + +@torch.inference_mode() +def run_benchmark( + num_tokens: int, + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + kv_cache_dtype: str, + kv_cache_layout: str, + num_iters: int, + implementation: str, + benchmark_mode: str, + device: str = "cuda", +) -> float: + """Return latency (seconds) for given num_tokens.""" + + if kv_cache_dtype == "fp8" and head_size % 16: + raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") + + if implementation not in ("cuda", "triton"): + raise ValueError( + f"Unsupported implementation: {implementation}. " + "Only 'cuda' and 'triton' are supported." + ) + if implementation == "triton" and kv_cache_layout == "HND": + return float("nan") # Triton does not support HND layout yet. + + current_platform.seed_everything(42) + torch.set_default_device(device) + + # create random key / value tensors [T, H, D]. + key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device) + value = torch.randn_like(key) + + # prepare the slot mapping. + # each token is assigned a unique slot in the KV-cache. + num_slots = block_size * num_blocks + if num_tokens > num_slots: + raise ValueError("num_tokens cannot exceed the total number of cache slots") + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device) + + key_caches, value_caches = create_kv_caches_with_random_flash( + num_blocks, + block_size, + 1, # num_layers + num_heads, + head_size, + kv_cache_dtype, + dtype, + device=device, + cache_layout=kv_cache_layout, + ) + key_cache, value_cache = key_caches[0], value_caches[0] + # to free unused memory + del key_caches, value_caches + + # compute per-kernel scaling factors for fp8 conversion (if used). + k_scale = (key.amax() / 64.0).to(torch.float32) + v_scale = (value.amax() / 64.0).to(torch.float32) + + if implementation == "cuda": + function_under_test = lambda: ops.reshape_and_cache_flash( + key, # noqa: F821 + value, # noqa: F821 + key_cache, # noqa: F821 + value_cache, # noqa: F821 + slot_mapping, # noqa: F821 + kv_cache_dtype, + k_scale, + v_scale, + ) + else: + function_under_test = lambda: triton_reshape_and_cache_flash( + key, # noqa: F821 + value, # noqa: F821 + key_cache, # noqa: F821 + value_cache, # noqa: F821 + slot_mapping, # noqa: F821 + kv_cache_dtype, + k_scale, + v_scale, + ) + if benchmark_mode == "cudagraph": + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + function_under_test() + torch.cuda.synchronize() + function_under_test = lambda: g.replay() + + def run_cuda_benchmark(n_iters: int) -> float: + nonlocal key, value, key_cache, value_cache, slot_mapping + torch.cuda.synchronize() + start = time.perf_counter() + for _ in range(n_iters): + function_under_test() + torch.cuda.synchronize() + end = time.perf_counter() + return (end - start) / n_iters + + # warm-up + run_cuda_benchmark(3) + + lat = run_cuda_benchmark(num_iters) + + # free tensors to mitigate OOM when sweeping + del key, value, key_cache, value_cache, slot_mapping + torch.cuda.empty_cache() + + return lat + + +def main(args): + rows = [] + for layout in ["NHD", "HND"]: + for exp in range(1, 17): + n_tok = 2**exp + lat = run_benchmark( + num_tokens=n_tok, + num_heads=args.num_heads, + head_size=args.head_size, + block_size=args.block_size, + num_blocks=args.num_blocks, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + kv_cache_dtype=args.kv_cache_dtype, + kv_cache_layout=layout, + num_iters=args.iters, + implementation=args.implementation, + benchmark_mode=args.mode, + device="cuda", + ) + rows.append([n_tok, layout, f"{lat * 1e6:.3f}"]) + + print( + f"Benchmark results for implementation {args.implementation}" + f" (measuring with {args.mode}):" + ) + print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"])) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + + parser.add_argument("--num-heads", type=int, default=128) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) + parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) + parser.add_argument("--num-blocks", type=int, default=128 * 512) + + parser.add_argument( + "--dtype", + type=str, + choices=["half", "bfloat16", "float"], + default="bfloat16", + ) + + parser.add_argument( + "--kv-cache-dtype", + type=str, + choices=["auto", "fp8"], + default="auto", + ) + + parser.add_argument("--iters", type=int, default=100) + + parser.add_argument( + "--implementation", + type=str, + choices=["cuda", "triton"], + default="cuda", + ) + + parser.add_argument( + "--mode", + type=str, + choices=["cudagraph", "no_graph"], + default="cudagraph", + ) + + args = parser.parse_args() + + main(args) diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py new file mode 100644 index 000000000..c7a4066b3 --- /dev/null +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -0,0 +1,675 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import matplotlib.pyplot as plt +import numpy as np +import torch + +from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( + silu_mul_fp8_quant_deep_gemm_cuda, +) +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used + + +@triton.jit +def _silu_mul_fp8_quant_deep_gemm( + # Pointers ------------------------------------------------------------ + input_ptr, # 16-bit activations (E, T, 2*H) + y_q_ptr, # fp8 quantized activations (E, T, H) + y_s_ptr, # 16-bit scales (E, T, G) + counts_ptr, # int32 num tokens per expert (E) + # Sizes --------------------------------------------------------------- + H: tl.constexpr, # hidden dimension (per output) + GROUP_SIZE: tl.constexpr, # elements per group (usually 128) + # Strides for input (elements) --------------------------------------- + stride_i_e, + stride_i_t, + stride_i_h, + # Strides for y_q (elements) ----------------------------------------- + stride_yq_e, + stride_yq_t, + stride_yq_h, + # Strides for y_s (elements) ----------------------------------------- + stride_ys_e, + stride_ys_t, + stride_ys_g, + # Stride for counts (elements) + stride_counts_e, + # Numeric params ------------------------------------------------------ + eps: tl.constexpr, + fp8_min: tl.constexpr, + fp8_max: tl.constexpr, + use_ue8m0: tl.constexpr, + # Meta --------------------------------------------------------------- + BLOCK: tl.constexpr, + NUM_STAGES: tl.constexpr, +): + G = H // GROUP_SIZE + + # map program id -> (e, g) + pid = tl.program_id(0) + e = pid // G + g = pid % G + + e = e.to(tl.int64) + g = g.to(tl.int64) + + # number of valid tokens for this expert + n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64) + + cols = tl.arange(0, BLOCK).to(tl.int64) + mask = cols < BLOCK + + base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h + base_gate_offset = base_input_offset + cols * stride_i_h + base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h + base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h + base_ys_offset = e * stride_ys_e + g * stride_ys_g + + for t in tl.range(0, n_tokens, num_stages=NUM_STAGES): + gate = tl.load( + input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0 + ).to(tl.float32) + up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0) + + gate = gate * (1.0 / (1.0 + tl.exp(-gate))) + y = gate * up + + y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max + if use_ue8m0: + y_s = tl.exp2(tl.ceil(tl.log2(y_s))) + + y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask) + tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s) + + +def silu_mul_fp8_quant_deep_gemm_triton( + y: torch.Tensor, # (E, T, 2*H) + tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert + num_parallel_tokens, + group_size: int = 128, + eps: float = 1e-10, +) -> tuple[torch.Tensor, torch.Tensor]: + """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales + + y has shape (E, T, 2*H). The first half of the last dimension is + silu-activated, multiplied by the second half, then quantized into FP8. + + Returns `(y_q, y_s)` where + * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H] + * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) + """ + assert y.ndim == 3, "y must be (E, T, 2*H)" + E, T, H2 = y.shape + assert H2 % 2 == 0, "last dim of y must be even (2*H)" + H = H2 // 2 + G = (H + group_size - 1) // group_size + assert H % group_size == 0, "H must be divisible by group_size" + assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, ( + "tokens_per_expert must be shape (E,)" + ) + tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32) + + # allocate outputs + fp8_dtype = torch.float8_e4m3fn + y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) + + # strides (elements) + stride_i_e, stride_i_t, stride_i_h = y.stride() + stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() + + # desired scale strides (elements): (T*G, 1, T) + stride_ys_e = T * G + stride_ys_t = 1 + stride_ys_g = T + y_s = torch.empty_strided( + (E, T, G), + (stride_ys_e, stride_ys_t, stride_ys_g), + dtype=torch.float32, + device=y.device, + ) + + stride_cnt_e = tokens_per_expert.stride()[0] + + # Static grid over experts and H-groups. + # A loop inside the kernel handles the token dim + grid = (E * G,) + + f_info = torch.finfo(fp8_dtype) + fp8_max = f_info.max + fp8_min = f_info.min + + _silu_mul_fp8_quant_deep_gemm[grid]( + y, + y_q, + y_s, + tokens_per_expert, + H, + group_size, + stride_i_e, + stride_i_t, + stride_i_h, + stride_yq_e, + stride_yq_t, + stride_yq_h, + stride_ys_e, + stride_ys_t, + stride_ys_g, + stride_cnt_e, + eps, + fp8_min, + fp8_max, + is_deep_gemm_e8m0_used(), + BLOCK=group_size, + NUM_STAGES=4, + num_warps=1, + ) + + return y_q, y_s + + +# Parse generation strategies +strategies = ["uniform", "max_t", "first_t"] + + +def benchmark( + kernel: Callable, + E: int, + T: int, + H: int, + total_tokens: int, + num_parallel_tokens: int = 64, + G: int = 128, + runs: int = 200, + num_warmups: int = 20, + gen_strategy: str = "default", + iterations_per_run: int = 20, +): + def generate_data(seed_offset=0): + """Generate input data with given seed offset""" + current_platform.seed_everything(42 + seed_offset) + y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() + + if gen_strategy == "uniform": + r = torch.rand(size=(E,), device="cuda") + r /= r.sum() + r *= total_tokens + tokens_per_expert = r.int() + tokens_per_expert = torch.minimum( + tokens_per_expert, + torch.ones((E,), device=r.device, dtype=torch.int) * T, + ) + elif gen_strategy == "max_t": + tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda") + tokens_per_expert.fill_(total_tokens / E) + elif gen_strategy == "first_t": + tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda") + tokens_per_expert[0] = min(T, total_tokens) + else: + raise ValueError(f"Unknown generation strategy: {gen_strategy}") + return y, tokens_per_expert + + dataset_count = 4 + # Pre-generate different input matrices for each iteration to avoid cache effects + data_sets = [generate_data(i) for i in range(dataset_count)] + + # Warmup + y, tokens_per_expert = data_sets[0] + for _ in range(num_warmups): + kernel( + y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G + ) + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + # Benchmark + latencies: list[float] = [] + for _ in range(runs): + torch.cuda.synchronize() + + start_event.record() + for i in range(iterations_per_run): + y, tokens_per_expert = data_sets[i % dataset_count] + kernel( + y, + tokens_per_expert, + num_parallel_tokens=num_parallel_tokens, + group_size=G, + ) + end_event.record() + end_event.synchronize() + + total_time_ms = start_event.elapsed_time(end_event) + per_iter_time_ms = total_time_ms / iterations_per_run + latencies.append(per_iter_time_ms) + + # Use median instead of average for better outlier handling + median_time_ms = np.median(latencies) + median_time_s = median_time_ms / 1000 + + # Calculate actual work done (using first dataset for consistency) + _, tokens_per_expert = data_sets[0] + actual_tokens = tokens_per_expert.sum().item() + actual_elements = actual_tokens * H + + # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops + ops_per_element = 8 + total_ops = actual_elements * ops_per_element + gflops = total_ops / median_time_s / 1e9 + + # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes) + input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs + output_bytes = actual_tokens * H * 1 # H fp8 outputs + scale_bytes = actual_tokens * (H // G) * 4 # scales in float32 + total_bytes = input_bytes + output_bytes + scale_bytes + memory_bw = total_bytes / median_time_s / 1e9 + + HOPPER_BANDWIDTH_TBPS = 3.35 + return ( + median_time_ms, + gflops, + memory_bw, + (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100, + ) + + +def create_comparison_plot( + ratio, cuda_times, baseline_times, config_labels, strategy_name, id +): + """Create a comparison plot for a specific generation strategy""" + fig, ax = plt.subplots(1, 1, figsize=(16, 6)) + + # Configure x-axis positions + x = np.arange(len(config_labels)) + width = 0.35 + + # Execution Time plot (lower is better) + ax.bar( + x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue" + ) + ax.bar( + x + width / 2, + baseline_times, + width, + label="Baseline", + alpha=0.8, + color="orange", + ) + + # Add speedup labels over each bar pair + for i in range(len(x)): + speedup = ratio[i] + max_height = max(cuda_times[i], baseline_times[i]) + ax.text( + x[i], + max_height + max_height * 0.02, + f"{speedup:.2f}x", + ha="center", + va="bottom", + fontweight="bold", + fontsize=9, + ) + + ax.set_xlabel("Configuration") + ax.set_ylabel("% Utilization") + ax.set_title( + f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" + ) + ax.set_xticks(x) + ax.set_xticklabels(config_labels, rotation=45, ha="right") + ax.legend() + ax.grid(True, alpha=0.3) + + plt.tight_layout() + return fig, ax + + +def create_combined_plot(all_results): + """Create a combined plot with all strategies in one PNG""" + num_strategies = len(all_results) + fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies)) + + if num_strategies == 1: + axes = [axes] + + for idx, ( + strategy_name, + ratio, + cuda_times, + baseline_times, + config_labels, + ) in enumerate(all_results): + ax = axes[idx] + + # Configure x-axis positions + x = np.arange(len(config_labels)) + width = 0.35 + + # Execution Time plot (lower is better) + ax.bar( + x - width / 2, + cuda_times, + width, + label="CUDA Kernel", + alpha=0.8, + color="blue", + ) + ax.bar( + x + width / 2, + baseline_times, + width, + label="Baseline", + alpha=0.8, + color="orange", + ) + + # Add speedup labels over each bar pair + for i in range(len(x)): + speedup = ratio[i] + max_height = max(cuda_times[i], baseline_times[i]) + ax.text( + x[i], + max_height + max_height * 0.02, + f"{speedup:.2f}x", + ha="center", + va="bottom", + fontweight="bold", + fontsize=9, + ) + + ax.set_xlabel("Configuration") + ax.set_ylabel("% Utilization") + ax.set_title( + f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" + ) + ax.set_xticks(x) + ax.set_xticklabels(config_labels, rotation=45, ha="right") + ax.legend() + ax.grid(True, alpha=0.3) + + plt.tight_layout() + filename = "../../silu_bench/silu_benchmark_combined.png" + plt.savefig(filename, dpi=300, bbox_inches="tight") + plt.show() + + return filename + + +outer_dim = 7168 +configs = [ + # DeepSeekV3 Configs + (8, 1024, 7168), + # DeepSeekV3 Configs + (32, 1024, 7168), + # DeepSeekV3 Configs + (256, 1024, 7168), +] + +runs = 100 +num_warmups = 20 + +strategy_descriptions = { + "uniform": "Uniform Random", + "max_t": "Even Assignment", + "first_t": "experts[0] = T, experts[1:] = 0", +} + +print(f"GPU: {torch.cuda.get_device_name()}") +print(f"Testing strategies: {', '.join(strategies)}") +print(f"Configurations: {len(configs)} configs") + +all_results = [] + +# Run benchmarks for each strategy +for id, strategy in enumerate(strategies): + print(f"\n{'=' * 60}") + print(f"Testing strategy: {strategy_descriptions[strategy]}") + print(f"{'=' * 60}") + + # Collect benchmark data for both algorithms + config_labels = [] + config_x_axis = [] + all_cuda_results = [] + all_baseline_results = [] + all_ratios = [] + + for E, T, H in configs: + total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E] + config_x_axis.append(total_tokens_config) + + cuda_results = [] + baseline_results = [] + ratios = [] + + for total_tokens in total_tokens_config: + config_label = f"E={E},T={T},H={H},TT={total_tokens}" + config_labels.append(config_label) + + # CUDA kernel results + time_ms_cuda, gflops, gbps, perc = benchmark( + silu_mul_fp8_quant_deep_gemm_cuda, + E, + T, + H, + total_tokens, + runs=runs, + num_warmups=num_warmups, + gen_strategy=strategy, + ) + cuda_results.append((time_ms_cuda, gflops, gbps, perc)) + + # Baseline results + time_ms_triton, gflops, gbps, perc = benchmark( + silu_mul_fp8_quant_deep_gemm_triton, + E, + T, + H, + total_tokens, + runs=runs, + num_warmups=num_warmups, + gen_strategy=strategy, + ) + baseline_results.append((time_ms_triton, gflops, gbps, perc)) + ratios.append(time_ms_triton / time_ms_cuda) + + print(f"Completed: {config_label}") + all_cuda_results.append(cuda_results) + all_baseline_results.append(baseline_results) + all_ratios.append(ratios) + + # Store results for combined plotting + all_results.append( + ( + strategy_descriptions[strategy], + all_ratios, + all_cuda_results, + all_baseline_results, + config_labels, + config_x_axis, + ) + ) + + # Print summary table for this strategy + print(f"\nSummary Table - {strategy_descriptions[strategy]}:") + print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}") + print("-" * 60) + + for i, (E, T, H) in enumerate(configs): + speedup = baseline_results[i][0] / cuda_results[i][0] + config_label = f"E={E:3d},T={T:4d},H={H:4d}" + print( + f"{config_label:<20} {cuda_results[i][0]:8.5f} " + f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x" + ) + + +def create_total_tokens_plot(all_results): + num_strategies = len(all_results) + num_configs = len(configs) + + # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage + fig, axs = plt.subplots( + num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies) + ) + + # Add main title to the entire figure + fig.suptitle( + "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)", + fontsize=16, + fontweight="bold", + y=0.98, + ) + + # Handle single strategy case + if num_strategies == 1: + axs = axs.reshape(1, -1) + + # Handle single config case + if num_configs == 1: + axs = axs.reshape(-1, 2) + + for strategy_idx, result in enumerate(all_results): + ( + strategy_name, + all_ratios, + all_cuda_results, + all_baseline_results, + config_labels, + config_x_axis, + ) = result + + for config_idx in range(num_configs): + # Speedup plot (left column) + ax_speedup = axs[strategy_idx, config_idx * 2] + # Bandwidth plot (right column) + ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1] + + E, T, H = configs[config_idx] + ratios = all_ratios[config_idx] + total_tokens_values = config_x_axis[config_idx] + + # Extract CUDA and Triton bandwidth percentages + cuda_bandwidth_percentages = [ + result[3] for result in all_cuda_results[config_idx] + ] + triton_bandwidth_percentages = [ + result[3] for result in all_baseline_results[config_idx] + ] + + # Plot speedup ratios vs total tokens (left plot) + ax_speedup.plot( + total_tokens_values, ratios, "bo-", linewidth=3, markersize=8 + ) + ax_speedup.set_title( + f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}", + fontsize=12, + fontweight="bold", + ) + ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) + ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11) + ax_speedup.grid(True, alpha=0.3) + + ax_bandwidth.plot( + total_tokens_values, + cuda_bandwidth_percentages, + "ro-", + linewidth=3, + markersize=8, + label="CUDA", + ) + ax_bandwidth.plot( + total_tokens_values, + triton_bandwidth_percentages, + "go-", + linewidth=3, + markersize=8, + label="Triton", + ) + ax_bandwidth.set_title( + f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}", + fontsize=12, + fontweight="bold", + ) + ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) + ax_bandwidth.set_ylabel( + "% of Peak Bandwidth", fontweight="bold", fontsize=11 + ) + ax_bandwidth.legend(prop={"weight": "bold"}) + ax_bandwidth.grid(True, alpha=0.3) + + # Format x-axis labels for both plots + for ax in [ax_speedup, ax_bandwidth]: + ax.set_xticks(total_tokens_values) + ax.set_xticklabels( + [ + f"{tt // 1000}K" if tt >= 1000 else str(tt) + for tt in total_tokens_values + ], + fontweight="bold", + ) + # Make tick labels bold + for label in ax.get_xticklabels() + ax.get_yticklabels(): + label.set_fontweight("bold") + + # Add value labels on speedup points + for x, y in zip(total_tokens_values, ratios): + ax_speedup.annotate( + f"{y:.2f}x", + (x, y), + textcoords="offset points", + xytext=(0, 12), + ha="center", + fontsize=10, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7), + ) + + # Add value labels on CUDA bandwidth points + for x, y in zip(total_tokens_values, cuda_bandwidth_percentages): + ax_bandwidth.annotate( + f"{y:.1f}%", + (x, y), + textcoords="offset points", + xytext=(0, 12), + ha="center", + fontsize=9, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3), + ) + + # Add value labels on Triton bandwidth points + for x, y in zip(total_tokens_values, triton_bandwidth_percentages): + ax_bandwidth.annotate( + f"{y:.1f}%", + (x, y), + textcoords="offset points", + xytext=(0, -15), + ha="center", + fontsize=9, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3), + ) + + plt.tight_layout() + plt.subplots_adjust(top=0.93) # Make room for main title + filename = "silu_benchmark_total_tokens.png" + plt.savefig(filename, dpi=300, bbox_inches="tight") + plt.show() + + return filename + + +# Create combined plot with all strategies +combined_plot_filename = create_total_tokens_plot(all_results) + +print(f"\n{'=' * 60}") +print("Benchmark Complete!") +print(f"Generated combined plot: {combined_plot_filename}") +print(f"{'=' * 60}") diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py new file mode 100644 index 000000000..6ddab4621 --- /dev/null +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -0,0 +1,293 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import csv +import os +from datetime import datetime +from typing import Optional + +import flashinfer +import torch + +from vllm.utils import round_up + +FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 +FP8_DTYPE = torch.float8_e4m3fn +FP4_DTYPE = torch.uint8 + + +def to_float8(x, dtype=torch.float8_e4m3fn): + finfo = torch.finfo(dtype) + min_val, max_val = x.aminmax() + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) + scale = finfo.max / amax * 0.1 + x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) + return x_scl_sat.to(dtype), scale.float().reciprocal() + + +@torch.no_grad() +def benchmark_decode( + dtype: torch.dtype, + quant_dtypes: tuple[ + Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] + ], + batch_size: int, + max_seq_len: int, + num_heads: tuple[int, int] = (64, 8), + head_size: int = 128, + kv_layout: str = "HND", + block_size: int = 16, + warmup: int = 10, + trials: int = 20, +): + torch.set_default_device("cuda") + torch.manual_seed(0) + + q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes + q_quant_dtype = q_quant_dtype or dtype + kv_quant_dtype = kv_quant_dtype or dtype + o_quant_dtype = o_quant_dtype or dtype + + num_qo_heads, num_kv_heads = num_heads + assert num_qo_heads % num_kv_heads == 0 + + sm_scale = float(1.0 / (head_size**0.5)) + + # large number to reduce kv_cache reuse + NUM_BLOCKS = int(256000 / block_size) + + kv_cache_shape = None + if kv_layout == "NHD": + kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size) + elif kv_layout == "HND": + kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size) + else: + raise ValueError(f"Invalid kv_layout: {kv_layout}") + + # Always using 1.0 scale to reflect the real perf in benchmarking + q_scale = 1.0 + ref_query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype) + if q_quant_dtype == FP8_DTYPE: + query, _ = to_float8(ref_query) + else: + query = ref_query + + kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32) + kv_lens[-1] = max_seq_len + + seq_lens = kv_lens + max_seq_len = torch.max(seq_lens).item() + + # Always using 1.0 scale to reflect the real perf in benchmarking + k_scale = v_scale = 1.0 + ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype) + if kv_quant_dtype == FP8_DTYPE: + kv_cache, _ = to_float8(ref_kv_cache) + else: + kv_cache = ref_kv_cache + + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + block_tables = torch.randint( + 0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32 + ) + kv_indptr = [0] + kv_indices = [] + kv_last_page_lens = [] + for i in range(batch_size): + seq_len = seq_lens[i] + assert seq_len > 0 + num_blocks = (seq_len + block_size - 1) // block_size + kv_indices.extend(block_tables[i, :num_blocks]) + kv_indptr.append(kv_indptr[-1] + num_blocks) + kv_last_page_len = seq_len % block_size + if kv_last_page_len == 0: + kv_last_page_len = block_size + kv_last_page_lens.append(kv_last_page_len) + + kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) + kv_indices = torch.tensor(kv_indices, dtype=torch.int32) + kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8) + + wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( + workspace_buffer, + kv_layout, + use_tensor_cores=True, + ) + wrapper.plan( + kv_indptr, + kv_indices, + kv_last_page_lens, + num_qo_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + sm_scale=sm_scale, + q_data_type=dtype, + kv_data_type=dtype, + ) + + def time_fn(fn, warmup=10, trials=20): + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + times = [] + for i in range(warmup): + fn() + for i in range(trials): + start.record() + fn() + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) # ms + return sum(times) / len(times), torch.std(torch.tensor(times)) + + o_scale = 1.0 + o_sf_scale = None + output_baseline = torch.empty(ref_query.shape, dtype=dtype) + if o_quant_dtype == FP4_DTYPE: + o_sf_scale = 500.0 + output_trtllm = flashinfer.utils.FP4Tensor( + torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8), + torch.empty( + ( + round_up(query.shape[0], 128), + round_up(query.shape[1] * query.shape[2] // 16, 4), + ), + dtype=torch.float8_e4m3fn, + ), + ) + else: + output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype) + + def baseline_decode(): + return wrapper.run( + ref_query, + ref_kv_cache, + k_scale=k_scale, + v_scale=v_scale, + out=output_baseline, + ) + + def trtllm_decode(): + return flashinfer.decode.trtllm_batch_decode_with_kv_cache( + query=query, + kv_cache=kv_cache, + workspace_buffer=workspace_buffer, + block_tables=block_tables, + seq_lens=seq_lens, + max_seq_len=max_seq_len, + bmm1_scale=q_scale * k_scale * sm_scale, + bmm2_scale=v_scale / o_scale, + o_sf_scale=o_sf_scale, + out=output_trtllm, + ) + + baseline_mean, baseline_std = time_fn(baseline_decode) + trtllm_mean, trtllm_std = time_fn(trtllm_decode) + + # Calculate percentage speedup (positive means TRT is faster) + speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean + + print( + f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:.3f}\t{trtllm_std.item():.3f}" + f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}" + ) + + # Return results for CSV writing + return { + "batch_size": batch_size, + "trtllm_mean": trtllm_mean, + "trtllm_std": trtllm_std.item(), + "baseline_mean": baseline_mean, + "baseline_std": baseline_std.item(), + "speedup_percent": speedup_percent, + "q_dtype": str(q_quant_dtype), + "kv_cache_dtype": str(kv_quant_dtype), + "output_dtype": str(o_quant_dtype), + "block_size": block_size, + "num_kv_heads": num_kv_heads, + "head_size": head_size, + "max_seq_len": max_seq_len, + } + + +def write_results_to_csv(results, filename=None): + """Write benchmark results to CSV file.""" + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv" + + fieldnames = [ + "batch_size", + "trtllm_mean", + "trtllm_std", + "baseline_mean", + "baseline_std", + "speedup_percent", + "q_dtype", + "kv_cache_dtype", + "output_dtype", + "block_size", + "num_kv_heads", + "head_size", + "max_seq_len", + ] + + file_exists = os.path.exists(filename) + + with open(filename, "a", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + if not file_exists: + writer.writeheader() + + for result in results: + writer.writerow(result) + + print(f"Results written to {filename}") + + +if __name__ == "__main__": + batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256] + max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] + all_results = [] + + dtype = torch.bfloat16 + quant_dtypes = [ + # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) + (None, None, None), + (None, FP8_DTYPE, None), + (FP8_DTYPE, FP8_DTYPE, None), + (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), + (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), + ] + + for quant_dtype in quant_dtypes: + q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype + q_quant_dtype = q_quant_dtype or dtype + kv_quant_dtype = kv_quant_dtype or dtype + o_quant_dtype = o_quant_dtype or dtype + + print( + f"Running benchmark for q_dtype = {q_quant_dtype}, " + f"kv_cache_dtype: {kv_quant_dtype}, " + f"output_dtype: {o_quant_dtype}" + ) + print( + "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t" + "baseline_std\tspeedup_percent" + ) + for max_seq_len in max_seq_lens: + for bs in batch_sizes: + result = benchmark_decode( + dtype=dtype, + quant_dtypes=quant_dtype, + batch_size=bs, + max_seq_len=max_seq_len, + ) + all_results.append(result) + + # Write all results to CSV + write_results_to_csv(all_results) diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py new file mode 100644 index 000000000..131df74c7 --- /dev/null +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -0,0 +1,308 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import csv +import os +from datetime import datetime +from typing import Optional + +import flashinfer +import torch + +from vllm.utils import round_up + +FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 +FP8_DTYPE = torch.float8_e4m3fn +FP4_DTYPE = torch.uint8 + + +def to_float8(x, dtype=torch.float8_e4m3fn): + finfo = torch.finfo(dtype) + min_val, max_val = x.aminmax() + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) + scale = finfo.max / amax * 0.1 + x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) + return x_scl_sat.to(dtype), scale.float().reciprocal() + + +@torch.no_grad() +def benchmark_prefill( + dtype: torch.dtype, + quant_dtypes: tuple[ + Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] + ], + batch_size: int, + max_seq_len: int, + num_heads: tuple[int, int] = (64, 8), + head_size: int = 128, + kv_layout: str = "HND", + block_size: int = 16, + warmup: int = 10, + trials: int = 20, +): + torch.set_default_device("cuda") + torch.manual_seed(0) + + q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes + q_quant_dtype = q_quant_dtype or dtype + kv_quant_dtype = kv_quant_dtype or dtype + o_quant_dtype = o_quant_dtype or dtype + + max_q_len = max_kv_len = max_seq_len + + num_qo_heads, num_kv_heads = num_heads + assert num_qo_heads % num_kv_heads == 0 + + sm_scale = float(1.0 / (head_size**0.5)) + + # large number to reduce kv_cache reuse + NUM_BLOCKS = int(256000 / block_size) + + kv_cache_shape = None + if kv_layout == "NHD": + kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size) + elif kv_layout == "HND": + kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size) + else: + raise ValueError(f"Invalid kv_layout: {kv_layout}") + + q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32) + q_lens[-1] = max_q_len + q_indptr = torch.cat( + [ + torch.tensor([0], dtype=torch.int32), + torch.cumsum(q_lens, dim=0, dtype=torch.int32), + ] + ) + + # Always using 1.0 scale to reflect the real perf in benchmarking + q_scale = 1.0 + ref_query = torch.randn( + torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype + ) + if q_quant_dtype == FP8_DTYPE: + query, _ = to_float8(ref_query) + else: + query = ref_query + + kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32) + kv_lens[-1] = max_kv_len + + seq_lens = kv_lens + q_lens + max_seq_len = torch.max(seq_lens).item() + + # Always using 1.0 scale to reflect the real perf in benchmarking + k_scale = v_scale = 1.0 + ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype) + if kv_quant_dtype == FP8_DTYPE: + kv_cache, _ = to_float8(ref_kv_cache) + else: + kv_cache = ref_kv_cache + + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + block_tables = torch.randint( + 0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32 + ) + kv_indptr = [0] + kv_indices = [] + kv_last_page_lens = [] + for i in range(batch_size): + seq_len = seq_lens[i] + assert seq_len > 0 + num_blocks = (seq_len + block_size - 1) // block_size + kv_indices.extend(block_tables[i, :num_blocks]) + kv_indptr.append(kv_indptr[-1] + num_blocks) + kv_last_page_len = seq_len % block_size + if kv_last_page_len == 0: + kv_last_page_len = block_size + kv_last_page_lens.append(kv_last_page_len) + + kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) + kv_indices = torch.tensor(kv_indices, dtype=torch.int32) + kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8) + + wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( + workspace_buffer, kv_layout + ) + wrapper.plan( + q_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + num_qo_heads, + num_kv_heads, + head_size, + block_size, + causal=True, + sm_scale=sm_scale, + q_data_type=dtype, + kv_data_type=dtype, + ) + + def time_fn(fn, warmup=10, trials=20): + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + times = [] + for i in range(warmup): + fn() + for i in range(trials): + start.record() + fn() + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) # ms + return sum(times) / len(times), torch.std(torch.tensor(times)) + + o_scale = 1.0 + o_sf_scale = None + output_baseline = torch.empty(ref_query.shape, dtype=dtype) + if o_quant_dtype == FP4_DTYPE: + o_sf_scale = 500.0 + output_trtllm = flashinfer.utils.FP4Tensor( + torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8), + torch.empty( + ( + round_up(query.shape[0], 128), + round_up(query.shape[1] * query.shape[2] // 16, 4), + ), + dtype=torch.float8_e4m3fn, + ), + ) + else: + output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype) + + def baseline_prefill(): + return wrapper.run( + ref_query, + ref_kv_cache, + k_scale=k_scale, + v_scale=v_scale, + out=output_baseline, + ) + + def trtllm_prefill(): + return flashinfer.prefill.trtllm_batch_context_with_kv_cache( + query=query, + kv_cache=kv_cache, + workspace_buffer=workspace_buffer, + block_tables=block_tables, + seq_lens=seq_lens, + max_q_len=max_q_len, + max_kv_len=max_seq_len, + bmm1_scale=q_scale * k_scale * sm_scale, + bmm2_scale=v_scale / o_scale, + batch_size=batch_size, + cum_seq_lens_q=q_indptr, + cum_seq_lens_kv=kv_indptr, + o_sf_scale=o_sf_scale, + out=output_trtllm, + ) + + baseline_mean, baseline_std = time_fn(baseline_prefill) + trtllm_mean, trtllm_std = time_fn(trtllm_prefill) + + # Calculate percentage speedup (positive means TRT is faster) + speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean + + print( + f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:8.3f}\t{trtllm_std.item():8.3f}" + f"\t{baseline_mean:8.3f}\t{baseline_std.item():8.3f}\t{speedup_percent:8.3f}" + ) + + # Return results for CSV writing + return { + "batch_size": batch_size, + "trtllm_mean": trtllm_mean, + "trtllm_std": trtllm_std.item(), + "baseline_mean": baseline_mean, + "baseline_std": baseline_std.item(), + "speedup_percent": speedup_percent, + "q_dtype": str(q_quant_dtype), + "kv_cache_dtype": str(kv_quant_dtype), + "output_dtype": str(o_quant_dtype), + "block_size": block_size, + "num_kv_heads": num_kv_heads, + "head_size": head_size, + "max_seq_len": max_seq_len, + } + + +def write_results_to_csv(results, filename=None): + """Write benchmark results to CSV file.""" + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv" + + fieldnames = [ + "batch_size", + "trtllm_mean", + "trtllm_std", + "baseline_mean", + "baseline_std", + "speedup_percent", + "q_dtype", + "kv_cache_dtype", + "output_dtype", + "block_size", + "num_kv_heads", + "head_size", + "max_seq_len", + ] + + file_exists = os.path.exists(filename) + + with open(filename, "a", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + if not file_exists: + writer.writeheader() + + for result in results: + writer.writerow(result) + + print(f"Results written to {filename}") + + +if __name__ == "__main__": + batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256] + max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] + all_results = [] + + dtype = torch.bfloat16 + quant_dtypes = [ + # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) + (None, None, None), + (FP8_DTYPE, FP8_DTYPE, None), + (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), + (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), + ] + + for quant_dtype in quant_dtypes: + q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype + q_quant_dtype = q_quant_dtype or dtype + kv_quant_dtype = kv_quant_dtype or dtype + o_quant_dtype = o_quant_dtype or dtype + + print( + f"Running benchmark for q_dtype = {q_quant_dtype}, " + f"kv_cache_dtype: {kv_quant_dtype}, " + f"output_dtype: {o_quant_dtype}" + ) + print( + "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t" + "baseline_std\tspeedup_percent" + ) + for max_seq_len in max_seq_lens: + for bs in batch_sizes: + result = benchmark_prefill( + dtype=dtype, + quant_dtypes=quant_dtype, + batch_size=bs, + max_seq_len=max_seq_len, + ) + all_results.append(result) + + # Write all results to CSV + write_results_to_csv(all_results) diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 4fcdbadd6..c6c8e0b0b 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -11,13 +11,13 @@ from typing import Any import torch -import tqdm -import triton +from tqdm import tqdm from vllm.model_executor.layers.quantization.utils.fp8_utils import ( _w8a8_block_fp8_matmul, ) from vllm.platforms import current_platform +from vllm.triton_utils import triton from vllm.utils import FlexibleArgumentParser mp.set_start_method("spawn", force=True) @@ -56,7 +56,7 @@ def w8a8_block_matmul( Bs: The per-block quantization scale for `B`. block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128]. - output_dytpe: The dtype of the returned tensor. + output_dtype: The dtype of the returned tensor. Returns: torch.Tensor: The result of matmul. @@ -141,6 +141,7 @@ def get_weight_shapes(tp_size): # cannot TP total = [ (512 + 64, 7168), + (2112, 7168), ((128 + 64) * 128, 7168), (128 * (128 + 128), 512), (7168, 16384), diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md index 917e81401..41e68e047 100644 --- a/benchmarks/kernels/deepgemm/README.md +++ b/benchmarks/kernels/deepgemm/README.md @@ -8,7 +8,7 @@ Currently this just includes dense GEMMs and only works on Hopper GPUs. You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory: -``` +```bash git clone --recursive https://github.com/deepseek-ai/DeepGEMM cd DeepGEMM python setup.py install @@ -17,7 +17,7 @@ uv pip install -e . ## Usage -``` +```console python benchmark_fp8_block_dense_gemm.py INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda. ===== STARTING FP8 GEMM BENCHMARK ===== diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index e67ce0545..db2398fc4 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -4,49 +4,20 @@ # ruff: noqa: E501 import time -# Import DeepGEMM functions -import deep_gemm import torch -from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor -# Import vLLM functions from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, w8a8_block_fp8_matmul, ) from vllm.triton_utils import triton - - -# Copied from -# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9 -def per_token_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Convert tensor to FP8 format with per-token scaling.""" - assert x.dim() == 2 and x.size(1) % 128 == 0 - m, n = x.shape - x_view = x.view(m, -1, 128) - x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - return (x_view * (448.0 / x_amax.unsqueeze(2))).to( - torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1) - - -# Copied from -# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17 -def per_block_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Convert tensor to FP8 format with per-block scaling.""" - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), - dtype=x.dtype, - device=x.device) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - x_amax / 448.0).view(x_view.size(0), x_view.size(2)) +from vllm.utils.deep_gemm import ( + calc_diff, + fp8_gemm_nt, + get_col_major_tma_aligned_tensor, + per_block_cast_to_fp8, +) def benchmark_shape(m: int, @@ -69,14 +40,14 @@ def benchmark_shape(m: int, # Pre-quantize B for all implementations # (weights can be pre-quantized offline) - B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B) - B_vllm, B_scale_vllm = per_block_cast_to_fp8(B) + B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) + B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) # Block size configuration block_size = [128, 128] # Pre-quantize A for all implementations - A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1]) A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) @@ -85,21 +56,13 @@ def benchmark_shape(m: int, # === DeepGEMM Implementation === def deepgemm_gemm(): - # A quantization is inside the loop as it depends on activations - # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) - # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8( - # A, block_size[1]) - # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm) - # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) - deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), + fp8_gemm_nt((A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm) return C_deepgemm # === vLLM Triton Implementation === def vllm_triton_gemm(): - # A quantization is inside the loop as it depends on activations - # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) return w8a8_block_fp8_matmul(A_vllm, B_vllm, A_scale_vllm, @@ -109,9 +72,6 @@ def vllm_triton_gemm(): # === vLLM CUTLASS Implementation === def vllm_cutlass_gemm(): - # A quantization is inside the loop as it depends on activations - # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( - # A, block_size[1], column_major_scales=True) return ops.cutlass_scaled_mm(A_vllm_cutlass, B_vllm.T, scale_a=A_scale_vllm_cutlass, diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index a27f02394..9a057990b 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -95,4 +95,10 @@ ([2048, 2816], 1), ([1408, 2048], 0), ], + "CohereLabs/c4ai-command-a-03-2025": [ + ([12288, 14336], 1), + ([12288, 12288], 0), + ([12288, 73728], 1), + ([36864, 12288], 0), + ], } diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md new file mode 100644 index 000000000..f5b5c6c97 --- /dev/null +++ b/benchmarks/multi_turn/README.md @@ -0,0 +1,174 @@ +# Benchmark KV Cache Offloading with Multi-Turn Conversations + +The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `requirements.txt` + +First start serving your model + +```bash +export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ + +vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests +``` + +The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface). + +## Synthetic Multi-Turn Conversations + +Download the following text file (used for generation of synthetic conversations) + +```bash +wget https://www.gutenberg.org/ebooks/1184.txt.utf-8 +mv 1184.txt.utf-8 pg1184.txt +``` + +The filename `pg1184.txt` is used in `generate_multi_turn.json` (see `"text_files"`). + +But you may use other text files if you prefer (using this specific file is not required). + +Then run the benchmarking script + +```bash +export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ + +python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \ +--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6 +``` + +You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.). + +If successful, you will see the following output + +```bash +---------------------------------------------------------------------------------------------------- +Statistics summary: +runtime_sec = 215.810 +requests_per_sec = 0.769 +---------------------------------------------------------------------------------------------------- + count mean std min 25% 50% 75% 90% 99% max +ttft_ms 166.0 78.22 67.63 45.91 59.94 62.26 64.43 69.66 353.18 567.54 +tpot_ms 166.0 25.37 0.57 24.40 25.07 25.31 25.50 25.84 27.50 28.05 +latency_ms 166.0 2591.07 326.90 1998.53 2341.62 2573.01 2860.10 3003.50 3268.46 3862.94 +input_num_turns 166.0 7.43 4.57 1.00 3.00 7.00 11.00 13.00 17.00 17.00 +input_num_tokens 166.0 2006.20 893.56 522.00 1247.75 2019.00 2718.00 3233.00 3736.45 3899.00 +output_num_tokens 166.0 100.01 11.80 80.00 91.00 99.00 109.75 116.00 120.00 120.00 +output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 115.00 119.00 119.00 +---------------------------------------------------------------------------------------------------- +``` + +### JSON configuration file for synthetic conversations generation + +The input flag `--input-file` is used to determine the input conversations for the benchmark.
+When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations. + +The file `generate_multi_turn.json` is an example file. + +The file must contain the sections `prompt_input` and `prompt_output`. + +The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`: + +* `num_turns` - Number of total turns in the conversation (both user & assistant).
+The final value will always be rounded to an even number so each user turn has a reply. +* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation). +* `num_tokens` - Total token length of each **user** message (one turn). + +The `prompt_output` section must contain `num_tokens`: + +* `num_tokens` - Total token length of each **assistant** message (one turn). + +### Random distributions for synthetic conversations generation + +When creating an input JSON file (such as `generate_multi_turn.json`),
+every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.
+The distribution determines how to randomly sample values for the field. + +The available distributions are listed below. + +**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.
+Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`. + +#### constant + +```json +{ + "distribution": "constant", + "value": 500 +} +``` + +* `value` - the fixed integer value (always returns the same number). + +#### uniform + +```json +{ + "distribution": "uniform", + "min": 12, + "max": 18 +} +``` + +* `min` - minimum value (inclusive). +* `max` - maximum value (inclusive), should be equal or larger than min. + +#### lognormal + +```json +{ + "distribution": "lognormal", + "average": 1000, + "max": 5000 +} +``` + +You can parameterize the lognormal distribution in one of two ways: + +Using the average and optional median ratio: + +* `average` - target average value of the distribution. +* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1). + +Using the parameters of the underlying normal distribution: + +* `mean` - mean of the underlying normal distribution. +* `sigma` - standard deviation of the underlying normal distribution. + +#### zipf + +```json +{ + "distribution": "zipf", + "alpha": 1.2, + "max": 100 +} +``` + +* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers. + +#### poisson + +```json +{ + "distribution": "poisson", + "alpha": 10, + "max": 50 +} +``` + +* `alpha` - expected value (λ). Also the variance of the distribution. + +## ShareGPT Conversations + +To run with the ShareGPT data, download the following ShareGPT dataset: +`https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json` + +Use the `convert_sharegpt_to_openai.py` script to convert the dataset to a format supported by `benchmark_serving_multi_turn.py` + +```bash +python convert_sharegpt_to_openai.py sharegpt_20230401_clean_lang_split.json sharegpt_conv_128.json --seed=99 --max-items=128 +``` + +The script will convert the ShareGPT dataset to a dataset with the standard user/assistant roles. + +The flag `--max-items=128` is used to sample 128 conversations from the original dataset (change as needed). + +Use the output JSON file `sharegpt_conv_128.json` as the `--input-file` for `benchmark_serving_multi_turn.py`. diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py new file mode 100644 index 000000000..67b937930 --- /dev/null +++ b/benchmarks/multi_turn/bench_dataset.py @@ -0,0 +1,588 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from statistics import mean +from typing import Any, NamedTuple, Optional, Union + +import numpy as np # type: ignore +import pandas as pd # type: ignore +from bench_utils import ( + TEXT_SEPARATOR, + Color, + logger, +) +from transformers import AutoTokenizer # type: ignore + +# Conversation ID is a string (e.g: "UzTK34D") +ConvId = str + +# A list of dicts (dicts with keys "id" and "messages") +ShareGptConversations = list[dict[str, Any]] + +# A list of dicts (dicts with keys "role" and "content") +MessagesList = list[dict[str, str]] + +# Map conversation ID to conversation messages +ConversationsMap = list[ConvId, MessagesList] + + +class Distribution(ABC): + @abstractmethod + def sample(self, size: int = 1) -> np.ndarray: + pass + + +class UniformDistribution(Distribution): + def __init__( + self, + min_val: Union[int, float], + max_val: Union[int, float], + is_integer: bool = True, + ) -> None: + self.min_val = min_val + self.max_val = max_val + self.is_integer = is_integer + + def sample(self, size: int = 1) -> np.ndarray: + if self.is_integer: + return np.random.randint( + int(self.min_val), int(self.max_val + 1), size=size + ) + else: + return np.random.uniform(self.min_val, self.max_val, size=size) + + def __repr__(self) -> str: + return f"UniformDistribution[{self.min_val}, {self.max_val}]" + + +class ConstantDistribution(Distribution): + def __init__(self, value: Union[int, float]) -> None: + self.value = value + self.max_val = value + + def sample(self, size: int = 1) -> np.ndarray: + return np.full(shape=size, fill_value=self.value) + + def __repr__(self) -> str: + return f"Constant[{self.value}]" + + +class ZipfDistribution(Distribution): + def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + self.alpha = alpha + self.max_val = max_val + + def sample(self, size: int = 1) -> np.ndarray: + samples = np.random.zipf(self.alpha, size=size) + if self.max_val: + samples = np.minimum(samples, self.max_val) + return samples + + def __repr__(self) -> str: + return f"ZipfDistribution[{self.alpha}]" + + +class PoissonDistribution(Distribution): + def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + self.alpha = alpha + self.max_val = max_val + + def sample(self, size: int = 1) -> np.ndarray: + samples = np.random.poisson(self.alpha, size=size) + if self.max_val: + samples = np.minimum(samples, self.max_val) + return samples + + def __repr__(self) -> str: + return f"PoissonDistribution[{self.alpha}]" + + +class LognormalDistribution(Distribution): + def __init__( + self, + mean: Optional[float] = None, + sigma: Optional[float] = None, + average: Optional[int] = None, + median_ratio: Optional[float] = None, + max_val: Optional[int] = None, + ) -> None: + self.average = average + self.median_ratio = median_ratio + self.max_val = max_val + + if average is not None: + if average < 1: + raise ValueError("Lognormal average must be positive") + + if mean or sigma: + raise ValueError( + "When using lognormal average, you can't provide mean/sigma" + ) + + if self.median_ratio is None: + # Default value that provides relatively wide range of values + self.median_ratio = 0.85 + + # Calculate mean/sigma of np.random.lognormal based on the average + mean, sigma = self._generate_lognormal_by_median( + target_average=self.average, median_ratio=self.median_ratio + ) + else: + if mean is None or sigma is None: + raise ValueError( + "Must provide both mean and sigma if average is not used" + ) + + if mean <= 0 or sigma < 0: + raise ValueError( + "Lognormal mean must be positive and sigma must be non-negative" + ) + + # Mean and standard deviation of the underlying normal distribution + # Based on numpy.random.lognormal + self.mean = mean + self.sigma = sigma + + @staticmethod + def _generate_lognormal_by_median( + target_average: int, median_ratio: float + ) -> tuple[float, float]: + """ + Compute (mu, sigma) for a lognormal distribution given: + - a target average (mean of the distribution) + - a ratio of median / mean (controls skewness), assume mean > median + + Background: + If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma). + * mean(X) = exp(mu + sigma^2 / 2) + * median(X) = exp(mu) + + So: + median / mean = exp(mu) / exp(mu + sigma^2 / 2) + = exp(-sigma^2 / 2) + + Rearranging: + sigma^2 = 2 * ln(mean / median) + mu = ln(median) + + This gives a unique (mu, sigma) for any valid mean and median. + """ + # Check input validity: median must be smaller than mean + if median_ratio <= 0 or median_ratio >= 1: + raise ValueError("median_ratio must be in range (0, 1)") + + target_median = target_average * median_ratio + + # Solve sigma^2 = 2 * ln(mean / median) + sigma = np.sqrt(2 * np.log(target_average / target_median)) + mu = np.log(target_median) + + return mu, sigma + + def sample(self, size: int = 1) -> np.ndarray: + samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size) + + if self.average is not None: + # Scale to average + samples *= self.average / samples.mean() + + if self.max_val: + samples = np.minimum(samples, self.max_val) + + return np.round(samples).astype(int) + + def __repr__(self) -> str: + if self.average: + return ( + f"LognormalDistribution[{self.average}, " + f"{self.median_ratio}, {self.max_val}]" + ) + return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]" + + +class GenConvArgs(NamedTuple): + num_conversations: int + text_files: list[str] + input_num_turns: Distribution + input_common_prefix_num_tokens: Distribution + input_prefix_num_tokens: Distribution + input_num_tokens: Distribution + output_num_tokens: Distribution + print_stats: bool + + +def verify_field_exists( + conf: dict, field_name: str, section: str, subsection: str +) -> None: + if field_name not in conf: + raise ValueError( + f"Missing field '{field_name}' in {section=} and {subsection=}" + ) + + +def get_random_distribution( + conf: dict, section: str, subsection: str, optional: bool = False +) -> Distribution: + # section can be "prompt_input" or "prompt_output" (both required) + conf = conf[section] + + if optional and subsection not in conf: + # Optional subsection, if not found assume the value is always 0 + return ConstantDistribution(0) + + # subsection can be "num_turns", "num_tokens" or "prefix_num_tokens" + if subsection not in conf: + raise ValueError(f"Missing subsection {subsection} in section {section}") + + conf = conf[subsection] + + distribution = conf.get("distribution") + if distribution is None: + raise ValueError( + f"Missing field 'distribution' in {section=} and {subsection=}" + ) + + if distribution == "constant": + verify_field_exists(conf, "value", section, subsection) + return ConstantDistribution(conf["value"]) + + elif distribution == "zipf": + verify_field_exists(conf, "alpha", section, subsection) + max_val = conf.get("max", None) + return ZipfDistribution(conf["alpha"], max_val=max_val) + + elif distribution == "poisson": + verify_field_exists(conf, "alpha", section, subsection) + max_val = conf.get("max", None) + return PoissonDistribution(conf["alpha"], max_val=max_val) + + elif distribution == "lognormal": + max_val = conf.get("max", None) + + if "average" in conf: + # Infer lognormal mean/sigma (numpy) from input average + median_ratio = conf.get("median_ratio", None) + return LognormalDistribution( + average=conf["average"], median_ratio=median_ratio, max_val=max_val + ) + + # Use mean/sigma directly (for full control over the distribution) + verify_field_exists(conf, "mean", section, subsection) + verify_field_exists(conf, "sigma", section, subsection) + return LognormalDistribution( + mean=conf["mean"], sigma=conf["sigma"], max_val=max_val + ) + + elif distribution == "uniform": + verify_field_exists(conf, "min", section, subsection) + verify_field_exists(conf, "max", section, subsection) + + min_value = conf["min"] + max_value = conf["max"] + + assert min_value > 0 + assert min_value <= max_value + + is_integer = isinstance(min_value, int) and isinstance(max_value, int) + return UniformDistribution(min_value, max_value, is_integer) + else: + raise ValueError(f"Unknown distribution: {distribution}") + + +def parse_input_json_file(conf: dict) -> GenConvArgs: + # Validate the input file + assert isinstance(conf, dict) + required_fields = [ + "filetype", + "num_conversations", + "text_files", + "prompt_input", + "prompt_output", + ] + for field in required_fields: + assert field in conf, f"Missing field {field} in input {conf}" + + assert conf["filetype"] == "generate_conversations" + + assert conf["num_conversations"] > 0, "num_conversations should be larger than zero" + + text_files = conf["text_files"] + + assert isinstance(text_files, list), "Field 'text_files' should be a list" + assert len(text_files) > 0, ( + "Field 'text_files' should be a list with at least one file" + ) + + # Parse the parameters for the prompt input/output workload + input_num_turns = get_random_distribution(conf, "prompt_input", "num_turns") + input_num_tokens = get_random_distribution(conf, "prompt_input", "num_tokens") + input_common_prefix_num_tokens = get_random_distribution( + conf, "prompt_input", "common_prefix_num_tokens", optional=True + ) + input_prefix_num_tokens = get_random_distribution( + conf, "prompt_input", "prefix_num_tokens" + ) + output_num_tokens = get_random_distribution(conf, "prompt_output", "num_tokens") + + print_stats: bool = conf.get("print_stats", False) + assert isinstance(print_stats, bool), ( + "Field 'print_stats' should be either 'true' or 'false'" + ) + + args = GenConvArgs( + num_conversations=conf["num_conversations"], + text_files=text_files, + input_num_turns=input_num_turns, + input_common_prefix_num_tokens=input_common_prefix_num_tokens, + input_prefix_num_tokens=input_prefix_num_tokens, + input_num_tokens=input_num_tokens, + output_num_tokens=output_num_tokens, + print_stats=print_stats, + ) + return args + + +def print_conv_stats(conversations: ConversationsMap, tokenizer: AutoTokenizer) -> None: + # Collect statistics + conv_stats: list[dict[Any, Any]] = [] + req_stats: list[int] = [] + + print("\nCollecting statistics...") + for messages in conversations.values(): + # messages is a list of dicts + user_tokens: list[int] = [] + assistant_tokens: list[int] = [] + request_tokens: list[int] = [] + + req_tokens = 0 + for m in messages: + content = m["content"] + num_tokens = len(tokenizer(content).input_ids) + + if m["role"] == "user": + user_tokens.append(num_tokens) + # New user prompt including all chat history + req_tokens += num_tokens + request_tokens.append(req_tokens) + + elif m["role"] == "assistant": + assistant_tokens.append(num_tokens) + # Update assistant answer + # (will be part of chat history for the next user prompt) + req_tokens += num_tokens + + item_stats = { + "conversation_turns": len(messages), + "user_tokens": mean(user_tokens), + "assistant_tokens": mean(assistant_tokens), + } + + conv_stats.append(item_stats) + req_stats.extend(request_tokens) + + # Print statistics + percentiles = [0.25, 0.5, 0.75, 0.9, 0.99] + + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}") + print(TEXT_SEPARATOR) + df = pd.DataFrame(conv_stats) + print(df.describe(percentiles=percentiles).transpose()) + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Request statistics:{Color.RESET}") + print(TEXT_SEPARATOR) + df = pd.DataFrame(req_stats, columns=["request_tokens"]) + print(df.describe(percentiles=percentiles).transpose()) + print(TEXT_SEPARATOR) + + +def generate_conversations( + args: GenConvArgs, tokenizer: AutoTokenizer +) -> ConversationsMap: + # Text for all user prompts + # (text from the input text files will be appended to this line) + base_prompt_text = "Please rewrite the following text and add more content: " + base_prompt_token_count = len( + tokenizer.encode(base_prompt_text, add_special_tokens=False) + ) + + logger.info(f"{Color.PURPLE}Generating conversations...{Color.RESET}") + logger.info(args) + + list_of_tokens = [] + + for filename in args.text_files: + # Load text file that will be used to generate prompts + with open(filename) as file: + data = file.read() + tokens_in_file = tokenizer.encode(data, add_special_tokens=False) + list_of_tokens.extend(tokens_in_file) + + conversations: ConversationsMap = {} + conv_id = 0 + + # Generate number of turns for every conversation + turn_count: np.ndarray = args.input_num_turns.sample(args.num_conversations) + + # Turn count should be at least 2 (one user prompt and one assistant answer) + turn_count = np.maximum(turn_count, 2) + + # Round up to an even number (every user prompt should have an answer) + turn_count = turn_count + (turn_count % 2) + + # Generate number of prefix tokens for every conversation + conv_prefix_tokens: np.ndarray = args.input_prefix_num_tokens.sample( + args.num_conversations + ) + + # Used to reduce shared text between conversations + # (jump/skip over text sections between conversations) + base_offset = 0 + + # Common prefix size for all conversations (only 1 sample required) + common_prefix_text = "" + common_prefix_tokens: int = args.input_common_prefix_num_tokens.sample(1)[0] + if common_prefix_tokens > 0: + # Using "." at the end to separate sentences + common_prefix_text = ( + tokenizer.decode(list_of_tokens[: common_prefix_tokens - 2]) + "." + ) + base_offset += common_prefix_tokens + + for conv_id in range(args.num_conversations): + # Generate a single conversation + messages: MessagesList = [] + + nturns = turn_count[conv_id] + + # User prompt token count per turn (with lower limit) + input_token_count: np.ndarray = args.input_num_tokens.sample(nturns) + input_token_count = np.maximum(input_token_count, base_prompt_token_count) + + # Assistant answer token count per turn (with lower limit) + output_token_count: np.ndarray = args.output_num_tokens.sample(nturns) + output_token_count = np.maximum(output_token_count, 1) + + user_turn = True + for turn_id in range(nturns): + if user_turn: + role = "user" + num_tokens = input_token_count[turn_id] + + # Generate the user prompt, + # use a unique prefix (the conv_id) for each conversation + # (to avoid shared prefix between conversations) + content = f"{conv_id} is a nice number... " + + if len(common_prefix_text) > 0 and turn_id == 0: + content = common_prefix_text + content + + # Update the number of tokens left for the content + num_tokens -= len(tokenizer.encode(content, add_special_tokens=False)) + + if turn_id == 0: + prefix_num_tokens = conv_prefix_tokens[conv_id] + if prefix_num_tokens > 0: + # Add prefix text (context) to the first turn + start_offset = base_offset + end_offset = start_offset + prefix_num_tokens + assert len(list_of_tokens) > end_offset, ( + "Not enough input text to generate " + f"{prefix_num_tokens} tokens for the " + f"prefix text ({start_offset=}, {end_offset=})" + ) + + content += f"{conv_id}, " + tokenizer.decode( + list_of_tokens[start_offset:end_offset] + ) + base_offset += prefix_num_tokens + + # Add the actual user prompt/question after the prefix text + content += base_prompt_text + num_tokens -= base_prompt_token_count + + if num_tokens > 0: + # Add text from the input file (to reach the desired token count) + start_offset = base_offset + turn_id * input_token_count.max() + end_offset = start_offset + num_tokens + assert len(list_of_tokens) > end_offset, ( + f"Not enough input text to generate {num_tokens} tokens " + f"for the prompt ({start_offset=}, {end_offset=})" + ) + + # Convert tokens back to text + content += tokenizer.decode(list_of_tokens[start_offset:end_offset]) + else: + role = "assistant" + # This content will not be used as input to the LLM server + # (actual answers will be used instead). + # Content is only required to determine the min_tokens/max_tokens + # (inputs to the LLM server). + num_tokens = output_token_count[turn_id] + assert len(list_of_tokens) > num_tokens, ( + f"Not enough input text to generate {num_tokens} " + "tokens for assistant content" + ) + content = tokenizer.decode(list_of_tokens[:num_tokens]) + + # Append the user/assistant message to the list of messages + messages.append({"role": role, "content": content}) + user_turn = not user_turn + + # Add the new conversation + conversations[f"CONV_ID_{conv_id}"] = messages + + # Increase base offset for the next conversation + base_offset += nturns + + if args.print_stats: + print_conv_stats(conversations, tokenizer) + + return conversations + + +def conversations_list_to_dict(input_list: ShareGptConversations) -> ConversationsMap: + conversations: ConversationsMap = {} + + for item in input_list: + conv_id: str = item["id"] + assert isinstance(conv_id, str) + + assert conv_id not in conversations, ( + f"Conversation ID {conv_id} found more than once in the input" + ) + + messages: MessagesList = item["messages"] + assert isinstance(messages, list), ( + f"Conversation messages should be a list (ID: {conv_id})" + ) + assert len(messages) > 0, f"Conversation with no messages (ID: {conv_id})" + + conversations[conv_id] = messages + + logger.info(f"Using {len(conversations)} unique conversations (IDs)") + assert len(conversations) == len(input_list) + + # Print statistics about the selected conversations + stats: list[dict[str, Any]] = [] + for conv_data in conversations.values(): + stats.append({"num_turns": len(conv_data)}) + + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}") + print(TEXT_SEPARATOR) + percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999] + conv_stats = pd.DataFrame(stats).describe(percentiles=percentiles) + print(conv_stats.transpose()) + print(TEXT_SEPARATOR) + + return conversations + + +def conversations_dict_to_list(input_dict: ConversationsMap) -> ShareGptConversations: + output: ShareGptConversations = [] + for conv_id, conv_data in input_dict.items(): + new_item = {"id": conv_id, "messages": conv_data} + output.append(new_item) + + return output diff --git a/benchmarks/multi_turn/bench_utils.py b/benchmarks/multi_turn/bench_utils.py new file mode 100644 index 000000000..e959a4be7 --- /dev/null +++ b/benchmarks/multi_turn/bench_utils.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging +from enum import Enum + + +class Color(Enum): + RED = "\033[91m" + GREEN = "\033[92m" + BLUE = "\033[94m" + PURPLE = "\033[95m" + CYAN = "\033[96m" + YELLOW = "\033[93m" + RESET = "\033[0m" + + def __str__(self): + return self.value + + +TEXT_SEPARATOR = "-" * 100 + +# Configure the logger +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] - %(message)s", + datefmt="%d-%m-%Y %H:%M:%S", +) +logger = logging.getLogger(__name__) diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py new file mode 100644 index 000000000..66d85eaf5 --- /dev/null +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -0,0 +1,1569 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import asyncio +import json +import logging +import multiprocessing as mp +import os +import random +import time +from collections import Counter, deque +from datetime import datetime +from enum import Enum +from http import HTTPStatus +from statistics import mean +from typing import NamedTuple, Optional, Union + +import aiohttp # type: ignore +import numpy as np # type: ignore +import pandas as pd # type: ignore +from bench_dataset import ( + ConversationsMap, + ConvId, + GenConvArgs, + MessagesList, + ShareGptConversations, + conversations_dict_to_list, + conversations_list_to_dict, + generate_conversations, + parse_input_json_file, +) +from bench_utils import TEXT_SEPARATOR, Color, logger +from transformers import AutoTokenizer # type: ignore + +NUM_TOKENS_FROM_DATASET = 0 +TERM_SIGNAL = None + + +class ConversationSampling(str, Enum): + ROUND_ROBIN = "round_robin" + RANDOM = "random" + + def __str__(self): + return self.value + + +class ClientArgs(NamedTuple): + seed: int + max_num_requests: Optional[int] + skip_first_turn: bool + max_turns: Optional[int] + max_active_conversations: int + verbose: bool + print_content: bool + verify_output: bool + conversation_sampling: ConversationSampling + request_rate: float + + +class RequestArgs(NamedTuple): + chat_url: str + model: str + stream: bool + limit_min_tokens: int # Use negative value for no limit + limit_max_tokens: int # Use negative value for no limit + + +class BenchmarkArgs(NamedTuple): + url: str + num_clients: int + early_stop: bool + + +class ServerResponse(NamedTuple): + valid: bool + ttft_ms: float # time to first chunk + tpot_ms: float # time per output chunk (one or more tokens) + latency_ms: float + start_time_ms: float + first_chunk: str # first chunk of the content + content: str # includes the first_chunk + num_chunks: int + + def __str__(self) -> str: + return f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}" # noqa: E501 + + +class RequestStats(NamedTuple): + ttft_ms: float + tpot_ms: float + latency_ms: float + start_time_ms: float + input_num_turns: int + input_num_tokens: int + output_num_tokens: int + output_num_chunks: int + output_num_first_chunk_tokens: int + approx_cached_percent: float + conversation_id: str + client_id: int + + def __str__(self) -> str: + return ( + f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}, input_num_tokens {self.input_num_tokens}, " # noqa: E501 + f"output_num_tokens {self.output_num_tokens} ({self.output_num_chunks} chunks, {self.output_num_first_chunk_tokens} tokens in first chunk), " # noqa: E501 + f"approx_cached_percent {self.approx_cached_percent:.2f}%" + ) + + +class MetricStats: + def __init__(self) -> None: + self.min: Optional[float] = None + self.max: Optional[float] = None + self.avg: Optional[float] = None + self.sum = 0.0 + self.count = 0 + + def update(self, value: float) -> None: + if self.min is None: + self.min = value + else: + self.min = min(self.min, value) + + if self.max is None: + self.max = value + else: + self.max = max(self.max, value) + + self.sum += value + self.count += 1 + self.avg = self.sum / self.count + + def __repr__(self) -> str: + if self.count == 0: + return "no data" + return f"avg: {self.avg:>10.3f}, min: {self.min:>10.3f}, max: {self.max:>10.3f}" + + +class MovingAverage: + def __init__(self, window_size: int) -> None: + self.window_size = window_size + self.window = np.zeros(window_size) + self.index = 0 + self.sum = 0.0 + self.count = 0 + self.avg: Optional[float] = None + + def update(self, new_value: float) -> None: + if self.count < self.window_size: + # Filling up the window + self.sum += new_value + self.window[self.count] = new_value + self.count += 1 + else: + # Window is full, start replacing old values + old_value = self.window[self.index] + self.sum = self.sum - old_value + new_value + self.window[self.index] = new_value + self.index = (self.index + 1) % self.window_size + + self.avg = self.sum / self.count + + def __repr__(self) -> str: + if self.count == 0: + return "no data" + return f"avg: {self.avg:>10.3f} ({self.count} samples)" + + +class DebugStats: + def __init__(self, logger: logging.Logger, window_size: int) -> None: + self.logger = logger + self.metrics: dict[str, Union[MovingAverage, MetricStats]] = { + "moving_avg_ttft_ms": MovingAverage(window_size), + "moving_avg_tpot_ms": MovingAverage(window_size), + "ttft_ms": MetricStats(), + "tpot_ms": MetricStats(), + "latency_ms": MetricStats(), + "input_num_turns": MetricStats(), + "input_num_tokens": MetricStats(), + "output_num_tokens": MetricStats(), + } + + def update(self, data: RequestStats) -> None: + self.metrics["ttft_ms"].update(data.ttft_ms) + self.metrics["moving_avg_ttft_ms"].update(data.ttft_ms) + self.metrics["tpot_ms"].update(data.tpot_ms) + self.metrics["moving_avg_tpot_ms"].update(data.tpot_ms) + self.metrics["latency_ms"].update(data.latency_ms) + self.metrics["input_num_turns"].update(data.input_num_turns) + self.metrics["input_num_tokens"].update(data.input_num_tokens) + self.metrics["output_num_tokens"].update(data.output_num_tokens) + + def print(self) -> None: + self.logger.info("-" * 50) + for k, v in self.metrics.items(): + kv_info = f"[{k:25}] {v}" + self.logger.info(kv_info) + self.logger.info("-" * 50) + + +# Must support Python 3.8, we can't use str.removeprefix(prefix) +# introduced in Python 3.9 +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + return text[len(prefix) :] + return text + + +def nanosec_to_millisec(value: float) -> float: + return value / 1000000.0 + + +def nanosec_to_sec(value: float) -> float: + return value / 1000000000.0 + + +async def send_request( + session: aiohttp.ClientSession, + messages: list[dict[str, str]], + chat_url: str, + model: str, + stream: bool = True, + min_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, +) -> ServerResponse: + payload = { + "model": model, + "messages": messages, + "seed": 0, + "temperature": 0.0, + } + + if stream: + payload["stream"] = True + payload["stream_options"] = {"include_usage": False} + + if min_tokens is not None: + payload["min_tokens"] = min_tokens + + if max_tokens is not None: + payload["max_tokens"] = max_tokens + + headers = {"Content-Type": "application/json"} + + # Calculate the timeout for the request + timeout_sec = 120 + if max_tokens is not None: + # Assume TPOT of 200ms and use max_tokens to determine timeout + timeout_sec = max(timeout_sec, int(max_tokens * 0.2)) + timeout = aiohttp.ClientTimeout(total=timeout_sec) + + valid_response = True + ttft: Optional[float] = None + chunk_delay: list[int] = [] + latency: Optional[float] = None + first_chunk = "" + generated_text = "" + + start_time: int = time.perf_counter_ns() + most_recent_timestamp: int = start_time + + async with session.post( + url=chat_url, json=payload, headers=headers, timeout=timeout + ) as response: + http_status = HTTPStatus(response.status) + if http_status == HTTPStatus.OK: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") + if chunk == "[DONE]": + # End of stream + latency = time.perf_counter_ns() - start_time + elif stream is False: + data = json.loads(chunk) + message = data["choices"][0]["message"] + assert message["role"] == "assistant" + generated_text += message["content"] + else: + timestamp: int = time.perf_counter_ns() + data = json.loads(chunk) + + # Delta is the new content/text/data + delta = data["choices"][0]["delta"] + if delta.get("content", None): + if ttft is None: + # First token + first_token_time = time.perf_counter_ns() + ttft = first_token_time - start_time + first_chunk = delta["content"] + else: + # Decoding phase + chunk_delay.append(timestamp - most_recent_timestamp) + + generated_text += delta["content"] + + most_recent_timestamp = timestamp + else: + valid_response = False + content = await response.text() + logger.warning( + f"{Color.YELLOW}Received HTTP status {http_status.value} " + f"({http_status.phrase}): {content}{Color.RESET}" + ) + + if latency is None: + latency = -1.0 + if valid_response: + # Streaming is disabled, latency was not set + latency = time.perf_counter_ns() - start_time + + if ttft is None: + # The response was a single chunk + ttft = latency + + # Each chunk may include more than one token + tpot: float = mean(chunk_delay) if len(chunk_delay) > 0 else 0.0 + num_chunks: int = len(chunk_delay) + + sr = ServerResponse( + valid=valid_response, + ttft_ms=nanosec_to_millisec(ttft) if ttft > 0.0 else -1.0, + tpot_ms=nanosec_to_millisec(tpot), + latency_ms=nanosec_to_millisec(latency), + start_time_ms=nanosec_to_millisec(start_time), + first_chunk=first_chunk, + content=generated_text, + num_chunks=num_chunks, + ) + return sr + + +def get_short_string(input: str) -> str: + n = 20 + if len(input) < 400: + return input + + return f"{input[:n]}...{input[-n:]}" + + +def get_token_count(tokenizer: AutoTokenizer, text: str) -> int: + return len(tokenizer(text, add_special_tokens=False).input_ids) + + +def get_messages_token_count( + tokenizer: AutoTokenizer, messages: list[dict[str, str]] +) -> int: + token_count = 0 + for m in messages: + token_count += get_token_count(tokenizer, m["content"]) + + return token_count + + +async def send_turn( + session: aiohttp.ClientSession, + client_id: int, + conv_id: str, + conversation_messages: MessagesList, + messages_to_use: int, + tokenizer: AutoTokenizer, + req_args: RequestArgs, + verbose: bool, + verify_output: bool, +) -> Optional[RequestStats]: + assert messages_to_use > 0 + assert messages_to_use <= len(conversation_messages) + + messages = conversation_messages[:messages_to_use] + + # Index of the next message (the role should be "user") + index = messages_to_use - 1 + + # Verify that the message has only two keys, "role" and "content" + assert len(messages[index].keys()) == 2 + assert "role" in messages[index] and "content" in messages[index] + assert messages[index]["role"] == "user", ( + f"Failed on conversation ID {conv_id}, message role should be user" + ) + + if verbose: + print( + f"{Color.CYAN}Messages (conversation ID {conv_id}," + f" {len(messages)} turns):{Color.RESET}", + messages, + ) + + # None means that there is no upper/lower limit for the output token count + min_tokens = None if req_args.limit_min_tokens < 0 else req_args.limit_min_tokens + max_tokens = None if req_args.limit_max_tokens < 0 else req_args.limit_max_tokens + + if len(conversation_messages) > messages_to_use: + # The conversation contains an assistant answer for the next user prompt + if ( + min_tokens == NUM_TOKENS_FROM_DATASET + or max_tokens == NUM_TOKENS_FROM_DATASET + ): + # Compute number of tokens in the answer (from the input conversation) + assistant_answer = conversation_messages[messages_to_use] + answer_num_tokens = get_token_count(tokenizer, assistant_answer["content"]) + assert assistant_answer["role"] == "assistant" + + if min_tokens == NUM_TOKENS_FROM_DATASET: + min_tokens = max(1, answer_num_tokens) + + if max_tokens == NUM_TOKENS_FROM_DATASET: + max_tokens = max(1, answer_num_tokens) + + # Send the current conversation to LLM and get a response + response: ServerResponse = await send_request( + session, + messages, + req_args.chat_url, + req_args.model, + req_args.stream, + min_tokens, + max_tokens, + ) + + if response.valid is False: + # Request failed + return None + + # Compute number of tokens in input / output + input_num_tokens = get_messages_token_count(tokenizer, messages) + + # Num tokens in the user's last question + question_num_tokens = get_token_count(tokenizer, messages[index]["content"]) + + # Num tokens in the history/context of the question + assert input_num_tokens >= question_num_tokens + history_num_tokens = input_num_tokens - question_num_tokens + + # Num tokens in the LLM's answer (first chunk and full answer) + first_chunk_tokens = get_token_count(tokenizer, response.first_chunk) + + output_content = response.content + output_num_tokens = get_token_count(tokenizer, output_content) + + # Prefix caching approximated cached percent + approx_cached_percent = ( + 100.0 * (history_num_tokens / input_num_tokens) if input_num_tokens > 0 else 0.0 + ) + + # Compute the correct TTFT and TPOT (based on tokens and not chunks). + # Required because multiple output tokens may be bundled in a single chunk. + if output_num_tokens > 1 and output_num_tokens > first_chunk_tokens: + # More than one token and more than one chunk in the output + decode_ms = response.latency_ms - response.ttft_ms + decode_num_tokens = output_num_tokens - first_chunk_tokens + tpot_ms = decode_ms / decode_num_tokens + else: + # In this case: output_num_tokens == first_chunk_tokens + # Output was a single chunk (output_num_tokens > 1) + # or even a single token (output_num_tokens == 1) + tpot_ms = 0.0 + + if first_chunk_tokens > 1: + # First chunk had multiple tokens, adjust TTFT for a single token + delta_ms = (first_chunk_tokens - 1) * tpot_ms + ttft_ms = max(0.1, response.ttft_ms - delta_ms) + else: + # First chunk had only one token + ttft_ms = response.ttft_ms + + rs = RequestStats( + ttft_ms=ttft_ms, + tpot_ms=tpot_ms, + latency_ms=response.latency_ms, + start_time_ms=response.start_time_ms, + input_num_turns=len(messages), + input_num_tokens=input_num_tokens, + output_num_tokens=output_num_tokens, + output_num_chunks=response.num_chunks, + output_num_first_chunk_tokens=first_chunk_tokens, + approx_cached_percent=approx_cached_percent, + conversation_id=conv_id, + client_id=client_id, + ) + + if verbose: + print( + f"\n{Color.YELLOW}Response ({output_num_tokens} tokens):{Color.RESET}", + output_content, + ) + print(f"{Color.YELLOW}Response metrics: {rs}{Color.RESET}") + print("-" * 70) + + # Save the LLM's answer (will be used as part of the context for the next user turn) + answer_index = messages_to_use + if len(conversation_messages) > answer_index: + assert conversation_messages[answer_index]["role"] == "assistant", ( + f"Failed on conversation ID {conv_id}, message role should be assistant" + ) + + orig_content = conversation_messages[answer_index]["content"] + if verify_output: + # Compare the new answer to the answer from the input file + debug_info = ( + f"LLM/dataset answers do not match ({conv_id}):" + f"\n'{get_short_string(output_content)}' (len: {len(output_content)})," + f"\n'{get_short_string(orig_content)}' (len: {len(orig_content)})" + ) + if orig_content != output_content: + raise ValueError(debug_info) + + # Update the answer + conversation_messages[answer_index]["content"] = output_content + else: + # A user prompt that has no answer, add the answer as a new message + new_answer = {"role": "assistant", "content": output_content} + conversation_messages.append(new_answer) + + return rs + + +async def poisson_sleep(request_rate: float, verbose: bool = False) -> None: + # Generate a random time interval from the Poisson distribution + assert request_rate > 0 + + interval = np.random.exponential(1.0 / request_rate) + if verbose: + logger.info(f"Sleeping for {interval:.3f} seconds...") + await asyncio.sleep(interval) + + +async def client_main( + args: ClientArgs, + req_args: RequestArgs, + client_id: int, + tokenizer: AutoTokenizer, + stop_event: mp.Event, # type: ignore + task_queue: mp.Queue, + result_queue: mp.Queue, + conv_queue: mp.Queue, +) -> None: + logger.info( + f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501 + ) + + random.seed(args.seed) + np.random.seed(args.seed) + + # Active conversations + active_convs: ConversationsMap = {} + conv_id_queue: deque = deque(maxlen=args.max_active_conversations) + + # Keep track of how many messages have been used for each conversation + turns_count: Counter = Counter() + num_successes = 0 + num_failures = 0 + + # Track the timestamp (time.perf_counter()) + # of the last turn per conversation (only for debug) + time_of_last_turn: dict[ConvId, float] = {} + + # Flag that indicates that there are no new tasks (conversations) for the client + task_queue_empty = False + + async with aiohttp.ClientSession() as session: + # Print progress + + while task_queue_empty is False: + result = None + + if ( + args.max_num_requests + and num_successes + num_failures == args.max_num_requests + ): + logger.info( + f"{Color.YELLOW}Client {client_id} reached " + f"request limit{Color.RESET}" + ) + break + + if stop_event.is_set(): # type: ignore + logger.info( + f"{Color.YELLOW}Client {client_id} received " + f"a termination signal{Color.RESET}" + ) + break + + while ( + len(active_convs) < args.max_active_conversations + and task_queue_empty is False + ): + # Get a new conversation from the task queue + conv_id, messages = task_queue.get() + + if conv_id is TERM_SIGNAL: + task_queue_empty = True + break + + if args.skip_first_turn: + # Skip the first turn (both user and assistant), + # relevant if warmup was enabled. + # Default turns_count[conv_id] will be zero if conv_id + # was never inserted/updated in turns_count. + turns_count[conv_id] += 2 + + if turns_count[conv_id] < len(messages): + # Add new conversation + active_convs[conv_id] = messages + conv_id_queue.append(conv_id) + + if args.verbose: + logger.info( + f"{Color.GREEN}Client {client_id} will use conversation ID {conv_id} (active conversations {len(active_convs)}){Color.RESET}" # noqa: E501 + ) + + elif args.verbose: + # No more messages (conversation finished during the warmup) + logger.info( + f"{Color.YELLOW}Client {client_id} will not use conversation ID {conv_id} (all {len(messages)} messages already sent){Color.RESET}" # noqa: E501 + ) + + if len(active_convs) == 0 or task_queue_empty: + logger.info( + f"{Color.YELLOW}Client {client_id} has no more work{Color.RESET}" + ) + break + + # Pick an active conversation for the next request + if args.conversation_sampling == ConversationSampling.ROUND_ROBIN: + conv_id = conv_id_queue.pop() + else: + # ConversationSampling.RANDOM + active_ids = list(active_convs.keys()) + conv_id = random.choice(active_ids) + + messages = active_convs[conv_id] + assert isinstance(messages, list) and len(messages) > 0 + + # Update the amount of messages to use + turns_count[conv_id] += 1 + current_turn = turns_count[conv_id] + + assert current_turn < len(messages), ( + f"Turn number {current_turn} is invalid for conversation ID {conv_id}" + f" that has only {len(messages)} messages" + ) + + if args.verbose: + curr_time_sec: float = time.perf_counter() + time_since_last_turn: Union[str, float] = "N/A" + if conv_id in time_of_last_turn: + time_since_last_turn = round( + curr_time_sec - time_of_last_turn[conv_id], 3 + ) + logger.info( + f"Client {client_id} using conversation ID {conv_id} (turn: {current_turn}, time since last turn [sec]: {time_since_last_turn})" # noqa: E501 + ) + time_of_last_turn[conv_id] = curr_time_sec + + success = True + try: + result = await send_turn( + session, + client_id, + conv_id, + messages, + current_turn, + tokenizer, + req_args, + args.print_content, + args.verify_output, + ) + if result is not None: + result_queue.put(result) + else: + # None means that the request failed, + # and should not be added to the statistics. + success = False + num_failures += 1 + + logger.warning( + f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + + # Remove the conversation (should not be used again) + active_convs.pop(conv_id) + + except asyncio.exceptions.TimeoutError: + num_failures += 1 + logger.exception( + f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + break # Exit gracefully instead of raising an error + + except Exception: + num_failures += 1 + logger.exception( + f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + break # Exit gracefully instead of raising an error + + if success: + num_successes += 1 + + # Update the turns counter to include the LLM response + # The LLM response will be used as context for the next user turn + turns_count[conv_id] += 1 + + max_turns = len(messages) + if args.max_turns is not None: + # Limit the number of turns in the conversation + max_turns = min(args.max_turns, max_turns) + + if turns_count[conv_id] >= max_turns: + # Conversation has no more turns (no longer active) + # save the updated conversation (with the LLM server's answer) + conv_queue.put((conv_id, active_convs.pop(conv_id))) + if args.verbose: + logger.info( + f"{Color.GREEN}Client {client_id} finished " + f"conversation ID {conv_id}{Color.RESET}" + ) + else: + # Conversation is not finished, insert it at the back of the queue + conv_id_queue.appendleft(conv_id) + + # Sleep between requests (if lambda is positive) + if args.request_rate > 0: + await poisson_sleep(args.request_rate, args.verbose) + + # Send indication that the client is done + conv_queue.put((TERM_SIGNAL, TERM_SIGNAL)) + + logger.info( + f"{Color.CYAN}Client {client_id} is done " + f"({num_successes=}, {num_failures=}){Color.RESET}" + ) + + +def worker_function( + client_id: int, + tokenizer: AutoTokenizer, + client_args: ClientArgs, + req_args: RequestArgs, + stop_event: mp.Event, # type: ignore + task_queue: mp.Queue, + result_queue: mp.Queue, + conv_queue: mp.Queue, +) -> None: + asyncio.run( + client_main( + client_args, + req_args, + client_id, + tokenizer, + stop_event, + task_queue, + result_queue, + conv_queue, + ) + ) + + +def get_client_config( + args: argparse.Namespace, input_conv: ConversationsMap +) -> tuple[ClientArgs, RequestArgs]: + if args.num_clients < 1: + raise ValueError("Number of clients must be a positive number") + + if len(input_conv) < args.num_clients: + raise ValueError( + "Number of conversations must be equal or larger than the number of clients" + ) + + max_req_per_client: Optional[int] = None + if args.max_num_requests is not None: + # Max number of requests per client + req_per_client = args.max_num_requests // args.num_clients + if req_per_client < 1: + raise ValueError("Number of requests should be at least one per client") + max_req_per_client = req_per_client + + max_active_conversations = args.max_active_conversations + if max_active_conversations is None: + # Each client will have only one active conversation at a time + max_active_conversations = args.num_clients + + if max_active_conversations > len(input_conv): + raise ValueError( + f"Max active conversations {max_active_conversations} " + "must be equal or less than the total number of conversations" + ) + + # Max number of active conversations per client + max_active_conv_per_client = max_active_conversations // args.num_clients + if max_active_conv_per_client < 1: + raise ValueError( + f"Max active conversations {max_active_conversations} " + "must be equal or greater than the number of clients" + ) + + # Skip the first user turn (as part of the warmup) + skip_first_turn = args.warmup_step + + # Common arguments for all clients + client_args = ClientArgs( + seed=args.seed, + max_num_requests=max_req_per_client, + skip_first_turn=skip_first_turn, + max_turns=args.max_turns, + max_active_conversations=max_active_conv_per_client, + verbose=args.verbose, + print_content=args.print_content, + verify_output=args.verify_output, + conversation_sampling=args.conversation_sampling, + request_rate=args.request_rate, + ) + + if args.limit_min_tokens > 0 or args.limit_max_tokens > 0: + if args.limit_min_tokens < 1 or args.limit_max_tokens < 1: + raise ValueError( + "Invalid min/max tokens limits (both limits should be provided)" + ) + if args.limit_min_tokens > args.limit_max_tokens: + raise ValueError( + "Invalid min/max tokens limits (min should not be larger than max)" + ) + + # Arguments for API requests + chat_url = f"{args.url}/v1/chat/completions" + model_name = args.served_model_name if args.served_model_name else args.model + + req_args = RequestArgs( + chat_url=chat_url, + model=model_name, + stream=not args.no_stream, + limit_min_tokens=args.limit_min_tokens, + limit_max_tokens=args.limit_max_tokens, + ) + + return client_args, req_args + + +async def main_mp( + client_args: ClientArgs, + req_args: RequestArgs, + bench_args: BenchmarkArgs, + tokenizer: AutoTokenizer, + input_conv: ConversationsMap, +) -> tuple[ConversationsMap, list[RequestStats]]: + # An event that will trigger graceful termination of all the clients + stop_event = mp.Event() + + # Queue for input conversations (from the input file/dataset) + task_queue: mp.Queue = mp.Queue() + + # Queue for client measurements (TTFT, TPOT, etc. for each request) + result_queue: mp.Queue = mp.Queue() + + # Queue for output conversations (with the LLM answers, sent by the server) + conv_queue: mp.Queue = mp.Queue() + output_conv: ConversationsMap = {} + client_metrics: list[RequestStats] = [] + + # Start all clients + start_time = time.perf_counter_ns() + logger.info(f"{Color.GREEN}Starting {bench_args.num_clients} clients{Color.RESET}") + + clients = [] + for client_id in range(bench_args.num_clients): + client = mp.Process( + name=f"client_{client_id}", + target=worker_function, + args=( + client_id, + tokenizer, + client_args, + req_args, + stop_event, + task_queue, + result_queue, + conv_queue, + ), + ) + clients.append(client) + client.start() + + # Submit all the input conversations as tasks for the clients + for conv_id, messages in input_conv.items(): + task_queue.put((conv_id, messages)) + + # Add termination signals for clients + for _ in range(bench_args.num_clients): + task_queue.put((TERM_SIGNAL, TERM_SIGNAL)) + + # Collect the updated conversations from all clients + num_clients_finished = 0 + total_convs = len(input_conv) + + debug_stats = DebugStats(logger, min(15 * bench_args.num_clients, 500)) + + while num_clients_finished < bench_args.num_clients: + # Collect updated conversation + conv_id, messages = conv_queue.get() + + # Collect results (measurements) + while not result_queue.empty(): + new_data = result_queue.get() + client_metrics.append(new_data) + debug_stats.update(new_data) + + if conv_id is TERM_SIGNAL: + num_clients_finished += 1 + logger.info( + f"{Color.CYAN}{num_clients_finished} out of " + f"{bench_args.num_clients} clients finished{Color.RESET}" + ) + + if bench_args.early_stop and not stop_event.is_set(): + # Once one client finished, stop all other clients. + # there is no reason to continue the benchmark with fewer clients. + logger.info( + f"{Color.YELLOW}Sending termination signal to clients{Color.RESET}" + ) + stop_event.set() + else: + output_conv[conv_id] = messages + + finished_convs = len(output_conv) + percent = finished_convs / total_convs + + # Tuned to control the print rate (can be changed if required) + print_cycle = max(3, int(bench_args.num_clients / 4)) + + if finished_convs % print_cycle == 0: + runtime_sec = nanosec_to_sec(time.perf_counter_ns() - start_time) + logger.info( + f"{Color.CYAN}Finished {finished_convs} out of {total_convs} conversations ({percent:.0%}), " # noqa: E501 + f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 + ) + + rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3) + if len(client_metrics) < (5 * bench_args.num_clients): + # Do not estimate the RPS if the number of samples is very low + # (threshold can be tuned if needed) + rps = "N/A" + + runtime_left_sec: Union[str, float] = round( + (runtime_sec / finished_convs) * (total_convs - finished_convs), 3 + ) + if percent < 0.05: + # If less than 5% of the conversations were not finished, + # the estimation will probably be very inaccurate + # (threshold can be tuned if needed). + runtime_left_sec = "N/A" + + logger.info( + f"{Color.CYAN}Estimated req/sec {rps}, estimated runtime left {runtime_left_sec} sec{Color.RESET}" # noqa: E501 + ) + debug_stats.print() + + logger.info( + f"{Color.CYAN}All {bench_args.num_clients} clients finished{Color.RESET}" + ) + + # At this point all the clients finished, + # collect results (TTFT, TPOT, etc.) from all the clients. + # This needs to happen before calling join on the clients + # (result_queue should be emptied). + while not result_queue.empty(): + client_metrics.append(result_queue.get()) + + logger.info(f"Collected {len(client_metrics)} samples from all the clients") + + # Wait for all clients to finish + for client in clients: + logger.info( + f"{Color.CYAN}Waiting for client {client.name} " + f"(is alive: {client.is_alive()}){Color.RESET}" + ) + + client.join(timeout=120) + + if client.is_alive(): + logger.warning( + f"{Color.YELLOW}Client {client.name} will be terminated{Color.RESET}" + ) + client.terminate() + + exitcode = client.exitcode + if exitcode != 0: + logger.error( + f"{Color.RED}Client {client.name} exited " + f"with exit code {exitcode}{Color.RESET}" + ) + + logger.info( + f"All {bench_args.num_clients} clients exited (successfully " + f"finished {len(output_conv)} out of {total_convs} conversations)" + ) + + # Queues should be closed, required to avoid hang at interpreter shutdown + unfinished_tasks = 0 + while not task_queue.empty(): + task_queue.get() + unfinished_tasks += 1 + + if unfinished_tasks > 0: + # Can happen if not all tasks (conversations) have finished. + # May happen if --max-num-requests was used, + # or if an error occurred in one of the clients. + logger.debug(f"Discarding {unfinished_tasks} unfinished tasks") + + task_queue.close() + task_queue.join_thread() + + result_queue.close() + result_queue.join_thread() + + conv_queue.close() + conv_queue.join_thread() + + return output_conv, client_metrics + + +def get_filename_with_timestamp(label: str, extension: str) -> str: + time_now = datetime.now() + timestamp = time_now.strftime("%d-%m-%Y_%H-%M-%S") + filename = f"{label}__{timestamp}.{extension}" + return filename + + +def process_statistics( + client_metrics: list[RequestStats], + warmup_percentages: list[float], + test_params: dict, + verbose: bool, + gen_conv_args: Optional[GenConvArgs] = None, + excel_output: bool = False, +) -> None: + if len(client_metrics) == 0: + logger.info("No samples to process") + return + + logger.info(f"Processing {len(client_metrics)} samples...") + + raw_data = pd.DataFrame(client_metrics) + + if verbose: + # Calculate the time between user turns in each conversation (in a new column) + raw_data = raw_data.sort_values(by=["conversation_id", "start_time_ms"]) + raw_data["time_between_user_turns_sec"] = raw_data.groupby("conversation_id")[ + "start_time_ms" + ].diff() + + # Convert milliseconds to seconds + raw_data["time_between_user_turns_sec"] = ( + raw_data["time_between_user_turns_sec"] / 1000.0 + ) + + # Final raw data should be sorted by time + raw_data = raw_data.sort_values(by=["start_time_ms"]) + raw_data["end_time_ms"] = raw_data["start_time_ms"] + raw_data["latency_ms"] + + percentiles = [0.25, 0.5, 0.75, 0.9] + + # Add more percentiles if there are enough samples + if len(raw_data) >= 100: + percentiles.append(0.99) + + if len(raw_data) >= 1000: + percentiles.append(0.999) + + if len(raw_data) >= 10000: + percentiles.append(0.9999) + + # Set precision for numbers in the output text (the dataframes) + pd.set_option("display.precision", 2) + + # Exclude parameters from RequestStats + exclude = [ + "start_time_ms", + "end_time_ms", + "output_num_first_chunk_tokens", + "approx_cached_percent", + "conversation_id", + "client_id", + ] + + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Parameters:{Color.RESET}") + for k, v in test_params.items(): + print(f"{k}={v}") + + # conversations generation parameters + if gen_conv_args is not None: + gen_params = { + "text_files": ", ".join(gen_conv_args.text_files), + "input_num_turns": str(gen_conv_args.input_num_turns), + "input_common_prefix_num_tokens": str( + gen_conv_args.input_common_prefix_num_tokens + ), + "input_prefix_num_tokens": str(gen_conv_args.input_prefix_num_tokens), + "input_num_tokens": str(gen_conv_args.input_num_tokens), + "output_num_tokens": str(gen_conv_args.output_num_tokens), + } + + print(f"{Color.YELLOW}Conversations Generation Parameters:{Color.RESET}") + for k, v in gen_params.items(): + print(f"{k}={v}") + + print(TEXT_SEPARATOR) + + params_list = [] + df_list = [] + for percent in warmup_percentages: + # Select samples from the end (tail) of the dataframe + warmup_count = int(percent * len(raw_data)) + tail_count = len(raw_data) - warmup_count + if tail_count == 0: + # No reason to process if the count of samples is zero + break + + df = raw_data.tail(tail_count) + + # Runtime is the diff between the end of the last request + # and the start of the first request + runtime_sec = df["end_time_ms"].iloc[-1] - df["start_time_ms"].iloc[0] + + # Convert milliseconds to seconds + runtime_sec = runtime_sec / 1000.0 + requests_per_sec = float(len(df)) / runtime_sec + + params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec} + + # Generate a summary of relevant metrics (and drop irrelevant data) + df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose() + + # List for Excel file + params_list.append(params) + df_list.append(df) + + # Print the statistics summary + if percent > 0 or len(warmup_percentages) > 1: + print( + f"{Color.YELLOW}Statistics summary " + f"(assuming {percent:.0%} warmup samples):{Color.RESET}" + ) + else: + print(f"{Color.YELLOW}Statistics summary:{Color.RESET}") + + for k, v in params.items(): + if isinstance(v, float): + print(f"{k} = {v:.3f}") + else: + print(f"{k} = {v}") + print(TEXT_SEPARATOR) + print(df) + print(TEXT_SEPARATOR) + + if excel_output: + prefix = f"statistics_{test_params['num_clients']}_clients" + filename = get_filename_with_timestamp(prefix, "xlsx") + + with pd.ExcelWriter(filename, engine="xlsxwriter") as writer: + startrow = 0 + test_params_df = pd.DataFrame([test_params]) + test_params_df.to_excel( + writer, sheet_name="Summary", index=False, startrow=startrow + ) + startrow += len(test_params_df) + 3 + + if gen_conv_args is not None: + gen_params_df = pd.DataFrame([gen_params]) + gen_params_df.to_excel( + writer, sheet_name="Summary", index=False, startrow=(startrow - 1) + ) + startrow += len(gen_params_df) + 3 + + for params, df_stats in zip(params_list, df_list): + df_params = pd.DataFrame([params]) + df_params.to_excel( + writer, sheet_name="Summary", index=False, startrow=startrow + ) + startrow += len(df_params) + 2 + df_stats.to_excel( + writer, sheet_name="Summary", index=True, startrow=startrow + ) + startrow += len(df_stats) + 3 + + raw_data.to_excel(writer, sheet_name="Raw data", index=False, startrow=0) + + logger.info( + f"{Color.GREEN}Client metrics exported to file: {filename}{Color.RESET}" + ) + + +async def get_server_info(url: str) -> None: + logger.info(f"{Color.BLUE}Collecting information from server: {url}{Color.RESET}") + async with aiohttp.ClientSession() as session: + # Get server version (not mandatory, "version" endpoint may not exist) + url_version = f"{url}/version" + async with session.get(url_version) as response: + if HTTPStatus(response.status) == HTTPStatus.OK: + text = await response.text() + logger.info(f"{Color.BLUE}Server version: {text}{Color.RESET}") + + # Get available models + url_models = f"{url}/v1/models" + async with session.get(url_models) as response: + if HTTPStatus(response.status) == HTTPStatus.OK: + text = await response.text() + logger.info(f"{Color.BLUE}Models:{Color.RESET}") + models_data = json.loads(text) + models_list = models_data["data"] + for model in models_list: + model_id = model["id"] + max_model_len = model.get("max_model_len", "N/A") + logger.info( + f"{Color.BLUE}\t{model_id=}, {max_model_len=}{Color.RESET}" + ) + else: + logger.info(f"{Color.RED}Failed to get models{Color.RESET}") + + +async def main() -> None: + parser = argparse.ArgumentParser( + prog="Benchmark serving with multi-turn conversations", + description="Benchmark online inference using REST API", + ) + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + + parser.add_argument( + "-i", + "--input-file", + type=str, + required=True, + help="Input JSON file with ShareGPT conversations or " + "configuration file for generation of synthetic conversations", + ) + parser.add_argument( + "-o", + "--output-file", + type=str, + default=None, + help="Output JSON file containing conversations with updated assistant answers", + ) + + parser.add_argument( + "--seed", + type=int, + default=0, + help="Seed for random number generators (default: 0)", + ) + + parser.add_argument( + "-m", "--model", type=str, required=True, help="Path of the LLM model" + ) + parser.add_argument( + "--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ", + ) + + parser.add_argument( + "-u", + "--url", + type=str, + default="http://localhost:8000", + help="Base URL for the LLM API server", + ) + + parser.add_argument( + "-p", + "--num-clients", + type=int, + default=1, + help="Number of clients that will send requests in parallel", + ) + parser.add_argument( + "-k", + "--max-active-conversations", + type=int, + default=None, + help="Max number of active conversations at a time (for all clients)", + ) + parser.add_argument( + "-n", + "--max-num-requests", + type=int, + default=None, + help="Max number of requests to send (total for all clients)", + ) + + parser.add_argument( + "--warmup-step", + default=False, + action="store_true", + help="Run a warmup step (using only the first turn of every conversation), " + "measurements will not be included in the final benchmark results", + ) + + parser.add_argument( + "--max-turns", + type=int, + default=None, + help="Maximum number of turns/messages per conversation, " + "includes both user and assistant messages " + "(a positive number, e.g: 2, 4, 6, etc.), disabled by default", + ) + parser.add_argument( + "--no-early-stop", + default=False, + action="store_true", + help="By default, the benchmark will stop if at least one client exits." + " Use this flag to disable this behavior", + ) + + parser.add_argument( + "--limit-max-tokens", + type=int, + default=NUM_TOKENS_FROM_DATASET, + help="Set max_tokens for the output token count of each request " + "(must also set --limit-min-tokens). " + "Overrides output token count from the input dataset. " + "Use a negative value to disable this limit.", + ) + parser.add_argument( + "--limit-min-tokens", + type=int, + default=NUM_TOKENS_FROM_DATASET, + help="Set min_tokens for the output token count of each request " + "(must also set --limit-max-tokens). " + "Overrides output token count from the input dataset. " + "Use a negative value to disable this limit.", + ) + + parser.add_argument( + "--request-rate", + type=float, + default=0, + help="Expected request rate (Poisson process) per client in requests/sec." + "Set to 0 for no delay between requests.", + ) + parser.add_argument( + "--conversation-sampling", + type=ConversationSampling, + choices=list(ConversationSampling), + default=ConversationSampling.ROUND_ROBIN, + help=( + "Strategy for selecting which conversation to use for the next request. " + "Options: 'round_robin' (cycle through conversations), " + "'random' (pick randomly)." + ), + ) + parser.add_argument( + "--verify-output", + default=False, + action="store_true", + help="Verify the LLM output (compare to the answers in the input JSON file)", + ) + + parser.add_argument( + "--no-stream", + default=False, + action="store_true", + help="Disable stream/streaming mode (set 'stream' to False in the API request)", + ) + + parser.add_argument( + "-e", + "--excel-output", + default=False, + action="store_true", + help="Export summary to Excel file (optional)", + ) + parser.add_argument( + "-v", + "--verbose", + default=False, + action="store_true", + help="Enable verbose output", + ) + parser.add_argument( + "--print-content", + default=False, + action="store_true", + help="Print the user prompts and the server's answers", + ) + + parser.add_argument( + "--warmup-percentages", + type=str, + default="0%", + help="Ignore the first X samples as warmup (X is a percentage)." + " A comma separated list of percentages can be used " + "(for example: --warmup-percentages=0%%,50%%)", + ) + + args = parser.parse_args() + + logger.info(args) + + logger.info(f"{Color.GREEN}Input parameters:{Color.RESET}") + logger.info(f"url={args.url}") + logger.info(f"model={args.model}") + logger.info(f"num_clients={args.num_clients}") + + if args.verify_output: + logger.info(f"{Color.PURPLE}Verify is enabled{Color.RESET}") + + # Calculate the amount of samples to filter (as warmup samples/measurements). + try: + warmup_percentages: list[float] = [0.0] + if not args.warmup_step: + # Warmup percentage can be used only if the warmup step was used + warmup_strings: list[str] = args.warmup_percentages.split(",") + warmup_strings = [x.replace("%", "") for x in warmup_strings] + warmup_percentages = [float(x) / 100 for x in warmup_strings] + + # Check for valid range (0 to 1) + for p in warmup_percentages: + assert p >= 0.0 and p < 1.0 + + # Sort from high to low warmup percentage + warmup_percentages.sort() + + logger.info( + f"Warmup percentages (percentage of samples): {warmup_percentages}" + ) + + except Exception: + raise ValueError( + f"Invalid --warmup-percentage={args.warmup_percentage}" + ) from None + + random.seed(args.seed) + np.random.seed(args.seed) + + if not os.path.exists(args.model): + raise OSError(f"Path does not exist: {args.model}") + logger.info("Loading tokenizer") + tokenizer = AutoTokenizer.from_pretrained(args.model) + + await get_server_info(args.url) + + # Load the input file (either conversations of configuration file) + logger.info(f"Reading input file: {args.input_file}") + with open(args.input_file) as f: + input_data = json.load(f) + + gen_conv_args = None + if isinstance(input_data, list): + # The conversations are stored as a list of dicts + logger.info(f"Found {len(input_data)} items in the input file") + + # Convert the list to a ConversationsMap + conversations = conversations_list_to_dict(input_data) + + elif isinstance(input_data, dict): + # The input file is a configuration file + # (type is determined by the field 'filetype') + if "filetype" not in input_data: + raise Exception( + f"Input file {args.input_file} is invalid (missing 'filetype')" + ) + + logger.info(f"Using input file with filetype: {input_data['filetype']}") + + gen_conv_args = parse_input_json_file(input_data) + + # Disable warning from "huggingface/tokenizers" + # (when using python multiprocessing and tokenizers) + os.environ["TOKENIZERS_PARALLELISM"] = "true" + + # Generate synthetic conversations + conversations = generate_conversations(gen_conv_args, tokenizer) + + else: + raise Exception(f"Input file {args.input_file} is invalid") + + if args.max_turns is not None: + if args.max_turns < 1: + raise ValueError("Max turns must be a positive number") + logger.info( + f"{Color.PURPLE}Max turns per conversation " + f"is limited to {args.max_turns}{Color.RESET}" + ) + + # Create benchmark configurations + client_args, req_args = get_client_config(args, conversations) + + bench_args = BenchmarkArgs( + url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop + ) + + # Warm-up step + if args.warmup_step: + # Only send a single user prompt from every conversation. + # max_active_conversations must be 1, + # otherwise the clients may exit after sending a single request + # (because the task queue is empty). + warmup_client_args = client_args._replace( + skip_first_turn=False, max_turns=1, max_active_conversations=1 + ) + + # Early stop should be disabled, + # all clients should finish their work before exiting + warmup_bench_args = bench_args._replace(early_stop=False) + + logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}") + conversations, _ = await main_mp( + warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations + ) + logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}") + + # Run the benchmark + start_time = time.perf_counter_ns() + client_convs, client_metrics = await main_mp( + client_args, req_args, bench_args, tokenizer, conversations + ) + total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time) + + # Calculate requests per second + total_runtime_sec = total_runtime_ms / 1000.0 + rps = len(client_metrics) / total_runtime_sec + logger.info( + f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec" + f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}" + ) + + # Benchmark parameters + params = { + "model": args.model, + "num_clients": args.num_clients, + "num_conversations": len(conversations), + "active_conversations": args.max_active_conversations, + "seed": args.seed, + } + + if args.limit_min_tokens > 0: + params["min_tokens"] = args.limit_min_tokens + + if args.limit_max_tokens > 0: + params["max_tokens"] = args.limit_max_tokens + + # Process and print statistics (and save excel file with the statistics) + process_statistics( + client_metrics, + test_params=params, + warmup_percentages=warmup_percentages, + verbose=args.verbose, + gen_conv_args=gen_conv_args, + excel_output=args.excel_output, + ) + + if args.output_file is not None: + # Write a JSON file with the updated conversations + # The "assistant" content will contain the answers from the tested LLM + output_data: ShareGptConversations = conversations_dict_to_list(client_convs) + logger.info( + f"{Color.GREEN}Writing conversations file: {args.output_file}{Color.RESET}" + ) + with open(args.output_file, "w") as f: + json.dump(output_data, f, indent=4) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py new file mode 100644 index 000000000..c3622c99a --- /dev/null +++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py @@ -0,0 +1,354 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Download dataset from: +https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json + +Convert to OpenAI API: +export INPUT_FILE=sharegpt_20230401_clean_lang_split.json +python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128 +""" + +import argparse +import json +import random +from statistics import mean +from typing import Any, Optional + +import pandas as pd # type: ignore +import tqdm # type: ignore +from transformers import AutoTokenizer # type: ignore + + +def has_non_english_chars(text: str) -> bool: + return not text.isascii() + + +def content_is_valid( + content: str, min_content_len: Optional[int], max_content_len: Optional[int] +) -> bool: + if min_content_len and len(content) < min_content_len: + return False + + if max_content_len and len(content) > max_content_len: + return False + + return has_non_english_chars(content) + + +def print_stats( + conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None +) -> None: + # Collect statistics + stats = [] + + print("\nCollecting statistics...") + for item in tqdm.tqdm(conversations): + # item has "id" and "messages" + messages = item["messages"] + + user_turns = 0 + assistant_turns = 0 + user_words = 0 + assistant_words = 0 + conv_chars = 0 + + user_tokens: list[int] = [] + assistant_tokens: list[int] = [] + + for m in messages: + content = m["content"] + conv_chars += len(content) + content_num_words = content.count(" ") + 1 + + num_tokens = 0 + if tokenizer: + num_tokens = len(tokenizer(m["content"]).input_ids) + + if m["role"] == "user": + user_turns += 1 + user_words += content_num_words + if tokenizer: + user_tokens.append(num_tokens) + + elif m["role"] == "assistant": + assistant_turns += 1 + assistant_words += content_num_words + if tokenizer: + assistant_tokens.append(num_tokens) + + # assert user_turns == assistant_turns, \ + # f"Invalid conversation ID {item['id']}" + + conv_words = user_words + assistant_words + item_stats = { + "user_turns": user_turns, + "assistant_turns": assistant_turns, + "user_words": user_words, + "assistant_words": assistant_words, + "conv_turns": len(messages), + "conv_words": conv_words, + "conv_characters": conv_chars, + } + + if len(user_tokens) > 0: + item_stats["user_tokens"] = int(mean(user_tokens)) + + if len(assistant_tokens) > 0: + item_stats["assistant_tokens"] = int(mean(assistant_tokens)) + + stats.append(item_stats) + + print("\nStatistics:") + percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999] + df = pd.DataFrame(stats) + print(df.describe(percentiles=percentiles).transpose()) + + +def convert_sharegpt_to_openai( + seed: int, + input_file: str, + output_file: str, + max_items: Optional[int], + min_content_len: Optional[int] = None, + max_content_len: Optional[int] = None, + min_turns: Optional[int] = None, + max_turns: Optional[int] = None, + model: Optional[str] = None, +) -> None: + if min_turns and max_turns: + assert min_turns <= max_turns + + if min_content_len and max_content_len: + # Verify that min is not larger than max if both were given + assert min_content_len <= max_content_len + + print( + f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=}," + f" {max_content_len=}, {min_turns=}, {max_turns=}\n" + ) + + random.seed(seed) + + tokenizer = None + if model is not None: + print(f"Loading tokenizer from: {model}") + tokenizer = AutoTokenizer.from_pretrained(model) + + # Read the ShareGPT JSON file + print(f"Reading file: {input_file}") + with open(input_file, encoding="utf-8") as f: + # Should be a list of dicts + # Each dict should have "id" (string) and "conversations" (list of dicts) + sharegpt_data = json.load(f) + + assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts" + + print(f"Total items in input file: {len(sharegpt_data):,}") + + print(f"Shuffling dataset with seed {seed}") + random.shuffle(sharegpt_data) + + # Map conversation ID to the all the messages + conversation_parts: dict[str, list[Any]] = {} + + for item in tqdm.tqdm(sharegpt_data): + assert "id" in item, "Missing key 'id'" + assert "conversations" in item, "Missing key 'conversations'" + + # Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.) + conv_id, _ = item["id"].split("_") + new_turns = item["conversations"] + + if conv_id not in conversation_parts: + # Start new conversation + conversation_parts[conv_id] = [] + elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0: + prev_turns = conversation_parts[conv_id][-1] + if prev_turns[-1]["from"] == new_turns[0]["from"]: + new_turns = new_turns[1:] + + if len(new_turns) > 0: + # We assume that parts are in order in the ShareGPT dataset + conversation_parts[conv_id].append(new_turns) + + dataset: list[dict[str, Any]] = [] + for conv_id, conv_parts in conversation_parts.items(): + new_item = {"id": conv_id} + + conversations: list[dict[str, str]] = [] + + # Merge all parts + for conv_part in conv_parts: + conversations.extend(conv_part) + + if len(conversations) > 0: + new_item["conversations"] = conversations + dataset.append(new_item) + + print(f"Total unique conversations (IDs) in input file: {len(dataset):,}") + + # Final output data + final_openai_dataset: list[dict] = [] + + # Filter conversations from the ShareGPT dataset and convert to OpenAI format + for item in tqdm.tqdm(dataset): + messages: list[dict] = [] + + assert "id" in item, "Missing key 'id'" + assert "conversations" in item, "Missing key 'conversations'" + + conv_id = item["id"] + conversations = item["conversations"] + + if min_turns is not None and len(conversations) < min_turns: + # Skip short conversations + continue + + # Convert each message in the conversation, up to max_turns if specified + for i, turn in enumerate(conversations): + assert "from" in turn and "value" in turn, ( + f"Invalid conversation ID {conv_id} - missing 'from' or 'value'" + ) + + role = None + turn_from = turn["from"] + + if turn_from in {"human", "user"}: + role = "user" + elif turn_from in {"gpt", "bing", "chatgpt", "bard"}: + role = "assistant" + elif turn_from == "system": + role = "system" + + assert role is not None, ( + f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid" + ) + + if i == 0 and role != "user": + # If the first message is from assistant (gpt), skip it. + # this happens when the conversation is a follow-up + # to a previous conversation (from the same user). + continue + + if max_turns is not None and i >= max_turns: + break + + # Convert message to OpenAI format (with "role" and "content") + content = turn["value"] + messages.append({"role": role, "content": content}) + + # Add the converted conversation to the OpenAI format + if len(messages) > 0: + valid_messages = True + + # First turn should always be from the user + user_turn = True + + for m in messages: + # Make sure that turns alternate between user and assistant + if (user_turn and m["role"] != "user") or ( + not user_turn and m["role"] != "assistant" + ): + valid_messages = False + break + + user_turn = not user_turn + + content = m["content"] + valid_messages = content_is_valid( + content, min_content_len, max_content_len + ) + if not valid_messages: + break + + if valid_messages is True: + final_openai_dataset.append({"id": conv_id, "messages": messages}) + + assert len(final_openai_dataset) > 0, "Final number of conversations is zero" + + print_stats(final_openai_dataset) + + print_stats_again = False + if max_items is not None and len(final_openai_dataset) > max_items: + print(f"\n\nSampling {max_items} items from the dataset...") + print_stats_again = True + final_openai_dataset = random.sample(final_openai_dataset, max_items) + + if print_stats_again: + # Print stats after the dataset changed + print_stats(final_openai_dataset, tokenizer) + + # Write the converted data to a new JSON file + final_size = len(final_openai_dataset) + print(f"\nTotal conversations converted (after filtering): {final_size:,}") + print(f"\nWriting file: {output_file}") + with open(output_file, "w", encoding="utf-8") as f: + json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Convert ShareGPT dataset to OpenAI API format" + ) + parser.add_argument("input_file", help="Path to the input ShareGPT JSON file") + parser.add_argument( + "output_file", help="Path to the output OpenAI format JSON file" + ) + parser.add_argument( + "--seed", type=int, default=0, help="Seed for random number generators" + ) + parser.add_argument( + "--max-items", + type=int, + default=None, + help="Maximum number of items in the output file", + ) + parser.add_argument( + "--min-turns", + type=int, + default=None, + help="Minimum number of turns per conversation", + ) + parser.add_argument( + "--max-turns", + type=int, + default=None, + help="Maximum number of turns per conversation", + ) + parser.add_argument( + "--min-content-len", + type=int, + default=None, + help="Min number of characters in the messages' content", + ) + parser.add_argument( + "--max-content-len", + type=int, + default=None, + help="Max number of characters in the messages' content", + ) + parser.add_argument( + "--model", + type=str, + default=None, + help="LLM model, only the tokenizer will be used", + ) + + args = parser.parse_args() + + convert_sharegpt_to_openai( + args.seed, + args.input_file, + args.output_file, + args.max_items, + args.min_content_len, + args.max_content_len, + args.min_turns, + args.max_turns, + args.model, + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt new file mode 100644 index 000000000..f0e193591 --- /dev/null +++ b/benchmarks/multi_turn/requirements.txt @@ -0,0 +1,5 @@ +numpy>=1.24 +pandas>=2.0.0 +aiohttp>=3.10 +transformers>=4.46 +xlsxwriter>=3.2.1 \ No newline at end of file From da30505f99b0a29a400b52ce8c435c381fd2d42f Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Tue, 16 Dec 2025 15:04:48 +0800 Subject: [PATCH 10/26] =?UTF-8?q?=E6=8F=90=E4=BA=A4benchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmarks/README.md | 20 - benchmarks/auto_tune/README.md | 218 -- benchmarks/auto_tune/auto_tune.sh | 313 -- benchmarks/auto_tune/batch_auto_tune.sh | 128 - benchmarks/backend_request_func.py | 658 ---- benchmarks/benchmark_block_pool.py | 74 - benchmarks/benchmark_latency.py | 17 - .../benchmark_long_document_qa_throughput.py | 202 -- benchmarks/benchmark_ngram_proposer.py | 213 -- benchmarks/benchmark_prefix_caching.py | 278 -- benchmarks/benchmark_prioritization.py | 222 -- benchmarks/benchmark_serving.py | 17 - .../benchmark_serving_structured_output.py | 1048 ------ benchmarks/benchmark_throughput.py | 17 - benchmarks/benchmark_utils.py | 125 - .../cutlass_benchmarks/sparse_benchmarks.py | 516 --- benchmarks/cutlass_benchmarks/utils.py | 100 - .../cutlass_benchmarks/w8a8_benchmarks.py | 372 --- .../cutlass_benchmarks/weight_shapes.py | 46 - .../disagg_overhead_benchmark.sh | 147 - .../disagg_performance_benchmark.sh | 165 - .../disagg_prefill_proxy_server.py | 199 -- benchmarks/disagg_benchmarks/rate_limiter.py | 45 - benchmarks/disagg_benchmarks/request_queue.py | 39 - .../disagg_benchmarks/round_robin_proxy.py | 63 - .../visualize_benchmark_results.py | 47 - .../fused_kernels/layernorm_rms_benchmarks.py | 228 -- benchmarks/kernels/bench_block_fp8_gemm.py | 145 - benchmarks/kernels/bench_fp8_gemm.py | 159 - benchmarks/kernels/bench_int8_gemm.py | 169 - benchmarks/kernels/bench_nvfp4_gemm.py | 198 -- .../kernels/bench_per_token_quant_fp8.py | 269 -- benchmarks/kernels/benchmark_activation.py | 104 - benchmarks/kernels/benchmark_bitblas.py | 244 -- .../kernels/benchmark_cutlass_fp4_moe.py | 504 --- .../kernels/benchmark_cutlass_moe_fp8.py | 406 --- .../kernels/benchmark_device_communicators.py | 508 --- .../kernels/benchmark_grouped_gemm_cutlass.py | 427 --- benchmarks/kernels/benchmark_layernorm.py | 93 - benchmarks/kernels/benchmark_lora.py | 1071 ------- benchmarks/kernels/benchmark_machete.py | 745 ----- benchmarks/kernels/benchmark_marlin.py | 413 --- benchmarks/kernels/benchmark_moe.py | 773 ----- .../kernels/benchmark_moe_align_block_size.py | 74 - .../benchmark_moe_permute_unpermute.py | 428 --- benchmarks/kernels/benchmark_mrope.py | 328 -- .../kernels/benchmark_paged_attention.py | 251 -- .../benchmark_per_token_group_quant.py | 159 - benchmarks/kernels/benchmark_polynorm.py | 155 - benchmarks/kernels/benchmark_quant.py | 108 - .../benchmark_reshape_and_cache_flash.py | 212 -- benchmarks/kernels/benchmark_rmsnorm.py | 256 -- benchmarks/kernels/benchmark_rope.py | 133 - benchmarks/kernels/benchmark_shapes.py | 94 - .../kernels/benchmark_silu_mul_fp8_quant.py | 675 ---- .../benchmark_trtllm_decode_attention.py | 293 -- .../benchmark_trtllm_prefill_attention.py | 308 -- .../kernels/benchmark_w8a8_block_fp8.py | 415 --- benchmarks/kernels/deepgemm/README.md | 129 - .../benchmark_fp8_block_dense_gemm.py | 427 --- benchmarks/kernels/graph_machete_bench.py | 64 - benchmarks/kernels/requirements.txt | 1 - benchmarks/kernels/utils.py | 214 -- benchmarks/kernels/weight_shapes.py | 104 - benchmarks/multi_turn/README.md | 174 - benchmarks/multi_turn/bench_dataset.py | 588 ---- benchmarks/multi_turn/bench_utils.py | 28 - .../benchmark_serving_multi_turn.py | 1569 --------- .../multi_turn/convert_sharegpt_to_openai.py | 354 --- benchmarks/multi_turn/requirements.txt | 5 - benchmarks/overheads/benchmark_hashing.py | 64 - benchmarks/pyproject.toml | 49 - benchmarks/run_structured_output_benchmark.sh | 129 - benchmarks/sonnet.txt | 518 --- vllm_omni/benchmarks/__init__.py | 0 vllm_omni/benchmarks/datasets.py | 2814 +++++++++++++++++ vllm_omni/benchmarks/latency.py | 170 + vllm_omni/benchmarks/serve.py | 1358 ++++++++ vllm_omni/benchmarks/throughput.py | 696 ++++ 79 files changed, 5038 insertions(+), 20051 deletions(-) delete mode 100644 benchmarks/README.md delete mode 100644 benchmarks/auto_tune/README.md delete mode 100644 benchmarks/auto_tune/auto_tune.sh delete mode 100644 benchmarks/auto_tune/batch_auto_tune.sh delete mode 100644 benchmarks/backend_request_func.py delete mode 100644 benchmarks/benchmark_block_pool.py delete mode 100644 benchmarks/benchmark_latency.py delete mode 100644 benchmarks/benchmark_long_document_qa_throughput.py delete mode 100644 benchmarks/benchmark_ngram_proposer.py delete mode 100644 benchmarks/benchmark_prefix_caching.py delete mode 100644 benchmarks/benchmark_prioritization.py delete mode 100644 benchmarks/benchmark_serving.py delete mode 100644 benchmarks/benchmark_serving_structured_output.py delete mode 100644 benchmarks/benchmark_throughput.py delete mode 100644 benchmarks/benchmark_utils.py delete mode 100644 benchmarks/cutlass_benchmarks/sparse_benchmarks.py delete mode 100644 benchmarks/cutlass_benchmarks/utils.py delete mode 100644 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py delete mode 100644 benchmarks/cutlass_benchmarks/weight_shapes.py delete mode 100644 benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh delete mode 100644 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh delete mode 100644 benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py delete mode 100644 benchmarks/disagg_benchmarks/rate_limiter.py delete mode 100644 benchmarks/disagg_benchmarks/request_queue.py delete mode 100644 benchmarks/disagg_benchmarks/round_robin_proxy.py delete mode 100644 benchmarks/disagg_benchmarks/visualize_benchmark_results.py delete mode 100644 benchmarks/fused_kernels/layernorm_rms_benchmarks.py delete mode 100644 benchmarks/kernels/bench_block_fp8_gemm.py delete mode 100644 benchmarks/kernels/bench_fp8_gemm.py delete mode 100644 benchmarks/kernels/bench_int8_gemm.py delete mode 100644 benchmarks/kernels/bench_nvfp4_gemm.py delete mode 100644 benchmarks/kernels/bench_per_token_quant_fp8.py delete mode 100644 benchmarks/kernels/benchmark_activation.py delete mode 100644 benchmarks/kernels/benchmark_bitblas.py delete mode 100644 benchmarks/kernels/benchmark_cutlass_fp4_moe.py delete mode 100644 benchmarks/kernels/benchmark_cutlass_moe_fp8.py delete mode 100644 benchmarks/kernels/benchmark_device_communicators.py delete mode 100644 benchmarks/kernels/benchmark_grouped_gemm_cutlass.py delete mode 100644 benchmarks/kernels/benchmark_layernorm.py delete mode 100644 benchmarks/kernels/benchmark_lora.py delete mode 100644 benchmarks/kernels/benchmark_machete.py delete mode 100644 benchmarks/kernels/benchmark_marlin.py delete mode 100644 benchmarks/kernels/benchmark_moe.py delete mode 100644 benchmarks/kernels/benchmark_moe_align_block_size.py delete mode 100644 benchmarks/kernels/benchmark_moe_permute_unpermute.py delete mode 100644 benchmarks/kernels/benchmark_mrope.py delete mode 100644 benchmarks/kernels/benchmark_paged_attention.py delete mode 100644 benchmarks/kernels/benchmark_per_token_group_quant.py delete mode 100644 benchmarks/kernels/benchmark_polynorm.py delete mode 100644 benchmarks/kernels/benchmark_quant.py delete mode 100644 benchmarks/kernels/benchmark_reshape_and_cache_flash.py delete mode 100644 benchmarks/kernels/benchmark_rmsnorm.py delete mode 100644 benchmarks/kernels/benchmark_rope.py delete mode 100644 benchmarks/kernels/benchmark_shapes.py delete mode 100644 benchmarks/kernels/benchmark_silu_mul_fp8_quant.py delete mode 100644 benchmarks/kernels/benchmark_trtllm_decode_attention.py delete mode 100644 benchmarks/kernels/benchmark_trtllm_prefill_attention.py delete mode 100644 benchmarks/kernels/benchmark_w8a8_block_fp8.py delete mode 100644 benchmarks/kernels/deepgemm/README.md delete mode 100644 benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py delete mode 100644 benchmarks/kernels/graph_machete_bench.py delete mode 100644 benchmarks/kernels/requirements.txt delete mode 100644 benchmarks/kernels/utils.py delete mode 100644 benchmarks/kernels/weight_shapes.py delete mode 100644 benchmarks/multi_turn/README.md delete mode 100644 benchmarks/multi_turn/bench_dataset.py delete mode 100644 benchmarks/multi_turn/bench_utils.py delete mode 100644 benchmarks/multi_turn/benchmark_serving_multi_turn.py delete mode 100644 benchmarks/multi_turn/convert_sharegpt_to_openai.py delete mode 100644 benchmarks/multi_turn/requirements.txt delete mode 100644 benchmarks/overheads/benchmark_hashing.py delete mode 100644 benchmarks/pyproject.toml delete mode 100644 benchmarks/run_structured_output_benchmark.sh delete mode 100644 benchmarks/sonnet.txt create mode 100644 vllm_omni/benchmarks/__init__.py create mode 100644 vllm_omni/benchmarks/datasets.py create mode 100644 vllm_omni/benchmarks/latency.py create mode 100644 vllm_omni/benchmarks/serve.py create mode 100644 vllm_omni/benchmarks/throughput.py diff --git a/benchmarks/README.md b/benchmarks/README.md deleted file mode 100644 index 269a4d51e..000000000 --- a/benchmarks/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Benchmarks - -This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation. - -## Contents - -- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput) -- **Throughput benchmarks**: Scripts for testing offline batch inference performance -- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference -- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.) - -## Usage - -For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli). - -For full CLI reference see: - -- -- -- diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md deleted file mode 100644 index d1bdb4c43..000000000 --- a/benchmarks/auto_tune/README.md +++ /dev/null @@ -1,218 +0,0 @@ -# Automated vLLM Server Parameter Tuning - -This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate. - -## Table of Contents - -- [Prerequisites](#prerequisites) -- [Configuration](#configuration) -- [How to Run](#how-to-run) -- [Example Use Cases](#example-use-cases) -- [Output](#output) -- [How It Works](#how-it-works) - -## Prerequisites - -Before running the script, please ensure the following steps are completed: - -1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch. - -```bash -git clone https://github.com/vllm-project/vllm.git -cd vllm -# git checkout -``` - -1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions. - -2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible. - -## Configuration - -You must set the following variables at the top of the script before execution. - - Note: You can also override the default values below via environment variables when running the script. - -```bash -MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh -``` - -| Variable | Description | Example Value | -| --- | --- | --- | -| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` | -| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` | -| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` | -| `TP` | **Required.** The tensor-parallelism size. | `1` | -| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) | -| `INPUT_LEN` | **Required.** Request input length. | `4000` | -| `OUTPUT_LEN` | **Required.** Request output length. | `16` | -| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` | -| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` | -| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` | -| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` | -| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` | - -**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`. - -## How to Run - -1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section. -2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost. - -```bash -cd -bash auto_tune.sh -``` - - Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself. - -## Example Use Cases - -Here are a few examples of how to configure the script for different goals: - -### 1. Maximize Throughput (No Latency Constraint) - -- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens. -- **Configuration**: - -```bash -INPUT_LEN=1800 -OUTPUT_LEN=20 -MAX_MODEL_LEN=2048 -MIN_CACHE_HIT_PCT=0 -MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number -``` - -#### 2. Maximize Throughput with a Latency Requirement - -- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms. -- **Configuration**: - -```bash -INPUT_LEN=1800 -OUTPUT_LEN=20 -MAX_MODEL_LEN=2048 -MIN_CACHE_HIT_PCT=0 -MAX_LATENCY_ALLOWED_MS=500 -``` - -#### 3. Maximize Throughput with Prefix Caching and Latency Requirements - -- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms. -- **Configuration**: - -```bash -INPUT_LEN=1800 -OUTPUT_LEN=20 -MAX_MODEL_LEN=2048 -MIN_CACHE_HIT_PCT=60 -MAX_LATENCY_ALLOWED_MS=500 -``` - -## Output - -After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`. - -- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run: - - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination. - - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run. - -- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found. - -```text -# Example result.txt content -hash:a1b2c3d4... -max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8 -max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500 -... -best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile -``` - - If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict. - -- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run. - -## How It Works - -The script follows a systematic process to find the optimal parameters: - -1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing. - -2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists. - -3. **Latency-Aware Throughput Search**: For each parameter combination: - - The vLLM server is started. - - A benchmark is first run with an infinite request rate (`--request-rate inf`). - - If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration. - - If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement. - -4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far. - -5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard. - -## Batched `auto_tune` - -The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file. - -### Prerequisites - -- **jq**: This script requires `jq` to parse the JSON configuration file. -- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated. - -### How to Run - -1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run. - -2. **Execute the script**: - - ```bash - bash batch_auto_tune.sh [gcs_upload_path] - ``` - - - ``: **Required.** Path to your JSON configuration file. - - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`). - -### Configuration File - -The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run. - -Here is an example `runs_config.json` with two benchmark configurations: - -```json -[ - { - "base": "/home/user", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "system": "TPU", # OR GPU - "tp": 8, - "input_len": 128, - "output_len": 2048, - "max_model_len": 2300, - "num_seqs_list": "128 256", - "num_batched_tokens_list": "8192 16384" - }, - { - "base": "/home/user", - "model": "meta-llama/Llama-3.1-70B-Instruct", - "system": "TPU", # OR GPU - "tp": 8, - "input_len": 4000, - "output_len": 16, - "max_model_len": 4096, - "num_seqs_list": "64 128", - "num_batched_tokens_list": "4096 8192", - "max_latency_allowed_ms": 500 - } -] -``` - -### Output - -The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added: - -- `run_id`: A unique identifier for the run, derived from the timestamp. -- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`). -- `results`: The content of the `result.txt` file from the `auto_tune.sh` run. -- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided). - -A summary of successful and failed runs is also printed to the console upon completion. diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh deleted file mode 100644 index b333ba9cd..000000000 --- a/benchmarks/auto_tune/auto_tune.sh +++ /dev/null @@ -1,313 +0,0 @@ -#!/bin/bash - -# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. -# See details in README (benchmarks/auto_tune/README.md). - -TAG=$(date +"%Y_%m_%d_%H_%M") -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO} -BASE=${BASE:-"$SCRIPT_DIR/../../.."} -MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"} -SYSTEM=${SYSTEM:-"TPU"} -TP=${TP:-1} -DOWNLOAD_DIR=${DOWNLOAD_DIR:-""} -INPUT_LEN=${INPUT_LEN:-4000} -OUTPUT_LEN=${OUTPUT_LEN:-16} -MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096} -MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0} -MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000} -NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"} -NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"} - -LOG_FOLDER="$BASE/auto-benchmark/$TAG" -RESULT="$LOG_FOLDER/result.txt" -PROFILE_PATH="$LOG_FOLDER/profile" - -echo "====================== AUTO TUNE PARAMETERS ====================" -echo "SCRIPT_DIR=$SCRIPT_DIR" -echo "BASE=$BASE" -echo "MODEL=$MODEL" -echo "SYSTEM=$SYSTEM" -echo "TP=$TP" -echo "DOWNLOAD_DIR=$DOWNLOAD_DIR" -echo "INPUT_LEN=$INPUT_LEN" -echo "OUTPUT_LEN=$OUTPUT_LEN" -echo "MAX_MODEL_LEN=$MAX_MODEL_LEN" -echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT" -echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS" -echo "NUM_SEQS_LIST=$NUM_SEQS_LIST" -echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST" -echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL" -echo "RESULT_FILE=$RESULT" -echo "====================== AUTO TUNEPARAMETERS ====================" - -rm -rf $LOG_FOLDER -rm -rf $PROFILE_PATH -mkdir -p $LOG_FOLDER -mkdir -p $PROFILE_PATH - -cd "$BASE/vllm" - -pip install -q datasets - -current_hash=$(git rev-parse HEAD) -echo "hash:$current_hash" >> "$RESULT" -echo "current_hash: $current_hash" - -TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN)) -RED='\033[0;31m' -if (( TOTAL_LEN > MAX_MODEL_LEN )); then - echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2 - exit 1 -fi - -best_throughput=0 -best_max_num_seqs=0 -best_num_batched_tokens=0 -best_goodput=0 -best_request_rate=0 - -start_server() { - local gpu_memory_utilization=$1 - local max_num_seqs=$2 - local max_num_batched_tokens=$3 - local vllm_log=$4 - local profile_dir=$5 - - pkill -if vllm - - # Define the common arguments as a bash array. - # Each argument and its value are separate elements. - local common_args_array=( - "$MODEL" - "--disable-log-requests" - "--port" "8004" - "--gpu-memory-utilization" "$gpu_memory_utilization" - "--max-num-seqs" "$max_num_seqs" - "--max-num-batched-tokens" "$max_num_batched_tokens" - "--tensor-parallel-size" "$TP" - "--enable-prefix-caching" - "--load-format" "dummy" - "--download-dir" "$DOWNLOAD_DIR" - "--max-model-len" "$MAX_MODEL_LEN" - ) - - # Use the array expansion "${common_args_array[@]}" - # This correctly passes each element as a separate argument. - if [[ -n "$profile_dir" ]]; then - # Start server with profiling enabled - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ - vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & - else - # Start server without profiling - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \ - vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & - fi - local server_pid=$! - - # wait for 10 minutes... - server_started=0 - for i in {1..60}; do - # This line checks whether the server is still alive or not, - # since that we should always have permission to send signal to the server process. - kill -0 $server_pid 2> /dev/null || break - - RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) - STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) - if [[ "$STATUS_CODE" -eq 200 ]]; then - server_started=1 - break - else - sleep 10 - fi - done - - if (( ! server_started )); then - echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log". - return 1 - else - return 0 - fi -} - -run_benchmark() { - local max_num_seqs=$1 - local max_num_batched_tokens=$2 - local gpu_memory_utilization=$3 - echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" - local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" - echo "vllm_log: $vllm_log" - echo - rm -f $vllm_log - pkill -if vllm - - echo "starting server..." - # Call start_server without a profile_dir to avoid profiling overhead - start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log "" - result=$? - if [[ "$result" -eq 1 ]]; then - echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" - else - echo "server started." - fi - echo - - echo "run benchmark test..." - meet_latency_requirement=0 - # get a basic qps by using request-rate inf - bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" - prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) - adjusted_input_len=$(( INPUT_LEN - prefix_len )) - # --profile flag is removed from this call - vllm bench serve \ - --backend vllm \ - --model $MODEL \ - --dataset-name random \ - --random-input-len $adjusted_input_len \ - --random-output-len $OUTPUT_LEN \ - --ignore-eos \ - --disable-tqdm \ - --request-rate inf \ - --percentile-metrics ttft,tpot,itl,e2el \ - --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ - --num-prompts 1000 \ - --random-prefix-len $prefix_len \ - --port 8004 &> "$bm_log" - throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') - e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') - goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') - - if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then - meet_latency_requirement=1 - request_rate=inf - fi - - if (( ! meet_latency_requirement )); then - # start from request-rate as int(throughput) + 1 - request_rate=$((${throughput%.*} + 1)) - while ((request_rate > 0)); do - # clear prefix cache - curl -X POST http://0.0.0.0:8004/reset_prefix_cache - sleep 5 - bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" - vllm bench serve \ - --backend vllm \ - --model $MODEL \ - --dataset-name random \ - --random-input-len $adjusted_input_len \ - --random-output-len $OUTPUT_LEN \ - --ignore-eos \ - --disable-tqdm \ - --request-rate $request_rate \ - --percentile-metrics ttft,tpot,itl,e2el \ - --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ - --num-prompts 100 \ - --random-prefix-len $prefix_len \ - --port 8004 &> "$bm_log" - throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') - e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') - goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') - if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then - meet_latency_requirement=1 - break - fi - request_rate=$((request_rate-1)) - done - fi - # write the results and update the best result. - if ((meet_latency_requirement)); then - echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" - echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT" - if (( $(echo "$throughput > $best_throughput" | bc -l) )); then - best_throughput=$throughput - best_max_num_seqs=$max_num_seqs - best_num_batched_tokens=$max_num_batched_tokens - best_goodput=$goodput - best_request_rate=$request_rate - fi - else - echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" - echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT" - fi - - echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" - - pkill -if vllm - sleep 10 - echo "====================" - return 0 -} - -read -r -a num_seqs_list <<< "$NUM_SEQS_LIST" -read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST" - -# first find out the max gpu-memory-utilization without HBM OOM. -gpu_memory_utilization=0.98 -find_gpu_memory_utilization=0 -while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do - # Pass empty string for profile_dir argument - start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" "" - result=$? - if [[ "$result" -eq 0 ]]; then - find_gpu_memory_utilization=1 - break - else - gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc) - fi -done - -if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then - echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model." -else - echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER." - exit 1 -fi - -for num_seqs in "${num_seqs_list[@]}"; do - for num_batched_tokens in "${num_batched_tokens_list[@]}"; do - run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization - done -done -echo "finish permutations" - -# ================================================================================= -# FINAL PROFILING RUN FOR THE BEST CONFIGURATION -# ================================================================================= -if (( $(echo "$best_throughput > 0" | bc -l) )); then - echo - echo "Benchmark tuning finished. Now running profiling on the best configuration found..." - echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput" - echo - - vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt" - bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt" - - # Start server with the best params and profiling ENABLED - echo "Starting server for profiling..." - start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH" - - # Run benchmark with the best params and the --profile flag - echo "Running benchmark with profiling..." - prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) - adjusted_input_len=$(( INPUT_LEN - prefix_len )) - vllm bench serve \ - --backend vllm \ - --model $MODEL \ - --dataset-name random \ - --random-input-len $adjusted_input_len \ - --random-output-len $OUTPUT_LEN \ - --ignore-eos \ - --disable-tqdm \ - --request-rate $best_request_rate \ - --percentile-metrics ttft,tpot,itl,e2el \ - --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ - --num-prompts 100 \ - --random-prefix-len $prefix_len \ - --port 8004 \ - --profile &> "$bm_log" -else - echo "No configuration met the latency requirements. Skipping final profiling run." -fi -pkill -if vllm -echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" -echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" diff --git a/benchmarks/auto_tune/batch_auto_tune.sh b/benchmarks/auto_tune/batch_auto_tune.sh deleted file mode 100644 index 57ef20daf..000000000 --- a/benchmarks/auto_tune/batch_auto_tune.sh +++ /dev/null @@ -1,128 +0,0 @@ -#!/bin/bash - -INPUT_JSON="$1" -GCS_PATH="$2" # Optional GCS path for uploading results for each run - -SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) -AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh" - -if [[ -z "$INPUT_JSON" ]]; then - echo "Error: Input JSON file not provided." - echo "Usage: $0 [gcs_upload_path]" - exit 1 -fi - -if [[ ! -f "$INPUT_JSON" ]]; then - echo "Error: File not found at '$INPUT_JSON'" - exit 1 -fi - -if ! command -v jq &> /dev/null; then - echo "Error: 'jq' command not found. Please install jq to process the JSON input." - exit 1 -fi - -if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then - echo "Error: 'gcloud' command not found, but a GCS_PATH was provided." - exit 1 -fi - -SUCCESS_COUNT=0 -FAILURE_COUNT=0 -FAILED_RUNS=() -SCRIPT_START_TIME=$(date +%s) - -json_content=$(cat "$INPUT_JSON") -if ! num_runs=$(echo "$json_content" | jq 'length'); then - echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2 - exit 1 -fi - -echo "Found $num_runs benchmark configurations in $INPUT_JSON." -echo "Starting benchmark runs..." -echo "--------------------------------------------------" - -for i in $(seq 0 $(($num_runs - 1))); do - run_object=$(echo "$json_content" | jq ".[$i]") - - RUN_START_TIME=$(date +%s) - ENV_VARS_ARRAY=() - # Dynamically create env vars from the JSON object's keys - for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do - value=$(echo "$run_object" | jq -r ".$key") - var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_') - ENV_VARS_ARRAY+=("${var_name}=${value}") - done - - echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}" - - # Execute auto_tune.sh and capture output - RUN_OUTPUT_FILE=$(mktemp) - if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then - STATUS="SUCCESS" - ((SUCCESS_COUNT++)) - else - STATUS="FAILURE" - ((FAILURE_COUNT++)) - FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)") - fi - - RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE") - rm "$RUN_OUTPUT_FILE" - - # Parse results and optionally upload them to GCS - RUN_ID="" - RESULTS="" - GCS_RESULTS_URL="" - if [[ "$STATUS" == "SUCCESS" ]]; then - RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true) - - if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then - RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")") - RESULT_DIR=$(dirname "$RESULT_FILE_PATH") - RESULTS=$(cat "$RESULT_FILE_PATH") - - if [[ -n "$GCS_PATH" ]]; then - GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}" - echo "Uploading results to GCS..." - if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then - echo "GCS upload successful." - else - echo "Warning: GCS upload failed for RUN_ID $RUN_ID." - fi - fi - else - echo "Warning: Could not find result file for a successful run." - STATUS="WARNING_NO_RESULT_FILE" - fi - fi - - # Add the results back into the JSON object for this run - json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \ - '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}') - - RUN_END_TIME=$(date +%s) - echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS" - echo "--------------------------------------------------" - - # Save intermediate progress back to the file - echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON" - -done - -SCRIPT_END_TIME=$(date +%s) -echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds." -echo -echo "====================== SUMMARY ======================" -echo "Successful runs: $SUCCESS_COUNT" -echo "Failed runs: $FAILURE_COUNT" -echo "===================================================" - -if [[ $FAILURE_COUNT -gt 0 ]]; then - echo "Details of failed runs (see JSON file for full parameters):" - for failed in "${FAILED_RUNS[@]}"; do - echo " - $failed" - done -fi - -echo "Updated results have been saved to '$INPUT_JSON'." diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py deleted file mode 100644 index ba7c733be..000000000 --- a/benchmarks/backend_request_func.py +++ /dev/null @@ -1,658 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import io -import json -import os -import sys -import time -import traceback -from dataclasses import dataclass, field -from typing import Optional, Union - -import aiohttp -import huggingface_hub.constants -from tqdm.asyncio import tqdm -from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast - -# NOTE(simon): do not import vLLM here so the benchmark script -# can run without vLLM installed. - -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) - - -@dataclass -class RequestFuncInput: - prompt: str - api_url: str - prompt_len: int - output_len: int - model: str - model_name: Optional[str] = None - logprobs: Optional[int] = None - extra_body: Optional[dict] = None - multi_modal_content: Optional[dict | list[dict]] = None - ignore_eos: bool = False - language: Optional[str] = None - request_id: Optional[str] = None - - -@dataclass -class RequestFuncOutput: - generated_text: str = "" - success: bool = False - latency: float = 0.0 - output_tokens: int = 0 - ttft: float = 0.0 # Time to first token - itl: list[float] = field(default_factory=list) # list of inter-token latencies - tpot: float = 0.0 # avg next-token latencies - prompt_len: int = 0 - error: str = "" - - -async def async_request_tgi( - request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - api_url = request_func_input.api_url - assert api_url.endswith("generate_stream") - - async with aiohttp.ClientSession( - trust_env=True, timeout=AIOHTTP_TIMEOUT - ) as session: - params = { - "max_new_tokens": request_func_input.output_len, - "do_sample": True, - "temperature": 0.01, # TGI does not accept 0.0 temperature. - "top_p": 0.99, # TGI does not accept 1.0 top_p. - "truncate": request_func_input.prompt_len, - "ignore_eos_token": request_func_input.ignore_eos, - } - payload = { - "inputs": request_func_input.prompt, - "parameters": params, - } - headers = None - if request_func_input.request_id: - headers = {"x-request-id": request_func_input.request_id} - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - if request_func_input.ignore_eos: - output.output_tokens = request_func_input.output_len - else: - output.output_tokens = None - - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - try: - async with session.post( - url=api_url, json=payload, headers=headers - ) as response: - if response.status == 200: - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - chunk_bytes = chunk_bytes.decode("utf-8") - - # NOTE: Sometimes TGI returns a ping response without - # any data, we should skip it. - if chunk_bytes.startswith(":"): - continue - chunk = chunk_bytes.removeprefix("data:") - - data = json.loads(chunk) - timestamp = time.perf_counter() - # First token - if ttft == 0.0: - ttft = time.perf_counter() - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - most_recent_timestamp) - - most_recent_timestamp = timestamp - - output.latency = most_recent_timestamp - st - output.success = True - output.generated_text = data["generated_text"] - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - -async def async_request_trt_llm( - request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - api_url = request_func_input.api_url - assert api_url.endswith("generate_stream") - - async with aiohttp.ClientSession( - trust_env=True, timeout=AIOHTTP_TIMEOUT - ) as session: - payload = { - "accumulate_tokens": True, - "text_input": request_func_input.prompt, - "temperature": 0.0, - "top_p": 1.0, - "max_tokens": request_func_input.output_len, - "stream": True, - } - if request_func_input.ignore_eos: - payload["min_length"] = request_func_input.output_len - headers = None - if request_func_input.request_id: - headers = {"x-request-id": request_func_input.request_id} - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - try: - async with session.post( - url=api_url, json=payload, headers=headers - ) as response: - if response.status == 200: - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - chunk = chunk_bytes.decode("utf-8").removeprefix("data:") - - data = json.loads(chunk) - output.generated_text += data["text_output"] - timestamp = time.perf_counter() - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - most_recent_timestamp) - - most_recent_timestamp = timestamp - - output.latency = most_recent_timestamp - st - output.success = True - - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - -async def async_request_deepspeed_mii( - request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - api_url = request_func_input.api_url - assert api_url.endswith(("completions", "profile")), ( - "OpenAI Completions API URL must end with 'completions' or 'profile'." - ) - - async with aiohttp.ClientSession( - trust_env=True, timeout=AIOHTTP_TIMEOUT - ) as session: - payload = { - "model": request_func_input.model, - "prompt": request_func_input.prompt, - "max_tokens": request_func_input.output_len, - "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. - "top_p": 1.0, - } - headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - if request_func_input.request_id: - headers["x-request-id"] = request_func_input.request_id - - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024, - # will use 0 as placeholder. - # See https://github.com/microsoft/DeepSpeed-MII/pull/311 - output.ttft = 0 - - st = time.perf_counter() - try: - async with session.post( - url=api_url, json=payload, headers=headers - ) as response: - if response.status == 200: - parsed_resp = await response.json() - output.latency = time.perf_counter() - st - if "choices" in parsed_resp: - output.generated_text = parsed_resp["choices"][0]["text"] - elif "text" in parsed_resp: - output.generated_text = parsed_resp["text"][0] - else: - output.error = ( - "Unexpected response format: " - "neither 'choices' nor 'text' found" - ) - output.success = False - output.success = True - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - -async def async_request_openai_completions( - request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - api_url = request_func_input.api_url - assert api_url.endswith(("completions", "profile")), ( - "OpenAI Completions API URL must end with 'completions' or 'profile'." - ) - - async with aiohttp.ClientSession( - trust_env=True, timeout=AIOHTTP_TIMEOUT - ) as session: - payload = { - "model": request_func_input.model_name - if request_func_input.model_name - else request_func_input.model, - "prompt": request_func_input.prompt, - "temperature": 0.0, - "repetition_penalty": 1.0, - "max_tokens": request_func_input.output_len, - "logprobs": request_func_input.logprobs, - "stream": True, - "stream_options": { - "include_usage": True, - }, - } - if request_func_input.ignore_eos: - payload["ignore_eos"] = request_func_input.ignore_eos - if request_func_input.extra_body: - payload.update(request_func_input.extra_body) - headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - if request_func_input.request_id: - headers["x-request-id"] = request_func_input.request_id - - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - generated_text = "" - st = time.perf_counter() - most_recent_timestamp = st - try: - async with session.post( - url=api_url, json=payload, headers=headers - ) as response: - if response.status == 200: - first_chunk_received = False - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") - if chunk != "[DONE]": - data = json.loads(chunk) - - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if choices := data.get("choices"): - # Note that text could be empty here - # e.g. for special tokens - text = choices[0].get("text") - timestamp = time.perf_counter() - # First token - if not first_chunk_received: - first_chunk_received = True - ttft = time.perf_counter() - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text += text or "" - if usage := data.get("usage"): - output.output_tokens = usage.get("completion_tokens") - if first_chunk_received: - output.success = True - else: - output.success = False - output.error = ( - "Never received a valid chunk to calculate TTFT." - "This response will be marked as failed!" - ) - output.generated_text = generated_text - output.latency = most_recent_timestamp - st - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - -async def async_request_openai_chat_completions( - request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - api_url = request_func_input.api_url - assert api_url.endswith(("chat/completions", "profile")), ( - "OpenAI Chat Completions API URL must end with 'chat/completions'." - ) - - async with aiohttp.ClientSession( - trust_env=True, timeout=AIOHTTP_TIMEOUT - ) as session: - content = [{"type": "text", "text": request_func_input.prompt}] - if request_func_input.multi_modal_content: - mm_content = request_func_input.multi_modal_content - if isinstance(mm_content, list): - content.extend(mm_content) - elif isinstance(mm_content, dict): - content.append(mm_content) - else: - raise TypeError( - "multi_modal_content must be a dict or list[dict] for openai-chat" - ) - payload = { - "model": request_func_input.model_name - if request_func_input.model_name - else request_func_input.model, - "messages": [ - {"role": "user", "content": content}, - ], - "temperature": 0.0, - "max_completion_tokens": request_func_input.output_len, - "stream": True, - "stream_options": { - "include_usage": True, - }, - } - if request_func_input.ignore_eos: - payload["ignore_eos"] = request_func_input.ignore_eos - if request_func_input.extra_body: - payload.update(request_func_input.extra_body) - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - } - if request_func_input.request_id: - headers["x-request-id"] = request_func_input.request_id - - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - generated_text = "" - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - try: - async with session.post( - url=api_url, json=payload, headers=headers - ) as response: - if response.status == 200: - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - chunk_bytes = chunk_bytes.decode("utf-8") - # NOTE: SSE comments (often used as pings) start with a colon. - # These are not JSON data payload and should be skipped. - if chunk_bytes.startswith(":"): - continue - - chunk = chunk_bytes.removeprefix("data: ") - - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) - - if choices := data.get("choices"): - content = choices[0]["delta"].get("content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - most_recent_timestamp) - - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get("completion_tokens") - - most_recent_timestamp = timestamp - - output.generated_text = generated_text - output.success = True - output.latency = most_recent_timestamp - st - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - -async def async_request_openai_audio( - request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - # Lazy import without PlaceholderModule to avoid vllm dep. - import soundfile - - api_url = request_func_input.api_url - assert api_url.endswith(("transcriptions", "translations")), ( - "OpenAI Chat Completions API URL must end with 'transcriptions' " - ) - "or `translations`." - - async with aiohttp.ClientSession( - trust_env=True, timeout=AIOHTTP_TIMEOUT - ) as session: - content = [{"type": "text", "text": request_func_input.prompt}] - payload = { - "model": request_func_input.model_name - if request_func_input.model_name - else request_func_input.model, - "temperature": 0.0, - "max_completion_tokens": request_func_input.output_len, - "stream": True, - "language": "en", - # Flattened due to multipart/form-data - "stream_include_usage": True, - "stream_continuous_usage_stats": True, - } - if request_func_input.extra_body: - payload.update(request_func_input.extra_body) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - } - if request_func_input.request_id: - headers["x-request-id"] = request_func_input.request_id - - # Send audio file - def to_bytes(y, sr): - buffer = io.BytesIO() - soundfile.write(buffer, y, sr, format="WAV") - buffer.seek(0) - return buffer - - mm_audio = request_func_input.multi_modal_content - if not isinstance(mm_audio, dict) or "audio" not in mm_audio: - raise TypeError("multi_modal_content must be a dict containing 'audio'") - with to_bytes(*mm_audio["audio"]) as f: - form = aiohttp.FormData() - form.add_field("file", f, content_type="audio/wav") - for key, value in payload.items(): - form.add_field(key, str(value)) - - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - generated_text = "" - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - try: - async with session.post( - url=api_url, data=form, headers=headers - ) as response: - if response.status == 200: - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) - - if choices := data.get("choices"): - content = choices[0]["delta"].get("content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append( - timestamp - most_recent_timestamp - ) - - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens" - ) - - most_recent_timestamp = timestamp - - output.generated_text = generated_text - output.success = True - output.latency = most_recent_timestamp - st - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - -def get_model(pretrained_model_name_or_path: str) -> str: - if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true": - from modelscope import snapshot_download - - from vllm.model_executor.model_loader.weight_utils import get_lock - - # Use file lock to prevent multiple processes from - # downloading the same model weights at the same time. - with get_lock(pretrained_model_name_or_path): - model_path = snapshot_download( - model_id=pretrained_model_name_or_path, - local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, - ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"], - ) - - return model_path - return pretrained_model_name_or_path - - -def get_tokenizer( - pretrained_model_name_or_path: str, - tokenizer_mode: str = "auto", - trust_remote_code: bool = False, - **kwargs, -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: - if pretrained_model_name_or_path is not None and not os.path.exists( - pretrained_model_name_or_path - ): - pretrained_model_name_or_path = get_model(pretrained_model_name_or_path) - if tokenizer_mode == "slow": - if kwargs.get("use_fast", False): - raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") - kwargs["use_fast"] = False - if tokenizer_mode == "mistral": - try: - from vllm.transformers_utils.tokenizer import MistralTokenizer - except ImportError as e: - raise ImportError( - "MistralTokenizer requires vllm package.\n" - "Please install it with `pip install vllm` " - "to use mistral tokenizer mode." - ) from e - return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path)) - else: - return AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - **kwargs, - ) - - -ASYNC_REQUEST_FUNCS = { - "tgi": async_request_tgi, - "vllm": async_request_openai_completions, - "lmdeploy": async_request_openai_completions, - "deepspeed-mii": async_request_deepspeed_mii, - "openai": async_request_openai_completions, - "openai-chat": async_request_openai_chat_completions, - "openai-audio": async_request_openai_audio, - "tensorrt-llm": async_request_trt_llm, - "scalellm": async_request_openai_completions, - "sglang": async_request_openai_completions, - "llama.cpp": async_request_openai_completions, -} - -OPENAI_COMPATIBLE_BACKENDS = [ - k - for k, v in ASYNC_REQUEST_FUNCS.items() - if v in (async_request_openai_completions, async_request_openai_chat_completions) -] diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py deleted file mode 100644 index eae8d9927..000000000 --- a/benchmarks/benchmark_block_pool.py +++ /dev/null @@ -1,74 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import gc - -from tabulate import tabulate - -from benchmark_utils import TimeCollector -from vllm.utils import FlexibleArgumentParser -from vllm.v1.core.block_pool import BlockPool - - -def main(args): - rows = [] - for allocate_block in args.allocate_blocks: - # Enforce a GC collect ahead to minimize the impact among runs - gc.collect() - block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True) - - get_blocks_times = TimeCollector(TimeCollector.US) - free_blocks_times = TimeCollector(TimeCollector.US) - for _ in range(args.num_iteration): - with get_blocks_times: - blocks = block_pool.get_new_blocks(allocate_block) - with free_blocks_times: - block_pool.free_blocks(blocks) - - rows.append( - [get_blocks_times.cnt, args.num_gpu_blocks, allocate_block] - + get_blocks_times.dump_avg_max() - + free_blocks_times.dump_avg_max() - ) - - print( - tabulate( - rows, - headers=[ - "Iterations", - "Total\nBlocks", - "Allocated\nBlocks", - "Get Blocks\nAvg (us)", - "Get Blocks\nMax (us)", - "Free Blocks\nAvg (us)", - "Free Blocks\nMax (us)", - ], - tablefmt="grid", - floatfmt=".3f", - ) - ) - - -def invoke_main() -> None: - parser = FlexibleArgumentParser( - description="Benchmark the performance of BlockPool for KV Cache." - ) - parser.add_argument("--num-gpu-blocks", type=int, default=100000) - parser.add_argument( - "--num-iteration", - type=int, - default=1000, - help="Number of iterations to run to stabilize final data readings", - ) - parser.add_argument( - "--allocate-blocks", - type=int, - nargs="*", - default=[10, 50, 100, 500, 1000], - help="Number of blocks to allocate", - ) - args = parser.parse_args() - main(args) - - -if __name__ == "__main__": - invoke_main() # pragma: no cover diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py deleted file mode 100644 index a7892f3f7..000000000 --- a/benchmarks/benchmark_latency.py +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import sys - -if __name__ == "__main__": - print("""DEPRECATED: This script has been moved to the vLLM CLI. - -Please use the following command instead: - vllm bench latency - -For help with the new command, run: - vllm bench latency --help - -Alternatively, you can run the new command directly with: - python -m vllm.entrypoints.cli.main bench latency --help -""") - sys.exit(1) diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py deleted file mode 100644 index 6e0f3b51c..000000000 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ /dev/null @@ -1,202 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Offline benchmark to test the long document QA throughput. - -Example usage: - # This workload samples 8 different prompts with a default input - # length of 20000 tokens, then replicates each prompt 2 times - # in random order. - python benchmark_long_document_qa_throughput.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-documents 8 \ - --repeat-count 2 - -Commandline arguments: - --num-documents: The number of documents to sample prompts from. - - --document-length: The length of each document in tokens. - (Optional, default: 20000) - - --output-len: The number of tokens to generate for each prompt. - (Optional, default: 10) - - --repeat-count: The number of times to repeat each prompt. - (Optional, default: 2) - - --repeat-mode: The mode to repeat prompts. The supported modes are: - - 'random': shuffle the prompts randomly. (Default) - - 'tile': the entire prompt list is repeated in sequence. (Potentially - lowest cache hit) - - 'interleave': each prompt is repeated consecutively before - moving to the next element. (Highest cache hit) - - --shuffle-seed: Random seed when the repeat mode is "random". - (Optional, default: 0) - -In the meantime, it also supports all the vLLM engine args to initialize the -LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more -details. -""" - -import dataclasses -import random -import time - -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser - - -def test_long_document_qa(llm=None, sampling_params=None, prompts=None): - """ - Test long document QA with the given prompts and sampling parameters. - Print the time spent in processing all the prompts. - - Args: - llm: The language model used for generating responses. - sampling_params: Sampling parameter used to generate the response. - prompts: A list of prompt strings to be processed by the LLM. - """ - start_time = time.time() - llm.generate(prompts, sampling_params=sampling_params) - end_time = time.time() - print(f"Time to execute all requests: {end_time - start_time:.4f} secs") - - -def repeat_prompts(prompts, repeat_count, mode: str): - """ - Repeat each prompt in the list for a specified number of times. - The order of prompts in the output list depends on the mode. - - Args: - prompts: A list of prompts to be repeated. - repeat_count: The number of times each prompt is repeated. - mode: The mode of repetition. Supported modes are: - - 'random': Shuffle the prompts randomly after repetition. - - 'tile': Repeat the entire prompt list in sequence. - Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3]. - - 'interleave': Repeat each prompt consecutively before moving to - the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3]. - - Returns: - A list of repeated prompts in the specified order. - - Raises: - ValueError: If an invalid mode is provided. - """ - print("Repeat mode: ", mode) - if mode == "random": - repeated_prompts = prompts * repeat_count - random.shuffle(repeated_prompts) - return repeated_prompts - elif mode == "tile": - return prompts * repeat_count - elif mode == "interleave": - repeated_prompts = [] - for prompt in prompts: - repeated_prompts.extend([prompt] * repeat_count) - return repeated_prompts - else: - raise ValueError( - f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'" - ) - - -def main(args): - random.seed(args.shuffle_seed) - - # Prepare the prompts: - # we append the document id at the beginning to avoid any of the document - # being the prefix of other documents - prompts = [ - str(i) + " ".join(["hi"] * args.document_length) - for i in range(args.num_documents) - ] - - prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode) - - warmup_prompts = [ - "This is warm up request " + str(i) + " ".join(["hi"] * args.document_length) - for i in range(args.num_documents) - ] - - # Create the LLM engine - engine_args = EngineArgs.from_cli_args(args) - llm = LLM(**dataclasses.asdict(engine_args)) - sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) - - print("------warm up------") - test_long_document_qa( - llm=llm, - prompts=warmup_prompts, - sampling_params=sampling_params, - ) - - print("------start generating------") - test_long_document_qa( - llm=llm, - prompts=prompts, - sampling_params=sampling_params, - ) - - -def create_argument_parser(): - parser = FlexibleArgumentParser( - description="Benchmark the performance with or " - "without automatic prefix caching." - ) - - parser.add_argument( - "--document-length", - type=int, - # Roughly the number of tokens for a system paper, - # excluding images - default=20000, - help="Range of input lengths for sampling prompts, " - 'specified as "min:max" (e.g., "128:256").', - ) - - parser.add_argument( - "--num-documents", - type=int, - default=8, - help="Range of input lengths for sampling prompts, " - 'specified as "min:max" (e.g., "128:256").', - ) - - parser.add_argument("--output-len", type=int, default=10) - - parser.add_argument( - "--repeat-count", - type=int, - default=2, - help="Number of times to repeat each prompt", - ) - - parser.add_argument( - "--repeat-mode", - type=str, - default="random", - help="The mode to repeat prompts. The supported " - 'modes are "random", "tile", and "interleave". ' - "See repeat_prompts() in the source code for details.", - ) - - parser.add_argument( - "--shuffle-seed", - type=int, - default=0, - help='Random seed when the repeat mode is "random"', - ) - - parser = EngineArgs.add_cli_args(parser) - - return parser - - -if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - main(args) diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py deleted file mode 100644 index d4b83edbd..000000000 --- a/benchmarks/benchmark_ngram_proposer.py +++ /dev/null @@ -1,213 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import gc -import time -from unittest import mock - -import numpy as np -from tabulate import tabulate - -from benchmark_utils import TimeCollector -from vllm.config import ( - CacheConfig, - DeviceConfig, - LoadConfig, - ModelConfig, - ParallelConfig, - SchedulerConfig, - SpeculativeConfig, - VllmConfig, -) -from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser -from vllm.v1.spec_decode.ngram_proposer import NgramProposer -from vllm.v1.worker.gpu_input_batch import InputBatch -from vllm.v1.worker.gpu_model_runner import GPUModelRunner - - -def benchmark_propose(args): - rows = [] - for max_ngram in args.max_ngram: - collector = TimeCollector(TimeCollector.US) - - model_config = ModelConfig( - model="facebook/opt-125m", - task="generate", - max_model_len=args.num_token + args.num_spec_token, - tokenizer="facebook/opt-125m", - tokenizer_mode="auto", - dtype="auto", - seed=None, - trust_remote_code=False, - ) - proposer = NgramProposer( - vllm_config=VllmConfig( - model_config=model_config, - speculative_config=SpeculativeConfig( - prompt_lookup_min=args.min_ngram, - prompt_lookup_max=max_ngram, - num_speculative_tokens=args.num_spec_token, - method="ngram", - ), - ) - ) - - # Warm up - proposer.propose(np.random.randint(0, 20, (args.num_token,))) - - gc.collect() - for _ in range(args.num_iteration): - tokens = np.random.randint(0, 20, (args.num_req, args.num_token)) - with collector: - for i in range(args.num_req): - proposer.propose(tokens[i, :]) - rows.append( - [args.num_req, args.num_token, args.min_ngram, max_ngram] - + collector.dump_avg_max() - ) - - print( - tabulate( - rows, - headers=[ - "# Request", - "# Token", - "Min Ngram", - "Max Ngram", - "Avg (us)", - "Max (us)", - ], - tablefmt="grid", - floatfmt=".3f", - ) - ) - - -def benchmark_batched_propose(args): - NUM_SPECULATIVE_TOKENS_NGRAM = 10 - PROMPT_LOOKUP_MIN = 5 - PROMPT_LOOKUP_MAX = 15 - MAX_MODEL_LEN = int(1e7) - DEVICE = current_platform.device_type - - model_config = ModelConfig(model="facebook/opt-125m", runner="generate") - - speculative_config = SpeculativeConfig( - target_model_config=model_config, - target_parallel_config=ParallelConfig(), - method="ngram", - num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM, - prompt_lookup_max=PROMPT_LOOKUP_MAX, - prompt_lookup_min=PROMPT_LOOKUP_MIN, - ) - - vllm_config = VllmConfig( - model_config=model_config, - cache_config=CacheConfig(), - speculative_config=speculative_config, - device_config=DeviceConfig(device=current_platform.device_type), - parallel_config=ParallelConfig(), - load_config=LoadConfig(), - scheduler_config=SchedulerConfig(), - ) - - # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group - mock_pp_group = mock.MagicMock() - mock_pp_group.world_size = 1 - with mock.patch( - "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group - ): - runner = GPUModelRunner(vllm_config, DEVICE) - - # hack max model len - runner.max_model_len = MAX_MODEL_LEN - runner.drafter.max_model_len = MAX_MODEL_LEN - - dummy_input_batch = InputBatch( - max_num_reqs=args.num_req, - max_model_len=MAX_MODEL_LEN, - max_num_batched_tokens=args.num_req * args.num_token, - device=DEVICE, - pin_memory=False, - vocab_size=256000, - block_sizes=[16], - ) - dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req)) - dummy_input_batch.spec_decode_unsupported_reqs = () - dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req - dummy_input_batch.token_ids_cpu = np.random.randint( - 0, 20, (args.num_req, args.num_token) - ) - - runner.input_batch = dummy_input_batch - - sampled_token_ids = [[0]] * args.num_req - - print("Starting benchmark") - # first run is warmup so ignore it - for _ in range(args.num_iteration): - start = time.time() - runner.drafter.propose( - sampled_token_ids, - dummy_input_batch.req_ids, - dummy_input_batch.num_tokens_no_spec, - dummy_input_batch.token_ids_cpu, - dummy_input_batch.spec_decode_unsupported_reqs, - ) - end = time.time() - print(f"Iteration time (s): {end - start}") - - -def invoke_main() -> None: - parser = FlexibleArgumentParser( - description="Benchmark the performance of N-gram speculative decode drafting" - ) - parser.add_argument( - "--batched", action="store_true", help="consider time to prepare batch" - ) # noqa: E501 - parser.add_argument( - "--num-iteration", - type=int, - default=100, - help="Number of iterations to run to stabilize final data readings", - ) - parser.add_argument( - "--num-req", type=int, default=128, help="Number of requests in the batch" - ) - parser.add_argument( - "--num-token", type=int, default=1500, help="Number of tokens for each request" - ) - parser.add_argument( - "--min-ngram", - type=int, - default=3, - help="Minimum n-gram to match", - ) - parser.add_argument( - "--max-ngram", - type=int, - nargs="*", - default=[5, 7, 10, 15, 20], - help="Maximum n-gram to match", - ) - parser.add_argument( - "--num-spec-token", - type=int, - default=3, - help="Number of speculative tokens to generate", - ) - args = parser.parse_args() - - if not args.batched: - benchmark_propose(args) - else: - benchmark_batched_propose(args) - - -""" -# Example command lines: -# time python3 benchmarks/benchmark_ngram_proposer.py -# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128 -""" # noqa: E501 -if __name__ == "__main__": - invoke_main() # pragma: no cover diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py deleted file mode 100644 index b5e2613de..000000000 --- a/benchmarks/benchmark_prefix_caching.py +++ /dev/null @@ -1,278 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Benchmark the efficiency of prefix caching. - -This script allows you to benchmark the performance of -a model with and without prefix caching using either fixed prompts -or prompts sampled from the ShareGPT dataset. - -Fixed example usage: - python benchmark_prefix_caching.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-prompts 1 \ - --repeat-count 100 \ - --input-length-range 128:256 - -ShareGPT example usage: - # This command samples 20 prompts with input lengths - # between 128 and 256 tokens from the ShareGPT dataset, - # then replicates each prompt 5 times. - python benchmark_prefix_caching.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \ - --enable-prefix-caching \ - --num-prompts 20 \ - --repeat-count 5 \ - --input-length-range 128:256 -""" - -import dataclasses -import json -import random -import time -from typing import Optional - -from transformers import PreTrainedTokenizerBase - -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser - -try: - from vllm.transformers_utils.tokenizer import get_tokenizer -except ImportError: - from backend_request_func import get_tokenizer - -PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 - - -def test_prefix(llm=None, sampling_params=None, prompts=None): - start_time = time.time() - - llm.generate(prompts, sampling_params=sampling_params) - - end_time = time.time() - print(f"cost time {end_time - start_time}") - - -@dataclasses.dataclass -class Request: - prompt: str - prompt_len: int - output_len: int - - -def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]: - vocab = tokenizer.get_vocab() - all_special_ids = set(tokenizer.all_special_ids) - - # Remove the special tokens. - return random.choices( - [v for k, v in vocab.items() if k not in all_special_ids], - k=length, - ) - - -def sample_requests_from_dataset( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - input_length_range: tuple[int, int], - fixed_output_len: Optional[int], -) -> list[Request]: - if fixed_output_len is not None and fixed_output_len < 4: - raise ValueError("output_len too small") - - # Load the dataset. - with open(dataset_path) as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [ - (data["conversations"][0]["value"], data["conversations"][1]["value"]) - for data in dataset - ] - - # Shuffle the dataset. - random.shuffle(dataset) - - min_len, max_len = input_length_range - assert min_len >= 0 and max_len >= min_len, "input_length_range too small" - - # Filter out sequences that are too long or too short - filtered_requests: list[Request] = [] - - for i in range(len(dataset)): - if len(filtered_requests) == num_requests: - break - - # Tokenize the prompts and completions. - prompt_token_ids = tokenizer(dataset[i][0]).input_ids - prompt = tokenizer.decode(prompt_token_ids) - completion = dataset[i][1] - completion_token_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_token_ids) - output_len = ( - len(completion_token_ids) if fixed_output_len is None else fixed_output_len - ) - if min_len <= prompt_len <= max_len: - filtered_requests.append(Request(prompt, prompt_len, output_len)) - - return filtered_requests - - -def sample_requests_from_random( - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - input_length_range: tuple[int, int], - fixed_output_len: Optional[int], - prefix_len: int, -) -> list[Request]: - requests = [] - prefix_token_ids = sample_tokens(tokenizer, prefix_len) - min_len, max_len = input_length_range - - for i in range(num_requests): - unique_part_token_ids = sample_tokens( - tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len) - ) - prompt_token_ids = prefix_token_ids + unique_part_token_ids - prompt = tokenizer.decode(prompt_token_ids) - prompt_len = len(prompt_token_ids) - assert min_len <= prompt_len <= max_len, ( - f"prompt_len {prompt_len} out of range {min_len}:{max_len}" - ) - requests.append(Request(prompt, prompt_len, fixed_output_len)) - return requests - - -def repeat_and_sort_requests( - requests: list[Request], repeat_count: int, sort: bool = False -) -> list[str]: - repeated_requests = requests * repeat_count - if sort: - repeated_requests.sort(key=lambda x: x[1]) - else: - random.shuffle(repeated_requests) - return [req.prompt for req in repeated_requests] - - -def main(args): - tokenizer = get_tokenizer(args.model, trust_remote_code=True) - input_length_range = tuple(map(int, args.input_length_range.split(":"))) - random.seed(args.seed) - if args.dataset_path is not None: - if args.prefix_len > 0: - raise ValueError( - "prefix-len is not supported when dataset-path is provided." - ) - print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}") - filtered_requests = sample_requests_from_dataset( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, - tokenizer=tokenizer, - input_length_range=input_length_range, - fixed_output_len=args.output_len, - ) - else: - print(f"Start to sample {args.num_prompts} prompts from random") - filtered_requests = sample_requests_from_random( - num_requests=args.num_prompts, - tokenizer=tokenizer, - input_length_range=input_length_range, - fixed_output_len=args.output_len, - prefix_len=args.prefix_len, - ) - - # Print some helpful stats of the requests. - print(f"Sampled {len(filtered_requests)} requests.") - prompt_lens = [req.prompt_len for req in filtered_requests] - print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}") - print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}") - print(f"Min Prompt Length: {min(prompt_lens)}") - print(f"Max Prompt Length: {max(prompt_lens)}") - - engine_args = EngineArgs.from_cli_args(args) - - llm = LLM(**dataclasses.asdict(engine_args)) - - sampling_params = SamplingParams( - temperature=0, - max_tokens=args.output_len, - detokenize=not args.disable_detokenize, - ) - - print("Testing filtered requests") - prompts = repeat_and_sort_requests( - filtered_requests, repeat_count=args.repeat_count, sort=args.sort - ) - - print("------start generating------") - test_prefix( - llm=llm, - prompts=prompts, - sampling_params=sampling_params, - ) - - -def create_argument_parser(): - parser = FlexibleArgumentParser( - description="Benchmark the performance with or without " - "automatic prefix caching." - ) - parser.add_argument( - "--dataset-path", type=str, default=None, help="Path to the dataset." - ) - parser.add_argument("--output-len", type=int, default=10) - parser.add_argument( - "--num-prompts", - type=int, - required=True, - help="Number of the prompts sampled from dataset", - ) - parser.add_argument( - "--repeat-count", - type=int, - default=1, - help="Number of times to repeat each prompt", - ) - parser.add_argument( - "--sort", action="store_true", help="Sort prompts by input length" - ) - parser.add_argument( - "--input-length-range", - type=str, - required=True, - help="Range of input lengths for sampling prompts," - 'specified as "min:max" (e.g., "128:256").', - ) - parser.add_argument( - "--prefix-len", - type=int, - default=0, - help="Specifies the length of a common prefix to be " - "added to the input prompt. The input-length-range will " - "subtract this length when filtering prompts. Only used " - "when dataset-path is not provided.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)" - ), - ) - - parser = EngineArgs.add_cli_args(parser) - - return parser - - -if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py deleted file mode 100644 index bb453791c..000000000 --- a/benchmarks/benchmark_prioritization.py +++ /dev/null @@ -1,222 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark offline prioritization.""" - -import argparse -import dataclasses -import json -import random -import time -from typing import Optional - -from transformers import AutoTokenizer, PreTrainedTokenizerBase - -from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser - - -# Select a equi-probable random priority -def get_random_flag(): - return 0 if random.random() < 0.5 else 1 - - -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], -) -> list[tuple[str, int, int, int]]: - if fixed_output_len is not None and fixed_output_len < 4: - raise ValueError("output_len too small") - - # Load the dataset. - with open(dataset_path) as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [ - (data["conversations"][0]["value"], data["conversations"][1]["value"]) - for data in dataset - ] - - # Shuffle the dataset. - random.shuffle(dataset) - - # Filter out sequences that are too long or too short - filtered_dataset: list[tuple[str, int, int]] = [] - for i in range(len(dataset)): - if len(filtered_dataset) == num_requests: - break - - # Tokenize the prompts and completions. - prompt = dataset[i][0] - prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] - completion_token_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_token_ids) - output_len = ( - len(completion_token_ids) if fixed_output_len is None else fixed_output_len - ) - if prompt_len < 4 or output_len < 4: - # Prune too short sequences. - continue - if prompt_len > 1024 or prompt_len + output_len > 2048: - # Prune too long sequences. - continue - - priority = get_random_flag() - - filtered_dataset.append((prompt, prompt_len, output_len, priority)) - - return filtered_dataset - - -def run_vllm( - requests: list[tuple[str, int, int]], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -) -> float: - from vllm import LLM, SamplingParams - - llm = LLM(**dataclasses.asdict(engine_args)) - - assert all( - llm.llm_engine.model_config.max_model_len >= (request[1] + request[2]) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of" - " input_len and output_len for all requests." - ) - - # Add the requests to the engine. - prompts = [] - sampling_params = [] - priority = [] - for prompt, _, output_len, _priority in requests: - prompts.append(prompt) - priority.append(_priority) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=output_len, - detokenize=not disable_detokenize, - ) - ) - - start = time.perf_counter() - llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True) - end = time.perf_counter() - return end - start - - -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - - # Sample the requests. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code - ) - if args.dataset is None: - # Synthesize a prompt with the given input length. - prompt = "hi" * (args.input_len - 1) - requests = [ - (prompt, args.input_len, args.output_len, get_random_flag()) - for _ in range(args.num_prompts) - ] - else: - requests = sample_requests( - args.dataset, args.num_prompts, tokenizer, args.output_len - ) - - if args.backend == "vllm": - elapsed_time = run_vllm( - requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize - ) - else: - raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum( - prompt_len + output_len for _, prompt_len, output_len, priority in requests - ) - print( - f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} tokens/s" - ) - - # Output JSON results if specified - if args.output_json: - results = { - "elapsed_time": elapsed_time, - "num_requests": len(requests), - "total_num_tokens": total_num_tokens, - "requests_per_second": len(requests) / elapsed_time, - "tokens_per_second": total_num_tokens / elapsed_time, - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - - -def create_argument_parser(): - parser = FlexibleArgumentParser(description="Benchmark the throughput.") - parser.add_argument( - "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" - ) - parser.add_argument( - "--dataset", type=str, default=None, help="Path to the dataset." - ) - parser.add_argument( - "--input-len", - type=int, - default=None, - help="Input prompt length for each request", - ) - parser.add_argument( - "--output-len", - type=int, - default=None, - help="Output length for each request. Overrides the " - "output length from the dataset.", - ) - parser.add_argument( - "--n", type=int, default=1, help="Number of generated sequences per prompt." - ) - parser.add_argument( - "--num-prompts", type=int, default=200, help="Number of prompts to process." - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the throughput results in JSON format.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)" - ), - ) - - parser = EngineArgs.add_cli_args(parser) - - return parser - - -if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - if args.tokenizer is None: - args.tokenizer = args.model - if args.dataset is None: - assert args.input_len is not None - assert args.output_len is not None - else: - assert args.input_len is None - - main(args) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py deleted file mode 100644 index 76cf51498..000000000 --- a/benchmarks/benchmark_serving.py +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import sys - -if __name__ == "__main__": - print("""DEPRECATED: This script has been moved to the vLLM CLI. - -Please use the following command instead: - vllm bench serve - -For help with the new command, run: - vllm bench serve --help - -Alternatively, you can run the new command directly with: - python -m vllm.entrypoints.cli.main bench serve --help -""") - sys.exit(1) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py deleted file mode 100644 index a03506254..000000000 --- a/benchmarks/benchmark_serving_structured_output.py +++ /dev/null @@ -1,1048 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -r"""Benchmark online serving throughput with structured outputs. - -On the server side, run one of the following commands: - (vLLM OpenAI API server) - vllm serve - -On the client side, run: - python benchmarks/benchmark_serving_structured_output.py \ - --backend \ - --model \ - --dataset json \ - --structured-output-ratio 1.0 \ - --request-rate 10 \ - --num-prompts 1000 - - when using tgi backend, add - --endpoint /generate_stream - to the end of the command above. -""" - -import argparse -import asyncio -import copy -import dataclasses -import json -import os -import random -import time -import uuid -import warnings -from collections.abc import AsyncGenerator -from dataclasses import dataclass -from typing import Optional - -import datasets -import numpy as np -import pandas as pd -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase - -from backend_request_func import ( - ASYNC_REQUEST_FUNCS, - RequestFuncInput, - RequestFuncOutput, -) - -try: - from vllm.transformers_utils.tokenizer import get_tokenizer -except ImportError: - from backend_request_func import get_tokenizer - -try: - from vllm.utils import FlexibleArgumentParser -except ImportError: - from argparse import ArgumentParser as FlexibleArgumentParser - -from vllm.v1.structured_output.backend_xgrammar import ( - has_xgrammar_unsupported_json_features, -) - -MILLISECONDS_TO_SECONDS_CONVERSION = 1000 - - -@dataclass -class BenchmarkMetrics: - completed: int - total_input: int - total_output: int - request_throughput: float - request_goodput: float - output_throughput: float - total_token_throughput: float - mean_ttft_ms: float - median_ttft_ms: float - std_ttft_ms: float - percentiles_ttft_ms: list[tuple[float, float]] - mean_tpot_ms: float - median_tpot_ms: float - std_tpot_ms: float - percentiles_tpot_ms: list[tuple[float, float]] - mean_itl_ms: float - median_itl_ms: float - std_itl_ms: float - percentiles_itl_ms: list[tuple[float, float]] - # E2EL stands for end-to-end latency per request. - # It is the time taken on the client side from sending - # a request to receiving a complete response. - mean_e2el_ms: float - median_e2el_ms: float - std_e2el_ms: float - percentiles_e2el_ms: list[tuple[float, float]] - - -@dataclasses.dataclass -class SampleRequest: - """A class representing a single inference request for benchmarking. - - Attributes: - prompt: The input text prompt for the model. - multi_modal_data: Optional dictionary containing multi-modal data (e.g. - images). - prompt_len: The length of the prompt in tokens. - expected_output_len: The expected length of the output in tokens. - """ - - prompt: str - prompt_len: int - expected_output_len: int - schema: dict - structure_type: str - completion: str = None - - -def sample_requests( - tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace -) -> list[SampleRequest]: - if args.dataset == "json" or args.dataset == "json-unique": - if args.json_schema_path is None: - dir_path = os.path.dirname(os.path.realpath(__file__)) - args.json_schema_path = os.path.join( - dir_path, "structured_schemas", "structured_schema_1.json" - ) - json_schemas = [] - with open(args.json_schema_path) as f: - schema = json.load(f) - - if args.dataset == "json-unique": - json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)] - for i in range(len(json_schemas)): - if "properties" not in json_schemas[i]: - json_schemas[i]["properties"] = {} - json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = { - "type": "string", - "description": "An unique optional field to avoid cached schemas", - } - else: - json_schemas = [schema] * args.num_prompts - - def gen_prompt(index: int): - return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501 - - def get_schema(index: int): - return json_schemas[index % len(json_schemas)] - - requests = [ - SampleRequest( - prompt=gen_prompt(i), - prompt_len=len(tokenizer(gen_prompt(i)).input_ids), - expected_output_len=args.output_len, - schema=get_schema(i), - structure_type=args.structure_type, - ) - for i in range(args.num_prompts) - ] - - elif args.dataset == "grammar": - schema = """ - root ::= select_statement - - select_statement ::= "SELECT " column " from " table " where " condition - - column ::= "col_1 " | "col_2 " - - table ::= "table_1 " | "table_2 " - - condition ::= column "= " number - - number ::= "1 " | "2 " - """ - prompt = "Generate an SQL query to show the 'username' \ - and 'email' from the 'users' table." - - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest( - prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - structure_type=args.structure_type, - ) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "regex": - regex = r"\w+@\w+\.com\n" - args.regex = regex - prompt = "Generate an email address for Alan Turing, \ - who works in Enigma. End in .com and new line. \ - Example result: alan.turing@enigma.com\n" - - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest( - prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=regex, - structure_type=args.structure_type, - ) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "choice": - choice = ["Positive", "Negative"] - args.choice = choice - prompt = "Classify this sentiment: vLLM is wonderful!" - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest( - prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=choice, - structure_type=args.structure_type, - ) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "xgrammar_bench": - requests: list[SampleRequest] = [] - dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") - full_dataset_len = len(dataset) - - def _filter_func(item): - import json - - schema = json.loads(item["schema"]) - return not has_xgrammar_unsupported_json_features(schema) - - dataset = dataset.filter(_filter_func) - num_filtered_out = full_dataset_len - len(dataset) - print( - f"dataset has {len(dataset)} entries after filtering " - f"out {num_filtered_out} entries with unsupported features" - ) - len_dataset = len(dataset) - for data_point_idx in range(args.num_prompts): - idx = data_point_idx - while idx >= len_dataset: - idx -= len_dataset - schema = dataset["schema"][idx] - prompt = tokenizer.apply_chat_template( - dataset["prompt"][idx], tokenize=False, add_generation_prompt=True - ) - input_len = len(tokenizer(prompt).input_ids) - completion = dataset["completion"][idx] - - requests.append( - SampleRequest( - prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - structure_type=args.structure_type, - completion=completion, - ) - ) - - return requests - - -async def get_request( - input_requests: list[SampleRequest], - request_rate: float, - burstiness: float = 1.0, -) -> AsyncGenerator[tuple[int, SampleRequest], None]: - """ - Asynchronously generates requests at a specified rate - with OPTIONAL burstiness. - - Args: - input_requests: - A list of input requests, each represented as a tuple. - request_rate: - The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. - Only takes effect when request_rate is not inf. - Default value is 1, which follows a Poisson process. - Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value - (burstiness > 1) results in a more uniform arrival of requests. - """ - input_requests = iter(input_requests) - - # Calculate scale parameter theta to maintain the desired request_rate. - assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}." - ) - theta = 1.0 / (request_rate * burstiness) - - for i, request in enumerate(input_requests): - yield i, request - - if request_rate == float("inf"): - # If the request rate is infinity, then we don't need to wait. - continue - - # Sample the request interval from the gamma distribution. - # If burstiness is 1, it follows exponential distribution. - interval = np.random.gamma(shape=burstiness, scale=theta) - # The next request will be sent after the interval. - await asyncio.sleep(interval) - - -def calculate_metrics( - input_requests: list[tuple[str, int, int]], - outputs: list[RequestFuncOutput], - dur_s: float, - tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - goodput_config_dict: Optional[dict[str, float]] = None, -) -> tuple[BenchmarkMetrics, list[int]]: - actual_output_lens: list[int] = [] - total_input = 0 - completed = 0 - good_completed = 0 - itls: list[float] = [] - tpots: list[float] = [] - all_tpots: list[float] = [] - ttfts: list[float] = [] - e2els: list[float] = [] - for i in range(len(outputs)): - if outputs[i].success: - # We use the tokenizer to count the number of output tokens for all - # serving backends instead of looking at len(outputs[i].itl) since - # multiple output tokens may be bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids - ) - actual_output_lens.append(output_len) - total_input += input_requests[i].prompt_len - tpot = 0 - if output_len > 1: - latency_minus_ttft = outputs[i].latency - outputs[i].ttft - tpot = latency_minus_ttft / (output_len - 1) - tpots.append(tpot) - outputs[i].tpot = tpot - # Note: if output_len <= 1, we regard tpot as 0 for goodput - all_tpots.append(tpot) - itls += outputs[i].itl - ttfts.append(outputs[i].ttft) - e2els.append(outputs[i].latency) - completed += 1 - else: - actual_output_lens.append(0) - - if goodput_config_dict: - valid_metrics = [] - slo_values = [] - - if "ttft" in goodput_config_dict: - valid_metrics.append(ttfts) - slo_values.append( - goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "tpot" in goodput_config_dict: - valid_metrics.append(all_tpots) - slo_values.append( - goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "e2el" in goodput_config_dict: - valid_metrics.append(e2els) - slo_values.append( - goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - - for req_metric in zip(*valid_metrics): - is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) - if is_good_req: - good_completed += 1 - - if completed == 0: - warnings.warn( - "All requests failed. This is likely due to a misconfiguration " - "on the benchmark arguments.", - stacklevel=2, - ) - metrics = BenchmarkMetrics( - completed=completed, - total_input=total_input, - total_output=sum(actual_output_lens), - request_throughput=completed / dur_s, - request_goodput=good_completed / dur_s, - output_throughput=sum(actual_output_lens) / dur_s, - total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_ttft_ms=np.mean(ttfts or 0) - * 1000, # ttfts is empty if streaming is not supported by backend - std_ttft_ms=np.std(ttfts or 0) * 1000, - median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[ - (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles - ], - mean_tpot_ms=np.mean(tpots or 0) * 1000, - std_tpot_ms=np.std(tpots or 0) * 1000, - median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[ - (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles - ], - mean_itl_ms=np.mean(itls or 0) * 1000, - std_itl_ms=np.std(itls or 0) * 1000, - median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[ - (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles - ], - mean_e2el_ms=np.mean(e2els or 0) * 1000, - std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[ - (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles - ], - ) - - return metrics, actual_output_lens - - -async def benchmark( - backend: str, - api_url: str, - base_url: str, - model_id: str, - tokenizer: PreTrainedTokenizerBase, - input_requests: list[SampleRequest], - request_rate: float, - burstiness: float, - disable_tqdm: bool, - profile: bool, - selected_percentile_metrics: list[str], - selected_percentiles: list[str], - ignore_eos: bool, - max_concurrency: Optional[int], - structured_output_ratio: float, - goodput_config_dict: Optional[dict[str, float]] = None, -): - if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS[backend] - else: - raise ValueError(f"Unknown backend: {backend}") - - def prepare_extra_body(request) -> dict: - extra_body = {} - # Add the schema to the extra_body - extra_body["structured_outputs"] = {} - extra_body["structured_outputs"][request.structure_type] = request.schema - return extra_body - - print("Starting initial single prompt test run...") - structured_output_req_idx = random.sample( - range(len(input_requests)), int(len(input_requests) * structured_output_ratio) - ) - - test_request = input_requests[0] - test_req_extra_body = ( - prepare_extra_body(test_request) if 0 in structured_output_req_idx else None - ) - test_input = RequestFuncInput( - model=model_id, - prompt=test_request.prompt, - api_url=api_url, - prompt_len=test_request.prompt_len, - output_len=test_request.expected_output_len, - ignore_eos=ignore_eos, - extra_body=test_req_extra_body, - ) - test_output = await request_func(request_func_input=test_input) - if not test_output.success: - raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}" - ) - else: - print("Initial test run completed. Starting main benchmark run...") - - if profile: - print("Starting profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_request.prompt, - api_url=base_url + "/start_profile", - prompt_len=test_request.prompt_len, - output_len=test_request.expected_output_len, - ignore_eos=ignore_eos, - extra_body=test_req_extra_body, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler started") - - distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" - - print(f"Traffic request rate: {request_rate}") - print(f"Burstiness factor: {burstiness} ({distribution})") - print(f"Maximum request concurrency: {max_concurrency}") - - pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) - semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None - - async def limited_request_func(request_func_input, pbar): - if semaphore is None: - return await request_func(request_func_input=request_func_input, pbar=pbar) - async with semaphore: - return await request_func(request_func_input=request_func_input, pbar=pbar) - - benchmark_start_time = time.perf_counter() - tasks: list[asyncio.Task] = [] - expected: list[str] = [] - async for i, request in get_request(input_requests, request_rate, burstiness): - extra_body = ( - prepare_extra_body(request) if i in structured_output_req_idx else None - ) - request_func_input = RequestFuncInput( - model=model_id, - prompt=request.prompt, - api_url=api_url, - prompt_len=request.prompt_len, - output_len=request.expected_output_len, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - expected.append(request.completion) - tasks.append( - asyncio.create_task( - limited_request_func(request_func_input=request_func_input, pbar=pbar) - ) - ) - outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) - - if pbar is not None: - pbar.close() - - benchmark_duration = time.perf_counter() - benchmark_start_time - - metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, - outputs=outputs, - dur_s=benchmark_duration, - tokenizer=tokenizer, - selected_percentile_metrics=selected_percentile_metrics, - selected_percentiles=selected_percentiles, - goodput_config_dict=goodput_config_dict, - ) - - print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) - print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - if max_concurrency is not None: - print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) - if request_rate != float("inf"): - print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) - print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) - print( - "{:<40} {:<10.2f}".format( - "Request throughput (req/s):", metrics.request_throughput - ) - ) - if goodput_config_dict: - print( - "{:<40} {:<10.2f}".format( - "Request goodput (req/s):", metrics.request_goodput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput - ) - ) - - result = { - "duration": benchmark_duration, - "completed": metrics.completed, - "total_input_tokens": metrics.total_input, - "total_output_tokens": metrics.total_output, - "request_throughput": metrics.request_throughput, - "output_throughput": metrics.output_throughput, - "total_token_throughput": metrics.total_token_throughput, - "ttft_description": pd.Series([output.ttft for output in outputs]) - .describe() - .to_dict(), - "tpot_description": pd.Series([output.tpot for output in outputs]) - .describe() - .to_dict(), - "input_lens": [output.prompt_len for output in outputs], - "output_lens": actual_output_lens, - "ttfts": [output.ttft for output in outputs], - "itls": [output.itl for output in outputs], - "errors": [output.error for output in outputs], - } - - ret = [ - {"generated": output.generated_text, "expected": gt} - for output, gt in zip(outputs, expected) - ] - - def process_one_metric( - # E.g., "ttft" - metric_attribute_name: str, - # E.g., "TTFT" - metric_name: str, - # E.g., "Time to First Token" - metric_header: str, - ): - # This function prints and adds statistics of the specified - # metric. - if metric_attribute_name not in selected_percentile_metrics: - return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) - print( - "{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"), - ) - ) - print( - "{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"), - ) - ) - result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms" - ) - result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms" - ) - result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms" - ) - for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) - result[f"p{p_word}_{metric_attribute_name}_ms"] = value - - process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") - process_one_metric("itl", "ITL", "Inter-token Latency") - process_one_metric("e2el", "E2EL", "End-to-end Latency") - - print("=" * 50) - - if profile: - print("Stopping profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_request.prompt, - api_url=base_url + "/stop_profile", - prompt_len=test_request.prompt_len, - output_len=test_request.expected_output_len, - extra_body={test_request.structure_type: test_request.schema}, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler stopped") - - return result, ret - - -def evaluate(ret, args): - def _eval_correctness_json(expected, actual): - # extract json string from string using regex - import regex as re - - actual = actual.replace("\n", "").replace(" ", "").strip() - try: - actual = re.search(r"\{.*\}", actual).group() - actual = json.loads(actual) - except Exception: - return False - - return True - - def _eval_correctness_choice(expected, actual): - return actual in args.choice - - def _eval_correctness_regex(expected, actual): - import regex as re - - return re.match(args.regex, actual) is not None - - def _eval_correctness(expected, actual): - if args.structure_type == "json": - return _eval_correctness_json(expected, actual) - elif args.structure_type == "regex": - return _eval_correctness_regex(expected, actual) - elif args.structure_type == "choice": - return _eval_correctness_choice(expected, actual) - else: - return None - - scores = [] - for res in ret: - score = _eval_correctness(res["expected"], res["generated"]) - res["correctness"] = score - scores.append(score) - - not_none_scores = [score for score in scores if score is not None] - - return ( - (sum(not_none_scores) / len(not_none_scores) * 100) - if len(not_none_scores) > 0 - else None - ) - - -def parse_goodput(slo_pairs): - goodput_config_dict = {} - try: - for slo_pair in slo_pairs: - slo_name, slo_val = slo_pair.split(":") - goodput_config_dict[slo_name] = float(slo_val) - except ValueError as err: - raise argparse.ArgumentTypeError( - "Invalid format found for service level objectives. " - 'Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is a " - "number in milliseconds." - ) from err - return goodput_config_dict - - -def check_goodput_args(args): - goodput_config_dict = {} - VALID_NAMES = ["ttft", "tpot", "e2el"] - if args.goodput: - goodput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in goodput_config_dict.items(): - if slo_name not in VALID_NAMES: - raise ValueError( - f"Invalid metric name found, {slo_name}: {slo_val}. " - "The service level objective name should be one of " - f"{str(VALID_NAMES)}. " - ) - if slo_val < 0: - raise ValueError( - f"Invalid value found, {slo_name}: {slo_val}. " - "The service level objective value should be " - "non-negative." - ) - return goodput_config_dict - - -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - np.random.seed(args.seed) - - backend = args.backend - model_id = args.model - tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - - if args.base_url is not None: - api_url = f"{args.base_url}{args.endpoint}" - base_url = f"{args.base_url}" - else: - api_url = f"http://{args.host}:{args.port}{args.endpoint}" - base_url = f"http://{args.host}:{args.port}" - - tokenizer = get_tokenizer( - tokenizer_id, - trust_remote_code=args.trust_remote_code, - tokenizer_mode=args.tokenizer_mode, - ) - - if args.dataset == "grammar": - args.structure_type = "grammar" - elif args.dataset == "regex": - args.structure_type = "regex" - elif args.dataset == "choice": - args.structure_type = "choice" - else: - args.structure_type = "json" - - if args.no_structured_output: - args.structured_output_ratio = 0 - if args.save_results: - result_file_name = f"{args.structured_output_ratio}so" - result_file_name += f"_{backend}" - result_file_name += f"_{args.request_rate}qps" - result_file_name += f"_{args.model.split('/')[-1]}" - result_file_name += f"_{args.dataset}" - result_file_name += f"_{args.num_prompts}" - result_file_name += f"_out{args.output_len}" - result_file_name += ".txt" - else: - result_file_name = None - - input_requests = sample_requests(tokenizer, args) - - goodput_config_dict = check_goodput_args(args) - - benchmark_result, ret = asyncio.run( - benchmark( - backend=backend, - api_url=api_url, - base_url=base_url, - model_id=model_id, - tokenizer=tokenizer, - input_requests=input_requests, - request_rate=args.request_rate, - burstiness=args.burstiness, - disable_tqdm=args.disable_tqdm, - profile=args.profile, - selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], - ignore_eos=args.ignore_eos, - max_concurrency=args.max_concurrency, - structured_output_ratio=args.structured_output_ratio, - goodput_config_dict=goodput_config_dict, - ) - ) - - # Save config and results to json - score = evaluate(ret, args) - print("correct_rate(%)", score, "\n") - if args.save_results: - results = { - "backend": backend, - "model_id": model_id, - "tokenizer_id": tokenizer_id, - "num_prompts": args.num_prompts, - "request_rate": args.request_rate - if args.request_rate < float("inf") - else "inf", - "burstiness": args.burstiness, - "max_concurrency": args.max_concurrency, - "correct_rate(%)": score, - } - results = {"outputs": ret, **results, **benchmark_result} - - # Save to file - if args.result_filename: - result_file_name = args.result_filename - if args.result_dir: - result_file_name = os.path.join(args.result_dir, result_file_name) - with open(result_file_name, "w", encoding="utf-8") as outfile: - json.dump(results, outfile, indent=4) - - -def create_argument_parser(): - parser = FlexibleArgumentParser( - description="Benchmark the online serving throughput." - ) - parser.add_argument( - "--backend", - type=str, - default="vllm", - choices=list(ASYNC_REQUEST_FUNCS.keys()), - ) - parser.add_argument( - "--base-url", - type=str, - default=None, - help="Server or API base url if not using http host and port.", - ) - # Use 127.0.0.1 here instead of localhost to force the use of ipv4 - parser.add_argument("--host", type=str, default="127.0.0.1") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument( - "--endpoint", - type=str, - default="/v1/completions", - help="API endpoint.", - ) - parser.add_argument( - "--dataset", - default="json", - choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"], - ) - parser.add_argument( - "--json-schema-path", type=str, default=None, help="Path to json schema." - ) - parser.add_argument( - "--max-concurrency", - type=int, - default=None, - help="Maximum number of concurrent requests. This can be used " - "to help simulate an environment where a higher level component " - "is enforcing a maximum number of concurrent requests. While the " - "--request-rate argument controls the rate at which requests are " - "initiated, this argument will control how many are actually allowed " - "to execute at a time. This means that when used in combination, the " - "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.", - ) - parser.add_argument( - "--model", - type=str, - required=True, - help="Name of the model.", - ) - parser.add_argument( - "--tokenizer", - type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 - ) - parser.add_argument( - "--tokenizer-mode", - type=str, - default="auto", - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 - ) - parser.add_argument( - "--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.", - ) - parser.add_argument( - "--output-len", - type=int, - default=128, - help="Number of output tokens.", - ) - parser.add_argument( - "--request-rate", - type=float, - default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process or gamma distribution " - "to synthesize the request arrival times.", - ) - parser.add_argument( - "--burstiness", - type=float, - default=1.0, - help="Burstiness factor of the request generation. " - "Only take effect when request_rate is not inf. " - "Default value is 1, which follows Poisson process. " - "Otherwise, the request intervals follow a gamma distribution. " - "A lower burstiness value (0 < burstiness < 1) results in more " - "bursty requests. A higher burstiness value (burstiness > 1) " - "results in a more uniform arrival of requests.", - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from huggingface", - ) - parser.add_argument( - "--disable-tqdm", - action="store_true", - help="Specify to disable tqdm progress bar.", - ) - parser.add_argument( - "--save-results", - action="store_true", - help="Specify to save benchmark results to a json file", - ) - parser.add_argument( - "--profile", - action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", - ) - parser.add_argument( - "--result-dir", - type=str, - default=None, - help="Specify directory to save benchmark json results." - "If not specified, results are saved in the current directory.", - ) - parser.add_argument( - "--result-filename", - type=str, - default=None, - help="Specify the filename to save benchmark json results." - "If not specified, results will be saved in " - "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" - " format.", - ) - parser.add_argument( - "--ignore-eos", - action="store_true", - help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", - ) - parser.add_argument( - "--percentile-metrics", - type=str, - default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentiles. " - "This argument specifies the metrics to report percentiles. " - 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' - 'Default value is "ttft,tpot,itl".', - ) - parser.add_argument( - "--metric-percentiles", - type=str, - default="99", - help="Comma-separated list of percentiles for selected metrics. " - 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' - 'Default value is "99". ' - 'Use "--percentile-metrics" to select metrics.', - ) - parser.add_argument( - "--goodput", - nargs="+", - required=False, - help='Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is in " - 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' - "separated by spaces. Allowed request level metric names are " - '"ttft", "tpot", "e2el". For more context on the definition of ' - "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve", - ) - - parser.add_argument( - "--no-structured-output", - action="store_true", - default=False, - help="Whether to disable JSON decoding or not.", - ) - parser.add_argument( - "--structured-output-ratio", - type=float, - default=1.0, - help="Ratio of Structured Outputs requests", - ) - - return parser - - -if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py deleted file mode 100644 index b6dc0918f..000000000 --- a/benchmarks/benchmark_throughput.py +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import sys - -if __name__ == "__main__": - print("""DEPRECATED: This script has been moved to the vLLM CLI. - -Please use the following command instead: - vllm bench throughput - -For help with the new command, run: - vllm bench throughput --help - -Alternatively, you can run the new command directly with: - python -m vllm.entrypoints.cli.main bench throughput --help -""") - sys.exit(1) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py deleted file mode 100644 index 98624abdf..000000000 --- a/benchmarks/benchmark_utils.py +++ /dev/null @@ -1,125 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import json -import math -import os -import time -from types import TracebackType -from typing import Any, Optional, Union - - -def convert_to_pytorch_benchmark_format( - args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any] -) -> list: - """ - Save the benchmark results in the format used by PyTorch OSS benchmark with - on metric per record - https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database - """ - records = [] - if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): - return records - - for name, benchmark_values in metrics.items(): - record = { - "benchmark": { - "name": "vLLM benchmark", - "extra_info": { - "args": vars(args), - }, - }, - "model": { - "name": args.model, - }, - "metric": { - "name": name, - "benchmark_values": benchmark_values, - "extra_info": extra_info, - }, - } - - tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") - # Save tensor_parallel_size parameter if it's part of the metadata - if not tp and "tensor_parallel_size" in extra_info: - record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = ( - extra_info["tensor_parallel_size"] - ) - - records.append(record) - - return records - - -class InfEncoder(json.JSONEncoder): - def clear_inf(self, o: Any): - if isinstance(o, dict): - return {k: self.clear_inf(v) for k, v in o.items()} - elif isinstance(o, list): - return [self.clear_inf(v) for v in o] - elif isinstance(o, float) and math.isinf(o): - return "inf" - return o - - def iterencode(self, o: Any, *args, **kwargs) -> Any: - return super().iterencode(self.clear_inf(o), *args, **kwargs) - - -def write_to_json(filename: str, records: list) -> None: - with open(filename, "w") as f: - json.dump( - records, - f, - cls=InfEncoder, - default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", - ) - - -# Collect time and generate time metrics -# -# Example Usage: -# collector = TimeCollector(TimeCollector.US) -# for _ in range(total_iteration): -# with collector: -# ... -# collector.dump_avg_max() -class TimeCollector: - NS: int = 1 - US: int = NS * 1000 - MS: int = US * 1000 - S: int = MS * 1000 - - def __init__(self, scale: int) -> None: - self.cnt: int = 0 - self._sum: int = 0 - self._max: Optional[int] = None - self.scale = scale - self.start_time: int = time.monotonic_ns() - - def collect(self, v: int) -> None: - self.cnt += 1 - self._sum += v - if self._max is None: - self._max = v - else: - self._max = max(self._max, v) - - def avg(self) -> Union[float, str]: - return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" - - def max(self) -> Union[float, str]: - return self._max / self.scale if self._max else "N/A" - - def dump_avg_max(self) -> list[Union[float, str]]: - return [self.avg(), self.max()] - - def __enter__(self) -> None: - self.start_time = time.monotonic_ns() - - def __exit__( - self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], - ) -> None: - self.collect(time.monotonic_ns() - self.start_time) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py deleted file mode 100644 index 9ec270bbd..000000000 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ /dev/null @@ -1,516 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import copy -import itertools -import pickle as pkl -import time -from collections.abc import Iterable -from typing import Callable - -import torch -import torch.utils.benchmark as TBenchmark -from torch.utils.benchmark import Measurement as TMeasurement -from utils import make_rand_sparse_tensors -from weight_shapes import WEIGHT_SHAPES - -from vllm import _custom_ops as ops -from vllm.utils import FlexibleArgumentParser - -DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) -DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] -DEFAULT_TP_SIZES = [1] - - -# bench -def bench_fn( - label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs -) -> TMeasurement: - min_run_time = 1 - - globals = { - "args": args, - "kwargs": kwargs, - "fn": fn, - } - return TBenchmark.Timer( - stmt="fn(*args, **kwargs)", - globals=globals, - label=label, - sub_label=sub_label, - description=description, - ).blocked_autorange(min_run_time=min_run_time) - - -def bench_int8( - dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str -) -> Iterable[TMeasurement]: - assert dtype == torch.int8 - b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k) - scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) - scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) - - out = ops.cutlass_scaled_sparse_mm( - a, b_compressed, e, scale_a, scale_b, torch.bfloat16 - ) - out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16) - - if not torch.allclose(out, out_ref): - print("Incorrect results") - print(out) - print(out_ref) - else: - print("Correct results") - - timers = [] - # pytorch impl - bfloat16 - timers.append( - bench_fn( - label, - sub_label, - "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, - a.to(dtype=torch.bfloat16), - b.to(dtype=torch.bfloat16), - ) - ) - - # pytorch impl - float16 - timers.append( - bench_fn( - label, - sub_label, - "pytorch_fp16_fp16_fp16_matmul-no-scales", - torch.mm, - a.to(dtype=torch.float16), - b.to(dtype=torch.float16), - ) - ) - - # cutlass impl - timers.append( - bench_fn( - label, - sub_label, - "cutlass_i8_i8_bf16_scaled_mm", - ops.cutlass_scaled_mm, - a, - b, - scale_a, - scale_b, - torch.bfloat16, - ) - ) - - # cutlass with bias - timers.append( - bench_fn( - label, - sub_label, - "cutlass_i8_i8_bf16_scaled_mm_bias", - ops.cutlass_scaled_mm, - a, - b, - scale_a, - scale_b, - torch.bfloat16, - bias, - ) - ) - - # cutlass sparse impl - timers.append( - bench_fn( - label, - sub_label, - "cutlass_i8_i8_bf16_scaled_sparse_mm", - ops.cutlass_scaled_sparse_mm, - a, - b_compressed, - e, - scale_a, - scale_b, - torch.bfloat16, - ) - ) - - # cutlass sparse with bias - timers.append( - bench_fn( - label, - sub_label, - "cutlass_i8_i8_bf16_scaled_sparse_mm_bias", - ops.cutlass_scaled_sparse_mm, - a, - b_compressed, - e, - scale_a, - scale_b, - torch.bfloat16, - bias, - ) - ) - - return timers - - -def bench_fp8( - dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str -) -> Iterable[TMeasurement]: - assert dtype == torch.float8_e4m3fn - b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k) - scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) - scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) - - out = ops.cutlass_scaled_sparse_mm( - a, b_compressed, e, scale_a, scale_b, torch.bfloat16 - ) - out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16) - - if not torch.allclose(out, out_ref): - print("Incorrect results") - print(out) - print(out_ref) - else: - print("Correct results") - - timers = [] - - # pytorch impl w. bf16 - timers.append( - bench_fn( - label, - sub_label, - "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, - a.to(dtype=torch.bfloat16, device="cuda"), - b.to(dtype=torch.bfloat16, device="cuda"), - ) - ) - - # pytorch impl: bf16 output, without fp8 fast accum - timers.append( - bench_fn( - label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16, - ) - ) - - # pytorch impl: bf16 output, with fp8 fast accum - timers.append( - bench_fn( - label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16, - use_fast_accum=True, - ) - ) - - # pytorch impl: fp16 output, without fp8 fast accum - timers.append( - bench_fn( - label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16, - ) - ) - - # pytorch impl: fp16 output, with fp8 fast accum - timers.append( - bench_fn( - label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16, - use_fast_accum=True, - ) - ) - - # cutlass impl: bf16 output - timers.append( - bench_fn( - label, - sub_label, - "cutlass_fp8_fp8_bf16_scaled_mm", - ops.cutlass_scaled_mm, - a, - b, - scale_a, - scale_b, - torch.bfloat16, - ) - ) - - # cutlass impl: bf16 output - timers.append( - bench_fn( - label, - sub_label, - "cutlass_fp8_fp8_bf16_scaled_sparse_mm", - ops.cutlass_scaled_sparse_mm, - a, - b_compressed, - e, - scale_a, - scale_b, - torch.bfloat16, - ) - ) - - # cutlass impl: fp16 output - timers.append( - bench_fn( - label, - sub_label, - "cutlass_fp8_fp8_fp16_scaled_sparse_mm", - ops.cutlass_scaled_sparse_mm, - a, - b_compressed, - e, - scale_a, - scale_b, - torch.float16, - ) - ) - - # cutlass impl: bf16 output, with bias - timers.append( - bench_fn( - label, - sub_label, - "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias", - ops.cutlass_scaled_sparse_mm, - a, - b_compressed, - e, - scale_a, - scale_b, - torch.bfloat16, - bias, - ) - ) - - # cutlass impl: fp16 output, with bias - timers.append( - bench_fn( - label, - sub_label, - "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias", - ops.cutlass_scaled_sparse_mm, - a, - b_compressed, - e, - scale_a, - scale_b, - torch.float16, - bias.to(dtype=torch.float16), - ) - ) - - return timers - - -def bench( - dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str -) -> Iterable[TMeasurement]: - if dtype == torch.int8: - return bench_int8(dtype, m, k, n, label, sub_label) - if dtype == torch.float8_e4m3fn: - return bench_fp8(dtype, m, k, n, label, sub_label) - raise ValueError("unsupported type") - - -# runner -def print_timers(timers: Iterable[TMeasurement]): - compare = TBenchmark.Compare(timers) - compare.print() - - -def run( - dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]] -) -> Iterable[TMeasurement]: - results = [] - for m, k, n in MKNs: - timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", f"MKN=({m}x{k}x{n})") - print_timers(timers) - results.extend(timers) - - return results - - -# output makers -def make_output( - data: Iterable[TMeasurement], - MKNs: Iterable[tuple[int, int, int]], - base_description: str, - timestamp=None, -): - print(f"== All Results {base_description} ====") - print_timers(data) - - # pickle all the results - timestamp = int(time.time()) if timestamp is None else timestamp - with open(f"{base_description}-{timestamp}.pkl", "wb") as f: - pkl.dump(data, f) - - -# argparse runners - - -def run_square_bench(args): - dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) - MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) - data = run(args.dtype, MKNs) - - make_output(data, MKNs, f"square_bench-{args.dtype}") - - -def run_range_bench(args): - dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment)) - n = len(dim_sizes) - Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes - Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes - Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes - MKNs = list(zip(Ms, Ks, Ns)) - data = run(args.dtype, MKNs) - - make_output(data, MKNs, f"range_bench-{args.dtype}") - - -def run_model_bench(args): - print("Benchmarking models:") - for i, model in enumerate(args.models): - print(f"[{i}] {model}") - - def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: - KNs = [] - for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): - KN[tp_split_dim] = KN[tp_split_dim] // tp_size - KNs.append(KN) - return KNs - - model_bench_data = [] - models_tps = list(itertools.product(args.models, args.tp_sizes)) - for model, tp_size in models_tps: - Ms = args.batch_sizes - KNs = model_shapes(model, tp_size) - MKNs = [] - for m in Ms: - for k, n in KNs: - MKNs.append((m, k, n)) - - data = run(args.dtype, MKNs) - model_bench_data.append(data) - - # Print all results - for data, model_tp in zip(model_bench_data, models_tps): - model, tp_size = model_tp - print(f"== Results {args.dtype} {model}-TP{tp_size} ====") - print_timers(data) - - timestamp = int(time.time()) - - all_data = [] - for d in model_bench_data: - all_data.extend(d) - # pickle all data - with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: - pkl.dump(all_data, f) - - -if __name__ == "__main__": - - def to_torch_dtype(dt): - if dt == "int8": - return torch.int8 - if dt == "fp8": - return torch.float8_e4m3fn - raise ValueError("unsupported dtype") - - parser = FlexibleArgumentParser( - description=""" -Benchmark Cutlass GEMM. - - To run square GEMMs: - python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 - - To run constant N and K and sweep M: - python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 - - To run dimensions from a model: - python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 - - Output: - - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. - """, # noqa: E501 - formatter_class=argparse.RawTextHelpFormatter, - ) - - parser.add_argument( - "--dtype", - type=to_torch_dtype, - required=True, - help="Available options are ['int8', 'fp8']", - ) - subparsers = parser.add_subparsers(dest="cmd") - - square_parser = subparsers.add_parser("square_bench") - square_parser.add_argument("--dim-start", type=int, required=True) - square_parser.add_argument("--dim-end", type=int, required=True) - square_parser.add_argument("--dim-increment", type=int, required=True) - square_parser.set_defaults(func=run_square_bench) - - range_parser = subparsers.add_parser("range_bench") - range_parser.add_argument("--dim-start", type=int, required=True) - range_parser.add_argument("--dim-end", type=int, required=True) - range_parser.add_argument("--dim-increment", type=int, required=True) - range_parser.add_argument("--m-constant", type=int, default=None) - range_parser.add_argument("--n-constant", type=int, default=None) - range_parser.add_argument("--k-constant", type=int, default=None) - range_parser.set_defaults(func=run_range_bench) - - model_parser = subparsers.add_parser("model_bench") - model_parser.add_argument( - "--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES.keys(), - ) - model_parser.add_argument( - "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES - ) - model_parser.add_argument( - "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES - ) - model_parser.set_defaults(func=run_model_bench) - - args = parser.parse_args() - args.func(args) diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py deleted file mode 100644 index b4f3c6bf9..000000000 --- a/benchmarks/cutlass_benchmarks/utils.py +++ /dev/null @@ -1,100 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Cutlass bench utils -from collections.abc import Iterable - -import torch - -import vllm._custom_ops as ops - - -def to_fp8(tensor: torch.Tensor) -> torch.Tensor: - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( - dtype=torch.float8_e4m3fn - ) - - -def to_int8(tensor: torch.Tensor) -> torch.Tensor: - return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) - - -def to_bf16(tensor: torch.Tensor) -> torch.Tensor: - return tensor.to(dtype=torch.bfloat16) - - -def to_fp16(tensor: torch.Tensor) -> torch.Tensor: - return tensor.to(dtype=torch.float16) - - -def make_rand_tensors( - dtype: torch.dtype, m: int, n: int, k: int -) -> tuple[torch.Tensor, torch.Tensor]: - a = torch.randn((m, k), device="cuda") * 5 - b = torch.randn((n, k), device="cuda").t() * 5 - - if dtype == torch.int8: - return to_int8(a), to_int8(b) - if dtype == torch.float8_e4m3fn: - return to_fp8(a), to_fp8(b) - - raise ValueError("unsupported dtype") - - -def prune_to_2_4(tensor): - # Reshape tensor to [N, 4] where N is number of groups of 4 - original_shape = tensor.shape - reshaped = tensor.reshape(-1, 4) - - # Get indices of top 2 absolute values in each group of 4 - _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1) - - # Create binary mask - mask = torch.zeros_like(reshaped) - mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype)) - - # Apply mask and reshape back - pruned = reshaped * mask - - # Turn all -0.0 to 0.0 - pruned[pruned == -0.0] = 0.0 - - return pruned.reshape(original_shape) - - -def make_rand_sparse_tensors( - dtype: torch.dtype, m: int, n: int, k: int -) -> tuple[torch.Tensor, torch.Tensor]: - a = torch.randn((m, k), device="cuda") * 5 - b = torch.randn((n, k), device="cuda").t() * 5 - - b = prune_to_2_4(b.t()).t() - - if dtype == torch.int8: - a, b = to_int8(a), to_int8(b) - elif dtype == torch.float8_e4m3fn: - a, b = to_fp8(a), to_fp8(b) - elif dtype == torch.float16: - a, b = to_fp16(a), to_fp16(b) - elif dtype == torch.bfloat16: - a, b = to_bf16(a), to_bf16(b) - else: - raise ValueError("unsupported dtype") - - b_compressed, e = ops.cutlass_sparse_compress(b.t()) - - # Compressed B, Metadata, Original A, B - return b_compressed, e, a, b - - -def make_n_rand_sparse_tensors( - num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int -) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: - ABs = [] - for _ in range(num_tensors): - b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) - if b_comp is not None: - ABs.append(make_rand_sparse_tensors(dtype, m, n, k)) - BComps, Es, As, Bs = zip(*ABs) - return list(BComps), list(Es), list(As), list(Bs) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py deleted file mode 100644 index a5a5b52f6..000000000 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ /dev/null @@ -1,372 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import copy -import itertools -import pickle as pkl -import time -from collections.abc import Iterable -from typing import Callable, Optional - -import torch -import torch.utils.benchmark as TBenchmark -from torch.utils.benchmark import Measurement as TMeasurement -from utils import make_rand_tensors -from weight_shapes import WEIGHT_SHAPES - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - w8a8_block_fp8_matmul, -) -from vllm.utils import FlexibleArgumentParser, cdiv - -DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) -DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] -DEFAULT_TP_SIZES = [1] - - -# bench -def bench_fn( - label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs -) -> TMeasurement: - min_run_time = 1 - - globals = { - "args": args, - "kwargs": kwargs, - "fn": fn, - } - return TBenchmark.Timer( - stmt="fn(*args, **kwargs)", - globals=globals, - label=label, - sub_label=sub_label, - description=description, - ).blocked_autorange(min_run_time=min_run_time) - - -def bench_int8( - dtype: torch.dtype, - m: int, - k: int, - n: int, - label: str, - sub_label: str, - bench_kernels: Optional[list[str]] = None, -) -> Iterable[TMeasurement]: - """Benchmark INT8-based kernels.""" - assert dtype == torch.int8 - a, b = make_rand_tensors(torch.int8, m, n, k) - scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) - scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) - azp = torch.zeros((m,), device="cuda", dtype=torch.int32) - azp_adj = torch.zeros((n,), device="cuda", dtype=torch.int32) - - bench_fns = { - "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm( - a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) - ), - "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm( - a.to(dtype=torch.float16), b.to(dtype=torch.float16) - ), - "cutlass_i8_i8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm( - a, b, scale_a, scale_b, torch.bfloat16 - ), - "cutlass_i8_i8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( - a, b, scale_a, scale_b, torch.bfloat16, bias - ), - "cutlass_i8_i8_bf16_scaled_mm_azp": lambda: ops.cutlass_scaled_mm_azp( - a, b, scale_a, scale_b, torch.bfloat16, azp_adj - ), - "cutlass_i8_i8_bf16_scaled_mm_azp_bias": lambda: ops.cutlass_scaled_mm_azp( - a, b, scale_a, scale_b, torch.bfloat16, azp_adj, None, bias - ), - "cutlass_i8_i8_bf16_scaled_mm_azp_pt": lambda: ops.cutlass_scaled_mm_azp( - a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp - ), - "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": lambda: ops.cutlass_scaled_mm_azp( - a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp, bias - ), - } - - timers = [] - for name, fn in bench_fns.items(): - # If bench_kernels is None, run all. Otherwise, run only exact matches. - if bench_kernels is None or name in bench_kernels: - print(f"Running {name}") - timers.append(bench_fn(label, sub_label, name, fn)) - - return timers - - -def bench_fp8( - dtype: torch.dtype, - m: int, - k: int, - n: int, - label: str, - sub_label: str, - bench_kernels: Optional[list[str]] = None, -) -> Iterable[TMeasurement]: - """Benchmark FP8-based kernels.""" - assert dtype == torch.float8_e4m3fn - a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) - a_cont = a.contiguous() - scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) - scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - - block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32) - block_scale_b = torch.rand( - cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32 - ) - block_scale_a_M_major = block_scale_a.t().contiguous().t() - block_scale_b_K_major = block_scale_b.t().contiguous().t() - bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) - - print(m, k, n) - - bench_fns = { - "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm( - a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) - ), - "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm( - a.to(dtype=torch.float16), b.to(dtype=torch.float16) - ), - "pytorch_fp8_fp8_fp16_scaled_mm": lambda: torch._scaled_mm( - a, b, scale_a, scale_b, out_dtype=torch.float16 - ), - "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": lambda: torch._scaled_mm( - a, b, scale_a, scale_b, out_dtype=torch.float16, use_fast_accum=True - ), - "pytorch_fp8_fp8_bf16_scaled_mm": lambda: torch._scaled_mm( - a, b, scale_a, scale_b, out_dtype=torch.bfloat16 - ), - "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": lambda: torch._scaled_mm( - a, b, scale_a, scale_b, out_dtype=torch.bfloat16, use_fast_accum=True - ), - "cutlass_fp8_fp8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm( - a, b, scale_a, scale_b, torch.bfloat16 - ), - "cutlass_fp8_fp8_fp16_scaled_mm": lambda: ops.cutlass_scaled_mm( - a, b, scale_a, scale_b, torch.float16 - ), - "cutlass_fp8_fp8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( - a, b, scale_a, scale_b, torch.bfloat16, bias - ), - "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( - a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16) - ), - "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul( - a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128) - ), - "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm( - a, b, block_scale_a_M_major, block_scale_b_K_major, torch.float16 - ), - } - - timers = [] - for name, fn in bench_fns.items(): - # If bench_kernels is None, run all. Otherwise, run only exact matches. - if bench_kernels is None or name in bench_kernels: - print(f"Running {name}") - timers.append(bench_fn(label, sub_label, name, fn)) - - return timers - - -def bench( - dtype: torch.dtype, - m: int, - k: int, - n: int, - label: str, - sub_label: str, - bench_kernels: Optional[list[str]] = None, -) -> Iterable[TMeasurement]: - if dtype == torch.int8: - return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) - if dtype == torch.float8_e4m3fn: - return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels) - raise ValueError("unsupported type") - - -# runner -def print_timers(timers: Iterable[TMeasurement]): - compare = TBenchmark.Compare(timers) - compare.print() - - -def run( - dtype: torch.dtype, - MKNs: Iterable[tuple[int, int, int]], - bench_kernels: Optional[list[str]] = None, -) -> Iterable[TMeasurement]: - results = [] - for m, k, n in MKNs: - timers = bench( - dtype, - m, - k, - n, - f"scaled-{dtype}-gemm", - f"MKN=({m}x{k}x{n})", - bench_kernels=bench_kernels, - ) - print_timers(timers) - results.extend(timers) - return results - - -def make_output( - data: Iterable[TMeasurement], - MKNs: Iterable[tuple[int, int, int]], - base_description: str, - timestamp=None, -): - print(f"== All Results {base_description} ====") - print_timers(data) - - # pickle all the results - timestamp = int(time.time()) if timestamp is None else timestamp - with open(f"{base_description}-{timestamp}.pkl", "wb") as f: - pkl.dump(data, f) - - -def run_square_bench(args): - dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) - MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) - data = run(args.dtype, MKNs, bench_kernels=args.kernels) - make_output(data, MKNs, f"square_bench-{args.dtype}") - - -def run_range_bench(args): - dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment)) - n = len(dim_sizes) - Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes - Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes - Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes - MKNs = list(zip(Ms, Ks, Ns)) - data = run(args.dtype, MKNs, bench_kernels=args.kernels) - make_output(data, MKNs, f"range_bench-{args.dtype}") - - -def run_model_bench(args): - print("Benchmarking models:") - for i, model in enumerate(args.models): - print(f"[{i}] {model}") - - def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: - KNs = [] - for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): - KN[tp_split_dim] = KN[tp_split_dim] // tp_size - KNs.append(KN) - return KNs - - model_bench_data = [] - models_tps = list(itertools.product(args.models, args.tp_sizes)) - for model, tp_size in models_tps: - Ms = args.batch_sizes - KNs = model_shapes(model, tp_size) - MKNs = [] - for m in Ms: - for k, n in KNs: - MKNs.append((m, k, n)) - - data = run(args.dtype, MKNs, bench_kernels=args.kernels) - model_bench_data.append(data) - - # Print all results - for data, model_tp in zip(model_bench_data, models_tps): - model, tp_size = model_tp - print(f"== Results {args.dtype} {model}-TP{tp_size} ====") - print_timers(data) - - timestamp = int(time.time()) - - all_data = [] - for d in model_bench_data: - all_data.extend(d) - # pickle all data - with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: - pkl.dump(all_data, f) - - -if __name__ == "__main__": - - def to_torch_dtype(dt): - if dt == "int8": - return torch.int8 - if dt == "fp8": - return torch.float8_e4m3fn - raise ValueError("unsupported dtype") - - parser = FlexibleArgumentParser( - description=""" -Benchmark Cutlass GEMM. - - To run square GEMMs: - python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 - - To run constant N and K and sweep M: - python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 - - To run dimensions from a model: - python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 - - Output: - - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. - """, # noqa: E501 - formatter_class=argparse.RawTextHelpFormatter, - ) - - parser.add_argument( - "--dtype", - type=to_torch_dtype, - required=True, - help="Available options are ['int8', 'fp8']", - ) - parser.add_argument( - "--kernels", - nargs="+", - type=str, - default=None, - help="Exact names of the kernels to benchmark. If not set, runs all kernels.", - ) - - subparsers = parser.add_subparsers(dest="cmd") - - square_parser = subparsers.add_parser("square_bench") - square_parser.add_argument("--dim-start", type=int, required=True) - square_parser.add_argument("--dim-end", type=int, required=True) - square_parser.add_argument("--dim-increment", type=int, required=True) - square_parser.set_defaults(func=run_square_bench) - - range_parser = subparsers.add_parser("range_bench") - range_parser.add_argument("--dim-start", type=int, required=True) - range_parser.add_argument("--dim-end", type=int, required=True) - range_parser.add_argument("--dim-increment", type=int, required=True) - range_parser.add_argument("--m-constant", type=int, default=None) - range_parser.add_argument("--n-constant", type=int, default=None) - range_parser.add_argument("--k-constant", type=int, default=None) - range_parser.set_defaults(func=run_range_bench) - - model_parser = subparsers.add_parser("model_bench") - model_parser.add_argument( - "--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES.keys(), - ) - model_parser.add_argument( - "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES - ) - model_parser.add_argument( - "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES - ) - model_parser.set_defaults(func=run_model_bench) - - args = parser.parse_args() - args.func(args) diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py deleted file mode 100644 index 25b96ef56..000000000 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Weight Shapes are in the format -# ([K, N], TP_SPLIT_DIM) -# Example: -# A shape of ([14336, 4096], 0) indicates the following GEMM shape, -# - TP1 : K = 14336, N = 4096 -# - TP2 : K = 7168, N = 4096 -# A shape of ([4096, 6144], 1) indicates the following GEMM shape, -# - TP1 : K = 4096, N = 6144 -# - TP4 : K = 4096, N = 1536 - -# TP1 shapes -WEIGHT_SHAPES = { - "mistralai/Mistral-7B-v0.1": [ - ([4096, 6144], 1), - ([4096, 4096], 0), - ([4096, 28672], 1), - ([14336, 4096], 0), - ], - "meta-llama/Llama-2-7b-hf": [ - ([4096, 12288], 1), - ([4096, 4096], 0), - ([4096, 22016], 1), - ([11008, 4096], 0), - ], - "meta-llama/Llama-3-8b": [ - ([4096, 6144], 1), - ([4096, 4096], 0), - ([4096, 28672], 1), - ([14336, 4096], 0), - ], - "meta-llama/Llama-2-13b-hf": [ - ([5120, 15360], 1), - ([5120, 5120], 0), - ([5120, 27648], 1), - ([13824, 5120], 0), - ], - "meta-llama/Llama-2-70b-hf": [ - ([8192, 10240], 1), - ([8192, 8192], 0), - ([8192, 57344], 1), - ([28672, 8192], 0), - ], -} diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh deleted file mode 100644 index 2c72941cf..000000000 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# benchmark the overhead of disaggregated prefill. -# methodology: -# - send all request to prefill vLLM instance. It will buffer KV cache. -# - then send all request to decode instance. -# - The TTFT of decode instance is the overhead. - -set -ex - -kill_gpu_processes() { - # kill all processes on GPU. - pgrep pt_main_thread | xargs -r kill -9 - pgrep python3 | xargs -r kill -9 - # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 - pgrep VLLM | xargs -r kill -9 - sleep 10 - - # remove vllm config file - rm -rf ~/.config/vllm - - # Print the GPU memory usage - # so that we know if all GPU processes are killed. - gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) - # The memory usage should be 0 MB. - echo "GPU 0 Memory Usage: $gpu_memory_usage MB" -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - local port=$1 - timeout 1200 bash -c " - until curl -s localhost:${port}/v1/completions > /dev/null; do - sleep 1 - done" && return 0 || return 1 -} - - -benchmark() { - - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') - - # compare chunked prefill with disaggregated prefill - - results_folder="./results" - model="meta-llama/Meta-Llama-3.1-8B-Instruct" - dataset_name="sonnet" - dataset_path="../sonnet_4x.txt" - num_prompts=10 - qps=$1 - prefix_len=50 - input_len=2048 - output_len=$2 - - - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8100 \ - --max-model-len 10000 \ - --gpu-memory-utilization 0.6 \ - --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - - - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8200 \ - --max-model-len 10000 \ - --gpu-memory-utilization 0.6 \ - --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - - wait_for_server 8100 - wait_for_server 8200 - - # let the prefill instance finish prefill - vllm bench serve \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8100 \ - --save-result \ - --result-dir $results_folder \ - --result-filename disagg_prefill_tp1.json \ - --request-rate "inf" - - - # send the request to decode. - # The TTFT of this command will be the overhead of disagg prefill impl. - vllm bench serve \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8200 \ - --save-result \ - --result-dir $results_folder \ - --result-filename disagg_prefill_tp1_overhead.json \ - --request-rate "$qps" - kill_gpu_processes - -} - - -main() { - - (which wget && which curl) || (apt-get update && apt-get install -y wget curl) - (which jq) || (apt-get -y install jq) - (which socat) || (apt-get -y install socat) - - pip install quart httpx datasets - - cd "$(dirname "$0")" - - cd .. - # create sonnet-4x.txt - echo "" > sonnet_4x.txt - for _ in {1..4} - do - cat sonnet.txt >> sonnet_4x.txt - done - cd disagg_benchmarks - - rm -rf results - mkdir results - - default_qps=1 - default_output_len=1 - benchmark $default_qps $default_output_len - -} - - -main "$@" diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh deleted file mode 100644 index 0bbf7cd2b..000000000 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ /dev/null @@ -1,165 +0,0 @@ -#!/bin/bash - -# Requirement: 2x GPUs. - - -# Model: meta-llama/Meta-Llama-3.1-8B-Instruct -# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests -# Resource: 2x GPU -# Approaches: -# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4 -# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance -# Prefilling instance: max_output_token=1 -# Decoding instance: force the input tokens be the same across requests to bypass prefilling - -set -ex - -kill_gpu_processes() { - # kill all processes on GPU. - pgrep pt_main_thread | xargs -r kill -9 - pgrep python3 | xargs -r kill -9 - # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 - pgrep VLLM | xargs -r kill -9 - for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done - sleep 1 -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - local port=$1 - timeout 1200 bash -c " - until curl -s localhost:${port}/v1/completions > /dev/null; do - sleep 1 - done" && return 0 || return 1 -} - - -launch_chunked_prefill() { - model="meta-llama/Meta-Llama-3.1-8B-Instruct" - # disagg prefill - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8100 \ - --max-model-len 10000 \ - --enable-chunked-prefill \ - --gpu-memory-utilization 0.6 & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8200 \ - --max-model-len 10000 \ - --enable-chunked-prefill \ - --gpu-memory-utilization 0.6 & - wait_for_server 8100 - wait_for_server 8200 - python3 round_robin_proxy.py & - sleep 1 -} - - -launch_disagg_prefill() { - model="meta-llama/Meta-Llama-3.1-8B-Instruct" - # disagg prefill - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8100 \ - --max-model-len 10000 \ - --gpu-memory-utilization 0.6 \ - --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ - --port 8200 \ - --max-model-len 10000 \ - --gpu-memory-utilization 0.6 \ - --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - - wait_for_server 8100 - wait_for_server 8200 - python3 disagg_prefill_proxy_server.py & - sleep 1 -} - - -benchmark() { - results_folder="./results" - model="meta-llama/Meta-Llama-3.1-8B-Instruct" - dataset_name="sonnet" - dataset_path="../sonnet_4x.txt" - num_prompts=100 - qps=$1 - prefix_len=50 - input_len=1024 - output_len=$2 - tag=$3 - - vllm bench serve \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8000 \ - --save-result \ - --result-dir $results_folder \ - --result-filename "$tag"-qps-"$qps".json \ - --request-rate "$qps" - - sleep 2 -} - - -main() { - - (which wget && which curl) || (apt-get update && apt-get install -y wget curl) - (which jq) || (apt-get -y install jq) - (which socat) || (apt-get -y install socat) - (which lsof) || (apt-get -y install lsof) - - pip install quart httpx matplotlib aiohttp datasets - - cd "$(dirname "$0")" - - cd .. - # create sonnet-4x.txt so that we can sample 2048 tokens for input - echo "" > sonnet_4x.txt - for _ in {1..4} - do - cat sonnet.txt >> sonnet_4x.txt - done - cd disagg_benchmarks - - rm -rf results - mkdir results - - default_output_len=6 - - export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') - - launch_chunked_prefill - for qps in 2 4 6 8; do - benchmark $qps $default_output_len chunked_prefill - done - kill_gpu_processes - - launch_disagg_prefill - for qps in 2 4 6 8; do - benchmark $qps $default_output_len disagg_prefill - done - kill_gpu_processes - - python3 visualize_benchmark_results.py - -} - - -main "$@" diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py deleted file mode 100644 index 904f80534..000000000 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ /dev/null @@ -1,199 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import asyncio -import logging -import os - -import aiohttp -from quart import Quart, Response, make_response, request -from rate_limiter import RateLimiter -from request_queue import RequestQueue - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def parse_args(): - """parse command line arguments""" - parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server") - - # Add args - parser.add_argument( - "--timeout", - type=float, - default=300, - help="Timeout for backend service requests in seconds (default: 300)", - ) - parser.add_argument( - "--max-concurrent", - type=int, - default=100, - help="Maximum concurrent requests to backend services (default: 100)", - ) - parser.add_argument( - "--queue-size", - type=int, - default=500, - help="Maximum number of requests in the queue (default: 500)", - ) - parser.add_argument( - "--rate-limit", - type=int, - default=40, - help="Maximum requests per second (default: 40)", - ) - parser.add_argument( - "--port", - type=int, - default=8000, - help="Port to run the server on (default: 8000)", - ) - parser.add_argument( - "--prefill-url", - type=str, - default="http://localhost:8100/v1/completions", - help="Prefill service endpoint URL", - ) - parser.add_argument( - "--decode-url", - type=str, - default="http://localhost:8200/v1/completions", - help="Decode service endpoint URL", - ) - - return parser.parse_args() - - -def main(): - """parse command line arguments""" - args = parse_args() - - # Initialize configuration using command line parameters - AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) - MAX_CONCURRENT_REQUESTS = args.max_concurrent - REQUEST_QUEUE_SIZE = args.queue_size - RATE_LIMIT = args.rate_limit - PREFILL_SERVICE_URL = args.prefill_url - DECODE_SERVICE_URL = args.decode_url - PORT = args.port - - app = Quart(__name__) - - # Initialize the rate limiter and request queue - rate_limiter = RateLimiter(RATE_LIMIT) - request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE) - - # Attach the configuration object to the application instance - app.config.update( - { - "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, - "rate_limiter": rate_limiter, - "request_queue": request_queue, - "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, - "DECODE_SERVICE_URL": DECODE_SERVICE_URL, - } - ) - - # Start queue processing on app startup - @app.before_serving - async def startup(): - """Start request processing task when app starts serving""" - asyncio.create_task(request_queue.process()) - - async def forward_request(url, data): - """Forward request to backend service with rate limiting and error handling""" - headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - - # Use rate limiter as context manager - async with ( - rate_limiter, - aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, - ): - try: - async with session.post( - url=url, json=data, headers=headers - ) as response: - if response.status == 200: - # Stream response chunks - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes - else: - # Handle backend service errors - error_text = await response.text() - logger.error( - "Backend service error: %s - %s", - response.status, - error_text, - ) - yield b'{"error": "Backend service error"}' - except aiohttp.ClientError as e: - # Handle connection errors - logger.error("Connection error to %s: %s", url, str(e)) - yield b'{"error": "Service unavailable"}' - except asyncio.TimeoutError: - # Handle timeout errors - logger.error("Timeout connecting to %s", url) - yield b'{"error": "Service timeout"}' - - async def process_request(): - """Process a single request through prefill and decode stages""" - try: - original_request_data = await request.get_json() - - # Create prefill request (max_tokens=1) - prefill_request = original_request_data.copy() - prefill_request["max_tokens"] = 1 - - # Execute prefill stage - async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request): - continue - - # Execute decode stage and stream response - generator = forward_request(DECODE_SERVICE_URL, original_request_data) - response = await make_response(generator) - response.timeout = None # Disable timeout for streaming response - return response - - except Exception: - logger.exception("Error processing request") - return Response( - response=b'{"error": "Internal server error"}', - status=500, - content_type="application/json", - ) - - @app.route("/v1/completions", methods=["POST"]) - async def handle_request(): - """Handle incoming API requests with concurrency and rate limiting""" - # Create task for request processing - task = asyncio.create_task(process_request()) - - # Enqueue request or reject if queue is full - if not await request_queue.enqueue(task): - return Response( - response=b'{"error": "Server busy, try again later"}', - status=503, - content_type="application/json", - ) - - try: - # Return the response from the processing task - return await task - except asyncio.CancelledError: - # Handle task cancellation (timeout or queue full) - logger.warning("Request cancelled due to timeout or queue full") - return Response( - response=b'{"error": "Request cancelled"}', - status=503, - content_type="application/json", - ) - - # Start the Quart server with host can be set to 0.0.0.0 - app.run(port=PORT) - - -if __name__ == "__main__": - main() diff --git a/benchmarks/disagg_benchmarks/rate_limiter.py b/benchmarks/disagg_benchmarks/rate_limiter.py deleted file mode 100644 index 87ac8cb6a..000000000 --- a/benchmarks/disagg_benchmarks/rate_limiter.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import time - - -class RateLimiter: - """Token bucket rate limiter implementation""" - - def __init__(self, rate_limit): - self.rate_limit = rate_limit # Requests per second - self.num_available_tokens = rate_limit # Available tokens - self.last_refill = time.monotonic() # Last token refill time - self.lock = asyncio.Lock() # Synchronization lock - - async def acquire(self): - """Acquire a token from the rate limiter""" - while True: - async with self.lock: - current_time = time.monotonic() - elapsed = current_time - self.last_refill - - # Refill num_available_tokens if more than 1 second has passed - if elapsed > 1.0: - self.num_available_tokens = self.rate_limit - self.last_refill = current_time - - # Check if num_available_tokens are available - if self.num_available_tokens > 0: - self.num_available_tokens -= 1 - return True - - # Calculate wait time if no num_available_tokens available - wait_time = 1.0 - elapsed - await asyncio.sleep(wait_time) - - async def __aenter__(self): - """Enter async context manager - acquire token""" - await self.acquire() - return self - - async def __aexit__(self, exc_type, exc_value, traceback): - """Exit async context manager - no cleanup needed""" - pass diff --git a/benchmarks/disagg_benchmarks/request_queue.py b/benchmarks/disagg_benchmarks/request_queue.py deleted file mode 100644 index 410bcb956..000000000 --- a/benchmarks/disagg_benchmarks/request_queue.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -from collections import deque - - -class RequestQueue: - """Request queue manager with concurrency control""" - - def __init__(self, max_concurrent, max_queue_size): - # Maximum concurrent requests - self.max_concurrent = max_concurrent - self.max_queue_size = max_queue_size # Maximum queue size - # Concurrency control - self.semaphore = asyncio.Semaphore(max_concurrent) - self.queue = deque() # Request queue - self.queue_size = 0 # Current queue size - self.lock = asyncio.Lock() # Sync queue Lock - - async def enqueue(self, task): - """Add a request task to the queue""" - async with self.lock: - if self.queue_size >= self.max_queue_size: - return False - - self.queue.append(task) - self.queue_size += 1 - return True - - async def process(self): - """Process queued requests using semaphore for concurrency control""" - while True: - if self.queue: - async with self.semaphore, self.lock: - task = self.queue.popleft() - self.queue_size -= 1 - await task - await asyncio.sleep(0.01) # Yield control to event loop diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py deleted file mode 100644 index b1df2f255..000000000 --- a/benchmarks/disagg_benchmarks/round_robin_proxy.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import itertools - -import aiohttp -from aiohttp import web - - -class RoundRobinProxy: - def __init__(self, target_ports): - self.target_ports = target_ports - self.port_cycle = itertools.cycle(self.target_ports) - - async def handle_request(self, request): - target_port = next(self.port_cycle) - target_url = f"http://localhost:{target_port}{request.path_qs}" - - async with aiohttp.ClientSession() as session: - try: - # Forward the request - async with session.request( - method=request.method, - url=target_url, - headers=request.headers, - data=request.content, - ) as response: - # Start sending the response - resp = web.StreamResponse( - status=response.status, headers=response.headers - ) - await resp.prepare(request) - - # Stream the response content - async for chunk in response.content.iter_any(): - await resp.write(chunk) - - await resp.write_eof() - return resp - - except Exception as e: - return web.Response(text=f"Error: {str(e)}", status=500) - - -async def main(): - proxy = RoundRobinProxy([8100, 8200]) - app = web.Application() - app.router.add_route("*", "/{path:.*}", proxy.handle_request) - - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite(runner, "localhost", 8000) - await site.start() - - print("Proxy server started on http://localhost:8000") - - # Keep the server running - await asyncio.Event().wait() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py deleted file mode 100644 index 74fa56d07..000000000 --- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import json - -import matplotlib.pyplot as plt -import pandas as pd - -if __name__ == "__main__": - data = [] - for name in ["disagg_prefill", "chunked_prefill"]: - for qps in [2, 4, 6, 8]: - with open(f"results/{name}-qps-{qps}.json") as f: - x = json.load(f) - x["name"] = name - x["qps"] = qps - data.append(x) - - df = pd.DataFrame.from_dict(data) - dis_df = df[df["name"] == "disagg_prefill"] - chu_df = df[df["name"] == "chunked_prefill"] - - plt.style.use("bmh") - plt.rcParams["font.size"] = 20 - - for key in [ - "mean_ttft_ms", - "median_ttft_ms", - "p99_ttft_ms", - "mean_itl_ms", - "median_itl_ms", - "p99_itl_ms", - ]: - fig, ax = plt.subplots(figsize=(11, 7)) - plt.plot( - dis_df["qps"], dis_df[key], label="disagg_prefill", marker="o", linewidth=4 - ) - plt.plot( - chu_df["qps"], chu_df[key], label="chunked_prefill", marker="o", linewidth=4 - ) - ax.legend() - - ax.set_xlabel("QPS") - ax.set_ylabel(key) - ax.set_ylim(bottom=0) - fig.savefig(f"results/{key}.png") - plt.close(fig) diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py deleted file mode 100644 index 901524214..000000000 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ /dev/null @@ -1,228 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pickle as pkl -import time -from collections.abc import Iterable -from dataclasses import dataclass -from itertools import product -from typing import Callable, Optional - -import torch -import torch.utils.benchmark as TBenchmark -from torch.utils.benchmark import Measurement as TMeasurement -from tqdm import tqdm - -import vllm._custom_ops as ops -from vllm.model_executor.layers.layernorm import RMSNorm - - -@dataclass -class bench_params_t: - num_tokens: int - hidden_size: int - add_residual: bool - dtype: torch.dtype - - def description(self): - return ( - f"N {self.num_tokens} " - f"x D {self.hidden_size} " - f"x R {self.add_residual} " - f"x DT {self.dtype}" - ) - - -def get_bench_params() -> list[bench_params_t]: - ## Test Fixtures - NUM_TOKENS = [2**x for x in range(11)] - HIDDEN_SIZES = list(range(1024, 8129, 1024)) - ADD_RESIDUAL = [True, False] - DTYPES = [torch.bfloat16, torch.float] - - combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES) - bench_params = list( - map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations) - ) - return bench_params - - -# Reference impls -def unfused_int8_impl( - rms_norm_layer: RMSNorm, - x: torch.Tensor, - residual: Optional[torch.Tensor], - quant_dtype: torch.dtype, -): - # Norm - torch_out = None - if residual is None: - torch_out = rms_norm_layer.forward_cuda(x, residual) - else: - torch_out, _ = rms_norm_layer.forward_cuda(x, residual) - - # Quant - torch_out, _, _ = ops.scaled_int8_quant(torch_out) - - -def unfused_fp8_impl( - rms_norm_layer: RMSNorm, - x: torch.Tensor, - residual: Optional[torch.Tensor], - quant_dtype: torch.dtype, -): - # Norm - torch_out = None - if residual is None: - torch_out = rms_norm_layer.forward_cuda(x, residual) - else: - torch_out, _ = rms_norm_layer.forward_cuda(x, residual) - - # Quant - torch_out, _ = ops.scaled_fp8_quant(torch_out) - - -def fused_impl( - rms_norm_layer: RMSNorm, # this stores the weights - x: torch.Tensor, - residual: Optional[torch.Tensor], - quant_dtype: torch.dtype, -): - out, _ = ops.rms_norm_dynamic_per_token_quant( - x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual - ) - - -# Bench functions -def bench_fn( - rms_norm_layer: RMSNorm, - x: torch.Tensor, - residual: torch.Tensor, - quant_dtype: torch.dtype, - label: str, - sub_label: str, - fn: Callable, - description: str, -) -> TMeasurement: - min_run_time = 1 - - globals = { - "rms_norm_layer": rms_norm_layer, - "x": x, - "residual": residual, - "quant_dtype": quant_dtype, - "fn": fn, - } - return TBenchmark.Timer( - stmt="fn(rms_norm_layer, x, residual, quant_dtype)", - globals=globals, - label=label, - sub_label=sub_label, - description=description, - ).blocked_autorange(min_run_time=min_run_time) - - -def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasurement]: - # Make inputs - layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype) - # Make weights - layer.weight.data.normal_(mean=1.0, std=0.1) - # Make inputs - scale = 1 / params.hidden_size - x = ( - torch.randn( - params.num_tokens, params.hidden_size, dtype=params.dtype, device="cuda" - ) - * scale - ) - residual = ( - (torch.randn_like(x) * scale).to(device="cuda") if params.add_residual else None - ) - - timers = [] - - # unfused int8 impl. - timers.append( - bench_fn( - layer, - x, - residual, - torch.int8, - label, - sub_label, - unfused_int8_impl, - "unfused_int8_impl", - ) - ) - - # unfused fp8 impl. - timers.append( - bench_fn( - layer, - x, - residual, - torch.float8_e4m3fn, - label, - sub_label, - unfused_fp8_impl, - "unfused_fp8_impl", - ) - ) - - # fused int8 impl. - timers.append( - bench_fn( - layer, - x, - residual, - torch.int8, - label, - sub_label, - fused_impl, - "fused_int8_impl", - ) - ) - - # fused fp8 impl. - timers.append( - bench_fn( - layer, - x, - residual, - torch.float8_e4m3fn, - label, - sub_label, - fused_impl, - "fused_fp8_impl", - ) - ) - - print_timers(timers) - - return timers - - -# launch bench -# runner -def print_timers(timers: Iterable[TMeasurement]): - compare = TBenchmark.Compare(timers) - compare.print() - - -def main(): - torch.set_default_device("cuda") - bench_params = get_bench_params() - - timers = [] - for bp in tqdm(bench_params): - timers.extend(bench(bp, "rms-norm-dynamic-per-token-quant", bp.description())) - print_timers(timers) - - # pickle all the results - timestamp = int(time.time()) - with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f: - pkl.dump(timers, f) - - -if __name__ == "__main__": - main() diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py deleted file mode 100644 index f1e504499..000000000 --- a/benchmarks/kernels/bench_block_fp8_gemm.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - apply_w8a8_block_fp8_linear, -) -from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - CUTLASS_BLOCK_FP8_SUPPORTED, -) -from vllm.platforms import current_platform -from vllm.triton_utils import triton as vllm_triton - -assert current_platform.is_cuda(), ( - "Only support benchmarking w8a8 block fp8 kernel on CUDA device." -) - -# DeepSeek-V3 weight shapes -DEEPSEEK_V3_SHAPES = [ - (512 + 64, 7168), - (2112, 7168), - ((128 + 64) * 128, 7168), - (128 * (128 + 128), 512), - (7168, 16384), - (7168, 18432), - (18432 * 2, 7168), - (24576, 1536), - (12288, 7168), - (4096, 7168), - (7168, 2048), -] - - -def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): - """Build runner function for w8a8 block fp8 matmul.""" - factor_for_scale = 1e-2 - - fp8_info = torch.finfo(torch.float8_e4m3fn) - fp8_max, fp8_min = fp8_info.max, fp8_info.min - - # Create random FP8 tensors - A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max - - B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max - B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) - - # Create scales - block_n, block_k = block_size[0], block_size[1] - n_tiles = (N + block_n - 1) // block_n - k_tiles = (K + block_k - 1) // block_k - - Bs = ( - torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device) - * factor_for_scale - ) - - # SM90 CUTLASS requires row-major format for scales - if use_cutlass and current_platform.is_device_capability(90): - Bs = Bs.T.contiguous() - - def run(): - if use_cutlass: - return apply_w8a8_block_fp8_linear( - A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True - ) - else: - return apply_w8a8_block_fp8_linear( - A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False - ) - - return run - - -# Determine available providers -available_providers = ["torch-bf16", "w8a8-block-fp8-triton"] -plot_title = "BF16 vs W8A8 Block FP8 GEMMs" - -if CUTLASS_BLOCK_FP8_SUPPORTED: - available_providers.append("w8a8-block-fp8-cutlass") - - -@vllm_triton.testing.perf_report( - vllm_triton.testing.Benchmark( - x_names=["batch_size"], - x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], - x_log=False, - line_arg="provider", - line_vals=available_providers, - line_names=available_providers, - ylabel="TFLOP/s (larger is better)", - plot_name="BF16 vs W8A8 Block FP8 GEMMs", - args={}, - ) -) -def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)): - M = batch_size - device = "cuda" - - quantiles = [0.5, 0.2, 0.8] - - if provider == "torch-bf16": - a = torch.randn((M, K), device=device, dtype=torch.bfloat16) - b = torch.randn((N, K), device=device, dtype=torch.bfloat16) - ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( - lambda: torch.nn.functional.linear(a, b), quantiles=quantiles - ) - elif provider == "w8a8-block-fp8-triton": - run_w8a8_triton = build_w8a8_block_fp8_runner( - M, N, K, block_size, device, use_cutlass=False - ) - ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( - lambda: run_w8a8_triton(), quantiles=quantiles - ) - elif provider == "w8a8-block-fp8-cutlass": - run_w8a8_cutlass = build_w8a8_block_fp8_runner( - M, N, K, block_size, device, use_cutlass=True - ) - ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( - lambda: run_w8a8_cutlass(), quantiles=quantiles - ) - else: - raise ValueError(f"Unknown provider: {provider}") - - to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) - return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) - - -if __name__ == "__main__": - block_size = (128, 128) - - for N, K in DEEPSEEK_V3_SHAPES: - print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}") - - print(f"TFLOP/s comparison (block_size={block_size}):") - benchmark_tflops.run( - print_data=True, - # show_plots=False, - # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}", - N=N, - K=K, - block_size=block_size, - ) - - print("\nBenchmark finished!") diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py deleted file mode 100644 index 920961899..000000000 --- a/benchmarks/kernels/bench_fp8_gemm.py +++ /dev/null @@ -1,159 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import copy -import itertools - -import torch -from weight_shapes import WEIGHT_SHAPES - -from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm -from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant -from vllm.triton_utils import triton - -PROVIDER_CFGS = { - "torch-bf16": dict(enabled=True), - "fp8-tensor-w-token-a": dict( - w="tensor", a="token", no_a_quant=False, enabled=False - ), - "fp8-tensor-w-tensor-a": dict( - w="tensor", a="tensor", no_a_quant=False, enabled=True - ), - "fp8-channel-w-token-a": dict( - w="channel", a="token", no_a_quant=False, enabled=True - ), - "fp8-channel-w-tensor-a": dict( - w="channel", a="tensor", no_a_quant=False, enabled=False - ), - "fp8-tensor-w-token-a-noquant": dict( - w="tensor", a="token", no_a_quant=True, enabled=False - ), - "fp8-tensor-w-tensor-a-noquant": dict( - w="tensor", a="tensor", no_a_quant=True, enabled=True - ), - "fp8-channel-w-token-a-noquant": dict( - w="channel", a="token", no_a_quant=True, enabled=True - ), - "fp8-channel-w-tensor-a-noquant": dict( - w="channel", a="tensor", no_a_quant=True, enabled=False - ), -} - -_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] - - -def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str): - if w_type == "tensor": - scale_b = torch.ones(1, device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - else: - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True) - return b_fp8.t(), scale_b_fp8 - - -def build_fp8_runner(cfg, a, b, dtype, device): - b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device) - - scale_a_const = ( - torch.ones(1, device=device, dtype=torch.float32) - if cfg["a"] == "tensor" - else None - ) - - if cfg["no_a_quant"]: - if cfg["a"] == "tensor": - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const) - else: - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True) - - def run(): - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - return run - - if cfg["a"] == "tensor": - - def run(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - else: - - def run(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - return run - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["batch_size"], - x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], - x_log=False, - line_arg="provider", - line_vals=_enabled, - line_names=_enabled, - ylabel="TFLOP/s (larger is better)", - plot_name="BF16 vs FP8 GEMMs", - args={}, - ) -) -def benchmark(batch_size, provider, N, K): - M = batch_size - device = "cuda" - dtype = torch.bfloat16 - - a = torch.randn((M, K), device=device, dtype=dtype) - b = torch.randn((N, K), device=device, dtype=dtype) - - quantiles = [0.5, 0.2, 0.8] - - if provider == "torch-bf16": - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( - lambda: torch.nn.functional.linear(a, b), quantiles=quantiles - ) - else: - cfg = PROVIDER_CFGS[provider] - run_quant = build_fp8_runner(cfg, a, b, dtype, device) - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( - lambda: run_quant(), quantiles=quantiles - ) - - to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) - return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) - - -def prepare_shapes(args): - out = [] - for model, tp_size in itertools.product(args.models, args.tp_sizes): - for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): - KN[tp_dim] //= tp_size - KN.append(model) - out.append(KN) - return out - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--models", - nargs="+", - type=str, - default=["meta-llama/Llama-3.1-8B-Instruct"], - choices=list(WEIGHT_SHAPES.keys()), - ) - parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) - args = parser.parse_args() - - for K, N, model in prepare_shapes(args): - print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") - benchmark.run( - print_data=True, - show_plots=True, - save_path=f"bench_fp8_res_n{N}_k{K}", - N=N, - K=K, - ) - - print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_int8_gemm.py b/benchmarks/kernels/bench_int8_gemm.py deleted file mode 100644 index e9c6d6440..000000000 --- a/benchmarks/kernels/bench_int8_gemm.py +++ /dev/null @@ -1,169 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import copy -import itertools - -import torch -from weight_shapes import WEIGHT_SHAPES - -from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm -from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant -from vllm.triton_utils import triton - -PROVIDER_CFGS = { - "torch-bf16": dict(enabled=True), - "int8-tensor-w-token-a": dict( - w="tensor", a="token", no_a_quant=False, enabled=False - ), - "int8-tensor-w-tensor-a": dict( - w="tensor", a="tensor", no_a_quant=False, enabled=True - ), - "int8-channel-w-token-a": dict( - w="channel", a="token", no_a_quant=False, enabled=True - ), - "int8-channel-w-tensor-a": dict( - w="channel", a="tensor", no_a_quant=False, enabled=False - ), - "int8-tensor-w-token-a-noquant": dict( - w="tensor", a="token", no_a_quant=True, enabled=False - ), - "int8-tensor-w-tensor-a-noquant": dict( - w="tensor", a="tensor", no_a_quant=True, enabled=True - ), - "int8-channel-w-token-a-noquant": dict( - w="channel", a="token", no_a_quant=True, enabled=True - ), - "int8-channel-w-tensor-a-noquant": dict( - w="channel", a="tensor", no_a_quant=True, enabled=False - ), -} - - -def _quant_weight(b, w_type, device): - if w_type == "tensor": - scale_b = torch.ones(1, device=device, dtype=torch.float32) - b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b) - assert scale_b_int8.numel() == 1 - else: # channel - b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b) - assert scale_b_int8.numel() == b.shape[0] - return b_int8.t(), scale_b_int8 - - -def build_int8_runner(cfg, a, b, dtype, device): - # quant before running the kernel - b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device) - - scale_a_const = None - if cfg["a"] == "tensor": - scale_a_const = torch.ones(1, device=device, dtype=torch.float32) - - # no quant, create activation ahead - if cfg["no_a_quant"]: - if cfg["a"] == "tensor": - a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const) - else: # token - a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a) - - def run_quant(): - return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) - - return run_quant - - # dynamic quant, create activation inside - if cfg["a"] == "tensor": - - def run_quant(): - a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const) - return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) - - else: # token - - def run_quant(): - a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a) - return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) - - return run_quant - - -_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")] - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["batch_size"], - x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], - x_log=False, - line_arg="provider", - line_vals=_enabled, - line_names=[k for k in _enabled], - ylabel="TFLOP/s (larger is better)", - plot_name="BF16 vs INT8 GEMMs", - args={}, - ) -) -def benchmark(batch_size, provider, N, K): - M = batch_size - device = "cuda" - dtype = torch.bfloat16 - a = torch.randn((M, K), device=device, dtype=dtype) - b = torch.randn((N, K), device=device, dtype=dtype) - - quantiles = [0.5, 0.2, 0.8] - - if provider == "torch-bf16": - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( - lambda: torch.nn.functional.linear(a, b), quantiles=quantiles - ) - else: - cfg = PROVIDER_CFGS[provider] - run_quant = build_int8_runner(cfg, a, b, dtype, device) - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( - lambda: run_quant(), quantiles=quantiles - ) - - to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) - return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) - - -def prepare_shapes(args): - KN_model_names = [] - for model, tp_size in itertools.product(args.models, args.tp_sizes): - for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): - KN[tp_dim] //= tp_size - KN.append(model) - KN_model_names.append(KN) - return KN_model_names - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--models", - nargs="+", - type=str, - default=["meta-llama/Llama-3.1-8B-Instruct"], - choices=list(WEIGHT_SHAPES.keys()), - help="List of models to benchmark", - ) - parser.add_argument( - "--tp-sizes", - nargs="+", - type=int, - default=[1], - help="List of tensor parallel sizes", - ) - args = parser.parse_args() - - for K, N, model in prepare_shapes(args): - print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:") - benchmark.run( - print_data=True, - show_plots=True, - save_path=f"bench_int8_res_n{N}_k{K}", - N=N, - K=K, - ) - - print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_nvfp4_gemm.py b/benchmarks/kernels/bench_nvfp4_gemm.py deleted file mode 100644 index 6b19eb113..000000000 --- a/benchmarks/kernels/bench_nvfp4_gemm.py +++ /dev/null @@ -1,198 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import copy -import itertools -import os - -import torch -from weight_shapes import WEIGHT_SHAPES - -from vllm import _custom_ops as ops -from vllm.platforms import current_platform -from vllm.scalar_type import scalar_types -from vllm.triton_utils import triton - -if not current_platform.has_device_capability(100): - raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)") - - -FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() -FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max - -PROVIDER_CFGS = { - "torch-bf16": dict(enabled=True), - "nvfp4": dict(no_a_quant=False, enabled=True), - "nvfp4-noquant": dict(no_a_quant=True, enabled=True), - "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True), - "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True), -} - -_needs_fbgemm = any( - v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False) -) -if _needs_fbgemm: - try: - from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import ( - triton_scale_nvfp4_quant, - ) - except ImportError: - print( - "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. " - "These providers will be skipped. Please install fbgemm_gpu with: " - "'pip install fbgemm-gpu-genai' to run them." - ) - # Disable FBGEMM providers so the benchmark can run. - for cfg in PROVIDER_CFGS.values(): - if cfg.get("fbgemm"): - cfg["enabled"] = False - -_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] - - -def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg): - # Compute global scale for weight - b_amax = torch.abs(b).max().to(torch.float32) - b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax - if "fbgemm" in cfg and cfg["fbgemm"]: - b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale) - else: - b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale) - return b_fp4, scale_b_fp4, b_global_scale - - -def build_nvfp4_runner(cfg, a, b, dtype, device): - b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg) - - # Compute global scale for activation - # NOTE: This is generally provided ahead-of-time by the model checkpoint. - a_amax = torch.abs(a).max().to(torch.float32) - a_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax - - # Alpha for the GEMM operation - alpha = 1.0 / (a_global_scale * b_global_scale) - if "fbgemm" in cfg and cfg["fbgemm"]: - if cfg["no_a_quant"]: - a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale) - - def run(): - return torch.ops.fbgemm.f4f4bf16( - a_fp4, - b_fp4, - scale_a_fp4, - scale_b_fp4, - global_scale=alpha, - use_mx=False, - ) - - return run - else: - - def run(): - a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale) - return torch.ops.fbgemm.f4f4bf16( - a_fp4, - b_fp4, - scale_a_fp4, - scale_b_fp4, - global_scale=alpha, - use_mx=False, - ) - - return run - - if cfg["no_a_quant"]: - # Pre-quantize activation - a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale) - - def run(): - return ops.cutlass_scaled_fp4_mm( - a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype - ) - - return run - - # Quantize activation on-the-fly - def run(): - a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale) - return ops.cutlass_scaled_fp4_mm( - a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype - ) - - return run - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["batch_size"], - x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], - x_log=False, - line_arg="provider", - line_vals=_enabled, - line_names=_enabled, - ylabel="TFLOP/s (larger is better)", - plot_name="BF16 vs NVFP4 GEMMs", - args={}, - ) -) -def benchmark(batch_size, provider, N, K): - M = batch_size - device = "cuda" - dtype = torch.bfloat16 - - a = torch.randn((M, K), device=device, dtype=dtype) - b = torch.randn((N, K), device=device, dtype=dtype) - - quantiles = [0.5, 0.2, 0.8] - - if provider == "torch-bf16": - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( - lambda: torch.nn.functional.linear(a, b), quantiles=quantiles - ) - else: - cfg = PROVIDER_CFGS[provider] - run_quant = build_nvfp4_runner(cfg, a, b, dtype, device) - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( - lambda: run_quant(), quantiles=quantiles - ) - - to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) - return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) - - -def prepare_shapes(args): - out = [] - for model, tp_size in itertools.product(args.models, args.tp_sizes): - for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): - KN[tp_dim] //= tp_size - KN.append(model) - out.append(KN) - return out - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--models", - nargs="+", - type=str, - default=["meta-llama/Llama-3.1-8B-Instruct"], - choices=list(WEIGHT_SHAPES.keys()), - ) - parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) - args = parser.parse_args() - - for K, N, model in prepare_shapes(args): - print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:") - save_dir = f"bench_nvfp4_res_n{N}_k{K}" - os.makedirs(save_dir, exist_ok=True) - - benchmark.run( - print_data=True, - show_plots=True, - save_path=save_dir, - N=N, - K=K, - ) - - print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py deleted file mode 100644 index e08e5680c..000000000 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ /dev/null @@ -1,269 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import itertools -from typing import Callable -from unittest.mock import patch - -import pandas as pd -import torch - -from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 -from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape -from vllm.triton_utils import triton -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser - - -def with_triton_mode(fn): - """Temporarily force the Triton fallback path""" - - def wrapped(*args, **kwargs): - with patch("vllm.platforms.current_platform.is_cuda", return_value=False): - return fn(*args, **kwargs) - - return wrapped - - -# TODO(luka): use standalone_compile utility -def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int): - def inner(*args): - torch._dynamo.mark_dynamic(args[arg_index], dim_index) - return fn(*args) - - return inner - - -def bench_compile(fn: Callable): - # recompile for different shapes - fwd = torch.compile(fn, fullgraph=True, dynamic=False) - - # First dim is explicitly dynamic to simulate vLLM usage - return with_dyn_arg(fwd, 0, 0) - - -torch._dynamo.config.recompile_limit = 8888 - - -def calculate_diff( - batch_size: int, - hidden_size: int, - group_shape: GroupShape, - dtype: torch.dtype, -): - """Calculate the difference between Inductor and CUDA implementations.""" - device = torch.device("cuda") - x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device) - - quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False) - - torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x) - torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x) - cuda_out, cuda_scale = quant_fp8.forward_cuda(x) - - try: - torch.testing.assert_close( - cuda_out.to(torch.float32), - torch_out.to(torch.float32), - rtol=1e-3, - atol=1e-5, - ) - torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5) - torch.testing.assert_close( - cuda_out.to(torch.float32), - torch_eager_out.to(torch.float32), - rtol=1e-3, - atol=1e-5, - ) - torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5) - print("✅ All implementations match") - except AssertionError as e: - print("❌ Implementations differ") - print(e) - - -configs = [] - - -def benchmark_quantization( - batch_size, - hidden_size, - provider, - group_shape: GroupShape, - col_major: bool, - dtype: torch.dtype, -): - device = torch.device("cuda") - - x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype) - - quantiles = [0.5, 0.2, 0.8] - quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major) - - if provider == "torch": - fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone()) - elif provider == "cuda": - fn = lambda: quant_fp8.forward_cuda(x.clone()) - elif provider == "triton": - if not group_shape.is_per_group(): - # Triton only supported for per-group - return 0, 0, 0 - - fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone()) - - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles) - - return 1000 * ms, 1000 * max_ms, 1000 * min_ms - - -# TODO(luka) extract to utils -def compute_geomean_speedups( - df: pd.DataFrame, - baseline_col: str, - speedup_cols: list[str], - groupby_cols: list[str] | None = None, -) -> pd.DataFrame: - """ - Compute geometric mean speedups over a baseline column. - - Args: - df: Input dataframe - baseline_col: Column to use as baseline - speedup_cols: Columns to compute speedups for - groupby_cols: Columns to group by. If None, compute over entire df. - - Returns: - pd.DataFrame with geometric mean speedups - """ - from scipy.stats import gmean - - def geo_speedup(group: pd.DataFrame) -> pd.Series: - ratios = { - col: (group[baseline_col] / group[col]).values for col in speedup_cols - } - return pd.Series({col: gmean(vals) for col, vals in ratios.items()}) - - if groupby_cols is None: - result = geo_speedup(df).to_frame().T - else: - result = ( - df.groupby(groupby_cols) - .apply(geo_speedup, include_groups=False) - .reset_index() - ) - - return result - - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark the various implementations of QuantFP8 (dynamic-only)" - ) - parser.add_argument("-c", "--check", action="store_true") - parser.add_argument( - "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" - ) - parser.add_argument( - "--hidden-sizes", - type=int, - nargs="+", - default=[896, 1024, 2048, 4096, 7168], - help="Hidden sizes to benchmark", - ) - parser.add_argument( - "--batch-sizes", - type=int, - nargs="+", - default=[1, 16, 128, 512, 1024], - help="Batch sizes to benchmark", - ) - parser.add_argument( - "--group-sizes", - type=int, - nargs="+", - default=None, - help="Group sizes for GroupShape(1,N) to benchmark. " - "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)", - ) - parser.add_argument( - "--no-column-major", - action="store_true", - help="Disable column-major scales testing", - ) - - args = parser.parse_args() - assert args - - dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] - - hidden_sizes = args.hidden_sizes - batch_sizes = args.batch_sizes - - if args.group_sizes is not None: - group_shapes = [] - for size in args.group_sizes: - if size == 0: - group_shapes.append(GroupShape.PER_TENSOR) - elif size == -1: - group_shapes.append(GroupShape.PER_TOKEN) - else: - group_shapes.append(GroupShape(1, size)) - else: - group_shapes = [ - GroupShape.PER_TENSOR, - GroupShape.PER_TOKEN, - GroupShape(1, 64), - GroupShape(1, 128), - ] - - column_major_scales = [False] if args.no_column_major else [True, False] - - config_gen = itertools.product( - group_shapes, - column_major_scales, - batch_sizes, - hidden_sizes, - ) - - # filter out column-major scales for non-group, reverse order - configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1])) - - print(f"Running {len(configs)} configurations:") - print(f" Hidden sizes: {hidden_sizes}") - print(f" Batch sizes: {batch_sizes}") - print(f" Group shapes: {[str(g) for g in group_shapes]}") - print(f" Column major scales: {column_major_scales}") - print() - - if args.check: - for group_shape in group_shapes: - group_size = group_shape[1] - print(f"{group_size=}") - calculate_diff( - batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype - ) - - benchmark = triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["hidden_size", "batch_size", "col_major", "group_shape"], - x_vals=configs, - line_arg="provider", - line_vals=["torch", "cuda", "triton"], - line_names=["Torch (Compiled)", "CUDA", "Triton"], - styles=[("blue", "-"), ("green", "-"), ("black", "-")], - ylabel="us", - plot_name="QuantFP8 performance", - args={}, - ) - )(benchmark_quantization) - - df = benchmark.run(print_data=True, dtype=dtype, return_df=True) - - # Print geomean speedups - geo_table_grouped = compute_geomean_speedups( - df, - baseline_col="Torch (Compiled)", - speedup_cols=["CUDA", "Triton"], - groupby_cols=["col_major", "group_shape"], - ) - - print("Speedup over Torch (Compiled)") - print(geo_table_grouped.to_string(index=False)) diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py deleted file mode 100644 index 93edbcc93..000000000 --- a/benchmarks/kernels/benchmark_activation.py +++ /dev/null @@ -1,104 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# benchmark custom activation op performance -import itertools - -import torch - -import vllm.model_executor.layers.activation # noqa F401 -from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform -from vllm.triton_utils import triton -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser - -batch_size_range = [1, 16, 32, 64, 128] -seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] -intermediate_size = [3072, 9728, 12288] -configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) - - -def benchmark_activation( - batch_size: int, - seq_len: int, - intermediate_size: int, - provider: str, - func_name: str, - dtype: torch.dtype, -): - device = "cuda" - num_tokens = batch_size * seq_len - dim = intermediate_size - current_platform.seed_everything(42) - torch.set_default_device(device) - - if func_name == "gelu_and_mul": - layer = CustomOp.op_registry[func_name](approximate="none") - elif func_name == "gelu_and_mul_tanh": - layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh") - elif func_name == "fatrelu_and_mul": - threshold = 0.5 - layer = CustomOp.op_registry[func_name](threshold) - else: - layer = CustomOp.op_registry[func_name]() - - x = torch.randn(num_tokens, dim, dtype=dtype, device=device) - compiled_layer = torch.compile(layer.forward_native) - - if provider == "custom": - fn = lambda: layer(x) - elif provider == "compiled": - fn = lambda: compiled_layer(x) - - ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( - fn, quantiles=[0.5, 0.2, 0.8] - ) - return ms, max_ms, min_ms - - -if __name__ == "__main__": - parser = FlexibleArgumentParser(description="Benchmark the custom activation op.") - parser.add_argument( - "--func-name", - type=str, - choices=[ - "mul_and_silu", - "silu_and_mul", - "gelu_and_mul", - "gelu_and_mul_tanh", - "fatrelu_and_mul", - "swigluoai_and_mul", - "gelu_new", - "gelu_fast", - "quick_gelu", - ], - default="silu_and_mul", - ) - parser.add_argument( - "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" - ) - args = parser.parse_args() - assert args - - func_name = args.func_name - dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] - - perf_report = triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["batch_size", "seq_len", "intermediate_size"], - x_vals=configs, - line_arg="provider", - line_vals=["custom", "compiled"], - line_names=["Custom OP", "Compiled"], - styles=[("blue", "-"), ("green", "-")], - ylabel="ms", - plot_name=f"{func_name}-op-performance", - args={}, - ) - ) - - perf_report( - lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation( - batch_size, seq_len, intermediate_size, provider, func_name, dtype - ) - ).run(print_data=True) diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py deleted file mode 100644 index 66b44c27d..000000000 --- a/benchmarks/kernels/benchmark_bitblas.py +++ /dev/null @@ -1,244 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from packaging import version - -from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( - MINIMUM_BITBLAS_VERSION, -) - -try: - import bitblas - - if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION): - raise ImportError( - "bitblas version is wrong. Please " - f"install bitblas>={MINIMUM_BITBLAS_VERSION}" - ) -except ImportError as e: - bitblas_import_exception = e - raise ValueError( - "Trying to use the bitblas backend, but could not import" - f"with the following error: {bitblas_import_exception}. " - "Please install bitblas through the following command: " - f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`" - ) from bitblas_import_exception - -from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target - -from vllm.utils import FlexibleArgumentParser - -parser = FlexibleArgumentParser( - description="Benchmark BitBLAS int4 on a specific target." -) - -# Add arguments to the parser -parser.add_argument( - "--target", - type=str, - default=auto_detect_nvidia_target(), - help="Specify the target device for benchmarking.", -) -parser.add_argument( - "--group_size", type=int, default=None, help="Group size for grouped quantization." -) -parser.add_argument( - "--A_dtype", - type=str, - default="float16", - choices=["float16", "float32", "float64", "int32", "int8"], - help="Data type of activation A.", -) -parser.add_argument( - "--W_dtype", - type=str, - default="int4", - choices=[ - "float16", - "float32", - "float64", - "int32", - "int8", - "int4", - "int2", - "int1", - "nf4", - "fp4_e2m1", - ], - help="Data type of weight W.", -) -parser.add_argument( - "--accum_dtype", - type=str, - default="float16", - choices=["float16", "int32"], - help="Data type for accumulation.", -) -parser.add_argument( - "--out_dtype", - type=str, - default="float16", - choices=["float16", "float32", "int32", "int8"], - help="Data type for output.", -) -parser.add_argument( - "--layout", - type=str, - default="nt", - choices=["nt", "nn"], - help="Matrix layout, 'nt' for non-transpose A and transpose W.", -) -parser.add_argument( - "--with_bias", action="store_true", help="Include bias in the benchmark." -) -parser.add_argument( - "--with_scaling", - action="store_true", - help="Include scaling factor in the quantization.", -) -parser.add_argument( - "--with_zeros", action="store_true", help="Include zeros in the quantization." -) -parser.add_argument( - "--zeros_mode", - type=str, - default=None, - choices=["original", "rescale", "quantized"], - help="Specify the mode for calculating zeros.", -) - -# Parse the arguments -args = parser.parse_args() - -# Assign arguments to variables -target = args.target -A_dtype = args.A_dtype -W_dtype = args.W_dtype -accum_dtype = args.accum_dtype -out_dtype = args.out_dtype -layout = args.layout -with_bias = args.with_bias -group_size = args.group_size -with_scaling = args.with_scaling -with_zeros = args.with_zeros -zeros_mode = args.zeros_mode - -# Define a list of shared arguments that repeat in every config -shared_args = [ - A_dtype, - W_dtype, - out_dtype, - accum_dtype, - layout, - with_bias, - group_size, - with_scaling, - with_zeros, - zeros_mode, -] - -# Define just the (M, K, N) shapes in a more compact list -shapes = [ - # square test - (1, 16384, 16384), - # BLOOM-176B - (1, 43008, 14336), - (1, 14336, 14336), - (1, 57344, 14336), - (1, 14336, 57344), - # OPT-65B - (1, 9216, 9216), - (1, 36864, 9216), - (1, 9216, 36864), - (1, 22016, 8192), - # LLAMA-70B/65B - (1, 8192, 22016), - (1, 8192, 8192), - (1, 28672, 8192), - (1, 8192, 28672), - # square test - (16384, 16384, 16384), - # BLOOM-176B - (8192, 43008, 14336), - (8192, 14336, 14336), - (8192, 57344, 14336), - (8192, 14336, 57344), - # OPT-65B - (8192, 9216, 9216), - (8192, 36864, 9216), - (8192, 9216, 36864), - (8192, 22016, 8192), - # LLAMA-70B/65B - (8192, 8192, 22016), - (8192, 8192, 8192), - (8192, 28672, 8192), - (8192, 8192, 28672), -] - -# Build test shapes with all the shared arguments -test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes] - -benchmark_sets = [] -benchmark_sets.extend(test_shapes) - -benchmark_results = {} -for config_class, operator, input_args in benchmark_sets: - config = config_class(*input_args) - matmul = operator(config, target=target, enable_tuning=True) - kernel_latency = matmul.profile_latency() - - print("Time cost is: {:.3f} ms".format(kernel_latency)) - - profile_config = { - f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": { - "BitBLAS_top20_latency": kernel_latency, - } - } - - benchmark_results.update(profile_config) - -# Define headers for the table -headers = [ - "PrimFunc", - "Input Arguments", - "BitBLAS Top20 Latency", -] - -# Calculate column widths for pretty printing -col_widths = [0, 0, 0] -for config_key, values in benchmark_results.items(): - args_split = config_key.split("-") - func_name = args_split[0] - input_args_str = "-".join(args_split[1:]) - col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2) - col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2) - col_widths[2] = max( - col_widths[2], - len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2, - len(headers[2]) + 2, - ) - # break only if you want to measure widths from a single example; - # otherwise, let it loop over all items. - -# Print header -for i, header in enumerate(headers): - headers[i] = header.ljust(col_widths[i]) -print("".join(headers)) -print("-" * sum(col_widths)) - -# Print rows -for config_key, values in benchmark_results.items(): - args_split = config_key.split("-") - func_name = args_split[0] - input_args_str = "-".join(args_split[1:]) - row = [ - func_name, - input_args_str, - f"{values['BitBLAS_top20_latency']:.3f} ms", - ] - row_str = "".join( - [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)] - ) - print(row_str) diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py deleted file mode 100644 index 726a2a371..000000000 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ /dev/null @@ -1,504 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe -kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit -activations. The triton_moe kernel takes in fp8 weights(tensor scaled to fp8) -and 16-bit activations. -""" - -import nvtx -import torch -import torch.utils.benchmark as benchmark - -from vllm import _custom_ops as ops -from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config -from vllm.model_executor.layers.fused_moe.config import ( - fp8_w8a8_moe_quant_config, - nvfp4_moe_quant_config, -) -from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 -from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk -from vllm.scalar_type import scalar_types -from vllm.utils import FlexibleArgumentParser - -WEIGHT_SHAPES_MOE = { - "nvidia/DeepSeek-R1-FP4": [ - [256, 8, 2048, 7168], - ], -} - -DEFAULT_MODELS = [ - "nvidia/DeepSeek-R1-FP4", -] - -DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] -DEFAULT_TP_SIZES = [1] - -PER_ACT_TOKEN_OPTS = [False] -PER_OUT_CH_OPTS = [False] -FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() -FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max - - -def to_fp8(tensor: torch.Tensor): - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( - dtype=torch.float8_e4m3fn - ) - - -def bench_run( - results: list[benchmark.Measurement], - model: str, - num_experts: int, - topk: int, - per_act_token: bool, - per_out_ch: bool, - mkn: tuple[int, int, int], -): - label = "NVFP4 Blockscaled CUTLASS MOE vs FP8 Tensor Scaled Triton" - - sub_label = ( - "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format( - model, num_experts, topk, per_act_token, per_out_ch, mkn - ) - ) - - print(f"Testing: {sub_label}") - - (m, k, n) = mkn - - dtype = torch.half - device = "cuda" - a = torch.randn((m, k), device=device, dtype=dtype) / 10 - w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10 - w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10 - - _, a_fp8_scale = ops.scaled_fp8_quant(a) - - w1_fp8q = torch.empty( - (num_experts, 2 * n, k), device=device, dtype=torch.float8_e4m3fn - ) - w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=torch.float8_e4m3fn) - w1_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) - w2_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) - - for expert in range(num_experts): - w1_fp8q[expert], w1_fp8scale[expert] = ops.scaled_fp8_quant(w1[expert]) - w2_fp8q[expert], w2_fp8scale[expert] = ops.scaled_fp8_quant(w2[expert]) - - w1_fp8q_notransp = w1_fp8q.clone() - w2_fp8q_notransp = w2_fp8q.clone() - w1_fp8q = w1_fp8q.transpose(1, 2) - w2_fp8q = w2_fp8q.transpose(1, 2) - - score = torch.randn((m, num_experts), device=device, dtype=dtype) - - topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) - - quant_blocksize = 16 - w1_blockscale = torch.empty( - (num_experts, 2 * n, k // quant_blocksize), - device=device, - dtype=torch.float8_e4m3fn, - ) - w2_blockscale = torch.empty( - (num_experts, k, n // quant_blocksize), device=device, dtype=torch.float8_e4m3fn - ) - - # n_b_scales = 2 * n if per_out_ch else 1 - # k_b_scales = k if per_out_ch else 1 - w1_fp4 = torch.empty((num_experts, 2 * n, k // 2), device=device, dtype=torch.uint8) - w2_fp4 = torch.empty((num_experts, k, n // 2), device=device, dtype=torch.uint8) - - w1_gs = torch.empty((num_experts,), device=device, dtype=torch.float32) - w2_gs = torch.empty((num_experts,), device=device, dtype=torch.float32) - a1_gs = torch.ones((num_experts,), device=device, dtype=torch.float32) - a2_gs = torch.ones((num_experts,), device=device, dtype=torch.float32) - - for expert in range(num_experts): - w1_e = w1[expert] - w2_e = w2[expert] - w1_amax = torch.abs(w1_e).max().to(torch.float32) - w2_amax = torch.abs(w2_e).max().to(torch.float32) - w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax - w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax - - w1_fp4[expert], w1_blockscale[expert] = ops.scaled_fp4_quant( - w1_e, w1_gs[expert] - ) - - w2_fp4[expert], w2_blockscale[expert] = ops.scaled_fp4_quant( - w2_e, w2_gs[expert] - ) - - def run_triton_moe( - a: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a_fp8_scale: torch.Tensor, - num_repeats: int, - ): - quant_config = fp8_w8a8_moe_quant_config( - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale, - ) - - for _ in range(num_repeats): - fused_experts( - a, - w1, - w2, - topk_weights, - topk_ids, - quant_config=quant_config, - ) - - def run_cutlass_moe_fp4( - a: torch.Tensor, - w1_fp4: torch.Tensor, - w2_fp4: torch.Tensor, - w1_blockscale: torch.Tensor, - w2_blockscale: torch.Tensor, - w1_gs: torch.Tensor, - w2_gs: torch.Tensor, - a1_gs: torch.Tensor, - a2_gs: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - m: int, - n: int, - k: int, - e: int, - device: torch.device, - num_repeats: int, - ): - quant_config = nvfp4_moe_quant_config( - a1_gscale=a1_gs, - a2_gscale=a2_gs, - w1_scale=w1_blockscale, - w2_scale=w2_blockscale, - g1_alphas=w1_gs, - g2_alphas=w2_gs, - ) - for _ in range(num_repeats): - with nvtx.annotate("cutlass_moe_fp4", color="green"): - cutlass_moe_fp4( - a=a, - w1_fp4=w1_fp4, - w2_fp4=w2_fp4, - topk_weights=topk_weights, - topk_ids=topk_ids, - m=m, - n=n, - k=k, - e=num_experts, - quant_config=quant_config, - ) - - def run_cutlass_from_graph( - a: torch.Tensor, - a1_gscale: torch.Tensor, - w1_fp4: torch.Tensor, - w1_blockscale: torch.Tensor, - w1_alphas: torch.Tensor, - a2_gscale: torch.Tensor, - w2_fp4: torch.Tensor, - w2_blockscale: torch.Tensor, - w2_alphas: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - m: int, - n: int, - k: int, - e: int, - device: torch.device, - ): - quant_config = nvfp4_moe_quant_config( - a1_gscale=a1_gs, - a2_gscale=a2_gs, - w1_scale=w1_blockscale, - w2_scale=w2_blockscale, - g1_alphas=w1_gs, - g2_alphas=w2_gs, - ) - - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) - ): - return cutlass_moe_fp4( - a=a, - w1_fp4=w1_fp4, - w2_fp4=w2_fp4, - topk_weights=topk_weights, - topk_ids=topk_ids, - m=m, - n=n, - k=k, - e=num_experts, - quant_config=quant_config, - ) - - def run_triton_from_graph( - a: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a_fp8_scale: torch.Tensor, - ): - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) - ): - quant_config = fp8_w8a8_moe_quant_config( - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale, - ) - return fused_experts( - a, - w1, - w2, - topk_weights, - topk_ids, - quant_config=quant_config, - ) - - def replay_graph(graph, num_repeats): - for _ in range(num_repeats): - graph.replay() - torch.cuda.synchronize() - - cutlass_stream = torch.cuda.Stream() - cutlass_graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): - run_cutlass_from_graph( - a=a, - a1_gscale=a1_gs, - w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_gs, - a2_gscale=a2_gs, - w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_gs, - topk_weights=topk_weights, - topk_ids=topk_ids, - m=m, - n=n, - k=k, - e=num_experts, - device=device, - ) - torch.cuda.synchronize() - - triton_stream = torch.cuda.Stream() - triton_graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(triton_graph, stream=triton_stream): - run_triton_from_graph( - a, - w1_fp8q_notransp, - w2_fp8q_notransp, - topk_weights, - topk_ids, - w1_fp8scale, - w2_fp8scale, - a_fp8_scale, - ) - torch.cuda.synchronize() - - min_run_time = 5 - num_warmup = 5 - num_runs = 25 - - globals = { - # Baseline params - "w1": w1, - "w2": w2, - "score": score, - "topk": topk, - "w1_fp8q_notransp": w1_fp8q_notransp, - "w2_fp8q_notransp": w2_fp8q_notransp, - "w1_fp8scale": w1_fp8scale, - "w2_fp8scale": w2_fp8scale, - "a_fp8_scale": a_fp8_scale, - # Cutlass params - "a": a, - "a1_gscale": a1_gs, - "w1_fp4": w1_fp4, - "w1_blockscale": w1_blockscale, - "w1_alphas": w1_gs, - "a2_gscale": a2_gs, - "w2_fp4": w2_fp4, - "w2_blockscale": w2_blockscale, - "w2_alphas": w2_gs, - "topk_weights": topk_weights, - "topk_ids": topk_ids, - "m": m, - "n": n, - "k": k, - "e": num_experts, - "device": device, - # cuda graph params - "cutlass_graph": cutlass_graph, - "triton_graph": triton_graph, - # Gen params - "num_runs": num_runs, - # Kernels - "run_triton_moe": run_triton_moe, - "run_cutlass_moe_fp4": run_cutlass_moe_fp4, - "replay_graph": replay_graph, - } - - # Warmup - run_triton_moe( - a, - w1_fp8q_notransp, - w2_fp8q_notransp, - topk_weights, - topk_ids, - w1_fp8scale, - w2_fp8scale, - a_fp8_scale, - num_warmup, - ) - - results.append( - benchmark.Timer( - stmt="run_triton_moe(a, w1_fp8q_notransp, w2_fp8q_notransp, topk_weights, topk_ids, w1_fp8scale, w2_fp8scale, a_fp8_scale, num_runs)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="triton_moe", - ).blocked_autorange(min_run_time=min_run_time) - ) - - # Warmup - replay_graph(triton_graph, num_warmup) - - results.append( - benchmark.Timer( - stmt="replay_graph(triton_graph, num_runs)", - globals=globals, - label=label, - sub_label=sub_label, - description="triton_moe_cuda_graphs", - ).blocked_autorange(min_run_time=min_run_time) - ) - - # Warmup - - run_cutlass_moe_fp4( - a, - w1_fp4, - w2_fp4, - w1_blockscale, - w2_blockscale, - w1_gs, - w2_gs, - a1_gs, - a2_gs, - topk_weights, - topk_ids, - m, - n, - k, - num_experts, - device, - num_warmup, - ) - - results.append( - benchmark.Timer( - stmt="run_cutlass_moe_fp4(a, w1_fp4, w2_fp4, w1_blockscale, w2_blockscale, w1_alphas, w2_alphas, a1_gscale, a2_gscale, topk_weights, topk_ids, m, n, k, e, device, num_runs)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="cutlass_moe_fp4", - ).blocked_autorange(min_run_time=min_run_time) - ) - - # Warmup - replay_graph(cutlass_graph, num_warmup) - - results.append( - benchmark.Timer( - stmt="replay_graph(cutlass_graph, num_runs)", - globals=globals, - label=label, - sub_label=sub_label, - description="cutlass_moe_fp4_cuda_graphs", - ).blocked_autorange(min_run_time=min_run_time) - ) - - -def main(args): - print("Benchmarking models:") - for i, model in enumerate(args.models): - print(f"[{i}] {model}") - - results: list[benchmark.Measurement] = [] - - for model in args.models: - for tp in args.tp_sizes: - for layer in WEIGHT_SHAPES_MOE[model]: - num_experts = layer[0] - topk = layer[1] - size_k = layer[2] - size_n = layer[3] // tp - - if len(args.limit_k) > 0 and size_k not in args.limit_k: - continue - - if len(args.limit_n) > 0 and size_n not in args.limit_n: - continue - - for per_act_token in PER_ACT_TOKEN_OPTS: - for per_out_ch in PER_OUT_CH_OPTS: - for size_m in args.batch_sizes: - mkn = (size_m, size_k, size_n) - bench_run( - results, - model, - num_experts, - topk, - per_act_token, - per_out_ch, - mkn, - ) - - compare = benchmark.Compare(results) - compare.print() - - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark NVFP4 CUTLASS MOE across specified models/shapes/batches" - ) - parser.add_argument( - "--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES_MOE.keys(), - ) - parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) - parser.add_argument( - "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES - ) - parser.add_argument("--limit-k", nargs="+", type=int, default=[]) - parser.add_argument("--limit-n", nargs="+", type=int, default=[]) - parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[]) - parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[]) - parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[]) - - args = parser.parse_args() - main(args) diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py deleted file mode 100644 index b419b2fa0..000000000 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ /dev/null @@ -1,406 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe -kernel. Both kernels take in fp8 quantized weights and 16-bit activations, -but use different quantization strategies and backends. -""" - -import nvtx -import torch - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config -from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 -from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk -from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser - -# Weight shapes for different models: [num_experts, topk, hidden_size, -# intermediate_size] -WEIGHT_SHAPES_MOE = { - "mixtral-8x7b": [ - [8, 2, 4096, 14336], - ], - "deepseek-v2": [ - [160, 6, 5120, 12288], - ], - "custom-small": [ - [8, 2, 2048, 7168], - ], - "glm45-fp8": [ - [128, 8, 4096, 1408], - ], - "Llama-4-Maverick-17B-128E-Instruct-FP8": [ - [128, 1, 5120, 8192], - ], -} - -DEFAULT_MODELS = [ - "mixtral-8x7b", -] - -DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] -DEFAULT_TP_SIZES = [1] - -PER_ACT_TOKEN_OPTS = [False, True] -PER_OUT_CH_OPTS = [False, True] - -FP8_DTYPE = current_platform.fp8_dtype() - - -def bench_run( - results: list, - model: str, - num_experts: int, - topk: int, - per_act_token: bool, - per_out_ch: bool, - mkn: tuple[int, int, int], -): - (m, k, n) = mkn - - dtype = torch.half - device = "cuda" - - # Create input activations - a = torch.randn((m, k), device=device, dtype=dtype) / 10 - - # Create weights - w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10 - w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10 - - # Create FP8 quantized weights and scales for both kernels - w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE) - w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE) - - # Create scales based on quantization strategy - if per_out_ch: - # Per-channel quantization - w1_scale = torch.empty( - (num_experts, 2 * n, 1), device=device, dtype=torch.float32 - ) - w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32) - else: - # Per-tensor quantization - w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) - w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) - - # Quantize weights - for expert in range(num_experts): - if per_out_ch: - # Per-channel quantization - not yet implemented properly - # For now, fall back to per-tensor quantization - w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert]) - w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert]) - # Expand scalar scales to the expected per-channel shape - w1_scale[expert] = w1_scale_temp.expand(2 * n, 1) - w2_scale[expert] = w2_scale_temp.expand(k, 1) - else: - # Per-tensor quantization - w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert]) - w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert]) - # Store scalar scales in [1, 1] tensors - w1_scale[expert, 0, 0] = w1_scale_temp - w2_scale[expert, 0, 0] = w2_scale_temp - - # Prepare weights for CUTLASS (no transpose needed) - w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K] - w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N] - - # Create router scores and get topk - score = torch.randn((m, num_experts), device=device, dtype=dtype) - topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) - - # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization - # Force per-tensor quantization for all cases to match working e2e setup - a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32) - a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32) - - # Force per-tensor quantization for all cases - per_act_token = False - - # Create stride tensors for CUTLASS - ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device) - ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device) - c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device) - c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device) - - def run_triton_moe( - a: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a1_scale: torch.Tensor, - a2_scale: torch.Tensor, - num_repeats: int, - ): - quant_config = fp8_w8a8_moe_quant_config( - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - per_act_token_quant=per_act_token, - per_out_ch_quant=per_out_ch, - ) - - for _ in range(num_repeats): - fused_experts( - a, - w1, - w2, - topk_weights, - topk_ids, - quant_config=quant_config, - ) - - def run_cutlass_moe_fp8( - a: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - ab_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides1: torch.Tensor, - c_strides2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a1_scale: torch.Tensor, - a2_scale: torch.Tensor, - num_repeats: int, - ): - quant_config = fp8_w8a8_moe_quant_config( - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - per_act_token_quant=per_act_token, - per_out_ch_quant=per_out_ch, - ) - - for _ in range(num_repeats): - with nvtx.annotate("cutlass_moe_fp8", color="blue"): - cutlass_moe_fp8( - a=a, - w1_q=w1, - w2_q=w2, - topk_weights=topk_weights, - topk_ids=topk_ids, - ab_strides1=ab_strides1, - ab_strides2=ab_strides2, - c_strides1=c_strides1, - c_strides2=c_strides2, - quant_config=quant_config, - activation="silu", - global_num_experts=num_experts, - ) - - # Pre-create quantization config to avoid creating it inside CUDA graph - quant_config = fp8_w8a8_moe_quant_config( - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - per_act_token_quant=per_act_token, - per_out_ch_quant=per_out_ch, - ) - - # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly) - cutlass_stream = torch.cuda.Stream() - cutlass_graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): - # Capture 10 invocations like benchmark_moe.py - for _ in range(10): - cutlass_moe_fp8( - a=a, - w1_q=w1_fp8q_cutlass, - w2_q=w2_fp8q_cutlass, - topk_weights=topk_weights, - topk_ids=topk_ids, - ab_strides1=ab_strides1, - ab_strides2=ab_strides2, - c_strides1=c_strides1, - c_strides2=c_strides2, - quant_config=quant_config, - activation="silu", - global_num_experts=num_experts, - ) - torch.cuda.synchronize() - - # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly) - triton_stream = torch.cuda.Stream() - triton_graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(triton_graph, stream=triton_stream): - # Capture 10 invocations like benchmark_moe.py - for _ in range(10): - fused_experts( - a, - w1_fp8q, - w2_fp8q, - topk_weights, - topk_ids, - quant_config=quant_config, - ) - torch.cuda.synchronize() - - def bench_cuda_graph(graph, num_warmup=5, num_iters=100): - """Benchmark CUDA graph using events like benchmark_moe.py""" - # Warmup - for _ in range(num_warmup): - graph.replay() - torch.cuda.synchronize() - - # Timing - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - latencies = [] - for _ in range(num_iters): - torch.cuda.synchronize() - start_event.record() - graph.replay() - end_event.record() - end_event.synchronize() - latencies.append(start_event.elapsed_time(end_event)) - - # Divide by 10 since graph contains 10 calls - return sum(latencies) / (num_iters * 10) - - # Benchmark parameters - num_warmup = 5 - num_iters = 100 - - # Benchmark only CUDA graphs (more reliable and faster) - # Benchmark Triton MoE with CUDA graphs - triton_graph_time = bench_cuda_graph( - triton_graph, num_warmup=num_warmup, num_iters=num_iters - ) - - # Benchmark CUTLASS MoE with CUDA graphs - cutlass_graph_time = bench_cuda_graph( - cutlass_graph, num_warmup=num_warmup, num_iters=num_iters - ) - - # Convert ms to us and return results - triton_time_us = triton_graph_time * 1000 - cutlass_time_us = cutlass_graph_time * 1000 - - return { - "batch_size": m, - "triton_time_us": triton_time_us, - "cutlass_time_us": cutlass_time_us, - } - - -def main(args): - print("Benchmarking models:") - for i, model in enumerate(args.models): - print(f"[{i}] {model}") - - all_results = [] - - for model in args.models: - for tp in args.tp_sizes: - for layer in WEIGHT_SHAPES_MOE[model]: - num_experts = layer[0] - topk = layer[1] - size_k = layer[2] - size_n = layer[3] // tp - - if len(args.limit_k) > 0 and size_k not in args.limit_k: - continue - - if len(args.limit_n) > 0 and size_n not in args.limit_n: - continue - - for per_act_token in args.per_act_token_opts: - for per_out_ch in args.per_out_ch_opts: - print( - f"\n=== {model}, experts={num_experts}, topk={topk}," - f"per_act={per_act_token}, per_out_ch={per_out_ch} ===" - ) - - config_results = [] - for size_m in args.batch_sizes: - mkn = (size_m, size_k, size_n) - result = bench_run( - [], # Not used anymore - model, - num_experts, - topk, - per_act_token, - per_out_ch, - mkn, - ) - if result: - config_results.append(result) - - # Print results table for this configuration - if config_results: - print( - f"\n{'Batch Size':<12}" - f"{'Triton (us)':<15}" - f"{'CUTLASS (us)':<15}" - ) - print("-" * 45) - for result in config_results: - print( - f"{result['batch_size']:<12}" - f"{result['triton_time_us']:<15.2f}" - f"{result['cutlass_time_us']:<15.2f}" - ) - - all_results.extend(config_results) - - print(f"\nTotal benchmarks completed: {len(all_results)}") - - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE - across specified models/shapes/batches - - Example usage: - python benchmark_cutlass_moe_fp8.py \ - --model "Llama-4-Maverick-17B-128E-Instruct-FP8" \ - --tp-sizes 8 \ - --batch-size 2 4 8 \ - --per-act-token-opts false \ - --per-out-ch-opts false - - """ - ) - parser.add_argument( - "--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES_MOE.keys(), - ) - parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) - parser.add_argument( - "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES - ) - parser.add_argument("--limit-k", nargs="+", type=int, default=[]) - parser.add_argument("--limit-n", nargs="+", type=int, default=[]) - parser.add_argument( - "--per-act-token-opts", - nargs="+", - type=lambda x: x.lower() == "true", - default=[False, True], - help="Per-activation token quantization options (true/false)", - ) - parser.add_argument( - "--per-out-ch-opts", - nargs="+", - type=lambda x: x.lower() == "true", - default=[False, True], - help="Per-output channel quantization options (true/false)", - ) - - args = parser.parse_args() - main(args) diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py deleted file mode 100644 index 4cbdde5a5..000000000 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ /dev/null @@ -1,508 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -Benchmark script for device communicators: -CustomAllreduce (oneshot, twoshot), PyNcclCommunicator, -and SymmMemCommunicator (multimem, two-shot). - -for NCCL symmetric memory you need to set the environment variables -NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does -not use fast NVLS implementation for all reduce. - -Usage: - torchrun --nproc_per_node= benchmark_device_communicators.py [options] - -Example: - torchrun --nproc_per_node=2 benchmark_device_communicators.py - --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100 -""" - -import json -import os -import time -from contextlib import nullcontext -from typing import Callable, Optional - -import torch -import torch.distributed as dist -from torch.distributed import ProcessGroup - -from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce -from vllm.distributed.device_communicators.pynccl import ( - PyNcclCommunicator, - register_nccl_symmetric_ops, -) -from vllm.distributed.device_communicators.pynccl_allocator import ( - set_graph_pool_id, -) -from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator -from vllm.logger import init_logger -from vllm.utils import FlexibleArgumentParser - -logger = init_logger(__name__) - -# Default sequence lengths to benchmark -DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192] - -# Fixed hidden size and dtype for all benchmarks -HIDDEN_SIZE = 8192 -BENCHMARK_DTYPE = torch.bfloat16 - -# CUDA graph settings -CUDA_GRAPH_CAPTURE_CYCLES = 10 - - -class CommunicatorBenchmark: - """Benchmark class for testing device communicators.""" - - def __init__( - self, - rank: int, - world_size: int, - device: torch.device, - cpu_group: ProcessGroup, - sequence_lengths: list[int], - ): - self.rank = rank - self.world_size = world_size - self.device = device - self.cpu_group = cpu_group - - # Calculate max_size_override based on largest sequence length - max_seq_len = max(sequence_lengths) - max_tensor_elements = max_seq_len * HIDDEN_SIZE - self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1 - - # Initialize communicators - self.custom_allreduce = None - self.pynccl_comm = None - self.symm_mem_comm = None - self.symm_mem_comm_multimem = None - self.symm_mem_comm_two_shot = None - - self._init_communicators() - - def _init_communicators(self): - """Initialize all available communicators.""" - try: - self.custom_allreduce = CustomAllreduce( - group=self.cpu_group, - device=self.device, - max_size=self.max_size_override, - ) - if not self.custom_allreduce.disabled: - logger.info("Rank %s: CustomAllreduce initialized", self.rank) - else: - logger.info("Rank %s: CustomAllreduce disabled", self.rank) - except Exception as e: - logger.warning( - "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e - ) - self.custom_allreduce = None - - try: - self.pynccl_comm = PyNcclCommunicator( - group=self.cpu_group, device=self.device - ) - if not self.pynccl_comm.disabled: - logger.info("Rank %s: PyNcclCommunicator initialized", self.rank) - register_nccl_symmetric_ops(self.pynccl_comm) - else: - logger.info("Rank %s: PyNcclCommunicator disabled", self.rank) - self.pynccl_comm = None - except Exception as e: - logger.warning( - "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e - ) - self.pynccl_comm = None - - # Initialize variants for SymmMemCommunicator - try: - self.symm_mem_comm_multimem = SymmMemCommunicator( - group=self.cpu_group, - device=self.device, - force_multimem=True, - max_size_override=self.max_size_override, - ) - if not self.symm_mem_comm_multimem.disabled: - logger.info( - "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank - ) - else: - self.symm_mem_comm_multimem = None - except Exception as e: - logger.warning( - "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s", - self.rank, - e, - ) - self.symm_mem_comm_multimem = None - - try: - self.symm_mem_comm_two_shot = SymmMemCommunicator( - group=self.cpu_group, - device=self.device, - force_multimem=False, - max_size_override=self.max_size_override, - ) - if not self.symm_mem_comm_two_shot.disabled: - logger.info( - "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank - ) - else: - self.symm_mem_comm_two_shot = None - except Exception as e: - logger.warning( - "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s", - self.rank, - e, - ) - self.symm_mem_comm_two_shot = None - - def benchmark_allreduce( - self, sequence_length: int, num_warmup: int, num_trials: int - ) -> dict[str, float]: - """Benchmark allreduce operations for all available communicators.""" - - results = {} - - # Define communicators with their benchmark functions - communicators = [] - - if self.custom_allreduce is not None: - comm = self.custom_allreduce - # CustomAllreduce one-shot - communicators.append( - ( - "ca_1stage", - lambda t, c=comm: c.custom_all_reduce(t), - lambda t, c=comm: c.should_custom_ar(t), - comm.capture(), - "1stage", # env variable value - ) - ) - # CustomAllreduce two-shot - communicators.append( - ( - "ca_2stage", - lambda t, c=comm: c.custom_all_reduce(t), - lambda t, c=comm: c.should_custom_ar(t), - comm.capture(), - "2stage", # env variable value - ) - ) - - if self.pynccl_comm is not None: - comm = self.pynccl_comm - communicators.append( - ( - "pynccl", - lambda t, c=comm: c.all_reduce(t), - lambda t: True, # Always available if initialized - nullcontext(), - None, # no env variable needed - ) - ) - communicators.append( - ( - "pynccl-symm", - lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t), - lambda t: True, # Always available if initialized - nullcontext(), - None, # no env variable needed - ) - ) - - if self.symm_mem_comm_multimem is not None: - comm = self.symm_mem_comm_multimem - communicators.append( - ( - "symm_mem_multimem", - lambda t, c=comm: c.all_reduce(t), - lambda t, c=comm: c.should_use_symm_mem(t), - nullcontext(), - None, # no env variable needed - ) - ) - - if self.symm_mem_comm_two_shot is not None: - comm = self.symm_mem_comm_two_shot - communicators.append( - ( - "symm_mem_two_shot", - lambda t, c=comm: c.all_reduce(t), - lambda t, c=comm: c.should_use_symm_mem(t), - nullcontext(), - None, # no env variable needed - ) - ) - - # Benchmark each communicator - for name, allreduce_fn, should_use_fn, context, env_var in communicators: - # Set environment variable if needed - if env_var is not None: - os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var - else: - # Clear the environment variable to avoid interference - os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None) - - latency = self.benchmark_allreduce_single( - sequence_length, - allreduce_fn, - should_use_fn, - context, - num_warmup, - num_trials, - ) - if latency is not None: - results[name] = latency - - return results - - def benchmark_allreduce_single( - self, - sequence_length: int, - allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]], - should_use_fn: Callable[[torch.Tensor], bool], - context, - num_warmup: int, - num_trials: int, - ) -> Optional[float]: - """Benchmark method with CUDA graph optimization.""" - try: - # Create test tensor (2D: sequence_length x hidden_size) - tensor = torch.randn( - sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device - ) - if not should_use_fn(tensor): - return None - - torch.cuda.synchronize() - stream = torch.cuda.Stream() - with torch.cuda.stream(stream): - graph_input = tensor.clone() - - # Warmup before capture - for _ in range(3): - allreduce_fn(graph_input) - - # Capture the graph using context manager - with context: - graph = torch.cuda.CUDAGraph() - graph_pool = torch.cuda.graph_pool_handle() - set_graph_pool_id(graph_pool) - with torch.cuda.graph(graph, pool=graph_pool): - for _ in range(CUDA_GRAPH_CAPTURE_CYCLES): - allreduce_fn(graph_input) - - torch.cuda.synchronize() - for _ in range(num_warmup): - graph.replay() - torch.cuda.synchronize() - - torch.cuda.synchronize() - start_time = time.perf_counter() - - for _ in range(num_trials): - graph.replay() - torch.cuda.synchronize() - - end_time = time.perf_counter() - - # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES - return ( - (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000 - ) - - except Exception as e: - logger.error("CUDA graph benchmark failed: %s", e) - raise RuntimeError( - f"CUDA graph benchmark failed for communicator: {e}" - ) from e - - -def _calculate_speedup_info(comm_results: dict[str, float]) -> str: - """Calculate speedup information for a single tensor size.""" - if not comm_results: - return "N/A" - - # Find the fastest communicator - fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k]) - fastest_time = comm_results[fastest_comm] - - # Calculate speedup vs PyNccl if available - if "pynccl" in comm_results: - pynccl_time = comm_results["pynccl"] - speedup = pynccl_time / fastest_time - return f"{fastest_comm} ({speedup:.2f}x)" - else: - return f"{fastest_comm} (N/A)" - - -def print_results( - results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int -): - """Print benchmark results in a formatted table.""" - - print(f"\n{'=' * 130}") - print("Device Communicator Benchmark Results") - print( - f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, " - f"Hidden Size: {HIDDEN_SIZE}" - ) - print(f"{'=' * 130}") - - # Get all communicator names - all_comms = set() - for size_results in results.values(): - all_comms.update(size_results.keys()) - - all_comms = sorted(list(all_comms)) - - # Print header - header = f"{'Tensor Shape':<20}{'Tensor Size':<15}" - for comm in all_comms: - header += f"{comm:<20}" - header += f"{'Best (Speedup vs PyNccl)':<30}" - print(header) - print("-" * len(header)) - - # Print results for each sequence length - for seq_len in sequence_lengths: - if seq_len in results: - # Calculate tensor size in elements and bytes - tensor_elements = seq_len * HIDDEN_SIZE - tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize - - # Format tensor size (MB) - tensor_size_mb = tensor_bytes / (1024 * 1024) - tensor_size_str = f"{tensor_size_mb:.2f} MB" - - # Format tensor shape - tensor_shape = f"({seq_len}, {HIDDEN_SIZE})" - - row = f"{tensor_shape:<20}{tensor_size_str:<15}" - for comm in all_comms: - if comm in results[seq_len]: - row += f"{results[seq_len][comm]:<20.3f}" - else: - row += f"{'N/A':<20}" - - # Calculate speedup information - speedup_info = _calculate_speedup_info(results[seq_len]) - row += f"{speedup_info:<30}" - - print(row) - - print(f"{'=' * 130}") - print("All times are in milliseconds (ms) per allreduce operation") - print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)") - - -def main(): - parser = FlexibleArgumentParser(description="Benchmark device communicators") - - parser.add_argument( - "--sequence-lengths", - type=int, - nargs="+", - default=DEFAULT_SEQUENCE_LENGTHS, - help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)", - ) - - parser.add_argument( - "--num-warmup", type=int, default=5, help="Number of warmup iterations" - ) - - parser.add_argument( - "--num-trials", type=int, default=50, help="Number of benchmark trials" - ) - - parser.add_argument("--output-json", type=str, help="Output results to JSON file") - - args = parser.parse_args() - - # Initialize distributed - if not dist.is_initialized(): - dist.init_process_group(backend="gloo") - rank = dist.get_rank() - world_size = dist.get_world_size() - - # Set device - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - - # Get CPU process group - cpu_group = dist.new_group(backend="gloo") - - # Disable USE_SYMM_MEM to avoid affecting the max_sizes - # in symm_mem and custom_all_reduce for benchmark - os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" - - # Initialize benchmark - benchmark = CommunicatorBenchmark( - rank, world_size, device, cpu_group, args.sequence_lengths - ) - - # Run benchmarks - all_results = {} - - for seq_len in args.sequence_lengths: - if rank == 0: - logger.info( - "Benchmarking sequence length: %s (tensor shape: %s x %s)", - seq_len, - seq_len, - HIDDEN_SIZE, - ) - - results = benchmark.benchmark_allreduce( - sequence_length=seq_len, - num_warmup=args.num_warmup, - num_trials=args.num_trials, - ) - - all_results[seq_len] = results - - # Synchronize between ranks - dist.barrier() - - # Print results (only rank 0) - if rank == 0: - print_results(all_results, args.sequence_lengths, world_size) - - # Save to JSON if requested - if args.output_json: - # Add speedup information to results - enhanced_results = {} - for seq_len, comm_results in all_results.items(): - enhanced_results[seq_len] = { - "timings": comm_results, - "speedup_info": _calculate_speedup_info(comm_results), - } - - output_data = { - "world_size": world_size, - "dtype": str(BENCHMARK_DTYPE), - "hidden_size": HIDDEN_SIZE, - "sequence_lengths": args.sequence_lengths, - "num_warmup": args.num_warmup, - "num_trials": args.num_trials, - "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES, - "results": enhanced_results, - } - - with open(args.output_json, "w") as f: - json.dump(output_data, f, indent=2) - - logger.info("Results saved to %s", args.output_json) - - # Cleanup - if cpu_group != dist.group.WORLD: - dist.destroy_process_group(cpu_group) - - -if __name__ == "__main__": - main() diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py deleted file mode 100644 index 14330ae6f..000000000 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ /dev/null @@ -1,427 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch -import torch.utils.benchmark as benchmark -from benchmark_shapes import WEIGHT_SHAPES_MOE - -from vllm import _custom_ops as ops -from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config -from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config -from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 -from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_experts, - fused_topk, -) -from vllm.utils import FlexibleArgumentParser - -DEFAULT_MODELS = [ - "nm-testing/Mixtral-8x7B-Instruct-v0.1", - "nm-testing/deepseekv2-lite", - "ibm-granite/granite-3.0-1b-a400m", - "ibm-granite/granite-3.0-3b-a800m", -] -DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512] -DEFAULT_TP_SIZES = [1] - -PER_ACT_TOKEN_OPTS = [False] -PER_OUT_CH_OPTS = [False] - - -def to_fp8(tensor: torch.Tensor): - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( - dtype=torch.float8_e4m3fn - ) - - -def bench_run( - results: list[benchmark.Measurement], - model: str, - num_experts: int, - topk: int, - per_act_token: bool, - per_out_ch: bool, - mkn: tuple[int, int, int], -): - label = "Quant Matmul" - - sub_label = ( - "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format( - model, num_experts, topk, per_act_token, per_out_ch, mkn - ) - ) - - print(f"Testing: {sub_label}") - - (m, k, n) = mkn - - dtype = torch.half - - a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 - w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10 - w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10 - - _, a_scale = ops.scaled_fp8_quant(a) - - w1_q = torch.empty( - (num_experts, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn - ) - w2_q = torch.empty((num_experts, k, n), device="cuda", dtype=torch.float8_e4m3fn) - w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) - w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) - - for expert in range(num_experts): - w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert]) - w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert]) - - score = torch.randn((m, num_experts), device="cuda", dtype=dtype) - - topk_weights, topk_ids, token_expert_indices = fused_topk( - a, score, topk, renormalize=False - ) - - ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) - ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) - c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) - c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) - - def run_triton_moe( - a: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a_scale: torch.Tensor, - num_repeats: int, - ): - quant_config = fp8_w8a8_moe_quant_config( - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale, - ) - for _ in range(num_repeats): - fused_experts( - a, - w1, - w2, - topk_weights, - topk_ids, - quant_config=quant_config, - ) - - def run_cutlass_moe( - a: torch.Tensor, - a_scale: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - ab_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides1: torch.Tensor, - c_strides2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - per_act_token: bool, - num_repeats: int, - ): - quant_config = fp8_w8a8_moe_quant_config( - w1_scale=w1_scale, - w2_scale=w2_scale, - per_act_token_quant=per_act_token, - ) - - for _ in range(num_repeats): - cutlass_moe_fp8( - a, - w1, - w2, - topk_weights, - topk_ids, - ab_strides1, - ab_strides2, - c_strides1, - c_strides2, - quant_config=quant_config, - ) - - def run_cutlass_from_graph( - a: torch.Tensor, - a_scale: torch.Tensor, - w1_q: torch.Tensor, - w2_q: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - ab_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides1: torch.Tensor, - c_strides2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - ): - quant_config = fp8_w8a8_moe_quant_config( - w1_scale=w1_scale, - w2_scale=w2_scale, - per_act_token_quant=per_act_token, - ) - - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) - ): - return cutlass_moe_fp8( - a, - w1_q, - w2_q, - topk_weights, - topk_ids, - ab_strides1, - ab_strides2, - c_strides1, - c_strides2, - quant_config=quant_config, - ) - - def run_triton_from_graph( - a: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a_scale: torch.Tensor, - ): - quant_config = fp8_w8a8_moe_quant_config( - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale, - ) - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) - ): - return fused_experts( - a, - w1, - w2, - topk_weights, - topk_ids, - quant_config=quant_config, - ) - - def replay_graph(graph, num_repeats): - for _ in range(num_repeats): - graph.replay() - torch.cuda.synchronize() - - cutlass_stream = torch.cuda.Stream() - cutlass_graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): - run_cutlass_from_graph( - a, - a_scale, - w1_q, - w2_q, - w1_scale, - w2_scale, - ab_strides1, - ab_strides2, - c_strides1, - c_strides2, - topk_weights, - topk_ids, - ) - torch.cuda.synchronize() - - triton_stream = torch.cuda.Stream() - triton_graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(triton_graph, stream=triton_stream): - run_triton_from_graph( - a, - w1_q, - w2_q, - topk_weights, - topk_ids, - w1_scale, - w2_scale, - a_scale, - ) - torch.cuda.synchronize() - - min_run_time = 5 - num_warmup = 5 - num_runs = 25 - - globals = { - # Baseline params - "w1": w1, - "w2": w2, - "score": score, - "topk": topk, - # Cutlass params - "a_scale": a_scale, - "w1_q": w1_q, - "w2_q": w2_q, - "w1_scale": w1_scale, - "w2_scale": w2_scale, - "per_act_token": per_act_token, - "ab_strides1": ab_strides1, - "ab_strides2": ab_strides2, - "c_strides1": c_strides1, - "c_strides2": c_strides2, - # cuda graph params - "cutlass_graph": cutlass_graph, - "triton_graph": triton_graph, - # Gen params - "a": a, - "topk_weights": topk_weights, - "topk_ids": topk_ids, - "num_runs": num_runs, - # Kernels - "run_triton_moe": run_triton_moe, - "run_cutlass_moe": run_cutlass_moe, - "replay_graph": replay_graph, - } - - # Warmup - run_triton_moe( - a, - w1_q, - w2_q, - topk_weights, - topk_ids, - w1_scale, - w2_scale, - a_scale, - num_warmup, - ) - - results.append( - benchmark.Timer( - stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="triton_moe", - ).blocked_autorange(min_run_time=min_run_time) - ) - - # Warmup - replay_graph(triton_graph, num_warmup) - - results.append( - benchmark.Timer( - stmt="replay_graph(triton_graph, num_runs)", - globals=globals, - label=label, - sub_label=sub_label, - description="triton_moe_cuda_graphs", - ).blocked_autorange(min_run_time=min_run_time) - ) - - # Warmup - run_cutlass_moe( - a, - a_scale, - w1_q, - w2_q, - w1_scale, - w2_scale, - ab_strides1, - ab_strides2, - c_strides1, - c_strides2, - topk_weights, - topk_ids, - per_act_token, - num_warmup, - ) - - results.append( - benchmark.Timer( - stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="grouped_gemm_moe", - ).blocked_autorange(min_run_time=min_run_time) - ) - - # Warmup - replay_graph(cutlass_graph, num_warmup) - - results.append( - benchmark.Timer( - stmt="replay_graph(cutlass_graph, num_runs)", - globals=globals, - label=label, - sub_label=sub_label, - description="grouped_gemm_moe_cuda_graphs", - ).blocked_autorange(min_run_time=min_run_time) - ) - - -def main(args): - print("Benchmarking models:") - for i, model in enumerate(args.models): - print(f"[{i}] {model}") - - results: list[benchmark.Measurement] = [] - - for model in args.models: - for tp in args.tp_sizes: - for layer in WEIGHT_SHAPES_MOE[model]: - num_experts = layer[0] - topk = layer[1] - size_k = layer[2] - size_n = layer[3] // tp - - if len(args.limit_k) > 0 and size_k not in args.limit_k: - continue - - if len(args.limit_n) > 0 and size_n not in args.limit_n: - continue - - for per_act_token in PER_ACT_TOKEN_OPTS: - for per_out_ch in PER_OUT_CH_OPTS: - for size_m in DEFAULT_BATCH_SIZES: - mkn = (size_m, size_k, size_n) - bench_run( - results, - model, - num_experts, - topk, - per_act_token, - per_out_ch, - mkn, - ) - - compare = benchmark.Compare(results) - compare.print() - - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark Marlin across specified models/shapes/batches" - ) - parser.add_argument( - "--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES_MOE.keys(), - ) - parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) - parser.add_argument( - "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES - ) - parser.add_argument("--limit-k", nargs="+", type=int, default=[]) - parser.add_argument("--limit-n", nargs="+", type=int, default=[]) - parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[]) - parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[]) - parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[]) - - args = parser.parse_args() - main(args) diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py deleted file mode 100644 index 69978ec6b..000000000 --- a/benchmarks/kernels/benchmark_layernorm.py +++ /dev/null @@ -1,93 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import time - -import torch - -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.platforms import current_platform -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser - - -@torch.inference_mode() -def main( - num_tokens: int, - hidden_size: int, - add_residual: bool, - dtype: torch.dtype, - seed: int = 0, - do_profile: bool = False, - num_warmup_iters: int = 5, - num_iters: int = 100, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device("cuda") - - layer = RMSNorm(hidden_size).to(dtype=dtype) - layer.weight.data.normal_(mean=1.0, std=0.1) - scale = 1 / (2 * hidden_size) - x = torch.randn(num_tokens, hidden_size, dtype=dtype) - x *= scale - residual = torch.randn_like(x) * scale if add_residual else None - - def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() - if profile: - torch.cuda.cudart().cudaProfilerStart() - start_time = time.perf_counter() - - for _ in range(num_iters): - layer(x, residual) - torch.cuda.synchronize() - - end_time = time.perf_counter() - if profile: - torch.cuda.cudart().cudaProfilerStop() - return (end_time - start_time) / num_iters - - # Warmup. - print("Warming up...") - run_benchmark = run_cuda_benchmark - run_benchmark(num_iters=num_warmup_iters, profile=False) - - # Benchmark. - if do_profile: - latency = run_benchmark(num_iters=1, profile=True) - else: - latency = run_benchmark(num_iters=num_iters, profile=False) - print(f"Kernel running time: {latency * 1000000:.3f} us") - - -if __name__ == "__main__": - parser = FlexibleArgumentParser(description="Benchmark the layernorm kernel.") - parser.add_argument("--num-tokens", type=int, default=4096) - parser.add_argument("--hidden-size", type=int, default=8192) - parser.add_argument("--add-residual", action="store_true") - parser.add_argument( - "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--profile", action="store_true") - parser.add_argument("--num-warmup-iters", type=int, default=5) - parser.add_argument( - "--num-iters", - type=int, - default=100, - help="Number of benchmark iterations. " - "If --profile is set, this number is ignored", - ) - - args = parser.parse_args() - print(args) - - main( - num_tokens=args.num_tokens, - hidden_size=args.hidden_size, - add_residual=args.add_residual, - dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], - seed=args.seed, - do_profile=args.profile, - num_warmup_iters=args.num_warmup_iters, - num_iters=args.num_iters, - ) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py deleted file mode 100644 index 799b16999..000000000 --- a/benchmarks/kernels/benchmark_lora.py +++ /dev/null @@ -1,1071 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import copy -import json -import pickle -import time -from dataclasses import dataclass -from enum import Enum, auto -from itertools import product -from pathlib import Path -from typing import Any, Callable, Optional - -import torch -import torch.utils.benchmark as TBenchmark -from torch.utils.benchmark import Measurement as TMeasurement -from utils import ArgPool, Bench, CudaGraphBenchParams -from weight_shapes import WEIGHT_SHAPES - -from vllm.triton_utils import HAS_TRITON - -if HAS_TRITON: - from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink - from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT - -from vllm.utils import FlexibleArgumentParser - -DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) -DEFAULT_TP_SIZES = [1] -DEFAULT_BATCH_SIZES = [ - 1, - 16, - 32, - 64, - 128, - 192, - 256, - 320, - 384, - 448, - 512, - 640, - 768, - 896, - 1024, - 2048, - 3072, - 4096, - 5120, - 6144, - 7168, - 8192, -] -DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384] -DEFAULT_LORA_RANKS = [16] -DEFAULT_NUM_LORAS = [1, 2, 3, 4] -DEFAULT_SORT_BY_LORA_IDS = [False, True] -DEFAULT_SEQ_LENGTHS = [1] -DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False] - - -# Utilities -def dtype_to_str(dtype: torch.dtype): - if dtype == torch.float16: - return "f16" - if dtype == torch.bfloat16: - return "bf16" - if dtype == torch.float32: - return "f32" - raise ValueError(f"Unsupported dtype {dtype}") - - -def make_rand_lora_weight_tensor( - k: int, n: int, num_loras: int, dtype: torch.dtype, device: str = "cuda" -) -> torch.Tensor: - # LoRA weights column major - return torch.rand((num_loras, n, k), dtype=dtype).to(device) - - -def make_rand_tensors( - a_shape: tuple[int, ...], - b_shape: tuple[int, ...], - c_shape: tuple[int, ...], - a_dtype: torch.dtype, - b_dtype: torch.dtype, - c_dtype: torch.dtype, - num_slices: int, - device: str = "cuda", -) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]: - """ - Make LoRA input/output matrices. - """ - A = torch.rand(a_shape, dtype=a_dtype).to(device) - - # LoRA weights column major - Bs = [torch.rand(b_shape, dtype=b_dtype).to(device) for _ in range(num_slices)] - - C = torch.zeros(c_shape, dtype=c_dtype).to(device) - return A, Bs, C - - -def make_prompt_lora_mapping( - num_prompts: int, num_active_loras: int, sort_by_lora_id: bool, device: str -) -> torch.Tensor: - """ - All prompts are mapped to a LoRA ID in range [0, num_active_loras). - where 0 refers to first lora, 1 refers to second lora and so on. - """ - assert num_active_loras > 0 - - if not sort_by_lora_id: - return torch.randint(0, num_active_loras, (num_prompts,), dtype=torch.long) - - # Divide LoRAs equally and in order. - part_size = num_prompts // num_active_loras - part_size = max(part_size, 1) - - lora_id = 0 - prompt_lora_mapping = [] - while len(prompt_lora_mapping) < num_prompts: - prompt_lora_mapping.extend([lora_id] * part_size) - lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id - return torch.tensor( - prompt_lora_mapping[:num_prompts], dtype=torch.long, device=device - ) - - -def make_token_lora_mapping( - num_tokens: int, - num_prompts: int, - prompt_lora_mapping: torch.Tensor, - seq_len_tensor: torch.Tensor, - device: str, -): - """ - Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor - """ - assert prompt_lora_mapping.shape[0] == num_prompts - - # token to lora index mapping - token_lora_mapping = [0] * num_tokens - current_offset = 0 - for b_id in range(num_prompts): - lora_index = prompt_lora_mapping[b_id].item() - s = current_offset - e = s + seq_len_tensor[b_id].item() - token_lora_mapping[s:e] = [lora_index] * (e - s) - current_offset += seq_len_tensor[b_id].item() - - return torch.tensor(token_lora_mapping, dtype=torch.long, device=device) - - -def ref_group_gemm( - ref_out: torch.Tensor, - input: torch.Tensor, - lora_weights: list[torch.Tensor], - seq_lens_cpu: torch.Tensor, - prompt_lora_mapping_cpu: torch.Tensor, - scaling: float, - add_inputs: Optional[bool], -): - """ - Torch group gemm reference implementation to test correctness of - benchmarking operations. - """ - batches = seq_lens_cpu.size(0) - out_list = [] - current_offset = 0 - for lora_index, b_length in zip(range(batches), seq_lens_cpu): - x = input[current_offset : b_length + current_offset, :] - current_offset += b_length - w = lora_weights[prompt_lora_mapping_cpu[lora_index]] - result = torch.nn.functional.linear(x, w) - result *= scaling - out_list.append(result) - - cat_result = torch.cat(out_list, dim=0) - - if add_inputs: - ref_out += cat_result - else: - ref_out.copy_(cat_result) - - -class OpType(Enum): - """ - LoRA Ops to benchmark and its properties. - """ - - LORA_SHRINK = auto() - LORA_EXPAND = auto() - - @staticmethod - def from_str(s: str) -> "OpType": - if s.lower() == "lora_shrink": - return OpType.LORA_SHRINK - if s.lower() == "lora_expand": - return OpType.LORA_EXPAND - raise ValueError(f"Unrecognized str {s} to convert to OpType") - - def is_shrink_fn(self) -> bool: - return self in [OpType.LORA_SHRINK] - - def is_expand_fn(self) -> bool: - return self in [OpType.LORA_EXPAND] - - def num_slices(self) -> list[int]: - return [1, 2, 3] - - def mkn( - self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int - ) -> tuple[int, int, int]: - num_tokens = batch_size * seq_length - if self.is_shrink_fn(): - m = num_tokens - k = hidden_size - n = lora_rank - else: - assert self.is_expand_fn() - m = num_tokens - k = lora_rank - n = hidden_size - return m, k, n - - def matmul_dtypes( - self, op_dtype: torch.dtype - ) -> tuple[torch.dtype, torch.dtype, torch.dtype]: - """ - return a type, b type and c type for A x B = C - """ - if self.is_shrink_fn(): - return op_dtype, op_dtype, torch.float32 - else: - assert self.is_expand_fn() - return torch.float32, op_dtype, op_dtype - - def matmul_shapes( - self, - batch_size: int, - seq_length: int, - hidden_size: int, - lora_rank: int, - num_loras: int, - num_slices: int, - ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: - """ - Given num_slices, return the shapes of the A, B, and C matrices - in A x B = C, for the op_type - """ - m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank) - - b_shape = (num_loras, n, k) # col-major - if self in [OpType.LORA_SHRINK]: - # LoRA shrink kernels support num_slices inherently in the kernel. - return ((m, k), b_shape, (num_slices, m, n)) - if self in [OpType.LORA_EXPAND]: - # LoRA expand kernels support num_slices inherently in the kernel - return ((num_slices, m, k), b_shape, (m, n * num_slices)) - raise ValueError(f"Unrecognized op_type {self}") - - def bench_fn(self) -> Callable: - if self == OpType.LORA_SHRINK: - return lora_shrink - if self == OpType.LORA_EXPAND: - return lora_expand - - raise ValueError(f"Unrecognized optype {self}") - - def run_ref_group_gemm( - self, - output: torch.Tensor, - input: torch.Tensor, - lora_weights: list[torch.Tensor], - **kwargs, - ) -> Callable: - """Each benchmark operation expects the input, lora_weights and outputs - in a slightly different format. Refer to self.matmul_shapes(). - run_ref_group_gemm accounts for those differences in executing a - reference group gemm for correctness testing. - """ - w_dtype = lora_weights[0].dtype - num_slices = len(lora_weights) - if self in [OpType.LORA_SHRINK]: - for slice_idx in range(num_slices): - ref_group_gemm( - ref_out=output[slice_idx, :], - input=input, - lora_weights=lora_weights[slice_idx], - **kwargs, - ) - elif self in [OpType.LORA_EXPAND]: - hidden_size = lora_weights[0].shape[1] - for slice_idx in range(num_slices): - slice_offset = slice_idx * hidden_size - ref_group_gemm( - ref_out=output[:, slice_offset : slice_offset + hidden_size], - input=input[slice_idx].clone().to(dtype=w_dtype), - lora_weights=lora_weights[slice_idx], - **kwargs, - ) - else: - raise ValueError(f"Unrecognized optype {self}") - - -@dataclass -class BenchmarkContext: - """ - LoRA benchmark context - """ - - batch_size: int - hidden_size: int - num_loras: int - num_active_loras: int - lora_rank: int - sort_by_lora_id: bool - dtype: torch.dtype - seq_length: Optional[int] = None - num_slices: Optional[int] = None # num_slices for slice based ops - - def with_seq_length(self, seq_length: int) -> "BenchmarkContext": - ctx = copy.copy(self) - ctx.seq_length = seq_length - return ctx - - def with_num_slices(self, num_slices: int) -> "BenchmarkContext": - ctx = copy.copy(self) - ctx.num_slices = num_slices - return ctx - - def bench_label(self) -> str: - return f"lora-{self.dtype}" - - def bench_sublabel(self, op_type: OpType) -> str: - m, k, n = op_type.mkn( - self.batch_size, self.seq_length, self.hidden_size, self.lora_rank - ) - desc = { - "bs": self.batch_size, - "sl": self.seq_length, - "m": m, - "k": k, - "n": n, - "num_loras": self.num_loras, - "sort_by_lora": self.sort_by_lora_id, - "num_slices": self.num_slices, - } - return json.dumps(desc) - - -@dataclass -class BenchmarkTensors: - """ - Input/Output tensors used for benchmarks - """ - - # matmul tensors - input: torch.Tensor - lora_weights_lst: list[torch.Tensor] - output: torch.Tensor - # LoRA kernel metadata - lora_kernel_meta: LoRAKernelMeta - # Metadata tensors used in testing correctness - seq_lens: torch.Tensor - prompt_lora_mapping: torch.Tensor - - def io_types(self) -> str: - return ( - f"{dtype_to_str(self.input.dtype)}x" - f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>" - f"{dtype_to_str(self.output.dtype)}" - ) - - @staticmethod - def make( - ctx: BenchmarkContext, op_type: OpType, device: str = "cuda" - ) -> "BenchmarkTensors": - # Make input / output matmul tensors. - a_shape, b_shape, c_shape = op_type.matmul_shapes( - ctx.batch_size, - ctx.seq_length, - ctx.hidden_size, - ctx.lora_rank, - ctx.num_loras, - ctx.num_slices, - ) - a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype) - input_tensor, lora_weights, output_tensor = make_rand_tensors( - a_shape, b_shape, c_shape, a_type, b_type, c_type, num_slices=ctx.num_slices - ) - - # Make metadata tensors. - # Keep the metadata tensors in the CPU for further processing if needed. - # The tensors get moved to the GPU before benchmarking. - assert ctx.num_active_loras <= ctx.num_loras - total_tokens = ctx.batch_size * ctx.seq_length - - # Make metadata tensors involved in correctness testing. - # Prepare seq lens tensor - seq_len_tensor = torch.randint( - ctx.seq_length, ctx.seq_length + 1, (ctx.batch_size,) - ) - assert total_tokens == seq_len_tensor.sum() - # Prepare prompt lora indices tensor - prompt_lora_indices_tensor = make_prompt_lora_mapping( - ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu" - ) - - # Make LoRAKernelMeta - token_lora_indices_tensor = make_token_lora_mapping( - total_tokens, - ctx.batch_size, - prompt_lora_indices_tensor, - seq_len_tensor, - "cpu", - ) - lora_kernel_meta = LoRAKernelMeta.make( - max_loras=ctx.num_loras, - max_num_tokens=token_lora_indices_tensor.size(0), - device="cpu", - ) - lora_kernel_meta.prepare_tensors(token_lora_mapping=token_lora_indices_tensor) - - return BenchmarkTensors( - input_tensor, - lora_weights, - output_tensor, - lora_kernel_meta, - seq_len_tensor, - prompt_lora_indices_tensor, - ) - - def sanity_check(self) -> None: - """ - Fails asserts when non-conformality is detected. - """ - num_tokens = self.input.shape[-2] - # check metadata tensors - assert torch.sum(self.seq_lens) == num_tokens - num_seqs = self.seq_lens.shape[0] - # assert self.seq_start_loc.shape[0] == num_seqs - assert self.prompt_lora_mapping.shape[0] == num_seqs - assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens - - def to_device(self, device: str): - """ - Transfer tensors to device if the tensors aren't already on the device - """ - - def to_device(tensor: torch.Tensor): - if tensor.device != device: - tensor = tensor.to(device=device) - return tensor - - self.input = to_device(self.input) - self.output = to_device(self.output) - self.seq_lens = to_device(self.seq_lens) - self.prompt_lora_mapping = to_device(self.prompt_lora_mapping) - for i in range(len(self.lora_weights_lst)): - self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) - - # LoRA meta - for field_name in LoRAKernelMeta.__dataclass_fields__: - field = getattr(self.lora_kernel_meta, field_name) - assert isinstance(field, torch.Tensor) - setattr( - self.lora_kernel_meta, - field_name, - to_device(field) if field_name != "no_lora_flag_cpu" else field, - ) - - def metadata(self) -> tuple[int, int, int]: - """ - Return num_seqs, num_tokens and max_seq_len - """ - num_seqs = self.seq_lens.shape[0] - num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0] - max_seq_len = torch.max(self.seq_lens).item() - num_slices = len(self.lora_weights_lst) - return num_seqs, num_tokens, max_seq_len, num_slices - - def as_lora_shrink_kwargs(self) -> dict[str, Any]: - self.sanity_check() - self.to_device(self.input.device) - - _, num_tokens, _, num_slices = self.metadata() - - # Sanity check matrix shapes. - i_shape, lw_shape, o_shape = ( - self.input.shape, - self.lora_weights_lst[0].shape, - self.output.shape, - ) - # Expected input shape [num_tokens, hidden_size] - assert len(i_shape) == 2 - assert i_shape[0] == num_tokens - hidden_size = i_shape[1] - # Expected lora weight shape [num_loras, lora_rank, hidden_size] - assert len(lw_shape) == 3 - assert lw_shape[2] == hidden_size - lora_rank = lw_shape[1] - # Expected output shape [num_slices, num_tokens, lora_rank] - assert len(o_shape) == 3 - assert o_shape == (num_slices, num_tokens, lora_rank) - - return { - "inputs": self.input, - "lora_a_weights": self.lora_weights_lst, - "output_tensor": self.output, - "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping, - "token_indices_sorted_by_lora_ids": ( - self.lora_kernel_meta.token_indices_sorted_by_lora_ids - ), - "num_tokens_per_lora": self.lora_kernel_meta.num_tokens_per_lora, - "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc, - "lora_ids": self.lora_kernel_meta.active_lora_ids, - "scaling": 1.0, - "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, - } - - def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: - self.sanity_check() - self.to_device(self.input.device) - - _, num_tokens, _, num_slices = self.metadata() - - # Sanity check matrix shapes. - i_shape, lw_shape, o_shape = ( - self.input.shape, - self.lora_weights_lst[0].shape, - self.output.shape, - ) - # Expected input shape : [num_slices, num_tokens, lora_rank] - assert len(i_shape) == 3 - assert i_shape[0] == num_slices - assert i_shape[1] == num_tokens - lora_rank = i_shape[2] - # Expected lora weight shape : [num_lora, hidden_size, lora_rank] - assert len(lw_shape) == 3 - assert lw_shape[2] == lora_rank - hidden_size = lw_shape[1] - # Expected output shape : [num_tokens, hidden_size * num_slices] - assert len(o_shape) == 2 - assert o_shape == (num_tokens, hidden_size * num_slices) - - return { - "inputs": self.input, - "lora_b_weights": self.lora_weights_lst, - "output_tensor": self.output, - "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping, - "token_indices_sorted_by_lora_ids": ( - self.lora_kernel_meta.token_indices_sorted_by_lora_ids - ), - "num_tokens_per_lora": self.lora_kernel_meta.num_tokens_per_lora, - "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc, - "lora_ids": self.lora_kernel_meta.active_lora_ids, - "offset_start": 0, - "add_inputs": add_inputs, - "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, - } - - def bench_fn_kwargs( - self, op_type: OpType, add_inputs: Optional[bool] = None - ) -> dict[str, Any]: - if op_type.is_shrink_fn(): - assert add_inputs is None - else: - assert add_inputs is not None - - if op_type == OpType.LORA_SHRINK: - return self.as_lora_shrink_kwargs() - if op_type == OpType.LORA_EXPAND: - return self.as_lora_expand_kwargs(add_inputs) - raise ValueError(f"Unrecognized optype {self}") - - def test_correctness( - self, op_type: OpType, expand_fn_add_inputs: Optional[bool] - ) -> bool: - """ - Test correctness of op_type implementation against a grouped gemm - reference implementation. - """ - seq_lens_cpu = self.seq_lens.to(device="cpu") - prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu") - ref_output = self.output.clone() - - self.output.zero_() - op_type.bench_fn()(**self.bench_fn_kwargs(op_type, expand_fn_add_inputs)) - - op_type.run_ref_group_gemm( - ref_output, - self.input, - self.lora_weights_lst, - seq_lens_cpu=seq_lens_cpu, - prompt_lora_mapping_cpu=prompt_lora_mapping_cpu, - scaling=1.0, - add_inputs=expand_fn_add_inputs, - ) - - rtol, atol = { - torch.float16: (6e-2, 6e-2), - torch.bfloat16: (6e-2, 6e-2), - torch.float32: (1e-2, 1e-2), - }[self.output.dtype] - - return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol) - - -def bench_optype( - ctx: BenchmarkContext, - arg_pool_size: int, - op_type: OpType, - cuda_graph_nops: Optional[int] = None, - expand_fn_add_inputs: Optional[bool] = None, - test_correctness: bool = False, -) -> TMeasurement: - assert arg_pool_size >= 1 - if op_type.is_shrink_fn(): - assert expand_fn_add_inputs is None - else: - assert expand_fn_add_inputs is not None - - # BenchmarkContext -> BenchmarkTensors - bench_tensors: list[BenchmarkTensors] = [ - BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size) - ] - for bt in bench_tensors: - bt.sanity_check() - - # Test correctness of our implementation. - if test_correctness: - assert all( - [bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors] - ) - - # BenchmarkTensors -> dict (kwargs) - kwargs_list = [ - bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) - for bt in bench_tensors - ] - - # Clear LoRA optimization hash-maps. - _LORA_A_PTR_DICT.clear() - _LORA_B_PTR_DICT.clear() - # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up - for kwargs in kwargs_list: - op_type.bench_fn()(**kwargs) - torch.cuda.synchronize() - - # Merge into a single kwargs and qualify arguments as ArgPool - kwargs = {k: ArgPool([]) for k in kwargs_list[0]} - for _kwargs in kwargs_list: - for k, v in _kwargs.items(): - kwargs[k].values.append(v) - - describe_args = ( - f"add_inputs={expand_fn_add_inputs}" if expand_fn_add_inputs is not None else "" - ) - description = f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})" - - cuda_graph_params = None - if cuda_graph_nops: - cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) - timer = None - with Bench( - cuda_graph_params, - ctx.bench_label(), - ctx.bench_sublabel(op_type), - description, - op_type.bench_fn(), - **kwargs, - ) as bench: - timer = bench.run() - return timer - - -def bench_torch_mm( - ctx: BenchmarkContext, - arg_pool_size: int, - op_type: OpType, - cuda_graph_nops: Optional[int] = None, -) -> TMeasurement: - """ - Benchmark basic torch.mm as a roofline. - - When all the input tokens have the same LoRA ID, the LoRA kernels are just - a matmul. This torch.mm benchmark serves as a roofline for that case. - - input op_type is used in determining the m, k, n dimensions for the matmul. - """ - - batch_size, hidden_size, lora_rank, seq_length, dtype = ( - ctx.batch_size, - ctx.hidden_size, - ctx.lora_rank, - ctx.seq_length, - ctx.dtype, - ) - - m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank) - # For a fairer comparison. - n = n * ctx.num_slices - - # Get matmul input and output tensors for A x B = C - As, Bs, Cs = [], [], [] - for _ in range(arg_pool_size): - As.append(torch.rand((m, k), dtype=dtype).to("cuda")) - Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t()) - Cs.append(torch.rand((m, n), dtype=dtype).to("cuda")) - - # Make torch.mm kwargs - mm_kwargs = {"input": ArgPool(As), "mat2": ArgPool(Bs), "out": ArgPool(Cs)} - - description = ( - f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}" - f"x{dtype_to_str(dtype)}" - f"=>{dtype_to_str(dtype)})" - ) - cuda_graph_params = None - if cuda_graph_nops: - cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) - with Bench( - cuda_graph_params, - ctx.bench_label(), - ctx.bench_sublabel(op_type), - description, - torch.mm, - **mm_kwargs, - ) as bench: - return bench.run() - - -# runner -def use_cuda_graph_recommendation() -> str: - return """ - Triton kernels have a significant launch overhead with - launched directly via python. This overhead is more noticeable - for small the problem sizes. For these cases, it is recommended - to use the script with `--cuda-graph-nops N` to benchmark N - consecutive invocations of the benchmarking operations from - inside a CUDA Graph. Note that the returned measurement is for N - invocations of the operation. - """ - - -def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): - compare = TBenchmark.Compare(timers) - compare.print() - - if args and args.cuda_graph_nops: - print( - f"Note : The timings reported above is for {args.cuda_graph_nops} " - "consecutive invocations of the benchmarking functions. " - f"Please divide by {args.cuda_graph_nops} for single invocation " - "timings." - ) - - print( - "Note on Comparison with torch.mm : The torch.mm numbers are " - "benchmark numbers of a simple matmul emulating the single lora " - "case. It is provided as a roofline for comparing our LoRA Kernel " - "implementations. It is expected that the LoRA kernels will be " - "slower than torch.mm in cases where num_loras is big. But for " - "small num_loras the goal should be to match the torch.mm numbers." - ) - - -def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): - if args.cuda_graph_nops is not None: - assert args.cuda_graph_nops > 0 - print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA Graph") - else: - print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}") - - timers = [] - for bench_ctx in bench_ctxs: - for seq_len in args.seq_lengths: - bench_ops: list[OpType] = args.op_types - seq_len_timers = [] - for bench_op in bench_ops: - for num_slices in bench_op.num_slices(): - _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices( - num_slices - ) - # Benchmark torch.mm as a roofline - seq_len_timers.append( - bench_torch_mm( - _ctx, args.arg_pool_size, bench_op, args.cuda_graph_nops - ) - ) - - # Benchmark bench_op - expand_fn_add_inputs = ( - [None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs - ) - for add_input_arg in expand_fn_add_inputs: - seq_len_timers.append( - bench_optype( - _ctx, - args.arg_pool_size, - bench_op, - args.cuda_graph_nops, - add_input_arg, - args.test_correctness, - ) - ) - - print_timers(seq_len_timers) - timers.extend(seq_len_timers) - - # Result stdout dump - print("== All Results ====") - print_timers(timers, args) - - if args.output_directory: - # Result file dump - od = Path(args.output_directory) - if not od.exists(): - od.mkdir() - - timestamp = int(time.time()) - pkl_file = od / f"lora_bench-{timestamp}.pkl" - print(f"Writing benchmarks to {pkl_file}") - with open(pkl_file, "wb") as f: - pickle.dump(timers, f) - - -def as_benchmark_contexts( - hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace -) -> list[BenchmarkContext]: - ctxs: list[BenchmarkContext] = [] - for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa - args.batch_sizes, - list(hidden_sizes), - lora_ranks, - args.num_loras, - args.sort_by_lora_id, - ): - ctxs.append( - BenchmarkContext( - batch_size=batch_size, - hidden_size=hidden_size, - lora_rank=lora_rank, - num_loras=num_loras, - num_active_loras=args.num_active_loras - if args.num_active_loras - else num_loras, - # To be filled based on the OpType to benchmark - seq_length=None, - sort_by_lora_id=sort_by_lora_id, - dtype=args.dtype, - # To be filled based on the OpType to benchmark - num_slices=None, - ) - ) - - return ctxs - - -def run_list_bench(args: argparse.Namespace): - print(args) - - print( - "List bench :\n" - f" Hidden Sizes {args.hidden_sizes}" - f" LoRA Ranks {args.lora_ranks}" - ) - - # Get all benchmarking contexts - bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( - hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args - ) - - run(args, bench_contexts) - - -def run_range_bench(args: argparse.Namespace): - print(args) - - hidden_sizes = list( - range( - args.hidden_sizes_start, - args.hidden_sizes_end + 1, - args.hidden_sizes_increment, - ) - ) - lora_ranks = list( - range(args.lora_ranks_start, args.lora_ranks_end + 1, args.lora_ranks_increment) - ) - - print(f"Range bench :\n Hidden Sizes {hidden_sizes} LoRA Ranks {lora_ranks}") - - # Get all benchmarking contexts - bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( - hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args - ) - - run(args, bench_contexts) - - -def run_model_bench(args: argparse.Namespace): - print(args) - - def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]: - hidden_sizes = set() - for KN, tp_split_dim in WEIGHT_SHAPES[model]: - KN[tp_split_dim] = KN[tp_split_dim] // tp_size - hidden_sizes.add(KN[1]) - return hidden_sizes - - # Get all hidden sizes - hidden_sizes: set[int] = set() - for model_name, tp_size in product(args.models, args.tp_sizes): - hidden_sizes = hidden_sizes.union(hidden_sizes_from_model(model_name, tp_size)) - - print(f"Model bench :\n Hidden Sizes {hidden_sizes} LoRA Ranks {args.lora_ranks}") - - # Get all benchmarking contexts - bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( - hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args - ) - - run(args, bench_contexts) - - -if __name__ == "__main__": - - def to_torch_dtype(dt): - if dt == "torch.float16": - return torch.float16 - if dt == "torch.bfloat16": - return torch.bfloat16 - raise ValueError("unsupported dtype") - - def get_bool(s: str) -> bool: - return s.lower() in ["true", "1"] - - def add_common_command_args(p: argparse.ArgumentParser): - p.add_argument( - "--dtype", - type=to_torch_dtype, - required=True, - help="Available options are ['torch.float16', 'torch.bfloat16']", - ) - - p.add_argument( - "--arg-pool-size", - type=int, - default=32, - help="Run profiles with a pool of input/output/meta tensors instead" - "of simply reusing the same tensors for all runs. A bigger arg-pool" - "mitigates hardware caching effects during benchmarking.", - ) - - p.add_argument( - "--cuda-graph-nops", - type=int, - help=( - "when set profiling is done using cudagraph, " - "with the given number of operations in a graph." - "Note that the measurement returned is the time " - "taken for N consecutive executions of the benchmarking " - "functions, where N is the value of this argument." - ), - ) - p.add_argument("--num-loras", nargs="+", type=int, default=DEFAULT_NUM_LORAS) - p.add_argument( - "--num-active-loras", - type=int, - default=None, - help="Active LoRAs. When None, all LoRAs are active", - ) - p.add_argument( - "--sort-by-lora-id", - nargs="+", - type=get_bool, - default=DEFAULT_SORT_BY_LORA_IDS, - ) - p.add_argument( - "--op-types", nargs="+", type=OpType.from_str, default=list(OpType) - ) - p.add_argument( - "--seq-lengths", nargs="+", type=int, default=DEFAULT_SEQ_LENGTHS - ) - p.add_argument( - "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES - ) - p.add_argument( - "--expand-fn-add-inputs", - nargs="+", - type=get_bool, - default=DEFAULT_EXPAND_FN_ADD_INPUTS, - ) - p.add_argument( - "-o", - "--output-directory", - type=str, - help=( - "Output directory to store a the list of benchmarking" - "TMeasurement objects as a pickle file" - ), - ) - - p.add_argument( - "--test-correctness", - action="store_true", - help=( - "When enabled, the benchmarking functions are tested" - "for correctness before the actual benchmarking" - ), - ) - - parser = FlexibleArgumentParser( - description=f""" -Benchmark LoRA kernels: - {use_cuda_graph_recommendation()} - - list_bench example: - python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 - - model_bench example: - python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 - - range_bench example: - python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 - """, # noqa: E501 - formatter_class=argparse.RawTextHelpFormatter, - ) - - subparsers = parser.add_subparsers(dest="cmd", required=True) - - list_parser = subparsers.add_parser("list_bench") - list_parser.add_argument( - "--hidden-sizes", nargs="+", type=int, default=DEFAULT_HIDDEN_SIZES - ) - list_parser.add_argument( - "--lora-ranks", nargs="+", type=int, default=DEFAULT_LORA_RANKS - ) - add_common_command_args(list_parser) - list_parser.set_defaults(func=run_list_bench) - - range_parser = subparsers.add_parser("range_bench") - range_parser.add_argument("--hidden-sizes-start", type=int, required=True) - range_parser.add_argument("--hidden-sizes-end", type=int, required=True) - range_parser.add_argument("--hidden-sizes-increment", type=int, required=True) - range_parser.add_argument("--lora-ranks-start", type=int, required=True) - range_parser.add_argument("--lora-ranks-end", type=int, required=True) - range_parser.add_argument("--lora-ranks-increment", type=int, required=True) - add_common_command_args(range_parser) - range_parser.set_defaults(func=run_range_bench) - - model_parser = subparsers.add_parser("model_bench") - model_parser.add_argument( - "--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES.keys(), - ) - model_parser.add_argument( - "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES - ) - model_parser.add_argument( - "--lora-ranks", nargs="+", type=int, default=DEFAULT_LORA_RANKS - ) - add_common_command_args(model_parser) - model_parser.set_defaults(func=run_model_bench) - - args = parser.parse_args() - args.func(args) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py deleted file mode 100644 index 1b1c3b321..000000000 --- a/benchmarks/kernels/benchmark_machete.py +++ /dev/null @@ -1,745 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import copy -import itertools -import math -import os -import pickle as pkl -import time -from collections.abc import Iterable -from dataclasses import dataclass -from itertools import product -from typing import Callable, Optional - -import pandas as pd -import torch -import torch.utils.benchmark as TBenchmark -from torch.utils.benchmark import Measurement as TMeasurement -from weight_shapes import WEIGHT_SHAPES - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - GPTQ_MARLIN_MAX_PARALLEL, - GPTQ_MARLIN_MIN_THREAD_N, - marlin_permute_scales, - marlin_zero_points, -) -from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( - MarlinWorkspace, -) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - pack_rows, - quantize_weights, -) -from vllm.scalar_type import ScalarType, scalar_types -from vllm.utils import FlexibleArgumentParser - -DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"] -DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024] -DEFAULT_TP_SIZES = [1] - -NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False) - -if NVTX_PROFILE: - import nvtx - - -def terse_type_name(dt): - return { - torch.bfloat16: "bf16", - torch.float16: "fp16", - torch.int8: "int8", - torch.float8_e4m3fn: "fp8", - torch.float: "float", - torch.int: "int", - }[dt] - - -@dataclass -class BenchmarkTensors: - w_ref: torch.Tensor - a: torch.Tensor - - w_q: torch.Tensor - group_size: Optional[int] - wtype: ScalarType - w_g_s: torch.Tensor - w_g_zp: Optional[torch.Tensor] - w_ch_s: Optional[torch.Tensor] - w_tok_s: Optional[torch.Tensor] - - -@dataclass -class TypeConfig: - act_type: torch.dtype - weight_type: ScalarType - output_type: Optional[torch.dtype] - group_scale_type: Optional[torch.dtype] - group_zero_type: Optional[torch.dtype] - channel_scale_type: Optional[torch.dtype] - token_scale_type: Optional[torch.dtype] - - -def rand_data(shape, dtype=torch.float16, scale=1): - if dtype.is_floating_point: - return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype) - else: - return torch.randint(-15, 15, shape, dtype=dtype, device="cuda") - - -def quantize_and_pack( - atype: torch.dtype, - w: torch.Tensor, - wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], - zero_points: bool = False, -): - assert wtype.is_integer(), "TODO: support floating point weights" - - w_ref, w_q, w_s, w_zp = quantize_weights( - w, - wtype, - group_size=group_size, - zero_points=zero_points, - # to match how the kernel applies zps - ref_zero_points_after_scales=True, - ) - - w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) - return w_ref, w_q, w_s, w_zp - - -def create_bench_tensors( - shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] -) -> list[BenchmarkTensors]: - m, n, k = shape - - # we want to make sure that weights don't fit into L2 cache between runs so - # we construct enough weights to exceed L2 cache, which is 50mb on a H100 - # so we target total weight size > 2*50mb - num_weights = math.ceil( - 2 * 50 * 1024**2 * 8 / (k * n * types.weight_type.size_bits) - ) - - a = rand_data((m, k), types.act_type, scale=5) - - benchmark_tensors: list[BenchmarkTensors] = [] - for _ in range(num_weights): - w = rand_data((k, n), types.act_type, scale=5) - - if types.group_scale_type is not None: - w = w.to(types.group_scale_type) - if w.dtype.itemsize == 1: - w = w.to(torch.float16) - - w_ref, w_q_packed, w_s, w_zp = quantize_and_pack( - a.dtype, - w, - types.weight_type, - types.group_scale_type, - group_size, - types.group_zero_type is not None, - ) - - if not a.dtype.is_floating_point: - aiinfo = torch.iinfo(a.dtype) - w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max) - - w_ref = w_ref.to(torch.float32) - - w_ch_s = ( - None - if types.channel_scale_type is None - else rand_data((n,), types.channel_scale_type) - ) - w_tok_s = ( - None - if types.token_scale_type is None - else rand_data((m,), types.token_scale_type) - ) - - benchmark_tensors.append( - BenchmarkTensors( - w_ref=w_ref, - a=a, - w_q=w_q_packed, - wtype=types.weight_type, - w_g_s=w_s, - w_g_zp=w_zp, - group_size=group_size, - w_ch_s=w_ch_s, - w_tok_s=w_tok_s, - ) - ) - - return benchmark_tensors - - -def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable: - a = bt.a - w = bt.w_ref.to(bt.a.dtype) # use float reference tensor - if a.dtype not in [torch.float16, torch.bfloat16]: - a = a.to(torch.float16) - w = w.to(torch.float16) - return lambda: torch.matmul(a, w) - - -def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable: - if bt.w_ch_s is not None and bt.w_tok_s is not None: - scale_a = bt.w_tok_s.to(torch.float32) - scale_b = bt.w_ch_s.to(torch.float32) - else: - scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device) - scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device) - w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t() - return lambda: ops.cutlass_scaled_mm( - bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16 - ) - - -def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: - device = bt.a.device - - workspace = MarlinWorkspace( - bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL - ) - - if bt.w_g_zp is None: - w_zp = torch.empty(0, dtype=torch.int, device=device) - else: - w_zp = marlin_zero_points( - bt.w_g_zp, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits - ) - - if bt.group_size is None: - w_s = torch.tensor([], device="cuda", dtype=torch.half) - else: - w_s = marlin_permute_scales( - bt.w_g_s, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.group_size - ) - - sort_indices = torch.empty(0, dtype=torch.int, device=device) - g_idx = torch.empty(0, dtype=torch.int, device=device) - w_q = ops.gptq_marlin_repack( - bt.w_q, sort_indices, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits - ) - - if bt.a.dtype.is_floating_point: - assert bt.w_ch_s is None - assert bt.w_tok_s is None - assert bt.group_size is not None - - fn = lambda: ops.gptq_marlin_gemm( - a=bt.a, - c=None, - b_q_weight=w_q, - b_bias=None, - b_scales=w_s, - global_scale=None, - b_zeros=w_zp, - g_idx=g_idx, - perm=sort_indices, - workspace=workspace.scratch, - b_q_type=bt.wtype, - size_m=bt.a.shape[0], - size_n=bt.w_ref.shape[1], - size_k=bt.w_ref.shape[0], - is_k_full=True, - is_zp_float=False, - ) - else: - assert bt.a.dtype == torch.int8 - assert bt.wtype == scalar_types.uint4b8 - raise NotImplementedError("QQQ is not supported anymore") - - return fn - - -def machete_create_bench_fn( - bt: BenchmarkTensors, out_type=torch.dtype, schedule=None -) -> Callable: - w_q = bt.w_q.t().contiguous().t() # make col major - w_q = ops.machete_prepack_B( - w_q, bt.a.dtype, bt.wtype, None if bt.w_g_s is None else bt.w_g_s.dtype - ) - - w_g_zp = bt.w_g_zp - if w_g_zp is not None: - w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype)) - - return lambda: ops.machete_mm( - a=bt.a, - b_q=w_q, - b_type=bt.wtype, - b_group_scales=bt.w_g_s, - b_group_zeros=w_g_zp, - b_group_size=bt.group_size, - b_channel_scales=bt.w_ch_s, - a_token_scales=bt.w_tok_s, - out_type=out_type, - schedule=schedule, - ) - - -def cutlass_w4a8_create_bench_fn( - bt: BenchmarkTensors, out_type=torch.dtype, schedule=None -) -> Callable: - w_q = bt.w_q.t().contiguous().t() # make col major - w_q = ops.cutlass_encode_and_reorder_int4b(w_q) - # expects fp8 scales - w_s = ops.cutlass_pack_scale_fp8(bt.w_g_s.to(torch.float8_e4m3fn)) - - return lambda: ops.cutlass_w4a8_mm( - a=bt.a, - b_q=w_q, - b_group_scales=w_s, - b_group_size=bt.group_size, - b_channel_scales=bt.w_ch_s, - a_token_scales=bt.w_tok_s, - maybe_schedule=schedule, - ) - - -# impl - -# bench - - -def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]): - min_run_time = 1 if not NVTX_PROFILE else 0.1 - res = TBenchmark.Timer( - stmt=""" - for fn in fns: - fn() - """, - globals={"fns": fns}, - label=label, - sub_label=sub_label, - description=description, - ).blocked_autorange(min_run_time=min_run_time) - - if NVTX_PROFILE: - with ( - nvtx.annotate("mm-bench"), - nvtx.annotate(f"{label}|{sub_label}|{description}"), - ): - fns[0]() - - return res - - -_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None -_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None - - -def bench( - types: TypeConfig, - group_size: int, - m: int, - k: int, - n: int, - label: str, - sub_label: str, - sweep_schedules: bool = True, -) -> list[TMeasurement]: - benchmark_tensors = create_bench_tensors((m, n, k), types, group_size) - sub_label += f", L={len(benchmark_tensors)}" - - name_type_string = f"W{types.weight_type}" + f"-A{terse_type_name(types.act_type)}" - if types.group_scale_type is not None: - name_type_string += f"-GS{terse_type_name(types.group_scale_type)}" - if types.group_zero_type is not None: - name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}" - if group_size is not None: - name_type_string += f"-G{group_size}" - if types.channel_scale_type is not None: - name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}" - if types.token_scale_type is not None: - name_type_string += f"-TS{terse_type_name(types.token_scale_type)}" - - timers = [] - # pytorch impl - timers.append( - bench_fns( - label, - sub_label, - "torch.matmul (fp16)", - [torch_matmul_f16_create_bench_fn(bt) for bt in benchmark_tensors], - ) - ) - - if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn: - timers.append( - bench_fns( - label, - sub_label, - f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", - [cutlass_scaled_mm_create_bench_fn(bt) for bt in benchmark_tensors], - ) - ) - - if types.act_type != torch.float8_e4m3fn: - timers.append( - bench_fns( - label, - sub_label, - f"marlin ({name_type_string})", - [marlin_create_bench_fn(bt) for bt in benchmark_tensors], - ) - ) - - # machete - timers.append( - bench_fns( - label, - sub_label, - f"machete ({name_type_string})", - [ - machete_create_bench_fn(bt, out_type=types.output_type) - for bt in benchmark_tensors - ], - ) - ) - - # cutlass w4a8 - if types.act_type == torch.float8_e4m3fn and group_size == 128: - timers.append( - bench_fns( - label, - sub_label, - f"cutlass w4a8 ({name_type_string})", - [ - cutlass_w4a8_create_bench_fn(bt, out_type=types.output_type) - for bt in benchmark_tensors - ], - ) - ) - - if sweep_schedules: - global _SWEEP_SCHEDULES_RESULTS - - print("Finding best schedule for machete") - best = None - best_schedule = None - schedules = ops.machete_supported_schedules( - a_type=types.act_type, - b_type=types.weight_type, - group_scales_type=types.group_scale_type, - group_zeros_type=types.group_zero_type, - token_scales_type=types.token_scale_type, - channel_scales_type=types.channel_scale_type, - out_type=types.output_type, - ) - - if schedules is None or len(schedules) == 0: - raise ValueError("No schedules found to sweep") - - for schedule in reversed(schedules): - schedule_M = int(schedule.split("_")[0].split("x")[1]) - - # Prune known bad schedules - if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4: - continue - - res = bench_fns( - label, - sub_label, - "machete_best", - [ - machete_create_bench_fn( - bt, out_type=types.output_type, schedule=schedule - ) - for bt in benchmark_tensors - ], - ) - - results_row = { - "M": m, - "K": k, - "N": n, - "group_size": group_size, - "schedule": schedule, - "median": res.median, - } - if _SWEEP_SCHEDULES_RESULTS is None: - _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(columns=results_row.keys()) - _SWEEP_SCHEDULES_RESULTS.loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row - - print(f" {res.median:5.5} ", schedule) - if not best or res.median < best.median: - best = res - best_schedule = schedule - print("Best schedule:", best_schedule) - timers.append(best) - - return timers - - -# runner -def print_timers(timers: list[TMeasurement]): - compare = TBenchmark.Compare(timers) - compare.print() - - -def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: - types = TypeConfig( - act_type=args.act_type, - weight_type=scalar_types.uint4b8 - if args.group_zero_type is None - else scalar_types.uint4, - output_type=args.out_type, - group_scale_type=args.group_scale_type, - group_zero_type=args.group_zero_type, - channel_scale_type=args.channel_scale_type, - token_scale_type=args.token_scale_type, - ) - - results: list[TMeasurement] = [] - for m, k, n in MKNs: - timers = bench( - types, - args.group_size, - m, - k, - n, - f"{args.act_type}-gemm", - f"MKN=({m}x{k}x{n})", - sweep_schedules=args.sweep_schedules, - ) - print_timers(timers) - results.extend(timers) - - return results - - -# output makers -def make_output( - data: list[TMeasurement], - MKNs: Iterable[tuple[int, int, int]], - base_description: str, - timestamp=None, -): - print(f"== All Results {base_description} ====") - print_timers(data) - - # pickle all the results - timestamp = int(time.time()) if timestamp is None else timestamp - with open(f"{base_description}-{timestamp}.pkl", "wb") as f: - pkl.dump(data, f) - - -# argparse runners - - -def run_square_bench(args): - dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) - MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) - data = run(args.dtype, args.sweep_schedules, MKNs) - - make_output(data, MKNs, f"square_bench-{args.dtype}") - - -def run_range_bench(args): - m_start, k_start, n_start = (int(x) for x in args.dim_start.split(",")) - m_end, k_end, n_end = (int(x) for x in args.dim_end.split(",")) - m_increment, k_increment, n_increment = ( - int(x) for x in args.dim_increment.split(",") - ) - Ms = list(range(m_start, m_end + 1, m_increment)) - Ks = list(range(k_start, k_end + 1, k_increment)) - Ns = list(range(n_start, n_end + 1, n_increment)) - MKNs = list(product(Ms, Ks, Ns)) - - data = run(args.dtype, args.sweep_schedules, MKNs) - - make_output(data, MKNs, f"range_bench-{args.dtype}") - - -def run_model_bench(args): - print("Benchmarking models:") - for i, model in enumerate(args.models): - print(f"[{i}] {model}") - - def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: - KNs = [] - for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): - KN[tp_split_dim] = KN[tp_split_dim] // tp_size - KNs.append(KN) - return KNs - - model_bench_data = [] - models_tps = list(itertools.product(args.models, args.tp_sizes)) - for model, tp_size in models_tps: - Ms = args.batch_sizes - KNs = model_shapes(model, tp_size) - MKNs = [] - for m in Ms: - for k, n in KNs: - MKNs.append((m, k, n)) - - data = run(args, MKNs) - model_bench_data.append(data) - - type_string = f"{args.act_type}" - - # Print all results - for data, model_tp in zip(model_bench_data, models_tps): - model, tp_size = model_tp - print(f"== Results {type_string} {model}-TP{tp_size} ====") - print_timers(data) - - timestr = time.strftime("%Y%m%d-%H%M%S") - - all_results = [] - for d in model_bench_data: - all_results.extend(d) - - # pickle all data - with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f: - args_dict = vars(args) - args_dict.pop("func") - pkl.dump( - { - "args": args_dict, - "results": all_results, - }, - f, - ) - - -if __name__ == "__main__": - - def to_torch_dtype(dt): - return { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "int8": torch.int8, - "float8_e4m3fn": torch.float8_e4m3fn, - "int": torch.int, - "float": torch.float, - }[dt] - - class ToTorchDtype(argparse.Action): - def __call__(self, parser, namespace, values, option_string=None): - setattr(namespace, self.dest, to_torch_dtype(values)) - - parser = FlexibleArgumentParser( - description=""" -Benchmark Machete GEMM. - - To run square GEMMs: - python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 - - To run constant N and K and sweep M: - python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 - - To run dimensions from a model: - python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 - - Output: - - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. - """, # noqa: E501 - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--act-type", - action=ToTorchDtype, - required=True, - choices=["bfloat16", "float16", "int8", "float8_e4m3fn"], - ) - parser.add_argument( - "--group-scale-type", - action=ToTorchDtype, - choices=["bfloat16", "float16"], - ) - parser.add_argument( - "--group-zero-type", - type=to_torch_dtype, - choices=["bfloat16", "float16"], - ) - parser.add_argument( - "--channel-scale-type", - action=ToTorchDtype, - choices=["float"], - ) - parser.add_argument( - "--token-scale-type", - action=ToTorchDtype, - choices=["float"], - ) - parser.add_argument( - "--out-type", - action=ToTorchDtype, - choices=["bfloat16", "float16"], - ) - parser.add_argument( - "--group-size", - type=int, - help="Available options are ['None', '-1', '128'], default=128", - default=128, - ) - parser.add_argument( - "--sweep-schedules", - action="store_true", - help="Run a sweep over all supported schedules", - ) - parser.add_argument( - "--sweep-csv-out", - help="CSV to store sweep results", - default="sch_sweep_results.csv", - ) - subparsers = parser.add_subparsers(dest="cmd", required=True) - - square_parser = subparsers.add_parser("square_bench") - square_parser.add_argument("--dim-start", type=int, required=True) - square_parser.add_argument("--dim-end", type=int, required=True) - square_parser.add_argument("--dim-increment", type=int, required=True) - square_parser.set_defaults(func=run_square_bench) - - range_parser = subparsers.add_parser("range_bench") - range_parser.add_argument( - "--dim-start", - type=str, - required=True, - help="Start value for M,K,N as common separated list", - ) - range_parser.add_argument( - "--dim-end", - type=str, - required=True, - help="End value (inclusive) for M,K,N as common separated list", - ) - range_parser.add_argument( - "--dim-increment", - type=str, - required=True, - help="Increment value for M,K,N as common separated list", - ) - range_parser.set_defaults(func=run_range_bench) - - model_parser = subparsers.add_parser("model_bench") - model_parser.add_argument( - "--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES.keys(), - ) - model_parser.add_argument( - "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES - ) - model_parser.add_argument( - "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES - ) - model_parser.set_defaults(func=run_model_bench) - - args = parser.parse_args() - - _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out - args.func(args) - - if _SWEEP_SCHEDULES_RESULTS is not None: - _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV) diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py deleted file mode 100644 index 34cc45e94..000000000 --- a/benchmarks/kernels/benchmark_marlin.py +++ /dev/null @@ -1,413 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch -import torch.utils.benchmark as benchmark -from benchmark_shapes import WEIGHT_SHAPES - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( - GPTQ_MARLIN_24_MAX_PARALLEL, - GPTQ_MARLIN_24_MIN_THREAD_N, - GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, - GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES, -) -from vllm.model_executor.layers.quantization.utils.allspark_utils import ( - ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, - ALLSPARK_SUPPORTED_QUANT_TYPES, -) -from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - GPTQ_MARLIN_MAX_PARALLEL, - GPTQ_MARLIN_MIN_THREAD_N, - MARLIN_SUPPORTED_GROUP_SIZES, - query_marlin_supported_quant_types, -) -from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( - FP4_MARLIN_SUPPORTED_GROUP_SIZES, - rand_marlin_weight_fp4_like, -) -from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( - marlin_quant_fp8_torch, -) -from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( - MarlinWorkspace, - awq_marlin_quantize, - marlin_quantize, -) -from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( - marlin_24_quantize, -) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - gptq_pack, - gptq_quantize_weights, - quantize_weights, - sort_weights, -) -from vllm.scalar_type import ScalarType, scalar_types -from vllm.utils import FlexibleArgumentParser - -DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] -DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] - -ACT_ORDER_OPTS = [False, True] -K_FULL_OPTS = [False, True] - - -def bench_run( - results: list[benchmark.Measurement], - model: str, - act_order: bool, - is_k_full: bool, - quant_type: ScalarType, - group_size: int, - size_m: int, - size_k: int, - size_n: int, -): - label = "Quant Matmul" - sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format( - model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n - ) - print(f"Testing: {sub_label}") - - a = torch.randn(size_m, size_k).to(torch.half).cuda() - b = torch.rand(size_k, size_n).to(torch.half).cuda() - has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8] - if act_order and (group_size == -1 or group_size == size_k or has_zp): - return - if size_k % group_size != 0: - return - - marlin_24_supported = ( - quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES - and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES - ) - repack_supported = ( - quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES - and group_size in MARLIN_SUPPORTED_GROUP_SIZES - ) - allspark_supported = ( - quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES - and group_size == -1 - and not act_order - and is_k_full - ) - - def gen_marlin_params(): - # Marlin quant - marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None - if quant_type == scalar_types.float4_e2m1f: - if group_size != 16 or act_order: - return - marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like( - b.T, group_size - ) - elif quant_type == scalar_types.float8_e4m3fn: - if group_size not in [-1, 128] or act_order: - return - marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size) - elif group_size == 16: - return - elif has_zp: - marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize( - b, quant_type, group_size - ) - else: - marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = ( - marlin_quantize(b, quant_type, group_size, act_order) - ) - return ( - marlin_w_ref, - marlin_q_w, - marlin_s, - marlin_s2, - marlin_zp, - marlin_g_idx, - marlin_sort_indices, - ) - - def gen_marlin_24_params(): - marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None - if marlin_24_supported: - (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = ( - marlin_24_quantize(b, quant_type, group_size) - ) - return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) - - def gen_repack_params(): - q_w_gptq = None - repack_sort_indices = None - if repack_supported: - (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights( - b, quant_type, group_size, act_order - ) - q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n) - - # For act_order, sort the "weights" and "g_idx" - # so that group ids are increasing - repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device) - if act_order: - (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx) - return q_w_gptq, repack_sort_indices - - def gen_allspark_params(): - qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = ( - CUBLAS_M_THRESHOLD - ) = None - nonlocal allspark_supported - if allspark_supported: - properties = torch.cuda.get_device_properties(b.device.index) - sm_count = properties.multi_processor_count - sm_version = properties.major * 10 + properties.minor - - supported_arch = sm_version >= 80 and sm_version < 90 - allspark_supported = allspark_supported and supported_arch - if supported_arch: - w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp) - qw = qw.to(torch.uint8) - - qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight( - qw, s, zp, has_zp - ) - CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD - return ( - qw_reorder, - s_reorder, - zp_reorder, - sm_count, - sm_version, - CUBLAS_M_THRESHOLD, - ) - - ( - marlin_w_ref, - marlin_q_w, - marlin_s, - marlin_s2, - marlin_zp, - marlin_g_idx, - marlin_sort_indices, - ) = gen_marlin_params() - marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = ( - gen_marlin_24_params() - ) - q_w_gptq, repack_sort_indices = gen_repack_params() - qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = ( - gen_allspark_params() - ) - - # Prepare - marlin_workspace = MarlinWorkspace( - size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL - ) - marlin_24_workspace = MarlinWorkspace( - size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL - ) - - globals = { - # Gen params - "quant_type": quant_type, - "group_size": group_size, - "size_m": size_m, - "size_n": size_n, - "size_k": size_k, - "a": a, - # Marlin params - "marlin_w_ref": marlin_w_ref, - "marlin_q_w": marlin_q_w, - "marlin_s": marlin_s, - "marlin_s2": marlin_s2, - "marlin_zp": marlin_zp, - "marlin_g_idx": marlin_g_idx, - "marlin_sort_indices": marlin_sort_indices, - "marlin_workspace": marlin_workspace, - "is_k_full": is_k_full, - # Marlin_24 params - "marlin_24_w_ref": marlin_24_w_ref, - "marlin_24_q_w_comp": marlin_24_q_w_comp, - "marlin_24_meta": marlin_24_meta, - "marlin_24_s": marlin_24_s, - "marlin_24_workspace": marlin_24_workspace, - # GPTQ params - "q_w_gptq": q_w_gptq, - "repack_sort_indices": repack_sort_indices, - # AllSpark W8A16 params - "qw_reorder": qw_reorder, - "s_reorder": s_reorder, - "zp_reorder": zp_reorder, - "sm_count": sm_count, - "sm_version": sm_version, - "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD, - # Kernels - "gptq_marlin_gemm": ops.gptq_marlin_gemm, - "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, - "gptq_marlin_repack": ops.gptq_marlin_repack, - "allspark_w8a16_gemm": ops.allspark_w8a16_gemm, - } - - min_run_time = 1 - - # Warmup pytorch - for _ in range(5): - torch.matmul(a, marlin_w_ref) - - results.append( - benchmark.Timer( - stmt="torch.matmul(a, marlin_w_ref)", - globals=globals, - label=label, - sub_label=sub_label, - description="pytorch_gemm", - ).blocked_autorange(min_run_time=min_run_time) - ) - - results.append( - benchmark.Timer( - stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="gptq_marlin_gemm", - ).blocked_autorange(min_run_time=min_run_time) - ) - - results.append( - benchmark.Timer( - stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="gptq_marlin_gemm_fp32", - ).blocked_autorange(min_run_time=min_run_time) - ) - - if marlin_24_supported: - results.append( - benchmark.Timer( - stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="gptq_marlin_24_gemm", - ).blocked_autorange(min_run_time=min_run_time) - ) - - if repack_supported: - results.append( - benchmark.Timer( - stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="gptq_marlin_repack", - ).blocked_autorange(min_run_time=min_run_time) - ) - - if allspark_supported: - results.append( - benchmark.Timer( - stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="allspark_w8a16_gemm_fp32", - ).blocked_autorange(min_run_time=min_run_time) - ) - - -def main(args): - print("Benchmarking models:") - for i, model in enumerate(args.models): - print(f"[{i}] {model}") - results: list[benchmark.Measurement] = [] - - for model in args.models: - for layer in WEIGHT_SHAPES[model]: - size_k = layer[0] - size_n = layer[1] - - if len(args.limit_k) > 0 and size_k not in args.limit_k: - continue - - if len(args.limit_n) > 0 and size_n not in args.limit_n: - continue - - for act_order in ACT_ORDER_OPTS: - if ( - len(args.limit_act_order) > 0 - and act_order not in args.limit_act_order - ): - continue - - for is_k_full in K_FULL_OPTS: - if ( - len(args.limit_k_full) > 0 - and is_k_full not in args.limit_k_full - ): - continue - - for quant_type in query_marlin_supported_quant_types(): - if ( - len(args.limit_num_bits) > 0 - and quant_type.size_bits not in args.limit_num_bits - ): - continue - - for group_size in ( - MARLIN_SUPPORTED_GROUP_SIZES - + FP4_MARLIN_SUPPORTED_GROUP_SIZES - ): - if ( - len(args.limit_group_size) > 0 - and group_size not in args.limit_group_size - ): - continue - - # For act_order, the group_size must be less than - # size_k - if act_order and (group_size == size_k or group_size == -1): - continue - - for size_m in args.batch_sizes: - bench_run( - results, - model, - act_order, - is_k_full, - quant_type, - group_size, - size_m, - size_k, - size_n, - ) - - compare = benchmark.Compare(results) - compare.print() - - -# For quick benchmarking use: -# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 -# -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark Marlin across specified models/shapes/batches" - ) - parser.add_argument( - "--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES.keys(), - ) - parser.add_argument( - "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES - ) - parser.add_argument("--limit-k", nargs="+", type=int, default=[]) - parser.add_argument("--limit-n", nargs="+", type=int, default=[]) - parser.add_argument("--limit-group-size", nargs="+", type=int, default=[]) - parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[]) - parser.add_argument("--limit-act-order", nargs="+", type=int, default=[]) - parser.add_argument("--limit-k-full", nargs="+", type=int, default=[]) - - args = parser.parse_args() - main(args) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py deleted file mode 100644 index 02c2db674..000000000 --- a/benchmarks/kernels/benchmark_moe.py +++ /dev/null @@ -1,773 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import json -import os -import time -from contextlib import nullcontext -from datetime import datetime -from itertools import product -from typing import Any, TypedDict - -import ray -import torch -from ray.experimental.tqdm_ray import tqdm - -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEQuantConfig, - _get_config_dtype_str, -) -from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.platforms import current_platform -from vllm.transformers_utils.config import get_config -from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser - -FP8_DTYPE = current_platform.fp8_dtype() - - -def ensure_divisibility(numerator, denominator, text): - """Ensure that numerator is divisible by the denominator.""" - assert numerator % denominator == 0, "{} {} is not divisible by tp {}.".format( - text, numerator, denominator - ) - - -class BenchmarkConfig(TypedDict): - BLOCK_SIZE_M: int - BLOCK_SIZE_N: int - BLOCK_SIZE_K: int - GROUP_SIZE_M: int - num_warps: int - num_stages: int - - -def benchmark_config( - config: BenchmarkConfig, - num_tokens: int, - num_experts: int, - shard_intermediate_size: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - num_iters: int = 100, - block_quant_shape: list[int] = None, - use_deep_gemm: bool = False, -) -> float: - init_dtype = torch.float16 if use_fp8_w8a8 else dtype - x = torch.randn(num_tokens, hidden_size, dtype=dtype) - if use_int8_w8a16: - w1 = torch.randint( - -127, - 127, - ( - num_experts, - shard_intermediate_size, - hidden_size, - ), - dtype=torch.int8, - ) - w2 = torch.randint( - -127, - 127, - ( - num_experts, - hidden_size, - shard_intermediate_size // 2, - ), - dtype=torch.int8, - ) - else: - w1 = torch.randn( - num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype - ) - w2 = torch.randn( - num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype - ) - gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32) - - w1_scale = None - w2_scale = None - a1_scale = None - a2_scale = None - if use_int8_w8a16: - w1_scale = torch.randn( - (num_experts, 2 * shard_intermediate_size), dtype=torch.float32 - ) - w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) - if use_deep_gemm: - # we use the default block shape for deepgemm - block_quant_shape = [128, 128] - if use_fp8_w8a8: - if block_quant_shape: - block_n, block_k = block_quant_shape[0], block_quant_shape[1] - E = num_experts - N = shard_intermediate_size // 2 - K = hidden_size - factor_for_scale = 1e-2 - n_tiles_w1 = (2 * N + block_n - 1) // block_n - n_tiles_w2 = (K + block_n - 1) // block_n - k_tiles_w1 = (K + block_k - 1) // block_k - k_tiles_w2 = (N + block_k - 1) // block_k - w1_scale = ( - torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) - * factor_for_scale - ) - w2_scale = ( - torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) - * factor_for_scale - ) - else: - w1_scale = torch.randn(num_experts, dtype=torch.float32) - w2_scale = torch.randn(num_experts, dtype=torch.float32) - - a1_scale = torch.randn(1, dtype=torch.float32) - a2_scale = torch.randn(1, dtype=torch.float32) - - w1 = w1.to(FP8_DTYPE) - w2 = w2.to(FP8_DTYPE) - - input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32) - - def prepare(i: int): - input_gating.copy_(gating_output[i]) - - def run(): - from vllm.model_executor.layers.fused_moe import override_config - - if use_fp8_w8a8: - quant_dtype = torch.float8_e4m3fn - elif use_int8_w8a16: - quant_dtype = torch.int8 - else: - quant_dtype = None - - quant_config = FusedMoEQuantConfig.make( - quant_dtype=quant_dtype, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_quant_shape, - ) - - with override_config(config): - topk_weights, topk_ids, token_expert_indices = fused_topk( - x, input_gating, topk, renormalize=not use_deep_gemm - ) - return fused_experts( - x, - w1, - w2, - topk_weights, - topk_ids, - inplace=True, - quant_config=quant_config, - allow_deep_gemm=use_deep_gemm, - ) - - # JIT compilation & warmup - run() - torch.cuda.synchronize() - - # Capture 10 invocations with CUDA graph - graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph): - for _ in range(10): - run() - torch.cuda.synchronize() - - # Warmup - for _ in range(5): - graph.replay() - torch.cuda.synchronize() - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - latencies: list[float] = [] - for i in range(num_iters): - prepare(i) - torch.cuda.synchronize() - - start_event.record() - graph.replay() - end_event.record() - end_event.synchronize() - latencies.append(start_event.elapsed_time(end_event)) - avg = sum(latencies) / (num_iters * 10) * 1000 # us - graph.reset() - return avg - - -def get_rocm_tuning_space(use_fp16): - block_mn_range = [16, 32, 64, 128, 256] - block_k_range = [16, 32, 64, 128, 256] - if not use_fp16: - block_k_range.remove(16) # BLOCK_K=16 not supported for fp8 - num_warps_range = [1, 2, 4, 8] - group_m_range = [1, 4, 8, 16, 32] - num_stage_range = [2] - waves_per_eu_range = [0] - matrix_instr_nonkdim_range = [16, 32] if use_fp16 else [] - kpack_range = [1, 2] if use_fp16 else [] - - param_ranges = { - "BLOCK_SIZE_M": block_mn_range, - "BLOCK_SIZE_N": block_mn_range, - "BLOCK_SIZE_K": block_k_range, - "GROUP_SIZE_M": group_m_range, - "num_warps": num_warps_range, - "num_stages": num_stage_range, - "waves_per_eu": waves_per_eu_range, - } - if use_fp16: - param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range - param_ranges["kpack"] = kpack_range - - return param_ranges - - -def get_configs_compute_bound(use_fp16, block_quant_shape) -> list[dict[str, int]]: - configs: list[BenchmarkConfig] = [] - - if current_platform.is_rocm(): - param_ranges = get_rocm_tuning_space(use_fp16) - else: - # Reduced search space for faster tuning. - # TODO(woosuk): Increase the search space and use a performance model to - # prune the search space. - block_m_range = [16, 32, 64, 128, 256] - block_n_range = [32, 64, 128, 256] - block_k_range = [64, 128, 256] - num_warps_range = [4, 8] - group_m_range = [1, 16, 32, 64] - num_stage_range = [2, 3, 4, 5] - - param_ranges = { - "BLOCK_SIZE_M": block_m_range, - "BLOCK_SIZE_N": block_n_range, - "BLOCK_SIZE_K": block_k_range, - "GROUP_SIZE_M": group_m_range, - "num_warps": num_warps_range, - "num_stages": num_stage_range, - } - - keys, values = zip(*param_ranges.items()) - for config_values in product(*values): - config = dict(zip(keys, config_values)) - configs.append(config) - - # Remove configs that are not compatible with fp8 block quantization - # BLOCK_SIZE_K must be a multiple of block_k - # BLOCK_SIZE_N must be a multiple of block_n - if block_quant_shape is not None and not use_fp16: - block_n, block_k = block_quant_shape[0], block_quant_shape[1] - for config in configs[:]: - if ( - config["BLOCK_SIZE_K"] % block_k != 0 - or config["BLOCK_SIZE_N"] % block_n != 0 - ): - configs.remove(config) - return configs - - -def prune_rocm_search_space( - num_tokens, shard_intermediate_size, hidden_size, search_space, is_fp16, topk -): - N1, K1 = shard_intermediate_size, hidden_size - N2, K2 = hidden_size, shard_intermediate_size // 2 - pruned_space_1 = prune_rocm_configs( - num_tokens * topk, N1, K1, search_space, is_fp16 - ) - pruned_space_2 = prune_rocm_configs( - num_tokens * topk, N2, K2, search_space, is_fp16 - ) - search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) - return search_space - - -# The following code is inspired by ROCm/Triton GEMM tuning script: -# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89 -def prune_rocm_configs(M, N, K, configs, is_fp16=True): - pruned_configs = [] - elemBytes_a = 2 if is_fp16 else 1 - elemBytes_b = 2 if is_fp16 else 1 - - mfma = 16 if M < 32 or N < 32 else 32 - - # TODO (zhanglx): figure out the boundary between large and small gemms - large_gemm = False - if M >= 2048 and N >= 2048: - large_gemm = True - - for config in configs: - BLOCK_SIZE_M = config.get("BLOCK_SIZE_M") - BLOCK_SIZE_N = config.get("BLOCK_SIZE_N") - BLOCK_SIZE_K = config.get("BLOCK_SIZE_K") - num_warps = config.get("num_warps") - - if is_fp16: - matrix_instr_nonkdim = config.get("matrix_instr_nonkdim") - if matrix_instr_nonkdim > mfma: - continue - if mfma == 4 and BLOCK_SIZE_K < 64: - continue - # some layouts could not work properly in case - # number elements per thread is less 1 - if BLOCK_SIZE_M * BLOCK_SIZE_N < 64: - continue - SPLIT_K = config.get("SPLIT_K", 1) - GROUP_M = config.get("GROUP_SIZE_M") - if is_fp16: - if ( - matrix_instr_nonkdim > BLOCK_SIZE_M - or matrix_instr_nonkdim > BLOCK_SIZE_N - ): - continue - if matrix_instr_nonkdim >= M and matrix_instr_nonkdim != BLOCK_SIZE_M: - continue - if matrix_instr_nonkdim >= N and matrix_instr_nonkdim != BLOCK_SIZE_N: - continue - # Skip BLOCK_SIZE that is too large compare to M/N - # unless BLOCK_SIZE is already small enough - if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16: - continue - if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16: - continue - # skip large split_k when not necessary - if SPLIT_K != 1 and not need_split_k(M, N, K): - continue - # skip split_k that leads to EVEN_K = false - leap = SPLIT_K * BLOCK_SIZE_K - modv = K % leap - if modv != 0: - continue - # skip large GROUP_M - if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1: - continue - # out of shared memory resource - # TODO (zhanglx): This does not consider the LDS usage in the epilogue - LDS = ( - BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a - + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b - ) - if LDS > 65536: - continue - # Skip small block sizes and num_warps for large gemm - # For fp16 and f8, we want to only use BLOCK_SIZE >= 64 - if large_gemm: - if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64: - continue - if BLOCK_SIZE_K < 64: - continue - if num_warps < 4: - continue - - pruned_configs.append(config) - - return pruned_configs - - -def need_split_k(SIZE_M, SIZE_N, SIZE_K): - return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024 - - -def merge_unique_dicts(list1, list2): - result = [] - combined_list = list1.copy() - combined_list.extend(list2) - for dictionary in combined_list: - if dictionary not in result: - result.append(dictionary) - return result - - -@ray.remote(num_gpus=1) -class BenchmarkWorker: - def __init__(self, seed: int) -> None: - torch.set_default_device("cuda") - current_platform.seed_everything(seed) - self.seed = seed - # Get the device ID to allocate tensors and kernels - # on the respective GPU. This is required for Ray to work - # correctly with multi-GPU tuning on the ROCm platform. - self.device_id = int(ray.get_gpu_ids()[0]) - - def benchmark( - self, - num_tokens: int, - num_experts: int, - shard_intermediate_size: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - block_quant_shape: list[int] = None, - use_deep_gemm: bool = False, - ) -> tuple[dict[str, int], float]: - current_platform.seed_everything(self.seed) - dtype_str = _get_config_dtype_str( - dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 - ) - # NOTE(woosuk): The current naming convention uses w2.shape[2], which - # is the intermediate size after silu_and_mul. - block_n = block_quant_shape[0] if block_quant_shape else None - block_k = block_quant_shape[1] if block_quant_shape else None - op_config = get_moe_configs( - num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k - ) - if op_config is None: - config = get_default_config( - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype_str, - block_quant_shape, - ) - else: - config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))] - kernel_time = benchmark_config( - config, - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=100, - block_quant_shape=block_quant_shape, - use_deep_gemm=use_deep_gemm, - ) - return config, kernel_time - - def tune( - self, - num_tokens: int, - num_experts: int, - shard_intermediate_size: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - search_space: list[dict[str, int]], - block_quant_shape: list[int], - use_deep_gemm: bool, - ) -> dict[str, int]: - best_config = None - best_time = float("inf") - if current_platform.is_rocm(): - is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) - search_space = prune_rocm_search_space( - num_tokens, - shard_intermediate_size, - hidden_size, - search_space, - is_fp16, - topk, - ) - - need_device_guard = False - if current_platform.is_rocm(): - visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None) - if visible_device != f"{self.device_id}": - need_device_guard = True - - with torch.cuda.device(self.device_id) if need_device_guard else nullcontext(): - for config in tqdm(search_space): - try: - kernel_time = benchmark_config( - config, - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=20, - block_quant_shape=block_quant_shape, - use_deep_gemm=use_deep_gemm, - ) - except triton.runtime.autotuner.OutOfResources: - # Some configurations may be invalid and fail to compile. - continue - - if kernel_time < best_time: - best_time = kernel_time - best_config = config - now = datetime.now() - print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") - assert best_config is not None - return best_config - - -def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: - return { - "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], - "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], - "BLOCK_SIZE_K": config["BLOCK_SIZE_K"], - "GROUP_SIZE_M": config["GROUP_SIZE_M"], - "num_warps": config["num_warps"], - "num_stages": config["num_stages"], - **( - {"waves_per_eu": config["waves_per_eu"]} if "waves_per_eu" in config else {} - ), - **( - {"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]} - if "matrix_instr_nonkdim" in config - else {} - ), - **({"kpack": config["kpack"]} if "kpack" in config else {}), - } - - -def save_configs( - configs: dict[int, BenchmarkConfig], - num_experts: int, - shard_intermediate_size: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - block_quant_shape: list[int], - save_dir: str, -) -> None: - dtype_str = _get_config_dtype_str( - dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 - ) - - # NOTE(woosuk): The current naming convention uses w2.shape[2], which - # is the intermediate size after silu_and_mul. - filename = get_config_file_name( - num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape - ) - os.makedirs(save_dir, exist_ok=True) - filename = os.path.join(save_dir, filename) - print(f"Writing best config to {filename}...") - with open(filename, "w") as f: - json.dump({"triton_version": triton.__version__, **configs}, f, indent=4) - f.write("\n") - - -def get_weight_block_size_safety(config, default_value=None): - quantization_config = getattr(config, "quantization_config", {}) - if isinstance(quantization_config, dict): - return quantization_config.get("weight_block_size", default_value) - return default_value - - -def main(args: argparse.Namespace): - print(args) - - config = get_config(model=args.model, trust_remote_code=args.trust_remote_code) - if args.model_prefix: - config = getattr(config, args.model_prefix) - - if config.architectures[0] == "DbrxForCausalLM": - E = config.ffn_config.moe_num_experts - topk = config.ffn_config.moe_top_k - intermediate_size = config.ffn_config.ffn_hidden_size - elif config.architectures[0] == "JambaForCausalLM": - E = config.num_experts - topk = config.num_experts_per_tok - intermediate_size = config.intermediate_size - elif config.architectures[0] in ( - "DeepseekV2ForCausalLM", - "DeepseekV3ForCausalLM", - "DeepseekV32ForCausalLM", - "Glm4MoeForCausalLM", - ): - E = config.n_routed_experts - topk = config.num_experts_per_tok - intermediate_size = config.moe_intermediate_size - elif config.architectures[0] in ( - "Qwen2MoeForCausalLM", - "Qwen3MoeForCausalLM", - "Qwen3NextForCausalLM", - ): - E = config.num_experts - topk = config.num_experts_per_tok - intermediate_size = config.moe_intermediate_size - elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"): - E = config.num_experts - topk = config.moe_topk[0] - intermediate_size = config.moe_intermediate_size[0] - else: - # Support for llama4 - config = config.get_text_config() - # Default: Mixtral. - E = config.num_local_experts - topk = config.num_experts_per_tok - intermediate_size = config.intermediate_size - enable_ep = bool(args.enable_expert_parallel) - if enable_ep: - ensure_divisibility(E, args.tp_size, "Number of experts") - E = E // args.tp_size - shard_intermediate_size = 2 * intermediate_size - else: - ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") - shard_intermediate_size = 2 * intermediate_size // args.tp_size - hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype - use_fp8_w8a8 = args.dtype == "fp8_w8a8" - use_int8_w8a16 = args.dtype == "int8_w8a16" - block_quant_shape = get_weight_block_size_safety(config) - - if args.batch_size is None: - batch_sizes = [ - 1, - 2, - 4, - 8, - 16, - 24, - 32, - 48, - 64, - 96, - 128, - 256, - 512, - 1024, - 1536, - 2048, - 3072, - 4096, - ] - else: - batch_sizes = args.batch_size - - use_deep_gemm = bool(args.use_deep_gemm) - - if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ: - # Ray will set ROCR_VISIBLE_DEVICES for device visibility - logger.warning( - "Ray uses ROCR_VISIBLE_DEVICES to control device accessibility." - "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES." - ) - val = os.environ["HIP_VISIBLE_DEVICES"] - os.environ["ROCR_VISIBLE_DEVICES"] = val - del os.environ["HIP_VISIBLE_DEVICES"] - - ray.init() - num_gpus = int(ray.available_resources()["GPU"]) - workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] - - def _distribute(method: str, inputs: list[Any]) -> list[Any]: - outputs = [] - worker_idx = 0 - for input_args in inputs: - worker = workers[worker_idx] - worker_method = getattr(worker, method) - output = worker_method.remote(*input_args) - outputs.append(output) - worker_idx = (worker_idx + 1) % num_gpus - return ray.get(outputs) - - if args.tune: - is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) - search_space = get_configs_compute_bound(is_fp16, block_quant_shape) - print(f"Start tuning over {len(search_space)} configurations...") - if use_deep_gemm: - raise ValueError( - "Tuning with --use-deep-gemm is not supported as it only tunes Triton " - "kernels. Please remove the flag." - ) - start = time.time() - configs = _distribute( - "tune", - [ - ( - batch_size, - E, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - search_space, - block_quant_shape, - use_deep_gemm, - ) - for batch_size in batch_sizes - ], - ) - best_configs = { - M: sort_config(config) for M, config in zip(batch_sizes, configs) - } - save_configs( - best_configs, - E, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - block_quant_shape, - args.save_dir, - ) - end = time.time() - print(f"Tuning took {end - start:.2f} seconds") - else: - outputs = _distribute( - "benchmark", - [ - ( - batch_size, - E, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - block_quant_shape, - use_deep_gemm, - ) - for batch_size in batch_sizes - ], - ) - - for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): - print(f"Batch size: {batch_size}, config: {config}") - print(f"Kernel time: {kernel_time:.2f} us") - - -if __name__ == "__main__": - parser = FlexibleArgumentParser() - parser.add_argument( - "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1" - ) - parser.add_argument( - "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2 - ) - parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true") - parser.add_argument( - "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" - ) - parser.add_argument("--use-deep-gemm", action="store_true") - parser.add_argument( - "--save-dir", type=str, default="./", help="Directory to save tuned results" - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--batch-size", type=int, nargs="+", required=False) - parser.add_argument("--tune", action="store_true") - parser.add_argument("--trust-remote-code", action="store_true") - parser.add_argument("--model-prefix", type=str, required=False) - args = parser.parse_args() - - main(args) diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py deleted file mode 100644 index f540cff62..000000000 --- a/benchmarks/kernels/benchmark_moe_align_block_size.py +++ /dev/null @@ -1,74 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import itertools - -import torch - -from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( - moe_align_block_size, -) -from vllm.triton_utils import triton - - -def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor: - return torch.stack( - [ - torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk] - for _ in range(num_tokens) - ] - ) - - -# test configurations -num_tokens_range = [1, 16, 256, 4096] -num_experts_range = [16, 64, 224, 256, 280, 512] -topk_range = [1, 2, 8] -configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range)) - - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["num_tokens", "num_experts", "topk"], - x_vals=configs, - line_arg="provider", - line_vals=["vllm"], - line_names=["vLLM"], - plot_name="moe-align-block-size-performance", - args={}, - ) -) -def benchmark(num_tokens, num_experts, topk, provider): - """Benchmark function for Triton.""" - block_size = 256 - topk_ids = get_topk_ids(num_tokens, num_experts, topk) - - quantiles = [0.5, 0.2, 0.8] - - if provider == "vllm": - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: moe_align_block_size(topk_ids, block_size, num_experts), - quantiles=quantiles, - ) - - return 1000 * ms, 1000 * max_ms, 1000 * min_ms - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--num_experts", - type=int, - default=64, - choices=[8, 16, 32, 64, 128, 256], - ) - parser.add_argument( - "--topk", - type=int, - default=8, - choices=[2, 4, 8], - help="Top-k value for correctness check.", - ) - args = parser.parse_args() - - benchmark.run(print_data=True, show_plots=True) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py deleted file mode 100644 index 04d2205aa..000000000 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ /dev/null @@ -1,428 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -from typing import Any, TypedDict - -import ray -import torch -from transformers import AutoConfig - -from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( - _moe_permute, - _moe_unpermute_and_reduce, - moe_permute, - moe_unpermute, -) -from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize -from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser - -FP8_DTYPE = current_platform.fp8_dtype() - - -class BenchmarkConfig(TypedDict): - BLOCK_SIZE_M: int - BLOCK_SIZE_N: int - BLOCK_SIZE_K: int - GROUP_SIZE_M: int - num_warps: int - num_stages: int - - -def benchmark_permute( - num_tokens: int, - num_experts: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - num_iters: int = 100, - use_customized_permute: bool = False, -) -> float: - # init_dtype = torch.float16 if use_fp8_w8a8 else dtype - hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) - # output_hidden_states = torch.empty_like(hidden_states) - if use_fp8_w8a8: - align_block_size = 128 # deepgemm needs 128 m aligned block - qhidden_states, scale = _fp8_quantize(hidden_states, None, None) - else: - align_block_size = None - qhidden_states = hidden_states - - gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32) - - input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) - topk_weights, topk_ids, token_expert_indices = fused_topk( - qhidden_states, input_gating, topk, False - ) - - def prepare(i: int): - input_gating.copy_(gating_output[i]) - - def run(): - if use_customized_permute: - ( - permuted_hidden_states, - a1q_scale, - first_token_off, - inv_perm_idx, - m_indices, - ) = moe_permute( - qhidden_states, - a1q_scale=None, - topk_ids=topk_ids, - n_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) - else: - ( - permuted_hidden_states, - a1q_scale, - sorted_token_ids, - expert_ids, - inv_perm, - ) = _moe_permute( - qhidden_states, None, topk_ids, num_experts, None, align_block_size - ) - - # JIT compilation & warmup - run() - torch.cuda.synchronize() - - # Capture 10 invocations with CUDA graph - graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph): - for _ in range(10): - run() - torch.cuda.synchronize() - - # Warmup - for _ in range(5): - graph.replay() - torch.cuda.synchronize() - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - latencies: list[float] = [] - for i in range(num_iters): - prepare(i) - torch.cuda.synchronize() - - start_event.record() - graph.replay() - end_event.record() - end_event.synchronize() - latencies.append(start_event.elapsed_time(end_event)) - avg = sum(latencies) / (num_iters * 10) * 1000 # us - graph.reset() - return avg - - -def benchmark_unpermute( - num_tokens: int, - num_experts: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - num_iters: int = 100, - use_customized_permute: bool = False, -) -> float: - # init_dtype = torch.float16 if use_fp8_w8a8 else dtype - hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) - output_hidden_states = torch.empty_like(hidden_states) - if use_fp8_w8a8: - align_block_size = 128 # deepgemm needs 128 m aligned block - qhidden_states, scale = _fp8_quantize(hidden_states, None, None) - else: - align_block_size = None - qhidden_states = hidden_states - - input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) - - topk_weights, topk_ids, token_expert_indices = fused_topk( - qhidden_states, input_gating, topk, False - ) - - def prepare(): - if use_customized_permute: - ( - permuted_hidden_states, - a1q_scale, - first_token_off, - inv_perm_idx, - m_indices, - ) = moe_permute( - qhidden_states, - a1q_scale=None, - topk_ids=topk_ids, - n_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) - # convert to fp16/bf16 as gemm output - return ( - permuted_hidden_states.to(dtype), - first_token_off, - inv_perm_idx, - m_indices, - ) - else: - ( - permuted_qhidden_states, - a1q_scale, - sorted_token_ids, - expert_ids, - inv_perm, - ) = _moe_permute( - qhidden_states, None, topk_ids, num_experts, None, align_block_size - ) - # convert to fp16/bf16 as gemm output - return ( - permuted_qhidden_states.to(dtype), - a1q_scale, - sorted_token_ids, - expert_ids, - inv_perm, - ) - - def run(input: tuple): - if use_customized_permute: - ( - permuted_hidden_states, - first_token_off, - inv_perm_idx, - m_indices, - ) = input - output = torch.empty_like(hidden_states) - moe_unpermute( - output, - permuted_hidden_states, - topk_weights, - inv_perm_idx, - first_token_off, - ) - else: - ( - permuted_hidden_states, - a1q_scale, - sorted_token_ids, - expert_ids, - inv_perm, - ) = input - _moe_unpermute_and_reduce( - output_hidden_states, - permuted_hidden_states, - inv_perm, - topk_weights, - True, - ) - - # JIT compilation & warmup - input = prepare() - run(input) - torch.cuda.synchronize() - - # Capture 10 invocations with CUDA graph - graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph): - for _ in range(10): - run(input) - torch.cuda.synchronize() - - # Warmup - for _ in range(5): - graph.replay() - torch.cuda.synchronize() - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - latencies: list[float] = [] - for i in range(num_iters): - torch.cuda.synchronize() - start_event.record() - graph.replay() - end_event.record() - end_event.synchronize() - latencies.append(start_event.elapsed_time(end_event)) - avg = sum(latencies) / (num_iters * 10) * 1000 # us - graph.reset() - return avg - - -@ray.remote(num_gpus=1) -class BenchmarkWorker: - def __init__(self, seed: int) -> None: - torch.set_default_device("cuda") - current_platform.seed_everything(seed) - self.seed = seed - # Get the device ID to allocate tensors and kernels - # on the respective GPU. This is required for Ray to work - # correctly with multi-GPU tuning on the ROCm platform. - self.device_id = int(ray.get_gpu_ids()[0]) - - def benchmark( - self, - num_tokens: int, - num_experts: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - use_customized_permute: bool = False, - ) -> tuple[dict[str, int], float]: - current_platform.seed_everything(self.seed) - - permute_time = benchmark_permute( - num_tokens, - num_experts, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=100, - use_customized_permute=use_customized_permute, - ) - unpermute_time = benchmark_unpermute( - num_tokens, - num_experts, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=100, - use_customized_permute=use_customized_permute, - ) - return permute_time, unpermute_time - - -def get_weight_block_size_safety(config, default_value=None): - quantization_config = getattr(config, "quantization_config", {}) - if isinstance(quantization_config, dict): - return quantization_config.get("weight_block_size", default_value) - return default_value - - -def main(args: argparse.Namespace): - print(args) - - config = AutoConfig.from_pretrained( - args.model, trust_remote_code=args.trust_remote_code - ) - if config.architectures[0] == "DbrxForCausalLM": - E = config.ffn_config.moe_num_experts - topk = config.ffn_config.moe_top_k - elif config.architectures[0] == "JambaForCausalLM": - E = config.num_experts - topk = config.num_experts_per_tok - elif ( - config.architectures[0] == "DeepseekV3ForCausalLM" - or config.architectures[0] == "DeepseekV2ForCausalLM" - or config.architectures[0] == "Glm4MoeForCausalLM" - ): - E = config.n_routed_experts - topk = config.num_experts_per_tok - elif config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"]: - E = config.num_experts - topk = config.num_experts_per_tok - - else: - # Support for llama4 - config = config.get_text_config() - # Default: Mixtral. - E = config.num_local_experts - topk = config.num_experts_per_tok - - hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype - use_fp8_w8a8 = args.dtype == "fp8_w8a8" - use_int8_w8a16 = args.dtype == "int8_w8a16" - use_customized_permute = args.use_customized_permute - - if args.batch_size is None: - batch_sizes = [ - 1, - 2, - 4, - 8, - 16, - 24, - 32, - 48, - 64, - 96, - 128, - 256, - 512, - 1024, - 1536, - 2048, - 3072, - 4096, - ] - else: - batch_sizes = [args.batch_size] - - ray.init() - num_gpus = int(ray.available_resources()["GPU"]) - workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] - - def _distribute(method: str, inputs: list[Any]) -> list[Any]: - outputs = [] - worker_idx = 0 - for input_args in inputs: - worker = workers[worker_idx] - worker_method = getattr(worker, method) - output = worker_method.remote(*input_args) - outputs.append(output) - worker_idx = (worker_idx + 1) % num_gpus - return ray.get(outputs) - - outputs = _distribute( - "benchmark", - [ - ( - batch_size, - E, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - use_customized_permute, - ) - for batch_size in batch_sizes - ], - ) - - for batch_size, (permute, unpermute) in zip(batch_sizes, outputs): - print(f"Batch size: {batch_size}") - print(f"Permute time: {permute:.2f} us") - print(f"Unpermute time: {unpermute:.2f} us") - - -if __name__ == "__main__": - parser = FlexibleArgumentParser() - parser.add_argument( - "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1" - ) - parser.add_argument( - "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" - ) - parser.add_argument("--use-customized-permute", action="store_true") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--batch-size", type=int, required=False) - parser.add_argument("--trust-remote-code", action="store_true") - args = parser.parse_args() - - main(args) diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py deleted file mode 100644 index b91473617..000000000 --- a/benchmarks/kernels/benchmark_mrope.py +++ /dev/null @@ -1,328 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models). -# It generates test data, runs benchmarks, and saves results to a CSV file. -# -# The CSV file (named with current date/time) contains these columns: -# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, -# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, -# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, -# speedup -# -# == Usage Examples == -# -# Single model benchmark: -# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \ -# --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 -# -# All models benchmark: -# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \ -# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 -# -# All models with different TP sizes: -# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \ -# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 -# -# All models with different token counts: -# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \ -# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384 -import csv -import os -import time -from datetime import datetime -from typing import Any - -import numpy as np -import torch - -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.platforms import current_platform -from vllm.transformers_utils.config import get_config -from vllm.utils import FlexibleArgumentParser - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -def generate_test_data( - num_tokens: int, - num_q_heads: int, - num_kv_heads: int, - head_size: int, - max_position_embeddings: int, - dtype: torch.dtype, - device: torch.device, -): - """Generate test data for given configuration.""" - # Create 2D positions (3, num_tokens) for multimodal case - positions = torch.randint( - 0, max_position_embeddings // 4, (3, num_tokens), device=device - ) - - # Create query and key tensors - query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device) - key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device) - - return positions, query, key - - -def calculate_stats(times: list[float]) -> dict[str, float]: - """Calculate statistics from a list of times.""" - times_array = np.array(times) - return { - "mean": np.mean(times_array), - "median": np.median(times_array), - "p99": np.percentile(times_array, 99), - "min": np.min(times_array), - "max": np.max(times_array), - } - - -def benchmark_mrope( - model_name: str, - num_tokens: int, - head_dim: int, - tp_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 8192, - rope_theta: float = 10000, - is_neox_style: bool = True, - rope_scaling: dict[str, Any] = None, - dtype: torch.dtype = torch.bfloat16, - seed: int = 0, - warmup_iter: int = 10, - benchmark_iter: int = 100, - csv_writer=None, -): - current_platform.seed_everything(seed) - torch.set_default_device(device) - # the parameters to compute the q k v size based on tp_size - mrope_helper_class = get_rope( - head_size=head_dim, - rotary_dim=head_dim, - max_position=max_position, - base=rope_theta, - is_neox_style=is_neox_style, - rope_scaling=rope_scaling, - dtype=dtype, - ).to(device=device) - - print(80 * "=") - print( - f"Evaluating model: {model_name} " - f"with tp_size: {tp_size} " - f"and num_tokens: {num_tokens}, " - f"dtype: {dtype}" - ) - - # create q k v input tensors - # create rotary pos emb input tensors - positions, query, key = generate_test_data( - num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device - ) - - # Warm up - for _ in range(warmup_iter): - mrope_helper_class.forward_native( - positions, - query.clone(), - key.clone(), - ) - - mrope_helper_class.forward_cuda( - positions, - query.clone(), - key.clone(), - ) - - torch.cuda.synchronize() - - # Time reference implementation - torch_times = [] - for _ in range(benchmark_iter): - query_clone = query.clone() - key_clone = key.clone() - torch.cuda.synchronize() - start_time = time.time() - - mrope_helper_class.forward_native( - positions, - query_clone, - key_clone, - ) - - torch.cuda.synchronize() - torch_times.append(time.time() - start_time) - - # Time triton kernel implementation - triton_times = [] - for _ in range(benchmark_iter): - query_clone = query.clone() - key_clone = key.clone() - torch.cuda.synchronize() - start_time = time.time() - mrope_helper_class.forward_cuda( - positions, - query_clone, - key_clone, - ) - torch.cuda.synchronize() - triton_times.append(time.time() - start_time) - - # Calculate statistics - torch_stats = calculate_stats(torch_times) - triton_stats = calculate_stats(triton_times) - print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):") - - print( - f"Torch implementation: " - f"mean={torch_stats['mean']:.8f}s, " - f"median={torch_stats['median']:.8f}s, " - f"p99={torch_stats['p99']:.8f}s" - ) - - print( - f"Triton implementation: " - f"mean={triton_stats['mean']:.8f}s, " - f"median={triton_stats['median']:.8f}s, " - f"p99={triton_stats['p99']:.8f}s" - ) - - print( - f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x" - ) - - # Write to CSV - if csv_writer: - row = [ - model_name, - tp_size, - num_tokens, - num_heads, - num_kv_heads, - head_dim, - max_position, - rope_theta, - is_neox_style, - str(rope_scaling), - str(dtype).split(".")[-1], - torch_stats["mean"], - torch_stats["median"], - torch_stats["p99"], - torch_stats["min"], - torch_stats["max"], - triton_stats["mean"], - triton_stats["median"], - triton_stats["p99"], - triton_stats["min"], - triton_stats["max"], - torch_stats["mean"] / triton_stats["mean"], # speedup - ] - csv_writer.writerow(row) - - return torch_stats, triton_stats - - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark the rotary embedding kernels." - ) - parser.add_argument("--model-name", type=str, default="") - parser.add_argument("--tp-size", type=int, default=1) - parser.add_argument("--warmup-iter", type=int, default=10) - parser.add_argument("--benchmark-iter", type=int, default=100) - parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--num-tokens", type=int, nargs="+", required=False) - parser.add_argument("--trust-remote-code", action="store_true") - parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv") - args = parser.parse_args() - print(args) - - # Create CSV file for results - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv" - - with open(csv_filename, "w", newline="") as csvfile: - csv_writer = csv.writer(csvfile) - # Write header - header = [ - "model_name", - "tp_size", - "num_tokens", - "num_heads", - "num_kv_heads", - "head_dim", - "max_position", - "rope_theta", - "is_neox_style", - "rope_scaling", - "dtype", - "torch_mean", - "torch_median", - "torch_p99", - "torch_min", - "torch_max", - "triton_mean", - "triton_median", - "triton_p99", - "triton_min", - "triton_max", - "speedup", - ] - csv_writer.writerow(header) - - model_tp_dict = {} - if args.model_name == "": - model_tp_dict = { - "Qwen/Qwen2-VL-2B-Instruct": [1], - "Qwen/Qwen2-VL-7B-Instruct": [1], - "Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8], - "Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8], - "Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8], - "Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8], - } - else: - model_tp_dict[args.model_name] = [args.tp_size] - - if args.num_tokens is None: - num_tokens_list = [2**i for i in range(0, 18)] - else: - num_tokens_list = args.num_tokens - - for model_name, tp_list in model_tp_dict.items(): - config = get_config(model_name, trust_remote_code=args.trust_remote_code) - for tp_size in tp_list: - # get the model config - total_num_kv_heads = config.num_key_value_heads - total_num_heads = config.num_attention_heads - num_heads = total_num_heads // tp_size - num_kv_heads = max(1, total_num_kv_heads // tp_size) - head_dim = config.hidden_size // total_num_heads - q_size = num_heads * head_dim - kv_size = num_kv_heads * head_dim - is_neox_style = True - rope_theta = config.rope_theta - max_position = config.max_position_embeddings - - for num_tokens in num_tokens_list: - benchmark_mrope( - model_name=model_name, - num_tokens=num_tokens, - head_dim=head_dim, - tp_size=tp_size, - num_heads=num_heads, - num_kv_heads=num_kv_heads, - max_position=max_position, - rope_theta=rope_theta, - is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, - dtype=getattr(torch, args.dtype), - seed=args.seed, - warmup_iter=args.warmup_iter, - benchmark_iter=args.benchmark_iter, - csv_writer=csv_writer, - ) - - print(f"Benchmark results saved to {csv_filename}") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py deleted file mode 100644 index 7e0376c18..000000000 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ /dev/null @@ -1,251 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -import time -from typing import Optional - -import torch - -from vllm import _custom_ops as ops -from vllm.logger import init_logger -from vllm.platforms import current_platform -from vllm.utils import ( - STR_DTYPE_TO_TORCH_DTYPE, - FlexibleArgumentParser, - create_kv_caches_with_random, -) - -logger = init_logger(__name__) - -NUM_BLOCKS = 128 * 1024 -PARTITION_SIZE = 512 -PARTITION_SIZE_ROCM = 256 - - -@torch.inference_mode() -def main( - version: str, - num_seqs: int, - seq_len: int, - num_query_heads: int, - num_kv_heads: int, - head_size: int, - use_alibi: bool, - block_size: int, - dtype: torch.dtype, - seed: int, - do_profile: bool, - device: str = "cuda", - kv_cache_dtype: Optional[str] = None, -) -> None: - current_platform.seed_everything(seed) - - scale = float(1.0 / (head_size**0.5)) - query = torch.empty( - num_seqs, num_query_heads, head_size, dtype=dtype, device=device - ) - query.uniform_(-scale, scale) - - assert num_query_heads % num_kv_heads == 0 - alibi_slopes = None - if use_alibi: - alibi_slopes = torch.randn(num_query_heads, dtype=torch.float, device=device) - - seq_lens = [seq_len for _ in range(num_seqs)] - max_seq_len = max(seq_lens) - seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device) - - # Create the block tables. - max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables_lst: list[list[int]] = [] - for _ in range(num_seqs): - block_table = [ - random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) - ] - block_tables_lst.append(block_table) - - block_tables = torch.tensor(block_tables_lst, dtype=torch.int, device=device) - - # Create the KV cache. - key_caches, value_caches = create_kv_caches_with_random( - NUM_BLOCKS, - block_size, - 1, - num_kv_heads, - head_size, - kv_cache_dtype, - dtype, - device=device, - ) - key_cache, value_cache = key_caches[0], value_caches[0] - - # Prepare for the paged attention kernel. - output = torch.empty_like(query) - if version == "v2": - if current_platform.is_rocm(): - global PARTITION_SIZE - if not args.custom_paged_attn and not current_platform.is_navi(): - PARTITION_SIZE = 1024 - else: - PARTITION_SIZE = PARTITION_SIZE_ROCM - num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE - tmp_output = torch.empty( - size=(num_seqs, num_query_heads, num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_query_heads, num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - - def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() - if profile: - torch.cuda.cudart().cudaProfilerStart() - start_time = time.perf_counter() - - # Using default kv_scale - k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) - - for _ in range(num_iters): - if version == "v1": - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - seq_lens, - block_size, - max_seq_len, - alibi_slopes, - kv_cache_dtype, - k_scale, - v_scale, - ) - elif version == "v2": - if not args.custom_paged_attn: - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - seq_lens, - block_size, - max_seq_len, - alibi_slopes, - kv_cache_dtype, - k_scale, - v_scale, - ) - else: - ops.paged_attention_rocm( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - seq_lens, - None, - block_size, - max_seq_len, - alibi_slopes, - kv_cache_dtype, - k_scale, - v_scale, - ) - else: - raise ValueError(f"Invalid version: {version}") - torch.cuda.synchronize() - - end_time = time.perf_counter() - if profile: - torch.cuda.cudart().cudaProfilerStop() - return (end_time - start_time) / num_iters - - # Warmup. - print("Warming up...") - run_benchmark = run_cuda_benchmark - run_benchmark(num_iters=3, profile=False) - - # Benchmark. - if do_profile: - latency = run_benchmark(num_iters=1, profile=True) - else: - latency = run_benchmark(num_iters=100, profile=False) - print(f"Kernel running time: {latency * 1000000:.3f} us") - - -if __name__ == "__main__": - logger.warning( - "This script benchmarks the paged attention kernel. " - "By default this is no longer used in vLLM inference." - ) - - parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.") - parser.add_argument("--version", type=str, choices=["v1", "v2"], default="v2") - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument("--seq-len", type=int, default=4096) - parser.add_argument("--num-query-heads", type=int, default=64) - parser.add_argument("--num-kv-heads", type=int, default=8) - parser.add_argument( - "--head-size", - type=int, - choices=[64, 80, 96, 112, 120, 128, 192, 256], - default=128, - ) - parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) - parser.add_argument("--use-alibi", action="store_true") - parser.add_argument( - "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--profile", action="store_true") - parser.add_argument( - "--kv-cache-dtype", - type=str, - choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"], - default="auto", - help="Data type for kv cache storage. If 'auto', will use model " - "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. " - "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)", - ) - parser.add_argument( - "--custom-paged-attn", action="store_true", help="Use custom paged attention" - ) - args = parser.parse_args() - print(args) - - if args.num_query_heads % args.num_kv_heads != 0: - raise ValueError("num_query_heads must be divisible by num_kv_heads") - main( - version=args.version, - num_seqs=args.batch_size, - seq_len=args.seq_len, - num_query_heads=args.num_query_heads, - num_kv_heads=args.num_kv_heads, - head_size=args.head_size, - block_size=args.block_size, - use_alibi=args.use_alibi, - dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], - seed=args.seed, - do_profile=args.profile, - kv_cache_dtype=args.kv_cache_dtype, - ) diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py deleted file mode 100644 index 1ccb5e08b..000000000 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ /dev/null @@ -1,159 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import math -from contextlib import contextmanager -from typing import Callable -from unittest.mock import patch - -import torch - -from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils -from vllm.platforms import current_platform - - -@contextmanager -def _triton_mode(): - """Temporarily force the Triton fallback path""" - with patch("vllm.platforms.current_platform.is_cuda", return_value=False): - yield - - -def _time_cuda( - fn: Callable[[], tuple[torch.Tensor, torch.Tensor]], - warmup_iters: int, - bench_iters: int, -) -> float: - # warmup - for _ in range(warmup_iters): - fn() - torch.cuda.synchronize() - - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - - start.record() - for _ in range(bench_iters): - fn() - end.record() - torch.cuda.synchronize() - - return start.elapsed_time(end) / bench_iters # ms/iter - - -def _run_single( - shape: tuple[int, int], - group_size: int, - dtype: str, - *, - column_major: bool = False, - scale_ue8m0: bool = False, - warmup_iters: int, - bench_iters: int, -) -> None: - num_tokens, hidden_dim = shape - - device = torch.device("cuda") - torch.manual_seed(42) - x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16) * 8 - - if dtype == "fp8": - - def cuda_impl(): - return fp8_utils.per_token_group_quant_fp8( - x, - group_size, - column_major_scales=column_major, - use_ue8m0=scale_ue8m0, - ) - - def triton_impl(): - with _triton_mode(): - return fp8_utils.per_token_group_quant_fp8( - x, - group_size, - column_major_scales=column_major, - use_ue8m0=scale_ue8m0, - ) - elif dtype == "int8": - - def cuda_impl(): - return int8_utils.per_token_group_quant_int8(x, group_size) - - def triton_impl(): - with _triton_mode(): - return int8_utils.per_token_group_quant_int8(x, group_size) - else: - raise ValueError("dtype must be 'fp8' or 'int8'") - - cuda_ms = _time_cuda(cuda_impl, warmup_iters, bench_iters) - triton_ms = _time_cuda(triton_impl, warmup_iters, bench_iters) - - speedup = triton_ms / cuda_ms if cuda_ms else math.inf - - cfg_desc = ( - f"shape={shape} gs={group_size:<3} col_major={column_major:<5} " - f"ue8m0={scale_ue8m0:<5} dtype={dtype}" - ) - print( - f"{cfg_desc:55} | CUDA {cuda_ms:7.3f} ms | Triton {triton_ms:7.3f} ms | " - f"speed-up ×{speedup:5.2f}" - ) - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--warmup-iters", type=int, default=10) - parser.add_argument("--bench-iters", type=int, default=100) - parser.add_argument("--dtype", choices=["fp8", "int8", "both"], default="both") - return parser.parse_args() - - -if __name__ == "__main__": - if not current_platform.is_cuda(): - raise RuntimeError("CUDA device is required to run this benchmark.") - - args = parse_args() - warmup_iters, bench_iters = args.warmup_iters, args.bench_iters - - shapes = [(32, 128), (64, 256), (16, 512)] - group_sizes = [64, 128] - - dtypes = ["fp8", "int8"] if args.dtype == "both" else [args.dtype] - - header = ( - "Configuration".ljust(55) - + " | " - + "CUDA (ms)".center(12) - + " | " - + "Triton (ms)".center(13) - + " | " - + "Speed-up" - ) - print(header) - print("-" * len(header)) - - for dtype in dtypes: - for shape in shapes: - for gs in group_sizes: - if dtype == "fp8": - for col_major in (False, True): - for ue8m0 in (False, True): - _run_single( - shape, - gs, - dtype, - column_major=col_major, - scale_ue8m0=ue8m0, - warmup_iters=warmup_iters, - bench_iters=bench_iters, - ) - else: # INT8 has no col-major / ue8m0 switches - _run_single( - shape, - gs, - dtype, - warmup_iters=warmup_iters, - bench_iters=bench_iters, - ) diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py deleted file mode 100644 index 9ac8f5e65..000000000 --- a/benchmarks/kernels/benchmark_polynorm.py +++ /dev/null @@ -1,155 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import itertools - -import torch - -from vllm import _custom_ops as vllm_ops -from vllm.triton_utils import triton - - -def polynorm_naive( - x: torch.Tensor, - weight: torch.Tensor, - bias: torch.Tensor, - eps: float = 1e-6, -): - orig_shape = x.shape - x = x.view(-1, x.shape[-1]) - - def norm(x, eps: float): - return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps) - - x = x.float() - return ( - ( - weight[0] * norm(x**3, eps) - + weight[1] * norm(x**2, eps) - + weight[2] * norm(x, eps) - + bias - ) - .to(weight.dtype) - .view(orig_shape) - ) - - -def polynorm_vllm( - x: torch.Tensor, - weight: torch.Tensor, - bias: torch.Tensor, - eps: float = 1e-6, -): - orig_shape = x.shape - x = x.view(-1, x.shape[-1]) - - out = torch.empty_like(x) - vllm_ops.poly_norm(out, x, weight, bias, eps) - output = out - - output = output.view(orig_shape) - return output - - -def calculate_diff(batch_size, seq_len, hidden_dim): - dtype = torch.bfloat16 - x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") - weight = torch.ones(3, dtype=dtype, device="cuda") - bias = torch.ones(1, dtype=dtype, device="cuda") - - output_naive = polynorm_naive(x, weight, bias) - output_vllm = polynorm_vllm(x, weight, bias) - - if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2): - print("✅ All implementations match") - else: - print("❌ Implementations differ") - - -batch_size_range = [2**i for i in range(0, 7, 2)] -seq_length_range = [2**i for i in range(6, 11, 1)] -dim_range = [2048, 4096] -configs = list(itertools.product(dim_range, batch_size_range, seq_length_range)) - - -def get_benchmark(): - @triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["dim", "batch_size", "seq_len"], - x_vals=[list(_) for _ in configs], - line_arg="provider", - line_vals=["naive", "vllm"], - line_names=["Naive", "vLLM"], - styles=[("blue", "-"), ("red", "-")], - ylabel="us", - plot_name="polynorm-perf", - args={}, - ) - ) - def benchmark(dim, batch_size, seq_len, provider): - dtype = torch.bfloat16 - hidden_dim = dim * 4 - - x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") - weight = torch.ones(3, dtype=dtype, device="cuda") - bias = torch.ones(1, dtype=dtype, device="cuda") - - quantiles = [0.5, 0.2, 0.8] - - if provider == "naive": - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: polynorm_naive(x, weight, bias), - quantiles=quantiles, - ) - else: - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: polynorm_vllm(x, weight, bias), - quantiles=quantiles, - ) - - return 1000 * ms, 1000 * max_ms, 1000 * min_ms - - return benchmark - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--batch-size", - type=int, - default=4, - help="Batch size", - ) - parser.add_argument( - "--seq-len", - type=int, - default=128, - help="Sequence length", - ) - parser.add_argument( - "--hidden-dim", - type=int, - default=8192, - help="Intermediate size of MLP", - ) - parser.add_argument( - "--save-path", - type=str, - default="./configs/polnorm/", - help="Path to save polnorm benchmark results", - ) - - args = parser.parse_args() - - # Run correctness test - calculate_diff( - batch_size=args.batch_size, - seq_len=args.seq_len, - hidden_dim=args.hidden_dim, - ) - - benchmark = get_benchmark() - # Run performance benchmark - benchmark.run(print_data=True, save_path=args.save_path) diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py deleted file mode 100644 index 6ab26f5f1..000000000 --- a/benchmarks/kernels/benchmark_quant.py +++ /dev/null @@ -1,108 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import time - -import torch - -from vllm import _custom_ops as ops -from vllm.platforms import current_platform -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser - - -@torch.inference_mode() -def main( - num_tokens: int, - hidden_size: int, - static_scale: bool, - quant_dtype: torch.dtype, - dtype: torch.dtype, - seed: int = 0, - do_profile: bool = False, - num_warmup_iters: int = 5, - num_iters: int = 100, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device("cuda") - - x = torch.randn(num_tokens, hidden_size, dtype=dtype) - scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None - - def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() - if profile: - torch.cuda.cudart().cudaProfilerStart() - start_time = time.perf_counter() - - for _ in range(num_iters): - if quant_dtype == torch.int8: - ops.scaled_int8_quant(x, scale) - else: - ops.scaled_fp8_quant(x, scale) - torch.cuda.synchronize() - - end_time = time.perf_counter() - if profile: - torch.cuda.cudart().cudaProfilerStop() - return (end_time - start_time) / num_iters - - # Warmup. - print("Warming up...") - run_benchmark = run_cuda_benchmark - run_benchmark(num_iters=num_warmup_iters, profile=False) - - # Benchmark. - if do_profile: - latency = run_benchmark(num_iters=1, profile=True) - else: - latency = run_benchmark(num_iters=num_iters, profile=False) - print(f"Kernel running time: {latency * 1000000:.3f} us") - - -if __name__ == "__main__": - - def to_torch_dtype(dt): - if dt == "int8": - return torch.int8 - if dt == "fp8": - return torch.float8_e4m3fn - raise ValueError(f"Unsupported dtype: {dt}") - - parser = FlexibleArgumentParser( - description="Benchmark the quantization (fp8 or int8) kernel." - ) - parser.add_argument("--num-tokens", type=int, default=4096) - parser.add_argument("--hidden-size", type=int, default=8192) - parser.add_argument("--static-scale", action="store_true") - parser.add_argument( - "--quant-dtype", type=str, choices=["fp8", "int8"], default="int8" - ) - parser.add_argument( - "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" - ) - - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--profile", action="store_true") - parser.add_argument("--num-warmup-iters", type=int, default=5) - parser.add_argument( - "--num-iters", - type=int, - default=100, - help="Number of benchmark iterations. " - "If --profile is set, this number is ignored", - ) - - args = parser.parse_args() - print(args) - - main( - num_tokens=args.num_tokens, - hidden_size=args.hidden_size, - static_scale=args.static_scale, - quant_dtype=to_torch_dtype(args.quant_dtype), - dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], - seed=args.seed, - do_profile=args.profile, - num_warmup_iters=args.num_warmup_iters, - num_iters=args.num_iters, - ) diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py deleted file mode 100644 index 0aace5710..000000000 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ /dev/null @@ -1,212 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - -import random -import time - -import torch -from tabulate import tabulate - -from vllm import _custom_ops as ops -from vllm.attention.ops.triton_reshape_and_cache_flash import ( - triton_reshape_and_cache_flash, -) -from vllm.logger import init_logger -from vllm.platforms import current_platform -from vllm.utils import ( - STR_DTYPE_TO_TORCH_DTYPE, - FlexibleArgumentParser, - create_kv_caches_with_random_flash, -) - -logger = init_logger(__name__) - - -@torch.inference_mode() -def run_benchmark( - num_tokens: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - kv_cache_dtype: str, - kv_cache_layout: str, - num_iters: int, - implementation: str, - benchmark_mode: str, - device: str = "cuda", -) -> float: - """Return latency (seconds) for given num_tokens.""" - - if kv_cache_dtype == "fp8" and head_size % 16: - raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") - - if implementation not in ("cuda", "triton"): - raise ValueError( - f"Unsupported implementation: {implementation}. " - "Only 'cuda' and 'triton' are supported." - ) - if implementation == "triton" and kv_cache_layout == "HND": - return float("nan") # Triton does not support HND layout yet. - - current_platform.seed_everything(42) - torch.set_default_device(device) - - # create random key / value tensors [T, H, D]. - key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device) - value = torch.randn_like(key) - - # prepare the slot mapping. - # each token is assigned a unique slot in the KV-cache. - num_slots = block_size * num_blocks - if num_tokens > num_slots: - raise ValueError("num_tokens cannot exceed the total number of cache slots") - slot_mapping_lst = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device) - - key_caches, value_caches = create_kv_caches_with_random_flash( - num_blocks, - block_size, - 1, # num_layers - num_heads, - head_size, - kv_cache_dtype, - dtype, - device=device, - cache_layout=kv_cache_layout, - ) - key_cache, value_cache = key_caches[0], value_caches[0] - # to free unused memory - del key_caches, value_caches - - # compute per-kernel scaling factors for fp8 conversion (if used). - k_scale = (key.amax() / 64.0).to(torch.float32) - v_scale = (value.amax() / 64.0).to(torch.float32) - - if implementation == "cuda": - function_under_test = lambda: ops.reshape_and_cache_flash( - key, # noqa: F821 - value, # noqa: F821 - key_cache, # noqa: F821 - value_cache, # noqa: F821 - slot_mapping, # noqa: F821 - kv_cache_dtype, - k_scale, - v_scale, - ) - else: - function_under_test = lambda: triton_reshape_and_cache_flash( - key, # noqa: F821 - value, # noqa: F821 - key_cache, # noqa: F821 - value_cache, # noqa: F821 - slot_mapping, # noqa: F821 - kv_cache_dtype, - k_scale, - v_scale, - ) - if benchmark_mode == "cudagraph": - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g): - function_under_test() - torch.cuda.synchronize() - function_under_test = lambda: g.replay() - - def run_cuda_benchmark(n_iters: int) -> float: - nonlocal key, value, key_cache, value_cache, slot_mapping - torch.cuda.synchronize() - start = time.perf_counter() - for _ in range(n_iters): - function_under_test() - torch.cuda.synchronize() - end = time.perf_counter() - return (end - start) / n_iters - - # warm-up - run_cuda_benchmark(3) - - lat = run_cuda_benchmark(num_iters) - - # free tensors to mitigate OOM when sweeping - del key, value, key_cache, value_cache, slot_mapping - torch.cuda.empty_cache() - - return lat - - -def main(args): - rows = [] - for layout in ["NHD", "HND"]: - for exp in range(1, 17): - n_tok = 2**exp - lat = run_benchmark( - num_tokens=n_tok, - num_heads=args.num_heads, - head_size=args.head_size, - block_size=args.block_size, - num_blocks=args.num_blocks, - dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], - kv_cache_dtype=args.kv_cache_dtype, - kv_cache_layout=layout, - num_iters=args.iters, - implementation=args.implementation, - benchmark_mode=args.mode, - device="cuda", - ) - rows.append([n_tok, layout, f"{lat * 1e6:.3f}"]) - - print( - f"Benchmark results for implementation {args.implementation}" - f" (measuring with {args.mode}):" - ) - print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"])) - - -if __name__ == "__main__": - parser = FlexibleArgumentParser() - - parser.add_argument("--num-heads", type=int, default=128) - parser.add_argument( - "--head-size", - type=int, - choices=[64, 80, 96, 112, 120, 128, 192, 256], - default=128, - ) - parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) - parser.add_argument("--num-blocks", type=int, default=128 * 512) - - parser.add_argument( - "--dtype", - type=str, - choices=["half", "bfloat16", "float"], - default="bfloat16", - ) - - parser.add_argument( - "--kv-cache-dtype", - type=str, - choices=["auto", "fp8"], - default="auto", - ) - - parser.add_argument("--iters", type=int, default=100) - - parser.add_argument( - "--implementation", - type=str, - choices=["cuda", "triton"], - default="cuda", - ) - - parser.add_argument( - "--mode", - type=str, - choices=["cudagraph", "no_graph"], - default="cudagraph", - ) - - args = parser.parse_args() - - main(args) diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py deleted file mode 100644 index 4cf633a81..000000000 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ /dev/null @@ -1,256 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import itertools -from typing import Optional, Union - -import torch -from flashinfer.norm import fused_add_rmsnorm, rmsnorm -from torch import nn - -from vllm import _custom_ops as vllm_ops -from vllm.triton_utils import triton - - -class HuggingFaceRMSNorm(nn.Module): - def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward( - self, - x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: - orig_dtype = x.dtype - x = x.to(torch.float32) - if residual is not None: - x = x + residual.to(torch.float32) - residual = x.to(orig_dtype) - - variance = x.pow(2).mean(dim=-1, keepdim=True) - x = x * torch.rsqrt(variance + self.variance_epsilon) - x = x.to(orig_dtype) * self.weight - if residual is None: - return x - else: - return x, residual - - -def rmsnorm_naive( - x: torch.Tensor, - weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, - eps: float = 1e-6, -): - naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) - naive_norm.weight = nn.Parameter(weight) - naive_norm = naive_norm.to(x.device) - - orig_shape = x.shape - x = x.view(-1, x.shape[-1]) - if residual is not None: - residual = residual.view(-1, residual.shape[-1]) - - output = naive_norm(x, residual) - - if isinstance(output, tuple): - output = (output[0].view(orig_shape), output[1].view(orig_shape)) - else: - output = output.view(orig_shape) - return output - - -def rmsnorm_flashinfer( - x: torch.Tensor, - weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, - eps: float = 1e-6, -): - orig_shape = x.shape - x = x.view(-1, x.shape[-1]) - if residual is not None: - residual = residual.view(-1, residual.shape[-1]) - - if residual is not None: - fused_add_rmsnorm(x, residual, weight, eps) - output = (x, residual) - else: - output = rmsnorm(x, weight, eps) - - if isinstance(output, tuple): - output = (output[0].view(orig_shape), output[1].view(orig_shape)) - else: - output = output.view(orig_shape) - return output - - -def rmsnorm_vllm( - x: torch.Tensor, - weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, - eps: float = 1e-6, -): - orig_shape = x.shape - x = x.view(-1, x.shape[-1]) - if residual is not None: - residual = residual.view(-1, residual.shape[-1]) - - if residual is not None: - vllm_ops.fused_add_rms_norm(x, residual, weight, eps) - output = (x, residual) - else: - out = torch.empty_like(x) - vllm_ops.rms_norm(out, x, weight, eps) - output = out - - if isinstance(output, tuple): - output = (output[0].view(orig_shape), output[1].view(orig_shape)) - else: - output = output.view(orig_shape) - return output - - -def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True): - dtype = torch.bfloat16 - x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda") - weight = torch.ones(hidden_size, dtype=dtype, device="cuda") - residual = torch.randn_like(x) if use_residual else None - - output_naive = rmsnorm_naive( - x.clone(), weight, residual.clone() if residual is not None else None - ) - output_flashinfer = rmsnorm_flashinfer( - x.clone(), weight, residual.clone() if residual is not None else None - ) - output_vllm = rmsnorm_vllm( - x.clone(), weight, residual.clone() if residual is not None else None - ) - - if use_residual: - output_naive = output_naive[0] - output_flashinfer = output_flashinfer[0] - output_vllm = output_vllm[0] - - print(f"Naive output={output_naive}") - print(f"FlashInfer output={output_flashinfer}") - print(f"vLLM output={output_vllm}") - - if torch.allclose( - output_naive, output_flashinfer, atol=1e-2, rtol=1e-2 - ) and torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2): - print("✅ All implementations match") - else: - print("❌ Implementations differ") - - -batch_size_range = [2**i for i in range(0, 7, 2)] -seq_length_range = [2**i for i in range(6, 11, 1)] -head_num_range = [32, 48] -configs = list(itertools.product(head_num_range, batch_size_range, seq_length_range)) - - -def get_benchmark(use_residual): - @triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["head_num", "batch_size", "seq_len"], - x_vals=[list(_) for _ in configs], - line_arg="provider", - line_vals=["huggingface", "flashinfer", "vllm"], - line_names=["HuggingFace", "FlashInfer", "vLLM"], - styles=[("blue", "-"), ("green", "-"), ("red", "-")], - ylabel="us", - plot_name=f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual", - args={}, - ) - ) - def benchmark(head_num, batch_size, seq_len, provider): - dtype = torch.bfloat16 - hidden_size = head_num * 128 # assuming head_dim = 128 - - x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda") - weight = torch.ones(hidden_size, dtype=dtype, device="cuda") - residual = torch.randn_like(x) if use_residual else None - - quantiles = [0.5, 0.2, 0.8] - - if provider == "huggingface": - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: rmsnorm_naive( - x.clone(), - weight, - residual.clone() if residual is not None else None, - ), - quantiles=quantiles, - ) - elif provider == "flashinfer": - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: rmsnorm_flashinfer( - x.clone(), - weight, - residual.clone() if residual is not None else None, - ), - quantiles=quantiles, - ) - else: - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: rmsnorm_vllm( - x.clone(), - weight, - residual.clone() if residual is not None else None, - ), - quantiles=quantiles, - ) - - return 1000 * ms, 1000 * max_ms, 1000 * min_ms - - return benchmark - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--batch-size", - type=int, - default=4, - help="Batch size", - ) - parser.add_argument( - "--seq-len", - type=int, - default=128, - help="Sequence length", - ) - parser.add_argument( - "--hidden-size", - type=int, - default=4096, - help="Hidden size (2nd dimension) of the sequence", - ) - parser.add_argument( - "--use-residual", action="store_true", help="Whether to use residual connection" - ) - parser.add_argument( - "--save-path", - type=str, - default="./configs/rmsnorm/", - help="Path to save rmsnorm benchmark results", - ) - - args = parser.parse_args() - - # Run correctness test - calculate_diff( - batch_size=args.batch_size, - seq_len=args.seq_len, - hidden_size=args.hidden_size, - use_residual=args.use_residual, - ) - - # Get the benchmark function with proper use_residual setting - benchmark = get_benchmark(args.use_residual) - # Run performance benchmark - benchmark.run(print_data=True, save_path=args.save_path) diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py deleted file mode 100644 index b81baf17a..000000000 --- a/benchmarks/kernels/benchmark_rope.py +++ /dev/null @@ -1,133 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from itertools import accumulate -from typing import Optional - -import nvtx -import torch - -from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope -from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser - - -def benchmark_rope_kernels_multi_lora( - is_neox_style: bool, - batch_size: int, - seq_len: int, - num_heads: int, - head_size: int, - rotary_dim: Optional[int], - dtype: torch.dtype, - seed: int, - device: str, - max_position: int = 8192, - base: float = 10000, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - if rotary_dim is None: - rotary_dim = head_size - # silulating serving 4 LoRAs - scaling_factors = [1, 2, 4, 8] - # batched RoPE can take multiple scaling factors - batched_rope = get_rope( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - {"rope_type": "linear", "factor": tuple(scaling_factors)}, - ) - # non-batched RoPE takes only one scaling factor, we create multiple - # instances to simulate the same behavior - non_batched_ropes: list[RotaryEmbedding] = [] - for scaling_factor in scaling_factors: - non_batched_ropes.append( - get_rope( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - {"rope_type": "linear", "factor": (scaling_factor,)}, - ) - ) - - positions = torch.randint(0, max_position, (batch_size, seq_len)) - query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype) - key = torch.randn_like(query) - - # create query offsets for batched RoPE, we concat multiple kv cache - # together and each query needs to find the right kv cache of its type - offset_map = torch.tensor( - list( - accumulate( - [0] - + [ - max_position * scaling_factor * 2 - for scaling_factor in scaling_factors[:-1] - ] - ) - ) - ) - query_types = torch.randint( - 0, len(scaling_factors), (batch_size, seq_len), device=device - ) - # map query types to offsets - query_offsets = offset_map[query_types] - # the kernel takes flattened offsets - flatten_offsets = query_offsets.flatten() - - # batched queries of the same type together for non-batched RoPE - queries = [query[query_types == i] for i in range(len(scaling_factors))] - keys = [key[query_types == i] for i in range(len(scaling_factors))] - packed_qkr = zip(queries, keys, non_batched_ropes) - # synchronize before start timing - torch.cuda.synchronize() - with nvtx.annotate("non-batched", color="yellow"): - for q, k, r in packed_qkr: - r.forward(positions, q, k) - torch.cuda.synchronize() - with nvtx.annotate("batched", color="green"): - batched_rope.forward(positions, query, key, flatten_offsets) - torch.cuda.synchronize() - - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark the rotary embedding kernels." - ) - parser.add_argument("--is-neox-style", type=bool, default=True) - parser.add_argument("--batch-size", type=int, default=16) - parser.add_argument("--seq-len", type=int, default=512) - parser.add_argument("--num-heads", type=int, default=8) - parser.add_argument( - "--head-size", - type=int, - choices=[64, 80, 96, 112, 120, 128, 192, 256], - default=128, - ) - parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) - parser.add_argument( - "--dtype", type=str, choices=["bfloat16", "float"], default="float" - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument( - "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0" - ) - args = parser.parse_args() - print(args) - - benchmark_rope_kernels_multi_lora( - is_neox_style=args.is_neox_style, - batch_size=args.batch_size, - seq_len=args.seq_len, - num_heads=args.num_heads, - head_size=args.head_size, - rotary_dim=args.rotary_dim, - dtype=getattr(torch, args.dtype), - seed=args.seed, - device=args.device, - ) diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py deleted file mode 100644 index 18c459c31..000000000 --- a/benchmarks/kernels/benchmark_shapes.py +++ /dev/null @@ -1,94 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -WEIGHT_SHAPES = { - "ideal": [[4 * 256 * 32, 256 * 32]], - "mistralai/Mistral-7B-v0.1/TP1": [ - [4096, 6144], - [4096, 4096], - [4096, 28672], - [14336, 4096], - ], - "mistralai/Mistral-7B-v0.1/TP2": [ - [4096, 3072], - [2048, 4096], - [4096, 14336], - [7168, 4096], - ], - "mistralai/Mistral-7B-v0.1/TP4": [ - [4096, 1536], - [1024, 4096], - [4096, 7168], - [3584, 4096], - ], - "meta-llama/Llama-2-7b-hf/TP1": [ - [4096, 12288], - [4096, 4096], - [4096, 22016], - [11008, 4096], - ], - "meta-llama/Llama-2-7b-hf/TP2": [ - [4096, 6144], - [2048, 4096], - [4096, 11008], - [5504, 4096], - ], - "meta-llama/Llama-2-7b-hf/TP4": [ - [4096, 3072], - [1024, 4096], - [4096, 5504], - [2752, 4096], - ], - "meta-llama/Llama-2-13b-hf/TP1": [ - [5120, 15360], - [5120, 5120], - [5120, 27648], - [13824, 5120], - ], - "meta-llama/Llama-2-13b-hf/TP2": [ - [5120, 7680], - [2560, 5120], - [5120, 13824], - [6912, 5120], - ], - "meta-llama/Llama-2-13b-hf/TP4": [ - [5120, 3840], - [1280, 5120], - [5120, 6912], - [3456, 5120], - ], - "meta-llama/Llama-2-70b-hf/TP1": [ - [8192, 10240], - [8192, 8192], - [8192, 57344], - [28672, 8192], - ], - "meta-llama/Llama-2-70b-hf/TP2": [ - [8192, 5120], - [4096, 8192], - [8192, 28672], - [14336, 8192], - ], - "meta-llama/Llama-2-70b-hf/TP4": [ - [8192, 2560], - [2048, 8192], - [8192, 14336], - [7168, 8192], - ], -} - -WEIGHT_SHAPES_MOE = { - "nm-testing/Mixtral-8x7B-Instruct-v0.1": [ - [8, 2, 4096, 28672], - [8, 2, 14336, 4096], - ], - "nm-testing/deepseekv2-lite": [ - [64, 6, 2048, 1408], - ], - "ibm-granite/granite-3.0-1b-a400m": [ - [32, 8, 1024, 1024], - ], - "ibm-granite/granite-3.0-3b-a800m": [ - [40, 8, 1024, 1536], - ], -} diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py deleted file mode 100644 index c7a4066b3..000000000 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ /dev/null @@ -1,675 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable - -import matplotlib.pyplot as plt -import numpy as np -import torch - -from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - silu_mul_fp8_quant_deep_gemm_cuda, -) -from vllm.platforms import current_platform -from vllm.triton_utils import tl, triton -from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used - - -@triton.jit -def _silu_mul_fp8_quant_deep_gemm( - # Pointers ------------------------------------------------------------ - input_ptr, # 16-bit activations (E, T, 2*H) - y_q_ptr, # fp8 quantized activations (E, T, H) - y_s_ptr, # 16-bit scales (E, T, G) - counts_ptr, # int32 num tokens per expert (E) - # Sizes --------------------------------------------------------------- - H: tl.constexpr, # hidden dimension (per output) - GROUP_SIZE: tl.constexpr, # elements per group (usually 128) - # Strides for input (elements) --------------------------------------- - stride_i_e, - stride_i_t, - stride_i_h, - # Strides for y_q (elements) ----------------------------------------- - stride_yq_e, - stride_yq_t, - stride_yq_h, - # Strides for y_s (elements) ----------------------------------------- - stride_ys_e, - stride_ys_t, - stride_ys_g, - # Stride for counts (elements) - stride_counts_e, - # Numeric params ------------------------------------------------------ - eps: tl.constexpr, - fp8_min: tl.constexpr, - fp8_max: tl.constexpr, - use_ue8m0: tl.constexpr, - # Meta --------------------------------------------------------------- - BLOCK: tl.constexpr, - NUM_STAGES: tl.constexpr, -): - G = H // GROUP_SIZE - - # map program id -> (e, g) - pid = tl.program_id(0) - e = pid // G - g = pid % G - - e = e.to(tl.int64) - g = g.to(tl.int64) - - # number of valid tokens for this expert - n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64) - - cols = tl.arange(0, BLOCK).to(tl.int64) - mask = cols < BLOCK - - base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h - base_gate_offset = base_input_offset + cols * stride_i_h - base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h - base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h - base_ys_offset = e * stride_ys_e + g * stride_ys_g - - for t in tl.range(0, n_tokens, num_stages=NUM_STAGES): - gate = tl.load( - input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0 - ).to(tl.float32) - up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0) - - gate = gate * (1.0 / (1.0 + tl.exp(-gate))) - y = gate * up - - y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max - if use_ue8m0: - y_s = tl.exp2(tl.ceil(tl.log2(y_s))) - - y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) - - tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask) - tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s) - - -def silu_mul_fp8_quant_deep_gemm_triton( - y: torch.Tensor, # (E, T, 2*H) - tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert - num_parallel_tokens, - group_size: int = 128, - eps: float = 1e-10, -) -> tuple[torch.Tensor, torch.Tensor]: - """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales - - y has shape (E, T, 2*H). The first half of the last dimension is - silu-activated, multiplied by the second half, then quantized into FP8. - - Returns `(y_q, y_s)` where - * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H] - * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) - """ - assert y.ndim == 3, "y must be (E, T, 2*H)" - E, T, H2 = y.shape - assert H2 % 2 == 0, "last dim of y must be even (2*H)" - H = H2 // 2 - G = (H + group_size - 1) // group_size - assert H % group_size == 0, "H must be divisible by group_size" - assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, ( - "tokens_per_expert must be shape (E,)" - ) - tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32) - - # allocate outputs - fp8_dtype = torch.float8_e4m3fn - y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) - - # strides (elements) - stride_i_e, stride_i_t, stride_i_h = y.stride() - stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() - - # desired scale strides (elements): (T*G, 1, T) - stride_ys_e = T * G - stride_ys_t = 1 - stride_ys_g = T - y_s = torch.empty_strided( - (E, T, G), - (stride_ys_e, stride_ys_t, stride_ys_g), - dtype=torch.float32, - device=y.device, - ) - - stride_cnt_e = tokens_per_expert.stride()[0] - - # Static grid over experts and H-groups. - # A loop inside the kernel handles the token dim - grid = (E * G,) - - f_info = torch.finfo(fp8_dtype) - fp8_max = f_info.max - fp8_min = f_info.min - - _silu_mul_fp8_quant_deep_gemm[grid]( - y, - y_q, - y_s, - tokens_per_expert, - H, - group_size, - stride_i_e, - stride_i_t, - stride_i_h, - stride_yq_e, - stride_yq_t, - stride_yq_h, - stride_ys_e, - stride_ys_t, - stride_ys_g, - stride_cnt_e, - eps, - fp8_min, - fp8_max, - is_deep_gemm_e8m0_used(), - BLOCK=group_size, - NUM_STAGES=4, - num_warps=1, - ) - - return y_q, y_s - - -# Parse generation strategies -strategies = ["uniform", "max_t", "first_t"] - - -def benchmark( - kernel: Callable, - E: int, - T: int, - H: int, - total_tokens: int, - num_parallel_tokens: int = 64, - G: int = 128, - runs: int = 200, - num_warmups: int = 20, - gen_strategy: str = "default", - iterations_per_run: int = 20, -): - def generate_data(seed_offset=0): - """Generate input data with given seed offset""" - current_platform.seed_everything(42 + seed_offset) - y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() - - if gen_strategy == "uniform": - r = torch.rand(size=(E,), device="cuda") - r /= r.sum() - r *= total_tokens - tokens_per_expert = r.int() - tokens_per_expert = torch.minimum( - tokens_per_expert, - torch.ones((E,), device=r.device, dtype=torch.int) * T, - ) - elif gen_strategy == "max_t": - tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda") - tokens_per_expert.fill_(total_tokens / E) - elif gen_strategy == "first_t": - tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda") - tokens_per_expert[0] = min(T, total_tokens) - else: - raise ValueError(f"Unknown generation strategy: {gen_strategy}") - return y, tokens_per_expert - - dataset_count = 4 - # Pre-generate different input matrices for each iteration to avoid cache effects - data_sets = [generate_data(i) for i in range(dataset_count)] - - # Warmup - y, tokens_per_expert = data_sets[0] - for _ in range(num_warmups): - kernel( - y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G - ) - torch.cuda.synchronize() - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - # Benchmark - latencies: list[float] = [] - for _ in range(runs): - torch.cuda.synchronize() - - start_event.record() - for i in range(iterations_per_run): - y, tokens_per_expert = data_sets[i % dataset_count] - kernel( - y, - tokens_per_expert, - num_parallel_tokens=num_parallel_tokens, - group_size=G, - ) - end_event.record() - end_event.synchronize() - - total_time_ms = start_event.elapsed_time(end_event) - per_iter_time_ms = total_time_ms / iterations_per_run - latencies.append(per_iter_time_ms) - - # Use median instead of average for better outlier handling - median_time_ms = np.median(latencies) - median_time_s = median_time_ms / 1000 - - # Calculate actual work done (using first dataset for consistency) - _, tokens_per_expert = data_sets[0] - actual_tokens = tokens_per_expert.sum().item() - actual_elements = actual_tokens * H - - # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops - ops_per_element = 8 - total_ops = actual_elements * ops_per_element - gflops = total_ops / median_time_s / 1e9 - - # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes) - input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs - output_bytes = actual_tokens * H * 1 # H fp8 outputs - scale_bytes = actual_tokens * (H // G) * 4 # scales in float32 - total_bytes = input_bytes + output_bytes + scale_bytes - memory_bw = total_bytes / median_time_s / 1e9 - - HOPPER_BANDWIDTH_TBPS = 3.35 - return ( - median_time_ms, - gflops, - memory_bw, - (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100, - ) - - -def create_comparison_plot( - ratio, cuda_times, baseline_times, config_labels, strategy_name, id -): - """Create a comparison plot for a specific generation strategy""" - fig, ax = plt.subplots(1, 1, figsize=(16, 6)) - - # Configure x-axis positions - x = np.arange(len(config_labels)) - width = 0.35 - - # Execution Time plot (lower is better) - ax.bar( - x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue" - ) - ax.bar( - x + width / 2, - baseline_times, - width, - label="Baseline", - alpha=0.8, - color="orange", - ) - - # Add speedup labels over each bar pair - for i in range(len(x)): - speedup = ratio[i] - max_height = max(cuda_times[i], baseline_times[i]) - ax.text( - x[i], - max_height + max_height * 0.02, - f"{speedup:.2f}x", - ha="center", - va="bottom", - fontweight="bold", - fontsize=9, - ) - - ax.set_xlabel("Configuration") - ax.set_ylabel("% Utilization") - ax.set_title( - f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" - ) - ax.set_xticks(x) - ax.set_xticklabels(config_labels, rotation=45, ha="right") - ax.legend() - ax.grid(True, alpha=0.3) - - plt.tight_layout() - return fig, ax - - -def create_combined_plot(all_results): - """Create a combined plot with all strategies in one PNG""" - num_strategies = len(all_results) - fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies)) - - if num_strategies == 1: - axes = [axes] - - for idx, ( - strategy_name, - ratio, - cuda_times, - baseline_times, - config_labels, - ) in enumerate(all_results): - ax = axes[idx] - - # Configure x-axis positions - x = np.arange(len(config_labels)) - width = 0.35 - - # Execution Time plot (lower is better) - ax.bar( - x - width / 2, - cuda_times, - width, - label="CUDA Kernel", - alpha=0.8, - color="blue", - ) - ax.bar( - x + width / 2, - baseline_times, - width, - label="Baseline", - alpha=0.8, - color="orange", - ) - - # Add speedup labels over each bar pair - for i in range(len(x)): - speedup = ratio[i] - max_height = max(cuda_times[i], baseline_times[i]) - ax.text( - x[i], - max_height + max_height * 0.02, - f"{speedup:.2f}x", - ha="center", - va="bottom", - fontweight="bold", - fontsize=9, - ) - - ax.set_xlabel("Configuration") - ax.set_ylabel("% Utilization") - ax.set_title( - f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" - ) - ax.set_xticks(x) - ax.set_xticklabels(config_labels, rotation=45, ha="right") - ax.legend() - ax.grid(True, alpha=0.3) - - plt.tight_layout() - filename = "../../silu_bench/silu_benchmark_combined.png" - plt.savefig(filename, dpi=300, bbox_inches="tight") - plt.show() - - return filename - - -outer_dim = 7168 -configs = [ - # DeepSeekV3 Configs - (8, 1024, 7168), - # DeepSeekV3 Configs - (32, 1024, 7168), - # DeepSeekV3 Configs - (256, 1024, 7168), -] - -runs = 100 -num_warmups = 20 - -strategy_descriptions = { - "uniform": "Uniform Random", - "max_t": "Even Assignment", - "first_t": "experts[0] = T, experts[1:] = 0", -} - -print(f"GPU: {torch.cuda.get_device_name()}") -print(f"Testing strategies: {', '.join(strategies)}") -print(f"Configurations: {len(configs)} configs") - -all_results = [] - -# Run benchmarks for each strategy -for id, strategy in enumerate(strategies): - print(f"\n{'=' * 60}") - print(f"Testing strategy: {strategy_descriptions[strategy]}") - print(f"{'=' * 60}") - - # Collect benchmark data for both algorithms - config_labels = [] - config_x_axis = [] - all_cuda_results = [] - all_baseline_results = [] - all_ratios = [] - - for E, T, H in configs: - total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E] - config_x_axis.append(total_tokens_config) - - cuda_results = [] - baseline_results = [] - ratios = [] - - for total_tokens in total_tokens_config: - config_label = f"E={E},T={T},H={H},TT={total_tokens}" - config_labels.append(config_label) - - # CUDA kernel results - time_ms_cuda, gflops, gbps, perc = benchmark( - silu_mul_fp8_quant_deep_gemm_cuda, - E, - T, - H, - total_tokens, - runs=runs, - num_warmups=num_warmups, - gen_strategy=strategy, - ) - cuda_results.append((time_ms_cuda, gflops, gbps, perc)) - - # Baseline results - time_ms_triton, gflops, gbps, perc = benchmark( - silu_mul_fp8_quant_deep_gemm_triton, - E, - T, - H, - total_tokens, - runs=runs, - num_warmups=num_warmups, - gen_strategy=strategy, - ) - baseline_results.append((time_ms_triton, gflops, gbps, perc)) - ratios.append(time_ms_triton / time_ms_cuda) - - print(f"Completed: {config_label}") - all_cuda_results.append(cuda_results) - all_baseline_results.append(baseline_results) - all_ratios.append(ratios) - - # Store results for combined plotting - all_results.append( - ( - strategy_descriptions[strategy], - all_ratios, - all_cuda_results, - all_baseline_results, - config_labels, - config_x_axis, - ) - ) - - # Print summary table for this strategy - print(f"\nSummary Table - {strategy_descriptions[strategy]}:") - print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}") - print("-" * 60) - - for i, (E, T, H) in enumerate(configs): - speedup = baseline_results[i][0] / cuda_results[i][0] - config_label = f"E={E:3d},T={T:4d},H={H:4d}" - print( - f"{config_label:<20} {cuda_results[i][0]:8.5f} " - f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x" - ) - - -def create_total_tokens_plot(all_results): - num_strategies = len(all_results) - num_configs = len(configs) - - # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage - fig, axs = plt.subplots( - num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies) - ) - - # Add main title to the entire figure - fig.suptitle( - "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)", - fontsize=16, - fontweight="bold", - y=0.98, - ) - - # Handle single strategy case - if num_strategies == 1: - axs = axs.reshape(1, -1) - - # Handle single config case - if num_configs == 1: - axs = axs.reshape(-1, 2) - - for strategy_idx, result in enumerate(all_results): - ( - strategy_name, - all_ratios, - all_cuda_results, - all_baseline_results, - config_labels, - config_x_axis, - ) = result - - for config_idx in range(num_configs): - # Speedup plot (left column) - ax_speedup = axs[strategy_idx, config_idx * 2] - # Bandwidth plot (right column) - ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1] - - E, T, H = configs[config_idx] - ratios = all_ratios[config_idx] - total_tokens_values = config_x_axis[config_idx] - - # Extract CUDA and Triton bandwidth percentages - cuda_bandwidth_percentages = [ - result[3] for result in all_cuda_results[config_idx] - ] - triton_bandwidth_percentages = [ - result[3] for result in all_baseline_results[config_idx] - ] - - # Plot speedup ratios vs total tokens (left plot) - ax_speedup.plot( - total_tokens_values, ratios, "bo-", linewidth=3, markersize=8 - ) - ax_speedup.set_title( - f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}", - fontsize=12, - fontweight="bold", - ) - ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) - ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11) - ax_speedup.grid(True, alpha=0.3) - - ax_bandwidth.plot( - total_tokens_values, - cuda_bandwidth_percentages, - "ro-", - linewidth=3, - markersize=8, - label="CUDA", - ) - ax_bandwidth.plot( - total_tokens_values, - triton_bandwidth_percentages, - "go-", - linewidth=3, - markersize=8, - label="Triton", - ) - ax_bandwidth.set_title( - f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}", - fontsize=12, - fontweight="bold", - ) - ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) - ax_bandwidth.set_ylabel( - "% of Peak Bandwidth", fontweight="bold", fontsize=11 - ) - ax_bandwidth.legend(prop={"weight": "bold"}) - ax_bandwidth.grid(True, alpha=0.3) - - # Format x-axis labels for both plots - for ax in [ax_speedup, ax_bandwidth]: - ax.set_xticks(total_tokens_values) - ax.set_xticklabels( - [ - f"{tt // 1000}K" if tt >= 1000 else str(tt) - for tt in total_tokens_values - ], - fontweight="bold", - ) - # Make tick labels bold - for label in ax.get_xticklabels() + ax.get_yticklabels(): - label.set_fontweight("bold") - - # Add value labels on speedup points - for x, y in zip(total_tokens_values, ratios): - ax_speedup.annotate( - f"{y:.2f}x", - (x, y), - textcoords="offset points", - xytext=(0, 12), - ha="center", - fontsize=10, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7), - ) - - # Add value labels on CUDA bandwidth points - for x, y in zip(total_tokens_values, cuda_bandwidth_percentages): - ax_bandwidth.annotate( - f"{y:.1f}%", - (x, y), - textcoords="offset points", - xytext=(0, 12), - ha="center", - fontsize=9, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3), - ) - - # Add value labels on Triton bandwidth points - for x, y in zip(total_tokens_values, triton_bandwidth_percentages): - ax_bandwidth.annotate( - f"{y:.1f}%", - (x, y), - textcoords="offset points", - xytext=(0, -15), - ha="center", - fontsize=9, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3), - ) - - plt.tight_layout() - plt.subplots_adjust(top=0.93) # Make room for main title - filename = "silu_benchmark_total_tokens.png" - plt.savefig(filename, dpi=300, bbox_inches="tight") - plt.show() - - return filename - - -# Create combined plot with all strategies -combined_plot_filename = create_total_tokens_plot(all_results) - -print(f"\n{'=' * 60}") -print("Benchmark Complete!") -print(f"Generated combined plot: {combined_plot_filename}") -print(f"{'=' * 60}") diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py deleted file mode 100644 index 6ddab4621..000000000 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ /dev/null @@ -1,293 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import csv -import os -from datetime import datetime -from typing import Optional - -import flashinfer -import torch - -from vllm.utils import round_up - -FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 -FP8_DTYPE = torch.float8_e4m3fn -FP4_DTYPE = torch.uint8 - - -def to_float8(x, dtype=torch.float8_e4m3fn): - finfo = torch.finfo(dtype) - min_val, max_val = x.aminmax() - amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) - scale = finfo.max / amax * 0.1 - x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) - return x_scl_sat.to(dtype), scale.float().reciprocal() - - -@torch.no_grad() -def benchmark_decode( - dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], - batch_size: int, - max_seq_len: int, - num_heads: tuple[int, int] = (64, 8), - head_size: int = 128, - kv_layout: str = "HND", - block_size: int = 16, - warmup: int = 10, - trials: int = 20, -): - torch.set_default_device("cuda") - torch.manual_seed(0) - - q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes - q_quant_dtype = q_quant_dtype or dtype - kv_quant_dtype = kv_quant_dtype or dtype - o_quant_dtype = o_quant_dtype or dtype - - num_qo_heads, num_kv_heads = num_heads - assert num_qo_heads % num_kv_heads == 0 - - sm_scale = float(1.0 / (head_size**0.5)) - - # large number to reduce kv_cache reuse - NUM_BLOCKS = int(256000 / block_size) - - kv_cache_shape = None - if kv_layout == "NHD": - kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size) - elif kv_layout == "HND": - kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size) - else: - raise ValueError(f"Invalid kv_layout: {kv_layout}") - - # Always using 1.0 scale to reflect the real perf in benchmarking - q_scale = 1.0 - ref_query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype) - if q_quant_dtype == FP8_DTYPE: - query, _ = to_float8(ref_query) - else: - query = ref_query - - kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32) - kv_lens[-1] = max_seq_len - - seq_lens = kv_lens - max_seq_len = torch.max(seq_lens).item() - - # Always using 1.0 scale to reflect the real perf in benchmarking - k_scale = v_scale = 1.0 - ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype) - if kv_quant_dtype == FP8_DTYPE: - kv_cache, _ = to_float8(ref_kv_cache) - else: - kv_cache = ref_kv_cache - - max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = torch.randint( - 0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32 - ) - kv_indptr = [0] - kv_indices = [] - kv_last_page_lens = [] - for i in range(batch_size): - seq_len = seq_lens[i] - assert seq_len > 0 - num_blocks = (seq_len + block_size - 1) // block_size - kv_indices.extend(block_tables[i, :num_blocks]) - kv_indptr.append(kv_indptr[-1] + num_blocks) - kv_last_page_len = seq_len % block_size - if kv_last_page_len == 0: - kv_last_page_len = block_size - kv_last_page_lens.append(kv_last_page_len) - - kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) - kv_indices = torch.tensor(kv_indices, dtype=torch.int32) - kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) - workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8) - - wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( - workspace_buffer, - kv_layout, - use_tensor_cores=True, - ) - wrapper.plan( - kv_indptr, - kv_indices, - kv_last_page_lens, - num_qo_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - sm_scale=sm_scale, - q_data_type=dtype, - kv_data_type=dtype, - ) - - def time_fn(fn, warmup=10, trials=20): - torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - times = [] - for i in range(warmup): - fn() - for i in range(trials): - start.record() - fn() - end.record() - torch.cuda.synchronize() - times.append(start.elapsed_time(end)) # ms - return sum(times) / len(times), torch.std(torch.tensor(times)) - - o_scale = 1.0 - o_sf_scale = None - output_baseline = torch.empty(ref_query.shape, dtype=dtype) - if o_quant_dtype == FP4_DTYPE: - o_sf_scale = 500.0 - output_trtllm = flashinfer.utils.FP4Tensor( - torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8), - torch.empty( - ( - round_up(query.shape[0], 128), - round_up(query.shape[1] * query.shape[2] // 16, 4), - ), - dtype=torch.float8_e4m3fn, - ), - ) - else: - output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype) - - def baseline_decode(): - return wrapper.run( - ref_query, - ref_kv_cache, - k_scale=k_scale, - v_scale=v_scale, - out=output_baseline, - ) - - def trtllm_decode(): - return flashinfer.decode.trtllm_batch_decode_with_kv_cache( - query=query, - kv_cache=kv_cache, - workspace_buffer=workspace_buffer, - block_tables=block_tables, - seq_lens=seq_lens, - max_seq_len=max_seq_len, - bmm1_scale=q_scale * k_scale * sm_scale, - bmm2_scale=v_scale / o_scale, - o_sf_scale=o_sf_scale, - out=output_trtllm, - ) - - baseline_mean, baseline_std = time_fn(baseline_decode) - trtllm_mean, trtllm_std = time_fn(trtllm_decode) - - # Calculate percentage speedup (positive means TRT is faster) - speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean - - print( - f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:.3f}\t{trtllm_std.item():.3f}" - f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}" - ) - - # Return results for CSV writing - return { - "batch_size": batch_size, - "trtllm_mean": trtllm_mean, - "trtllm_std": trtllm_std.item(), - "baseline_mean": baseline_mean, - "baseline_std": baseline_std.item(), - "speedup_percent": speedup_percent, - "q_dtype": str(q_quant_dtype), - "kv_cache_dtype": str(kv_quant_dtype), - "output_dtype": str(o_quant_dtype), - "block_size": block_size, - "num_kv_heads": num_kv_heads, - "head_size": head_size, - "max_seq_len": max_seq_len, - } - - -def write_results_to_csv(results, filename=None): - """Write benchmark results to CSV file.""" - if filename is None: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv" - - fieldnames = [ - "batch_size", - "trtllm_mean", - "trtllm_std", - "baseline_mean", - "baseline_std", - "speedup_percent", - "q_dtype", - "kv_cache_dtype", - "output_dtype", - "block_size", - "num_kv_heads", - "head_size", - "max_seq_len", - ] - - file_exists = os.path.exists(filename) - - with open(filename, "a", newline="") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - - if not file_exists: - writer.writeheader() - - for result in results: - writer.writerow(result) - - print(f"Results written to {filename}") - - -if __name__ == "__main__": - batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256] - max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] - all_results = [] - - dtype = torch.bfloat16 - quant_dtypes = [ - # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) - (None, None, None), - (None, FP8_DTYPE, None), - (FP8_DTYPE, FP8_DTYPE, None), - (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), - (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), - ] - - for quant_dtype in quant_dtypes: - q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype - q_quant_dtype = q_quant_dtype or dtype - kv_quant_dtype = kv_quant_dtype or dtype - o_quant_dtype = o_quant_dtype or dtype - - print( - f"Running benchmark for q_dtype = {q_quant_dtype}, " - f"kv_cache_dtype: {kv_quant_dtype}, " - f"output_dtype: {o_quant_dtype}" - ) - print( - "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t" - "baseline_std\tspeedup_percent" - ) - for max_seq_len in max_seq_lens: - for bs in batch_sizes: - result = benchmark_decode( - dtype=dtype, - quant_dtypes=quant_dtype, - batch_size=bs, - max_seq_len=max_seq_len, - ) - all_results.append(result) - - # Write all results to CSV - write_results_to_csv(all_results) diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py deleted file mode 100644 index 131df74c7..000000000 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ /dev/null @@ -1,308 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import csv -import os -from datetime import datetime -from typing import Optional - -import flashinfer -import torch - -from vllm.utils import round_up - -FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 -FP8_DTYPE = torch.float8_e4m3fn -FP4_DTYPE = torch.uint8 - - -def to_float8(x, dtype=torch.float8_e4m3fn): - finfo = torch.finfo(dtype) - min_val, max_val = x.aminmax() - amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) - scale = finfo.max / amax * 0.1 - x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) - return x_scl_sat.to(dtype), scale.float().reciprocal() - - -@torch.no_grad() -def benchmark_prefill( - dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], - batch_size: int, - max_seq_len: int, - num_heads: tuple[int, int] = (64, 8), - head_size: int = 128, - kv_layout: str = "HND", - block_size: int = 16, - warmup: int = 10, - trials: int = 20, -): - torch.set_default_device("cuda") - torch.manual_seed(0) - - q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes - q_quant_dtype = q_quant_dtype or dtype - kv_quant_dtype = kv_quant_dtype or dtype - o_quant_dtype = o_quant_dtype or dtype - - max_q_len = max_kv_len = max_seq_len - - num_qo_heads, num_kv_heads = num_heads - assert num_qo_heads % num_kv_heads == 0 - - sm_scale = float(1.0 / (head_size**0.5)) - - # large number to reduce kv_cache reuse - NUM_BLOCKS = int(256000 / block_size) - - kv_cache_shape = None - if kv_layout == "NHD": - kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size) - elif kv_layout == "HND": - kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size) - else: - raise ValueError(f"Invalid kv_layout: {kv_layout}") - - q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32) - q_lens[-1] = max_q_len - q_indptr = torch.cat( - [ - torch.tensor([0], dtype=torch.int32), - torch.cumsum(q_lens, dim=0, dtype=torch.int32), - ] - ) - - # Always using 1.0 scale to reflect the real perf in benchmarking - q_scale = 1.0 - ref_query = torch.randn( - torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype - ) - if q_quant_dtype == FP8_DTYPE: - query, _ = to_float8(ref_query) - else: - query = ref_query - - kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32) - kv_lens[-1] = max_kv_len - - seq_lens = kv_lens + q_lens - max_seq_len = torch.max(seq_lens).item() - - # Always using 1.0 scale to reflect the real perf in benchmarking - k_scale = v_scale = 1.0 - ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype) - if kv_quant_dtype == FP8_DTYPE: - kv_cache, _ = to_float8(ref_kv_cache) - else: - kv_cache = ref_kv_cache - - max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = torch.randint( - 0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32 - ) - kv_indptr = [0] - kv_indices = [] - kv_last_page_lens = [] - for i in range(batch_size): - seq_len = seq_lens[i] - assert seq_len > 0 - num_blocks = (seq_len + block_size - 1) // block_size - kv_indices.extend(block_tables[i, :num_blocks]) - kv_indptr.append(kv_indptr[-1] + num_blocks) - kv_last_page_len = seq_len % block_size - if kv_last_page_len == 0: - kv_last_page_len = block_size - kv_last_page_lens.append(kv_last_page_len) - - kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) - kv_indices = torch.tensor(kv_indices, dtype=torch.int32) - kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) - workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8) - - wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( - workspace_buffer, kv_layout - ) - wrapper.plan( - q_indptr, - kv_indptr, - kv_indices, - kv_last_page_lens, - num_qo_heads, - num_kv_heads, - head_size, - block_size, - causal=True, - sm_scale=sm_scale, - q_data_type=dtype, - kv_data_type=dtype, - ) - - def time_fn(fn, warmup=10, trials=20): - torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - times = [] - for i in range(warmup): - fn() - for i in range(trials): - start.record() - fn() - end.record() - torch.cuda.synchronize() - times.append(start.elapsed_time(end)) # ms - return sum(times) / len(times), torch.std(torch.tensor(times)) - - o_scale = 1.0 - o_sf_scale = None - output_baseline = torch.empty(ref_query.shape, dtype=dtype) - if o_quant_dtype == FP4_DTYPE: - o_sf_scale = 500.0 - output_trtllm = flashinfer.utils.FP4Tensor( - torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8), - torch.empty( - ( - round_up(query.shape[0], 128), - round_up(query.shape[1] * query.shape[2] // 16, 4), - ), - dtype=torch.float8_e4m3fn, - ), - ) - else: - output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype) - - def baseline_prefill(): - return wrapper.run( - ref_query, - ref_kv_cache, - k_scale=k_scale, - v_scale=v_scale, - out=output_baseline, - ) - - def trtllm_prefill(): - return flashinfer.prefill.trtllm_batch_context_with_kv_cache( - query=query, - kv_cache=kv_cache, - workspace_buffer=workspace_buffer, - block_tables=block_tables, - seq_lens=seq_lens, - max_q_len=max_q_len, - max_kv_len=max_seq_len, - bmm1_scale=q_scale * k_scale * sm_scale, - bmm2_scale=v_scale / o_scale, - batch_size=batch_size, - cum_seq_lens_q=q_indptr, - cum_seq_lens_kv=kv_indptr, - o_sf_scale=o_sf_scale, - out=output_trtllm, - ) - - baseline_mean, baseline_std = time_fn(baseline_prefill) - trtllm_mean, trtllm_std = time_fn(trtllm_prefill) - - # Calculate percentage speedup (positive means TRT is faster) - speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean - - print( - f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:8.3f}\t{trtllm_std.item():8.3f}" - f"\t{baseline_mean:8.3f}\t{baseline_std.item():8.3f}\t{speedup_percent:8.3f}" - ) - - # Return results for CSV writing - return { - "batch_size": batch_size, - "trtllm_mean": trtllm_mean, - "trtllm_std": trtllm_std.item(), - "baseline_mean": baseline_mean, - "baseline_std": baseline_std.item(), - "speedup_percent": speedup_percent, - "q_dtype": str(q_quant_dtype), - "kv_cache_dtype": str(kv_quant_dtype), - "output_dtype": str(o_quant_dtype), - "block_size": block_size, - "num_kv_heads": num_kv_heads, - "head_size": head_size, - "max_seq_len": max_seq_len, - } - - -def write_results_to_csv(results, filename=None): - """Write benchmark results to CSV file.""" - if filename is None: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv" - - fieldnames = [ - "batch_size", - "trtllm_mean", - "trtllm_std", - "baseline_mean", - "baseline_std", - "speedup_percent", - "q_dtype", - "kv_cache_dtype", - "output_dtype", - "block_size", - "num_kv_heads", - "head_size", - "max_seq_len", - ] - - file_exists = os.path.exists(filename) - - with open(filename, "a", newline="") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - - if not file_exists: - writer.writeheader() - - for result in results: - writer.writerow(result) - - print(f"Results written to {filename}") - - -if __name__ == "__main__": - batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256] - max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] - all_results = [] - - dtype = torch.bfloat16 - quant_dtypes = [ - # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) - (None, None, None), - (FP8_DTYPE, FP8_DTYPE, None), - (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), - (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), - ] - - for quant_dtype in quant_dtypes: - q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype - q_quant_dtype = q_quant_dtype or dtype - kv_quant_dtype = kv_quant_dtype or dtype - o_quant_dtype = o_quant_dtype or dtype - - print( - f"Running benchmark for q_dtype = {q_quant_dtype}, " - f"kv_cache_dtype: {kv_quant_dtype}, " - f"output_dtype: {o_quant_dtype}" - ) - print( - "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t" - "baseline_std\tspeedup_percent" - ) - for max_seq_len in max_seq_lens: - for bs in batch_sizes: - result = benchmark_prefill( - dtype=dtype, - quant_dtypes=quant_dtype, - batch_size=bs, - max_seq_len=max_seq_len, - ) - all_results.append(result) - - # Write all results to CSV - write_results_to_csv(all_results) diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py deleted file mode 100644 index c6c8e0b0b..000000000 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ /dev/null @@ -1,415 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# Adapted from sglang quantization/tuning_block_wise_kernel.py - -import argparse -import json -import multiprocessing as mp -import os -import time -from datetime import datetime -from typing import Any - -import torch -from tqdm import tqdm - -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - _w8a8_block_fp8_matmul, -) -from vllm.platforms import current_platform -from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser - -mp.set_start_method("spawn", force=True) - -assert current_platform.is_cuda(), ( - "Only support tune w8a8 block fp8 kernel on CUDA device." -) - -DTYPE_MAP = { - "float32": torch.float32, - "float16": torch.float16, - "half": torch.half, - "bfloat16": torch.bfloat16, -} - - -def w8a8_block_matmul( - A: torch.Tensor, - B: torch.Tensor, - As: torch.Tensor, - Bs: torch.Tensor, - block_size: list[int], - config: dict[str, Any], - output_dtype: torch.dtype = torch.float16, -) -> torch.Tensor: - """This function performs matrix multiplication with - block-wise quantization. - - It takes two input tensors `A` and `B` with scales `As` and `Bs`. - The output is returned in the specified `output_dtype`. - - Args: - A: The input tensor, e.g., activation. - B: The input tensor, e.g., weight. - As: The per-token-group quantization scale for `A`. - Bs: The per-block quantization scale for `B`. - block_size: The block size for per-block quantization. - It should be 2-dim, e.g., [128, 128]. - output_dtype: The dtype of the returned tensor. - - Returns: - torch.Tensor: The result of matmul. - """ - assert len(block_size) == 2 - block_n, block_k = block_size[0], block_size[1] - - assert A.shape[-1] == B.shape[-1] - assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous() - assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1] - M = A.numel() // A.shape[-1] - - assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 - N, K = B.shape - assert triton.cdiv(N, block_n) == Bs.shape[0] - assert triton.cdiv(K, block_k) == Bs.shape[1] - - C_shape = A.shape[:-1] + (N,) - C = A.new_empty(C_shape, dtype=output_dtype) - - def grid(META): - return ( - triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), - ) - - if A.dtype == torch.float8_e4m3fn: - kernel = _w8a8_block_fp8_matmul - else: - raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") - - kernel[grid]( - A, - B, - C, - As, - Bs, - M, - N, - K, - block_n, - block_k, - A.stride(-2), - A.stride(-1), - B.stride(1), - B.stride(0), - C.stride(-2), - C.stride(-1), - As.stride(-2), - As.stride(-1), - Bs.stride(1), - Bs.stride(0), - **config, - ) - - return C - - -def get_configs_compute_bound(): - configs = [] - for num_stages in [2, 3, 4, 5]: - for block_m in [16, 32, 64, 128, 256]: - for block_k in [64, 128]: - for block_n in [32, 64, 128, 256]: - for num_warps in [4, 8]: - for group_size in [1, 16, 32, 64]: - configs.append( - { - "BLOCK_SIZE_M": block_m, - "BLOCK_SIZE_N": block_n, - "BLOCK_SIZE_K": block_k, - "GROUP_SIZE_M": group_size, - "num_warps": num_warps, - "num_stages": num_stages, - } - ) - return configs - - -def get_weight_shapes(tp_size): - # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. - # Modify them, if you tune for another different model. - # cannot TP - total = [ - (512 + 64, 7168), - (2112, 7168), - ((128 + 64) * 128, 7168), - (128 * (128 + 128), 512), - (7168, 16384), - (7168, 18432), - ] - # N can TP - n_tp = [ - (18432 * 2, 7168), - ((128 + 64) * 128, 7168), - (128 * (128 + 128), 512), - (24576, 1536), - (12288, 7168), - (4096, 7168), - ] - # K can TP - k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)] - - weight_shapes = [] - for t in total: - weight_shapes.append(t) - for n_t in n_tp: - new_t = (n_t[0] // tp_size, n_t[1]) - weight_shapes.append(new_t) - for k_t in k_tp: - new_t = (k_t[0], k_t[1] // tp_size) - weight_shapes.append(new_t) - return weight_shapes - - -def benchmark_config( - A, B, As, Bs, block_size, config, out_dtype=torch.float16, num_iters=10 -): - def run(): - w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype) - - torch.cuda.synchronize() - # JIT complication & warmup - for _ in range(5): - run() - torch.cuda.synchronize() - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - latencies: list[float] = [] - for i in range(num_iters): - torch.cuda.synchronize() - start_event.record() - run() - end_event.record() - end_event.synchronize() - latencies.append(start_event.elapsed_time(end_event)) - avg = sum(latencies) / (num_iters * 10) * 1000 # us - return avg - - -def tune(M, N, K, block_size, out_dtype, search_space, input_type): - factor_for_scale = 1e-2 - - if input_type == "fp8": - fp8_info = torch.finfo(torch.float8_e4m3fn) - fp8_max, fp8_min = fp8_info.max, fp8_info.min - - A_fp32 = ( - (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max - ) - A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) - - B_fp32 = ( - (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max - ) - B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) - else: - raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") - - block_n, block_k = block_size[0], block_size[1] - n_tiles = (N + block_n - 1) // block_n - k_tiles = (K + block_k - 1) // block_k - - As = torch.rand(M, k_tiles, dtype=torch.float32, device="cuda") * factor_for_scale - Bs = ( - torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") - * factor_for_scale - ) - - best_config = None - best_time = float("inf") - for config in tqdm(search_space): - try: - kernel_time = benchmark_config( - A, - B, - As, - Bs, - block_size, - config, - out_dtype, - num_iters=10, - ) - except triton.runtime.autotuner.OutOfResources: - # Some configurations may be invalid and fail to compile. - continue - - if kernel_time < best_time: - best_time = kernel_time - best_config = config - now = datetime.now() - print(f"{now.ctime()}] Completed tuning for batch_size={M}") - assert best_config is not None - return best_config - - -def save_configs( - N, - K, - block_n, - block_k, - configs, - save_path, - input_type="fp8", -) -> None: - os.makedirs(save_path, exist_ok=True) - device_name = current_platform.get_device_name().replace(" ", "_") - json_file_name = ( - f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8," - f"block_shape=[{block_n},{block_k}].json" - ) - - config_file_path = os.path.join(save_path, json_file_name) - print(f"Writing best config to {config_file_path}...") - - with open(config_file_path, "w") as f: - json.dump(configs, f, indent=4) - f.write("\n") - - -def tune_on_gpu(args_dict): - """Run tuning on a specific GPU.""" - gpu_id = args_dict["gpu_id"] - batch_sizes = args_dict["batch_sizes"] - weight_shapes = args_dict["weight_shapes"] - args = args_dict["args"] - - torch.cuda.set_device(gpu_id) - print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}") - - block_n = args.block_n - block_k = args.block_k - out_dtype = DTYPE_MAP[args.out_dtype] - save_path = args.save_path - input_type = args.input_type - - search_space = get_configs_compute_bound() - search_space = [ - config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0 - ] - - start = time.time() - for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"): - N, K = shape[0], shape[1] - print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`") - benchmark_results = [ - tune( - batch_size, - N, - K, - [block_n, block_k], - out_dtype, - search_space, - input_type, - ) - for batch_size in tqdm(batch_sizes, desc=f"GPU {gpu_id} - Batch sizes") - ] - best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)} - save_configs(N, K, block_n, block_k, best_configs, save_path, input_type) - - end = time.time() - print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds") - - -def distribute_batch_sizes(batch_sizes, num_gpus): - """Distribute batch sizes across available GPUs.""" - batches_per_gpu = [] - for i in range(num_gpus): - start_idx = i * len(batch_sizes) // num_gpus - end_idx = (i + 1) * len(batch_sizes) // num_gpus - batches_per_gpu.append(batch_sizes[start_idx:end_idx]) - return batches_per_gpu - - -def main(args): - print(args) - num_gpus = torch.cuda.device_count() - if num_gpus == 0: - raise RuntimeError("No GPU available for tuning") - print(f"Found {num_gpus} GPUs for parallel tuning") - - torch.cuda.init() - - if args.batch_size is None: - batch_sizes = [ - 1, - 2, - 4, - 8, - 16, - 24, - 32, - 48, - 64, - 96, - 128, - 256, - 512, - 1024, - 1536, - 2048, - 3072, - 4096, - ] - else: - batch_sizes = [args.batch_size] - num_gpus = 1 # If only one batch size, use only one GPU - - weight_shapes = get_weight_shapes(args.tp_size) - - batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus) - - process_args = [] - for gpu_id in range(num_gpus): - process_args.append( - { - "gpu_id": gpu_id, - "batch_sizes": batches_per_gpu[gpu_id], - "weight_shapes": weight_shapes, # Each GPU processes all weight shapes - "args": args, - } - ) - - ctx = mp.get_context("spawn") - with ctx.Pool(num_gpus) as pool: - pool.map(tune_on_gpu, process_args) - - print("Multi-GPU tuning completed") - - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description=""" -Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1: - python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8 -Then copy to model_executor/layers/quantization/utils/configs - """, - formatter_class=argparse.RawTextHelpFormatter, - ) - - parser.add_argument("--tp-size", "-tp", type=int, default=8) - parser.add_argument("--input-type", type=str, choices=["fp8"], default="fp8") - parser.add_argument( - "--out-dtype", - type=str, - choices=["float32", "float16", "bfloat16", "half"], - default="float16", - ) - parser.add_argument("--block-n", type=int, default=128) - parser.add_argument("--block-k", type=int, default=128) - parser.add_argument("--batch-size", type=int, required=False) - parser.add_argument("--save-path", type=str, default="./") - args = parser.parse_args() - - main(args) diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md deleted file mode 100644 index 41e68e047..000000000 --- a/benchmarks/kernels/deepgemm/README.md +++ /dev/null @@ -1,129 +0,0 @@ -# DeepSeek DeepGEMM Kernels Benchmark - -This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels. - -Currently this just includes dense GEMMs and only works on Hopper GPUs. - -## Setup - -You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory: - -```bash -git clone --recursive https://github.com/deepseek-ai/DeepGEMM -cd DeepGEMM -python setup.py install -uv pip install -e . -``` - -## Usage - -```console -python benchmark_fp8_block_dense_gemm.py -INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda. -===== STARTING FP8 GEMM BENCHMARK ===== -PyTorch version: 2.5.1+cu124 -CUDA version: 12.4 -Triton version: 3.1.0 -Using device: NVIDIA H100 80GB HBM3 -WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json -INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. -WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json -WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json -INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. -INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. - -===== PERFORMANCE COMPARISON ===== - -DeepGEMM Implementation: -+------+-------+-------+-----------+--------+--------+ -| m | n | k | Time (μs) | TFLOPS | GB/s | -+------+-------+-------+-----------+--------+--------+ -| 8 | 4096 | 7168 | 102.9 | 4.6 | 286.4 | -| 8 | 7168 | 18432 | 70.8 | 29.8 | 1868.8 | -| 8 | 18432 | 7168 | 69.3 | 30.5 | 1911.8 | -| 64 | 4096 | 7168 | 69.1 | 54.4 | 439.0 | -| 64 | 7168 | 18432 | 69.4 | 243.6 | 1933.6 | -| 64 | 18432 | 7168 | 70.4 | 240.3 | 1917.2 | -| 64 | 24576 | 1536 | 70.1 | 68.9 | 584.6 | -| 64 | 32768 | 512 | 68.4 | 31.4 | 307.1 | -| 64 | 7168 | 16384 | 69.5 | 216.3 | 1718.5 | -| 128 | 4096 | 7168 | 141.1 | 53.3 | 222.1 | -| 128 | 7168 | 18432 | 71.9 | 470.5 | 1896.1 | -| 128 | 18432 | 7168 | 69.3 | 488.2 | 1988.2 | -| 1024 | 4096 | 7168 | 89.7 | 670.1 | 502.5 | -| 1024 | 18432 | 7168 | 279.0 | 969.8 | 635.2 | -| 2048 | 4096 | 7168 | 175.1 | 687.0 | 347.4 | -| 4096 | 4096 | 7168 | 335.4 | 717.0 | 275.1 | -+------+-------+-------+-----------+--------+--------+ - -vLLM Triton Implementation: -+------+-------+-------+-----------+--------+--------+--------------+ -| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM | -+------+-------+-------+-----------+--------+--------+--------------+ -| 8 | 4096 | 7168 | 74.0 | 6.3 | 398.2 | 1.39x faster | -| 8 | 7168 | 18432 | 89.6 | 23.6 | 1478.1 | 0.79x slower | -| 8 | 18432 | 7168 | 113.2 | 18.7 | 1170.4 | 0.61x slower | -| 64 | 4096 | 7168 | 79.4 | 47.3 | 382.2 | 0.87x slower | -| 64 | 7168 | 18432 | 98.5 | 171.7 | 1363.0 | 0.70x slower | -| 64 | 18432 | 7168 | 119.5 | 141.5 | 1129.4 | 0.59x slower | -| 64 | 24576 | 1536 | 37.6 | 128.4 | 1089.7 | 1.86x faster | -| 64 | 32768 | 512 | 38.7 | 55.5 | 542.6 | 1.77x faster | -| 64 | 7168 | 16384 | 86.1 | 174.5 | 1386.4 | 0.81x slower | -| 128 | 4096 | 7168 | 90.7 | 82.9 | 345.4 | 1.56x faster | -| 128 | 7168 | 18432 | 144.0 | 234.9 | 946.9 | 0.50x slower | -| 128 | 18432 | 7168 | 229.5 | 147.4 | 600.1 | 0.30x slower | -| 1024 | 4096 | 7168 | 242.3 | 248.2 | 186.1 | 0.37x slower | -| 1024 | 18432 | 7168 | 897.8 | 301.4 | 197.4 | 0.31x slower | -| 2048 | 4096 | 7168 | 463.0 | 259.7 | 131.4 | 0.38x slower | -| 4096 | 4096 | 7168 | 901.8 | 266.7 | 102.3 | 0.37x slower | -+------+-------+-------+-----------+--------+--------+--------------+ - -vLLM CUTLASS Implementation: -+------+-------+-------+-----------+--------+--------+--------------+--------------+ -| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM | vs Triton | -+------+-------+-------+-----------+--------+--------+--------------+--------------+ -| 8 | 4096 | 7168 | 34.6 | 13.6 | 852.3 | 2.98x faster | 2.14x faster | -| 8 | 7168 | 18432 | 78.9 | 26.8 | 1677.3 | 0.90x slower | 1.13x faster | -| 8 | 18432 | 7168 | 81.2 | 26.0 | 1631.1 | 0.85x slower | 1.39x faster | -| 64 | 4096 | 7168 | 36.9 | 101.9 | 822.9 | 1.87x faster | 2.15x faster | -| 64 | 7168 | 18432 | 87.4 | 193.4 | 1535.2 | 0.79x slower | 1.13x faster | -| 64 | 18432 | 7168 | 85.0 | 199.0 | 1587.6 | 0.83x slower | 1.41x faster | -| 64 | 24576 | 1536 | 28.0 | 172.8 | 1465.8 | 2.51x faster | 1.35x faster | -| 64 | 32768 | 512 | 28.8 | 74.5 | 728.5 | 2.37x faster | 1.34x faster | -| 64 | 7168 | 16384 | 77.9 | 193.0 | 1532.8 | 0.89x slower | 1.11x faster | -| 128 | 4096 | 7168 | 39.1 | 192.4 | 802.0 | 3.61x faster | 2.32x faster | -| 128 | 7168 | 18432 | 93.7 | 360.8 | 1454.2 | 0.77x slower | 1.54x faster | -| 128 | 18432 | 7168 | 85.7 | 394.8 | 1608.0 | 0.81x slower | 2.68x faster | -| 1024 | 4096 | 7168 | 99.7 | 603.1 | 452.2 | 0.90x slower | 2.43x faster | -| 1024 | 18432 | 7168 | 331.3 | 816.7 | 534.9 | 0.84x slower | 2.71x faster | -| 2048 | 4096 | 7168 | 198.3 | 606.6 | 306.7 | 0.88x slower | 2.34x faster | -| 4096 | 4096 | 7168 | 392.2 | 613.2 | 235.3 | 0.86x slower | 2.30x faster | -+------+-------+-------+-----------+--------+--------+--------------+--------------+ - -===== AVERAGE PERFORMANCE ===== -+----------------+------------+----------+---------------+ -| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) | -+----------------+------------+----------+---------------+ -| DeepGEMM | 310.98 | 1052.10 | 0.11 | -| vLLM Triton | 144.30 | 715.60 | 0.23 | -| vLLM CUTLASS | 286.78 | 1076.67 | 0.11 | -+----------------+------------+----------+---------------+ - -===== AVERAGE SPEEDUPS ===== -+-----------------------------+--------------+ -| Comparison | Speedup | -+-----------------------------+--------------+ -| DeepGEMM vs vLLM Triton | 1.71x faster | -| DeepGEMM vs vLLM CUTLASS | 0.94x slower | -| vLLM CUTLASS vs vLLM Triton | 1.84x faster | -+-----------------------------+--------------+ - -===== ACCURACY COMPARISON ===== -+----------------+-----------------------+ -| Implementation | Avg Diff vs Reference | -+----------------+-----------------------+ -| DeepGEMM | 0.000684 | -| vLLM Triton | 0.000684 | -| vLLM CUTLASS | 0.000684 | -+----------------+-----------------------+ -``` diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py deleted file mode 100644 index db2398fc4..000000000 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ /dev/null @@ -1,427 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# fmt: off -# ruff: noqa: E501 -import time - -import torch - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - per_token_group_quant_fp8, - w8a8_block_fp8_matmul, -) -from vllm.triton_utils import triton -from vllm.utils.deep_gemm import ( - calc_diff, - fp8_gemm_nt, - get_col_major_tma_aligned_tensor, - per_block_cast_to_fp8, -) - - -def benchmark_shape(m: int, - n: int, - k: int, - warmup: int = 100, - repeat: int = 10000, - verbose: bool = False) -> dict: - """Benchmark all implementations for a specific (m, n, k) shape.""" - if verbose: - print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===") - - # Create test tensors - A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) - B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) - - # Reference result in BF16 - torch.cuda.synchronize() - C_ref = A @ B.t() - - # Pre-quantize B for all implementations - # (weights can be pre-quantized offline) - B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) - B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) - - # Block size configuration - block_size = [128, 128] - - # Pre-quantize A for all implementations - A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1]) - A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) - C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) - A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) - A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( - A, block_size[1], column_major_scales=True) - - # === DeepGEMM Implementation === - def deepgemm_gemm(): - fp8_gemm_nt((A_deepgemm, A_scale_deepgemm), - (B_deepgemm, B_scale_deepgemm), - C_deepgemm) - return C_deepgemm - - # === vLLM Triton Implementation === - def vllm_triton_gemm(): - return w8a8_block_fp8_matmul(A_vllm, - B_vllm, - A_scale_vllm, - B_scale_vllm, - block_size, - output_dtype=torch.bfloat16) - - # === vLLM CUTLASS Implementation === - def vllm_cutlass_gemm(): - return ops.cutlass_scaled_mm(A_vllm_cutlass, - B_vllm.T, - scale_a=A_scale_vllm_cutlass, - scale_b=B_scale_vllm.T, - out_dtype=torch.bfloat16) - - # Run correctness check first - if verbose: - print("Running correctness check...") - C_deepgemm = deepgemm_gemm() - C_vllm_triton = vllm_triton_gemm() - C_vllm_cutlass = vllm_cutlass_gemm() - - deepgemm_diff = calc_diff(C_deepgemm, C_ref) - vllm_triton_diff = calc_diff(C_vllm_triton, C_ref) - vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref) - - if verbose: - print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}") - print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}") - print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}") - print("vLLM Triton vs DeepGEMM difference: " - f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}") - print("vLLM CUTLASS vs DeepGEMM difference: " - f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}") - - # Benchmark implementations - implementations = { - "DeepGEMM": deepgemm_gemm, - "vLLM Triton": vllm_triton_gemm, - "vLLM CUTLASS": vllm_cutlass_gemm - } - - benchmark_results = { - "shape": { - "m": m, - "n": n, - "k": k - }, - "implementations": {} - } - - for name, func in implementations.items(): - # Warmup - for _ in range(warmup): - func() - torch.cuda.synchronize() - - # Timing loop - torch.cuda.synchronize() - start = time.time() - for _ in range(repeat): - func() - torch.cuda.synchronize() - end = time.time() - - # Calculate timing and TFLOPS - avg_time_ms = (end - start) / repeat * 1000 - avg_time_us = avg_time_ms * 1000 - tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12 - gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3) - - benchmark_results["implementations"][name] = { - "time_ms": avg_time_ms, - "time_us": avg_time_us, - "tflops": tflops, - "gb_s": gb_s, - "diff": { - "DeepGEMM": - 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm), - "Reference": - deepgemm_diff if name == "DeepGEMM" else - (vllm_triton_diff - if name == "vLLM Triton" else vllm_cutlass_diff) - } - } - - if verbose: - print( - f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s" - ) - - # Calculate speedups - baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"] - for name, data in benchmark_results["implementations"].items(): - if name != "DeepGEMM": - speedup = baseline / data["time_ms"] - benchmark_results["implementations"][name][ - "speedup_vs_deepgemm"] = speedup - if verbose: - print(f"DeepGEMM is {1/speedup:.2f}x " - f"{'faster' if 1/speedup > 1 else 'slower'} than {name}") - - vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][ - "time_ms"] - vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][ - "time_ms"] - cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time - benchmark_results["implementations"]["vLLM CUTLASS"][ - "speedup_vs_triton"] = cutlass_vs_triton - if verbose: - print( - f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x " - f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton" - ) - - return benchmark_results - - -def format_table_row(values, widths): - """Format a row with specified column widths.""" - return "| " + " | ".join(f"{val:{w}}" - for val, w in zip(values, widths)) + " |" - - -def print_table(headers, rows, title=None): - """Print a table with headers and rows.""" - if title: - print(f"\n{title}") - - # Calculate column widths based on headers and data - widths = [ - max(len(str(h)), max(len(str(row[i])) for row in rows)) - for i, h in enumerate(headers) - ] - - # Create separator line - separator = "+-" + "-+-".join("-" * w for w in widths) + "-+" - - # Print table - print(separator) - print(format_table_row(headers, widths)) - print(separator) - for row in rows: - print(format_table_row(row, widths)) - print(separator) - - -def format_speedup(value): - """Format speedup value with indicator if it's faster or slower.""" - return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}" - - -def run_benchmarks(verbose: bool = False): - """Run benchmarks for a set of common shapes.""" - print("===== STARTING FP8 GEMM BENCHMARK =====") - - # Make sure we're using the GPU - if not torch.cuda.is_available(): - print("CUDA not available! Tests require GPU.") - return - - # Print system information - print(f"PyTorch version: {torch.__version__}") - print(f"CUDA version: {torch.version.cuda}") - print(f"Triton version: {triton.__version__}") - print(f"Using device: {torch.cuda.get_device_name()}") - - # Enable TF32 for better performance - torch.backends.cuda.matmul.allow_tf32 = True - torch.backends.cudnn.allow_tf32 = True - - # Set seeds for reproducibility - torch.manual_seed(42) - torch.cuda.manual_seed(42) - - # Define benchmark shapes (m, n, k) - shapes = [ - (8, 4096, 7168), - (8, 7168, 18432), - (8, 18432, 7168), - (64, 4096, 7168), - (64, 7168, 18432), - (64, 18432, 7168), - (64, 24576, 1536), - (64, 32768, 512), - (64, 7168, 16384), - (128, 4096, 7168), - (128, 7168, 18432), - (128, 18432, 7168), - (1024, 4096, 7168), - (1024, 18432, 7168), - (2048, 4096, 7168), - (4096, 4096, 7168), - ] - shapes = [ - # (64, 2112, 7168), - (64, 24576, 1536), - (64, 32768, 512), - (64, 7168, 16384), - (64, 4096, 7168), - (64, 7168, 2048), - # (128, 2112, 7168), - (128, 24576, 1536), - (128, 32768, 512), - (128, 7168, 16384), - (128, 4096, 7168), - (128, 7168, 2048), - # (4096, 2112, 7168), - (4096, 24576, 1536), - (4096, 32768, 512), - (4096, 7168, 16384), - (4096, 4096, 7168), - (4096, 7168, 2048), - ] - - all_results = [] - for m, n, k in shapes: - result = benchmark_shape(m, n, k, verbose=verbose) - all_results.append(result) - - # Print results in a nicely formatted table - print("\n===== PERFORMANCE COMPARISON =====") - - # Print DeepGEMM table - deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"] - deepgemm_rows = [] - for result in all_results: - shape = result["shape"] - impl_data = result["implementations"]["DeepGEMM"] - deepgemm_rows.append([ - shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", - f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}" - ]) - - print_table(deepgemm_headers, - deepgemm_rows, - title="DeepGEMM Implementation:") - - # Print vLLM Triton table - triton_headers = [ - "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM" - ] - triton_rows = [] - for result in all_results: - shape = result["shape"] - impl_data = result["implementations"]["vLLM Triton"] - speedup = impl_data.get("speedup_vs_deepgemm", 1.0) - triton_rows.append([ - shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", - f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", - format_speedup(speedup) - ]) - - print_table(triton_headers, - triton_rows, - title="vLLM Triton Implementation:") - - # Print vLLM CUTLASS table - cutlass_headers = [ - "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM", - "vs Triton" - ] - cutlass_rows = [] - for result in all_results: - shape = result["shape"] - impl_data = result["implementations"]["vLLM CUTLASS"] - vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0) - vs_triton = impl_data.get("speedup_vs_triton", 1.0) - cutlass_rows.append([ - shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", - f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", - format_speedup(vs_deepgemm), - format_speedup(vs_triton) - ]) - - print_table(cutlass_headers, - cutlass_rows, - title="vLLM CUTLASS Implementation:") - - # Calculate and print averages - print("\n===== AVERAGE PERFORMANCE =====") - - implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"] - avg_metrics = { - impl: { - "tflops": 0, - "gb_s": 0, - "time_ms": 0 - } - for impl in implementations - } - - for result in all_results: - for impl in implementations: - impl_data = result["implementations"][impl] - avg_metrics[impl]["tflops"] += impl_data["tflops"] - avg_metrics[impl]["gb_s"] += impl_data["gb_s"] - avg_metrics[impl]["time_ms"] += impl_data["time_ms"] - - num_shapes = len(all_results) - avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"] - avg_rows = [] - - for impl in implementations: - avg_tflops = avg_metrics[impl]["tflops"] / num_shapes - avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes - avg_time = avg_metrics[impl]["time_ms"] / num_shapes - avg_rows.append([ - impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}" - ]) - - print_table(avg_headers, avg_rows) - - # Calculate average speedups - avg_speedups = { - "DeepGEMM vs vLLM Triton": 0, - "DeepGEMM vs vLLM CUTLASS": 0, - "vLLM CUTLASS vs vLLM Triton": 0 - } - - for result in all_results: - deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"] - vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"] - vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][ - "time_ms"] - - avg_speedups[ - "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time - avg_speedups[ - "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time - avg_speedups[ - "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time - - print("\n===== AVERAGE SPEEDUPS =====") - speedup_headers = ["Comparison", "Speedup"] - speedup_rows = [] - for comparison, total in avg_speedups.items(): - avg_speedup = total / num_shapes - status = "faster" if avg_speedup > 1 else "slower" - speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"]) - - print_table(speedup_headers, speedup_rows) - - # Average accuracy comparison - print("\n===== ACCURACY COMPARISON =====") - avg_diff = {impl: 0 for impl in implementations} - - for result in all_results: - for impl in implementations: - avg_diff[impl] += result["implementations"][impl]["diff"][ - "Reference"] - - diff_headers = ["Implementation", "Avg Diff vs Reference"] - diff_rows = [] - for impl in implementations: - diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"]) - - print_table(diff_headers, diff_rows) - - -if __name__ == "__main__": - run_benchmarks(verbose=False) diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py deleted file mode 100644 index 9a4da0ef5..000000000 --- a/benchmarks/kernels/graph_machete_bench.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -import pickle -from collections import defaultdict - -import matplotlib.pyplot as plt -import pandas as pd -import regex as re -import seaborn as sns -from torch.utils.benchmark import Measurement as TMeasurement - -from vllm.utils import FlexibleArgumentParser - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark the latency of processing a single batch of " - "requests till completion." - ) - parser.add_argument("filename", type=str) - - args = parser.parse_args() - - with open(args.filename, "rb") as f: - data = pickle.load(f) - raw_results: list[TMeasurement] = data["results"] - - results = defaultdict(lambda: list()) - for v in raw_results: - result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label) - if result is not None: - KN = result.group(1) - else: - raise Exception("MKN not found") - result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label) - if result is not None: - M = result.group(1) - else: - raise Exception("MKN not found") - - kernel = v.task_spec.description - results[KN].append({"kernel": kernel, "batch_size": M, "median": v.median}) - - rows = int(math.ceil(len(results) / 2)) - fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows)) - axs = axs.flatten() - for axs_idx, (shape, data) in enumerate(results.items()): - plt.sca(axs[axs_idx]) - df = pd.DataFrame(data) - sns.lineplot( - data=df, - x="batch_size", - y="median", - hue="kernel", - style="kernel", - markers=True, - dashes=False, - palette="Dark2", - ) - plt.title(f"Shape: {shape}") - plt.ylabel("time (median, s)") - plt.tight_layout() - plt.savefig("graph_machete_bench.pdf") diff --git a/benchmarks/kernels/requirements.txt b/benchmarks/kernels/requirements.txt deleted file mode 100644 index 1411a4a0b..000000000 --- a/benchmarks/kernels/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pandas \ No newline at end of file diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py deleted file mode 100644 index 4bbb36bb4..000000000 --- a/benchmarks/kernels/utils.py +++ /dev/null @@ -1,214 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from collections.abc import Iterable -from typing import Any, Callable, Optional - -import torch -import torch.utils.benchmark as TBenchmark -from torch.utils.benchmark import Measurement as TMeasurement - - -@dataclasses.dataclass -class CudaGraphBenchParams: - num_ops_in_cuda_graph: int - - -@dataclasses.dataclass -class ArgPool: - """ - When some argument of the benchmarking function is annotated with this type, - the benchmarking class (BenchMM) will collapse the argument to a pick a - single value from the given list of values, during function invocation. - For every invocation during a benchmarking run, it will choose a - different value from the list. - """ - - values: Iterable[Any] - - def __getitem__(self, index): - return self.values[index] - - -class Bench: - class ArgsIterator: - def __init__(self, args_list, kwargs_list): - assert len(args_list) == len(kwargs_list) - self.args_list = args_list - self.kwargs_list = kwargs_list - self.n = len(self.args_list) - self.idx = 0 - - def __next__(self): - while True: - yield (self.args_list[self.idx], self.kwargs_list[self.idx]) - self.idx += 1 - self.idx = self.idx % self.n - - def reset(self): - self.idx = 0 - - @property - def n_args(self): - return self.n - - def __init__( - self, - cuda_graph_params: Optional[CudaGraphBenchParams], - label: str, - sub_label: str, - description: str, - fn: Callable, - *args, - **kwargs, - ): - self.cuda_graph_params = cuda_graph_params - self.use_cuda_graph = self.cuda_graph_params is not None - self.label = label - self.sub_label = sub_label - self.description = description - self.fn = fn - - # Process args - self._args = args - self._kwargs = kwargs - self.args_list, self.kwargs_list = self.collapse_argpool(*args, **kwargs) - self.args_iterator = self.ArgsIterator(self.args_list, self.kwargs_list) - - # Cudagraph runner - self.g = None - if self.use_cuda_graph: - self.g = self.get_cuda_graph_runner() - - # benchmark run params - self.min_run_time = 1 - - def collapse_argpool(self, *args, **kwargs): - argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [ - arg for arg in kwargs.values() if isinstance(arg, ArgPool) - ] - if len(argpool_args) == 0: - return [args], [kwargs] - - # Make sure all argpools are of the same size - argpool_size = len(argpool_args[0].values) - assert all([argpool_size == len(arg.values) for arg in argpool_args]) - - # create copies of the args - args_list = [] - kwargs_list = [] - for _ in range(argpool_size): - args_list.append(args) - kwargs_list.append(kwargs.copy()) - - for i in range(argpool_size): - # collapse args; Just pick the ith value - args_list[i] = tuple( - [arg[i] if isinstance(arg, ArgPool) else arg for arg in args_list[i]] - ) - - # collapse kwargs - kwargs_i = kwargs_list[i] - arg_pool_keys = [k for k, v in kwargs_i.items() if isinstance(v, ArgPool)] - for k in arg_pool_keys: - # again just pick the ith value - kwargs_i[k] = kwargs_i[k][i] - kwargs_list[i] = kwargs_i - - return args_list, kwargs_list - - def get_cuda_graph_runner(self): - assert self.use_cuda_graph - assert self.args_iterator is not None - - num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph - - # warmup - args_it = self.args_iterator.__next__() - for _ in range(2): - args, kwargs = next(args_it) - self.fn(*args, **kwargs) - - self.args_iterator.reset() - args_it = self.args_iterator.__next__() - stream = torch.cuda.Stream() - with torch.cuda.stream(stream): - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g): - for _ in range(num_graph_ops): - args, kwargs = next(args_it) - self.fn(*args, **kwargs) - return g - - def run_cudagrah(self) -> TMeasurement: - assert self.use_cuda_graph - globals = {"g": self.g} - - return TBenchmark.Timer( - stmt="g.replay()", - globals=globals, - label=( - f"{self.label}" - f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops" - ), - sub_label=self.sub_label, - description=self.description, - ).blocked_autorange(min_run_time=self.min_run_time) - - def run_eager(self) -> TMeasurement: - setup = None - stmt = None - globals = None - - has_arg_pool = self.args_iterator.n_args > 1 - if has_arg_pool: - setup = """ - args_iterator.reset() - args_it = args_iterator.__next__() - """ - stmt = """ - args, kwargs = next(args_it) - fn(*args, **kwargs) - """ - globals = {"fn": self.fn, "args_iterator": self.args_iterator} - else: - # no arg pool. Just use the args and kwargs directly - self.args_iterator.reset() - args_it = self.args_iterator.__next__() - args, kwargs = next(args_it) - - setup = "" - stmt = """ - fn(*args, **kwargs) - """ - globals = {"fn": self.fn, "args": args, "kwargs": kwargs} - - return TBenchmark.Timer( - stmt=stmt, - setup=setup, - globals=globals, - label=self.label, - sub_label=self.sub_label, - description=self.description, - ).blocked_autorange(min_run_time=self.min_run_time) - - def run(self) -> TMeasurement: - timer = None - if self.use_cuda_graph: # noqa SIM108 - timer = self.run_cudagrah() - else: - timer = self.run_eager() - if not timer.meets_confidence() or timer.has_warnings: - print("Doesn't meet confidence - re-running bench ...") - return self.run() - return timer - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - if exc_type: - print(f"exc type {exc_type}") - print(f"exc value {exc_value}") - print(f"exc traceback {traceback}") diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py deleted file mode 100644 index 9a057990b..000000000 --- a/benchmarks/kernels/weight_shapes.py +++ /dev/null @@ -1,104 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Weight Shapes are in the format -# ([K, N], TP_SPLIT_DIM) -# Example: -# A shape of ([14336, 4096], 0) indicates the following GEMM shape, -# - TP1 : K = 14336, N = 4096 -# - TP2 : K = 7168, N = 4096 -# A shape of ([4096, 6144], 1) indicates the following GEMM shape, -# - TP1 : K = 4096, N = 6144 -# - TP4 : K = 4096, N = 1536 - -# TP1 shapes -WEIGHT_SHAPES = { - "mistralai/Mistral-7B-v0.1": [ - ([4096, 6144], 1), - ([4096, 4096], 0), - ([4096, 28672], 1), - ([14336, 4096], 0), - ], - "meta-llama/Llama-2-7b-hf": [ - ([4096, 12288], 1), - ([4096, 4096], 0), - ([4096, 22016], 1), - ([11008, 4096], 0), - ], - "meta-llama/Llama-3-8b": [ - ([4096, 6144], 1), - ([4096, 4096], 0), - ([4096, 28672], 1), - ([14336, 4096], 0), - ], - "meta-llama/Llama-2-13b-hf": [ - ([5120, 15360], 1), - ([5120, 5120], 0), - ([5120, 27648], 1), - ([13824, 5120], 0), - ], - "meta-llama/Llama-2-70b-hf": [ - ([8192, 10240], 1), - ([8192, 8192], 0), - ([8192, 57344], 1), - ([28672, 8192], 0), - ], - "meta-llama/Llama-3.1-405b-hf": [ - ([16384, 18432], 1), - ([16384, 16384], 0), - ([16384, 106496], 1), - ([53248, 16384], 0), - ], - "meta-llama/Llama-3.1-8B-Instruct": [ - ([4096, 6144], 1), - ([4096, 4096], 0), - ([4096, 28672], 1), - ([14336, 4096], 0), - ], - "meta-llama/Llama-3.3-70B-Instruct": [ - ([8192, 10240], 1), - ([8192, 8192], 0), - ([8192, 57344], 1), - ([28672, 8192], 0), - ], - "mistralai/Mistral-Large-Instruct-2407": [ - ([12288, 14336], 1), - ([12288, 12288], 0), - ([12288, 57344], 1), - ([28672, 12288], 0), - ], - "Qwen/Qwen2.5-7B-Instruct": [ - ([3584, 4608], 1), - ([3584, 3584], 0), - ([3584, 37888], 1), - ([18944, 3584], 0), - ], - "Qwen/Qwen2.5-32B-Instruct": [ - ([5120, 7168], 1), - ([5120, 5120], 0), - ([5120, 55296], 1), - ([27648, 5120], 0), - ], - "Qwen/Qwen2.5-72B-Instruct": [ - ([8192, 10240], 1), - ([8192, 8192], 0), - ([8192, 59136], 1), - ([29568, 8192], 0), - ], - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [ - ([2048, 3072], 1), - ([2048, 4096], 1), - ([2048, 2048], 0), - ([2048, 576], 0), - ([2048, 21888], 1), - ([10944, 2048], 0), - ([2048, 2816], 1), - ([1408, 2048], 0), - ], - "CohereLabs/c4ai-command-a-03-2025": [ - ([12288, 14336], 1), - ([12288, 12288], 0), - ([12288, 73728], 1), - ([36864, 12288], 0), - ], -} diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md deleted file mode 100644 index f5b5c6c97..000000000 --- a/benchmarks/multi_turn/README.md +++ /dev/null @@ -1,174 +0,0 @@ -# Benchmark KV Cache Offloading with Multi-Turn Conversations - -The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `requirements.txt` - -First start serving your model - -```bash -export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ - -vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests -``` - -The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface). - -## Synthetic Multi-Turn Conversations - -Download the following text file (used for generation of synthetic conversations) - -```bash -wget https://www.gutenberg.org/ebooks/1184.txt.utf-8 -mv 1184.txt.utf-8 pg1184.txt -``` - -The filename `pg1184.txt` is used in `generate_multi_turn.json` (see `"text_files"`). - -But you may use other text files if you prefer (using this specific file is not required). - -Then run the benchmarking script - -```bash -export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ - -python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \ ---input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6 -``` - -You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.). - -If successful, you will see the following output - -```bash ----------------------------------------------------------------------------------------------------- -Statistics summary: -runtime_sec = 215.810 -requests_per_sec = 0.769 ----------------------------------------------------------------------------------------------------- - count mean std min 25% 50% 75% 90% 99% max -ttft_ms 166.0 78.22 67.63 45.91 59.94 62.26 64.43 69.66 353.18 567.54 -tpot_ms 166.0 25.37 0.57 24.40 25.07 25.31 25.50 25.84 27.50 28.05 -latency_ms 166.0 2591.07 326.90 1998.53 2341.62 2573.01 2860.10 3003.50 3268.46 3862.94 -input_num_turns 166.0 7.43 4.57 1.00 3.00 7.00 11.00 13.00 17.00 17.00 -input_num_tokens 166.0 2006.20 893.56 522.00 1247.75 2019.00 2718.00 3233.00 3736.45 3899.00 -output_num_tokens 166.0 100.01 11.80 80.00 91.00 99.00 109.75 116.00 120.00 120.00 -output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 115.00 119.00 119.00 ----------------------------------------------------------------------------------------------------- -``` - -### JSON configuration file for synthetic conversations generation - -The input flag `--input-file` is used to determine the input conversations for the benchmark.
-When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations. - -The file `generate_multi_turn.json` is an example file. - -The file must contain the sections `prompt_input` and `prompt_output`. - -The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`: - -* `num_turns` - Number of total turns in the conversation (both user & assistant).
-The final value will always be rounded to an even number so each user turn has a reply. -* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation). -* `num_tokens` - Total token length of each **user** message (one turn). - -The `prompt_output` section must contain `num_tokens`: - -* `num_tokens` - Total token length of each **assistant** message (one turn). - -### Random distributions for synthetic conversations generation - -When creating an input JSON file (such as `generate_multi_turn.json`),
-every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.
-The distribution determines how to randomly sample values for the field. - -The available distributions are listed below. - -**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.
-Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`. - -#### constant - -```json -{ - "distribution": "constant", - "value": 500 -} -``` - -* `value` - the fixed integer value (always returns the same number). - -#### uniform - -```json -{ - "distribution": "uniform", - "min": 12, - "max": 18 -} -``` - -* `min` - minimum value (inclusive). -* `max` - maximum value (inclusive), should be equal or larger than min. - -#### lognormal - -```json -{ - "distribution": "lognormal", - "average": 1000, - "max": 5000 -} -``` - -You can parameterize the lognormal distribution in one of two ways: - -Using the average and optional median ratio: - -* `average` - target average value of the distribution. -* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1). - -Using the parameters of the underlying normal distribution: - -* `mean` - mean of the underlying normal distribution. -* `sigma` - standard deviation of the underlying normal distribution. - -#### zipf - -```json -{ - "distribution": "zipf", - "alpha": 1.2, - "max": 100 -} -``` - -* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers. - -#### poisson - -```json -{ - "distribution": "poisson", - "alpha": 10, - "max": 50 -} -``` - -* `alpha` - expected value (λ). Also the variance of the distribution. - -## ShareGPT Conversations - -To run with the ShareGPT data, download the following ShareGPT dataset: -`https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json` - -Use the `convert_sharegpt_to_openai.py` script to convert the dataset to a format supported by `benchmark_serving_multi_turn.py` - -```bash -python convert_sharegpt_to_openai.py sharegpt_20230401_clean_lang_split.json sharegpt_conv_128.json --seed=99 --max-items=128 -``` - -The script will convert the ShareGPT dataset to a dataset with the standard user/assistant roles. - -The flag `--max-items=128` is used to sample 128 conversations from the original dataset (change as needed). - -Use the output JSON file `sharegpt_conv_128.json` as the `--input-file` for `benchmark_serving_multi_turn.py`. diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py deleted file mode 100644 index 67b937930..000000000 --- a/benchmarks/multi_turn/bench_dataset.py +++ /dev/null @@ -1,588 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from abc import ABC, abstractmethod -from statistics import mean -from typing import Any, NamedTuple, Optional, Union - -import numpy as np # type: ignore -import pandas as pd # type: ignore -from bench_utils import ( - TEXT_SEPARATOR, - Color, - logger, -) -from transformers import AutoTokenizer # type: ignore - -# Conversation ID is a string (e.g: "UzTK34D") -ConvId = str - -# A list of dicts (dicts with keys "id" and "messages") -ShareGptConversations = list[dict[str, Any]] - -# A list of dicts (dicts with keys "role" and "content") -MessagesList = list[dict[str, str]] - -# Map conversation ID to conversation messages -ConversationsMap = list[ConvId, MessagesList] - - -class Distribution(ABC): - @abstractmethod - def sample(self, size: int = 1) -> np.ndarray: - pass - - -class UniformDistribution(Distribution): - def __init__( - self, - min_val: Union[int, float], - max_val: Union[int, float], - is_integer: bool = True, - ) -> None: - self.min_val = min_val - self.max_val = max_val - self.is_integer = is_integer - - def sample(self, size: int = 1) -> np.ndarray: - if self.is_integer: - return np.random.randint( - int(self.min_val), int(self.max_val + 1), size=size - ) - else: - return np.random.uniform(self.min_val, self.max_val, size=size) - - def __repr__(self) -> str: - return f"UniformDistribution[{self.min_val}, {self.max_val}]" - - -class ConstantDistribution(Distribution): - def __init__(self, value: Union[int, float]) -> None: - self.value = value - self.max_val = value - - def sample(self, size: int = 1) -> np.ndarray: - return np.full(shape=size, fill_value=self.value) - - def __repr__(self) -> str: - return f"Constant[{self.value}]" - - -class ZipfDistribution(Distribution): - def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: - self.alpha = alpha - self.max_val = max_val - - def sample(self, size: int = 1) -> np.ndarray: - samples = np.random.zipf(self.alpha, size=size) - if self.max_val: - samples = np.minimum(samples, self.max_val) - return samples - - def __repr__(self) -> str: - return f"ZipfDistribution[{self.alpha}]" - - -class PoissonDistribution(Distribution): - def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: - self.alpha = alpha - self.max_val = max_val - - def sample(self, size: int = 1) -> np.ndarray: - samples = np.random.poisson(self.alpha, size=size) - if self.max_val: - samples = np.minimum(samples, self.max_val) - return samples - - def __repr__(self) -> str: - return f"PoissonDistribution[{self.alpha}]" - - -class LognormalDistribution(Distribution): - def __init__( - self, - mean: Optional[float] = None, - sigma: Optional[float] = None, - average: Optional[int] = None, - median_ratio: Optional[float] = None, - max_val: Optional[int] = None, - ) -> None: - self.average = average - self.median_ratio = median_ratio - self.max_val = max_val - - if average is not None: - if average < 1: - raise ValueError("Lognormal average must be positive") - - if mean or sigma: - raise ValueError( - "When using lognormal average, you can't provide mean/sigma" - ) - - if self.median_ratio is None: - # Default value that provides relatively wide range of values - self.median_ratio = 0.85 - - # Calculate mean/sigma of np.random.lognormal based on the average - mean, sigma = self._generate_lognormal_by_median( - target_average=self.average, median_ratio=self.median_ratio - ) - else: - if mean is None or sigma is None: - raise ValueError( - "Must provide both mean and sigma if average is not used" - ) - - if mean <= 0 or sigma < 0: - raise ValueError( - "Lognormal mean must be positive and sigma must be non-negative" - ) - - # Mean and standard deviation of the underlying normal distribution - # Based on numpy.random.lognormal - self.mean = mean - self.sigma = sigma - - @staticmethod - def _generate_lognormal_by_median( - target_average: int, median_ratio: float - ) -> tuple[float, float]: - """ - Compute (mu, sigma) for a lognormal distribution given: - - a target average (mean of the distribution) - - a ratio of median / mean (controls skewness), assume mean > median - - Background: - If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma). - * mean(X) = exp(mu + sigma^2 / 2) - * median(X) = exp(mu) - - So: - median / mean = exp(mu) / exp(mu + sigma^2 / 2) - = exp(-sigma^2 / 2) - - Rearranging: - sigma^2 = 2 * ln(mean / median) - mu = ln(median) - - This gives a unique (mu, sigma) for any valid mean and median. - """ - # Check input validity: median must be smaller than mean - if median_ratio <= 0 or median_ratio >= 1: - raise ValueError("median_ratio must be in range (0, 1)") - - target_median = target_average * median_ratio - - # Solve sigma^2 = 2 * ln(mean / median) - sigma = np.sqrt(2 * np.log(target_average / target_median)) - mu = np.log(target_median) - - return mu, sigma - - def sample(self, size: int = 1) -> np.ndarray: - samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size) - - if self.average is not None: - # Scale to average - samples *= self.average / samples.mean() - - if self.max_val: - samples = np.minimum(samples, self.max_val) - - return np.round(samples).astype(int) - - def __repr__(self) -> str: - if self.average: - return ( - f"LognormalDistribution[{self.average}, " - f"{self.median_ratio}, {self.max_val}]" - ) - return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]" - - -class GenConvArgs(NamedTuple): - num_conversations: int - text_files: list[str] - input_num_turns: Distribution - input_common_prefix_num_tokens: Distribution - input_prefix_num_tokens: Distribution - input_num_tokens: Distribution - output_num_tokens: Distribution - print_stats: bool - - -def verify_field_exists( - conf: dict, field_name: str, section: str, subsection: str -) -> None: - if field_name not in conf: - raise ValueError( - f"Missing field '{field_name}' in {section=} and {subsection=}" - ) - - -def get_random_distribution( - conf: dict, section: str, subsection: str, optional: bool = False -) -> Distribution: - # section can be "prompt_input" or "prompt_output" (both required) - conf = conf[section] - - if optional and subsection not in conf: - # Optional subsection, if not found assume the value is always 0 - return ConstantDistribution(0) - - # subsection can be "num_turns", "num_tokens" or "prefix_num_tokens" - if subsection not in conf: - raise ValueError(f"Missing subsection {subsection} in section {section}") - - conf = conf[subsection] - - distribution = conf.get("distribution") - if distribution is None: - raise ValueError( - f"Missing field 'distribution' in {section=} and {subsection=}" - ) - - if distribution == "constant": - verify_field_exists(conf, "value", section, subsection) - return ConstantDistribution(conf["value"]) - - elif distribution == "zipf": - verify_field_exists(conf, "alpha", section, subsection) - max_val = conf.get("max", None) - return ZipfDistribution(conf["alpha"], max_val=max_val) - - elif distribution == "poisson": - verify_field_exists(conf, "alpha", section, subsection) - max_val = conf.get("max", None) - return PoissonDistribution(conf["alpha"], max_val=max_val) - - elif distribution == "lognormal": - max_val = conf.get("max", None) - - if "average" in conf: - # Infer lognormal mean/sigma (numpy) from input average - median_ratio = conf.get("median_ratio", None) - return LognormalDistribution( - average=conf["average"], median_ratio=median_ratio, max_val=max_val - ) - - # Use mean/sigma directly (for full control over the distribution) - verify_field_exists(conf, "mean", section, subsection) - verify_field_exists(conf, "sigma", section, subsection) - return LognormalDistribution( - mean=conf["mean"], sigma=conf["sigma"], max_val=max_val - ) - - elif distribution == "uniform": - verify_field_exists(conf, "min", section, subsection) - verify_field_exists(conf, "max", section, subsection) - - min_value = conf["min"] - max_value = conf["max"] - - assert min_value > 0 - assert min_value <= max_value - - is_integer = isinstance(min_value, int) and isinstance(max_value, int) - return UniformDistribution(min_value, max_value, is_integer) - else: - raise ValueError(f"Unknown distribution: {distribution}") - - -def parse_input_json_file(conf: dict) -> GenConvArgs: - # Validate the input file - assert isinstance(conf, dict) - required_fields = [ - "filetype", - "num_conversations", - "text_files", - "prompt_input", - "prompt_output", - ] - for field in required_fields: - assert field in conf, f"Missing field {field} in input {conf}" - - assert conf["filetype"] == "generate_conversations" - - assert conf["num_conversations"] > 0, "num_conversations should be larger than zero" - - text_files = conf["text_files"] - - assert isinstance(text_files, list), "Field 'text_files' should be a list" - assert len(text_files) > 0, ( - "Field 'text_files' should be a list with at least one file" - ) - - # Parse the parameters for the prompt input/output workload - input_num_turns = get_random_distribution(conf, "prompt_input", "num_turns") - input_num_tokens = get_random_distribution(conf, "prompt_input", "num_tokens") - input_common_prefix_num_tokens = get_random_distribution( - conf, "prompt_input", "common_prefix_num_tokens", optional=True - ) - input_prefix_num_tokens = get_random_distribution( - conf, "prompt_input", "prefix_num_tokens" - ) - output_num_tokens = get_random_distribution(conf, "prompt_output", "num_tokens") - - print_stats: bool = conf.get("print_stats", False) - assert isinstance(print_stats, bool), ( - "Field 'print_stats' should be either 'true' or 'false'" - ) - - args = GenConvArgs( - num_conversations=conf["num_conversations"], - text_files=text_files, - input_num_turns=input_num_turns, - input_common_prefix_num_tokens=input_common_prefix_num_tokens, - input_prefix_num_tokens=input_prefix_num_tokens, - input_num_tokens=input_num_tokens, - output_num_tokens=output_num_tokens, - print_stats=print_stats, - ) - return args - - -def print_conv_stats(conversations: ConversationsMap, tokenizer: AutoTokenizer) -> None: - # Collect statistics - conv_stats: list[dict[Any, Any]] = [] - req_stats: list[int] = [] - - print("\nCollecting statistics...") - for messages in conversations.values(): - # messages is a list of dicts - user_tokens: list[int] = [] - assistant_tokens: list[int] = [] - request_tokens: list[int] = [] - - req_tokens = 0 - for m in messages: - content = m["content"] - num_tokens = len(tokenizer(content).input_ids) - - if m["role"] == "user": - user_tokens.append(num_tokens) - # New user prompt including all chat history - req_tokens += num_tokens - request_tokens.append(req_tokens) - - elif m["role"] == "assistant": - assistant_tokens.append(num_tokens) - # Update assistant answer - # (will be part of chat history for the next user prompt) - req_tokens += num_tokens - - item_stats = { - "conversation_turns": len(messages), - "user_tokens": mean(user_tokens), - "assistant_tokens": mean(assistant_tokens), - } - - conv_stats.append(item_stats) - req_stats.extend(request_tokens) - - # Print statistics - percentiles = [0.25, 0.5, 0.75, 0.9, 0.99] - - print(TEXT_SEPARATOR) - print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}") - print(TEXT_SEPARATOR) - df = pd.DataFrame(conv_stats) - print(df.describe(percentiles=percentiles).transpose()) - print(TEXT_SEPARATOR) - print(f"{Color.YELLOW}Request statistics:{Color.RESET}") - print(TEXT_SEPARATOR) - df = pd.DataFrame(req_stats, columns=["request_tokens"]) - print(df.describe(percentiles=percentiles).transpose()) - print(TEXT_SEPARATOR) - - -def generate_conversations( - args: GenConvArgs, tokenizer: AutoTokenizer -) -> ConversationsMap: - # Text for all user prompts - # (text from the input text files will be appended to this line) - base_prompt_text = "Please rewrite the following text and add more content: " - base_prompt_token_count = len( - tokenizer.encode(base_prompt_text, add_special_tokens=False) - ) - - logger.info(f"{Color.PURPLE}Generating conversations...{Color.RESET}") - logger.info(args) - - list_of_tokens = [] - - for filename in args.text_files: - # Load text file that will be used to generate prompts - with open(filename) as file: - data = file.read() - tokens_in_file = tokenizer.encode(data, add_special_tokens=False) - list_of_tokens.extend(tokens_in_file) - - conversations: ConversationsMap = {} - conv_id = 0 - - # Generate number of turns for every conversation - turn_count: np.ndarray = args.input_num_turns.sample(args.num_conversations) - - # Turn count should be at least 2 (one user prompt and one assistant answer) - turn_count = np.maximum(turn_count, 2) - - # Round up to an even number (every user prompt should have an answer) - turn_count = turn_count + (turn_count % 2) - - # Generate number of prefix tokens for every conversation - conv_prefix_tokens: np.ndarray = args.input_prefix_num_tokens.sample( - args.num_conversations - ) - - # Used to reduce shared text between conversations - # (jump/skip over text sections between conversations) - base_offset = 0 - - # Common prefix size for all conversations (only 1 sample required) - common_prefix_text = "" - common_prefix_tokens: int = args.input_common_prefix_num_tokens.sample(1)[0] - if common_prefix_tokens > 0: - # Using "." at the end to separate sentences - common_prefix_text = ( - tokenizer.decode(list_of_tokens[: common_prefix_tokens - 2]) + "." - ) - base_offset += common_prefix_tokens - - for conv_id in range(args.num_conversations): - # Generate a single conversation - messages: MessagesList = [] - - nturns = turn_count[conv_id] - - # User prompt token count per turn (with lower limit) - input_token_count: np.ndarray = args.input_num_tokens.sample(nturns) - input_token_count = np.maximum(input_token_count, base_prompt_token_count) - - # Assistant answer token count per turn (with lower limit) - output_token_count: np.ndarray = args.output_num_tokens.sample(nturns) - output_token_count = np.maximum(output_token_count, 1) - - user_turn = True - for turn_id in range(nturns): - if user_turn: - role = "user" - num_tokens = input_token_count[turn_id] - - # Generate the user prompt, - # use a unique prefix (the conv_id) for each conversation - # (to avoid shared prefix between conversations) - content = f"{conv_id} is a nice number... " - - if len(common_prefix_text) > 0 and turn_id == 0: - content = common_prefix_text + content - - # Update the number of tokens left for the content - num_tokens -= len(tokenizer.encode(content, add_special_tokens=False)) - - if turn_id == 0: - prefix_num_tokens = conv_prefix_tokens[conv_id] - if prefix_num_tokens > 0: - # Add prefix text (context) to the first turn - start_offset = base_offset - end_offset = start_offset + prefix_num_tokens - assert len(list_of_tokens) > end_offset, ( - "Not enough input text to generate " - f"{prefix_num_tokens} tokens for the " - f"prefix text ({start_offset=}, {end_offset=})" - ) - - content += f"{conv_id}, " + tokenizer.decode( - list_of_tokens[start_offset:end_offset] - ) - base_offset += prefix_num_tokens - - # Add the actual user prompt/question after the prefix text - content += base_prompt_text - num_tokens -= base_prompt_token_count - - if num_tokens > 0: - # Add text from the input file (to reach the desired token count) - start_offset = base_offset + turn_id * input_token_count.max() - end_offset = start_offset + num_tokens - assert len(list_of_tokens) > end_offset, ( - f"Not enough input text to generate {num_tokens} tokens " - f"for the prompt ({start_offset=}, {end_offset=})" - ) - - # Convert tokens back to text - content += tokenizer.decode(list_of_tokens[start_offset:end_offset]) - else: - role = "assistant" - # This content will not be used as input to the LLM server - # (actual answers will be used instead). - # Content is only required to determine the min_tokens/max_tokens - # (inputs to the LLM server). - num_tokens = output_token_count[turn_id] - assert len(list_of_tokens) > num_tokens, ( - f"Not enough input text to generate {num_tokens} " - "tokens for assistant content" - ) - content = tokenizer.decode(list_of_tokens[:num_tokens]) - - # Append the user/assistant message to the list of messages - messages.append({"role": role, "content": content}) - user_turn = not user_turn - - # Add the new conversation - conversations[f"CONV_ID_{conv_id}"] = messages - - # Increase base offset for the next conversation - base_offset += nturns - - if args.print_stats: - print_conv_stats(conversations, tokenizer) - - return conversations - - -def conversations_list_to_dict(input_list: ShareGptConversations) -> ConversationsMap: - conversations: ConversationsMap = {} - - for item in input_list: - conv_id: str = item["id"] - assert isinstance(conv_id, str) - - assert conv_id not in conversations, ( - f"Conversation ID {conv_id} found more than once in the input" - ) - - messages: MessagesList = item["messages"] - assert isinstance(messages, list), ( - f"Conversation messages should be a list (ID: {conv_id})" - ) - assert len(messages) > 0, f"Conversation with no messages (ID: {conv_id})" - - conversations[conv_id] = messages - - logger.info(f"Using {len(conversations)} unique conversations (IDs)") - assert len(conversations) == len(input_list) - - # Print statistics about the selected conversations - stats: list[dict[str, Any]] = [] - for conv_data in conversations.values(): - stats.append({"num_turns": len(conv_data)}) - - print(TEXT_SEPARATOR) - print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}") - print(TEXT_SEPARATOR) - percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999] - conv_stats = pd.DataFrame(stats).describe(percentiles=percentiles) - print(conv_stats.transpose()) - print(TEXT_SEPARATOR) - - return conversations - - -def conversations_dict_to_list(input_dict: ConversationsMap) -> ShareGptConversations: - output: ShareGptConversations = [] - for conv_id, conv_data in input_dict.items(): - new_item = {"id": conv_id, "messages": conv_data} - output.append(new_item) - - return output diff --git a/benchmarks/multi_turn/bench_utils.py b/benchmarks/multi_turn/bench_utils.py deleted file mode 100644 index e959a4be7..000000000 --- a/benchmarks/multi_turn/bench_utils.py +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import logging -from enum import Enum - - -class Color(Enum): - RED = "\033[91m" - GREEN = "\033[92m" - BLUE = "\033[94m" - PURPLE = "\033[95m" - CYAN = "\033[96m" - YELLOW = "\033[93m" - RESET = "\033[0m" - - def __str__(self): - return self.value - - -TEXT_SEPARATOR = "-" * 100 - -# Configure the logger -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] - %(message)s", - datefmt="%d-%m-%Y %H:%M:%S", -) -logger = logging.getLogger(__name__) diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py deleted file mode 100644 index 66d85eaf5..000000000 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ /dev/null @@ -1,1569 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import asyncio -import json -import logging -import multiprocessing as mp -import os -import random -import time -from collections import Counter, deque -from datetime import datetime -from enum import Enum -from http import HTTPStatus -from statistics import mean -from typing import NamedTuple, Optional, Union - -import aiohttp # type: ignore -import numpy as np # type: ignore -import pandas as pd # type: ignore -from bench_dataset import ( - ConversationsMap, - ConvId, - GenConvArgs, - MessagesList, - ShareGptConversations, - conversations_dict_to_list, - conversations_list_to_dict, - generate_conversations, - parse_input_json_file, -) -from bench_utils import TEXT_SEPARATOR, Color, logger -from transformers import AutoTokenizer # type: ignore - -NUM_TOKENS_FROM_DATASET = 0 -TERM_SIGNAL = None - - -class ConversationSampling(str, Enum): - ROUND_ROBIN = "round_robin" - RANDOM = "random" - - def __str__(self): - return self.value - - -class ClientArgs(NamedTuple): - seed: int - max_num_requests: Optional[int] - skip_first_turn: bool - max_turns: Optional[int] - max_active_conversations: int - verbose: bool - print_content: bool - verify_output: bool - conversation_sampling: ConversationSampling - request_rate: float - - -class RequestArgs(NamedTuple): - chat_url: str - model: str - stream: bool - limit_min_tokens: int # Use negative value for no limit - limit_max_tokens: int # Use negative value for no limit - - -class BenchmarkArgs(NamedTuple): - url: str - num_clients: int - early_stop: bool - - -class ServerResponse(NamedTuple): - valid: bool - ttft_ms: float # time to first chunk - tpot_ms: float # time per output chunk (one or more tokens) - latency_ms: float - start_time_ms: float - first_chunk: str # first chunk of the content - content: str # includes the first_chunk - num_chunks: int - - def __str__(self) -> str: - return f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}" # noqa: E501 - - -class RequestStats(NamedTuple): - ttft_ms: float - tpot_ms: float - latency_ms: float - start_time_ms: float - input_num_turns: int - input_num_tokens: int - output_num_tokens: int - output_num_chunks: int - output_num_first_chunk_tokens: int - approx_cached_percent: float - conversation_id: str - client_id: int - - def __str__(self) -> str: - return ( - f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}, input_num_tokens {self.input_num_tokens}, " # noqa: E501 - f"output_num_tokens {self.output_num_tokens} ({self.output_num_chunks} chunks, {self.output_num_first_chunk_tokens} tokens in first chunk), " # noqa: E501 - f"approx_cached_percent {self.approx_cached_percent:.2f}%" - ) - - -class MetricStats: - def __init__(self) -> None: - self.min: Optional[float] = None - self.max: Optional[float] = None - self.avg: Optional[float] = None - self.sum = 0.0 - self.count = 0 - - def update(self, value: float) -> None: - if self.min is None: - self.min = value - else: - self.min = min(self.min, value) - - if self.max is None: - self.max = value - else: - self.max = max(self.max, value) - - self.sum += value - self.count += 1 - self.avg = self.sum / self.count - - def __repr__(self) -> str: - if self.count == 0: - return "no data" - return f"avg: {self.avg:>10.3f}, min: {self.min:>10.3f}, max: {self.max:>10.3f}" - - -class MovingAverage: - def __init__(self, window_size: int) -> None: - self.window_size = window_size - self.window = np.zeros(window_size) - self.index = 0 - self.sum = 0.0 - self.count = 0 - self.avg: Optional[float] = None - - def update(self, new_value: float) -> None: - if self.count < self.window_size: - # Filling up the window - self.sum += new_value - self.window[self.count] = new_value - self.count += 1 - else: - # Window is full, start replacing old values - old_value = self.window[self.index] - self.sum = self.sum - old_value + new_value - self.window[self.index] = new_value - self.index = (self.index + 1) % self.window_size - - self.avg = self.sum / self.count - - def __repr__(self) -> str: - if self.count == 0: - return "no data" - return f"avg: {self.avg:>10.3f} ({self.count} samples)" - - -class DebugStats: - def __init__(self, logger: logging.Logger, window_size: int) -> None: - self.logger = logger - self.metrics: dict[str, Union[MovingAverage, MetricStats]] = { - "moving_avg_ttft_ms": MovingAverage(window_size), - "moving_avg_tpot_ms": MovingAverage(window_size), - "ttft_ms": MetricStats(), - "tpot_ms": MetricStats(), - "latency_ms": MetricStats(), - "input_num_turns": MetricStats(), - "input_num_tokens": MetricStats(), - "output_num_tokens": MetricStats(), - } - - def update(self, data: RequestStats) -> None: - self.metrics["ttft_ms"].update(data.ttft_ms) - self.metrics["moving_avg_ttft_ms"].update(data.ttft_ms) - self.metrics["tpot_ms"].update(data.tpot_ms) - self.metrics["moving_avg_tpot_ms"].update(data.tpot_ms) - self.metrics["latency_ms"].update(data.latency_ms) - self.metrics["input_num_turns"].update(data.input_num_turns) - self.metrics["input_num_tokens"].update(data.input_num_tokens) - self.metrics["output_num_tokens"].update(data.output_num_tokens) - - def print(self) -> None: - self.logger.info("-" * 50) - for k, v in self.metrics.items(): - kv_info = f"[{k:25}] {v}" - self.logger.info(kv_info) - self.logger.info("-" * 50) - - -# Must support Python 3.8, we can't use str.removeprefix(prefix) -# introduced in Python 3.9 -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix) :] - return text - - -def nanosec_to_millisec(value: float) -> float: - return value / 1000000.0 - - -def nanosec_to_sec(value: float) -> float: - return value / 1000000000.0 - - -async def send_request( - session: aiohttp.ClientSession, - messages: list[dict[str, str]], - chat_url: str, - model: str, - stream: bool = True, - min_tokens: Optional[int] = None, - max_tokens: Optional[int] = None, -) -> ServerResponse: - payload = { - "model": model, - "messages": messages, - "seed": 0, - "temperature": 0.0, - } - - if stream: - payload["stream"] = True - payload["stream_options"] = {"include_usage": False} - - if min_tokens is not None: - payload["min_tokens"] = min_tokens - - if max_tokens is not None: - payload["max_tokens"] = max_tokens - - headers = {"Content-Type": "application/json"} - - # Calculate the timeout for the request - timeout_sec = 120 - if max_tokens is not None: - # Assume TPOT of 200ms and use max_tokens to determine timeout - timeout_sec = max(timeout_sec, int(max_tokens * 0.2)) - timeout = aiohttp.ClientTimeout(total=timeout_sec) - - valid_response = True - ttft: Optional[float] = None - chunk_delay: list[int] = [] - latency: Optional[float] = None - first_chunk = "" - generated_text = "" - - start_time: int = time.perf_counter_ns() - most_recent_timestamp: int = start_time - - async with session.post( - url=chat_url, json=payload, headers=headers, timeout=timeout - ) as response: - http_status = HTTPStatus(response.status) - if http_status == HTTPStatus.OK: - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") - if chunk == "[DONE]": - # End of stream - latency = time.perf_counter_ns() - start_time - elif stream is False: - data = json.loads(chunk) - message = data["choices"][0]["message"] - assert message["role"] == "assistant" - generated_text += message["content"] - else: - timestamp: int = time.perf_counter_ns() - data = json.loads(chunk) - - # Delta is the new content/text/data - delta = data["choices"][0]["delta"] - if delta.get("content", None): - if ttft is None: - # First token - first_token_time = time.perf_counter_ns() - ttft = first_token_time - start_time - first_chunk = delta["content"] - else: - # Decoding phase - chunk_delay.append(timestamp - most_recent_timestamp) - - generated_text += delta["content"] - - most_recent_timestamp = timestamp - else: - valid_response = False - content = await response.text() - logger.warning( - f"{Color.YELLOW}Received HTTP status {http_status.value} " - f"({http_status.phrase}): {content}{Color.RESET}" - ) - - if latency is None: - latency = -1.0 - if valid_response: - # Streaming is disabled, latency was not set - latency = time.perf_counter_ns() - start_time - - if ttft is None: - # The response was a single chunk - ttft = latency - - # Each chunk may include more than one token - tpot: float = mean(chunk_delay) if len(chunk_delay) > 0 else 0.0 - num_chunks: int = len(chunk_delay) - - sr = ServerResponse( - valid=valid_response, - ttft_ms=nanosec_to_millisec(ttft) if ttft > 0.0 else -1.0, - tpot_ms=nanosec_to_millisec(tpot), - latency_ms=nanosec_to_millisec(latency), - start_time_ms=nanosec_to_millisec(start_time), - first_chunk=first_chunk, - content=generated_text, - num_chunks=num_chunks, - ) - return sr - - -def get_short_string(input: str) -> str: - n = 20 - if len(input) < 400: - return input - - return f"{input[:n]}...{input[-n:]}" - - -def get_token_count(tokenizer: AutoTokenizer, text: str) -> int: - return len(tokenizer(text, add_special_tokens=False).input_ids) - - -def get_messages_token_count( - tokenizer: AutoTokenizer, messages: list[dict[str, str]] -) -> int: - token_count = 0 - for m in messages: - token_count += get_token_count(tokenizer, m["content"]) - - return token_count - - -async def send_turn( - session: aiohttp.ClientSession, - client_id: int, - conv_id: str, - conversation_messages: MessagesList, - messages_to_use: int, - tokenizer: AutoTokenizer, - req_args: RequestArgs, - verbose: bool, - verify_output: bool, -) -> Optional[RequestStats]: - assert messages_to_use > 0 - assert messages_to_use <= len(conversation_messages) - - messages = conversation_messages[:messages_to_use] - - # Index of the next message (the role should be "user") - index = messages_to_use - 1 - - # Verify that the message has only two keys, "role" and "content" - assert len(messages[index].keys()) == 2 - assert "role" in messages[index] and "content" in messages[index] - assert messages[index]["role"] == "user", ( - f"Failed on conversation ID {conv_id}, message role should be user" - ) - - if verbose: - print( - f"{Color.CYAN}Messages (conversation ID {conv_id}," - f" {len(messages)} turns):{Color.RESET}", - messages, - ) - - # None means that there is no upper/lower limit for the output token count - min_tokens = None if req_args.limit_min_tokens < 0 else req_args.limit_min_tokens - max_tokens = None if req_args.limit_max_tokens < 0 else req_args.limit_max_tokens - - if len(conversation_messages) > messages_to_use: - # The conversation contains an assistant answer for the next user prompt - if ( - min_tokens == NUM_TOKENS_FROM_DATASET - or max_tokens == NUM_TOKENS_FROM_DATASET - ): - # Compute number of tokens in the answer (from the input conversation) - assistant_answer = conversation_messages[messages_to_use] - answer_num_tokens = get_token_count(tokenizer, assistant_answer["content"]) - assert assistant_answer["role"] == "assistant" - - if min_tokens == NUM_TOKENS_FROM_DATASET: - min_tokens = max(1, answer_num_tokens) - - if max_tokens == NUM_TOKENS_FROM_DATASET: - max_tokens = max(1, answer_num_tokens) - - # Send the current conversation to LLM and get a response - response: ServerResponse = await send_request( - session, - messages, - req_args.chat_url, - req_args.model, - req_args.stream, - min_tokens, - max_tokens, - ) - - if response.valid is False: - # Request failed - return None - - # Compute number of tokens in input / output - input_num_tokens = get_messages_token_count(tokenizer, messages) - - # Num tokens in the user's last question - question_num_tokens = get_token_count(tokenizer, messages[index]["content"]) - - # Num tokens in the history/context of the question - assert input_num_tokens >= question_num_tokens - history_num_tokens = input_num_tokens - question_num_tokens - - # Num tokens in the LLM's answer (first chunk and full answer) - first_chunk_tokens = get_token_count(tokenizer, response.first_chunk) - - output_content = response.content - output_num_tokens = get_token_count(tokenizer, output_content) - - # Prefix caching approximated cached percent - approx_cached_percent = ( - 100.0 * (history_num_tokens / input_num_tokens) if input_num_tokens > 0 else 0.0 - ) - - # Compute the correct TTFT and TPOT (based on tokens and not chunks). - # Required because multiple output tokens may be bundled in a single chunk. - if output_num_tokens > 1 and output_num_tokens > first_chunk_tokens: - # More than one token and more than one chunk in the output - decode_ms = response.latency_ms - response.ttft_ms - decode_num_tokens = output_num_tokens - first_chunk_tokens - tpot_ms = decode_ms / decode_num_tokens - else: - # In this case: output_num_tokens == first_chunk_tokens - # Output was a single chunk (output_num_tokens > 1) - # or even a single token (output_num_tokens == 1) - tpot_ms = 0.0 - - if first_chunk_tokens > 1: - # First chunk had multiple tokens, adjust TTFT for a single token - delta_ms = (first_chunk_tokens - 1) * tpot_ms - ttft_ms = max(0.1, response.ttft_ms - delta_ms) - else: - # First chunk had only one token - ttft_ms = response.ttft_ms - - rs = RequestStats( - ttft_ms=ttft_ms, - tpot_ms=tpot_ms, - latency_ms=response.latency_ms, - start_time_ms=response.start_time_ms, - input_num_turns=len(messages), - input_num_tokens=input_num_tokens, - output_num_tokens=output_num_tokens, - output_num_chunks=response.num_chunks, - output_num_first_chunk_tokens=first_chunk_tokens, - approx_cached_percent=approx_cached_percent, - conversation_id=conv_id, - client_id=client_id, - ) - - if verbose: - print( - f"\n{Color.YELLOW}Response ({output_num_tokens} tokens):{Color.RESET}", - output_content, - ) - print(f"{Color.YELLOW}Response metrics: {rs}{Color.RESET}") - print("-" * 70) - - # Save the LLM's answer (will be used as part of the context for the next user turn) - answer_index = messages_to_use - if len(conversation_messages) > answer_index: - assert conversation_messages[answer_index]["role"] == "assistant", ( - f"Failed on conversation ID {conv_id}, message role should be assistant" - ) - - orig_content = conversation_messages[answer_index]["content"] - if verify_output: - # Compare the new answer to the answer from the input file - debug_info = ( - f"LLM/dataset answers do not match ({conv_id}):" - f"\n'{get_short_string(output_content)}' (len: {len(output_content)})," - f"\n'{get_short_string(orig_content)}' (len: {len(orig_content)})" - ) - if orig_content != output_content: - raise ValueError(debug_info) - - # Update the answer - conversation_messages[answer_index]["content"] = output_content - else: - # A user prompt that has no answer, add the answer as a new message - new_answer = {"role": "assistant", "content": output_content} - conversation_messages.append(new_answer) - - return rs - - -async def poisson_sleep(request_rate: float, verbose: bool = False) -> None: - # Generate a random time interval from the Poisson distribution - assert request_rate > 0 - - interval = np.random.exponential(1.0 / request_rate) - if verbose: - logger.info(f"Sleeping for {interval:.3f} seconds...") - await asyncio.sleep(interval) - - -async def client_main( - args: ClientArgs, - req_args: RequestArgs, - client_id: int, - tokenizer: AutoTokenizer, - stop_event: mp.Event, # type: ignore - task_queue: mp.Queue, - result_queue: mp.Queue, - conv_queue: mp.Queue, -) -> None: - logger.info( - f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501 - ) - - random.seed(args.seed) - np.random.seed(args.seed) - - # Active conversations - active_convs: ConversationsMap = {} - conv_id_queue: deque = deque(maxlen=args.max_active_conversations) - - # Keep track of how many messages have been used for each conversation - turns_count: Counter = Counter() - num_successes = 0 - num_failures = 0 - - # Track the timestamp (time.perf_counter()) - # of the last turn per conversation (only for debug) - time_of_last_turn: dict[ConvId, float] = {} - - # Flag that indicates that there are no new tasks (conversations) for the client - task_queue_empty = False - - async with aiohttp.ClientSession() as session: - # Print progress - - while task_queue_empty is False: - result = None - - if ( - args.max_num_requests - and num_successes + num_failures == args.max_num_requests - ): - logger.info( - f"{Color.YELLOW}Client {client_id} reached " - f"request limit{Color.RESET}" - ) - break - - if stop_event.is_set(): # type: ignore - logger.info( - f"{Color.YELLOW}Client {client_id} received " - f"a termination signal{Color.RESET}" - ) - break - - while ( - len(active_convs) < args.max_active_conversations - and task_queue_empty is False - ): - # Get a new conversation from the task queue - conv_id, messages = task_queue.get() - - if conv_id is TERM_SIGNAL: - task_queue_empty = True - break - - if args.skip_first_turn: - # Skip the first turn (both user and assistant), - # relevant if warmup was enabled. - # Default turns_count[conv_id] will be zero if conv_id - # was never inserted/updated in turns_count. - turns_count[conv_id] += 2 - - if turns_count[conv_id] < len(messages): - # Add new conversation - active_convs[conv_id] = messages - conv_id_queue.append(conv_id) - - if args.verbose: - logger.info( - f"{Color.GREEN}Client {client_id} will use conversation ID {conv_id} (active conversations {len(active_convs)}){Color.RESET}" # noqa: E501 - ) - - elif args.verbose: - # No more messages (conversation finished during the warmup) - logger.info( - f"{Color.YELLOW}Client {client_id} will not use conversation ID {conv_id} (all {len(messages)} messages already sent){Color.RESET}" # noqa: E501 - ) - - if len(active_convs) == 0 or task_queue_empty: - logger.info( - f"{Color.YELLOW}Client {client_id} has no more work{Color.RESET}" - ) - break - - # Pick an active conversation for the next request - if args.conversation_sampling == ConversationSampling.ROUND_ROBIN: - conv_id = conv_id_queue.pop() - else: - # ConversationSampling.RANDOM - active_ids = list(active_convs.keys()) - conv_id = random.choice(active_ids) - - messages = active_convs[conv_id] - assert isinstance(messages, list) and len(messages) > 0 - - # Update the amount of messages to use - turns_count[conv_id] += 1 - current_turn = turns_count[conv_id] - - assert current_turn < len(messages), ( - f"Turn number {current_turn} is invalid for conversation ID {conv_id}" - f" that has only {len(messages)} messages" - ) - - if args.verbose: - curr_time_sec: float = time.perf_counter() - time_since_last_turn: Union[str, float] = "N/A" - if conv_id in time_of_last_turn: - time_since_last_turn = round( - curr_time_sec - time_of_last_turn[conv_id], 3 - ) - logger.info( - f"Client {client_id} using conversation ID {conv_id} (turn: {current_turn}, time since last turn [sec]: {time_since_last_turn})" # noqa: E501 - ) - time_of_last_turn[conv_id] = curr_time_sec - - success = True - try: - result = await send_turn( - session, - client_id, - conv_id, - messages, - current_turn, - tokenizer, - req_args, - args.print_content, - args.verify_output, - ) - if result is not None: - result_queue.put(result) - else: - # None means that the request failed, - # and should not be added to the statistics. - success = False - num_failures += 1 - - logger.warning( - f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 - ) - - # Remove the conversation (should not be used again) - active_convs.pop(conv_id) - - except asyncio.exceptions.TimeoutError: - num_failures += 1 - logger.exception( - f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 - ) - break # Exit gracefully instead of raising an error - - except Exception: - num_failures += 1 - logger.exception( - f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 - ) - break # Exit gracefully instead of raising an error - - if success: - num_successes += 1 - - # Update the turns counter to include the LLM response - # The LLM response will be used as context for the next user turn - turns_count[conv_id] += 1 - - max_turns = len(messages) - if args.max_turns is not None: - # Limit the number of turns in the conversation - max_turns = min(args.max_turns, max_turns) - - if turns_count[conv_id] >= max_turns: - # Conversation has no more turns (no longer active) - # save the updated conversation (with the LLM server's answer) - conv_queue.put((conv_id, active_convs.pop(conv_id))) - if args.verbose: - logger.info( - f"{Color.GREEN}Client {client_id} finished " - f"conversation ID {conv_id}{Color.RESET}" - ) - else: - # Conversation is not finished, insert it at the back of the queue - conv_id_queue.appendleft(conv_id) - - # Sleep between requests (if lambda is positive) - if args.request_rate > 0: - await poisson_sleep(args.request_rate, args.verbose) - - # Send indication that the client is done - conv_queue.put((TERM_SIGNAL, TERM_SIGNAL)) - - logger.info( - f"{Color.CYAN}Client {client_id} is done " - f"({num_successes=}, {num_failures=}){Color.RESET}" - ) - - -def worker_function( - client_id: int, - tokenizer: AutoTokenizer, - client_args: ClientArgs, - req_args: RequestArgs, - stop_event: mp.Event, # type: ignore - task_queue: mp.Queue, - result_queue: mp.Queue, - conv_queue: mp.Queue, -) -> None: - asyncio.run( - client_main( - client_args, - req_args, - client_id, - tokenizer, - stop_event, - task_queue, - result_queue, - conv_queue, - ) - ) - - -def get_client_config( - args: argparse.Namespace, input_conv: ConversationsMap -) -> tuple[ClientArgs, RequestArgs]: - if args.num_clients < 1: - raise ValueError("Number of clients must be a positive number") - - if len(input_conv) < args.num_clients: - raise ValueError( - "Number of conversations must be equal or larger than the number of clients" - ) - - max_req_per_client: Optional[int] = None - if args.max_num_requests is not None: - # Max number of requests per client - req_per_client = args.max_num_requests // args.num_clients - if req_per_client < 1: - raise ValueError("Number of requests should be at least one per client") - max_req_per_client = req_per_client - - max_active_conversations = args.max_active_conversations - if max_active_conversations is None: - # Each client will have only one active conversation at a time - max_active_conversations = args.num_clients - - if max_active_conversations > len(input_conv): - raise ValueError( - f"Max active conversations {max_active_conversations} " - "must be equal or less than the total number of conversations" - ) - - # Max number of active conversations per client - max_active_conv_per_client = max_active_conversations // args.num_clients - if max_active_conv_per_client < 1: - raise ValueError( - f"Max active conversations {max_active_conversations} " - "must be equal or greater than the number of clients" - ) - - # Skip the first user turn (as part of the warmup) - skip_first_turn = args.warmup_step - - # Common arguments for all clients - client_args = ClientArgs( - seed=args.seed, - max_num_requests=max_req_per_client, - skip_first_turn=skip_first_turn, - max_turns=args.max_turns, - max_active_conversations=max_active_conv_per_client, - verbose=args.verbose, - print_content=args.print_content, - verify_output=args.verify_output, - conversation_sampling=args.conversation_sampling, - request_rate=args.request_rate, - ) - - if args.limit_min_tokens > 0 or args.limit_max_tokens > 0: - if args.limit_min_tokens < 1 or args.limit_max_tokens < 1: - raise ValueError( - "Invalid min/max tokens limits (both limits should be provided)" - ) - if args.limit_min_tokens > args.limit_max_tokens: - raise ValueError( - "Invalid min/max tokens limits (min should not be larger than max)" - ) - - # Arguments for API requests - chat_url = f"{args.url}/v1/chat/completions" - model_name = args.served_model_name if args.served_model_name else args.model - - req_args = RequestArgs( - chat_url=chat_url, - model=model_name, - stream=not args.no_stream, - limit_min_tokens=args.limit_min_tokens, - limit_max_tokens=args.limit_max_tokens, - ) - - return client_args, req_args - - -async def main_mp( - client_args: ClientArgs, - req_args: RequestArgs, - bench_args: BenchmarkArgs, - tokenizer: AutoTokenizer, - input_conv: ConversationsMap, -) -> tuple[ConversationsMap, list[RequestStats]]: - # An event that will trigger graceful termination of all the clients - stop_event = mp.Event() - - # Queue for input conversations (from the input file/dataset) - task_queue: mp.Queue = mp.Queue() - - # Queue for client measurements (TTFT, TPOT, etc. for each request) - result_queue: mp.Queue = mp.Queue() - - # Queue for output conversations (with the LLM answers, sent by the server) - conv_queue: mp.Queue = mp.Queue() - output_conv: ConversationsMap = {} - client_metrics: list[RequestStats] = [] - - # Start all clients - start_time = time.perf_counter_ns() - logger.info(f"{Color.GREEN}Starting {bench_args.num_clients} clients{Color.RESET}") - - clients = [] - for client_id in range(bench_args.num_clients): - client = mp.Process( - name=f"client_{client_id}", - target=worker_function, - args=( - client_id, - tokenizer, - client_args, - req_args, - stop_event, - task_queue, - result_queue, - conv_queue, - ), - ) - clients.append(client) - client.start() - - # Submit all the input conversations as tasks for the clients - for conv_id, messages in input_conv.items(): - task_queue.put((conv_id, messages)) - - # Add termination signals for clients - for _ in range(bench_args.num_clients): - task_queue.put((TERM_SIGNAL, TERM_SIGNAL)) - - # Collect the updated conversations from all clients - num_clients_finished = 0 - total_convs = len(input_conv) - - debug_stats = DebugStats(logger, min(15 * bench_args.num_clients, 500)) - - while num_clients_finished < bench_args.num_clients: - # Collect updated conversation - conv_id, messages = conv_queue.get() - - # Collect results (measurements) - while not result_queue.empty(): - new_data = result_queue.get() - client_metrics.append(new_data) - debug_stats.update(new_data) - - if conv_id is TERM_SIGNAL: - num_clients_finished += 1 - logger.info( - f"{Color.CYAN}{num_clients_finished} out of " - f"{bench_args.num_clients} clients finished{Color.RESET}" - ) - - if bench_args.early_stop and not stop_event.is_set(): - # Once one client finished, stop all other clients. - # there is no reason to continue the benchmark with fewer clients. - logger.info( - f"{Color.YELLOW}Sending termination signal to clients{Color.RESET}" - ) - stop_event.set() - else: - output_conv[conv_id] = messages - - finished_convs = len(output_conv) - percent = finished_convs / total_convs - - # Tuned to control the print rate (can be changed if required) - print_cycle = max(3, int(bench_args.num_clients / 4)) - - if finished_convs % print_cycle == 0: - runtime_sec = nanosec_to_sec(time.perf_counter_ns() - start_time) - logger.info( - f"{Color.CYAN}Finished {finished_convs} out of {total_convs} conversations ({percent:.0%}), " # noqa: E501 - f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 - ) - - rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3) - if len(client_metrics) < (5 * bench_args.num_clients): - # Do not estimate the RPS if the number of samples is very low - # (threshold can be tuned if needed) - rps = "N/A" - - runtime_left_sec: Union[str, float] = round( - (runtime_sec / finished_convs) * (total_convs - finished_convs), 3 - ) - if percent < 0.05: - # If less than 5% of the conversations were not finished, - # the estimation will probably be very inaccurate - # (threshold can be tuned if needed). - runtime_left_sec = "N/A" - - logger.info( - f"{Color.CYAN}Estimated req/sec {rps}, estimated runtime left {runtime_left_sec} sec{Color.RESET}" # noqa: E501 - ) - debug_stats.print() - - logger.info( - f"{Color.CYAN}All {bench_args.num_clients} clients finished{Color.RESET}" - ) - - # At this point all the clients finished, - # collect results (TTFT, TPOT, etc.) from all the clients. - # This needs to happen before calling join on the clients - # (result_queue should be emptied). - while not result_queue.empty(): - client_metrics.append(result_queue.get()) - - logger.info(f"Collected {len(client_metrics)} samples from all the clients") - - # Wait for all clients to finish - for client in clients: - logger.info( - f"{Color.CYAN}Waiting for client {client.name} " - f"(is alive: {client.is_alive()}){Color.RESET}" - ) - - client.join(timeout=120) - - if client.is_alive(): - logger.warning( - f"{Color.YELLOW}Client {client.name} will be terminated{Color.RESET}" - ) - client.terminate() - - exitcode = client.exitcode - if exitcode != 0: - logger.error( - f"{Color.RED}Client {client.name} exited " - f"with exit code {exitcode}{Color.RESET}" - ) - - logger.info( - f"All {bench_args.num_clients} clients exited (successfully " - f"finished {len(output_conv)} out of {total_convs} conversations)" - ) - - # Queues should be closed, required to avoid hang at interpreter shutdown - unfinished_tasks = 0 - while not task_queue.empty(): - task_queue.get() - unfinished_tasks += 1 - - if unfinished_tasks > 0: - # Can happen if not all tasks (conversations) have finished. - # May happen if --max-num-requests was used, - # or if an error occurred in one of the clients. - logger.debug(f"Discarding {unfinished_tasks} unfinished tasks") - - task_queue.close() - task_queue.join_thread() - - result_queue.close() - result_queue.join_thread() - - conv_queue.close() - conv_queue.join_thread() - - return output_conv, client_metrics - - -def get_filename_with_timestamp(label: str, extension: str) -> str: - time_now = datetime.now() - timestamp = time_now.strftime("%d-%m-%Y_%H-%M-%S") - filename = f"{label}__{timestamp}.{extension}" - return filename - - -def process_statistics( - client_metrics: list[RequestStats], - warmup_percentages: list[float], - test_params: dict, - verbose: bool, - gen_conv_args: Optional[GenConvArgs] = None, - excel_output: bool = False, -) -> None: - if len(client_metrics) == 0: - logger.info("No samples to process") - return - - logger.info(f"Processing {len(client_metrics)} samples...") - - raw_data = pd.DataFrame(client_metrics) - - if verbose: - # Calculate the time between user turns in each conversation (in a new column) - raw_data = raw_data.sort_values(by=["conversation_id", "start_time_ms"]) - raw_data["time_between_user_turns_sec"] = raw_data.groupby("conversation_id")[ - "start_time_ms" - ].diff() - - # Convert milliseconds to seconds - raw_data["time_between_user_turns_sec"] = ( - raw_data["time_between_user_turns_sec"] / 1000.0 - ) - - # Final raw data should be sorted by time - raw_data = raw_data.sort_values(by=["start_time_ms"]) - raw_data["end_time_ms"] = raw_data["start_time_ms"] + raw_data["latency_ms"] - - percentiles = [0.25, 0.5, 0.75, 0.9] - - # Add more percentiles if there are enough samples - if len(raw_data) >= 100: - percentiles.append(0.99) - - if len(raw_data) >= 1000: - percentiles.append(0.999) - - if len(raw_data) >= 10000: - percentiles.append(0.9999) - - # Set precision for numbers in the output text (the dataframes) - pd.set_option("display.precision", 2) - - # Exclude parameters from RequestStats - exclude = [ - "start_time_ms", - "end_time_ms", - "output_num_first_chunk_tokens", - "approx_cached_percent", - "conversation_id", - "client_id", - ] - - print(TEXT_SEPARATOR) - print(f"{Color.YELLOW}Parameters:{Color.RESET}") - for k, v in test_params.items(): - print(f"{k}={v}") - - # conversations generation parameters - if gen_conv_args is not None: - gen_params = { - "text_files": ", ".join(gen_conv_args.text_files), - "input_num_turns": str(gen_conv_args.input_num_turns), - "input_common_prefix_num_tokens": str( - gen_conv_args.input_common_prefix_num_tokens - ), - "input_prefix_num_tokens": str(gen_conv_args.input_prefix_num_tokens), - "input_num_tokens": str(gen_conv_args.input_num_tokens), - "output_num_tokens": str(gen_conv_args.output_num_tokens), - } - - print(f"{Color.YELLOW}Conversations Generation Parameters:{Color.RESET}") - for k, v in gen_params.items(): - print(f"{k}={v}") - - print(TEXT_SEPARATOR) - - params_list = [] - df_list = [] - for percent in warmup_percentages: - # Select samples from the end (tail) of the dataframe - warmup_count = int(percent * len(raw_data)) - tail_count = len(raw_data) - warmup_count - if tail_count == 0: - # No reason to process if the count of samples is zero - break - - df = raw_data.tail(tail_count) - - # Runtime is the diff between the end of the last request - # and the start of the first request - runtime_sec = df["end_time_ms"].iloc[-1] - df["start_time_ms"].iloc[0] - - # Convert milliseconds to seconds - runtime_sec = runtime_sec / 1000.0 - requests_per_sec = float(len(df)) / runtime_sec - - params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec} - - # Generate a summary of relevant metrics (and drop irrelevant data) - df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose() - - # List for Excel file - params_list.append(params) - df_list.append(df) - - # Print the statistics summary - if percent > 0 or len(warmup_percentages) > 1: - print( - f"{Color.YELLOW}Statistics summary " - f"(assuming {percent:.0%} warmup samples):{Color.RESET}" - ) - else: - print(f"{Color.YELLOW}Statistics summary:{Color.RESET}") - - for k, v in params.items(): - if isinstance(v, float): - print(f"{k} = {v:.3f}") - else: - print(f"{k} = {v}") - print(TEXT_SEPARATOR) - print(df) - print(TEXT_SEPARATOR) - - if excel_output: - prefix = f"statistics_{test_params['num_clients']}_clients" - filename = get_filename_with_timestamp(prefix, "xlsx") - - with pd.ExcelWriter(filename, engine="xlsxwriter") as writer: - startrow = 0 - test_params_df = pd.DataFrame([test_params]) - test_params_df.to_excel( - writer, sheet_name="Summary", index=False, startrow=startrow - ) - startrow += len(test_params_df) + 3 - - if gen_conv_args is not None: - gen_params_df = pd.DataFrame([gen_params]) - gen_params_df.to_excel( - writer, sheet_name="Summary", index=False, startrow=(startrow - 1) - ) - startrow += len(gen_params_df) + 3 - - for params, df_stats in zip(params_list, df_list): - df_params = pd.DataFrame([params]) - df_params.to_excel( - writer, sheet_name="Summary", index=False, startrow=startrow - ) - startrow += len(df_params) + 2 - df_stats.to_excel( - writer, sheet_name="Summary", index=True, startrow=startrow - ) - startrow += len(df_stats) + 3 - - raw_data.to_excel(writer, sheet_name="Raw data", index=False, startrow=0) - - logger.info( - f"{Color.GREEN}Client metrics exported to file: {filename}{Color.RESET}" - ) - - -async def get_server_info(url: str) -> None: - logger.info(f"{Color.BLUE}Collecting information from server: {url}{Color.RESET}") - async with aiohttp.ClientSession() as session: - # Get server version (not mandatory, "version" endpoint may not exist) - url_version = f"{url}/version" - async with session.get(url_version) as response: - if HTTPStatus(response.status) == HTTPStatus.OK: - text = await response.text() - logger.info(f"{Color.BLUE}Server version: {text}{Color.RESET}") - - # Get available models - url_models = f"{url}/v1/models" - async with session.get(url_models) as response: - if HTTPStatus(response.status) == HTTPStatus.OK: - text = await response.text() - logger.info(f"{Color.BLUE}Models:{Color.RESET}") - models_data = json.loads(text) - models_list = models_data["data"] - for model in models_list: - model_id = model["id"] - max_model_len = model.get("max_model_len", "N/A") - logger.info( - f"{Color.BLUE}\t{model_id=}, {max_model_len=}{Color.RESET}" - ) - else: - logger.info(f"{Color.RED}Failed to get models{Color.RESET}") - - -async def main() -> None: - parser = argparse.ArgumentParser( - prog="Benchmark serving with multi-turn conversations", - description="Benchmark online inference using REST API", - ) - parser.add_argument("--version", action="version", version="%(prog)s 1.0") - - parser.add_argument( - "-i", - "--input-file", - type=str, - required=True, - help="Input JSON file with ShareGPT conversations or " - "configuration file for generation of synthetic conversations", - ) - parser.add_argument( - "-o", - "--output-file", - type=str, - default=None, - help="Output JSON file containing conversations with updated assistant answers", - ) - - parser.add_argument( - "--seed", - type=int, - default=0, - help="Seed for random number generators (default: 0)", - ) - - parser.add_argument( - "-m", "--model", type=str, required=True, help="Path of the LLM model" - ) - parser.add_argument( - "--served-model-name", - type=str, - default=None, - help="The model name used in the API. " - "If not specified, the model name will be the " - "same as the ``--model`` argument. ", - ) - - parser.add_argument( - "-u", - "--url", - type=str, - default="http://localhost:8000", - help="Base URL for the LLM API server", - ) - - parser.add_argument( - "-p", - "--num-clients", - type=int, - default=1, - help="Number of clients that will send requests in parallel", - ) - parser.add_argument( - "-k", - "--max-active-conversations", - type=int, - default=None, - help="Max number of active conversations at a time (for all clients)", - ) - parser.add_argument( - "-n", - "--max-num-requests", - type=int, - default=None, - help="Max number of requests to send (total for all clients)", - ) - - parser.add_argument( - "--warmup-step", - default=False, - action="store_true", - help="Run a warmup step (using only the first turn of every conversation), " - "measurements will not be included in the final benchmark results", - ) - - parser.add_argument( - "--max-turns", - type=int, - default=None, - help="Maximum number of turns/messages per conversation, " - "includes both user and assistant messages " - "(a positive number, e.g: 2, 4, 6, etc.), disabled by default", - ) - parser.add_argument( - "--no-early-stop", - default=False, - action="store_true", - help="By default, the benchmark will stop if at least one client exits." - " Use this flag to disable this behavior", - ) - - parser.add_argument( - "--limit-max-tokens", - type=int, - default=NUM_TOKENS_FROM_DATASET, - help="Set max_tokens for the output token count of each request " - "(must also set --limit-min-tokens). " - "Overrides output token count from the input dataset. " - "Use a negative value to disable this limit.", - ) - parser.add_argument( - "--limit-min-tokens", - type=int, - default=NUM_TOKENS_FROM_DATASET, - help="Set min_tokens for the output token count of each request " - "(must also set --limit-max-tokens). " - "Overrides output token count from the input dataset. " - "Use a negative value to disable this limit.", - ) - - parser.add_argument( - "--request-rate", - type=float, - default=0, - help="Expected request rate (Poisson process) per client in requests/sec." - "Set to 0 for no delay between requests.", - ) - parser.add_argument( - "--conversation-sampling", - type=ConversationSampling, - choices=list(ConversationSampling), - default=ConversationSampling.ROUND_ROBIN, - help=( - "Strategy for selecting which conversation to use for the next request. " - "Options: 'round_robin' (cycle through conversations), " - "'random' (pick randomly)." - ), - ) - parser.add_argument( - "--verify-output", - default=False, - action="store_true", - help="Verify the LLM output (compare to the answers in the input JSON file)", - ) - - parser.add_argument( - "--no-stream", - default=False, - action="store_true", - help="Disable stream/streaming mode (set 'stream' to False in the API request)", - ) - - parser.add_argument( - "-e", - "--excel-output", - default=False, - action="store_true", - help="Export summary to Excel file (optional)", - ) - parser.add_argument( - "-v", - "--verbose", - default=False, - action="store_true", - help="Enable verbose output", - ) - parser.add_argument( - "--print-content", - default=False, - action="store_true", - help="Print the user prompts and the server's answers", - ) - - parser.add_argument( - "--warmup-percentages", - type=str, - default="0%", - help="Ignore the first X samples as warmup (X is a percentage)." - " A comma separated list of percentages can be used " - "(for example: --warmup-percentages=0%%,50%%)", - ) - - args = parser.parse_args() - - logger.info(args) - - logger.info(f"{Color.GREEN}Input parameters:{Color.RESET}") - logger.info(f"url={args.url}") - logger.info(f"model={args.model}") - logger.info(f"num_clients={args.num_clients}") - - if args.verify_output: - logger.info(f"{Color.PURPLE}Verify is enabled{Color.RESET}") - - # Calculate the amount of samples to filter (as warmup samples/measurements). - try: - warmup_percentages: list[float] = [0.0] - if not args.warmup_step: - # Warmup percentage can be used only if the warmup step was used - warmup_strings: list[str] = args.warmup_percentages.split(",") - warmup_strings = [x.replace("%", "") for x in warmup_strings] - warmup_percentages = [float(x) / 100 for x in warmup_strings] - - # Check for valid range (0 to 1) - for p in warmup_percentages: - assert p >= 0.0 and p < 1.0 - - # Sort from high to low warmup percentage - warmup_percentages.sort() - - logger.info( - f"Warmup percentages (percentage of samples): {warmup_percentages}" - ) - - except Exception: - raise ValueError( - f"Invalid --warmup-percentage={args.warmup_percentage}" - ) from None - - random.seed(args.seed) - np.random.seed(args.seed) - - if not os.path.exists(args.model): - raise OSError(f"Path does not exist: {args.model}") - logger.info("Loading tokenizer") - tokenizer = AutoTokenizer.from_pretrained(args.model) - - await get_server_info(args.url) - - # Load the input file (either conversations of configuration file) - logger.info(f"Reading input file: {args.input_file}") - with open(args.input_file) as f: - input_data = json.load(f) - - gen_conv_args = None - if isinstance(input_data, list): - # The conversations are stored as a list of dicts - logger.info(f"Found {len(input_data)} items in the input file") - - # Convert the list to a ConversationsMap - conversations = conversations_list_to_dict(input_data) - - elif isinstance(input_data, dict): - # The input file is a configuration file - # (type is determined by the field 'filetype') - if "filetype" not in input_data: - raise Exception( - f"Input file {args.input_file} is invalid (missing 'filetype')" - ) - - logger.info(f"Using input file with filetype: {input_data['filetype']}") - - gen_conv_args = parse_input_json_file(input_data) - - # Disable warning from "huggingface/tokenizers" - # (when using python multiprocessing and tokenizers) - os.environ["TOKENIZERS_PARALLELISM"] = "true" - - # Generate synthetic conversations - conversations = generate_conversations(gen_conv_args, tokenizer) - - else: - raise Exception(f"Input file {args.input_file} is invalid") - - if args.max_turns is not None: - if args.max_turns < 1: - raise ValueError("Max turns must be a positive number") - logger.info( - f"{Color.PURPLE}Max turns per conversation " - f"is limited to {args.max_turns}{Color.RESET}" - ) - - # Create benchmark configurations - client_args, req_args = get_client_config(args, conversations) - - bench_args = BenchmarkArgs( - url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop - ) - - # Warm-up step - if args.warmup_step: - # Only send a single user prompt from every conversation. - # max_active_conversations must be 1, - # otherwise the clients may exit after sending a single request - # (because the task queue is empty). - warmup_client_args = client_args._replace( - skip_first_turn=False, max_turns=1, max_active_conversations=1 - ) - - # Early stop should be disabled, - # all clients should finish their work before exiting - warmup_bench_args = bench_args._replace(early_stop=False) - - logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}") - conversations, _ = await main_mp( - warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations - ) - logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}") - - # Run the benchmark - start_time = time.perf_counter_ns() - client_convs, client_metrics = await main_mp( - client_args, req_args, bench_args, tokenizer, conversations - ) - total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time) - - # Calculate requests per second - total_runtime_sec = total_runtime_ms / 1000.0 - rps = len(client_metrics) / total_runtime_sec - logger.info( - f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec" - f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}" - ) - - # Benchmark parameters - params = { - "model": args.model, - "num_clients": args.num_clients, - "num_conversations": len(conversations), - "active_conversations": args.max_active_conversations, - "seed": args.seed, - } - - if args.limit_min_tokens > 0: - params["min_tokens"] = args.limit_min_tokens - - if args.limit_max_tokens > 0: - params["max_tokens"] = args.limit_max_tokens - - # Process and print statistics (and save excel file with the statistics) - process_statistics( - client_metrics, - test_params=params, - warmup_percentages=warmup_percentages, - verbose=args.verbose, - gen_conv_args=gen_conv_args, - excel_output=args.excel_output, - ) - - if args.output_file is not None: - # Write a JSON file with the updated conversations - # The "assistant" content will contain the answers from the tested LLM - output_data: ShareGptConversations = conversations_dict_to_list(client_convs) - logger.info( - f"{Color.GREEN}Writing conversations file: {args.output_file}{Color.RESET}" - ) - with open(args.output_file, "w") as f: - json.dump(output_data, f, indent=4) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py deleted file mode 100644 index c3622c99a..000000000 --- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py +++ /dev/null @@ -1,354 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Download dataset from: -https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json - -Convert to OpenAI API: -export INPUT_FILE=sharegpt_20230401_clean_lang_split.json -python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128 -""" - -import argparse -import json -import random -from statistics import mean -from typing import Any, Optional - -import pandas as pd # type: ignore -import tqdm # type: ignore -from transformers import AutoTokenizer # type: ignore - - -def has_non_english_chars(text: str) -> bool: - return not text.isascii() - - -def content_is_valid( - content: str, min_content_len: Optional[int], max_content_len: Optional[int] -) -> bool: - if min_content_len and len(content) < min_content_len: - return False - - if max_content_len and len(content) > max_content_len: - return False - - return has_non_english_chars(content) - - -def print_stats( - conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None -) -> None: - # Collect statistics - stats = [] - - print("\nCollecting statistics...") - for item in tqdm.tqdm(conversations): - # item has "id" and "messages" - messages = item["messages"] - - user_turns = 0 - assistant_turns = 0 - user_words = 0 - assistant_words = 0 - conv_chars = 0 - - user_tokens: list[int] = [] - assistant_tokens: list[int] = [] - - for m in messages: - content = m["content"] - conv_chars += len(content) - content_num_words = content.count(" ") + 1 - - num_tokens = 0 - if tokenizer: - num_tokens = len(tokenizer(m["content"]).input_ids) - - if m["role"] == "user": - user_turns += 1 - user_words += content_num_words - if tokenizer: - user_tokens.append(num_tokens) - - elif m["role"] == "assistant": - assistant_turns += 1 - assistant_words += content_num_words - if tokenizer: - assistant_tokens.append(num_tokens) - - # assert user_turns == assistant_turns, \ - # f"Invalid conversation ID {item['id']}" - - conv_words = user_words + assistant_words - item_stats = { - "user_turns": user_turns, - "assistant_turns": assistant_turns, - "user_words": user_words, - "assistant_words": assistant_words, - "conv_turns": len(messages), - "conv_words": conv_words, - "conv_characters": conv_chars, - } - - if len(user_tokens) > 0: - item_stats["user_tokens"] = int(mean(user_tokens)) - - if len(assistant_tokens) > 0: - item_stats["assistant_tokens"] = int(mean(assistant_tokens)) - - stats.append(item_stats) - - print("\nStatistics:") - percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999] - df = pd.DataFrame(stats) - print(df.describe(percentiles=percentiles).transpose()) - - -def convert_sharegpt_to_openai( - seed: int, - input_file: str, - output_file: str, - max_items: Optional[int], - min_content_len: Optional[int] = None, - max_content_len: Optional[int] = None, - min_turns: Optional[int] = None, - max_turns: Optional[int] = None, - model: Optional[str] = None, -) -> None: - if min_turns and max_turns: - assert min_turns <= max_turns - - if min_content_len and max_content_len: - # Verify that min is not larger than max if both were given - assert min_content_len <= max_content_len - - print( - f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=}," - f" {max_content_len=}, {min_turns=}, {max_turns=}\n" - ) - - random.seed(seed) - - tokenizer = None - if model is not None: - print(f"Loading tokenizer from: {model}") - tokenizer = AutoTokenizer.from_pretrained(model) - - # Read the ShareGPT JSON file - print(f"Reading file: {input_file}") - with open(input_file, encoding="utf-8") as f: - # Should be a list of dicts - # Each dict should have "id" (string) and "conversations" (list of dicts) - sharegpt_data = json.load(f) - - assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts" - - print(f"Total items in input file: {len(sharegpt_data):,}") - - print(f"Shuffling dataset with seed {seed}") - random.shuffle(sharegpt_data) - - # Map conversation ID to the all the messages - conversation_parts: dict[str, list[Any]] = {} - - for item in tqdm.tqdm(sharegpt_data): - assert "id" in item, "Missing key 'id'" - assert "conversations" in item, "Missing key 'conversations'" - - # Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.) - conv_id, _ = item["id"].split("_") - new_turns = item["conversations"] - - if conv_id not in conversation_parts: - # Start new conversation - conversation_parts[conv_id] = [] - elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0: - prev_turns = conversation_parts[conv_id][-1] - if prev_turns[-1]["from"] == new_turns[0]["from"]: - new_turns = new_turns[1:] - - if len(new_turns) > 0: - # We assume that parts are in order in the ShareGPT dataset - conversation_parts[conv_id].append(new_turns) - - dataset: list[dict[str, Any]] = [] - for conv_id, conv_parts in conversation_parts.items(): - new_item = {"id": conv_id} - - conversations: list[dict[str, str]] = [] - - # Merge all parts - for conv_part in conv_parts: - conversations.extend(conv_part) - - if len(conversations) > 0: - new_item["conversations"] = conversations - dataset.append(new_item) - - print(f"Total unique conversations (IDs) in input file: {len(dataset):,}") - - # Final output data - final_openai_dataset: list[dict] = [] - - # Filter conversations from the ShareGPT dataset and convert to OpenAI format - for item in tqdm.tqdm(dataset): - messages: list[dict] = [] - - assert "id" in item, "Missing key 'id'" - assert "conversations" in item, "Missing key 'conversations'" - - conv_id = item["id"] - conversations = item["conversations"] - - if min_turns is not None and len(conversations) < min_turns: - # Skip short conversations - continue - - # Convert each message in the conversation, up to max_turns if specified - for i, turn in enumerate(conversations): - assert "from" in turn and "value" in turn, ( - f"Invalid conversation ID {conv_id} - missing 'from' or 'value'" - ) - - role = None - turn_from = turn["from"] - - if turn_from in {"human", "user"}: - role = "user" - elif turn_from in {"gpt", "bing", "chatgpt", "bard"}: - role = "assistant" - elif turn_from == "system": - role = "system" - - assert role is not None, ( - f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid" - ) - - if i == 0 and role != "user": - # If the first message is from assistant (gpt), skip it. - # this happens when the conversation is a follow-up - # to a previous conversation (from the same user). - continue - - if max_turns is not None and i >= max_turns: - break - - # Convert message to OpenAI format (with "role" and "content") - content = turn["value"] - messages.append({"role": role, "content": content}) - - # Add the converted conversation to the OpenAI format - if len(messages) > 0: - valid_messages = True - - # First turn should always be from the user - user_turn = True - - for m in messages: - # Make sure that turns alternate between user and assistant - if (user_turn and m["role"] != "user") or ( - not user_turn and m["role"] != "assistant" - ): - valid_messages = False - break - - user_turn = not user_turn - - content = m["content"] - valid_messages = content_is_valid( - content, min_content_len, max_content_len - ) - if not valid_messages: - break - - if valid_messages is True: - final_openai_dataset.append({"id": conv_id, "messages": messages}) - - assert len(final_openai_dataset) > 0, "Final number of conversations is zero" - - print_stats(final_openai_dataset) - - print_stats_again = False - if max_items is not None and len(final_openai_dataset) > max_items: - print(f"\n\nSampling {max_items} items from the dataset...") - print_stats_again = True - final_openai_dataset = random.sample(final_openai_dataset, max_items) - - if print_stats_again: - # Print stats after the dataset changed - print_stats(final_openai_dataset, tokenizer) - - # Write the converted data to a new JSON file - final_size = len(final_openai_dataset) - print(f"\nTotal conversations converted (after filtering): {final_size:,}") - print(f"\nWriting file: {output_file}") - with open(output_file, "w", encoding="utf-8") as f: - json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2) - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Convert ShareGPT dataset to OpenAI API format" - ) - parser.add_argument("input_file", help="Path to the input ShareGPT JSON file") - parser.add_argument( - "output_file", help="Path to the output OpenAI format JSON file" - ) - parser.add_argument( - "--seed", type=int, default=0, help="Seed for random number generators" - ) - parser.add_argument( - "--max-items", - type=int, - default=None, - help="Maximum number of items in the output file", - ) - parser.add_argument( - "--min-turns", - type=int, - default=None, - help="Minimum number of turns per conversation", - ) - parser.add_argument( - "--max-turns", - type=int, - default=None, - help="Maximum number of turns per conversation", - ) - parser.add_argument( - "--min-content-len", - type=int, - default=None, - help="Min number of characters in the messages' content", - ) - parser.add_argument( - "--max-content-len", - type=int, - default=None, - help="Max number of characters in the messages' content", - ) - parser.add_argument( - "--model", - type=str, - default=None, - help="LLM model, only the tokenizer will be used", - ) - - args = parser.parse_args() - - convert_sharegpt_to_openai( - args.seed, - args.input_file, - args.output_file, - args.max_items, - args.min_content_len, - args.max_content_len, - args.min_turns, - args.max_turns, - args.model, - ) - - -if __name__ == "__main__": - main() diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt deleted file mode 100644 index f0e193591..000000000 --- a/benchmarks/multi_turn/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -numpy>=1.24 -pandas>=2.0.0 -aiohttp>=3.10 -transformers>=4.46 -xlsxwriter>=3.2.1 \ No newline at end of file diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py deleted file mode 100644 index 0957a9c65..000000000 --- a/benchmarks/overheads/benchmark_hashing.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import cProfile -import pstats - -from vllm import LLM, SamplingParams -from vllm.utils import FlexibleArgumentParser - -# A very long prompt, total number of tokens is about 15k. -LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000 -LONG_PROMPT = " ".join(LONG_PROMPT) - - -def main(args): - llm = LLM( - model=args.model, - enforce_eager=True, - enable_prefix_caching=True, - tensor_parallel_size=args.tensor_parallel_size, - ) - - sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) - profiler = cProfile.Profile() - - print("------warm up------") - for i in range(3): - output = llm.generate(LONG_PROMPT, sampling_params) - print(output[0].outputs[0].text) - - print("------start generating------") - for i in range(3): - profiler.runctx( - "llm.generate(LONG_PROMPT, sampling_params)", globals(), locals() - ) - - # analyze the runtime of hashing function - stats = pstats.Stats(profiler) - stats.sort_stats("cumulative") - total_time = 0 - total_calls = 0 - for func in stats.stats: - if "hash_of_block" in func[2]: - total_time = stats.stats[func][3] - total_calls = stats.stats[func][0] - percentage = (total_time / stats.total_tt) * 100 - print( - f"Hashing took {total_time:.2f} seconds,{percentage:.2f}% of the total runtime." - ) - - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark the performance of hashing function in" - "automatic prefix caching." - ) - parser.add_argument("--model", type=str, default="lmsys/longchat-7b-16k") - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) - parser.add_argument("--output-len", type=int, default=10) - parser.add_argument( - "--enable-prefix-caching", action="store_true", help="enable prefix caching" - ) - args = parser.parse_args() - main(args) diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml deleted file mode 100644 index 65b1e09a2..000000000 --- a/benchmarks/pyproject.toml +++ /dev/null @@ -1,49 +0,0 @@ -# This local pyproject file is part of the migration from yapf to ruff format. -# It uses the same core rules as the main pyproject.toml file, but with the -# following differences: -# - ruff line length is overridden to 88 -# - deprecated typing ignores (UP006, UP035) have been removed - -[tool.ruff] -line-length = 88 - -[tool.ruff.lint.per-file-ignores] -"vllm/third_party/**" = ["ALL"] -"vllm/version.py" = ["F401"] -"vllm/_version.py" = ["ALL"] - -[tool.ruff.lint] -select = [ - # pycodestyle - "E", - # Pyflakes - "F", - # pyupgrade - "UP", - # flake8-bugbear - "B", - # flake8-simplify - "SIM", - # isort - "I", - # flake8-logging-format - "G", -] -ignore = [ - # star imports - "F405", "F403", - # lambda expression assignment - "E731", - # Loop control variable not used within loop body - "B007", - # f-string format - "UP032", - # Can remove once 3.10+ is the minimum Python version - "UP007", -] - -[tool.ruff.lint.isort] -known-first-party = ["vllm"] - -[tool.ruff.format] -docstring-code-format = true \ No newline at end of file diff --git a/benchmarks/run_structured_output_benchmark.sh b/benchmarks/run_structured_output_benchmark.sh deleted file mode 100644 index b043ab83e..000000000 --- a/benchmarks/run_structured_output_benchmark.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/bin/bash - -# default values -MODEL=${MODEL:-"Qwen/Qwen2.5-7B-Instruct"} -BACKEND=${BACKEND:-"vllm"} -DATASET=${DATASET:-"xgrammar_bench"} -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -OUTPUT_DIR=${OUTPUT_DIR:-"$SCRIPT_DIR/structured_output_benchmark_results"} -PORT=${PORT:-8000} -STRUCTURED_OUTPUT_RATIO=${STRUCTURED_OUTPUT_RATIO:-1} -TOTAL_SECONDS=${TOTAL_SECONDS:-90} -MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-300} -TOKENIZER_MODE=${TOKENIZER_MODE:-"auto"} - -usage() { - echo "Usage: $0 [options]" - echo "Options:" - echo " --model MODEL Model to benchmark (default: $MODEL)" - echo " --backend BACKEND Backend to use (default: $BACKEND)" - echo " --dataset DATASET Dataset to use (default: $DATASET)" - echo " --max-new-tokens N Maximum number of tokens to generate (default: $MAX_NEW_TOKENS)" - echo " --output-dir DIR Output directory for results (default: $OUTPUT_DIR)" - echo " --port PORT Port to use (default: $PORT)" - echo " --structured-output-ratio N Ratio of structured outputs (default: $STRUCTURED_OUTPUT_RATIO)" - echo " --tokenizer-mode MODE Tokenizer mode to use (default: $TOKENIZER_MODE)" - echo " --total-seconds N Total seconds to run the benchmark (default: $TOTAL_SECONDS)" - echo " -h, --help Show this help message and exit" - exit 0 -} - -# parse command line arguments -while [[ $# -gt 0 ]]; do - case $1 in - --model) - MODEL="$2" - shift 2 - ;; - --backend) - BACKEND="$2" - shift 2 - ;; - --dataset) - DATASET="$2" - shift 2 - ;; - --max-new-tokens) - MAX_NEW_TOKENS="$2" - shift 2 - ;; - --output-dir) - OUTPUT_DIR="$2" - shift 2 - ;; - --port) - PORT="$2" - shift 2 - ;; - --structured-output-ratio) - STRUCTURED_OUTPUT_RATIO="$2" - shift 2 - ;; - --tokenizer-mode) - TOKENIZER_MODE="$2" - shift 2 - ;; - --total-seconds) - TOTAL_SECONDS="$2" - shift 2 - ;; - -h|--help) - usage - ;; - *) - echo "Unknown argument: $1\n" - usage - ;; - esac -done - -# Create output directory if it doesn't exist -mkdir -p "$OUTPUT_DIR" - -# Define QPS values to test -QPS_VALUES=(25 20 15 10 5 1) - -# Common parameters -COMMON_PARAMS="--backend $BACKEND \ - --model $MODEL \ - --dataset $DATASET \ - --structured-output-ratio $STRUCTURED_OUTPUT_RATIO \ - --save-results \ - --result-dir $OUTPUT_DIR \ - --output-len $MAX_NEW_TOKENS \ - --port $PORT \ - --tokenizer-mode $TOKENIZER_MODE" - -echo "Starting structured output benchmark with model: $MODEL" -echo "Backend: $BACKEND" -echo "Dataset: $DATASET" -echo "Results will be saved to: $OUTPUT_DIR" -echo "----------------------------------------" - -# Run benchmarks with different QPS values -for qps in "${QPS_VALUES[@]}"; do - echo "Running benchmark with QPS: $qps" - - # Get git hash and branch for the filename - GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown") - GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown") - - # Construct filename for this run - FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json" - - NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc) - NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part - echo "Running benchmark with $NUM_PROMPTS prompts" - - # Run the benchmark - python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \ - --request-rate $qps \ - --result-filename "$FILENAME" \ - --num-prompts $NUM_PROMPTS - - echo "Completed benchmark with QPS: $qps" - echo "----------------------------------------" -done - -echo "All benchmarks completed!" -echo "Results saved to: $OUTPUT_DIR" diff --git a/benchmarks/sonnet.txt b/benchmarks/sonnet.txt deleted file mode 100644 index 34c444e8c..000000000 --- a/benchmarks/sonnet.txt +++ /dev/null @@ -1,518 +0,0 @@ -FROM fairest creatures we desire increase, -That thereby beauty's rose might never die, -But as the riper should by time decease, -His tender heir might bear his memory: -But thou, contracted to thine own bright eyes, -Feed'st thy light'st flame with self-substantial fuel, -Making a famine where abundance lies, -Thyself thy foe, to thy sweet self too cruel. -Thou that art now the world's fresh ornament -And only herald to the gaudy spring, -Within thine own bud buriest thy content -And, tender churl, makest waste in niggarding. -Pity the world, or else this glutton be, -To eat the world's due, by the grave and thee. -When forty winters shall beseige thy brow, -And dig deep trenches in thy beauty's field, -Thy youth's proud livery, so gazed on now, -Will be a tatter'd weed, of small worth held: -Then being ask'd where all thy beauty lies, -Where all the treasure of thy lusty days, -To say, within thine own deep-sunken eyes, -Were an all-eating shame and thriftless praise. -How much more praise deserved thy beauty's use, -If thou couldst answer 'This fair child of mine -Shall sum my count and make my old excuse,' -Proving his beauty by succession thine! -This were to be new made when thou art old, -And see thy blood warm when thou feel'st it cold. -Look in thy glass, and tell the face thou viewest -Now is the time that face should form another; -Whose fresh repair if now thou not renewest, -Thou dost beguile the world, unbless some mother. -For where is she so fair whose unear'd womb -Disdains the tillage of thy husbandry? -Or who is he so fond will be the tomb -Of his self-love, to stop posterity? -Thou art thy mother's glass, and she in thee -Calls back the lovely April of her prime: -So thou through windows of thine age shall see -Despite of wrinkles this thy golden time. -But if thou live, remember'd not to be, -Die single, and thine image dies with thee. -Unthrifty loveliness, why dost thou spend -Upon thyself thy beauty's legacy? -Nature's bequest gives nothing but doth lend, -And being frank she lends to those are free. -Then, beauteous niggard, why dost thou abuse -The bounteous largess given thee to give? -Profitless usurer, why dost thou use -So great a sum of sums, yet canst not live? -For having traffic with thyself alone, -Thou of thyself thy sweet self dost deceive. -Then how, when nature calls thee to be gone, -What acceptable audit canst thou leave? -Thy unused beauty must be tomb'd with thee, -Which, used, lives th' executor to be. -Those hours, that with gentle work did frame -The lovely gaze where every eye doth dwell, -Will play the tyrants to the very same -And that unfair which fairly doth excel: -For never-resting time leads summer on -To hideous winter and confounds him there; -Sap cheque'd with frost and lusty leaves quite gone, -Beauty o'ersnow'd and bareness every where: -Then, were not summer's distillation left, -A liquid prisoner pent in walls of glass, -Beauty's effect with beauty were bereft, -Nor it nor no remembrance what it was: -But flowers distill'd though they with winter meet, -Leese but their show; their substance still lives sweet. -Then let not winter's ragged hand deface -In thee thy summer, ere thou be distill'd: -Make sweet some vial; treasure thou some place -With beauty's treasure, ere it be self-kill'd. -That use is not forbidden usury, -Which happies those that pay the willing loan; -That's for thyself to breed another thee, -Or ten times happier, be it ten for one; -Ten times thyself were happier than thou art, -If ten of thine ten times refigured thee: -Then what could death do, if thou shouldst depart, -Leaving thee living in posterity? -Be not self-will'd, for thou art much too fair -To be death's conquest and make worms thine heir. -Lo! in the orient when the gracious light -Lifts up his burning head, each under eye -Doth homage to his new-appearing sight, -Serving with looks his sacred majesty; -And having climb'd the steep-up heavenly hill, -Resembling strong youth in his middle age, -yet mortal looks adore his beauty still, -Attending on his golden pilgrimage; -But when from highmost pitch, with weary car, -Like feeble age, he reeleth from the day, -The eyes, 'fore duteous, now converted are -From his low tract and look another way: -So thou, thyself out-going in thy noon, -Unlook'd on diest, unless thou get a son. -Music to hear, why hear'st thou music sadly? -Sweets with sweets war not, joy delights in joy. -Why lovest thou that which thou receivest not gladly, -Or else receivest with pleasure thine annoy? -If the true concord of well-tuned sounds, -By unions married, do offend thine ear, -They do but sweetly chide thee, who confounds -In singleness the parts that thou shouldst bear. -Mark how one string, sweet husband to another, -Strikes each in each by mutual ordering, -Resembling sire and child and happy mother -Who all in one, one pleasing note do sing: -Whose speechless song, being many, seeming one, -Sings this to thee: 'thou single wilt prove none.' -Is it for fear to wet a widow's eye -That thou consumest thyself in single life? -Ah! if thou issueless shalt hap to die. -The world will wail thee, like a makeless wife; -The world will be thy widow and still weep -That thou no form of thee hast left behind, -When every private widow well may keep -By children's eyes her husband's shape in mind. -Look, what an unthrift in the world doth spend -Shifts but his place, for still the world enjoys it; -But beauty's waste hath in the world an end, -And kept unused, the user so destroys it. -No love toward others in that bosom sits -That on himself such murderous shame commits. -For shame! deny that thou bear'st love to any, -Who for thyself art so unprovident. -Grant, if thou wilt, thou art beloved of many, -But that thou none lovest is most evident; -For thou art so possess'd with murderous hate -That 'gainst thyself thou stick'st not to conspire. -Seeking that beauteous roof to ruinate -Which to repair should be thy chief desire. -O, change thy thought, that I may change my mind! -Shall hate be fairer lodged than gentle love? -Be, as thy presence is, gracious and kind, -Or to thyself at least kind-hearted prove: -Make thee another self, for love of me, -That beauty still may live in thine or thee. -As fast as thou shalt wane, so fast thou growest -In one of thine, from that which thou departest; -And that fresh blood which youngly thou bestowest -Thou mayst call thine when thou from youth convertest. -Herein lives wisdom, beauty and increase: -Without this, folly, age and cold decay: -If all were minded so, the times should cease -And threescore year would make the world away. -Let those whom Nature hath not made for store, -Harsh featureless and rude, barrenly perish: -Look, whom she best endow'd she gave the more; -Which bounteous gift thou shouldst in bounty cherish: -She carved thee for her seal, and meant thereby -Thou shouldst print more, not let that copy die. -When I do count the clock that tells the time, -And see the brave day sunk in hideous night; -When I behold the violet past prime, -And sable curls all silver'd o'er with white; -When lofty trees I see barren of leaves -Which erst from heat did canopy the herd, -And summer's green all girded up in sheaves -Borne on the bier with white and bristly beard, -Then of thy beauty do I question make, -That thou among the wastes of time must go, -Since sweets and beauties do themselves forsake -And die as fast as they see others grow; -And nothing 'gainst Time's scythe can make defence -Save breed, to brave him when he takes thee hence. -O, that you were yourself! but, love, you are -No longer yours than you yourself here live: -Against this coming end you should prepare, -And your sweet semblance to some other give. -So should that beauty which you hold in lease -Find no determination: then you were -Yourself again after yourself's decease, -When your sweet issue your sweet form should bear. -Who lets so fair a house fall to decay, -Which husbandry in honour might uphold -Against the stormy gusts of winter's day -And barren rage of death's eternal cold? -O, none but unthrifts! Dear my love, you know -You had a father: let your son say so. -Not from the stars do I my judgment pluck; -And yet methinks I have astronomy, -But not to tell of good or evil luck, -Of plagues, of dearths, or seasons' quality; -Nor can I fortune to brief minutes tell, -Pointing to each his thunder, rain and wind, -Or say with princes if it shall go well, -By oft predict that I in heaven find: -But from thine eyes my knowledge I derive, -And, constant stars, in them I read such art -As truth and beauty shall together thrive, -If from thyself to store thou wouldst convert; -Or else of thee this I prognosticate: -Thy end is truth's and beauty's doom and date. -When I consider every thing that grows -Holds in perfection but a little moment, -That this huge stage presenteth nought but shows -Whereon the stars in secret influence comment; -When I perceive that men as plants increase, -Cheered and cheque'd even by the self-same sky, -Vaunt in their youthful sap, at height decrease, -And wear their brave state out of memory; -Then the conceit of this inconstant stay -Sets you most rich in youth before my sight, -Where wasteful Time debateth with Decay, -To change your day of youth to sullied night; -And all in war with Time for love of you, -As he takes from you, I engraft you new. -But wherefore do not you a mightier way -Make war upon this bloody tyrant, Time? -And fortify yourself in your decay -With means more blessed than my barren rhyme? -Now stand you on the top of happy hours, -And many maiden gardens yet unset -With virtuous wish would bear your living flowers, -Much liker than your painted counterfeit: -So should the lines of life that life repair, -Which this, Time's pencil, or my pupil pen, -Neither in inward worth nor outward fair, -Can make you live yourself in eyes of men. -To give away yourself keeps yourself still, -And you must live, drawn by your own sweet skill. -Who will believe my verse in time to come, -If it were fill'd with your most high deserts? -Though yet, heaven knows, it is but as a tomb -Which hides your life and shows not half your parts. -If I could write the beauty of your eyes -And in fresh numbers number all your graces, -The age to come would say 'This poet lies: -Such heavenly touches ne'er touch'd earthly faces.' -So should my papers yellow'd with their age -Be scorn'd like old men of less truth than tongue, -And your true rights be term'd a poet's rage -And stretched metre of an antique song: -But were some child of yours alive that time, -You should live twice; in it and in my rhyme. -Shall I compare thee to a summer's day? -Thou art more lovely and more temperate: -Rough winds do shake the darling buds of May, -And summer's lease hath all too short a date: -Sometime too hot the eye of heaven shines, -And often is his gold complexion dimm'd; -And every fair from fair sometime declines, -By chance or nature's changing course untrimm'd; -But thy eternal summer shall not fade -Nor lose possession of that fair thou owest; -Nor shall Death brag thou wander'st in his shade, -When in eternal lines to time thou growest: -So long as men can breathe or eyes can see, -So long lives this and this gives life to thee. -Devouring Time, blunt thou the lion's paws, -And make the earth devour her own sweet brood; -Pluck the keen teeth from the fierce tiger's jaws, -And burn the long-lived phoenix in her blood; -Make glad and sorry seasons as thou fleets, -And do whate'er thou wilt, swift-footed Time, -To the wide world and all her fading sweets; -But I forbid thee one most heinous crime: -O, carve not with thy hours my love's fair brow, -Nor draw no lines there with thine antique pen; -Him in thy course untainted do allow -For beauty's pattern to succeeding men. -Yet, do thy worst, old Time: despite thy wrong, -My love shall in my verse ever live young. -A woman's face with Nature's own hand painted -Hast thou, the master-mistress of my passion; -A woman's gentle heart, but not acquainted -With shifting change, as is false women's fashion; -An eye more bright than theirs, less false in rolling, -Gilding the object whereupon it gazeth; -A man in hue, all 'hues' in his controlling, -Much steals men's eyes and women's souls amazeth. -And for a woman wert thou first created; -Till Nature, as she wrought thee, fell a-doting, -And by addition me of thee defeated, -By adding one thing to my purpose nothing. -But since she prick'd thee out for women's pleasure, -Mine be thy love and thy love's use their treasure. -So is it not with me as with that Muse -Stirr'd by a painted beauty to his verse, -Who heaven itself for ornament doth use -And every fair with his fair doth rehearse -Making a couplement of proud compare, -With sun and moon, with earth and sea's rich gems, -With April's first-born flowers, and all things rare -That heaven's air in this huge rondure hems. -O' let me, true in love, but truly write, -And then believe me, my love is as fair -As any mother's child, though not so bright -As those gold candles fix'd in heaven's air: -Let them say more than like of hearsay well; -I will not praise that purpose not to sell. -My glass shall not persuade me I am old, -So long as youth and thou are of one date; -But when in thee time's furrows I behold, -Then look I death my days should expiate. -For all that beauty that doth cover thee -Is but the seemly raiment of my heart, -Which in thy breast doth live, as thine in me: -How can I then be elder than thou art? -O, therefore, love, be of thyself so wary -As I, not for myself, but for thee will; -Bearing thy heart, which I will keep so chary -As tender nurse her babe from faring ill. -Presume not on thy heart when mine is slain; -Thou gavest me thine, not to give back again. -As an unperfect actor on the stage -Who with his fear is put besides his part, -Or some fierce thing replete with too much rage, -Whose strength's abundance weakens his own heart. -So I, for fear of trust, forget to say -The perfect ceremony of love's rite, -And in mine own love's strength seem to decay, -O'ercharged with burden of mine own love's might. -O, let my books be then the eloquence -And dumb presagers of my speaking breast, -Who plead for love and look for recompense -More than that tongue that more hath more express'd. -O, learn to read what silent love hath writ: -To hear with eyes belongs to love's fine wit. -Mine eye hath play'd the painter and hath stell'd -Thy beauty's form in table of my heart; -My body is the frame wherein 'tis held, -And perspective it is the painter's art. -For through the painter must you see his skill, -To find where your true image pictured lies; -Which in my bosom's shop is hanging still, -That hath his windows glazed with thine eyes. -Now see what good turns eyes for eyes have done: -Mine eyes have drawn thy shape, and thine for me -Are windows to my breast, where-through the sun -Delights to peep, to gaze therein on thee; -Yet eyes this cunning want to grace their art; -They draw but what they see, know not the heart. -Let those who are in favour with their stars -Of public honour and proud titles boast, -Whilst I, whom fortune of such triumph bars, -Unlook'd for joy in that I honour most. -Great princes' favourites their fair leaves spread -But as the marigold at the sun's eye, -And in themselves their pride lies buried, -For at a frown they in their glory die. -The painful warrior famoused for fight, -After a thousand victories once foil'd, -Is from the book of honour razed quite, -And all the rest forgot for which he toil'd: -Then happy I, that love and am beloved -Where I may not remove nor be removed. -Lord of my love, to whom in vassalage -Thy merit hath my duty strongly knit, -To thee I send this written embassage, -To witness duty, not to show my wit: -Duty so great, which wit so poor as mine -May make seem bare, in wanting words to show it, -But that I hope some good conceit of thine -In thy soul's thought, all naked, will bestow it; -Till whatsoever star that guides my moving -Points on me graciously with fair aspect -And puts apparel on my tatter'd loving, -To show me worthy of thy sweet respect: -Then may I dare to boast how I do love thee; -Till then not show my head where thou mayst prove me. -Weary with toil, I haste me to my bed, -The dear repose for limbs with travel tired; -But then begins a journey in my head, -To work my mind, when body's work's expired: -For then my thoughts, from far where I abide, -Intend a zealous pilgrimage to thee, -And keep my drooping eyelids open wide, -Looking on darkness which the blind do see -Save that my soul's imaginary sight -Presents thy shadow to my sightless view, -Which, like a jewel hung in ghastly night, -Makes black night beauteous and her old face new. -Lo! thus, by day my limbs, by night my mind, -For thee and for myself no quiet find. -How can I then return in happy plight, -That am debarr'd the benefit of rest? -When day's oppression is not eased by night, -But day by night, and night by day, oppress'd? -And each, though enemies to either's reign, -Do in consent shake hands to torture me; -The one by toil, the other to complain -How far I toil, still farther off from thee. -I tell the day, to please them thou art bright -And dost him grace when clouds do blot the heaven: -So flatter I the swart-complexion'd night, -When sparkling stars twire not thou gild'st the even. -But day doth daily draw my sorrows longer -And night doth nightly make grief's strength seem stronger. -When, in disgrace with fortune and men's eyes, -I all alone beweep my outcast state -And trouble deal heaven with my bootless cries -And look upon myself and curse my fate, -Wishing me like to one more rich in hope, -Featured like him, like him with friends possess'd, -Desiring this man's art and that man's scope, -With what I most enjoy contented least; -Yet in these thoughts myself almost despising, -Haply I think on thee, and then my state, -Like to the lark at break of day arising -From sullen earth, sings hymns at heaven's gate; -For thy sweet love remember'd such wealth brings -That then I scorn to change my state with kings. -When to the sessions of sweet silent thought -I summon up remembrance of things past, -I sigh the lack of many a thing I sought, -And with old woes new wail my dear time's waste: -Then can I drown an eye, unused to flow, -For precious friends hid in death's dateless night, -And weep afresh love's long since cancell'd woe, -And moan the expense of many a vanish'd sight: -Then can I grieve at grievances foregone, -And heavily from woe to woe tell o'er -The sad account of fore-bemoaned moan, -Which I new pay as if not paid before. -But if the while I think on thee, dear friend, -All losses are restored and sorrows end. -Thy bosom is endeared with all hearts, -Which I by lacking have supposed dead, -And there reigns love and all love's loving parts, -And all those friends which I thought buried. -How many a holy and obsequious tear -Hath dear religious love stol'n from mine eye -As interest of the dead, which now appear -But things removed that hidden in thee lie! -Thou art the grave where buried love doth live, -Hung with the trophies of my lovers gone, -Who all their parts of me to thee did give; -That due of many now is thine alone: -Their images I loved I view in thee, -And thou, all they, hast all the all of me. -If thou survive my well-contented day, -When that churl Death my bones with dust shall cover, -And shalt by fortune once more re-survey -These poor rude lines of thy deceased lover, -Compare them with the bettering of the time, -And though they be outstripp'd by every pen, -Reserve them for my love, not for their rhyme, -Exceeded by the height of happier men. -O, then vouchsafe me but this loving thought: -'Had my friend's Muse grown with this growing age, -A dearer birth than this his love had brought, -To march in ranks of better equipage: -But since he died and poets better prove, -Theirs for their style I'll read, his for his love.' -Full many a glorious morning have I seen -Flatter the mountain-tops with sovereign eye, -Kissing with golden face the meadows green, -Gilding pale streams with heavenly alchemy; -Anon permit the basest clouds to ride -With ugly rack on his celestial face, -And from the forlorn world his visage hide, -Stealing unseen to west with this disgrace: -Even so my sun one early morn did shine -With all triumphant splendor on my brow; -But out, alack! he was but one hour mine; -The region cloud hath mask'd him from me now. -Yet him for this my love no whit disdaineth; -Suns of the world may stain when heaven's sun staineth. -Why didst thou promise such a beauteous day, -And make me travel forth without my cloak, -To let base clouds o'ertake me in my way, -Hiding thy bravery in their rotten smoke? -'Tis not enough that through the cloud thou break, -To dry the rain on my storm-beaten face, -For no man well of such a salve can speak -That heals the wound and cures not the disgrace: -Nor can thy shame give physic to my grief; -Though thou repent, yet I have still the loss: -The offender's sorrow lends but weak relief -To him that bears the strong offence's cross. -Ah! but those tears are pearl which thy love sheds, -And they are rich and ransom all ill deeds. -No more be grieved at that which thou hast done: -Roses have thorns, and silver fountains mud; -Clouds and eclipses stain both moon and sun, -And loathsome canker lives in sweetest bud. -All men make faults, and even I in this, -Authorizing thy trespass with compare, -Myself corrupting, salving thy amiss, -Excusing thy sins more than thy sins are; -For to thy sensual fault I bring in sense-- -Thy adverse party is thy advocate-- -And 'gainst myself a lawful plea commence: -Such civil war is in my love and hate -That I an accessary needs must be -To that sweet thief which sourly robs from me. -Let me confess that we two must be twain, -Although our undivided loves are one: -So shall those blots that do with me remain -Without thy help by me be borne alone. -In our two loves there is but one respect, -Though in our lives a separable spite, -Which though it alter not love's sole effect, -Yet doth it steal sweet hours from love's delight. -I may not evermore acknowledge thee, -Lest my bewailed guilt should do thee shame, -Nor thou with public kindness honour me, -Unless thou take that honour from thy name: -But do not so; I love thee in such sort -As, thou being mine, mine is thy good report. -As a decrepit father takes delight -To see his active child do deeds of youth, -So I, made lame by fortune's dearest spite, -Take all my comfort of thy worth and truth. -For whether beauty, birth, or wealth, or wit, -Or any of these all, or all, or more, -Entitled in thy parts do crowned sit, -I make my love engrafted to this store: -So then I am not lame, poor, nor despised, -Whilst that this shadow doth such substance give -That I in thy abundance am sufficed -And by a part of all thy glory live. -Look, what is best, that best I wish in thee: -This wish I have; then ten times happy me! \ No newline at end of file diff --git a/vllm_omni/benchmarks/__init__.py b/vllm_omni/benchmarks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/vllm_omni/benchmarks/datasets.py b/vllm_omni/benchmarks/datasets.py new file mode 100644 index 000000000..c019dd35a --- /dev/null +++ b/vllm_omni/benchmarks/datasets.py @@ -0,0 +1,2814 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This module defines a framework for sampling benchmark requests from various +datasets. Each dataset subclass of BenchmarkDataset must implement sample +generation. Supported dataset types include: + - ShareGPT + - Random (synthetic) + - Sonnet + - BurstGPT + - HuggingFace + - VisionArena +""" +import argparse +import ast +import base64 +import io +import json +import logging +import math +import random +import torchaudio +import torch +import cv2 +import tempfile +import os +from abc import ABC, abstractmethod +from collections.abc import Iterator, Mapping +from contextlib import suppress +from copy import deepcopy +from dataclasses import dataclass +from functools import cache +from io import BytesIO +from typing import Any, Callable, Optional, Union, cast, Dict + +import numpy as np +from PIL import Image +from transformers import PreTrainedTokenizerBase +from typing_extensions import deprecated + +from vllm.lora.request import LoRARequest +from vllm.lora.utils import get_adapter_absolute_path +from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.image import convert_image_mode +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import PlaceholderModule + +try: + from datasets import load_dataset +except ImportError: + datasets = PlaceholderModule("datasets") + load_dataset = datasets.placeholder_attr("load_dataset") + +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + +logger = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- +# Data Classes +# ----------------------------------------------------------------------------- + + +@dataclass +class SampleRequest: + """ + Represents a single inference request for benchmarking. + """ + + prompt: Union[str, list[str]] + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[ + Union[MultiModalDataDict, dict, list[dict]] + ] = None + lora_request: Optional[LoRARequest] = None + request_id: Optional[str] = None + + +# ----------------------------------------------------------------------------- +# Benchmark Dataset Base Class +# ----------------------------------------------------------------------------- + + +class BenchmarkDataset(ABC): + DEFAULT_SEED = 0 + IS_MULTIMODAL = False + + def __init__( + self, + dataset_path: Optional[str] = None, + random_seed: int = DEFAULT_SEED, + ) -> None: + """ + Initialize the BenchmarkDataset with an optional dataset path and random + seed. + + Args: + dataset_path (Optional[str]): Path to the dataset. If None, it + indicates that a default or random dataset might be used. + random_seed (int): Seed value for reproducible shuffling or + sampling. Defaults to DEFAULT_SEED. + """ + self.dataset_path = dataset_path + # Set the random seed, ensuring that a None value is replaced with the + # default seed. + self.random_seed = (random_seed + if random_seed is not None else self.DEFAULT_SEED) + self.data = None + + def apply_multimodal_chat_transformation( + self, + prompt: str, + mm_content: Optional[ + Union[MultiModalDataDict, dict, list[dict]] + ] = None) -> list[dict]: + """ + Transform a prompt and optional multimodal content into a chat format. + This method is used for chat models that expect a specific conversation + format. + """ + content = [{"text": prompt, "type": "text"}] + if mm_content is not None: + if isinstance(mm_content, list): + content.extend(cast(list[dict[str, Any]], mm_content)) + elif isinstance(mm_content, dict): + content.append(mm_content) + else: + raise TypeError( + "Could not process multimodal content of type: " + + f"{type(mm_content)}" + ) + return [{"role": "user", "content": content}] + + def load_data(self) -> None: + """ + Load data from the dataset path into self.data. + + This method must be overridden by subclasses since the method to load + data will vary depending on the dataset format and source. + + Raises: + NotImplementedError: If a subclass does not implement this method. + """ + # TODO (jenniferzhao): add support for downloading data + raise NotImplementedError( + "load_data must be implemented in subclasses.") + + def get_random_lora_request( + self, + max_loras: Optional[int] = None, + lora_path: Optional[str] = None, + ) -> Optional[LoRARequest]: + """ + Optionally select a random LoRA request. + + This method is used when LoRA parameters are provided. It randomly + selects a LoRA based on max_loras. + + Args: + max_loras (Optional[int]): The maximum number of LoRAs available. + If `None`, LoRA is not used. + lora_path (Optional[str]): Path to the LoRA parameters on disk. + If `None`, LoRA is not used. + + Returns: + A new [`LoRARequest`][vllm.lora.request.LoRARequest] + (or `None` if not applicable). + """ + if max_loras is None or lora_path is None: + return None + + # Generate a random LoRA ID in the range [1, max_loras]. + lora_id = random.randint(1, max_loras) + lora_request = LoRARequest( + lora_name=str(lora_id), + lora_int_id=lora_id, + lora_path=lora_path_on_disk(lora_path), + ) + return lora_request + + @abstractmethod + def sample(self, tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False) -> list[SampleRequest]: + """ + Abstract method to generate sample requests from the dataset. + + Subclasses must override this method to implement dataset-specific logic + for generating a list of SampleRequest objects. + + Args: + tokenizer (PreTrainedTokenizerBase): The tokenizer to be used + for processing the dataset's text. + num_requests (int): The number of sample requests to generate. + request_id_prefix (str): The prefix of request_id. + + Returns: + list[SampleRequest]: A list of sample requests generated from the + dataset. + """ + raise NotImplementedError("sample must be implemented in subclasses.") + + def maybe_oversample_requests( + self, + requests: list[SampleRequest], + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + ) -> None: + """ + Oversamples the list of requests if its size is less than the desired + number. + + Args: + requests (List[SampleRequest]): The current list of sampled + requests. + num_requests (int): The target number of requests. + request_id_prefix (str): The prefix applied to generated request + identifiers. + + """ + if no_oversample: + logger.info("Skipping oversampling. " \ + "Total samples: %d.", len(requests)) + return + + if len(requests) < num_requests: + random.seed(self.random_seed) + additional = deepcopy( + random.choices(requests, k=num_requests - len(requests)) + ) + for i in range(len(additional)): + req = additional[i] + req.request_id = request_id_prefix + str(len(requests) + i) + requests.extend(additional) + logger.info("Oversampled requests to reach %d total samples.", + num_requests) + + +# ----------------------------------------------------------------------------- +# Utility Functions and Global Caches +# ----------------------------------------------------------------------------- + + +def is_valid_sequence( + prompt_len: int, + output_len: int, + min_len: int = 4, + max_prompt_len: int = 1024, + max_total_len: int = 2048, + skip_min_output_len_check: bool = False, +) -> bool: + """ + Validate a sequence based on prompt and output lengths. + + Default pruning criteria are copied from the original `sample_hf_requests` + and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as + from `sample_requests` in benchmark_throughput.py. + """ + # Check for invalid conditions + prompt_too_short = prompt_len < min_len + output_too_short = (not skip_min_output_len_check) and (output_len + < min_len) + prompt_too_long = prompt_len > max_prompt_len + combined_too_long = (prompt_len + output_len) > max_total_len + + # Return True if none of the invalid conditions are met + return not (prompt_too_short or output_too_short or prompt_too_long + or combined_too_long) + + +@cache +def lora_path_on_disk(lora_path: str) -> str: + return get_adapter_absolute_path(lora_path) + + +# Global cache for LoRA tokenizers. +lora_tokenizer_cache: dict[int, AnyTokenizer] = {} + + +def process_image(image: Any) -> Mapping[str, Any]: + """ + Process a single image input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key + containing raw image data. - Loads the bytes as a PIL.Image.Image. + + 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as + a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns + a dictionary with the image as a base64 data URL. + + 3. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(image, dict) and 'bytes' in image: + image = Image.open(BytesIO(image['bytes'])) + if isinstance(image, Image.Image): + image = convert_image_mode(image, "RGB") + with io.BytesIO() as image_data: + image.save(image_data, format="JPEG") + image_base64 = base64.b64encode( + image_data.getvalue()).decode("utf-8") + return { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + } + + if isinstance(image, str): + image_url = (image if image.startswith( + ("http://", "https://", "file://")) else f"file://{image}") + return {"type": "image_url", "image_url": {"url": image_url}} + + raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image" + " or str or dictionary with raw image bytes.") + + +def process_video(video: Any) -> Mapping[str, Any]: + """ + Process a single video input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key + containing raw video data. + + 2. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(video, dict) and 'bytes' in video: + video_bytes = video['bytes'] + video_base64 = base64.b64encode(video_bytes).decode("utf-8") + return { + "type": "video_url", + "video_url": { + "url": f"data:video/mp4;base64,{video_base64}" + }, + } + + if isinstance(video, str): + video_url = (video if video.startswith( + ("http://", "https://", "file://")) else f"file://{video}") + return {"type": "video_url", "video_url": {"url": video_url}} + + raise ValueError( + f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501 + ) + + +def process_audio(audio: Any) -> Mapping[str, Any]: + """ + Process a single audio input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw audio bytes: - Expects a dict with a 'bytes' key + containing raw audio data. + + 2. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the audio URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(audio, dict) and 'bytes' in audio: + audio_bytes = audio['bytes'] + audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') + return { + "type": "audio_url", + "audio_url": { + "url": f"data:audio/mpeg;base64,{audio_base64}" + }, + } + if isinstance(audio, str): + audio_url = (audio if audio.startswith( + ("http://", "https://", "file://")) else f"file://{audio}") + return {"type": "audio_url", "audio_url": {"url": audio_url}} + + raise ValueError(f"Invalid audio input {audio}. Must be a string of local path/remote url, or a dictionary with raw audio bytes in the form of `{{'bytes': raw_audio_bytes}}`." + ) +# ----------------------------------------------------------------------------- +# Random Dataset Implementation (Synthetic Data) +# ----------------------------------------------------------------------------- + + +class RandomDataset(BenchmarkDataset): + """ + Synthetic text-only dataset for serving/throughput benchmarks. + + Strategy: + - Sample input/output token lengths per request from integer-uniform ranges + around configured means (controlled by range_ratio). + - Prepend a fixed random prefix of length prefix_len. + - Generate the remaining tokens as a reproducible sequence: + (offset + index + arange(input_len)) % vocab_size. + - Decode then re-encode/truncate to ensure prompt token counts match. + - Uses numpy.default_rng seeded with random_seed for reproducible sampling. + """ + # Default values copied from benchmark_serving.py for the random dataset. + DEFAULT_PREFIX_LEN = 0 + DEFAULT_RANGE_RATIO = 0.0 + DEFAULT_INPUT_LEN = 1024 + DEFAULT_OUTPUT_LEN = 128 + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + # Use numpy's default_rng for deterministic sampling + # Do not use random.seed() or np.random.seed() elsewhere in this class. + # This ensures that the RNG is isolated from global RNG state. + self._rng = np.random.default_rng(self.random_seed) + + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + prefix_len: int = DEFAULT_PREFIX_LEN, + range_ratio: float = DEFAULT_RANGE_RATIO, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + batchsize: int = 1, + **kwargs, + ) -> list[SampleRequest]: + + input_lens, output_lens, offsets = self.get_sampling_params( + num_requests, range_ratio, input_len, output_len, tokenizer + ) + + # Generate prefix once + prefix_token_ids = self.get_prefix(tokenizer, prefix_len) + vocab_size = tokenizer.vocab_size + + requests = [] + for i in range(num_requests): + prompt, total_input_len = self.generate_token_sequence( + tokenizer=tokenizer, + prefix_token_ids=prefix_token_ids, + prefix_len=prefix_len, + vocab_size=vocab_size, + input_len=int(input_lens[i]), + offset=int(offsets[i]), + index=i, + ) + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=total_input_len, + expected_output_len=int(output_lens[i]), + request_id=request_id_prefix + str(i), + ) + ) + # only used for embeddings benchmark. + if batchsize > 1: + batch_requests = [] + # Create batched requests + for i in range(0, num_requests, batchsize): + batch = requests[i : i + batchsize] + batch_requests.append( + SampleRequest( + prompt=[req.prompt for req in batch], + prompt_len=sum(req.prompt_len for req in batch), + expected_output_len=0, + request_id=request_id_prefix + str(i // batchsize), + ) + ) + requests = batch_requests + return requests + + def get_prefix( + self, tokenizer: PreTrainedTokenizerBase, prefix_len: int + ) -> list[int]: + """ + Get the prefix for the dataset. + """ + return ( + self._rng.integers( + 0, tokenizer.vocab_size, size=prefix_len).tolist() + if prefix_len > 0 + else [] + ) + + def get_sampling_params( + self, + num_requests: int, + range_ratio: float, + input_len: int, + output_len: int, + tokenizer: PreTrainedTokenizerBase, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Get the sampling parameters for the dataset. + """ + # Enforce range_ratio < 1 + if not (0.0 <= range_ratio < 1.0): + raise ValueError("range_ratio must be in [0, 1).") + num_special_tokens = int(tokenizer.num_special_tokens_to_add()) + real_input_len = max(0, int(input_len) - num_special_tokens) + # Bounds use floor for low and ceil for high + input_low = math.floor(real_input_len * (1 - range_ratio)) + input_high = math.ceil(real_input_len * (1 + range_ratio)) + output_low = math.floor(output_len * (1 - range_ratio)) + output_high = math.ceil(output_len * (1 + range_ratio)) + # Ensure the lower bound for output length is at least 1 to + # prevent sampling 0 tokens. + output_low = max(output_low, 1) + + if input_low > input_high: + raise ValueError( + "Invalid input sampling interval: " + f"low={input_low} > high={input_high}" + ) + if output_low > output_high: + raise ValueError( + "Invalid output sampling interval: " + f"low={output_low} > high={output_high}" + ) + + logger.info( + "Sampling input_len from [%s, %s] and output_len from [%s, %s]", + input_low, + input_high, + output_low, + output_high, + ) + + input_lens = self._rng.integers(input_low, input_high + 1, + size=num_requests) + output_lens = self._rng.integers(output_low, output_high + 1, + size=num_requests) + offsets = self._rng.integers(0, tokenizer.vocab_size, + size=num_requests) + return input_lens, output_lens, offsets + + def generate_token_sequence( + self, + *, + tokenizer: PreTrainedTokenizerBase, + prefix_token_ids: list[int], + prefix_len: int, + vocab_size: int, + input_len: int, + offset: int, + index: int, + ) -> tuple[str, int]: + """ + Returns (prompt, total_input_len). + + NOTE: After decoding the prompt we have to encode and decode it again. + This is done because in some cases N consecutive tokens + give a string tokenized into != N number of tokens. + For example for GPT2Tokenizer: + [6880, 6881] -> ['Ġcalls', 'here'] -> + [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] + To avoid uncontrolled change of the prompt length, + the encoded sequence is truncated before being decoded again. + """ + # Build the inner sequence by sampling sequentially from the vocab + inner_seq = ((offset + index + np.arange(input_len)) + % vocab_size).tolist() + token_sequence = prefix_token_ids + inner_seq + + # Decode, then re-encode and truncate to preserve token count invariants + prompt = tokenizer.decode(token_sequence) + total_input_len = prefix_len + int(input_len) + + re_encoded_sequence = tokenizer.encode( + prompt, add_special_tokens=False)[:total_input_len] + prompt = tokenizer.decode(re_encoded_sequence) + total_input_len = len(re_encoded_sequence) + + return prompt, total_input_len + + +# ----------------------------------------------------------------------------- +# MultiModalDataset Implementation +# ----------------------------------------------------------------------------- + +class RandomMultiModalDataset(RandomDataset): + """ + Synthetic multimodal dataset (text + images) that extends RandomDataset. + + Status: + - Images: supported via synthetic RGB data. + - Video: not yet supported (TODO: implement video generation method). + - Audio: not yet supported. + + Sampling overview: + 1) Number of items per request is sampled uniformly from the integer range + [floor(n·(1−r)), ceil(n·(1+r))], where n is the base count and r is + `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0. + The maximum is further clamped to the sum of per-modality limits. + 2) Each item’s modality and shape is sampled from `bucket_config`, a dict + mapping (height, width, num_frames) → probability. We treat + `num_frames`=1 as image and and `num_frames` > 1 as video. + Entries with zero probability are removed and the rest are renormalized + to sum to 1. + 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`. + When a modality reaches its cap, all of its buckets are excluded and the + remaining probabilities are renormalized. + + Example bucket configuration: + {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1} + - Two image buckets (`num_frames`=1) and one video bucket + (`num_frames`=16). + OBS.: Only image sampling is supported for now. + """ + + IS_MULTIMODAL = True + # NOTE: video sampling is WIP. Setting it to 0. + DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 0} + + DEFAULT_BASE_ITEMS_PER_REQUEST = 1 + DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0 + DEFAULT_MM_ITEM_BUCKET_CONFIG = { + (256, 256, 1): 0.5, + (720, 1280, 1): 0.5, + (720, 1280, 16): 0.0, + } + DEFAULT_ENABLE_MULTIMODAL_CHAT = False + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + + def generate_synthetic_image(self, width: int, height: int) -> Image.Image: + """Generate synthetic PIL image with random RGB values. + + NOTE: iid pixel sampling results in worst-case compression + (good for stressing I/O), but very unlike real photos. + We could consider a “low-freq” mode (e.g., noise blur) + to emulate network realism instead of max stress. + """ + random_pixels = self._rng.integers( + 0, + 256, + (height, width, 3), + dtype=np.uint8, + ) + return Image.fromarray(random_pixels) + + def generate_synthetic_video(self, width: int, + height: int, + num_frames: int) -> Any: + """Generate synthetic video with random values. + """ + video_data = self._rng.integers( + 0, 256, + (num_frames, height, width, 3), + dtype=np.uint8, + ) + video_tensor = torch.from_numpy(video_data) + with tempfile.NamedTemporaryFile(suffix=f".mp4", delete=False) as tmp: + temp_path = tmp.name + frames, height, width, channels = video_tensor.shape + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter(temp_path, fourcc, 30, (width, height)) + + for i in range(frames): + frame = video_tensor[i].numpy() + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + out.write(frame) + out.release() + + with open(temp_path, 'rb') as f: + video_bytes = f.read() + + os.unlink(temp_path) + + return { + 'bytes': video_bytes, + } + + + def generate_synthetic_audio( + self, + duration: int, # seconds + num_channels: int #1:Mono,2:Stereo 5:5.1 surround sound + ) -> Dict[str, Any]: + """Generate synthetic audio with random values. + Default use 48000Hz. + """ + sample_rate = 48000 + num_samples = int(sample_rate * duration) + audio_data = self._rng.uniform( + -0.5, 0.5, + (num_samples, num_channels) + ) + audio_data = np.clip(audio_data, -1.0, 1.0) + audio_tensor = torch.FloatTensor(audio_data.T) + buffer = io.BytesIO() + torchaudio.save( + buffer, + audio_tensor, + sample_rate, + format="mp3" + ) + buffer.seek(0) + audio_bytes = buffer.read() + return { + 'bytes': audio_bytes, + } + + def map_config_to_modality(self, config: tuple[int, int, int]) -> str: + """Map the configuration to the modality.""" + if config[0] == 0: + return "audio" + elif config[-1] == 1: + return "image" + elif config[-1] > 1: + return "video" + else: + raise ValueError(f"Invalid multimodal item configuration: {config}") + + def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int], + float]) -> dict[tuple[int, int, int], float]: + """ + Remove zero probability entries + and normalize the bucket config to sum to 1. + """ + # Raise error if value is negative + if any(v < 0 for v in bucket_config.values()): + raise ValueError("Bucket config values must be non-negative.") + # Remove zero probability entries + bucket_config = {k: v for k, v in bucket_config.items() if v > 0} + # if bucket config is empty, raise error + if not bucket_config: + raise ValueError("Got invalid bucket config. " + "Bucket config values must be non-zero.") + # Normalize the remaining bucket config to sum to 1 + total = sum(bucket_config.values()) + return {k: v / total for k, v in bucket_config.items()} + + + def generate_mm_item(self, + mm_item_config: tuple[int, int, int], + ) -> Mapping[str, Any]: + """ + Create synthetic images and videos and + apply process_image/process_video respectively. + This follows the OpenAI API chat completions + https://github.com/openai/openai-python + """ + + if self.map_config_to_modality(mm_item_config) == "image": + return process_image(self.generate_synthetic_image( + mm_item_config[1], + mm_item_config[0])) + elif self.map_config_to_modality(mm_item_config) == "video": + return process_video(self.generate_synthetic_video( + mm_item_config[1], + mm_item_config[0], + mm_item_config[2])) + elif self.map_config_to_modality(mm_item_config) == "audio": + return process_audio(self.generate_synthetic_audio( + mm_item_config[1], + mm_item_config[2])) + else: + raise ValueError(f"Invalid multimodal item configuration: " + f"{mm_item_config}") + + + def get_mm_item_sampling_params( + self, + base_items_per_request: int, + num_mm_items_range_ratio: float, + limit_mm_per_prompt: dict[str, int], + bucket_config: dict[tuple[int, int, int], float], + ) -> tuple[int, int, dict[str, int], dict[tuple[int, int, int], float]]: + """ + Get the sampling parameters for the multimodal items. + """ + # Enforce num_mm_items_range_ratio <= 1 + if not (0.0 <= num_mm_items_range_ratio <= 1.0): + raise ValueError("num_mm_items_range_ratio must be in [0, 1].") + + # Ensure modalities to sample are in limit_mm_per_prompt + for k, v in bucket_config.items(): + # get modality from bucket config + modality = self.map_config_to_modality(k) + if modality not in limit_mm_per_prompt: + raise ValueError(f"Modality {modality} is not in " + f"limit_mm_per_prompt: " + f"{limit_mm_per_prompt.keys()}") + + # Remove zero probability entries + # and normalize bucket config to sum to 1 + bucket_config = self.normalize_bucket_config(bucket_config) + logger.info( + "Normalized bucket config: %s", bucket_config, + ) + # Only consider limit per prompt for modalities in bucket config + allowed_modalities = {self.map_config_to_modality(cfg) + for cfg in bucket_config} + limit_mm_per_prompt = { + k: v for k, v in limit_mm_per_prompt.items() + if k in allowed_modalities} + if not limit_mm_per_prompt: + raise ValueError("No valid limits for modalities present in " + "bucket_config.") + + logger.info( + "Updated mm-limit-per-prompt: %s", limit_mm_per_prompt, + ) + + # Get max and min num mm items and ensure + # it is at most the sum of limit_mm_per_prompt for all modalities + max_num_mm_items = min( + sum(limit_mm_per_prompt.values()), + math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio)) + ) + # Ensure min num mm items is at least 0 + min_num_mm_items = max( + 0, + math.floor(base_items_per_request * (1 - num_mm_items_range_ratio)) + ) + # Raise error if min num mm items is greater than max num mm items + if min_num_mm_items > max_num_mm_items: + raise ValueError(f"Min num mm items is greater than max mm items: " + f"{min_num_mm_items} > {max_num_mm_items}") + + logger.info( + "Sampling number of multimodal items from [%s, %s]", + min_num_mm_items, max_num_mm_items, + ) + + return ( + min_num_mm_items, + max_num_mm_items, + limit_mm_per_prompt, + bucket_config, + ) + + def get_mm_item_iterator( + self, + min_num_mm_items: int, + max_num_mm_items: int, + bucket_config: dict[tuple[int, int, int], float], + limit_mm_per_prompt: dict[str, int], + ) -> Iterator[tuple[int,int, int]]: + """ + Iterator over the multimodal items for each request + whose size is between min_num_mm_items and max_num_mm_items. + + Loop over the bucket config and sample a multimodal item. + Loop until the number of multimodal items sampled is equal to + request_num_mm_items or limit of multimodal items per prompt + for all modalities is reached. + + Note: + - This function operates on a per-request shallow copy of + `bucket_config` (tuple->float). The original dict passed to + `sample` is not mutated. If this ever changes, a test + is implemented and will fail. + """ + # Get the number of multimodal items to sample + request_num_mm_items = int( + self._rng.integers(min_num_mm_items, max_num_mm_items + 1) + ) + # If request_num_mm_items is 0, yield an empty iterator + if request_num_mm_items == 0: + return + # Initialize modality counters + modality_counter = {self.map_config_to_modality(k): 0 + for k in bucket_config} + # Copy the bucket config to avoid modifying the original + bucket_config_copy = bucket_config.copy() + # Loop over the number of multimodal items to sample + while sum(modality_counter.values()) < request_num_mm_items: + # Sample a multimodal item config + mm_item_config = self._rng.choice(list(bucket_config_copy.keys()), + p=list(bucket_config_copy.values())) + modality = self.map_config_to_modality(mm_item_config) + # Check that modality count is less than limit per prompt + if modality_counter[modality] < limit_mm_per_prompt[modality]: + modality_counter[modality] += 1 + yield ( + mm_item_config + ) + else: + # If the counter is greater than the limit per prompt + # set all multimodal items of this modality to 0 + for k, v in bucket_config_copy.items(): + if self.map_config_to_modality(k) == modality: + bucket_config_copy[k] = 0 + # If all configs are 0, break the loop + # This should not happen as request_num_mm_items is at most + # the sum of limit_mm_per_prompt for all modalities + if all(v == 0 for v in bucket_config_copy.values()): + logger.warning("Exhausted all multimodal items " + "of modality %s", + modality) + break + # Renormalize the bucket config + bucket_config_copy = self.normalize_bucket_config( + bucket_config_copy) + + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN, + range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, + input_len: int = RandomDataset.DEFAULT_INPUT_LEN, + output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN, + limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT, + base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST, + num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO, + bucket_config: dict[tuple[int, int, int], float] = + DEFAULT_MM_ITEM_BUCKET_CONFIG, + enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT, + **kwargs, + ) -> list[SampleRequest]: + + # Get the sampling parameters for the dataset + input_lens, output_lens, offsets = self.get_sampling_params( + num_requests, range_ratio, input_len, output_len, tokenizer + ) + + ( + min_num_mm_items, + max_num_mm_items, + limit_mm_per_prompt, + bucket_config, + ) = self.get_mm_item_sampling_params( + base_items_per_request, + num_mm_items_range_ratio, + limit_mm_per_prompt, + bucket_config, + ) + + # Generate prefix once + prefix_token_ids = self.get_prefix(tokenizer, prefix_len) + vocab_size = tokenizer.vocab_size + # Add synthetic multimodal items to each request + mm_requests = [] + for i in range(num_requests): + prompt, total_input_len = self.generate_token_sequence( + tokenizer=tokenizer, + prefix_token_ids=prefix_token_ids, + prefix_len=prefix_len, + vocab_size=vocab_size, + input_len=int(input_lens[i]), + offset=int(offsets[i]), + index=i, + ) + # Get multimodal item iterator for a given request + mm_item_iterator = self.get_mm_item_iterator( + min_num_mm_items, + max_num_mm_items, + bucket_config, + limit_mm_per_prompt, + ) + + mm_content = cast(list[dict[str, Any]], [ + self.generate_mm_item(mm_item_config) + for mm_item_config in mm_item_iterator + ]) + + if enable_multimodal_chat: + # NOTE: For now this option is only provided for completeness + # given that the serve.py benchmark currently does not use it. + mm_chat_prompt: Any = prompt + mm_chat_prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) + sample_request = SampleRequest( + prompt=mm_chat_prompt, + prompt_len=total_input_len, + expected_output_len=int(output_lens[i]), + multi_modal_data=None, + request_id=request_id_prefix + str(i), + ) + else: + sample_request = SampleRequest( + prompt=prompt, + prompt_len=total_input_len, + expected_output_len=int(output_lens[i]), + multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), + ) + mm_requests.append(sample_request) + return mm_requests + +# ----------------------------------------------------------------------------- +# ShareGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ShareGPTDataset(BenchmarkDataset): + """ + Implements the ShareGPT dataset. Loads data from a JSON file and generates + sample requests based on conversation turns. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + with open(self.dataset_path, encoding="utf-8") as f: + self.data = json.load(f) + # Filter entries with at least two conversation turns. + self.data = [ + entry for entry in self.data + if "conversations" in entry and len(entry["conversations"]) >= 2 + ] + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + samples: list = [] + ind = 0 + for entry in self.data: + if len(samples) >= num_requests: + break + prompt, completion = ( + entry["conversations"][0]["value"], + entry["conversations"][1]["value"], + ) + + lora_request = self.get_random_lora_request( + max_loras=max_loras, lora_path=lora_path) + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + new_output_len = (len(completion_ids) + if output_len is None else output_len) + if not is_valid_sequence(prompt_len, + new_output_len, + skip_min_output_len_check=output_len + is not None): + continue + if image_path := entry.get("image"): + mm_content = process_image(image_path) + elif video_path := entry.get("video"): + mm_content = process_video(video_path) + else: + mm_content = None + if enable_multimodal_chat: + prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=new_output_len, + lora_request=lora_request, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), + )) + ind += 1 + self.maybe_oversample_requests(samples, + num_requests, + request_id_prefix, + no_oversample) + return samples + + +class _ValidateDatasetArgs(argparse.Action): + """Argparse action to validate dataset name and path compatibility.""" + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values) + + # Get current values of both dataset_name and dataset_path + dataset_name = getattr(namespace, 'dataset_name', 'random') + dataset_path = getattr(namespace, 'dataset_path', None) + + # Validate the combination + if dataset_name == "random" and dataset_path is not None: + parser.error( + "Cannot use 'random' dataset with --dataset-path. " + "Please specify the appropriate --dataset-name (e.g., " + "'sharegpt', 'custom', 'sonnet') for your dataset file: " + f"{dataset_path}" + ) + + +def add_dataset_parser(parser: FlexibleArgumentParser): + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--dataset-name", + type=str, + default="random", + action=_ValidateDatasetArgs, + choices=[ + "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", + "custom", "prefix_repetition", "spec_bench" + ], + help="Name of the dataset to benchmark on.", + ) + parser.add_argument( + "--no-stream", + action="store_true", + help="Do not load the dataset in streaming mode.", + ) + parser.add_argument( + "--dataset-path", + type=str, + default=None, + action=_ValidateDatasetArgs, + help="Path to the sharegpt/sonnet dataset. " + "Or the huggingface dataset ID if using HF dataset.", + ) + parser.add_argument( + "--no-oversample", + action="store_true", + help="Do not oversample if the dataset has " \ + "fewer samples than num-prompts.", + ) + + # group for dataset specific arguments + custom_group = parser.add_argument_group("custom dataset options") + custom_group.add_argument( + "--custom-output-len", + type=int, + default=256, + help= + "Number of output tokens per request, used only for custom dataset.", + ) + custom_group.add_argument( + "--custom-skip-chat-template", + action="store_true", + help= + "Skip applying chat template to prompt, used only for custom dataset.", + ) + + spec_bench_group = parser.add_argument_group("spec bench dataset options") + spec_bench_group.add_argument( + "--spec-bench-output-len", + type=int, + default=256, + help= + "Num of output tokens per request, used only for spec bench dataset.", + ) + spec_bench_group.add_argument( + "--spec-bench-category", + type=str, + default=None, + help= + "Category for spec bench dataset. If None, use all categories.", + ) + + sonnet_group = parser.add_argument_group("sonnet dataset options") + sonnet_group.add_argument( + "--sonnet-input-len", + type=int, + default=550, + help= + "Number of input tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-output-len", + type=int, + default=150, + help= + "Number of output tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-prefix-len", + type=int, + default=200, + help= + "Number of prefix tokens per request, used only for sonnet dataset.", + ) + + sharegpt_group = parser.add_argument_group("sharegpt dataset options") + sharegpt_group.add_argument( + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the ShareGPT dataset.", + ) + + blazedit_group = parser.add_argument_group("blazedit dataset options") + blazedit_group.add_argument( + "--blazedit-min-distance", + type=float, + default=0.0, + help= + "Minimum distance for blazedit dataset. Min: 0, Max: 1.0", + ) + blazedit_group.add_argument( + "--blazedit-max-distance", + type=float, + default=1.0, + help= + "Maximum distance for blazedit dataset. Min: 0, Max: 1.0", + ) + + random_group = parser.add_argument_group("random dataset options") + random_group.add_argument( + "--random-input-len", + type=int, + default=1024, + help= + "Number of input tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-output-len", + type=int, + default=128, + help= + "Number of output tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-range-ratio", + type=float, + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for random sampling. Must be in the range [0, 1) to define " + "a symmetric sampling range" + "[length * (1 - range_ratio), length * (1 + range_ratio)].", + ) + random_group.add_argument( + "--random-prefix-len", + type=int, + default=0, + help=("Number of fixed prefix tokens before the random context " + "in a request. " + "The total input length is the sum of `random-prefix-len` and " + "a random " + "context length sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]."), + ) + random_group.add_argument( + "--random-batch-size", + type=int, + default=1, + help=("Batch size for random sampling. " + "Only used for embeddings benchmark."), + ) + + # random multimodal dataset options + random_mm_group = parser.add_argument_group( + "random multimodal dataset options extended from random dataset") + random_mm_group.add_argument( + "--random-mm-base-items-per-request", + type=int, + default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST, + help=( + "Base number of multimodal items per request for random-mm. " + "Actual per-request count is sampled around this base using " + "--random-mm-num-mm-items-range-ratio." + ), + ) + random_mm_group.add_argument( + "--random-mm-num-mm-items-range-ratio", + type=float, + default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO, + help=( + "Range ratio r in [0, 1] for sampling items per request. " + "We sample uniformly from the closed integer range " + "[floor(n*(1-r)), ceil(n*(1+r))] " + "where n is the base items per request. " + "r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped " + "to the sum of per-modality limits from " + "--random-mm-limit-mm-per-prompt. " + "An error is raised if the computed min exceeds the max." + ), + ) + random_mm_group.add_argument( + "--random-mm-limit-mm-per-prompt", + type=json.loads, + default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT, + help=( + "Per-modality hard caps for items attached per request, e.g. " + "'{\"image\": 3, \"video\": 0}'. The sampled per-request item " + "count is clamped to the sum of these limits. When a modality " + "reaches its cap, its buckets are excluded and probabilities are " + "renormalized." + "OBS.: Only image sampling is supported for now." + ), + ) + + def _parse_mm_bucket_config(v: object) -> dict[tuple[int, int, int], float]: + # If already a dict (e.g., programmatic call), normalize keys + def normalize(d: dict) -> dict[tuple[int, int, int], float]: + out: dict[tuple[int, int, int], float] = {} + for k, val in d.items(): + key = k + if isinstance(key, str): + with suppress(Exception): + key = ast.literal_eval(key) + if not (isinstance(key, tuple) and len(key) == 3 + and all(isinstance(x, int) for x in key)): + raise ValueError( + f"Invalid bucket key {k!r}. Expected tuple (H, W, T)." + ) + out[(int(key[0]), int(key[1]), int(key[2]))] = float(val) + return out + + if isinstance(v, dict): + return normalize(v) + if isinstance(v, str): + # Python literal (supports tuple keys) + parsed = ast.literal_eval(v) + if not isinstance(parsed, dict): + raise ValueError("Bucket config must parse to a dict.") + return normalize(parsed) + raise ValueError("Unsupported value for --random-mm-bucket-config.") + + random_mm_group.add_argument( + "--random-mm-bucket-config", + type=_parse_mm_bucket_config, + default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG, + help=( + "The bucket config is a dictionary mapping a multimodal item" + "sampling configuration to a probability." + "Currently allows for 2 modalities: images and videos. " + "An bucket key is a tuple of (height, width, num_frames)" + "The value is the probability of sampling that specific item. " + "Example: " + "--random-mm-bucket-config " + "{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.10} " + "First item: images with resolution 256x256 w.p. 0.5" + "Second item: images with resolution 720x1280 w.p. 0.4 " + "Third item: videos with resolution 720x1280 and 16 frames w.p. 0.1" + "OBS.: If the probabilities do not sum to 1, they are normalized." + "OBS bis.: Only image sampling is supported for now." + ), + ) + + hf_group = parser.add_argument_group("hf dataset options") + hf_group.add_argument("--hf-subset", + type=str, + default=None, + help="Subset of the HF dataset.") + hf_group.add_argument("--hf-split", + type=str, + default=None, + help="Split of the HF dataset.") + hf_group.add_argument( + "--hf-name", + type=str, + default=None, + help=( + "Name of the dataset on HuggingFace " + "(e.g., 'lmarena-ai/VisionArena-Chat'). " + "Specify this if your dataset-path is a local path." + ), + ) + hf_group.add_argument( + "--hf-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output lengths " + "from the sampled HF dataset.", + ) + + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options") + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=256, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=256, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=10, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=128, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + + +def get_samples(args, tokenizer) -> list[SampleRequest]: + + if not hasattr(args, "request_id_prefix"): + args.request_id_prefix = "" + + if args.dataset_name == "custom": + dataset = CustomDataset(dataset_path=args.dataset_path) + input_requests = dataset.sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.custom_output_len, + skip_chat_template=args.custom_skip_chat_template, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + + elif args.dataset_name == "sonnet": + dataset = SonnetDataset(dataset_path=args.dataset_path) + # For the "sonnet" dataset, formatting depends on the backend. + if args.backend == "openai-chat": + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=False, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + else: + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset.") + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=True, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + + elif args.dataset_name == "hf": + # all following datasets are implemented from the + # HuggingFaceDataset base class + hf_kwargs = {} + if ( + args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = VisionArenaDataset + args.hf_split = "train" + args.hf_subset = None + elif ( + args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = MMVUDataset + args.hf_split = "validation" + args.hf_subset = None + elif ( + args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = InstructCoderDataset + args.hf_split = "train" + elif ( + args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = MTBenchDataset + args.hf_split = "train" + elif ( + args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = ConversationDataset + elif ( + args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS + or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = AIMODataset + args.hf_split = "train" + elif ( + args.dataset_path + in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501 + or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = NextEditPredictionDataset + args.hf_split = "train" + elif ( + args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = ASRDataset + args.hf_split = "train" + elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS: + dataset_class = BlazeditDataset + args.hf_split = "train" + hf_kwargs = { + "min_distance": args.blazedit_min_distance, + "max_distance": args.blazedit_max_distance, + } + elif ( + args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = MLPerfDataset + args.hf_split = "train" + else: + supported_datasets = set([ + dataset_name for cls in HuggingFaceDataset.__subclasses__() + for dataset_name in cls.SUPPORTED_DATASET_PATHS + ]) + raise ValueError( + f"Unsupported dataset path: {args.dataset_path}. " + "Huggingface dataset only supports dataset_path" + f" from one of following: {supported_datasets}. " + "Please consider contributing if you would " + "like to add support for additional dataset formats.") + + if dataset_class.IS_MULTIMODAL and args.backend not in [ + "openai-chat", + "openai-audio", + ]: + # multi-modal benchmark is only available on OpenAI Chat + # endpoint-type. + raise ValueError( + "Multi-modal content is only supported on 'openai-chat' and " + "'openai-audio' backends.") + input_requests = dataset_class( + dataset_path=args.dataset_path, + dataset_subset=args.hf_subset, + dataset_split=args.hf_split, + random_seed=args.seed, + no_stream=args.no_stream, + hf_name=args.hf_name, + ).sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.hf_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + **hf_kwargs + ) + + else: + # For datasets that follow a similar structure, use a mapping. + dataset_mapping = { + "spec_bench": + lambda: SpecBench(dataset_path=args.dataset_path, + category=args.spec_bench_category).sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.spec_bench_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + "sharegpt": lambda: ShareGPTDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + "burstgpt": lambda: BurstGPTDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + "random": lambda: RandomDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + range_ratio=args.random_range_ratio, + request_id_prefix=args.request_id_prefix, + batchsize=args.random_batch_size, + no_oversample=args.no_oversample, + ), + "random-mm": + lambda: RandomMultiModalDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + range_ratio=args.random_range_ratio, + input_len=args.random_input_len, + output_len=args.random_output_len, + base_items_per_request=args.random_mm_base_items_per_request, + limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt, + num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio, + bucket_config=args.random_mm_bucket_config, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + "prefix_repetition": + lambda: PrefixRepetitionRandomDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.prefix_repetition_prefix_len, + suffix_len=args.prefix_repetition_suffix_len, + num_prefixes=args.prefix_repetition_num_prefixes, + output_len=args.prefix_repetition_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + } + + try: + # Enforce endpoint compatibility for multimodal datasets. + if args.dataset_name == "random-mm" and args.backend not in [ + "openai-chat"]: + raise ValueError( + "Multi-modal content (images) is only supported on " + "'openai-chat' backend." + ) + input_requests = dataset_mapping[args.dataset_name]() + except KeyError as err: + raise ValueError(f"Unknown dataset: {args.dataset_name}") from err + + return input_requests + + +# ----------------------------------------------------------------------------- +# Custom Dataset Implementation +# ----------------------------------------------------------------------------- + + +class CustomDataset(BenchmarkDataset): + """ + Implements the Custom dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + {"prompt": "What is the capital of India?"} + {"prompt": "What is the capital of Iran?"} + {"prompt": "What is the capital of China?"} + ``` + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + # self.data will be a list of dictionaries + # e.g., [{"prompt": "What is the capital of India?"}, ...] + # This will be the standardized format which load_data() + # has to convert into depending on the filetype of dataset_path. + # sample() will assume this standardized format of self.data + self.data = [] + + # Load the JSONL file + if self.dataset_path.endswith(".jsonl"): + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, + lines=True) + + # check if the JSONL file has a 'prompt' column + if "prompt" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'prompt' column.") + + # Convert each row to a dictionary and append to self.data + # This will convert the DataFrame to a list of dictionaries + # where each dictionary corresponds to a row in the DataFrame. + # This is the standardized format we want for self.data + for _, row in jsonl_data.iterrows(): + self.data.append(row.to_dict()) + else: + raise NotImplementedError( + "Only JSONL format is supported for CustomDataset.") + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + # load all data if needed + self.num_available_samples = len(self.data) + if num_requests <= 0: + num_requests = self.num_available_samples + logger.info("num_requests is set to 0 or negative, " + "so using all available samples: %d", + num_requests) + + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + )) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Spec Bench Dataset Implementation +# ----------------------------------------------------------------------------- + + +class SpecBench(CustomDataset): + """ + Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench + Download the dataset using: + wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl + """ # noqa: E501 + + def __init__(self, **kwargs) -> None: + self.category = kwargs.pop("category", None) + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + self.data = [] + + # Load the JSONL file + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, + lines=True) + + # check if the JSONL file has a 'turns' column + if "turns" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'turns' column.") + + for _, row in jsonl_data.iterrows(): + # sample only from a specific category if specified + if (not self.category) or (self.category == row['category']): + prompt = row["turns"][0] + self.data.append({"prompt": prompt}) + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample(self, **kwargs) -> list: + # leverage CustomDataset sample + kwargs["skip_chat_template"] = False + return super().sample(**kwargs) + + +# ----------------------------------------------------------------------------- +# Sonnet Dataset Implementation +# ----------------------------------------------------------------------------- + +@deprecated( + "SonnetDataset is deprecated and will be removed in a future version.", +) +class SonnetDataset(BenchmarkDataset): + """ + Simplified implementation of the Sonnet dataset. Loads poem lines from a + text file and generates sample requests. Default values here copied from + `benchmark_serving.py` for the sonnet dataset. + """ + + DEFAULT_PREFIX_LEN = 200 + DEFAULT_INPUT_LEN = 550 + DEFAULT_OUTPUT_LEN = 150 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if not self.dataset_path: + raise ValueError("dataset_path must be provided.") + with open(self.dataset_path, encoding="utf-8") as f: + self.data = f.readlines() + + def sample( + self, + tokenizer, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + return_prompt_formatted: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + # Calculate average token length for a poem line. + tokenized_lines = [tokenizer(line).input_ids for line in self.data] + avg_len = sum(len(tokens) + for tokens in tokenized_lines) / len(tokenized_lines) + + # Build the base prompt. + base_prompt = "Pick as many lines as you can from these poem lines:\n" + base_msg = [{"role": "user", "content": base_prompt}] + base_fmt = tokenizer.apply_chat_template(base_msg, + add_generation_prompt=True, + tokenize=False) + base_offset = len(tokenizer(base_fmt).input_ids) + if input_len <= base_offset: + raise ValueError( + f"'input_len' must be higher than the base prompt length " + f"({base_offset}).") + + # Determine how many poem lines to use. + num_input_lines = round((input_len - base_offset) / avg_len) + num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) + prefix_lines = self.data[:num_prefix_lines] + + samples = [] + ind = 0 + while len(samples) < num_requests: + extra_lines = random.choices(self.data, + k=num_input_lines - num_prefix_lines) + prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" + msg = [{"role": "user", "content": prompt}] + prompt_formatted = tokenizer.apply_chat_template( + msg, add_generation_prompt=True, tokenize=False) + prompt_len = len(tokenizer(prompt_formatted).input_ids) + if prompt_len <= input_len: + samples.append( + SampleRequest( + prompt=prompt_formatted + if return_prompt_formatted else prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(ind), + )) + ind += 1 + return samples + + +# ----------------------------------------------------------------------------- +# BurstGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class BurstGPTDataset(BenchmarkDataset): + """ + Implements the BurstGPT dataset. Loads data from a CSV file and generates + sample requests based on synthetic prompt generation. Only rows with Model + "GPT-4" and positive response tokens are used. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self, ): + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + df = pd.read_csv(self.dataset_path) + # Filter to keep only GPT-4 rows. + gpt4_df = df[df["Model"] == "GPT-4"] + # Remove failed requests (where Response tokens is 0 or less). + gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] + # Sample the desired number of rows. + self.data = gpt4_df + + def _sample_loaded_data(self, num_requests: int) -> list: + if num_requests <= len(self.data): + data = self.data.sample(n=num_requests, + random_state=self.random_seed) + else: + data = self.data.sample( + n=num_requests, + random_state=self.random_seed, + replace=True, + ) + # Convert the dataframe to a list of lists. + return data.values.tolist() + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + max_loras: Optional[int] = None, + lora_path: Optional[str] = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list[SampleRequest]: + samples = [] + data = self._sample_loaded_data(num_requests=num_requests) + for i in range(num_requests): + input_len = int(data[i][2]) + output_len = int(data[i][3]) + lora_req = self.get_random_lora_request( + max_loras=max_loras, lora_path=lora_path) + vocab_size = tokenizer.vocab_size + # Generate a synthetic prompt: a list of token IDs computed as (i + + # j) modulo vocab_size. + token_ids = [(i + j) % vocab_size for j in range(input_len)] + prompt = tokenizer.decode(token_ids) + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=output_len, + lora_request=lora_req, + request_id=request_id_prefix + str(i), + )) + return samples + + +# ----------------------------------------------------------------------------- +# HuggingFace Dataset Base Implementation +# ----------------------------------------------------------------------------- +class HuggingFaceDataset(BenchmarkDataset): + """Base class for datasets hosted on HuggingFace.""" + + SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set() + + def __init__( + self, + dataset_path: str, + dataset_split: str, + no_stream: bool = False, + dataset_subset: Optional[str] = None, + hf_name: Optional[str] = None, + **kwargs, + ) -> None: + super().__init__(dataset_path=dataset_path, **kwargs) + + self.dataset_split = dataset_split + self.dataset_subset = dataset_subset + self.load_stream = not no_stream + self.hf_name = hf_name or dataset_path + self.load_data() + + def load_data(self) -> None: + """Load data from HuggingFace datasets.""" + self.data = load_dataset( + self.dataset_path, + name=self.dataset_subset, + split=self.dataset_split, + streaming=self.load_stream, + ) + self.data = self.data.shuffle(seed=self.random_seed) + + +# ----------------------------------------------------------------------------- +# Conversation Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ConversationDataset(HuggingFaceDataset): + """Dataset for conversation data with multimodal support.""" + SUPPORTED_DATASET_PATHS = { + 'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered' + } + IS_MULTIMODAL = True + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs) -> list: + # Filter examples with at least 2 conversations + filtered_data = self.data.filter( + lambda x: len(x["conversations"]) >= 2) + sampled_requests = [] + ind = 0 + dynamic_output = output_len is None + + for item in filtered_data: + if len(sampled_requests) >= num_requests: + break + conv = item["conversations"] + prompt, completion = conv[0]["value"], conv[1]["value"] + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence( + prompt_len, completion_len): + continue + mm_content = process_image( + item["image"]) if "image" in item else None + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len and output len + prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), + )) + ind += 1 + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Vision Arena Dataset Implementation +# ----------------------------------------------------------------------------- + + +class VisionArenaDataset(HuggingFaceDataset): + """ + Vision Arena Dataset. + """ + + DEFAULT_OUTPUT_LEN = 128 + SUPPORTED_DATASET_PATHS = { + "lmarena-ai/VisionArena-Chat": + lambda x: x["conversation"][0][0]["content"], + "lmarena-ai/vision-arena-bench-v0.1": + lambda x: x["turns"][0][0]["content"] + } + IS_MULTIMODAL = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) + if parser_fn is None: + raise ValueError(f"Unsupported dataset path: {self.hf_name}") + prompt = parser_fn(item) + mm_content = process_image(item["images"][0]) + prompt_len = len(tokenizer(prompt).input_ids) + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len + prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), + )) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + return sampled_requests + + +class MMVUDataset(HuggingFaceDataset): + """ + MMVU Dataset. + https://huggingface.co/datasets/yale-nlp/MMVU + """ + + DEFAULT_OUTPUT_LEN = 128 + SUPPORTED_DATASET_PATHS = { + "yale-nlp/MMVU": + lambda x: x["question"] + " " + ( + " ".join(f"{k}.{v}" for k, v in x["choices"].items()) + ), + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) + if parser_fn is None: + raise ValueError(f"Unsupported dataset path: {self.hf_name}") + prompt = parser_fn(item) + mm_content = process_video(item["video"]) + prompt_len = len(tokenizer(prompt).input_ids) + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len + prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), + )) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Instruct Coder Dataset Implementation +# ----------------------------------------------------------------------------- + + +class InstructCoderDataset(HuggingFaceDataset): + """ + InstructCoder Dataset. + https://huggingface.co/datasets/likaixin/InstructCoder + + InstructCoder is the dataset designed for general code editing. It consists + of 114,239 instruction-input-output triplets, and covers multiple distinct + code editing scenario. + """ + + DEFAULT_OUTPUT_LEN = 200 # this is the average default output length + SUPPORTED_DATASET_PATHS = { + "likaixin/InstructCoder", + } + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + prompt = ( + f"{item['input']}\n\n{item['instruction']} Just output " + "the code, do not include any explanation." + ) + + # apply template + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + )) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# MT-Bench Dataset Implementation +# ----------------------------------------------------------------------------- + + +class MTBenchDataset(HuggingFaceDataset): + """ + MT-Bench Dataset. + https://huggingface.co/datasets/philschmid/mt-bench + + We create a single turn dataset for MT-Bench. + This is similar to Spec decoding benchmark setup in vLLM + https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 + """ # noqa: E501 + + DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM + SUPPORTED_DATASET_PATHS = { + "philschmid/mt-bench", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + prompt = item["turns"][0] + + # apply template + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + )) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Blazedit Dataset Implementation +# ----------------------------------------------------------------------------- + + +class BlazeditDataset(HuggingFaceDataset): + """ + Blazedit Dataset. + https://github.com/ise-uiuc/blazedit + + 5k char version: vdaita/edit_5k_char + 10k char version: vdaita/edit_10k_char + """ # noqa: E501 + + # 5k char version will have output as ~5k chars + # 10k char version will have output as ~10k chars + # Assuming 3 char per token, 10k chars will be 3333 tokens + # We set default to 4000 to be safe + DEFAULT_OUTPUT_LEN = 4000 + SUPPORTED_DATASET_PATHS = { + "vdaita/edit_5k_char", + "vdaita/edit_10k_char", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + request_id_prefix: str = "", + no_oversample: bool = False, + min_distance: float = 0.0, + max_distance: float = 1.0, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + code = item["code"] + change_request = item["change_request"] + norm_distance = item["norm_distance"] + + # compare the levenshtein distance normalized by code length + if norm_distance < min_distance or norm_distance > max_distance: + continue + + # template copied from + # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501 + instruction = f"""Given a code file, please apply the change requests and generate the new file. + +Original file: +```python +{code} +``` + +Change request: +{change_request} + +Please generate the new code file in the "New file" section below.""" # noqa: E501 + + # apply template + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": instruction + }], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + )) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + + return sampled_requests + + +# ----------------------------------------------------------------------------- +# AIMO Dataset Implementation +# ----------------------------------------------------------------------------- + + +class AIMODataset(HuggingFaceDataset): + """ + Dataset class for processing a AIMO dataset with reasoning questions. + """ + SUPPORTED_DATASET_PATHS = { + "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5", + "AI-MO/NuminaMath-CoT" + } + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs) -> list: + sampled_requests = [] + ind = 0 + dynamic_output = output_len is None + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt, completion = item['problem'], item["solution"] + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence(prompt_len, + completion_len, + max_prompt_len=2048, + max_total_len=32000): + continue + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + request_id=request_id_prefix + str(ind), + )) + ind += 1 + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Next Edit Prediction Dataset Implementation +# ----------------------------------------------------------------------------- + + +zeta_prompt = """### Instruction: +You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location. + +### User Edits: + +{} + +### User Excerpt: + +{} + +### Response: + +""" # noqa: E501 + + +def _format_zeta_prompt( + sample: dict, + original_start_marker: str = "<|editable_region_start|>") -> dict: + """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. + + This function formats examples from the NEP dataset + into prompts and expected outputs. It could be + further extended to support more NEP datasets. + + Args: + sample: The dataset sample containing events, + inputs, and outputs. + original_start_marker: The marker indicating the + start of the editable region. Defaults to + "<|editable_region_start|>". + + Returns: + A dictionary with the formatted prompts and expected outputs. + """ + events = sample["events"] + input = sample["input"] + output = sample["output"] + prompt = zeta_prompt.format(events, input) + + # following the original implementation, extract the focused region + # from the raw output + output_start_index = output.find(original_start_marker) + output_focused_region = output[output_start_index:] + expected_output = output_focused_region + + return {"prompt": prompt, "expected_output": expected_output} + + +class NextEditPredictionDataset(HuggingFaceDataset): + """ + Dataset class for processing a Next Edit Prediction dataset. + """ + + SUPPORTED_DATASET_PATHS = { + "zed-industries/zeta", + } + MAPPING_PROMPT_FUNCS = { + "zed-industries/zeta": _format_zeta_prompt, + } + + def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs): + formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name) + if formatting_prompt_func is None: + raise ValueError(f"Unsupported dataset path: {self.hf_name}") + samples = [] + for i, sample in enumerate(self.data): + sample = formatting_prompt_func(sample) + samples.append( + SampleRequest( + prompt=sample["prompt"], + prompt_len=len(tokenizer(sample["prompt"]).input_ids), + expected_output_len=len( + tokenizer(sample["expected_output"]).input_ids), + request_id=request_id_prefix + str(i), + )) + if len(samples) >= num_requests: + break + self.maybe_oversample_requests(samples, + num_requests, + request_id_prefix, + no_oversample) + return samples + + +# ----------------------------------------------------------------------------- +# ASR Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ASRDataset(HuggingFaceDataset): + """ + Dataset class for processing a ASR dataset for transcription. + Tested on the following set: + + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | Dataset | Domain | Speaking Style | hf-subset | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | TED-LIUM | TED talks | Oratory | release1, release2, release3| + | | | | release3-speaker-adaptation | + | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | + | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | + | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | + | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | + | AMI | Meetings | Spontaneous | ihm, sdm | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + + """ # noqa: E501 + + SUPPORTED_DATASET_PATHS = { + "openslr/librispeech_asr", + "facebook/voxpopuli", + "LIUM/tedlium", + "edinburghcstr/ami", + "speechcolab/gigaspeech", + "kensho/spgispeech", + } + + DEFAULT_OUTPUT_LEN = 128 + IS_MULTIMODAL = True + + # TODO Whisper-specific. Abstract interface when more models are supported. + TRANSCRIPTION_PREAMBLE = ( + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>") + skip_long_audios: bool = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + prompt = ASRDataset.TRANSCRIPTION_PREAMBLE + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests = [] + ind = 0 + skipped = 0 + for item in self.data: + if len(sampled_requests) >= num_requests: + break + audio = item["audio"] + y, sr = audio["array"], audio["sampling_rate"] + duration_s = librosa.get_duration(y=y, sr=sr) + # Whisper max supported duration + if self.skip_long_audios and duration_s > 30: + skipped += 1 + continue + + mm_content = {"audio": (y, sr)} + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), + )) + ind += 1 + if skipped: + logger.warning( + "%d samples discarded from dataset due to" + " their length being greater than" + " what Whisper supports.", + skipped, + ) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# MLPerf Dataset Implementation +# ----------------------------------------------------------------------------- + + +class MLPerfDataset(HuggingFaceDataset): + """ + MLPerf Inference Dataset. + + Dataset on HF: + https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data + https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data + + Each record contains: + - "system_prompt": system role instruction. + - "question": user question. + - "output": reference answer. + + We combine the system prompt and question into a chat-formatted prompt + (using the tokenizer's chat template) and set the expected output length to + the tokenized length of the provided reference answer. + """ + + SUPPORTED_DATASET_PATHS = { + "mgoin/mlperf-inference-llama2-data", + "mgoin/mlperf-inference-llama3.1-data", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list[SampleRequest]: + # Force dynamic output length based on reference completion. + dynamic_output = output_len is None + sampled_requests: list[SampleRequest] = [] + ind = 0 + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + + system_prompt = item["system_prompt"] + question = item["question"] + reference_answer = item["output"] + + # Build chat-style prompt using tokenizer template, if available. + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": question}, + ] + prompt_formatted = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + prompt_len = len(tokenizer(prompt_formatted).input_ids) + + # Determine output length from reference answer tokens. + ref_out_len = len( + tokenizer(reference_answer, add_special_tokens=False).input_ids + ) + expected_output_len = ref_out_len if dynamic_output else output_len + + # Validate sequence lengths. + if not is_valid_sequence(prompt_len, expected_output_len): + continue + + sampled_requests.append( + SampleRequest( + prompt=prompt_formatted, + prompt_len=prompt_len, + expected_output_len=expected_output_len, + request_id=request_id_prefix + str(ind), + ) + ) + ind += 1 + + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Prefix Repetition Dataset Implementation +# ----------------------------------------------------------------------------- + + +class PrefixRepetitionRandomDataset(BenchmarkDataset): + # Default values copied from benchmark_serving.py for the repeated prefix + # dataset. + DEFAULT_PREFIX_LEN = 256 + DEFAULT_SUFFIX_LEN = 256 + DEFAULT_NUM_PREFIXES = 10 + DEFAULT_OUTPUT_LEN = 128 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + random.seed(self.random_seed) + np.random.seed(self.random_seed) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + suffix_len: int = DEFAULT_SUFFIX_LEN, + num_prefixes: int = DEFAULT_NUM_PREFIXES, + output_len: int = DEFAULT_OUTPUT_LEN, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list[SampleRequest]: + vocab_size = tokenizer.vocab_size + prompts_per_prefix = num_requests // num_prefixes + if prompts_per_prefix == 0: + raise ValueError( + f"num_requests ({num_requests}) must be greater than or equal " + f"to num_prefixes ({num_prefixes})" + ) + + def _generate_exact_length_tokens(target_length: int) -> list[int]: + """Generate tokens that decode and re-encode to exactly + target_length.""" + # Generate random tokens + tokens = np.random.randint( + 0, vocab_size, size=target_length).tolist() + text = tokenizer.decode(tokens) + re_encoded = tokenizer.encode(text, add_special_tokens=False) + + if len(re_encoded) == target_length: + return re_encoded + elif len(re_encoded) < target_length: + # Recursively generate additional consistent tokens + needed = target_length - len(re_encoded) + extra_tokens = _generate_exact_length_tokens(needed) + return re_encoded + extra_tokens + else: + # Truncate to target length + return re_encoded[:target_length] + + requests = [] + for _ in range(num_prefixes): + prefix_tokens = _generate_exact_length_tokens(prefix_len) + + for _ in range(prompts_per_prefix): + suffix_tokens = _generate_exact_length_tokens(suffix_len) + + combined_tokens = prefix_tokens + suffix_tokens + prompt = tokenizer.decode(combined_tokens) + prompt_len = len(combined_tokens) + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + + random.shuffle(requests) + return requests diff --git a/vllm_omni/benchmarks/latency.py b/vllm_omni/benchmarks/latency.py new file mode 100644 index 000000000..05378ec74 --- /dev/null +++ b/vllm_omni/benchmarks/latency.py @@ -0,0 +1,170 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark the latency of processing a single batch of requests.""" + +import argparse +import dataclasses +import json +import os +import time +from typing import Any, Optional + +import numpy as np +from tqdm import tqdm + +import vllm.envs as envs +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + write_to_json) +from vllm.engine.arg_utils import EngineArgs +from vllm.inputs import PromptType +from vllm.sampling_params import BeamSearchParams + + +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: dict[str, Any]) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={"latency": results["latencies"]}, + extra_info={k: results[k] + for k in ["avg_latency", "percentiles"]}) + if pt_records: + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def add_cli_args(parser: argparse.ArgumentParser): + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument( + "--n", + type=int, + default=1, + help="Number of generated sequences per prompt.", + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-iters-warmup", + type=int, + default=10, + help="Number of iterations to run for warmup.", + ) + parser.add_argument("--num-iters", + type=int, + default=30, + help="Number of iterations to run.") + parser.add_argument( + "--profile", + action="store_true", + help="profile the generation process of a single batch", + ) + parser.add_argument( + "--output-json", + type=str, + default=None, + help="Path to save the latency results in JSON format.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) + + parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=False) + + +def main(args: argparse.Namespace): + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler.") + engine_args = EngineArgs.from_cli_args(args) + + # Lazy import to avoid importing LLM when the bench command is not selected. + from vllm import LLM, SamplingParams + + # NOTE(woosuk): If the request cannot be processed in a single batch, + # the engine will automatically process the request in multiple batches. + llm = LLM(**dataclasses.asdict(engine_args)) + assert llm.llm_engine.model_config.max_model_len >= ( + args.input_len + + args.output_len), ("Please ensure that max_model_len is greater than" + " the sum of input_len and output_len.") + + sampling_params = SamplingParams( + n=args.n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=args.output_len, + detokenize=not args.disable_detokenize, + ) + dummy_prompt_token_ids = np.random.randint(10000, + size=(args.batch_size, + args.input_len)) + dummy_prompts: list[PromptType] = [{ + "prompt_token_ids": batch + } for batch in dummy_prompt_token_ids.tolist()] + + def llm_generate(): + if not args.use_beam_search: + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + else: + llm.beam_search( + dummy_prompts, + BeamSearchParams( + beam_width=args.n, + max_tokens=args.output_len, + ignore_eos=True, + ), + ) + + def run_to_completion(profile_dir: Optional[str] = None): + if profile_dir: + llm.start_profile() + llm_generate() + llm.stop_profile() + else: + start_time = time.perf_counter() + llm_generate() + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + print("Warming up...") + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + run_to_completion(profile_dir=None) + + if args.profile: + profile_dir = envs.VLLM_TORCH_PROFILER_DIR + print(f"Profiling (results will be saved to '{profile_dir}')...") + run_to_completion(profile_dir=profile_dir) + return + + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion(profile_dir=None)) + latencies = np.array(latencies) + percentages = [10, 25, 50, 75, 90, 99] + percentiles = np.percentile(latencies, percentages) + print(f"Avg latency: {np.mean(latencies)} seconds") + for percentage, percentile in zip(percentages, percentiles): + print(f"{percentage}% percentile latency: {percentile} seconds") + + # Output JSON results if specified + if args.output_json: + results = { + "avg_latency": np.mean(latencies), + "latencies": latencies.tolist(), + "percentiles": dict(zip(percentages, percentiles.tolist())), + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py new file mode 100644 index 000000000..2a042802d --- /dev/null +++ b/vllm_omni/benchmarks/serve.py @@ -0,0 +1,1358 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +r"""Benchmark online serving throughput. + +On the server side, run one of the following commands +to launch the vLLM OpenAI API server: + vllm serve + +On the client side, run: + vllm bench serve \ + --backend \ + --label \ + --model \ + --dataset-name \ + --request-rate \ + --num-prompts +""" +import argparse +import asyncio +import gc +import importlib.util +import json +import os +import random +import shutil +import time +import warnings +from collections.abc import AsyncGenerator, Iterable +from dataclasses import dataclass +from datetime import datetime +from enum import Enum +from typing import Any, Literal, Optional + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser, + get_samples) +from vllm.benchmarks.lib.endpoint_request_func import ( + ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, + RequestFuncOutput) +from vllm.benchmarks.lib.ready_checker import wait_for_endpoint +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + write_to_json) +from vllm.transformers_utils.tokenizer import get_tokenizer + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + +TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None) + and (shutil.which("gnuplot") is not None)) + + +# TODO: Remove this in v0.11.0 +class DeprecatedEndpointTypeAction(argparse.Action): + """Argparse action for the deprecated --endpoint-type flag. + """ + + def __call__(self, _, namespace, values, option_string=None): + warnings.warn( + "'--endpoint-type' is deprecated and will be removed in v0.11.0. " + "Please use '--backend' instead or remove this argument if you " + "have already set it.", + stacklevel=1, + ) + setattr(namespace, self.dest, values) + + +class TaskType(Enum): + GENERATION = "generation" + EMBEDDING = "embedding" + + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + request_goodput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: list[tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: list[tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: list[tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: list[tuple[float, float]] + # Max output tokens per second and concurrent requests at that peak + max_output_tokens_per_s: float + max_concurrent_requests: int + + +@dataclass +class EmbedBenchmarkMetrics: + completed: int + total_input: int + request_throughput: float + total_token_throughput: float + mean_e2el_ms: float + std_e2el_ms: float + median_e2el_ms: float + percentiles_e2el_ms: float + + +def _get_current_request_rate( + ramp_up_strategy: Optional[Literal["linear", "exponential"]], + ramp_up_start_rps: Optional[int], + ramp_up_end_rps: Optional[int], + request_index: int, + total_requests: int, + request_rate: float, +) -> float: + if (ramp_up_strategy and ramp_up_start_rps is not None + and ramp_up_end_rps is not None): + progress = request_index / max(total_requests - 1, 1) + if ramp_up_strategy == "linear": + increase = (ramp_up_end_rps - ramp_up_start_rps) * progress + return ramp_up_start_rps + increase + elif ramp_up_strategy == "exponential": + ratio = ramp_up_end_rps / ramp_up_start_rps + return ramp_up_start_rps * (ratio**progress) + else: + raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}") + return request_rate + + +async def get_request( + input_requests: list[SampleRequest], + request_rate: float, + burstiness: float = 1.0, + ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, + ramp_up_start_rps: Optional[int] = None, + ramp_up_end_rps: Optional[int] = None, +) -> AsyncGenerator[tuple[SampleRequest, float], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness and OPTIONAL ramp-up strategy. + + Args: + input_requests: + A list of input requests, each represented as a SampleRequest. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + ramp_up_strategy (optional): + The ramp-up strategy. Can be "linear" or "exponential". + If None, uses constant request rate (specified by request_rate). + ramp_up_start_rps (optional): + The starting request rate for ramp-up. + ramp_up_end_rps (optional): + The ending request rate for ramp-up. + """ + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}.") + # Convert to list to get length for ramp-up calculations + if isinstance(input_requests, + Iterable) and not isinstance(input_requests, list): + input_requests = list(input_requests) + + total_requests = len(input_requests) + assert total_requests > 0, "No requests provided." + + # Precompute delays among requests to minimize request send laggings + request_rates = [] + delay_ts = [] + for request_index, request in enumerate(input_requests): + current_request_rate = _get_current_request_rate( + ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps, + request_index, total_requests, request_rate) + request_rates.append(current_request_rate) + if current_request_rate == float("inf"): + delay_ts.append(0) + else: + theta = 1.0 / (current_request_rate * burstiness) + + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + delay_ts.append(np.random.gamma(shape=burstiness, scale=theta)) + + # Calculate the cumulative delay time from the first sent out requests. + for i in range(1, len(delay_ts)): + delay_ts[i] += delay_ts[i - 1] + if ramp_up_strategy is None and delay_ts[-1] != 0: + # When ramp_up_strategy is not set, we assume the request rate is fixed + # and all requests should be sent in target_total_delay_s, the following + # logic would re-scale delay time to ensure the final delay_ts + # align with target_total_delay_s. + # + # NOTE: If we simply accumulate the random delta values + # from the gamma distribution, their sum would have 1-2% gap + # from target_total_delay_s. The purpose of the following logic is to + # close the gap for stabilizing the throughput data + # from different random seeds. + target_total_delay_s = total_requests / request_rate + normalize_factor = target_total_delay_s / delay_ts[-1] + delay_ts = [delay * normalize_factor for delay in delay_ts] + + start_ts = time.time() + for request_index, request in enumerate(input_requests): + if delay_ts[request_index] > 0: + current_ts = time.time() + sleep_interval_s = start_ts + delay_ts[request_index] - current_ts + if sleep_interval_s > 0: + await asyncio.sleep(sleep_interval_s) + yield request, request_rates[request_index] + + +def calculate_metrics_for_embeddings( + outputs: list[RequestFuncOutput], dur_s: float, + selected_percentiles: list[float]) -> EmbedBenchmarkMetrics: + """Calculate the metrics for the embedding requests. + + Args: + outputs: The outputs of the requests. + dur_s: The duration of the benchmark. + selected_percentiles: The percentiles to select. + + Returns: + The calculated benchmark metrics. + """ + total_input = 0 + completed = 0 + e2els: list[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + e2els.append(outputs[i].latency) + completed += 1 + total_input += outputs[i].prompt_len + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + metrics = EmbedBenchmarkMetrics( + completed=completed, + total_input=total_input, + request_throughput=completed / dur_s, + total_token_throughput=total_input / dur_s, + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], + ) + return metrics + + +def calculate_metrics( + input_requests: list[SampleRequest], + outputs: list[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, + selected_percentiles: list[float], + goodput_config_dict: dict[str, float], +) -> tuple[BenchmarkMetrics, list[int]]: + """Calculate the metrics for the benchmark. + + Args: + input_requests: The input requests. + outputs: The outputs of the requests. + dur_s: The duration of the benchmark. + tokenizer: The tokenizer to use. + selected_percentiles: The percentiles to select. + goodput_config_dict: The goodput configuration. + + Returns: + A tuple of the benchmark metrics and the actual output lengths. + """ + actual_output_lens: list[int] = [] + total_input = 0 + completed = 0 + good_completed = 0 + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + output_len = outputs[i].output_tokens + + if not output_len: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) + actual_output_lens.append(output_len) + total_input += input_requests[i].prompt_len + tpot = 0 + if output_len > 1: + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) + tpots.append(tpot) + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if goodput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in goodput_config_dict: + valid_metrics.append(ttfts) + slo_values.append(goodput_config_dict["ttft"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "tpot" in goodput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append(goodput_config_dict["tpot"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "e2el" in goodput_config_dict: + valid_metrics.append(e2els) + slo_values.append(goodput_config_dict["e2el"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + + # Calculate max output tokens per second metric + max_output_tokens_per_s = 0.0 + max_concurrent_requests = 0 + + # Find the time range across all successful requests + successful_outputs = [output for output in outputs if output.success] + if successful_outputs: + min_start_time = min(output.start_time + for output in successful_outputs) + max_end_time = max(output.start_time + output.latency + for output in successful_outputs) + + # Create second buckets (ceiling to ensure we capture all time) + duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1 + tokens_per_second = np.zeros(duration_seconds) + concurrent_requests_per_second = np.zeros(duration_seconds) + + for i, output in enumerate(successful_outputs): + # Calculate token generation timestamp using + # start_time, ttft, and itl + token_times = [output.start_time + output.ttft] + current_time = token_times[0] + for itl_value in output.itl: + current_time += itl_value + token_times.append(current_time) + + # Add tokens to second buckets + for token_time in token_times: + second_bucket = int(token_time - min_start_time) + if 0 <= second_bucket < duration_seconds: + tokens_per_second[second_bucket] += 1 + + # Track concurrent requests for each second this request was active + request_start_second = int(output.start_time - min_start_time) + request_end_second = int((output.start_time + output.latency) - + min_start_time) + + for second in range(request_start_second, request_end_second + 1): + concurrent_requests_per_second[second] += 1 + + # Find the maximum tokens per second and corresponding + # concurrent requests + if len(tokens_per_second) > 0: + max_output_tokens_per_s = float(np.max(tokens_per_second)) + max_concurrent_requests = int( + np.max(concurrent_requests_per_second)) + + if TERM_PLOTLIB_AVAILABLE: + import termplotlib as tpl + fig = tpl.figure() + fig.plot(np.arange(len(tokens_per_second)), + tokens_per_second, + title="Output tokens per second") + fig.plot(np.arange(len(concurrent_requests_per_second)), + concurrent_requests_per_second, + title="Concurrent requests per second") + fig.show() + else: + print("tip: install termplotlib and gnuplot to plot the metrics") + + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) * + 1000, # ttfts is empty if streaming is not supported by the endpoint + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) + for p in selected_percentiles], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) + for p in selected_percentiles], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) + for p in selected_percentiles], + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], + max_output_tokens_per_s=max_output_tokens_per_s, + max_concurrent_requests=max_concurrent_requests, + ) + + return metrics, actual_output_lens + + +async def benchmark( + endpoint_type: str, + api_url: str, + base_url: str, + model_id: str, + model_name: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: list[SampleRequest], + logprobs: Optional[int], + request_rate: float, + burstiness: float, + disable_tqdm: bool, + profile: bool, + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + ignore_eos: bool, + goodput_config_dict: dict[str, float], + max_concurrency: Optional[int], + lora_modules: Optional[Iterable[str]], + extra_headers: Optional[dict], + extra_body: Optional[dict], + ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, + ramp_up_start_rps: Optional[int] = None, + ramp_up_end_rps: Optional[int] = None, + ready_check_timeout_sec: int = 600, +): + task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else + TaskType.GENERATION) + if endpoint_type in ASYNC_REQUEST_FUNCS: + if task_type == TaskType.EMBEDDING: + request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"] + else: + request_func = ASYNC_REQUEST_FUNCS[endpoint_type] + else: + raise ValueError(f"Unknown backend: {endpoint_type}") + + # Reuses connections across requests to reduce TLS handshake overhead. + connector = aiohttp.TCPConnector( + limit=max_concurrency or 0, + limit_per_host=max_concurrency or 0, + ttl_dns_cache=300, + use_dns_cache=True, + keepalive_timeout=60, + enable_cleanup_closed=True, + force_close=False, + ssl=("https://" in api_url), + ) + + session = aiohttp.ClientSession( + connector=connector, + trust_env=True, + timeout=aiohttp.ClientTimeout(total=6 * 60 * 60), + ) + + print("Starting initial single prompt test run...") + test_prompt, test_prompt_len, test_output_len, test_mm_content = ( + input_requests[0].prompt, + input_requests[0].prompt_len, + input_requests[0].expected_output_len, + input_requests[0].multi_modal_data, + ) + + assert (test_mm_content is None or isinstance(test_mm_content, dict) + or (isinstance(test_mm_content, list) + and all(isinstance(item, dict) for item in test_mm_content)) + ), "multi_modal_data must be a dict or list[dict]" + test_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + api_url=api_url, + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, + extra_headers=extra_headers, + extra_body=extra_body, + ) + + if ready_check_timeout_sec > 0: + test_output = await wait_for_endpoint( + request_func, + test_input, + session, + timeout_seconds=ready_check_timeout_sec, + ) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark " + "arguments are correctly specified. " + f"Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") + else: + print("Skipping endpoint ready check.") + + if lora_modules: + # For each input request, choose a LoRA module at random. + lora_modules = iter( + [random.choice(lora_modules) for _ in range(len(input_requests))]) + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput(model=model_id, + model_name=model_name, + prompt=test_prompt, + api_url=base_url + "/start_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, + extra_headers=extra_headers, + extra_body=extra_body) + profile_output = await request_func(request_func_input=profile_input, + session=session) + if profile_output.success: + print("Profiler started") + + distribution = ("Poisson process" + if burstiness == 1.0 else "Gamma distribution") + + if ramp_up_strategy is not None: + print(f"Traffic ramp-up strategy: {ramp_up_strategy}.") + print(f"Will increase RPS from {ramp_up_start_rps} to " + f"{ramp_up_end_rps} RPS over the duration of the benchmark.") + else: + print(f"Traffic request rate: {request_rate}") + + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = (asyncio.Semaphore(max_concurrency) + if max_concurrency else None) + + async def limited_request_func(request_func_input, session, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, + session=session, + pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, + session=session, + pbar=pbar) + + benchmark_start_time = time.perf_counter() + tasks: list[asyncio.Task] = [] + + rps_change_events = [] + last_int_rps = -1 + if ramp_up_strategy is not None and ramp_up_start_rps is not None: + last_int_rps = ramp_up_start_rps + rps_change_events.append({ + "rps": last_int_rps, + "timestamp": datetime.now().isoformat(), + }) + + async for request, current_request_rate in get_request( + input_requests, request_rate, burstiness, ramp_up_strategy, + ramp_up_start_rps, ramp_up_end_rps): + if ramp_up_strategy is not None: + current_int_rps = int(current_request_rate) + if current_int_rps > last_int_rps: + timestamp = datetime.now().isoformat() + for rps_val in range(last_int_rps + 1, current_int_rps + 1): + rps_change_events.append({ + "rps": rps_val, + "timestamp": timestamp + }) + last_int_rps = current_int_rps + prompt, prompt_len, output_len, mm_content, request_id = ( + request.prompt, + request.prompt_len, + request.expected_output_len, + request.multi_modal_data, + request.request_id, + ) + req_model_id, req_model_name = model_id, model_name + if lora_modules: + req_lora_module = next(lora_modules) + req_model_id, req_model_name = req_lora_module, req_lora_module + + request_func_input = RequestFuncInput( + model=req_model_id, + model_name=req_model_name, + prompt=prompt, + api_url=api_url, + prompt_len=prompt_len, + output_len=output_len, + logprobs=logprobs, + multi_modal_content=mm_content, + ignore_eos=ignore_eos, + extra_headers=extra_headers, + extra_body=extra_body, + request_id=request_id, + ) + tasks.append( + asyncio.create_task( + limited_request_func(request_func_input=request_func_input, + session=session, + pbar=pbar))) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + if task_type == TaskType.GENERATION: + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, + ) + else: + metrics = calculate_metrics_for_embeddings( + outputs=outputs, + dur_s=benchmark_duration, + selected_percentiles=selected_percentiles, + ) + actual_output_lens = 0 + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + if max_concurrency is not None: + print("{:<40} {:<10}".format("Maximum request concurrency:", + max_concurrency)) + if request_rate != float('inf'): + print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", + request_rate)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + if isinstance(metrics, BenchmarkMetrics): + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + if goodput_config_dict: + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", + metrics.request_goodput)) + if isinstance(metrics, BenchmarkMetrics): + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format( + "Peak output token throughput (tok/s):", + metrics.max_output_tokens_per_s)) + print("{:<40} {:<10.2f}".format("Peak concurrent requests:", + metrics.max_concurrent_requests)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.total_token_throughput)) + + if isinstance(metrics, BenchmarkMetrics): + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "request_goodput": + metrics.request_goodput if goodput_config_dict else None, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + "max_output_tokens_per_s": metrics.max_output_tokens_per_s, + "max_concurrent_requests": metrics.max_concurrent_requests, + } + else: + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "request_throughput": metrics.request_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "errors": [output.error for output in outputs], + } + + if rps_change_events: + result["rps_change_events"] = rps_change_events + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"))) + print("{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"))) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, + f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", + value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + if task_type == TaskType.GENERATION: + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + ) + profile_output = await request_func(request_func_input=profile_input, + session=session) + if profile_output.success: + print("Profiler stopped") + + await session.close() + return result + + +def check_goodput_args(args): + # Check and parse goodput arguments + goodput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. ") + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative.") + return goodput_config_dict + + +def parse_goodput(slo_pairs): + goodput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + goodput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + "Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds.") from err + return goodput_config_dict + + +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: dict[str, Any], + file_name: str) -> None: + metrics = [ + "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", + "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms", + "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms" + ] + # These raw data might be useful, but they are rather big. They can be added + # later if needed + ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={k: [results[k]] + for k in metrics if k in results}, + extra_info={ + k: results[k] + for k in results if k not in metrics and k not in ignored_metrics + }) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def add_cli_args(parser: argparse.ArgumentParser): + add_dataset_parser(parser) + parser.add_argument( + "--label", + type=str, + default=None, + help="The label (prefix) of the benchmark results. If not specified, " + "the value of '--backend' will be used as the label.", + ) + parser.add_argument( + "--backend", + type=str, + default="openai", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + help="The type of backend or endpoint to use for the benchmark." + ) + parser.add_argument( + "--endpoint-type", + type=str, + default=None, + choices=list(ASYNC_REQUEST_FUNCS.keys()), + action=DeprecatedEndpointTypeAction, + help="'--endpoint-type' is deprecated and will be removed in v0.11.0. " + "Please use '--backend' instead.", + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--header", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --header x-additional-info=0.3.3) " + "for headers to be passed with each request. These headers override " \ + "per backend constants and values set via environment variable, and " \ + "will be overriden by other arguments (such as request ids)." + ) + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.", + ) + + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--logprobs", + type=int, + default=None, + help=("Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed"), + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", + ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " + "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--save-result", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) + parser.add_argument( + "--append-result", + action="store_true", + help="Append the benchmark result to the existing json file.", + ) + parser.add_argument( + "--metadata", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " + "for metadata of this run to be saved in the result JSON file " + "for record keeping purposes.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" # noqa + " format.", + ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-separated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ") + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-separated list of percentiles for selected metrics. " + "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\"." + "Use \"--percentile-metrics\" to select metrics.", + ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help="Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is in " + "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + "separated by spaces. Allowed request level metric names are " + "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) + parser.add_argument( + "--request-id-prefix", + type=str, + required=False, + default="benchmark-serving", + help="Specify the prefix of request id.", + ) + + sampling_group = parser.add_argument_group("sampling parameters") + sampling_group.add_argument( + "--top-p", + type=float, + default=None, + help="Top-p sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--top-k", + type=int, + default=None, + help="Top-k sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--min-p", + type=float, + default=None, + help="Min-p sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--temperature", + type=float, + default=None, + help="Temperature sampling parameter. Only has effect on " + "openai-compatible backends. If not specified, default to greedy " + "decoding (i.e. temperature==0.0).", + ) + + parser.add_argument( + '--tokenizer-mode', + type=str, + default="auto", + choices=['auto', 'slow', 'mistral', 'custom'], + help='The tokenizer mode.\n\n* "auto" will use the ' + 'fast tokenizer if available.\n* "slow" will ' + 'always use the slow tokenizer. \n* ' + '"mistral" will always use the `mistral_common` tokenizer. \n*' + '"custom" will use --tokenizer to select the preregistered tokenizer.') + + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ") + + parser.add_argument("--lora-modules", + nargs='+', + default=None, + help="A subset of LoRA module names passed in when " + "launching the server. For each request, the " + "script chooses a LoRA module at random.") + + parser.add_argument( + "--ramp-up-strategy", + type=str, + default=None, + choices=["linear", "exponential"], + help="The ramp-up strategy. This would be used to " + "ramp up the request rate from initial RPS to final " + "RPS rate (specified by --ramp-up-start-rps and " + "--ramp-up-end-rps.) over the duration of the benchmark.") + parser.add_argument( + "--ramp-up-start-rps", + type=int, + default=None, + help="The starting request rate for ramp-up (RPS). " + "Needs to be specified when --ramp-up-strategy is used.", + ) + parser.add_argument( + "--ramp-up-end-rps", + type=int, + default=None, + help="The ending request rate for ramp-up (RPS). " + "Needs to be specified when --ramp-up-strategy is used.", + ) + parser.add_argument( + "--ready-check-timeout-sec", + type=int, + default=600, + help="Maximum time to wait for the endpoint to become ready " + "in seconds (default: 600 seconds / 10 minutes). If set to 0, " + "the ready check will be skipped." + ) + + +def main(args: argparse.Namespace) -> dict[str, Any]: + return asyncio.run(main_async(args)) + + +async def main_async(args: argparse.Namespace) -> dict[str, Any]: + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + # Validate ramp-up arguments + if args.ramp_up_strategy is not None: + if args.request_rate != float("inf"): + raise ValueError( + "When using ramp-up, do not specify --request-rate. " + "The request rate will be controlled by ramp-up parameters. " + "Please remove the --request-rate argument.") + if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: + raise ValueError( + "When using --ramp-up-strategy, both --ramp-up-start-rps and " + "--ramp-up-end-rps must be specified") + if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: + raise ValueError("Ramp-up start and end RPS must be non-negative") + if args.ramp_up_start_rps > args.ramp_up_end_rps: + raise ValueError("Ramp-up start RPS must be less than end RPS") + if (args.ramp_up_strategy == "exponential" + and args.ramp_up_start_rps == 0): + raise ValueError( + "For exponential ramp-up, the start RPS cannot be 0.") + + label = args.label + model_id = args.model + model_name = args.served_model_name + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + tokenizer_mode = args.tokenizer_mode + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + + # Headers + headers = None + if args.header: + headers = {} + for item in args.header: + if "=" in item: + kvstring = item.split("=", 1) + headers[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid header format. Please use KEY=VALUE format.") + + tokenizer = get_tokenizer(tokenizer_id, + tokenizer_mode=tokenizer_mode, + trust_remote_code=args.trust_remote_code) + + if args.dataset_name is None: + raise ValueError( + "Please specify '--dataset-name' and the corresponding " + "'--dataset-path' if required.") + + # Load the dataset. + input_requests = get_samples(args, tokenizer) + goodput_config_dict = check_goodput_args(args) + + # Collect the sampling parameters. + sampling_params = { + k: v + for k, v in { + "top_p": args.top_p, + "top_k": args.top_k, + "min_p": args.min_p, + "temperature": args.temperature, + }.items() if v is not None + } + + # Sampling parameters are only supported by openai-compatible backend. + if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: + raise ValueError("Sampling parameters are only supported by " + "openai-compatible backends.") + + if "temperature" not in sampling_params: + sampling_params["temperature"] = 0.0 # Default to greedy decoding. + + # Avoid GC processing "static" data - reduce pause times. + gc.collect() + gc.freeze() + + benchmark_result = await benchmark( + endpoint_type=args.backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + model_name=model_name, + tokenizer=tokenizer, + input_requests=input_requests, + logprobs=args.logprobs, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[ + float(p) for p in args.metric_percentiles.split(",") + ], + ignore_eos=args.ignore_eos, + goodput_config_dict=goodput_config_dict, + max_concurrency=args.max_concurrency, + lora_modules=args.lora_modules, + extra_headers=headers, + extra_body=sampling_params, + ramp_up_strategy=args.ramp_up_strategy, + ramp_up_start_rps=args.ramp_up_start_rps, + ramp_up_end_rps=args.ramp_up_end_rps, + ready_check_timeout_sec=args.ready_check_timeout_sec, + ) + + # Save config and results to json + result_json: dict[str, Any] = {} + + # Setup + current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") + result_json["date"] = current_dt + result_json["endpoint_type"] = args.backend # for backward compatibility + result_json["backend"] = args.backend + result_json["label"] = label + result_json["model_id"] = model_id + result_json["tokenizer_id"] = tokenizer_id + result_json["num_prompts"] = args.num_prompts + + # Metadata + if args.metadata: + for item in args.metadata: + if "=" in item: + kvstring = item.split("=", 1) + result_json[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid metadata format. Please use KEY=VALUE format.") + + # Traffic + result_json["request_rate"] = (args.request_rate if args.request_rate + < float("inf") else "inf") + result_json["burstiness"] = args.burstiness + result_json["max_concurrency"] = args.max_concurrency + + if args.ramp_up_strategy is not None: + result_json["ramp_up_strategy"] = args.ramp_up_strategy + result_json["ramp_up_start_rps"] = args.ramp_up_start_rps + result_json["ramp_up_end_rps"] = args.ramp_up_end_rps + + # Merge with benchmark result + result_json = {**result_json, **benchmark_result} + + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", + ]: + if field in result_json: + del result_json[field] + if field in benchmark_result: + del benchmark_result[field] + + # Save to file + if args.save_result or args.append_result: + base_model_id = model_id.split("/")[-1] + max_concurrency_str = (f"-concurrency{args.max_concurrency}" + if args.max_concurrency is not None else "") + label = label or args.backend + if args.ramp_up_strategy is not None: + file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + else: + file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + if args.result_filename: + file_name = args.result_filename + if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) + file_name = os.path.join(args.result_dir, file_name) + with open(file_name, + mode="a+" if args.append_result else "w", + encoding="utf-8") as outfile: + # Append a newline. + if args.append_result and outfile.tell() != 0: + outfile.write("\n") + json.dump(result_json, outfile) + save_to_pytorch_benchmark_format(args, result_json, file_name) + + return result_json diff --git a/vllm_omni/benchmarks/throughput.py b/vllm_omni/benchmarks/throughput.py new file mode 100644 index 000000000..96e39fd92 --- /dev/null +++ b/vllm_omni/benchmarks/throughput.py @@ -0,0 +1,696 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark offline inference throughput.""" +import argparse +import dataclasses +import json +import os +import random +import time +import warnings +from typing import Any, Optional, Union + +import torch +import uvloop +from tqdm import tqdm +from transformers import (AutoModelForCausalLM, AutoTokenizer, + PreTrainedTokenizerBase) + +from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset, + ConversationDataset, + InstructCoderDataset, + PrefixRepetitionRandomDataset, + RandomDataset, SampleRequest, + ShareGPTDataset, SonnetDataset, + VisionArenaDataset) +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + write_to_json) +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.inputs import TextPrompt, TokensPrompt +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import BeamSearchParams +from vllm.utils import merge_async_iterators + + +def run_vllm( + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + do_profile: bool, + disable_detokenize: bool = False, +) -> tuple[float, Optional[list[RequestOutput]]]: + from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args)) + assert all( + llm.llm_engine.model_config.max_model_len >= ( + request.prompt_len + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests.") + # Add the requests to the engine. + prompts: list[Union[TextPrompt, TokensPrompt]] = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + prompts.append( + TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data) + if "prompt_token_ids" in request.prompt else \ + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + )) + lora_requests: Optional[list[LoRARequest]] = None + if engine_args.enable_lora: + lora_requests = [request.lora_request for request in requests] + + use_beam_search = False + + outputs = None + if not use_beam_search: + start = time.perf_counter() + if do_profile: + llm.start_profile() + outputs = llm.generate(prompts, + sampling_params, + lora_request=lora_requests, + use_tqdm=True) + if do_profile: + llm.stop_profile() + end = time.perf_counter() + else: + assert lora_requests is None, "BeamSearch API does not support LoRA" + prompts = [request.prompt for request in requests] + # output_len should be the same for all requests. + output_len = requests[0].expected_output_len + for request in requests: + assert request.expected_output_len == output_len + start = time.perf_counter() + if do_profile: + llm.start_profile() + llm.beam_search( + prompts, + BeamSearchParams( + beam_width=n, + max_tokens=output_len, + ignore_eos=True, + )) + if do_profile: + llm.stop_profile() + end = time.perf_counter() + return end - start, outputs + + +def run_vllm_chat( + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + do_profile: bool, + disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]: + """ + Run vLLM chat benchmark. This function is recommended ONLY for benchmarking + multimodal models as it properly handles multimodal inputs and chat + formatting. For non-multimodal models, use run_vllm() instead. + """ + from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args)) + + assert all( + llm.llm_engine.model_config.max_model_len >= ( + request.prompt_len + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of " + "prompt_len and expected_output_len for all requests.") + + prompts = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + )) + start = time.perf_counter() + if do_profile: + llm.start_profile() + outputs = llm.chat(prompts, sampling_params, use_tqdm=True) + if do_profile: + llm.stop_profile() + end = time.perf_counter() + return end - start, outputs + + +async def run_vllm_async( + requests: list[SampleRequest], + n: int, + engine_args: AsyncEngineArgs, + do_profile: bool, + disable_frontend_multiprocessing: bool = False, + disable_detokenize: bool = False, +) -> float: + from vllm import SamplingParams + from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) + + async with build_async_engine_client_from_engine_args( + engine_args, + disable_frontend_multiprocessing=disable_frontend_multiprocessing, + ) as llm: + model_config = await llm.get_model_config() + assert all( + model_config.max_model_len >= (request.prompt_len + + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests.") + + # Add the requests to the engine. + prompts: list[Union[TextPrompt, TokensPrompt]] = [] + sampling_params: list[SamplingParams] = [] + lora_requests: list[Optional[LoRARequest]] = [] + for request in requests: + prompts.append( + TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data) + if "prompt_token_ids" in request.prompt else \ + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + )) + lora_requests.append(request.lora_request) + + generators = [] + start = time.perf_counter() + if do_profile: + await llm.start_profile() + for i, (prompt, sp, + lr) in enumerate(zip(prompts, sampling_params, lora_requests)): + generator = llm.generate(prompt, + sp, + lora_request=lr, + request_id=f"test{i}") + generators.append(generator) + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass + if do_profile: + await llm.stop_profile() + end = time.perf_counter() + return end - start + + +def run_hf( + requests: list[SampleRequest], + model: str, + tokenizer: PreTrainedTokenizerBase, + n: int, + max_batch_size: int, + trust_remote_code: bool, + disable_detokenize: bool = False, +) -> float: + llm = AutoModelForCausalLM.from_pretrained( + model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) + if llm.config.model_type == "llama": + # To enable padding in the HF backend. + tokenizer.pad_token = tokenizer.eos_token + llm = llm.cuda() + + pbar = tqdm(total=len(requests)) + start = time.perf_counter() + batch: list[str] = [] + max_prompt_len = 0 + max_output_len = 0 + for i in range(len(requests)): + prompt = requests[i].prompt + prompt_len = requests[i].prompt_len + output_len = requests[i].expected_output_len + # Add the prompt to the batch. + batch.append(prompt) + max_prompt_len = max(max_prompt_len, prompt_len) + max_output_len = max(max_output_len, output_len) + if len(batch) < max_batch_size and i != len(requests) - 1: + # Check if we can add more requests to the batch. + next_prompt_len = requests[i + 1].prompt_len + next_output_len = requests[i + 1].expected_output_len + if (max(max_prompt_len, next_prompt_len) + + max(max_output_len, next_output_len)) <= 2048: + # We can add more requests to the batch. + continue + + # Generate the sequences. + input_ids = tokenizer(batch, return_tensors="pt", + padding=True).input_ids + llm_outputs = llm.generate( + input_ids=input_ids.cuda(), + do_sample=True, + num_return_sequences=n, + temperature=1.0, + top_p=1.0, + use_cache=True, + max_new_tokens=max_output_len, + ) + if not disable_detokenize: + # Include the decoding time. + tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) + pbar.update(len(batch)) + + # Clear the batch. + batch = [] + max_prompt_len = 0 + max_output_len = 0 + end = time.perf_counter() + return end - start + + +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: dict[str, Any]) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={ + "requests_per_second": [results["requests_per_second"]], + "tokens_per_second": [results["tokens_per_second"]], + }, + extra_info={ + k: results[k] + for k in ["elapsed_time", "num_requests", "total_num_tokens"] + }) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def get_requests(args, tokenizer): + # Common parameters for all dataset types. + common_kwargs = { + "dataset_path": args.dataset_path, + "random_seed": args.seed, + } + sample_kwargs = { + "tokenizer": tokenizer, + "lora_path": args.lora_path, + "max_loras": args.max_loras, + "num_requests": args.num_prompts, + "input_len": args.input_len, + "output_len": args.output_len, + } + + if args.dataset_path is None or args.dataset_name == "random": + sample_kwargs["range_ratio"] = args.random_range_ratio + sample_kwargs["prefix_len"] = args.prefix_len + dataset_cls = RandomDataset + elif args.dataset_name == "sharegpt": + dataset_cls = ShareGPTDataset + if args.backend == "vllm-chat": + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_name == "sonnet": + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset.") + dataset_cls = SonnetDataset + sample_kwargs["prefix_len"] = args.prefix_len + sample_kwargs["return_prompt_formatted"] = True + elif args.dataset_name == "burstgpt": + dataset_cls = BurstGPTDataset + elif args.dataset_name == "hf": + if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = VisionArenaDataset + common_kwargs['dataset_subset'] = None + common_kwargs['dataset_split'] = "train" + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = InstructCoderDataset + common_kwargs['dataset_split'] = "train" + elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = ConversationDataset + common_kwargs['dataset_subset'] = args.hf_subset + common_kwargs['dataset_split'] = args.hf_split + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + dataset_cls = AIMODataset + common_kwargs['dataset_subset'] = None + common_kwargs['dataset_split'] = "train" + elif args.dataset_name == "prefix_repetition": + dataset_cls = PrefixRepetitionRandomDataset + sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len + sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len + sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes + sample_kwargs["output_len"] = args.prefix_repetition_output_len + else: + raise ValueError(f"Unknown dataset name: {args.dataset_name}") + # Remove None values + sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} + return dataset_cls(**common_kwargs).sample(**sample_kwargs) + + +def validate_args(args): + """ + Validate command-line arguments. + """ + + # === Deprecation and Defaulting === + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next release. " + "Please use '--dataset-name' and '--dataset-path' instead.", + stacklevel=2) + args.dataset_path = args.dataset + + if not getattr(args, "tokenizer", None): + args.tokenizer = args.model + + # === Backend Validation === + valid_backends = {"vllm", "hf", "mii", "vllm-chat"} + if args.backend not in valid_backends: + raise ValueError(f"Unsupported backend: {args.backend}") + + # === Dataset Configuration === + if ( + not args.dataset + and not args.dataset_path + and args.dataset_name not in {"prefix_repetition"} + ): + print( + "When dataset path is not set, it will default to random dataset") + args.dataset_name = 'random' + if args.input_len is None: + raise ValueError("input_len must be provided for a random dataset") + + # === Dataset Name Specific Checks === + # --hf-subset and --hf-split: only used + # when dataset_name is 'hf' + if args.dataset_name != "hf" and ( + getattr(args, "hf_subset", None) is not None + or getattr(args, "hf_split", None) is not None): + warnings.warn("--hf-subset and --hf-split will be ignored \ + since --dataset-name is not 'hf'.", + stacklevel=2) + elif args.dataset_name == "hf": + if args.dataset_path in ( + VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() + | ConversationDataset.SUPPORTED_DATASET_PATHS): + assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501 + elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS + | AIMODataset.SUPPORTED_DATASET_PATHS): + assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501 + else: + raise ValueError( + f"{args.dataset_path} is not supported by hf dataset.") + + # --random-range-ratio: only used when dataset_name is 'random' + if args.dataset_name != 'random' and args.random_range_ratio is not None: + warnings.warn("--random-range-ratio will be ignored since \ + --dataset-name is not 'random'.", + stacklevel=2) + + # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not + # set. + if args.dataset_name not in {"random", "sonnet", None + } and args.prefix_len is not None: + warnings.warn("--prefix-len will be ignored since --dataset-name\ + is not 'random', 'sonnet', or not set.", + stacklevel=2) + + # === LoRA Settings === + if getattr(args, "enable_lora", False) and args.backend != "vllm": + raise ValueError( + "LoRA benchmarking is only supported for vLLM backend") + if getattr(args, "enable_lora", False) and args.lora_path is None: + raise ValueError("LoRA path must be provided when enable_lora is True") + + # === Backend-specific Validations === + if args.backend == "hf" and args.hf_max_batch_size is None: + raise ValueError("HF max batch size is required for HF backend") + if args.backend != "hf" and args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + + if args.backend in {"hf", "mii"} and getattr(args, "quantization", + None) is not None: + raise ValueError("Quantization is only for vLLM backend.") + + if args.backend == "mii" and args.dtype != "auto": + raise ValueError("dtype must be auto for MII backend.") + if args.backend == "mii" and args.n != 1: + raise ValueError("n must be 1 for MII backend.") + if args.backend == "mii" and args.tokenizer != args.model: + raise ValueError( + "Tokenizer must be the same as the model for MII backend.") + + # --data-parallel is not supported currently. + # https://github.com/vllm-project/vllm/issues/16222 + if args.data_parallel_size > 1: + raise ValueError( + "Data parallel is not supported in offline benchmark, " + "please use benchmark serving instead" + ) + + +def add_cli_args(parser: argparse.ArgumentParser): + parser.add_argument("--backend", + type=str, + choices=["vllm", "hf", "mii", "vllm-chat"], + default="vllm") + parser.add_argument( + "--dataset-name", + type=str, + choices=[ + "sharegpt", "random", "sonnet", "burstgpt", "hf", + "prefix_repetition" + ], + help="Name of the dataset to benchmark on.", + default="sharegpt") + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Path to the ShareGPT dataset, will be deprecated in\ + the next release. The dataset is expected to " + "be a json in form of list[dict[..., conversations: " + "list[dict[..., value: ]]]]") + parser.add_argument("--dataset-path", + type=str, + default=None, + help="Path to the dataset") + parser.add_argument("--input-len", + type=int, + default=None, + help="Input prompt length for each request") + parser.add_argument("--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.") + parser.add_argument("--n", + type=int, + default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.") + parser.add_argument("--hf-max-batch-size", + type=int, + default=None, + help="Maximum batch size for HF backend.") + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the throughput results in JSON format.') + parser.add_argument("--async-engine", + action='store_true', + default=False, + help="Use vLLM async engine rather than LLM class.") + parser.add_argument("--disable-frontend-multiprocessing", + action='store_true', + default=False, + help="Disable decoupled async engine frontend.") + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize the response (i.e. do not include " + "detokenization time in the measurement)")) + # LoRA + parser.add_argument( + "--lora-path", + type=str, + default=None, + help="Path to the lora adapters to use. This can be an absolute path, " + "a relative path, or a Hugging Face model identifier.") + parser.add_argument( + "--prefix-len", + type=int, + default=0, + help="Number of fixed prefix tokens before the random " + "context in a request (default: 0).", + ) + # random dataset + parser.add_argument( + "--random-range-ratio", + type=float, + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for RandomDataset. Must be in the range [0, 1) to define " + "a symmetric sampling range " + "[length * (1 - range_ratio), length * (1 + range_ratio)].", + ) + + # hf dtaset + parser.add_argument("--hf-subset", + type=str, + default=None, + help="Subset of the HF dataset.") + parser.add_argument("--hf-split", + type=str, + default=None, + help="Split of the HF dataset.") + parser.add_argument( + "--profile", + action="store_true", + default=False, + help="Use Torch Profiler. The env variable " + "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.") + + # prefix repetition dataset + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options") + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=None, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=None, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=None, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=None, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + + parser = AsyncEngineArgs.add_cli_args(parser) + + +def main(args: argparse.Namespace): + if args.tokenizer is None: + args.tokenizer = args.model + validate_args(args) + if args.seed is None: + args.seed = 0 + random.seed(args.seed) + # Sample the requests. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + requests = get_requests(args, tokenizer) + is_multi_modal = any(request.multi_modal_data is not None + for request in requests) + request_outputs: Optional[list[RequestOutput]] = None + if args.backend == "vllm": + if args.async_engine: + elapsed_time = uvloop.run( + run_vllm_async( + requests, + args.n, + AsyncEngineArgs.from_cli_args(args), + disable_frontend_multiprocessing=args.disable_frontend_multiprocessing, + disable_detokenize=args.disable_detokenize, + do_profile=args.profile, + )) + else: + elapsed_time, request_outputs = run_vllm( + requests, args.n, EngineArgs.from_cli_args(args), + disable_detokenize=args.disable_detokenize, + do_profile=args.profile) + elif args.backend == "hf": + assert args.tensor_parallel_size == 1 + if args.profile: + raise NotImplementedError( + "Profiling not implemented yet for backend='hf'.") + elapsed_time = run_hf(requests, args.model, tokenizer, args.n, + args.hf_max_batch_size, args.trust_remote_code, + args.disable_detokenize) + elif args.backend == "vllm-chat": + elapsed_time, request_outputs = run_vllm_chat( + requests, args.n, EngineArgs.from_cli_args(args), + disable_detokenize=args.disable_detokenize, do_profile=args.profile) + else: + raise ValueError(f"Unknown backend: {args.backend}") + + if request_outputs: + # Note: with the vllm and vllm-chat backends, + # we have request_outputs, which we use to count tokens. + total_prompt_tokens = 0 + total_output_tokens = 0 + for ro in request_outputs: + if not isinstance(ro, RequestOutput): + continue + total_prompt_tokens += len( + ro.prompt_token_ids) if ro.prompt_token_ids else 0 + total_output_tokens += sum( + len(o.token_ids) for o in ro.outputs if o) + total_num_tokens = total_prompt_tokens + total_output_tokens + else: + total_num_tokens = sum(r.prompt_len + r.expected_output_len + for r in requests) + total_output_tokens = sum(r.expected_output_len for r in requests) + total_prompt_tokens = total_num_tokens - total_output_tokens + + if is_multi_modal and args.backend != "vllm-chat": + print("\033[91mWARNING\033[0m: Multi-modal request with " + f"{args.backend} backend detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details.") + # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. + # vllm-chat backend counts the image tokens now + + print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s") + print(f"Total num prompt tokens: {total_prompt_tokens}") + print(f"Total num output tokens: {total_output_tokens}") + + # Output JSON results if specified + if args.output_json: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": total_num_tokens / elapsed_time, + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) From 1e5842e69a2c7170cace5af70cc06101692b39c6 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Tue, 16 Dec 2025 15:34:44 +0800 Subject: [PATCH 11/26] =?UTF-8?q?=E6=8F=90=E4=BA=A4benchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/e2e/online_serving/test_qwen2_5_omni.py | 79 +++++++++++++------ vllm_omni/benchmarks/serve.py | 4 +- 2 files changed, 59 insertions(+), 24 deletions(-) diff --git a/tests/e2e/online_serving/test_qwen2_5_omni.py b/tests/e2e/online_serving/test_qwen2_5_omni.py index 7521da469..dd0398d89 100644 --- a/tests/e2e/online_serving/test_qwen2_5_omni.py +++ b/tests/e2e/online_serving/test_qwen2_5_omni.py @@ -1,17 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -E2E Online tests for Qwen3-Omni model with video input and audio output. +E2E Online tests for Qwen2_5-Omni model with video input and audio output. """ import os from pathlib import Path - -import openai import pytest -import time -from vllm.assets.video import VideoAsset +import subprocess from tests.conftest import OmniServer os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -23,23 +20,61 @@ # Create parameter combinations for model and stage config test_params = [(model, stage_config) for model in models for stage_config in stage_configs] +@pytest.fixture(scope="module") +def omni_server(request): + """Start vLLM-Omni server as a subprocess with actual model weights. + Uses module scope so the server starts only once for all tests. + Multi-stage initialization can take 10-20+ minutes. + """ + model, stage_config_path = request.param + #with OmniServer(model, ["--stage-configs-path", stage_config_path]) as server: + with OmniServer(model, []) as server: + yield server -@pytest.fixture -def client(omni_server): - """OpenAI client for the running vLLM-Omni server.""" - return openai.OpenAI( - base_url=f"http://{omni_server.host}:{omni_server.port}/v1", - api_key="EMPTY", - ) - - -@pytest.mark.parametrize("test_param", test_params) -def test_video_to_audio( - test_param, +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_mix_to_audio( + omni_server, ) -> None: - """Test processing video, generating audio output via OpenAI API.""" + """Test processing video+audio+image, generating audio output via OpenAI API.""" # Create data URL for the base64 encoded video - model, stage_config_path = test_param - with OmniServer(model, []) as server: - time.sleep(1000000) - pass + command = [ + "vllm-omni", + "bench", + "serve", + "--model", + omni_server.model, + "--host", + omni_server.host, + "--port", + str(omni_server.port), + "--dataset-name", + "random-mm", + "--request_rate", + "1", + "--random-input-len", + "32", + "--random-range-ratio", + "0.0", + "--random-mm-base-items-per-request", + "2", + "--random-mm-num-mm-items-range-ratio", + "0", + "--random-mm-limit-mm-per-prompt", + '{"image":10, "video": 1, "audio": 1}', + "--random-mm-bucket-config", + '{"(640,640,1)":0.5, "(0,1,1)": 0.1, "(256, 256, 2)": 0.4}', + "--ignore-eos", + "--random-output-len", + "4", + "--num-prompts", + "5", + "--endpoint", + "/v1/chat/completions", + "--backend", + "openai-chat", + ] + result = subprocess.run(command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + + assert result.returncode == 0, f"Benchmark failed: {result.stderr}" diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py index 2a042802d..6de8dc4c2 100644 --- a/vllm_omni/benchmarks/serve.py +++ b/vllm_omni/benchmarks/serve.py @@ -4,10 +4,10 @@ On the server side, run one of the following commands to launch the vLLM OpenAI API server: - vllm serve + vllm-omni serve On the client side, run: - vllm bench serve \ + vllm-omni bench serve \ --backend \ --label \ --model \ From d80d7095212600ff435e3799a373149d4f2f898c Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Wed, 17 Dec 2025 16:35:27 +0800 Subject: [PATCH 12/26] =?UTF-8?q?=E6=8F=90=E4=BA=A4benchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/benchmarks/latency.py | 2 +- vllm_omni/benchmarks/lib/__init__.py | 3 + .../benchmarks/lib/endpoint_request_func.py | 531 ++++++++++++++++++ vllm_omni/benchmarks/lib/ready_checker.py | 71 +++ vllm_omni/benchmarks/lib/utils.py | 78 +++ vllm_omni/benchmarks/serve.py | 10 +- vllm_omni/benchmarks/throughput.py | 4 +- 7 files changed, 691 insertions(+), 8 deletions(-) create mode 100644 vllm_omni/benchmarks/lib/__init__.py create mode 100644 vllm_omni/benchmarks/lib/endpoint_request_func.py create mode 100644 vllm_omni/benchmarks/lib/ready_checker.py create mode 100644 vllm_omni/benchmarks/lib/utils.py diff --git a/vllm_omni/benchmarks/latency.py b/vllm_omni/benchmarks/latency.py index 05378ec74..e9fd9d5da 100644 --- a/vllm_omni/benchmarks/latency.py +++ b/vllm_omni/benchmarks/latency.py @@ -13,7 +13,7 @@ from tqdm import tqdm import vllm.envs as envs -from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, +from vllm_omni.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType diff --git a/vllm_omni/benchmarks/lib/__init__.py b/vllm_omni/benchmarks/lib/__init__.py new file mode 100644 index 000000000..005e87af6 --- /dev/null +++ b/vllm_omni/benchmarks/lib/__init__.py @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark library utilities.""" diff --git a/vllm_omni/benchmarks/lib/endpoint_request_func.py b/vllm_omni/benchmarks/lib/endpoint_request_func.py new file mode 100644 index 000000000..661a1b4f4 --- /dev/null +++ b/vllm_omni/benchmarks/lib/endpoint_request_func.py @@ -0,0 +1,531 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""The request function for API endpoints.""" + +import io +import json +import os +import sys +import time +import traceback +from collections.abc import Awaitable +from dataclasses import dataclass, field +from typing import Optional, Protocol, Union +import aiohttp +from tqdm.asyncio import tqdm + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + + +class StreamedResponseHandler: + """Handles streaming HTTP responses by accumulating chunks until complete + messages are available.""" + + def __init__(self): + self.buffer = "" + + def add_chunk(self, chunk_bytes: bytes) -> list[str]: + """Add a chunk of bytes to the buffer and return any complete + messages.""" + chunk_str = chunk_bytes.decode("utf-8") + self.buffer += chunk_str + + messages = [] + + # Split by double newlines (SSE message separator) + while "\n\n" in self.buffer: + message, self.buffer = self.buffer.split("\n\n", 1) + message = message.strip() + if message: + messages.append(message) + + # if self.buffer is not empty, check if it is a complete message + # by removing data: prefix and check if it is a valid JSON + if self.buffer.startswith("data: "): + message_content = self.buffer.removeprefix("data: ").strip() + if message_content == "[DONE]": + messages.append(self.buffer.strip()) + self.buffer = "" + elif message_content: + try: + json.loads(message_content) + messages.append(self.buffer.strip()) + self.buffer = "" + except json.JSONDecodeError: + # Incomplete JSON, wait for more chunks. + pass + + return messages + + +@dataclass +class RequestFuncInput: + """The input for the request function.""" + prompt: str + api_url: str + prompt_len: int + output_len: int + model: str + model_name: Optional[str] = None + logprobs: Optional[int] = None + extra_headers: Optional[dict] = None + extra_body: Optional[dict] = None + multi_modal_content: Optional[Union[dict, list[dict]]] = None + ignore_eos: bool = False + language: Optional[str] = None + request_id: Optional[str] = None + + +@dataclass +class RequestFuncOutput: + """The output of the request function including metrics.""" + generated_text: str = "" + success: bool = False + latency: float = 0.0 + output_tokens: int = 0 + ttft: float = 0.0 # Time to first token + itl: list[float] = field( + default_factory=list) # list of inter-token latencies + tpot: float = 0.0 # avg next-token latencies + prompt_len: int = 0 + error: str = "" + start_time: float = 0.0 + + +class RequestFunc(Protocol): + def __call__( + self, + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, + ) -> Awaitable[RequestFuncOutput]: + ... + +async def async_request_openai_completions( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + """The async request function for the OpenAI Completions API. + + Args: + request_func_input: The input for the request function. + pbar: The progress bar to display the progress. + + Returns: + The output of the request function. + """ + api_url = request_func_input.api_url + assert api_url.endswith( + ("completions", "profile") + ), "OpenAI Completions API URL must end with 'completions' or 'profile'." + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name else request_func_input.model, + "prompt": request_func_input.prompt, + "temperature": 0.0, + "repetition_penalty": 1.0, + "max_tokens": request_func_input.output_len, + "logprobs": request_func_input.logprobs, + "stream": True, + "stream_options": { + "include_usage": True, + }, + } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + st = time.perf_counter() + output.start_time = st + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + first_chunk_received = False + handler = StreamedResponseHandler() + + async for chunk_bytes in response.content.iter_any(): + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + messages = handler.add_chunk(chunk_bytes) + for message in messages: + # NOTE: SSE comments (often used as pings) start with + # a colon. These are not JSON data payload and should + # be skipped. + if message.startswith(":"): + continue + + chunk = message.removeprefix("data: ") + + if chunk != "[DONE]": + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + if first_chunk_received: + output.success = True + else: + output.success = False + output.error = ( + "Never received a valid chunk to calculate TTFT." + "This response will be marked as failed!") + output.generated_text = generated_text + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_chat_completions( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith(("chat/completions", "profile")), ( + "OpenAI Chat Completions API URL must end with 'chat/completions'.") + + content = [{"type": "text", "text": request_func_input.prompt}] + if request_func_input.multi_modal_content: + mm_content = request_func_input.multi_modal_content + if isinstance(mm_content, list): + content.extend(mm_content) + elif isinstance(mm_content, dict): + content.append(mm_content) + else: + raise TypeError( + "multi_modal_content must be a dict or list[dict] " + "for openai-chat" + ) + payload = { + "model": + request_func_input.model_name + if request_func_input.model_name else request_func_input.model, + "messages": [ + { + "role": "user", + "content": content + }, + ], + "temperature": + 0.0, + "max_completion_tokens": + request_func_input.output_len, + "stream": + True, + "stream_options": { + "include_usage": True, + }, + } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + output.start_time = st + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + handler = StreamedResponseHandler() + async for chunk_bytes in response.content.iter_any(): + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + messages = handler.add_chunk(chunk_bytes) + for message in messages: + # NOTE: SSE comments (often used as pings) start with + # a colon. These are not JSON data payload and should + # be skipped. + if message.startswith(":"): + continue + + chunk = message.removeprefix("data: ") + + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_audio( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + # Lazy import without PlaceholderModule to avoid vllm dep. + import soundfile + + api_url = request_func_input.api_url + assert api_url.endswith(("transcriptions", "translations")), ( + "OpenAI Chat Completions API URL must end with 'transcriptions' ") + "or `translations`." + + content = [{"type": "text", "text": request_func_input.prompt}] + payload = { + "model": + request_func_input.model_name + if request_func_input.model_name else request_func_input.model, + "temperature": + 0.0, + "max_completion_tokens": + request_func_input.output_len, + "stream": + True, + "language": + "en", + # Flattened due to multipart/form-data + "stream_include_usage": + True, + "stream_continuous_usage_stats": + True, + } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id + + # Send audio file + def to_bytes(y, sr): + buffer = io.BytesIO() + soundfile.write(buffer, y, sr, format="WAV") + buffer.seek(0) + return buffer + + mm_audio = request_func_input.multi_modal_content + if not isinstance(mm_audio, dict) or "audio" not in mm_audio: + raise TypeError("multi_modal_content must be a dict containing 'audio'") + with to_bytes(*mm_audio["audio"]) as f: + form = aiohttp.FormData() + form.add_field("file", f, content_type="audio/wav") + for key, value in payload.items(): + form.add_field(key, str(value)) + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + output.start_time = st + most_recent_timestamp = st + try: + async with session.post(url=api_url, + data=form, + headers=headers) as response: + if response.status == 200: + handler = StreamedResponseHandler() + + async for chunk_bytes in response.content.iter_any(): + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + messages = handler.add_chunk(chunk_bytes) + for message in messages: + chunk = message.decode("utf-8").removeprefix( + "data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get( + "content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append( + timestamp - most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_embeddings( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +): + api_url = request_func_input.api_url + assert api_url.endswith( + "embeddings" + ), "OpenAI Embeddings API URL must end with 'embeddings'." + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + payload = { + "model": request_func_input.model, + "input": request_func_input.prompt, + } + + output = RequestFuncOutput() + st = time.perf_counter() + output.start_time = st + try: + async with session.post( + url=api_url, + headers=headers, + json=payload + ) as response: + if response.status == 200: + output.latency = time.perf_counter() - st + data = await response.json() + output.success = True + output.generated_text = "" + output.prompt_len = data.get( + "usage", {}).get( + "prompt_tokens", 0) + else: + output.success = False + output.error = response.reason or "" + except Exception as e: + output.success = False + output.error = str(e) + + if pbar: + pbar.update(1) + return output + + +# TODO: Add more request functions for different API protocols. +ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { + "vllm": async_request_openai_completions, + "openai": async_request_openai_completions, + "openai-chat": async_request_openai_chat_completions, + "openai-audio": async_request_openai_audio, + "openai-embeddings": async_request_openai_embeddings, +} + +OPENAI_COMPATIBLE_BACKENDS = [ + k for k, v in ASYNC_REQUEST_FUNCS.items() + if v in (async_request_openai_completions, + async_request_openai_chat_completions) +] diff --git a/vllm_omni/benchmarks/lib/ready_checker.py b/vllm_omni/benchmarks/lib/ready_checker.py new file mode 100644 index 000000000..40898864f --- /dev/null +++ b/vllm_omni/benchmarks/lib/ready_checker.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utilities for checking endpoint readiness.""" + +import asyncio +import time +import aiohttp +from tqdm.asyncio import tqdm + +from .endpoint_request_func import (RequestFunc, RequestFuncInput, + RequestFuncOutput) + +async def wait_for_endpoint( + request_func: RequestFunc, + test_input: RequestFuncInput, + session: aiohttp.ClientSession, + timeout_seconds: int = 600, + retry_interval: int = 5, +) -> RequestFuncOutput: + """ + Wait for an endpoint to become available before starting benchmarks. + + Args: + request_func: The async request function to call + test_input: The RequestFuncInput to test with + timeout_seconds: Maximum time to wait in seconds (default: 10 minutes) + retry_interval: Time between retries in seconds (default: 5 seconds) + + Returns: + RequestFuncOutput: The successful response + + Raises: + ValueError: If the endpoint doesn't become available within the timeout + """ + deadline = time.perf_counter() + timeout_seconds + output = RequestFuncOutput(success=False) + print(f"Waiting for endpoint to become up in {timeout_seconds} seconds") + + with tqdm( + total=timeout_seconds, + bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining", + unit="s", + ) as pbar: + + while True: + # update progress bar + remaining = deadline - time.perf_counter() + elapsed = timeout_seconds - remaining + update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n) + pbar.update(update_amount) + pbar.refresh() + if remaining <= 0: + pbar.close() + break + + # ping the endpoint using request_func + try: + output = await request_func( + request_func_input=test_input, session=session) + if output.success: + pbar.close() + return output + except aiohttp.ClientConnectorError: + pass + + # retry after a delay + sleep_duration = min(retry_interval, remaining) + if sleep_duration > 0: + await asyncio.sleep(sleep_duration) + + return output diff --git a/vllm_omni/benchmarks/lib/utils.py b/vllm_omni/benchmarks/lib/utils.py new file mode 100644 index 000000000..41a24ae64 --- /dev/null +++ b/vllm_omni/benchmarks/lib/utils.py @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import json +import math +import os +from typing import Any + +def convert_to_pytorch_benchmark_format(args: argparse.Namespace, + metrics: dict[str, list], + extra_info: dict[str, Any]) -> list: + """ + Save the benchmark results in the format used by PyTorch OSS benchmark with + on metric per record + https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + """ + records = [] + if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): + return records + + for name, benchmark_values in metrics.items(): + record = { + "benchmark": { + "name": "vLLM benchmark", + "extra_info": { + "args": vars(args), + }, + }, + "model": { + "name": args.model, + }, + "metric": { + "name": name, + "benchmark_values": benchmark_values, + "extra_info": extra_info, + }, + } + + tp = record["benchmark"]["extra_info"]["args"].get( + "tensor_parallel_size") + # Save tensor_parallel_size parameter if it's part of the metadata + if not tp and "tensor_parallel_size" in extra_info: + record["benchmark"]["extra_info"]["args"][ + "tensor_parallel_size"] = extra_info["tensor_parallel_size"] + + records.append(record) + + return records + + +class InfEncoder(json.JSONEncoder): + + def clear_inf(self, o: Any): + if isinstance(o, dict): + return { + str(k) + if not isinstance(k, (str, int, float, bool, type(None))) + else k: self.clear_inf(v) + for k, v in o.items() + } + elif isinstance(o, list): + return [self.clear_inf(v) for v in o] + elif isinstance(o, float) and math.isinf(o): + return "inf" + return o + + def iterencode(self, o: Any, *args, **kwargs) -> Any: + return super().iterencode(self.clear_inf(o), *args, **kwargs) + + +def write_to_json(filename: str, records: list) -> None: + with open(filename, "w") as f: + json.dump( + records, + f, + cls=InfEncoder, + default=lambda o: f"<{type(o).__name__} is not JSON serializable>", + ) diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py index 6de8dc4c2..1175d2d28 100644 --- a/vllm_omni/benchmarks/serve.py +++ b/vllm_omni/benchmarks/serve.py @@ -36,15 +36,15 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser, +from vllm_omni.benchmarks.datasets import (SampleRequest, add_dataset_parser, get_samples) -from vllm.benchmarks.lib.endpoint_request_func import ( +from vllm_omni.benchmarks.lib.endpoint_request_func import ( ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, RequestFuncOutput) -from vllm.benchmarks.lib.ready_checker import wait_for_endpoint -from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, +from vllm_omni.benchmarks.lib.ready_checker import wait_for_endpoint +from vllm_omni.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm_omni.transformers_utils.tokenizer import get_tokenizer MILLISECONDS_TO_SECONDS_CONVERSION = 1000 diff --git a/vllm_omni/benchmarks/throughput.py b/vllm_omni/benchmarks/throughput.py index 96e39fd92..5056db3db 100644 --- a/vllm_omni/benchmarks/throughput.py +++ b/vllm_omni/benchmarks/throughput.py @@ -16,14 +16,14 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset, +from vllm_omni.benchmarks.datasets import (AIMODataset, BurstGPTDataset, ConversationDataset, InstructCoderDataset, PrefixRepetitionRandomDataset, RandomDataset, SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset) -from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, +from vllm_omni.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.inputs import TextPrompt, TokensPrompt From 8d57415cc56070bbe5888ef2d759579f410e3e00 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Wed, 17 Dec 2025 22:39:33 +0800 Subject: [PATCH 13/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/benchmarks/datasets.py | 1714 +---------------- vllm_omni/benchmarks/latency.py | 170 -- .../benchmarks/lib/endpoint_request_func.py | 387 +--- vllm_omni/benchmarks/lib/ready_checker.py | 71 - vllm_omni/benchmarks/lib/utils.py | 78 - vllm_omni/benchmarks/serve.py | 16 +- vllm_omni/benchmarks/throughput.py | 696 ------- vllm_omni/entrypoints/cli/__init__.py | 3 +- .../entrypoints/cli/benchmark/__init__.py | 0 vllm_omni/entrypoints/cli/benchmark/serve.py | 25 + 10 files changed, 54 insertions(+), 3106 deletions(-) delete mode 100644 vllm_omni/benchmarks/latency.py delete mode 100644 vllm_omni/benchmarks/lib/ready_checker.py delete mode 100644 vllm_omni/benchmarks/lib/utils.py delete mode 100644 vllm_omni/benchmarks/throughput.py create mode 100644 vllm_omni/entrypoints/cli/benchmark/__init__.py create mode 100644 vllm_omni/entrypoints/cli/benchmark/serve.py diff --git a/vllm_omni/benchmarks/datasets.py b/vllm_omni/benchmarks/datasets.py index c019dd35a..86a3a0715 100644 --- a/vllm_omni/benchmarks/datasets.py +++ b/vllm_omni/benchmarks/datasets.py @@ -11,38 +11,30 @@ - HuggingFace - VisionArena """ -import argparse import ast import base64 import io import json import logging import math -import random -import torchaudio -import torch -import cv2 -import tempfile import os -from abc import ABC, abstractmethod +import tempfile from collections.abc import Iterator, Mapping from contextlib import suppress -from copy import deepcopy -from dataclasses import dataclass -from functools import cache -from io import BytesIO -from typing import Any, Callable, Optional, Union, cast, Dict +from typing import Any, cast, Dict +import cv2 import numpy as np +import torch +import torchaudio from PIL import Image from transformers import PreTrainedTokenizerBase -from typing_extensions import deprecated - -from vllm.lora.request import LoRARequest -from vllm.lora.utils import get_adapter_absolute_path -from vllm.multimodal import MultiModalDataDict -from vllm.multimodal.image import convert_image_mode -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.benchmark.datasets import (RandomDataset, ShareGPTDataset, SpecBench, + SonnetDataset, BurstGPTDataset, ConversationDataset, + VisionArenaDataset, MMVUDataset, InstructCoderDataset, MTBenchDataset, + BlazeditDataset, AIMODataset, NextEditPredictionDataset, ASRDataset, MLPerfDataset, + PrefixRepetitionRandomDataset, CustomDataset, SampleRequest, _ValidateDatasetArgs, + process_image) from vllm.utils import PlaceholderModule try: @@ -68,273 +60,6 @@ logger = logging.getLogger(__name__) -# ----------------------------------------------------------------------------- -# Data Classes -# ----------------------------------------------------------------------------- - - -@dataclass -class SampleRequest: - """ - Represents a single inference request for benchmarking. - """ - - prompt: Union[str, list[str]] - prompt_len: int - expected_output_len: int - multi_modal_data: Optional[ - Union[MultiModalDataDict, dict, list[dict]] - ] = None - lora_request: Optional[LoRARequest] = None - request_id: Optional[str] = None - - -# ----------------------------------------------------------------------------- -# Benchmark Dataset Base Class -# ----------------------------------------------------------------------------- - - -class BenchmarkDataset(ABC): - DEFAULT_SEED = 0 - IS_MULTIMODAL = False - - def __init__( - self, - dataset_path: Optional[str] = None, - random_seed: int = DEFAULT_SEED, - ) -> None: - """ - Initialize the BenchmarkDataset with an optional dataset path and random - seed. - - Args: - dataset_path (Optional[str]): Path to the dataset. If None, it - indicates that a default or random dataset might be used. - random_seed (int): Seed value for reproducible shuffling or - sampling. Defaults to DEFAULT_SEED. - """ - self.dataset_path = dataset_path - # Set the random seed, ensuring that a None value is replaced with the - # default seed. - self.random_seed = (random_seed - if random_seed is not None else self.DEFAULT_SEED) - self.data = None - - def apply_multimodal_chat_transformation( - self, - prompt: str, - mm_content: Optional[ - Union[MultiModalDataDict, dict, list[dict]] - ] = None) -> list[dict]: - """ - Transform a prompt and optional multimodal content into a chat format. - This method is used for chat models that expect a specific conversation - format. - """ - content = [{"text": prompt, "type": "text"}] - if mm_content is not None: - if isinstance(mm_content, list): - content.extend(cast(list[dict[str, Any]], mm_content)) - elif isinstance(mm_content, dict): - content.append(mm_content) - else: - raise TypeError( - "Could not process multimodal content of type: " + - f"{type(mm_content)}" - ) - return [{"role": "user", "content": content}] - - def load_data(self) -> None: - """ - Load data from the dataset path into self.data. - - This method must be overridden by subclasses since the method to load - data will vary depending on the dataset format and source. - - Raises: - NotImplementedError: If a subclass does not implement this method. - """ - # TODO (jenniferzhao): add support for downloading data - raise NotImplementedError( - "load_data must be implemented in subclasses.") - - def get_random_lora_request( - self, - max_loras: Optional[int] = None, - lora_path: Optional[str] = None, - ) -> Optional[LoRARequest]: - """ - Optionally select a random LoRA request. - - This method is used when LoRA parameters are provided. It randomly - selects a LoRA based on max_loras. - - Args: - max_loras (Optional[int]): The maximum number of LoRAs available. - If `None`, LoRA is not used. - lora_path (Optional[str]): Path to the LoRA parameters on disk. - If `None`, LoRA is not used. - - Returns: - A new [`LoRARequest`][vllm.lora.request.LoRARequest] - (or `None` if not applicable). - """ - if max_loras is None or lora_path is None: - return None - - # Generate a random LoRA ID in the range [1, max_loras]. - lora_id = random.randint(1, max_loras) - lora_request = LoRARequest( - lora_name=str(lora_id), - lora_int_id=lora_id, - lora_path=lora_path_on_disk(lora_path), - ) - return lora_request - - @abstractmethod - def sample(self, tokenizer: PreTrainedTokenizerBase, - num_requests: int, - request_id_prefix: str = "", - no_oversample: bool = False) -> list[SampleRequest]: - """ - Abstract method to generate sample requests from the dataset. - - Subclasses must override this method to implement dataset-specific logic - for generating a list of SampleRequest objects. - - Args: - tokenizer (PreTrainedTokenizerBase): The tokenizer to be used - for processing the dataset's text. - num_requests (int): The number of sample requests to generate. - request_id_prefix (str): The prefix of request_id. - - Returns: - list[SampleRequest]: A list of sample requests generated from the - dataset. - """ - raise NotImplementedError("sample must be implemented in subclasses.") - - def maybe_oversample_requests( - self, - requests: list[SampleRequest], - num_requests: int, - request_id_prefix: str = "", - no_oversample: bool = False, - ) -> None: - """ - Oversamples the list of requests if its size is less than the desired - number. - - Args: - requests (List[SampleRequest]): The current list of sampled - requests. - num_requests (int): The target number of requests. - request_id_prefix (str): The prefix applied to generated request - identifiers. - - """ - if no_oversample: - logger.info("Skipping oversampling. " \ - "Total samples: %d.", len(requests)) - return - - if len(requests) < num_requests: - random.seed(self.random_seed) - additional = deepcopy( - random.choices(requests, k=num_requests - len(requests)) - ) - for i in range(len(additional)): - req = additional[i] - req.request_id = request_id_prefix + str(len(requests) + i) - requests.extend(additional) - logger.info("Oversampled requests to reach %d total samples.", - num_requests) - - -# ----------------------------------------------------------------------------- -# Utility Functions and Global Caches -# ----------------------------------------------------------------------------- - - -def is_valid_sequence( - prompt_len: int, - output_len: int, - min_len: int = 4, - max_prompt_len: int = 1024, - max_total_len: int = 2048, - skip_min_output_len_check: bool = False, -) -> bool: - """ - Validate a sequence based on prompt and output lengths. - - Default pruning criteria are copied from the original `sample_hf_requests` - and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as - from `sample_requests` in benchmark_throughput.py. - """ - # Check for invalid conditions - prompt_too_short = prompt_len < min_len - output_too_short = (not skip_min_output_len_check) and (output_len - < min_len) - prompt_too_long = prompt_len > max_prompt_len - combined_too_long = (prompt_len + output_len) > max_total_len - - # Return True if none of the invalid conditions are met - return not (prompt_too_short or output_too_short or prompt_too_long - or combined_too_long) - - -@cache -def lora_path_on_disk(lora_path: str) -> str: - return get_adapter_absolute_path(lora_path) - - -# Global cache for LoRA tokenizers. -lora_tokenizer_cache: dict[int, AnyTokenizer] = {} - - -def process_image(image: Any) -> Mapping[str, Any]: - """ - Process a single image input and return a multimedia content dictionary. - - Supports the following input types: - - 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key - containing raw image data. - Loads the bytes as a PIL.Image.Image. - - 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as - a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns - a dictionary with the image as a base64 data URL. - - 3. String input: - Treats the string as a URL or local file path. - - Prepends "file://" if the string doesn't start with "http://" or - "file://". - Returns a dictionary with the image URL. - - Raises: - ValueError: If the input is not a supported type. - """ - if isinstance(image, dict) and 'bytes' in image: - image = Image.open(BytesIO(image['bytes'])) - if isinstance(image, Image.Image): - image = convert_image_mode(image, "RGB") - with io.BytesIO() as image_data: - image.save(image_data, format="JPEG") - image_base64 = base64.b64encode( - image_data.getvalue()).decode("utf-8") - return { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, - } - - if isinstance(image, str): - image_url = (image if image.startswith( - ("http://", "https://", "file://")) else f"file://{image}") - return {"type": "image_url", "image_url": {"url": image_url}} - - raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image" - " or str or dictionary with raw image bytes.") - def process_video(video: Any) -> Mapping[str, Any]: """ @@ -404,199 +129,7 @@ def process_audio(audio: Any) -> Mapping[str, Any]: raise ValueError(f"Invalid audio input {audio}. Must be a string of local path/remote url, or a dictionary with raw audio bytes in the form of `{{'bytes': raw_audio_bytes}}`." ) -# ----------------------------------------------------------------------------- -# Random Dataset Implementation (Synthetic Data) -# ----------------------------------------------------------------------------- - - -class RandomDataset(BenchmarkDataset): - """ - Synthetic text-only dataset for serving/throughput benchmarks. - - Strategy: - - Sample input/output token lengths per request from integer-uniform ranges - around configured means (controlled by range_ratio). - - Prepend a fixed random prefix of length prefix_len. - - Generate the remaining tokens as a reproducible sequence: - (offset + index + arange(input_len)) % vocab_size. - - Decode then re-encode/truncate to ensure prompt token counts match. - - Uses numpy.default_rng seeded with random_seed for reproducible sampling. - """ - # Default values copied from benchmark_serving.py for the random dataset. - DEFAULT_PREFIX_LEN = 0 - DEFAULT_RANGE_RATIO = 0.0 - DEFAULT_INPUT_LEN = 1024 - DEFAULT_OUTPUT_LEN = 128 - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - # Use numpy's default_rng for deterministic sampling - # Do not use random.seed() or np.random.seed() elsewhere in this class. - # This ensures that the RNG is isolated from global RNG state. - self._rng = np.random.default_rng(self.random_seed) - - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - request_id_prefix: str = "", - no_oversample: bool = False, - prefix_len: int = DEFAULT_PREFIX_LEN, - range_ratio: float = DEFAULT_RANGE_RATIO, - input_len: int = DEFAULT_INPUT_LEN, - output_len: int = DEFAULT_OUTPUT_LEN, - batchsize: int = 1, - **kwargs, - ) -> list[SampleRequest]: - - input_lens, output_lens, offsets = self.get_sampling_params( - num_requests, range_ratio, input_len, output_len, tokenizer - ) - - # Generate prefix once - prefix_token_ids = self.get_prefix(tokenizer, prefix_len) - vocab_size = tokenizer.vocab_size - - requests = [] - for i in range(num_requests): - prompt, total_input_len = self.generate_token_sequence( - tokenizer=tokenizer, - prefix_token_ids=prefix_token_ids, - prefix_len=prefix_len, - vocab_size=vocab_size, - input_len=int(input_lens[i]), - offset=int(offsets[i]), - index=i, - ) - requests.append( - SampleRequest( - prompt=prompt, - prompt_len=total_input_len, - expected_output_len=int(output_lens[i]), - request_id=request_id_prefix + str(i), - ) - ) - # only used for embeddings benchmark. - if batchsize > 1: - batch_requests = [] - # Create batched requests - for i in range(0, num_requests, batchsize): - batch = requests[i : i + batchsize] - batch_requests.append( - SampleRequest( - prompt=[req.prompt for req in batch], - prompt_len=sum(req.prompt_len for req in batch), - expected_output_len=0, - request_id=request_id_prefix + str(i // batchsize), - ) - ) - requests = batch_requests - return requests - - def get_prefix( - self, tokenizer: PreTrainedTokenizerBase, prefix_len: int - ) -> list[int]: - """ - Get the prefix for the dataset. - """ - return ( - self._rng.integers( - 0, tokenizer.vocab_size, size=prefix_len).tolist() - if prefix_len > 0 - else [] - ) - - def get_sampling_params( - self, - num_requests: int, - range_ratio: float, - input_len: int, - output_len: int, - tokenizer: PreTrainedTokenizerBase, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Get the sampling parameters for the dataset. - """ - # Enforce range_ratio < 1 - if not (0.0 <= range_ratio < 1.0): - raise ValueError("range_ratio must be in [0, 1).") - num_special_tokens = int(tokenizer.num_special_tokens_to_add()) - real_input_len = max(0, int(input_len) - num_special_tokens) - # Bounds use floor for low and ceil for high - input_low = math.floor(real_input_len * (1 - range_ratio)) - input_high = math.ceil(real_input_len * (1 + range_ratio)) - output_low = math.floor(output_len * (1 - range_ratio)) - output_high = math.ceil(output_len * (1 + range_ratio)) - # Ensure the lower bound for output length is at least 1 to - # prevent sampling 0 tokens. - output_low = max(output_low, 1) - - if input_low > input_high: - raise ValueError( - "Invalid input sampling interval: " - f"low={input_low} > high={input_high}" - ) - if output_low > output_high: - raise ValueError( - "Invalid output sampling interval: " - f"low={output_low} > high={output_high}" - ) - - logger.info( - "Sampling input_len from [%s, %s] and output_len from [%s, %s]", - input_low, - input_high, - output_low, - output_high, - ) - - input_lens = self._rng.integers(input_low, input_high + 1, - size=num_requests) - output_lens = self._rng.integers(output_low, output_high + 1, - size=num_requests) - offsets = self._rng.integers(0, tokenizer.vocab_size, - size=num_requests) - return input_lens, output_lens, offsets - - def generate_token_sequence( - self, - *, - tokenizer: PreTrainedTokenizerBase, - prefix_token_ids: list[int], - prefix_len: int, - vocab_size: int, - input_len: int, - offset: int, - index: int, - ) -> tuple[str, int]: - """ - Returns (prompt, total_input_len). - - NOTE: After decoding the prompt we have to encode and decode it again. - This is done because in some cases N consecutive tokens - give a string tokenized into != N number of tokens. - For example for GPT2Tokenizer: - [6880, 6881] -> ['Ġcalls', 'here'] -> - [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] - To avoid uncontrolled change of the prompt length, - the encoded sequence is truncated before being decoded again. - """ - # Build the inner sequence by sampling sequentially from the vocab - inner_seq = ((offset + index + np.arange(input_len)) - % vocab_size).tolist() - token_sequence = prefix_token_ids + inner_seq - - # Decode, then re-encode and truncate to preserve token count invariants - prompt = tokenizer.decode(token_sequence) - total_input_len = prefix_len + int(input_len) - - re_encoded_sequence = tokenizer.encode( - prompt, add_special_tokens=False)[:total_input_len] - prompt = tokenizer.decode(re_encoded_sequence) - total_input_len = len(re_encoded_sequence) - return prompt, total_input_len # ----------------------------------------------------------------------------- @@ -609,8 +142,8 @@ class RandomMultiModalDataset(RandomDataset): Status: - Images: supported via synthetic RGB data. - - Video: not yet supported (TODO: implement video generation method). - - Audio: not yet supported. + - Video: supported via synthetic bytes data. + - Audio: supported via synthetic bytes data. Sampling overview: 1) Number of items per request is sampled uniformly from the integer range @@ -1011,112 +544,6 @@ def sample( mm_requests.append(sample_request) return mm_requests -# ----------------------------------------------------------------------------- -# ShareGPT Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ShareGPTDataset(BenchmarkDataset): - """ - Implements the ShareGPT dataset. Loads data from a JSON file and generates - sample requests based on conversation turns. - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - with open(self.dataset_path, encoding="utf-8") as f: - self.data = json.load(f) - # Filter entries with at least two conversation turns. - self.data = [ - entry for entry in self.data - if "conversations" in entry and len(entry["conversations"]) >= 2 - ] - random.seed(self.random_seed) - random.shuffle(self.data) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - lora_path: Optional[str] = None, - max_loras: Optional[int] = None, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list: - samples: list = [] - ind = 0 - for entry in self.data: - if len(samples) >= num_requests: - break - prompt, completion = ( - entry["conversations"][0]["value"], - entry["conversations"][1]["value"], - ) - - lora_request = self.get_random_lora_request( - max_loras=max_loras, lora_path=lora_path) - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - new_output_len = (len(completion_ids) - if output_len is None else output_len) - if not is_valid_sequence(prompt_len, - new_output_len, - skip_min_output_len_check=output_len - is not None): - continue - if image_path := entry.get("image"): - mm_content = process_image(image_path) - elif video_path := entry.get("video"): - mm_content = process_video(video_path) - else: - mm_content = None - if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation( - prompt, mm_content) - samples.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=new_output_len, - lora_request=lora_request, - multi_modal_data=mm_content, - request_id=request_id_prefix + str(ind), - )) - ind += 1 - self.maybe_oversample_requests(samples, - num_requests, - request_id_prefix, - no_oversample) - return samples - - -class _ValidateDatasetArgs(argparse.Action): - """Argparse action to validate dataset name and path compatibility.""" - def __call__(self, parser, namespace, values, option_string=None): - setattr(namespace, self.dest, values) - - # Get current values of both dataset_name and dataset_path - dataset_name = getattr(namespace, 'dataset_name', 'random') - dataset_path = getattr(namespace, 'dataset_path', None) - - # Validate the combination - if dataset_name == "random" and dataset_path is not None: - parser.error( - "Cannot use 'random' dataset with --dataset-path. " - "Please specify the appropriate --dataset-name (e.g., " - "'sharegpt', 'custom', 'sonnet') for your dataset file: " - f"{dataset_path}" - ) def add_dataset_parser(parser: FlexibleArgumentParser): @@ -1667,1148 +1094,33 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: return input_requests -# ----------------------------------------------------------------------------- -# Custom Dataset Implementation -# ----------------------------------------------------------------------------- -class CustomDataset(BenchmarkDataset): - """ - Implements the Custom dataset. Loads data from a JSONL file and generates - sample requests based on conversation turns. E.g., - ``` - {"prompt": "What is the capital of India?"} - {"prompt": "What is the capital of Iran?"} - {"prompt": "What is the capital of China?"} - ``` - """ - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - # self.data will be a list of dictionaries - # e.g., [{"prompt": "What is the capital of India?"}, ...] - # This will be the standardized format which load_data() - # has to convert into depending on the filetype of dataset_path. - # sample() will assume this standardized format of self.data - self.data = [] - - # Load the JSONL file - if self.dataset_path.endswith(".jsonl"): - jsonl_data = pd.read_json(path_or_buf=self.dataset_path, - lines=True) - - # check if the JSONL file has a 'prompt' column - if "prompt" not in jsonl_data.columns: - raise ValueError("JSONL file must contain a 'prompt' column.") - - # Convert each row to a dictionary and append to self.data - # This will convert the DataFrame to a list of dictionaries - # where each dictionary corresponds to a row in the DataFrame. - # This is the standardized format we want for self.data - for _, row in jsonl_data.iterrows(): - self.data.append(row.to_dict()) - else: - raise NotImplementedError( - "Only JSONL format is supported for CustomDataset.") - random.seed(self.random_seed) - random.shuffle(self.data) - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - lora_path: Optional[str] = None, - max_loras: Optional[int] = None, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - skip_chat_template: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list: - # load all data if needed - self.num_available_samples = len(self.data) - if num_requests <= 0: - num_requests = self.num_available_samples - logger.info("num_requests is set to 0 or negative, " - "so using all available samples: %d", - num_requests) - - sampled_requests = [] - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - prompt = item["prompt"] - - # apply template - if not skip_chat_template: - prompt = tokenizer.apply_chat_template( - [{ - "role": "user", - "content": prompt - }], - add_generation_prompt=True, - tokenize=False, - ) - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - request_id=request_id_prefix + str(i), - )) - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - return sampled_requests -# ----------------------------------------------------------------------------- -# Spec Bench Dataset Implementation -# ----------------------------------------------------------------------------- - - -class SpecBench(CustomDataset): - """ - Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench - Download the dataset using: - wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl - """ # noqa: E501 - - def __init__(self, **kwargs) -> None: - self.category = kwargs.pop("category", None) - super().__init__(**kwargs) - self.load_data() - def load_data(self) -> None: - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - self.data = [] - # Load the JSONL file - jsonl_data = pd.read_json(path_or_buf=self.dataset_path, - lines=True) - # check if the JSONL file has a 'turns' column - if "turns" not in jsonl_data.columns: - raise ValueError("JSONL file must contain a 'turns' column.") - for _, row in jsonl_data.iterrows(): - # sample only from a specific category if specified - if (not self.category) or (self.category == row['category']): - prompt = row["turns"][0] - self.data.append({"prompt": prompt}) - random.seed(self.random_seed) - random.shuffle(self.data) - def sample(self, **kwargs) -> list: - # leverage CustomDataset sample - kwargs["skip_chat_template"] = False - return super().sample(**kwargs) -# ----------------------------------------------------------------------------- -# Sonnet Dataset Implementation -# ----------------------------------------------------------------------------- -@deprecated( - "SonnetDataset is deprecated and will be removed in a future version.", -) -class SonnetDataset(BenchmarkDataset): - """ - Simplified implementation of the Sonnet dataset. Loads poem lines from a - text file and generates sample requests. Default values here copied from - `benchmark_serving.py` for the sonnet dataset. - """ - DEFAULT_PREFIX_LEN = 200 - DEFAULT_INPUT_LEN = 550 - DEFAULT_OUTPUT_LEN = 150 - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.load_data() - def load_data(self) -> None: - if not self.dataset_path: - raise ValueError("dataset_path must be provided.") - with open(self.dataset_path, encoding="utf-8") as f: - self.data = f.readlines() - def sample( - self, - tokenizer, - num_requests: int, - prefix_len: int = DEFAULT_PREFIX_LEN, - input_len: int = DEFAULT_INPUT_LEN, - output_len: int = DEFAULT_OUTPUT_LEN, - return_prompt_formatted: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list: - # Calculate average token length for a poem line. - tokenized_lines = [tokenizer(line).input_ids for line in self.data] - avg_len = sum(len(tokens) - for tokens in tokenized_lines) / len(tokenized_lines) - - # Build the base prompt. - base_prompt = "Pick as many lines as you can from these poem lines:\n" - base_msg = [{"role": "user", "content": base_prompt}] - base_fmt = tokenizer.apply_chat_template(base_msg, - add_generation_prompt=True, - tokenize=False) - base_offset = len(tokenizer(base_fmt).input_ids) - if input_len <= base_offset: - raise ValueError( - f"'input_len' must be higher than the base prompt length " - f"({base_offset}).") - - # Determine how many poem lines to use. - num_input_lines = round((input_len - base_offset) / avg_len) - num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) - prefix_lines = self.data[:num_prefix_lines] - - samples = [] - ind = 0 - while len(samples) < num_requests: - extra_lines = random.choices(self.data, - k=num_input_lines - num_prefix_lines) - prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" - msg = [{"role": "user", "content": prompt}] - prompt_formatted = tokenizer.apply_chat_template( - msg, add_generation_prompt=True, tokenize=False) - prompt_len = len(tokenizer(prompt_formatted).input_ids) - if prompt_len <= input_len: - samples.append( - SampleRequest( - prompt=prompt_formatted - if return_prompt_formatted else prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - request_id=request_id_prefix + str(ind), - )) - ind += 1 - return samples -# ----------------------------------------------------------------------------- -# BurstGPT Dataset Implementation -# ----------------------------------------------------------------------------- -class BurstGPTDataset(BenchmarkDataset): - """ - Implements the BurstGPT dataset. Loads data from a CSV file and generates - sample requests based on synthetic prompt generation. Only rows with Model - "GPT-4" and positive response tokens are used. - """ - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self, ): - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - df = pd.read_csv(self.dataset_path) - # Filter to keep only GPT-4 rows. - gpt4_df = df[df["Model"] == "GPT-4"] - # Remove failed requests (where Response tokens is 0 or less). - gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] - # Sample the desired number of rows. - self.data = gpt4_df - - def _sample_loaded_data(self, num_requests: int) -> list: - if num_requests <= len(self.data): - data = self.data.sample(n=num_requests, - random_state=self.random_seed) - else: - data = self.data.sample( - n=num_requests, - random_state=self.random_seed, - replace=True, - ) - # Convert the dataframe to a list of lists. - return data.values.tolist() - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - max_loras: Optional[int] = None, - lora_path: Optional[str] = None, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list[SampleRequest]: - samples = [] - data = self._sample_loaded_data(num_requests=num_requests) - for i in range(num_requests): - input_len = int(data[i][2]) - output_len = int(data[i][3]) - lora_req = self.get_random_lora_request( - max_loras=max_loras, lora_path=lora_path) - vocab_size = tokenizer.vocab_size - # Generate a synthetic prompt: a list of token IDs computed as (i + - # j) modulo vocab_size. - token_ids = [(i + j) % vocab_size for j in range(input_len)] - prompt = tokenizer.decode(token_ids) - samples.append( - SampleRequest( - prompt=prompt, - prompt_len=input_len, - expected_output_len=output_len, - lora_request=lora_req, - request_id=request_id_prefix + str(i), - )) - return samples - - -# ----------------------------------------------------------------------------- -# HuggingFace Dataset Base Implementation -# ----------------------------------------------------------------------------- -class HuggingFaceDataset(BenchmarkDataset): - """Base class for datasets hosted on HuggingFace.""" - - SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set() - - def __init__( - self, - dataset_path: str, - dataset_split: str, - no_stream: bool = False, - dataset_subset: Optional[str] = None, - hf_name: Optional[str] = None, - **kwargs, - ) -> None: - super().__init__(dataset_path=dataset_path, **kwargs) - - self.dataset_split = dataset_split - self.dataset_subset = dataset_subset - self.load_stream = not no_stream - self.hf_name = hf_name or dataset_path - self.load_data() - - def load_data(self) -> None: - """Load data from HuggingFace datasets.""" - self.data = load_dataset( - self.dataset_path, - name=self.dataset_subset, - split=self.dataset_split, - streaming=self.load_stream, - ) - self.data = self.data.shuffle(seed=self.random_seed) - - -# ----------------------------------------------------------------------------- -# Conversation Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ConversationDataset(HuggingFaceDataset): - """Dataset for conversation data with multimodal support.""" - SUPPORTED_DATASET_PATHS = { - 'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered' - } - IS_MULTIMODAL = True - - def sample(self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs) -> list: - # Filter examples with at least 2 conversations - filtered_data = self.data.filter( - lambda x: len(x["conversations"]) >= 2) - sampled_requests = [] - ind = 0 - dynamic_output = output_len is None - - for item in filtered_data: - if len(sampled_requests) >= num_requests: - break - conv = item["conversations"] - prompt, completion = conv[0]["value"], conv[1]["value"] - - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - completion_len = len(completion_ids) - output_len = completion_len if dynamic_output else output_len - assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence( - prompt_len, completion_len): - continue - mm_content = process_image( - item["image"]) if "image" in item else None - if enable_multimodal_chat: - # Note: when chat is enabled the request prompt_len is no longer - # accurate and we will be using request output to count the - # actual prompt len and output len - prompt = self.apply_multimodal_chat_transformation( - prompt, mm_content) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - request_id=request_id_prefix + str(ind), - )) - ind += 1 - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Vision Arena Dataset Implementation -# ----------------------------------------------------------------------------- - - -class VisionArenaDataset(HuggingFaceDataset): - """ - Vision Arena Dataset. - """ - - DEFAULT_OUTPUT_LEN = 128 - SUPPORTED_DATASET_PATHS = { - "lmarena-ai/VisionArena-Chat": - lambda x: x["conversation"][0][0]["content"], - "lmarena-ai/vision-arena-bench-v0.1": - lambda x: x["turns"][0][0]["content"] - } - IS_MULTIMODAL = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list: - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) - sampled_requests = [] - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) - if parser_fn is None: - raise ValueError(f"Unsupported dataset path: {self.hf_name}") - prompt = parser_fn(item) - mm_content = process_image(item["images"][0]) - prompt_len = len(tokenizer(prompt).input_ids) - if enable_multimodal_chat: - # Note: when chat is enabled the request prompt_len is no longer - # accurate and we will be using request output to count the - # actual prompt len - prompt = self.apply_multimodal_chat_transformation( - prompt, mm_content) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - request_id=request_id_prefix + str(i), - )) - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - return sampled_requests - - -class MMVUDataset(HuggingFaceDataset): - """ - MMVU Dataset. - https://huggingface.co/datasets/yale-nlp/MMVU - """ - - DEFAULT_OUTPUT_LEN = 128 - SUPPORTED_DATASET_PATHS = { - "yale-nlp/MMVU": - lambda x: x["question"] + " " + ( - " ".join(f"{k}.{v}" for k, v in x["choices"].items()) - ), - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list: - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) - sampled_requests = [] - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) - if parser_fn is None: - raise ValueError(f"Unsupported dataset path: {self.hf_name}") - prompt = parser_fn(item) - mm_content = process_video(item["video"]) - prompt_len = len(tokenizer(prompt).input_ids) - if enable_multimodal_chat: - # Note: when chat is enabled the request prompt_len is no longer - # accurate and we will be using request output to count the - # actual prompt len - prompt = self.apply_multimodal_chat_transformation( - prompt, mm_content) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - request_id=request_id_prefix + str(i), - )) - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Instruct Coder Dataset Implementation -# ----------------------------------------------------------------------------- - - -class InstructCoderDataset(HuggingFaceDataset): - """ - InstructCoder Dataset. - https://huggingface.co/datasets/likaixin/InstructCoder - - InstructCoder is the dataset designed for general code editing. It consists - of 114,239 instruction-input-output triplets, and covers multiple distinct - code editing scenario. - """ - - DEFAULT_OUTPUT_LEN = 200 # this is the average default output length - SUPPORTED_DATASET_PATHS = { - "likaixin/InstructCoder", - } - - def sample(self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs) -> list: - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) - sampled_requests = [] - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - prompt = ( - f"{item['input']}\n\n{item['instruction']} Just output " - "the code, do not include any explanation." - ) - - # apply template - prompt = tokenizer.apply_chat_template( - [{ - "role": "user", - "content": prompt - }], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - request_id=request_id_prefix + str(i), - )) - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# MT-Bench Dataset Implementation -# ----------------------------------------------------------------------------- - - -class MTBenchDataset(HuggingFaceDataset): - """ - MT-Bench Dataset. - https://huggingface.co/datasets/philschmid/mt-bench - - We create a single turn dataset for MT-Bench. - This is similar to Spec decoding benchmark setup in vLLM - https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 - """ # noqa: E501 - - DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM - SUPPORTED_DATASET_PATHS = { - "philschmid/mt-bench", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list: - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) - sampled_requests = [] - - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - prompt = item["turns"][0] - - # apply template - prompt = tokenizer.apply_chat_template( - [{ - "role": "user", - "content": prompt - }], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - request_id=request_id_prefix + str(i), - )) - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Blazedit Dataset Implementation -# ----------------------------------------------------------------------------- - - -class BlazeditDataset(HuggingFaceDataset): - """ - Blazedit Dataset. - https://github.com/ise-uiuc/blazedit - - 5k char version: vdaita/edit_5k_char - 10k char version: vdaita/edit_10k_char - """ # noqa: E501 - - # 5k char version will have output as ~5k chars - # 10k char version will have output as ~10k chars - # Assuming 3 char per token, 10k chars will be 3333 tokens - # We set default to 4000 to be safe - DEFAULT_OUTPUT_LEN = 4000 - SUPPORTED_DATASET_PATHS = { - "vdaita/edit_5k_char", - "vdaita/edit_10k_char", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - request_id_prefix: str = "", - no_oversample: bool = False, - min_distance: float = 0.0, - max_distance: float = 1.0, - **kwargs, - ) -> list: - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) - sampled_requests = [] - - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - code = item["code"] - change_request = item["change_request"] - norm_distance = item["norm_distance"] - - # compare the levenshtein distance normalized by code length - if norm_distance < min_distance or norm_distance > max_distance: - continue - - # template copied from - # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501 - instruction = f"""Given a code file, please apply the change requests and generate the new file. - -Original file: -```python -{code} -``` - -Change request: -{change_request} - -Please generate the new code file in the "New file" section below.""" # noqa: E501 - - # apply template - prompt = tokenizer.apply_chat_template( - [{ - "role": "user", - "content": instruction - }], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) - - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - request_id=request_id_prefix + str(i), - )) - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - - return sampled_requests - - -# ----------------------------------------------------------------------------- -# AIMO Dataset Implementation -# ----------------------------------------------------------------------------- - - -class AIMODataset(HuggingFaceDataset): - """ - Dataset class for processing a AIMO dataset with reasoning questions. - """ - SUPPORTED_DATASET_PATHS = { - "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5", - "AI-MO/NuminaMath-CoT" - } - - def sample(self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs) -> list: - sampled_requests = [] - ind = 0 - dynamic_output = output_len is None - - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt, completion = item['problem'], item["solution"] - - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - completion_len = len(completion_ids) - output_len = completion_len if dynamic_output else output_len - assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence(prompt_len, - completion_len, - max_prompt_len=2048, - max_total_len=32000): - continue - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=None, - request_id=request_id_prefix + str(ind), - )) - ind += 1 - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Next Edit Prediction Dataset Implementation -# ----------------------------------------------------------------------------- - - -zeta_prompt = """### Instruction: -You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location. - -### User Edits: - -{} - -### User Excerpt: - -{} - -### Response: - -""" # noqa: E501 - - -def _format_zeta_prompt( - sample: dict, - original_start_marker: str = "<|editable_region_start|>") -> dict: - """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. - - This function formats examples from the NEP dataset - into prompts and expected outputs. It could be - further extended to support more NEP datasets. - - Args: - sample: The dataset sample containing events, - inputs, and outputs. - original_start_marker: The marker indicating the - start of the editable region. Defaults to - "<|editable_region_start|>". - - Returns: - A dictionary with the formatted prompts and expected outputs. - """ - events = sample["events"] - input = sample["input"] - output = sample["output"] - prompt = zeta_prompt.format(events, input) - - # following the original implementation, extract the focused region - # from the raw output - output_start_index = output.find(original_start_marker) - output_focused_region = output[output_start_index:] - expected_output = output_focused_region - - return {"prompt": prompt, "expected_output": expected_output} - - -class NextEditPredictionDataset(HuggingFaceDataset): - """ - Dataset class for processing a Next Edit Prediction dataset. - """ - - SUPPORTED_DATASET_PATHS = { - "zed-industries/zeta", - } - MAPPING_PROMPT_FUNCS = { - "zed-industries/zeta": _format_zeta_prompt, - } - - def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs): - formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name) - if formatting_prompt_func is None: - raise ValueError(f"Unsupported dataset path: {self.hf_name}") - samples = [] - for i, sample in enumerate(self.data): - sample = formatting_prompt_func(sample) - samples.append( - SampleRequest( - prompt=sample["prompt"], - prompt_len=len(tokenizer(sample["prompt"]).input_ids), - expected_output_len=len( - tokenizer(sample["expected_output"]).input_ids), - request_id=request_id_prefix + str(i), - )) - if len(samples) >= num_requests: - break - self.maybe_oversample_requests(samples, - num_requests, - request_id_prefix, - no_oversample) - return samples - - -# ----------------------------------------------------------------------------- -# ASR Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ASRDataset(HuggingFaceDataset): - """ - Dataset class for processing a ASR dataset for transcription. - Tested on the following set: - - +----------------+----------------------------------------+--------------------------+-----------------------------+ - | Dataset | Domain | Speaking Style | hf-subset | - +----------------+----------------------------------------+--------------------------+-----------------------------+ - | TED-LIUM | TED talks | Oratory | release1, release2, release3| - | | | | release3-speaker-adaptation | - | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | - | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | - | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | - | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | - | AMI | Meetings | Spontaneous | ihm, sdm | - +----------------+----------------------------------------+--------------------------+-----------------------------+ - - """ # noqa: E501 - - SUPPORTED_DATASET_PATHS = { - "openslr/librispeech_asr", - "facebook/voxpopuli", - "LIUM/tedlium", - "edinburghcstr/ami", - "speechcolab/gigaspeech", - "kensho/spgispeech", - } - - DEFAULT_OUTPUT_LEN = 128 - IS_MULTIMODAL = True - - # TODO Whisper-specific. Abstract interface when more models are supported. - TRANSCRIPTION_PREAMBLE = ( - "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>") - skip_long_audios: bool = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list: - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) - prompt = ASRDataset.TRANSCRIPTION_PREAMBLE - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests = [] - ind = 0 - skipped = 0 - for item in self.data: - if len(sampled_requests) >= num_requests: - break - audio = item["audio"] - y, sr = audio["array"], audio["sampling_rate"] - duration_s = librosa.get_duration(y=y, sr=sr) - # Whisper max supported duration - if self.skip_long_audios and duration_s > 30: - skipped += 1 - continue - - mm_content = {"audio": (y, sr)} - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - request_id=request_id_prefix + str(ind), - )) - ind += 1 - if skipped: - logger.warning( - "%d samples discarded from dataset due to" - " their length being greater than" - " what Whisper supports.", - skipped, - ) - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# MLPerf Dataset Implementation -# ----------------------------------------------------------------------------- - - -class MLPerfDataset(HuggingFaceDataset): - """ - MLPerf Inference Dataset. - - Dataset on HF: - https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data - https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data - - Each record contains: - - "system_prompt": system role instruction. - - "question": user question. - - "output": reference answer. - - We combine the system prompt and question into a chat-formatted prompt - (using the tokenizer's chat template) and set the expected output length to - the tokenized length of the provided reference answer. - """ - - SUPPORTED_DATASET_PATHS = { - "mgoin/mlperf-inference-llama2-data", - "mgoin/mlperf-inference-llama3.1-data", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list[SampleRequest]: - # Force dynamic output length based on reference completion. - dynamic_output = output_len is None - sampled_requests: list[SampleRequest] = [] - ind = 0 - - for item in self.data: - if len(sampled_requests) >= num_requests: - break - - system_prompt = item["system_prompt"] - question = item["question"] - reference_answer = item["output"] - - # Build chat-style prompt using tokenizer template, if available. - messages = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": question}, - ] - prompt_formatted = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ) - prompt_len = len(tokenizer(prompt_formatted).input_ids) - - # Determine output length from reference answer tokens. - ref_out_len = len( - tokenizer(reference_answer, add_special_tokens=False).input_ids - ) - expected_output_len = ref_out_len if dynamic_output else output_len - - # Validate sequence lengths. - if not is_valid_sequence(prompt_len, expected_output_len): - continue - - sampled_requests.append( - SampleRequest( - prompt=prompt_formatted, - prompt_len=prompt_len, - expected_output_len=expected_output_len, - request_id=request_id_prefix + str(ind), - ) - ) - ind += 1 - - self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix, no_oversample) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Prefix Repetition Dataset Implementation -# ----------------------------------------------------------------------------- - - -class PrefixRepetitionRandomDataset(BenchmarkDataset): - # Default values copied from benchmark_serving.py for the repeated prefix - # dataset. - DEFAULT_PREFIX_LEN = 256 - DEFAULT_SUFFIX_LEN = 256 - DEFAULT_NUM_PREFIXES = 10 - DEFAULT_OUTPUT_LEN = 128 - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - random.seed(self.random_seed) - np.random.seed(self.random_seed) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - prefix_len: int = DEFAULT_PREFIX_LEN, - suffix_len: int = DEFAULT_SUFFIX_LEN, - num_prefixes: int = DEFAULT_NUM_PREFIXES, - output_len: int = DEFAULT_OUTPUT_LEN, - request_id_prefix: str = "", - no_oversample: bool = False, - **kwargs, - ) -> list[SampleRequest]: - vocab_size = tokenizer.vocab_size - prompts_per_prefix = num_requests // num_prefixes - if prompts_per_prefix == 0: - raise ValueError( - f"num_requests ({num_requests}) must be greater than or equal " - f"to num_prefixes ({num_prefixes})" - ) - - def _generate_exact_length_tokens(target_length: int) -> list[int]: - """Generate tokens that decode and re-encode to exactly - target_length.""" - # Generate random tokens - tokens = np.random.randint( - 0, vocab_size, size=target_length).tolist() - text = tokenizer.decode(tokens) - re_encoded = tokenizer.encode(text, add_special_tokens=False) - - if len(re_encoded) == target_length: - return re_encoded - elif len(re_encoded) < target_length: - # Recursively generate additional consistent tokens - needed = target_length - len(re_encoded) - extra_tokens = _generate_exact_length_tokens(needed) - return re_encoded + extra_tokens - else: - # Truncate to target length - return re_encoded[:target_length] - - requests = [] - for _ in range(num_prefixes): - prefix_tokens = _generate_exact_length_tokens(prefix_len) - - for _ in range(prompts_per_prefix): - suffix_tokens = _generate_exact_length_tokens(suffix_len) - - combined_tokens = prefix_tokens + suffix_tokens - prompt = tokenizer.decode(combined_tokens) - prompt_len = len(combined_tokens) - requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - ) - ) - random.shuffle(requests) - return requests diff --git a/vllm_omni/benchmarks/latency.py b/vllm_omni/benchmarks/latency.py deleted file mode 100644 index e9fd9d5da..000000000 --- a/vllm_omni/benchmarks/latency.py +++ /dev/null @@ -1,170 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark the latency of processing a single batch of requests.""" - -import argparse -import dataclasses -import json -import os -import time -from typing import Any, Optional - -import numpy as np -from tqdm import tqdm - -import vllm.envs as envs -from vllm_omni.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, - write_to_json) -from vllm.engine.arg_utils import EngineArgs -from vllm.inputs import PromptType -from vllm.sampling_params import BeamSearchParams - - -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: dict[str, Any]) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={"latency": results["latencies"]}, - extra_info={k: results[k] - for k in ["avg_latency", "percentiles"]}) - if pt_records: - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def add_cli_args(parser: argparse.ArgumentParser): - parser.add_argument("--input-len", type=int, default=32) - parser.add_argument("--output-len", type=int, default=128) - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument( - "--n", - type=int, - default=1, - help="Number of generated sequences per prompt.", - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-iters-warmup", - type=int, - default=10, - help="Number of iterations to run for warmup.", - ) - parser.add_argument("--num-iters", - type=int, - default=30, - help="Number of iterations to run.") - parser.add_argument( - "--profile", - action="store_true", - help="profile the generation process of a single batch", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the latency results in JSON format.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=("Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)"), - ) - - parser = EngineArgs.add_cli_args(parser) - # V1 enables prefix caching by default which skews the latency - # numbers. We need to disable prefix caching by default. - parser.set_defaults(enable_prefix_caching=False) - - -def main(args: argparse.Namespace): - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: - raise OSError( - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " - "Please set it to a valid path to use torch profiler.") - engine_args = EngineArgs.from_cli_args(args) - - # Lazy import to avoid importing LLM when the bench command is not selected. - from vllm import LLM, SamplingParams - - # NOTE(woosuk): If the request cannot be processed in a single batch, - # the engine will automatically process the request in multiple batches. - llm = LLM(**dataclasses.asdict(engine_args)) - assert llm.llm_engine.model_config.max_model_len >= ( - args.input_len + - args.output_len), ("Please ensure that max_model_len is greater than" - " the sum of input_len and output_len.") - - sampling_params = SamplingParams( - n=args.n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=args.output_len, - detokenize=not args.disable_detokenize, - ) - dummy_prompt_token_ids = np.random.randint(10000, - size=(args.batch_size, - args.input_len)) - dummy_prompts: list[PromptType] = [{ - "prompt_token_ids": batch - } for batch in dummy_prompt_token_ids.tolist()] - - def llm_generate(): - if not args.use_beam_search: - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) - else: - llm.beam_search( - dummy_prompts, - BeamSearchParams( - beam_width=args.n, - max_tokens=args.output_len, - ignore_eos=True, - ), - ) - - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - llm.start_profile() - llm_generate() - llm.stop_profile() - else: - start_time = time.perf_counter() - llm_generate() - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - print("Warming up...") - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - run_to_completion(profile_dir=None) - - if args.profile: - profile_dir = envs.VLLM_TORCH_PROFILER_DIR - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=profile_dir) - return - - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) - latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90, 99] - percentiles = np.percentile(latencies, percentages) - print(f"Avg latency: {np.mean(latencies)} seconds") - for percentage, percentile in zip(percentages, percentiles): - print(f"{percentage}% percentile latency: {percentile} seconds") - - # Output JSON results if specified - if args.output_json: - results = { - "avg_latency": np.mean(latencies), - "latencies": latencies.tolist(), - "percentiles": dict(zip(percentages, percentiles.tolist())), - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) diff --git a/vllm_omni/benchmarks/lib/endpoint_request_func.py b/vllm_omni/benchmarks/lib/endpoint_request_func.py index 661a1b4f4..1e6d5a7d7 100644 --- a/vllm_omni/benchmarks/lib/endpoint_request_func.py +++ b/vllm_omni/benchmarks/lib/endpoint_request_func.py @@ -13,216 +13,14 @@ from typing import Optional, Protocol, Union import aiohttp from tqdm.asyncio import tqdm +from vllm.benchmarks.lib.endpoint_request_func import (async_request_openai_completions,async_request_openai_audio, + async_request_openai_embeddings, RequestFunc, + RequestFuncInput, + RequestFuncOutput,StreamedResponseHandler) AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) -class StreamedResponseHandler: - """Handles streaming HTTP responses by accumulating chunks until complete - messages are available.""" - - def __init__(self): - self.buffer = "" - - def add_chunk(self, chunk_bytes: bytes) -> list[str]: - """Add a chunk of bytes to the buffer and return any complete - messages.""" - chunk_str = chunk_bytes.decode("utf-8") - self.buffer += chunk_str - - messages = [] - - # Split by double newlines (SSE message separator) - while "\n\n" in self.buffer: - message, self.buffer = self.buffer.split("\n\n", 1) - message = message.strip() - if message: - messages.append(message) - - # if self.buffer is not empty, check if it is a complete message - # by removing data: prefix and check if it is a valid JSON - if self.buffer.startswith("data: "): - message_content = self.buffer.removeprefix("data: ").strip() - if message_content == "[DONE]": - messages.append(self.buffer.strip()) - self.buffer = "" - elif message_content: - try: - json.loads(message_content) - messages.append(self.buffer.strip()) - self.buffer = "" - except json.JSONDecodeError: - # Incomplete JSON, wait for more chunks. - pass - - return messages - - -@dataclass -class RequestFuncInput: - """The input for the request function.""" - prompt: str - api_url: str - prompt_len: int - output_len: int - model: str - model_name: Optional[str] = None - logprobs: Optional[int] = None - extra_headers: Optional[dict] = None - extra_body: Optional[dict] = None - multi_modal_content: Optional[Union[dict, list[dict]]] = None - ignore_eos: bool = False - language: Optional[str] = None - request_id: Optional[str] = None - - -@dataclass -class RequestFuncOutput: - """The output of the request function including metrics.""" - generated_text: str = "" - success: bool = False - latency: float = 0.0 - output_tokens: int = 0 - ttft: float = 0.0 # Time to first token - itl: list[float] = field( - default_factory=list) # list of inter-token latencies - tpot: float = 0.0 # avg next-token latencies - prompt_len: int = 0 - error: str = "" - start_time: float = 0.0 - - -class RequestFunc(Protocol): - def __call__( - self, - request_func_input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: Optional[tqdm] = None, - ) -> Awaitable[RequestFuncOutput]: - ... - -async def async_request_openai_completions( - request_func_input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - """The async request function for the OpenAI Completions API. - - Args: - request_func_input: The input for the request function. - pbar: The progress bar to display the progress. - - Returns: - The output of the request function. - """ - api_url = request_func_input.api_url - assert api_url.endswith( - ("completions", "profile") - ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - - payload = { - "model": request_func_input.model_name - if request_func_input.model_name else request_func_input.model, - "prompt": request_func_input.prompt, - "temperature": 0.0, - "repetition_penalty": 1.0, - "max_tokens": request_func_input.output_len, - "logprobs": request_func_input.logprobs, - "stream": True, - "stream_options": { - "include_usage": True, - }, - } - if request_func_input.ignore_eos: - payload["ignore_eos"] = request_func_input.ignore_eos - if request_func_input.extra_body: - payload.update(request_func_input.extra_body) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" - } - if request_func_input.extra_headers: - headers |= request_func_input.extra_headers - if request_func_input.request_id: - headers["x-request-id"] = request_func_input.request_id - - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - generated_text = "" - st = time.perf_counter() - output.start_time = st - most_recent_timestamp = st - try: - async with session.post(url=api_url, json=payload, - headers=headers) as response: - if response.status == 200: - first_chunk_received = False - handler = StreamedResponseHandler() - - async for chunk_bytes in response.content.iter_any(): - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - messages = handler.add_chunk(chunk_bytes) - for message in messages: - # NOTE: SSE comments (often used as pings) start with - # a colon. These are not JSON data payload and should - # be skipped. - if message.startswith(":"): - continue - - chunk = message.removeprefix("data: ") - - if chunk != "[DONE]": - data = json.loads(chunk) - - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if choices := data.get("choices"): - # Note that text could be empty here - # e.g. for special tokens - text = choices[0].get("text") - timestamp = time.perf_counter() - # First token - if not first_chunk_received: - first_chunk_received = True - ttft = time.perf_counter() - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text += text or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") - if first_chunk_received: - output.success = True - else: - output.success = False - output.error = ( - "Never received a valid chunk to calculate TTFT." - "This response will be marked as failed!") - output.generated_text = generated_text - output.latency = most_recent_timestamp - st - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, session: aiohttp.ClientSession, @@ -259,10 +57,7 @@ async def async_request_openai_chat_completions( "max_completion_tokens": request_func_input.output_len, "stream": - True, - "stream_options": { - "include_usage": True, - }, + False } if request_func_input.ignore_eos: payload["ignore_eos"] = request_func_input.ignore_eos @@ -343,178 +138,6 @@ async def async_request_openai_chat_completions( pbar.update(1) return output - -async def async_request_openai_audio( - request_func_input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - # Lazy import without PlaceholderModule to avoid vllm dep. - import soundfile - - api_url = request_func_input.api_url - assert api_url.endswith(("transcriptions", "translations")), ( - "OpenAI Chat Completions API URL must end with 'transcriptions' ") - "or `translations`." - - content = [{"type": "text", "text": request_func_input.prompt}] - payload = { - "model": - request_func_input.model_name - if request_func_input.model_name else request_func_input.model, - "temperature": - 0.0, - "max_completion_tokens": - request_func_input.output_len, - "stream": - True, - "language": - "en", - # Flattened due to multipart/form-data - "stream_include_usage": - True, - "stream_continuous_usage_stats": - True, - } - if request_func_input.extra_body: - payload.update(request_func_input.extra_body) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - } - if request_func_input.extra_headers: - headers |= request_func_input.extra_headers - if request_func_input.request_id: - headers["x-request-id"] = request_func_input.request_id - - # Send audio file - def to_bytes(y, sr): - buffer = io.BytesIO() - soundfile.write(buffer, y, sr, format="WAV") - buffer.seek(0) - return buffer - - mm_audio = request_func_input.multi_modal_content - if not isinstance(mm_audio, dict) or "audio" not in mm_audio: - raise TypeError("multi_modal_content must be a dict containing 'audio'") - with to_bytes(*mm_audio["audio"]) as f: - form = aiohttp.FormData() - form.add_field("file", f, content_type="audio/wav") - for key, value in payload.items(): - form.add_field(key, str(value)) - - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - generated_text = "" - ttft = 0.0 - st = time.perf_counter() - output.start_time = st - most_recent_timestamp = st - try: - async with session.post(url=api_url, - data=form, - headers=headers) as response: - if response.status == 200: - handler = StreamedResponseHandler() - - async for chunk_bytes in response.content.iter_any(): - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - messages = handler.add_chunk(chunk_bytes) - for message in messages: - chunk = message.decode("utf-8").removeprefix( - "data: ") - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) - - if choices := data.get("choices"): - content = choices[0]["delta"].get( - "content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append( - timestamp - most_recent_timestamp) - - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") - - most_recent_timestamp = timestamp - - output.generated_text = generated_text - output.success = True - output.latency = most_recent_timestamp - st - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - -async def async_request_openai_embeddings( - request_func_input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: Optional[tqdm] = None, -): - api_url = request_func_input.api_url - assert api_url.endswith( - "embeddings" - ), "OpenAI Embeddings API URL must end with 'embeddings'." - - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - } - - payload = { - "model": request_func_input.model, - "input": request_func_input.prompt, - } - - output = RequestFuncOutput() - st = time.perf_counter() - output.start_time = st - try: - async with session.post( - url=api_url, - headers=headers, - json=payload - ) as response: - if response.status == 200: - output.latency = time.perf_counter() - st - data = await response.json() - output.success = True - output.generated_text = "" - output.prompt_len = data.get( - "usage", {}).get( - "prompt_tokens", 0) - else: - output.success = False - output.error = response.reason or "" - except Exception as e: - output.success = False - output.error = str(e) - - if pbar: - pbar.update(1) - return output - - # TODO: Add more request functions for different API protocols. ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "vllm": async_request_openai_completions, diff --git a/vllm_omni/benchmarks/lib/ready_checker.py b/vllm_omni/benchmarks/lib/ready_checker.py deleted file mode 100644 index 40898864f..000000000 --- a/vllm_omni/benchmarks/lib/ready_checker.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Utilities for checking endpoint readiness.""" - -import asyncio -import time -import aiohttp -from tqdm.asyncio import tqdm - -from .endpoint_request_func import (RequestFunc, RequestFuncInput, - RequestFuncOutput) - -async def wait_for_endpoint( - request_func: RequestFunc, - test_input: RequestFuncInput, - session: aiohttp.ClientSession, - timeout_seconds: int = 600, - retry_interval: int = 5, -) -> RequestFuncOutput: - """ - Wait for an endpoint to become available before starting benchmarks. - - Args: - request_func: The async request function to call - test_input: The RequestFuncInput to test with - timeout_seconds: Maximum time to wait in seconds (default: 10 minutes) - retry_interval: Time between retries in seconds (default: 5 seconds) - - Returns: - RequestFuncOutput: The successful response - - Raises: - ValueError: If the endpoint doesn't become available within the timeout - """ - deadline = time.perf_counter() + timeout_seconds - output = RequestFuncOutput(success=False) - print(f"Waiting for endpoint to become up in {timeout_seconds} seconds") - - with tqdm( - total=timeout_seconds, - bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining", - unit="s", - ) as pbar: - - while True: - # update progress bar - remaining = deadline - time.perf_counter() - elapsed = timeout_seconds - remaining - update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n) - pbar.update(update_amount) - pbar.refresh() - if remaining <= 0: - pbar.close() - break - - # ping the endpoint using request_func - try: - output = await request_func( - request_func_input=test_input, session=session) - if output.success: - pbar.close() - return output - except aiohttp.ClientConnectorError: - pass - - # retry after a delay - sleep_duration = min(retry_interval, remaining) - if sleep_duration > 0: - await asyncio.sleep(sleep_duration) - - return output diff --git a/vllm_omni/benchmarks/lib/utils.py b/vllm_omni/benchmarks/lib/utils.py deleted file mode 100644 index 41a24ae64..000000000 --- a/vllm_omni/benchmarks/lib/utils.py +++ /dev/null @@ -1,78 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import json -import math -import os -from typing import Any - -def convert_to_pytorch_benchmark_format(args: argparse.Namespace, - metrics: dict[str, list], - extra_info: dict[str, Any]) -> list: - """ - Save the benchmark results in the format used by PyTorch OSS benchmark with - on metric per record - https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database - """ - records = [] - if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): - return records - - for name, benchmark_values in metrics.items(): - record = { - "benchmark": { - "name": "vLLM benchmark", - "extra_info": { - "args": vars(args), - }, - }, - "model": { - "name": args.model, - }, - "metric": { - "name": name, - "benchmark_values": benchmark_values, - "extra_info": extra_info, - }, - } - - tp = record["benchmark"]["extra_info"]["args"].get( - "tensor_parallel_size") - # Save tensor_parallel_size parameter if it's part of the metadata - if not tp and "tensor_parallel_size" in extra_info: - record["benchmark"]["extra_info"]["args"][ - "tensor_parallel_size"] = extra_info["tensor_parallel_size"] - - records.append(record) - - return records - - -class InfEncoder(json.JSONEncoder): - - def clear_inf(self, o: Any): - if isinstance(o, dict): - return { - str(k) - if not isinstance(k, (str, int, float, bool, type(None))) - else k: self.clear_inf(v) - for k, v in o.items() - } - elif isinstance(o, list): - return [self.clear_inf(v) for v in o] - elif isinstance(o, float) and math.isinf(o): - return "inf" - return o - - def iterencode(self, o: Any, *args, **kwargs) -> Any: - return super().iterencode(self.clear_inf(o), *args, **kwargs) - - -def write_to_json(filename: str, records: list) -> None: - with open(filename, "w") as f: - json.dump( - records, - f, - cls=InfEncoder, - default=lambda o: f"<{type(o).__name__} is not JSON serializable>", - ) diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py index 1175d2d28..8ba16378f 100644 --- a/vllm_omni/benchmarks/serve.py +++ b/vllm_omni/benchmarks/serve.py @@ -36,15 +36,17 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from vllm_omni.benchmarks.datasets import (SampleRequest, add_dataset_parser, - get_samples) +from vllm_omni.benchmarks.datasets import get_samples,add_dataset_parser + from vllm_omni.benchmarks.lib.endpoint_request_func import ( - ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, - RequestFuncOutput) -from vllm_omni.benchmarks.lib.ready_checker import wait_for_endpoint -from vllm_omni.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS) + +from vllm.benchmarks.datasets import SampleRequest +from vllm.lib.endpoint_request_func import RequestFuncInput,RequestFuncOutput +from vllm.benchmarks.lib.ready_checker import wait_for_endpoint +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) -from vllm_omni.transformers_utils.tokenizer import get_tokenizer +from vllm.transformers_utils.tokenizer import get_tokenizer MILLISECONDS_TO_SECONDS_CONVERSION = 1000 diff --git a/vllm_omni/benchmarks/throughput.py b/vllm_omni/benchmarks/throughput.py deleted file mode 100644 index 5056db3db..000000000 --- a/vllm_omni/benchmarks/throughput.py +++ /dev/null @@ -1,696 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark offline inference throughput.""" -import argparse -import dataclasses -import json -import os -import random -import time -import warnings -from typing import Any, Optional, Union - -import torch -import uvloop -from tqdm import tqdm -from transformers import (AutoModelForCausalLM, AutoTokenizer, - PreTrainedTokenizerBase) - -from vllm_omni.benchmarks.datasets import (AIMODataset, BurstGPTDataset, - ConversationDataset, - InstructCoderDataset, - PrefixRepetitionRandomDataset, - RandomDataset, SampleRequest, - ShareGPTDataset, SonnetDataset, - VisionArenaDataset) -from vllm_omni.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, - write_to_json) -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.inputs import TextPrompt, TokensPrompt -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput -from vllm.sampling_params import BeamSearchParams -from vllm.utils import merge_async_iterators - - -def run_vllm( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - do_profile: bool, - disable_detokenize: bool = False, -) -> tuple[float, Optional[list[RequestOutput]]]: - from vllm import LLM, SamplingParams - llm = LLM(**dataclasses.asdict(engine_args)) - assert all( - llm.llm_engine.model_config.max_model_len >= ( - request.prompt_len + request.expected_output_len) - for request in requests), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests.") - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append( - TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data) - if "prompt_token_ids" in request.prompt else \ - TextPrompt(prompt=request.prompt, - multi_modal_data=request.multi_modal_data)) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - )) - lora_requests: Optional[list[LoRARequest]] = None - if engine_args.enable_lora: - lora_requests = [request.lora_request for request in requests] - - use_beam_search = False - - outputs = None - if not use_beam_search: - start = time.perf_counter() - if do_profile: - llm.start_profile() - outputs = llm.generate(prompts, - sampling_params, - lora_request=lora_requests, - use_tqdm=True) - if do_profile: - llm.stop_profile() - end = time.perf_counter() - else: - assert lora_requests is None, "BeamSearch API does not support LoRA" - prompts = [request.prompt for request in requests] - # output_len should be the same for all requests. - output_len = requests[0].expected_output_len - for request in requests: - assert request.expected_output_len == output_len - start = time.perf_counter() - if do_profile: - llm.start_profile() - llm.beam_search( - prompts, - BeamSearchParams( - beam_width=n, - max_tokens=output_len, - ignore_eos=True, - )) - if do_profile: - llm.stop_profile() - end = time.perf_counter() - return end - start, outputs - - -def run_vllm_chat( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - do_profile: bool, - disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]: - """ - Run vLLM chat benchmark. This function is recommended ONLY for benchmarking - multimodal models as it properly handles multimodal inputs and chat - formatting. For non-multimodal models, use run_vllm() instead. - """ - from vllm import LLM, SamplingParams - llm = LLM(**dataclasses.asdict(engine_args)) - - assert all( - llm.llm_engine.model_config.max_model_len >= ( - request.prompt_len + request.expected_output_len) - for request in requests), ( - "Please ensure that max_model_len is greater than the sum of " - "prompt_len and expected_output_len for all requests.") - - prompts = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - )) - start = time.perf_counter() - if do_profile: - llm.start_profile() - outputs = llm.chat(prompts, sampling_params, use_tqdm=True) - if do_profile: - llm.stop_profile() - end = time.perf_counter() - return end - start, outputs - - -async def run_vllm_async( - requests: list[SampleRequest], - n: int, - engine_args: AsyncEngineArgs, - do_profile: bool, - disable_frontend_multiprocessing: bool = False, - disable_detokenize: bool = False, -) -> float: - from vllm import SamplingParams - from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) - - async with build_async_engine_client_from_engine_args( - engine_args, - disable_frontend_multiprocessing=disable_frontend_multiprocessing, - ) as llm: - model_config = await llm.get_model_config() - assert all( - model_config.max_model_len >= (request.prompt_len + - request.expected_output_len) - for request in requests), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests.") - - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - lora_requests: list[Optional[LoRARequest]] = [] - for request in requests: - prompts.append( - TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data) - if "prompt_token_ids" in request.prompt else \ - TextPrompt(prompt=request.prompt, - multi_modal_data=request.multi_modal_data)) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - )) - lora_requests.append(request.lora_request) - - generators = [] - start = time.perf_counter() - if do_profile: - await llm.start_profile() - for i, (prompt, sp, - lr) in enumerate(zip(prompts, sampling_params, lora_requests)): - generator = llm.generate(prompt, - sp, - lora_request=lr, - request_id=f"test{i}") - generators.append(generator) - all_gens = merge_async_iterators(*generators) - async for i, res in all_gens: - pass - if do_profile: - await llm.stop_profile() - end = time.perf_counter() - return end - start - - -def run_hf( - requests: list[SampleRequest], - model: str, - tokenizer: PreTrainedTokenizerBase, - n: int, - max_batch_size: int, - trust_remote_code: bool, - disable_detokenize: bool = False, -) -> float: - llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) - if llm.config.model_type == "llama": - # To enable padding in the HF backend. - tokenizer.pad_token = tokenizer.eos_token - llm = llm.cuda() - - pbar = tqdm(total=len(requests)) - start = time.perf_counter() - batch: list[str] = [] - max_prompt_len = 0 - max_output_len = 0 - for i in range(len(requests)): - prompt = requests[i].prompt - prompt_len = requests[i].prompt_len - output_len = requests[i].expected_output_len - # Add the prompt to the batch. - batch.append(prompt) - max_prompt_len = max(max_prompt_len, prompt_len) - max_output_len = max(max_output_len, output_len) - if len(batch) < max_batch_size and i != len(requests) - 1: - # Check if we can add more requests to the batch. - next_prompt_len = requests[i + 1].prompt_len - next_output_len = requests[i + 1].expected_output_len - if (max(max_prompt_len, next_prompt_len) + - max(max_output_len, next_output_len)) <= 2048: - # We can add more requests to the batch. - continue - - # Generate the sequences. - input_ids = tokenizer(batch, return_tensors="pt", - padding=True).input_ids - llm_outputs = llm.generate( - input_ids=input_ids.cuda(), - do_sample=True, - num_return_sequences=n, - temperature=1.0, - top_p=1.0, - use_cache=True, - max_new_tokens=max_output_len, - ) - if not disable_detokenize: - # Include the decoding time. - tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) - pbar.update(len(batch)) - - # Clear the batch. - batch = [] - max_prompt_len = 0 - max_output_len = 0 - end = time.perf_counter() - return end - start - - -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: dict[str, Any]) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={ - "requests_per_second": [results["requests_per_second"]], - "tokens_per_second": [results["tokens_per_second"]], - }, - extra_info={ - k: results[k] - for k in ["elapsed_time", "num_requests", "total_num_tokens"] - }) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def get_requests(args, tokenizer): - # Common parameters for all dataset types. - common_kwargs = { - "dataset_path": args.dataset_path, - "random_seed": args.seed, - } - sample_kwargs = { - "tokenizer": tokenizer, - "lora_path": args.lora_path, - "max_loras": args.max_loras, - "num_requests": args.num_prompts, - "input_len": args.input_len, - "output_len": args.output_len, - } - - if args.dataset_path is None or args.dataset_name == "random": - sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["prefix_len"] = args.prefix_len - dataset_cls = RandomDataset - elif args.dataset_name == "sharegpt": - dataset_cls = ShareGPTDataset - if args.backend == "vllm-chat": - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_name == "sonnet": - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset.") - dataset_cls = SonnetDataset - sample_kwargs["prefix_len"] = args.prefix_len - sample_kwargs["return_prompt_formatted"] = True - elif args.dataset_name == "burstgpt": - dataset_cls = BurstGPTDataset - elif args.dataset_name == "hf": - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = VisionArenaDataset - common_kwargs['dataset_subset'] = None - common_kwargs['dataset_split'] = "train" - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = InstructCoderDataset - common_kwargs['dataset_split'] = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = ConversationDataset - common_kwargs['dataset_subset'] = args.hf_subset - common_kwargs['dataset_split'] = args.hf_split - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_cls = AIMODataset - common_kwargs['dataset_subset'] = None - common_kwargs['dataset_split'] = "train" - elif args.dataset_name == "prefix_repetition": - dataset_cls = PrefixRepetitionRandomDataset - sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len - sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len - sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes - sample_kwargs["output_len"] = args.prefix_repetition_output_len - else: - raise ValueError(f"Unknown dataset name: {args.dataset_name}") - # Remove None values - sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} - return dataset_cls(**common_kwargs).sample(**sample_kwargs) - - -def validate_args(args): - """ - Validate command-line arguments. - """ - - # === Deprecation and Defaulting === - if args.dataset is not None: - warnings.warn( - "The '--dataset' argument will be deprecated in the next release. " - "Please use '--dataset-name' and '--dataset-path' instead.", - stacklevel=2) - args.dataset_path = args.dataset - - if not getattr(args, "tokenizer", None): - args.tokenizer = args.model - - # === Backend Validation === - valid_backends = {"vllm", "hf", "mii", "vllm-chat"} - if args.backend not in valid_backends: - raise ValueError(f"Unsupported backend: {args.backend}") - - # === Dataset Configuration === - if ( - not args.dataset - and not args.dataset_path - and args.dataset_name not in {"prefix_repetition"} - ): - print( - "When dataset path is not set, it will default to random dataset") - args.dataset_name = 'random' - if args.input_len is None: - raise ValueError("input_len must be provided for a random dataset") - - # === Dataset Name Specific Checks === - # --hf-subset and --hf-split: only used - # when dataset_name is 'hf' - if args.dataset_name != "hf" and ( - getattr(args, "hf_subset", None) is not None - or getattr(args, "hf_split", None) is not None): - warnings.warn("--hf-subset and --hf-split will be ignored \ - since --dataset-name is not 'hf'.", - stacklevel=2) - elif args.dataset_name == "hf": - if args.dataset_path in ( - VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() - | ConversationDataset.SUPPORTED_DATASET_PATHS): - assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501 - elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS - | AIMODataset.SUPPORTED_DATASET_PATHS): - assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501 - else: - raise ValueError( - f"{args.dataset_path} is not supported by hf dataset.") - - # --random-range-ratio: only used when dataset_name is 'random' - if args.dataset_name != 'random' and args.random_range_ratio is not None: - warnings.warn("--random-range-ratio will be ignored since \ - --dataset-name is not 'random'.", - stacklevel=2) - - # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not - # set. - if args.dataset_name not in {"random", "sonnet", None - } and args.prefix_len is not None: - warnings.warn("--prefix-len will be ignored since --dataset-name\ - is not 'random', 'sonnet', or not set.", - stacklevel=2) - - # === LoRA Settings === - if getattr(args, "enable_lora", False) and args.backend != "vllm": - raise ValueError( - "LoRA benchmarking is only supported for vLLM backend") - if getattr(args, "enable_lora", False) and args.lora_path is None: - raise ValueError("LoRA path must be provided when enable_lora is True") - - # === Backend-specific Validations === - if args.backend == "hf" and args.hf_max_batch_size is None: - raise ValueError("HF max batch size is required for HF backend") - if args.backend != "hf" and args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - - if args.backend in {"hf", "mii"} and getattr(args, "quantization", - None) is not None: - raise ValueError("Quantization is only for vLLM backend.") - - if args.backend == "mii" and args.dtype != "auto": - raise ValueError("dtype must be auto for MII backend.") - if args.backend == "mii" and args.n != 1: - raise ValueError("n must be 1 for MII backend.") - if args.backend == "mii" and args.tokenizer != args.model: - raise ValueError( - "Tokenizer must be the same as the model for MII backend.") - - # --data-parallel is not supported currently. - # https://github.com/vllm-project/vllm/issues/16222 - if args.data_parallel_size > 1: - raise ValueError( - "Data parallel is not supported in offline benchmark, " - "please use benchmark serving instead" - ) - - -def add_cli_args(parser: argparse.ArgumentParser): - parser.add_argument("--backend", - type=str, - choices=["vllm", "hf", "mii", "vllm-chat"], - default="vllm") - parser.add_argument( - "--dataset-name", - type=str, - choices=[ - "sharegpt", "random", "sonnet", "burstgpt", "hf", - "prefix_repetition" - ], - help="Name of the dataset to benchmark on.", - default="sharegpt") - parser.add_argument( - "--dataset", - type=str, - default=None, - help="Path to the ShareGPT dataset, will be deprecated in\ - the next release. The dataset is expected to " - "be a json in form of list[dict[..., conversations: " - "list[dict[..., value: ]]]]") - parser.add_argument("--dataset-path", - type=str, - default=None, - help="Path to the dataset") - parser.add_argument("--input-len", - type=int, - default=None, - help="Input prompt length for each request") - parser.add_argument("--output-len", - type=int, - default=None, - help="Output length for each request. Overrides the " - "output length from the dataset.") - parser.add_argument("--n", - type=int, - default=1, - help="Number of generated sequences per prompt.") - parser.add_argument("--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.") - parser.add_argument("--hf-max-batch-size", - type=int, - default=None, - help="Maximum batch size for HF backend.") - parser.add_argument( - '--output-json', - type=str, - default=None, - help='Path to save the throughput results in JSON format.') - parser.add_argument("--async-engine", - action='store_true', - default=False, - help="Use vLLM async engine rather than LLM class.") - parser.add_argument("--disable-frontend-multiprocessing", - action='store_true', - default=False, - help="Disable decoupled async engine frontend.") - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=("Do not detokenize the response (i.e. do not include " - "detokenization time in the measurement)")) - # LoRA - parser.add_argument( - "--lora-path", - type=str, - default=None, - help="Path to the lora adapters to use. This can be an absolute path, " - "a relative path, or a Hugging Face model identifier.") - parser.add_argument( - "--prefix-len", - type=int, - default=0, - help="Number of fixed prefix tokens before the random " - "context in a request (default: 0).", - ) - # random dataset - parser.add_argument( - "--random-range-ratio", - type=float, - default=0.0, - help="Range ratio for sampling input/output length, " - "used only for RandomDataset. Must be in the range [0, 1) to define " - "a symmetric sampling range " - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) - - # hf dtaset - parser.add_argument("--hf-subset", - type=str, - default=None, - help="Subset of the HF dataset.") - parser.add_argument("--hf-split", - type=str, - default=None, - help="Split of the HF dataset.") - parser.add_argument( - "--profile", - action="store_true", - default=False, - help="Use Torch Profiler. The env variable " - "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.") - - # prefix repetition dataset - prefix_repetition_group = parser.add_argument_group( - "prefix repetition dataset options") - prefix_repetition_group.add_argument( - "--prefix-repetition-prefix-len", - type=int, - default=None, - help="Number of prefix tokens per request, used only for prefix " - "repetition dataset.", - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-suffix-len", - type=int, - default=None, - help="Number of suffix tokens per request, used only for prefix " - "repetition dataset. Total input length is prefix_len + suffix_len.", - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-num-prefixes", - type=int, - default=None, - help="Number of prefixes to generate, used only for prefix repetition " - "dataset. Prompts per prefix is num_requests // num_prefixes.", - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-output-len", - type=int, - default=None, - help="Number of output tokens per request, used only for prefix " - "repetition dataset.", - ) - - parser = AsyncEngineArgs.add_cli_args(parser) - - -def main(args: argparse.Namespace): - if args.tokenizer is None: - args.tokenizer = args.model - validate_args(args) - if args.seed is None: - args.seed = 0 - random.seed(args.seed) - # Sample the requests. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code) - requests = get_requests(args, tokenizer) - is_multi_modal = any(request.multi_modal_data is not None - for request in requests) - request_outputs: Optional[list[RequestOutput]] = None - if args.backend == "vllm": - if args.async_engine: - elapsed_time = uvloop.run( - run_vllm_async( - requests, - args.n, - AsyncEngineArgs.from_cli_args(args), - disable_frontend_multiprocessing=args.disable_frontend_multiprocessing, - disable_detokenize=args.disable_detokenize, - do_profile=args.profile, - )) - else: - elapsed_time, request_outputs = run_vllm( - requests, args.n, EngineArgs.from_cli_args(args), - disable_detokenize=args.disable_detokenize, - do_profile=args.profile) - elif args.backend == "hf": - assert args.tensor_parallel_size == 1 - if args.profile: - raise NotImplementedError( - "Profiling not implemented yet for backend='hf'.") - elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.hf_max_batch_size, args.trust_remote_code, - args.disable_detokenize) - elif args.backend == "vllm-chat": - elapsed_time, request_outputs = run_vllm_chat( - requests, args.n, EngineArgs.from_cli_args(args), - disable_detokenize=args.disable_detokenize, do_profile=args.profile) - else: - raise ValueError(f"Unknown backend: {args.backend}") - - if request_outputs: - # Note: with the vllm and vllm-chat backends, - # we have request_outputs, which we use to count tokens. - total_prompt_tokens = 0 - total_output_tokens = 0 - for ro in request_outputs: - if not isinstance(ro, RequestOutput): - continue - total_prompt_tokens += len( - ro.prompt_token_ids) if ro.prompt_token_ids else 0 - total_output_tokens += sum( - len(o.token_ids) for o in ro.outputs if o) - total_num_tokens = total_prompt_tokens + total_output_tokens - else: - total_num_tokens = sum(r.prompt_len + r.expected_output_len - for r in requests) - total_output_tokens = sum(r.expected_output_len for r in requests) - total_prompt_tokens = total_num_tokens - total_output_tokens - - if is_multi_modal and args.backend != "vllm-chat": - print("\033[91mWARNING\033[0m: Multi-modal request with " - f"{args.backend} backend detected. The " - "following metrics are not accurate because image tokens are not" - " counted. See vllm-project/vllm/issues/9778 for details.") - # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. - # vllm-chat backend counts the image tokens now - - print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s") - print(f"Total num prompt tokens: {total_prompt_tokens}") - print(f"Total num output tokens: {total_output_tokens}") - - # Output JSON results if specified - if args.output_json: - results = { - "elapsed_time": elapsed_time, - "num_requests": len(requests), - "total_num_tokens": total_num_tokens, - "requests_per_second": len(requests) / elapsed_time, - "tokens_per_second": total_num_tokens / elapsed_time, - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) diff --git a/vllm_omni/entrypoints/cli/__init__.py b/vllm_omni/entrypoints/cli/__init__.py index b233a71e6..605b9cc7f 100644 --- a/vllm_omni/entrypoints/cli/__init__.py +++ b/vllm_omni/entrypoints/cli/__init__.py @@ -1,5 +1,6 @@ """CLI helpers for vLLM-Omni entrypoints.""" from .serve import OmniServeCommand +from vllm_omni.entrypoints.cli.benchmark.serve import OmniBenchmarkServingSubcommand -__all__ = ["OmniServeCommand"] +__all__ = ["OmniServeCommand", "OmniBenchmarkServingSubcommand"] diff --git a/vllm_omni/entrypoints/cli/benchmark/__init__.py b/vllm_omni/entrypoints/cli/benchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/vllm_omni/entrypoints/cli/benchmark/serve.py b/vllm_omni/entrypoints/cli/benchmark/serve.py new file mode 100644 index 000000000..856923077 --- /dev/null +++ b/vllm_omni/entrypoints/cli/benchmark/serve.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm_omni.benchmarks.serve import add_cli_args, main +from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase + + +class OmniBenchmarkServingSubcommand(BenchmarkSubcommandBase): + """ The `serve` subcommand for vllm bench. """ + name = "serve" + help = "Benchmark the online serving throughput." + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + add_cli_args(parser) + parser.add_argument( + "--omni", + action="store_true", + help="Enable vLLM-benchmark-Omni mode" + ) + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + main(args) From 4ac263a69e27dd1efe57508e6f2c049c6d4f4329 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Wed, 17 Dec 2025 23:03:49 +0800 Subject: [PATCH 14/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/entrypoints/cli/benchmark/main.py | 55 ++++++++++++++++++++ vllm_omni/entrypoints/cli/benchmark/serve.py | 5 -- vllm_omni/entrypoints/cli/main.py | 2 + 3 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 vllm_omni/entrypoints/cli/benchmark/main.py diff --git a/vllm_omni/entrypoints/cli/benchmark/main.py b/vllm_omni/entrypoints/cli/benchmark/main.py new file mode 100644 index 000000000..ffe99ed98 --- /dev/null +++ b/vllm_omni/entrypoints/cli/benchmark/main.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import argparse +import typing + +from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase +from vllm.entrypoints.cli.types import CLISubcommand +from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG + +if typing.TYPE_CHECKING: + from vllm.utils import FlexibleArgumentParser + + +class OmniBenchmarkSubcommand(CLISubcommand): + """ The `bench` subcommand for the vLLM CLI. """ + + name = "bench" + help = "vLLM bench subcommand." + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + args.dispatch_function(args) + + def validate(self, args: argparse.Namespace) -> None: + pass + + def subparser_init( + self, + subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: + bench_parser = subparsers.add_parser( + self.name, + description=self.help, + usage=f"vllm {self.name} [options]") + bench_subparsers = bench_parser.add_subparsers(required=True, + dest="bench_type") + + for cmd_cls in BenchmarkSubcommandBase.__subclasses__(): + cmd_subparser = bench_subparsers.add_parser( + cmd_cls.name, + help=cmd_cls.help, + description=cmd_cls.help, + usage=f"vllm {self.name} {cmd_cls.name} [options]", + ) + cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd) + cmd_cls.add_cli_args(cmd_subparser) + cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( + subcmd=f"{self.name} {cmd_cls.name}") + return bench_parser + + +def cmd_init() -> list[CLISubcommand]: + return [OmniBenchmarkSubcommand()] diff --git a/vllm_omni/entrypoints/cli/benchmark/serve.py b/vllm_omni/entrypoints/cli/benchmark/serve.py index 856923077..61a366e1c 100644 --- a/vllm_omni/entrypoints/cli/benchmark/serve.py +++ b/vllm_omni/entrypoints/cli/benchmark/serve.py @@ -14,11 +14,6 @@ class OmniBenchmarkServingSubcommand(BenchmarkSubcommandBase): @classmethod def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: add_cli_args(parser) - parser.add_argument( - "--omni", - action="store_true", - help="Enable vLLM-benchmark-Omni mode" - ) @staticmethod def cmd(args: argparse.Namespace) -> None: diff --git a/vllm_omni/entrypoints/cli/main.py b/vllm_omni/entrypoints/cli/main.py index 9647484f8..857fa4aaa 100644 --- a/vllm_omni/entrypoints/cli/main.py +++ b/vllm_omni/entrypoints/cli/main.py @@ -19,9 +19,11 @@ def main(): from vllm.utils import FlexibleArgumentParser import vllm_omni.entrypoints.cli.serve + import vllm_omni.entrypoints.cli.benchmark.main CMD_MODULES = [ vllm_omni.entrypoints.cli.serve, + vllm_omni.entrypoints.cli.benchmark.main, ] cli_env_setup() From 1bb540783947372e1ff6e06f6a5854bc1120ddc1 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 09:34:43 +0800 Subject: [PATCH 15/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/benchmarks/datasets.py | 2 +- vllm_omni/benchmarks/serve.py | 2 +- vllm_omni/entrypoints/cli/benchmark/base.py | 25 ++++++++++++++++++++ vllm_omni/entrypoints/cli/benchmark/main.py | 4 ++-- vllm_omni/entrypoints/cli/benchmark/serve.py | 4 ++-- 5 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 vllm_omni/entrypoints/cli/benchmark/base.py diff --git a/vllm_omni/benchmarks/datasets.py b/vllm_omni/benchmarks/datasets.py index 86a3a0715..a698ae9d5 100644 --- a/vllm_omni/benchmarks/datasets.py +++ b/vllm_omni/benchmarks/datasets.py @@ -29,7 +29,7 @@ import torchaudio from PIL import Image from transformers import PreTrainedTokenizerBase -from vllm.benchmark.datasets import (RandomDataset, ShareGPTDataset, SpecBench, +from vllm.benchmarks.datasets import (RandomDataset, ShareGPTDataset, SpecBench, SonnetDataset, BurstGPTDataset, ConversationDataset, VisionArenaDataset, MMVUDataset, InstructCoderDataset, MTBenchDataset, BlazeditDataset, AIMODataset, NextEditPredictionDataset, ASRDataset, MLPerfDataset, diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py index 8ba16378f..38397de60 100644 --- a/vllm_omni/benchmarks/serve.py +++ b/vllm_omni/benchmarks/serve.py @@ -42,7 +42,7 @@ ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS) from vllm.benchmarks.datasets import SampleRequest -from vllm.lib.endpoint_request_func import RequestFuncInput,RequestFuncOutput +from vllm.benchmarks.lib.endpoint_request_func import RequestFuncInput,RequestFuncOutput from vllm.benchmarks.lib.ready_checker import wait_for_endpoint from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) diff --git a/vllm_omni/entrypoints/cli/benchmark/base.py b/vllm_omni/entrypoints/cli/benchmark/base.py new file mode 100644 index 000000000..b12e0fe02 --- /dev/null +++ b/vllm_omni/entrypoints/cli/benchmark/base.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm.entrypoints.cli.types import CLISubcommand + + +class OmniBenchmarkSubcommandBase(CLISubcommand): + """ The base class of subcommands for vllm bench. """ + + help: str + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + """Add the CLI arguments to the parser.""" + raise NotImplementedError + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + """Run the benchmark. + + Args: + args: The arguments to the command. + """ + raise NotImplementedError diff --git a/vllm_omni/entrypoints/cli/benchmark/main.py b/vllm_omni/entrypoints/cli/benchmark/main.py index ffe99ed98..3392473ff 100644 --- a/vllm_omni/entrypoints/cli/benchmark/main.py +++ b/vllm_omni/entrypoints/cli/benchmark/main.py @@ -6,7 +6,7 @@ import argparse import typing -from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase +from vllm_omni.entrypoints.cli.benchmark.base import OmniBenchmarkSubcommandBase from vllm.entrypoints.cli.types import CLISubcommand from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG @@ -37,7 +37,7 @@ def subparser_init( bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type") - for cmd_cls in BenchmarkSubcommandBase.__subclasses__(): + for cmd_cls in OmniBenchmarkSubcommandBase.__subclasses__(): cmd_subparser = bench_subparsers.add_parser( cmd_cls.name, help=cmd_cls.help, diff --git a/vllm_omni/entrypoints/cli/benchmark/serve.py b/vllm_omni/entrypoints/cli/benchmark/serve.py index 61a366e1c..e69c9f77a 100644 --- a/vllm_omni/entrypoints/cli/benchmark/serve.py +++ b/vllm_omni/entrypoints/cli/benchmark/serve.py @@ -3,10 +3,10 @@ import argparse from vllm_omni.benchmarks.serve import add_cli_args, main -from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase +from vllm_omni.entrypoints.cli.benchmark.base import OmniBenchmarkSubcommandBase -class OmniBenchmarkServingSubcommand(BenchmarkSubcommandBase): +class OmniBenchmarkServingSubcommand(OmniBenchmarkSubcommandBase): """ The `serve` subcommand for vllm bench. """ name = "serve" help = "Benchmark the online serving throughput." From e8d26fc2513270768fc36ab90057c55c7f831892 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 14:07:24 +0800 Subject: [PATCH 16/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../benchmarks/lib/endpoint_request_func.py | 60 ++--- vllm_omni/benchmarks/serve.py | 234 ++---------------- 2 files changed, 34 insertions(+), 260 deletions(-) diff --git a/vllm_omni/benchmarks/lib/endpoint_request_func.py b/vllm_omni/benchmarks/lib/endpoint_request_func.py index 1e6d5a7d7..7603dbdc8 100644 --- a/vllm_omni/benchmarks/lib/endpoint_request_func.py +++ b/vllm_omni/benchmarks/lib/endpoint_request_func.py @@ -20,6 +20,9 @@ AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) +@dataclass +class MixRequestFuncOutput(RequestFuncOutput): + output_audio_num: int = None async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, @@ -72,60 +75,27 @@ async def async_request_openai_chat_completions( if request_func_input.request_id: headers["x-request-id"] = request_func_input.request_id - output = RequestFuncOutput() + output = MixRequestFuncOutput() output.prompt_len = request_func_input.prompt_len - generated_text = "" - ttft = 0.0 + output.ttft = 0.0 st = time.perf_counter() output.start_time = st - most_recent_timestamp = st + output.output_audio_num = 0 try: async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: - handler = StreamedResponseHandler() - async for chunk_bytes in response.content.iter_any(): - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - messages = handler.add_chunk(chunk_bytes) - for message in messages: - # NOTE: SSE comments (often used as pings) start with - # a colon. These are not JSON data payload and should - # be skipped. - if message.startswith(":"): - continue - - chunk = message.removeprefix("data: ") - - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) - - if choices := data.get("choices"): - content = choices[0]["delta"].get("content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) - - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") - - most_recent_timestamp = timestamp - - output.generated_text = generated_text + data = await response.json() + choices = data.get("choices") + for choice in choices: + content = choice["message"].get("content") + output.generated_text += content or "" + if choice["message"].get("audio"): + output.output_audio_num += 1 + output.output_tokens = 0 output.success = True - output.latency = most_recent_timestamp - st + output.latency = time.perf_counter() - st else: output.error = response.reason or "" output.success = False diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py index 38397de60..12db540bb 100644 --- a/vllm_omni/benchmarks/serve.py +++ b/vllm_omni/benchmarks/serve.py @@ -40,9 +40,13 @@ from vllm_omni.benchmarks.lib.endpoint_request_func import ( ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS) +from vllm_omni.benchmarks.lib.endpoint_request_func import MixRequestFuncOutput from vllm.benchmarks.datasets import SampleRequest +from vllm.benchmarks.serve import (BenchmarkMetrics,EmbedBenchmarkMetrics,DeprecatedEndpointTypeAction, + TaskType,get_request,check_goodput_args,save_to_pytorch_benchmark_format) from vllm.benchmarks.lib.endpoint_request_func import RequestFuncInput,RequestFuncOutput + from vllm.benchmarks.lib.ready_checker import wait_for_endpoint from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) @@ -54,69 +58,11 @@ and (shutil.which("gnuplot") is not None)) -# TODO: Remove this in v0.11.0 -class DeprecatedEndpointTypeAction(argparse.Action): - """Argparse action for the deprecated --endpoint-type flag. - """ - - def __call__(self, _, namespace, values, option_string=None): - warnings.warn( - "'--endpoint-type' is deprecated and will be removed in v0.11.0. " - "Please use '--backend' instead or remove this argument if you " - "have already set it.", - stacklevel=1, - ) - setattr(namespace, self.dest, values) - - -class TaskType(Enum): - GENERATION = "generation" - EMBEDDING = "embedding" - @dataclass -class BenchmarkMetrics: - completed: int - total_input: int - total_output: int - request_throughput: float - request_goodput: float - output_throughput: float - total_token_throughput: float - mean_ttft_ms: float - median_ttft_ms: float - std_ttft_ms: float - percentiles_ttft_ms: list[tuple[float, float]] - mean_tpot_ms: float - median_tpot_ms: float - std_tpot_ms: float - percentiles_tpot_ms: list[tuple[float, float]] - mean_itl_ms: float - median_itl_ms: float - std_itl_ms: float - percentiles_itl_ms: list[tuple[float, float]] - # E2EL stands for end-to-end latency per request. - # It is the time taken on the client side from sending - # a request to receiving a complete response. - mean_e2el_ms: float - median_e2el_ms: float - std_e2el_ms: float - percentiles_e2el_ms: list[tuple[float, float]] - # Max output tokens per second and concurrent requests at that peak - max_output_tokens_per_s: float - max_concurrent_requests: int - +class MixBenchmarkMetrics(BenchmarkMetrics): + audio_throughput: float -@dataclass -class EmbedBenchmarkMetrics: - completed: int - total_input: int - request_throughput: float - total_token_throughput: float - mean_e2el_ms: float - std_e2el_ms: float - median_e2el_ms: float - percentiles_e2el_ms: float def _get_current_request_rate( @@ -141,96 +87,8 @@ def _get_current_request_rate( return request_rate -async def get_request( - input_requests: list[SampleRequest], - request_rate: float, - burstiness: float = 1.0, - ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, - ramp_up_start_rps: Optional[int] = None, - ramp_up_end_rps: Optional[int] = None, -) -> AsyncGenerator[tuple[SampleRequest, float], None]: - """ - Asynchronously generates requests at a specified rate - with OPTIONAL burstiness and OPTIONAL ramp-up strategy. - - Args: - input_requests: - A list of input requests, each represented as a SampleRequest. - request_rate: - The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. - Only takes effect when request_rate is not inf. - Default value is 1, which follows a Poisson process. - Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value - (burstiness > 1) results in a more uniform arrival of requests. - ramp_up_strategy (optional): - The ramp-up strategy. Can be "linear" or "exponential". - If None, uses constant request rate (specified by request_rate). - ramp_up_start_rps (optional): - The starting request rate for ramp-up. - ramp_up_end_rps (optional): - The ending request rate for ramp-up. - """ - assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}.") - # Convert to list to get length for ramp-up calculations - if isinstance(input_requests, - Iterable) and not isinstance(input_requests, list): - input_requests = list(input_requests) - - total_requests = len(input_requests) - assert total_requests > 0, "No requests provided." - - # Precompute delays among requests to minimize request send laggings - request_rates = [] - delay_ts = [] - for request_index, request in enumerate(input_requests): - current_request_rate = _get_current_request_rate( - ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps, - request_index, total_requests, request_rate) - request_rates.append(current_request_rate) - if current_request_rate == float("inf"): - delay_ts.append(0) - else: - theta = 1.0 / (current_request_rate * burstiness) - - # Sample the request interval from the gamma distribution. - # If burstiness is 1, it follows exponential distribution. - delay_ts.append(np.random.gamma(shape=burstiness, scale=theta)) - - # Calculate the cumulative delay time from the first sent out requests. - for i in range(1, len(delay_ts)): - delay_ts[i] += delay_ts[i - 1] - if ramp_up_strategy is None and delay_ts[-1] != 0: - # When ramp_up_strategy is not set, we assume the request rate is fixed - # and all requests should be sent in target_total_delay_s, the following - # logic would re-scale delay time to ensure the final delay_ts - # align with target_total_delay_s. - # - # NOTE: If we simply accumulate the random delta values - # from the gamma distribution, their sum would have 1-2% gap - # from target_total_delay_s. The purpose of the following logic is to - # close the gap for stabilizing the throughput data - # from different random seeds. - target_total_delay_s = total_requests / request_rate - normalize_factor = target_total_delay_s / delay_ts[-1] - delay_ts = [delay * normalize_factor for delay in delay_ts] - - start_ts = time.time() - for request_index, request in enumerate(input_requests): - if delay_ts[request_index] > 0: - current_ts = time.time() - sleep_interval_s = start_ts + delay_ts[request_index] - current_ts - if sleep_interval_s > 0: - await asyncio.sleep(sleep_interval_s) - yield request, request_rates[request_index] - - def calculate_metrics_for_embeddings( - outputs: list[RequestFuncOutput], dur_s: float, + outputs: list[MixRequestFuncOutput], dur_s: float, selected_percentiles: list[float]) -> EmbedBenchmarkMetrics: """Calculate the metrics for the embedding requests. @@ -277,7 +135,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentiles: list[float], goodput_config_dict: dict[str, float], -) -> tuple[BenchmarkMetrics, list[int]]: +) -> tuple[MixBenchmarkMetrics, list[int]]: """Calculate the metrics for the benchmark. Args: @@ -294,6 +152,7 @@ def calculate_metrics( actual_output_lens: list[int] = [] total_input = 0 completed = 0 + audio_completed = 0 good_completed = 0 itls: list[float] = [] tpots: list[float] = [] @@ -326,6 +185,7 @@ def calculate_metrics( ttfts.append(outputs[i].ttft) e2els.append(outputs[i].latency) completed += 1 + audio_completed += outputs[i].output_audio_num else: actual_output_lens.append(0) @@ -417,13 +277,14 @@ def calculate_metrics( else: print("tip: install termplotlib and gnuplot to plot the metrics") - metrics = BenchmarkMetrics( + metrics = MixBenchmarkMetrics( completed=completed, total_input=total_input, total_output=sum(actual_output_lens), request_throughput=completed / dur_s, request_goodput=good_completed / dur_s, output_throughput=sum(actual_output_lens) / dur_s, + audio_throughput=audio_completed / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by the endpoint @@ -661,7 +522,7 @@ async def limited_request_func(request_func_input, session, pbar): limited_request_func(request_func_input=request_func_input, session=session, pbar=pbar))) - outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) + outputs: list[MixRequestFuncOutput] = await asyncio.gather(*tasks) if pbar is not None: pbar.close() @@ -696,15 +557,17 @@ async def limited_request_func(request_func_input, session, pbar): print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - if isinstance(metrics, BenchmarkMetrics): + if isinstance(metrics, MixBenchmarkMetrics): print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Audio throughput (num/s):", + metrics.audio_throughput)) if goodput_config_dict: print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) - if isinstance(metrics, BenchmarkMetrics): + if isinstance(metrics, MixBenchmarkMetrics): print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) print("{:<40} {:<10.2f}".format( @@ -715,7 +578,7 @@ async def limited_request_func(request_func_input, session, pbar): print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput)) - if isinstance(metrics, BenchmarkMetrics): + if isinstance(metrics, MixBenchmarkMetrics): result = { "duration": benchmark_duration, "completed": metrics.completed, @@ -725,6 +588,7 @@ async def limited_request_func(request_func_input, session, pbar): "request_goodput": metrics.request_goodput if goodput_config_dict else None, "output_throughput": metrics.output_throughput, + "audio_throughput": metrics.audio_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], "output_lens": actual_output_lens, @@ -809,66 +673,6 @@ def process_one_metric( return result -def check_goodput_args(args): - # Check and parse goodput arguments - goodput_config_dict = {} - VALID_NAMES = ["ttft", "tpot", "e2el"] - if args.goodput: - goodput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in goodput_config_dict.items(): - if slo_name not in VALID_NAMES: - raise ValueError( - f"Invalid metric name found, {slo_name}: {slo_val}. " - "The service level objective name should be one of " - f"{str(VALID_NAMES)}. ") - if slo_val < 0: - raise ValueError( - f"Invalid value found, {slo_name}: {slo_val}. " - "The service level objective value should be " - "non-negative.") - return goodput_config_dict - - -def parse_goodput(slo_pairs): - goodput_config_dict = {} - try: - for slo_pair in slo_pairs: - slo_name, slo_val = slo_pair.split(":") - goodput_config_dict[slo_name] = float(slo_val) - except ValueError as err: - raise argparse.ArgumentTypeError( - "Invalid format found for service level objectives. " - "Specify service level objectives for goodput as \"KEY:VALUE\" " - "pairs, where the key is a metric name, and the value is a " - "number in milliseconds.") from err - return goodput_config_dict - - -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: dict[str, Any], - file_name: str) -> None: - metrics = [ - "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", - "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms", - "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms" - ] - # These raw data might be useful, but they are rather big. They can be added - # later if needed - ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={k: [results[k]] - for k in metrics if k in results}, - extra_info={ - k: results[k] - for k in results if k not in metrics and k not in ignored_metrics - }) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - def add_cli_args(parser: argparse.ArgumentParser): add_dataset_parser(parser) parser.add_argument( From 9cb9abce2bc36b4af03edae2e994daf3602a2c3f Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 14:12:19 +0800 Subject: [PATCH 17/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/benchmarks/lib/endpoint_request_func.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_omni/benchmarks/lib/endpoint_request_func.py b/vllm_omni/benchmarks/lib/endpoint_request_func.py index 7603dbdc8..d31dadd58 100644 --- a/vllm_omni/benchmarks/lib/endpoint_request_func.py +++ b/vllm_omni/benchmarks/lib/endpoint_request_func.py @@ -93,7 +93,8 @@ async def async_request_openai_chat_completions( output.generated_text += content or "" if choice["message"].get("audio"): output.output_audio_num += 1 - output.output_tokens = 0 + usage = data.get("usage") + output.output_tokens = usage.get("completion_tokens") output.success = True output.latency = time.perf_counter() - st else: From 555d8aebc455b60f08a995e96177be27fde50a55 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 14:18:00 +0800 Subject: [PATCH 18/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/entrypoints/cli/benchmark/main.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm_omni/entrypoints/cli/benchmark/main.py b/vllm_omni/entrypoints/cli/benchmark/main.py index 3392473ff..0fc5373ef 100644 --- a/vllm_omni/entrypoints/cli/benchmark/main.py +++ b/vllm_omni/entrypoints/cli/benchmark/main.py @@ -42,12 +42,18 @@ def subparser_init( cmd_cls.name, help=cmd_cls.help, description=cmd_cls.help, - usage=f"vllm {self.name} {cmd_cls.name} [options]", + usage=f"vllm {self.name} {cmd_cls.name} --omni [options]", ) cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd) cmd_cls.add_cli_args(cmd_subparser) + cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( subcmd=f"{self.name} {cmd_cls.name}") + cmd_subparser.add_argument( + "--omni", + action="store_true", + help="Enable benchmark-Omni mode", + ) return bench_parser From 0d94a5adf1cb711ef25fa37ed922c4ec67b8f647 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 14:19:41 +0800 Subject: [PATCH 19/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/entrypoints/cli/benchmark/main.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm_omni/entrypoints/cli/benchmark/main.py b/vllm_omni/entrypoints/cli/benchmark/main.py index 0fc5373ef..9efbef1de 100644 --- a/vllm_omni/entrypoints/cli/benchmark/main.py +++ b/vllm_omni/entrypoints/cli/benchmark/main.py @@ -33,27 +33,28 @@ def subparser_init( bench_parser = subparsers.add_parser( self.name, description=self.help, - usage=f"vllm {self.name} [options]") + usage=f"vllm {self.name} --omni [options]") bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type") + bench_subparsers.add_argument( + "--omni", + action="store_true", + help="Enable benchmark-Omni mode", + ) for cmd_cls in OmniBenchmarkSubcommandBase.__subclasses__(): cmd_subparser = bench_subparsers.add_parser( cmd_cls.name, help=cmd_cls.help, description=cmd_cls.help, - usage=f"vllm {self.name} {cmd_cls.name} --omni [options]", + usage=f"vllm {self.name} {cmd_cls.name} [options]", ) cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd) cmd_cls.add_cli_args(cmd_subparser) cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( subcmd=f"{self.name} {cmd_cls.name}") - cmd_subparser.add_argument( - "--omni", - action="store_true", - help="Enable benchmark-Omni mode", - ) + return bench_parser From e07c41e2316b252176490c22a21a3102f7b14d8a Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 14:23:10 +0800 Subject: [PATCH 20/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/entrypoints/cli/benchmark/main.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm_omni/entrypoints/cli/benchmark/main.py b/vllm_omni/entrypoints/cli/benchmark/main.py index 9efbef1de..b9148035a 100644 --- a/vllm_omni/entrypoints/cli/benchmark/main.py +++ b/vllm_omni/entrypoints/cli/benchmark/main.py @@ -33,21 +33,22 @@ def subparser_init( bench_parser = subparsers.add_parser( self.name, description=self.help, - usage=f"vllm {self.name} --omni [options]") - bench_subparsers = bench_parser.add_subparsers(required=True, - dest="bench_type") - bench_subparsers.add_argument( + usage=f"vllm {self.name} [--omni] [options]") + bench_parser.add_argument( "--omni", action="store_true", help="Enable benchmark-Omni mode", ) + bench_subparsers = bench_parser.add_subparsers(required=True, + dest="bench_type") + for cmd_cls in OmniBenchmarkSubcommandBase.__subclasses__(): cmd_subparser = bench_subparsers.add_parser( cmd_cls.name, help=cmd_cls.help, description=cmd_cls.help, - usage=f"vllm {self.name} {cmd_cls.name} [options]", + usage=f"vllm {self.name} {cmd_cls.name} [--omni] [options]", ) cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd) cmd_cls.add_cli_args(cmd_subparser) From 556436fe26eba9dc2f68311a5b6f8625e07f8110 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 14:28:56 +0800 Subject: [PATCH 21/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/entrypoints/cli/benchmark/main.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm_omni/entrypoints/cli/benchmark/main.py b/vllm_omni/entrypoints/cli/benchmark/main.py index b9148035a..262551f34 100644 --- a/vllm_omni/entrypoints/cli/benchmark/main.py +++ b/vllm_omni/entrypoints/cli/benchmark/main.py @@ -55,6 +55,12 @@ def subparser_init( cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( subcmd=f"{self.name} {cmd_cls.name}") + cmd_subparser.add_argument( + "--omni", + action="store_true", + default=True, + help="Enable benchmark-Omni mode (always enabled for omni commands)", + ) return bench_parser From f5c4591d7a106ffd141950eb7ca6133527b324e0 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 14:30:39 +0800 Subject: [PATCH 22/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_omni/entrypoints/cli/benchmark/main.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/vllm_omni/entrypoints/cli/benchmark/main.py b/vllm_omni/entrypoints/cli/benchmark/main.py index 262551f34..6f6a07fee 100644 --- a/vllm_omni/entrypoints/cli/benchmark/main.py +++ b/vllm_omni/entrypoints/cli/benchmark/main.py @@ -33,12 +33,7 @@ def subparser_init( bench_parser = subparsers.add_parser( self.name, description=self.help, - usage=f"vllm {self.name} [--omni] [options]") - bench_parser.add_argument( - "--omni", - action="store_true", - help="Enable benchmark-Omni mode", - ) + usage=f"vllm {self.name} [options]") bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type") @@ -50,17 +45,17 @@ def subparser_init( description=cmd_cls.help, usage=f"vllm {self.name} {cmd_cls.name} [--omni] [options]", ) - cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd) - cmd_cls.add_cli_args(cmd_subparser) - - cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( - subcmd=f"{self.name} {cmd_cls.name}") cmd_subparser.add_argument( "--omni", action="store_true", - default=True, + default=True, # 对于 Omni 子命令,默认启用 help="Enable benchmark-Omni mode (always enabled for omni commands)", ) + cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd) + cmd_cls.add_cli_args(cmd_subparser) + + cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( + subcmd=f"{self.name} {cmd_cls.name}") return bench_parser From bcd0dd6c775d649a859f0c07b448781e4abdbbfe Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 15:06:22 +0800 Subject: [PATCH 23/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/e2e/online_serving/test_qwen2_5_omni.py | 1 + vllm_omni/benchmarks/lib/endpoint_request_func.py | 4 +++- vllm_omni/benchmarks/serve.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/e2e/online_serving/test_qwen2_5_omni.py b/tests/e2e/online_serving/test_qwen2_5_omni.py index dd0398d89..f194ffc87 100644 --- a/tests/e2e/online_serving/test_qwen2_5_omni.py +++ b/tests/e2e/online_serving/test_qwen2_5_omni.py @@ -41,6 +41,7 @@ def test_mix_to_audio( "vllm-omni", "bench", "serve", + "--omni", "--model", omni_server.model, "--host", diff --git a/vllm_omni/benchmarks/lib/endpoint_request_func.py b/vllm_omni/benchmarks/lib/endpoint_request_func.py index d31dadd58..d1b1d5053 100644 --- a/vllm_omni/benchmarks/lib/endpoint_request_func.py +++ b/vllm_omni/benchmarks/lib/endpoint_request_func.py @@ -23,6 +23,7 @@ @dataclass class MixRequestFuncOutput(RequestFuncOutput): output_audio_num: int = None + prompt_tokens: int = None async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, @@ -77,7 +78,6 @@ async def async_request_openai_chat_completions( output = MixRequestFuncOutput() output.prompt_len = request_func_input.prompt_len - output.ttft = 0.0 st = time.perf_counter() output.start_time = st @@ -95,8 +95,10 @@ async def async_request_openai_chat_completions( output.output_audio_num += 1 usage = data.get("usage") output.output_tokens = usage.get("completion_tokens") + output.prompt_tokens = usage.get("prompt_tokens") output.success = True output.latency = time.perf_counter() - st + output.ttft = output.latency else: output.error = response.reason or "" output.success = False diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py index 12db540bb..2c29e2c70 100644 --- a/vllm_omni/benchmarks/serve.py +++ b/vllm_omni/benchmarks/serve.py @@ -62,6 +62,7 @@ @dataclass class MixBenchmarkMetrics(BenchmarkMetrics): audio_throughput: float + total_text_input: int @@ -151,6 +152,7 @@ def calculate_metrics( """ actual_output_lens: list[int] = [] total_input = 0 + total_text_input = 0 completed = 0 audio_completed = 0 good_completed = 0 @@ -173,7 +175,8 @@ def calculate_metrics( tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids) actual_output_lens.append(output_len) - total_input += input_requests[i].prompt_len + total_text_input += input_requests[i].prompt_len + total_input += outputs[i].prompt_tokens tpot = 0 if output_len > 1: latency_minus_ttft = outputs[i].latency - outputs[i].ttft @@ -280,6 +283,7 @@ def calculate_metrics( metrics = MixBenchmarkMetrics( completed=completed, total_input=total_input, + total_text_input=total_text_input, total_output=sum(actual_output_lens), request_throughput=completed / dur_s, request_goodput=good_completed / dur_s, @@ -557,6 +561,7 @@ async def limited_request_func(request_func_input, session, pbar): print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total text input tokens:", metrics.total_text_input)) if isinstance(metrics, MixBenchmarkMetrics): print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) @@ -583,6 +588,7 @@ async def limited_request_func(request_func_input, session, pbar): "duration": benchmark_duration, "completed": metrics.completed, "total_input_tokens": metrics.total_input, + "total_text_input_tokens": metrics.total_text_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, "request_goodput": @@ -844,7 +850,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--percentile-metrics", type=str, - default="ttft,tpot,itl", + default="ttft,tpot,itl,e2el", help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ") From e49d385aa0126299a088a42d8f8b6f29c6c68f37 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 16:22:31 +0800 Subject: [PATCH 24/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/api/README.md | 31 ++++-- docs/contributing/tests/tests_style.md | 88 ++++++++------- pytest.ini | 2 +- tests/conftest.py | 81 -------------- .../stage_configs/qwen2_5_omni_ci.yaml | 105 ------------------ tests/e2e/online_serving/test_qwen2_5_omni.py | 81 -------------- 6 files changed, 69 insertions(+), 319 deletions(-) delete mode 100644 tests/e2e/online_serving/stage_configs/qwen2_5_omni_ci.yaml delete mode 100644 tests/e2e/online_serving/test_qwen2_5_omni.py diff --git a/docs/api/README.md b/docs/api/README.md index d2652f80e..ffb838b40 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -1,17 +1,8 @@ # Summary +## Entry Points -## Configuration - -Configuration classes. - -- [vllm_omni.config.model.OmniModelConfig][] -- [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][] -- [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][] - -## EntryPoints - -Main entrypoints for vLLM-Omni inference and serving. +Main entry points for vLLM-Omni inference and serving. - [vllm_omni.entrypoints.async_omni.AsyncOmni][] - [vllm_omni.entrypoints.async_omni.AsyncOmniStageLLM][] @@ -19,8 +10,10 @@ Main entrypoints for vLLM-Omni inference and serving. - [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalItemTracker][] - [vllm_omni.entrypoints.chat_utils.parse_chat_messages_futures][] - [vllm_omni.entrypoints.cli.serve.OmniServeCommand][] +- [vllm_omni.entrypoints.client_request_state.ClientRequestState][] - [vllm_omni.entrypoints.log_utils.OrchestratorMetrics][] - [vllm_omni.entrypoints.omni.Omni][] +- [vllm_omni.entrypoints.omni_diffusion.OmniDiffusion][] - [vllm_omni.entrypoints.omni_llm.OmniLLM][] - [vllm_omni.entrypoints.omni_llm.OmniStageLLM][] - [vllm_omni.entrypoints.omni_stage.OmniStage][] @@ -99,14 +92,30 @@ Model execution components. - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3_VisionPatchEmbed][] - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3_VisionPatchMerger][] +## Configuration + +Configuration classes. + +- [vllm_omni.config.model.OmniModelConfig][] +- [vllm_omni.diffusion.cache.teacache.config.TeaCacheConfig][] +- [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][] +- [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][] + ## Workers Worker classes and model runners for distributed inference. - [vllm_omni.diffusion.worker.gpu_worker.GPUWorker][] - [vllm_omni.diffusion.worker.gpu_worker.WorkerProc][] +- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorker][] +- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorkerProc][] - [vllm_omni.worker.gpu_ar_model_runner.GPUARModelRunner][] - [vllm_omni.worker.gpu_ar_worker.GPUARWorker][] - [vllm_omni.worker.gpu_generation_model_runner.GPUGenerationModelRunner][] - [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][] - [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][] +- [vllm_omni.worker.npu.npu_ar_model_runner.NPUARModelRunner][] +- [vllm_omni.worker.npu.npu_ar_worker.NPUARWorker][] +- [vllm_omni.worker.npu.npu_generation_model_runner.NPUGenerationModelRunner][] +- [vllm_omni.worker.npu.npu_generation_worker.NPUGenerationWorker][] +- [vllm_omni.worker.npu.npu_model_runner.OmniNPUModelRunner][] \ No newline at end of file diff --git a/docs/contributing/tests/tests_style.md b/docs/contributing/tests/tests_style.md index b6813ccc2..0ffaf60ac 100644 --- a/docs/contributing/tests/tests_style.md +++ b/docs/contributing/tests/tests_style.md @@ -1,4 +1,4 @@ -# Test File Structure and Style Guide +# vLLM-Omni Test File Structure and Style Guide To ensure project maintainability and sustainable development, we encourage contributors to submit test code (unit tests, system tests, or end-to-end tests) alongside their code changes. This document outlines the guidelines for organizing and naming test files. @@ -9,7 +9,9 @@ For unit tests and system tests, we strongly recommend placing test files in the ### End-to-End (E2E) Tests for Models End-to-end tests verify the complete functionality of a system or component. For our project, the E2E tests for different omni models are organized into two subdirectories: + - **`tests/e2e/offline_inference/`**: Tests for offline inference modes (e.g., Qwen3Omni offline inference) + - **`tests/e2e/online_serving/`**: Tests for online serving scenarios (e.g., API server tests) **Example:** The test file for `vllm_omni/entrypoints/omni_llm.py` should be located at `tests/entrypoints/test_omni_llm.py`. @@ -24,10 +26,9 @@ vllm_omni/ tests/ │ └── model.py │ └── test_model.py │ ├── core/ → ├── core/ -│ ├── dit_cache_manager.py │ ├── test_dit_cache_manager.py │ └── sched/ │ └── sched/ # Maps to core/sched/ -│ ├── omni_ar_scheduler.py │ ├── test_omni_ar_scheduler.py -│ ├── omni_generation_scheduler.py │ ├── test_omni_generation_scheduler.py +│ ├── omni_ar_scheduler.py │ ├── test_omni_ar_scheduler.py +│ ├── omni_generation_scheduler.py │ ├── test_omni_generation_scheduler.py │ └── output.py │ └── test_output.py │ ├── diffusion/ → ├── diffusion/ @@ -37,14 +38,14 @@ vllm_omni/ tests/ │ │ └── backends/ │ │ └── test_*.py │ ├── models/ │ ├── models/ # Maps to diffusion/models/ │ │ ├── qwen_image/ │ │ ├── qwen_image/ -│ │ │ └── ... │ │ │ └── test_*.py -│ │ └── z_image/ │ │ └── z_image/ -│ │ └── ... │ │ └── test_*.py -│ └── worker/ │ └── worker/ # Maps to diffusion/worker/ -│ └── ... │ └── test_*.py +│ │ │ └── ... │ │ │ └── test_*.py +│ │ └── z_image/ │ │ └── z_image/ +│ │ └── ... │ │ └── test_*.py +│ └── worker/ │ └── worker/ # Maps to diffusion/worker/ +│ └── ... │ └── test_*.py │ ├── distributed/ → ├── distributed/ -│ └── ... │ └── test_*.py +│ └── ... │ └── test_*.py │ ├── engine/ → ├── engine/ │ ├── processor.py │ ├── test_processor.py @@ -80,7 +81,7 @@ vllm_omni/ tests/ │ │ └── ... │ │ └── test_*.py │ ├── stage_configs/ │ └── stage_configs/ # Configuration tests (if needed) │ │ └── ... │ └── test_*.py -│ └── stage_input_processors/ │ └── stage_input_processors/ +│ └── stage_input_processors/ │ └── stage_input_processors/ │ └── ... │ └── test_*.py │ ├── sample/ → ├── sample/ @@ -90,22 +91,22 @@ vllm_omni/ tests/ │ └── platform_utils.py │ └── test_platform_utils.py │ ├── worker/ → ├── worker/ - ├── gpu_ar_worker.py │ ├── test_gpu_ar_worker.py - ├── gpu_generation_worker.py │ ├── test_gpu_generation_worker.py - ├── gpu_model_runner.py │ ├── test_gpu_model_runner.py - └── npu/ │ └── npu/ # Maps to worker/npu/ - └── ... │ └── test_*.py + ├── gpu_ar_worker.py │ ├── test_gpu_ar_worker.py + ├── gpu_generation_worker.py │ ├── test_gpu_generation_worker.py + ├── gpu_model_runner.py │ ├── test_gpu_model_runner.py + └── npu/ │ └── npu/ # Maps to worker/npu/ + └── ... │ └── test_*.py │ -└── e2e/ → ├── e2e/ # End-to-end scenarios (no 1:1 source mirror) - ├── online_serving/ # Full-stack online serving flows - │ └── (empty for now) - └── offline_inference/ # Full offline inference flows - ├── test_qwen2_5_omni.py # Moved from multi_stages/ - ├── test_qwen3_omni.py # Moved from multi_stages_h100/ - ├── test_diffusion_model.py # Moved from single_stage/ - └── stage_configs/ # Shared stage configs - ├── qwen2_5_omni_ci.yaml - └── qwen3_omni_ci.yaml +└── e2e/ → ├── e2e/ # End-to-end scenarios (no 1:1 source mirror) + ├── online_serving/ # Full-stack online serving flows + │ └── (empty for now) + └── offline_inference/ # Full offline inference flows + ├── test_qwen2_5_omni.py # Moved from multi_stages/ + ├── test_qwen3_omni.py # Moved from multi_stages_h100/ + ├── test_t2i_model.py # Moved from single_stage/ + └── stage_configs/ # Shared stage configs + ├── qwen2_5_omni_ci.yaml + └── qwen3_omni_ci.yaml ``` @@ -133,8 +134,10 @@ vllm_omni/ tests/ 1. **File header**: Add SPDX license header to all test files 2. **Imports**: Pls don't use manual `sys.path` modifications, use standard imports instead. 3. **Test type differentiation**: - - Unit tests: Maintain mock style - - Model tests: Consider using OmniRunner uniformly, avoid decorators + + - Unit tests: Maintain mock style + - E2E tests for models: Consider using OmniRunner uniformly, avoid decorators + 4. **Documentation**: Add docstrings to all test functions 5. **Environment variables**: Set uniformly in `conftest.py` or at the top of files 6. **Type annotations**: Add type annotations to all test function parameters @@ -142,10 +145,13 @@ vllm_omni/ tests/ ### Template #### E2E - Online serving + +```python +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Online E2E smoke test for an omni model (video,text,audio → audio). """ -```python from pathlib import Path import pytest @@ -179,7 +185,7 @@ def base64_encoded_video() -> str: @pytest.fixture(scope="session") def dummy_messages_from_video_data(video_data_url: str, content_text: str) -> str: xxx - + @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_video_to_audio( client: openai.OpenAI, @@ -189,24 +195,23 @@ def test_video_to_audio( #set message video_data_url = f"data:video/mp4;base64, {base64_encoded_video}" messages = dummy_messages_from_video_data(video_data_url) - + #send request chat_completion = client.chat.completions.create( model=omni_server.model, messages=messages, ) - + #verify text output text_choice = chat_completion.choices[0] assert text_choice.finish_reason == "length" - + #verify audio output audio_choice = chat_completion.choices[1] audio_message = audio_choice.message if hasattr(audio_message, "audio") and audio_message.audio: assert audio_message.audio.data is not None assert len(audio_message.audio.data) > 0 - ``` #### E2E - Offline inference @@ -228,16 +233,21 @@ from ..multi_stages.conftest import OmniRunner # Optional: set process start method for workers os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -CI_STAGE_CONFIG_PATH = str(Path(__file__).parent / "stage_configs" / "qwen3_omni_ci.yaml") # Edit here to load your model +models = ["{your model name}"] #Edit here to load your model +stage_configs = [str(Path(__file__).parent / "stage_configs" / {your model yaml})] #Edit here to load your model yaml +# Create parameter combinations for model and stage config +test_params = [(model, stage_config) for model in models for stage_config in stage_configs] # function name: test_{input_modality}_to_{output_modality} # modality candidate: text, image, audio, video, mixed_modalities @pytest.mark.gpu_mem_high # requires high-memory GPU node -@pytest.mark.parametrize("model", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]) +@pytest.mark.parametrize("test_config", test_params) def test_video_to_audio(omni_runner: type[OmniRunner], model: str) -> None: """Offline inference: video input, audio output.""" - with omni_runner(model, seed=42, stage_configs_path=CI_STAGE_CONFIG_PATH) as runner: + model, stage_config_path = test_config + with omni_runner(model, seed=42, stage_configs_path=stage_config_path) as runner: + # Prepare inputs video = VideoAsset(name="sample", num_frames=4).np_ndarrays outputs = runner.generate_multimodal( @@ -251,10 +261,8 @@ def test_video_to_audio(omni_runner: type[OmniRunner], model: str) -> None: assert has_audio ``` - - ## Checklist before submit your test files: 1. The file is saved in a suitable places and the file name is clear. 2. The coding style matches the requirements. -3. For e2e omni model tests, specify the +3. For e2e omni model tests, specify the \ No newline at end of file diff --git a/pytest.ini b/pytest.ini index 8fb4beb97..9a32794cf 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,3 @@ [pytest] markers = - gpu_mem_high: needs high VRAM + gpu_mem_high: needs high VRAM \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 3e917a6da..aa1ff4e69 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,7 @@ import os -import socket -import subprocess -import sys -import time import pytest import torch from vllm.logger import init_logger -from vllm.utils import get_open_port logger = init_logger(__name__) @@ -38,79 +33,3 @@ def clean_gpu_memory_between_tests(): if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() - - - -class OmniServer: - """Omniserver for vLLM-Omni tests.""" - - def __init__( - self, - model: str, - serve_args: list[str], - *, - env_dict: dict[str, str] | None = None, - ) -> None: - self.model = model - self.serve_args = serve_args - self.env_dict = env_dict - self.proc: subprocess.Popen | None = None - self.host = "127.0.0.1" - self.port = get_open_port() - - def _start_server(self) -> None: - """Start the vLLM-Omni server subprocess.""" - env = os.environ.copy() - env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - if self.env_dict is not None: - env.update(self.env_dict) - - cmd = [ - sys.executable, - "-m", - "vllm_omni.entrypoints.cli.main", - "serve", - self.model, - "--omni", - "--host", - self.host, - "--port", - str(self.port), - ] + self.serve_args - - print(f"Launching OmniServer with: {' '.join(cmd)}") - self.proc = subprocess.Popen( - cmd, - env=env, - cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # Set working directory to vllm-omni root - ) - - # Wait for server to be ready - max_wait = 600 # 10 minutes - start_time = time.time() - while time.time() - start_time < max_wait: - try: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.settimeout(1) - result = sock.connect_ex((self.host, self.port)) - if result == 0: - print(f"Server ready on {self.host}:{self.port}") - return - except Exception: - pass - time.sleep(2) - - raise RuntimeError(f"Server failed to start within {max_wait} seconds") - - def __enter__(self): - self._start_server() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.proc: - self.proc.terminate() - try: - self.proc.wait(timeout=30) - except subprocess.TimeoutExpired: - self.proc.kill() - self.proc.wait() diff --git a/tests/e2e/online_serving/stage_configs/qwen2_5_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/qwen2_5_omni_ci.yaml deleted file mode 100644 index 96e9d7fa7..000000000 --- a/tests/e2e/online_serving/stage_configs/qwen2_5_omni_ci.yaml +++ /dev/null @@ -1,105 +0,0 @@ -# stage config for running qwen2.5-omni with architecture of OmniLLM. - -# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090). -# This config is optimized for CI e2e tests. -stage_args: - - stage_id: 0 - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - max_batch_size: 1 - engine_args: - model_stage: thinker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 896 - max_num_batched_tokens: 896 - max_num_seqs: 1 - gpu_memory_utilization: 0.8 - skip_mm_profiling: true - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - runtime: - process: true - devices: "1" - max_batch_size: 1 - engine_args: - model_stage: talker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 896 - max_num_batched_tokens: 896 - max_num_seqs: 1 - gpu_memory_utilization: 0.8 - skip_mm_profiling: true - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - stage_id: 2 - runtime: - process: true - devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU - max_batch_size: 1 - engine_args: - model_stage: code2wav - model_arch: Qwen2_5OmniForConditionalGeneration - worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.15 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - defaults: - window_size: -1 # Simplified: trigger downstream only after full upstream completion - max_inflight: 1 # Simplified: process serially within each stage - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - window_size: -1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 - window_size: -1 diff --git a/tests/e2e/online_serving/test_qwen2_5_omni.py b/tests/e2e/online_serving/test_qwen2_5_omni.py deleted file mode 100644 index f194ffc87..000000000 --- a/tests/e2e/online_serving/test_qwen2_5_omni.py +++ /dev/null @@ -1,81 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -E2E Online tests for Qwen2_5-Omni model with video input and audio output. -""" - -import os - -from pathlib import Path -import pytest -import subprocess -from tests.conftest import OmniServer - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - -models = ["Qwen/Qwen2.5-Omni-7B"] - -# CI stage config for 2*H100-80G GPUs -stage_configs = [str(Path(__file__).parent / "stage_configs" / "qwen2_5_omni_ci.yaml")] -# Create parameter combinations for model and stage config -test_params = [(model, stage_config) for model in models for stage_config in stage_configs] - -@pytest.fixture(scope="module") -def omni_server(request): - """Start vLLM-Omni server as a subprocess with actual model weights. - Uses module scope so the server starts only once for all tests. - Multi-stage initialization can take 10-20+ minutes. - """ - model, stage_config_path = request.param - #with OmniServer(model, ["--stage-configs-path", stage_config_path]) as server: - with OmniServer(model, []) as server: - yield server - -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_mix_to_audio( - omni_server, -) -> None: - """Test processing video+audio+image, generating audio output via OpenAI API.""" - # Create data URL for the base64 encoded video - command = [ - "vllm-omni", - "bench", - "serve", - "--omni", - "--model", - omni_server.model, - "--host", - omni_server.host, - "--port", - str(omni_server.port), - "--dataset-name", - "random-mm", - "--request_rate", - "1", - "--random-input-len", - "32", - "--random-range-ratio", - "0.0", - "--random-mm-base-items-per-request", - "2", - "--random-mm-num-mm-items-range-ratio", - "0", - "--random-mm-limit-mm-per-prompt", - '{"image":10, "video": 1, "audio": 1}', - "--random-mm-bucket-config", - '{"(640,640,1)":0.5, "(0,1,1)": 0.1, "(256, 256, 2)": 0.4}', - "--ignore-eos", - "--random-output-len", - "4", - "--num-prompts", - "5", - "--endpoint", - "/v1/chat/completions", - "--backend", - "openai-chat", - ] - result = subprocess.run(command, capture_output=True, text=True) - print(result.stdout) - print(result.stderr) - - assert result.returncode == 0, f"Benchmark failed: {result.stderr}" From 76db1968783f18fe41d1c48ebb0837632f51013a Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 16:29:48 +0800 Subject: [PATCH 25/26] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/api/README.md | 2 +- docs/contributing/tests/tests_style.md | 2 +- tests/conftest.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/api/README.md b/docs/api/README.md index ffb838b40..914d0b67d 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -118,4 +118,4 @@ Worker classes and model runners for distributed inference. - [vllm_omni.worker.npu.npu_ar_worker.NPUARWorker][] - [vllm_omni.worker.npu.npu_generation_model_runner.NPUGenerationModelRunner][] - [vllm_omni.worker.npu.npu_generation_worker.NPUGenerationWorker][] -- [vllm_omni.worker.npu.npu_model_runner.OmniNPUModelRunner][] \ No newline at end of file +- [vllm_omni.worker.npu.npu_model_runner.OmniNPUModelRunner][] diff --git a/docs/contributing/tests/tests_style.md b/docs/contributing/tests/tests_style.md index 0ffaf60ac..7f065c1af 100644 --- a/docs/contributing/tests/tests_style.md +++ b/docs/contributing/tests/tests_style.md @@ -265,4 +265,4 @@ def test_video_to_audio(omni_runner: type[OmniRunner], model: str) -> None: 1. The file is saved in a suitable places and the file name is clear. 2. The coding style matches the requirements. -3. For e2e omni model tests, specify the \ No newline at end of file +3. For e2e omni model tests, specify the diff --git a/tests/conftest.py b/tests/conftest.py index aa1ff4e69..82c959f07 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import os + import pytest import torch from vllm.logger import init_logger From b673b5604e6880dd2590bb77586db070c8838719 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 18 Dec 2025 19:42:38 +0800 Subject: [PATCH 26/26] modify blank lines and chinese comments --- vllm_omni/benchmarks/datasets.py | 34 +-------------------- vllm_omni/entrypoints/cli/benchmark/main.py | 2 +- 2 files changed, 2 insertions(+), 34 deletions(-) diff --git a/vllm_omni/benchmarks/datasets.py b/vllm_omni/benchmarks/datasets.py index a698ae9d5..e67eaa61e 100644 --- a/vllm_omni/benchmarks/datasets.py +++ b/vllm_omni/benchmarks/datasets.py @@ -1091,36 +1091,4 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: except KeyError as err: raise ValueError(f"Unknown dataset: {args.dataset_name}") from err - return input_requests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + return input_requests \ No newline at end of file diff --git a/vllm_omni/entrypoints/cli/benchmark/main.py b/vllm_omni/entrypoints/cli/benchmark/main.py index 6f6a07fee..3b4574b43 100644 --- a/vllm_omni/entrypoints/cli/benchmark/main.py +++ b/vllm_omni/entrypoints/cli/benchmark/main.py @@ -48,7 +48,7 @@ def subparser_init( cmd_subparser.add_argument( "--omni", action="store_true", - default=True, # 对于 Omni 子命令,默认启用 + default=True, help="Enable benchmark-Omni mode (always enabled for omni commands)", ) cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)