diff --git a/docs/source/_ext/trtllm_config_selector.py b/docs/source/_ext/trtllm_config_selector.py new file mode 100644 index 00000000000..78edcce7970 --- /dev/null +++ b/docs/source/_ext/trtllm_config_selector.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from docutils import nodes +from docutils.parsers.rst import Directive, directives + + +class TRTLLMConfigSelector(Directive): + """Embed the interactive config selector widget.""" + + has_content = False + option_spec = { + "models": directives.unchanged, + "config_db": directives.unchanged, + } + + def run(self): + models = (self.options.get("models") or "").strip() + config_db = (self.options.get("config_db") or "").strip() + + attrs = ['data-trtllm-config-selector="1"'] + if models: + attrs.append(f'data-models="{models}"') + if config_db: + attrs.append(f'data-config-db="{config_db}"') + + html = f"
" + return [nodes.raw("", html, format="html")] + + +def setup(app): + app.add_css_file("config_selector.css") + app.add_js_file("config_selector.js") + app.add_directive("trtllm_config_selector", TRTLLMConfigSelector) + return {"version": "0.1", "parallel_read_safe": True, "parallel_write_safe": True} diff --git a/docs/source/_static/config_db.json b/docs/source/_static/config_db.json new file mode 100644 index 00000000000..df16335e7de --- /dev/null +++ b/docs/source/_static/config_db.json @@ -0,0 +1,2875 @@ +{ + "entries": [ + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "deepseek-ai/DeepSeek-R1-0528", + "model_display_name": "DeepSeek-R1", + "model_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp4_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp4_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp4_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp4_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp4_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml", + "concurrency": 128, + "config_filename": "1k1k_tp4_conc128.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml", + "concurrency": 256, + "config_filename": "1k1k_tp4_conc256.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp4_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp4_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp4_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp4_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp4_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml", + "concurrency": 128, + "config_filename": "8k1k_tp4_conc128.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml", + "concurrency": 256, + "config_filename": "8k1k_tp4_conc256.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml", + "concurrency": 128, + "config_filename": "1k1k_tp8_conc128.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml", + "concurrency": 256, + "config_filename": "1k1k_tp8_conc256.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml", + "concurrency": 128, + "config_filename": "8k1k_tp8_conc128.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml", + "concurrency": 256, + "config_filename": "8k1k_tp8_conc256.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml", + "config_path": "examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "nvidia/DeepSeek-R1-0528-FP4-v2", + "model_display_name": "DeepSeek-R1 (NVFP4)", + "model_url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp1_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp1_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp1_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp1_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp1_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml", + "concurrency": 4, + "config_filename": "1k8k_tp1_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml", + "concurrency": 8, + "config_filename": "1k8k_tp1_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml", + "concurrency": 16, + "config_filename": "1k8k_tp1_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml", + "concurrency": 32, + "config_filename": "1k8k_tp1_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml", + "concurrency": 64, + "config_filename": "1k8k_tp1_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp1_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp1_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp1_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp1_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp1_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "B200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp2_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp2_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp2_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp2_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp2_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml", + "concurrency": 4, + "config_filename": "1k8k_tp2_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml", + "concurrency": 8, + "config_filename": "1k8k_tp2_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml", + "concurrency": 16, + "config_filename": "1k8k_tp2_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml", + "concurrency": 32, + "config_filename": "1k8k_tp2_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml", + "concurrency": 64, + "config_filename": "1k8k_tp2_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp2_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp2_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp2_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp2_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp2_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "2xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp4_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp4_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp4_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp4_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp4_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml", + "concurrency": 4, + "config_filename": "1k8k_tp4_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml", + "concurrency": 8, + "config_filename": "1k8k_tp4_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml", + "concurrency": 16, + "config_filename": "1k8k_tp4_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml", + "concurrency": 32, + "config_filename": "1k8k_tp4_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml", + "concurrency": 64, + "config_filename": "1k8k_tp4_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp4_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp4_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp4_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp4_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp4_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "4xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "1k8k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "1k8k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "1k8k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "1k8k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "1k8k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml", + "gpu": "B200_NVL", + "gpu_display": "8xB200_NVL", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp1_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp1_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp1_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp1_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp1_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml", + "concurrency": 4, + "config_filename": "1k8k_tp1_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml", + "concurrency": 8, + "config_filename": "1k8k_tp1_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml", + "concurrency": 16, + "config_filename": "1k8k_tp1_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml", + "concurrency": 32, + "config_filename": "1k8k_tp1_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml", + "concurrency": 64, + "config_filename": "1k8k_tp1_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 8192, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp1_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp1_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp1_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp1_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp1_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "H200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 1, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp2_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp2_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp2_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp2_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp2_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml", + "concurrency": 4, + "config_filename": "1k8k_tp2_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml", + "concurrency": 8, + "config_filename": "1k8k_tp2_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml", + "concurrency": 16, + "config_filename": "1k8k_tp2_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml", + "concurrency": 32, + "config_filename": "1k8k_tp2_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml", + "concurrency": 64, + "config_filename": "1k8k_tp2_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 8192, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp2_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp2_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp2_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp2_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp2_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "2xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 2, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp4_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp4_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp4_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp4_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp4_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml", + "concurrency": 4, + "config_filename": "1k8k_tp4_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml", + "concurrency": 8, + "config_filename": "1k8k_tp4_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml", + "concurrency": 16, + "config_filename": "1k8k_tp4_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml", + "concurrency": 32, + "config_filename": "1k8k_tp4_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml", + "concurrency": 64, + "config_filename": "1k8k_tp4_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 8192, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp4_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp4_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp4_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp4_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp4_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "4xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 4, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "1k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "1k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "1k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "1k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "1k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "1k8k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "1k8k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "1k8k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "1k8k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "1k8k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 1024, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 8192, + "performance_profile": "Max Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml", + "concurrency": 4, + "config_filename": "8k1k_tp8_conc4.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Min Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml", + "concurrency": 8, + "config_filename": "8k1k_tp8_conc8.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Low Latency" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml", + "concurrency": 16, + "config_filename": "8k1k_tp8_conc16.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Balanced" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml", + "concurrency": 32, + "config_filename": "8k1k_tp8_conc32.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "High Throughput" + }, + { + "command": "trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml", + "concurrency": 64, + "config_filename": "8k1k_tp8_conc64.yaml", + "config_github_url": "https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml", + "config_path": "examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml", + "config_raw_url": "https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml", + "gpu": "H200_SXM", + "gpu_display": "8xH200_SXM", + "isl": 8192, + "model": "openai/gpt-oss-120b", + "model_display_name": "gpt-oss-120b", + "model_url": "https://huggingface.co/openai/gpt-oss-120b", + "num_gpus": 8, + "osl": 1024, + "performance_profile": "Max Throughput" + } + ], + "models": { + "deepseek-ai/DeepSeek-R1-0528": { + "display_name": "DeepSeek-R1", + "url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" + }, + "nvidia/DeepSeek-R1-0528-FP4-v2": { + "display_name": "DeepSeek-R1 (NVFP4)", + "url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2" + }, + "openai/gpt-oss-120b": { + "display_name": "gpt-oss-120b", + "url": "https://huggingface.co/openai/gpt-oss-120b" + } + }, + "source": "examples/configs/database/lookup.yaml" +} diff --git a/docs/source/_static/config_selector.css b/docs/source/_static/config_selector.css new file mode 100644 index 00000000000..6ff95978414 --- /dev/null +++ b/docs/source/_static/config_selector.css @@ -0,0 +1,130 @@ +.trtllm-config-selector { + border: 1px solid rgba(0, 0, 0, 0.08); + border-radius: 10px; + padding: 16px; + margin: 16px 0; +} + +.trtllm-config-selector__header { + margin-bottom: 12px; +} + +.trtllm-config-selector__subtitle { + font-size: 0.95rem; + opacity: 0.8; + margin-top: 4px; +} + +.trtllm-config-selector__form { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 12px; + margin-top: 12px; +} + +.trtllm-config-selector__label { + display: block; + font-size: 0.85rem; + margin-bottom: 6px; + opacity: 0.9; +} + +.trtllm-config-selector__select { + width: 100%; + padding: 8px 10px; + border-radius: 8px; + border: 1px solid rgba(0, 0, 0, 0.18); + background: transparent; +} + +.trtllm-config-selector__output { + margin-top: 14px; +} + +.trtllm-config-selector__cmd { + margin: 0; + padding: 10px 12px; + border-radius: 10px; + border: 1px solid rgba(0, 0, 0, 0.12); + overflow-x: auto; + white-space: pre-wrap; + overflow-wrap: anywhere; + position: relative; + padding-right: 54px; /* room for inline copy button */ +} + +.trtllm-config-selector__meta { + margin-top: 8px; + font-size: 0.9rem; + opacity: 0.85; +} + +.trtllm-config-selector__yamlDetails { + margin-top: 12px; +} + +.trtllm-config-selector__yamlSummary { + cursor: pointer; + font-weight: 600; +} + +.trtllm-config-selector__yamlBox { + margin-top: 10px; +} + +.trtllm-config-selector__yamlPre { + margin: 0; + padding: 10px 12px; + border-radius: 10px; + border: 1px solid rgba(0, 0, 0, 0.12); + overflow-x: auto; + max-height: 520px; + position: relative; + padding-right: 54px; /* room for inline copy button */ +} + +.trtllm-config-selector__copyInline { + position: absolute; + top: 8px; + right: 8px; + font-size: 0.85rem; + padding: 6px 10px; + border-radius: 10px; + border: 1px solid rgba(0, 0, 0, 0.12); + background: rgba(255, 255, 255, 0.9); + cursor: pointer; +} + +.trtllm-config-selector__copyInline:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.trtllm-config-selector__copyInline:hover:not(:disabled) { + background: rgba(255, 255, 255, 1); +} + +.trtllm-config-selector__configLink { + text-decoration: underline; +} + +.yaml-key { + font-weight: 600; +} + +.yaml-comment { + opacity: 0.7; +} + +.yaml-punct, +.yaml-bool, +.yaml-num, +.yaml-str { + opacity: 0.9; +} + +.trtllm-config-selector__error { + margin-top: 10px; + font-size: 0.9rem; + opacity: 0.85; +} diff --git a/docs/source/_static/config_selector.js b/docs/source/_static/config_selector.js new file mode 100644 index 00000000000..85cb90a9099 --- /dev/null +++ b/docs/source/_static/config_selector.js @@ -0,0 +1,579 @@ +(function () { + "use strict"; + + let dbPromise = null; + let widgetId = 0; + + function $(root, sel) { + return root.querySelector(sel); + } + + function el(tag, attrs = {}, children = []) { + const node = document.createElement(tag); + for (const [k, v] of Object.entries(attrs)) { + if (k === "class") node.className = String(v); + else if (k === "text") node.textContent = String(v); + else if (k.startsWith("data-")) node.setAttribute(k, String(v)); + else if (k === "for") node.htmlFor = String(v); + else node.setAttribute(k, String(v)); + } + for (const c of children) node.appendChild(c); + return node; + } + + function uniqBy(arr, keyFn) { + const seen = new Set(); + const out = []; + for (const x of arr) { + const k = keyFn(x); + if (!seen.has(k)) { + seen.add(k); + out.push(x); + } + } + return out; + } + + function sortStrings(a, b) { + return String(a).localeCompare(String(b)); + } + + function sortNums(a, b) { + return Number(a) - Number(b); + } + + async function loadDb(dbUrl) { + if (!dbPromise) { + dbPromise = fetch(dbUrl, { credentials: "same-origin" }).then((r) => { + if (!r.ok) { + throw new Error(`Failed to load config DB (${r.status}): ${dbUrl}`); + } + return r.json(); + }); + } + return dbPromise; + } + + function defaultDbUrl() { + const scriptEl = document.querySelector('script[src*="config_selector.js"]'); + if (scriptEl && scriptEl.src) { + const u = new URL(scriptEl.src, document.baseURI); + u.pathname = u.pathname.replace(/config_selector\.js$/, "config_db.json"); + u.search = ""; + u.hash = ""; + return u.toString(); + } + return new URL("_static/config_db.json", document.baseURI).toString(); + } + + async function copyText(text) { + if (navigator.clipboard && navigator.clipboard.writeText) { + await navigator.clipboard.writeText(text); + return; + } + const ta = el("textarea", { "aria-hidden": "true" }); + ta.value = text; + ta.style.position = "fixed"; + ta.style.left = "-9999px"; + document.body.appendChild(ta); + ta.select(); + document.execCommand("copy"); + document.body.removeChild(ta); + } + + function escapeHtml(s) { + return String(s) + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll('"', """) + .replaceAll("'", "'"); + } + + function highlightYaml(yamlText) { + const lines = String(yamlText).split("\n"); + const out = []; + + function highlightScalar(raw) { + const m = String(raw).match(/^(\s*)(.*?)(\s*)$/); + const lead = m ? m[1] : ""; + const core = m ? m[2] : String(raw); + const trail = m ? m[3] : ""; + const t = core.trim(); + if (!t) return escapeHtml(raw); + + const boolNull = /^(true|false|null|~)$/; + const num = /^-?\d+(\.\d+)?$/; + const dq = t.length >= 2 && t.startsWith('"') && t.endsWith('"'); + const sq = t.length >= 2 && t.startsWith("'") && t.endsWith("'"); + + if (boolNull.test(t)) { + return `${escapeHtml(lead)}${escapeHtml(core)}${escapeHtml(trail)}`; + } + if (num.test(t)) { + return `${escapeHtml(lead)}${escapeHtml(core)}${escapeHtml(trail)}`; + } + if (dq || sq) { + return `${escapeHtml(lead)}${escapeHtml(core)}${escapeHtml(trail)}`; + } + return escapeHtml(raw); + } + + for (const line of lines) { + const hashIdx = line.indexOf("#"); + const hasComment = hashIdx >= 0; + const codePart = hasComment ? line.slice(0, hashIdx) : line; + const commentPart = hasComment ? line.slice(hashIdx) : ""; + + const mList = codePart.match(/^(\s*)(-\s+)?(.*)$/); + const indent = mList ? mList[1] : ""; + const dash = mList && mList[2] ? mList[2] : ""; + const rest = mList ? mList[3] : codePart; + + const idx = rest.indexOf(":"); + let html = ""; + if (idx >= 0) { + const keyRaw = rest.slice(0, idx); + const after = rest.slice(idx + 1); + html += escapeHtml(indent); + if (dash) html += `-${escapeHtml(dash.slice(1))}`; + html += `${escapeHtml(keyRaw.trimEnd())}`; + html += `:`; + html += highlightScalar(after); + } else { + html += escapeHtml(indent); + if (dash) html += `-${escapeHtml(dash.slice(1))}`; + html += highlightScalar(rest); + } + + if (commentPart) { + html += `${escapeHtml(commentPart)}`; + } + out.push(html); + } + return out.join("\n"); + } + + function formatCommand(entry) { + const model = entry.model || ""; + const configPath = entry.config_path || ""; + if (!model || !configPath) return entry.command || ""; + return [ + `trtllm-serve ${model} \\`, + ` --extra_llm_api_options \${TRTLLM_DIR}/${configPath}`, + ].join("\n"); + } + + function parseCsvModels(s) { + if (!s) return null; + const parts = String(s) + .split(",") + .map((x) => x.trim()) + .filter(Boolean); + return parts.length ? parts : null; + } + + function initOne(container, payload) { + const allowedModels = parseCsvModels(container.getAttribute("data-models")); + + const allEntries = Array.isArray(payload.entries) ? payload.entries : []; + const entries = allowedModels + ? allEntries.filter((e) => allowedModels.includes(e.model)) + : allEntries.slice(); + + const modelsInfo = payload.models || {}; + + const state = { + model: "", + topology: "", + islOsl: "", + profile: "", + concurrency: "", + }; + + container.innerHTML = ""; + container.classList.add("trtllm-config-selector"); + + const header = el("div", { class: "trtllm-config-selector__header" }, [ + el("div", { + class: "trtllm-config-selector__subtitle", + text: "Select a model + deployment shape to generate a trtllm-serve command.", + }), + ]); + + const form = el("div", { class: "trtllm-config-selector__form" }); + + function mkSelect(labelText, id) { + const label = el("label", { + class: "trtllm-config-selector__label", + for: id, + text: labelText, + }); + const select = el("select", { class: "trtllm-config-selector__select", id }); + const wrap = el("div", { class: "trtllm-config-selector__field" }, [label, select]); + return { wrap, select }; + } + + const id = ++widgetId; + const selModel = mkSelect("Model", `trtllm-model-${id}`); + const selTopo = mkSelect("Topology", `trtllm-topo-${id}`); + const selSeq = mkSelect("ISL / OSL", `trtllm-seq-${id}`); + const selProf = mkSelect("Performance profile", `trtllm-prof-${id}`); + const selConc = mkSelect("Concurrency", `trtllm-conc-${id}`); + + form.appendChild(selModel.wrap); + form.appendChild(selTopo.wrap); + form.appendChild(selSeq.wrap); + form.appendChild(selProf.wrap); + form.appendChild(selConc.wrap); + + const output = el("div", { class: "trtllm-config-selector__output" }); + const cmdPre = el("pre", { class: "trtllm-config-selector__cmd" }, [ + el("code", { class: "trtllm-config-selector__cmdcode", text: "" }), + ]); + const cmdCopyBtn = el("button", { + class: "trtllm-config-selector__copyInline", + type: "button", + title: "Copy command", + "aria-label": "Copy command", + text: "Copy", + }); + const meta = el("div", { class: "trtllm-config-selector__meta", text: "" }); + + output.appendChild(cmdPre); + output.appendChild(meta); + cmdPre.appendChild(cmdCopyBtn); + + const yamlDetails = el("details", { class: "trtllm-config-selector__yamlDetails" }, [ + el("summary", { class: "trtllm-config-selector__yamlSummary", text: "Show config YAML" }), + ]); + const yamlBox = el("div", { class: "trtllm-config-selector__yamlBox" }); + const yamlPre = el("pre", { class: "trtllm-config-selector__yamlPre" }, [ + el("code", { class: "trtllm-config-selector__yamlCode", text: "" }), + ]); + const yamlCopyBtn = el("button", { + class: "trtllm-config-selector__copyInline", + type: "button", + title: "Copy YAML", + "aria-label": "Copy YAML", + text: "Copy", + }); + yamlBox.appendChild(yamlPre); + yamlDetails.appendChild(yamlBox); + output.appendChild(yamlDetails); + yamlPre.appendChild(yamlCopyBtn); + + const errorBox = el("div", { class: "trtllm-config-selector__error", text: "" }); + + container.appendChild(header); + container.appendChild(form); + container.appendChild(output); + container.appendChild(errorBox); + + const yamlCache = new Map(); + let currentEntry = null; + let currentYamlText = ""; + const yamlCodeEl = $(yamlPre, "code"); + + async function fetchYamlFor(entry) { + const url = entry.config_raw_url || ""; + if (!url) return null; + if (yamlCache.has(url)) return yamlCache.get(url) || ""; + const r = await fetch(url, { credentials: "omit" }); + if (!r.ok) throw new Error(`Failed to fetch YAML (${r.status}): ${url}`); + const txt = await r.text(); + yamlCache.set(url, txt); + return txt; + } + + function resetYamlPanel() { + yamlDetails.open = false; + yamlDetails.dataset.state = "idle"; + yamlCodeEl.textContent = ""; + yamlCopyBtn.disabled = true; + currentYamlText = ""; + } + + resetYamlPanel(); + + yamlDetails.addEventListener("toggle", async () => { + if (!yamlDetails.open) return; + if (!currentEntry) { + yamlDetails.dataset.state = "idle"; + yamlCodeEl.textContent = "Select a configuration above to view its YAML."; + return; + } + if (yamlDetails.dataset.state === "loaded") return; + if (yamlDetails.dataset.state === "loading") return; + + const e = currentEntry; + if (!e.config_raw_url) { + yamlDetails.dataset.state = "error"; + yamlCodeEl.textContent = "No raw URL available for this config."; + return; + } + + yamlDetails.dataset.state = "loading"; + yamlCodeEl.textContent = `Loading YAML from ${e.config_raw_url} …`; + try { + const txt = await fetchYamlFor(e); + currentYamlText = txt || ""; + yamlDetails.dataset.state = "loaded"; + yamlCodeEl.innerHTML = highlightYaml(currentYamlText); + yamlCopyBtn.disabled = !currentYamlText; + } catch (err) { + yamlDetails.dataset.state = "error"; + yamlCopyBtn.disabled = true; + yamlCodeEl.textContent = `Failed to load YAML.\n\n${String(err)}`; + } + }); + + yamlCopyBtn.addEventListener("click", async () => { + const txt = currentYamlText || yamlCodeEl.textContent || ""; + if (!txt) return; + try { + await copyText(txt); + yamlCopyBtn.textContent = "Copied"; + setTimeout(() => (yamlCopyBtn.textContent = "Copy"), 1200); + } catch (_) { + yamlCopyBtn.textContent = "Copy failed"; + setTimeout(() => (yamlCopyBtn.textContent = "Copy"), 1500); + } + }); + + function setSelectOptions(select, options, value, placeholder) { + select.innerHTML = ""; + select.appendChild(el("option", { value: "", text: placeholder || "Select…" })); + for (const opt of options) { + select.appendChild(el("option", { value: opt.value, text: opt.label })); + } + select.value = value || ""; + select.disabled = options.length === 0; + } + + function filteredByState(prefixOnly = false) { + return entries.filter((e) => { + if (state.model && e.model !== state.model) return false; + if (state.topology) { + const [ng, gpu] = state.topology.split("|"); + if (String(e.num_gpus) !== ng || e.gpu !== gpu) return false; + } + if (state.islOsl) { + const [isl, osl] = state.islOsl.split("|"); + if (String(e.isl) !== isl || String(e.osl) !== osl) return false; + } + if (!prefixOnly && state.profile && e.performance_profile !== state.profile) return false; + if (!prefixOnly && state.concurrency && String(e.concurrency) !== state.concurrency) return false; + return true; + }); + } + + function render() { + errorBox.textContent = ""; + + // Model options + const modelOpts = uniqBy( + entries.map((e) => e.model), + (m) => m + ) + .sort(sortStrings) + .map((m) => { + const info = modelsInfo[m]; + const label = info && info.display_name ? `${info.display_name} (${m})` : m; + return { value: m, label }; + }); + if (state.model && !modelOpts.some((o) => o.value === state.model)) state.model = ""; + if (!state.model && modelOpts.length === 1) state.model = modelOpts[0].value; + setSelectOptions(selModel.select, modelOpts, state.model, "Select a model…"); + + // Topology options + const topoEntries = entries.filter((e) => !state.model || e.model === state.model); + const topoOpts = uniqBy( + topoEntries.map((e) => ({ + value: `${e.num_gpus}|${e.gpu}`, + label: e.gpu_display || `${e.num_gpus}x${e.gpu}`, + num_gpus: e.num_gpus, + gpu: e.gpu, + })), + (o) => o.value + ) + .sort((a, b) => sortNums(a.num_gpus, b.num_gpus) || sortStrings(a.gpu, b.gpu)); + if (state.topology && !topoOpts.some((o) => o.value === state.topology)) state.topology = ""; + if (!state.topology && topoOpts.length === 1) state.topology = topoOpts[0].value; + setSelectOptions(selTopo.select, topoOpts, state.topology, "Select a topology…"); + + // ISL/OSL options + const seqEntries = entries.filter((e) => { + if (state.model && e.model !== state.model) return false; + if (state.topology) { + const [ng, gpu] = state.topology.split("|"); + if (String(e.num_gpus) !== ng || e.gpu !== gpu) return false; + } + return true; + }); + const seqOpts = uniqBy( + seqEntries.map((e) => ({ + value: `${e.isl}|${e.osl}`, + label: `${e.isl} / ${e.osl}`, + isl: e.isl, + osl: e.osl, + })), + (o) => o.value + ).sort((a, b) => sortNums(a.isl, b.isl) || sortNums(a.osl, b.osl)); + if (state.islOsl && !seqOpts.some((o) => o.value === state.islOsl)) state.islOsl = ""; + if (!state.islOsl && seqOpts.length === 1) state.islOsl = seqOpts[0].value; + setSelectOptions(selSeq.select, seqOpts, state.islOsl, "Select ISL/OSL…"); + + // Profile options + const prefEntries = filteredByState(true); + const profOpts = uniqBy( + prefEntries.map((e) => e.performance_profile), + (p) => p + ) + .sort(sortStrings) + .map((p) => ({ value: p, label: p })); + if (state.profile && !profOpts.some((o) => o.value === state.profile)) state.profile = ""; + if (!state.profile && profOpts.length === 1) state.profile = profOpts[0].value; + // Prefer Balanced if present (nicer default). + if (!state.profile && profOpts.some((o) => o.value === "Balanced")) state.profile = "Balanced"; + setSelectOptions(selProf.select, profOpts, state.profile, "Select a profile…"); + + // Concurrency options (filtered by profile if chosen) + const profEntries2 = filteredByState(true).filter((e) => !state.profile || e.performance_profile === state.profile); + const concOpts = uniqBy( + profEntries2.map((e) => ({ value: String(e.concurrency), label: String(e.concurrency), conc: e.concurrency })), + (o) => o.value + ).sort((a, b) => sortNums(a.conc, b.conc)); + if (state.concurrency && !concOpts.some((o) => o.value === state.concurrency)) state.concurrency = ""; + if (!state.concurrency && concOpts.length === 1) state.concurrency = concOpts[0].value; + setSelectOptions(selConc.select, concOpts, state.concurrency, "Select concurrency…"); + + // Resolve final selection + const finalEntries = filteredByState(false).filter((e) => { + if (state.profile && e.performance_profile !== state.profile) return false; + if (state.concurrency && String(e.concurrency) !== state.concurrency) return false; + return true; + }); + + const code = /** @type {HTMLElement} */ ($(cmdPre, "code")); + if (finalEntries.length === 1) { + const e = finalEntries[0]; + code.textContent = formatCommand(e); + cmdCopyBtn.disabled = !e.command; + meta.textContent = ""; + meta.appendChild(el("span", { text: "Config: " })); + const cfgHref = e.config_github_url || e.config_raw_url || ""; + if (cfgHref) { + meta.appendChild( + el("a", { + class: "trtllm-config-selector__configLink", + href: cfgHref, + target: "_blank", + rel: "noopener", + text: e.config_path || cfgHref, + }) + ); + } else { + meta.appendChild(el("span", { text: e.config_path || "" })); + } + + currentEntry = e; + resetYamlPanel(); + } else { + code.textContent = ""; + cmdCopyBtn.disabled = true; + meta.textContent = ""; + currentEntry = null; + resetYamlPanel(); + if (entries.length === 0) { + errorBox.textContent = "No configuration entries available for this page."; + } else if (state.model && topoOpts.length === 0) { + errorBox.textContent = "No matching topologies for this model."; + } else if (state.topology && seqOpts.length === 0) { + errorBox.textContent = "No matching ISL/OSL options for this selection."; + } else if (state.islOsl && profOpts.length === 0) { + errorBox.textContent = "No matching performance profiles for this selection."; + } else if (state.profile && concOpts.length === 0) { + errorBox.textContent = "No matching concurrencies for this profile."; + } else if (state.model && state.topology && state.islOsl && state.profile && state.concurrency) { + errorBox.textContent = "Selection did not resolve to a single configuration."; + } else { + errorBox.textContent = "Select options above to generate a command."; + } + } + } + + selModel.select.addEventListener("change", () => { + state.model = selModel.select.value; + state.topology = ""; + state.islOsl = ""; + state.profile = ""; + state.concurrency = ""; + render(); + }); + selTopo.select.addEventListener("change", () => { + state.topology = selTopo.select.value; + state.islOsl = ""; + state.profile = ""; + state.concurrency = ""; + render(); + }); + selSeq.select.addEventListener("change", () => { + state.islOsl = selSeq.select.value; + state.profile = ""; + state.concurrency = ""; + render(); + }); + selProf.select.addEventListener("change", () => { + state.profile = selProf.select.value; + state.concurrency = ""; + render(); + }); + selConc.select.addEventListener("change", () => { + state.concurrency = selConc.select.value; + render(); + }); + + cmdCopyBtn.addEventListener("click", async () => { + const code = $(cmdPre, "code"); + const txt = (code && code.textContent) || ""; + if (!txt) return; + try { + await copyText(txt); + cmdCopyBtn.textContent = "Copied"; + setTimeout(() => (cmdCopyBtn.textContent = "Copy"), 1200); + } catch (e) { + cmdCopyBtn.textContent = "Copy failed"; + setTimeout(() => (cmdCopyBtn.textContent = "Copy"), 1500); + } + }); + + render(); + } + + async function main() { + const containers = Array.from(document.querySelectorAll("[data-trtllm-config-selector]")); + if (!containers.length) return; + + const first = containers[0]; + const dbPath = first.getAttribute("data-config-db"); + const dbUrl = dbPath + ? new URL(dbPath, document.baseURI).toString() + : defaultDbUrl(); + + try { + const payload = await loadDb(dbUrl); + for (const c of containers) initOne(c, payload); + } catch (err) { + for (const c of containers) { + c.textContent = `Failed to load configuration database: ${String(err)}`; + } + } + } + + if (document.readyState === "loading") { + document.addEventListener("DOMContentLoaded", main); + } else { + main(); + } +})(); diff --git a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md index 34a509f5a4f..9e031475455 100644 --- a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md +++ b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md @@ -44,7 +44,7 @@ TensorRT LLM distributes the pre-built container on [NGC Catalog](https://catalo You can launch the container using the following command: ```bash -docker run --rm -it --ipc host -p 8000:8000 --gpus all --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tensorrt-llm/release:x.y.z +docker run --rm -it --ipc host -p 8000:8000 --gpus all --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 ``` diff --git a/docs/source/conf.py b/docs/source/conf.py index fdabe15e17e..3705eafc643 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -15,6 +15,7 @@ from docutils import nodes sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath('_ext')) project = 'TensorRT LLM' copyright = '2025, NVidia' @@ -43,6 +44,13 @@ templates_path = ['_templates'] exclude_patterns = ['performance/performance-tuning-guide/introduction.md'] +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +CPP_XML_INDEX = os.path.abspath( + os.path.join(SCRIPT_DIR, "..", "cpp_docs", "xml", "index.xml")) +HAS_CPP_XML = os.path.exists(CPP_XML_INDEX) +if not HAS_CPP_XML: + exclude_patterns.append('_cpp_gen/**') + extensions = [ 'sphinx.ext.duration', 'sphinx.ext.autodoc', @@ -51,7 +59,6 @@ 'sphinx.ext.napoleon', 'sphinx.ext.mathjax', 'myst_parser', # for markdown support - "breathe", 'sphinx.ext.todo', 'sphinx.ext.autosectionlabel', 'sphinxarg.ext', @@ -59,8 +66,12 @@ 'sphinx_copybutton', 'sphinxcontrib.autodoc_pydantic', 'sphinx_togglebutton', + 'trtllm_config_selector', ] +if HAS_CPP_XML: + extensions.append("breathe") + autodoc_member_order = 'bysource' autodoc_pydantic_model_show_json = True autodoc_pydantic_model_show_config_summary = True @@ -140,12 +151,11 @@ ] } -# ------------------------ C++ Doc related -------------------------- -# Breathe configuration -breathe_default_project = "TensorRT-LLM" -breathe_projects = {"TensorRT-LLM": "../cpp_docs/xml"} - -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +if HAS_CPP_XML: + breathe_default_project = "TensorRT-LLM" + breathe_projects = {"TensorRT-LLM": "../cpp_docs/xml"} +else: + breathe_projects = {} CPP_INCLUDE_DIR = os.path.join(SCRIPT_DIR, '../../cpp/include/tensorrt_llm') CPP_GEN_DIR = os.path.join(SCRIPT_DIR, '_cpp_gen') @@ -206,10 +216,11 @@ def gen_cpp_doc(ofile_name: str, header_dir: str, summary: str): .. It is also doable to automatically generate this file and list all the modules in the conf.py """.strip() -# compile cpp doc -subprocess.run(['mkdir', '-p', CPP_GEN_DIR]) -gen_cpp_doc(CPP_GEN_DIR + '/runtime.rst', CPP_INCLUDE_DIR + '/runtime', - runtime_summary) +if HAS_CPP_XML: + # compile cpp doc + subprocess.run(['mkdir', '-p', CPP_GEN_DIR]) + gen_cpp_doc(CPP_GEN_DIR + '/runtime.rst', CPP_INCLUDE_DIR + '/runtime', + runtime_summary) executor_summary = f""" Executor @@ -220,6 +231,7 @@ def gen_cpp_doc(ofile_name: str, header_dir: str, summary: str): .. It is also doable to automatically generate this file and list all the modules in the conf.py """.strip() -subprocess.run(['mkdir', '-p', CPP_GEN_DIR]) -gen_cpp_doc(CPP_GEN_DIR + '/executor.rst', CPP_INCLUDE_DIR + '/executor', - executor_summary) +if HAS_CPP_XML: + subprocess.run(['mkdir', '-p', CPP_GEN_DIR]) + gen_cpp_doc(CPP_GEN_DIR + '/executor.rst', CPP_INCLUDE_DIR + '/executor', + executor_summary) diff --git a/docs/source/deployment-guide/config_table.rst b/docs/source/deployment-guide/config_table.rst index d28fed25a8e..3747ed7ab55 100644 --- a/docs/source/deployment-guide/config_table.rst +++ b/docs/source/deployment-guide/config_table.rst @@ -1,13 +1,15 @@ +.. start-config-table-note .. include:: note_sections.rst :start-after: .. start-note-traffic-patterns :end-before: .. end-note-traffic-patterns +.. end-config-table-note .. start-deepseek-ai/DeepSeek-R1-0528 .. _deepseek-ai/DeepSeek-R1-0528: `DeepSeek-R1 `_ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: :width: 100% @@ -148,7 +150,7 @@ .. _nvidia/DeepSeek-R1-0528-FP4-v2: `DeepSeek-R1 (NVFP4) `_ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: :width: 100% @@ -167,162 +169,162 @@ - 4 - `1k1k_tp4_conc4.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp4_conc8.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 1024 - 16 - `1k1k_tp4_conc16.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` * - 4xB200_NVL - - Low Latency + - Balanced - 1024 / 1024 - 32 - `1k1k_tp4_conc32.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 1024 - 64 - `1k1k_tp4_conc64.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 1024 - 128 - `1k1k_tp4_conc128.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 128 - - `1k1k_tp8_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` * - 4xB200_NVL - - High Throughput + - Max Throughput - 1024 / 1024 - 256 - `1k1k_tp4_conc256.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml`` - * - 8xB200_NVL - - Max Throughput - - 1024 / 1024 - - 256 - - `1k1k_tp8_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` * - 4xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp4_conc4.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` * - 4xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp4_conc8.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` * - 4xB200_NVL - Low Latency - 8192 / 1024 - 16 - `8k1k_tp4_conc16.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` * - 4xB200_NVL - - Low Latency + - Balanced - 8192 / 1024 - 32 - `8k1k_tp4_conc32.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 64 - `8k1k_tp4_conc64.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 128 - `8k1k_tp4_conc128.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml`` + * - 4xB200_NVL + - Max Throughput + - 8192 / 1024 + - 256 + - `8k1k_tp4_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` + * - 8xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 128 + - `1k1k_tp8_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 256 + - `1k1k_tp8_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` + * - 8xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - Balanced + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` * - 8xB200_NVL - High Throughput - 8192 / 1024 - 128 - `8k1k_tp8_conc128.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 256 - - `8k1k_tp4_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` * - 8xB200_NVL - Max Throughput - 8192 / 1024 @@ -337,7 +339,7 @@ .. _openai/gpt-oss-120b: `gpt-oss-120b `_ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: :width: 100% @@ -356,714 +358,714 @@ - 4 - `1k1k_tp1_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` * - B200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp1_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` * - B200_NVL - - Low Latency + - Balanced - 1024 / 1024 - 16 - `1k1k_tp1_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` * - B200_NVL - High Throughput - 1024 / 1024 - 32 - `1k1k_tp1_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` * - B200_NVL - - High Throughput + - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp1_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - Max Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` * - B200_NVL - Min Latency - 1024 / 8192 - 4 - `1k8k_tp1_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` * - B200_NVL - Low Latency - 1024 / 8192 - 8 - `1k8k_tp1_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` * - B200_NVL - - Low Latency + - Balanced - 1024 / 8192 - 16 - `1k8k_tp1_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 8192 - - 16 - - `1k8k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` * - B200_NVL - High Throughput - 1024 / 8192 - 32 - `1k8k_tp1_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` * - B200_NVL - - High Throughput + - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp1_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` - * - 8xB200_NVL - - Max Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` * - B200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp1_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml`` - * - 2xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` - * - 4xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` * - B200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp1_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml`` - * - 2xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` - * - 4xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` * - B200_NVL - - Low Latency + - Balanced - 8192 / 1024 - 16 - `8k1k_tp1_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml`` - * - 2xB200_NVL - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` * - B200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp1_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml`` - * - 2xB200_NVL - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` * - B200_NVL - - High Throughput + - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp1_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml`` * - 2xB200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - Max Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` - * - H200_SXM - Min Latency - 1024 / 1024 - 4 - - `1k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` - * - 2xH200_SXM + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` + * - 2xB200_NVL - Low Latency - 1024 / 1024 - - 4 - - `1k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` - * - 4xH200_SXM - - Low Latency + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` + * - 2xB200_NVL + - Balanced - 1024 / 1024 - - 4 - - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` - * - 8xH200_SXM - - Low Latency + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` + * - 2xB200_NVL + - High Throughput - 1024 / 1024 - - 4 - - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` - * - H200_SXM - - Low Latency + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` + * - 2xB200_NVL + - Max Throughput - 1024 / 1024 - - 8 - - `1k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` - * - 2xH200_SXM + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` + * - 2xB200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` + * - 2xB200_NVL - Low Latency - - 1024 / 1024 + - 1024 / 8192 - 8 - - `1k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` - * - 4xH200_SXM - - Low Latency + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` + * - 2xB200_NVL + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` + * - 2xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` + * - 2xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` + * - 2xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` + * - 2xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` + * - 2xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` + * - 4xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` + * - 4xB200_NVL + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` + * - 4xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` + * - 4xB200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` + * - 4xB200_NVL + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` + * - 4xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` + * - 4xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` + * - 4xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` + * - 4xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` + * - H200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` + * - H200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` + * - H200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` + * - H200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` + * - H200_SXM + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` + * - H200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` + * - H200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` + * - H200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` + * - H200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` + * - H200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` + * - H200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` + * - 2xH200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` + * - 2xH200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` + * - 2xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` + * - 2xH200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` + * - 2xH200_SXM + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` + * - 2xH200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` + * - 2xH200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` + * - 2xH200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` + * - 2xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` + * - 2xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` + * - 4xH200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` + * - 4xH200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` + * - 4xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` + * - 4xH200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` + * - 4xH200_SXM + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` + * - 4xH200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` + * - 4xH200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` + * - 4xH200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` + * - 4xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` + * - 8xH200_SXM + - Min Latency - 1024 / 1024 - - 8 - - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` * - 8xH200_SXM - Low Latency - 1024 / 1024 - 8 - `1k1k_tp8_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml`` - * - H200_SXM - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` * - 8xH200_SXM - - High Throughput + - Balanced - 1024 / 1024 - 16 - `1k1k_tp8_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml`` - * - H200_SXM - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` * - 8xH200_SXM - High Throughput - 1024 / 1024 - 32 - `1k1k_tp8_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml`` - * - H200_SXM - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` * - 8xH200_SXM - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp8_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml`` - * - H200_SXM - - Min Latency - - 1024 / 8192 - - 4 - - `1k8k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` - * - 4xH200_SXM - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` * - 8xH200_SXM - - Low Latency + - Min Latency - 1024 / 8192 - 4 - `1k8k_tp8_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml`` - * - H200_SXM - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` - * - 4xH200_SXM - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` * - 8xH200_SXM - Low Latency - 1024 / 8192 - 8 - `1k8k_tp8_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml`` - * - H200_SXM - - Low Latency - - 1024 / 8192 - - 16 - - `1k8k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 8192 - - 16 - - `1k8k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` * - 8xH200_SXM - - High Throughput + - Balanced - 1024 / 8192 - 16 - `1k8k_tp8_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml`` - * - H200_SXM - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` * - 8xH200_SXM - High Throughput - 1024 / 8192 - 32 - `1k8k_tp8_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml`` - * - H200_SXM - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` * - 8xH200_SXM - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp8_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml`` - * - H200_SXM - - Min Latency - - 8192 / 1024 - - 4 - - `8k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` - * - 2xH200_SXM - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` - * - 4xH200_SXM - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` * - 8xH200_SXM - - Low Latency + - Min Latency - 8192 / 1024 - 4 - `8k1k_tp8_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml`` - * - H200_SXM - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` - * - 2xH200_SXM - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` - * - 4xH200_SXM - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` * - 8xH200_SXM - Low Latency - 8192 / 1024 - 8 - `8k1k_tp8_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml`` - * - H200_SXM - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` - * - 2xH200_SXM - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` - * - 4xH200_SXM - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` * - 8xH200_SXM - - High Throughput + - Balanced - 8192 / 1024 - 16 - `8k1k_tp8_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml`` - * - H200_SXM - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` - * - 2xH200_SXM - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` - * - 4xH200_SXM - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` * - 8xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml`` - * - H200_SXM - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` - * - 2xH200_SXM - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` - * - 4xH200_SXM - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` * - 8xH200_SXM - Max Throughput - 8192 / 1024 diff --git a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md index e4165eac09c..7e229e4f182 100644 --- a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md @@ -47,7 +47,7 @@ docker run --rm -it \ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:x.y.z \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash ``` @@ -432,19 +432,28 @@ $$ ## Preconfigured Recipes -The following tables list recommended configurations from the comprehensive database for different performance profiles. +The following sections help you pick a known-good `trtllm-serve --config` for your target GPU and traffic pattern. + +### Recipe selector + +```{eval-rst} +.. trtllm_config_selector:: + :models: deepseek-ai/DeepSeek-R1-0528, nvidia/DeepSeek-R1-0528-FP4-v2 +``` ```{eval-rst} .. include:: note_sections.rst :start-after: .. start-note-traffic-patterns :end-before: .. end-note-traffic-patterns +``` +### Recipe database + +```{eval-rst} .. include:: config_table.rst :start-after: .. start-deepseek-ai/DeepSeek-R1-0528 :end-before: .. end-deepseek-ai/DeepSeek-R1-0528 -``` -```{eval-rst} .. include:: config_table.rst :start-after: .. start-nvidia/DeepSeek-R1-0528-FP4-v2 :end-before: .. end-nvidia/DeepSeek-R1-0528-FP4-v2 diff --git a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md index 5a9f9f4c726..3eca2fab14c 100644 --- a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md @@ -43,7 +43,7 @@ docker run --rm -it \ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:x.y.z \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash ``` @@ -380,13 +380,24 @@ $$ ## Preconfigured Recipes -The following table lists recommended configurations from the comprehensive database for different performance profiles. +The following sections help you pick a known-good `trtllm-serve --config` for your target GPU and traffic pattern. + +### Recipe selector + +```{eval-rst} +.. trtllm_config_selector:: + :models: openai/gpt-oss-120b +``` ```{eval-rst} .. include:: note_sections.rst :start-after: .. start-note-traffic-patterns :end-before: .. end-note-traffic-patterns +``` + +### Recipe database +```{eval-rst} .. include:: config_table.rst :start-after: .. start-openai/gpt-oss-120b :end-before: .. end-openai/gpt-oss-120b diff --git a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md index d3e328d810d..b45b7d2ffab 100644 --- a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md @@ -39,7 +39,7 @@ docker run --rm -it \ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:x.y.z \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash ``` diff --git a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md index 7d69b7a8be7..3e70209b212 100644 --- a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md @@ -38,7 +38,7 @@ docker run --rm -it \ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:x.y.z \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash ``` diff --git a/docs/source/deployment-guide/index.rst b/docs/source/deployment-guide/index.rst index 644a9d9ae95..e0f508745c3 100644 --- a/docs/source/deployment-guide/index.rst +++ b/docs/source/deployment-guide/index.rst @@ -100,9 +100,26 @@ The deployment guides below provide more detailed instructions for serving speci deployment-guide-for-qwen3-next-on-trtllm.md deployment-guide-for-kimi-k2-thinking-on-trtllm.md -Comprehensive Configuration Database ------------------------------------- +Preconfigured Recipes +--------------------- + +.. _recipe-selector: + +Recipe selector +^^^^^^^^^^^^^^^ + +.. trtllm_config_selector:: + +.. include:: note_sections.rst + :start-after: .. start-note-traffic-patterns + :end-before: .. end-note-traffic-patterns + +.. _recipe-database: + +Recipe database +^^^^^^^^^^^^^^^ The table below lists all available pre-configured model scenarios in the TensorRT LLM configuration database. Each row represents a specific model, GPU, and performance profile combination with recommended request settings. .. include:: config_table.rst + :start-after: .. end-config-table-note diff --git a/docs/source/deployment-guide/note_sections.rst b/docs/source/deployment-guide/note_sections.rst index 4cd0d1c41dd..7b3fe3e563d 100644 --- a/docs/source/deployment-guide/note_sections.rst +++ b/docs/source/deployment-guide/note_sections.rst @@ -31,6 +31,6 @@ .. note:: - The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, refer to the :ref:`Comprehensive Configuration Database` section below which covers a larger set of traffic patterns and performance profiles. + The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, refer to the :ref:`Preconfigured Recipes` section below which covers a larger set of traffic patterns and performance profiles. .. end-note-quick-start-isl-osl diff --git a/docs/source/quick-start-guide.md b/docs/source/quick-start-guide.md index 03458cb08fd..6eff451feb6 100644 --- a/docs/source/quick-start-guide.md +++ b/docs/source/quick-start-guide.md @@ -10,7 +10,7 @@ This is the starting point to try out TensorRT LLM. Specifically, this Quick Sta The [TensorRT LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) maintained by NVIDIA contains all of the required dependencies pre-installed. You can start the container on a machine with NVIDIA GPUs via: ```bash -docker run --rm -it --ipc host --gpus all --ulimit memlock=-1 --ulimit stack=67108864 -p 8000:8000 nvcr.io/nvidia/tensorrt-llm/release:x.y.z +docker run --rm -it --ipc host --gpus all --ulimit memlock=-1 --ulimit stack=67108864 -p 8000:8000 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 ``` diff --git a/scripts/generate_config_table.py b/scripts/generate_config_table.py index 2d423c0811f..24e30668b1d 100644 --- a/scripts/generate_config_table.py +++ b/scripts/generate_config_table.py @@ -14,15 +14,36 @@ # limitations under the License. +import json import os import sys from collections import defaultdict +from dataclasses import asdict, dataclass from pathlib import Path -from examples.configs.database.database import DATABASE_LIST_PATH, RecipeList - SCRIPT_DIR = Path(__file__).parent.resolve() REPO_ROOT = SCRIPT_DIR.parent + + +def _ensure_repo_root_on_syspath() -> None: + if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + + +def _load_recipe_list(yaml_path: Path): + _ensure_repo_root_on_syspath() + from examples.configs.database.database import RecipeList + + return RecipeList.from_yaml(yaml_path) + + +def _default_database_list_path() -> Path: + _ensure_repo_root_on_syspath() + from examples.configs.database.database import DATABASE_LIST_PATH + + return Path(DATABASE_LIST_PATH) + + MODEL_INFO = { "deepseek-ai/DeepSeek-R1-0528": { "display_name": "DeepSeek-R1", @@ -42,27 +63,139 @@ HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD = 32 -def generate_rst(yaml_path, output_file=None): - """Generate RST table from YAML config database. +@dataclass(frozen=True) +class RecipeRow: + model: str + model_display_name: str + model_url: str + gpu: str + num_gpus: int + isl: int + osl: int + concurrency: int + config_path: str + gpu_display: str + performance_profile: str + command: str + config_filename: str + config_github_url: str + config_raw_url: str + + +def _model_display_and_url(model: str) -> tuple[str, str]: + if model in MODEL_INFO: + info = MODEL_INFO[model] + return info["display_name"], info["url"] + return model, "" + + +def _profile_from_sorted_entries(concurrencies: list[int], idx: int) -> str: + """Assign a performance profile given entries sorted by concurrency.""" + n = len(concurrencies) + conc = concurrencies[idx] + + if n == 1: + if conc <= LOW_LATENCY_CONCURRENCY_THRESHOLD: + return "Low Latency" + if conc >= HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD: + return "High Throughput" + return "Balanced" + + if idx == 0: + return "Min Latency" + if idx == n - 1: + return "Max Throughput" + if idx in ((n - 1) // 2, n // 2): + return "Balanced" + if idx < n // 2: + return "Low Latency" + return "High Throughput" - Args: - yaml_path: Path to lookup.yaml (str or Path) - output_file: Optional output file path. If None, prints to stdout. - """ - recipe_list = RecipeList.from_yaml(Path(yaml_path)) - # Group by model -> (gpu, isl, osl) -> list of recipes +def build_rows(yaml_path) -> list[RecipeRow]: + recipe_list = _load_recipe_list(Path(yaml_path)) + model_groups = defaultdict(lambda: defaultdict(list)) for recipe in recipe_list: - key = (recipe.gpu, recipe.isl, recipe.osl) + key = (recipe.gpu, recipe.num_gpus, recipe.isl, recipe.osl) model_groups[recipe.model][key].append(recipe) + rows: list[RecipeRow] = [] + + sorted_models = sorted(model_groups.keys()) + for model in sorted_models: + subgroups = model_groups[model] + sorted_keys = sorted( + subgroups.keys(), + key=lambda k: (str(k[0]), int(k[1] or 0), int(k[2] or 0), int(k[3] or 0)), + ) + + model_display_name, model_url = _model_display_and_url(model) + + for key in sorted_keys: + entries = subgroups[key] + entries.sort(key=lambda x: x.concurrency) + concurrencies = [e.concurrency for e in entries] + + for idx, entry in enumerate(entries): + gpu = entry.gpu + num_gpus = entry.num_gpus + gpu_display = f"{num_gpus}x{gpu}" if num_gpus and num_gpus > 1 else gpu + isl = entry.isl + osl = entry.osl + conc = entry.concurrency + config_path = entry.config_path + + profile = _profile_from_sorted_entries(concurrencies, idx) + + command = ( + f"trtllm-serve {model} --extra_llm_api_options ${{TRTLLM_DIR}}/{config_path}" + ) + + config_filename = os.path.basename(config_path) + config_github_url = ( + f"https://github.com/NVIDIA/TensorRT-LLM/blob/main/{config_path}" + ) + config_raw_url = ( + f"https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/main/{config_path}" + ) + + rows.append( + RecipeRow( + model=model, + model_display_name=model_display_name, + model_url=model_url, + gpu=gpu, + num_gpus=num_gpus, + isl=isl, + osl=osl, + concurrency=conc, + config_path=config_path, + gpu_display=gpu_display, + performance_profile=profile, + command=command, + config_filename=config_filename, + config_github_url=config_github_url, + config_raw_url=config_raw_url, + ) + ) + + return rows + + +def generate_rst(yaml_path, output_file=None): + rows = build_rows(yaml_path) + model_groups = defaultdict(list) + for row in rows: + model_groups[row.model].append(row) + lines = [] - # Include note_sections.rst at the top (relative include for Sphinx) + lines.append(".. start-config-table-note") lines.append(".. include:: note_sections.rst") lines.append(" :start-after: .. start-note-traffic-patterns") lines.append(" :end-before: .. end-note-traffic-patterns") + lines.append(".. end-config-table-note") lines.append("") sorted_models = sorted(model_groups.keys()) @@ -71,16 +204,16 @@ def generate_rst(yaml_path, output_file=None): lines.append(f".. start-{model}") lines.append("") - if model in MODEL_INFO: - info = MODEL_INFO[model] - title_text = f"`{info['display_name']} <{info['url']}>`_" + model_display_name, model_url = _model_display_and_url(model) + if model_url: + title_text = f"`{model_display_name} <{model_url}>`_" else: title_text = model lines.append(f".. _{model}:") lines.append("") lines.append(title_text) - lines.append("^" * len(title_text)) + lines.append("~" * len(title_text)) lines.append("") lines.append(".. list-table::") @@ -95,57 +228,25 @@ def generate_rst(yaml_path, output_file=None): lines.append(" - Config") lines.append(" - Command") - subgroups = model_groups[model] - sorted_keys = sorted( - subgroups.keys(), key=lambda k: (str(k[0]), int(k[1] or 0), int(k[2] or 0)) + entries = sorted( + model_groups[model], + key=lambda r: ( + str(r.gpu), + int(r.num_gpus or 0), + int(r.isl or 0), + int(r.osl or 0), + int(r.concurrency or 0), + ), ) - for key in sorted_keys: - entries = subgroups[key] - entries.sort(key=lambda x: x.concurrency) - n = len(entries) - - for idx, entry in enumerate(entries): - gpu = entry.gpu - num_gpus = entry.num_gpus - gpu_display = f"{num_gpus}x{gpu}" if num_gpus and num_gpus > 1 else gpu - isl = entry.isl - osl = entry.osl - conc = entry.concurrency - config_path = entry.config_path - - if n == 1: - if conc <= LOW_LATENCY_CONCURRENCY_THRESHOLD: - profile = "Low Latency" - elif conc >= HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD: - profile = "High Throughput" - else: - profile = "Balanced" - elif idx == 0: - profile = "Min Latency" - elif idx == n - 1: - profile = "Max Throughput" - elif idx in ((n - 1) // 2, n // 2): - profile = "Balanced" - elif idx < n // 2: - profile = "Low Latency" - else: - profile = "High Throughput" - - full_config_path = config_path - command = f"trtllm-serve {model} --extra_llm_api_options ${{TRTLLM_DIR}}/{full_config_path}" - - config_filename = os.path.basename(full_config_path) - - github_url = f"https://github.com/NVIDIA/TensorRT-LLM/blob/main/{full_config_path}" - config_link = f"`{config_filename} <{github_url}>`_" - - lines.append(f" * - {gpu_display}") - lines.append(f" - {profile}") - lines.append(f" - {isl} / {osl}") - lines.append(f" - {conc}") - lines.append(f" - {config_link}") - lines.append(f" - ``{command}``") + for row in entries: + config_link = f"`{row.config_filename} <{row.config_github_url}>`_" + lines.append(f" * - {row.gpu_display}") + lines.append(f" - {row.performance_profile}") + lines.append(f" - {row.isl} / {row.osl}") + lines.append(f" - {row.concurrency}") + lines.append(f" - {config_link}") + lines.append(f" - ``{row.command}``") lines.append("") lines.append(f".. end-{model}") @@ -155,15 +256,45 @@ def generate_rst(yaml_path, output_file=None): if output_file: with open(output_file, "w") as f: f.write(output_text) - print(f"Generated table written to: {output_file}", file=sys.stderr) else: print(output_text) +def generate_json(yaml_path, output_file): + rows = build_rows(yaml_path) + + source_path = Path(yaml_path) + source = ( + str(source_path.relative_to(REPO_ROOT)) + if source_path.is_relative_to(REPO_ROOT) + else str(source_path) + ) + + models = {} + for row in rows: + if row.model not in models: + models[row.model] = { + "display_name": row.model_display_name, + "url": row.model_url, + } + + payload = { + "source": source, + "models": models, + "entries": [asdict(r) for r in rows], + } + + with open(output_file, "w") as f: + json.dump(payload, f, indent=2, sort_keys=True) + f.write("\n") + + if __name__ == "__main__": - yaml_path = DATABASE_LIST_PATH + yaml_path = _default_database_list_path() if not yaml_path.exists(): print(f"Error: YAML file not found at {yaml_path}", file=sys.stderr) sys.exit(1) output_path = REPO_ROOT / "docs/source/deployment-guide/config_table.rst" + json_output_path = REPO_ROOT / "docs/source/_static/config_db.json" generate_rst(yaml_path, output_file=output_path) + generate_json(yaml_path, output_file=json_output_path) diff --git a/tests/unittest/tools/test_generate_config_table.py b/tests/unittest/tools/test_generate_config_table.py index a2dcf66783f..259fb0747c7 100644 --- a/tests/unittest/tools/test_generate_config_table.py +++ b/tests/unittest/tools/test_generate_config_table.py @@ -23,7 +23,7 @@ SCRIPTS_DIR = os.path.join(REPO_ROOT, "scripts") sys.path.insert(0, SCRIPTS_DIR) -from generate_config_table import generate_rst # noqa: E402 +from generate_config_table import generate_json, generate_rst # noqa: E402 class TestConfigTableSync(unittest.TestCase): @@ -32,21 +32,27 @@ def test_config_table_sync(self): Ensures that the RST file is up-to-date with the YAML database. """ - if generate_rst is None: + if generate_rst is None or generate_json is None: self.skipTest("generate_config_table not available") # Define paths yaml_path = os.path.join(REPO_ROOT, "examples/configs/database/lookup.yaml") rst_path = os.path.join(REPO_ROOT, "docs/source/deployment-guide/config_table.rst") + json_path = os.path.join(REPO_ROOT, "docs/source/_static/config_db.json") # Ensure files exist self.assertTrue(os.path.exists(yaml_path), f"YAML file not found: {yaml_path}") self.assertTrue(os.path.exists(rst_path), f"RST file not found: {rst_path}") + self.assertTrue(os.path.exists(json_path), f"JSON file not found: {json_path}") # Read existing RST content with open(rst_path, "r") as f: existing_content = f.read() + # Read existing JSON content + with open(json_path, "r") as f: + existing_json = f.read() + # Generate new RST content with tempfile.NamedTemporaryFile(mode="w+", delete=True) as tmp: generate_rst(yaml_path, output_file=tmp.name) @@ -61,6 +67,19 @@ def test_config_table_sync(self): "Please run 'python3 scripts/generate_config_table.py' from the repo root to update it.", ) + # Generate new JSON content and compare + with tempfile.NamedTemporaryFile(mode="w+", delete=True) as tmp: + generate_json(yaml_path, output_file=tmp.name) + tmp.seek(0) + generated_json = tmp.read() + + self.assertEqual( + existing_json.strip(), + generated_json.strip(), + "config_db.json is not synchronized with lookup.yaml. " + "Please run 'python3 scripts/generate_config_table.py' from the repo root to update it.", + ) + if __name__ == "__main__": unittest.main()