Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 1956931

Browse files
authoredMar 27, 2024
[Misc] add the "download-dir" option to the latency/throughput benchmarks (vllm-project#3621)
1 parent e24336b commit 1956931

File tree

2 files changed

+32
-19
lines changed

2 files changed

+32
-19
lines changed
 

‎benchmarks/benchmark_latency.py

+16-12
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,17 @@ def main(args: argparse.Namespace):
1616

1717
# NOTE(woosuk): If the request cannot be processed in a single batch,
1818
# the engine will automatically process the request in multiple batches.
19-
llm = LLM(
20-
model=args.model,
21-
tokenizer=args.tokenizer,
22-
quantization=args.quantization,
23-
tensor_parallel_size=args.tensor_parallel_size,
24-
trust_remote_code=args.trust_remote_code,
25-
dtype=args.dtype,
26-
enforce_eager=args.enforce_eager,
27-
kv_cache_dtype=args.kv_cache_dtype,
28-
device=args.device,
29-
ray_workers_use_nsight=args.ray_workers_use_nsight,
30-
)
19+
llm = LLM(model=args.model,
20+
tokenizer=args.tokenizer,
21+
quantization=args.quantization,
22+
tensor_parallel_size=args.tensor_parallel_size,
23+
trust_remote_code=args.trust_remote_code,
24+
dtype=args.dtype,
25+
enforce_eager=args.enforce_eager,
26+
kv_cache_dtype=args.kv_cache_dtype,
27+
device=args.device,
28+
ray_workers_use_nsight=args.ray_workers_use_nsight,
29+
download_dir=args.download_dir)
3130

3231
sampling_params = SamplingParams(
3332
n=args.n,
@@ -151,5 +150,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
151150
action='store_true',
152151
help="If specified, use nsight to profile ray workers",
153152
)
153+
parser.add_argument('--download-dir',
154+
type=str,
155+
default=None,
156+
help='directory to download and load the weights, '
157+
'default to the default cache dir of huggingface')
154158
args = parser.parse_args()
155159
main(args)

‎benchmarks/benchmark_throughput.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def run_vllm(
7575
device: str,
7676
enable_prefix_caching: bool,
7777
gpu_memory_utilization: float = 0.9,
78+
download_dir: Optional[str] = None,
7879
) -> float:
7980
from vllm import LLM, SamplingParams
8081
llm = LLM(model=model,
@@ -89,7 +90,8 @@ def run_vllm(
8990
enforce_eager=enforce_eager,
9091
kv_cache_dtype=kv_cache_dtype,
9192
device=device,
92-
enable_prefix_caching=enable_prefix_caching)
93+
enable_prefix_caching=enable_prefix_caching,
94+
download_dir=download_dir)
9395

9496
# Add the requests to the engine.
9597
for prompt, _, output_len in requests:
@@ -208,12 +210,14 @@ def main(args: argparse.Namespace):
208210
args.output_len)
209211

210212
if args.backend == "vllm":
211-
elapsed_time = run_vllm(
212-
requests, args.model, args.tokenizer, args.quantization,
213-
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
214-
args.trust_remote_code, args.dtype, args.max_model_len,
215-
args.enforce_eager, args.kv_cache_dtype, args.device,
216-
args.enable_prefix_caching, args.gpu_memory_utilization)
213+
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
214+
args.quantization, args.tensor_parallel_size,
215+
args.seed, args.n, args.use_beam_search,
216+
args.trust_remote_code, args.dtype,
217+
args.max_model_len, args.enforce_eager,
218+
args.kv_cache_dtype, args.device,
219+
args.enable_prefix_caching,
220+
args.gpu_memory_utilization, args.download_dir)
217221
elif args.backend == "hf":
218222
assert args.tensor_parallel_size == 1
219223
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -314,6 +318,11 @@ def main(args: argparse.Namespace):
314318
"--enable-prefix-caching",
315319
action='store_true',
316320
help="enable automatic prefix caching for vLLM backend.")
321+
parser.add_argument('--download-dir',
322+
type=str,
323+
default=None,
324+
help='directory to download and load the weights, '
325+
'default to the default cache dir of huggingface')
317326
args = parser.parse_args()
318327
if args.tokenizer is None:
319328
args.tokenizer = args.model

0 commit comments

Comments
 (0)