Skip to content

Commit

Permalink
feat: Update app with datasources. Update slurm scripts and result pa…
Browse files Browse the repository at this point in the history
…rsing.
  • Loading branch information
Hugoch committed Oct 1, 2024
1 parent 7d1612f commit 4ad4904
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 57 deletions.
216 changes: 168 additions & 48 deletions extra/dashboard/app.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os
from cProfile import label
from contextlib import ExitStack
from dataclasses import dataclass

Expand Down Expand Up @@ -31,52 +33,132 @@ class PlotConfig:
Each benchmark is run using a constant arrival rate of requests per second (QPS),
independently of the number of requests that are being processed (open loop).
The metrics are:
* Inter token latency: Time to generate a new output token for each user querying the system.
It translates as the “speed” perceived by the end-user. We aim for at least 300 words per minute (average reading speed), so ITL<150ms
* Time to First Token: Time the user has to wait before seeing the first token of its answer.
Lower waiting time are essential for real-time interactions, less so for offline workloads.
* End-to-end latency: The overall time the system took to generate the full response to the user.
* Throughput: The number of tokens per second the system can generate across all requests
* Successful requests: The number of requests the system was able to honor in the benchmark timeframe
* Error rate: The percentage of requests that ended up in error, as the system could not process them in time or failed to process them.
⚠️ TGI has a rate-limiting mechanism that will throttle requests, so a high error rate can be a sign of rate limit hit.
'''

df = pd.DataFrame()
df_bench = pd.DataFrame()
df_ci = pd.DataFrame()
summary = pd.DataFrame()
line_plots = []
column_mappings = {'inter_token_latency_ms_p90': 'ITL P90 (ms)', 'time_to_first_token_ms_p90': 'TTFT P90 (ms)',
'e2e_latency_ms_p90': 'E2E P90 (ms)', 'token_throughput_secs': 'Throughput (tokens/s)',
'successful_requests': 'Successful requests', 'error_rate': 'Error rate (%)', 'model': 'Model',
'rate': 'QPS'}


def plot(model, device) -> pd.DataFrame:
d = df[(df['model'] == model) & (df['device'] == device)]
d = df_bench[(df_bench['model'] == model) & (df_bench['device'] == device)]
return d


def update_app(device, model):
def update_app(device_bench, device_ci, model, commit_ref, commit_compare):
res = []
for plot in line_plots:
res.append(df[(df['model'] == model) & (df['device'] == device)])
return res + [summary_table(device)]
res.append(df_bench[(df_bench['model'] == model) & (df_bench['device'] == device_bench)])
return res + [summary_table(device_bench), compare_table(device_ci, commit_ref, commit_compare)]


def summary_table(device) -> pd.DataFrame:
rates = [4., 8., 16.]
data = df[(df['device'] == device) & (df['rate'].isin(rates))]
data = data.groupby(['model', 'rate']).agg(
data = df_bench[(df_bench['device'] == device) & (df_bench['rate'].isin(rates))]
data = data.groupby(['model', 'rate', 'engine']).agg(
{'inter_token_latency_ms_p90': 'mean', 'time_to_first_token_ms_p90': 'mean',
'e2e_latency_ms_p90': 'mean', 'token_throughput_secs': 'mean',
'successful_requests': 'mean', 'error_rate': 'mean'}).reset_index()
data = data[['model', 'rate', 'inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90',
'token_throughput_secs']]
data = data[
['model', 'engine', 'rate', 'inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90',
'token_throughput_secs']]
for metric in ['inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90',
'token_throughput_secs']:
data[metric] = data[metric].apply(lambda x: f"{x:.2f}")
data = data.rename(
columns={'inter_token_latency_ms_p90': 'ITL P90 (ms)', 'time_to_first_token_ms_p90': 'TTFT P90 (ms)',
'e2e_latency_ms_p90': 'E2E P90 (ms)', 'token_throughput_secs': 'Throughput (tokens/s)',
'successful_requests': 'Successful requests', 'error_rate': 'Error rate (%)', 'model': 'Model',
'rate': 'QPS'})
columns=column_mappings)
return data


def load_data() -> pd.DataFrame:
data = pd.read_parquet('results.parquet')
def compare_table(device, commit_ref, commit_compare) -> pd.DataFrame:
rates = [4., 8., 16.]
metrics = ['inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90',
'token_throughput_secs']
data = df_ci[(df_ci['device'] == device) & (df_ci['rate'].isin(rates))]
ref = data[(data['device'] == device) & (data['version'] == commit_ref) & (data['engine'] == 'TGI')]
compare = data[(data['device'] == device) & (data['version'] == commit_compare) & (data['engine'] == 'TGI')]
data = ref.merge(compare, on=['model', 'rate'], suffixes=('_ref', '_compare'))
data = data.rename(
columns=column_mappings)
for metric in metrics:
name = column_mappings[metric]
data[f'∆ {name}'] = (data[f'{metric}_compare'] - data[f'{metric}_ref']) / data[f'{metric}_ref'] * 100.0
data[f'∆ {name}'] = data[f'∆ {name}'].apply(lambda x: f"{x:.2f}%")
data = data[['Model', 'QPS'] + [f'∆ {column_mappings[metric]}' for metric in metrics]]

return data


def load_bench_results(source) -> pd.DataFrame:
data = pd.read_parquet(source)
# remove warmup and throughput
data = data[(data['id'] != 'warmup') & (data['id'] != 'throughput')]
# only keep constant rate
data = data[data['executor_type'] == 'ConstantArrivalRate']
# sanity check: we should have only one version per engine
# assert data.groupby(['engine', 'version']).size().reset_index().shape[0] == 2
return data


def load_ci_results(source) -> pd.DataFrame:
data = pd.read_parquet(source)
return data


def select_region(selection: gr.SelectData, device, model):
min_w, max_w = selection.index
data = df_bench[(df_bench['model'] == model) & (df_bench['device'] == device) & (df_bench['rate'] >= min_w) & (
df_bench['rate'] <= max_w)]
res = []
for plot in line_plots:
# find the y values for the selected region
metric = plot["metric"]
y_min = data[metric].min()
y_max = data[metric].max()
res.append(gr.LinePlot(x_lim=[min_w, max_w], y_lim=[y_min, y_max]))
return res


def reset_region():
res = []
for _ in line_plots:
res.append(gr.LinePlot(x_lim=None, y_lim=None))
return res


def load_datasource(datasource, fn):
if datasource.startswith('file://'):
return fn(datasource)
elif datasource.startswith('s3://'):
return fn(datasource)
else:
raise ValueError(f"Unknown datasource: {datasource}")


if __name__ == '__main__':
datasource_bench = os.environ.get('DATASOURCE_BENCH', 'file://benchmarks.parquet')
datasource_ci = os.environ.get('DATASOURCE_CI', 'file://ci.parquet')
df_bench = load_datasource(datasource_bench, load_bench_results)
df_ci = load_datasource(datasource_ci, load_ci_results)

metrics = {
"inter_token_latency_ms_p90": PlotConfig(title="Inter Token Latency P90 (lower is better)", x_title="QPS",
y_title="Time (ms)"),
Expand All @@ -88,44 +170,82 @@ def load_data() -> pd.DataFrame:
y_title="Tokens/s"),
"successful_requests": PlotConfig(title="Successful requests (higher is better)", x_title="QPS",
y_title="Count"),
"error_rate": PlotConfig(title="Error rate (lower is better)", x_title="QPS", y_title="%")
"error_rate": PlotConfig(title="Error rate", x_title="QPS", y_title="%")
}
default_df = pd.DataFrame.from_dict(
{"rate": [1, 2], "inter_token_latency_ms_p90": [10, 20], "engine": ["tgi", "vllm"]})
df = load_data()
models = df["model"].unique()
devices = df["device"].unique()
# df_bench = load_bench_results()
# df_ci = load_ci_results()
models = df_bench["model"].unique()
devices_bench = df_bench["device"].unique()
devices_ci = df_ci["device"].unique()
commits = df_ci[df_ci["engine"] == "TGI"]["version"].unique()
with gr.Blocks(css=css, title="TGI benchmarks") as demo:
with gr.Row():
header = gr.Markdown("# TGI benchmarks\nBenchmark results for Hugging Face TGI 🤗")
with gr.Row():
device = gr.Radio(devices, label="Select device", value="H100")
with gr.Row():
summary_desc = gr.Markdown(summary_desc)
with gr.Row():
table = gr.DataFrame(
pd.DataFrame(),
elem_classes=["summary"],
)
with gr.Row():
details_desc = gr.Markdown("## Details")
with gr.Row():
model = gr.Dropdown(list(models), label="Select model", value=models[0])
i = 0
with ExitStack() as stack:
for k, v in metrics.items():
if i % 2 == 0:
stack.close()
gs = stack.enter_context(gr.Row())
line_plots.append(
{"component": gr.LinePlot(default_df, label=f'{v.title}', x="rate", y=k,
color="engine", y_title=v.y_title,
color_map={'vLLM': '#2F5BA1', 'TGI': '#FF9D00'}), "model": model.value,
"device": device})
i += 1

device.change(update_app, [device, model], [item["component"] for item in line_plots] + [table])
model.change(update_app, [device, model], [item["component"] for item in line_plots] + [table])
demo.load(update_app, [device, model], [item["component"] for item in line_plots] + [table])
with gr.Tab(label="TGI benchmarks"):
with gr.Row():
device_bench = gr.Radio(devices_bench, label="Select device", value="H100")
with gr.Row():
summary_desc = gr.Markdown(summary_desc)
versions = df_bench.groupby(['engine', 'version']).size().reset_index()
with gr.Row():
versions_md = "**Versions**\n"
for engine in versions['engine'].unique():
versions_md += f"* **{engine}**: {versions[versions['engine'] == engine]['version'].values[0]}\n"
versions_desc = gr.Markdown(versions_md)
with gr.Row():
table = gr.DataFrame(
pd.DataFrame(),
elem_classes=["summary"],
)
with gr.Row():
details_desc = gr.Markdown("## Details")
with gr.Row():
model = gr.Dropdown(list(models), label="Select model", value=models[0])
colors = ['#640D5F', '#D91656', '#EE66A6', '#FFEB55']
colormap = {}
for idx, engine in enumerate(df_bench['engine'].unique()):
colormap[engine] = colors[idx % len(colors)]
colormap['vLLM'] = '#2F5BA1'
colormap['TGI'] = '#FF9D00'
i = 0
with ExitStack() as stack:
for k, v in metrics.items():
if i % 2 == 0:
stack.close()
gs = stack.enter_context(gr.Row())
line_plots.append(
{"component": gr.LinePlot(default_df, label=f'{v.title}', x="rate", y=k,
color="engine", y_title=v.y_title, x_title=v.x_title,
color_map=colormap),
"model": model.value,
"device": device_bench,
"metric": k
},
)
i += 1
with gr.Tab(label="CI results"):
with gr.Row():
header = gr.Markdown("# CI results\nSummary of the benchmarks")
with gr.Row():
device_ci = gr.Radio(list(devices_ci), label="Select device", value=devices_ci[0])
with gr.Row():
commit_ref = gr.Dropdown(list(commits), label="Reference commit", value=commits[0])
commit_compare = gr.Dropdown(list(commits), label="Commit to compare", value=commits[0])
with gr.Row():
comparison_table = gr.DataFrame(
pd.DataFrame(),
elem_classes=["summary"],
)
for component in [device_bench, device_ci, model, commit_ref, commit_compare]:
component.change(update_app, [device_bench, device_ci, model, commit_ref, commit_compare],
[item["component"] for item in line_plots] + [table, comparison_table])
gr.on([plot["component"].select for plot in line_plots], select_region, [device_bench, model],
outputs=[item["component"] for item in line_plots])
gr.on([plot["component"].double_click for plot in line_plots], reset_region, None,
outputs=[item["component"] for item in line_plots])
demo.load(update_app, [device_bench, device_ci, model, commit_ref, commit_compare],
[item["component"] for item in line_plots] + [table, comparison_table])

demo.launch()
1 change: 1 addition & 0 deletions extra/slurm/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def main():
('meta-llama/Llama-3.1-70B-Instruct', 4),
('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
('neuralmagic/Meta-Llama-3-70B-Instruct-FP8', 2),
('CohereForAI/c4ai-command-r-plus-08-2024', 4),
]
num_passes = 2
engines = ['tgi', 'vllm']
Expand Down
6 changes: 3 additions & 3 deletions extra/slurm/tgi.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ mkdir -p "${RESULTS_DIR}"
if [[ $exit_code != 124 ]]; then
# run benchmark
echo "Starting benchmark"
VERSION=$(curl -s http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}/info | jq -r '.version')
VERSION=$(curl -s http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}/info | jq -r '.sha')
srun --het-group=1 \
-u \
-n 1 \
--container-image="registry.hpc-cluster-hopper.hpc.internal.huggingface.tech#library/text-generation-inference-benchmark:latest" \
--container-image="ghcr.io#huggingface/text-generation-inference-benchmark:latest" \
--container-mounts="${RESULTS_DIR}:/opt/text-generation-inference-benchmark/results" \
--no-container-mount-home \
text-generation-inference-benchmark \
Expand All @@ -71,7 +71,7 @@ if [[ $exit_code != 124 ]]; then
--rates 0.8 --rates 1.6 --rates 2.4 --rates 3.2 --rates 4.0 --rates 4.8 --rates 5.6 --rates 6.4 --rates 7.2 --rates 8.0 --rates 8.8 --rates 9.6 --rates 10.4 --rates 11.2 --rates 12.0 --rates 12.8 --rates 13.6 --rates 14.4 --rates 15.2 --rates 16.0 --rates 16.8 --rates 17.6 --rates 18.4 --rates 19.2 --rates 20.0 --rates 20.8 --rates 21.6 --rates 22.4 --rates 23.2 --rates 24.0 \
--prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
--decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
--extra-meta "version=$VERSION,engine=\"TGI\",tp=$TP" \
--extra-meta "version=$VERSION,engine=TGI,tp=$TP" \
--no-console
fi

Expand Down
4 changes: 2 additions & 2 deletions extra/slurm/vllm.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ if [[ $exit_code != 124 ]]; then
srun --het-group=1 \
-u \
-n 1 \
--container-image="registry.hpc-cluster-hopper.hpc.internal.huggingface.tech#library/text-generation-inference-benchmark:latest" \
--container-image="ghcr.io#huggingface/text-generation-inference-benchmark:latest" \
--container-mounts="${RESULTS_DIR}:/opt/text-generation-inference-benchmark/results" \
--no-container-mount-home \
text-generation-inference-benchmark \
Expand All @@ -70,7 +70,7 @@ if [[ $exit_code != 124 ]]; then
--rates 0.8 --rates 1.6 --rates 2.4 --rates 3.2 --rates 4.0 --rates 4.8 --rates 5.6 --rates 6.4 --rates 7.2 --rates 8.0 --rates 8.8 --rates 9.6 --rates 10.4 --rates 11.2 --rates 12.0 --rates 12.8 --rates 13.6 --rates 14.4 --rates 15.2 --rates 16.0 --rates 16.8 --rates 17.6 --rates 18.4 --rates 19.2 --rates 20.0 --rates 20.8 --rates 21.6 --rates 22.4 --rates 23.2 --rates 24.0 \
--prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
--decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
--extra-meta "version=$VERSION,engine=\"vLLM\",tp=$TP" \
--extra-meta "version=$VERSION,engine=vLLM,tp=$TP" \
--no-console
fi

Expand Down
12 changes: 8 additions & 4 deletions parse_results.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import argparse
import json
import os
from operator import index

import pandas as pd
from pandas import DataFrame


def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
Expand All @@ -16,7 +15,9 @@ def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
entry = result
[config] = pd.json_normalize(result['config']).to_dict(orient='records')
entry.update(config)
entry['engine'] = key
entry['engine'] = data['config']['meta']['engine']
entry['tp'] = data['config']['meta']['tp']
entry['version'] = data['config']['meta']['version']
entry['model'] = model
del entry['config']
df = pd.concat([df, pd.DataFrame(entry, index=[0])])
Expand All @@ -39,7 +40,10 @@ def build_results_df() -> pd.DataFrame:


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--results-file', type=str, required=True, help='Path to the results file / S3 bucket')
args = parser.parse_args()
df = build_results_df()
df['device'] = df['model'].apply(lambda x: 'H100')
df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0
df.to_parquet('results.parquet')
df.to_parquet(args.results_file)

0 comments on commit 4ad4904

Please sign in to comment.