diff --git a/benchmarks/inference/collect_results.py b/benchmarks/inference/collect_results.py new file mode 100644 index 000000000000..74f60e442470 --- /dev/null +++ b/benchmarks/inference/collect_results.py @@ -0,0 +1,135 @@ +import os +import re +import argparse +import pandas as pd + +parser = argparse.ArgumentParser() +parser.add_argument( + "--results-dir", + "-r", + type=str, + default="./results", + help="directory containing sweep results", +) +parser.add_argument("--version", + "-v", + type=int, + default=0, + help="version to be collected") +parser.add_argument("--gen-text-n", + "-n", + type=int, + default=1, + help="expected number of generated text") +parser.add_argument("--output", + "-o", + type=str, + default="./results.csv", + help="output file") +args = parser.parse_args() + + +def get_branch(file_path): + match = re.match(r".*\/(.*)\.log", file_path) + if match is None: + return False + else: + return match.groups()[0] + + +def get_benchmark_params(root_dir, file_path): + match = re.match( + rf"{root_dir}\/(.+?)_(fp\d+)_(true|false)_(true|false)_(\d+)gpus_v(\d+)\/", + file_path, + ) + if match is None: + return False + else: + model, dtype, graphs, kernel, gpus, version = match.groups() + bool_dict = {"true": True, "false": False} + return { + "model": model, + "dtype": dtype, + "graphs": bool_dict[graphs.lower()], + "kernel": bool_dict[kernel.lower()], + "gpus": int(gpus), + "version": int(version), + } + + +def get_perf_data(file_content): + matches = re.findall(r"\s+(.+?)\sLatency:\s+(\d+\.\d+)\sms", file_content) + if matches is []: + return False + else: + return {f"latency-{key}": float(val) for key, val in matches} + + +def get_generated_text(file_content, gen_text_n): + file_content = file_content.replace("\n", " ") + file_content = file_content.replace("\t", " ") + matches = re.findall(r"RESPONSE\s(\d+):\s+[-]{30}\s+(.+?)\s+[-]{30}", file_content) + if len(matches) != gen_text_n: + return False + else: + return {f"generated-text-{key}": val for key, val in matches} + + +if __name__ == "__main__": + # List to collect data from all benchmarks + benchmarks_data = [] + + # Walk through directory of results from sweep.sh + for root, dirs, files in os.walk(args.results_dir): + # Because of how some models are named, the dir structure for results can vary, e.g.: + # "EleutherAI/gpt-neo_*/baseline.log" versus "gpt2_*/baseline.log" + if dirs: + continue + + # Get data from baseline and each tested branch + for name in files: + file_path = os.path.join(root, name) + + branch = get_branch(file_path) + if not branch: + print(f"WARNING: Could not detect branch for file {file_path}, skipping") + continue + + params = get_benchmark_params(args.results_dir, file_path) + if not params: + print( + f"WARNING: Could not detect benchmark settings for file {file_path}, skipping" + ) + continue + + # Verify that the version matches that which we want to collect + if params["version"] != args.version: + continue + + with open(file_path, "r") as f: + file_content = f.read() + + perf_data = get_perf_data(file_content) + if not perf_data: + print( + f"WARNING: Could not detect benchmark performance data for file {file_path}, skipping" + ) + continue + + generated_text = get_generated_text(file_content, args.gen_text_n) + if not generated_text: + print( + f"WARNING: Could not detect generated text for file {file_path}, skipping" + ) + continue + + benchmarks_data.append({ + "branch": branch, + **params, + **perf_data, + **generated_text + }) + + # Convert to a DataFrame and save + benchmarks_df = pd.DataFrame(benchmarks_data) + benchmarks_df.to_csv(args.output) diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py index fde708f63c4c..af1370abad1f 100644 --- a/benchmarks/inference/gpt-bench.py +++ b/benchmarks/inference/gpt-bench.py @@ -1,3 +1,4 @@ +import os import torch import time import deepspeed @@ -7,9 +8,26 @@ parser = argparse.ArgumentParser() parser.add_argument("--model", "-m", type=str, help="hf model name") parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference") -parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32") +parser.add_argument("--dtype", + type=str, + default="fp16", + choices=["fp16", + "fp32", + "int8"], + help="int8, fp16, or fp32") +parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on") +parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on") parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens") -parser.add_argument("--local_rank", type=int, default=0, help="local rank") +parser.add_argument("--local_rank", + type=int, + default=int(os.getenv("LOCAL_RANK", + "0")), + help="local rank") +parser.add_argument("--world_size", + type=int, + default=int(os.getenv("WORLD_SIZE", + "1")), + help="world size") parser.add_argument("--trials", type=int, default=30, help="number of trials") args = parser.parse_args() @@ -44,9 +62,17 @@ def print_latency(latency_set, title, warmup=3): deepspeed.init_distributed("nccl") -print(args.model, args.max_tokens, args.dtype) +if args.local_rank == 0: + print("BENCHMARK SETTINGS:") + print(f"\tMODEL: {args.model}") + print(f"\tMAX_TOKENS: {args.max_tokens}") + print(f"\tDTYPE: {args.dtype}") + print(f"\tCUDA_GRAPHS: {args.graphs}") + print(f"\tKERNEL_INJECT: {args.kernel_inject}") -if args.dtype.lower() == "fp16": +if args.dtype == "int8": + dtype = torch.int8 +elif args.dtype == "fp16": dtype = torch.float16 else: dtype = torch.float32 @@ -56,26 +82,33 @@ def print_latency(latency_set, title, warmup=3): framework="pt", device=args.local_rank) -if dtype == torch.half: +if dtype == torch.float16: pipe.model.half() if args.deepspeed: - pipe.model = deepspeed.init_inference(pipe.model, - dtype=dtype, - replace_with_kernel_inject=True, - replace_method='auto') + pipe.model = deepspeed.init_inference( + pipe.model, + dtype=dtype, + mp_size=args.world_size, + replace_with_kernel_inject=args.kernel_inject, + replace_method="auto", + enable_cuda_graph=args.graphs, + ) responses = [] times = [] for i in range(args.trials): torch.cuda.synchronize() start = time.time() - r = pipe("DeepSpeed is", max_new_tokens=args.max_tokens) + r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens) torch.cuda.synchronize() end = time.time() responses.append(r) times.append((end - start) / (args.max_tokens - 3)) -print_latency(times, "token latency") - -print(responses[0:3]) +if args.local_rank == 0: + print_latency(times, "token latency") + print(f"RESPONSE 0:") + print("-" * 30) + print(responses[0][0]["generated_text"]) + print("-" * 30) diff --git a/benchmarks/inference/run_model.sh b/benchmarks/inference/run_model.sh index 5237f5bceddc..7b7c4b6988e1 100644 --- a/benchmarks/inference/run_model.sh +++ b/benchmarks/inference/run_model.sh @@ -3,22 +3,34 @@ set -ex model=$1 branch1=$2 branch2=$3 +dtype=$4 +graphs=$5 +kernel=$6 +gpus=$7 version=0 -log_path=results/${model}_v${version} +log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version} mkdir -p ${log_path} +params="--dtype $dtype " +if [[ "$graphs" == "true" ]]; then + params+="--graphs " +fi +if [[ "$kernel" == "true" ]]; then + params+="--kernel " +fi + echo "baseline $log_path" -deepspeed --num_gpus 1 gpt-bench.py -m "${model}" &> ${log_path}/baseline.log +deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log cd ../../ git checkout ${branch1} cd - echo "ds ${branch1} $log_path" -deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch1}.log +deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/ds-${branch1}.log cd ../../ git checkout ${branch2} cd - echo "ds ${branch2} $log_path" -deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch2}.log +deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params&> ${log_path}/ds-${branch2}.log diff --git a/benchmarks/inference/sweep.sh b/benchmarks/inference/sweep.sh index 8a9bb41b5335..46d90435a8ce 100644 --- a/benchmarks/inference/sweep.sh +++ b/benchmarks/inference/sweep.sh @@ -5,22 +5,37 @@ export TRANSFORMERS_CACHE=/tmp/hf-cache branch1=$1 branch2=$2 -for m in `echo "EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"`; do - bash run_model.sh $m $branch1 $branch2 -done +gptneo_models="EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M" +gpt2_models="gpt2 gpt2-large gpt2-xl" +gptj_models="EleutherAI/gpt-j-6B" +opt_models="facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b" +bloom_models="bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1" -for m in `echo "gpt2 gpt2-large gpt2-xl"`; do - bash run_model.sh $m $branch1 $branch2 -done +for gpus in `echo "1 2 4 8"`; do + for dtype in `echo "fp16 fp32"`; do + for graphs in `echo "true false"`; do + for kernel in `echo "true false"`; do + params="$dtype $graphs $kernel $gpus" + for m in `echo "$gptneo_models"`; do + bash run_model.sh $m $branch1 $branch2 $params + done -for m in `echo "EleutherAI/gpt-j-6B"`; do - bash run_model.sh $m $branch1 $branch2 -done + for m in `echo "$gpt2_models"`; do + bash run_model.sh $m $branch1 $branch2 $params + done -for m in `echo "facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"`; do - bash run_model.sh $m $branch1 $branch2 -done + for m in `echo "$gptj_models"`; do + bash run_model.sh $m $branch1 $branch2 $params + done + + for m in `echo "$opt_models"`; do + bash run_model.sh $m $branch1 $branch2 $params + done -for m in `echo "bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"`; do - bash run_model.sh $m $branch1 $branch2 + for m in `echo "$bloom_models"`; do + bash run_model.sh $m $branch1 $branch2 $params + done + done + done + done done