diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index ffe83aa5d0..8769a6bd88 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -73,6 +73,106 @@ from datasets import load_dataset, load_metric # isort: skip +from evaluate import load +import re +import time + +def truncate(completion): + # completion is the prediction generated by calling codegen + prints = list(re.finditer('^print', completion, re.MULTILINE)) + if len(prints) > 1: + completion = completion[:prints[1].start()] + + defs = list(re.finditer('^def', completion, re.MULTILINE)) + if len(defs) > 1: + completion = completion[:defs[1].start()] + + return completion + + +def process_unit_test(sample): + # sample is one row from the HumanEval dataset + unit_test = sample["test"] + function_start = sample["prompt"].find("def") + 4 + function_end = sample["prompt"][function_start:].find("(") + function_name = sample["prompt"][function_start:function_start + function_end] + unit_test = unit_test + f"check({function_name})\n" + + return unit_test + + +def human_eval(args, dataset_name="openai_humaneval"): + # n in the number of predictions to generate for each task where n >= k + # (in the original paper k = [1, 10, 100], where k <= 100 + + # temperatures is the temperature to use for each prediction + # (in the original paper temperatures = [0.2, 0.6, 0.8] + + # also, the original paper uses nucleus sampling, that + # is not supported by easy to implement + + import os + + os.environ["HF_ALLOW_CODE_EVAL"] = "1" + os.environ["TOKENIZERS_PARALLELISM"] = "true" + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + + text_generation = Pipeline.create( + task="codegen", + model_path=args.model_path, + engine_type=args.engine, + num_cores=args.num_cores, + sequence_length=428, + prompt_processing_sequence_length=428, + max_generated_tokens=256, + deterministic=False, + sampling_temperature=args.temperature, + ) + + # Load the full dataset in streaming mode to facilitate subset creation + full_dataset = load_dataset(dataset_name, split="test", streaming=True) + + # Set to total length of the full dataset if user hasn't specified any + if not args.max_samples: + args.max_samples = 164 + + if args.benchmark_humaneval: + # A selection of tasks from the HumanEval dataset to perform faster evaluation on - + # this selection tries to keep a variety of tasks based on token lengths + benchmark_problems_tokenlen = {"HumanEval/83": 46, "HumanEval/35": 76, "HumanEval/22": 95, + "HumanEval/146": 112, "HumanEval/77": 121, "HumanEval/33": 143, + "HumanEval/41": 160, "HumanEval/113": 178, "HumanEval/72": 236, + "HumanEval/115": 334, "HumanEval/129": 428 + } + print("Creating Benchmark Dataset") + dataset_subset = full_dataset.filter(lambda x: x["task_id"] in benchmark_problems_tokenlen) + dataset_subset_len = len(benchmark_problems_tokenlen) + else: + # Create a subset from the dataset for evaluation starting on "start" index upto "max_samples" number of samples + print("Creating Subset from Dataset") + temp_dataset_subset = full_dataset.skip(args.start) + dataset_subset = temp_dataset_subset.take(args.max_samples) + dataset_subset_len = args.max_samples + + references = [] + predictions = [] + + for idx, sample in _enumerate_progress(dataset_subset, dataset_subset_len): + sample_prompt = sample["prompt"] + # print(f"\n sample_prompt:\n {sample_prompt}") + sample_test = process_unit_test(sample) + sample_task_id = sample["task_id"] + print(f"sample_task_id: {sample_task_id}") + # print(f"\n sample_test: \n {sample_test}") + sample_predictions = text_generation(sequences=[sample_prompt] * args.n_solutions) + sample_predictions = sample_predictions.sequences + for i in range(args.n_solutions): + sample_predictions[i] = truncate(sample_prompt + sample_predictions[i]) + + references.append(sample_test) + predictions.append(sample_predictions) + + return references, predictions def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"): @@ -109,6 +209,25 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"): return perplexity_metrics +def select_openai_humaneval_method(args): + if args.humaneval_method == "pass_at_k": + start_time = time.time() + references, predictions = human_eval(args) + code_eval = load("code_eval") + pass_at_k, results = code_eval.compute(references=references, predictions=predictions, k=[1, 2, 10]) + print(f"\nopenai_humaneval evaluation results: {pass_at_k}") + end_time = time.time() + eval_time = "Evaluation time: " + str(end_time - start_time) + "\n" + file_name_detailed_result = "result_humaneval_" + str(args.start) + "_" + str(args.start + args.max_samples - 1) + with open(file_name_detailed_result, "w") as fp: + fp.write(eval_time) + json.dump(pass_at_k, fp) + json.dump(results, fp) + else: + perplexity_metrics = perplexity_eval(args) + return perplexity_metrics + + def qa_eval(args, dataset_name="squad"): # load validation dataset and eval tool dataset = load_dataset(dataset_name)["validation"] @@ -474,7 +593,9 @@ def _split_train_val(train_dataset, val_ratio, seed=42): "imdb": imdb_eval, "conll2003": conll2003_eval, "go_emotions": go_emotions_eval, - "openai_humaneval": perplexity_eval, + # #"openai_humaneval": human_eval, + # "openai_humaneval": perplexity_eval, + "openai_humaneval": select_openai_humaneval_method, } @@ -606,6 +727,47 @@ def parse_args(): default=False, ) + parser.add_argument( + "--humaneval-method", + default="perplexity", + choices=["perplexity", "pass_at_k"], + help="Whether to run perplexity evaluation or pass@k evaluation on the openai_humaneval dataset." + " Default is perplexity", + type=str, + ) + + parser.add_argument( + "--n-solutions", + help="The total number of solutions to generate for one code prompt of the openai_humaneval dataset. " + "Default is 1.", + type=int, + default=1, + ) + + parser.add_argument( + "--temperature", + help="Used with openai_humaneval dataset - The temperature to use for sampling. " + "Default is 0.8", + type=int, + default=0.8, + ) + + parser.add_argument( + "--benchmark-humaneval", + help="Set to allow on a smaller subset of the dataset." + "Default is False", + default=False, + action="store_true", + ) + + parser.add_argument( + "--start", + help="Used only with openai_humaneval dataset when evaluation method is pass_at_k and " + "parallel processing is needed. Default is 0.", + type=int, + default=0, + ) + return parser.parse_args() @@ -629,8 +791,8 @@ def _main(args): print(f"\nmnli eval results: {mnli_metrics}") else: metrics = SUPPORTED_DATASETS[dataset](args) - - print(f"\n{dataset} eval results: {metrics.compute()}") + if not (dataset == "openai_humaneval" and args.humaneval_method == "pass_at_k"): + print(f"\n{dataset} eval results: {metrics.compute()}") def main():