diff --git a/federatedscope/llm/dataloader/dataloader.py b/federatedscope/llm/dataloader/dataloader.py index d6f468dda..cdc9b02d5 100644 --- a/federatedscope/llm/dataloader/dataloader.py +++ b/federatedscope/llm/dataloader/dataloader.py @@ -279,6 +279,32 @@ def load_llm_dataset(config=None, **kwargs): list_data_dict[i]['output'] = \ list_data_dict[i]['output'].replace('####', 'The answer is') dataset = LLMDataset(list_data_dict, tokenizer) + elif dataset_name.lower() == "medical_tc": + fp = os.path.join(config.data.root, 'medical_tc_train.jsonl') + if not os.path.exists(fp): + download_url( + 'https://federatedscope.oss-cn-beijing.aliyuncs.com/FS-LLM' + '/medical_tc_train.jsonl', config.data.root) + os.rename(os.path.join(config.data.root, 'train.jsonl'), fp) + list_data_dict = load_jsonl(fp, + instruction='instruction', + input='input', + output='output', + category='output') + dataset = LLMDataset(list_data_dict, tokenizer) + elif dataset_name.lower() == "finance": + fp = os.path.join(config.data.root, 'finance_train_data.jsonl') + if not os.path.exists(fp): + download_url( + 'https://federatedscope.oss-cn-beijing.aliyuncs.com/FS-LLM' + '/finance_train_data.jsonl', config.data.root) + os.rename(os.path.join(config.data.root, 'train.jsonl'), fp) + list_data_dict = load_jsonl(fp, + instruction='instruction', + input='input', + output='output', + category='category') + dataset = LLMDataset(list_data_dict, tokenizer) elif dataset_name.lower() == 'code_search_net': from tqdm import tqdm from federatedscope.llm.dataset.code_search_net import \ diff --git a/federatedscope/llm/eval/eval_for_finance/eval.py b/federatedscope/llm/eval/eval_for_finance/eval.py new file mode 100644 index 000000000..92711697f --- /dev/null +++ b/federatedscope/llm/eval/eval_for_finance/eval.py @@ -0,0 +1,179 @@ +import os +import torch +import numpy as np +import pandas as pd +import json +import transformers + +from federatedscope.core.configs.config import global_cfg +from federatedscope.core.cmd_args import parse_args, parse_client_cfg +from federatedscope.core.auxiliaries.utils import setup_seed +from federatedscope.core.auxiliaries.logging import update_logger +from federatedscope.llm.misc.fschat import FSChatBot +from federatedscope.core.data.utils import download_url + +# import torch._dynamo +# torch._dynamo.config.suppress_errors = True + +transformers.logging.set_verbosity(40) + +choices = ["A", "B", "C", "D"] + + +def format_subject(subject): + ll = subject.split("_") + s = "" + for entry in ll: + s += " " + entry + return s + + +def format_example(df, idx, include_answer=True): + prompt = df.iloc[idx, 0] + k = df.shape[1] - 2 + for j in range(k): + prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1]) + prompt += "\nAnswer:" + if include_answer: + prompt += " {}\n\n".format(df.iloc[idx, k + 1]) + return prompt + + +def gen_prompt(train_df, subject, k=-1): + prompt = "The following are multiple choice \ + questions (with answers) about {}.\n\n".format(format_subject(subject)) + if k == -1: + k = train_df.shape[0] + for i in range(k): + prompt += format_example(train_df, i) + return prompt + + +@torch.no_grad() +def eval(subject, model, tokenizer, test_df, device): + cors = [] + all_probs = [] + + for i in range(test_df.shape[0]): + # get prompt and make sure it fits + prompt = format_example(test_df, i, include_answer=False) + + input_ids = tokenizer( + prompt, + return_tensors="pt", + max_length=tokenizer.model_max_length, + ).input_ids.to(device) + + while input_ids.shape[-1] > 1024: + input_ids = tokenizer(prompt, + return_tensors="pt").input_ids.to(device) + + label = test_df.iloc[i, test_df.shape[1] - 1] + + logits = model(input_ids=input_ids).logits[0, -1] + + probs = (torch.nn.functional.softmax( + torch.tensor([ + logits[tokenizer("A").input_ids[-1]], + logits[tokenizer("B").input_ids[-1]], + logits[tokenizer("C").input_ids[-1]], + logits[tokenizer("D").input_ids[-1]], + ]).float(), + dim=0, + ).detach().cpu().numpy()) + pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)] + + cor = pred == label + cors.append(cor) + all_probs.append(probs) + + acc = np.mean(cors) + cors = np.array(cors) + + all_probs = np.array(all_probs) + print("Average accuracy {:.3f} - {}".format(acc, subject)) + + return cors, acc, all_probs + + +def main(): + init_cfg = global_cfg.clone() + args = parse_args() + + if args.cfg_file: + init_cfg.merge_from_file(args.cfg_file) + cfg_opt, client_cfg_opt = parse_client_cfg(args.opts) + init_cfg.merge_from_list(cfg_opt) + + update_logger(init_cfg, clear_before_add=True) + setup_seed(init_cfg.seed) + + # load your finetuned model (saved as xxx.ckpt) + # in yaml file federate.save_to + fschatbot = FSChatBot(init_cfg) + tokenizer = fschatbot.tokenizer + model = fschatbot.model + device = fschatbot.device + + if not os.path.exists("data/FinEval"): + download_url( + "https://federatedscope.oss-cn-beijing.aliyuncs.com/FS" + "-LLM/FinEval.zip", init_cfg.data.root) + print("Please unzip the file and rerun") + return + + data_dir = os.path.join(init_cfg.data.root, "FinEval") + eval_dir = "finance_eval_result" + + subjects = sorted([ + f.split("_dev.csv")[0] + for f in os.listdir(os.path.join(data_dir, "dev")) if "_dev.csv" in f + ]) + + if not os.path.exists(eval_dir): + os.makedirs(eval_dir) + if not os.path.exists( + os.path.join(eval_dir, "results_{}".format( + init_cfg.federate.save_to))): + os.makedirs( + os.path.join(eval_dir, + "results_{}".format(init_cfg.federate.save_to))) + + all_cors = [] + + for subject in subjects: + test_df = pd.read_csv(os.path.join(data_dir, "dev", + subject + "_dev.csv"), + header=None) + test_df = test_df.iloc[:, 1:7] + + cors, acc, probs = eval(subject, model, tokenizer, test_df, device) + all_cors.append(cors) + + test_df["{}_correct".format(init_cfg.federate.save_to)] = cors + for j in range(probs.shape[1]): + choice = choices[j] + test_df["{}_choice{}_probs".format(init_cfg.federate.save_to, + choice)] = probs[:, j] + test_df.to_csv( + os.path.join(eval_dir, + "results_{}".format(init_cfg.federate.save_to), + "{}.csv".format(subject)), + index=None, + ) + + results = {"subcategories": {}, "categories": {}} + + weighted_acc = np.mean(np.concatenate(all_cors)) + results["weighted_accuracy"] = weighted_acc + print("Average accuracy: {:.3f}".format(weighted_acc)) + + results_file = os.path.join( + eval_dir, "accuracies_{}.json".format( + init_cfg.federate.save_to.replace("/", "_"))) + with open(results_file, "w") as f: + json.dump(results, f) + + +if __name__ == "__main__": + main() diff --git a/federatedscope/llm/eval/eval_for_medical/eval.py b/federatedscope/llm/eval/eval_for_medical/eval.py new file mode 100644 index 000000000..c0a26ed47 --- /dev/null +++ b/federatedscope/llm/eval/eval_for_medical/eval.py @@ -0,0 +1,75 @@ +import os + +import numpy as np +import transformers +from tqdm import tqdm + +from federatedscope.core.configs.config import global_cfg +from federatedscope.core.cmd_args import parse_args, parse_client_cfg +from federatedscope.core.auxiliaries.utils import setup_seed +from federatedscope.core.auxiliaries.logging import update_logger +from federatedscope.core.data.utils import download_url +from federatedscope.llm.dataloader.dataloader import load_jsonl +from federatedscope.llm.misc.fschat import FSChatBot + +transformers.logging.set_verbosity(40) + +DEBUG = False + + +def is_correct(model_answer, answer): + return model_answer == answer + + +def main(): + init_cfg = global_cfg.clone() + args = parse_args() + + if args.cfg_file: + init_cfg.merge_from_file(args.cfg_file) + cfg_opt, client_cfg_opt = parse_client_cfg(args.opts) + init_cfg.merge_from_list(cfg_opt) + + update_logger(init_cfg, clear_before_add=True) + setup_seed(init_cfg.seed) + + # load your finetuned model (saved as xxx.ckpt) + # in yaml file federate.save_to + fschatbot = FSChatBot(init_cfg) + + # Get test file + fp = os.path.join(init_cfg.data.root, "medical_tc_test.jsonl") + if not os.path.exists(fp): + download_url( + 'https://federatedscope.oss-cn-beijing.aliyuncs.com/FS-LLM' + '/medical_tc_test.jsonl', init_cfg.data.root) + os.rename(os.path.join(init_cfg.data.root, 'test.jsonl'), fp) + + list_data_dict = load_jsonl(fp, + instruction='instruction', + input='input', + output='output', + category='output') + + answers = [] + for sample in tqdm(list_data_dict): + input_text = sample['instruction'] + sample["input"] + generate_kwargs = dict(max_new_tokens=256, top_p=0.95, temperature=0.8) + model_answer = fschatbot.generate(input_text, generate_kwargs) + + is_cor = is_correct(model_answer[0], sample['output'][0]) + answers.append(is_cor) + if DEBUG: + print(f'Full input_text:\n{input_text}\n\n') + print(f'Question: {sample["instruction"]}\n\n' + f'Answers: {sample["output"]}\n\n' + f'Model Answers: {model_answer}\n\n' + f'Is correct: {is_cor}\n\n') + + print(f'Num of total question: {len(answers)}, ' + f'correct num: {sum(answers)}, ' + f'correct rate: {float(sum(answers))/len(answers)}.') + + +if __name__ == "__main__": + main() diff --git a/federatedscope/llm/eval/eval_for_mmlu/eval.py b/federatedscope/llm/eval/eval_for_mmlu/eval.py index 68a59be87..97e897e68 100644 --- a/federatedscope/llm/eval/eval_for_mmlu/eval.py +++ b/federatedscope/llm/eval/eval_for_mmlu/eval.py @@ -16,6 +16,9 @@ from federatedscope.core.data.utils import download_url import tarfile +# import torch._dynamo +# torch._dynamo.config.suppress_errors = True + transformers.logging.set_verbosity(40) choices = ["A", "B", "C", "D"]