diff --git a/.gitignore b/.gitignore index 23d398a..4804852 100644 --- a/.gitignore +++ b/.gitignore @@ -149,7 +149,10 @@ test/ # Logs logs/* !logs/.gitkeep +log/* # databasegit status db/* -!db/.gitkeep \ No newline at end of file +!db/.gitkeep + +biji.txt \ No newline at end of file diff --git a/green_bit_llm/evaluation/README.md b/green_bit_llm/evaluation/README.md index 7d10140..079f015 100644 --- a/green_bit_llm/evaluation/README.md +++ b/green_bit_llm/evaluation/README.md @@ -44,7 +44,11 @@ We have released over 200 highly precise 2.2/2.5/3/4-bit models across the moder ### PPL Evaluation ```bash -python -m green_bit_llm.evaluation.evaluate --model GreenBitAI/Qwen-1.5-4B-layer-mix-bpw-3.0 --trust-remote-code --eval-ppl --ppl-tasks wikitext2,c4,ptb +python -m green_bit_llm.evaluation.evaluate --model GreenBitAI/Qwen-1.5-4B-layer-mix-bpw-3.0 --backend greenbit-engine --trust-remote-code --eval-ppl --ppl-tasks wikitext2,c4,ptb +``` +or +```bash +python -m green_bit_llm.evaluation.evaluate --model models/Qwen2.5-7B-Instruct --trust-remote-code --backend vllm --eval-ppl --ppl-tasks wikitext2,c4,ptb ``` | **Repository** | **Method** | **Avg bits** | **wikitext 2 (2048)** | **c4 (2048)** | diff --git a/green_bit_llm/evaluation/evaluate.py b/green_bit_llm/evaluation/evaluate.py index f7f9eaa..7b1733e 100644 --- a/green_bit_llm/evaluation/evaluate.py +++ b/green_bit_llm/evaluation/evaluate.py @@ -21,16 +21,20 @@ from pathlib import Path from lm_eval import evaluator - +from vllm.model_executor.layers.logits_processor import _apply_logits_processors +from vllm import LLM, SamplingParams import warnings warnings.filterwarnings('ignore') + + # default value for arguments DEFAULT_MODEL_PATH = "GreenBitAI/Qwen-1.5-1.8B-layer-mix-bpw-2.2" DEFAULT_SEQLEN = 2048 DEFAULT_RANDOM_SEED = 0 DTYPE = torch.half +DEFAULT_MODEL_BCKEND = ["vllm", "greenbit-engine"] replace_peft_lora_model_with_gba_lora_model() @@ -203,6 +207,18 @@ def setup_arg_parser(): help="Specify lora dir for lora merge" ) + parser.add_argument( + "--backend", + type=str, + default="vllm", + help="Specify the model inference backend from [vllm, greenbit-engine]" + ) + parser.add_argument( + "--gpu-memory-utilization", + type=float, + default=0.8, + help="only useful when using vllm backend." + ) return parser @@ -212,10 +228,10 @@ def create_device_map(cuda_device_id): device_map = {f"cuda:{id}" for id in ids} return device_map -def main(args): +def evaluate_green_bit_engine(args): if not os.path.exists(Path(args.save_dir)): os.mkdir(Path(args.save_dir)) - + # Building configs tokenizer_config = {"trust_remote_code": True if args.trust_remote_code else None} pretrain_model_config = { @@ -225,7 +241,7 @@ def main(args): if args.eos_token is not None: tokenizer_config["eos_token"] = args.eos_token - + model, tokenizer, config = load( args.model, tokenizer_config=tokenizer_config, @@ -235,7 +251,7 @@ def main(args): model_config=pretrain_model_config, requires_grad=False ) - + if args.lora_dir is not None: config = LoraConfig( r=64, @@ -258,7 +274,97 @@ def main(args): eval_results = {"{}".format(args.model): eval_results} - add_dict_to_json_file(file_path="{}".format(os.path.join(args.save_dir, "eval_results.json")), new_data=eval_results) + add_dict_to_json_file(file_path="{}".format(os.path.join(args.save_dir, "eval_greenbit_engine_results.json")), new_data=eval_results) + +def evaluate_vllm(args): + logits_list = [] + def forward_hook(module, input, output): + lm_head, hidden_states, sampling_metadata, *embedding_bias = input + embedding_bias = embedding_bias[0] if embedding_bias else None + logits = module._get_logits(hidden_states, lm_head, embedding_bias) + if logits is not None: + if module.soft_cap is not None: + logits = logits / module.soft_cap + logits = torch.tanh(logits) + logits = logits * module.soft_cap + if module.scale != 1.0: + logits *= module.scale + logits = _apply_logits_processors(logits, sampling_metadata) + logits_list.append(logits) + return output + + @torch.no_grad() + def calculate_ppl(model, testenc, seqlen, device='cuda'): + nsamples = testenc.numel() // seqlen + nlls = [] + + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1, + logprobs=None + ) + + for i in tqdm(range(nsamples)): + logits_list.clear() + batch = testenc[:, (i * seqlen):((i + 1) * seqlen)] + outputs = model.generate(prompts=None, prompt_token_ids=batch.tolist(), sampling_params=sampling_params) + logits = logits_list[0].to(device) + logits = logits.unsqueeze(0) + shift_logits = logits[:, :-1, :] + shift_labels = testenc[:, (i * seqlen): ((i + 1) * seqlen)][ + :, 1: + ].to(device) + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1), + ) + neg_log_likelihood = loss.float() * seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * seqlen)) + return ppl.item() + + print(f"Loading model from {args.model}") + model = LLM( + model=args.model, + trust_remote_code=args.trust_remote_code, + gpu_memory_utilization=args.gpu_memory_utilization + ) + model.llm_engine.model_executor.driver_worker.model_runner.model.logits_processor.register_forward_hook(forward_hook) + + results = {} + logger = create_logger(Path(args.save_dir)) + if args.eval_ppl: + for dataset in args.ppl_tasks.split(","): + # print(f"\nEvaluating {dataset}...") + dataloader, testloader = get_loaders( + dataset.strip(), + seed=args.seed, + model=args.model, + seqlen=args.seqlen, + ) + + if "c4" in dataset: + testenc = testloader + else: + testenc = testloader.input_ids + + ppl = calculate_ppl(model, testenc, args.seqlen) + logger.info(f'{dataset} : {ppl}') + results[dataset] = ppl + + eval_results = {args.model: results} + + add_dict_to_json_file(file_path="{}".format(os.path.join(args.save_dir, "eval_vllm_results.json")), new_data=eval_results) + +def main(args): + if args.backend not in DEFAULT_MODEL_BCKEND: + print(f"Backend is error, please set the backend from {DEFAULT_MODEL_BCKEND}") + exit(-1) + if args.backend == "vllm": + evaluate_vllm(args) + elif args.backend == "greenbit-engine": + evaluate_green_bit_engine(args) if __name__ == "__main__": if not torch.cuda.is_available(): diff --git a/green_bit_llm/inference/backends/__init__.py b/green_bit_llm/inference/backends/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/green_bit_llm/inference/backends/base.py b/green_bit_llm/inference/backends/base.py new file mode 100644 index 0000000..8954178 --- /dev/null +++ b/green_bit_llm/inference/backends/base.py @@ -0,0 +1,7 @@ +import abc + + +class BaseInferenceBackend: + @abc.abstractmethod + def generate(self, prompt, params): + pass diff --git a/green_bit_llm/inference/backends/green_bit_backend.py b/green_bit_llm/inference/backends/green_bit_backend.py new file mode 100644 index 0000000..d44f5bf --- /dev/null +++ b/green_bit_llm/inference/backends/green_bit_backend.py @@ -0,0 +1,58 @@ +from green_bit_llm.inference.sim_gen import DTYPE +from .base import BaseInferenceBackend +import os + +import torch +import torch.nn as nn + +import warnings +warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.modules.module') + +from transformers import PreTrainedTokenizer + +from green_bit_llm.common import generate, load +from green_bit_llm.args_parser import setup_shared_arg_parser + +# default value for arguments +DEFAULT_PROMPT = None +DEFAULT_MAX_TOKENS = 100 +DEFAULT_TEMP = 0.8 +DEFAULT_TOP_P = 0.95 +DTYPE = torch.half + +class GBLLMInferenceBackend(BaseInferenceBackend): + def __init__(self, model_path, **kwargs): + # Building configs + tokenizer_config = {"trust_remote_code": True if kwargs.get("trust_remote_code") else None} + pretrain_model_config = { + "trust_remote_code": True if kwargs.get("trust_remote_code") else None, + "attn_implementation": "flash_attention_2" if kwargs.get("use_flash_attention_2") else None + } + if kwargs.get("eos_token") is not None: + tokenizer_config["eos_token"] = kwargs.get("eos_token") + + self.model, self.tokenizer, config = load( + model_path, + tokenizer_config=tokenizer_config, + dtype=kwargs.get("dtype", DTYPE), + device_map=kwargs.get("auto", "auto"), + seqlen=kwargs.get("seqlen", 2048), + model_config=pretrain_model_config, + requires_grad=False + ) + + def generate(self, prompt, params=None): + if params == None: + params = {} + if isinstance(prompt, str): + prompt = [prompt] + for prom in prompt: + generate( + self.model, + self.tokenizer, + prom, + params.get("temperature", DEFAULT_TEMP), + params.get("max_tokens", DEFAULT_MAX_TOKENS), + True, + params.get("top_p", DEFAULT_TOP_P), + ) \ No newline at end of file diff --git a/green_bit_llm/inference/backends/vllm_backend.py b/green_bit_llm/inference/backends/vllm_backend.py new file mode 100644 index 0000000..496c31b --- /dev/null +++ b/green_bit_llm/inference/backends/vllm_backend.py @@ -0,0 +1,18 @@ +from vllm import LLM +from .base import BaseInferenceBackend + +class VLLMInferenceBackend(BaseInferenceBackend): + def __init__(self, model_path, **kwargs): + self.model = LLM(model_path, **kwargs) + + def do_generate(self, prompt, params): + outputs = self.model.generate(prompt, params) + return outputs + + def generate(self, prompt, params=None): + if isinstance(prompt, str): + prompt = [prompt] + outputs = self.do_generate(prompt, params) + for i,output in enumerate(outputs): + print("Prompt:",prompt[i]) + print("Generated text:",output.outputs[0].text) \ No newline at end of file diff --git a/green_bit_llm/inference/demo.py b/green_bit_llm/inference/demo.py new file mode 100644 index 0000000..e69de29 diff --git a/green_bit_llm/inference/llm_inference.py b/green_bit_llm/inference/llm_inference.py new file mode 100644 index 0000000..e69de29 diff --git a/third_party/vllm/requirements-common.txt b/third_party/vllm/requirements-common.txt new file mode 100644 index 0000000..ef5ed8b --- /dev/null +++ b/third_party/vllm/requirements-common.txt @@ -0,0 +1,34 @@ +psutil +sentencepiece # Required for LLaMA tokenizer. +numpy < 2.0.0 +requests >= 2.26.0 +tqdm +py-cpuinfo +transformers >= 4.45.2 # Required for Llama 3.2 and Qwen2-VL. +tokenizers >= 0.19.1 # Required for Llama 3. +protobuf # Required by LlamaTokenizer. +fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' +fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' +aiohttp +openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support) +uvicorn[standard] +pydantic >= 2.9 # Required for fastapi >= 0.113.0 +pillow # Required for image processing +prometheus_client >= 0.18.0 +prometheus-fastapi-instrumentator >= 7.0.0 +tiktoken >= 0.6.0 # Required for DBRX tokenizer +lm-format-enforcer == 0.10.6 +outlines >= 0.0.43, < 0.1 +typing_extensions >= 4.10 +filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 +partial-json-parser # used for parsing partial JSON outputs +pyzmq +msgspec +gguf == 0.10.0 +importlib_metadata +mistral_common[opencv] >= 1.4.4 +pyyaml +six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 +setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 +einops # Required for Qwen2-VL. +compressed-tensors == 0.7.1 # required for compressed-tensors diff --git a/third_party/vllm/requirements-cuda.txt b/third_party/vllm/requirements-cuda.txt new file mode 100644 index 0000000..058ab7c --- /dev/null +++ b/third_party/vllm/requirements-cuda.txt @@ -0,0 +1,10 @@ +# Common dependencies +-r requirements-common.txt + +# Dependencies for NVIDIA GPUs +ray >= 2.9 +nvidia-ml-py >= 12.560.30 # for pynvml package +torch == 2.5.1 +# These must be updated alongside torch +torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1