vllm-project
diff --git a/‎src/.lycheeignore‎ renamed to ‎.lycheeignore‎ b/‎src/.lycheeignore‎ renamed to ‎.lycheeignore‎
diff --git a/‎examples/eagle3/apply_eagle3_eagle.sh‎
Lines changed: 1 addition & 0 deletions b/‎examples/eagle3/apply_eagle3_eagle.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎research/hass/README.md‎
Lines changed: 29 additions & 0 deletions b/‎research/hass/README.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎research/hass/__init__.py‎ b/‎research/hass/__init__.py‎
diff --git a/‎research/hass/data/mt_bench/question.jsonl‎
Lines changed: 80 additions & 0 deletions b/‎research/hass/data/mt_bench/question.jsonl‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎research/hass/ge_data/__init__.py‎ b/‎research/hass/ge_data/__init__.py‎
diff --git a/‎research/hass/ge_data/allocation.py‎
Lines changed: 99 additions & 0 deletions b/‎research/hass/ge_data/allocation.py‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎research/hass/ge_data/ge_data_all_llama31.py‎
Lines changed: 205 additions & 0 deletions b/‎research/hass/ge_data/ge_data_all_llama31.py‎
Lines changed: 205 additions & 0 deletions
@@ -0,0 +1 @@
+speculators convert --eagle3 yuhuili/EAGLE3-LLaMA3.1-Instruct-8B eagle3-llama-3.1-8b-instruct-converted meta-llama/Meta-Llama-3.1-8B-Instruct --validate
@@ -228,6 +228,8 @@ select = [
     "E722",  # allow bare exception catching
     "C90",  # ignore code complexity errors
     "PGH004",  # allow general style ignores
+    "ERA001", # allow commented-out code
+    "N80",  # allow upper case names
 ]
 
 [tool.ruff.lint.isort]
 
@@ -0,0 +1,29 @@
+# Train Time Test HASS
+#### This code uses the HASS method (https://github.com/HArmonizedSS/HASS) to train models that are a variation on the Eagle 1 architecture.  The original Eagle code can be found here: https://github.com/SafeAILab/EAGLE
+
+## To Run:
+
+The training process is broken up in to 2 steps, the first where you generate data from the large model, and the second where you actually train the drafter model. It works for Llama 3.1.8B-Instruct, Llama 3.3.70B-Instruct, and Mistral Small 24B 3503.
+
+
+### Data Generation step:
+
+1. Modify the directory names and arguments in `gen_data.sh`
+2. You can get the data for ShareGPT at:  Aeala/ShareGPT_Vicuna_unfiltered.  Ultrachat will be automatically downloaded.
+3. Make sure that the system prompts and chat template demarkation in the desired file (ultrachat.py or sharegpt.py, ultrachatMistral.py or sharegptMistral.py) are correct
+4. Run the script: `./gen_data.sh`
+5. Run for each of: sharegpt, and ultrachat sft and gen splits.
+Notes:  For llama 3.1.8B this will generate ~4TB of data on your system.  The script for training searches your data directory recursively, so the internal structure of your data directory does not matter.
+
+### Run training
+1. Modify the directory names and arguments in `train.sh`
+3. Run `./train.sh`
+
+### Serve the model with vllm:
+1. Convert your saved model with: `convert.sh`
+2. Serve the model with: ` VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --seed 42 -tp 1 --speculative-config '{"model": "llama_eagle", "num_speculative_tokens": 3, "method":"eagle3", "draft_tensor_parallel_size":1}'`
+
+
+
+### TODO:
+1. Throw an error if you attempt to create a model that will not be supported in vllm - with the wrong configuration of heads etc.
@@ -0,0 +1,99 @@
+# This file is adapted from https://github.com/HArmonizedSS/HASS (arxiv: https://arxiv.org/abs/2408.15766)
+# Which is a fork of the Eagle repository: https://github.com/SafeAILab/EAGLE (arxiv: https://arxiv.org/abs/2401.15077)
+
+
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+
+parser = argparse.ArgumentParser(description="sp")
+parser.add_argument("--outdir", type=str, default="0")
+parser.add_argument("--data_path", type=str, default="0")
+parser.add_argument("--model_path", type=str, default="0")
+parser.add_argument("--dataset", type=str, default="ultrachat")
+parser.add_argument("--total_gpus", type=int, default=8)
+parser.add_argument("--gpus_per_model", type=int, default=1)
+parser.add_argument("--samples", type=int, default=68000)
+parser.add_argument("--split", type=str, default="sft")
+parser.add_argument("--chat_template", type=str, default="llama")
+
+
+args = parser.parse_args()
+
+
+s = 0
+e = args.samples
+gpus = [
+    [i + j for j in range(args.gpus_per_model)]
+    for i in range(0, args.total_gpus, args.gpus_per_model)
+]
+
+
+num_p = len(gpus)
+outdir = args.outdir
+
+
+def split_range(start, end, n, over=False):
+    length = end - start + 1  # Include the end
+    base_interval = length // n
+    additional = length % n  # Get the remainder of the division
+    intervals = []
+    previous = start
+
+    for i in range(n):
+        current_interval = base_interval + (1 if i < additional else 0)
+        if over:
+            intervals.append((previous, previous + current_interval))
+        else:
+            intervals.append(
+                (previous, previous + current_interval - 1)
+            )  # '-1' because the end is inclusive
+        previous += current_interval
+
+    return intervals
+
+
+def run_command(cmd):
+    os.system(cmd)  # noqa: S605
+
+
+if not os.path.exists(outdir):
+    os.makedirs(outdir)
+
+
+data_a = split_range(s, e, num_p, over=True)
+commands = []
+for i in range(num_p):
+    index = i
+    start = data_a[i][0]
+    end = data_a[i][1]
+    # gpu_index_str = [str(i) for i in gpu_index]
+    # gpu_index_str=','.join(gpu_index_str)
+    gpu_index = gpus[i]
+    gpu_index_str = " ".join(map(str, gpu_index))
+    # gpu_index_str='['+gpu_index_str+']'
+    if args.chat_template == "llama":
+        command = (
+            f"python ge_data/{args.dataset}.py --start={start} --end={end} "
+            f"--index={index} --gpu_index {gpu_index_str} --outdir {outdir} "
+            f"--data_path {args.data_path} --model_path {args.model_path} "
+            f"--split {args.split}"
+        )
+    elif args.chat_template == "mistral":
+        command = (
+            f"python ge_data/{args.dataset}Mistral.py --start={start} --end={end} "
+            f"--index={index} --gpu_index {gpu_index_str} --outdir {outdir} "
+            f"--data_path {args.data_path} --model_path {args.model_path} "
+            f"--split {args.split}"
+        )
+    else:
+        raise NotImplementedError(
+            "Only llama and mistral chat templates are supported."
+        )
+
+    commands.append(command)
+
+with ThreadPoolExecutor(max_workers=len(commands)) as executor:
+    for command in commands:
+        executor.submit(run_command, command)
+        print(command)
@@ -0,0 +1,205 @@
+# This file is adapted from https://github.com/HArmonizedSS/HASS (arxiv: https://arxiv.org/abs/2408.15766)
+# Which is a fork of the Eagle repository: https://github.com/SafeAILab/EAGLE (arxiv: https://arxiv.org/abs/2401.15077)
+
+import argparse
+import os
+
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+parser = argparse.ArgumentParser(description="sp")
+parser.add_argument("--start", type=int, default=0)
+parser.add_argument("--end", type=int, default=100)
+parser.add_argument("--index", type=int, default=1)
+parser.add_argument("--gpu_index", type=int, nargs="+", default=[0])
+parser.add_argument("--outdir", type=str, default="outdir0")
+parser.add_argument("--data_path", type=str, default="0")
+parser.add_argument("--model_path", type=str, default="0")
+args = parser.parse_args()
+
+os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)[1:-1]
+
+bigname = args.model_path
+
+
+def longest_common_prefix(list1, list2):
+    prefix_length = 0
+    min_length = min(len(list1), len(list2))
+
+    for i in range(min_length):
+        if list1[i] == list2[i]:
+            prefix_length += 1
+        else:
+            break
+
+    common_prefix = list1[:prefix_length]
+    return common_prefix, prefix_length
+
+
+def build_dataset_rank(
+    tokenizer,
+    split="train",  # noqa: ARG001
+    select=None,  # noqa: ARG001
+):
+    ds = load_dataset("json", data_files=args.data_path)
+    ds = ds["train"]
+    ds = ds.shuffle(seed=42)
+    ds1 = ds.select(range(args.start, args.end))
+
+    original_columns1 = ds1.column_names
+    # original_columns2 = ds2.column_names
+    num_proc = 4  # noqa: F841
+
+    def preprocess_function(examples):
+        new_examples = {"conversation": [], "input_ids": [], "loss_mask": []}
+        for i in range(len(examples["id"])):
+            messages = [
+                {
+                    "role": "system",
+                    "content": (
+                        "Cutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024"
+                    ),
+                },
+            ]
+            convroles = ["user", "assistant"]
+            roles = {"human": "user", "gpt": "assistant"}
+            source = examples["conversations"][i]
+            if roles[source[0]["from"]] != "user":
+                # Skip the first one if it is not from human
+                source = source[1:]
+            for j, sentence in enumerate(source):
+                role = roles[sentence["from"]]
+                assert role == convroles[j % 2], f"{i}"  # noqa: S101
+                if sentence["from"] == "gpt":
+                    sentence["value"] = " " + sentence["value"]
+                messages.append({"role": role, "content": sentence["value"]})
+            conversation = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=False,
+            )
+
+            if not tokenizer.pad_token_id:
+                tokenizer.pad_token_id = tokenizer.unk_token_id
+
+            input_ids = tokenizer(
+                conversation,
+                return_tensors="pt",
+                max_length=4096,
+                add_special_tokens=False,
+            ).input_ids[0]
+            loss_mask = torch.ones_like(input_ids)
+            # print(i)
+
+            sep = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+
+            total_len = len(input_ids)  # noqa: F841
+
+            sep2 = "<|eot_id|><|start_header_id|>user<|end_header_id|>"
+            turns = conversation.split(sep2)
+
+            turns[1] = turns[0] + sep2 + turns[1]
+            turns = turns[1:]
+
+            cur_len = 1
+            loss_mask[:cur_len] = 0
+            for i, turn in enumerate(turns):  # noqa: PLW2901
+                if turn == "":
+                    break
+                turn_len = len(tokenizer(turn).input_ids)
+
+                parts = turn.split(sep)
+                if len(parts) != 2:
+                    break
+                parts[0] += sep
+                # "-2" is hardcoded for the Llama tokenizer to make the offset correct.
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+
+                # Ignore the user instructions
+                if i == 0:
+                    loss_mask[cur_len : cur_len + instruction_len - 2] = 0
+                else:
+                    loss_mask[cur_len - 3 : cur_len + instruction_len + 1] = 0
+                cur_len += turn_len
+                if i != 0:
+                    cur_len += 3
+                # cur_len+=2
+
+                # if i != 0 and not tokenizer.legacy:
+                #     # The legacy and non-legacy modes handle special toks differently
+                #     cur_len -= 1
+
+            loss_mask[cur_len:] = 0
+
+            new_examples["conversation"].append(conversation)
+            new_examples["input_ids"].append(input_ids[None, :])
+            new_examples["loss_mask"].append(loss_mask[None, :])
+
+        return new_examples
+
+    ds1 = ds1.map(
+        preprocess_function,
+        batched=True,
+        # num_proc=num_proc,
+        remove_columns=original_columns1,
+        load_from_cache_file=False,
+    )
+
+    ds1.set_format(type="torch")
+    return ds1
+
+
+bigtokenizer = AutoTokenizer.from_pretrained(bigname, use_fast=False)
+ds = build_dataset_rank(bigtokenizer)
+print(ds)
+bigmodel = AutoModelForCausalLM.from_pretrained(
+    bigname, device_map="auto", torch_dtype=torch.float16
+)
+bigmodel.eval()
+
+
+@torch.no_grad()
+def ge(data):
+    input_ids = data["input_ids"]
+    num_layers = len(bigmodel.model.layers)
+    outs_big = bigmodel(input_ids.cuda(), output_hidden_states=True)
+    # hidden_state_big = outs_big.hidden_states[-1]
+    featureFusion = [
+        outs_big.hidden_states[3],
+        outs_big.hidden_states[num_layers // 2 + 1],
+        outs_big.hidden_states[-3],
+    ]
+    target = outs_big.hidden_states[-1]
+    hidden_state_big = torch.cat(featureFusion, dim=-1)
+    max_prob_tokens_big = torch.argmax(outs_big.logits, dim=-1)  # noqa: F841
+    probs = torch.softmax(outs_big.logits, dim=-1)
+    maxp = probs[0].max(dim=1).values  # noqa: F841
+    return {
+        "input_ids": input_ids.cpu()[0],
+        "hidden_state": hidden_state_big.cpu()[0],
+        "loss_mask": data["loss_mask"].cpu()[0],
+        "target": target.cpu()[0],
+    }
+
+
+outdir = f"{args.outdir}/{args.index}"
+if not os.path.exists(outdir):
+    os.makedirs(outdir)
+
+
+def writedata(name, data_point):
+    if not os.path.exists(name):
+        os.makedirs(name)
+    current_length = len(os.listdir(name))
+    idx = current_length
+    torch.save(data_point, f"{name}/data_{idx}.ckpt")
+
+
+for id_, data in enumerate(ds):
+    if id_ % 100 == 0:
+        print(id_, end="\t")
+    if id_ % 1000 == 0:
+        print("")
+    outdata = ge(data)
+    writedata(outdir, outdata)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+speculators convert --eagle3 yuhuili/EAGLE3-LLaMA3.1-Instruct-8B eagle3-llama-3.1-8b-instruct-converted meta-llama/Meta-Llama-3.1-8B-Instruct --validate`
Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,8 @@ select = [`
`228`	`228`	`"E722", # allow bare exception catching`
`229`	`229`	`"C90", # ignore code complexity errors`
`230`	`230`	`"PGH004", # allow general style ignores`
	`231`	`+ "ERA001", # allow commented-out code`
	`232`	`+ "N80", # allow upper case names`
`231`	`233`	`]`
`232`	`234`
`233`	`235`	`[tool.ruff.lint.isort]`