-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathmain.py
103 lines (87 loc) · 2.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
from typing import Optional
import huggingface_hub
import torch
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, LlamaTokenizer, GenerationConfig
try:
"""
To access your cerebrium secret:
1. create a secret in your projects dashboard
2. Take note of the name,
for this example, we'll use a secret called `hf_auth_token`
3. add the code in the line below to access your secret
"""
hf_auth_token = os.environ.get("hf_auth_token") # load your secret
# And that's all! It's that easy.
if hf_auth_token == "":
raise Exception(
"hf_auth_token is empty. You need a hf_auth_token secret added to your account to access this model."
)
except Exception as e:
print("\n\n")
print("=" * 60)
print("Error: ", e)
print("=" * 60)
raise e
huggingface_hub.login(token=hf_auth_token)
class Item(BaseModel):
prompt: str
cutoff_len: Optional[int] = 256
temperature: Optional[float] = 0.1
top_p: Optional[float] = 0.75
top_k: Optional[float] = 40
num_beams: Optional[int] = 4
max_new_tokens: Optional[int] = 256
base_model_name = "meta-llama/Llama-2-7b-hf" # Hugging Face Model Id
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
load_in_8bit=True,
torch_dtype=torch.float16,
device_map="auto",
)
tokenizer = LlamaTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
tokenizer.padding_side = "left" # Allow batched inference
def tokenize(prompt, cutoff_len, add_eos_token=True):
print("tokenizing: ", prompt)
return tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors="pt",
)
def generate(params: Item):
print("Placing prompt in template")
prompt_input = tokenize(prompt=params.prompt, cutoff_len=params.cutoff_len)
input_ids = prompt_input["input_ids"]
input_ids = input_ids.to(base_model.device)
print("Setting up generation config")
generation_config = GenerationConfig(
temperature=params.temperature,
top_p=params.top_p,
top_k=params.top_k,
num_beams=params.num_beams,
max_new_tokens=params.max_new_tokens,
)
with torch.no_grad():
outputs = base_model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
)
return tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
def predict(prompt, cutoff_len, temperature, top_p, top_k, num_beams, max_new_tokens):
item = Item(
prompt=prompt,
cutoff_len=cutoff_len,
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
max_new_tokens=max_new_tokens,
)
result = generate(params=item)
return {"Prediction": result}