Cook-Buddy/llm.py at main · GarfieldFluffJr/Cook-Buddy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
torch.cuda.empty_cache()  # Clear cache after generation

# Load the model and tokenizer without quantization
model = AutoModelForCausalLM.from_pretrained("allenyang687/cookbuddymodel")
tokenizer = AutoTokenizer.from_pretrained("allenyang687/cookbuddymodel")

# Enable gradient checkpointing (optional)
model.gradient_checkpointing_enable()

# Clear cache before starting
torch.cuda.empty_cache()

# Set up the TextStreamer for streaming the output (optional)
text_streamer = TextStreamer(tokenizer)

# Function to handle generating responses for a given input
def generate_response(user_input):
    prompt = """Hold Conversation.""

    ### Instruction:
    {instruction}

    ### Input:
    {input}

    ### Response:
    {response}"""

    inputs = tokenizer(
        [prompt.format(
            instruction="Be very specific with instructions and give the time and temperature to cook everything at.",  # You can customize this or make dynamic
            input=user_input,  # The user input will be used here
            response=""  # Leave blank for generation
        )],
        return_tensors="pt"
    ).to("cuda")

    # Use mixed precision (optional)
    from torch.cuda.amp import autocast

    # Use no_grad() to save memory during inference
    with torch.no_grad(), autocast():
        generated_output = model.generate(
            **inputs,
            streamer=text_streamer,
            max_new_tokens=612,  # Limit token count for output
            temperature=0.7  # Control randomness (lower value = more deterministic)
        )

    # Decode the generated tokens into a string
    generated_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)

    return generated_text

#Live chat loop
# print("Chat with the model! Type 'exit' to stop.")
# while True:
#     user_input = input("You: ")  # Take input from the user
#     if user_input.lower() == 'exit':
#         print("Exiting chat. Goodbye!")
#         break  # Exit the loop if the user types 'exit'
#     else:
#         response = generate_response(user_input)  # Generate the model response based on user input
#         print(f"Model: {response}")  # Print the generated response