deepseek-distill-qwen/test_qwen_model.py at master · coding-alt/deepseek-distill-qwen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
===============================================================================
    Script for Generating Responses using Fine-tuned Qwen Model
===============================================================================

Author:
-------
houalex@gmail.com

Date:
-----
Feb 14, 2025

Description:
-------------
This script loads a fine-tuned Qwen model, which was trained on specific datasets,
to generate natural language responses to user prompts. The model is used to generate
text based on a provided question or statement. The generated response is printed
to the console.

Dependencies:
-------------
- transformers (for loading and using the Qwen model)
- torch (for model inference on GPU/CPU)
- pipeline (for text generation)

Usage:
------
1. Install the required dependencies:
   $ pip install transformers torch

2. Run the script to:
   - Load the fine-tuned Qwen model
   - Generate responses based on the provided prompt
   - Print the generated response

License:
--------
Apache License 2.0 (Apache-2.0)

===============================================================================
"""

# -*- coding: utf-8 -*-
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

# Load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained(
    "./model/deepseek-distill-qwen2.5-1.5b",
    torch_dtype=torch.float16,  # Using fp16 for faster inference if possible
    device_map="auto"  # Automatically select the best device (GPU if available)
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./model/deepseek-distill-qwen2.5-1.5b")
model.resize_token_embeddings(len(tokenizer))  # Resize token embeddings to match tokenizer size

# Create a text generation pipeline
chat_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# Generate a response based on a prompt
prompt = """<|user|>
What do you think about Trump?
<|end|>
<|assistant|>
"""

# Generate the response with specified parameters
output = chat_pipeline(
    prompt,
    max_new_tokens=500,  # Limit the output length to 500 tokens
    temperature=0.7,  # Controls randomness of the output (higher is more random)
    do_sample=True,  # Use sampling rather than greedy search
    eos_token_id=tokenizer.eos_token_id  # Ensure the model stops at the end-of-sequence token
)

# Print the generated response
print(output[0]['generated_text'])