-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
34 lines (27 loc) · 930 Bytes
/
main.py
File metadata and controls
34 lines (27 loc) · 930 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import torch
import llm_cpp_engine
from transformers import AutoTokenizer
import os
TOKENIZER_PATH = "Qwen3-8B-Tokenizer"
MODEL_PATH = "data/model"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
engine = llm_cpp_engine.InferenceEngine('cuda')
engine.load(MODEL_PATH)
print("model loaded successfully")
text = "天空是什么颜色?"
text_ids = tokenizer.encode(text)
input_ids = ( [engine.im_start_id, engine.role_id_user, engine.nl_id] +
text_ids +
[engine.im_end_id, engine.nl_id] +
[engine.im_start_id, engine.role_id_assistant, engine.nl_id])
# input_ids = text_ids
token = engine.next_token(input_ids)
output_ids = input_ids + [token]
for i in range(500):
token = engine.next_token(token)
if token == engine.im_end_id:
break
output_ids.append(token)
print(i, token)
output_text = tokenizer.decode(output_ids)
print(output_text)