Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

redundant Checks, audio_prompt to Tensor, and adding accelerator+autocastfor evaluation #804

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ gradio==4.41.0
gradio_client
http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_studio-0.4.0.9-py3-none-any.whl
decord
setuptools==75.8.0
2 changes: 1 addition & 1 deletion requirements_o2.6.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ moviepy

# for web demo
aiofiles==23.2.1
onnxruntime==1.20.1
#onnxruntime==1.20.1
fastapi
uvicorn
gradio==4.44.1
Expand Down
45 changes: 24 additions & 21 deletions web_demos/minicpm-o_2.6/model_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import uvicorn
from fastapi import FastAPI, Header, Query, Request, HTTPException, WebSocket, WebSocketDisconnect
from fastapi.responses import JSONResponse, StreamingResponse
from accelerate import Accelerator


cur_path = os.path.split(os.path.realpath(__file__))[0]
sys.path.append(os.path.abspath(cur_path))
Expand Down Expand Up @@ -92,12 +94,16 @@ def __init__(self):

self.minicpmo_model_path = args.model #"openbmb/MiniCPM-o-2_6"
self.model_version = "2.6"

accelerator = Accelerator()
with torch.no_grad():
self.minicpmo_model = AutoModel.from_pretrained(self.minicpmo_model_path, trust_remote_code=True, torch_dtype=self.target_dtype, attn_implementation='sdpa')
self.minicpmo_tokenizer = AutoTokenizer.from_pretrained(self.minicpmo_model_path, trust_remote_code=True)
self.minicpmo_model.init_tts()
# self.minicpmo_model.tts.float()
self.minicpmo_model.to(self.device).eval()
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
self.minicpmo_model = AutoModel.from_pretrained(self.minicpmo_model_path, trust_remote_code=True, torch_dtype=self.target_dtype, attn_implementation='sdpa', device_map="auto")
self.minicpmo_model = accelerator.prepare(self.minicpmo_model)
self.minicpmo_tokenizer = AutoTokenizer.from_pretrained(self.minicpmo_model_path, trust_remote_code=True)
self.minicpmo_model.init_tts()
# self.minicpmo_model.tts.float()
self.minicpmo_model.eval()

self.ref_path_video_default = "assets/ref_audios/video_default.wav"
self.ref_path_default = "assets/ref_audios/default.wav"
Expand Down Expand Up @@ -204,6 +210,8 @@ def no_active_stream(self):
return False

def sys_prompt_init(self, msg_type):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if self.past_session_id == self.session_id:
return
logger.info("### sys_prompt_init ###")
Expand All @@ -226,7 +234,8 @@ def sys_prompt_init(self, msg_type):
ref_path = self.ref_path_male

audio_prompt, sr = librosa.load(ref_path, sr=16000, mono=True)
sys_msg = {'role': 'user', 'content': [audio_voice_clone_prompt + "\n", audio_prompt, "\n" + audio_assistant_prompt]}
audio_tensor = torch.tensor(audio_prompt).float().to(device)
sys_msg = {'role': 'user', 'content': [audio_voice_clone_prompt + "\n", audio_tensor, "\n" + audio_assistant_prompt]}
elif msg_type == 2: #video
voice_clone_prompt="你是一个AI助手。你能接受视频,音频和文本输入并输出语音和文本。模仿输入音频中的声音特征。"
assistant_prompt="作为助手,你将使用这种声音风格说话。"
Expand All @@ -243,7 +252,8 @@ def sys_prompt_init(self, msg_type):
ref_path = self.ref_path_male

audio_prompt, sr = librosa.load(ref_path, sr=16000, mono=True)
sys_msg = {'role': 'user', 'content': [voice_clone_prompt, audio_prompt, assistant_prompt]}
audio_tensor = torch.tensor(audio_prompt).float().to(device)
sys_msg = {'role': 'user', 'content': [voice_clone_prompt, audio_tensor, assistant_prompt]}
# elif msg_type == 3: #user start
# assistant_prompt="作为助手,你将使用这种声音风格说话。"
# if self.customized_options is not None:
Expand All @@ -268,20 +278,13 @@ def sys_prompt_init(self, msg_type):
)

self.savedir = os.path.join(f"./log_data/{args.port}/", str(time.time()))
if not os.path.exists(self.savedir):
os.makedirs(self.savedir)
if not os.path.exists(self.savedir + "/input_audio_log"):
os.makedirs(self.savedir + "/input_audio_log")
if not os.path.exists(self.savedir + "/input_audio_vad_log"):
os.makedirs(self.savedir + "/input_audio_vad_log")
if not os.path.exists(self.savedir + "/input_image_log"):
os.makedirs(self.savedir + "/input_image_log")
if not os.path.exists(self.savedir + "/output_audio_log"):
os.makedirs(self.savedir + "/output_audio_log")
if not os.path.exists(self.savedir + "/feedback_log"):
os.makedirs(self.savedir + "/feedback_log")
if not os.path.exists(self.savedir + "/input_audio"):
os.makedirs(self.savedir + "/input_audio")
os.makedirs(self.savedir, exist_ok=True)
os.makedirs(self.savedir + "/input_audio_log", exist_ok=True)
os.makedirs(self.savedir + "/input_audio_vad_log", exist_ok=True)
os.makedirs(self.savedir + "/input_image_log", exist_ok=True)
os.makedirs(self.savedir + "/output_audio_log", exist_ok=True)
os.makedirs(self.savedir + "/feedback_log", exist_ok=True)
os.makedirs(self.savedir + "/input_audio", exist_ok=True)

self.past_session_id = self.session_id
self.audio_prefill = []
Expand Down