Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
307 changes: 10 additions & 297 deletions README.md

Large diffs are not rendered by default.

49 changes: 49 additions & 0 deletions deploy-hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import logging
import subprocess
logging.basicConfig(level=logging.INFO)

def run_shell_script(script_path):
"""
运行指定路径的shell脚本,并打印输出到控制台。

:param script_path: Shell脚本的文件路径
"""
try:
# 使用subprocess.Popen来运行shell脚本
with subprocess.Popen(['bash', script_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) as proc:
# 读取输出
for line in proc.stdout:
print(line, end='') # 实时打印输出
proc.stdout.close()
return_code = proc.wait()
if return_code:
print(f"Shell脚本运行出错,返回码:{return_code}")
except Exception as e:
print(f"运行shell脚本时发生错误:{e}")

# 使用方法示例
# 假设有一个名为example.sh的脚本文件在当前目录下
run_shell_script('deploy.sh')
# SDK模型下载
from modelscope import snapshot_download
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')

class Args:
def __init__(self):
self.port = 5000
self.model_dir = 'pretrained_models/CosyVoice-300M'

from webui import main
from cosyvoice.cli.cosyvoice import CosyVoice
import numpy as np

# 创建 args 实例
args = Args()

cosyvoice = CosyVoice(args.model_dir)
sft_spk = cosyvoice.list_avaliable_spks()
prompt_sr, target_sr = 16000, 22050
default_data = np.zeros(target_sr)

# 调用 main 时传递 args
main(args,sft_spk)
34 changes: 34 additions & 0 deletions deploy-hf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
sh Miniconda3-latest-Linux-x86_64.sh -b
source ~/miniconda3/bin/activate

#conda create -n cosyvoice python=3.8
#conda activate cosyvoice
#conda install -y -c conda-forge pynini==2.1.5
# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
pip install -r requirements_.txt

# If you encounter sox compatibility issues
# ubuntu
apt-get -y update && apt-get -y install sox libsox-dev

mkdir -p pretrained_models
#git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
#git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
#git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
#git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
#git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
#huggingface-cli download model-scope/CosyVoice-300M --local-dir pretrained_models/CosyVoice-300M --token=$hf_token
#huggingface-cli download model-scope/CosyVoice-300M-SFT --local-dir pretrained_models/CosyVoice-300M-SFT --token=$hf_token
#huggingface-cli download FunAudioLLM/CosyVoice-ttsfrd --local-dir pretrained_models/CosyVoice-ttsfrd --token=$hf_token

ls pretrained_models

cd pretrained_models/CosyVoice-ttsfrd/
unzip resource.zip -d .
pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl

export PYTHONPATH=third_party/Matcha-TTS

python3 webui.py
9 changes: 5 additions & 4 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04

ARG VENV_NAME="cosyvoice"
ENV VENV=$VENV_NAME
Expand Down Expand Up @@ -42,10 +42,11 @@ WORKDIR /workspace

ENV PYTHONPATH="${PYTHONPATH}:/workspace/CosyVoice:/workspace/CosyVoice/third_party/Matcha-TTS"

RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git

#RUN git clone --recursive https://github.com/tanbw/CosyVoice.git
COPY requirements.txt CosyVoice/requirements.txt
RUN conda activate ${VENV} && conda install -y -c conda-forge pynini==2.1.5
RUN conda activate ${VENV} && cd CosyVoice && \
pip install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 && \
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com

RUN rm -rf CosyVoice
WORKDIR /workspace/CosyVoice
35 changes: 35 additions & 0 deletions docker/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # microsoft/onnxruntime#21684
conformer==0.3.2
deepspeed==0.14.2; sys_platform == 'linux'
diffusers==0.29.0
gdown==5.1.0
gradio==5.4.0
grpcio==1.57.0
grpcio-tools==1.57.0
hydra-core==1.3.2
HyperPyYAML==1.2.2
inflect==7.3.1
librosa==0.10.2
lightning==2.2.4
matplotlib==3.7.5
modelscope==1.20.0
networkx==3.1
omegaconf==2.3.0
onnx==1.16.0
onnxruntime-gpu==1.21.0; sys_platform == 'linux'
onnxruntime==1.21.0; sys_platform == 'darwin' or sys_platform == 'win32'
openai-whisper==20231117
protobuf==4.25
pyarrow==18.1.0
pydantic==2.7.0
pyworld==0.3.4
rich==13.7.1
soundfile==0.12.1
tensorboard==2.14.0
tensorrt==10.9.0.34; sys_platform == 'linux'
transformers==4.40.1
uvicorn==0.30.0
wget==3.2
fastapi==0.115.6
fastapi-cli==0.0.4
WeTextProcessing==1.0.3
145 changes: 122 additions & 23 deletions runtime/python/fastapi/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav
from cosyvoice.utils.common import set_all_random_seed
import librosa
import random
import torch

app = FastAPI()
# set cross region allowance
Expand All @@ -36,42 +40,136 @@
allow_methods=["*"],
allow_headers=["*"])

def generate_seed():
seed = random.randint(1, 100000000)
return {
"__type__": "update",
"value": seed
}

max_val = 0.8

def postprocess(speech, top_db=60, hop_length=220, win_length=440):
speech, _ = librosa.effects.trim(
speech, top_db=top_db,
frame_length=win_length,
hop_length=hop_length
)
if speech.abs().max() > max_val:
speech = speech / speech.abs().max() * max_val
speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
return speech

def generate_data(model_output):
for i in model_output:
tts_audio = (i['tts_speech'].numpy() * (2 ** 15)).astype(np.int16).tobytes()
yield tts_audio


@app.get("/inference_sft")
def generate_stream(model_output):
for i in model_output:
tts_audio = i['tts_speech'].numpy().tobytes()
yield tts_audio

def generate_header():
headers = {
"X-Custom-Header-sampleRate": f"{cosyvoice.sample_rate}"
}
return headers

@app.post("/inference_sft")
async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
model_output = cosyvoice.inference_sft(tts_text, spk_id)
return StreamingResponse(generate_data(model_output))
set_all_random_seed(generate_seed()["value"])
model_output = cosyvoice.inference_sft(tts_text, spk_id ,stream = False)
return StreamingResponse(generate_data(model_output),headers=generate_header())

@app.post("/stream/inference_sft")
async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
set_all_random_seed(generate_seed()["value"])
model_output = cosyvoice.inference_sft(tts_text, spk_id, stream = True)
return StreamingResponse(generate_stream(model_output),headers=generate_header())

@app.get("/inference_zero_shot")
@app.post("/inference_zero_shot")
async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
prompt_speech_16k = load_wav(prompt_wav.file, 16000)
model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
return StreamingResponse(generate_data(model_output))
prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
set_all_random_seed(generate_seed()["value"])
model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k,stream = False)

return StreamingResponse(generate_data(model_output),headers=generate_header())

@app.post("/stream/inference_zero_shot")
async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
set_all_random_seed(generate_seed()["value"])
model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream = True)
return StreamingResponse(generate_stream(model_output),headers=generate_header())

@app.get("/inference_cross_lingual")
@app.post("/inference_cross_lingual")
async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
prompt_speech_16k = load_wav(prompt_wav.file, 16000)
model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
return StreamingResponse(generate_data(model_output))

prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
set_all_random_seed(generate_seed()["value"])
model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream = False)
return StreamingResponse(generate_data(model_output),headers=generate_header())

@app.post("/stream/inference_cross_lingual")
async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
set_all_random_seed(generate_seed()["value"])
model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream = True)
return StreamingResponse(generate_stream(model_output),headers=generate_header())

@app.get("/inference_instruct")
@app.post("/inference_instruct")
async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
return StreamingResponse(generate_data(model_output))
set_all_random_seed(generate_seed()["value"])
model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text, stream = False)
return StreamingResponse(generate_data(model_output),headers=generate_header())

@app.post("/inference_instruct2")
async def inference_instruct2(tts_text: str = Form(), instruct_text: str = Form(), prompt_wav: UploadFile = File()):
set_all_random_seed(generate_seed()["value"])
prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
model_output = cosyvoice.inference_instruct2(tts_text, instruct_text,prompt_speech_16k, stream = False)
return StreamingResponse(generate_data(model_output),headers=generate_header())

@app.post("/stream/inference_instruct")
async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
set_all_random_seed(generate_seed()["value"])
model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text, stream = True)
return StreamingResponse(generate_stream(model_output),headers=generate_header())

import torch
import threading

# 检查当前 GPU 的显存使用情况
def check_memory_usage():
allocated = torch.cuda.memory_allocated() / (1024 ** 2) # 转换为MB
reserved = torch.cuda.memory_reserved() / (1024 ** 2) # 转换为MB
total_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2) # 转换为MB
logging.info(f"Allocated memory: {allocated:.2f} MB")
logging.info(f"Reserved memory: {reserved:.2f} MB")
logging.info(f"Total memory: {total_memory:.2f} MB")
return allocated, reserved, total_memory

# 释放未使用的显存
def release_memory():
torch.cuda.empty_cache()
logging.info("Memory has been released.")

# 检查显存使用情况并在需要时释放显存
def monitor_and_release_memory():
allocated, reserved, total_memory = check_memory_usage()
if allocated >= total_memory / 2:
logging.info("Allocated memory exceeds half of the total memory. Releasing memory...")
release_memory()
else:
logging.info("Memory usage is within acceptable limits.")

# 定时器函数,每10分钟运行一次
def run_periodically(interval, func):
def wrapper():
func()
threading.Timer(interval, wrapper).start()

wrapper()

@app.get("/inference_instruct2")
@app.post("/inference_instruct2")
Expand All @@ -91,11 +189,12 @@ async def inference_instruct2(tts_text: str = Form(), instruct_text: str = Form(
default='iic/CosyVoice-300M',
help='local path or modelscope repo id')
args = parser.parse_args()
try:
cosyvoice = CosyVoice(args.model_dir)
except Exception:
try:
cosyvoice = CosyVoice2(args.model_dir)
except Exception:
raise TypeError('no valid model_type!')

# 设置每个进程最多使用 50% 的 GPU 显存
#torch.cuda.set_per_process_memory_fraction(0.8, 0)
#logging.info('Torch set_per_process_memory_fraction 0.8')
cosyvoice = CosyVoice2(args.model_dir) if 'CosyVoice2' in args.model_dir else CosyVoice(args.model_dir)
# 每10分钟(600秒)运行一次 monitor_and_release_memory
#run_periodically(600, monitor_and_release_memory)

uvicorn.run(app, host="0.0.0.0", port=args.port)
Loading