FunAudioLLM · tanbw · Oct 10, 2024 · Dec 29, 2024 · Dec 29, 2024 · Dec 30, 2024
diff --git a/README.md b/README.md
diff --git a/deploy-hf.py b/deploy-hf.py
@@ -0,0 +1,49 @@
+import logging
+import subprocess
+logging.basicConfig(level=logging.INFO)
+
+def run_shell_script(script_path):
+    """
+    运行指定路径的shell脚本，并打印输出到控制台。
+
+    :param script_path: Shell脚本的文件路径
+    """
+    try:
+        # 使用subprocess.Popen来运行shell脚本
+        with subprocess.Popen(['bash', script_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) as proc:
+            # 读取输出
+            for line in proc.stdout:
+                print(line, end='')  # 实时打印输出
+            proc.stdout.close()
+            return_code = proc.wait()
+            if return_code:
+                print(f"Shell脚本运行出错，返回码：{return_code}")
+    except Exception as e:
+        print(f"运行shell脚本时发生错误：{e}")
+
+# 使用方法示例
+# 假设有一个名为example.sh的脚本文件在当前目录下
+run_shell_script('deploy.sh')
+ # SDK模型下载
+from modelscope import snapshot_download
+snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
+
+class Args:
+    def __init__(self):
+        self.port = 5000
+        self.model_dir = 'pretrained_models/CosyVoice-300M'
+
+from webui import main
+from cosyvoice.cli.cosyvoice import CosyVoice
+import numpy as np
+
+# 创建 args 实例
+args = Args()
+
+cosyvoice = CosyVoice(args.model_dir)
+sft_spk = cosyvoice.list_avaliable_spks()
+prompt_sr, target_sr = 16000, 22050
+default_data = np.zeros(target_sr)
+
+# 调用 main 时传递 args
+main(args,sft_spk)
diff --git a/deploy-hf.sh b/deploy-hf.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+sh Miniconda3-latest-Linux-x86_64.sh -b
+source ~/miniconda3/bin/activate
+
+#conda create -n cosyvoice python=3.8
+#conda activate cosyvoice
+#conda install -y -c conda-forge pynini==2.1.5
+# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
+pip install -r requirements_.txt
+
+# If you encounter sox compatibility issues
+# ubuntu
+apt-get -y update && apt-get -y install sox libsox-dev
+
+mkdir -p pretrained_models
+#git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
+#git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
+#git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
+#git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
+#git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
+#huggingface-cli download model-scope/CosyVoice-300M --local-dir pretrained_models/CosyVoice-300M --token=$hf_token
+#huggingface-cli download model-scope/CosyVoice-300M-SFT --local-dir pretrained_models/CosyVoice-300M-SFT --token=$hf_token
+#huggingface-cli download FunAudioLLM/CosyVoice-ttsfrd --local-dir pretrained_models/CosyVoice-ttsfrd --token=$hf_token
+
+ls pretrained_models
+
+cd pretrained_models/CosyVoice-ttsfrd/
+unzip resource.zip -d .
+pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
+
+export PYTHONPATH=third_party/Matcha-TTS
+
+python3 webui.py
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04
 
 ARG VENV_NAME="cosyvoice"
 ENV VENV=$VENV_NAME
@@ -42,10 +42,11 @@ WORKDIR /workspace
 
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/CosyVoice:/workspace/CosyVoice/third_party/Matcha-TTS"
 
-RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
-
+#RUN git clone --recursive https://github.com/tanbw/CosyVoice.git
+COPY requirements.txt CosyVoice/requirements.txt
 RUN conda activate ${VENV} && conda install -y -c conda-forge pynini==2.1.5
 RUN conda activate ${VENV} && cd CosyVoice && \
+    pip install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 && \
     pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
-
+RUN rm -rf CosyVoice
 WORKDIR /workspace/CosyVoice
diff --git a/docker/requirements.txt b/docker/requirements.txt
@@ -0,0 +1,35 @@
+--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # microsoft/onnxruntime#21684
+conformer==0.3.2
+deepspeed==0.14.2; sys_platform == 'linux'
+diffusers==0.29.0
+gdown==5.1.0
+gradio==5.4.0
+grpcio==1.57.0
+grpcio-tools==1.57.0
+hydra-core==1.3.2
+HyperPyYAML==1.2.2
+inflect==7.3.1
+librosa==0.10.2
+lightning==2.2.4
+matplotlib==3.7.5
+modelscope==1.20.0
+networkx==3.1
+omegaconf==2.3.0
+onnx==1.16.0
+onnxruntime-gpu==1.21.0; sys_platform == 'linux'
+onnxruntime==1.21.0; sys_platform == 'darwin' or sys_platform == 'win32'
+openai-whisper==20231117
+protobuf==4.25
+pyarrow==18.1.0
+pydantic==2.7.0
+pyworld==0.3.4
+rich==13.7.1
+soundfile==0.12.1
+tensorboard==2.14.0
+tensorrt==10.9.0.34; sys_platform == 'linux'
+transformers==4.40.1
+uvicorn==0.30.0
+wget==3.2
+fastapi==0.115.6
+fastapi-cli==0.0.4
+WeTextProcessing==1.0.3
diff --git a/runtime/python/fastapi/server.py b/runtime/python/fastapi/server.py
@@ -26,6 +26,10 @@
 sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
 from cosyvoice.utils.file_utils import load_wav
+from cosyvoice.utils.common import set_all_random_seed
+import librosa
+import random
+import torch
 
 app = FastAPI()
 # set cross region allowance
@@ -36,42 +40,136 @@
     allow_methods=["*"],
     allow_headers=["*"])
 
+def generate_seed():
+    seed = random.randint(1, 100000000)
+    return {
+        "__type__": "update",
+        "value": seed
+    }
 
+max_val = 0.8
+
+def postprocess(speech, top_db=60, hop_length=220, win_length=440):
+    speech, _ = librosa.effects.trim(
+        speech, top_db=top_db,
+        frame_length=win_length,
+        hop_length=hop_length
+    )
+    if speech.abs().max() > max_val:
+        speech = speech / speech.abs().max() * max_val
+    speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
+    return speech
+
 def generate_data(model_output):
     for i in model_output:
         tts_audio = (i['tts_speech'].numpy() * (2 ** 15)).astype(np.int16).tobytes()
         yield tts_audio
 
-
-@app.get("/inference_sft")
+def generate_stream(model_output):
+    for i in model_output:
+        tts_audio = i['tts_speech'].numpy().tobytes()
+        yield tts_audio
+
+def generate_header():
+    headers = {
+        "X-Custom-Header-sampleRate": f"{cosyvoice.sample_rate}"
+    }
+    return headers
+
 @app.post("/inference_sft")
 async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
-    model_output = cosyvoice.inference_sft(tts_text, spk_id)
-    return StreamingResponse(generate_data(model_output))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_sft(tts_text, spk_id ,stream = False)
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
 
+@app.post("/stream/inference_sft")
+async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_sft(tts_text, spk_id, stream = True)
+    return StreamingResponse(generate_stream(model_output),headers=generate_header())
 
-@app.get("/inference_zero_shot")
 @app.post("/inference_zero_shot")
 async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
-    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
-    model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
-    return StreamingResponse(generate_data(model_output))
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k,stream = False)
+
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
 
+@app.post("/stream/inference_zero_shot")
+async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream = True)
+    return StreamingResponse(generate_stream(model_output),headers=generate_header())
 
-@app.get("/inference_cross_lingual")
 @app.post("/inference_cross_lingual")
 async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
-    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
-    model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
-    return StreamingResponse(generate_data(model_output))
-
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream = False)
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
+
+@app.post("/stream/inference_cross_lingual")
+async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream = True)
+    return StreamingResponse(generate_stream(model_output),headers=generate_header())
 
-@app.get("/inference_instruct")
 @app.post("/inference_instruct")
 async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
-    model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
-    return StreamingResponse(generate_data(model_output))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text, stream = False)
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
+
+@app.post("/inference_instruct2")
+async def inference_instruct2(tts_text: str = Form(), instruct_text: str = Form(), prompt_wav: UploadFile = File()):
+    set_all_random_seed(generate_seed()["value"])
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    model_output = cosyvoice.inference_instruct2(tts_text, instruct_text,prompt_speech_16k, stream = False)
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
+
+@app.post("/stream/inference_instruct")
+async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text, stream = True)
+    return StreamingResponse(generate_stream(model_output),headers=generate_header())
+
+import torch
+import threading
+
+# 检查当前 GPU 的显存使用情况
+def check_memory_usage():
+    allocated = torch.cuda.memory_allocated() / (1024 ** 2)  # 转换为MB
+    reserved = torch.cuda.memory_reserved() / (1024 ** 2)    # 转换为MB
+    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)  # 转换为MB
+    logging.info(f"Allocated memory: {allocated:.2f} MB")
+    logging.info(f"Reserved memory: {reserved:.2f} MB")
+    logging.info(f"Total memory: {total_memory:.2f} MB")
+    return allocated, reserved, total_memory
+
+# 释放未使用的显存
+def release_memory():
+    torch.cuda.empty_cache()
+    logging.info("Memory has been released.")
 
+# 检查显存使用情况并在需要时释放显存
+def monitor_and_release_memory():
+    allocated, reserved, total_memory = check_memory_usage()
+    if allocated >= total_memory / 2:
+        logging.info("Allocated memory exceeds half of the total memory. Releasing memory...")
+        release_memory()
+    else:
+        logging.info("Memory usage is within acceptable limits.")
+
+# 定时器函数，每10分钟运行一次
+def run_periodically(interval, func):
+    def wrapper():
+        func()
+        threading.Timer(interval, wrapper).start()
+
+    wrapper()
 
 @app.get("/inference_instruct2")
 @app.post("/inference_instruct2")
@@ -91,11 +189,12 @@ async def inference_instruct2(tts_text: str = Form(), instruct_text: str = Form(
                         default='iic/CosyVoice-300M',
                         help='local path or modelscope repo id')
     args = parser.parse_args()
-    try:
-        cosyvoice = CosyVoice(args.model_dir)
-    except Exception:
-        try:
-            cosyvoice = CosyVoice2(args.model_dir)
-        except Exception:
-            raise TypeError('no valid model_type!')
+
+    # 设置每个进程最多使用 50% 的 GPU 显存
+    #torch.cuda.set_per_process_memory_fraction(0.8, 0)
+    #logging.info('Torch set_per_process_memory_fraction 0.8')
+    cosyvoice = CosyVoice2(args.model_dir) if 'CosyVoice2' in args.model_dir else CosyVoice(args.model_dir)
+    # 每10分钟（600秒）运行一次 monitor_and_release_memory
+    #run_periodically(600, monitor_and_release_memory)
+
     uvicorn.run(app, host="0.0.0.0", port=args.port)