From 75b40a6274e7a98bb6200e89e25f34ffb9b1f36e Mon Sep 17 00:00:00 2001
From: tan_bw <tan_bw@163.com>
Date: Thu, 10 Oct 2024 21:52:51 +0800
Subject: [PATCH 01/14] =?UTF-8?q?=E4=B8=8D=E5=90=8C=E9=83=A8=E7=BD=B2?=
 =?UTF-8?q?=E7=8E=AF=E5=A2=83=E4=B8=8B=E7=9A=84=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deploy-hf.py                     |  49 ++++++
 deploy-hf.sh                     |  34 ++++
 runtime/python/fastapi/server.py |   8 +-
 webui-hf.py                      | 229 +++++++++++++++++++++++++
 webui-vc.py                      | 279 +++++++++++++++++++++++++++++++
 5 files changed, 595 insertions(+), 4 deletions(-)
 create mode 100644 deploy-hf.py
 create mode 100644 deploy-hf.sh
 create mode 100644 webui-hf.py
 create mode 100644 webui-vc.py

diff --git a/deploy-hf.py b/deploy-hf.py
new file mode 100644
index 000000000..d05a22ce8
--- /dev/null
+++ b/deploy-hf.py
@@ -0,0 +1,49 @@
+import logging
+import subprocess
+logging.basicConfig(level=logging.INFO)
+
+def run_shell_script(script_path):
+    """
+    运行指定路径的shell脚本，并打印输出到控制台。
+
+    :param script_path: Shell脚本的文件路径
+    """
+    try:
+        # 使用subprocess.Popen来运行shell脚本
+        with subprocess.Popen(['bash', script_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) as proc:
+            # 读取输出
+            for line in proc.stdout:
+                print(line, end='')  # 实时打印输出
+            proc.stdout.close()
+            return_code = proc.wait()
+            if return_code:
+                print(f"Shell脚本运行出错，返回码：{return_code}")
+    except Exception as e:
+        print(f"运行shell脚本时发生错误：{e}")
+
+# 使用方法示例
+# 假设有一个名为example.sh的脚本文件在当前目录下
+run_shell_script('deploy.sh')
+ # SDK模型下载
+from modelscope import snapshot_download
+snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
+
+class Args:
+    def __init__(self):
+        self.port = 5000
+        self.model_dir = 'pretrained_models/CosyVoice-300M'
+
+from webui import main
+from cosyvoice.cli.cosyvoice import CosyVoice
+import numpy as np
+
+# 创建 args 实例
+args = Args()
+
+cosyvoice = CosyVoice(args.model_dir)
+sft_spk = cosyvoice.list_avaliable_spks()
+prompt_sr, target_sr = 16000, 22050
+default_data = np.zeros(target_sr)
+
+# 调用 main 时传递 args
+main(args,sft_spk)
\ No newline at end of file
diff --git a/deploy-hf.sh b/deploy-hf.sh
new file mode 100644
index 000000000..7a1bea8ce
--- /dev/null
+++ b/deploy-hf.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+sh Miniconda3-latest-Linux-x86_64.sh -b
+source ~/miniconda3/bin/activate
+
+#conda create -n cosyvoice python=3.8
+#conda activate cosyvoice
+#conda install -y -c conda-forge pynini==2.1.5
+# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
+pip install -r requirements_.txt
+
+# If you encounter sox compatibility issues
+# ubuntu
+apt-get -y update && apt-get -y install sox libsox-dev
+
+mkdir -p pretrained_models
+#git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
+#git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
+#git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
+#git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
+#git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
+#huggingface-cli download model-scope/CosyVoice-300M --local-dir pretrained_models/CosyVoice-300M --token=$hf_token
+#huggingface-cli download model-scope/CosyVoice-300M-SFT --local-dir pretrained_models/CosyVoice-300M-SFT --token=$hf_token
+#huggingface-cli download FunAudioLLM/CosyVoice-ttsfrd --local-dir pretrained_models/CosyVoice-ttsfrd --token=$hf_token
+
+ls pretrained_models
+
+cd pretrained_models/CosyVoice-ttsfrd/
+unzip resource.zip -d .
+pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
+
+export PYTHONPATH=third_party/Matcha-TTS
+
+python3 webui.py
\ No newline at end of file
diff --git a/runtime/python/fastapi/server.py b/runtime/python/fastapi/server.py
index bfe4a56b8..bf0e5cb37 100644
--- a/runtime/python/fastapi/server.py
+++ b/runtime/python/fastapi/server.py
@@ -43,27 +43,27 @@ def generate_data(model_output):
         yield tts_audio
 
 
-@app.get("/inference_sft")
+@app.post("/inference_sft")
 async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
     model_output = cosyvoice.inference_sft(tts_text, spk_id)
     return StreamingResponse(generate_data(model_output))
 
 
-@app.get("/inference_zero_shot")
+@app.post("/inference_zero_shot")
 async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
     prompt_speech_16k = load_wav(prompt_wav.file, 16000)
     model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
     return StreamingResponse(generate_data(model_output))
 
 
-@app.get("/inference_cross_lingual")
+@app.post("/inference_cross_lingual")
 async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
     prompt_speech_16k = load_wav(prompt_wav.file, 16000)
     model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
     return StreamingResponse(generate_data(model_output))
 
 
-@app.get("/inference_instruct")
+@app.post("/inference_instruct")
 async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
     model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
     return StreamingResponse(generate_data(model_output))
diff --git a/webui-hf.py b/webui-hf.py
new file mode 100644
index 000000000..7fb6615cd
--- /dev/null
+++ b/webui-hf.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import argparse
+import gradio as gr
+import numpy as np
+#os.system('pip install torchaudio==2.0.2')
+import torch
+import torchaudio
+import random
+import librosa
+import spaces
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.utils.file_utils import load_wav, logging
+from cosyvoice.utils.common import set_all_random_seed
+import logging
+logging.basicConfig(level=logging.INFO)
+
+inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
+instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
+                 '3s极速复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',
+                 '跨语种复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 点击生成音频按钮',
+                 '自然语言控制': '1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮'}
+stream_mode_list = [('否', False), ('是', True)]
+max_val = 0.8
+
+@spaces.GPU
+def generate_seed():
+    seed = random.randint(1, 100000000)
+    return {
+        "__type__": "update",
+        "value": seed
+    }
+
+@spaces.GPU
+def postprocess(speech, top_db=60, hop_length=220, win_length=440):
+    speech, _ = librosa.effects.trim(
+        speech, top_db=top_db,
+        frame_length=win_length,
+        hop_length=hop_length
+    )
+    if speech.abs().max() > max_val:
+        speech = speech / speech.abs().max() * max_val
+    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
+    return speech
+
+@spaces.GPU
+def change_instruction(mode_checkbox_group):
+    return instruct_dict[mode_checkbox_group]
+
+@spaces.GPU
+def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
+                   seed, stream, speed):
+    if prompt_wav_upload is not None:
+        prompt_wav = prompt_wav_upload
+    elif prompt_wav_record is not None:
+        prompt_wav = prompt_wav_record
+    else:
+        prompt_wav = None
+    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
+    if mode_checkbox_group in ['自然语言控制']:
+        if get_cosyvoice().frontend.instruct is False:
+            gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
+            yield (target_sr, default_data)
+        if instruct_text == '':
+            gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
+            yield (target_sr, default_data)
+        if prompt_wav is not None or prompt_text != '':
+            gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
+    # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
+    if mode_checkbox_group in ['跨语种复刻']:
+        if get_cosyvoice().frontend.instruct is True:
+            gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
+            yield (target_sr, default_data)
+        if instruct_text != '':
+            gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
+        if prompt_wav is None:
+            gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
+            yield (target_sr, default_data)
+        gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
+    # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
+    if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
+        if prompt_wav is None:
+            gr.Warning('prompt音频为空，您是否忘记输入prompt音频？')
+            yield (target_sr, default_data)
+        if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
+            gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
+            yield (target_sr, default_data)
+    # sft mode only use sft_dropdown
+    if mode_checkbox_group in ['预训练音色']:
+        if instruct_text != '' or prompt_wav is not None or prompt_text != '':
+            gr.Info('您正在使用预训练音色模式，prompt文本/prompt音频/instruct文本会被忽略！')
+    # zero_shot mode only use prompt_wav prompt text
+    if mode_checkbox_group in ['3s极速复刻']:
+        if prompt_text == '':
+            gr.Warning('prompt文本为空，您是否忘记输入prompt文本？')
+            yield (target_sr, default_data)
+        if instruct_text != '':
+            gr.Info('您正在使用3s极速复刻模式，预训练音色/instruct文本会被忽略！')
+
+    if mode_checkbox_group == '预训练音色':
+        logging.info('get sft inference request')
+        set_all_random_seed(seed)
+        for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
+            yield (target_sr, i['tts_speech'].numpy().flatten())
+    elif mode_checkbox_group == '3s极速复刻':
+        logging.info('get zero_shot inference request')
+        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
+        set_all_random_seed(seed)
+        for i in get_cosyvoice().inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
+            yield (target_sr, i['tts_speech'].numpy().flatten())
+    elif mode_checkbox_group == '跨语种复刻':
+        logging.info('get cross_lingual inference request')
+        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
+        set_all_random_seed(seed)
+        for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
+            yield (target_sr, i['tts_speech'].numpy().flatten())
+    else:
+        logging.info('get instruct inference request')
+        set_all_random_seed(seed)
+        for i in get_cosyvoice().inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream, speed=speed):
+            yield (target_sr, i['tts_speech'].numpy().flatten())
+
+# SDK模型下载
+import platform
+import threading
+python_version = platform.python_version()
+print("Python version:", python_version)
+from huggingface_hub import dump_environment_info
+dump_environment_info()
+
+os.system('mkdir -p pretrained_models')
+os.system('git clone https://huggingface.co/FunAudioLLM/CosyVoice-300M pretrained_models/CosyVoice-300M')
+os.system('cd pretrained_models/CosyVoice-300M && git checkout 39c4e13d46bd4dfb840d214547623e5fcd2428e2')
+
+#os.system('git clone https://huggingface.co/FunAudioLLM/CosyVoice-300M-SFT pretrained_models/CosyVoice-300M-SFT')
+#os.system('cd pretrained_models/CosyVoice-300M-SFT && git checkout 096a5cff8d497fabb3dec2756a200f3688457a1b')
+
+#os.system('git clone https://huggingface.co/FunAudioLLM/CosyVoice-300M-Instruct pretrained_models/CosyVoice-300M-Instruct')
+#os.system('cd pretrained_models/CosyVoice-300M-Instruct && git checkout ba5265d9a3169c1fedce145122c9dd4bc24e062c')
+
+os.system('apt-get -y update && apt-get -y install sox libsox-dev')
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_dir',
+                    type=str,
+                    default='pretrained_models/CosyVoice-300M',
+                    help='local path or modelscope repo id')
+args = parser.parse_args()
+
+cosyvoice_instance = None
+model_dir=args.model_dir
+cosyvoice_lock = threading.Lock()
+
+@spaces.GPU
+def get_cosyvoice():
+    global cosyvoice_instance, model_dir
+    with cosyvoice_lock:
+        if cosyvoice_instance is not None:
+            return cosyvoice_instance
+        cosyvoice_instance=CosyVoice(model_dir)
+        return cosyvoice_instance
+
+@spaces.GPU
+def load_sft_options():
+    sound_choices=get_cosyvoice().list_avaliable_spks()
+    sound_choices_tuples = [(choice, choice) for choice in sound_choices]
+    return sound_choices_tuples
+
+
+prompt_sr, target_sr = 16000, 22050
+default_data = np.zeros(target_sr)
+
+with gr.Blocks() as demo:
+    gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
+                预训练模型 [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
+                [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
+                [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
+    gr.Markdown("#### 请输入需要合成的文本，选择推理模式，并按照提示步骤进行操作")
+
+    tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型，提供舒适自然的语音合成能力。")
+    with gr.Row():
+        mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
+        instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
+        sft_dropdown = gr.Dropdown(choices=[], label='选择预训练音色',  scale=0.25,allow_custom_value=True)
+        load_sft_button = gr.Button("加载预训练音色")
+        load_sft_button.click(load_sft_options, outputs=sft_dropdown)
+        stream = gr.Radio(choices=stream_mode_list, label='是否流式推理', value=stream_mode_list[0][1])
+        speed = gr.Number(value=1, label="速度调节(仅支持非流式推理)", minimum=0.5, maximum=2.0, step=0.1)
+        with gr.Column(scale=0.25):
+            seed_button = gr.Button(value="\U0001F3B2")
+            seed = gr.Number(value=0, label="随机推理种子")
+
+    with gr.Row():
+        prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件，注意采样率不低于16khz')
+        prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件')
+    prompt_text = gr.Textbox(label="输入prompt文本", lines=1, placeholder="请输入prompt文本，需与prompt音频内容一致，暂时不支持自动识别...", value='')
+    instruct_text = gr.Textbox(label="输入instruct文本", lines=1, placeholder="请输入instruct文本.", value='')
+
+    generate_button = gr.Button("生成音频")
+
+    audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=True)
+
+    seed_button.click(generate_seed, inputs=[], outputs=seed)
+    generate_button.click(generate_audio,
+                            inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
+                                    seed, stream, speed],
+                            outputs=[audio_output])
+    mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
+demo.queue(max_size=4, default_concurrency_limit=2)
+demo.launch(share=True)
+
+
+
diff --git a/webui-vc.py b/webui-vc.py
new file mode 100644
index 000000000..4ab3bd919
--- /dev/null
+++ b/webui-vc.py
@@ -0,0 +1,279 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import argparse
+import gradio as gr
+import numpy as np
+import torch
+import torchaudio
+import random
+import librosa
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.utils.file_utils import load_wav, logging
+
+inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
+instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
+                 '3s极速复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',
+                 '跨语种复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 点击生成音频按钮',
+                 '自然语言控制': '1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮'}
+stream_mode_list = [('否', False), ('是', True)]
+max_val = 0.8
+
+
+def generate_seed():
+    seed = random.randint(1, 100000000)
+    return {
+        "__type__": "update",
+        "value": seed
+    }
+
+
+def set_all_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def postprocess(speech, top_db=60, hop_length=220, win_length=440):
+    speech, _ = librosa.effects.trim(
+        speech, top_db=top_db,
+        frame_length=win_length,
+        hop_length=hop_length
+    )
+    if speech.abs().max() > max_val:
+        speech = speech / speech.abs().max() * max_val
+    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
+    return speech
+
+
+def change_instruction(mode_checkbox_group):
+    return instruct_dict[mode_checkbox_group]
+
+
+def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
+                   seed, stream, speed_factor):
+    if prompt_wav_upload is not None:
+        prompt_wav = prompt_wav_upload
+    elif prompt_wav_record is not None:
+        prompt_wav = prompt_wav_record
+    else:
+        prompt_wav = None
+    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
+    if mode_checkbox_group in ['自然语言控制']:
+        if cosyvoice.frontend.instruct is False:
+            gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
+            yield (target_sr, default_data)
+        if instruct_text == '':
+            gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
+            yield (target_sr, default_data)
+        if prompt_wav is not None or prompt_text != '':
+            gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
+    # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
+    if mode_checkbox_group in ['跨语种复刻']:
+        if cosyvoice.frontend.instruct is True:
+            gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
+            yield (target_sr, default_data)
+        if instruct_text != '':
+            gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
+        if prompt_wav is None:
+            gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
+            yield (target_sr, default_data)
+        gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
+    # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
+    if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
+        if prompt_wav is None:
+            gr.Warning('prompt音频为空，您是否忘记输入prompt音频？')
+            yield (target_sr, default_data)
+        if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
+            gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
+            yield (target_sr, default_data)
+    # sft mode only use sft_dropdown
+    if mode_checkbox_group in ['预训练音色']:
+        if instruct_text != '' or prompt_wav is not None or prompt_text != '':
+            gr.Info('您正在使用预训练音色模式，prompt文本/prompt音频/instruct文本会被忽略！')
+    # zero_shot mode only use prompt_wav prompt text
+    if mode_checkbox_group in ['3s极速复刻']:
+        if prompt_text == '':
+            gr.Warning('prompt文本为空，您是否忘记输入prompt文本？')
+            yield (target_sr, default_data)
+        if instruct_text != '':
+            gr.Info('您正在使用3s极速复刻模式，预训练音色/instruct文本会被忽略！')
+
+    if mode_checkbox_group == '预训练音色':
+        logging.info('get sft inference request')
+        set_all_random_seed(seed)
+        for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream):
+            yield (target_sr, i['tts_speech'].numpy().flatten())
+    elif mode_checkbox_group == '3s极速复刻':
+        logging.info('get zero_shot inference request')
+        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
+        set_all_random_seed(seed)
+        for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream):
+            yield (target_sr, i['tts_speech'].numpy().flatten())
+    elif mode_checkbox_group == '跨语种复刻':
+        logging.info('get cross_lingual inference request')
+        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
+        set_all_random_seed(seed)
+        for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream):
+            yield (target_sr, i['tts_speech'].numpy().flatten())
+    else:
+        logging.info('get instruct inference request')
+        set_all_random_seed(seed)
+        for i in cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream):
+            yield (target_sr, i['tts_speech'].numpy().flatten())
+
+
+def generate_audiofile(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
+                   seed, speed_factor):
+    stream=False
+    if prompt_wav_upload is not None:
+        prompt_wav = prompt_wav_upload
+    elif prompt_wav_record is not None:
+        prompt_wav = prompt_wav_record
+    else:
+        prompt_wav = None
+    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
+    if mode_checkbox_group in ['自然语言控制']:
+        if cosyvoice.frontend.instruct is False:
+            gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
+            return (target_sr, default_data)
+        if instruct_text == '':
+            gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
+            return (target_sr, default_data)
+        if prompt_wav is not None or prompt_text != '':
+            gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
+    # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
+    if mode_checkbox_group in ['跨语种复刻']:
+        if cosyvoice.frontend.instruct is True:
+            gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
+            return (target_sr, default_data)
+        if instruct_text != '':
+            gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
+        if prompt_wav is None:
+            gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
+            return (target_sr, default_data)
+        gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
+    # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
+    if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
+        if prompt_wav is None:
+            gr.Warning('prompt音频为空，您是否忘记输入prompt音频？')
+            return (target_sr, default_data)
+        if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
+            gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
+            return (target_sr, default_data)
+    # sft mode only use sft_dropdown
+    if mode_checkbox_group in ['预训练音色']:
+        if instruct_text != '' or prompt_wav is not None or prompt_text != '':
+            gr.Info('您正在使用预训练音色模式，prompt文本/prompt音频/instruct文本会被忽略！')
+    # zero_shot mode only use prompt_wav prompt text
+    if mode_checkbox_group in ['3s极速复刻']:
+        if prompt_text == '':
+            gr.Warning('prompt文本为空，您是否忘记输入prompt文本？')
+            return (target_sr, default_data)
+        if instruct_text != '':
+            gr.Info('您正在使用3s极速复刻模式，预训练音色/instruct文本会被忽略！')
+
+    if mode_checkbox_group == '预训练音色':
+        logging.info('get sft inference request')
+        set_all_random_seed(seed)
+        i=cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream)
+        audio_segments = []
+        for item in i:
+            # 将每个元素的 'tts_speech' 转换为 NumPy 数组并展平，然后添加到列表中
+            audio_segments.append(item['tts_speech'].numpy().flatten())
+            full_audio = np.concatenate(audio_segments, axis=0)
+            
+        return (target_sr, full_audio)
+    elif mode_checkbox_group == '3s极速复刻':
+        logging.info('get zero_shot inference request')
+        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
+        set_all_random_seed(seed)
+        i=cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream)
+        return (target_sr, i['tts_speech'].numpy().flatten())
+    elif mode_checkbox_group == '跨语种复刻':
+        logging.info('get cross_lingual inference request')
+        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
+        set_all_random_seed(seed)
+        i=cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream)
+        return (target_sr, i['tts_speech'].numpy().flatten())
+    else:
+        logging.info('get instruct inference request')
+        set_all_random_seed(seed)
+        i=cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream)
+        return (target_sr, i['tts_speech'].numpy().flatten())
+
+
+def main():
+    with gr.Blocks() as demo:
+        gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
+                    预训练模型 [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
+                    [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
+                    [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
+        gr.Markdown("#### 请输入需要合成的文本，选择推理模式，并按照提示步骤进行操作")
+
+        tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型，提供舒适自然的语音合成能力。")
+        speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True)
+        with gr.Row():
+            mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
+            instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
+            sft_dropdown = gr.Dropdown(choices=sft_spk, label='选择预训练音色', value=sft_spk[0], scale=0.25)
+            stream = gr.Radio(choices=stream_mode_list, label='是否流式推理', value=stream_mode_list[0][1])
+            with gr.Column(scale=0.25):
+                seed_button = gr.Button(value="\U0001F3B2")
+                seed = gr.Number(value=0, label="随机推理种子")
+
+        with gr.Row():
+            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件，注意采样率不低于16khz')
+            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件')
+        prompt_text = gr.Textbox(label="输入prompt文本", lines=1, placeholder="请输入prompt文本，需与prompt音频内容一致，暂时不支持自动识别...", value='')
+        instruct_text = gr.Textbox(label="输入instruct文本", lines=1, placeholder="请输入instruct文本.", value='')
+
+        generate_button = gr.Button("生成音频")
+
+        #audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=True)
+        audio_output = gr.Audio(label="合成音频", autoplay=True)
+        seed_button.click(generate_seed, inputs=[], outputs=seed)
+        generate_button.click(generate_audiofile,
+                              inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
+                                      seed,  speed_factor],
+                              outputs=audio_output)
+        #generate_button.click(generate_audio,
+        #                      inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
+        #                              seed, stream, speed_factor],
+        #                      outputs=[audio_output])
+        mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
+    demo.queue(max_size=4, default_concurrency_limit=2)
+    demo.launch(server_name='0.0.0.0', server_port=args.port,share=True,allowed_paths=['/'],auth=("admin", "pass1234"))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--port',
+                        type=int,
+                        default=8000)
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path or modelscope repo id')
+    args = parser.parse_args()
+    cosyvoice = CosyVoice(args.model_dir)
+    sft_spk = cosyvoice.list_avaliable_spks()
+    print("可用音色：" + ", ".join(sft_spk))
+    prompt_sr, target_sr = 16000, 22050
+    default_data = np.zeros(target_sr)
+    main()

From 8d78ca0c6d432a08d5554fe62d9458164db800b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=A0=E8=A2=88=E8=A3=9F=E7=A9=BF=E5=8F=8D=E4=BA=86?=
 <tanbw@users.noreply.github.com>
Date: Mon, 30 Dec 2024 04:35:26 +0800
Subject: [PATCH 02/14] Update server.py

---
 runtime/python/fastapi/server.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/runtime/python/fastapi/server.py b/runtime/python/fastapi/server.py
index bf0e5cb37..7a351f664 100644
--- a/runtime/python/fastapi/server.py
+++ b/runtime/python/fastapi/server.py
@@ -55,6 +55,16 @@ async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(),
     model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
     return StreamingResponse(generate_data(model_output))
 
+@app.post("stream/inference_zero_shot")
+async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
+    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
+    model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
+    return StreamingResponse(generate_stream(model_output))
+
+def generate_stream(model_output):
+    for i in model_output:
+        tts_audio = i['tts_speech'].numpy().tobytes()
+        yield tts_audio
 
 @app.post("/inference_cross_lingual")
 async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):

From b167233d2637aa9156f20645af3b29274d3872b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=A0=E8=A2=88=E8=A3=9F=E7=A9=BF=E5=8F=8D=E4=BA=86?=
 <tanbw@users.noreply.github.com>
Date: Mon, 30 Dec 2024 16:24:03 +0800
Subject: [PATCH 03/14] Update Dockerfile

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 60b101fc1..ac641814e 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -42,7 +42,7 @@ WORKDIR /workspace
 
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/CosyVoice:/workspace/CosyVoice/third_party/Matcha-TTS"
 
-RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
+RUN git clone --recursive https://github.com/tanbw/CosyVoice.git
 
 RUN conda activate ${VENV} && conda install -y -c conda-forge pynini==2.1.5
 RUN conda activate ${VENV} && cd CosyVoice && \

From 8225fd2c7dda9b6f5791232bf6cc501763634d1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=A0=E8=A2=88=E8=A3=9F=E7=A9=BF=E5=8F=8D=E4=BA=86?=
 <tanbw@users.noreply.github.com>
Date: Wed, 1 Jan 2025 05:26:13 +0800
Subject: [PATCH 04/14] Update server.py

---
 runtime/python/fastapi/server.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/runtime/python/fastapi/server.py b/runtime/python/fastapi/server.py
index 7a351f664..5a8f06d1f 100644
--- a/runtime/python/fastapi/server.py
+++ b/runtime/python/fastapi/server.py
@@ -42,12 +42,20 @@ def generate_data(model_output):
         tts_audio = (i['tts_speech'].numpy() * (2 ** 15)).astype(np.int16).tobytes()
         yield tts_audio
 
+def generate_stream(model_output):
+    for i in model_output:
+        tts_audio = i['tts_speech'].numpy().tobytes()
+        yield tts_audio
 
 @app.post("/inference_sft")
 async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
     model_output = cosyvoice.inference_sft(tts_text, spk_id)
     return StreamingResponse(generate_data(model_output))
 
+@app.post("/stream/inference_sft")
+async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
+    model_output = cosyvoice.inference_sft(tts_text, spk_id)
+    return StreamingResponse(generate_stream(model_output))
 
 @app.post("/inference_zero_shot")
 async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
@@ -55,29 +63,33 @@ async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(),
     model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
     return StreamingResponse(generate_data(model_output))
 
-@app.post("stream/inference_zero_shot")
+@app.post("/stream/inference_zero_shot")
 async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
     prompt_speech_16k = load_wav(prompt_wav.file, 16000)
     model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
     return StreamingResponse(generate_stream(model_output))
 
-def generate_stream(model_output):
-    for i in model_output:
-        tts_audio = i['tts_speech'].numpy().tobytes()
-        yield tts_audio
-
 @app.post("/inference_cross_lingual")
 async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
     prompt_speech_16k = load_wav(prompt_wav.file, 16000)
     model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
     return StreamingResponse(generate_data(model_output))
-
+    
+@app.post("/stream/inference_cross_lingual")
+async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
+    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
+    model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
+    return StreamingResponse(generate_stream(model_output))
 
 @app.post("/inference_instruct")
 async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
     model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
     return StreamingResponse(generate_data(model_output))
 
+@app.post("/stream/inference_instruct")
+async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
+    model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
+    return StreamingResponse(generate_stream(model_output))
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()

From 3aab63600448c850d694ec63ebd0fe62fdf55d5c Mon Sep 17 00:00:00 2001
From: tan_bw <tan_bw@163.com>
Date: Wed, 1 Jan 2025 20:49:02 +0800
Subject: [PATCH 05/14] no message

---
 runtime/python/fastapi/server.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/runtime/python/fastapi/server.py b/runtime/python/fastapi/server.py
index 5a8f06d1f..4c9e55ff3 100644
--- a/runtime/python/fastapi/server.py
+++ b/runtime/python/fastapi/server.py
@@ -54,7 +54,7 @@ async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
 
 @app.post("/stream/inference_sft")
 async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
-    model_output = cosyvoice.inference_sft(tts_text, spk_id)
+    model_output = cosyvoice.inference_sft(tts_text, spk_id, stream = True)
     return StreamingResponse(generate_stream(model_output))
 
 @app.post("/inference_zero_shot")
@@ -66,7 +66,7 @@ async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(),
 @app.post("/stream/inference_zero_shot")
 async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
     prompt_speech_16k = load_wav(prompt_wav.file, 16000)
-    model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
+    model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream = True)
     return StreamingResponse(generate_stream(model_output))
 
 @app.post("/inference_cross_lingual")
@@ -78,7 +78,7 @@ async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile
 @app.post("/stream/inference_cross_lingual")
 async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
     prompt_speech_16k = load_wav(prompt_wav.file, 16000)
-    model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
+    model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream = True)
     return StreamingResponse(generate_stream(model_output))
 
 @app.post("/inference_instruct")
@@ -88,7 +88,7 @@ async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instr
 
 @app.post("/stream/inference_instruct")
 async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
-    model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
+    model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text, stream = True)
     return StreamingResponse(generate_stream(model_output))
 
 if __name__ == '__main__':
@@ -101,5 +101,5 @@ async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instr
                         default='iic/CosyVoice-300M',
                         help='local path or modelscope repo id')
     args = parser.parse_args()
-    cosyvoice = CosyVoice(args.model_dir)
+    cosyvoice = CosyVoice2(args.model_dir) if 'CosyVoice2' in args.model_dir else CosyVoice(args.model_dir)
     uvicorn.run(app, host="0.0.0.0", port=args.port)

From deb48bdef0405d3dbd92ed442d6d84e5bcadcca8 Mon Sep 17 00:00:00 2001
From: tan_bw <tan_bw@163.com>
Date: Wed, 1 Jan 2025 23:51:39 +0800
Subject: [PATCH 06/14] no message

---
 docker/Dockerfile       |  6 +++---
 docker/requirements.txt | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 docker/requirements.txt

diff --git a/docker/Dockerfile b/docker/Dockerfile
index ac641814e..2bcddc45e 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -42,10 +42,10 @@ WORKDIR /workspace
 
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/CosyVoice:/workspace/CosyVoice/third_party/Matcha-TTS"
 
-RUN git clone --recursive https://github.com/tanbw/CosyVoice.git
-
+#RUN git clone --recursive https://github.com/tanbw/CosyVoice.git
+COPY requirements.txt CosyVoice\requirements.txt
 RUN conda activate ${VENV} && conda install -y -c conda-forge pynini==2.1.5
 RUN conda activate ${VENV} && cd CosyVoice && \
     pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
-
+RUN rm -rf CosyVoice
 WORKDIR /workspace/CosyVoice
diff --git a/docker/requirements.txt b/docker/requirements.txt
new file mode 100644
index 000000000..e02452b5c
--- /dev/null
+++ b/docker/requirements.txt
@@ -0,0 +1,39 @@
+--extra-index-url https://download.pytorch.org/whl/cu121
+--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684
+conformer==0.3.2
+deepspeed==0.14.2; sys_platform == 'linux'
+diffusers==0.27.2
+gdown==5.1.0
+gradio==4.32.2
+grpcio==1.57.0
+grpcio-tools==1.57.0
+huggingface-hub==0.25.2
+hydra-core==1.3.2
+HyperPyYAML==1.2.2
+inflect==7.3.1
+librosa==0.10.2
+lightning==2.2.4
+matplotlib==3.7.5
+modelscope==1.15.0
+networkx==3.1
+omegaconf==2.3.0
+onnx==1.16.0
+onnxruntime-gpu==1.18.0; sys_platform == 'linux'
+onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'windows'
+openai-whisper==20231117
+protobuf==4.25
+pydantic==2.7.0
+rich==13.7.1
+soundfile==0.12.1
+tensorboard==2.14.0
+tensorrt-cu12==10.0.1; sys_platform == 'linux'
+tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
+tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
+torch==2.3.1
+torchaudio==2.3.1
+transformers==4.40.1
+uvicorn==0.30.0
+wget==3.2
+fastapi==0.111.0
+fastapi-cli==0.0.4
+WeTextProcessing==1.0.3

From 4cc98aa279864ca1517ba5b1a9746c5a28080b22 Mon Sep 17 00:00:00 2001
From: tan_bw <tan_bw@163.com>
Date: Wed, 1 Jan 2025 23:52:44 +0800
Subject: [PATCH 07/14] no message

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 2bcddc45e..c21003d0e 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -43,7 +43,7 @@ WORKDIR /workspace
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/CosyVoice:/workspace/CosyVoice/third_party/Matcha-TTS"
 
 #RUN git clone --recursive https://github.com/tanbw/CosyVoice.git
-COPY requirements.txt CosyVoice\requirements.txt
+COPY requirements.txt CosyVoice/requirements.txt
 RUN conda activate ${VENV} && conda install -y -c conda-forge pynini==2.1.5
 RUN conda activate ${VENV} && cd CosyVoice && \
     pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com

From fb96e7c6843b545aa7ccae9f9ce3a14e27ab36d8 Mon Sep 17 00:00:00 2001
From: tan_bw <tan_bw@163.com>
Date: Thu, 2 Jan 2025 10:16:34 +0800
Subject: [PATCH 08/14] no message

---
 runtime/python/fastapi/server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/python/fastapi/server.py b/runtime/python/fastapi/server.py
index 4c9e55ff3..f87f39a4a 100644
--- a/runtime/python/fastapi/server.py
+++ b/runtime/python/fastapi/server.py
@@ -24,7 +24,7 @@
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append('{}/../../..'.format(ROOT_DIR))
 sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
-from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
 from cosyvoice.utils.file_utils import load_wav
 
 app = FastAPI()

From beea30f68c9061a07f3de82756f3db4374e7b435 Mon Sep 17 00:00:00 2001
From: tan_bw <tan_bw@163.com>
Date: Thu, 2 Jan 2025 18:41:10 +0800
Subject: [PATCH 09/14] no message

---
 runtime/python/fastapi/server.py | 72 ++++++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 17 deletions(-)

diff --git a/runtime/python/fastapi/server.py b/runtime/python/fastapi/server.py
index f87f39a4a..74f2c834e 100644
--- a/runtime/python/fastapi/server.py
+++ b/runtime/python/fastapi/server.py
@@ -26,6 +26,10 @@
 sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
 from cosyvoice.utils.file_utils import load_wav
+from cosyvoice.utils.common import set_all_random_seed
+import librosa
+import random
+import torch
 
 app = FastAPI()
 # set cross region allowance
@@ -36,7 +40,26 @@
     allow_methods=["*"],
     allow_headers=["*"])
 
+def generate_seed():
+    seed = random.randint(1, 100000000)
+    return {
+        "__type__": "update",
+        "value": seed
+    }
 
+max_val = 0.8
+
+def postprocess(speech, top_db=60, hop_length=220, win_length=440):
+    speech, _ = librosa.effects.trim(
+        speech, top_db=top_db,
+        frame_length=win_length,
+        hop_length=hop_length
+    )
+    if speech.abs().max() > max_val:
+        speech = speech / speech.abs().max() * max_val
+    speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
+    return speech
+    
 def generate_data(model_output):
     for i in model_output:
         tts_audio = (i['tts_speech'].numpy() * (2 ** 15)).astype(np.int16).tobytes()
@@ -46,50 +69,65 @@ def generate_stream(model_output):
     for i in model_output:
         tts_audio = i['tts_speech'].numpy().tobytes()
         yield tts_audio
-
+        
+def generate_header():
+    headers = {
+        "X-Custom-Header-sampleRate": f"{cosyvoice.sample_rate}"
+    }
+    return headers
+    
 @app.post("/inference_sft")
 async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
-    model_output = cosyvoice.inference_sft(tts_text, spk_id)
-    return StreamingResponse(generate_data(model_output))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_sft(tts_text, spk_id ,stream = False)
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
 
 @app.post("/stream/inference_sft")
 async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
+    set_all_random_seed(generate_seed()["value"])
     model_output = cosyvoice.inference_sft(tts_text, spk_id, stream = True)
-    return StreamingResponse(generate_stream(model_output))
+    return StreamingResponse(generate_stream(model_output),headers=generate_header())
 
 @app.post("/inference_zero_shot")
 async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
-    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
-    model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
-    return StreamingResponse(generate_data(model_output))
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k,stream = False)
+   
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
 
 @app.post("/stream/inference_zero_shot")
 async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
-    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    set_all_random_seed(generate_seed()["value"])
     model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream = True)
-    return StreamingResponse(generate_stream(model_output))
+    return StreamingResponse(generate_stream(model_output),headers=generate_header())
 
 @app.post("/inference_cross_lingual")
 async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
-    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
-    model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
-    return StreamingResponse(generate_data(model_output))
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream = False)
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
     
 @app.post("/stream/inference_cross_lingual")
 async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
-    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    set_all_random_seed(generate_seed()["value"])
     model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream = True)
-    return StreamingResponse(generate_stream(model_output))
+    return StreamingResponse(generate_stream(model_output),headers=generate_header())
 
 @app.post("/inference_instruct")
 async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
-    model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
-    return StreamingResponse(generate_data(model_output))
+    set_all_random_seed(generate_seed()["value"])
+    model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text, stream = False)
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
 
 @app.post("/stream/inference_instruct")
 async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
+    set_all_random_seed(generate_seed()["value"])
     model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text, stream = True)
-    return StreamingResponse(generate_stream(model_output))
+    return StreamingResponse(generate_stream(model_output),headers=generate_header())
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()

From 99cedfa189e31e0b2977cd05a3e17d1e849a47cd Mon Sep 17 00:00:00 2001
From: tan_bw <tan_bw@163.com>
Date: Sun, 13 Apr 2025 20:06:06 +0800
Subject: [PATCH 10/14] no message

---
 runtime/python/fastapi/server.py | 48 ++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/runtime/python/fastapi/server.py b/runtime/python/fastapi/server.py
index 74f2c834e..be4432185 100644
--- a/runtime/python/fastapi/server.py
+++ b/runtime/python/fastapi/server.py
@@ -123,11 +123,54 @@ async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instr
     model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text, stream = False)
     return StreamingResponse(generate_data(model_output),headers=generate_header())
 
+@app.post("/inference_instruct2")
+async def inference_instruct2(tts_text: str = Form(), instruct_text: str = Form(), prompt_wav: UploadFile = File()):
+    set_all_random_seed(generate_seed()["value"])
+    prompt_speech_16k = postprocess(load_wav(prompt_wav.file, 16000))
+    model_output = cosyvoice.inference_instruct2(tts_text, instruct_text,prompt_speech_16k, stream = False)
+    return StreamingResponse(generate_data(model_output),headers=generate_header())
+
 @app.post("/stream/inference_instruct")
 async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
     set_all_random_seed(generate_seed()["value"])
     model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text, stream = True)
     return StreamingResponse(generate_stream(model_output),headers=generate_header())
+    
+import torch
+import threading
+
+# 检查当前 GPU 的显存使用情况
+def check_memory_usage():
+    allocated = torch.cuda.memory_allocated() / (1024 ** 2)  # 转换为MB
+    reserved = torch.cuda.memory_reserved() / (1024 ** 2)    # 转换为MB
+    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)  # 转换为MB
+    logging.info(f"Allocated memory: {allocated:.2f} MB")
+    logging.info(f"Reserved memory: {reserved:.2f} MB")
+    logging.info(f"Total memory: {total_memory:.2f} MB")
+    return allocated, reserved, total_memory
+
+# 释放未使用的显存
+def release_memory():
+    torch.cuda.empty_cache()
+    logging.info("Memory has been released.")
+
+# 检查显存使用情况并在需要时释放显存
+def monitor_and_release_memory():
+    allocated, reserved, total_memory = check_memory_usage()
+    if allocated >= total_memory / 2:
+        logging.info("Allocated memory exceeds half of the total memory. Releasing memory...")
+        release_memory()
+    else:
+        logging.info("Memory usage is within acceptable limits.")
+
+# 定时器函数，每10分钟运行一次
+def run_periodically(interval, func):
+    def wrapper():
+        func()
+        threading.Timer(interval, wrapper).start()
+    
+    wrapper()
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -139,5 +182,10 @@ async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instr
                         default='iic/CosyVoice-300M',
                         help='local path or modelscope repo id')
     args = parser.parse_args()
+        # 设置每个进程最多使用 50% 的 GPU 显存
+    #torch.cuda.set_per_process_memory_fraction(0.8, 0)
+    #logging.info('Torch set_per_process_memory_fraction 0.8')
     cosyvoice = CosyVoice2(args.model_dir) if 'CosyVoice2' in args.model_dir else CosyVoice(args.model_dir)
+    # 每10分钟（600秒）运行一次 monitor_and_release_memory
+    #run_periodically(600, monitor_and_release_memory)
     uvicorn.run(app, host="0.0.0.0", port=args.port)

From 2e1b4512dac3f556e988516a60871340fa567f35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=A0=E8=A2=88=E8=A3=9F=E7=A9=BF=E5=8F=8D=E4=BA=86?=
 <tanbw@users.noreply.github.com>
Date: Tue, 22 Apr 2025 18:03:47 +0800
Subject: [PATCH 11/14] Update Dockerfile

---
 docker/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 0ac5c7e19..2bff92431 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04
 
 ARG VENV_NAME="cosyvoice"
 ENV VENV=$VENV_NAME
@@ -46,6 +46,7 @@ ENV PYTHONPATH="${PYTHONPATH}:/workspace/CosyVoice:/workspace/CosyVoice/third_pa
 COPY requirements.txt CosyVoice/requirements.txt
 RUN conda activate ${VENV} && conda install -y -c conda-forge pynini==2.1.5
 RUN conda activate ${VENV} && cd CosyVoice && \
+    pip install --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 && \
     pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
 RUN rm -rf CosyVoice
 WORKDIR /workspace/CosyVoice

From 1e958fc1a256af991d7bdc0efd63e51f992b49d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=A0=E8=A2=88=E8=A3=9F=E7=A9=BF=E5=8F=8D=E4=BA=86?=
 <tanbw@users.noreply.github.com>
Date: Tue, 22 Apr 2025 18:04:10 +0800
Subject: [PATCH 12/14] Update requirements.txt

---
 docker/requirements.txt | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/docker/requirements.txt b/docker/requirements.txt
index e02452b5c..f8b4395a1 100644
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@@ -1,39 +1,35 @@
---extra-index-url https://download.pytorch.org/whl/cu121
---extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684
+--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # microsoft/onnxruntime#21684
 conformer==0.3.2
 deepspeed==0.14.2; sys_platform == 'linux'
-diffusers==0.27.2
+diffusers==0.29.0
 gdown==5.1.0
-gradio==4.32.2
+gradio==5.4.0
 grpcio==1.57.0
 grpcio-tools==1.57.0
-huggingface-hub==0.25.2
 hydra-core==1.3.2
 HyperPyYAML==1.2.2
 inflect==7.3.1
 librosa==0.10.2
 lightning==2.2.4
 matplotlib==3.7.5
-modelscope==1.15.0
+modelscope==1.20.0
 networkx==3.1
 omegaconf==2.3.0
 onnx==1.16.0
-onnxruntime-gpu==1.18.0; sys_platform == 'linux'
-onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'windows'
+onnxruntime-gpu==1.21.0; sys_platform == 'linux'
+onnxruntime==1.21.0; sys_platform == 'darwin' or sys_platform == 'win32'
 openai-whisper==20231117
 protobuf==4.25
+pyarrow==18.1.0
 pydantic==2.7.0
+pyworld==0.3.4
 rich==13.7.1
 soundfile==0.12.1
 tensorboard==2.14.0
-tensorrt-cu12==10.0.1; sys_platform == 'linux'
-tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
-tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
-torch==2.3.1
-torchaudio==2.3.1
+tensorrt==10.9.0.34; sys_platform == 'linux'
 transformers==4.40.1
 uvicorn==0.30.0
 wget==3.2
-fastapi==0.111.0
+fastapi==0.115.6
 fastapi-cli==0.0.4
 WeTextProcessing==1.0.3

From 3bf48f125a8c25d3f9c386cdb3abf2b614391817 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=A0=E8=A2=88=E8=A3=9F=E7=A9=BF=E5=8F=8D=E4=BA=86?=
 <tanbw@users.noreply.github.com>
Date: Thu, 24 Apr 2025 22:55:55 +0800
Subject: [PATCH 13/14] Update README.md

---
 README.md | 246 ++----------------------------------------------------
 1 file changed, 7 insertions(+), 239 deletions(-)

diff --git a/README.md b/README.md
index 4a1dbd30e..f1031dd87 100644
--- a/README.md
+++ b/README.md
@@ -1,241 +1,9 @@
-[![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice🤠&text2=Text-to-Speech%20💖%20Large%20Language%20Model&width=800&height=210)](https://github.com/Akshay090/svg-banners)
+## 👉🏻 有什么改动 👈🏻
 
-## 👉🏻 CosyVoice 👈🏻
-**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
+1.**升级到cuda12.8适配50系显卡** 
+docker目录，修改了Dockerfile和requirements适配50系显卡，升级基础镜像到cuda12.8.0，已在Ubuntu24下测试通过。
+不再从git下载源码，而是在docker run时映射宿主机源码目录到/workspace/CosyVoice，方便修改和测试。
 
-**CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M)
-
-## Highlight🔥
-
-**CosyVoice 2.0** has been released! Compared to version 1.0, the new version offers more accurate, more stable, faster, and better speech generation capabilities.
-### Multilingual
-- **Supported Language**: Chinese, English, Japanese, Korean, Chinese dialects (Cantonese, Sichuanese, Shanghainese, Tianjinese, Wuhanese, etc.)
-- **Crosslingual & Mixlingual**：Support zero-shot voice cloning for cross-lingual and code-switching scenarios.
-### Ultra-Low Latency
-- **Bidirectional Streaming Support**: CosyVoice 2.0 integrates offline and streaming modeling technologies.
-- **Rapid First Packet Synthesis**: Achieves latency as low as 150ms while maintaining high-quality audio output.
-### High Accuracy
-- **Improved Pronunciation**: Reduces pronunciation errors by 30% to 50% compared to CosyVoice 1.0.
-- **Benchmark Achievements**: Attains the lowest character error rate on the hard test set of the Seed-TTS evaluation set.
-### Strong Stability
-- **Consistency in Timbre**: Ensures reliable voice consistency for zero-shot and cross-language speech synthesis.
-- **Cross-language Synthesis**: Marked improvements compared to version 1.0.
-### Natural Experience
-- **Enhanced Prosody and Sound Quality**: Improved alignment of synthesized audio, raising MOS evaluation scores from 5.4 to 5.53.
-- **Emotional and Dialectal Flexibility**: Now supports more granular emotional controls and accent adjustments.
-
-## Roadmap
-
-- [x] 2024/12
-
-    - [x] 25hz cosyvoice 2.0 released
-
-- [x] 2024/09
-
-    - [x] 25hz cosyvoice base model
-    - [x] 25hz cosyvoice voice conversion model
-
-- [x] 2024/08
-
-    - [x] Repetition Aware Sampling(RAS) inference for llm stability
-    - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization
-
-- [x] 2024/07
-
-    - [x] Flow matching training support
-    - [x] WeTextProcessing support when ttsfrd is not available
-    - [x] Fastapi server and client
-
-
-## Install
-
-**Clone and install**
-
-- Clone the repo
-``` sh
-git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
-# If you failed to clone submodule due to network failures, please run following command until success
-cd CosyVoice
-git submodule update --init --recursive
-```
-
-- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
-- Create Conda env:
-
-``` sh
-conda create -n cosyvoice -y python=3.10
-conda activate cosyvoice
-# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
-conda install -y -c conda-forge pynini==2.1.5
-pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
-
-# If you encounter sox compatibility issues
-# ubuntu
-sudo apt-get install sox libsox-dev
-# centos
-sudo yum install sox sox-devel
-```
-
-**Model download**
-
-We strongly recommend that you download our pretrained `CosyVoice2-0.5B` `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `CosyVoice-ttsfrd` resource.
-
-``` python
-# SDK模型下载
-from modelscope import snapshot_download
-snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
-snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
-snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
-snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
-snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
-```
-
-``` sh
-# git模型下载，请确保已安装git lfs
-mkdir -p pretrained_models
-git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B
-git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
-git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
-git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
-git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
-```
-
-Optionally, you can unzip `ttsfrd` resouce and install `ttsfrd` package for better text normalization performance.
-
-Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use WeTextProcessing by default.
-
-``` sh
-cd pretrained_models/CosyVoice-ttsfrd/
-unzip resource.zip -d .
-pip install ttsfrd_dependency-0.1-py3-none-any.whl
-pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
-```
-
-**Basic Usage**
-
-We strongly recommend using `CosyVoice2-0.5B` for better performance.
-Follow code below for detailed usage of each model.
-
-``` python
-import sys
-sys.path.append('third_party/Matcha-TTS')
-from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
-from cosyvoice.utils.file_utils import load_wav
-import torchaudio
-```
-
-**CosyVoice2 Usage**
-```python
-cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False, use_flow_cache=False)
-
-# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
-# zero_shot usage
-prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
-for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-
-# save zero_shot spk for future usage
-assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', prompt_speech_16k, 'my_zero_shot_spk') is True
-for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-cosyvoice.save_spkinfo()
-
-# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
-for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中，他突然[laughter]停下来，因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
-    torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-
-# instruct usage
-for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
-    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-
-# bistream usage, you can use generator as input, this is useful when using text llm model as input
-# NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
-def text_generator():
-    yield '收到好友从远方寄来的生日礼物，'
-    yield '那份意外的惊喜与深深的祝福'
-    yield '让我心中充满了甜蜜的快乐，'
-    yield '笑容如花儿般绽放。'
-for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-```
-
-**CosyVoice Usage**
-```python
-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
-# sft usage
-print(cosyvoice.list_available_spks())
-# change stream=True for chunk stream inference
-for i, j in enumerate(cosyvoice.inference_sft('你好，我是通义生成式语音大模型，请问有什么可以帮您的吗？', '中文女', stream=False)):
-    torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-
-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
-# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
-prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
-for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-# cross_lingual usage
-prompt_speech_16k = load_wav('./asset/cross_lingual_prompt.wav', 16000)
-for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
-    torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-# vc usage
-prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
-source_speech_16k = load_wav('./asset/cross_lingual_prompt.wav', 16000)
-for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
-    torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-
-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
-# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
-for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
-    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-```
-
-**Start web demo**
-
-You can use our web demo page to get familiar with CosyVoice quickly.
-
-Please see the demo website for details.
-
-``` python
-# change iic/CosyVoice-300M-SFT for sft inference, or iic/CosyVoice-300M-Instruct for instruct inference
-python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M
-```
-
-**Advanced Usage**
-
-For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`.
-
-**Build for deployment**
-
-Optionally, if you want service deployment,
-you can run following steps.
-
-``` sh
-cd runtime/python
-docker build -t cosyvoice:v1.0 .
-# change iic/CosyVoice-300M to iic/CosyVoice-300M-Instruct if you want to use instruct inference
-# for grpc usage
-docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/grpc && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"
-cd grpc && python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
-# for fastapi usage
-docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/fastapi && python3 server.py --port 50000 --model_dir iic/CosyVoice-300M && sleep infinity"
-cd fastapi && python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
-```
-
-## Discussion & Communication
-
-You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues).
-
-You can also scan the QR code to join our official Dingding chat group.
-
-<img src="./asset/dingding.png" width="250px">
-
-## Acknowledge
-
-1. We borrowed a lot of code from [FunASR](https://github.com/modelscope/FunASR).
-2. We borrowed a lot of code from [FunCodec](https://github.com/modelscope/FunCodec).
-3. We borrowed a lot of code from [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS).
-4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec).
-5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
-
-## Disclaimer
-The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
+2.**接口：** 
+修改了runtime下的fastapi接口，把流式和非流式接口分开了
+每个接口里面新增了HttpHeader，包含模型输出的采样率

From cf2008d28b144ce7f44e39157882094fe66e6c50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=A0=E8=A2=88=E8=A3=9F=E7=A9=BF=E5=8F=8D=E4=BA=86?=
 <tanbw@users.noreply.github.com>
Date: Thu, 24 Apr 2025 23:04:04 +0800
Subject: [PATCH 14/14] Update README.md

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f1031dd87..c9abf503c 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,15 @@
 
 1.**升级到cuda12.8适配50系显卡** 
 docker目录，修改了Dockerfile和requirements适配50系显卡，升级基础镜像到cuda12.8.0，已在Ubuntu24下测试通过。
+
+2.**Dockerfile不从仓库下载代码**
 不再从git下载源码，而是在docker run时映射宿主机源码目录到/workspace/CosyVoice，方便修改和测试。
 
-2.**接口：** 
+3.**接口：** 
 修改了runtime下的fastapi接口，把流式和非流式接口分开了
 每个接口里面新增了HttpHeader，包含模型输出的采样率
+
+## commit记录
+本项目不定期与官方源码同步，为了保证稳定性，记录经过测试的模型和代码的commitid：
+
+**2025/04/24**  modelscope的CosyVocie2官方模型：60b054e54afdd0d950e658dede3d2ef73d9d65b6，github代码：3bf48f125a8c25d3f9c386cdb3abf2b614391817