Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@
[submodule "deps/Zonos"]
path = deps/Zonos
url = https://github.com/weedge/Zonos.git
[submodule "deps/StepAudio"]
path = deps/StepAudio
url = https://github.com/weedge/Step-Audio.git
1 change: 1 addition & 0 deletions deps/StepAudio
Submodule StepAudio added at 7ce0a8
128 changes: 77 additions & 51 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -182,32 +182,14 @@ speech_vad_analyzer = [
rms_recorder = []
vad_recorder = ["achatbot[speech_vad]"]

# asr module tag -> pkgs
whisper_asr = ["openai-whisper==20231117"]
whisper_timestamped_asr = ["whisper-timestamped~=1.14.2"]
whisper_faster_asr = ["faster-whisper~=1.0.2"]
whisper_transformers_asr = ["transformers[torch]>=4.40.2"]
whisper_mlx_asr = [
"mlx_whisper~=0.2.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
]
whisper_groq_asr = ["groq~=0.9.0"]
sense_voice_asr = [
"torch~=2.2.2",
"funasr~=1.1.8",
"onnx",
"onnxconverter-common",
]
speech_asr = [
"achatbot[whisper_asr,whisper_timestamped_asr,whisper_faster_asr,whisper_transformers_asr,whisper_mlx_asr,whisper_groq_asr,sense_voice_asr]",
]

# --------------------------------- llm --------------------------
# llm module tag -> pkgs
# init use cpu Pre-built Wheel to install,
# if want to use other lib(cuda), see: https://github.com/abetlen/llama-cpp-python#installation-configuration
llama_cpp = ["llama-cpp-python~=0.2.82"]
llm_personalai_proxy = ["geocoder~=1.38.1"]

# vision
# vision llm
llm_transformers_manual_vision = [
#"transformers@git+https://github.com/huggingface/transformers",
# https://github.com/huggingface/transformers/releases/tag/v4.45.0
Expand Down Expand Up @@ -245,9 +227,65 @@ llm_transformers_manual_vision_deepseekvl2 = [
"timm>=0.9.16",
]

# voice llm
llm_transformers_manual_voice = [
#"transformers@git+https://github.com/huggingface/transformers",
# https://github.com/huggingface/transformers/releases/tag/v4.45.transformers~=4.45.2
"transformers~=4.45.2",
"torch~=2.2.2",
"torchaudio~=2.2.2",
]
llm_transformers_manual_voice_glm = [
"achatbot[llm_transformers_manual_voice,tts_cosy_voice,gdown,matplotlib,conf]",
]
llm_transformers_manual_voice_freeze_omni = [
"achatbot[llm_transformers_manual_voice,librosa,soundfile,yaml]",
]
# speech llm
llm_transformers_manual_speech_llasa = [
"achatbot[llm_transformers_manual_voice]",
]
llm_transformers_manual_speech_step = [
"achatbot[llm_transformers_manual_voice]",
]
# vision voice llm
llm_transformers_manual_vision_voice_minicpmo = [
"achatbot[accelerate,librosa,soundfile]",
"torch~=2.2.2",
"torchaudio~=2.2.2",
"torchvision~=0.17.2",
"transformers==4.44.2",
#"librosa==0.9.0",
#"soundfile==0.12.1",
"vector-quantize-pytorch~=1.18.5",
"vocos~=0.1.0",
"decord",
"moviepy",
]

# core llms
core_llm = ["achatbot[llama_cpp,llm_personalai_proxy]"]

# ----------------- asr ------------------
# asr module tag -> pkgs
whisper_asr = ["openai-whisper==20231117"]
whisper_timestamped_asr = ["whisper-timestamped~=1.14.2"]
whisper_faster_asr = ["faster-whisper~=1.0.2"]
whisper_transformers_asr = ["transformers[torch]>=4.40.2"]
whisper_mlx_asr = [
"mlx_whisper~=0.2.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
]
whisper_groq_asr = ["groq~=0.9.0"]
sense_voice_asr = [
"torch~=2.2.2",
"funasr~=1.1.8",
"onnx",
"onnxconverter-common",
]
speech_asr = [
"achatbot[whisper_asr,whisper_timestamped_asr,whisper_faster_asr,whisper_transformers_asr,whisper_mlx_asr,whisper_groq_asr,sense_voice_asr]",
]

# -----------------codec------------------
# https://huggingface.co/kyutai/mimi/blob/main/config.json transformers_version
codec_transformers_mimi = ["transformers[torch]~=4.45.1"]
Expand Down Expand Up @@ -357,43 +395,31 @@ tts_zonos_hybrid = [
"mamba-ssm>=2.2.4",
"causal-conv1d>=1.5.0.post8",
]
tts_step = [
"torch==2.3.1",
"torchaudio==2.3.1",
"torchvision==0.18.1",
"transformers==4.48.3",
"accelerate==1.3.0",
"openai-whisper==20231117",
"sox==1.5.0",
"modelscope",
"six==1.16.0",
"hyperpyyaml",
"conformer==0.3.2",
"diffusers",
"onnxruntime-gpu==1.20.1", # cuda 12.5
"sentencepiece",
"funasr>=1.1.3",
"protobuf==5.29.3",
"achatbot[conf,librosa]",
]

# multi tts modules engine
speech_tts = [
"achatbot[tts_coqui,tts_edge,tts_g,tts_pyttsx3,tts_cosy_voice,tts_chat,tts_f5,tts_openvoicev2,tts_kokoro]",
]

# voice
llm_transformers_manual_voice = [
#"transformers@git+https://github.com/huggingface/transformers",
# https://github.com/huggingface/transformers/releases/tag/v4.45.transformers~=4.45.2
"transformers~=4.45.2",
"torch~=2.2.2",
"torchaudio~=2.2.2",
]
llm_transformers_manual_voice_glm = [
"achatbot[llm_transformers_manual_voice,tts_cosy_voice,gdown,matplotlib,conf]",
]
llm_transformers_manual_voice_freeze_omni = [
"achatbot[llm_transformers_manual_voice,librosa,soundfile,yaml]",
]
llm_transformers_manual_speech_llasa = [
"achatbot[llm_transformers_manual_voice]",
]
llm_transformers_manual_vision_voice_minicpmo = [
"achatbot[accelerate,librosa,soundfile]",
"torch~=2.2.2",
"torchaudio~=2.2.2",
"torchvision~=0.17.2",
"transformers==4.44.2",
#"librosa==0.9.0",
#"soundfile==0.12.1",
"vector-quantize-pytorch~=1.18.5",
"vocos~=0.1.0",
"decord",
"moviepy",
]


# player module tag -> pkgs
stream_player = []
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"TingtingRAP": "(RAP)远远甩开的笑他是陆行龟 他曾跌倒也曾吃过灰 他说有福的人才会多吃亏 他的爸爸让他小心交友可他偏偏钻进个垃圾堆 他说他明白How to play",
"Tingting哼唱": "(哼唱)你从一座叫 我 的小镇经过 刚好屋顶的雪化成雨飘落",
"Tingting": "那等我们到海洋馆之后,给妈妈买个礼物,好不好呀?"
}
28 changes: 26 additions & 2 deletions src/cmd/grpc/speaker/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,16 @@ def load_model(tts_stub: TTSStub):


def synthesize_us(tts_stub: TTSStub):
request_data = SynthesizeRequest(tts_text="hello,你好,我是机器人")
tag = os.getenv("TTS_TAG", "tts_edge")
if tag not in TTSEnvInit.map_synthesize_config_func:
logging.warning(f"{tag} not in map_synthesize_config_func, use default config")
kwargs = TTSEnvInit.get_tts_synth_args()
else:
kwargs = TTSEnvInit.map_synthesize_config_func[tag]()
request_data = SynthesizeRequest(
tts_text="hello,你好,我是机器人", json_kwargs=json.dumps(kwargs)
)
logging.debug(request_data)
response_iterator = tts_stub.SynthesizeUS(request_data)
for response in response_iterator:
yield response.tts_audio
Expand Down Expand Up @@ -111,7 +120,7 @@ def set_voice(tts_stub: TTSStub, voice: str):
IS_RELOAD=1 python -m src.cmd.grpc.speaker.client

TTS_TAG=tts_llasa IS_SAVE=1 IS_RELOAD=1 python -m src.cmd.grpc.speaker.client
TTS_TAG=tts_llasa IS_SAVE=1 IS_RELOAD=1 python -m src.cmd.grpc.speaker.client
TTS_TAG=tts_step IS_SAVE=1 IS_RELOAD=1 python -m src.cmd.grpc.speaker.client

# instruct2speech
TTS_TAG=tts_minicpmo \
Expand All @@ -134,6 +143,21 @@ def set_voice(tts_stub: TTSStub, voice: str):
SPEAKER_EMBEDDING_MODEL_DIR=./models/Zyphra/Zonos-v0.1-speaker-embedding
ZONOS_REF_AUDIO_PATH=./test/audio_files/asr_example_zh.wav \
IS_SAVE=1 IS_RELOAD=1 python -m src.cmd.grpc.speaker.client

# tts lm gen
TTS_TAG=tts_step IS_SAVE=1 IS_RELOAD=1 \
TTS_WARMUP_STEPS=2 TTS_LM_MODEL_PATH=./models/stepfun-ai/Step-Audio-TTS-3B \
TTS_TOKENIZER_MODEL_PATH=./models/stepfun-ai/Step-Audio-Tokenizer \
python -m src.cmd.grpc.speaker.client
# tts voice clone
TTS_TAG=tts_step IS_SAVE=1 IS_RELOAD=1 \
TTS_WARMUP_STEPS=2 TTS_LM_MODEL_PATH=/content/models/stepfun-ai/Step-Audio-TTS-3B \
TTS_TOKENIZER_MODEL_PATH=/content/models/stepfun-ai/Step-Audio-Tokenizer \
TTS_STREAM_FACTOR=2 \
TTS_MODE=voice_clone \
SRC_AUDIO_PATH=./test/audio_files/asr_example_zh.wav \
python -m src.cmd.grpc.speaker.client

"""
if __name__ == "__main__":
player = None
Expand Down
2 changes: 1 addition & 1 deletion src/common/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def get_stream_info(self) -> dict:
raise NotImplementedError("must be implemented in the child class")

@abstractmethod
def set_voice(self, voice: str):
def set_voice(self, voice: str, **kwargs):
"""
Note:
- just simple voice set, don't support set voice with user id
Expand Down
14 changes: 11 additions & 3 deletions src/common/utils/task.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python
from concurrent.futures import ThreadPoolExecutor
import logging
import traceback
from typing import Callable, Any
import asyncio
import queue
Expand All @@ -17,9 +19,15 @@ async def async_task(sync_func: Callable, *args, **kwargs) -> Any:

def fetch_async_items(queue: queue.Queue, asyncFunc, *args) -> None:
async def get_items() -> None:
async for item in asyncFunc(*args):
queue.put(item)
queue.put(None)
try:
async for item in asyncFunc(*args):
queue.put(item)
queue.put(None)
except Exception as e:
error_message = traceback.format_exc()
logging.error(f"error:{e} trace: {error_message}")

queue.put(None)

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
Expand Down
Loading