diff --git a/README.md b/README.md index d6651ef7..1b2fa2af 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,10 @@ # Chatterbox TTS +## For multimodel use tts_multi_gradio.py + +## I forked the og because Chatterbox-TTS-Extended managed to lower the output quality + [![Alt Text](https://img.shields.io/badge/listen-demo_samples-blue)](https://resemble-ai.github.io/chatterbox_demopage/) [![Alt Text](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/ResembleAI/Chatterbox) [![Alt Text](https://static-public.podonos.com/badges/insight-on-pdns-sm-dark.svg)](https://podonos.com/resembleai/chatterbox) diff --git a/demo_text.txt b/demo_text.txt new file mode 100644 index 00000000..70ecead4 --- /dev/null +++ b/demo_text.txt @@ -0,0 +1,8 @@ +Text 1 (Cca 300 znaků) +Praha, srdce Evropy, se probouzí do chladného podzimního rána. Listí šustí pod nohama spěchajících chodců na Karlově mostě, kde se první turisté shlukují, aby zachytili magický východ slunce nad Hradčany. Historie dýchá z každého kamene starého města a vůně čerstvě upečeného trdelníku se line ulicemi. Je to den plný možností a objevů pro každého, kdo sem zavítá. Česká metropole má vždy co nabídnout, ať už jde o umění, architekturu nebo kulturu. + +Text 2 (Cca 300 znaků) +Vývoj umělé inteligence (AI) je v současnosti jedním z nejdůležitějších technologických témat. Otevírá dveře k revolučním změnám v mnoha odvětvích, od medicíny po průmysl. Zároveň s sebou přináší i etické a společenské otázky, které je třeba zodpovědět. Jak zajistit spravedlivé využití AI a ochranu osobních dat? Tyto debaty jsou klíčové pro budoucí směřování naší společnosti v digitální éře. Je nutné hledat rovnováhu. + +Text 3 (Cca 300 znaků) +V Krkonoších napadl první sníh a horské chaty se připravují na zimní sezónu. Vzduch je svěží a mrazivý, ideální pro dlouhé túry s výhledy na zasněžené vrcholky. Lyžařská střediska finišují s údržbou a netrpělivě čekají na první nedočkavé sportovce. Pohyb v horách v zimě vyžaduje respekt a dobrou výbavu, ale odměnou je nezapomenutelný zážitek a pocit svobody v tiché, bílé krajině. \ No newline at end of file diff --git a/gradio_tts_app.py b/gradio_tts_app.py index cda7912b..b34a1936 100644 --- a/gradio_tts_app.py +++ b/gradio_tts_app.py @@ -3,9 +3,11 @@ import torch import gradio as gr from chatterbox.tts import ChatterboxTTS +import re DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +MAX_CHARS_PER_CHUNK = 400 # Adjust this based on testing def set_seed(seed: int): @@ -21,35 +23,118 @@ def load_model(): return model +def split_text_smart(text, max_chars=MAX_CHARS_PER_CHUNK): + """Split text into chunks at sentence boundaries""" + if len(text) <= max_chars: + return [text] + + # Split by sentences (., !, ?) + sentences = re.split(r'([.!?]+\s*)', text) + + chunks = [] + current_chunk = "" + + for i in range(0, len(sentences), 2): + sentence = sentences[i] + punctuation = sentences[i + 1] if i + 1 < len(sentences) else "" + full_sentence = sentence + punctuation + + # If adding this sentence would exceed limit + if len(current_chunk) + len(full_sentence) > max_chars: + if current_chunk: + chunks.append(current_chunk.strip()) + current_chunk = full_sentence + else: + # Single sentence is too long, split by words + words = full_sentence.split() + temp_chunk = "" + for word in words: + if len(temp_chunk) + len(word) + 1 <= max_chars: + temp_chunk += word + " " + else: + if temp_chunk: + chunks.append(temp_chunk.strip()) + temp_chunk = word + " " + current_chunk = temp_chunk + else: + current_chunk += full_sentence + + if current_chunk.strip(): + chunks.append(current_chunk.strip()) + + return chunks + + def generate(model, text, audio_prompt_path, exaggeration, temperature, seed_num, cfgw, min_p, top_p, repetition_penalty): if model is None: model = ChatterboxTTS.from_pretrained(DEVICE) + if not text or not text.strip(): + return None + + # Set seed if specified if seed_num != 0: set_seed(int(seed_num)) - wav = model.generate( - text, - audio_prompt_path=audio_prompt_path, - exaggeration=exaggeration, - temperature=temperature, - cfg_weight=cfgw, - min_p=min_p, - top_p=top_p, - repetition_penalty=repetition_penalty, - ) - return (model.sr, wav.squeeze(0).numpy()) + # Split text into chunks + chunks = split_text_smart(text.strip()) + + print(f"Processing {len(chunks)} chunk(s)...") + + try: + audio_chunks = [] + + for i, chunk in enumerate(chunks): + print(f"Generating chunk {i+1}/{len(chunks)}: {chunk[:50]}...") + + wav = model.generate( + chunk, + audio_prompt_path=audio_prompt_path, + exaggeration=exaggeration, + temperature=temperature, + cfg_weight=cfgw, + min_p=min_p, + top_p=top_p, + repetition_penalty=repetition_penalty, + ) + + # Convert to numpy and add to chunks + audio_chunks.append(wav.squeeze(0).numpy()) + + # Concatenate all audio chunks + if len(audio_chunks) > 1: + # Add small silence between chunks (0.1 seconds) + silence = np.zeros(int(model.sr * 0.1)) + final_audio = audio_chunks[0] + for chunk in audio_chunks[1:]: + final_audio = np.concatenate([final_audio, silence, chunk]) + print(f"Successfully generated {len(chunks)} chunks!") + return (model.sr, final_audio) + else: + return (model.sr, audio_chunks[0]) + + except Exception as e: + print(f"Error during generation: {str(e)}") + return None with gr.Blocks() as demo: model_state = gr.State(None) # Loaded once per session/user + gr.Markdown(""" + # Chatterbox TTS + + **Note:** Long texts are automatically split into chunks for processing. + Each chunk is limited to ~400 characters for optimal quality. + """) + with gr.Row(): with gr.Column(): text = gr.Textbox( value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", - label="Text to synthesize (max chars 300)", - max_lines=5 + label="Text to synthesize (automatically chunked if too long)", + lines=8, + max_lines=15 ) ref_wav = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Reference Audio File", value=None) exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5) diff --git a/gui.cmd b/gui.cmd new file mode 100644 index 00000000..dce4e948 --- /dev/null +++ b/gui.cmd @@ -0,0 +1,3 @@ +call venv\Scripts\activate +python gui.py +pause \ No newline at end of file diff --git a/gui.py b/gui.py new file mode 100644 index 00000000..788bc0eb --- /dev/null +++ b/gui.py @@ -0,0 +1,391 @@ +import sys +import re +import torch +import torchaudio as ta +from pathlib import Path +from safetensors.torch import load_file +from PySide6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, + QHBoxLayout, QPushButton, QLineEdit, QTextEdit, + QComboBox, QLabel, QGroupBox, QFileDialog, + QMessageBox, QCheckBox, QDoubleSpinBox) +from PySide6.QtCore import QThread, Signal, QSettings +from chatterbox_git.src.chatterbox import mtl_tts + + +def split_into_sentences(text): + """Simple sentence splitter for batching""" + # Split on period, exclamation, question mark followed by space or end + sentences = re.split(r'(? str: def get_supported_languages_display() -> str: """Generate a formatted display of all supported languages.""" + # Combine base supported languages with any custom ones + all_langs = dict(SUPPORTED_LANGUAGES) + all_langs.update({"cs": "Czech"}) # Add custom languages here + language_items = [] - for code, name in sorted(SUPPORTED_LANGUAGES.items()): + for code, name in sorted(all_langs.items()): language_items.append(f"**{name}** (`{code}`)") # Split into 2 lines @@ -126,7 +144,7 @@ def get_supported_languages_display() -> str: line2 = " • ".join(language_items[mid:]) return f""" -### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total) +### 🌍 Supported Languages ({len(all_langs)} total) {line1} {line2} @@ -134,27 +152,101 @@ def get_supported_languages_display() -> str: def get_or_load_model(): - """Loads the ChatterboxMultilingualTTS model if it hasn't been loaded already, - and ensures it's on the correct device.""" + """Loads the ChatterboxMultilingualTTS model if it hasn't been loaded already.""" global MODEL if MODEL is None: print("Model not loaded, initializing...") try: MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE) - if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE: - MODEL.to(DEVICE) - print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}") + # FIXED: Force model to device explicitly + if hasattr(MODEL, 'to'): + MODEL = MODEL.to(DEVICE) + print(f"✓ Model moved to {DEVICE}") + + # Also move submodules if they exist + if hasattr(MODEL, 't3') and hasattr(MODEL.t3, 'to'): + MODEL.t3 = MODEL.t3.to(DEVICE) + if hasattr(MODEL, 't2') and hasattr(MODEL.t2, 'to'): + MODEL.t2 = MODEL.t2.to(DEVICE) + if hasattr(MODEL, 't1') and hasattr(MODEL.t1, 'to'): + MODEL.t1 = MODEL.t1.to(DEVICE) + + print(f"✓ Model loaded successfully on {DEVICE}") + + # Print actual device locations for debugging + if hasattr(MODEL, 't3'): + try: + t3_device = next(MODEL.t3.parameters()).device + print(f" T3 device: {t3_device}") + except: + pass + except Exception as e: print(f"Error loading model: {e}") + import traceback + traceback.print_exc() raise return MODEL + +def switch_t3_model(model_choice: str): + """Switch the T3 model to a custom version""" + global MODEL + + if MODEL is None: + MODEL = get_or_load_model() + + custom_path = CUSTOM_T3_MODELS.get(model_choice) + + if custom_path: + # FIXED: Better path handling + if not os.path.exists(custom_path): + # Try adding .safetensors if not present + if not custom_path.endswith('.safetensors'): + alt_path = custom_path + '.safetensors' + if os.path.exists(alt_path): + custom_path = alt_path + else: + return f"❌ Error: Model file not found at {custom_path} or {alt_path}" + else: + return f"❌ Error: Model file not found: {custom_path}" + + print(f"Loading custom T3 model from: {custom_path}") + try: + # FIXED: Load directly to target device, not CPU first + t3_state = load_safetensors(custom_path, device=str(DEVICE)) + MODEL.t3.load_state_dict(t3_state, strict=False) # Added strict=False for safety + MODEL.t3.to(DEVICE).eval() + + # Verify device + t3_device = next(MODEL.t3.parameters()).device + print(f"✓ Loaded custom T3 model: {model_choice} on {t3_device}") + return f"✓ Loaded: {model_choice}\n📍 Device: {t3_device}\n📁 From: {custom_path}" + except Exception as e: + import traceback + error_details = traceback.format_exc() + print(f"Error details:\n{error_details}") + return f"❌ Error loading model: {str(e)}\n\nFull traceback in console." + else: + print("Reloading default T3 model...") + try: + # Reload the entire model to get default T3 + MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE) + MODEL = MODEL.to(DEVICE) + MODEL.t3.to(DEVICE).eval() + print("✓ Loaded default T3 model") + return f"✓ Loaded: Default T3 model\n📍 Device: {DEVICE}" + except Exception as e: + return f"❌ Error loading model: {str(e)}" + + # Attempt to load the model at startup. try: get_or_load_model() except Exception as e: print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}") + def set_seed(seed: int): """Sets the random seed for reproducibility across torch, numpy, and random.""" torch.manual_seed(seed) @@ -163,16 +255,6 @@ def set_seed(seed: int): torch.cuda.manual_seed_all(seed) random.seed(seed) np.random.seed(seed) - -def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | None: - """ - Decide which audio prompt to use: - - If user provided a path (upload/mic/url), use it. - - Else, fall back to language-specific default (if any). - """ - if provided_path and str(provided_path).strip(): - return provided_path - return LANGUAGE_CONFIG.get(language_id, {}).get("audio") def generate_tts_audio( @@ -184,26 +266,7 @@ def generate_tts_audio( seed_num_input: int = 0, cfgw_input: float = 0.5 ) -> tuple[int, np.ndarray]: - """ - Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling. - Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi. - - This tool synthesizes natural-sounding speech from input text. When a reference audio file - is provided, it captures the speaker's voice characteristics and speaking style. The generated audio - maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided. - - Args: - text_input (str): The text to synthesize into speech (maximum 300 characters) - language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi) - audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None. - exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5. - temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8. - seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0. - cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer. - - Returns: - tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray) - """ + """Generate TTS audio with custom T3 model support""" current_model = get_or_load_model() if current_model is None: @@ -212,7 +275,14 @@ def generate_tts_audio( if seed_num_input != 0: set_seed(int(seed_num_input)) - print(f"Generating audio for text: '{text_input[:50]}...'") + print(f"Generating audio for text: '{text_input[:50]}...' in language: {language_id}") + + # FIXED: Verify model is on correct device before generation + if hasattr(current_model, 't3'): + t3_device = next(current_model.t3.parameters()).device + print(f" T3 currently on: {t3_device}") + if str(t3_device) != DEVICE and DEVICE == "cuda": + print(f" ⚠️ WARNING: T3 is on {t3_device} but should be on {DEVICE}") # Handle optional audio prompt chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id) @@ -234,29 +304,52 @@ def generate_tts_audio( **generate_kwargs ) print("Audio generation complete.") - return (current_model.sr, wav.squeeze(0).numpy()) + return (current_model.sr, wav.squeeze(0).cpu().numpy()) # FIXED: Added .cpu() before .numpy() -with gr.Blocks() as demo: + +with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( """ - # Chatterbox Multilingual Demo - Generate high-quality multilingual speech from text with reference audio styling, supporting 23 languages. + # 🎙️ Chatterbox Multilingual Demo with Custom T3 Support + Generate high-quality multilingual speech from text with reference audio styling and custom model support. """ ) # Display supported languages gr.Markdown(get_supported_languages_display()) + with gr.Row(): with gr.Column(): - initial_lang = "fr" + # Model Selection Section + gr.Markdown("### 🔧 Model Configuration") + t3_model_dropdown = gr.Dropdown( + choices=list(CUSTOM_T3_MODELS.keys()), + value="Default", + label="T3 Model", + info="Select which T3 model to use" + ) + model_status = gr.Textbox( + label="Model Status", + value="Default model loaded", + interactive=False, + lines=2 + ) + load_t3_btn = gr.Button("🔄 Load Selected T3 Model", variant="secondary", size="sm") + + gr.Markdown("---") + + # TTS Controls + initial_lang = "cs" # Default to Czech for testing text = gr.Textbox( value=default_text_for_ui(initial_lang), label="Text to synthesize (max chars 300)", max_lines=5 ) + # Get all supported languages including custom ones + all_language_codes = list(SUPPORTED_LANGUAGES.keys()) + ["cs"] language_id = gr.Dropdown( - choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()), + choices=sorted(set(all_language_codes)), value=initial_lang, label="Language", info="Select the language for text-to-speech synthesis" @@ -270,35 +363,63 @@ def generate_tts_audio( ) gr.Markdown( - "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.", + "💡 **Note**: Ensure that the reference clip matches the specified language tag. For custom languages, set CFG weight to 0 if experiencing accent issues.", elem_classes=["audio-note"] ) exaggeration = gr.Slider( - 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5 + 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5)", value=.5 ) cfg_weight = gr.Slider( - 0.2, 1, step=.05, label="CFG/Pace", value=0.5 + 0.0, 1, step=.05, label="CFG/Pace (0 for language transfer)", value=0.5 ) with gr.Accordion("More options", open=False): seed_num = gr.Number(value=0, label="Random seed (0 for random)") temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8) - run_btn = gr.Button("Generate", variant="primary") + run_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg") with gr.Column(): - audio_output = gr.Audio(label="Output Audio") + gr.Markdown("### 📊 Output") + audio_output = gr.Audio(label="Generated Audio") + + gr.Markdown(""" + --- + ### 💡 Tips + + **Custom T3 Models:** + - Load your fine-tuned T3 models for new languages + - Use full path with `.safetensors` extension + - Example: `C:/ChatterboxTraining/t3/t3_cs.safetensors` + - Switch between models without restarting + + **Voice Cloning:** + - Upload 5-10 seconds of clear reference audio + - Single speaker, minimal background noise + - Match reference language to target language + + **Parameters:** + - **Exaggeration**: Controls emotion intensity + - **Temperature**: Higher = more variation + - **CFG Weight**: Set to 0 for language transfer without accent + """) - def on_language_change(lang, current_ref, current_text): - return default_audio_for_ui(lang), default_text_for_ui(lang) + def on_language_change(lang, current_ref, current_text): + return default_audio_for_ui(lang), default_text_for_ui(lang) - language_id.change( - fn=on_language_change, - inputs=[language_id, ref_wav, text], - outputs=[ref_wav, text], - show_progress=False - ) + language_id.change( + fn=on_language_change, + inputs=[language_id, ref_wav, text], + outputs=[ref_wav, text], + show_progress=False + ) + + load_t3_btn.click( + fn=switch_t3_model, + inputs=[t3_model_dropdown], + outputs=[model_status] + ) run_btn.click( fn=generate_tts_audio, @@ -314,4 +435,24 @@ def on_language_change(lang, current_ref, current_text): outputs=[audio_output], ) -demo.launch(mcp_server=True) +if __name__ == "__main__": + print("\n" + "="*60) + print("🎙️ CHATTERBOX MULTILINGUAL TTS - CUSTOM T3 EDITION") + print("="*60) + print(f"Device: {DEVICE}") + if DEVICE == "cuda": + print(f"GPU: {torch.cuda.get_device_name(0)}") + print(f"CUDA Version: {torch.version.cuda}") + print(f"Available T3 models: {len(CUSTOM_T3_MODELS)}") + for model_name, path in CUSTOM_T3_MODELS.items(): + if path: + exists = "✓" if os.path.exists(path) or os.path.exists(path + ".safetensors") else "✗" + print(f" {exists} {model_name}: {path}") + else: + print(f" ✓ {model_name} (built-in)") + print("="*60 + "\n") + + demo.queue( + max_size=50, + default_concurrency_limit=1, + ).launch(share=True) \ No newline at end of file diff --git a/reference.cmd b/reference.cmd new file mode 100644 index 00000000..ee94da83 --- /dev/null +++ b/reference.cmd @@ -0,0 +1,3 @@ +call venv\Scripts\activate +python reference.py +pause \ No newline at end of file diff --git a/reference.py b/reference.py new file mode 100644 index 00000000..f32d6870 --- /dev/null +++ b/reference.py @@ -0,0 +1,22 @@ +from chatterbox_git.src.chatterbox import mtl_tts +import torchaudio as ta +from safetensors.torch import load_file as load_safetensors + +device = "cuda" # or mps or cuda + +multilingual_model = mtl_tts.ChatterboxMultilingualTTS.from_pretrained(device=device) + +# ---- +# Then download the file from huggingface and place it in the current directory. +# ---- + + + +t3_state = load_safetensors("t3_cs.safetensors", device="cuda") +multilingual_model.t3.load_state_dict(t3_state) +multilingual_model.t3.to(device).eval() + +czech_text = "Přečtěte si krátký text a odpovězte na několik otázek, které testují porozumění. Můžete se začíst do krátkých úryvků z článků nebo do některého z našich krátkých a vtipných příběhů. Pozor, vybraný text můžete řešit pouze jednou v daný den." +wav_czech = multilingual_model.generate(czech_text, language_id="cs") +ta.save("test-cs.wav", wav_czech, multilingual_model.sr) + diff --git a/tts_multi_gradio.cmd b/tts_multi_gradio.cmd new file mode 100644 index 00000000..e10ccd2d --- /dev/null +++ b/tts_multi_gradio.cmd @@ -0,0 +1,9 @@ +@echo off +echo Starting Chatterbox TTS Gradio App... +echo. + +call .venv\Scripts\activate.bat +start http://127.0.0.1:7860 +python tts_multi_gradio.py + +if errorlevel 1 pause \ No newline at end of file diff --git a/tts_multi_gradio.py b/tts_multi_gradio.py new file mode 100644 index 00000000..2cfd92fa --- /dev/null +++ b/tts_multi_gradio.py @@ -0,0 +1,338 @@ +import random +import numpy as np +import torch +import gradio as gr +from chatterbox.tts import ChatterboxTTS +from transformers import BarkModel +import re +import os +from pathlib import Path + + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +MAX_CHARS_PER_CHUNK = 400 # Adjust this based on testing + +# Custom Bark model configuration +CUSTOM_BARK_MODELS = { + "Default (suno/bark-small)": None, + "czt3": "C:/ChatterboxTraining/t3", + # Add more custom models here: + # "My Voice Clone": "C:/path/to/another/model", +} + + +def set_seed(seed: int): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + random.seed(seed) + np.random.seed(seed) + + +def load_model(): + """Load ChatterboxTTS with default Bark model""" + print(f"Loading Chatterbox TTS on {DEVICE}...") + model = ChatterboxTTS.from_pretrained(DEVICE) + print("✓ Chatterbox TTS loaded with default Bark model") + return model + + +def switch_bark_model(model, model_choice): + """Switch the Bark model to a different version""" + if model is None: + model = ChatterboxTTS.from_pretrained(DEVICE) + + custom_path = CUSTOM_BARK_MODELS.get(model_choice) + + if custom_path: + # Check if path exists + if not os.path.exists(custom_path): + return model, f"❌ Error: Model path not found: {custom_path}" + + print(f"Loading custom Bark model from: {custom_path}") + try: + # Replace the Bark model + model.bark_model = BarkModel.from_pretrained(custom_path).to(DEVICE) + print(f"✓ Loaded custom Bark model: {model_choice}") + return model, f"✓ Loaded: {model_choice}" + except Exception as e: + return model, f"❌ Error loading model: {str(e)}" + else: + print("Loading default Bark model...") + try: + # Reload default model + model.bark_model = BarkModel.from_pretrained("suno/bark-small").to(DEVICE) + print("✓ Loaded default Bark model") + return model, "✓ Loaded: Default Bark model" + except Exception as e: + return model, f"❌ Error loading model: {str(e)}" + + +def split_text_smart(text, max_chars=MAX_CHARS_PER_CHUNK): + """Split text into chunks at sentence boundaries""" + if len(text) <= max_chars: + return [text] + + # Split by sentences (., !, ?) + sentences = re.split(r'([.!?]+\s*)', text) + + chunks = [] + current_chunk = "" + + for i in range(0, len(sentences), 2): + sentence = sentences[i] + punctuation = sentences[i + 1] if i + 1 < len(sentences) else "" + full_sentence = sentence + punctuation + + # If adding this sentence would exceed limit + if len(current_chunk) + len(full_sentence) > max_chars: + if current_chunk: + chunks.append(current_chunk.strip()) + current_chunk = full_sentence + else: + # Single sentence is too long, split by words + words = full_sentence.split() + temp_chunk = "" + for word in words: + if len(temp_chunk) + len(word) + 1 <= max_chars: + temp_chunk += word + " " + else: + if temp_chunk: + chunks.append(temp_chunk.strip()) + temp_chunk = word + " " + current_chunk = temp_chunk + else: + current_chunk += full_sentence + + if current_chunk.strip(): + chunks.append(current_chunk.strip()) + + return chunks + + +def generate(model, text, audio_prompt_path, exaggeration, temperature, seed_num, cfgw, min_p, top_p, repetition_penalty): + if model is None: + model = ChatterboxTTS.from_pretrained(DEVICE) + + if not text or not text.strip(): + return None + + # Set seed if specified + if seed_num != 0: + set_seed(int(seed_num)) + + # Split text into chunks + chunks = split_text_smart(text.strip()) + + print(f"Processing {len(chunks)} chunk(s)...") + + try: + audio_chunks = [] + + for i, chunk in enumerate(chunks): + print(f"Generating chunk {i+1}/{len(chunks)}: {chunk[:50]}...") + + wav = model.generate( + chunk, + audio_prompt_path=audio_prompt_path, + exaggeration=exaggeration, + temperature=temperature, + cfg_weight=cfgw, + min_p=min_p, + top_p=top_p, + repetition_penalty=repetition_penalty, + ) + + # Convert to numpy and add to chunks + audio_chunks.append(wav.squeeze(0).numpy()) + + # Concatenate all audio chunks + if len(audio_chunks) > 1: + # Add small silence between chunks (0.1 seconds) + silence = np.zeros(int(model.sr * 0.1)) + final_audio = audio_chunks[0] + for chunk in audio_chunks[1:]: + final_audio = np.concatenate([final_audio, silence, chunk]) + print(f"Successfully generated {len(chunks)} chunks!") + return (model.sr, final_audio) + else: + return (model.sr, audio_chunks[0]) + + except Exception as e: + print(f"Error during generation: {str(e)}") + return None + + +with gr.Blocks(theme=gr.themes.Soft()) as demo: + model_state = gr.State(None) # Loaded once per session/user + + gr.Markdown(""" + # 🎙️ Chatterbox TTS + + Advanced text-to-speech with custom Bark model support. + + **Note:** Long texts are automatically split into chunks for processing. + Each chunk is limited to ~400 characters for optimal quality. + """) + + with gr.Row(): + with gr.Column(scale=1): + # Model Selection + gr.Markdown("### 🔧 Model Configuration") + model_dropdown = gr.Dropdown( + choices=list(CUSTOM_BARK_MODELS.keys()), + value="Default (suno/bark-small)", + label="Bark Model", + info="Select which Bark model to use" + ) + model_status = gr.Textbox( + label="Model Status", + value="Default model loaded", + interactive=False, + lines=1 + ) + load_model_btn = gr.Button("🔄 Load Selected Model", variant="secondary", size="sm") + + gr.Markdown("---") + + # Text Input + gr.Markdown("### 📝 Text Input") + text = gr.Textbox( + value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", + label="Text to synthesize", + lines=6, + max_lines=12, + placeholder="Enter text to convert to speech..." + ) + + # Reference Audio + gr.Markdown("### 🎵 Voice Reference") + ref_wav = gr.Audio( + sources=["upload", "microphone"], + type="filepath", + label="Reference Audio File", + value=None + ) + + # Main Controls + gr.Markdown("### 🎛️ Voice Controls") + exaggeration = gr.Slider( + 0.25, 2, + step=.05, + label="Exaggeration", + info="Neutral = 0.5, extreme values can be unstable", + value=.5 + ) + cfg_weight = gr.Slider( + 0.0, 1, + step=.05, + label="CFG Weight / Pace", + info="Controls generation guidance", + value=0.3 + ) + + # Advanced Options + with gr.Accordion("⚙️ Advanced Options", open=False): + seed_num = gr.Number( + value=0, + label="Random Seed", + info="0 for random, set number for reproducible results" + ) + temp = gr.Slider( + 0.05, 5, + step=.05, + label="Temperature", + info="Higher = more creative/variable", + value=.8 + ) + min_p = gr.Slider( + 0.00, 1.00, + step=0.01, + label="Min P", + info="Newer sampler. 0.02-0.1 recommended. 0.00 disables", + value=0.05 + ) + top_p = gr.Slider( + 0.00, 1.00, + step=0.01, + label="Top P", + info="Original sampler. 1.0 disables (recommended)", + value=1.00 + ) + repetition_penalty = gr.Slider( + 1.00, 2.00, + step=0.1, + label="Repetition Penalty", + info="Reduces repeated phrases", + value=1.2 + ) + + # Generate Button + run_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg") + + with gr.Column(scale=1): + gr.Markdown("### 🔊 Output") + audio_output = gr.Audio(label="Generated Audio") + + gr.Markdown(""" + --- + ### 💡 Tips + + **Model Selection:** + - **Default**: Original Bark model from Suno AI + - **Fine-tuned**: Your custom trained model + + **Voice Cloning:** + - Upload 5-10 seconds of clear reference audio + - Single speaker, minimal background noise + + **Text Processing:** + - Texts over 400 chars are auto-chunked + - Use proper punctuation for better prosody + + **Parameter Guide:** + - **Exaggeration**: Controls emotion intensity + - **Temperature**: Higher = more variation + - **CFG Weight**: Affects pacing and adherence + """) + + # Event Handlers + demo.load(fn=load_model, inputs=[], outputs=model_state) + + load_model_btn.click( + fn=switch_bark_model, + inputs=[model_state, model_dropdown], + outputs=[model_state, model_status] + ) + + run_btn.click( + fn=generate, + inputs=[ + model_state, + text, + ref_wav, + exaggeration, + temp, + seed_num, + cfg_weight, + min_p, + top_p, + repetition_penalty, + ], + outputs=audio_output, + ) + +if __name__ == "__main__": + print("\n" + "="*60) + print("🎙️ CHATTERBOX TTS - CUSTOM MODEL EDITION") + print("="*60) + print(f"Device: {DEVICE}") + print(f"Available models: {len(CUSTOM_BARK_MODELS)}") + for model_name in CUSTOM_BARK_MODELS.keys(): + print(f" - {model_name}") + print("="*60 + "\n") + + demo.queue( + max_size=50, + default_concurrency_limit=1, + ).launch(share=True) \ No newline at end of file diff --git a/wav_to_mp3.cmd b/wav_to_mp3.cmd new file mode 100644 index 00000000..77e6e126 --- /dev/null +++ b/wav_to_mp3.cmd @@ -0,0 +1,2 @@ +ffmpeg -i %1 %1.mp3 +pause \ No newline at end of file