resemble-ai · hclivess · Oct 31, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/README.md b/README.md
@@ -3,6 +3,10 @@
 
 # Chatterbox TTS
 
+## For multimodel use tts_multi_gradio.py
+
+## I forked the og because Chatterbox-TTS-Extended managed to lower the output quality
+
 [![Alt Text](https://img.shields.io/badge/listen-demo_samples-blue)](https://resemble-ai.github.io/chatterbox_demopage/)
 [![Alt Text](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/ResembleAI/Chatterbox)
 [![Alt Text](https://static-public.podonos.com/badges/insight-on-pdns-sm-dark.svg)](https://podonos.com/resembleai/chatterbox)

diff --git a/demo_text.txt b/demo_text.txt
@@ -0,0 +1,8 @@
+Text 1 (Cca 300 znaků)
+Praha, srdce Evropy, se probouzí do chladného podzimního rána. Listí šustí pod nohama spěchajících chodců na Karlově mostě, kde se první turisté shlukují, aby zachytili magický východ slunce nad Hradčany. Historie dýchá z každého kamene starého města a vůně čerstvě upečeného trdelníku se line ulicemi. Je to den plný možností a objevů pro každého, kdo sem zavítá. Česká metropole má vždy co nabídnout, ať už jde o umění, architekturu nebo kulturu.
+
+Text 2 (Cca 300 znaků)
+Vývoj umělé inteligence (AI) je v současnosti jedním z nejdůležitějších technologických témat. Otevírá dveře k revolučním změnám v mnoha odvětvích, od medicíny po průmysl. Zároveň s sebou přináší i etické a společenské otázky, které je třeba zodpovědět. Jak zajistit spravedlivé využití AI a ochranu osobních dat? Tyto debaty jsou klíčové pro budoucí směřování naší společnosti v digitální éře. Je nutné hledat rovnováhu.
+
+Text 3 (Cca 300 znaků)
+V Krkonoších napadl první sníh a horské chaty se připravují na zimní sezónu. Vzduch je svěží a mrazivý, ideální pro dlouhé túry s výhledy na zasněžené vrcholky. Lyžařská střediska finišují s údržbou a netrpělivě čekají na první nedočkavé sportovce. Pohyb v horách v zimě vyžaduje respekt a dobrou výbavu, ale odměnou je nezapomenutelný zážitek a pocit svobody v tiché, bílé krajině.
diff --git a/gradio_tts_app.py b/gradio_tts_app.py
@@ -3,9 +3,11 @@
 import torch
 import gradio as gr
 from chatterbox.tts import ChatterboxTTS
+import re
 
 
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MAX_CHARS_PER_CHUNK = 400  # Adjust this based on testing
 
 
 def set_seed(seed: int):
@@ -21,35 +23,118 @@ def load_model():
     return model
 
 
+def split_text_smart(text, max_chars=MAX_CHARS_PER_CHUNK):
+    """Split text into chunks at sentence boundaries"""
+    if len(text) <= max_chars:
+        return [text]
+
+    # Split by sentences (., !, ?)
+    sentences = re.split(r'([.!?]+\s*)', text)
+
+    chunks = []
+    current_chunk = ""
+
+    for i in range(0, len(sentences), 2):
+        sentence = sentences[i]
+        punctuation = sentences[i + 1] if i + 1 < len(sentences) else ""
+        full_sentence = sentence + punctuation
+
+        # If adding this sentence would exceed limit
+        if len(current_chunk) + len(full_sentence) > max_chars:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+                current_chunk = full_sentence
+            else:
+                # Single sentence is too long, split by words
+                words = full_sentence.split()
+                temp_chunk = ""
+                for word in words:
+                    if len(temp_chunk) + len(word) + 1 <= max_chars:
+                        temp_chunk += word + " "
+                    else:
+                        if temp_chunk:
+                            chunks.append(temp_chunk.strip())
+                        temp_chunk = word + " "
+                current_chunk = temp_chunk
+        else:
+            current_chunk += full_sentence
+
+    if current_chunk.strip():
+        chunks.append(current_chunk.strip())
+
+    return chunks
+
+
 def generate(model, text, audio_prompt_path, exaggeration, temperature, seed_num, cfgw, min_p, top_p, repetition_penalty):
     if model is None:
         model = ChatterboxTTS.from_pretrained(DEVICE)
 
+    if not text or not text.strip():
+        return None
+
+    # Set seed if specified
     if seed_num != 0:
         set_seed(int(seed_num))
 
-    wav = model.generate(
-        text,
-        audio_prompt_path=audio_prompt_path,
-        exaggeration=exaggeration,
-        temperature=temperature,
-        cfg_weight=cfgw,
-        min_p=min_p,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
-    )
-    return (model.sr, wav.squeeze(0).numpy())
+    # Split text into chunks
+    chunks = split_text_smart(text.strip())
+
+    print(f"Processing {len(chunks)} chunk(s)...")
+
+    try:
+        audio_chunks = []
+
+        for i, chunk in enumerate(chunks):
+            print(f"Generating chunk {i+1}/{len(chunks)}: {chunk[:50]}...")
+
+            wav = model.generate(
+                chunk,
+                audio_prompt_path=audio_prompt_path,
+                exaggeration=exaggeration,
+                temperature=temperature,
+                cfg_weight=cfgw,
+                min_p=min_p,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+            )
+
+            # Convert to numpy and add to chunks
+            audio_chunks.append(wav.squeeze(0).numpy())
+
+        # Concatenate all audio chunks
+        if len(audio_chunks) > 1:
+            # Add small silence between chunks (0.1 seconds)
+            silence = np.zeros(int(model.sr * 0.1))
+            final_audio = audio_chunks[0]
+            for chunk in audio_chunks[1:]:
+                final_audio = np.concatenate([final_audio, silence, chunk])
+            print(f"Successfully generated {len(chunks)} chunks!")
+            return (model.sr, final_audio)
+        else:
+            return (model.sr, audio_chunks[0])
+
+    except Exception as e:
+        print(f"Error during generation: {str(e)}")
+        return None
 
 
 with gr.Blocks() as demo:
     model_state = gr.State(None)  # Loaded once per session/user
 
+    gr.Markdown("""
+    # Chatterbox TTS
+
+    **Note:** Long texts are automatically split into chunks for processing.
+    Each chunk is limited to ~400 characters for optimal quality.
+    """)
+
     with gr.Row():
         with gr.Column():
             text = gr.Textbox(
                 value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
-                label="Text to synthesize (max chars 300)",
-                max_lines=5
+                label="Text to synthesize (automatically chunked if too long)",
+                lines=8,
+                max_lines=15
             )
             ref_wav = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Reference Audio File", value=None)
             exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5)

diff --git a/gui.cmd b/gui.cmd
@@ -0,0 +1,3 @@
+call venv\Scripts\activate
+python gui.py
+pause