chatterbox-ui/gradio_app.py

import gradio as gr
import yaml
import os
import time
import re
import tempfile
import shutil
import numpy as np
from chatterbox.tts import ChatterboxTTS
import torchaudio as ta
import torch
from typing import Dict, Tuple, Optional, List

# Load speaker options from YAML with error handling
try:
    yaml_path = os.path.abspath("speakers.yaml")
    if not os.path.exists(yaml_path):
        raise FileNotFoundError(f"speakers.yaml not found at {yaml_path}")

    with open(yaml_path) as f:
        speakers = yaml.safe_load(f)

    if not speakers or not isinstance(speakers, dict):
        raise ValueError("speakers.yaml must contain a valid dictionary mapping")

except Exception as e:
    raise SystemExit(f"Failed to load speakers.yaml: {str(e)}")

def split_text_at_sentence_boundaries(text, max_length=300):
    """Split text at sentence boundaries, ensuring each chunk is <= max_length."""
    sentence_pattern = r'[.!?](?:\s|$)'
    sentences = re.split(f'({sentence_pattern})', text)

    actual_sentences = []
    current = ""
    for i in range(0, len(sentences), 2):
        if i+1 < len(sentences):
            current = sentences[i] + sentences[i+1]
        else:
            current = sentences[i]
        if current:
            actual_sentences.append(current)

    chunks = []
    current_chunk = ""

    for sentence in actual_sentences:
        if len(current_chunk) + len(sentence) > max_length and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk += sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def parse_dialog_line(line):
    """Parse a dialog line in the format: Name: "Text"""
    pattern = r'^([^:]+):\s*"([^"]+)"$'
    match = re.match(pattern, line.strip())
    if match:
        speaker = match.group(1).strip()
        text = match.group(2).strip()
        return (speaker, text)
    return None

def save_speakers_config(speakers_dict: Dict) -> str:
    """Save speakers configuration to YAML file."""
    try:
        with open("speakers.yaml", 'w') as f:
            yaml.dump(speakers_dict, f)
        return "Speakers configuration saved successfully!"
    except Exception as e:
        return f"Error saving configuration: {str(e)}"

def add_speaker(speaker_name: str, audio_file: str, current_speakers: Dict) -> Tuple[str, Dict]:
    """Add a new speaker with audio sample."""
    if not speaker_name or not audio_file:
        return "Please provide both speaker name and audio file", current_speakers

    if speaker_name in current_speakers:
        return f"Speaker '{speaker_name}' already exists!", current_speakers

    # Save the audio file
    speakers_dir = "speaker_samples"
    os.makedirs(speakers_dir, exist_ok=True)

    # Generate a unique filename
    ext = os.path.splitext(audio_file)[1] or '.wav'
    new_filename = f"{speaker_name.lower().replace(' ', '_')}{ext}"
    new_filepath = os.path.join(speakers_dir, new_filename)

    # Copy the uploaded file
    shutil.copy2(audio_file, new_filepath)

    # Update speakers dictionary
    updated_speakers = dict(current_speakers)
    updated_speakers[speaker_name] = new_filepath

    # Save to YAML
    save_speakers_config(updated_speakers)

    return f"Speaker '{speaker_name}' added successfully!", updated_speakers

def remove_speaker(speaker_name: str, current_speakers: Dict) -> Tuple[str, Dict]:
    """Remove a speaker from the configuration."""
    if not speaker_name or speaker_name not in current_speakers:
        return "Speaker not found!", current_speakers

    # Don't actually delete the audio file, just remove from config
    updated_speakers = dict(current_speakers)
    del updated_speakers[speaker_name]

    # Save to YAML
    save_speakers_config(updated_speakers)

    return f"Speaker '{speaker_name}' removed!", updated_speakers

def update_speakers_dropdown():
    """Update the speakers dropdown with current speakers."""
    return gr.Dropdown.update(choices=list(speakers.keys()))

def generate_audio(speaker_choice, custom_sample, text, exaggeration, cfg_weight, temperature, max_new_tokens):
    # Get sample path from selection or upload
    sample_path = speakers[speaker_choice] if speaker_choice != "Custom" else custom_sample

    if not os.path.exists(sample_path):
        raise gr.Error("Sample file not found!")

    # Load model (cached automatically by Gradio)
    tts = ChatterboxTTS.from_pretrained(device="mps")

    # Generate audio with advanced controls
    gen_kwargs = dict(
        text=text,
        audio_prompt_path=sample_path,
        exaggeration=exaggeration,
        cfg_weight=cfg_weight,
        temperature=temperature
    )
    # max_new_tokens is not supported by the current TTS library, so we ignore it here
    wav = tts.generate(**gen_kwargs)

    # Save with timestamp
    output_path = f"output_{int(time.time())}.wav"
    ta.save(output_path, wav, tts.sr)

    return output_path, output_path

def process_dialog(dialog_text, speaker_samples, output_base, reinit_each_line, progress=gr.Progress()):
    """Process dialog text and generate audio files."""
    try:
        print("Starting dialog processing...")  # Debug log
        print(f"Speaker samples: {speaker_samples}")  # Debug log

        if not dialog_text or not dialog_text.strip():
            return "Error: No dialog text provided", None

        # Parse dialog lines
        dialog_lines = [line.strip() for line in dialog_text.split('\n') if line.strip()]

        if not dialog_lines:
            return "Error: No valid dialog lines found", None

        print(f"Processing {len(dialog_lines)} dialog lines")  # Debug log

        # Initialize model only once if not reinitializing per line
        model = None
        if not reinit_each_line:
            progress(0.1, desc="Loading TTS model...")
            try:
                model = ChatterboxTTS.from_pretrained(device="mps")
                print("TTS model loaded successfully")  # Debug log
            except Exception as e:
                return f"Error loading TTS model: {str(e)}", None

        # Create output directory
        output_dir = "dialog_output"
        os.makedirs(output_dir, exist_ok=True)

        # Process each dialog line
        file_counter = 1
        output_files = []
        summary = []

        for i, line in enumerate(dialog_lines):
            progress(i / len(dialog_lines), desc=f"Processing line {i+1}/{len(dialog_lines)}")
            print(f"Processing line {i+1}: {line}")  # Debug log

            try:
                parsed = parse_dialog_line(line)
                if not parsed:
                    print(f"Skipping line (invalid format): {line}")  # Debug log
                    continue

                speaker, text = parsed
                print(f"Found speaker: {speaker}, text: {text[:50]}...")  # Debug log

                if speaker not in speaker_samples:
                    msg = f"Skipping unknown speaker: {speaker}"
                    print(msg)  # Debug log
                    summary.append(msg)
                    continue

                sample_path = speaker_samples[speaker]
                if not os.path.exists(sample_path):
                    msg = f"Audio sample not found for speaker '{speaker}': {sample_path}"
                    print(msg)  # Debug log
                    summary.append(msg)
                    continue

                if reinit_each_line or model is None:
                    print("Initializing new TTS model instance")  # Debug log
                    model = ChatterboxTTS.from_pretrained(device="mps")

                if len(text) > 300:
                    chunks = split_text_at_sentence_boundaries(text)
                    print(f"Splitting long text into {len(chunks)} chunks")  # Debug log
                    for chunk in chunks:
                        output_file = os.path.join(output_dir, f"{file_counter:03d}-{output_base}.wav")
                        print(f"Generating audio for chunk: {chunk[:50]}...")  # Debug log
                        try:
                            wav = model.generate(chunk, audio_prompt_path=sample_path)
                            ta.save(output_file, wav, model.sr)
                            output_files.append(output_file)
                            summary.append(f"{output_file}: {speaker} (chunk) - {chunk[:50]}...")
                            file_counter += 1
                            print(f"Generated audio: {output_file}")  # Debug log
                        except Exception as e:
                            error_msg = f"Error generating audio for chunk: {str(e)}"
                            print(error_msg)  # Debug log
                            summary.append(error_msg)
                            continue
                else:
                    output_file = os.path.join(output_dir, f"{file_counter:03d}-{output_base}.wav")
                    print(f"Generating audio: {text[:50]}...")  # Debug log
                    try:
                        wav = model.generate(text, audio_prompt_path=sample_path)
                        ta.save(output_file, wav, model.sr)
                        output_files.append(output_file)
                        summary.append(f"{output_file}: {speaker} - {text[:50]}...")
                        file_counter += 1
                        print(f"Generated audio: {output_file}")  # Debug log
                    except Exception as e:
                        error_msg = f"Error generating audio: {str(e)}"
                        print(error_msg)  # Debug log
                        summary.append(error_msg)
                        continue

            except Exception as e:
                error_msg = f"Error processing line '{line}': {str(e)}"
                print(error_msg)  # Debug log
                summary.append(error_msg)
                continue

        if not output_files:
            return "Error: No audio files were generated. Check speaker names and audio samples.", None

        # Concatenate all audio files with 1-second gaps
        concatenated_file = None
        try:
            if len(output_files) > 1:
                print("Concatenating audio files...")  # Debug log
                # Load all audio files
                waveforms = []
                sample_rates = set()

                for file in output_files:
                    waveform, sample_rate = ta.load(file)
                    waveforms.append(waveform)
                    sample_rates.add(sample_rate)

                if len(sample_rates) != 1:
                    raise ValueError(f"Sample rate mismatch: {sample_rates}")

                sample_rate = sample_rates.pop()
                gap_samples = int(1.0 * sample_rate)  # 1 second gap
                gap = torch.zeros(1, gap_samples)  # Mono channel

                # Concatenate waveforms with gaps
                concatenated = waveforms[0]
                for wav in waveforms[1:]:
                    concatenated = torch.cat([concatenated, gap, wav], dim=1)

                # Save concatenated file
                concatenated_path = os.path.join(output_dir, f"{output_base}_concatenated.wav")
                ta.save(concatenated_path, concatenated, sample_rate)
                output_files.append(concatenated_path)
                summary.append(f"\nConcatenated file: {concatenated_path}")
                concatenated_file = concatenated_path
                print(f"Created concatenated file: {concatenated_path}")

            # Create a zip file of all outputs
            import zipfile
            zip_path = os.path.join(output_dir, f"{output_base}.zip")
            print(f"Creating zip file: {zip_path}")  # Debug log
            with zipfile.ZipFile(zip_path, 'w') as zipf:
                for file in output_files:
                    zipf.write(file, os.path.basename(file))
            print(f"Zip file created successfully with {len(output_files)} files")  # Debug log

            # Return both the zip and the concatenated file if it exists
            if concatenated_file:
                return "\n".join(summary), concatenated_file, zip_path
            return "\n".join(summary), None, zip_path
        except Exception as e:
            error_msg = f"Error creating zip file: {str(e)}"
            print(error_msg)  # Debug log
            return "\n".join(summary + [error_msg]), None

    except Exception as e:
        error_msg = f"Unexpected error: {str(e)}"
        print(error_msg)  # Debug log
        import traceback
        traceback.print_exc()  # Print full traceback
        return error_msg, None

with gr.Blocks() as demo:
    gr.Markdown("# Chatterbox TTS Generator")

    # Store speakers in a global state
    speakers_state = gr.State(speakers)

    with gr.Tabs() as tabs:
        with gr.TabItem("Single Utterance"):
            with gr.Row():
                with gr.Column():
                    speaker_dropdown = gr.Dropdown(
                        choices=["Custom"] + list(speakers.keys()),
                        value="Custom",
                        label="Select Speaker"
                    )
                    custom_upload = gr.Audio(
                        label="Or upload custom speaker sample",
                        type="filepath",
                        visible=True
                    )
                    text_input = gr.Textbox(
                        label="Text to synthesize",
                        placeholder="Enter text here...",
                        lines=3
                    )
                    exaggeration_slider = gr.Slider(
                        minimum=0.0, maximum=2.0, value=0.5, step=0.01,
                        label="Exaggeration (emotion)",
                        info="Controls expressiveness. 0.5 = neutral, higher = more expressive."
                    )
                    cfg_weight_slider = gr.Slider(
                        minimum=0.0, maximum=2.0, value=0.5, step=0.01,
                        label="CFG Weight",
                        info="Higher = more faithful to text, lower = more like reference voice."
                    )
                    temperature_slider = gr.Slider(
                        minimum=0.1, maximum=2.0, value=0.8, step=0.01,
                        label="Temperature",
                        info="Controls randomness. Higher = more variation."
                    )
                    max_new_tokens_box = gr.Number(
                        value=1000,
                        label="Max New Tokens (advanced)",
                        precision=0,
                        info="Maximum audio tokens to generate. Increase for longer texts."
                    )
                    generate_btn = gr.Button("Generate Speech")

                with gr.Column():
                    audio_output = gr.Audio(label="Generated Speech")
                    download = gr.File(label="Download WAV")

            gr.Examples(
                examples=[
                    ["Hello world! This is a demo.", "Tara"],
                    ["Welcome to the future of text-to-speech.", "Zac"]
                ],
                inputs=[text_input, speaker_dropdown]
            )

            generate_btn.click(
                fn=generate_audio,
                inputs=[speaker_dropdown, custom_upload, text_input, exaggeration_slider,
                       cfg_weight_slider, temperature_slider, max_new_tokens_box],
                outputs=[audio_output, download]
            )

        with gr.TabItem("Dialog Generation"):
            with gr.Row():
                with gr.Column():
                    with gr.Row():
                        with gr.Column(scale=2):
                            dialog_text = gr.Textbox(
                                label="Dialog Text",
                                placeholder='''Enter dialog in format:
Speaker1: "Hello, how are you?"
Speaker2: "I'm doing well, thank you!"
Speaker1: "What are your plans for today?"
Speaker2: "I'm working on a new project."''',
                                lines=10
                            )
                        with gr.Column(scale=1):
                            with gr.Group():
                                gr.Markdown("### Speaker Configuration")
                                with gr.Row():
                                    new_speaker_name = gr.Textbox(
                                        label="New Speaker Name",
                                        placeholder="Enter speaker name"
                                    )
                                    new_speaker_audio = gr.Audio(
                                        label="Speaker Sample",
                                        type="filepath"
                                    )
                                with gr.Row():
                                    add_speaker_btn = gr.Button("Add Speaker")
                                    remove_speaker_btn = gr.Button("Remove Selected")

                                speakers_dropdown = gr.Dropdown(
                                    label="Available Speakers",
                                    choices=list(speakers.keys()) if speakers else [],
                                    interactive=True,
                                    multiselect=True
                                )

                                gr.Markdown("### Generation Settings")
                                with gr.Row():
                                    output_base = gr.Textbox(
                                        label="Output Base Name",
                                        value="dialog_output",
                                        placeholder="base_name (will generate 001-base_name.wav, etc.)"
                                    )
                                    reinit_each_line = gr.Checkbox(
                                        label="Re-initialize model each line",
                                        value=False,
                                        info="Reduces memory usage but is slower"
                                    )

                                config_status = gr.Textbox(
                                    label="Status",
                                    interactive=False,
                                    visible=True
                                )

                    dialog_generate_btn = gr.Button("Generate Dialog")

                with gr.Column():
                    dialog_output = gr.Textbox(
                        label="Generation Log",
                        interactive=False,
                        lines=15
                    )
                    concatenated_audio = gr.Audio(
                        label="Concatenated Audio",
                        visible=False
                    )
                    dialog_download = gr.File(
                        label="Download All Files",
                        visible=False
                    )

            # Event handlers
            add_speaker_btn.click(
                fn=add_speaker,
                inputs=[new_speaker_name, new_speaker_audio, speakers_state],
                outputs=[config_status, speakers_dropdown]
            )

            remove_speaker_btn.click(
                fn=remove_speaker,
                inputs=[speakers_dropdown, speakers_state],
                outputs=[config_status, speakers_dropdown]
            )

            # Update the speakers dropdown when the tab is selected
            def on_tab_change():
                # Return a dictionary that updates the dropdown choices
                return {"__type__": "update", "choices": list(speakers.keys())}

            tabs.select(
                fn=on_tab_change,
                inputs=[],
                outputs=[speakers_dropdown]
            )

            def update_outputs(*args):
                result = process_dialog(*args)
                if len(result) == 3:
                    summary, concat_file, zip_file = result
                    if concat_file:
                        # When we have a concatenated file, show both the audio player and download
                        return [
                            summary,  # dialog_output
                            gr.Audio(value=concat_file, visible=True),  # concatenated_audio
                            gr.File(value=zip_file, visible=True)  # dialog_download
                        ]
                    # When no concatenated file, just show the zip download
                    return [
                        summary,
                        gr.Audio(visible=False),
                        gr.File(value=zip_file, visible=True)
                    ]
                # Error case
                return [
                    result[0],  # error message
                    gr.Audio(visible=False),
                    gr.File(visible=False)
                ]

            # Update the click handler with the correct number of outputs
            dialog_generate_btn.click(
                fn=update_outputs,
                inputs=[
                    dialog_text,
                    speakers_state,  # Pass the current speakers dict
                    output_base,
                    reinit_each_line
                ],
                outputs=[
                    dialog_output,           # Text output
                    concatenated_audio,      # Audio component
                    dialog_download          # Zip file
                ]
            )

if __name__ == "__main__":
    demo.launch(share=True)