chatterbox-ui/gradio_app.py

import gradio as gr
import yaml
import os
import time
from chatterbox.tts import ChatterboxTTS
import torchaudio as ta

# Load speaker options from YAML with error handling
try:
    yaml_path = os.path.abspath("speakers.yaml")
    if not os.path.exists(yaml_path):
        raise FileNotFoundError(f"speakers.yaml not found at {yaml_path}")

    with open(yaml_path) as f:
        speakers = yaml.safe_load(f)

    if not speakers or not isinstance(speakers, dict):
        raise ValueError("speakers.yaml must contain a valid dictionary mapping")

except Exception as e:
    raise SystemExit(f"Failed to load speakers.yaml: {str(e)}")

def generate_audio(speaker_choice, custom_sample, text, exaggeration, cfg_weight, temperature, max_new_tokens):
    # Get sample path from selection or upload
    sample_path = speakers[speaker_choice] if speaker_choice != "Custom" else custom_sample

    if not os.path.exists(sample_path):
        raise gr.Error("Sample file not found!")

    # Load model (cached automatically by Gradio)
    tts = ChatterboxTTS.from_pretrained(device="mps")

    # Generate audio with advanced controls
    gen_kwargs = dict(
        text=text,
        audio_prompt_path=sample_path,
        exaggeration=exaggeration,
        cfg_weight=cfg_weight,
        temperature=temperature
    )
    # max_new_tokens is not supported by the current TTS library, so we ignore it here
    wav = tts.generate(**gen_kwargs)

    # Save with timestamp
    output_path = f"output_{int(time.time())}.wav"
    ta.save(output_path, wav, tts.sr)

    return output_path, output_path


with gr.Blocks() as demo:
    gr.Markdown("# Chatterbox TTS Generator")

    with gr.Row():
        with gr.Column():
            speaker_dropdown = gr.Dropdown(
                choices=["Custom"] + list(speakers.keys()),
                value="Custom",
                label="Select Speaker"
            )
            custom_upload = gr.Audio(
                label="Or upload custom speaker sample",
                type="filepath",
                visible=True
            )
            text_input = gr.Textbox(
                label="Text to synthesize",
                placeholder="Enter text here...",
                lines=3
            )
            exaggeration_slider = gr.Slider(
                minimum=0.0, maximum=2.0, value=0.5, step=0.01,
                label="Exaggeration (emotion)",
                info="Controls expressiveness. 0.5 = neutral, higher = more expressive."
            )
            cfg_weight_slider = gr.Slider(
                minimum=0.0, maximum=2.0, value=0.5, step=0.01,
                label="CFG Weight",
                info="Higher = more faithful to text, lower = more like reference voice."
            )
            temperature_slider = gr.Slider(
                minimum=0.1, maximum=2.0, value=0.8, step=0.01,
                label="Temperature",
                info="Controls randomness. Higher = more variation."
            )
            max_new_tokens_box = gr.Number(
                value=1000,
                label="Max New Tokens (advanced)",
                precision=0,
                info="Maximum audio tokens to generate. Increase for longer texts."
            )
            generate_btn = gr.Button("Generate Speech")

        with gr.Column():
            audio_output = gr.Audio(label="Generated Speech")
            download = gr.File(label="Download WAV")

    gr.Examples(
        examples=[
            ["Hello world! This is a demo.", "Tara"],
            ["Welcome to the future of text-to-speech.", "Zac"]
        ],
        inputs=[text_input, speaker_dropdown]
    )

    generate_btn.click(
        fn=generate_audio,
        inputs=[speaker_dropdown, custom_upload, text_input, exaggeration_slider, cfg_weight_slider, temperature_slider, max_new_tokens_box],
        outputs=[audio_output, download]
    )

if __name__ == "__main__":
    demo.launch(share=True)