chatterbox-ui/gradio_app.py

114 lines
3.9 KiB
Python

import gradio as gr
import yaml
import os
import time
from chatterbox.tts import ChatterboxTTS
import torchaudio as ta
# Load speaker options from YAML with error handling
try:
yaml_path = os.path.abspath("speakers.yaml")
if not os.path.exists(yaml_path):
raise FileNotFoundError(f"speakers.yaml not found at {yaml_path}")
with open(yaml_path) as f:
speakers = yaml.safe_load(f)
if not speakers or not isinstance(speakers, dict):
raise ValueError("speakers.yaml must contain a valid dictionary mapping")
except Exception as e:
raise SystemExit(f"Failed to load speakers.yaml: {str(e)}")
def generate_audio(speaker_choice, custom_sample, text, exaggeration, cfg_weight, temperature, max_new_tokens):
# Get sample path from selection or upload
sample_path = speakers[speaker_choice] if speaker_choice != "Custom" else custom_sample
if not os.path.exists(sample_path):
raise gr.Error("Sample file not found!")
# Load model (cached automatically by Gradio)
tts = ChatterboxTTS.from_pretrained(device="mps")
# Generate audio with advanced controls
gen_kwargs = dict(
text=text,
audio_prompt_path=sample_path,
exaggeration=exaggeration,
cfg_weight=cfg_weight,
temperature=temperature
)
# max_new_tokens is not supported by the current TTS library, so we ignore it here
wav = tts.generate(**gen_kwargs)
# Save with timestamp
output_path = f"output_{int(time.time())}.wav"
ta.save(output_path, wav, tts.sr)
return output_path, output_path
with gr.Blocks() as demo:
gr.Markdown("# Chatterbox TTS Generator")
with gr.Row():
with gr.Column():
speaker_dropdown = gr.Dropdown(
choices=["Custom"] + list(speakers.keys()),
value="Custom",
label="Select Speaker"
)
custom_upload = gr.Audio(
label="Or upload custom speaker sample",
type="filepath",
visible=True
)
text_input = gr.Textbox(
label="Text to synthesize",
placeholder="Enter text here...",
lines=3
)
exaggeration_slider = gr.Slider(
minimum=0.0, maximum=2.0, value=0.5, step=0.01,
label="Exaggeration (emotion)",
info="Controls expressiveness. 0.5 = neutral, higher = more expressive."
)
cfg_weight_slider = gr.Slider(
minimum=0.0, maximum=2.0, value=0.5, step=0.01,
label="CFG Weight",
info="Higher = more faithful to text, lower = more like reference voice."
)
temperature_slider = gr.Slider(
minimum=0.1, maximum=2.0, value=0.8, step=0.01,
label="Temperature",
info="Controls randomness. Higher = more variation."
)
max_new_tokens_box = gr.Number(
value=1000,
label="Max New Tokens (advanced)",
precision=0,
info="Maximum audio tokens to generate. Increase for longer texts."
)
generate_btn = gr.Button("Generate Speech")
with gr.Column():
audio_output = gr.Audio(label="Generated Speech")
download = gr.File(label="Download WAV")
gr.Examples(
examples=[
["Hello world! This is a demo.", "Tara"],
["Welcome to the future of text-to-speech.", "Zac"]
],
inputs=[text_input, speaker_dropdown]
)
generate_btn.click(
fn=generate_audio,
inputs=[speaker_dropdown, custom_upload, text_input, exaggeration_slider, cfg_weight_slider, temperature_slider, max_new_tokens_box],
outputs=[audio_output, download]
)
if __name__ == "__main__":
demo.launch(share=True)