114 lines
3.9 KiB
Python
114 lines
3.9 KiB
Python
import gradio as gr
|
|
import yaml
|
|
import os
|
|
import time
|
|
from chatterbox.tts import ChatterboxTTS
|
|
import torchaudio as ta
|
|
|
|
# Load speaker options from YAML with error handling
|
|
try:
|
|
yaml_path = os.path.abspath("speakers.yaml")
|
|
if not os.path.exists(yaml_path):
|
|
raise FileNotFoundError(f"speakers.yaml not found at {yaml_path}")
|
|
|
|
with open(yaml_path) as f:
|
|
speakers = yaml.safe_load(f)
|
|
|
|
if not speakers or not isinstance(speakers, dict):
|
|
raise ValueError("speakers.yaml must contain a valid dictionary mapping")
|
|
|
|
except Exception as e:
|
|
raise SystemExit(f"Failed to load speakers.yaml: {str(e)}")
|
|
|
|
def generate_audio(speaker_choice, custom_sample, text, exaggeration, cfg_weight, temperature, max_new_tokens):
|
|
# Get sample path from selection or upload
|
|
sample_path = speakers[speaker_choice] if speaker_choice != "Custom" else custom_sample
|
|
|
|
if not os.path.exists(sample_path):
|
|
raise gr.Error("Sample file not found!")
|
|
|
|
# Load model (cached automatically by Gradio)
|
|
tts = ChatterboxTTS.from_pretrained(device="mps")
|
|
|
|
# Generate audio with advanced controls
|
|
gen_kwargs = dict(
|
|
text=text,
|
|
audio_prompt_path=sample_path,
|
|
exaggeration=exaggeration,
|
|
cfg_weight=cfg_weight,
|
|
temperature=temperature
|
|
)
|
|
# max_new_tokens is not supported by the current TTS library, so we ignore it here
|
|
wav = tts.generate(**gen_kwargs)
|
|
|
|
# Save with timestamp
|
|
output_path = f"output_{int(time.time())}.wav"
|
|
ta.save(output_path, wav, tts.sr)
|
|
|
|
return output_path, output_path
|
|
|
|
|
|
with gr.Blocks() as demo:
|
|
gr.Markdown("# Chatterbox TTS Generator")
|
|
|
|
with gr.Row():
|
|
with gr.Column():
|
|
speaker_dropdown = gr.Dropdown(
|
|
choices=["Custom"] + list(speakers.keys()),
|
|
value="Custom",
|
|
label="Select Speaker"
|
|
)
|
|
custom_upload = gr.Audio(
|
|
label="Or upload custom speaker sample",
|
|
type="filepath",
|
|
visible=True
|
|
)
|
|
text_input = gr.Textbox(
|
|
label="Text to synthesize",
|
|
placeholder="Enter text here...",
|
|
lines=3
|
|
)
|
|
exaggeration_slider = gr.Slider(
|
|
minimum=0.0, maximum=2.0, value=0.5, step=0.01,
|
|
label="Exaggeration (emotion)",
|
|
info="Controls expressiveness. 0.5 = neutral, higher = more expressive."
|
|
)
|
|
cfg_weight_slider = gr.Slider(
|
|
minimum=0.0, maximum=2.0, value=0.5, step=0.01,
|
|
label="CFG Weight",
|
|
info="Higher = more faithful to text, lower = more like reference voice."
|
|
)
|
|
temperature_slider = gr.Slider(
|
|
minimum=0.1, maximum=2.0, value=0.8, step=0.01,
|
|
label="Temperature",
|
|
info="Controls randomness. Higher = more variation."
|
|
)
|
|
max_new_tokens_box = gr.Number(
|
|
value=1000,
|
|
label="Max New Tokens (advanced)",
|
|
precision=0,
|
|
info="Maximum audio tokens to generate. Increase for longer texts."
|
|
)
|
|
generate_btn = gr.Button("Generate Speech")
|
|
|
|
with gr.Column():
|
|
audio_output = gr.Audio(label="Generated Speech")
|
|
download = gr.File(label="Download WAV")
|
|
|
|
gr.Examples(
|
|
examples=[
|
|
["Hello world! This is a demo.", "Tara"],
|
|
["Welcome to the future of text-to-speech.", "Zac"]
|
|
],
|
|
inputs=[text_input, speaker_dropdown]
|
|
)
|
|
|
|
generate_btn.click(
|
|
fn=generate_audio,
|
|
inputs=[speaker_dropdown, custom_upload, text_input, exaggeration_slider, cfg_weight_slider, temperature_slider, max_new_tokens_box],
|
|
outputs=[audio_output, download]
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch(share=True)
|