import gradio as gr import yaml import os import time from chatterbox.tts import ChatterboxTTS import torchaudio as ta # Load speaker options from YAML with error handling try: yaml_path = os.path.abspath("speakers.yaml") if not os.path.exists(yaml_path): raise FileNotFoundError(f"speakers.yaml not found at {yaml_path}") with open(yaml_path) as f: speakers = yaml.safe_load(f) if not speakers or not isinstance(speakers, dict): raise ValueError("speakers.yaml must contain a valid dictionary mapping") except Exception as e: raise SystemExit(f"Failed to load speakers.yaml: {str(e)}") def generate_audio(speaker_choice, custom_sample, text, exaggeration, cfg_weight, temperature, max_new_tokens): # Get sample path from selection or upload sample_path = speakers[speaker_choice] if speaker_choice != "Custom" else custom_sample if not os.path.exists(sample_path): raise gr.Error("Sample file not found!") # Load model (cached automatically by Gradio) tts = ChatterboxTTS.from_pretrained(device="mps") # Generate audio with advanced controls gen_kwargs = dict( text=text, audio_prompt_path=sample_path, exaggeration=exaggeration, cfg_weight=cfg_weight, temperature=temperature ) # max_new_tokens is not supported by the current TTS library, so we ignore it here wav = tts.generate(**gen_kwargs) # Save with timestamp output_path = f"output_{int(time.time())}.wav" ta.save(output_path, wav, tts.sr) return output_path, output_path with gr.Blocks() as demo: gr.Markdown("# Chatterbox TTS Generator") with gr.Row(): with gr.Column(): speaker_dropdown = gr.Dropdown( choices=["Custom"] + list(speakers.keys()), value="Custom", label="Select Speaker" ) custom_upload = gr.Audio( label="Or upload custom speaker sample", type="filepath", visible=True ) text_input = gr.Textbox( label="Text to synthesize", placeholder="Enter text here...", lines=3 ) exaggeration_slider = gr.Slider( minimum=0.0, maximum=2.0, value=0.5, step=0.01, label="Exaggeration (emotion)", info="Controls expressiveness. 0.5 = neutral, higher = more expressive." ) cfg_weight_slider = gr.Slider( minimum=0.0, maximum=2.0, value=0.5, step=0.01, label="CFG Weight", info="Higher = more faithful to text, lower = more like reference voice." ) temperature_slider = gr.Slider( minimum=0.1, maximum=2.0, value=0.8, step=0.01, label="Temperature", info="Controls randomness. Higher = more variation." ) max_new_tokens_box = gr.Number( value=1000, label="Max New Tokens (advanced)", precision=0, info="Maximum audio tokens to generate. Increase for longer texts." ) generate_btn = gr.Button("Generate Speech") with gr.Column(): audio_output = gr.Audio(label="Generated Speech") download = gr.File(label="Download WAV") gr.Examples( examples=[ ["Hello world! This is a demo.", "Tara"], ["Welcome to the future of text-to-speech.", "Zac"] ], inputs=[text_input, speaker_dropdown] ) generate_btn.click( fn=generate_audio, inputs=[speaker_dropdown, custom_upload, text_input, exaggeration_slider, cfg_weight_slider, temperature_slider, max_new_tokens_box], outputs=[audio_output, download] ) if __name__ == "__main__": demo.launch(share=True)