Gradio app added, cbx-dialog-generate.py added

This commit is contained in:
Steve White 2025-06-04 08:30:07 -05:00
commit 63efb26910
10 changed files with 641 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
.venv
.gradio
output*.wav
*.wav
*.mp3

View File

@ -0,0 +1,89 @@
# Chatterbox Dialog Generator
This tool generates audio files for dialog from a markdown file, using the Chatterbox TTS system. It maps speaker names to audio samples using a YAML configuration file.
## Features
- Maps speaker names to audio samples via a YAML config file
- Processes markdown dialog files with lines in the format: `Name: "Text"`
- Generates sequentially numbered audio files (e.g., `001-output.wav`, `002-output.wav`)
- Automatically splits long dialog lines (>300 characters) at sentence boundaries
- Provides a summary of generated files
## Requirements
- Python 3.6+
- PyYAML
- torchaudio
- Chatterbox TTS library
## Usage
```bash
python cbx-dialog-generate.py --config speakers.yaml --dialog sample-dialog.md --output-base output
```
### Arguments
- `--config`: Path to the YAML config file mapping speaker names to audio samples
- `--dialog`: Path to the markdown dialog file
- `--output-base`: Base name for output files (e.g., "output" for "001-output.wav")
- `--reinit-each-line`: Re-initialize the model after each line to reduce memory usage (useful for long dialogs)
## Config File Format (YAML)
The config file maps speaker names (as they appear in the dialog) to audio sample files:
```yaml
Denise: denise.wav
Mark: mark.wav
Mary: mary.wav
```
## Dialog File Format (Markdown)
The dialog file should contain lines in the format:
```
Name: "Text"
```
For example:
```
Denise: "What do you think is wrong with me?"
Mark: "I think you're being overly emotional."
Mary: "Jesus, Mark, can you be any more of an asshole?"
```
## Output
The script generates sequentially numbered WAV files:
- `001-output.wav`
- `002-output.wav`
- etc.
If a dialog line exceeds 300 characters, it will be split at sentence boundaries into multiple files, each maintaining the sequential numbering.
## Example
Given the sample dialog and config files, running:
```bash
python cbx-dialog-generate.py --config speakers.yaml --dialog sample-dialog.md --output-base output
```
For long dialogs where memory usage is a concern, you can use:
```bash
python cbx-dialog-generate.py --config speakers.yaml --dialog sample-dialog.md --output-base output --reinit-each-line
```
Either command would generate:
- `001-output.wav` - Denise's first line
- `002-output.wav` - Mark's first line
- `003-output.wav` - Mary's line
- `004-output.wav` - First part of Denise's long line
- `005-output.wav` - Second part of Denise's long line
- `006-output.wav` - Mark's second line

143
cbx-dialog-generate.py Normal file
View File

@ -0,0 +1,143 @@
#!/usr/bin/env python3
import argparse
import re
import os
import yaml
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS
def split_text_at_sentence_boundaries(text, max_length=300):
"""
Split text at sentence boundaries, ensuring each chunk is <= max_length.
Returns a list of text chunks.
"""
# Simple regex for sentence boundaries (period, question mark, exclamation mark followed by space or end)
sentence_pattern = r'[.!?](?:\s|$)'
sentences = re.split(f'({sentence_pattern})', text)
# Recombine the split parts (the regex split keeps the delimiters as separate items)
actual_sentences = []
current = ""
for i in range(0, len(sentences), 2):
if i+1 < len(sentences):
current = sentences[i] + sentences[i+1]
else:
current = sentences[i]
if current:
actual_sentences.append(current)
# Group sentences into chunks <= max_length
chunks = []
current_chunk = ""
for sentence in actual_sentences:
# If adding this sentence would exceed max_length and we already have content,
# finish the current chunk and start a new one
if len(current_chunk) + len(sentence) > max_length and current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
else:
current_chunk += sentence
# Add the last chunk if it has content
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def parse_dialog_line(line):
"""
Parse a dialog line in the format: Name: "Text"
Returns a tuple of (speaker, text) or None if the line doesn't match the pattern.
"""
pattern = r'^([^:]+):\s*"([^"]+)"$'
match = re.match(pattern, line.strip())
if match:
speaker = match.group(1).strip()
text = match.group(2).strip()
return (speaker, text)
return None
def main():
parser = argparse.ArgumentParser(description="Generate dialog audio from markdown file using Chatterbox TTS")
parser.add_argument('--config', required=True, type=str, help='YAML config file mapping speaker names to audio samples')
parser.add_argument('--dialog', required=True, type=str, help='Markdown dialog file')
parser.add_argument('--output-base', required=True, type=str, help='Base name for output files (e.g., "output" for "001-output.wav")')
parser.add_argument('--reinit-each-line', action='store_true', help='Re-initialize the model after each line to reduce memory usage')
args = parser.parse_args()
# Load the YAML config
with open(args.config, 'r') as f:
speaker_samples = yaml.safe_load(f)
# Load the dialog file
with open(args.dialog, 'r') as f:
dialog_lines = f.readlines()
# Initialize model only once if not reinitializing per line
model = None
if not args.reinit_each_line:
print("Loading ChatterboxTTS model once for all lines...")
model = ChatterboxTTS.from_pretrained(device="mps")
# Process each dialog line
file_counter = 1
summary = []
for line_num, line in enumerate(dialog_lines, 1):
parsed = parse_dialog_line(line)
if not parsed:
print(f"Skipping line {line_num}: Not in the expected format")
continue
speaker, text = parsed
# Check if the speaker is in the config
if speaker not in speaker_samples:
print(f"Warning: Speaker '{speaker}' not found in config, skipping line {line_num}")
continue
sample_path = speaker_samples[speaker]
# Reinitialize model if needed
if args.reinit_each_line or model is None:
if args.reinit_each_line:
print(f"Reinitializing model for line {line_num}...")
model = ChatterboxTTS.from_pretrained(device="mps")
# Check if the text needs to be split (> 300 chars)
if len(text) > 300:
chunks = split_text_at_sentence_boundaries(text)
chunk_files = []
for chunk in chunks:
output_file = f"{file_counter:03d}-{args.output_base}.wav"
# Generate audio for this chunk
wav = model.generate(chunk, audio_prompt_path=sample_path)
ta.save(output_file, wav, model.sr)
chunk_files.append(output_file)
summary.append(f"File {output_file}: {speaker} (chunk) - {chunk[:50]}...")
file_counter += 1
print(f"Generated {len(chunks)} files for line {line_num} (speaker: {speaker})")
else:
# Generate a single file for this line
output_file = f"{file_counter:03d}-{args.output_base}.wav"
# Generate audio
wav = model.generate(text, audio_prompt_path=sample_path)
ta.save(output_file, wav, model.sr)
summary.append(f"File {output_file}: {speaker} - {text[:50]}...")
file_counter += 1
print(f"Generated file for line {line_num} (speaker: {speaker})")
# Print summary
print("\nSummary of generated files:")
for entry in summary:
print(entry)
if __name__ == '__main__':
main()

22
cbx-generate.py Executable file
View File

@ -0,0 +1,22 @@
import argparse
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS
def main():
parser = argparse.ArgumentParser(description="Chatterbox TTS audio generation")
parser.add_argument('--sample', required=True, type=str, help='Prompt/reference audio file (e.g. .wav, .mp3) for the voice')
parser.add_argument('--output', required=True, type=str, help='Output audio file path (should end with .wav)')
parser.add_argument('--text', required=True, type=str, help='Text to synthesize')
args = parser.parse_args()
# Load model on MPS (for Apple Silicon)
model = ChatterboxTTS.from_pretrained(device="mps")
# Generate the audio
wav = model.generate(args.text, audio_prompt_path=args.sample)
# Save to output .wav
ta.save(args.output, wav, model.sr)
print(f"Generated audio saved to {args.output}")
if __name__ == '__main__':
main()

13
chatterbox-test.py Normal file
View File

@ -0,0 +1,13 @@
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS
model = ChatterboxTTS.from_pretrained(device="mps")
text = "Sometimes you have to wonder just what's going on with this crazy fucking world."
#wav = model.generate(text)
#ta.save("test-1.wav", wav, model.sr)
# If you want to synthesize with a different voice, specify the audio prompt
AUDIO_PROMPT_PATH="sample.mp3"
wav = model.generate(text, audio_prompt_path=AUDIO_PROMPT_PATH)
ta.save("test-2.wav", wav, model.sr)

244
chatterbox_tts.py.bak Normal file
View File

@ -0,0 +1,244 @@
from dataclasses import dataclass
from pathlib import Path
import librosa
import torch
import perth
import torch.nn.functional as F
from huggingface_hub import hf_hub_download
from .models.t3 import T3
from .models.s3tokenizer import S3_SR, drop_invalid_tokens
from .models.s3gen import S3GEN_SR, S3Gen
from .models.tokenizers import EnTokenizer
from .models.voice_encoder import VoiceEncoder
from .models.t3.modules.cond_enc import T3Cond
REPO_ID = "ResembleAI/chatterbox"
def punc_norm(text: str) -> str:
"""
Quick cleanup func for punctuation from LLMs or
containing chars not seen often in the dataset
"""
if len(text) == 0:
return "You need to add some text for me to talk."
# Capitalise first letter
if text[0].islower():
text = text[0].upper() + text[1:]
# Remove multiple space chars
text = " ".join(text.split())
# Replace uncommon/llm punc
punc_to_replace = [
("...", ", "),
("", ", "),
(":", ","),
(" - ", ", "),
(";", ", "),
("", "-"),
("", "-"),
(" ,", ","),
("", "\""),
("", "\""),
("", "'"),
("", "'"),
]
for old_char_sequence, new_char in punc_to_replace:
text = text.replace(old_char_sequence, new_char)
# Add full stop if no ending punc
text = text.rstrip(" ")
sentence_enders = {".", "!", "?", "-", ","}
if not any(text.endswith(p) for p in sentence_enders):
text += "."
return text
@dataclass
class Conditionals:
"""
Conditionals for T3 and S3Gen
- T3 conditionals:
- speaker_emb
- clap_emb
- cond_prompt_speech_tokens
- cond_prompt_speech_emb
- emotion_adv
- S3Gen conditionals:
- prompt_token
- prompt_token_len
- prompt_feat
- prompt_feat_len
- embedding
"""
t3: T3Cond
gen: dict
def to(self, device):
self.t3 = self.t3.to(device=device)
for k, v in self.gen.items():
if torch.is_tensor(v):
self.gen[k] = v.to(device=device)
return self
def save(self, fpath: Path):
arg_dict = dict(
t3=self.t3.__dict__,
gen=self.gen
)
torch.save(arg_dict, fpath)
@classmethod
def load(cls, fpath, map_location="cpu"):
kwargs = torch.load(fpath, map_location=map_location, weights_only=True)
return cls(T3Cond(**kwargs['t3']), kwargs['gen'])
class ChatterboxTTS:
ENC_COND_LEN = 6 * S3_SR
DEC_COND_LEN = 10 * S3GEN_SR
def __init__(
self,
t3: T3,
s3gen: S3Gen,
ve: VoiceEncoder,
tokenizer: EnTokenizer,
device: str,
conds: Conditionals = None,
):
self.sr = S3GEN_SR # sample rate of synthesized audio
self.t3 = t3
self.s3gen = s3gen
self.ve = ve
self.tokenizer = tokenizer
self.device = device
self.conds = conds
self.watermarker = perth.PerthImplicitWatermarker()
@classmethod
def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
ckpt_dir = Path(ckpt_dir)
ve = VoiceEncoder()
ve.load_state_dict(
torch.load(ckpt_dir / "ve.pt")
)
ve.to(device).eval()
t3 = T3()
t3_state = torch.load(ckpt_dir / "t3_cfg.pt")
if "model" in t3_state.keys():
t3_state = t3_state["model"][0]
t3.load_state_dict(t3_state)
t3.to(device).eval()
s3gen = S3Gen()
s3gen.load_state_dict(
torch.load(ckpt_dir / "s3gen.pt")
)
s3gen.to(device).eval()
tokenizer = EnTokenizer(
str(ckpt_dir / "tokenizer.json")
)
conds = None
if (builtin_voice := ckpt_dir / "conds.pt").exists():
conds = Conditionals.load(builtin_voice).to(device)
return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
@classmethod
def from_pretrained(cls, device) -> 'ChatterboxTTS':
for fpath in ["ve.pt", "t3_cfg.pt", "s3gen.pt", "tokenizer.json", "conds.pt"]:
local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath)
return cls.from_local(Path(local_path).parent, device)
def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
## Load reference wav
s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
ref_16k_wav = librosa.resample(s3gen_ref_wav, orig_sr=S3GEN_SR, target_sr=S3_SR)
s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
# Speech cond prompt tokens
if plen := self.t3.hp.speech_cond_prompt_len:
s3_tokzr = self.s3gen.tokenizer
t3_cond_prompt_tokens, _ = s3_tokzr.forward([ref_16k_wav[:self.ENC_COND_LEN]], max_len=plen)
t3_cond_prompt_tokens = torch.atleast_2d(t3_cond_prompt_tokens).to(self.device)
# Voice-encoder speaker embedding
ve_embed = torch.from_numpy(self.ve.embeds_from_wavs([ref_16k_wav], sample_rate=S3_SR))
ve_embed = ve_embed.mean(axis=0, keepdim=True).to(self.device)
t3_cond = T3Cond(
speaker_emb=ve_embed,
cond_prompt_speech_tokens=t3_cond_prompt_tokens,
emotion_adv=exaggeration * torch.ones(1, 1, 1),
).to(device=self.device)
self.conds = Conditionals(t3_cond, s3gen_ref_dict)
def generate(
self,
text,
audio_prompt_path=None,
exaggeration=0.5,
cfg_weight=0.5,
temperature=0.8,
):
if audio_prompt_path:
self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
else:
assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
# Update exaggeration if needed
if exaggeration != self.conds.t3.emotion_adv[0, 0, 0]:
_cond: T3Cond = self.conds.t3
self.conds.t3 = T3Cond(
speaker_emb=_cond.speaker_emb,
cond_prompt_speech_tokens=_cond.cond_prompt_speech_tokens,
emotion_adv=exaggeration * torch.ones(1, 1, 1),
).to(device=self.device)
# Norm and tokenize text
text = punc_norm(text)
text_tokens = self.tokenizer.text_to_tokens(text).to(self.device)
text_tokens = torch.cat([text_tokens, text_tokens], dim=0) # Need two seqs for CFG
sot = self.t3.hp.start_text_token
eot = self.t3.hp.stop_text_token
text_tokens = F.pad(text_tokens, (1, 0), value=sot)
text_tokens = F.pad(text_tokens, (0, 1), value=eot)
with torch.inference_mode():
speech_tokens = self.t3.inference(
t3_cond=self.conds.t3,
text_tokens=text_tokens,
max_new_tokens=1000, # TODO: use the value in config
temperature=temperature,
cfg_weight=cfg_weight,
)
# Extract only the conditional batch.
speech_tokens = speech_tokens[0]
# TODO: output becomes 1D
speech_tokens = drop_invalid_tokens(speech_tokens)
speech_tokens = speech_tokens.to(self.device)
wav, _ = self.s3gen.inference(
speech_tokens=speech_tokens,
ref_dict=self.conds.gen,
)
wav = wav.squeeze(0).detach().cpu().numpy()
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
return torch.from_numpy(watermarked_wav).unsqueeze(0)

113
gradio_app.py Normal file
View File

@ -0,0 +1,113 @@
import gradio as gr
import yaml
import os
import time
from chatterbox.tts import ChatterboxTTS
import torchaudio as ta
# Load speaker options from YAML with error handling
try:
yaml_path = os.path.abspath("speakers.yaml")
if not os.path.exists(yaml_path):
raise FileNotFoundError(f"speakers.yaml not found at {yaml_path}")
with open(yaml_path) as f:
speakers = yaml.safe_load(f)
if not speakers or not isinstance(speakers, dict):
raise ValueError("speakers.yaml must contain a valid dictionary mapping")
except Exception as e:
raise SystemExit(f"Failed to load speakers.yaml: {str(e)}")
def generate_audio(speaker_choice, custom_sample, text, exaggeration, cfg_weight, temperature, max_new_tokens):
# Get sample path from selection or upload
sample_path = speakers[speaker_choice] if speaker_choice != "Custom" else custom_sample
if not os.path.exists(sample_path):
raise gr.Error("Sample file not found!")
# Load model (cached automatically by Gradio)
tts = ChatterboxTTS.from_pretrained(device="mps")
# Generate audio with advanced controls
gen_kwargs = dict(
text=text,
audio_prompt_path=sample_path,
exaggeration=exaggeration,
cfg_weight=cfg_weight,
temperature=temperature
)
# max_new_tokens is not supported by the current TTS library, so we ignore it here
wav = tts.generate(**gen_kwargs)
# Save with timestamp
output_path = f"output_{int(time.time())}.wav"
ta.save(output_path, wav, tts.sr)
return output_path, output_path
with gr.Blocks() as demo:
gr.Markdown("# Chatterbox TTS Generator")
with gr.Row():
with gr.Column():
speaker_dropdown = gr.Dropdown(
choices=["Custom"] + list(speakers.keys()),
value="Custom",
label="Select Speaker"
)
custom_upload = gr.Audio(
label="Or upload custom speaker sample",
type="filepath",
visible=True
)
text_input = gr.Textbox(
label="Text to synthesize",
placeholder="Enter text here...",
lines=3
)
exaggeration_slider = gr.Slider(
minimum=0.0, maximum=2.0, value=0.5, step=0.01,
label="Exaggeration (emotion)",
info="Controls expressiveness. 0.5 = neutral, higher = more expressive."
)
cfg_weight_slider = gr.Slider(
minimum=0.0, maximum=2.0, value=0.5, step=0.01,
label="CFG Weight",
info="Higher = more faithful to text, lower = more like reference voice."
)
temperature_slider = gr.Slider(
minimum=0.1, maximum=2.0, value=0.8, step=0.01,
label="Temperature",
info="Controls randomness. Higher = more variation."
)
max_new_tokens_box = gr.Number(
value=1000,
label="Max New Tokens (advanced)",
precision=0,
info="Maximum audio tokens to generate. Increase for longer texts."
)
generate_btn = gr.Button("Generate Speech")
with gr.Column():
audio_output = gr.Audio(label="Generated Speech")
download = gr.File(label="Download WAV")
gr.Examples(
examples=[
["Hello world! This is a demo.", "Tara"],
["Welcome to the future of text-to-speech.", "Zac"]
],
inputs=[text_input, speaker_dropdown]
)
generate_btn.click(
fn=generate_audio,
inputs=[speaker_dropdown, custom_upload, text_input, exaggeration_slider, cfg_weight_slider, temperature_slider, max_new_tokens_box],
outputs=[audio_output, download]
)
if __name__ == "__main__":
demo.launch(share=True)

5
sample-dialog.md Normal file
View File

@ -0,0 +1,5 @@
Leah: "What do you think is wrong with me?"
Zac: "I think you're being overly emotional."
Tara: "Jesus, Mark, can you be any more of an asshole?"
Leah: "This is a longer line that will demonstrate how the script handles text that exceeds the 300 character limit. It will be split at sentence boundaries to ensure that the generated audio files are of a reasonable length. This sentence adds more characters. And this one adds even more to push us over the 300 character limit. The script should create multiple audio files for this single dialog line, while keeping the sentence structure intact."
Zac: "I didn't mean to upset anyone. I was just trying to be honest."

7
speakers.yaml Normal file
View File

@ -0,0 +1,7 @@
Tara: Tara.mp3
Zac: Zac.mp3
Leah: Leah.mp3
Leo: Leo.mp3
Adam: Adam.mp3
Alice: Alice.mp3
Lewis: Lewis.mp3

BIN
test1-wav Normal file

Binary file not shown.