chatterbox-ui/cbx-dialog-generate.py

#!/usr/bin/env python3
import argparse
import re
import os
import yaml
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS

# Import helper to fix Python path
import import_helper

def split_text_at_sentence_boundaries(text, max_length=300):
    """
    Split text at sentence boundaries, ensuring each chunk is <= max_length.
    Returns a list of text chunks.
    """
    # Simple regex for sentence boundaries (period, question mark, exclamation mark followed by space or end)
    sentence_pattern = r'[.!?](?:\s|$)'
    sentences = re.split(f'({sentence_pattern})', text)

    # Recombine the split parts (the regex split keeps the delimiters as separate items)
    actual_sentences = []
    current = ""
    for i in range(0, len(sentences), 2):
        if i+1 < len(sentences):
            current = sentences[i] + sentences[i+1]
        else:
            current = sentences[i]
        if current:
            actual_sentences.append(current)

    # Group sentences into chunks <= max_length
    chunks = []
    current_chunk = ""

    for sentence in actual_sentences:
        # If adding this sentence would exceed max_length and we already have content,
        # finish the current chunk and start a new one
        if len(current_chunk) + len(sentence) > max_length and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk += sentence

    # Add the last chunk if it has content
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def parse_dialog_line(line):
    """
    Parse a dialog line in the format: Name: "Text"
    Returns a tuple of (speaker, text) or None if the line doesn't match the pattern.
    """
    pattern = r'^([^:]+):\s*"([^"]+)"$'
    match = re.match(pattern, line.strip())
    if match:
        speaker = match.group(1).strip()
        text = match.group(2).strip()
        return (speaker, text)
    return None

def main():
    parser = argparse.ArgumentParser(description="Generate dialog audio from markdown file using Chatterbox TTS")
    parser.add_argument('--config', required=True, type=str, help='YAML config file mapping speaker names to audio samples')
    parser.add_argument('--dialog', required=True, type=str, help='Markdown dialog file')
    parser.add_argument('--output-base', required=True, type=str, help='Base name for output files (e.g., "output" for "001-output.wav")')
    parser.add_argument('--reinit-each-line', action='store_true', help='Re-initialize the model after each line to reduce memory usage')
    args = parser.parse_args()

    # Load the YAML config
    with open(args.config, 'r') as f:
        speaker_samples = yaml.safe_load(f)

    # Load the dialog file
    with open(args.dialog, 'r') as f:
        dialog_lines = f.readlines()

    # Initialize model only once if not reinitializing per line
    model = None
    if not args.reinit_each_line:
        print("Loading ChatterboxTTS model once for all lines...")
        model = ChatterboxTTS.from_pretrained(device="mps")

    # Process each dialog line
    file_counter = 1
    summary = []

    for line_num, line in enumerate(dialog_lines, 1):
        parsed = parse_dialog_line(line)
        if not parsed:
            print(f"Skipping line {line_num}: Not in the expected format")
            continue

        speaker, text = parsed

        # Check if the speaker is in the config
        if speaker not in speaker_samples:
            print(f"Warning: Speaker '{speaker}' not found in config, skipping line {line_num}")
            continue

        sample_path = speaker_samples[speaker]

        # Reinitialize model if needed
        if args.reinit_each_line or model is None:
            if args.reinit_each_line:
                print(f"Reinitializing model for line {line_num}...")
            model = ChatterboxTTS.from_pretrained(device="mps")

        # Check if the text needs to be split (> 300 chars)
        if len(text) > 300:
            chunks = split_text_at_sentence_boundaries(text)
            chunk_files = []

            for chunk in chunks:
                output_file = f"{file_counter:03d}-{args.output_base}.wav"

                # Generate audio for this chunk
                wav = model.generate(chunk, audio_prompt_path=sample_path)
                ta.save(output_file, wav, model.sr)

                chunk_files.append(output_file)
                summary.append(f"File {output_file}: {speaker} (chunk) - {chunk[:50]}...")
                file_counter += 1

            print(f"Generated {len(chunks)} files for line {line_num} (speaker: {speaker})")
        else:
            # Generate a single file for this line
            output_file = f"{file_counter:03d}-{args.output_base}.wav"

            # Generate audio
            wav = model.generate(text, audio_prompt_path=sample_path)
            ta.save(output_file, wav, model.sr)

            summary.append(f"File {output_file}: {speaker} - {text[:50]}...")
            file_counter += 1
            print(f"Generated file for line {line_num} (speaker: {speaker})")

    # Print summary
    print("\nSummary of generated files:")
    for entry in summary:
        print(entry)

if __name__ == '__main__':
    main()