chatterbox-ui/cbx-audiobook.py

#!/usr/bin/env python
"""
Chatterbox Audiobook Generator

This script converts a text file into an audiobook using the Chatterbox TTS system.
It parses the text file into manageable chunks, generates audio for each chunk,
and assembles them into a complete audiobook.
"""

import argparse
import asyncio
import gc
import os
import re
import subprocess
import sys
import torch
from pathlib import Path
import uuid

# Import helper to fix Python path
import import_helper

# Import backend services
from backend.app.services.tts_service import TTSService
from backend.app.services.speaker_service import SpeakerManagementService
from backend.app.services.audio_manipulation_service import AudioManipulationService
from backend.app.config import DIALOG_GENERATED_DIR, TTS_TEMP_OUTPUT_DIR

class AudiobookGenerator:
    def __init__(self, speaker_id, output_base_name, device="mps",
                 exaggeration=0.5, cfg_weight=0.5, temperature=0.8,
                 pause_between_sentences=0.5, pause_between_paragraphs=1.0,
                 keep_model_loaded=False, cleanup_interval=10, use_subprocess=False):
        """
        Initialize the audiobook generator.

        Args:
            speaker_id: ID of the speaker to use
            output_base_name: Base name for output files
            device: Device to use for TTS (mps, cuda, cpu)
            exaggeration: Controls expressiveness (0.0-1.0)
            cfg_weight: Controls alignment with speaker characteristics (0.0-1.0)
            temperature: Controls randomness in generation (0.0-1.0)
            pause_between_sentences: Pause duration between sentences in seconds
            pause_between_paragraphs: Pause duration between paragraphs in seconds
            keep_model_loaded: If True, keeps model loaded across chunks (more efficient but uses more memory)
            cleanup_interval: How often to perform deep cleanup when keep_model_loaded=True
            use_subprocess: If True, uses separate processes for each chunk (slower but guarantees memory release)
        """
        self.speaker_id = speaker_id
        self.output_base_name = output_base_name
        self.device = device
        self.exaggeration = exaggeration
        self.cfg_weight = cfg_weight
        self.temperature = temperature
        self.pause_between_sentences = pause_between_sentences
        self.pause_between_paragraphs = pause_between_paragraphs
        self.keep_model_loaded = keep_model_loaded
        self.cleanup_interval = cleanup_interval
        self.use_subprocess = use_subprocess
        self.chunk_counter = 0

        # Initialize services
        self.tts_service = TTSService(device=device)
        self.speaker_service = SpeakerManagementService()
        self.audio_manipulator = AudioManipulationService()

        # Create output directories
        self.output_dir = DIALOG_GENERATED_DIR / output_base_name
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.temp_dir = TTS_TEMP_OUTPUT_DIR / output_base_name
        self.temp_dir.mkdir(parents=True, exist_ok=True)

        # Validate speaker
        self._validate_speaker()

    def _validate_speaker(self):
        """Validate that the specified speaker exists."""
        speaker_info = self.speaker_service.get_speaker_by_id(self.speaker_id)
        if not speaker_info:
            raise ValueError(f"Speaker ID '{self.speaker_id}' not found.")
        if not speaker_info.sample_path:
            raise ValueError(f"Speaker ID '{self.speaker_id}' has no sample path defined.")

        # Store speaker info for later use
        self.speaker_info = speaker_info

    def _cleanup_memory(self):
        """Force memory cleanup and garbage collection."""
        print("Performing memory cleanup...")

        # Force garbage collection multiple times for thorough cleanup
        for _ in range(3):
            gc.collect()

        # Clear device-specific caches
        if self.device == "cuda" and torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
            # Additional CUDA cleanup
            try:
                torch.cuda.reset_peak_memory_stats()
            except:
                pass
        elif self.device == "mps" and torch.backends.mps.is_available():
            if hasattr(torch.mps, "empty_cache"):
                torch.mps.empty_cache()
            if hasattr(torch.mps, "synchronize"):
                torch.mps.synchronize()
            # Try to free MPS memory more aggressively
            try:
                import os
                # This forces MPS to release memory back to the system
                if hasattr(torch.mps, "set_per_process_memory_fraction"):
                    current_allocated = torch.mps.current_allocated_memory() if hasattr(torch.mps, "current_allocated_memory") else 0
                    if current_allocated > 0:
                        torch.mps.empty_cache()
            except:
                pass

        # Additional aggressive cleanup
        if hasattr(torch, '_C') and hasattr(torch._C, '_cuda_clearCublasWorkspaces'):
            try:
                torch._C._cuda_clearCublasWorkspaces()
            except:
                pass

        print("Memory cleanup completed.")

    async def _generate_chunk_subprocess(self, chunk, segment_filename_base, speaker_sample_path):
        """
        Generate a single chunk using cbx-generate.py in a subprocess.
        This guarantees memory is released when the process exits.
        """
        output_file = self.temp_dir / f"{segment_filename_base}.wav"

        # Use cbx-generate.py for single chunk generation
        cmd = [
            sys.executable, "cbx-generate.py",
            "--sample", str(speaker_sample_path),
            "--output", str(output_file),
            "--text", chunk,
            "--device", self.device
        ]

        print(f"Running subprocess: {' '.join(cmd[:4])} ... (text truncated)")

        try:
            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=300,  # 5 minute timeout per chunk
                cwd=Path(__file__).parent  # Run from project root
            )

            if result.returncode != 0:
                raise RuntimeError(f"Subprocess failed: {result.stderr}")

            if not output_file.exists():
                raise RuntimeError(f"Output file not created: {output_file}")

            print(f"Subprocess completed successfully: {output_file}")
            return output_file

        except subprocess.TimeoutExpired:
            raise RuntimeError(f"Subprocess timed out after 5 minutes")
        except Exception as e:
            raise RuntimeError(f"Subprocess error: {e}")

    def split_text_into_chunks(self, text, max_length=300):
        """
        Split text into chunks suitable for TTS processing.

        This uses the same logic as the DialogProcessorService._split_text method
        but adds additional paragraph handling.
        """
        # Split text into paragraphs first
        paragraphs = re.split(r'\n\s*\n', text)
        paragraphs = [p.strip() for p in paragraphs if p.strip()]

        all_chunks = []

        for paragraph in paragraphs:
            # Split paragraph into sentences
            sentences = re.split(r'(?<=[.!?\u2026])\s+|(?<=[.!?\u2026])(?=[\"\')\]\}\u201d\u2019])|(?<=[.!?\u2026])$', paragraph.strip())
            sentences = [s.strip() for s in sentences if s and s.strip()]

            chunks = []
            current_chunk = ""

            for sentence in sentences:
                if not sentence:
                    continue
                if not current_chunk:  # First sentence for this chunk
                    current_chunk = sentence
                elif len(current_chunk) + len(sentence) + 1 <= max_length:
                    current_chunk += " " + sentence
                else:
                    chunks.append(current_chunk)
                    current_chunk = sentence

            if current_chunk:  # Add the last chunk
                chunks.append(current_chunk)

            # Further split any chunks that are still too long
            paragraph_chunks = []
            for chunk in chunks:
                if len(chunk) > max_length:
                    # Simple split by length if a sentence itself is too long
                    for i in range(0, len(chunk), max_length):
                        paragraph_chunks.append(chunk[i:i+max_length])
                else:
                    paragraph_chunks.append(chunk)

            # Add paragraph marker
            if paragraph_chunks:
                all_chunks.append({"type": "paragraph", "chunks": paragraph_chunks})

        return all_chunks

    async def generate_audiobook(self, text_file_path):
        """
        Generate an audiobook from a text file.

        Args:
            text_file_path: Path to the text file to convert

        Returns:
            Path to the generated audiobook file
        """
        # Read the text file
        text_path = Path(text_file_path)
        if not text_path.exists():
            raise FileNotFoundError(f"Text file not found: {text_file_path}")

        with open(text_path, 'r', encoding='utf-8') as f:
            text = f.read()

        print(f"Processing text file: {text_file_path}")
        print(f"Text length: {len(text)} characters")

        # Split text into chunks
        paragraphs = self.split_text_into_chunks(text)
        total_chunks = sum(len(p["chunks"]) for p in paragraphs)
        print(f"Split into {len(paragraphs)} paragraphs with {total_chunks} total chunks")

        # Generate audio for each chunk
        segment_results = []
        chunk_count = 0

        # Pre-load model if keeping it loaded
        if self.keep_model_loaded:
            print("Pre-loading TTS model for batch processing...")
            self.tts_service.load_model()

        try:
            for para_idx, paragraph in enumerate(paragraphs):
                print(f"Processing paragraph {para_idx+1}/{len(paragraphs)}")

                for chunk_idx, chunk in enumerate(paragraph["chunks"]):
                    chunk_count += 1
                    self.chunk_counter += 1
                    print(f"  Generating audio for chunk {chunk_count}/{total_chunks}: {chunk[:50]}...")

                    # Generate unique filename for this chunk
                    segment_filename_base = f"{self.output_base_name}_p{para_idx}_c{chunk_idx}_{uuid.uuid4().hex[:8]}"

                    try:
                        # Get absolute speaker sample path
                        speaker_sample_path = Path(self.speaker_info.sample_path)
                        if not speaker_sample_path.is_absolute():
                            from backend.app.config import SPEAKER_DATA_BASE_DIR
                            speaker_sample_path = SPEAKER_DATA_BASE_DIR / speaker_sample_path

                        # Generate speech for this chunk
                        if self.use_subprocess:
                            # Use subprocess for guaranteed memory release
                            segment_output_path = await self._generate_chunk_subprocess(
                                chunk=chunk,
                                segment_filename_base=segment_filename_base,
                                speaker_sample_path=speaker_sample_path
                            )
                        else:
                            # Load model for this chunk (if not keeping loaded)
                            if not self.keep_model_loaded:
                                print("Loading TTS model...")
                                self.tts_service.load_model()

                            # Generate speech using the TTS service
                            segment_output_path = await self.tts_service.generate_speech(
                                text=chunk,
                                speaker_id=self.speaker_id,
                                speaker_sample_path=str(speaker_sample_path),
                                output_filename_base=segment_filename_base,
                                output_dir=self.temp_dir,
                                exaggeration=self.exaggeration,
                                cfg_weight=self.cfg_weight,
                                temperature=self.temperature
                            )

                        # Memory management strategy based on model lifecycle
                        if self.use_subprocess:
                            # No memory management needed - subprocess handles it
                            pass
                        elif self.keep_model_loaded:
                            # Light cleanup after each chunk
                            if self.chunk_counter % self.cleanup_interval == 0:
                                print(f"Performing periodic deep cleanup (chunk {self.chunk_counter})")
                                self._cleanup_memory()
                        else:
                            # Explicit memory cleanup after generation
                            self._cleanup_memory()

                            # Unload model after generation
                            print("Unloading TTS model...")
                            self.tts_service.unload_model()

                            # Additional memory cleanup after model unload
                            self._cleanup_memory()

                        # Add to segment results
                        segment_results.append({
                            "type": "speech",
                            "path": str(segment_output_path)
                        })

                        # Add pause between sentences
                        if chunk_idx < len(paragraph["chunks"]) - 1:
                            segment_results.append({
                                "type": "silence",
                                "duration": self.pause_between_sentences
                            })

                    except Exception as e:
                        print(f"Error generating speech for chunk: {e}")
                        # Ensure model is unloaded if there was an error and not using subprocess
                        if not self.use_subprocess:
                            if not self.keep_model_loaded and self.tts_service.model is not None:
                                print("Unloading TTS model after error...")
                                self.tts_service.unload_model()
                            # Force cleanup after error
                            self._cleanup_memory()
                        # Continue with next chunk

                # Add longer pause between paragraphs
                if para_idx < len(paragraphs) - 1:
                    segment_results.append({
                        "type": "silence",
                        "duration": self.pause_between_paragraphs
                    })

        finally:
            # Always unload model at the end if it was kept loaded
            if self.keep_model_loaded and self.tts_service.model is not None:
                print("Final cleanup: Unloading TTS model...")
                self.tts_service.unload_model()
                self._cleanup_memory()

        # Concatenate all segments
        print("Concatenating audio segments...")
        concatenated_filename = f"{self.output_base_name}_audiobook.wav"
        concatenated_path = self.output_dir / concatenated_filename

        self.audio_manipulator.concatenate_audio_segments(
            segment_results=segment_results,
            output_concatenated_path=concatenated_path
        )

        # Create ZIP archive with all files
        print("Creating ZIP archive...")
        zip_filename = f"{self.output_base_name}_audiobook.zip"
        zip_path = self.output_dir / zip_filename

        # Collect all speech segment files
        speech_segment_paths = [
            Path(s["path"]) for s in segment_results
            if s["type"] == "speech" and Path(s["path"]).exists()
        ]

        self.audio_manipulator.create_zip_archive(
            segment_file_paths=speech_segment_paths,
            concatenated_audio_path=concatenated_path,
            output_zip_path=zip_path
        )

        print(f"Audiobook generation complete!")
        print(f"Audiobook file: {concatenated_path}")
        print(f"ZIP archive: {zip_path}")

        # Ensure model is unloaded at the end (just in case)
        if self.tts_service.model is not None:
            print("Final check: Unloading TTS model...")
            self.tts_service.unload_model()

        return concatenated_path

async def main():
    parser = argparse.ArgumentParser(description="Generate an audiobook from a text file using Chatterbox TTS")

    # Create a mutually exclusive group for the main operation vs listing speakers
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--list-speakers", action="store_true", help="List available speakers and exit")
    group.add_argument("text_file", nargs="?", help="Path to the text file to convert")

    # Other arguments
    parser.add_argument("--speaker", "-s", help="ID of the speaker to use")
    parser.add_argument("--output", "-o", help="Base name for output files (default: derived from text filename)")
    parser.add_argument("--device", default="mps", choices=["mps", "cuda", "cpu"], help="Device to use for TTS (default: mps)")
    parser.add_argument("--exaggeration", type=float, default=0.5, help="Controls expressiveness (0.0-1.0, default: 0.5)")
    parser.add_argument("--cfg-weight", type=float, default=0.5, help="Controls alignment with speaker (0.0-1.0, default: 0.5)")
    parser.add_argument("--temperature", type=float, default=0.8, help="Controls randomness (0.0-1.0, default: 0.8)")
    parser.add_argument("--sentence-pause", type=float, default=0.5, help="Pause between sentences in seconds (default: 0.5)")
    parser.add_argument("--paragraph-pause", type=float, default=1.0, help="Pause between paragraphs in seconds (default: 1.0)")
    parser.add_argument("--keep-model-loaded", action="store_true", help="Keep model loaded between chunks (faster but uses more memory)")
    parser.add_argument("--cleanup-interval", type=int, default=10, help="How often to perform deep cleanup when keeping model loaded (default: 10)")
    parser.add_argument("--force-cpu-on-oom", action="store_true", help="Automatically switch to CPU if MPS/CUDA runs out of memory")
    parser.add_argument("--max-chunk-length", type=int, default=300, help="Maximum chunk length for text splitting (default: 300)")
    parser.add_argument("--use-subprocess", action="store_true", help="Use separate processes for each chunk (guarantees memory release but slower)")

    args = parser.parse_args()

    # List speakers if requested
    if args.list_speakers:
        speaker_service = SpeakerManagementService()
        speakers = speaker_service.get_speakers()
        print("Available speakers:")
        for speaker in speakers:
            print(f"  {speaker.id}: {speaker.name}")
        return

    # Validate required arguments for audiobook generation
    if not args.text_file:
        parser.error("text_file is required when not using --list-speakers")

    if not args.speaker:
        parser.error("--speaker/-s is required when not using --list-speakers")

    # Determine output base name if not provided
    if not args.output:
        text_path = Path(args.text_file)
        args.output = text_path.stem

    try:
        # Create audiobook generator
        generator = AudiobookGenerator(
            speaker_id=args.speaker,
            output_base_name=args.output,
            device=args.device,
            exaggeration=args.exaggeration,
            cfg_weight=args.cfg_weight,
            temperature=args.temperature,
            pause_between_sentences=args.sentence_pause,
            pause_between_paragraphs=args.paragraph_pause,
            keep_model_loaded=args.keep_model_loaded,
            cleanup_interval=args.cleanup_interval,
            use_subprocess=args.use_subprocess
        )

        # Generate audiobook with automatic fallback
        try:
            await generator.generate_audiobook(args.text_file)
        except (RuntimeError, torch.OutOfMemoryError) as e:
            if args.force_cpu_on_oom and "out of memory" in str(e).lower() and args.device != "cpu":
                print(f"\n⚠️  {args.device.upper()} out of memory: {e}")
                print("🔄 Automatically switching to CPU and retrying...")

                # Create new generator with CPU
                generator = AudiobookGenerator(
                    speaker_id=args.speaker,
                    output_base_name=args.output,
                    device="cpu",
                    exaggeration=args.exaggeration,
                    cfg_weight=args.cfg_weight,
                    temperature=args.temperature,
                    pause_between_sentences=args.sentence_pause,
                    pause_between_paragraphs=args.paragraph_pause,
                    keep_model_loaded=args.keep_model_loaded,
                    cleanup_interval=args.cleanup_interval,
                    use_subprocess=args.use_subprocess
                )

                await generator.generate_audiobook(args.text_file)
                print("✅ Successfully completed using CPU fallback!")
            else:
                raise

    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        return 1

    return 0

if __name__ == "__main__":
    sys.exit(asyncio.run(main()))