chatterbox-ui/backend/app/models/tts_models.py

"""
TTS Data Models and Request/Response structures for multi-backend support
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Dict, Any, Optional
from pathlib import Path

@dataclass
class TTSParameters:
    """Common TTS parameters with backend-specific extensions"""
    temperature: float = 0.8
    backend_params: Dict[str, Any] = field(default_factory=dict)

@dataclass
class SpeakerConfig:
    """Enhanced speaker configuration"""
    id: str
    name: str
    sample_path: str
    reference_text: Optional[str] = None
    tts_backend: str = "chatterbox"

    def validate(self):
        """Validate speaker configuration based on backend"""
        if self.tts_backend == "higgs" and not self.reference_text:
            raise ValueError(f"reference_text required for Higgs backend speaker: {self.name}")

        sample_path = Path(self.sample_path)
        if not sample_path.exists() and not sample_path.is_absolute():
            # If not absolute, it might be relative to speaker data dir - will be validated later
            pass

@dataclass
class OutputConfig:
    """Output configuration for TTS generation"""
    filename_base: str
    output_dir: Optional[Path] = None
    format: str = "wav"

@dataclass
class TTSRequest:
    """Unified TTS request structure"""
    text: str
    speaker_config: SpeakerConfig
    parameters: TTSParameters
    output_config: OutputConfig

@dataclass
class TTSResponse:
    """Unified TTS response structure"""
    output_path: Path
    generated_text: Optional[str] = None
    audio_duration: Optional[float] = None
    sampling_rate: Optional[int] = None
    backend_used: str = ""