56 lines
1.7 KiB
Python
56 lines
1.7 KiB
Python
"""
|
|
TTS Data Models and Request/Response structures for multi-backend support
|
|
"""
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, Any, Optional
|
|
from pathlib import Path
|
|
|
|
@dataclass
|
|
class TTSParameters:
|
|
"""Common TTS parameters with backend-specific extensions"""
|
|
temperature: float = 0.8
|
|
backend_params: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
@dataclass
|
|
class SpeakerConfig:
|
|
"""Enhanced speaker configuration"""
|
|
id: str
|
|
name: str
|
|
sample_path: str
|
|
reference_text: Optional[str] = None
|
|
tts_backend: str = "chatterbox"
|
|
|
|
def validate(self):
|
|
"""Validate speaker configuration based on backend"""
|
|
if self.tts_backend == "higgs" and not self.reference_text:
|
|
raise ValueError(f"reference_text required for Higgs backend speaker: {self.name}")
|
|
|
|
sample_path = Path(self.sample_path)
|
|
if not sample_path.exists() and not sample_path.is_absolute():
|
|
# If not absolute, it might be relative to speaker data dir - will be validated later
|
|
pass
|
|
|
|
@dataclass
|
|
class OutputConfig:
|
|
"""Output configuration for TTS generation"""
|
|
filename_base: str
|
|
output_dir: Optional[Path] = None
|
|
format: str = "wav"
|
|
|
|
@dataclass
|
|
class TTSRequest:
|
|
"""Unified TTS request structure"""
|
|
text: str
|
|
speaker_config: SpeakerConfig
|
|
parameters: TTSParameters
|
|
output_config: OutputConfig
|
|
|
|
@dataclass
|
|
class TTSResponse:
|
|
"""Unified TTS response structure"""
|
|
output_path: Path
|
|
generated_text: Optional[str] = None
|
|
audio_duration: Optional[float] = None
|
|
sampling_rate: Optional[int] = None
|
|
backend_used: str = "" |