chatterbox-ui/backend/app/models/dialog_models.py

from pydantic import BaseModel, Field, validator
from typing import List, Union, Literal, Optional

class DialogItemBase(BaseModel):
    type: str

class SpeechItem(DialogItemBase):
    type: Literal['speech'] = 'speech'
    speaker_id: str = Field(..., description="ID of the speaker for this speech segment.")
    text: str = Field(..., description="Text content to be synthesized.")
    exaggeration: Optional[float] = Field(0.5, description="Controls the expressiveness of the speech. Higher values lead to more exaggerated speech. Default from Gradio.")
    cfg_weight: Optional[float] = Field(0.5, description="Classifier-Free Guidance weight. Higher values make the speech more aligned with the prompt text and speaker characteristics. Default from Gradio.")
    temperature: Optional[float] = Field(0.8, description="Controls randomness in generation. Lower values make speech more deterministic, higher values more varied. Default from Gradio.")
    use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file instead of generating new audio for this line.")
    audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this line (used if use_existing_audio is true).")

class SilenceItem(DialogItemBase):
    type: Literal['silence'] = 'silence'
    duration: float = Field(..., gt=0, description="Duration of the silence in seconds.")
    use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file for silence instead of generating a new silent segment.")
    audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this silence (used if use_existing_audio is true).")

class DialogRequest(BaseModel):
    dialog_items: List[Union[SpeechItem, SilenceItem]] = Field(..., description="A list of speech and silence items.")
    output_base_name: str = Field(..., description="Base name for the output files (e.g., 'my_dialog_v1'). Extensions will be added automatically.")

    @validator('dialog_items', pre=True, each_item=True)
    def check_item_type(cls, item):
        if not isinstance(item, dict):
            raise ValueError("Each dialog item must be a dictionary.")
        item_type = item.get('type')
        if item_type == 'speech':
            # Pydantic will handle further validation based on SpeechItem model
            return item
        elif item_type == 'silence':
            # Pydantic will handle further validation based on SilenceItem model
            return item
        raise ValueError(f"Unknown dialog item type: {item_type}. Must be 'speech' or 'silence'.")

class DialogResponse(BaseModel):
    log: str = Field(description="Log of the dialog generation process.")
    # For now, these URLs might be relative paths or placeholders.
    # Actual serving strategy will determine the final URL format.
    concatenated_audio_url: Optional[str] = Field(None, description="URL/path to the concatenated audio file.")
    zip_archive_url: Optional[str] = Field(None, description="URL/path to the ZIP archive of all audio files.")
    temp_dir_path: Optional[str] = Field(None, description="Path to the temporary directory holding generated files, for server-side reference.")
    error_message: Optional[str] = Field(None, description="Error message if the process failed globally.")