from pydantic import BaseModel, Field, validator from typing import List, Union, Literal, Optional class DialogItemBase(BaseModel): type: str class SpeechItem(DialogItemBase): type: Literal['speech'] = 'speech' speaker_id: str = Field(..., description="ID of the speaker for this speech segment.") text: str = Field(..., description="Text content to be synthesized.") exaggeration: Optional[float] = Field(0.5, description="Controls the expressiveness of the speech. Higher values lead to more exaggerated speech. Default from Gradio.") cfg_weight: Optional[float] = Field(0.5, description="Classifier-Free Guidance weight. Higher values make the speech more aligned with the prompt text and speaker characteristics. Default from Gradio.") temperature: Optional[float] = Field(0.8, description="Controls randomness in generation. Lower values make speech more deterministic, higher values more varied. Default from Gradio.") use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file instead of generating new audio for this line.") audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this line (used if use_existing_audio is true).") class SilenceItem(DialogItemBase): type: Literal['silence'] = 'silence' duration: float = Field(..., gt=0, description="Duration of the silence in seconds.") use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file for silence instead of generating a new silent segment.") audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this silence (used if use_existing_audio is true).") class DialogRequest(BaseModel): dialog_items: List[Union[SpeechItem, SilenceItem]] = Field(..., description="A list of speech and silence items.") output_base_name: str = Field(..., description="Base name for the output files (e.g., 'my_dialog_v1'). Extensions will be added automatically.") @validator('dialog_items', pre=True, each_item=True) def check_item_type(cls, item): if not isinstance(item, dict): raise ValueError("Each dialog item must be a dictionary.") item_type = item.get('type') if item_type == 'speech': # Pydantic will handle further validation based on SpeechItem model return item elif item_type == 'silence': # Pydantic will handle further validation based on SilenceItem model return item raise ValueError(f"Unknown dialog item type: {item_type}. Must be 'speech' or 'silence'.") class DialogResponse(BaseModel): log: str = Field(description="Log of the dialog generation process.") # For now, these URLs might be relative paths or placeholders. # Actual serving strategy will determine the final URL format. concatenated_audio_url: Optional[str] = Field(None, description="URL/path to the concatenated audio file.") zip_archive_url: Optional[str] = Field(None, description="URL/path to the ZIP archive of all audio files.") temp_dir_path: Optional[str] = Field(None, description="Path to the temporary directory holding generated files, for server-side reference.") error_message: Optional[str] = Field(None, description="Error message if the process failed globally.")