chatterbox-ui/backend/app/models/dialog_models.py

48 lines
3.3 KiB
Python

from pydantic import BaseModel, Field, validator
from typing import List, Union, Literal, Optional
class DialogItemBase(BaseModel):
type: str
class SpeechItem(DialogItemBase):
type: Literal['speech'] = 'speech'
speaker_id: str = Field(..., description="ID of the speaker for this speech segment.")
text: str = Field(..., description="Text content to be synthesized.")
exaggeration: Optional[float] = Field(0.5, description="Controls the expressiveness of the speech. Higher values lead to more exaggerated speech. Default from Gradio.")
cfg_weight: Optional[float] = Field(0.5, description="Classifier-Free Guidance weight. Higher values make the speech more aligned with the prompt text and speaker characteristics. Default from Gradio.")
temperature: Optional[float] = Field(0.8, description="Controls randomness in generation. Lower values make speech more deterministic, higher values more varied. Default from Gradio.")
use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file instead of generating new audio for this line.")
audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this line (used if use_existing_audio is true).")
class SilenceItem(DialogItemBase):
type: Literal['silence'] = 'silence'
duration: float = Field(..., gt=0, description="Duration of the silence in seconds.")
use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file for silence instead of generating a new silent segment.")
audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this silence (used if use_existing_audio is true).")
class DialogRequest(BaseModel):
dialog_items: List[Union[SpeechItem, SilenceItem]] = Field(..., description="A list of speech and silence items.")
output_base_name: str = Field(..., description="Base name for the output files (e.g., 'my_dialog_v1'). Extensions will be added automatically.")
@validator('dialog_items', pre=True, each_item=True)
def check_item_type(cls, item):
if not isinstance(item, dict):
raise ValueError("Each dialog item must be a dictionary.")
item_type = item.get('type')
if item_type == 'speech':
# Pydantic will handle further validation based on SpeechItem model
return item
elif item_type == 'silence':
# Pydantic will handle further validation based on SilenceItem model
return item
raise ValueError(f"Unknown dialog item type: {item_type}. Must be 'speech' or 'silence'.")
class DialogResponse(BaseModel):
log: str = Field(description="Log of the dialog generation process.")
# For now, these URLs might be relative paths or placeholders.
# Actual serving strategy will determine the final URL format.
concatenated_audio_url: Optional[str] = Field(None, description="URL/path to the concatenated audio file.")
zip_archive_url: Optional[str] = Field(None, description="URL/path to the ZIP archive of all audio files.")
temp_dir_path: Optional[str] = Field(None, description="Path to the temporary directory holding generated files, for server-side reference.")
error_message: Optional[str] = Field(None, description="Error message if the process failed globally.")