diff --git a/backend/app/config.py b/backend/app/config.py index 70cd037..5ba1601 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -17,3 +17,5 @@ TTS_TEMP_OUTPUT_DIR = PROJECT_ROOT / "tts_temp_outputs" # These are stored within the 'backend' directory to be easily servable. DIALOG_OUTPUT_PARENT_DIR = PROJECT_ROOT / "backend" DIALOG_GENERATED_DIR = DIALOG_OUTPUT_PARENT_DIR / "tts_generated_dialogs" +# Alias for clarity and backward compatibility +DIALOG_OUTPUT_DIR = DIALOG_GENERATED_DIR diff --git a/backend/app/models/dialog_models.py b/backend/app/models/dialog_models.py index e198adc..a5845ef 100644 --- a/backend/app/models/dialog_models.py +++ b/backend/app/models/dialog_models.py @@ -11,10 +11,14 @@ class SpeechItem(DialogItemBase): exaggeration: Optional[float] = Field(0.5, description="Controls the expressiveness of the speech. Higher values lead to more exaggerated speech. Default from Gradio.") cfg_weight: Optional[float] = Field(0.5, description="Classifier-Free Guidance weight. Higher values make the speech more aligned with the prompt text and speaker characteristics. Default from Gradio.") temperature: Optional[float] = Field(0.8, description="Controls randomness in generation. Lower values make speech more deterministic, higher values more varied. Default from Gradio.") + use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file instead of generating new audio for this line.") + audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this line (used if use_existing_audio is true).") class SilenceItem(DialogItemBase): type: Literal['silence'] = 'silence' duration: float = Field(..., gt=0, description="Duration of the silence in seconds.") + use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file for silence instead of generating a new silent segment.") + audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this silence (used if use_existing_audio is true).") class DialogRequest(BaseModel): dialog_items: List[Union[SpeechItem, SilenceItem]] = Field(..., description="A list of speech and silence items.") diff --git a/backend/app/services/dialog_processor_service.py b/backend/app/services/dialog_processor_service.py index 050e5b6..833c199 100644 --- a/backend/app/services/dialog_processor_service.py +++ b/backend/app/services/dialog_processor_service.py @@ -86,11 +86,58 @@ class DialogProcessorService: dialog_temp_dir.mkdir(parents=True, exist_ok=True) processing_log.append(f"Created temporary directory for segments: {dialog_temp_dir}") + import shutil segment_idx = 0 for i, item in enumerate(dialog_items): item_type = item.get("type") processing_log.append(f"Processing item {i+1}: type='{item_type}'") + # --- Universal: Handle reuse of existing audio for both speech and silence --- + use_existing_audio = item.get("use_existing_audio", False) + audio_url = item.get("audio_url") + if use_existing_audio and audio_url: + # Determine source path (handle both absolute and relative) + # Map web URL to actual file location in tts_generated_dialogs + if audio_url.startswith("/generated_audio/"): + src_audio_path = config.DIALOG_OUTPUT_DIR / audio_url[len("/generated_audio/"):] + else: + src_audio_path = Path(audio_url) + if not src_audio_path.is_absolute(): + # Assume relative to the generated audio root dir + src_audio_path = config.DIALOG_OUTPUT_DIR / audio_url.lstrip("/\\") + # Now src_audio_path should point to the real file in tts_generated_dialogs + if src_audio_path.is_file(): + segment_filename = f"{output_base_name}_seg{segment_idx}_reused.wav" + dest_path = (self.temp_audio_dir / output_base_name / segment_filename) + try: + if not src_audio_path.exists(): + processing_log.append(f"[REUSE] Source audio file does not exist: {src_audio_path}") + else: + processing_log.append(f"[REUSE] Source audio file exists: {src_audio_path}, size={src_audio_path.stat().st_size} bytes") + shutil.copyfile(src_audio_path, dest_path) + if not dest_path.exists(): + processing_log.append(f"[REUSE] Destination audio file was not created: {dest_path}") + else: + processing_log.append(f"[REUSE] Destination audio file created: {dest_path}, size={dest_path.stat().st_size} bytes") + # Only include 'type' and 'path' so the concatenator always includes this segment + segment_results.append({ + "type": item_type, + "path": str(dest_path) + }) + processing_log.append(f"Reused existing audio for item {i+1}: copied from {src_audio_path} to {dest_path}") + except Exception as e: + error_message = f"Failed to copy reused audio for item {i+1}: {e}" + processing_log.append(error_message) + segment_results.append({"type": "error", "message": error_message}) + segment_idx += 1 + continue + else: + error_message = f"Audio file for reuse not found at {src_audio_path} for item {i+1}." + processing_log.append(error_message) + segment_results.append({"type": "error", "message": error_message}) + segment_idx += 1 + continue + if item_type == "speech": speaker_id = item.get("speaker_id") text = item.get("text") @@ -161,6 +208,11 @@ class DialogProcessorService: processing_log.append(f"Unknown item type '{item_type}' at item {i+1}. Skipping.") segment_results.append({"type": "error", "message": f"Unknown item type: {item_type}"}) + # Log the full segment_results list for debugging + processing_log.append("[DEBUG] Final segment_results list:") + for idx, seg in enumerate(segment_results): + processing_log.append(f" [{idx}] {seg}") + return { "log": "\n".join(processing_log), "segment_files": segment_results,