Now uses pre-generated files in concatenated file.
This commit is contained in:
parent
4a7c1ea6a1
commit
d3ff6e5241
|
@ -17,3 +17,5 @@ TTS_TEMP_OUTPUT_DIR = PROJECT_ROOT / "tts_temp_outputs"
|
||||||
# These are stored within the 'backend' directory to be easily servable.
|
# These are stored within the 'backend' directory to be easily servable.
|
||||||
DIALOG_OUTPUT_PARENT_DIR = PROJECT_ROOT / "backend"
|
DIALOG_OUTPUT_PARENT_DIR = PROJECT_ROOT / "backend"
|
||||||
DIALOG_GENERATED_DIR = DIALOG_OUTPUT_PARENT_DIR / "tts_generated_dialogs"
|
DIALOG_GENERATED_DIR = DIALOG_OUTPUT_PARENT_DIR / "tts_generated_dialogs"
|
||||||
|
# Alias for clarity and backward compatibility
|
||||||
|
DIALOG_OUTPUT_DIR = DIALOG_GENERATED_DIR
|
||||||
|
|
|
@ -11,10 +11,14 @@ class SpeechItem(DialogItemBase):
|
||||||
exaggeration: Optional[float] = Field(0.5, description="Controls the expressiveness of the speech. Higher values lead to more exaggerated speech. Default from Gradio.")
|
exaggeration: Optional[float] = Field(0.5, description="Controls the expressiveness of the speech. Higher values lead to more exaggerated speech. Default from Gradio.")
|
||||||
cfg_weight: Optional[float] = Field(0.5, description="Classifier-Free Guidance weight. Higher values make the speech more aligned with the prompt text and speaker characteristics. Default from Gradio.")
|
cfg_weight: Optional[float] = Field(0.5, description="Classifier-Free Guidance weight. Higher values make the speech more aligned with the prompt text and speaker characteristics. Default from Gradio.")
|
||||||
temperature: Optional[float] = Field(0.8, description="Controls randomness in generation. Lower values make speech more deterministic, higher values more varied. Default from Gradio.")
|
temperature: Optional[float] = Field(0.8, description="Controls randomness in generation. Lower values make speech more deterministic, higher values more varied. Default from Gradio.")
|
||||||
|
use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file instead of generating new audio for this line.")
|
||||||
|
audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this line (used if use_existing_audio is true).")
|
||||||
|
|
||||||
class SilenceItem(DialogItemBase):
|
class SilenceItem(DialogItemBase):
|
||||||
type: Literal['silence'] = 'silence'
|
type: Literal['silence'] = 'silence'
|
||||||
duration: float = Field(..., gt=0, description="Duration of the silence in seconds.")
|
duration: float = Field(..., gt=0, description="Duration of the silence in seconds.")
|
||||||
|
use_existing_audio: Optional[bool] = Field(False, description="If true and audio_url is provided, use the existing audio file for silence instead of generating a new silent segment.")
|
||||||
|
audio_url: Optional[str] = Field(None, description="Path or URL to pre-generated audio for this silence (used if use_existing_audio is true).")
|
||||||
|
|
||||||
class DialogRequest(BaseModel):
|
class DialogRequest(BaseModel):
|
||||||
dialog_items: List[Union[SpeechItem, SilenceItem]] = Field(..., description="A list of speech and silence items.")
|
dialog_items: List[Union[SpeechItem, SilenceItem]] = Field(..., description="A list of speech and silence items.")
|
||||||
|
|
|
@ -86,11 +86,58 @@ class DialogProcessorService:
|
||||||
dialog_temp_dir.mkdir(parents=True, exist_ok=True)
|
dialog_temp_dir.mkdir(parents=True, exist_ok=True)
|
||||||
processing_log.append(f"Created temporary directory for segments: {dialog_temp_dir}")
|
processing_log.append(f"Created temporary directory for segments: {dialog_temp_dir}")
|
||||||
|
|
||||||
|
import shutil
|
||||||
segment_idx = 0
|
segment_idx = 0
|
||||||
for i, item in enumerate(dialog_items):
|
for i, item in enumerate(dialog_items):
|
||||||
item_type = item.get("type")
|
item_type = item.get("type")
|
||||||
processing_log.append(f"Processing item {i+1}: type='{item_type}'")
|
processing_log.append(f"Processing item {i+1}: type='{item_type}'")
|
||||||
|
|
||||||
|
# --- Universal: Handle reuse of existing audio for both speech and silence ---
|
||||||
|
use_existing_audio = item.get("use_existing_audio", False)
|
||||||
|
audio_url = item.get("audio_url")
|
||||||
|
if use_existing_audio and audio_url:
|
||||||
|
# Determine source path (handle both absolute and relative)
|
||||||
|
# Map web URL to actual file location in tts_generated_dialogs
|
||||||
|
if audio_url.startswith("/generated_audio/"):
|
||||||
|
src_audio_path = config.DIALOG_OUTPUT_DIR / audio_url[len("/generated_audio/"):]
|
||||||
|
else:
|
||||||
|
src_audio_path = Path(audio_url)
|
||||||
|
if not src_audio_path.is_absolute():
|
||||||
|
# Assume relative to the generated audio root dir
|
||||||
|
src_audio_path = config.DIALOG_OUTPUT_DIR / audio_url.lstrip("/\\")
|
||||||
|
# Now src_audio_path should point to the real file in tts_generated_dialogs
|
||||||
|
if src_audio_path.is_file():
|
||||||
|
segment_filename = f"{output_base_name}_seg{segment_idx}_reused.wav"
|
||||||
|
dest_path = (self.temp_audio_dir / output_base_name / segment_filename)
|
||||||
|
try:
|
||||||
|
if not src_audio_path.exists():
|
||||||
|
processing_log.append(f"[REUSE] Source audio file does not exist: {src_audio_path}")
|
||||||
|
else:
|
||||||
|
processing_log.append(f"[REUSE] Source audio file exists: {src_audio_path}, size={src_audio_path.stat().st_size} bytes")
|
||||||
|
shutil.copyfile(src_audio_path, dest_path)
|
||||||
|
if not dest_path.exists():
|
||||||
|
processing_log.append(f"[REUSE] Destination audio file was not created: {dest_path}")
|
||||||
|
else:
|
||||||
|
processing_log.append(f"[REUSE] Destination audio file created: {dest_path}, size={dest_path.stat().st_size} bytes")
|
||||||
|
# Only include 'type' and 'path' so the concatenator always includes this segment
|
||||||
|
segment_results.append({
|
||||||
|
"type": item_type,
|
||||||
|
"path": str(dest_path)
|
||||||
|
})
|
||||||
|
processing_log.append(f"Reused existing audio for item {i+1}: copied from {src_audio_path} to {dest_path}")
|
||||||
|
except Exception as e:
|
||||||
|
error_message = f"Failed to copy reused audio for item {i+1}: {e}"
|
||||||
|
processing_log.append(error_message)
|
||||||
|
segment_results.append({"type": "error", "message": error_message})
|
||||||
|
segment_idx += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
error_message = f"Audio file for reuse not found at {src_audio_path} for item {i+1}."
|
||||||
|
processing_log.append(error_message)
|
||||||
|
segment_results.append({"type": "error", "message": error_message})
|
||||||
|
segment_idx += 1
|
||||||
|
continue
|
||||||
|
|
||||||
if item_type == "speech":
|
if item_type == "speech":
|
||||||
speaker_id = item.get("speaker_id")
|
speaker_id = item.get("speaker_id")
|
||||||
text = item.get("text")
|
text = item.get("text")
|
||||||
|
@ -161,6 +208,11 @@ class DialogProcessorService:
|
||||||
processing_log.append(f"Unknown item type '{item_type}' at item {i+1}. Skipping.")
|
processing_log.append(f"Unknown item type '{item_type}' at item {i+1}. Skipping.")
|
||||||
segment_results.append({"type": "error", "message": f"Unknown item type: {item_type}"})
|
segment_results.append({"type": "error", "message": f"Unknown item type: {item_type}"})
|
||||||
|
|
||||||
|
# Log the full segment_results list for debugging
|
||||||
|
processing_log.append("[DEBUG] Final segment_results list:")
|
||||||
|
for idx, seg in enumerate(segment_results):
|
||||||
|
processing_log.append(f" [{idx}] {seg}")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"log": "\n".join(processing_log),
|
"log": "\n".join(processing_log),
|
||||||
"segment_files": segment_results,
|
"segment_files": segment_results,
|
||||||
|
|
Loading…
Reference in New Issue