Compare commits
No commits in common. "0e522feddfe0599a193fc75911c392a4cfb63d8a" and "2af705ca43bef2f72e311eff64dc565bdd35b773" have entirely different histories.
0e522feddf
...
2af705ca43
File diff suppressed because it is too large
Load Diff
|
@ -1,9 +0,0 @@
|
||||||
|
|
||||||
# 2025-06-14 18:21:08.215816
|
|
||||||
+yes
|
|
||||||
|
|
||||||
# 2025-06-14 18:21:29.450580
|
|
||||||
+/model
|
|
||||||
|
|
||||||
# 2025-06-14 18:22:01.292648
|
|
||||||
+/exit
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
36
OpenCode.md
36
OpenCode.md
|
@ -1,36 +0,0 @@
|
||||||
# OpenCode.md
|
|
||||||
|
|
||||||
## Build/Test Commands
|
|
||||||
```bash
|
|
||||||
# Backend setup and run (from project root)
|
|
||||||
pip install -r backend/requirements.txt
|
|
||||||
uvicorn backend.app.main:app --reload --host 0.0.0.0 --port 8000
|
|
||||||
|
|
||||||
# Frontend tests
|
|
||||||
npm test # Run all Jest tests
|
|
||||||
npm test -- --testNamePattern="getSpeakers" # Run single test
|
|
||||||
|
|
||||||
# Backend API test
|
|
||||||
python backend/run_api_test.py
|
|
||||||
|
|
||||||
# Alternative interface
|
|
||||||
python gradio_app.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Code Style Guidelines
|
|
||||||
|
|
||||||
### Python (Backend)
|
|
||||||
- **Imports**: Standard library first, third-party, then local imports with blank lines between groups
|
|
||||||
- **Types**: Use type hints extensively (`List[Speaker]`, `Optional[str]`, `Dict[str, Any]`)
|
|
||||||
- **Classes**: PascalCase (`SpeakerManagementService`, `DialogRequest`)
|
|
||||||
- **Functions/Variables**: snake_case (`get_speakers`, `speaker_id`, `audio_url`)
|
|
||||||
- **Error Handling**: Use FastAPI `HTTPException` with descriptive messages
|
|
||||||
- **Models**: Pydantic models with Field descriptions and validators
|
|
||||||
|
|
||||||
### JavaScript (Frontend)
|
|
||||||
- **Modules**: ES6 modules with explicit imports/exports
|
|
||||||
- **Functions**: camelCase with JSDoc comments (`getSpeakers`, `addSpeaker`)
|
|
||||||
- **Constants**: UPPER_SNAKE_CASE (`API_BASE_URL`)
|
|
||||||
- **Error Handling**: Comprehensive try/catch with detailed error messages
|
|
||||||
- **Async**: Use async/await consistently, handle response.ok checks
|
|
||||||
- **Testing**: Jest with descriptive test names and comprehensive mocking
|
|
|
@ -4,7 +4,6 @@ from fastapi.middleware.cors import CORSMiddleware
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from app.routers import speakers, dialog # Import the routers
|
from app.routers import speakers, dialog # Import the routers
|
||||||
from app import config
|
from app import config
|
||||||
from app.services.tts_service import get_global_tts_service
|
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="Chatterbox TTS API",
|
title="Chatterbox TTS API",
|
||||||
|
@ -38,21 +37,4 @@ config.DIALOG_GENERATED_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
# Mount StaticFiles to serve generated dialogs
|
# Mount StaticFiles to serve generated dialogs
|
||||||
app.mount("/generated_audio", StaticFiles(directory=config.DIALOG_GENERATED_DIR), name="generated_audio")
|
app.mount("/generated_audio", StaticFiles(directory=config.DIALOG_GENERATED_DIR), name="generated_audio")
|
||||||
|
|
||||||
# Application lifecycle events for TTS model management
|
|
||||||
@app.on_event("startup")
|
|
||||||
async def startup_event():
|
|
||||||
"""Load TTS model on application startup."""
|
|
||||||
print("🚀 Starting Chatterbox TTS API...")
|
|
||||||
tts_service = get_global_tts_service()
|
|
||||||
tts_service.load_model()
|
|
||||||
print("✅ TTS model loaded and ready!")
|
|
||||||
|
|
||||||
@app.on_event("shutdown")
|
|
||||||
async def shutdown_event():
|
|
||||||
"""Unload TTS model on application shutdown."""
|
|
||||||
print("🔄 Shutting down Chatterbox TTS API...")
|
|
||||||
tts_service = get_global_tts_service()
|
|
||||||
tts_service.unload_model()
|
|
||||||
print("✅ TTS model unloaded. Goodbye!")
|
|
||||||
|
|
||||||
# Further endpoints for speakers, dialog generation, etc., will be added here.
|
# Further endpoints for speakers, dialog generation, etc., will be added here.
|
||||||
|
|
|
@ -4,7 +4,7 @@ import shutil
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from app.models.dialog_models import DialogRequest, DialogResponse
|
from app.models.dialog_models import DialogRequest, DialogResponse
|
||||||
from app.services.tts_service import TTSService, get_global_tts_service
|
from app.services.tts_service import TTSService
|
||||||
from app.services.speaker_service import SpeakerManagementService
|
from app.services.speaker_service import SpeakerManagementService
|
||||||
from app.services.dialog_processor_service import DialogProcessorService
|
from app.services.dialog_processor_service import DialogProcessorService
|
||||||
from app.services.audio_manipulation_service import AudioManipulationService
|
from app.services.audio_manipulation_service import AudioManipulationService
|
||||||
|
@ -17,8 +17,8 @@ router = APIRouter()
|
||||||
# For now, direct instantiation or simple Depends is fine.
|
# For now, direct instantiation or simple Depends is fine.
|
||||||
|
|
||||||
def get_tts_service():
|
def get_tts_service():
|
||||||
# Return the global singleton instance
|
# Consider making device configurable
|
||||||
return get_global_tts_service(device="mps")
|
return TTSService(device="mps")
|
||||||
|
|
||||||
def get_speaker_management_service():
|
def get_speaker_management_service():
|
||||||
return SpeakerManagementService()
|
return SpeakerManagementService()
|
||||||
|
@ -128,7 +128,19 @@ async def generate_line(
|
||||||
detail=error_detail
|
detail=error_detail
|
||||||
)
|
)
|
||||||
|
|
||||||
# Note: manage_tts_model_lifecycle function removed - model lifecycle now managed at application startup/shutdown
|
async def manage_tts_model_lifecycle(tts_service: TTSService, task_function, *args, **kwargs):
|
||||||
|
"""Loads TTS model, executes task, then unloads model."""
|
||||||
|
try:
|
||||||
|
print("API: Loading TTS model...")
|
||||||
|
tts_service.load_model()
|
||||||
|
return await task_function(*args, **kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
# Log or handle specific exceptions if needed before re-raising
|
||||||
|
print(f"API: Error during TTS model lifecycle or task execution: {e}")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
print("API: Unloading TTS model...")
|
||||||
|
tts_service.unload_model()
|
||||||
|
|
||||||
async def process_dialog_flow(
|
async def process_dialog_flow(
|
||||||
request: DialogRequest,
|
request: DialogRequest,
|
||||||
|
@ -262,8 +274,10 @@ async def generate_dialog_endpoint(
|
||||||
- Concatenates all audio segments into a single file.
|
- Concatenates all audio segments into a single file.
|
||||||
- Creates a ZIP archive of all individual segments and the concatenated file.
|
- Creates a ZIP archive of all individual segments and the concatenated file.
|
||||||
"""
|
"""
|
||||||
# Model is now loaded at startup and kept loaded - no per-request lifecycle management needed
|
# Wrap the core processing logic with model loading/unloading
|
||||||
return await process_dialog_flow(
|
return await manage_tts_model_lifecycle(
|
||||||
|
tts_service,
|
||||||
|
process_dialog_flow,
|
||||||
request=request,
|
request=request,
|
||||||
dialog_processor=dialog_processor,
|
dialog_processor=dialog_processor,
|
||||||
audio_manipulator=audio_manipulator,
|
audio_manipulator=audio_manipulator,
|
||||||
|
|
|
@ -41,22 +41,10 @@ def safe_load_chatterbox_tts(device):
|
||||||
return ChatterboxTTS.from_pretrained(device=device)
|
return ChatterboxTTS.from_pretrained(device=device)
|
||||||
|
|
||||||
class TTSService:
|
class TTSService:
|
||||||
_instance = None
|
|
||||||
_initialized = False
|
|
||||||
|
|
||||||
def __new__(cls, device: str = "mps"):
|
|
||||||
"""Singleton pattern - ensures only one instance exists."""
|
|
||||||
if cls._instance is None:
|
|
||||||
cls._instance = super(TTSService, cls).__new__(cls)
|
|
||||||
return cls._instance
|
|
||||||
|
|
||||||
def __init__(self, device: str = "mps"): # Default to MPS for Macs, can be "cpu" or "cuda"
|
def __init__(self, device: str = "mps"): # Default to MPS for Macs, can be "cpu" or "cuda"
|
||||||
# Only initialize once to prevent resetting the model
|
self.device = device
|
||||||
if not self._initialized:
|
self.model = None
|
||||||
self.device = device
|
self._ensure_output_dir_exists()
|
||||||
self.model = None
|
|
||||||
self._ensure_output_dir_exists()
|
|
||||||
TTSService._initialized = True
|
|
||||||
|
|
||||||
def _ensure_output_dir_exists(self):
|
def _ensure_output_dir_exists(self):
|
||||||
"""Ensures the TTS output directory exists."""
|
"""Ensures the TTS output directory exists."""
|
||||||
|
@ -74,12 +62,12 @@ class TTSService:
|
||||||
# Potentially raise an exception or handle appropriately
|
# Potentially raise an exception or handle appropriately
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
print("[Singleton] ChatterboxTTS model already loaded.")
|
print("ChatterboxTTS model already loaded.")
|
||||||
|
|
||||||
def unload_model(self):
|
def unload_model(self):
|
||||||
"""Unloads the model and clears memory."""
|
"""Unloads the model and clears memory."""
|
||||||
if self.model is not None:
|
if self.model is not None:
|
||||||
print("[Singleton] Unloading ChatterboxTTS model and clearing cache...")
|
print("Unloading ChatterboxTTS model and clearing cache...")
|
||||||
del self.model
|
del self.model
|
||||||
self.model = None
|
self.model = None
|
||||||
if self.device == "cuda":
|
if self.device == "cuda":
|
||||||
|
@ -88,9 +76,7 @@ class TTSService:
|
||||||
if hasattr(torch.mps, "empty_cache"): # Check if empty_cache is available for MPS
|
if hasattr(torch.mps, "empty_cache"): # Check if empty_cache is available for MPS
|
||||||
torch.mps.empty_cache()
|
torch.mps.empty_cache()
|
||||||
gc.collect() # Explicitly run garbage collection
|
gc.collect() # Explicitly run garbage collection
|
||||||
print("[Singleton] Model unloaded and memory cleared.")
|
print("Model unloaded and memory cleared.")
|
||||||
else:
|
|
||||||
print("[Singleton] Model was not loaded, nothing to unload.")
|
|
||||||
|
|
||||||
async def generate_speech(
|
async def generate_speech(
|
||||||
self,
|
self,
|
||||||
|
@ -108,7 +94,10 @@ class TTSService:
|
||||||
Saves the output to a .wav file.
|
Saves the output to a .wav file.
|
||||||
"""
|
"""
|
||||||
if self.model is None:
|
if self.model is None:
|
||||||
raise RuntimeError("TTS model is not loaded. Model should be loaded at application startup.")
|
self.load_model()
|
||||||
|
|
||||||
|
if self.model is None: # Check again if loading failed
|
||||||
|
raise RuntimeError("TTS model is not loaded. Cannot generate speech.")
|
||||||
|
|
||||||
# Ensure speaker_sample_path is valid
|
# Ensure speaker_sample_path is valid
|
||||||
speaker_sample_p = Path(speaker_sample_path)
|
speaker_sample_p = Path(speaker_sample_path)
|
||||||
|
@ -141,20 +130,10 @@ class TTSService:
|
||||||
# For now, we keep it loaded. Memory management might need refinement.
|
# For now, we keep it loaded. Memory management might need refinement.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Global singleton instance access
|
|
||||||
_global_tts_service = None
|
|
||||||
|
|
||||||
def get_global_tts_service(device: str = "mps") -> TTSService:
|
|
||||||
"""Get the global singleton TTS service instance."""
|
|
||||||
global _global_tts_service
|
|
||||||
if _global_tts_service is None:
|
|
||||||
_global_tts_service = TTSService(device=device)
|
|
||||||
return _global_tts_service
|
|
||||||
|
|
||||||
# Example usage (for testing, not part of the service itself)
|
# Example usage (for testing, not part of the service itself)
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
async def main_test():
|
async def main_test():
|
||||||
tts_service = get_global_tts_service(device="mps")
|
tts_service = TTSService(device="mps")
|
||||||
try:
|
try:
|
||||||
tts_service.load_model()
|
tts_service.load_model()
|
||||||
|
|
||||||
|
|
404
cbx-audiobook.py
404
cbx-audiobook.py
|
@ -1,404 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
"""
|
|
||||||
Chatterbox Audiobook Generator
|
|
||||||
|
|
||||||
This script converts a text file into an audiobook using the Chatterbox TTS system.
|
|
||||||
It parses the text file into manageable chunks, generates audio for each chunk,
|
|
||||||
and assembles them into a complete audiobook.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import asyncio
|
|
||||||
import gc
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import torch
|
|
||||||
from pathlib import Path
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
# Import helper to fix Python path
|
|
||||||
import import_helper
|
|
||||||
|
|
||||||
# Import backend services
|
|
||||||
from backend.app.services.tts_service import TTSService
|
|
||||||
from backend.app.services.speaker_service import SpeakerManagementService
|
|
||||||
from backend.app.services.audio_manipulation_service import AudioManipulationService
|
|
||||||
from backend.app.config import DIALOG_GENERATED_DIR, TTS_TEMP_OUTPUT_DIR
|
|
||||||
|
|
||||||
class AudiobookGenerator:
|
|
||||||
def __init__(self, speaker_id, output_base_name, device="mps",
|
|
||||||
exaggeration=0.5, cfg_weight=0.5, temperature=0.8,
|
|
||||||
pause_between_sentences=0.5, pause_between_paragraphs=1.0,
|
|
||||||
use_subprocess=False):
|
|
||||||
"""
|
|
||||||
Initialize the audiobook generator.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
speaker_id: ID of the speaker to use
|
|
||||||
output_base_name: Base name for output files
|
|
||||||
device: Device to use for TTS (mps, cuda, cpu)
|
|
||||||
exaggeration: Controls expressiveness (0.0-1.0)
|
|
||||||
cfg_weight: Controls alignment with speaker characteristics (0.0-1.0)
|
|
||||||
temperature: Controls randomness in generation (0.0-1.0)
|
|
||||||
pause_between_sentences: Pause duration between sentences in seconds
|
|
||||||
pause_between_paragraphs: Pause duration between paragraphs in seconds
|
|
||||||
use_subprocess: If True, uses separate processes for each chunk (slower but guarantees memory release)
|
|
||||||
"""
|
|
||||||
self.speaker_id = speaker_id
|
|
||||||
self.output_base_name = output_base_name
|
|
||||||
self.device = device
|
|
||||||
self.exaggeration = exaggeration
|
|
||||||
self.cfg_weight = cfg_weight
|
|
||||||
self.temperature = temperature
|
|
||||||
self.pause_between_sentences = pause_between_sentences
|
|
||||||
self.pause_between_paragraphs = pause_between_paragraphs
|
|
||||||
self.use_subprocess = use_subprocess
|
|
||||||
|
|
||||||
# Initialize services
|
|
||||||
self.tts_service = TTSService(device=device)
|
|
||||||
self.speaker_service = SpeakerManagementService()
|
|
||||||
self.audio_manipulator = AudioManipulationService()
|
|
||||||
|
|
||||||
# Create output directories
|
|
||||||
self.output_dir = DIALOG_GENERATED_DIR / output_base_name
|
|
||||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
self.temp_dir = TTS_TEMP_OUTPUT_DIR / output_base_name
|
|
||||||
self.temp_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Validate speaker
|
|
||||||
self._validate_speaker()
|
|
||||||
|
|
||||||
def _validate_speaker(self):
|
|
||||||
"""Validate that the specified speaker exists."""
|
|
||||||
speaker_info = self.speaker_service.get_speaker_by_id(self.speaker_id)
|
|
||||||
if not speaker_info:
|
|
||||||
raise ValueError(f"Speaker ID '{self.speaker_id}' not found.")
|
|
||||||
if not speaker_info.sample_path:
|
|
||||||
raise ValueError(f"Speaker ID '{self.speaker_id}' has no sample path defined.")
|
|
||||||
|
|
||||||
# Store speaker info for later use
|
|
||||||
self.speaker_info = speaker_info
|
|
||||||
|
|
||||||
|
|
||||||
async def _generate_chunk_subprocess(self, chunk, segment_filename_base, speaker_sample_path):
|
|
||||||
"""
|
|
||||||
Generate a single chunk using cbx-generate.py in a subprocess.
|
|
||||||
This guarantees memory is released when the process exits.
|
|
||||||
"""
|
|
||||||
output_file = self.temp_dir / f"{segment_filename_base}.wav"
|
|
||||||
|
|
||||||
# Use cbx-generate.py for single chunk generation
|
|
||||||
cmd = [
|
|
||||||
sys.executable, "cbx-generate.py",
|
|
||||||
"--sample", str(speaker_sample_path),
|
|
||||||
"--output", str(output_file),
|
|
||||||
"--text", chunk,
|
|
||||||
"--device", self.device
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"Running subprocess: {' '.join(cmd[:4])} ... (text truncated)")
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = subprocess.run(
|
|
||||||
cmd,
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=300, # 5 minute timeout per chunk
|
|
||||||
cwd=Path(__file__).parent # Run from project root
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
|
||||||
raise RuntimeError(f"Subprocess failed: {result.stderr}")
|
|
||||||
|
|
||||||
if not output_file.exists():
|
|
||||||
raise RuntimeError(f"Output file not created: {output_file}")
|
|
||||||
|
|
||||||
print(f"Subprocess completed successfully: {output_file}")
|
|
||||||
return output_file
|
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
raise RuntimeError(f"Subprocess timed out after 5 minutes")
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(f"Subprocess error: {e}")
|
|
||||||
|
|
||||||
def split_text_into_chunks(self, text, max_length=300):
|
|
||||||
"""
|
|
||||||
Split text into chunks suitable for TTS processing.
|
|
||||||
|
|
||||||
This uses the same logic as the DialogProcessorService._split_text method
|
|
||||||
but adds additional paragraph handling.
|
|
||||||
"""
|
|
||||||
# Split text into paragraphs first
|
|
||||||
paragraphs = re.split(r'\n\s*\n', text)
|
|
||||||
paragraphs = [p.strip() for p in paragraphs if p.strip()]
|
|
||||||
|
|
||||||
all_chunks = []
|
|
||||||
|
|
||||||
for paragraph in paragraphs:
|
|
||||||
# Split paragraph into sentences
|
|
||||||
sentences = re.split(r'(?<=[.!?\u2026])\s+|(?<=[.!?\u2026])(?=[\"\')\]\}\u201d\u2019])|(?<=[.!?\u2026])$', paragraph.strip())
|
|
||||||
sentences = [s.strip() for s in sentences if s and s.strip()]
|
|
||||||
|
|
||||||
chunks = []
|
|
||||||
current_chunk = ""
|
|
||||||
|
|
||||||
for sentence in sentences:
|
|
||||||
if not sentence:
|
|
||||||
continue
|
|
||||||
if not current_chunk: # First sentence for this chunk
|
|
||||||
current_chunk = sentence
|
|
||||||
elif len(current_chunk) + len(sentence) + 1 <= max_length:
|
|
||||||
current_chunk += " " + sentence
|
|
||||||
else:
|
|
||||||
chunks.append(current_chunk)
|
|
||||||
current_chunk = sentence
|
|
||||||
|
|
||||||
if current_chunk: # Add the last chunk
|
|
||||||
chunks.append(current_chunk)
|
|
||||||
|
|
||||||
# Further split any chunks that are still too long
|
|
||||||
paragraph_chunks = []
|
|
||||||
for chunk in chunks:
|
|
||||||
if len(chunk) > max_length:
|
|
||||||
# Simple split by length if a sentence itself is too long
|
|
||||||
for i in range(0, len(chunk), max_length):
|
|
||||||
paragraph_chunks.append(chunk[i:i+max_length])
|
|
||||||
else:
|
|
||||||
paragraph_chunks.append(chunk)
|
|
||||||
|
|
||||||
# Add paragraph marker
|
|
||||||
if paragraph_chunks:
|
|
||||||
all_chunks.append({"type": "paragraph", "chunks": paragraph_chunks})
|
|
||||||
|
|
||||||
return all_chunks
|
|
||||||
|
|
||||||
async def generate_audiobook(self, text_file_path):
|
|
||||||
"""
|
|
||||||
Generate an audiobook from a text file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text_file_path: Path to the text file to convert
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Path to the generated audiobook file
|
|
||||||
"""
|
|
||||||
# Read the text file
|
|
||||||
text_path = Path(text_file_path)
|
|
||||||
if not text_path.exists():
|
|
||||||
raise FileNotFoundError(f"Text file not found: {text_file_path}")
|
|
||||||
|
|
||||||
with open(text_path, 'r', encoding='utf-8') as f:
|
|
||||||
text = f.read()
|
|
||||||
|
|
||||||
print(f"Processing text file: {text_file_path}")
|
|
||||||
print(f"Text length: {len(text)} characters")
|
|
||||||
|
|
||||||
# Split text into chunks
|
|
||||||
paragraphs = self.split_text_into_chunks(text)
|
|
||||||
total_chunks = sum(len(p["chunks"]) for p in paragraphs)
|
|
||||||
print(f"Split into {len(paragraphs)} paragraphs with {total_chunks} total chunks")
|
|
||||||
|
|
||||||
# Generate audio for each chunk
|
|
||||||
segment_results = []
|
|
||||||
chunk_count = 0
|
|
||||||
|
|
||||||
# Load model once at the start (singleton will handle reuse)
|
|
||||||
print("Loading TTS model...")
|
|
||||||
self.tts_service.load_model()
|
|
||||||
|
|
||||||
try:
|
|
||||||
for para_idx, paragraph in enumerate(paragraphs):
|
|
||||||
print(f"Processing paragraph {para_idx+1}/{len(paragraphs)}")
|
|
||||||
|
|
||||||
for chunk_idx, chunk in enumerate(paragraph["chunks"]):
|
|
||||||
chunk_count += 1
|
|
||||||
print(f" Generating audio for chunk {chunk_count}/{total_chunks}: {chunk[:50]}...")
|
|
||||||
|
|
||||||
# Generate unique filename for this chunk
|
|
||||||
segment_filename_base = f"{self.output_base_name}_p{para_idx}_c{chunk_idx}_{uuid.uuid4().hex[:8]}"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Get absolute speaker sample path
|
|
||||||
speaker_sample_path = Path(self.speaker_info.sample_path)
|
|
||||||
if not speaker_sample_path.is_absolute():
|
|
||||||
from backend.app.config import SPEAKER_DATA_BASE_DIR
|
|
||||||
speaker_sample_path = SPEAKER_DATA_BASE_DIR / speaker_sample_path
|
|
||||||
|
|
||||||
# Generate speech for this chunk
|
|
||||||
if self.use_subprocess:
|
|
||||||
# Use subprocess for guaranteed memory release
|
|
||||||
segment_output_path = await self._generate_chunk_subprocess(
|
|
||||||
chunk=chunk,
|
|
||||||
segment_filename_base=segment_filename_base,
|
|
||||||
speaker_sample_path=speaker_sample_path
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Generate speech using the TTS service (model already loaded)
|
|
||||||
segment_output_path = await self.tts_service.generate_speech(
|
|
||||||
text=chunk,
|
|
||||||
speaker_id=self.speaker_id,
|
|
||||||
speaker_sample_path=str(speaker_sample_path),
|
|
||||||
output_filename_base=segment_filename_base,
|
|
||||||
output_dir=self.temp_dir,
|
|
||||||
exaggeration=self.exaggeration,
|
|
||||||
cfg_weight=self.cfg_weight,
|
|
||||||
temperature=self.temperature
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add to segment results
|
|
||||||
segment_results.append({
|
|
||||||
"type": "speech",
|
|
||||||
"path": str(segment_output_path)
|
|
||||||
})
|
|
||||||
|
|
||||||
# Add pause between sentences
|
|
||||||
if chunk_idx < len(paragraph["chunks"]) - 1:
|
|
||||||
segment_results.append({
|
|
||||||
"type": "silence",
|
|
||||||
"duration": self.pause_between_sentences
|
|
||||||
})
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error generating speech for chunk: {e}")
|
|
||||||
# Continue with next chunk
|
|
||||||
|
|
||||||
# Add longer pause between paragraphs
|
|
||||||
if para_idx < len(paragraphs) - 1:
|
|
||||||
segment_results.append({
|
|
||||||
"type": "silence",
|
|
||||||
"duration": self.pause_between_paragraphs
|
|
||||||
})
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Optionally unload model at the end (singleton manages this efficiently)
|
|
||||||
if not self.use_subprocess:
|
|
||||||
print("Unloading TTS model...")
|
|
||||||
self.tts_service.unload_model()
|
|
||||||
|
|
||||||
# Concatenate all segments
|
|
||||||
print("Concatenating audio segments...")
|
|
||||||
concatenated_filename = f"{self.output_base_name}_audiobook.wav"
|
|
||||||
concatenated_path = self.output_dir / concatenated_filename
|
|
||||||
|
|
||||||
self.audio_manipulator.concatenate_audio_segments(
|
|
||||||
segment_results=segment_results,
|
|
||||||
output_concatenated_path=concatenated_path
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create ZIP archive with all files
|
|
||||||
print("Creating ZIP archive...")
|
|
||||||
zip_filename = f"{self.output_base_name}_audiobook.zip"
|
|
||||||
zip_path = self.output_dir / zip_filename
|
|
||||||
|
|
||||||
# Collect all speech segment files
|
|
||||||
speech_segment_paths = [
|
|
||||||
Path(s["path"]) for s in segment_results
|
|
||||||
if s["type"] == "speech" and Path(s["path"]).exists()
|
|
||||||
]
|
|
||||||
|
|
||||||
self.audio_manipulator.create_zip_archive(
|
|
||||||
segment_file_paths=speech_segment_paths,
|
|
||||||
concatenated_audio_path=concatenated_path,
|
|
||||||
output_zip_path=zip_path
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Audiobook generation complete!")
|
|
||||||
print(f"Audiobook file: {concatenated_path}")
|
|
||||||
print(f"ZIP archive: {zip_path}")
|
|
||||||
|
|
||||||
return concatenated_path
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
parser = argparse.ArgumentParser(description="Generate an audiobook from a text file using Chatterbox TTS")
|
|
||||||
|
|
||||||
# Create a mutually exclusive group for the main operation vs listing speakers
|
|
||||||
group = parser.add_mutually_exclusive_group(required=True)
|
|
||||||
group.add_argument("--list-speakers", action="store_true", help="List available speakers and exit")
|
|
||||||
group.add_argument("text_file", nargs="?", help="Path to the text file to convert")
|
|
||||||
|
|
||||||
# Other arguments
|
|
||||||
parser.add_argument("--speaker", "-s", help="ID of the speaker to use")
|
|
||||||
parser.add_argument("--output", "-o", help="Base name for output files (default: derived from text filename)")
|
|
||||||
parser.add_argument("--device", default="mps", choices=["mps", "cuda", "cpu"], help="Device to use for TTS (default: mps)")
|
|
||||||
parser.add_argument("--exaggeration", type=float, default=0.5, help="Controls expressiveness (0.0-1.0, default: 0.5)")
|
|
||||||
parser.add_argument("--cfg-weight", type=float, default=0.5, help="Controls alignment with speaker (0.0-1.0, default: 0.5)")
|
|
||||||
parser.add_argument("--temperature", type=float, default=0.8, help="Controls randomness (0.0-1.0, default: 0.8)")
|
|
||||||
parser.add_argument("--sentence-pause", type=float, default=0.5, help="Pause between sentences in seconds (default: 0.5)")
|
|
||||||
parser.add_argument("--paragraph-pause", type=float, default=1.0, help="Pause between paragraphs in seconds (default: 1.0)")
|
|
||||||
parser.add_argument("--force-cpu-on-oom", action="store_true", help="Automatically switch to CPU if MPS/CUDA runs out of memory")
|
|
||||||
parser.add_argument("--max-chunk-length", type=int, default=300, help="Maximum chunk length for text splitting (default: 300)")
|
|
||||||
parser.add_argument("--use-subprocess", action="store_true", help="Use separate processes for each chunk (slower but reduces memory usage)")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# List speakers if requested
|
|
||||||
if args.list_speakers:
|
|
||||||
speaker_service = SpeakerManagementService()
|
|
||||||
speakers = speaker_service.get_speakers()
|
|
||||||
print("Available speakers:")
|
|
||||||
for speaker in speakers:
|
|
||||||
print(f" {speaker.id}: {speaker.name}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Validate required arguments for audiobook generation
|
|
||||||
if not args.text_file:
|
|
||||||
parser.error("text_file is required when not using --list-speakers")
|
|
||||||
|
|
||||||
if not args.speaker:
|
|
||||||
parser.error("--speaker/-s is required when not using --list-speakers")
|
|
||||||
|
|
||||||
# Determine output base name if not provided
|
|
||||||
if not args.output:
|
|
||||||
text_path = Path(args.text_file)
|
|
||||||
args.output = text_path.stem
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Create audiobook generator
|
|
||||||
generator = AudiobookGenerator(
|
|
||||||
speaker_id=args.speaker,
|
|
||||||
output_base_name=args.output,
|
|
||||||
device=args.device,
|
|
||||||
exaggeration=args.exaggeration,
|
|
||||||
cfg_weight=args.cfg_weight,
|
|
||||||
temperature=args.temperature,
|
|
||||||
pause_between_sentences=args.sentence_pause,
|
|
||||||
pause_between_paragraphs=args.paragraph_pause,
|
|
||||||
use_subprocess=args.use_subprocess
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate audiobook with automatic fallback
|
|
||||||
try:
|
|
||||||
await generator.generate_audiobook(args.text_file)
|
|
||||||
except (RuntimeError, torch.OutOfMemoryError) as e:
|
|
||||||
if args.force_cpu_on_oom and "out of memory" in str(e).lower() and args.device != "cpu":
|
|
||||||
print(f"\n⚠️ {args.device.upper()} out of memory: {e}")
|
|
||||||
print("🔄 Automatically switching to CPU and retrying...")
|
|
||||||
|
|
||||||
# Create new generator with CPU
|
|
||||||
generator = AudiobookGenerator(
|
|
||||||
speaker_id=args.speaker,
|
|
||||||
output_base_name=args.output,
|
|
||||||
device="cpu",
|
|
||||||
exaggeration=args.exaggeration,
|
|
||||||
cfg_weight=args.cfg_weight,
|
|
||||||
temperature=args.temperature,
|
|
||||||
pause_between_sentences=args.sentence_pause,
|
|
||||||
pause_between_paragraphs=args.paragraph_pause,
|
|
||||||
use_subprocess=args.use_subprocess
|
|
||||||
)
|
|
||||||
|
|
||||||
await generator.generate_audiobook(args.text_file)
|
|
||||||
print("✅ Successfully completed using CPU fallback!")
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error: {e}", file=sys.stderr)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(asyncio.run(main()))
|
|
|
@ -1,28 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Import helper module for CLI scripts that need to import backend services.
|
|
||||||
This ensures the Python path is set up correctly to import from the backend directory.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Add the project root to the Python path
|
|
||||||
PROJECT_ROOT = Path(__file__).parent.resolve()
|
|
||||||
if str(PROJECT_ROOT) not in sys.path:
|
|
||||||
sys.path.insert(0, str(PROJECT_ROOT))
|
|
||||||
|
|
||||||
# Add the backend directory to the Python path for app.* imports
|
|
||||||
BACKEND_ROOT = PROJECT_ROOT / "backend"
|
|
||||||
if str(BACKEND_ROOT) not in sys.path:
|
|
||||||
sys.path.insert(0, str(BACKEND_ROOT))
|
|
||||||
|
|
||||||
# Verify that we can import from backend
|
|
||||||
try:
|
|
||||||
from backend.app.config import PROJECT_ROOT as CONFIG_PROJECT_ROOT
|
|
||||||
from app.services.tts_service import TTSService
|
|
||||||
from app.services.speaker_service import SpeakerManagementService
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"Warning: Could not import backend services: {e}")
|
|
||||||
print(f"Make sure you're running from the project root directory: {PROJECT_ROOT}")
|
|
||||||
print(f"Backend directory: {BACKEND_ROOT}")
|
|
|
@ -28,15 +28,3 @@ dd3552d9-f4e8-49ed-9892-f9e67afcf23c:
|
||||||
2cdd6d3d-c533-44bf-a5f6-cc83bd089d32:
|
2cdd6d3d-c533-44bf-a5f6-cc83bd089d32:
|
||||||
name: Grace
|
name: Grace
|
||||||
sample_path: speaker_samples/2cdd6d3d-c533-44bf-a5f6-cc83bd089d32.wav
|
sample_path: speaker_samples/2cdd6d3d-c533-44bf-a5f6-cc83bd089d32.wav
|
||||||
fdbfa71b-7647-4574-a1c0-31350348b434:
|
|
||||||
name: Elthea
|
|
||||||
sample_path: speaker_samples/fdbfa71b-7647-4574-a1c0-31350348b434.wav
|
|
||||||
44cfc6c1-78ec-4278-920a-8ad067cd1eba:
|
|
||||||
name: Eddie
|
|
||||||
sample_path: speaker_samples/44cfc6c1-78ec-4278-920a-8ad067cd1eba.wav
|
|
||||||
a25c52cc-ad56-46d2-9209-62fa7aebb150:
|
|
||||||
name: Charlotte
|
|
||||||
sample_path: speaker_samples/a25c52cc-ad56-46d2-9209-62fa7aebb150.wav
|
|
||||||
aeb43113-586c-4ab8-86e6-3b26737b9816:
|
|
||||||
name: Announcer1
|
|
||||||
sample_path: speaker_samples/aeb43113-586c-4ab8-86e6-3b26737b9816.wav
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/Volumes/SAM2/CODE/chatterbox-test/.venv/bin/python
|
#!/Users/stwhite/CODE/chatterbox-ui/.venv/bin/python
|
||||||
"""
|
"""
|
||||||
Startup script that launches both the backend and frontend servers concurrently.
|
Startup script that launches both the backend and frontend servers concurrently.
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue