From b781d8abcf6bc8b775c885d8639c79f3eb920ee6 Mon Sep 17 00:00:00 2001 From: Steve White Date: Thu, 5 Jun 2025 09:20:19 -0500 Subject: [PATCH] Updated note directory- gradio interface working. --- .gitignore | 3 + .note/code_structure.md | 32 +++++++++ .note/current_focus.md | 19 ++++++ .note/decision_log.md | 22 +++++++ .note/detailed_migration_plan.md | 94 ++++++++++++++++++++++++++ .note/development_standards.md | 21 ++++++ .note/interfaces.md | 88 +++++++++++++++++++++++++ .note/project_overview.md | 42 ++++++++++++ .note/session_log.md | 26 ++++++++ storage_service.py | 110 +++++++++++++++++++++++++++++++ 10 files changed, 457 insertions(+) create mode 100644 .note/code_structure.md create mode 100644 .note/current_focus.md create mode 100644 .note/decision_log.md create mode 100644 .note/detailed_migration_plan.md create mode 100644 .note/development_standards.md create mode 100644 .note/interfaces.md create mode 100644 .note/project_overview.md create mode 100644 .note/session_log.md create mode 100644 storage_service.py diff --git a/.gitignore b/.gitignore index c4acb3b..86ac632 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ output*.wav *.mp3 dialog_output/ *.zip +.DS_Store +__pycache__ +projects/ diff --git a/.note/code_structure.md b/.note/code_structure.md new file mode 100644 index 0000000..8aaf143 --- /dev/null +++ b/.note/code_structure.md @@ -0,0 +1,32 @@ +# Code Structure + +*(This document will describe the organization of the codebase as it evolves.)* + +## Current (Gradio-based - to be migrated) +- `gradio_app.py`: Main application logic for the Gradio UI. +- `requirements.txt`: Python dependencies. +- `speaker_samples/`: Directory for speaker audio samples. +- `speakers.yaml`: Configuration for speakers. +- `single_output/`: Output directory for single utterance TTS. +- `dialog_output/`: Output directory for dialog TTS. + +## Planned (FastAPI + Vanilla JS) + +### Backend (FastAPI - Python) +- `main.py`: FastAPI application entry point, router setup. +- `api/`: Directory for API endpoint modules (e.g., `tts_routes.py`, `speaker_routes.py`). +- `core/`: Core logic (e.g., TTS processing, dialog assembly, file management). +- `models/`: Pydantic models for request/response validation. +- `services/`: Business logic services (e.g., `TTSService`, `DialogService`). +- `static/` (or served via CDN): For frontend files if not using a separate frontend server during development. + +### Frontend (Vanilla JavaScript) +- `index.html`: Main HTML file. +- `css/`: Stylesheets. + - `style.css` +- `js/`: JavaScript files. + - `app.js`: Main application logic. + - `api.js`: Functions for interacting with the FastAPI backend. + - `uiComponents.js`: Reusable UI components (e.g., DialogLine, AudioPlayer). + - `state.js`: Frontend state management (if needed). +- `assets/`: Static assets like images or icons. diff --git a/.note/current_focus.md b/.note/current_focus.md new file mode 100644 index 0000000..20c3b58 --- /dev/null +++ b/.note/current_focus.md @@ -0,0 +1,19 @@ +# Current Focus + +**Date:** 2025-06-05 + +**Primary Goal:** Initiate the migration of the Chatterbox TTS dialog generator from Gradio to a vanilla JavaScript frontend and FastAPI backend. + +**Recent Accomplishments:** + +- Set up the `.note/` Memory Bank directory and essential files. +- Reviewed `gradio_app.py` to understand existing dialog generation logic. +- Developed a detailed, phased plan for re-implementing the dialog generation functionality with FastAPI and Vanilla JS. This plan has been saved to `.note/detailed_migration_plan.md`. + +**Current Task:** + +- Awaiting your feedback on the detailed migration plan (see `.note/detailed_migration_plan.md`). + +**Next Steps (pending your approval of plan):** +- Begin Phase 1: Backend API Development (FastAPI). + - Task 1.1: Project Setup (FastAPI project structure, `requirements.txt`). diff --git a/.note/decision_log.md b/.note/decision_log.md new file mode 100644 index 0000000..f25002c --- /dev/null +++ b/.note/decision_log.md @@ -0,0 +1,22 @@ +# Decision Log + +This log records key decisions made throughout the project, along with their rationale. + +--- + +**Date:** 2025-06-05 +**Decision ID:** 20250605-001 +**Decision:** Adopt the `.note/` Memory Bank system for project documentation and context management. +**Rationale:** As per user's global development standards (MEMORY[user_global]) to ensure persistent knowledge and effective collaboration, especially given potential agent memory resets. +**Impact:** Creation of standard `.note/` files (`project_overview.md`, `current_focus.md`, etc.). All significant project information, decisions, and progress will be logged here. + +--- + +**Date:** 2025-06-05 +**Decision ID:** 20250605-002 +**Decision:** Created a detailed migration plan for moving from Gradio to FastAPI & Vanilla JS. +**Rationale:** Based on a thorough review of `gradio_app.py` and the user's request, a detailed, phased plan was necessary to guide development. This incorporates key findings about TTS model management, text processing, and output requirements. +**Impact:** The plan is stored in `.note/detailed_migration_plan.md`. `current_focus.md` has been updated to reflect this. Development will follow this plan upon user approval. +**Related Memory:** MEMORY[b82cdf38-f0b9-45cd-8097-5b1b47030a40] (System memory of the plan) + +--- diff --git a/.note/detailed_migration_plan.md b/.note/detailed_migration_plan.md new file mode 100644 index 0000000..170639b --- /dev/null +++ b/.note/detailed_migration_plan.md @@ -0,0 +1,94 @@ +# Chatterbox TTS: Gradio to FastAPI & Vanilla JS Migration Plan + +This plan outlines the steps to re-implement the dialog generation features of the Chatterbox TTS application, moving from the current Gradio-based implementation to a FastAPI backend and a vanilla JavaScript frontend. It incorporates findings from `gradio_app.py` and aligns with the existing high-level strategy (MEMORY[c20c2cce-46d4-453f-9bc3-c18e05dbc66f]). + +### 1. Backend (FastAPI) Development + +**Objective:** Create a robust API to handle TTS generation, speaker management, and file delivery. + +**Key Modules/Components:** + +* **API Endpoints:** + * `POST /api/dialog/generate`: + * **Input**: Structured list: `[{type: "speech", speaker_id: "str", text: "str"}, {type: "silence", duration: float}]`, `output_base_name: str`. + * **Output**: JSON with `log: str`, `concatenated_audio_url: str`, `zip_archive_url: str`. + * `GET /api/speakers`: Returns list of available speakers (`[{id: "str", name: "str", sample_path: "str"}]`). + * `POST /api/speakers`: Adds a new speaker. Input: `name: str`, `audio_sample_file: UploadFile`. Output: `{id: "str", name: "str", message: "str"}`. + * `DELETE /api/speakers/{speaker_id}`: Removes a speaker. +* **Core Logic & Services:** + * `TTSService`: + * Manages `ChatterboxTTS` model instance(s) (loading, inference, memory cleanup). + * Handles `ChatterboxTTS.generate()` calls, incorporating parameters like `exaggeration`, `cfg_weight`, `temperature` (decision needed on exposure vs. defaults). + * Implements rigorous memory management (inspired by `generate_audio` and `process_dialog`'s `reinit_each_line` concept). + * `DialogProcessorService`: + * Orchestrates dialog generation using `TTSService`. + * Implements `split_text_at_sentence_boundaries` logic for long text inputs. + * Manages generation of individual audio segments. + * `AudioManipulationService`: + * Concatenates audio segments using `torch` and `torchaudio`, inserting specified silences. + * Creates ZIP archives of all generated audio files using `zipfile`. + * `SpeakerManagementService`: + * Manages `speakers.yaml` (or alternative storage) for speaker metadata. + * Handles storage and retrieval of speaker audio samples (e.g., in `speaker_samples/`). +* **File Handling:** + * Strategy for storing and serving generated `.wav` and `.zip` files (e.g., FastAPI `StaticFiles`, temporary directories, or cloud storage). + +**Implementation Steps (Phase 1):** + +1. **Project Setup:** Initialize FastAPI project, define dependencies (`fastapi`, `uvicorn`, `python-multipart`, `pyyaml`, `torch`, `torchaudio`, `chatterbox-tts`). +2. **Speaker Management:** Implement `SpeakerManagementService` and the `/api/speakers` endpoints. +3. **TTS Core:** Develop `TTSService`, focusing on model loading, inference, and critical memory management. +4. **Dialog Processing:** Implement `DialogProcessorService` including text splitting. +5. **Audio Utilities:** Create `AudioManipulationService` for concatenation and zipping. +6. **Main Endpoint:** Implement `POST /api/dialog/generate` orchestrating the services. +7. **Configuration:** Manage paths (`speakers.yaml`, sample storage, output directories) and TTS settings. +8. **Testing:** Thoroughly test all API endpoints using tools like Postman or `curl`. + +### 2. Frontend (Vanilla JavaScript) Development + +**Objective:** Create an intuitive UI for dialog construction, speaker management, and interaction with the backend. + +**Key Modules/Components:** + +* **HTML (`index.html`):** Structure for dialog editor, speaker controls, results display. +* **CSS (`style.css`):** Styling for a clean and usable interface. +* **JavaScript (`app.js`, `api.js`, `ui.js`): + * `api.js`: Functions for all backend API communications (`fetch`). + * `ui.js`: DOM manipulation for dynamic dialog lines, speaker lists, and results rendering. + * `app.js`: Main application logic, event handling, state management (for dialog lines, speaker data). + +**Implementation Steps (Phase 2):** + +1. **Basic Layout:** Create `index.html` and `style.css`. +2. **API Client:** Develop `api.js` to interface with all backend endpoints. +3. **Speaker UI:** + * Fetch and display speakers using `ui.js` and `api.js`. + * Implement forms and logic for adding (with file upload) and removing speakers. +4. **Dialog Editor UI:** + * Dynamically add/remove/reorder dialog lines (speech/silence). + * Inputs for speaker selection (populated from API), text, and silence duration. + * Input for `output_base_name`. +5. **Interaction & Results:** + * "Generate Dialog" button to submit data via `api.js`. + * Display generation log, audio player for concatenated output, and download link for ZIP file. + +### 3. Integration & Testing (Phase 3) + +1. **Full System Connection:** Ensure seamless frontend-backend communication. +2. **End-to-End Testing:** Test various dialog scenarios, speaker configurations, and error conditions. +3. **Performance & Memory:** Profile backend memory usage during generation; refine `TTSService` memory strategies if needed. +4. **UX Refinement:** Iterate on UI/UX based on testing feedback. + +### 4. Advanced Features & Deployment (Phase 4) + +* (As per MEMORY[c20c2cce-46d4-453f-9bc3-c18e05dbc66f]) +* **Real-time Updates:** Consider WebSockets for live progress during generation. +* **Deployment Strategy:** Plan for deploying the FastAPI application and serving the static frontend assets. + +### Key Considerations from `gradio_app.py` Analysis: + +* **Memory Management for TTS Model:** This is critical. The `reinit_each_line` option and explicit cleanup in `generate_audio` highlight this. The FastAPI backend must handle this robustly. +* **Text Chunking:** The `split_text_at_sentence_boundaries` (max 300 chars) logic is essential and must be replicated. +* **Dialog Parsing:** The `Speaker: "Text"` and `Silence: duration` format should be the basis for the frontend data structure sent to the backend. +* **TTS Parameters:** Decide whether to expose advanced TTS parameters (`exaggeration`, `cfg_weight`, `temperature`) for dialog lines in the new API. +* **File Output:** The backend needs to replicate the generation of individual segment files, a concatenated file, and a ZIP archive. diff --git a/.note/development_standards.md b/.note/development_standards.md new file mode 100644 index 0000000..d228ecb --- /dev/null +++ b/.note/development_standards.md @@ -0,0 +1,21 @@ +# Development Standards + +*(To be defined. This document will outline coding conventions, patterns, and best practices for the project.)* + +## General Principles +- **Clarity and Readability:** Code should be easy to understand and maintain. +- **Modularity:** Design components with clear responsibilities and interfaces. +- **Testability:** Write code that is easily testable. + +## Python (FastAPI Backend) +- Follow PEP 8 style guidelines. +- Use type hints. +- Structure API endpoints logically. + +## JavaScript (Vanilla JS Frontend) +- Follow modern JavaScript best practices (ES6+). +- Organize code into modules. +- Prioritize performance and responsiveness. + +## Commit Messages +- Follow conventional commit message format (e.g., `feat: add new TTS feature`, `fix: resolve audio playback bug`). diff --git a/.note/interfaces.md b/.note/interfaces.md new file mode 100644 index 0000000..ab5123e --- /dev/null +++ b/.note/interfaces.md @@ -0,0 +1,88 @@ +# Component Interfaces + +*(This document will define the interfaces between different components of the system, especially between the frontend and backend.)* + +## Backend API (FastAPI) + +*(To be detailed. Examples below)* + +### `/api/tts/generate_single` (POST) +- **Request Body:** + ```json + { + "text": "string", + "speaker_id": "string", + "temperature": "float (optional)", + "length_penalty": "float (optional)" + } + ``` +- **Response Body (Success):** + ```json + { + "audio_url": "string (URL to the generated audio file)", + "duration_ms": "integer" + } + ``` +- **Response Body (Error):** + ```json + { + "detail": "string (error message)" + } + ``` + +### `/api/tts/generate_dialog` (POST) +- **Request Body:** + ```json + { + "dialog_lines": [ + { + "type": "speech", // or "silence" + "speaker_id": "string (required if type is speech)", + "text": "string (required if type is speech)", + "duration_s": "float (required if type is silence)" + } + ], + "output_base_name": "string (optional)" + } + ``` +- **Response Body (Success):** + ```json + { + "dialog_audio_url": "string (URL to the concatenated dialog audio file)", + "individual_files_zip_url": "string (URL to zip of individual lines)", + "total_duration_ms": "integer" + } + ``` + +### `/api/speakers` (GET) +- **Response Body (Success):** + ```json + [ + { + "id": "string", + "name": "string", + "sample_url": "string (optional)" + } + ] + ``` + +### `/api/speakers` (POST) +- **Request Body:** (Multipart form-data) + - `name`: "string" + - `audio_sample`: file (WAV) +- **Response Body (Success):** + ```json + { + "id": "string", + "name": "string", + "message": "Speaker added successfully" + } + ``` + +## Frontend Components (Vanilla JS) + +*(To be detailed as frontend development progresses.)* + +- **DialogLine Component:** Manages input for a single line of dialog (speaker, text). +- **AudioPlayer Component:** Handles playback of generated audio. +- **ProjectManager Component:** Manages overall project state, dialog lines, and interaction with the backend. diff --git a/.note/project_overview.md b/.note/project_overview.md new file mode 100644 index 0000000..1028773 --- /dev/null +++ b/.note/project_overview.md @@ -0,0 +1,42 @@ +# Project Overview: Chatterbox TTS Application Migration + +## 1. Current System + +The project is currently a Gradio-based application named "Chatterbox TTS Gradio App". +Its primary function is to provide a user interface for text-to-speech (TTS) generation using the Chatterbox TTS model. + +Key features of the current Gradio application include: +- Single utterance TTS generation. +- Multi-speaker dialog generation with configurable silence gaps. +- Speaker management (adding/removing speakers with custom audio samples). +- Automatic memory optimization (model cleanup after generation). +- Organized output file storage (`single_output/` and `dialog_output/`). + +## 2. Project Goal: Migration to Modern Web Stack + +The primary goal of this project is to re-implement the Chatterbox TTS application, specifically its dialog generation capabilities, by migrating from the current Gradio framework to a new architecture. + +The new architecture will consist of: +- **Frontend**: Vanilla JavaScript +- **Backend**: FastAPI (Python) + +This migration aims to address limitations of the Gradio framework, such as audio playback issues, limited UI control, and state management complexity, and to provide a more robust, performant, and professional user experience. + +## 3. High-Level Plan & Existing Documentation + +A comprehensive implementation plan for this migration already exists and should be consulted. This plan (Memory ID c20c2cce-46d4-453f-9bc3-c18e05dbc66f) outlines: +- A 4-phase implementation (Backend API, Frontend Development, Integration & Testing, Production Features). +- The complete technical architecture. +- A detailed component system (DialogLine, AudioPlayer, ProjectManager). +- Features like real-time status updates and drag-and-drop functionality. +- Migration strategies. +- Expected benefits (e.g., faster responsiveness, better audio reliability). +- An estimated timeline. + +## 4. Scope of Current Work + +The immediate next step, as requested by the user, is to: +1. Review the existing `gradio_app.py`. +2. Refine or detail the plan for re-implementing the dialog generation functionality with the new stack, leveraging the existing comprehensive plan. + +This document will be updated as the project progresses to reflect new decisions, architectural changes, and milestones. diff --git a/.note/session_log.md b/.note/session_log.md new file mode 100644 index 0000000..1bfd8fa --- /dev/null +++ b/.note/session_log.md @@ -0,0 +1,26 @@ +# Session Log + +--- +**Session Start:** 2025-06-05 + +**Goal:** Initiate migration of Chatterbox TTS dialog generator from Gradio to Vanilla JS + FastAPI. + +**Key Activities & Insights:** +- User requested review of `gradio_app.py` and a plan for re-implementation. +- Checked for `.note/` Memory Bank directory (MEMORY[user_global]). +- Directory not found. +- Read `README.md` to gather project context. +- Created `.note/` directory and populated standard files: + - `project_overview.md` (with initial content based on README and user request). + - `current_focus.md` (outlining immediate tasks). + - `development_standards.md` (template). + - `decision_log.md` (logged decision to use Memory Bank). + - `code_structure.md` (initial thoughts on current and future structure). + - `session_log.md` (this entry). + - `interfaces.md` (template). + +**Next Steps:** +- Confirm Memory Bank setup with the user. +- Proceed to review `gradio_app.py`. + +--- diff --git a/storage_service.py b/storage_service.py new file mode 100644 index 0000000..3303ede --- /dev/null +++ b/storage_service.py @@ -0,0 +1,110 @@ +""" +Project storage service for saving and loading Chatterbox TTS projects. +""" +import json +import os +import asyncio +from pathlib import Path +from typing import List, Optional +from datetime import datetime + +from models import DialogProject, DialogLine + + +class ProjectStorage: + """Handles saving and loading projects to/from JSON files.""" + + def __init__(self, storage_dir: str = "projects"): + self.storage_dir = Path(storage_dir) + self.storage_dir.mkdir(exist_ok=True) + + async def save_project(self, project: DialogProject) -> bool: + """Save a project to a JSON file.""" + try: + project_file = self.storage_dir / f"{project.id}.json" + + # Convert to dict and ensure timestamps are strings + project_data = project.dict() + project_data["last_modified"] = datetime.now().isoformat() + + # Ensure created_at is set if not already + if not project_data.get("created_at"): + project_data["created_at"] = datetime.now().isoformat() + + with open(project_file, 'w', encoding='utf-8') as f: + json.dump(project_data, f, indent=2, ensure_ascii=False) + + return True + except Exception as e: + print(f"Error saving project {project.id}: {e}") + return False + + async def load_project(self, project_id: str) -> Optional[DialogProject]: + """Load a project from a JSON file.""" + try: + project_file = self.storage_dir / f"{project_id}.json" + + if not project_file.exists(): + return None + + with open(project_file, 'r', encoding='utf-8') as f: + project_data = json.load(f) + + # Validate that audio files still exist + for line in project_data.get("lines", []): + if line.get("audio_url"): + audio_path = Path("dialog_output") / line["audio_url"].split("/")[-1] + if not audio_path.exists(): + line["audio_url"] = None + line["status"] = "pending" + + return DialogProject(**project_data) + except Exception as e: + print(f"Error loading project {project_id}: {e}") + return None + + async def list_projects(self) -> List[dict]: + """List all saved projects with metadata.""" + projects = [] + + for project_file in self.storage_dir.glob("*.json"): + try: + with open(project_file, 'r', encoding='utf-8') as f: + project_data = json.load(f) + + projects.append({ + "id": project_data["id"], + "name": project_data["name"], + "created_at": project_data.get("created_at"), + "last_modified": project_data.get("last_modified"), + "line_count": len(project_data.get("lines", [])), + "has_audio": any(line.get("audio_url") for line in project_data.get("lines", [])) + }) + except Exception as e: + print(f"Error reading project file {project_file}: {e}") + continue + + # Sort by last modified (most recent first) + projects.sort(key=lambda x: x.get("last_modified", ""), reverse=True) + return projects + + async def delete_project(self, project_id: str) -> bool: + """Delete a saved project.""" + try: + project_file = self.storage_dir / f"{project_id}.json" + if project_file.exists(): + project_file.unlink() + return True + return False + except Exception as e: + print(f"Error deleting project {project_id}: {e}") + return False + + async def project_exists(self, project_id: str) -> bool: + """Check if a project exists in storage.""" + project_file = self.storage_dir / f"{project_id}.json" + return project_file.exists() + + +# Global storage instance +project_storage = ProjectStorage()