Implement Phase 1 of Report Generation Module: Document Scraping and SQLite Storage

2025-02-27 17:39:34 -06:00 · 2025-02-27 17:39:34 -06:00 · 60f78dab9c
parent a34b92c103
commit 60f78dab9c
9 changed files with 1049 additions and 0 deletions
--- a/report/init.py
+++ b/report/init.py
@ -0,0 +1,19 @@
+"""
+Report generation module for the intelligent research system.
+
+This module provides functionality to generate reports from search results
+by scraping documents, storing them in a database, and synthesizing them
+into a comprehensive report.
+"""
+
+from report.report_generator import get_report_generator, initialize_report_generator
+from report.document_scraper import get_document_scraper
+from report.database.db_manager import get_db_manager, initialize_database
+
+__all__ = [
+    'get_report_generator',
+    'initialize_report_generator',
+    'get_document_scraper',
+    'get_db_manager',
+    'initialize_database'
+]
--- a/report/database/init.py
+++ b/report/database/init.py
@ -0,0 +1,14 @@
+"""
+Database module for the report generation module.
+
+This module provides functionality to create, manage, and query the SQLite database
+for storing scraped documents and their metadata.
+"""
+
+from report.database.db_manager import get_db_manager, initialize_database, DBManager
+
+__all__ = [
+    'get_db_manager',
+    'initialize_database',
+    'DBManager'
+]
--- a/report/database/db_manager.py
+++ b/report/database/db_manager.py
@ -0,0 +1,393 @@
+"""
+SQLite database manager for the report generation module.
+
+This module provides functionality to create, manage, and query the SQLite database
+for storing scraped documents and their metadata.
+"""
+
+import os
+import json
+import aiosqlite
+import asyncio
+import logging
+from datetime import datetime
+from typing import Dict, List, Any, Optional, Tuple, Union
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+class DBManager:
+    """
+    Database manager for the report generation module.
+    
+    This class provides methods to create, manage, and query the SQLite database
+    for storing scraped documents and their metadata.
+    """
+    
+    def __init__(self, db_path: str = "report/database/documents.db"):
+        """
+        Initialize the database manager.
+        
+        Args:
+            db_path: Path to the SQLite database file
+        """
+        self.db_path = db_path
+        self._ensure_dir_exists()
+        
+    def _ensure_dir_exists(self):
+        """Ensure the directory for the database file exists."""
+        db_dir = os.path.dirname(self.db_path)
+        if not os.path.exists(db_dir):
+            os.makedirs(db_dir)
+            logger.info(f"Created directory: {db_dir}")
+    
+    async def initialize_db(self):
+        """
+        Initialize the database by creating necessary tables if they don't exist.
+        
+        This method creates the documents and metadata tables.
+        """
+        async with aiosqlite.connect(self.db_path) as db:
+            # Create documents table
+            await db.execute('''
+                CREATE TABLE IF NOT EXISTS documents (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    url TEXT UNIQUE NOT NULL,
+                    title TEXT,
+                    content TEXT NOT NULL,
+                    scrape_date TIMESTAMP NOT NULL,
+                    content_type TEXT,
+                    token_count INTEGER,
+                    hash TEXT UNIQUE
+                )
+            ''')
+            
+            # Create metadata table
+            await db.execute('''
+                CREATE TABLE IF NOT EXISTS metadata (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    document_id INTEGER NOT NULL,
+                    key TEXT NOT NULL,
+                    value TEXT,
+                    FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE,
+                    UNIQUE (document_id, key)
+                )
+            ''')
+            
+            # Create index on url for faster lookups
+            await db.execute('CREATE INDEX IF NOT EXISTS idx_documents_url ON documents (url)')
+            
+            # Create index on document_id for faster metadata lookups
+            await db.execute('CREATE INDEX IF NOT EXISTS idx_metadata_document_id ON metadata (document_id)')
+            
+            await db.commit()
+            logger.info("Database initialized successfully")
+    
+    async def document_exists(self, url: str) -> bool:
+        """
+        Check if a document with the given URL already exists in the database.
+        
+        Args:
+            url: URL of the document to check
+            
+        Returns:
+            True if the document exists, False otherwise
+        """
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            cursor = await db.execute('SELECT id FROM documents WHERE url = ?', (url,))
+            result = await cursor.fetchone()
+            return result is not None
+    
+    async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
+        """
+        Get a document by its URL.
+        
+        Args:
+            url: URL of the document to retrieve
+            
+        Returns:
+            Document as a dictionary, or None if not found
+        """
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            cursor = await db.execute('''
+                SELECT id, url, title, content, scrape_date, content_type, token_count, hash
+                FROM documents
+                WHERE url = ?
+            ''', (url,))
+            
+            document = await cursor.fetchone()
+            if not document:
+                return None
+            
+            # Convert to dictionary
+            doc_dict = dict(document)
+            
+            # Get metadata
+            cursor = await db.execute('''
+                SELECT key, value
+                FROM metadata
+                WHERE document_id = ?
+            ''', (doc_dict['id'],))
+            
+            metadata = await cursor.fetchall()
+            doc_dict['metadata'] = {row['key']: row['value'] for row in metadata}
+            
+            return doc_dict
+    
+    async def add_document(self, url: str, title: str, content: str, 
+                          content_type: str, token_count: int,
+                          metadata: Dict[str, str], doc_hash: str) -> int:
+        """
+        Add a document to the database.
+        
+        Args:
+            url: URL of the document
+            title: Title of the document
+            content: Content of the document
+            content_type: Type of content (e.g., 'markdown', 'html', 'text')
+            token_count: Number of tokens in the document
+            metadata: Dictionary of metadata key-value pairs
+            doc_hash: Hash of the document content for deduplication
+            
+        Returns:
+            ID of the added document
+            
+        Raises:
+            aiosqlite.Error: If there's an error adding the document
+        """
+        async with aiosqlite.connect(self.db_path) as db:
+            try:
+                # Begin transaction
+                await db.execute('BEGIN TRANSACTION')
+                
+                # Insert document
+                cursor = await db.execute('''
+                    INSERT INTO documents (url, title, content, scrape_date, content_type, token_count, hash)
+                    VALUES (?, ?, ?, ?, ?, ?, ?)
+                ''', (url, title, content, datetime.now().isoformat(), content_type, token_count, doc_hash))
+                
+                document_id = cursor.lastrowid
+                
+                # Insert metadata
+                for key, value in metadata.items():
+                    await db.execute('''
+                        INSERT INTO metadata (document_id, key, value)
+                        VALUES (?, ?, ?)
+                    ''', (document_id, key, value))
+                
+                # Commit transaction
+                await db.commit()
+                logger.info(f"Added document: {url} (ID: {document_id})")
+                return document_id
+                
+            except aiosqlite.Error as e:
+                # Rollback transaction on error
+                await db.execute('ROLLBACK')
+                logger.error(f"Error adding document: {str(e)}")
+                raise
+    
+    async def update_document(self, document_id: int, content: str = None, 
+                             title: str = None, token_count: int = None,
+                             metadata: Dict[str, str] = None) -> bool:
+        """
+        Update an existing document in the database.
+        
+        Args:
+            document_id: ID of the document to update
+            content: New content (optional)
+            title: New title (optional)
+            token_count: New token count (optional)
+            metadata: New or updated metadata (optional)
+            
+        Returns:
+            True if the document was updated, False otherwise
+            
+        Raises:
+            aiosqlite.Error: If there's an error updating the document
+        """
+        async with aiosqlite.connect(self.db_path) as db:
+            try:
+                # Begin transaction
+                await db.execute('BEGIN TRANSACTION')
+                
+                # Update document fields if provided
+                update_parts = []
+                params = []
+                
+                if content is not None:
+                    update_parts.append("content = ?")
+                    params.append(content)
+                
+                if title is not None:
+                    update_parts.append("title = ?")
+                    params.append(title)
+                
+                if token_count is not None:
+                    update_parts.append("token_count = ?")
+                    params.append(token_count)
+                
+                if update_parts:
+                    update_query = f"UPDATE documents SET {', '.join(update_parts)} WHERE id = ?"
+                    params.append(document_id)
+                    await db.execute(update_query, params)
+                
+                # Update metadata if provided
+                if metadata:
+                    for key, value in metadata.items():
+                        # Check if metadata key exists
+                        cursor = await db.execute('''
+                            SELECT id FROM metadata
+                            WHERE document_id = ? AND key = ?
+                        ''', (document_id, key))
+                        
+                        result = await cursor.fetchone()
+                        
+                        if result:
+                            # Update existing metadata
+                            await db.execute('''
+                                UPDATE metadata SET value = ?
+                                WHERE document_id = ? AND key = ?
+                            ''', (value, document_id, key))
+                        else:
+                            # Insert new metadata
+                            await db.execute('''
+                                INSERT INTO metadata (document_id, key, value)
+                                VALUES (?, ?, ?)
+                            ''', (document_id, key, value))
+                
+                # Commit transaction
+                await db.commit()
+                logger.info(f"Updated document ID: {document_id}")
+                return True
+                
+            except aiosqlite.Error as e:
+                # Rollback transaction on error
+                await db.execute('ROLLBACK')
+                logger.error(f"Error updating document: {str(e)}")
+                raise
+    
+    async def delete_document(self, document_id: int) -> bool:
+        """
+        Delete a document from the database.
+        
+        Args:
+            document_id: ID of the document to delete
+            
+        Returns:
+            True if the document was deleted, False otherwise
+        """
+        async with aiosqlite.connect(self.db_path) as db:
+            try:
+                # Begin transaction
+                await db.execute('BEGIN TRANSACTION')
+                
+                # Delete document (metadata will be deleted via ON DELETE CASCADE)
+                await db.execute('DELETE FROM documents WHERE id = ?', (document_id,))
+                
+                # Commit transaction
+                await db.commit()
+                logger.info(f"Deleted document ID: {document_id}")
+                return True
+                
+            except aiosqlite.Error as e:
+                # Rollback transaction on error
+                await db.execute('ROLLBACK')
+                logger.error(f"Error deleting document: {str(e)}")
+                return False
+    
+    async def search_documents(self, query: str, limit: int = 10, offset: int = 0) -> List[Dict[str, Any]]:
+        """
+        Search for documents matching the query.
+        
+        Args:
+            query: Search query (will be matched against title and content)
+            limit: Maximum number of results to return
+            offset: Number of results to skip
+            
+        Returns:
+            List of matching documents as dictionaries
+        """
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            
+            # Search documents
+            cursor = await db.execute('''
+                SELECT id, url, title, content, scrape_date, content_type, token_count
+                FROM documents
+                WHERE title LIKE ? OR content LIKE ?
+                ORDER BY scrape_date DESC
+                LIMIT ? OFFSET ?
+            ''', (f'%{query}%', f'%{query}%', limit, offset))
+            
+            documents = await cursor.fetchall()
+            results = []
+            
+            # Get metadata for each document
+            for doc in documents:
+                doc_dict = dict(doc)
+                
+                cursor = await db.execute('''
+                    SELECT key, value
+                    FROM metadata
+                    WHERE document_id = ?
+                ''', (doc_dict['id'],))
+                
+                metadata = await cursor.fetchall()
+                doc_dict['metadata'] = {row['key']: row['value'] for row in metadata}
+                
+                results.append(doc_dict)
+            
+            return results
+    
+    async def get_documents_by_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
+        """
+        Get multiple documents by their URLs.
+        
+        Args:
+            urls: List of URLs to retrieve
+            
+        Returns:
+            List of documents as dictionaries
+        """
+        results = []
+        for url in urls:
+            doc = await self.get_document_by_url(url)
+            if doc:
+                results.append(doc)
+        return results
+    
+    async def count_documents(self) -> int:
+        """
+        Get the total number of documents in the database.
+        
+        Returns:
+            Number of documents
+        """
+        async with aiosqlite.connect(self.db_path) as db:
+            cursor = await db.execute('SELECT COUNT(*) as count FROM documents')
+            result = await cursor.fetchone()
+            return result[0] if result else 0
+
+
+# Create a singleton instance for global use
+db_manager = DBManager()
+
+async def initialize_database():
+    """Initialize the database."""
+    await db_manager.initialize_db()
+
+def get_db_manager() -> DBManager:
+    """
+    Get the global database manager instance.
+    
+    Returns:
+        DBManager instance
+    """
+    return db_manager
+
+# Run database initialization if this module is executed directly
+if __name__ == "__main__":
+    asyncio.run(initialize_database())
--- a/report/database/documents.db
+++ b/report/database/documents.db
--- a/report/document_scraper.py
+++ b/report/document_scraper.py
@ -0,0 +1,400 @@
+"""
+Document scraper module for the report generation module.
+
+This module provides functionality to scrape web pages and extract clean content
+using Jina Reader API or fallback methods.
+"""
+
+import os
+import re
+import json
+import hashlib
+import logging
+import asyncio
+import aiohttp
+import validators
+import tiktoken
+from typing import Dict, List, Any, Optional, Tuple, Union
+from datetime import datetime
+from urllib.parse import urlparse, urljoin
+from bs4 import BeautifulSoup
+import html2text
+
+from config.config import get_config
+from report.database.db_manager import get_db_manager, DBManager
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+class DocumentScraper:
+    """
+    Document scraper for the report generation module.
+    
+    This class provides methods to scrape web pages and extract clean content
+    using Jina Reader API or fallback methods.
+    """
+    
+    def __init__(self):
+        """Initialize the document scraper."""
+        self.config = get_config()
+        self.api_key = self._get_api_key()
+        self.endpoint = "https://api.jina.ai/v1/reader"
+        self.db_manager = get_db_manager()
+        self.tokenizer = tiktoken.get_encoding("cl100k_base")  # Using OpenAI's tokenizer
+        
+    def _get_api_key(self) -> str:
+        """
+        Get the Jina AI API key.
+        
+        Returns:
+            The API key as a string
+            
+        Raises:
+            ValueError: If the API key is not found
+        """
+        try:
+            return self.config.get_api_key('jina')
+        except ValueError as e:
+            logger.warning(f"Jina AI API key not found. Fallback methods will be used. {str(e)}")
+            return ""
+    
+    def _count_tokens(self, text: str) -> int:
+        """
+        Count the number of tokens in a text.
+        
+        Args:
+            text: The text to count tokens for
+            
+        Returns:
+            Number of tokens in the text
+        """
+        return len(self.tokenizer.encode(text))
+    
+    def _compute_hash(self, content: str) -> str:
+        """
+        Compute a hash of the document content for deduplication.
+        
+        Args:
+            content: The document content
+            
+        Returns:
+            Hash of the content
+        """
+        return hashlib.sha256(content.encode('utf-8')).hexdigest()
+    
+    def _normalize_url(self, url: str) -> str:
+        """
+        Normalize a URL by removing fragments and unnecessary query parameters.
+        
+        Args:
+            url: The URL to normalize
+            
+        Returns:
+            Normalized URL
+        """
+        parsed = urlparse(url)
+        # Remove fragment
+        normalized = parsed._replace(fragment="")
+        
+        # TODO: Add more normalization rules if needed
+        
+        return normalized.geturl()
+    
+    def _validate_url(self, url: str) -> bool:
+        """
+        Validate a URL.
+        
+        Args:
+            url: The URL to validate
+            
+        Returns:
+            True if the URL is valid, False otherwise
+        """
+        return validators.url(url) is True
+    
+    async def _extract_metadata_from_html(self, html: str, url: str) -> Dict[str, str]:
+        """
+        Extract metadata from HTML content.
+        
+        Args:
+            html: The HTML content
+            url: The URL of the page
+            
+        Returns:
+            Dictionary of metadata
+        """
+        metadata = {
+            "source_url": url,
+            "scrape_date": datetime.now().isoformat()
+        }
+        
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+            
+            # Extract title
+            if soup.title:
+                metadata["title"] = soup.title.string
+            
+            # Extract meta tags
+            for meta in soup.find_all('meta'):
+                # Author
+                if meta.get('name') and meta.get('name').lower() == 'author' and meta.get('content'):
+                    metadata["author"] = meta.get('content')
+                
+                # Description
+                if meta.get('name') and meta.get('name').lower() == 'description' and meta.get('content'):
+                    metadata["description"] = meta.get('content')
+                
+                # Keywords
+                if meta.get('name') and meta.get('name').lower() == 'keywords' and meta.get('content'):
+                    metadata["keywords"] = meta.get('content')
+                
+                # Publication date
+                if meta.get('property') and meta.get('property').lower() in ['article:published_time', 'og:published_time'] and meta.get('content'):
+                    metadata["publication_date"] = meta.get('content')
+                
+                # Open Graph data
+                if meta.get('property') and meta.get('property').lower().startswith('og:') and meta.get('content'):
+                    og_key = meta.get('property').lower().replace('og:', 'og_')
+                    metadata[og_key] = meta.get('content')
+            
+            # Extract structured data (JSON-LD)
+            for script in soup.find_all('script', type='application/ld+json'):
+                try:
+                    ld_data = json.loads(script.string)
+                    if isinstance(ld_data, dict):
+                        # Extract date published
+                        if ld_data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting'] and ld_data.get('datePublished'):
+                            metadata["publication_date"] = ld_data.get('datePublished')
+                        
+                        # Extract author
+                        if ld_data.get('author'):
+                            author = ld_data.get('author')
+                            if isinstance(author, dict) and author.get('name'):
+                                metadata["author"] = author.get('name')
+                            elif isinstance(author, str):
+                                metadata["author"] = author
+                except (json.JSONDecodeError, AttributeError):
+                    pass
+            
+        except Exception as e:
+            logger.warning(f"Error extracting metadata: {str(e)}")
+        
+        return metadata
+    
+    async def _html_to_markdown(self, html: str) -> str:
+        """
+        Convert HTML to Markdown.
+        
+        Args:
+            html: The HTML content
+            
+        Returns:
+            Markdown content
+        """
+        converter = html2text.HTML2Text()
+        converter.ignore_links = False
+        converter.ignore_images = False
+        converter.ignore_tables = False
+        converter.body_width = 0  # No wrapping
+        
+        return converter.handle(html)
+    
+    async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
+        """
+        Scrape a web page using Jina Reader API.
+        
+        Args:
+            url: The URL to scrape
+            
+        Returns:
+            Tuple of (content, metadata)
+        """
+        if not self.api_key:
+            logger.warning("Jina API key not available. Using fallback method.")
+            return None, None
+        
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+            "Accept": "application/json"
+        }
+        
+        data = {
+            "url": url,
+            "format": "markdown"  # Request markdown format
+        }
+        
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(self.endpoint, headers=headers, json=data) as response:
+                    if response.status != 200:
+                        logger.warning(f"Jina Reader API error: {response.status} - {await response.text()}")
+                        return None, None
+                    
+                    result = await response.json()
+                    
+                    if "content" not in result:
+                        logger.warning(f"Jina Reader API returned no content: {result}")
+                        return None, None
+                    
+                    content = result.get("content", "")
+                    metadata = result.get("metadata", {})
+                    
+                    # Add source URL to metadata
+                    metadata["source_url"] = url
+                    
+                    return content, metadata
+        
+        except Exception as e:
+            logger.error(f"Error calling Jina Reader API: {str(e)}")
+            return None, None
+    
+    async def _scrape_with_fallback(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
+        """
+        Scrape a web page using fallback method (aiohttp + BeautifulSoup).
+        
+        Args:
+            url: The URL to scrape
+            
+        Returns:
+            Tuple of (content, metadata)
+        """
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
+                    if response.status != 200:
+                        logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}")
+                        return None, None
+                    
+                    html = await response.text()
+                    
+                    # Extract metadata
+                    metadata = await self._extract_metadata_from_html(html, url)
+                    
+                    # Convert to markdown
+                    content = await self._html_to_markdown(html)
+                    
+                    return content, metadata
+        
+        except Exception as e:
+            logger.error(f"Error in fallback scraping: {str(e)}")
+            return None, None
+    
+    async def scrape_url(self, url: str, force_refresh: bool = False) -> Optional[Dict[str, Any]]:
+        """
+        Scrape a web page and store the content in the database.
+        
+        Args:
+            url: The URL to scrape
+            force_refresh: If True, scrape the URL even if it's already in the database
+            
+        Returns:
+            Document dictionary if successful, None otherwise
+        """
+        # Validate URL
+        if not self._validate_url(url):
+            logger.warning(f"Invalid URL: {url}")
+            return None
+        
+        # Normalize URL
+        normalized_url = self._normalize_url(url)
+        
+        # Check if document already exists in database
+        if not force_refresh and await self.db_manager.document_exists(normalized_url):
+            logger.info(f"Document already exists in database: {normalized_url}")
+            return await self.db_manager.get_document_by_url(normalized_url)
+        
+        # Try Jina Reader first
+        content, metadata = await self._scrape_with_jina_reader(normalized_url)
+        
+        # Fallback to custom scraping if Jina Reader fails
+        if content is None:
+            logger.info(f"Falling back to custom scraping for URL: {normalized_url}")
+            content, metadata = await self._scrape_with_fallback(normalized_url)
+        
+        if content is None or not content.strip():
+            logger.warning(f"Failed to extract content from URL: {normalized_url}")
+            return None
+        
+        # Count tokens
+        token_count = self._count_tokens(content)
+        
+        # Compute hash for deduplication
+        doc_hash = self._compute_hash(content)
+        
+        # Get title from metadata or use URL as fallback
+        title = metadata.get("title", urlparse(normalized_url).netloc)
+        
+        # Store in database
+        try:
+            document_id = await self.db_manager.add_document(
+                url=normalized_url,
+                title=title,
+                content=content,
+                content_type="markdown",
+                token_count=token_count,
+                metadata=metadata,
+                doc_hash=doc_hash
+            )
+            
+            # Return the document
+            return await self.db_manager.get_document_by_url(normalized_url)
+            
+        except Exception as e:
+            logger.error(f"Error storing document in database: {str(e)}")
+            return None
+    
+    async def scrape_urls(self, urls: List[str], force_refresh: bool = False) -> List[Dict[str, Any]]:
+        """
+        Scrape multiple URLs in parallel.
+        
+        Args:
+            urls: List of URLs to scrape
+            force_refresh: If True, scrape URLs even if they're already in the database
+            
+        Returns:
+            List of document dictionaries
+        """
+        tasks = [self.scrape_url(url, force_refresh) for url in urls]
+        results = await asyncio.gather(*tasks)
+        
+        # Filter out None results
+        return [doc for doc in results if doc is not None]
+
+
+# Create a singleton instance for global use
+document_scraper = DocumentScraper()
+
+def get_document_scraper() -> DocumentScraper:
+    """
+    Get the global document scraper instance.
+    
+    Returns:
+        DocumentScraper instance
+    """
+    return document_scraper
+
+# Example usage
+async def test_scraper():
+    """Test the document scraper with a sample URL."""
+    from report.database.db_manager import initialize_database
+    
+    # Initialize database
+    await initialize_database()
+    
+    # Scrape a URL
+    scraper = get_document_scraper()
+    document = await scraper.scrape_url("https://en.wikipedia.org/wiki/Web_scraping")
+    
+    if document:
+        print(f"Successfully scraped document: {document['title']}")
+        print(f"Token count: {document['token_count']}")
+        print(f"Content preview: {document['content'][:500]}...")
+    else:
+        print("Failed to scrape document")
+
+# Run test if this module is executed directly
+if __name__ == "__main__":
+    asyncio.run(test_scraper())
--- a/report/report_generator.py
+++ b/report/report_generator.py
@ -0,0 +1,130 @@
+"""
+Report generator module for the intelligent research system.
+
+This module provides functionality to generate reports from search results
+by scraping documents, storing them in a database, and synthesizing them
+into a comprehensive report.
+"""
+
+import os
+import asyncio
+import logging
+from typing import Dict, List, Any, Optional, Tuple, Union
+
+from report.database.db_manager import get_db_manager, initialize_database
+from report.document_scraper import get_document_scraper
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+class ReportGenerator:
+    """
+    Report generator for the intelligent research system.
+    
+    This class provides methods to generate reports from search results
+    by scraping documents, storing them in a database, and synthesizing them
+    into a comprehensive report.
+    """
+    
+    def __init__(self):
+        """Initialize the report generator."""
+        self.db_manager = get_db_manager()
+        self.document_scraper = get_document_scraper()
+    
+    async def initialize(self):
+        """Initialize the report generator by setting up the database."""
+        await initialize_database()
+        logger.info("Report generator initialized")
+    
+    async def process_search_results(self, search_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Process search results by scraping the URLs and storing them in the database.
+        
+        Args:
+            search_results: List of search results, each containing at least a 'url' field
+            
+        Returns:
+            List of processed documents
+        """
+        # Extract URLs from search results
+        urls = [result.get('url') for result in search_results if result.get('url')]
+        
+        # Scrape URLs and store in database
+        documents = await self.document_scraper.scrape_urls(urls)
+        
+        # Log results
+        logger.info(f"Processed {len(documents)} documents out of {len(urls)} URLs")
+        
+        return documents
+    
+    async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
+        """
+        Get a document by its URL.
+        
+        Args:
+            url: URL of the document
+            
+        Returns:
+            Document as a dictionary, or None if not found
+        """
+        return await self.db_manager.get_document_by_url(url)
+    
+    async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
+        """
+        Search for documents in the database.
+        
+        Args:
+            query: Search query
+            limit: Maximum number of results to return
+            
+        Returns:
+            List of matching documents
+        """
+        return await self.db_manager.search_documents(query, limit)
+
+
+# Create a singleton instance for global use
+report_generator = ReportGenerator()
+
+async def initialize_report_generator():
+    """Initialize the report generator."""
+    await report_generator.initialize()
+
+def get_report_generator() -> ReportGenerator:
+    """
+    Get the global report generator instance.
+    
+    Returns:
+        ReportGenerator instance
+    """
+    return report_generator
+
+# Example usage
+async def test_report_generator():
+    """Test the report generator with sample search results."""
+    # Initialize report generator
+    await initialize_report_generator()
+    
+    # Sample search results
+    search_results = [
+        {"url": "https://en.wikipedia.org/wiki/Web_scraping", "title": "Web scraping - Wikipedia"},
+        {"url": "https://en.wikipedia.org/wiki/Natural_language_processing", "title": "Natural language processing - Wikipedia"}
+    ]
+    
+    # Process search results
+    generator = get_report_generator()
+    documents = await generator.process_search_results(search_results)
+    
+    # Print results
+    print(f"Processed {len(documents)} documents")
+    for doc in documents:
+        print(f"Title: {doc['title']}")
+        print(f"URL: {doc['url']}")
+        print(f"Token count: {doc['token_count']}")
+        print(f"Content preview: {doc['content'][:200]}...")
+        print("-" * 80)
+
+# Run test if this module is executed directly
+if __name__ == "__main__":
+    asyncio.run(test_report_generator())
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,11 @@ litellm>=1.0.0
 gradio>=4.0.0
 pyyaml>=6.0
 python-dotenv>=1.0.0
+beautifulsoup4>=4.12.0
+aiosqlite>=0.19.0
+asyncio>=3.4.3
+aiohttp>=3.9.0
+validators>=0.22.0
+markdown>=3.5.0
+html2text>=2020.1.16
+feedparser>=6.0.10
--- a/tests/init.py
+++ b/tests/init.py
@ -0,0 +1,3 @@
+"""
+Test modules for the intelligent research system.
+"""
--- a/tests/test_document_scraper.py
+++ b/tests/test_document_scraper.py
@ -0,0 +1,82 @@
+"""
+Test script for the document scraper module.
+
+This script tests the functionality of the document scraper module
+by scraping a few sample URLs and storing them in the database.
+"""
+
+import os
+import sys
+import asyncio
+import logging
+from typing import List, Dict, Any
+
+# Add parent directory to path to allow importing modules
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from report.database.db_manager import initialize_database, get_db_manager
+from report.document_scraper import get_document_scraper
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Sample URLs for testing
+TEST_URLS = [
+    "https://en.wikipedia.org/wiki/Web_scraping",
+    "https://en.wikipedia.org/wiki/Natural_language_processing",
+    "https://en.wikipedia.org/wiki/SQLite"
+]
+
+async def test_document_scraper():
+    """Test the document scraper with sample URLs."""
+    # Initialize database
+    await initialize_database()
+    logger.info("Database initialized")
+    
+    # Get document scraper
+    scraper = get_document_scraper()
+    
+    # Scrape URLs
+    logger.info(f"Scraping {len(TEST_URLS)} URLs...")
+    documents = await scraper.scrape_urls(TEST_URLS)
+    
+    # Print results
+    logger.info(f"Successfully scraped {len(documents)} documents")
+    for doc in documents:
+        logger.info(f"Title: {doc['title']}")
+        logger.info(f"URL: {doc['url']}")
+        logger.info(f"Token count: {doc['token_count']}")
+        logger.info(f"Content preview: {doc['content'][:200]}...")
+        logger.info("-" * 80)
+    
+    # Test database search
+    db_manager = get_db_manager()
+    search_results = await db_manager.search_documents("scraping")
+    logger.info(f"Found {len(search_results)} documents matching 'scraping'")
+    
+    # Test document retrieval by URL
+    doc = await db_manager.get_document_by_url(TEST_URLS[0])
+    if doc:
+        logger.info(f"Retrieved document by URL: {doc['title']}")
+    else:
+        logger.error(f"Failed to retrieve document by URL: {TEST_URLS[0]}")
+    
+    # Count documents in database
+    count = await db_manager.count_documents()
+    logger.info(f"Total documents in database: {count}")
+    
+    return True
+
+if __name__ == "__main__":
+    try:
+        success = asyncio.run(test_document_scraper())
+        if success:
+            logger.info("All tests passed!")
+            sys.exit(0)
+        else:
+            logger.error("Tests failed!")
+            sys.exit(1)
+    except Exception as e:
+        logger.exception(f"Error running tests: {str(e)}")
+        sys.exit(1)