diff --git a/report/__init__.py b/report/__init__.py new file mode 100644 index 0000000..43b1c4e --- /dev/null +++ b/report/__init__.py @@ -0,0 +1,19 @@ +""" +Report generation module for the intelligent research system. + +This module provides functionality to generate reports from search results +by scraping documents, storing them in a database, and synthesizing them +into a comprehensive report. +""" + +from report.report_generator import get_report_generator, initialize_report_generator +from report.document_scraper import get_document_scraper +from report.database.db_manager import get_db_manager, initialize_database + +__all__ = [ + 'get_report_generator', + 'initialize_report_generator', + 'get_document_scraper', + 'get_db_manager', + 'initialize_database' +] diff --git a/report/database/__init__.py b/report/database/__init__.py new file mode 100644 index 0000000..5bc29f5 --- /dev/null +++ b/report/database/__init__.py @@ -0,0 +1,14 @@ +""" +Database module for the report generation module. + +This module provides functionality to create, manage, and query the SQLite database +for storing scraped documents and their metadata. +""" + +from report.database.db_manager import get_db_manager, initialize_database, DBManager + +__all__ = [ + 'get_db_manager', + 'initialize_database', + 'DBManager' +] diff --git a/report/database/db_manager.py b/report/database/db_manager.py new file mode 100644 index 0000000..45d2951 --- /dev/null +++ b/report/database/db_manager.py @@ -0,0 +1,393 @@ +""" +SQLite database manager for the report generation module. + +This module provides functionality to create, manage, and query the SQLite database +for storing scraped documents and their metadata. +""" + +import os +import json +import aiosqlite +import asyncio +import logging +from datetime import datetime +from typing import Dict, List, Any, Optional, Tuple, Union + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +class DBManager: + """ + Database manager for the report generation module. + + This class provides methods to create, manage, and query the SQLite database + for storing scraped documents and their metadata. + """ + + def __init__(self, db_path: str = "report/database/documents.db"): + """ + Initialize the database manager. + + Args: + db_path: Path to the SQLite database file + """ + self.db_path = db_path + self._ensure_dir_exists() + + def _ensure_dir_exists(self): + """Ensure the directory for the database file exists.""" + db_dir = os.path.dirname(self.db_path) + if not os.path.exists(db_dir): + os.makedirs(db_dir) + logger.info(f"Created directory: {db_dir}") + + async def initialize_db(self): + """ + Initialize the database by creating necessary tables if they don't exist. + + This method creates the documents and metadata tables. + """ + async with aiosqlite.connect(self.db_path) as db: + # Create documents table + await db.execute(''' + CREATE TABLE IF NOT EXISTS documents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE NOT NULL, + title TEXT, + content TEXT NOT NULL, + scrape_date TIMESTAMP NOT NULL, + content_type TEXT, + token_count INTEGER, + hash TEXT UNIQUE + ) + ''') + + # Create metadata table + await db.execute(''' + CREATE TABLE IF NOT EXISTS metadata ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + document_id INTEGER NOT NULL, + key TEXT NOT NULL, + value TEXT, + FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE, + UNIQUE (document_id, key) + ) + ''') + + # Create index on url for faster lookups + await db.execute('CREATE INDEX IF NOT EXISTS idx_documents_url ON documents (url)') + + # Create index on document_id for faster metadata lookups + await db.execute('CREATE INDEX IF NOT EXISTS idx_metadata_document_id ON metadata (document_id)') + + await db.commit() + logger.info("Database initialized successfully") + + async def document_exists(self, url: str) -> bool: + """ + Check if a document with the given URL already exists in the database. + + Args: + url: URL of the document to check + + Returns: + True if the document exists, False otherwise + """ + async with aiosqlite.connect(self.db_path) as db: + db.row_factory = aiosqlite.Row + cursor = await db.execute('SELECT id FROM documents WHERE url = ?', (url,)) + result = await cursor.fetchone() + return result is not None + + async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]: + """ + Get a document by its URL. + + Args: + url: URL of the document to retrieve + + Returns: + Document as a dictionary, or None if not found + """ + async with aiosqlite.connect(self.db_path) as db: + db.row_factory = aiosqlite.Row + cursor = await db.execute(''' + SELECT id, url, title, content, scrape_date, content_type, token_count, hash + FROM documents + WHERE url = ? + ''', (url,)) + + document = await cursor.fetchone() + if not document: + return None + + # Convert to dictionary + doc_dict = dict(document) + + # Get metadata + cursor = await db.execute(''' + SELECT key, value + FROM metadata + WHERE document_id = ? + ''', (doc_dict['id'],)) + + metadata = await cursor.fetchall() + doc_dict['metadata'] = {row['key']: row['value'] for row in metadata} + + return doc_dict + + async def add_document(self, url: str, title: str, content: str, + content_type: str, token_count: int, + metadata: Dict[str, str], doc_hash: str) -> int: + """ + Add a document to the database. + + Args: + url: URL of the document + title: Title of the document + content: Content of the document + content_type: Type of content (e.g., 'markdown', 'html', 'text') + token_count: Number of tokens in the document + metadata: Dictionary of metadata key-value pairs + doc_hash: Hash of the document content for deduplication + + Returns: + ID of the added document + + Raises: + aiosqlite.Error: If there's an error adding the document + """ + async with aiosqlite.connect(self.db_path) as db: + try: + # Begin transaction + await db.execute('BEGIN TRANSACTION') + + # Insert document + cursor = await db.execute(''' + INSERT INTO documents (url, title, content, scrape_date, content_type, token_count, hash) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (url, title, content, datetime.now().isoformat(), content_type, token_count, doc_hash)) + + document_id = cursor.lastrowid + + # Insert metadata + for key, value in metadata.items(): + await db.execute(''' + INSERT INTO metadata (document_id, key, value) + VALUES (?, ?, ?) + ''', (document_id, key, value)) + + # Commit transaction + await db.commit() + logger.info(f"Added document: {url} (ID: {document_id})") + return document_id + + except aiosqlite.Error as e: + # Rollback transaction on error + await db.execute('ROLLBACK') + logger.error(f"Error adding document: {str(e)}") + raise + + async def update_document(self, document_id: int, content: str = None, + title: str = None, token_count: int = None, + metadata: Dict[str, str] = None) -> bool: + """ + Update an existing document in the database. + + Args: + document_id: ID of the document to update + content: New content (optional) + title: New title (optional) + token_count: New token count (optional) + metadata: New or updated metadata (optional) + + Returns: + True if the document was updated, False otherwise + + Raises: + aiosqlite.Error: If there's an error updating the document + """ + async with aiosqlite.connect(self.db_path) as db: + try: + # Begin transaction + await db.execute('BEGIN TRANSACTION') + + # Update document fields if provided + update_parts = [] + params = [] + + if content is not None: + update_parts.append("content = ?") + params.append(content) + + if title is not None: + update_parts.append("title = ?") + params.append(title) + + if token_count is not None: + update_parts.append("token_count = ?") + params.append(token_count) + + if update_parts: + update_query = f"UPDATE documents SET {', '.join(update_parts)} WHERE id = ?" + params.append(document_id) + await db.execute(update_query, params) + + # Update metadata if provided + if metadata: + for key, value in metadata.items(): + # Check if metadata key exists + cursor = await db.execute(''' + SELECT id FROM metadata + WHERE document_id = ? AND key = ? + ''', (document_id, key)) + + result = await cursor.fetchone() + + if result: + # Update existing metadata + await db.execute(''' + UPDATE metadata SET value = ? + WHERE document_id = ? AND key = ? + ''', (value, document_id, key)) + else: + # Insert new metadata + await db.execute(''' + INSERT INTO metadata (document_id, key, value) + VALUES (?, ?, ?) + ''', (document_id, key, value)) + + # Commit transaction + await db.commit() + logger.info(f"Updated document ID: {document_id}") + return True + + except aiosqlite.Error as e: + # Rollback transaction on error + await db.execute('ROLLBACK') + logger.error(f"Error updating document: {str(e)}") + raise + + async def delete_document(self, document_id: int) -> bool: + """ + Delete a document from the database. + + Args: + document_id: ID of the document to delete + + Returns: + True if the document was deleted, False otherwise + """ + async with aiosqlite.connect(self.db_path) as db: + try: + # Begin transaction + await db.execute('BEGIN TRANSACTION') + + # Delete document (metadata will be deleted via ON DELETE CASCADE) + await db.execute('DELETE FROM documents WHERE id = ?', (document_id,)) + + # Commit transaction + await db.commit() + logger.info(f"Deleted document ID: {document_id}") + return True + + except aiosqlite.Error as e: + # Rollback transaction on error + await db.execute('ROLLBACK') + logger.error(f"Error deleting document: {str(e)}") + return False + + async def search_documents(self, query: str, limit: int = 10, offset: int = 0) -> List[Dict[str, Any]]: + """ + Search for documents matching the query. + + Args: + query: Search query (will be matched against title and content) + limit: Maximum number of results to return + offset: Number of results to skip + + Returns: + List of matching documents as dictionaries + """ + async with aiosqlite.connect(self.db_path) as db: + db.row_factory = aiosqlite.Row + + # Search documents + cursor = await db.execute(''' + SELECT id, url, title, content, scrape_date, content_type, token_count + FROM documents + WHERE title LIKE ? OR content LIKE ? + ORDER BY scrape_date DESC + LIMIT ? OFFSET ? + ''', (f'%{query}%', f'%{query}%', limit, offset)) + + documents = await cursor.fetchall() + results = [] + + # Get metadata for each document + for doc in documents: + doc_dict = dict(doc) + + cursor = await db.execute(''' + SELECT key, value + FROM metadata + WHERE document_id = ? + ''', (doc_dict['id'],)) + + metadata = await cursor.fetchall() + doc_dict['metadata'] = {row['key']: row['value'] for row in metadata} + + results.append(doc_dict) + + return results + + async def get_documents_by_urls(self, urls: List[str]) -> List[Dict[str, Any]]: + """ + Get multiple documents by their URLs. + + Args: + urls: List of URLs to retrieve + + Returns: + List of documents as dictionaries + """ + results = [] + for url in urls: + doc = await self.get_document_by_url(url) + if doc: + results.append(doc) + return results + + async def count_documents(self) -> int: + """ + Get the total number of documents in the database. + + Returns: + Number of documents + """ + async with aiosqlite.connect(self.db_path) as db: + cursor = await db.execute('SELECT COUNT(*) as count FROM documents') + result = await cursor.fetchone() + return result[0] if result else 0 + + +# Create a singleton instance for global use +db_manager = DBManager() + +async def initialize_database(): + """Initialize the database.""" + await db_manager.initialize_db() + +def get_db_manager() -> DBManager: + """ + Get the global database manager instance. + + Returns: + DBManager instance + """ + return db_manager + +# Run database initialization if this module is executed directly +if __name__ == "__main__": + asyncio.run(initialize_database()) diff --git a/report/database/documents.db b/report/database/documents.db new file mode 100644 index 0000000..54edcfa Binary files /dev/null and b/report/database/documents.db differ diff --git a/report/document_scraper.py b/report/document_scraper.py new file mode 100644 index 0000000..d2000f2 --- /dev/null +++ b/report/document_scraper.py @@ -0,0 +1,400 @@ +""" +Document scraper module for the report generation module. + +This module provides functionality to scrape web pages and extract clean content +using Jina Reader API or fallback methods. +""" + +import os +import re +import json +import hashlib +import logging +import asyncio +import aiohttp +import validators +import tiktoken +from typing import Dict, List, Any, Optional, Tuple, Union +from datetime import datetime +from urllib.parse import urlparse, urljoin +from bs4 import BeautifulSoup +import html2text + +from config.config import get_config +from report.database.db_manager import get_db_manager, DBManager + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +class DocumentScraper: + """ + Document scraper for the report generation module. + + This class provides methods to scrape web pages and extract clean content + using Jina Reader API or fallback methods. + """ + + def __init__(self): + """Initialize the document scraper.""" + self.config = get_config() + self.api_key = self._get_api_key() + self.endpoint = "https://api.jina.ai/v1/reader" + self.db_manager = get_db_manager() + self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer + + def _get_api_key(self) -> str: + """ + Get the Jina AI API key. + + Returns: + The API key as a string + + Raises: + ValueError: If the API key is not found + """ + try: + return self.config.get_api_key('jina') + except ValueError as e: + logger.warning(f"Jina AI API key not found. Fallback methods will be used. {str(e)}") + return "" + + def _count_tokens(self, text: str) -> int: + """ + Count the number of tokens in a text. + + Args: + text: The text to count tokens for + + Returns: + Number of tokens in the text + """ + return len(self.tokenizer.encode(text)) + + def _compute_hash(self, content: str) -> str: + """ + Compute a hash of the document content for deduplication. + + Args: + content: The document content + + Returns: + Hash of the content + """ + return hashlib.sha256(content.encode('utf-8')).hexdigest() + + def _normalize_url(self, url: str) -> str: + """ + Normalize a URL by removing fragments and unnecessary query parameters. + + Args: + url: The URL to normalize + + Returns: + Normalized URL + """ + parsed = urlparse(url) + # Remove fragment + normalized = parsed._replace(fragment="") + + # TODO: Add more normalization rules if needed + + return normalized.geturl() + + def _validate_url(self, url: str) -> bool: + """ + Validate a URL. + + Args: + url: The URL to validate + + Returns: + True if the URL is valid, False otherwise + """ + return validators.url(url) is True + + async def _extract_metadata_from_html(self, html: str, url: str) -> Dict[str, str]: + """ + Extract metadata from HTML content. + + Args: + html: The HTML content + url: The URL of the page + + Returns: + Dictionary of metadata + """ + metadata = { + "source_url": url, + "scrape_date": datetime.now().isoformat() + } + + try: + soup = BeautifulSoup(html, 'html.parser') + + # Extract title + if soup.title: + metadata["title"] = soup.title.string + + # Extract meta tags + for meta in soup.find_all('meta'): + # Author + if meta.get('name') and meta.get('name').lower() == 'author' and meta.get('content'): + metadata["author"] = meta.get('content') + + # Description + if meta.get('name') and meta.get('name').lower() == 'description' and meta.get('content'): + metadata["description"] = meta.get('content') + + # Keywords + if meta.get('name') and meta.get('name').lower() == 'keywords' and meta.get('content'): + metadata["keywords"] = meta.get('content') + + # Publication date + if meta.get('property') and meta.get('property').lower() in ['article:published_time', 'og:published_time'] and meta.get('content'): + metadata["publication_date"] = meta.get('content') + + # Open Graph data + if meta.get('property') and meta.get('property').lower().startswith('og:') and meta.get('content'): + og_key = meta.get('property').lower().replace('og:', 'og_') + metadata[og_key] = meta.get('content') + + # Extract structured data (JSON-LD) + for script in soup.find_all('script', type='application/ld+json'): + try: + ld_data = json.loads(script.string) + if isinstance(ld_data, dict): + # Extract date published + if ld_data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting'] and ld_data.get('datePublished'): + metadata["publication_date"] = ld_data.get('datePublished') + + # Extract author + if ld_data.get('author'): + author = ld_data.get('author') + if isinstance(author, dict) and author.get('name'): + metadata["author"] = author.get('name') + elif isinstance(author, str): + metadata["author"] = author + except (json.JSONDecodeError, AttributeError): + pass + + except Exception as e: + logger.warning(f"Error extracting metadata: {str(e)}") + + return metadata + + async def _html_to_markdown(self, html: str) -> str: + """ + Convert HTML to Markdown. + + Args: + html: The HTML content + + Returns: + Markdown content + """ + converter = html2text.HTML2Text() + converter.ignore_links = False + converter.ignore_images = False + converter.ignore_tables = False + converter.body_width = 0 # No wrapping + + return converter.handle(html) + + async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]: + """ + Scrape a web page using Jina Reader API. + + Args: + url: The URL to scrape + + Returns: + Tuple of (content, metadata) + """ + if not self.api_key: + logger.warning("Jina API key not available. Using fallback method.") + return None, None + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + "Accept": "application/json" + } + + data = { + "url": url, + "format": "markdown" # Request markdown format + } + + try: + async with aiohttp.ClientSession() as session: + async with session.post(self.endpoint, headers=headers, json=data) as response: + if response.status != 200: + logger.warning(f"Jina Reader API error: {response.status} - {await response.text()}") + return None, None + + result = await response.json() + + if "content" not in result: + logger.warning(f"Jina Reader API returned no content: {result}") + return None, None + + content = result.get("content", "") + metadata = result.get("metadata", {}) + + # Add source URL to metadata + metadata["source_url"] = url + + return content, metadata + + except Exception as e: + logger.error(f"Error calling Jina Reader API: {str(e)}") + return None, None + + async def _scrape_with_fallback(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]: + """ + Scrape a web page using fallback method (aiohttp + BeautifulSoup). + + Args: + url: The URL to scrape + + Returns: + Tuple of (content, metadata) + """ + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response: + if response.status != 200: + logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}") + return None, None + + html = await response.text() + + # Extract metadata + metadata = await self._extract_metadata_from_html(html, url) + + # Convert to markdown + content = await self._html_to_markdown(html) + + return content, metadata + + except Exception as e: + logger.error(f"Error in fallback scraping: {str(e)}") + return None, None + + async def scrape_url(self, url: str, force_refresh: bool = False) -> Optional[Dict[str, Any]]: + """ + Scrape a web page and store the content in the database. + + Args: + url: The URL to scrape + force_refresh: If True, scrape the URL even if it's already in the database + + Returns: + Document dictionary if successful, None otherwise + """ + # Validate URL + if not self._validate_url(url): + logger.warning(f"Invalid URL: {url}") + return None + + # Normalize URL + normalized_url = self._normalize_url(url) + + # Check if document already exists in database + if not force_refresh and await self.db_manager.document_exists(normalized_url): + logger.info(f"Document already exists in database: {normalized_url}") + return await self.db_manager.get_document_by_url(normalized_url) + + # Try Jina Reader first + content, metadata = await self._scrape_with_jina_reader(normalized_url) + + # Fallback to custom scraping if Jina Reader fails + if content is None: + logger.info(f"Falling back to custom scraping for URL: {normalized_url}") + content, metadata = await self._scrape_with_fallback(normalized_url) + + if content is None or not content.strip(): + logger.warning(f"Failed to extract content from URL: {normalized_url}") + return None + + # Count tokens + token_count = self._count_tokens(content) + + # Compute hash for deduplication + doc_hash = self._compute_hash(content) + + # Get title from metadata or use URL as fallback + title = metadata.get("title", urlparse(normalized_url).netloc) + + # Store in database + try: + document_id = await self.db_manager.add_document( + url=normalized_url, + title=title, + content=content, + content_type="markdown", + token_count=token_count, + metadata=metadata, + doc_hash=doc_hash + ) + + # Return the document + return await self.db_manager.get_document_by_url(normalized_url) + + except Exception as e: + logger.error(f"Error storing document in database: {str(e)}") + return None + + async def scrape_urls(self, urls: List[str], force_refresh: bool = False) -> List[Dict[str, Any]]: + """ + Scrape multiple URLs in parallel. + + Args: + urls: List of URLs to scrape + force_refresh: If True, scrape URLs even if they're already in the database + + Returns: + List of document dictionaries + """ + tasks = [self.scrape_url(url, force_refresh) for url in urls] + results = await asyncio.gather(*tasks) + + # Filter out None results + return [doc for doc in results if doc is not None] + + +# Create a singleton instance for global use +document_scraper = DocumentScraper() + +def get_document_scraper() -> DocumentScraper: + """ + Get the global document scraper instance. + + Returns: + DocumentScraper instance + """ + return document_scraper + +# Example usage +async def test_scraper(): + """Test the document scraper with a sample URL.""" + from report.database.db_manager import initialize_database + + # Initialize database + await initialize_database() + + # Scrape a URL + scraper = get_document_scraper() + document = await scraper.scrape_url("https://en.wikipedia.org/wiki/Web_scraping") + + if document: + print(f"Successfully scraped document: {document['title']}") + print(f"Token count: {document['token_count']}") + print(f"Content preview: {document['content'][:500]}...") + else: + print("Failed to scrape document") + +# Run test if this module is executed directly +if __name__ == "__main__": + asyncio.run(test_scraper()) diff --git a/report/report_generator.py b/report/report_generator.py new file mode 100644 index 0000000..af27631 --- /dev/null +++ b/report/report_generator.py @@ -0,0 +1,130 @@ +""" +Report generator module for the intelligent research system. + +This module provides functionality to generate reports from search results +by scraping documents, storing them in a database, and synthesizing them +into a comprehensive report. +""" + +import os +import asyncio +import logging +from typing import Dict, List, Any, Optional, Tuple, Union + +from report.database.db_manager import get_db_manager, initialize_database +from report.document_scraper import get_document_scraper + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +class ReportGenerator: + """ + Report generator for the intelligent research system. + + This class provides methods to generate reports from search results + by scraping documents, storing them in a database, and synthesizing them + into a comprehensive report. + """ + + def __init__(self): + """Initialize the report generator.""" + self.db_manager = get_db_manager() + self.document_scraper = get_document_scraper() + + async def initialize(self): + """Initialize the report generator by setting up the database.""" + await initialize_database() + logger.info("Report generator initialized") + + async def process_search_results(self, search_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Process search results by scraping the URLs and storing them in the database. + + Args: + search_results: List of search results, each containing at least a 'url' field + + Returns: + List of processed documents + """ + # Extract URLs from search results + urls = [result.get('url') for result in search_results if result.get('url')] + + # Scrape URLs and store in database + documents = await self.document_scraper.scrape_urls(urls) + + # Log results + logger.info(f"Processed {len(documents)} documents out of {len(urls)} URLs") + + return documents + + async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]: + """ + Get a document by its URL. + + Args: + url: URL of the document + + Returns: + Document as a dictionary, or None if not found + """ + return await self.db_manager.get_document_by_url(url) + + async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]: + """ + Search for documents in the database. + + Args: + query: Search query + limit: Maximum number of results to return + + Returns: + List of matching documents + """ + return await self.db_manager.search_documents(query, limit) + + +# Create a singleton instance for global use +report_generator = ReportGenerator() + +async def initialize_report_generator(): + """Initialize the report generator.""" + await report_generator.initialize() + +def get_report_generator() -> ReportGenerator: + """ + Get the global report generator instance. + + Returns: + ReportGenerator instance + """ + return report_generator + +# Example usage +async def test_report_generator(): + """Test the report generator with sample search results.""" + # Initialize report generator + await initialize_report_generator() + + # Sample search results + search_results = [ + {"url": "https://en.wikipedia.org/wiki/Web_scraping", "title": "Web scraping - Wikipedia"}, + {"url": "https://en.wikipedia.org/wiki/Natural_language_processing", "title": "Natural language processing - Wikipedia"} + ] + + # Process search results + generator = get_report_generator() + documents = await generator.process_search_results(search_results) + + # Print results + print(f"Processed {len(documents)} documents") + for doc in documents: + print(f"Title: {doc['title']}") + print(f"URL: {doc['url']}") + print(f"Token count: {doc['token_count']}") + print(f"Content preview: {doc['content'][:200]}...") + print("-" * 80) + +# Run test if this module is executed directly +if __name__ == "__main__": + asyncio.run(test_report_generator()) diff --git a/requirements.txt b/requirements.txt index 6cd3c9c..167716a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,11 @@ litellm>=1.0.0 gradio>=4.0.0 pyyaml>=6.0 python-dotenv>=1.0.0 +beautifulsoup4>=4.12.0 +aiosqlite>=0.19.0 +asyncio>=3.4.3 +aiohttp>=3.9.0 +validators>=0.22.0 +markdown>=3.5.0 +html2text>=2020.1.16 +feedparser>=6.0.10 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..92d4a33 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +""" +Test modules for the intelligent research system. +""" diff --git a/tests/test_document_scraper.py b/tests/test_document_scraper.py new file mode 100644 index 0000000..4f23602 --- /dev/null +++ b/tests/test_document_scraper.py @@ -0,0 +1,82 @@ +""" +Test script for the document scraper module. + +This script tests the functionality of the document scraper module +by scraping a few sample URLs and storing them in the database. +""" + +import os +import sys +import asyncio +import logging +from typing import List, Dict, Any + +# Add parent directory to path to allow importing modules +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from report.database.db_manager import initialize_database, get_db_manager +from report.document_scraper import get_document_scraper + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Sample URLs for testing +TEST_URLS = [ + "https://en.wikipedia.org/wiki/Web_scraping", + "https://en.wikipedia.org/wiki/Natural_language_processing", + "https://en.wikipedia.org/wiki/SQLite" +] + +async def test_document_scraper(): + """Test the document scraper with sample URLs.""" + # Initialize database + await initialize_database() + logger.info("Database initialized") + + # Get document scraper + scraper = get_document_scraper() + + # Scrape URLs + logger.info(f"Scraping {len(TEST_URLS)} URLs...") + documents = await scraper.scrape_urls(TEST_URLS) + + # Print results + logger.info(f"Successfully scraped {len(documents)} documents") + for doc in documents: + logger.info(f"Title: {doc['title']}") + logger.info(f"URL: {doc['url']}") + logger.info(f"Token count: {doc['token_count']}") + logger.info(f"Content preview: {doc['content'][:200]}...") + logger.info("-" * 80) + + # Test database search + db_manager = get_db_manager() + search_results = await db_manager.search_documents("scraping") + logger.info(f"Found {len(search_results)} documents matching 'scraping'") + + # Test document retrieval by URL + doc = await db_manager.get_document_by_url(TEST_URLS[0]) + if doc: + logger.info(f"Retrieved document by URL: {doc['title']}") + else: + logger.error(f"Failed to retrieve document by URL: {TEST_URLS[0]}") + + # Count documents in database + count = await db_manager.count_documents() + logger.info(f"Total documents in database: {count}") + + return True + +if __name__ == "__main__": + try: + success = asyncio.run(test_document_scraper()) + if success: + logger.info("All tests passed!") + sys.exit(0) + else: + logger.error("Tests failed!") + sys.exit(1) + except Exception as e: + logger.exception(f"Error running tests: {str(e)}") + sys.exit(1)