Implement Phase 1 of Report Generation Module: Document Scraping and SQLite Storage

This commit is contained in:
Steve White 2025-02-27 17:39:34 -06:00
parent a34b92c103
commit 60f78dab9c
9 changed files with 1049 additions and 0 deletions

19
report/__init__.py Normal file
View File

@ -0,0 +1,19 @@
"""
Report generation module for the intelligent research system.
This module provides functionality to generate reports from search results
by scraping documents, storing them in a database, and synthesizing them
into a comprehensive report.
"""
from report.report_generator import get_report_generator, initialize_report_generator
from report.document_scraper import get_document_scraper
from report.database.db_manager import get_db_manager, initialize_database
__all__ = [
'get_report_generator',
'initialize_report_generator',
'get_document_scraper',
'get_db_manager',
'initialize_database'
]

View File

@ -0,0 +1,14 @@
"""
Database module for the report generation module.
This module provides functionality to create, manage, and query the SQLite database
for storing scraped documents and their metadata.
"""
from report.database.db_manager import get_db_manager, initialize_database, DBManager
__all__ = [
'get_db_manager',
'initialize_database',
'DBManager'
]

View File

@ -0,0 +1,393 @@
"""
SQLite database manager for the report generation module.
This module provides functionality to create, manage, and query the SQLite database
for storing scraped documents and their metadata.
"""
import os
import json
import aiosqlite
import asyncio
import logging
from datetime import datetime
from typing import Dict, List, Any, Optional, Tuple, Union
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class DBManager:
"""
Database manager for the report generation module.
This class provides methods to create, manage, and query the SQLite database
for storing scraped documents and their metadata.
"""
def __init__(self, db_path: str = "report/database/documents.db"):
"""
Initialize the database manager.
Args:
db_path: Path to the SQLite database file
"""
self.db_path = db_path
self._ensure_dir_exists()
def _ensure_dir_exists(self):
"""Ensure the directory for the database file exists."""
db_dir = os.path.dirname(self.db_path)
if not os.path.exists(db_dir):
os.makedirs(db_dir)
logger.info(f"Created directory: {db_dir}")
async def initialize_db(self):
"""
Initialize the database by creating necessary tables if they don't exist.
This method creates the documents and metadata tables.
"""
async with aiosqlite.connect(self.db_path) as db:
# Create documents table
await db.execute('''
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
title TEXT,
content TEXT NOT NULL,
scrape_date TIMESTAMP NOT NULL,
content_type TEXT,
token_count INTEGER,
hash TEXT UNIQUE
)
''')
# Create metadata table
await db.execute('''
CREATE TABLE IF NOT EXISTS metadata (
id INTEGER PRIMARY KEY AUTOINCREMENT,
document_id INTEGER NOT NULL,
key TEXT NOT NULL,
value TEXT,
FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE,
UNIQUE (document_id, key)
)
''')
# Create index on url for faster lookups
await db.execute('CREATE INDEX IF NOT EXISTS idx_documents_url ON documents (url)')
# Create index on document_id for faster metadata lookups
await db.execute('CREATE INDEX IF NOT EXISTS idx_metadata_document_id ON metadata (document_id)')
await db.commit()
logger.info("Database initialized successfully")
async def document_exists(self, url: str) -> bool:
"""
Check if a document with the given URL already exists in the database.
Args:
url: URL of the document to check
Returns:
True if the document exists, False otherwise
"""
async with aiosqlite.connect(self.db_path) as db:
db.row_factory = aiosqlite.Row
cursor = await db.execute('SELECT id FROM documents WHERE url = ?', (url,))
result = await cursor.fetchone()
return result is not None
async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
"""
Get a document by its URL.
Args:
url: URL of the document to retrieve
Returns:
Document as a dictionary, or None if not found
"""
async with aiosqlite.connect(self.db_path) as db:
db.row_factory = aiosqlite.Row
cursor = await db.execute('''
SELECT id, url, title, content, scrape_date, content_type, token_count, hash
FROM documents
WHERE url = ?
''', (url,))
document = await cursor.fetchone()
if not document:
return None
# Convert to dictionary
doc_dict = dict(document)
# Get metadata
cursor = await db.execute('''
SELECT key, value
FROM metadata
WHERE document_id = ?
''', (doc_dict['id'],))
metadata = await cursor.fetchall()
doc_dict['metadata'] = {row['key']: row['value'] for row in metadata}
return doc_dict
async def add_document(self, url: str, title: str, content: str,
content_type: str, token_count: int,
metadata: Dict[str, str], doc_hash: str) -> int:
"""
Add a document to the database.
Args:
url: URL of the document
title: Title of the document
content: Content of the document
content_type: Type of content (e.g., 'markdown', 'html', 'text')
token_count: Number of tokens in the document
metadata: Dictionary of metadata key-value pairs
doc_hash: Hash of the document content for deduplication
Returns:
ID of the added document
Raises:
aiosqlite.Error: If there's an error adding the document
"""
async with aiosqlite.connect(self.db_path) as db:
try:
# Begin transaction
await db.execute('BEGIN TRANSACTION')
# Insert document
cursor = await db.execute('''
INSERT INTO documents (url, title, content, scrape_date, content_type, token_count, hash)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', (url, title, content, datetime.now().isoformat(), content_type, token_count, doc_hash))
document_id = cursor.lastrowid
# Insert metadata
for key, value in metadata.items():
await db.execute('''
INSERT INTO metadata (document_id, key, value)
VALUES (?, ?, ?)
''', (document_id, key, value))
# Commit transaction
await db.commit()
logger.info(f"Added document: {url} (ID: {document_id})")
return document_id
except aiosqlite.Error as e:
# Rollback transaction on error
await db.execute('ROLLBACK')
logger.error(f"Error adding document: {str(e)}")
raise
async def update_document(self, document_id: int, content: str = None,
title: str = None, token_count: int = None,
metadata: Dict[str, str] = None) -> bool:
"""
Update an existing document in the database.
Args:
document_id: ID of the document to update
content: New content (optional)
title: New title (optional)
token_count: New token count (optional)
metadata: New or updated metadata (optional)
Returns:
True if the document was updated, False otherwise
Raises:
aiosqlite.Error: If there's an error updating the document
"""
async with aiosqlite.connect(self.db_path) as db:
try:
# Begin transaction
await db.execute('BEGIN TRANSACTION')
# Update document fields if provided
update_parts = []
params = []
if content is not None:
update_parts.append("content = ?")
params.append(content)
if title is not None:
update_parts.append("title = ?")
params.append(title)
if token_count is not None:
update_parts.append("token_count = ?")
params.append(token_count)
if update_parts:
update_query = f"UPDATE documents SET {', '.join(update_parts)} WHERE id = ?"
params.append(document_id)
await db.execute(update_query, params)
# Update metadata if provided
if metadata:
for key, value in metadata.items():
# Check if metadata key exists
cursor = await db.execute('''
SELECT id FROM metadata
WHERE document_id = ? AND key = ?
''', (document_id, key))
result = await cursor.fetchone()
if result:
# Update existing metadata
await db.execute('''
UPDATE metadata SET value = ?
WHERE document_id = ? AND key = ?
''', (value, document_id, key))
else:
# Insert new metadata
await db.execute('''
INSERT INTO metadata (document_id, key, value)
VALUES (?, ?, ?)
''', (document_id, key, value))
# Commit transaction
await db.commit()
logger.info(f"Updated document ID: {document_id}")
return True
except aiosqlite.Error as e:
# Rollback transaction on error
await db.execute('ROLLBACK')
logger.error(f"Error updating document: {str(e)}")
raise
async def delete_document(self, document_id: int) -> bool:
"""
Delete a document from the database.
Args:
document_id: ID of the document to delete
Returns:
True if the document was deleted, False otherwise
"""
async with aiosqlite.connect(self.db_path) as db:
try:
# Begin transaction
await db.execute('BEGIN TRANSACTION')
# Delete document (metadata will be deleted via ON DELETE CASCADE)
await db.execute('DELETE FROM documents WHERE id = ?', (document_id,))
# Commit transaction
await db.commit()
logger.info(f"Deleted document ID: {document_id}")
return True
except aiosqlite.Error as e:
# Rollback transaction on error
await db.execute('ROLLBACK')
logger.error(f"Error deleting document: {str(e)}")
return False
async def search_documents(self, query: str, limit: int = 10, offset: int = 0) -> List[Dict[str, Any]]:
"""
Search for documents matching the query.
Args:
query: Search query (will be matched against title and content)
limit: Maximum number of results to return
offset: Number of results to skip
Returns:
List of matching documents as dictionaries
"""
async with aiosqlite.connect(self.db_path) as db:
db.row_factory = aiosqlite.Row
# Search documents
cursor = await db.execute('''
SELECT id, url, title, content, scrape_date, content_type, token_count
FROM documents
WHERE title LIKE ? OR content LIKE ?
ORDER BY scrape_date DESC
LIMIT ? OFFSET ?
''', (f'%{query}%', f'%{query}%', limit, offset))
documents = await cursor.fetchall()
results = []
# Get metadata for each document
for doc in documents:
doc_dict = dict(doc)
cursor = await db.execute('''
SELECT key, value
FROM metadata
WHERE document_id = ?
''', (doc_dict['id'],))
metadata = await cursor.fetchall()
doc_dict['metadata'] = {row['key']: row['value'] for row in metadata}
results.append(doc_dict)
return results
async def get_documents_by_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
"""
Get multiple documents by their URLs.
Args:
urls: List of URLs to retrieve
Returns:
List of documents as dictionaries
"""
results = []
for url in urls:
doc = await self.get_document_by_url(url)
if doc:
results.append(doc)
return results
async def count_documents(self) -> int:
"""
Get the total number of documents in the database.
Returns:
Number of documents
"""
async with aiosqlite.connect(self.db_path) as db:
cursor = await db.execute('SELECT COUNT(*) as count FROM documents')
result = await cursor.fetchone()
return result[0] if result else 0
# Create a singleton instance for global use
db_manager = DBManager()
async def initialize_database():
"""Initialize the database."""
await db_manager.initialize_db()
def get_db_manager() -> DBManager:
"""
Get the global database manager instance.
Returns:
DBManager instance
"""
return db_manager
# Run database initialization if this module is executed directly
if __name__ == "__main__":
asyncio.run(initialize_database())

Binary file not shown.

400
report/document_scraper.py Normal file
View File

@ -0,0 +1,400 @@
"""
Document scraper module for the report generation module.
This module provides functionality to scrape web pages and extract clean content
using Jina Reader API or fallback methods.
"""
import os
import re
import json
import hashlib
import logging
import asyncio
import aiohttp
import validators
import tiktoken
from typing import Dict, List, Any, Optional, Tuple, Union
from datetime import datetime
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import html2text
from config.config import get_config
from report.database.db_manager import get_db_manager, DBManager
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class DocumentScraper:
"""
Document scraper for the report generation module.
This class provides methods to scrape web pages and extract clean content
using Jina Reader API or fallback methods.
"""
def __init__(self):
"""Initialize the document scraper."""
self.config = get_config()
self.api_key = self._get_api_key()
self.endpoint = "https://api.jina.ai/v1/reader"
self.db_manager = get_db_manager()
self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer
def _get_api_key(self) -> str:
"""
Get the Jina AI API key.
Returns:
The API key as a string
Raises:
ValueError: If the API key is not found
"""
try:
return self.config.get_api_key('jina')
except ValueError as e:
logger.warning(f"Jina AI API key not found. Fallback methods will be used. {str(e)}")
return ""
def _count_tokens(self, text: str) -> int:
"""
Count the number of tokens in a text.
Args:
text: The text to count tokens for
Returns:
Number of tokens in the text
"""
return len(self.tokenizer.encode(text))
def _compute_hash(self, content: str) -> str:
"""
Compute a hash of the document content for deduplication.
Args:
content: The document content
Returns:
Hash of the content
"""
return hashlib.sha256(content.encode('utf-8')).hexdigest()
def _normalize_url(self, url: str) -> str:
"""
Normalize a URL by removing fragments and unnecessary query parameters.
Args:
url: The URL to normalize
Returns:
Normalized URL
"""
parsed = urlparse(url)
# Remove fragment
normalized = parsed._replace(fragment="")
# TODO: Add more normalization rules if needed
return normalized.geturl()
def _validate_url(self, url: str) -> bool:
"""
Validate a URL.
Args:
url: The URL to validate
Returns:
True if the URL is valid, False otherwise
"""
return validators.url(url) is True
async def _extract_metadata_from_html(self, html: str, url: str) -> Dict[str, str]:
"""
Extract metadata from HTML content.
Args:
html: The HTML content
url: The URL of the page
Returns:
Dictionary of metadata
"""
metadata = {
"source_url": url,
"scrape_date": datetime.now().isoformat()
}
try:
soup = BeautifulSoup(html, 'html.parser')
# Extract title
if soup.title:
metadata["title"] = soup.title.string
# Extract meta tags
for meta in soup.find_all('meta'):
# Author
if meta.get('name') and meta.get('name').lower() == 'author' and meta.get('content'):
metadata["author"] = meta.get('content')
# Description
if meta.get('name') and meta.get('name').lower() == 'description' and meta.get('content'):
metadata["description"] = meta.get('content')
# Keywords
if meta.get('name') and meta.get('name').lower() == 'keywords' and meta.get('content'):
metadata["keywords"] = meta.get('content')
# Publication date
if meta.get('property') and meta.get('property').lower() in ['article:published_time', 'og:published_time'] and meta.get('content'):
metadata["publication_date"] = meta.get('content')
# Open Graph data
if meta.get('property') and meta.get('property').lower().startswith('og:') and meta.get('content'):
og_key = meta.get('property').lower().replace('og:', 'og_')
metadata[og_key] = meta.get('content')
# Extract structured data (JSON-LD)
for script in soup.find_all('script', type='application/ld+json'):
try:
ld_data = json.loads(script.string)
if isinstance(ld_data, dict):
# Extract date published
if ld_data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting'] and ld_data.get('datePublished'):
metadata["publication_date"] = ld_data.get('datePublished')
# Extract author
if ld_data.get('author'):
author = ld_data.get('author')
if isinstance(author, dict) and author.get('name'):
metadata["author"] = author.get('name')
elif isinstance(author, str):
metadata["author"] = author
except (json.JSONDecodeError, AttributeError):
pass
except Exception as e:
logger.warning(f"Error extracting metadata: {str(e)}")
return metadata
async def _html_to_markdown(self, html: str) -> str:
"""
Convert HTML to Markdown.
Args:
html: The HTML content
Returns:
Markdown content
"""
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.ignore_images = False
converter.ignore_tables = False
converter.body_width = 0 # No wrapping
return converter.handle(html)
async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
"""
Scrape a web page using Jina Reader API.
Args:
url: The URL to scrape
Returns:
Tuple of (content, metadata)
"""
if not self.api_key:
logger.warning("Jina API key not available. Using fallback method.")
return None, None
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
"Accept": "application/json"
}
data = {
"url": url,
"format": "markdown" # Request markdown format
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(self.endpoint, headers=headers, json=data) as response:
if response.status != 200:
logger.warning(f"Jina Reader API error: {response.status} - {await response.text()}")
return None, None
result = await response.json()
if "content" not in result:
logger.warning(f"Jina Reader API returned no content: {result}")
return None, None
content = result.get("content", "")
metadata = result.get("metadata", {})
# Add source URL to metadata
metadata["source_url"] = url
return content, metadata
except Exception as e:
logger.error(f"Error calling Jina Reader API: {str(e)}")
return None, None
async def _scrape_with_fallback(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
"""
Scrape a web page using fallback method (aiohttp + BeautifulSoup).
Args:
url: The URL to scrape
Returns:
Tuple of (content, metadata)
"""
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
if response.status != 200:
logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}")
return None, None
html = await response.text()
# Extract metadata
metadata = await self._extract_metadata_from_html(html, url)
# Convert to markdown
content = await self._html_to_markdown(html)
return content, metadata
except Exception as e:
logger.error(f"Error in fallback scraping: {str(e)}")
return None, None
async def scrape_url(self, url: str, force_refresh: bool = False) -> Optional[Dict[str, Any]]:
"""
Scrape a web page and store the content in the database.
Args:
url: The URL to scrape
force_refresh: If True, scrape the URL even if it's already in the database
Returns:
Document dictionary if successful, None otherwise
"""
# Validate URL
if not self._validate_url(url):
logger.warning(f"Invalid URL: {url}")
return None
# Normalize URL
normalized_url = self._normalize_url(url)
# Check if document already exists in database
if not force_refresh and await self.db_manager.document_exists(normalized_url):
logger.info(f"Document already exists in database: {normalized_url}")
return await self.db_manager.get_document_by_url(normalized_url)
# Try Jina Reader first
content, metadata = await self._scrape_with_jina_reader(normalized_url)
# Fallback to custom scraping if Jina Reader fails
if content is None:
logger.info(f"Falling back to custom scraping for URL: {normalized_url}")
content, metadata = await self._scrape_with_fallback(normalized_url)
if content is None or not content.strip():
logger.warning(f"Failed to extract content from URL: {normalized_url}")
return None
# Count tokens
token_count = self._count_tokens(content)
# Compute hash for deduplication
doc_hash = self._compute_hash(content)
# Get title from metadata or use URL as fallback
title = metadata.get("title", urlparse(normalized_url).netloc)
# Store in database
try:
document_id = await self.db_manager.add_document(
url=normalized_url,
title=title,
content=content,
content_type="markdown",
token_count=token_count,
metadata=metadata,
doc_hash=doc_hash
)
# Return the document
return await self.db_manager.get_document_by_url(normalized_url)
except Exception as e:
logger.error(f"Error storing document in database: {str(e)}")
return None
async def scrape_urls(self, urls: List[str], force_refresh: bool = False) -> List[Dict[str, Any]]:
"""
Scrape multiple URLs in parallel.
Args:
urls: List of URLs to scrape
force_refresh: If True, scrape URLs even if they're already in the database
Returns:
List of document dictionaries
"""
tasks = [self.scrape_url(url, force_refresh) for url in urls]
results = await asyncio.gather(*tasks)
# Filter out None results
return [doc for doc in results if doc is not None]
# Create a singleton instance for global use
document_scraper = DocumentScraper()
def get_document_scraper() -> DocumentScraper:
"""
Get the global document scraper instance.
Returns:
DocumentScraper instance
"""
return document_scraper
# Example usage
async def test_scraper():
"""Test the document scraper with a sample URL."""
from report.database.db_manager import initialize_database
# Initialize database
await initialize_database()
# Scrape a URL
scraper = get_document_scraper()
document = await scraper.scrape_url("https://en.wikipedia.org/wiki/Web_scraping")
if document:
print(f"Successfully scraped document: {document['title']}")
print(f"Token count: {document['token_count']}")
print(f"Content preview: {document['content'][:500]}...")
else:
print("Failed to scrape document")
# Run test if this module is executed directly
if __name__ == "__main__":
asyncio.run(test_scraper())

130
report/report_generator.py Normal file
View File

@ -0,0 +1,130 @@
"""
Report generator module for the intelligent research system.
This module provides functionality to generate reports from search results
by scraping documents, storing them in a database, and synthesizing them
into a comprehensive report.
"""
import os
import asyncio
import logging
from typing import Dict, List, Any, Optional, Tuple, Union
from report.database.db_manager import get_db_manager, initialize_database
from report.document_scraper import get_document_scraper
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class ReportGenerator:
"""
Report generator for the intelligent research system.
This class provides methods to generate reports from search results
by scraping documents, storing them in a database, and synthesizing them
into a comprehensive report.
"""
def __init__(self):
"""Initialize the report generator."""
self.db_manager = get_db_manager()
self.document_scraper = get_document_scraper()
async def initialize(self):
"""Initialize the report generator by setting up the database."""
await initialize_database()
logger.info("Report generator initialized")
async def process_search_results(self, search_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Process search results by scraping the URLs and storing them in the database.
Args:
search_results: List of search results, each containing at least a 'url' field
Returns:
List of processed documents
"""
# Extract URLs from search results
urls = [result.get('url') for result in search_results if result.get('url')]
# Scrape URLs and store in database
documents = await self.document_scraper.scrape_urls(urls)
# Log results
logger.info(f"Processed {len(documents)} documents out of {len(urls)} URLs")
return documents
async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
"""
Get a document by its URL.
Args:
url: URL of the document
Returns:
Document as a dictionary, or None if not found
"""
return await self.db_manager.get_document_by_url(url)
async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
"""
Search for documents in the database.
Args:
query: Search query
limit: Maximum number of results to return
Returns:
List of matching documents
"""
return await self.db_manager.search_documents(query, limit)
# Create a singleton instance for global use
report_generator = ReportGenerator()
async def initialize_report_generator():
"""Initialize the report generator."""
await report_generator.initialize()
def get_report_generator() -> ReportGenerator:
"""
Get the global report generator instance.
Returns:
ReportGenerator instance
"""
return report_generator
# Example usage
async def test_report_generator():
"""Test the report generator with sample search results."""
# Initialize report generator
await initialize_report_generator()
# Sample search results
search_results = [
{"url": "https://en.wikipedia.org/wiki/Web_scraping", "title": "Web scraping - Wikipedia"},
{"url": "https://en.wikipedia.org/wiki/Natural_language_processing", "title": "Natural language processing - Wikipedia"}
]
# Process search results
generator = get_report_generator()
documents = await generator.process_search_results(search_results)
# Print results
print(f"Processed {len(documents)} documents")
for doc in documents:
print(f"Title: {doc['title']}")
print(f"URL: {doc['url']}")
print(f"Token count: {doc['token_count']}")
print(f"Content preview: {doc['content'][:200]}...")
print("-" * 80)
# Run test if this module is executed directly
if __name__ == "__main__":
asyncio.run(test_report_generator())

View File

@ -5,3 +5,11 @@ litellm>=1.0.0
gradio>=4.0.0
pyyaml>=6.0
python-dotenv>=1.0.0
beautifulsoup4>=4.12.0
aiosqlite>=0.19.0
asyncio>=3.4.3
aiohttp>=3.9.0
validators>=0.22.0
markdown>=3.5.0
html2text>=2020.1.16
feedparser>=6.0.10

3
tests/__init__.py Normal file
View File

@ -0,0 +1,3 @@
"""
Test modules for the intelligent research system.
"""

View File

@ -0,0 +1,82 @@
"""
Test script for the document scraper module.
This script tests the functionality of the document scraper module
by scraping a few sample URLs and storing them in the database.
"""
import os
import sys
import asyncio
import logging
from typing import List, Dict, Any
# Add parent directory to path to allow importing modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from report.database.db_manager import initialize_database, get_db_manager
from report.document_scraper import get_document_scraper
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Sample URLs for testing
TEST_URLS = [
"https://en.wikipedia.org/wiki/Web_scraping",
"https://en.wikipedia.org/wiki/Natural_language_processing",
"https://en.wikipedia.org/wiki/SQLite"
]
async def test_document_scraper():
"""Test the document scraper with sample URLs."""
# Initialize database
await initialize_database()
logger.info("Database initialized")
# Get document scraper
scraper = get_document_scraper()
# Scrape URLs
logger.info(f"Scraping {len(TEST_URLS)} URLs...")
documents = await scraper.scrape_urls(TEST_URLS)
# Print results
logger.info(f"Successfully scraped {len(documents)} documents")
for doc in documents:
logger.info(f"Title: {doc['title']}")
logger.info(f"URL: {doc['url']}")
logger.info(f"Token count: {doc['token_count']}")
logger.info(f"Content preview: {doc['content'][:200]}...")
logger.info("-" * 80)
# Test database search
db_manager = get_db_manager()
search_results = await db_manager.search_documents("scraping")
logger.info(f"Found {len(search_results)} documents matching 'scraping'")
# Test document retrieval by URL
doc = await db_manager.get_document_by_url(TEST_URLS[0])
if doc:
logger.info(f"Retrieved document by URL: {doc['title']}")
else:
logger.error(f"Failed to retrieve document by URL: {TEST_URLS[0]}")
# Count documents in database
count = await db_manager.count_documents()
logger.info(f"Total documents in database: {count}")
return True
if __name__ == "__main__":
try:
success = asyncio.run(test_document_scraper())
if success:
logger.info("All tests passed!")
sys.exit(0)
else:
logger.error("Tests failed!")
sys.exit(1)
except Exception as e:
logger.exception(f"Error running tests: {str(e)}")
sys.exit(1)