Implement Phase 1 of Report Generation Module: Document Scraping and SQLite Storage
This commit is contained in:
parent
a34b92c103
commit
60f78dab9c
|
@ -0,0 +1,19 @@
|
|||
"""
|
||||
Report generation module for the intelligent research system.
|
||||
|
||||
This module provides functionality to generate reports from search results
|
||||
by scraping documents, storing them in a database, and synthesizing them
|
||||
into a comprehensive report.
|
||||
"""
|
||||
|
||||
from report.report_generator import get_report_generator, initialize_report_generator
|
||||
from report.document_scraper import get_document_scraper
|
||||
from report.database.db_manager import get_db_manager, initialize_database
|
||||
|
||||
__all__ = [
|
||||
'get_report_generator',
|
||||
'initialize_report_generator',
|
||||
'get_document_scraper',
|
||||
'get_db_manager',
|
||||
'initialize_database'
|
||||
]
|
|
@ -0,0 +1,14 @@
|
|||
"""
|
||||
Database module for the report generation module.
|
||||
|
||||
This module provides functionality to create, manage, and query the SQLite database
|
||||
for storing scraped documents and their metadata.
|
||||
"""
|
||||
|
||||
from report.database.db_manager import get_db_manager, initialize_database, DBManager
|
||||
|
||||
__all__ = [
|
||||
'get_db_manager',
|
||||
'initialize_database',
|
||||
'DBManager'
|
||||
]
|
|
@ -0,0 +1,393 @@
|
|||
"""
|
||||
SQLite database manager for the report generation module.
|
||||
|
||||
This module provides functionality to create, manage, and query the SQLite database
|
||||
for storing scraped documents and their metadata.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import aiosqlite
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DBManager:
|
||||
"""
|
||||
Database manager for the report generation module.
|
||||
|
||||
This class provides methods to create, manage, and query the SQLite database
|
||||
for storing scraped documents and their metadata.
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: str = "report/database/documents.db"):
|
||||
"""
|
||||
Initialize the database manager.
|
||||
|
||||
Args:
|
||||
db_path: Path to the SQLite database file
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self._ensure_dir_exists()
|
||||
|
||||
def _ensure_dir_exists(self):
|
||||
"""Ensure the directory for the database file exists."""
|
||||
db_dir = os.path.dirname(self.db_path)
|
||||
if not os.path.exists(db_dir):
|
||||
os.makedirs(db_dir)
|
||||
logger.info(f"Created directory: {db_dir}")
|
||||
|
||||
async def initialize_db(self):
|
||||
"""
|
||||
Initialize the database by creating necessary tables if they don't exist.
|
||||
|
||||
This method creates the documents and metadata tables.
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
# Create documents table
|
||||
await db.execute('''
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
title TEXT,
|
||||
content TEXT NOT NULL,
|
||||
scrape_date TIMESTAMP NOT NULL,
|
||||
content_type TEXT,
|
||||
token_count INTEGER,
|
||||
hash TEXT UNIQUE
|
||||
)
|
||||
''')
|
||||
|
||||
# Create metadata table
|
||||
await db.execute('''
|
||||
CREATE TABLE IF NOT EXISTS metadata (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
document_id INTEGER NOT NULL,
|
||||
key TEXT NOT NULL,
|
||||
value TEXT,
|
||||
FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE,
|
||||
UNIQUE (document_id, key)
|
||||
)
|
||||
''')
|
||||
|
||||
# Create index on url for faster lookups
|
||||
await db.execute('CREATE INDEX IF NOT EXISTS idx_documents_url ON documents (url)')
|
||||
|
||||
# Create index on document_id for faster metadata lookups
|
||||
await db.execute('CREATE INDEX IF NOT EXISTS idx_metadata_document_id ON metadata (document_id)')
|
||||
|
||||
await db.commit()
|
||||
logger.info("Database initialized successfully")
|
||||
|
||||
async def document_exists(self, url: str) -> bool:
|
||||
"""
|
||||
Check if a document with the given URL already exists in the database.
|
||||
|
||||
Args:
|
||||
url: URL of the document to check
|
||||
|
||||
Returns:
|
||||
True if the document exists, False otherwise
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
cursor = await db.execute('SELECT id FROM documents WHERE url = ?', (url,))
|
||||
result = await cursor.fetchone()
|
||||
return result is not None
|
||||
|
||||
async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get a document by its URL.
|
||||
|
||||
Args:
|
||||
url: URL of the document to retrieve
|
||||
|
||||
Returns:
|
||||
Document as a dictionary, or None if not found
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
cursor = await db.execute('''
|
||||
SELECT id, url, title, content, scrape_date, content_type, token_count, hash
|
||||
FROM documents
|
||||
WHERE url = ?
|
||||
''', (url,))
|
||||
|
||||
document = await cursor.fetchone()
|
||||
if not document:
|
||||
return None
|
||||
|
||||
# Convert to dictionary
|
||||
doc_dict = dict(document)
|
||||
|
||||
# Get metadata
|
||||
cursor = await db.execute('''
|
||||
SELECT key, value
|
||||
FROM metadata
|
||||
WHERE document_id = ?
|
||||
''', (doc_dict['id'],))
|
||||
|
||||
metadata = await cursor.fetchall()
|
||||
doc_dict['metadata'] = {row['key']: row['value'] for row in metadata}
|
||||
|
||||
return doc_dict
|
||||
|
||||
async def add_document(self, url: str, title: str, content: str,
|
||||
content_type: str, token_count: int,
|
||||
metadata: Dict[str, str], doc_hash: str) -> int:
|
||||
"""
|
||||
Add a document to the database.
|
||||
|
||||
Args:
|
||||
url: URL of the document
|
||||
title: Title of the document
|
||||
content: Content of the document
|
||||
content_type: Type of content (e.g., 'markdown', 'html', 'text')
|
||||
token_count: Number of tokens in the document
|
||||
metadata: Dictionary of metadata key-value pairs
|
||||
doc_hash: Hash of the document content for deduplication
|
||||
|
||||
Returns:
|
||||
ID of the added document
|
||||
|
||||
Raises:
|
||||
aiosqlite.Error: If there's an error adding the document
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
try:
|
||||
# Begin transaction
|
||||
await db.execute('BEGIN TRANSACTION')
|
||||
|
||||
# Insert document
|
||||
cursor = await db.execute('''
|
||||
INSERT INTO documents (url, title, content, scrape_date, content_type, token_count, hash)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
''', (url, title, content, datetime.now().isoformat(), content_type, token_count, doc_hash))
|
||||
|
||||
document_id = cursor.lastrowid
|
||||
|
||||
# Insert metadata
|
||||
for key, value in metadata.items():
|
||||
await db.execute('''
|
||||
INSERT INTO metadata (document_id, key, value)
|
||||
VALUES (?, ?, ?)
|
||||
''', (document_id, key, value))
|
||||
|
||||
# Commit transaction
|
||||
await db.commit()
|
||||
logger.info(f"Added document: {url} (ID: {document_id})")
|
||||
return document_id
|
||||
|
||||
except aiosqlite.Error as e:
|
||||
# Rollback transaction on error
|
||||
await db.execute('ROLLBACK')
|
||||
logger.error(f"Error adding document: {str(e)}")
|
||||
raise
|
||||
|
||||
async def update_document(self, document_id: int, content: str = None,
|
||||
title: str = None, token_count: int = None,
|
||||
metadata: Dict[str, str] = None) -> bool:
|
||||
"""
|
||||
Update an existing document in the database.
|
||||
|
||||
Args:
|
||||
document_id: ID of the document to update
|
||||
content: New content (optional)
|
||||
title: New title (optional)
|
||||
token_count: New token count (optional)
|
||||
metadata: New or updated metadata (optional)
|
||||
|
||||
Returns:
|
||||
True if the document was updated, False otherwise
|
||||
|
||||
Raises:
|
||||
aiosqlite.Error: If there's an error updating the document
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
try:
|
||||
# Begin transaction
|
||||
await db.execute('BEGIN TRANSACTION')
|
||||
|
||||
# Update document fields if provided
|
||||
update_parts = []
|
||||
params = []
|
||||
|
||||
if content is not None:
|
||||
update_parts.append("content = ?")
|
||||
params.append(content)
|
||||
|
||||
if title is not None:
|
||||
update_parts.append("title = ?")
|
||||
params.append(title)
|
||||
|
||||
if token_count is not None:
|
||||
update_parts.append("token_count = ?")
|
||||
params.append(token_count)
|
||||
|
||||
if update_parts:
|
||||
update_query = f"UPDATE documents SET {', '.join(update_parts)} WHERE id = ?"
|
||||
params.append(document_id)
|
||||
await db.execute(update_query, params)
|
||||
|
||||
# Update metadata if provided
|
||||
if metadata:
|
||||
for key, value in metadata.items():
|
||||
# Check if metadata key exists
|
||||
cursor = await db.execute('''
|
||||
SELECT id FROM metadata
|
||||
WHERE document_id = ? AND key = ?
|
||||
''', (document_id, key))
|
||||
|
||||
result = await cursor.fetchone()
|
||||
|
||||
if result:
|
||||
# Update existing metadata
|
||||
await db.execute('''
|
||||
UPDATE metadata SET value = ?
|
||||
WHERE document_id = ? AND key = ?
|
||||
''', (value, document_id, key))
|
||||
else:
|
||||
# Insert new metadata
|
||||
await db.execute('''
|
||||
INSERT INTO metadata (document_id, key, value)
|
||||
VALUES (?, ?, ?)
|
||||
''', (document_id, key, value))
|
||||
|
||||
# Commit transaction
|
||||
await db.commit()
|
||||
logger.info(f"Updated document ID: {document_id}")
|
||||
return True
|
||||
|
||||
except aiosqlite.Error as e:
|
||||
# Rollback transaction on error
|
||||
await db.execute('ROLLBACK')
|
||||
logger.error(f"Error updating document: {str(e)}")
|
||||
raise
|
||||
|
||||
async def delete_document(self, document_id: int) -> bool:
|
||||
"""
|
||||
Delete a document from the database.
|
||||
|
||||
Args:
|
||||
document_id: ID of the document to delete
|
||||
|
||||
Returns:
|
||||
True if the document was deleted, False otherwise
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
try:
|
||||
# Begin transaction
|
||||
await db.execute('BEGIN TRANSACTION')
|
||||
|
||||
# Delete document (metadata will be deleted via ON DELETE CASCADE)
|
||||
await db.execute('DELETE FROM documents WHERE id = ?', (document_id,))
|
||||
|
||||
# Commit transaction
|
||||
await db.commit()
|
||||
logger.info(f"Deleted document ID: {document_id}")
|
||||
return True
|
||||
|
||||
except aiosqlite.Error as e:
|
||||
# Rollback transaction on error
|
||||
await db.execute('ROLLBACK')
|
||||
logger.error(f"Error deleting document: {str(e)}")
|
||||
return False
|
||||
|
||||
async def search_documents(self, query: str, limit: int = 10, offset: int = 0) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search for documents matching the query.
|
||||
|
||||
Args:
|
||||
query: Search query (will be matched against title and content)
|
||||
limit: Maximum number of results to return
|
||||
offset: Number of results to skip
|
||||
|
||||
Returns:
|
||||
List of matching documents as dictionaries
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
|
||||
# Search documents
|
||||
cursor = await db.execute('''
|
||||
SELECT id, url, title, content, scrape_date, content_type, token_count
|
||||
FROM documents
|
||||
WHERE title LIKE ? OR content LIKE ?
|
||||
ORDER BY scrape_date DESC
|
||||
LIMIT ? OFFSET ?
|
||||
''', (f'%{query}%', f'%{query}%', limit, offset))
|
||||
|
||||
documents = await cursor.fetchall()
|
||||
results = []
|
||||
|
||||
# Get metadata for each document
|
||||
for doc in documents:
|
||||
doc_dict = dict(doc)
|
||||
|
||||
cursor = await db.execute('''
|
||||
SELECT key, value
|
||||
FROM metadata
|
||||
WHERE document_id = ?
|
||||
''', (doc_dict['id'],))
|
||||
|
||||
metadata = await cursor.fetchall()
|
||||
doc_dict['metadata'] = {row['key']: row['value'] for row in metadata}
|
||||
|
||||
results.append(doc_dict)
|
||||
|
||||
return results
|
||||
|
||||
async def get_documents_by_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get multiple documents by their URLs.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to retrieve
|
||||
|
||||
Returns:
|
||||
List of documents as dictionaries
|
||||
"""
|
||||
results = []
|
||||
for url in urls:
|
||||
doc = await self.get_document_by_url(url)
|
||||
if doc:
|
||||
results.append(doc)
|
||||
return results
|
||||
|
||||
async def count_documents(self) -> int:
|
||||
"""
|
||||
Get the total number of documents in the database.
|
||||
|
||||
Returns:
|
||||
Number of documents
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
cursor = await db.execute('SELECT COUNT(*) as count FROM documents')
|
||||
result = await cursor.fetchone()
|
||||
return result[0] if result else 0
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
db_manager = DBManager()
|
||||
|
||||
async def initialize_database():
|
||||
"""Initialize the database."""
|
||||
await db_manager.initialize_db()
|
||||
|
||||
def get_db_manager() -> DBManager:
|
||||
"""
|
||||
Get the global database manager instance.
|
||||
|
||||
Returns:
|
||||
DBManager instance
|
||||
"""
|
||||
return db_manager
|
||||
|
||||
# Run database initialization if this module is executed directly
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(initialize_database())
|
Binary file not shown.
|
@ -0,0 +1,400 @@
|
|||
"""
|
||||
Document scraper module for the report generation module.
|
||||
|
||||
This module provides functionality to scrape web pages and extract clean content
|
||||
using Jina Reader API or fallback methods.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import hashlib
|
||||
import logging
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import validators
|
||||
import tiktoken
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
import html2text
|
||||
|
||||
from config.config import get_config
|
||||
from report.database.db_manager import get_db_manager, DBManager
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentScraper:
|
||||
"""
|
||||
Document scraper for the report generation module.
|
||||
|
||||
This class provides methods to scrape web pages and extract clean content
|
||||
using Jina Reader API or fallback methods.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the document scraper."""
|
||||
self.config = get_config()
|
||||
self.api_key = self._get_api_key()
|
||||
self.endpoint = "https://api.jina.ai/v1/reader"
|
||||
self.db_manager = get_db_manager()
|
||||
self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer
|
||||
|
||||
def _get_api_key(self) -> str:
|
||||
"""
|
||||
Get the Jina AI API key.
|
||||
|
||||
Returns:
|
||||
The API key as a string
|
||||
|
||||
Raises:
|
||||
ValueError: If the API key is not found
|
||||
"""
|
||||
try:
|
||||
return self.config.get_api_key('jina')
|
||||
except ValueError as e:
|
||||
logger.warning(f"Jina AI API key not found. Fallback methods will be used. {str(e)}")
|
||||
return ""
|
||||
|
||||
def _count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Count the number of tokens in a text.
|
||||
|
||||
Args:
|
||||
text: The text to count tokens for
|
||||
|
||||
Returns:
|
||||
Number of tokens in the text
|
||||
"""
|
||||
return len(self.tokenizer.encode(text))
|
||||
|
||||
def _compute_hash(self, content: str) -> str:
|
||||
"""
|
||||
Compute a hash of the document content for deduplication.
|
||||
|
||||
Args:
|
||||
content: The document content
|
||||
|
||||
Returns:
|
||||
Hash of the content
|
||||
"""
|
||||
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
"""
|
||||
Normalize a URL by removing fragments and unnecessary query parameters.
|
||||
|
||||
Args:
|
||||
url: The URL to normalize
|
||||
|
||||
Returns:
|
||||
Normalized URL
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
# Remove fragment
|
||||
normalized = parsed._replace(fragment="")
|
||||
|
||||
# TODO: Add more normalization rules if needed
|
||||
|
||||
return normalized.geturl()
|
||||
|
||||
def _validate_url(self, url: str) -> bool:
|
||||
"""
|
||||
Validate a URL.
|
||||
|
||||
Args:
|
||||
url: The URL to validate
|
||||
|
||||
Returns:
|
||||
True if the URL is valid, False otherwise
|
||||
"""
|
||||
return validators.url(url) is True
|
||||
|
||||
async def _extract_metadata_from_html(self, html: str, url: str) -> Dict[str, str]:
|
||||
"""
|
||||
Extract metadata from HTML content.
|
||||
|
||||
Args:
|
||||
html: The HTML content
|
||||
url: The URL of the page
|
||||
|
||||
Returns:
|
||||
Dictionary of metadata
|
||||
"""
|
||||
metadata = {
|
||||
"source_url": url,
|
||||
"scrape_date": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Extract title
|
||||
if soup.title:
|
||||
metadata["title"] = soup.title.string
|
||||
|
||||
# Extract meta tags
|
||||
for meta in soup.find_all('meta'):
|
||||
# Author
|
||||
if meta.get('name') and meta.get('name').lower() == 'author' and meta.get('content'):
|
||||
metadata["author"] = meta.get('content')
|
||||
|
||||
# Description
|
||||
if meta.get('name') and meta.get('name').lower() == 'description' and meta.get('content'):
|
||||
metadata["description"] = meta.get('content')
|
||||
|
||||
# Keywords
|
||||
if meta.get('name') and meta.get('name').lower() == 'keywords' and meta.get('content'):
|
||||
metadata["keywords"] = meta.get('content')
|
||||
|
||||
# Publication date
|
||||
if meta.get('property') and meta.get('property').lower() in ['article:published_time', 'og:published_time'] and meta.get('content'):
|
||||
metadata["publication_date"] = meta.get('content')
|
||||
|
||||
# Open Graph data
|
||||
if meta.get('property') and meta.get('property').lower().startswith('og:') and meta.get('content'):
|
||||
og_key = meta.get('property').lower().replace('og:', 'og_')
|
||||
metadata[og_key] = meta.get('content')
|
||||
|
||||
# Extract structured data (JSON-LD)
|
||||
for script in soup.find_all('script', type='application/ld+json'):
|
||||
try:
|
||||
ld_data = json.loads(script.string)
|
||||
if isinstance(ld_data, dict):
|
||||
# Extract date published
|
||||
if ld_data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting'] and ld_data.get('datePublished'):
|
||||
metadata["publication_date"] = ld_data.get('datePublished')
|
||||
|
||||
# Extract author
|
||||
if ld_data.get('author'):
|
||||
author = ld_data.get('author')
|
||||
if isinstance(author, dict) and author.get('name'):
|
||||
metadata["author"] = author.get('name')
|
||||
elif isinstance(author, str):
|
||||
metadata["author"] = author
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error extracting metadata: {str(e)}")
|
||||
|
||||
return metadata
|
||||
|
||||
async def _html_to_markdown(self, html: str) -> str:
|
||||
"""
|
||||
Convert HTML to Markdown.
|
||||
|
||||
Args:
|
||||
html: The HTML content
|
||||
|
||||
Returns:
|
||||
Markdown content
|
||||
"""
|
||||
converter = html2text.HTML2Text()
|
||||
converter.ignore_links = False
|
||||
converter.ignore_images = False
|
||||
converter.ignore_tables = False
|
||||
converter.body_width = 0 # No wrapping
|
||||
|
||||
return converter.handle(html)
|
||||
|
||||
async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
|
||||
"""
|
||||
Scrape a web page using Jina Reader API.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
|
||||
Returns:
|
||||
Tuple of (content, metadata)
|
||||
"""
|
||||
if not self.api_key:
|
||||
logger.warning("Jina API key not available. Using fallback method.")
|
||||
return None, None
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Accept": "application/json"
|
||||
}
|
||||
|
||||
data = {
|
||||
"url": url,
|
||||
"format": "markdown" # Request markdown format
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(self.endpoint, headers=headers, json=data) as response:
|
||||
if response.status != 200:
|
||||
logger.warning(f"Jina Reader API error: {response.status} - {await response.text()}")
|
||||
return None, None
|
||||
|
||||
result = await response.json()
|
||||
|
||||
if "content" not in result:
|
||||
logger.warning(f"Jina Reader API returned no content: {result}")
|
||||
return None, None
|
||||
|
||||
content = result.get("content", "")
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
# Add source URL to metadata
|
||||
metadata["source_url"] = url
|
||||
|
||||
return content, metadata
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Jina Reader API: {str(e)}")
|
||||
return None, None
|
||||
|
||||
async def _scrape_with_fallback(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
|
||||
"""
|
||||
Scrape a web page using fallback method (aiohttp + BeautifulSoup).
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
|
||||
Returns:
|
||||
Tuple of (content, metadata)
|
||||
"""
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
|
||||
if response.status != 200:
|
||||
logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}")
|
||||
return None, None
|
||||
|
||||
html = await response.text()
|
||||
|
||||
# Extract metadata
|
||||
metadata = await self._extract_metadata_from_html(html, url)
|
||||
|
||||
# Convert to markdown
|
||||
content = await self._html_to_markdown(html)
|
||||
|
||||
return content, metadata
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fallback scraping: {str(e)}")
|
||||
return None, None
|
||||
|
||||
async def scrape_url(self, url: str, force_refresh: bool = False) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Scrape a web page and store the content in the database.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
force_refresh: If True, scrape the URL even if it's already in the database
|
||||
|
||||
Returns:
|
||||
Document dictionary if successful, None otherwise
|
||||
"""
|
||||
# Validate URL
|
||||
if not self._validate_url(url):
|
||||
logger.warning(f"Invalid URL: {url}")
|
||||
return None
|
||||
|
||||
# Normalize URL
|
||||
normalized_url = self._normalize_url(url)
|
||||
|
||||
# Check if document already exists in database
|
||||
if not force_refresh and await self.db_manager.document_exists(normalized_url):
|
||||
logger.info(f"Document already exists in database: {normalized_url}")
|
||||
return await self.db_manager.get_document_by_url(normalized_url)
|
||||
|
||||
# Try Jina Reader first
|
||||
content, metadata = await self._scrape_with_jina_reader(normalized_url)
|
||||
|
||||
# Fallback to custom scraping if Jina Reader fails
|
||||
if content is None:
|
||||
logger.info(f"Falling back to custom scraping for URL: {normalized_url}")
|
||||
content, metadata = await self._scrape_with_fallback(normalized_url)
|
||||
|
||||
if content is None or not content.strip():
|
||||
logger.warning(f"Failed to extract content from URL: {normalized_url}")
|
||||
return None
|
||||
|
||||
# Count tokens
|
||||
token_count = self._count_tokens(content)
|
||||
|
||||
# Compute hash for deduplication
|
||||
doc_hash = self._compute_hash(content)
|
||||
|
||||
# Get title from metadata or use URL as fallback
|
||||
title = metadata.get("title", urlparse(normalized_url).netloc)
|
||||
|
||||
# Store in database
|
||||
try:
|
||||
document_id = await self.db_manager.add_document(
|
||||
url=normalized_url,
|
||||
title=title,
|
||||
content=content,
|
||||
content_type="markdown",
|
||||
token_count=token_count,
|
||||
metadata=metadata,
|
||||
doc_hash=doc_hash
|
||||
)
|
||||
|
||||
# Return the document
|
||||
return await self.db_manager.get_document_by_url(normalized_url)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error storing document in database: {str(e)}")
|
||||
return None
|
||||
|
||||
async def scrape_urls(self, urls: List[str], force_refresh: bool = False) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Scrape multiple URLs in parallel.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to scrape
|
||||
force_refresh: If True, scrape URLs even if they're already in the database
|
||||
|
||||
Returns:
|
||||
List of document dictionaries
|
||||
"""
|
||||
tasks = [self.scrape_url(url, force_refresh) for url in urls]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Filter out None results
|
||||
return [doc for doc in results if doc is not None]
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
document_scraper = DocumentScraper()
|
||||
|
||||
def get_document_scraper() -> DocumentScraper:
|
||||
"""
|
||||
Get the global document scraper instance.
|
||||
|
||||
Returns:
|
||||
DocumentScraper instance
|
||||
"""
|
||||
return document_scraper
|
||||
|
||||
# Example usage
|
||||
async def test_scraper():
|
||||
"""Test the document scraper with a sample URL."""
|
||||
from report.database.db_manager import initialize_database
|
||||
|
||||
# Initialize database
|
||||
await initialize_database()
|
||||
|
||||
# Scrape a URL
|
||||
scraper = get_document_scraper()
|
||||
document = await scraper.scrape_url("https://en.wikipedia.org/wiki/Web_scraping")
|
||||
|
||||
if document:
|
||||
print(f"Successfully scraped document: {document['title']}")
|
||||
print(f"Token count: {document['token_count']}")
|
||||
print(f"Content preview: {document['content'][:500]}...")
|
||||
else:
|
||||
print("Failed to scrape document")
|
||||
|
||||
# Run test if this module is executed directly
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_scraper())
|
|
@ -0,0 +1,130 @@
|
|||
"""
|
||||
Report generator module for the intelligent research system.
|
||||
|
||||
This module provides functionality to generate reports from search results
|
||||
by scraping documents, storing them in a database, and synthesizing them
|
||||
into a comprehensive report.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union
|
||||
|
||||
from report.database.db_manager import get_db_manager, initialize_database
|
||||
from report.document_scraper import get_document_scraper
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ReportGenerator:
|
||||
"""
|
||||
Report generator for the intelligent research system.
|
||||
|
||||
This class provides methods to generate reports from search results
|
||||
by scraping documents, storing them in a database, and synthesizing them
|
||||
into a comprehensive report.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the report generator."""
|
||||
self.db_manager = get_db_manager()
|
||||
self.document_scraper = get_document_scraper()
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the report generator by setting up the database."""
|
||||
await initialize_database()
|
||||
logger.info("Report generator initialized")
|
||||
|
||||
async def process_search_results(self, search_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process search results by scraping the URLs and storing them in the database.
|
||||
|
||||
Args:
|
||||
search_results: List of search results, each containing at least a 'url' field
|
||||
|
||||
Returns:
|
||||
List of processed documents
|
||||
"""
|
||||
# Extract URLs from search results
|
||||
urls = [result.get('url') for result in search_results if result.get('url')]
|
||||
|
||||
# Scrape URLs and store in database
|
||||
documents = await self.document_scraper.scrape_urls(urls)
|
||||
|
||||
# Log results
|
||||
logger.info(f"Processed {len(documents)} documents out of {len(urls)} URLs")
|
||||
|
||||
return documents
|
||||
|
||||
async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get a document by its URL.
|
||||
|
||||
Args:
|
||||
url: URL of the document
|
||||
|
||||
Returns:
|
||||
Document as a dictionary, or None if not found
|
||||
"""
|
||||
return await self.db_manager.get_document_by_url(url)
|
||||
|
||||
async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search for documents in the database.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
limit: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of matching documents
|
||||
"""
|
||||
return await self.db_manager.search_documents(query, limit)
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
report_generator = ReportGenerator()
|
||||
|
||||
async def initialize_report_generator():
|
||||
"""Initialize the report generator."""
|
||||
await report_generator.initialize()
|
||||
|
||||
def get_report_generator() -> ReportGenerator:
|
||||
"""
|
||||
Get the global report generator instance.
|
||||
|
||||
Returns:
|
||||
ReportGenerator instance
|
||||
"""
|
||||
return report_generator
|
||||
|
||||
# Example usage
|
||||
async def test_report_generator():
|
||||
"""Test the report generator with sample search results."""
|
||||
# Initialize report generator
|
||||
await initialize_report_generator()
|
||||
|
||||
# Sample search results
|
||||
search_results = [
|
||||
{"url": "https://en.wikipedia.org/wiki/Web_scraping", "title": "Web scraping - Wikipedia"},
|
||||
{"url": "https://en.wikipedia.org/wiki/Natural_language_processing", "title": "Natural language processing - Wikipedia"}
|
||||
]
|
||||
|
||||
# Process search results
|
||||
generator = get_report_generator()
|
||||
documents = await generator.process_search_results(search_results)
|
||||
|
||||
# Print results
|
||||
print(f"Processed {len(documents)} documents")
|
||||
for doc in documents:
|
||||
print(f"Title: {doc['title']}")
|
||||
print(f"URL: {doc['url']}")
|
||||
print(f"Token count: {doc['token_count']}")
|
||||
print(f"Content preview: {doc['content'][:200]}...")
|
||||
print("-" * 80)
|
||||
|
||||
# Run test if this module is executed directly
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_report_generator())
|
|
@ -5,3 +5,11 @@ litellm>=1.0.0
|
|||
gradio>=4.0.0
|
||||
pyyaml>=6.0
|
||||
python-dotenv>=1.0.0
|
||||
beautifulsoup4>=4.12.0
|
||||
aiosqlite>=0.19.0
|
||||
asyncio>=3.4.3
|
||||
aiohttp>=3.9.0
|
||||
validators>=0.22.0
|
||||
markdown>=3.5.0
|
||||
html2text>=2020.1.16
|
||||
feedparser>=6.0.10
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
"""
|
||||
Test modules for the intelligent research system.
|
||||
"""
|
|
@ -0,0 +1,82 @@
|
|||
"""
|
||||
Test script for the document scraper module.
|
||||
|
||||
This script tests the functionality of the document scraper module
|
||||
by scraping a few sample URLs and storing them in the database.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Add parent directory to path to allow importing modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from report.database.db_manager import initialize_database, get_db_manager
|
||||
from report.document_scraper import get_document_scraper
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sample URLs for testing
|
||||
TEST_URLS = [
|
||||
"https://en.wikipedia.org/wiki/Web_scraping",
|
||||
"https://en.wikipedia.org/wiki/Natural_language_processing",
|
||||
"https://en.wikipedia.org/wiki/SQLite"
|
||||
]
|
||||
|
||||
async def test_document_scraper():
|
||||
"""Test the document scraper with sample URLs."""
|
||||
# Initialize database
|
||||
await initialize_database()
|
||||
logger.info("Database initialized")
|
||||
|
||||
# Get document scraper
|
||||
scraper = get_document_scraper()
|
||||
|
||||
# Scrape URLs
|
||||
logger.info(f"Scraping {len(TEST_URLS)} URLs...")
|
||||
documents = await scraper.scrape_urls(TEST_URLS)
|
||||
|
||||
# Print results
|
||||
logger.info(f"Successfully scraped {len(documents)} documents")
|
||||
for doc in documents:
|
||||
logger.info(f"Title: {doc['title']}")
|
||||
logger.info(f"URL: {doc['url']}")
|
||||
logger.info(f"Token count: {doc['token_count']}")
|
||||
logger.info(f"Content preview: {doc['content'][:200]}...")
|
||||
logger.info("-" * 80)
|
||||
|
||||
# Test database search
|
||||
db_manager = get_db_manager()
|
||||
search_results = await db_manager.search_documents("scraping")
|
||||
logger.info(f"Found {len(search_results)} documents matching 'scraping'")
|
||||
|
||||
# Test document retrieval by URL
|
||||
doc = await db_manager.get_document_by_url(TEST_URLS[0])
|
||||
if doc:
|
||||
logger.info(f"Retrieved document by URL: {doc['title']}")
|
||||
else:
|
||||
logger.error(f"Failed to retrieve document by URL: {TEST_URLS[0]}")
|
||||
|
||||
# Count documents in database
|
||||
count = await db_manager.count_documents()
|
||||
logger.info(f"Total documents in database: {count}")
|
||||
|
||||
return True
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
success = asyncio.run(test_document_scraper())
|
||||
if success:
|
||||
logger.info("All tests passed!")
|
||||
sys.exit(0)
|
||||
else:
|
||||
logger.error("Tests failed!")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error running tests: {str(e)}")
|
||||
sys.exit(1)
|
Loading…
Reference in New Issue