131 lines
4.2 KiB
Python
131 lines
4.2 KiB
Python
"""
|
|
Report generator module for the intelligent research system.
|
|
|
|
This module provides functionality to generate reports from search results
|
|
by scraping documents, storing them in a database, and synthesizing them
|
|
into a comprehensive report.
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
import logging
|
|
from typing import Dict, List, Any, Optional, Tuple, Union
|
|
|
|
from report.database.db_manager import get_db_manager, initialize_database
|
|
from report.document_scraper import get_document_scraper
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class ReportGenerator:
|
|
"""
|
|
Report generator for the intelligent research system.
|
|
|
|
This class provides methods to generate reports from search results
|
|
by scraping documents, storing them in a database, and synthesizing them
|
|
into a comprehensive report.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the report generator."""
|
|
self.db_manager = get_db_manager()
|
|
self.document_scraper = get_document_scraper()
|
|
|
|
async def initialize(self):
|
|
"""Initialize the report generator by setting up the database."""
|
|
await initialize_database()
|
|
logger.info("Report generator initialized")
|
|
|
|
async def process_search_results(self, search_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Process search results by scraping the URLs and storing them in the database.
|
|
|
|
Args:
|
|
search_results: List of search results, each containing at least a 'url' field
|
|
|
|
Returns:
|
|
List of processed documents
|
|
"""
|
|
# Extract URLs from search results
|
|
urls = [result.get('url') for result in search_results if result.get('url')]
|
|
|
|
# Scrape URLs and store in database
|
|
documents = await self.document_scraper.scrape_urls(urls)
|
|
|
|
# Log results
|
|
logger.info(f"Processed {len(documents)} documents out of {len(urls)} URLs")
|
|
|
|
return documents
|
|
|
|
async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Get a document by its URL.
|
|
|
|
Args:
|
|
url: URL of the document
|
|
|
|
Returns:
|
|
Document as a dictionary, or None if not found
|
|
"""
|
|
return await self.db_manager.get_document_by_url(url)
|
|
|
|
async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
|
"""
|
|
Search for documents in the database.
|
|
|
|
Args:
|
|
query: Search query
|
|
limit: Maximum number of results to return
|
|
|
|
Returns:
|
|
List of matching documents
|
|
"""
|
|
return await self.db_manager.search_documents(query, limit)
|
|
|
|
|
|
# Create a singleton instance for global use
|
|
report_generator = ReportGenerator()
|
|
|
|
async def initialize_report_generator():
|
|
"""Initialize the report generator."""
|
|
await report_generator.initialize()
|
|
|
|
def get_report_generator() -> ReportGenerator:
|
|
"""
|
|
Get the global report generator instance.
|
|
|
|
Returns:
|
|
ReportGenerator instance
|
|
"""
|
|
return report_generator
|
|
|
|
# Example usage
|
|
async def test_report_generator():
|
|
"""Test the report generator with sample search results."""
|
|
# Initialize report generator
|
|
await initialize_report_generator()
|
|
|
|
# Sample search results
|
|
search_results = [
|
|
{"url": "https://en.wikipedia.org/wiki/Web_scraping", "title": "Web scraping - Wikipedia"},
|
|
{"url": "https://en.wikipedia.org/wiki/Natural_language_processing", "title": "Natural language processing - Wikipedia"}
|
|
]
|
|
|
|
# Process search results
|
|
generator = get_report_generator()
|
|
documents = await generator.process_search_results(search_results)
|
|
|
|
# Print results
|
|
print(f"Processed {len(documents)} documents")
|
|
for doc in documents:
|
|
print(f"Title: {doc['title']}")
|
|
print(f"URL: {doc['url']}")
|
|
print(f"Token count: {doc['token_count']}")
|
|
print(f"Content preview: {doc['content'][:200]}...")
|
|
print("-" * 80)
|
|
|
|
# Run test if this module is executed directly
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_report_generator())
|