ira/report/report_generator.py

237 lines
8.3 KiB
Python

"""
Report generator module for the intelligent research system.
This module provides functionality to generate reports from search results
by scraping documents, storing them in a database, and synthesizing them
into a comprehensive report.
"""
import os
import asyncio
import logging
from typing import Dict, List, Any, Optional, Tuple, Union
from report.database.db_manager import get_db_manager, initialize_database
from report.document_scraper import get_document_scraper
from report.document_processor import get_document_processor
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class ReportGenerator:
"""
Report generator for the intelligent research system.
This class provides methods to generate reports from search results
by scraping documents, storing them in a database, and synthesizing them
into a comprehensive report.
"""
def __init__(self):
"""Initialize the report generator."""
self.db_manager = get_db_manager()
self.document_scraper = get_document_scraper()
self.document_processor = get_document_processor()
async def initialize(self):
"""Initialize the report generator by setting up the database."""
await initialize_database()
logger.info("Report generator initialized")
async def process_search_results(self, search_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Process search results by scraping the URLs and storing them in the database.
Args:
search_results: List of search results, each containing at least a 'url' field
Returns:
List of processed documents
"""
# Extract URLs from search results
urls = [result.get('url') for result in search_results if result.get('url')]
# Extract relevance scores if available
relevance_scores = {}
for result in search_results:
if result.get('url') and result.get('score') is not None:
relevance_scores[result.get('url')] = result.get('score')
# Scrape URLs and store in database
documents = await self.document_scraper.scrape_urls(urls)
# Log results
logger.info(f"Processed {len(documents)} documents out of {len(urls)} URLs")
return documents, relevance_scores
async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
"""
Get a document by its URL.
Args:
url: URL of the document
Returns:
Document as a dictionary, or None if not found
"""
return await self.db_manager.get_document_by_url(url)
async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
"""
Search for documents in the database.
Args:
query: Search query
limit: Maximum number of results to return
Returns:
List of matching documents
"""
return await self.db_manager.search_documents(query, limit)
async def prepare_documents_for_report(self,
search_results: List[Dict[str, Any]],
token_budget: Optional[int] = None,
chunk_size: int = 1000,
overlap_size: int = 100) -> List[Dict[str, Any]]:
"""
Prepare documents for report generation by processing search results,
prioritizing documents, and chunking them to fit within token budget.
Args:
search_results: List of search results
token_budget: Maximum number of tokens to use
chunk_size: Maximum number of tokens per chunk
overlap_size: Number of tokens to overlap between chunks
Returns:
List of selected document chunks
"""
# Process search results to get documents and relevance scores
documents, relevance_scores = await self.process_search_results(search_results)
# Prioritize and chunk documents
selected_chunks = self.document_processor.process_documents_for_report(
documents,
relevance_scores,
token_budget,
chunk_size,
overlap_size
)
return selected_chunks
async def generate_report(self,
search_results: List[Dict[str, Any]],
query: str,
token_budget: Optional[int] = None,
chunk_size: int = 1000,
overlap_size: int = 100) -> str:
"""
Generate a report from search results.
Args:
search_results: List of search results
query: Original search query
token_budget: Maximum number of tokens to use
chunk_size: Maximum number of tokens per chunk
overlap_size: Number of tokens to overlap between chunks
Returns:
Generated report as a string
"""
# Prepare documents for report
selected_chunks = await self.prepare_documents_for_report(
search_results,
token_budget,
chunk_size,
overlap_size
)
# TODO: Implement report synthesis using LLM
# For now, just return a placeholder report
report = f"# Report for: {query}\n\n"
report += f"Based on {len(selected_chunks)} document chunks\n\n"
# Add document summaries
for i, chunk in enumerate(selected_chunks[:5]): # Show first 5 chunks
report += f"## Document {i+1}: {chunk.get('title', 'Untitled')}\n"
report += f"Source: {chunk.get('url', 'Unknown')}\n"
report += f"Chunk type: {chunk.get('chunk_type', 'Unknown')}\n"
report += f"Priority score: {chunk.get('priority_score', 0.0):.2f}\n\n"
# Add a snippet of the content
content = chunk.get('content', '')
snippet = content[:200] + "..." if len(content) > 200 else content
report += f"{snippet}\n\n"
return report
# Create a singleton instance for global use
report_generator = ReportGenerator()
async def initialize_report_generator():
"""Initialize the report generator."""
await report_generator.initialize()
def get_report_generator() -> ReportGenerator:
"""
Get the global report generator instance.
Returns:
ReportGenerator instance
"""
return report_generator
async def test_report_generator():
"""Test the report generator with sample search results."""
# Initialize the report generator
await initialize_report_generator()
# Sample search results
search_results = [
{
'title': 'Example Document 1',
'url': 'https://example.com/doc1',
'snippet': 'This is an example document.',
'score': 0.95
},
{
'title': 'Example Document 2',
'url': 'https://example.com/doc2',
'snippet': 'This is another example document.',
'score': 0.85
},
{
'title': 'Python Documentation',
'url': 'https://docs.python.org/3/',
'snippet': 'Official Python documentation.',
'score': 0.75
}
]
# Process search results
documents, relevance_scores = await report_generator.process_search_results(search_results)
# Print documents
print(f"Processed {len(documents)} documents")
for doc in documents:
print(f"Document: {doc.get('title')} ({doc.get('url')})")
print(f"Token count: {doc.get('token_count')}")
print(f"Content snippet: {doc.get('content')[:100]}...")
print()
# Generate report
report = await report_generator.generate_report(search_results, "Python programming")
# Print report
print("Generated Report:")
print(report)
# Run test if this module is executed directly
if __name__ == "__main__":
asyncio.run(test_report_generator())