ira/report/report_generator.py

"""
Report generator module for the intelligent research system.

This module provides functionality to generate reports from search results
by scraping documents, storing them in a database, and synthesizing them
into a comprehensive report.
"""

import os
import asyncio
import logging
from typing import Dict, List, Any, Optional, Tuple, Union

from report.database.db_manager import get_db_manager, initialize_database
from report.document_scraper import get_document_scraper
from report.document_processor import get_document_processor
from report.report_synthesis import get_report_synthesizer
from report.progressive_report_synthesis import get_progressive_report_synthesizer
from report.report_detail_levels import get_report_detail_level_manager, DetailLevel

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class ReportGenerator:
    """
    Report generator for the intelligent research system.

    This class provides methods to generate reports from search results
    by scraping documents, storing them in a database, and synthesizing them
    into a comprehensive report.
    """

    def __init__(self):
        """Initialize the report generator."""
        self.db_manager = get_db_manager()
        self.document_scraper = get_document_scraper()
        self.document_processor = get_document_processor()
        self.report_synthesizer = get_report_synthesizer()
        self.progressive_report_synthesizer = get_progressive_report_synthesizer()
        self.detail_level_manager = get_report_detail_level_manager()
        self.detail_level = "standard"  # Default detail level
        self.model_name = None  # Will use default model based on detail level

    async def initialize(self):
        """Initialize the report generator by setting up the database."""
        await initialize_database()
        logger.info("Report generator initialized")

    def set_detail_level(self, detail_level: str) -> None:
        """
        Set the detail level for report generation.

        Args:
            detail_level: Detail level (brief, standard, detailed, comprehensive)
        """
        try:
            # Validate detail level
            config = self.detail_level_manager.get_detail_level_config(detail_level)
            self.detail_level = detail_level

            # Update model if needed
            model = config.get("model")
            if model and model != self.model_name:
                self.model_name = model
                self.report_synthesizer = get_report_synthesizer(model)
                self.progressive_report_synthesizer = get_progressive_report_synthesizer(model)

            logger.info(f"Detail level set to {detail_level} with model {model}")
        except ValueError as e:
            logger.error(f"Error setting detail level: {e}")
            raise

    def get_detail_level_config(self) -> Dict[str, Any]:
        """
        Get the current detail level configuration.

        Returns:
            Dictionary of configuration parameters for the current detail level
        """
        return self.detail_level_manager.get_detail_level_config(self.detail_level)

    def get_available_detail_levels(self) -> List[Tuple[str, str]]:
        """
        Get a list of available detail levels with descriptions.

        Returns:
            List of tuples containing detail level and description
        """
        return self.detail_level_manager.get_available_detail_levels()

    async def process_search_results(self, search_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Process search results by scraping the URLs and storing them in the database.

        Args:
            search_results: List of search results, each containing at least a 'url' field

        Returns:
            List of processed documents
        """
        # Extract URLs from search results
        urls = [result.get('url') for result in search_results if result.get('url')]

        # Extract relevance scores if available
        relevance_scores = {}
        for result in search_results:
            if result.get('url') and result.get('score') is not None:
                relevance_scores[result.get('url')] = result.get('score')

        # Scrape URLs and store in database
        documents = await self.document_scraper.scrape_urls(urls)

        # Log results
        logger.info(f"Processed {len(documents)} documents out of {len(urls)} URLs")

        return documents, relevance_scores

    async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
        """
        Get a document by its URL.

        Args:
            url: URL of the document

        Returns:
            Document as a dictionary, or None if not found
        """
        return await self.db_manager.get_document_by_url(url)

    async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
        """
        Search for documents in the database.

        Args:
            query: Search query
            limit: Maximum number of results to return

        Returns:
            List of matching documents
        """
        return await self.db_manager.search_documents(query, limit)

    async def prepare_documents_for_report(self,
                                         search_results: List[Dict[str, Any]],
                                         token_budget: Optional[int] = None,
                                         chunk_size: Optional[int] = None,
                                         overlap_size: Optional[int] = None) -> List[Dict[str, Any]]:
        """
        Prepare documents for report generation by processing search results,
        prioritizing documents, and chunking them to fit within token budget.

        Args:
            search_results: List of search results
            token_budget: Maximum number of tokens to use
            chunk_size: Maximum number of tokens per chunk
            overlap_size: Number of tokens to overlap between chunks

        Returns:
            List of selected document chunks
        """
        # Get configuration from detail level if not specified
        config = self.get_detail_level_config()

        if token_budget is None:
            token_budget = config.get("token_budget")

        if chunk_size is None:
            chunk_size = config.get("chunk_size", 1000)

        if overlap_size is None:
            overlap_size = config.get("overlap_size", 100)

        logger.info(f"Preparing documents with token_budget={token_budget}, chunk_size={chunk_size}, overlap_size={overlap_size}")

        # Process search results to get documents and relevance scores
        documents, relevance_scores = await self.process_search_results(search_results)

        # Prioritize and chunk documents
        selected_chunks = self.document_processor.process_documents_for_report(
            documents,
            relevance_scores,
            token_budget,
            chunk_size,
            overlap_size
        )

        return selected_chunks

    async def generate_report(self,
                             search_results: List[Dict[str, Any]],
                             query: str,
                             token_budget: Optional[int] = None,
                             chunk_size: Optional[int] = None,
                             overlap_size: Optional[int] = None,
                             detail_level: Optional[str] = None) -> str:
        """
        Generate a report from search results.

        Args:
            search_results: List of search results
            query: Original search query
            token_budget: Maximum number of tokens to use
            chunk_size: Maximum number of tokens per chunk
            overlap_size: Number of tokens to overlap between chunks
            detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)

        Returns:
            Generated report as a string
        """
        # Set detail level if specified
        if detail_level:
            self.set_detail_level(detail_level)

        # Prepare documents for report
        selected_chunks = await self.prepare_documents_for_report(
            search_results,
            token_budget,
            chunk_size,
            overlap_size
        )

        # Choose the appropriate synthesizer based on detail level
        if self.detail_level.lower() == "comprehensive":
            # Use progressive report synthesizer for comprehensive detail level
            logger.info(f"Using progressive report synthesizer for {self.detail_level} detail level")
            report = await self.progressive_report_synthesizer.synthesize_report(
                selected_chunks,
                query,
                detail_level=self.detail_level
            )
        else:
            # Use standard report synthesizer for other detail levels
            logger.info(f"Using standard report synthesizer for {self.detail_level} detail level")
            report = await self.report_synthesizer.synthesize_report(
                selected_chunks,
                query,
                detail_level=self.detail_level
            )

        return report


# Create a singleton instance for global use
report_generator = ReportGenerator()

async def initialize_report_generator():
    """Initialize the report generator."""
    await report_generator.initialize()

def get_report_generator() -> ReportGenerator:
    """
    Get the global report generator instance.

    Returns:
        ReportGenerator instance
    """
    return report_generator

async def test_report_generator(use_mock: bool = False):
    """
    Test the report generator with sample search results.

    Args:
        use_mock: If True, use mock data instead of making actual API calls
    """
    # Initialize the report generator
    await initialize_report_generator()

    # Get document scraper with mock option
    document_scraper = get_document_scraper(use_mock=use_mock)

    # Sample search results with real, accessible URLs
    search_results = [
        {
            'title': 'Python Documentation',
            'url': 'https://docs.python.org/3/tutorial/index.html',
            'snippet': 'The Python Tutorial.',
            'score': 0.95
        },
        {
            'title': 'Python Requests Library',
            'url': 'https://requests.readthedocs.io/en/latest/',
            'snippet': 'Requests is an elegant and simple HTTP library for Python.',
            'score': 0.85
        },
        {
            'title': 'Real Python',
            'url': 'https://realpython.com/',
            'snippet': 'Python tutorials for developers of all skill levels.',
            'score': 0.75
        }
    ]

    try:
        # Process search results
        documents, relevance_scores = await report_generator.process_search_results(search_results)

        # Print documents
        print(f"Processed {len(documents)} documents")
        for doc in documents:
            print(f"Document: {doc.get('title')} ({doc.get('url')})")
            print(f"Token count: {doc.get('token_count')}")
            content_preview = doc.get('content', '')[:100] + '...' if doc.get('content') else 'No content'
            print(f"Content snippet: {content_preview}")
            print()

        # Generate report
        report = await report_generator.generate_report(search_results, "Python programming")

        # Print report
        print("Generated Report:")
        print(report)
    except Exception as e:
        logger.error(f"Error during report generation test: {str(e)}")
        import traceback
        traceback.print_exc()

# Run test if this module is executed directly
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Test the report generator')
    parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
    args = parser.parse_args()

    print(f"Running test with {'mock data' if args.mock else 'real data'}")
    asyncio.run(test_report_generator(use_mock=args.mock))