ira/report/report_generator.py

"""
Report generator module for the intelligent research system.

This module provides functionality to generate reports from search results
by scraping documents, storing them in a database, and synthesizing them
into a comprehensive report.
"""

import os
import asyncio
import logging
from typing import Dict, List, Any, Optional, Tuple, Union

from report.database.db_manager import get_db_manager, initialize_database
from report.document_scraper import get_document_scraper

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class ReportGenerator:
    """
    Report generator for the intelligent research system.

    This class provides methods to generate reports from search results
    by scraping documents, storing them in a database, and synthesizing them
    into a comprehensive report.
    """

    def __init__(self):
        """Initialize the report generator."""
        self.db_manager = get_db_manager()
        self.document_scraper = get_document_scraper()

    async def initialize(self):
        """Initialize the report generator by setting up the database."""
        await initialize_database()
        logger.info("Report generator initialized")

    async def process_search_results(self, search_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Process search results by scraping the URLs and storing them in the database.

        Args:
            search_results: List of search results, each containing at least a 'url' field

        Returns:
            List of processed documents
        """
        # Extract URLs from search results
        urls = [result.get('url') for result in search_results if result.get('url')]

        # Scrape URLs and store in database
        documents = await self.document_scraper.scrape_urls(urls)

        # Log results
        logger.info(f"Processed {len(documents)} documents out of {len(urls)} URLs")

        return documents

    async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
        """
        Get a document by its URL.

        Args:
            url: URL of the document

        Returns:
            Document as a dictionary, or None if not found
        """
        return await self.db_manager.get_document_by_url(url)

    async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
        """
        Search for documents in the database.

        Args:
            query: Search query
            limit: Maximum number of results to return

        Returns:
            List of matching documents
        """
        return await self.db_manager.search_documents(query, limit)


# Create a singleton instance for global use
report_generator = ReportGenerator()

async def initialize_report_generator():
    """Initialize the report generator."""
    await report_generator.initialize()

def get_report_generator() -> ReportGenerator:
    """
    Get the global report generator instance.

    Returns:
        ReportGenerator instance
    """
    return report_generator

# Example usage
async def test_report_generator():
    """Test the report generator with sample search results."""
    # Initialize report generator
    await initialize_report_generator()

    # Sample search results
    search_results = [
        {"url": "https://en.wikipedia.org/wiki/Web_scraping", "title": "Web scraping - Wikipedia"},
        {"url": "https://en.wikipedia.org/wiki/Natural_language_processing", "title": "Natural language processing - Wikipedia"}
    ]

    # Process search results
    generator = get_report_generator()
    documents = await generator.process_search_results(search_results)

    # Print results
    print(f"Processed {len(documents)} documents")
    for doc in documents:
        print(f"Title: {doc['title']}")
        print(f"URL: {doc['url']}")
        print(f"Token count: {doc['token_count']}")
        print(f"Content preview: {doc['content'][:200]}...")
        print("-" * 80)

# Run test if this module is executed directly
if __name__ == "__main__":
    asyncio.run(test_report_generator())