ira/report/document_processor.py

"""
Document processor module for the report generation module.

This module provides functionality to prioritize documents based on relevance scores,
chunk long documents into manageable pieces, and select the most relevant chunks
to stay within token budget limits.
"""

import re
import math
import logging
import tiktoken
from typing import Dict, List, Any, Optional, Tuple, Union, Set
from datetime import datetime

from report.database.db_manager import get_db_manager

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class DocumentProcessor:
    """
    Document processor for the report generation module.

    This class provides methods to prioritize documents based on relevance scores,
    chunk long documents into manageable pieces, and select the most relevant chunks
    to stay within token budget limits.
    """

    def __init__(self, default_token_limit: int = 120000):
        """
        Initialize the document processor.

        Args:
            default_token_limit: Default token limit for the context window
        """
        self.db_manager = get_db_manager()
        self.default_token_limit = default_token_limit
        self.tokenizer = tiktoken.get_encoding("cl100k_base")  # Using OpenAI's tokenizer

    def _count_tokens(self, text: str) -> int:
        """
        Count the number of tokens in a text.

        Args:
            text: The text to count tokens for

        Returns:
            Number of tokens in the text
        """
        return len(self.tokenizer.encode(text))

    def prioritize_documents(self, documents: List[Dict[str, Any]],
                            relevance_scores: Optional[Dict[str, float]] = None,
                            recency_weight: float = 0.3,
                            token_count_weight: float = 0.2) -> List[Dict[str, Any]]:
        """
        Prioritize documents based on relevance scores, recency, and token count.

        Args:
            documents: List of documents to prioritize
            relevance_scores: Dictionary mapping document URLs to relevance scores
            recency_weight: Weight for recency in the prioritization score
            token_count_weight: Weight for token count in the prioritization score

        Returns:
            List of documents sorted by priority score
        """
        # If no relevance scores provided, use equal scores for all documents
        if relevance_scores is None:
            relevance_scores = {doc['url']: 1.0 for doc in documents}

        # Get current time for recency calculation
        current_time = datetime.now()

        # Calculate priority scores
        for doc in documents:
            # Relevance score (normalized to 0-1)
            relevance_score = relevance_scores.get(doc['url'], 0.0)

            # Recency score (normalized to 0-1)
            try:
                doc_time = datetime.fromisoformat(doc['scrape_date'])
                time_diff = (current_time - doc_time).total_seconds() / 86400  # Convert to days
                recency_score = 1.0 / (1.0 + time_diff)  # Newer documents get higher scores
            except (KeyError, ValueError):
                recency_score = 0.5  # Default if scrape_date is missing or invalid

            # Token count score (normalized to 0-1)
            # Prefer documents with more tokens, but not too many
            token_count = doc.get('token_count', 0)
            token_count_score = min(token_count / 5000, 1.0)  # Normalize to 0-1

            # Calculate final priority score
            relevance_weight = 1.0 - recency_weight - token_count_weight
            priority_score = (
                relevance_weight * relevance_score +
                recency_weight * recency_score +
                token_count_weight * token_count_score
            )

            # Add priority score to document
            doc['priority_score'] = priority_score

        # Sort documents by priority score (descending)
        return sorted(documents, key=lambda x: x.get('priority_score', 0.0), reverse=True)

    def chunk_document_by_sections(self, document: Dict[str, Any],
                                 max_chunk_tokens: int = 1000,
                                 overlap_tokens: int = 100) -> List[Dict[str, Any]]:
        """
        Chunk a document by sections based on Markdown headers.

        Args:
            document: Document to chunk
            max_chunk_tokens: Maximum number of tokens per chunk
            overlap_tokens: Number of tokens to overlap between chunks

        Returns:
            List of document chunks
        """
        content = document.get('content', '')

        # If content is empty, return empty list
        if not content.strip():
            return []

        # Ensure document has a title
        document_title = document.get('title')
        if document_title is None:
            document_title = 'Untitled'

        # Find all headers in the content
        header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
        headers = list(header_pattern.finditer(content))

        # If no headers found, use fixed-size chunking
        if not headers:
            return self.chunk_document_fixed_size(document, max_chunk_tokens, overlap_tokens)

        chunks = []

        # Process each section (from one header to the next)
        for i in range(len(headers)):
            start_pos = headers[i].start()

            # Determine end position (next header or end of content)
            if i < len(headers) - 1:
                end_pos = headers[i + 1].start()
            else:
                end_pos = len(content)

            section_content = content[start_pos:end_pos]
            section_tokens = self._count_tokens(section_content)

            # If section is small enough, add it as a single chunk
            if section_tokens <= max_chunk_tokens:
                chunks.append({
                    'document_id': document.get('id'),
                    'url': document.get('url'),
                    'title': document_title,
                    'content': section_content,
                    'token_count': section_tokens,
                    'chunk_type': 'section',
                    'section_title': headers[i].group(2),
                    'section_level': len(headers[i].group(1)),
                    'priority_score': document.get('priority_score', 0.0)
                })
            else:
                # If section is too large, split it into fixed-size chunks
                section_chunks = self._split_text_fixed_size(
                    section_content,
                    max_chunk_tokens,
                    overlap_tokens
                )

                for j, chunk_content in enumerate(section_chunks):
                    chunk_tokens = self._count_tokens(chunk_content)
                    chunks.append({
                        'document_id': document.get('id'),
                        'url': document.get('url'),
                        'title': document_title,
                        'content': chunk_content,
                        'token_count': chunk_tokens,
                        'chunk_type': 'section_part',
                        'section_title': headers[i].group(2),
                        'section_level': len(headers[i].group(1)),
                        'section_part': j + 1,
                        'total_parts': len(section_chunks),
                        'priority_score': document.get('priority_score', 0.0)
                    })

        return chunks

    def chunk_document_fixed_size(self, document: Dict[str, Any],
                                max_chunk_tokens: int = 1000,
                                overlap_tokens: int = 100) -> List[Dict[str, Any]]:
        """
        Chunk a document into fixed-size chunks with overlap.

        Args:
            document: Document to chunk
            max_chunk_tokens: Maximum number of tokens per chunk
            overlap_tokens: Number of tokens to overlap between chunks

        Returns:
            List of document chunks
        """
        content = document.get('content', '')

        # If content is empty, return empty list
        if not content.strip():
            return []

        # Ensure document has a title
        document_title = document.get('title')
        if document_title is None:
            document_title = 'Untitled'

        # Split the content into fixed-size chunks
        chunk_contents = self._split_text_fixed_size(content, max_chunk_tokens, overlap_tokens)

        # Create chunk objects
        chunks = []
        for i, chunk_content in enumerate(chunk_contents):
            chunk_tokens = self._count_tokens(chunk_content)
            chunks.append({
                'document_id': document.get('id'),
                'url': document.get('url'),
                'title': document_title,
                'content': chunk_content,
                'token_count': chunk_tokens,
                'chunk_type': 'fixed',
                'chunk_index': i,
                'total_chunks': len(chunk_contents),
                'priority_score': document.get('priority_score', 0.0) * (1.0 - (i * 0.05))  # Slightly reduce priority for later chunks
            })

        return chunks

    def chunk_document_hierarchical(self, document: Dict[str, Any],
                                  max_chunk_tokens: int = 1000,
                                  overlap_tokens: int = 100) -> List[Dict[str, Any]]:
        """
        Chunk a very large document using a hierarchical approach.

        This method first chunks the document by sections, then further chunks
        large sections into smaller pieces.

        Args:
            document: Document to chunk
            max_chunk_tokens: Maximum number of tokens per chunk
            overlap_tokens: Number of tokens to overlap between chunks

        Returns:
            List of document chunks
        """
        # First, chunk by sections
        section_chunks = self.chunk_document_by_sections(document, max_chunk_tokens, overlap_tokens)

        # If the document is small enough, return section chunks
        if sum(chunk.get('token_count', 0) for chunk in section_chunks) <= max_chunk_tokens * 3:
            return section_chunks

        # Otherwise, create a summary chunk and keep the most important sections
        content = document.get('content', '')
        title = document.get('title', 'Untitled')

        # Extract first paragraph as summary
        first_para_match = re.search(r'^(.*?)\n\n', content, re.DOTALL)
        summary = first_para_match.group(1) if first_para_match else content[:500]

        # Create summary chunk
        summary_chunk = {
            'document_id': document.get('id'),
            'url': document.get('url'),
            'title': title,
            'content': f"# {title}\n\n{summary}\n\n(This is a summary of a large document)",
            'token_count': self._count_tokens(f"# {title}\n\n{summary}\n\n(This is a summary of a large document)"),
            'chunk_type': 'summary',
            'priority_score': document.get('priority_score', 0.0) * 1.2  # Boost summary priority
        }

        # Sort section chunks by priority (section level and position)
        def section_priority(chunk):
            # Prioritize by section level (lower is more important)
            level_score = 6 - chunk.get('section_level', 3)
            # Prioritize earlier sections
            position_score = 1.0 / (1.0 + chunk.get('chunk_index', 0) + chunk.get('section_part', 0))
            return level_score * position_score

        sorted_sections = sorted(section_chunks, key=section_priority, reverse=True)

        # Return summary chunk and top sections
        return [summary_chunk] + sorted_sections

    def _split_text_fixed_size(self, text: str,
                             max_chunk_tokens: int = 1000,
                             overlap_tokens: int = 100) -> List[str]:
        """
        Split text into fixed-size chunks with overlap.

        Args:
            text: Text to split
            max_chunk_tokens: Maximum number of tokens per chunk
            overlap_tokens: Number of tokens to overlap between chunks

        Returns:
            List of text chunks
        """
        # Encode text into tokens
        tokens = self.tokenizer.encode(text)

        # If text is small enough, return as a single chunk
        if len(tokens) <= max_chunk_tokens:
            return [text]

        # Calculate number of chunks needed
        num_chunks = math.ceil((len(tokens) - overlap_tokens) / (max_chunk_tokens - overlap_tokens))

        chunks = []

        # Split tokens into chunks
        for i in range(num_chunks):
            # Calculate start and end positions
            start_pos = i * (max_chunk_tokens - overlap_tokens)
            end_pos = min(start_pos + max_chunk_tokens, len(tokens))

            # Extract chunk tokens
            chunk_tokens = tokens[start_pos:end_pos]

            # Decode chunk tokens back to text
            chunk_text = self.tokenizer.decode(chunk_tokens)

            chunks.append(chunk_text)

        return chunks

    def select_chunks_for_context(self, chunks: List[Dict[str, Any]],
                                token_budget: int,
                                min_chunks_per_doc: int = 1) -> List[Dict[str, Any]]:
        """
        Select chunks to include in the context window based on token budget.

        Args:
            chunks: List of document chunks
            token_budget: Maximum number of tokens to use
            min_chunks_per_doc: Minimum number of chunks to include per document

        Returns:
            List of selected chunks
        """
        # Group chunks by document
        doc_chunks = {}
        for chunk in chunks:
            doc_id = chunk.get('document_id')
            if doc_id not in doc_chunks:
                doc_chunks[doc_id] = []
            doc_chunks[doc_id].append(chunk)

        # Sort chunks within each document by priority
        for doc_id in doc_chunks:
            doc_chunks[doc_id] = sorted(
                doc_chunks[doc_id],
                key=lambda x: x.get('priority_score', 0.0),
                reverse=True
            )

        # Select at least min_chunks_per_doc from each document
        selected_chunks = []
        remaining_budget = token_budget

        # First pass: select minimum chunks from each document
        for doc_id, chunks in doc_chunks.items():
            for i in range(min(min_chunks_per_doc, len(chunks))):
                chunk = chunks[i]
                selected_chunks.append(chunk)
                remaining_budget -= chunk.get('token_count', 0)

        # If we've exceeded the budget, sort selected chunks and trim
        if remaining_budget <= 0:
            selected_chunks = sorted(
                selected_chunks,
                key=lambda x: x.get('priority_score', 0.0),
                reverse=True
            )

            # Keep adding chunks until we exceed the budget
            current_budget = 0
            for i, chunk in enumerate(selected_chunks):
                current_budget += chunk.get('token_count', 0)
                if current_budget > token_budget:
                    selected_chunks = selected_chunks[:i]
                    break

            return selected_chunks

        # Second pass: add more chunks based on priority until budget is exhausted
        # Flatten remaining chunks from all documents
        remaining_chunks = []
        for doc_id, chunks in doc_chunks.items():
            if len(chunks) > min_chunks_per_doc:
                remaining_chunks.extend(chunks[min_chunks_per_doc:])

        # Sort remaining chunks by priority
        remaining_chunks = sorted(
            remaining_chunks,
            key=lambda x: x.get('priority_score', 0.0),
            reverse=True
        )

        # Add chunks until budget is exhausted
        for chunk in remaining_chunks:
            if chunk.get('token_count', 0) <= remaining_budget:
                selected_chunks.append(chunk)
                remaining_budget -= chunk.get('token_count', 0)

            if remaining_budget <= 0:
                break

        return selected_chunks

    def process_documents_for_report(self, documents: List[Dict[str, Any]],
                                   relevance_scores: Optional[Dict[str, float]] = None,
                                   token_budget: Optional[int] = None,
                                   chunk_size: int = 1000,
                                   overlap_size: int = 100) -> List[Dict[str, Any]]:
        """
        Process documents for report generation.

        This method prioritizes documents, chunks them, and selects the most
        relevant chunks to stay within the token budget.

        Args:
            documents: List of documents to process
            relevance_scores: Dictionary mapping document URLs to relevance scores
            token_budget: Maximum number of tokens to use (default: self.default_token_limit)
            chunk_size: Maximum number of tokens per chunk
            overlap_size: Number of tokens to overlap between chunks

        Returns:
            List of selected document chunks
        """
        if token_budget is None:
            token_budget = self.default_token_limit

        # Prioritize documents
        prioritized_docs = self.prioritize_documents(documents, relevance_scores)

        # Chunk documents
        all_chunks = []
        for doc in prioritized_docs:
            # Ensure document has a title
            if doc.get('title') is None:
                doc['title'] = 'Untitled'

            # Choose chunking strategy based on document size
            token_count = doc.get('token_count', 0)

            if token_count > chunk_size * 10:
                # Very large document: use hierarchical chunking
                chunks = self.chunk_document_hierarchical(doc, chunk_size, overlap_size)
            elif token_count > chunk_size:
                # Medium document: use section-based chunking
                chunks = self.chunk_document_by_sections(doc, chunk_size, overlap_size)
            else:
                # Small document: keep as a single chunk
                chunks = [{
                    'document_id': doc.get('id'),
                    'url': doc.get('url'),
                    'title': doc.get('title', 'Untitled'),
                    'content': doc.get('content', ''),
                    'token_count': token_count,
                    'chunk_type': 'full',
                    'priority_score': doc.get('priority_score', 0.0)
                }]

            # Ensure all chunks have a title
            for chunk in chunks:
                if chunk.get('title') is None:
                    chunk['title'] = 'Untitled'

            all_chunks.extend(chunks)

        # Select chunks based on token budget
        selected_chunks = self.select_chunks_for_context(all_chunks, token_budget)

        # Log statistics
        total_docs = len(documents)
        total_chunks = len(all_chunks)
        selected_chunk_count = len(selected_chunks)
        selected_token_count = sum(chunk.get('token_count', 0) for chunk in selected_chunks)

        logger.info(f"Processed {total_docs} documents into {total_chunks} chunks")
        logger.info(f"Selected {selected_chunk_count} chunks with {selected_token_count} tokens")

        return selected_chunks


# Create a singleton instance for global use
document_processor = DocumentProcessor()

def get_document_processor() -> DocumentProcessor:
    """
    Get the global document processor instance.

    Returns:
        DocumentProcessor instance
    """
    return document_processor