ira/report/document_processor.py

512 lines
20 KiB
Python

"""
Document processor module for the report generation module.
This module provides functionality to prioritize documents based on relevance scores,
chunk long documents into manageable pieces, and select the most relevant chunks
to stay within token budget limits.
"""
import re
import math
import logging
import tiktoken
from typing import Dict, List, Any, Optional, Tuple, Union, Set
from datetime import datetime
from report.database.db_manager import get_db_manager
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class DocumentProcessor:
"""
Document processor for the report generation module.
This class provides methods to prioritize documents based on relevance scores,
chunk long documents into manageable pieces, and select the most relevant chunks
to stay within token budget limits.
"""
def __init__(self, default_token_limit: int = 120000):
"""
Initialize the document processor.
Args:
default_token_limit: Default token limit for the context window
"""
self.db_manager = get_db_manager()
self.default_token_limit = default_token_limit
self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer
def _count_tokens(self, text: str) -> int:
"""
Count the number of tokens in a text.
Args:
text: The text to count tokens for
Returns:
Number of tokens in the text
"""
return len(self.tokenizer.encode(text))
def prioritize_documents(self, documents: List[Dict[str, Any]],
relevance_scores: Optional[Dict[str, float]] = None,
recency_weight: float = 0.3,
token_count_weight: float = 0.2) -> List[Dict[str, Any]]:
"""
Prioritize documents based on relevance scores, recency, and token count.
Args:
documents: List of documents to prioritize
relevance_scores: Dictionary mapping document URLs to relevance scores
recency_weight: Weight for recency in the prioritization score
token_count_weight: Weight for token count in the prioritization score
Returns:
List of documents sorted by priority score
"""
# If no relevance scores provided, use equal scores for all documents
if relevance_scores is None:
relevance_scores = {doc['url']: 1.0 for doc in documents}
# Get current time for recency calculation
current_time = datetime.now()
# Calculate priority scores
for doc in documents:
# Relevance score (normalized to 0-1)
relevance_score = relevance_scores.get(doc['url'], 0.0)
# Recency score (normalized to 0-1)
try:
doc_time = datetime.fromisoformat(doc['scrape_date'])
time_diff = (current_time - doc_time).total_seconds() / 86400 # Convert to days
recency_score = 1.0 / (1.0 + time_diff) # Newer documents get higher scores
except (KeyError, ValueError):
recency_score = 0.5 # Default if scrape_date is missing or invalid
# Token count score (normalized to 0-1)
# Prefer documents with more tokens, but not too many
token_count = doc.get('token_count', 0)
token_count_score = min(token_count / 5000, 1.0) # Normalize to 0-1
# Calculate final priority score
relevance_weight = 1.0 - recency_weight - token_count_weight
priority_score = (
relevance_weight * relevance_score +
recency_weight * recency_score +
token_count_weight * token_count_score
)
# Add priority score to document
doc['priority_score'] = priority_score
# Sort documents by priority score (descending)
return sorted(documents, key=lambda x: x.get('priority_score', 0.0), reverse=True)
def chunk_document_by_sections(self, document: Dict[str, Any],
max_chunk_tokens: int = 1000,
overlap_tokens: int = 100) -> List[Dict[str, Any]]:
"""
Chunk a document by sections based on Markdown headers.
Args:
document: Document to chunk
max_chunk_tokens: Maximum number of tokens per chunk
overlap_tokens: Number of tokens to overlap between chunks
Returns:
List of document chunks
"""
content = document.get('content', '')
# If content is empty, return empty list
if not content.strip():
return []
# Ensure document has a title
document_title = document.get('title')
if document_title is None:
document_title = 'Untitled'
# Find all headers in the content
header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
headers = list(header_pattern.finditer(content))
# If no headers found, use fixed-size chunking
if not headers:
return self.chunk_document_fixed_size(document, max_chunk_tokens, overlap_tokens)
chunks = []
# Process each section (from one header to the next)
for i in range(len(headers)):
start_pos = headers[i].start()
# Determine end position (next header or end of content)
if i < len(headers) - 1:
end_pos = headers[i + 1].start()
else:
end_pos = len(content)
section_content = content[start_pos:end_pos]
section_tokens = self._count_tokens(section_content)
# If section is small enough, add it as a single chunk
if section_tokens <= max_chunk_tokens:
chunks.append({
'document_id': document.get('id'),
'url': document.get('url'),
'title': document_title,
'content': section_content,
'token_count': section_tokens,
'chunk_type': 'section',
'section_title': headers[i].group(2),
'section_level': len(headers[i].group(1)),
'priority_score': document.get('priority_score', 0.0)
})
else:
# If section is too large, split it into fixed-size chunks
section_chunks = self._split_text_fixed_size(
section_content,
max_chunk_tokens,
overlap_tokens
)
for j, chunk_content in enumerate(section_chunks):
chunk_tokens = self._count_tokens(chunk_content)
chunks.append({
'document_id': document.get('id'),
'url': document.get('url'),
'title': document_title,
'content': chunk_content,
'token_count': chunk_tokens,
'chunk_type': 'section_part',
'section_title': headers[i].group(2),
'section_level': len(headers[i].group(1)),
'section_part': j + 1,
'total_parts': len(section_chunks),
'priority_score': document.get('priority_score', 0.0)
})
return chunks
def chunk_document_fixed_size(self, document: Dict[str, Any],
max_chunk_tokens: int = 1000,
overlap_tokens: int = 100) -> List[Dict[str, Any]]:
"""
Chunk a document into fixed-size chunks with overlap.
Args:
document: Document to chunk
max_chunk_tokens: Maximum number of tokens per chunk
overlap_tokens: Number of tokens to overlap between chunks
Returns:
List of document chunks
"""
content = document.get('content', '')
# If content is empty, return empty list
if not content.strip():
return []
# Ensure document has a title
document_title = document.get('title')
if document_title is None:
document_title = 'Untitled'
# Split the content into fixed-size chunks
chunk_contents = self._split_text_fixed_size(content, max_chunk_tokens, overlap_tokens)
# Create chunk objects
chunks = []
for i, chunk_content in enumerate(chunk_contents):
chunk_tokens = self._count_tokens(chunk_content)
chunks.append({
'document_id': document.get('id'),
'url': document.get('url'),
'title': document_title,
'content': chunk_content,
'token_count': chunk_tokens,
'chunk_type': 'fixed',
'chunk_index': i,
'total_chunks': len(chunk_contents),
'priority_score': document.get('priority_score', 0.0) * (1.0 - (i * 0.05)) # Slightly reduce priority for later chunks
})
return chunks
def chunk_document_hierarchical(self, document: Dict[str, Any],
max_chunk_tokens: int = 1000,
overlap_tokens: int = 100) -> List[Dict[str, Any]]:
"""
Chunk a very large document using a hierarchical approach.
This method first chunks the document by sections, then further chunks
large sections into smaller pieces.
Args:
document: Document to chunk
max_chunk_tokens: Maximum number of tokens per chunk
overlap_tokens: Number of tokens to overlap between chunks
Returns:
List of document chunks
"""
# First, chunk by sections
section_chunks = self.chunk_document_by_sections(document, max_chunk_tokens, overlap_tokens)
# If the document is small enough, return section chunks
if sum(chunk.get('token_count', 0) for chunk in section_chunks) <= max_chunk_tokens * 3:
return section_chunks
# Otherwise, create a summary chunk and keep the most important sections
content = document.get('content', '')
title = document.get('title', 'Untitled')
# Extract first paragraph as summary
first_para_match = re.search(r'^(.*?)\n\n', content, re.DOTALL)
summary = first_para_match.group(1) if first_para_match else content[:500]
# Create summary chunk
summary_chunk = {
'document_id': document.get('id'),
'url': document.get('url'),
'title': title,
'content': f"# {title}\n\n{summary}\n\n(This is a summary of a large document)",
'token_count': self._count_tokens(f"# {title}\n\n{summary}\n\n(This is a summary of a large document)"),
'chunk_type': 'summary',
'priority_score': document.get('priority_score', 0.0) * 1.2 # Boost summary priority
}
# Sort section chunks by priority (section level and position)
def section_priority(chunk):
# Prioritize by section level (lower is more important)
level_score = 6 - chunk.get('section_level', 3)
# Prioritize earlier sections
position_score = 1.0 / (1.0 + chunk.get('chunk_index', 0) + chunk.get('section_part', 0))
return level_score * position_score
sorted_sections = sorted(section_chunks, key=section_priority, reverse=True)
# Return summary chunk and top sections
return [summary_chunk] + sorted_sections
def _split_text_fixed_size(self, text: str,
max_chunk_tokens: int = 1000,
overlap_tokens: int = 100) -> List[str]:
"""
Split text into fixed-size chunks with overlap.
Args:
text: Text to split
max_chunk_tokens: Maximum number of tokens per chunk
overlap_tokens: Number of tokens to overlap between chunks
Returns:
List of text chunks
"""
# Encode text into tokens
tokens = self.tokenizer.encode(text)
# If text is small enough, return as a single chunk
if len(tokens) <= max_chunk_tokens:
return [text]
# Calculate number of chunks needed
num_chunks = math.ceil((len(tokens) - overlap_tokens) / (max_chunk_tokens - overlap_tokens))
chunks = []
# Split tokens into chunks
for i in range(num_chunks):
# Calculate start and end positions
start_pos = i * (max_chunk_tokens - overlap_tokens)
end_pos = min(start_pos + max_chunk_tokens, len(tokens))
# Extract chunk tokens
chunk_tokens = tokens[start_pos:end_pos]
# Decode chunk tokens back to text
chunk_text = self.tokenizer.decode(chunk_tokens)
chunks.append(chunk_text)
return chunks
def select_chunks_for_context(self, chunks: List[Dict[str, Any]],
token_budget: int,
min_chunks_per_doc: int = 1) -> List[Dict[str, Any]]:
"""
Select chunks to include in the context window based on token budget.
Args:
chunks: List of document chunks
token_budget: Maximum number of tokens to use
min_chunks_per_doc: Minimum number of chunks to include per document
Returns:
List of selected chunks
"""
# Group chunks by document
doc_chunks = {}
for chunk in chunks:
doc_id = chunk.get('document_id')
if doc_id not in doc_chunks:
doc_chunks[doc_id] = []
doc_chunks[doc_id].append(chunk)
# Sort chunks within each document by priority
for doc_id in doc_chunks:
doc_chunks[doc_id] = sorted(
doc_chunks[doc_id],
key=lambda x: x.get('priority_score', 0.0),
reverse=True
)
# Select at least min_chunks_per_doc from each document
selected_chunks = []
remaining_budget = token_budget
# First pass: select minimum chunks from each document
for doc_id, chunks in doc_chunks.items():
for i in range(min(min_chunks_per_doc, len(chunks))):
chunk = chunks[i]
selected_chunks.append(chunk)
remaining_budget -= chunk.get('token_count', 0)
# If we've exceeded the budget, sort selected chunks and trim
if remaining_budget <= 0:
selected_chunks = sorted(
selected_chunks,
key=lambda x: x.get('priority_score', 0.0),
reverse=True
)
# Keep adding chunks until we exceed the budget
current_budget = 0
for i, chunk in enumerate(selected_chunks):
current_budget += chunk.get('token_count', 0)
if current_budget > token_budget:
selected_chunks = selected_chunks[:i]
break
return selected_chunks
# Second pass: add more chunks based on priority until budget is exhausted
# Flatten remaining chunks from all documents
remaining_chunks = []
for doc_id, chunks in doc_chunks.items():
if len(chunks) > min_chunks_per_doc:
remaining_chunks.extend(chunks[min_chunks_per_doc:])
# Sort remaining chunks by priority
remaining_chunks = sorted(
remaining_chunks,
key=lambda x: x.get('priority_score', 0.0),
reverse=True
)
# Add chunks until budget is exhausted
for chunk in remaining_chunks:
if chunk.get('token_count', 0) <= remaining_budget:
selected_chunks.append(chunk)
remaining_budget -= chunk.get('token_count', 0)
if remaining_budget <= 0:
break
return selected_chunks
def process_documents_for_report(self, documents: List[Dict[str, Any]],
relevance_scores: Optional[Dict[str, float]] = None,
token_budget: Optional[int] = None,
chunk_size: int = 1000,
overlap_size: int = 100) -> List[Dict[str, Any]]:
"""
Process documents for report generation.
This method prioritizes documents, chunks them, and selects the most
relevant chunks to stay within the token budget.
Args:
documents: List of documents to process
relevance_scores: Dictionary mapping document URLs to relevance scores
token_budget: Maximum number of tokens to use (default: self.default_token_limit)
chunk_size: Maximum number of tokens per chunk
overlap_size: Number of tokens to overlap between chunks
Returns:
List of selected document chunks
"""
if token_budget is None:
token_budget = self.default_token_limit
# Prioritize documents
prioritized_docs = self.prioritize_documents(documents, relevance_scores)
# Chunk documents
all_chunks = []
for doc in prioritized_docs:
# Ensure document has a title
if doc.get('title') is None:
doc['title'] = 'Untitled'
# Choose chunking strategy based on document size
token_count = doc.get('token_count', 0)
if token_count > chunk_size * 10:
# Very large document: use hierarchical chunking
chunks = self.chunk_document_hierarchical(doc, chunk_size, overlap_size)
elif token_count > chunk_size:
# Medium document: use section-based chunking
chunks = self.chunk_document_by_sections(doc, chunk_size, overlap_size)
else:
# Small document: keep as a single chunk
chunks = [{
'document_id': doc.get('id'),
'url': doc.get('url'),
'title': doc.get('title', 'Untitled'),
'content': doc.get('content', ''),
'token_count': token_count,
'chunk_type': 'full',
'priority_score': doc.get('priority_score', 0.0)
}]
# Ensure all chunks have a title
for chunk in chunks:
if chunk.get('title') is None:
chunk['title'] = 'Untitled'
all_chunks.extend(chunks)
# Select chunks based on token budget
selected_chunks = self.select_chunks_for_context(all_chunks, token_budget)
# Log statistics
total_docs = len(documents)
total_chunks = len(all_chunks)
selected_chunk_count = len(selected_chunks)
selected_token_count = sum(chunk.get('token_count', 0) for chunk in selected_chunks)
logger.info(f"Processed {total_docs} documents into {total_chunks} chunks")
logger.info(f"Selected {selected_chunk_count} chunks with {selected_token_count} tokens")
return selected_chunks
# Create a singleton instance for global use
document_processor = DocumentProcessor()
def get_document_processor() -> DocumentProcessor:
"""
Get the global document processor instance.
Returns:
DocumentProcessor instance
"""
return document_processor