ira/tests/test_document_processor.py

115 lines
4.2 KiB
Python

"""
Test script for the document processor module.
This script tests the document prioritization and chunking functionality
of the document processor module.
"""
import os
import sys
import asyncio
import json
from datetime import datetime
from typing import Dict, List, Any, Optional
# Add the project root directory to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from report.document_processor import get_document_processor
from report.database.db_manager import get_db_manager, initialize_database
from report.document_scraper import get_document_scraper
async def test_document_processor(use_mock: bool = False):
"""Test the document processor with sample documents."""
# Initialize database
await initialize_database()
# Create document processor
document_processor = get_document_processor()
# Create document scraper with mock option
document_scraper = get_document_scraper(use_mock=use_mock)
# Sample search results with real, accessible URLs
search_results = [
{
'title': 'Python Documentation',
'url': 'https://docs.python.org/3/',
'snippet': 'Official Python documentation.',
'score': 0.95
},
{
'title': 'Python.org',
'url': 'https://www.python.org/',
'snippet': 'The official home of the Python Programming Language.',
'score': 0.85
},
{
'title': 'Wikipedia - Python',
'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
'snippet': 'Python is a high-level, general-purpose programming language.',
'score': 0.75
}
]
# Process search results
documents = []
relevance_scores = {}
for result in search_results:
# Scrape document
document = await document_scraper.scrape_url(result['url'])
if document:
documents.append(document)
relevance_scores[document['url']] = result['score']
print(f"Scraped {len(documents)} documents")
# Test document prioritization
prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)
print("\nPrioritized documents:")
for i, doc in enumerate(prioritized_docs):
print(f"{i+1}. {doc['title']} (Score: {doc.get('priority_score', 'N/A')})")
# Test document chunking
if documents:
print("\nChunking document:", documents[0]['title'])
chunks = document_processor.chunk_document_by_sections(documents[0])
print(f"Created {len(chunks)} chunks")
for i, chunk in enumerate(chunks[:3]): # Show first 3 chunks
print(f"Chunk {i+1}: {chunk['title']} ({chunk['token_count']} tokens)")
content_preview = chunk['content'][:100] + '...' if len(chunk['content']) > 100 else chunk['content']
print(f"Content: {content_preview}")
# Test token budget management
token_budget = 4000
print(f"\nSelecting chunks with token budget: {token_budget}")
# Create chunks for each document
all_chunks = []
for doc in prioritized_docs:
doc_chunks = document_processor.chunk_document_by_sections(doc)
all_chunks.extend(doc_chunks)
# Select chunks based on token budget
selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)
print(f"Selected {len(selected_chunks)} chunks with total tokens: {sum(c['token_count'] for c in selected_chunks)}")
# Test end-to-end processing
print("\nTesting end-to-end processing")
processed_chunks = document_processor.process_documents_for_report(documents, relevance_scores)
print(f"Processed {len(processed_chunks)} chunks for report")
return processed_chunks
# Run test if this module is executed directly
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Test the document processor')
parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
args = parser.parse_args()
print(f"Running test with {'mock data' if args.mock else 'real data'}")
asyncio.run(test_document_processor(use_mock=args.mock))