""" Test script for the document processor module. This script tests the document prioritization and chunking functionality of the document processor module. """ import os import sys import asyncio import json from datetime import datetime from typing import Dict, List, Any, Optional # Add the project root directory to the Python path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from report.document_processor import get_document_processor from report.database.db_manager import get_db_manager, initialize_database from report.document_scraper import get_document_scraper async def test_document_processor(use_mock: bool = False): """Test the document processor with sample documents.""" # Initialize database await initialize_database() # Create document processor document_processor = get_document_processor() # Create document scraper with mock option document_scraper = get_document_scraper(use_mock=use_mock) # Sample search results with real, accessible URLs search_results = [ { 'title': 'Python Documentation', 'url': 'https://docs.python.org/3/', 'snippet': 'Official Python documentation.', 'score': 0.95 }, { 'title': 'Python.org', 'url': 'https://www.python.org/', 'snippet': 'The official home of the Python Programming Language.', 'score': 0.85 }, { 'title': 'Wikipedia - Python', 'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)', 'snippet': 'Python is a high-level, general-purpose programming language.', 'score': 0.75 } ] # Process search results documents = [] relevance_scores = {} for result in search_results: # Scrape document document = await document_scraper.scrape_url(result['url']) if document: documents.append(document) relevance_scores[document['url']] = result['score'] print(f"Scraped {len(documents)} documents") # Test document prioritization prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores) print("\nPrioritized documents:") for i, doc in enumerate(prioritized_docs): print(f"{i+1}. {doc['title']} (Score: {doc.get('priority_score', 'N/A')})") # Test document chunking if documents: print("\nChunking document:", documents[0]['title']) chunks = document_processor.chunk_document_by_sections(documents[0]) print(f"Created {len(chunks)} chunks") for i, chunk in enumerate(chunks[:3]): # Show first 3 chunks print(f"Chunk {i+1}: {chunk['title']} ({chunk['token_count']} tokens)") content_preview = chunk['content'][:100] + '...' if len(chunk['content']) > 100 else chunk['content'] print(f"Content: {content_preview}") # Test token budget management token_budget = 4000 print(f"\nSelecting chunks with token budget: {token_budget}") # Create chunks for each document all_chunks = [] for doc in prioritized_docs: doc_chunks = document_processor.chunk_document_by_sections(doc) all_chunks.extend(doc_chunks) # Select chunks based on token budget selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget) print(f"Selected {len(selected_chunks)} chunks with total tokens: {sum(c['token_count'] for c in selected_chunks)}") # Test end-to-end processing print("\nTesting end-to-end processing") processed_chunks = document_processor.process_documents_for_report(documents, relevance_scores) print(f"Processed {len(processed_chunks)} chunks for report") return processed_chunks # Run test if this module is executed directly if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Test the document processor') parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls') args = parser.parse_args() print(f"Running test with {'mock data' if args.mock else 'real data'}") asyncio.run(test_document_processor(use_mock=args.mock))