""" Test script for the document processor module. This script tests the document prioritization and chunking functionality of the document processor module. """ import os import sys import asyncio import json from datetime import datetime from typing import Dict, List, Any, Optional # Add the project root directory to the Python path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from report.document_processor import get_document_processor from report.database.db_manager import get_db_manager, initialize_database from report.document_scraper import get_document_scraper async def test_document_processor(): """Test the document processor with sample documents.""" # Initialize the database await initialize_database() # Get the document processor and scraper document_processor = get_document_processor() document_scraper = get_document_scraper() db_manager = get_db_manager() # Sample URLs to test with test_urls = [ "https://en.wikipedia.org/wiki/Python_(programming_language)", "https://en.wikipedia.org/wiki/Natural_language_processing", "https://docs.python.org/3/tutorial/index.html", "https://en.wikipedia.org/wiki/Machine_learning" ] # Scrape the URLs print(f"Scraping {len(test_urls)} URLs...") documents = await document_scraper.scrape_urls(test_urls) print(f"Scraped {len(documents)} documents") # Sample relevance scores relevance_scores = { "https://en.wikipedia.org/wiki/Python_(programming_language)": 0.95, "https://en.wikipedia.org/wiki/Natural_language_processing": 0.85, "https://docs.python.org/3/tutorial/index.html": 0.75, "https://en.wikipedia.org/wiki/Machine_learning": 0.65 } # Test document prioritization print("\nTesting document prioritization...") prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores) print("Prioritized documents:") for i, doc in enumerate(prioritized_docs): print(f"{i+1}. {doc.get('title')} - Score: {doc.get('priority_score', 0.0):.2f}") # Test document chunking print("\nTesting document chunking...") # Test section-based chunking print("\nSection-based chunking:") if documents: section_chunks = document_processor.chunk_document_by_sections(documents[0], 1000, 100) print(f"Created {len(section_chunks)} section-based chunks") for i, chunk in enumerate(section_chunks[:3]): # Show first 3 chunks print(f"Chunk {i+1}:") print(f" Type: {chunk.get('chunk_type')}") print(f" Section: {chunk.get('section_title', 'N/A')}") print(f" Tokens: {chunk.get('token_count')}") content = chunk.get('content', '') print(f" Content preview: {content[:100]}...") # Test fixed-size chunking print("\nFixed-size chunking:") if documents: fixed_chunks = document_processor.chunk_document_fixed_size(documents[0], 1000, 100) print(f"Created {len(fixed_chunks)} fixed-size chunks") for i, chunk in enumerate(fixed_chunks[:3]): # Show first 3 chunks print(f"Chunk {i+1}:") print(f" Type: {chunk.get('chunk_type')}") print(f" Index: {chunk.get('chunk_index')}/{chunk.get('total_chunks')}") print(f" Tokens: {chunk.get('token_count')}") content = chunk.get('content', '') print(f" Content preview: {content[:100]}...") # Test hierarchical chunking print("\nHierarchical chunking:") if documents: hierarchical_chunks = document_processor.chunk_document_hierarchical(documents[0], 1000, 100) print(f"Created {len(hierarchical_chunks)} hierarchical chunks") for i, chunk in enumerate(hierarchical_chunks[:3]): # Show first 3 chunks print(f"Chunk {i+1}:") print(f" Type: {chunk.get('chunk_type')}") if chunk.get('chunk_type') == 'summary': print(f" Summary chunk") else: print(f" Section: {chunk.get('section_title', 'N/A')}") print(f" Tokens: {chunk.get('token_count')}") content = chunk.get('content', '') print(f" Content preview: {content[:100]}...") # Test chunk selection print("\nTesting chunk selection...") # Create a mix of chunks from all documents all_chunks = [] for doc in documents: chunks = document_processor.chunk_document_by_sections(doc, 1000, 100) all_chunks.extend(chunks) print(f"Total chunks: {len(all_chunks)}") # Select chunks based on token budget token_budget = 10000 selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget) total_tokens = sum(chunk.get('token_count', 0) for chunk in selected_chunks) print(f"Selected {len(selected_chunks)} chunks with {total_tokens} tokens (budget: {token_budget})") # Test full document processing print("\nTesting full document processing...") processed_chunks = document_processor.process_documents_for_report( documents, relevance_scores, token_budget=20000, chunk_size=1000, overlap_size=100 ) total_processed_tokens = sum(chunk.get('token_count', 0) for chunk in processed_chunks) print(f"Processed {len(processed_chunks)} chunks with {total_processed_tokens} tokens") # Show the top 5 chunks print("\nTop 5 chunks:") for i, chunk in enumerate(processed_chunks[:5]): print(f"Chunk {i+1}:") print(f" Document: {chunk.get('title')}") print(f" Type: {chunk.get('chunk_type')}") print(f" Priority: {chunk.get('priority_score', 0.0):.2f}") print(f" Tokens: {chunk.get('token_count')}") content = chunk.get('content', '') print(f" Content preview: {content[:100]}...") async def main(): """Main function to run the tests.""" await test_document_processor() if __name__ == "__main__": asyncio.run(main())