ira/tests/test_document_processor.py

157 lines
6.0 KiB
Python

"""
Test script for the document processor module.
This script tests the document prioritization and chunking functionality
of the document processor module.
"""
import os
import sys
import asyncio
import json
from datetime import datetime
from typing import Dict, List, Any, Optional
# Add the project root directory to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from report.document_processor import get_document_processor
from report.database.db_manager import get_db_manager, initialize_database
from report.document_scraper import get_document_scraper
async def test_document_processor():
"""Test the document processor with sample documents."""
# Initialize the database
await initialize_database()
# Get the document processor and scraper
document_processor = get_document_processor()
document_scraper = get_document_scraper()
db_manager = get_db_manager()
# Sample URLs to test with
test_urls = [
"https://en.wikipedia.org/wiki/Python_(programming_language)",
"https://en.wikipedia.org/wiki/Natural_language_processing",
"https://docs.python.org/3/tutorial/index.html",
"https://en.wikipedia.org/wiki/Machine_learning"
]
# Scrape the URLs
print(f"Scraping {len(test_urls)} URLs...")
documents = await document_scraper.scrape_urls(test_urls)
print(f"Scraped {len(documents)} documents")
# Sample relevance scores
relevance_scores = {
"https://en.wikipedia.org/wiki/Python_(programming_language)": 0.95,
"https://en.wikipedia.org/wiki/Natural_language_processing": 0.85,
"https://docs.python.org/3/tutorial/index.html": 0.75,
"https://en.wikipedia.org/wiki/Machine_learning": 0.65
}
# Test document prioritization
print("\nTesting document prioritization...")
prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)
print("Prioritized documents:")
for i, doc in enumerate(prioritized_docs):
print(f"{i+1}. {doc.get('title')} - Score: {doc.get('priority_score', 0.0):.2f}")
# Test document chunking
print("\nTesting document chunking...")
# Test section-based chunking
print("\nSection-based chunking:")
if documents:
section_chunks = document_processor.chunk_document_by_sections(documents[0], 1000, 100)
print(f"Created {len(section_chunks)} section-based chunks")
for i, chunk in enumerate(section_chunks[:3]): # Show first 3 chunks
print(f"Chunk {i+1}:")
print(f" Type: {chunk.get('chunk_type')}")
print(f" Section: {chunk.get('section_title', 'N/A')}")
print(f" Tokens: {chunk.get('token_count')}")
content = chunk.get('content', '')
print(f" Content preview: {content[:100]}...")
# Test fixed-size chunking
print("\nFixed-size chunking:")
if documents:
fixed_chunks = document_processor.chunk_document_fixed_size(documents[0], 1000, 100)
print(f"Created {len(fixed_chunks)} fixed-size chunks")
for i, chunk in enumerate(fixed_chunks[:3]): # Show first 3 chunks
print(f"Chunk {i+1}:")
print(f" Type: {chunk.get('chunk_type')}")
print(f" Index: {chunk.get('chunk_index')}/{chunk.get('total_chunks')}")
print(f" Tokens: {chunk.get('token_count')}")
content = chunk.get('content', '')
print(f" Content preview: {content[:100]}...")
# Test hierarchical chunking
print("\nHierarchical chunking:")
if documents:
hierarchical_chunks = document_processor.chunk_document_hierarchical(documents[0], 1000, 100)
print(f"Created {len(hierarchical_chunks)} hierarchical chunks")
for i, chunk in enumerate(hierarchical_chunks[:3]): # Show first 3 chunks
print(f"Chunk {i+1}:")
print(f" Type: {chunk.get('chunk_type')}")
if chunk.get('chunk_type') == 'summary':
print(f" Summary chunk")
else:
print(f" Section: {chunk.get('section_title', 'N/A')}")
print(f" Tokens: {chunk.get('token_count')}")
content = chunk.get('content', '')
print(f" Content preview: {content[:100]}...")
# Test chunk selection
print("\nTesting chunk selection...")
# Create a mix of chunks from all documents
all_chunks = []
for doc in documents:
chunks = document_processor.chunk_document_by_sections(doc, 1000, 100)
all_chunks.extend(chunks)
print(f"Total chunks: {len(all_chunks)}")
# Select chunks based on token budget
token_budget = 10000
selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)
total_tokens = sum(chunk.get('token_count', 0) for chunk in selected_chunks)
print(f"Selected {len(selected_chunks)} chunks with {total_tokens} tokens (budget: {token_budget})")
# Test full document processing
print("\nTesting full document processing...")
processed_chunks = document_processor.process_documents_for_report(
documents,
relevance_scores,
token_budget=20000,
chunk_size=1000,
overlap_size=100
)
total_processed_tokens = sum(chunk.get('token_count', 0) for chunk in processed_chunks)
print(f"Processed {len(processed_chunks)} chunks with {total_processed_tokens} tokens")
# Show the top 5 chunks
print("\nTop 5 chunks:")
for i, chunk in enumerate(processed_chunks[:5]):
print(f"Chunk {i+1}:")
print(f" Document: {chunk.get('title')}")
print(f" Type: {chunk.get('chunk_type')}")
print(f" Priority: {chunk.get('priority_score', 0.0):.2f}")
print(f" Tokens: {chunk.get('token_count')}")
content = chunk.get('content', '')
print(f" Content preview: {content[:100]}...")
async def main():
"""Main function to run the tests."""
await test_document_processor()
if __name__ == "__main__":
asyncio.run(main())