157 lines
6.0 KiB
Python
157 lines
6.0 KiB
Python
"""
|
|
Test script for the document processor module.
|
|
|
|
This script tests the document prioritization and chunking functionality
|
|
of the document processor module.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import asyncio
|
|
import json
|
|
from datetime import datetime
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
# Add the project root directory to the Python path
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from report.document_processor import get_document_processor
|
|
from report.database.db_manager import get_db_manager, initialize_database
|
|
from report.document_scraper import get_document_scraper
|
|
|
|
async def test_document_processor():
|
|
"""Test the document processor with sample documents."""
|
|
# Initialize the database
|
|
await initialize_database()
|
|
|
|
# Get the document processor and scraper
|
|
document_processor = get_document_processor()
|
|
document_scraper = get_document_scraper()
|
|
db_manager = get_db_manager()
|
|
|
|
# Sample URLs to test with
|
|
test_urls = [
|
|
"https://en.wikipedia.org/wiki/Python_(programming_language)",
|
|
"https://en.wikipedia.org/wiki/Natural_language_processing",
|
|
"https://docs.python.org/3/tutorial/index.html",
|
|
"https://en.wikipedia.org/wiki/Machine_learning"
|
|
]
|
|
|
|
# Scrape the URLs
|
|
print(f"Scraping {len(test_urls)} URLs...")
|
|
documents = await document_scraper.scrape_urls(test_urls)
|
|
print(f"Scraped {len(documents)} documents")
|
|
|
|
# Sample relevance scores
|
|
relevance_scores = {
|
|
"https://en.wikipedia.org/wiki/Python_(programming_language)": 0.95,
|
|
"https://en.wikipedia.org/wiki/Natural_language_processing": 0.85,
|
|
"https://docs.python.org/3/tutorial/index.html": 0.75,
|
|
"https://en.wikipedia.org/wiki/Machine_learning": 0.65
|
|
}
|
|
|
|
# Test document prioritization
|
|
print("\nTesting document prioritization...")
|
|
prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)
|
|
|
|
print("Prioritized documents:")
|
|
for i, doc in enumerate(prioritized_docs):
|
|
print(f"{i+1}. {doc.get('title')} - Score: {doc.get('priority_score', 0.0):.2f}")
|
|
|
|
# Test document chunking
|
|
print("\nTesting document chunking...")
|
|
|
|
# Test section-based chunking
|
|
print("\nSection-based chunking:")
|
|
if documents:
|
|
section_chunks = document_processor.chunk_document_by_sections(documents[0], 1000, 100)
|
|
print(f"Created {len(section_chunks)} section-based chunks")
|
|
|
|
for i, chunk in enumerate(section_chunks[:3]): # Show first 3 chunks
|
|
print(f"Chunk {i+1}:")
|
|
print(f" Type: {chunk.get('chunk_type')}")
|
|
print(f" Section: {chunk.get('section_title', 'N/A')}")
|
|
print(f" Tokens: {chunk.get('token_count')}")
|
|
content = chunk.get('content', '')
|
|
print(f" Content preview: {content[:100]}...")
|
|
|
|
# Test fixed-size chunking
|
|
print("\nFixed-size chunking:")
|
|
if documents:
|
|
fixed_chunks = document_processor.chunk_document_fixed_size(documents[0], 1000, 100)
|
|
print(f"Created {len(fixed_chunks)} fixed-size chunks")
|
|
|
|
for i, chunk in enumerate(fixed_chunks[:3]): # Show first 3 chunks
|
|
print(f"Chunk {i+1}:")
|
|
print(f" Type: {chunk.get('chunk_type')}")
|
|
print(f" Index: {chunk.get('chunk_index')}/{chunk.get('total_chunks')}")
|
|
print(f" Tokens: {chunk.get('token_count')}")
|
|
content = chunk.get('content', '')
|
|
print(f" Content preview: {content[:100]}...")
|
|
|
|
# Test hierarchical chunking
|
|
print("\nHierarchical chunking:")
|
|
if documents:
|
|
hierarchical_chunks = document_processor.chunk_document_hierarchical(documents[0], 1000, 100)
|
|
print(f"Created {len(hierarchical_chunks)} hierarchical chunks")
|
|
|
|
for i, chunk in enumerate(hierarchical_chunks[:3]): # Show first 3 chunks
|
|
print(f"Chunk {i+1}:")
|
|
print(f" Type: {chunk.get('chunk_type')}")
|
|
if chunk.get('chunk_type') == 'summary':
|
|
print(f" Summary chunk")
|
|
else:
|
|
print(f" Section: {chunk.get('section_title', 'N/A')}")
|
|
print(f" Tokens: {chunk.get('token_count')}")
|
|
content = chunk.get('content', '')
|
|
print(f" Content preview: {content[:100]}...")
|
|
|
|
# Test chunk selection
|
|
print("\nTesting chunk selection...")
|
|
|
|
# Create a mix of chunks from all documents
|
|
all_chunks = []
|
|
for doc in documents:
|
|
chunks = document_processor.chunk_document_by_sections(doc, 1000, 100)
|
|
all_chunks.extend(chunks)
|
|
|
|
print(f"Total chunks: {len(all_chunks)}")
|
|
|
|
# Select chunks based on token budget
|
|
token_budget = 10000
|
|
selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)
|
|
|
|
total_tokens = sum(chunk.get('token_count', 0) for chunk in selected_chunks)
|
|
print(f"Selected {len(selected_chunks)} chunks with {total_tokens} tokens (budget: {token_budget})")
|
|
|
|
# Test full document processing
|
|
print("\nTesting full document processing...")
|
|
processed_chunks = document_processor.process_documents_for_report(
|
|
documents,
|
|
relevance_scores,
|
|
token_budget=20000,
|
|
chunk_size=1000,
|
|
overlap_size=100
|
|
)
|
|
|
|
total_processed_tokens = sum(chunk.get('token_count', 0) for chunk in processed_chunks)
|
|
print(f"Processed {len(processed_chunks)} chunks with {total_processed_tokens} tokens")
|
|
|
|
# Show the top 5 chunks
|
|
print("\nTop 5 chunks:")
|
|
for i, chunk in enumerate(processed_chunks[:5]):
|
|
print(f"Chunk {i+1}:")
|
|
print(f" Document: {chunk.get('title')}")
|
|
print(f" Type: {chunk.get('chunk_type')}")
|
|
print(f" Priority: {chunk.get('priority_score', 0.0):.2f}")
|
|
print(f" Tokens: {chunk.get('token_count')}")
|
|
content = chunk.get('content', '')
|
|
print(f" Content preview: {content[:100]}...")
|
|
|
|
async def main():
|
|
"""Main function to run the tests."""
|
|
await test_document_processor()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|