115 lines
4.2 KiB
Python
115 lines
4.2 KiB
Python
"""
|
|
Test script for the document processor module.
|
|
|
|
This script tests the document prioritization and chunking functionality
|
|
of the document processor module.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import asyncio
|
|
import json
|
|
from datetime import datetime
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
# Add the project root directory to the Python path
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from report.document_processor import get_document_processor
|
|
from report.database.db_manager import get_db_manager, initialize_database
|
|
from report.document_scraper import get_document_scraper
|
|
|
|
async def test_document_processor(use_mock: bool = False):
|
|
"""Test the document processor with sample documents."""
|
|
# Initialize database
|
|
await initialize_database()
|
|
|
|
# Create document processor
|
|
document_processor = get_document_processor()
|
|
|
|
# Create document scraper with mock option
|
|
document_scraper = get_document_scraper(use_mock=use_mock)
|
|
|
|
# Sample search results with real, accessible URLs
|
|
search_results = [
|
|
{
|
|
'title': 'Python Documentation',
|
|
'url': 'https://docs.python.org/3/',
|
|
'snippet': 'Official Python documentation.',
|
|
'score': 0.95
|
|
},
|
|
{
|
|
'title': 'Python.org',
|
|
'url': 'https://www.python.org/',
|
|
'snippet': 'The official home of the Python Programming Language.',
|
|
'score': 0.85
|
|
},
|
|
{
|
|
'title': 'Wikipedia - Python',
|
|
'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
|
|
'snippet': 'Python is a high-level, general-purpose programming language.',
|
|
'score': 0.75
|
|
}
|
|
]
|
|
|
|
# Process search results
|
|
documents = []
|
|
relevance_scores = {}
|
|
|
|
for result in search_results:
|
|
# Scrape document
|
|
document = await document_scraper.scrape_url(result['url'])
|
|
if document:
|
|
documents.append(document)
|
|
relevance_scores[document['url']] = result['score']
|
|
|
|
print(f"Scraped {len(documents)} documents")
|
|
|
|
# Test document prioritization
|
|
prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)
|
|
print("\nPrioritized documents:")
|
|
for i, doc in enumerate(prioritized_docs):
|
|
print(f"{i+1}. {doc['title']} (Score: {doc.get('priority_score', 'N/A')})")
|
|
|
|
# Test document chunking
|
|
if documents:
|
|
print("\nChunking document:", documents[0]['title'])
|
|
chunks = document_processor.chunk_document_by_sections(documents[0])
|
|
print(f"Created {len(chunks)} chunks")
|
|
for i, chunk in enumerate(chunks[:3]): # Show first 3 chunks
|
|
print(f"Chunk {i+1}: {chunk['title']} ({chunk['token_count']} tokens)")
|
|
content_preview = chunk['content'][:100] + '...' if len(chunk['content']) > 100 else chunk['content']
|
|
print(f"Content: {content_preview}")
|
|
|
|
# Test token budget management
|
|
token_budget = 4000
|
|
print(f"\nSelecting chunks with token budget: {token_budget}")
|
|
|
|
# Create chunks for each document
|
|
all_chunks = []
|
|
for doc in prioritized_docs:
|
|
doc_chunks = document_processor.chunk_document_by_sections(doc)
|
|
all_chunks.extend(doc_chunks)
|
|
|
|
# Select chunks based on token budget
|
|
selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)
|
|
print(f"Selected {len(selected_chunks)} chunks with total tokens: {sum(c['token_count'] for c in selected_chunks)}")
|
|
|
|
# Test end-to-end processing
|
|
print("\nTesting end-to-end processing")
|
|
processed_chunks = document_processor.process_documents_for_report(documents, relevance_scores)
|
|
print(f"Processed {len(processed_chunks)} chunks for report")
|
|
|
|
return processed_chunks
|
|
|
|
# Run test if this module is executed directly
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Test the document processor')
|
|
parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
|
|
args = parser.parse_args()
|
|
|
|
print(f"Running test with {'mock data' if args.mock else 'real data'}")
|
|
asyncio.run(test_document_processor(use_mock=args.mock))
|