ira/tests/test_document_processor.py

"""
Test script for the document processor module.

This script tests the document prioritization and chunking functionality
of the document processor module.
"""

import os
import sys
import asyncio
import json
from datetime import datetime
from typing import Dict, List, Any, Optional

# Add the project root directory to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from report.document_processor import get_document_processor
from report.database.db_manager import get_db_manager, initialize_database
from report.document_scraper import get_document_scraper

async def test_document_processor(use_mock: bool = False):
    """Test the document processor with sample documents."""
    # Initialize database
    await initialize_database()

    # Create document processor
    document_processor = get_document_processor()

    # Create document scraper with mock option
    document_scraper = get_document_scraper(use_mock=use_mock)

    # Sample search results with real, accessible URLs
    search_results = [
        {
            'title': 'Python Documentation',
            'url': 'https://docs.python.org/3/',
            'snippet': 'Official Python documentation.',
            'score': 0.95
        },
        {
            'title': 'Python.org',
            'url': 'https://www.python.org/',
            'snippet': 'The official home of the Python Programming Language.',
            'score': 0.85
        },
        {
            'title': 'Wikipedia - Python',
            'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
            'snippet': 'Python is a high-level, general-purpose programming language.',
            'score': 0.75
        }
    ]

    # Process search results
    documents = []
    relevance_scores = {}

    for result in search_results:
        # Scrape document
        document = await document_scraper.scrape_url(result['url'])
        if document:
            documents.append(document)
            relevance_scores[document['url']] = result['score']

    print(f"Scraped {len(documents)} documents")

    # Test document prioritization
    prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)
    print("\nPrioritized documents:")
    for i, doc in enumerate(prioritized_docs):
        print(f"{i+1}. {doc['title']} (Score: {doc.get('priority_score', 'N/A')})")

    # Test document chunking
    if documents:
        print("\nChunking document:", documents[0]['title'])
        chunks = document_processor.chunk_document_by_sections(documents[0])
        print(f"Created {len(chunks)} chunks")
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"Chunk {i+1}: {chunk['title']} ({chunk['token_count']} tokens)")
            content_preview = chunk['content'][:100] + '...' if len(chunk['content']) > 100 else chunk['content']
            print(f"Content: {content_preview}")

    # Test token budget management
    token_budget = 4000
    print(f"\nSelecting chunks with token budget: {token_budget}")

    # Create chunks for each document
    all_chunks = []
    for doc in prioritized_docs:
        doc_chunks = document_processor.chunk_document_by_sections(doc)
        all_chunks.extend(doc_chunks)

    # Select chunks based on token budget
    selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)
    print(f"Selected {len(selected_chunks)} chunks with total tokens: {sum(c['token_count'] for c in selected_chunks)}")

    # Test end-to-end processing
    print("\nTesting end-to-end processing")
    processed_chunks = document_processor.process_documents_for_report(documents, relevance_scores)
    print(f"Processed {len(processed_chunks)} chunks for report")

    return processed_chunks

# Run test if this module is executed directly
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Test the document processor')
    parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
    args = parser.parse_args()

    print(f"Running test with {'mock data' if args.mock else 'real data'}")
    asyncio.run(test_document_processor(use_mock=args.mock))