ira/tests/test_document_processor.py

"""
Test script for the document processor module.

This script tests the document prioritization and chunking functionality
of the document processor module.
"""

import os
import sys
import asyncio
import json
from datetime import datetime
from typing import Dict, List, Any, Optional

# Add the project root directory to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from report.document_processor import get_document_processor
from report.database.db_manager import get_db_manager, initialize_database
from report.document_scraper import get_document_scraper

async def test_document_processor():
    """Test the document processor with sample documents."""
    # Initialize the database
    await initialize_database()

    # Get the document processor and scraper
    document_processor = get_document_processor()
    document_scraper = get_document_scraper()
    db_manager = get_db_manager()

    # Sample URLs to test with
    test_urls = [
        "https://en.wikipedia.org/wiki/Python_(programming_language)",
        "https://en.wikipedia.org/wiki/Natural_language_processing",
        "https://docs.python.org/3/tutorial/index.html",
        "https://en.wikipedia.org/wiki/Machine_learning"
    ]

    # Scrape the URLs
    print(f"Scraping {len(test_urls)} URLs...")
    documents = await document_scraper.scrape_urls(test_urls)
    print(f"Scraped {len(documents)} documents")

    # Sample relevance scores
    relevance_scores = {
        "https://en.wikipedia.org/wiki/Python_(programming_language)": 0.95,
        "https://en.wikipedia.org/wiki/Natural_language_processing": 0.85,
        "https://docs.python.org/3/tutorial/index.html": 0.75,
        "https://en.wikipedia.org/wiki/Machine_learning": 0.65
    }

    # Test document prioritization
    print("\nTesting document prioritization...")
    prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)

    print("Prioritized documents:")
    for i, doc in enumerate(prioritized_docs):
        print(f"{i+1}. {doc.get('title')} - Score: {doc.get('priority_score', 0.0):.2f}")

    # Test document chunking
    print("\nTesting document chunking...")

    # Test section-based chunking
    print("\nSection-based chunking:")
    if documents:
        section_chunks = document_processor.chunk_document_by_sections(documents[0], 1000, 100)
        print(f"Created {len(section_chunks)} section-based chunks")

        for i, chunk in enumerate(section_chunks[:3]):  # Show first 3 chunks
            print(f"Chunk {i+1}:")
            print(f"  Type: {chunk.get('chunk_type')}")
            print(f"  Section: {chunk.get('section_title', 'N/A')}")
            print(f"  Tokens: {chunk.get('token_count')}")
            content = chunk.get('content', '')
            print(f"  Content preview: {content[:100]}...")

    # Test fixed-size chunking
    print("\nFixed-size chunking:")
    if documents:
        fixed_chunks = document_processor.chunk_document_fixed_size(documents[0], 1000, 100)
        print(f"Created {len(fixed_chunks)} fixed-size chunks")

        for i, chunk in enumerate(fixed_chunks[:3]):  # Show first 3 chunks
            print(f"Chunk {i+1}:")
            print(f"  Type: {chunk.get('chunk_type')}")
            print(f"  Index: {chunk.get('chunk_index')}/{chunk.get('total_chunks')}")
            print(f"  Tokens: {chunk.get('token_count')}")
            content = chunk.get('content', '')
            print(f"  Content preview: {content[:100]}...")

    # Test hierarchical chunking
    print("\nHierarchical chunking:")
    if documents:
        hierarchical_chunks = document_processor.chunk_document_hierarchical(documents[0], 1000, 100)
        print(f"Created {len(hierarchical_chunks)} hierarchical chunks")

        for i, chunk in enumerate(hierarchical_chunks[:3]):  # Show first 3 chunks
            print(f"Chunk {i+1}:")
            print(f"  Type: {chunk.get('chunk_type')}")
            if chunk.get('chunk_type') == 'summary':
                print(f"  Summary chunk")
            else:
                print(f"  Section: {chunk.get('section_title', 'N/A')}")
            print(f"  Tokens: {chunk.get('token_count')}")
            content = chunk.get('content', '')
            print(f"  Content preview: {content[:100]}...")

    # Test chunk selection
    print("\nTesting chunk selection...")

    # Create a mix of chunks from all documents
    all_chunks = []
    for doc in documents:
        chunks = document_processor.chunk_document_by_sections(doc, 1000, 100)
        all_chunks.extend(chunks)

    print(f"Total chunks: {len(all_chunks)}")

    # Select chunks based on token budget
    token_budget = 10000
    selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)

    total_tokens = sum(chunk.get('token_count', 0) for chunk in selected_chunks)
    print(f"Selected {len(selected_chunks)} chunks with {total_tokens} tokens (budget: {token_budget})")

    # Test full document processing
    print("\nTesting full document processing...")
    processed_chunks = document_processor.process_documents_for_report(
        documents,
        relevance_scores,
        token_budget=20000,
        chunk_size=1000,
        overlap_size=100
    )

    total_processed_tokens = sum(chunk.get('token_count', 0) for chunk in processed_chunks)
    print(f"Processed {len(processed_chunks)} chunks with {total_processed_tokens} tokens")

    # Show the top 5 chunks
    print("\nTop 5 chunks:")
    for i, chunk in enumerate(processed_chunks[:5]):
        print(f"Chunk {i+1}:")
        print(f"  Document: {chunk.get('title')}")
        print(f"  Type: {chunk.get('chunk_type')}")
        print(f"  Priority: {chunk.get('priority_score', 0.0):.2f}")
        print(f"  Tokens: {chunk.get('token_count')}")
        content = chunk.get('content', '')
        print(f"  Content preview: {content[:100]}...")

async def main():
    """Main function to run the tests."""
    await test_document_processor()

if __name__ == "__main__":
    asyncio.run(main())