ira/tests/test_report_synthesis.py

154 lines
6.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Test script for the report synthesis functionality.
This script tests the report synthesis functionality by generating a report
from sample document chunks.
"""
import os
import sys
import asyncio
import json
import argparse
from typing import List, Dict, Any, Optional
# Add the parent directory to the path so we can import the modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from report.report_synthesis import get_report_synthesizer
from report.document_processor import get_document_processor
from report.document_scraper import get_document_scraper
from report.database.db_manager import get_db_manager, initialize_database
async def test_with_sample_chunks():
"""Test report synthesis with sample document chunks."""
# Sample document chunks
chunks = [
{
"title": "Introduction to Python",
"url": "https://docs.python.org/3/tutorial/index.html",
"content": "Python is an easy to learn, powerful programming language. It has efficient high-level data structures and a simple but effective approach to object-oriented programming. Python's elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms.",
"chunk_type": "introduction",
"priority_score": 0.95
},
{
"title": "Python Features",
"url": "https://www.python.org/about/",
"content": "Python is a programming language that lets you work quickly and integrate systems more effectively. Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development, as well as for use as a scripting or glue language to connect existing components together.",
"chunk_type": "features",
"priority_score": 0.90
},
{
"title": "Python Applications",
"url": "https://www.python.org/about/apps/",
"content": "Python is used in many application domains. Here's a sampling: Web and Internet Development, Scientific and Numeric Computing, Education, Desktop GUIs, Software Development, and Business Applications. Python is also used as a scripting language for web applications, e.g. via mod_wsgi for the Apache webserver. With Web Server Gateway Interface support, it has become the language of choice for many web developers.",
"chunk_type": "applications",
"priority_score": 0.85
}
]
# Initialize the report synthesizer
synthesizer = get_report_synthesizer()
# Test query
query = "What are the key features and applications of Python programming language?"
# Generate report
print(f"Generating report for query: '{query}'")
print("-" * 50)
report = await synthesizer.synthesize_report(chunks, query)
print("\nGenerated Report:")
print("=" * 50)
print(report)
print("=" * 50)
async def test_with_real_urls(urls: List[str], query: str, use_mock: bool = False):
"""
Test report synthesis with real URLs.
Args:
urls: List of URLs to scrape
query: Query to use for the report
use_mock: Whether to use mock data for document scraping
"""
# Initialize the database
await initialize_database()
# Get document scraper with mock option
document_scraper = get_document_scraper(use_mock=use_mock)
# Get document processor
document_processor = get_document_processor()
# Get report synthesizer
report_synthesizer = get_report_synthesizer()
# Scrape URLs
print(f"Scraping {len(urls)} URLs...")
documents = await document_scraper.scrape_urls(urls)
print(f"Scraped {len(documents)} documents")
# Create relevance scores (mock scores for this test)
relevance_scores = {}
for i, doc in enumerate(documents):
relevance_scores[doc.get('url')] = 1.0 - (i * 0.1) # Simple decreasing scores
# Process documents for report
print("Processing documents for report...")
selected_chunks = document_processor.process_documents_for_report(
documents,
relevance_scores,
token_budget=4000,
chunk_size=1000,
overlap_size=100
)
print(f"Selected {len(selected_chunks)} chunks for report")
# Generate report
print(f"Generating report for query: '{query}'")
print("-" * 50)
report = await report_synthesizer.synthesize_report(selected_chunks, query)
print("\nGenerated Report:")
print("=" * 50)
print(report)
print("=" * 50)
# Save the report to a file
output_file = f"report_{int(asyncio.get_event_loop().time())}.md"
with open(output_file, "w") as f:
f.write(report)
print(f"Report saved to {output_file}")
async def main():
"""Main function to run the test."""
parser = argparse.ArgumentParser(description="Test report synthesis functionality")
parser.add_argument("--sample", action="store_true", help="Use sample document chunks")
parser.add_argument("--urls", nargs="+", help="URLs to scrape")
parser.add_argument("--query", type=str, default="What are the key features and applications of Python programming language?", help="Query to use for the report")
parser.add_argument("--mock", action="store_true", help="Use mock data for document scraping")
args = parser.parse_args()
if args.sample:
await test_with_sample_chunks()
elif args.urls:
await test_with_real_urls(args.urls, args.query, args.mock)
else:
# Default test with some Python-related URLs
default_urls = [
"https://docs.python.org/3/tutorial/index.html",
"https://www.python.org/about/",
"https://www.python.org/about/apps/",
"https://realpython.com/python-introduction/"
]
await test_with_real_urls(default_urls, args.query, args.mock)
if __name__ == "__main__":
asyncio.run(main())