154 lines
6.1 KiB
Python
Executable File
154 lines
6.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Test script for the report synthesis functionality.
|
|
|
|
This script tests the report synthesis functionality by generating a report
|
|
from sample document chunks.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import asyncio
|
|
import json
|
|
import argparse
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
# Add the parent directory to the path so we can import the modules
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from report.report_synthesis import get_report_synthesizer
|
|
from report.document_processor import get_document_processor
|
|
from report.document_scraper import get_document_scraper
|
|
from report.database.db_manager import get_db_manager, initialize_database
|
|
|
|
async def test_with_sample_chunks():
|
|
"""Test report synthesis with sample document chunks."""
|
|
# Sample document chunks
|
|
chunks = [
|
|
{
|
|
"title": "Introduction to Python",
|
|
"url": "https://docs.python.org/3/tutorial/index.html",
|
|
"content": "Python is an easy to learn, powerful programming language. It has efficient high-level data structures and a simple but effective approach to object-oriented programming. Python's elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms.",
|
|
"chunk_type": "introduction",
|
|
"priority_score": 0.95
|
|
},
|
|
{
|
|
"title": "Python Features",
|
|
"url": "https://www.python.org/about/",
|
|
"content": "Python is a programming language that lets you work quickly and integrate systems more effectively. Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development, as well as for use as a scripting or glue language to connect existing components together.",
|
|
"chunk_type": "features",
|
|
"priority_score": 0.90
|
|
},
|
|
{
|
|
"title": "Python Applications",
|
|
"url": "https://www.python.org/about/apps/",
|
|
"content": "Python is used in many application domains. Here's a sampling: Web and Internet Development, Scientific and Numeric Computing, Education, Desktop GUIs, Software Development, and Business Applications. Python is also used as a scripting language for web applications, e.g. via mod_wsgi for the Apache webserver. With Web Server Gateway Interface support, it has become the language of choice for many web developers.",
|
|
"chunk_type": "applications",
|
|
"priority_score": 0.85
|
|
}
|
|
]
|
|
|
|
# Initialize the report synthesizer
|
|
synthesizer = get_report_synthesizer()
|
|
|
|
# Test query
|
|
query = "What are the key features and applications of Python programming language?"
|
|
|
|
# Generate report
|
|
print(f"Generating report for query: '{query}'")
|
|
print("-" * 50)
|
|
|
|
report = await synthesizer.synthesize_report(chunks, query)
|
|
|
|
print("\nGenerated Report:")
|
|
print("=" * 50)
|
|
print(report)
|
|
print("=" * 50)
|
|
|
|
async def test_with_real_urls(urls: List[str], query: str, use_mock: bool = False):
|
|
"""
|
|
Test report synthesis with real URLs.
|
|
|
|
Args:
|
|
urls: List of URLs to scrape
|
|
query: Query to use for the report
|
|
use_mock: Whether to use mock data for document scraping
|
|
"""
|
|
# Initialize the database
|
|
await initialize_database()
|
|
|
|
# Get document scraper with mock option
|
|
document_scraper = get_document_scraper(use_mock=use_mock)
|
|
|
|
# Get document processor
|
|
document_processor = get_document_processor()
|
|
|
|
# Get report synthesizer
|
|
report_synthesizer = get_report_synthesizer()
|
|
|
|
# Scrape URLs
|
|
print(f"Scraping {len(urls)} URLs...")
|
|
documents = await document_scraper.scrape_urls(urls)
|
|
print(f"Scraped {len(documents)} documents")
|
|
|
|
# Create relevance scores (mock scores for this test)
|
|
relevance_scores = {}
|
|
for i, doc in enumerate(documents):
|
|
relevance_scores[doc.get('url')] = 1.0 - (i * 0.1) # Simple decreasing scores
|
|
|
|
# Process documents for report
|
|
print("Processing documents for report...")
|
|
selected_chunks = document_processor.process_documents_for_report(
|
|
documents,
|
|
relevance_scores,
|
|
token_budget=4000,
|
|
chunk_size=1000,
|
|
overlap_size=100
|
|
)
|
|
print(f"Selected {len(selected_chunks)} chunks for report")
|
|
|
|
# Generate report
|
|
print(f"Generating report for query: '{query}'")
|
|
print("-" * 50)
|
|
|
|
report = await report_synthesizer.synthesize_report(selected_chunks, query)
|
|
|
|
print("\nGenerated Report:")
|
|
print("=" * 50)
|
|
print(report)
|
|
print("=" * 50)
|
|
|
|
# Save the report to a file
|
|
output_file = f"report_{int(asyncio.get_event_loop().time())}.md"
|
|
with open(output_file, "w") as f:
|
|
f.write(report)
|
|
|
|
print(f"Report saved to {output_file}")
|
|
|
|
async def main():
|
|
"""Main function to run the test."""
|
|
parser = argparse.ArgumentParser(description="Test report synthesis functionality")
|
|
parser.add_argument("--sample", action="store_true", help="Use sample document chunks")
|
|
parser.add_argument("--urls", nargs="+", help="URLs to scrape")
|
|
parser.add_argument("--query", type=str, default="What are the key features and applications of Python programming language?", help="Query to use for the report")
|
|
parser.add_argument("--mock", action="store_true", help="Use mock data for document scraping")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.sample:
|
|
await test_with_sample_chunks()
|
|
elif args.urls:
|
|
await test_with_real_urls(args.urls, args.query, args.mock)
|
|
else:
|
|
# Default test with some Python-related URLs
|
|
default_urls = [
|
|
"https://docs.python.org/3/tutorial/index.html",
|
|
"https://www.python.org/about/",
|
|
"https://www.python.org/about/apps/",
|
|
"https://realpython.com/python-introduction/"
|
|
]
|
|
await test_with_real_urls(default_urls, args.query, args.mock)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|