ira/tests/test_document_scraper.py

83 lines
2.6 KiB
Python

"""
Test script for the document scraper module.
This script tests the functionality of the document scraper module
by scraping a few sample URLs and storing them in the database.
"""
import os
import sys
import asyncio
import logging
from typing import List, Dict, Any
# Add parent directory to path to allow importing modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from report.database.db_manager import initialize_database, get_db_manager
from report.document_scraper import get_document_scraper
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Sample URLs for testing
TEST_URLS = [
"https://en.wikipedia.org/wiki/Web_scraping",
"https://en.wikipedia.org/wiki/Natural_language_processing",
"https://en.wikipedia.org/wiki/SQLite"
]
async def test_document_scraper():
"""Test the document scraper with sample URLs."""
# Initialize database
await initialize_database()
logger.info("Database initialized")
# Get document scraper
scraper = get_document_scraper()
# Scrape URLs
logger.info(f"Scraping {len(TEST_URLS)} URLs...")
documents = await scraper.scrape_urls(TEST_URLS)
# Print results
logger.info(f"Successfully scraped {len(documents)} documents")
for doc in documents:
logger.info(f"Title: {doc['title']}")
logger.info(f"URL: {doc['url']}")
logger.info(f"Token count: {doc['token_count']}")
logger.info(f"Content preview: {doc['content'][:200]}...")
logger.info("-" * 80)
# Test database search
db_manager = get_db_manager()
search_results = await db_manager.search_documents("scraping")
logger.info(f"Found {len(search_results)} documents matching 'scraping'")
# Test document retrieval by URL
doc = await db_manager.get_document_by_url(TEST_URLS[0])
if doc:
logger.info(f"Retrieved document by URL: {doc['title']}")
else:
logger.error(f"Failed to retrieve document by URL: {TEST_URLS[0]}")
# Count documents in database
count = await db_manager.count_documents()
logger.info(f"Total documents in database: {count}")
return True
if __name__ == "__main__":
try:
success = asyncio.run(test_document_scraper())
if success:
logger.info("All tests passed!")
sys.exit(0)
else:
logger.error("Tests failed!")
sys.exit(1)
except Exception as e:
logger.exception(f"Error running tests: {str(e)}")
sys.exit(1)