""" Test script for the document scraper module. This script tests the functionality of the document scraper module by scraping a few sample URLs and storing them in the database. """ import os import sys import asyncio import logging from typing import List, Dict, Any # Add parent directory to path to allow importing modules sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from report.database.db_manager import initialize_database, get_db_manager from report.document_scraper import get_document_scraper # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Sample URLs for testing TEST_URLS = [ "https://en.wikipedia.org/wiki/Web_scraping", "https://en.wikipedia.org/wiki/Natural_language_processing", "https://en.wikipedia.org/wiki/SQLite" ] async def test_document_scraper(): """Test the document scraper with sample URLs.""" # Initialize database await initialize_database() logger.info("Database initialized") # Get document scraper scraper = get_document_scraper() # Scrape URLs logger.info(f"Scraping {len(TEST_URLS)} URLs...") documents = await scraper.scrape_urls(TEST_URLS) # Print results logger.info(f"Successfully scraped {len(documents)} documents") for doc in documents: logger.info(f"Title: {doc['title']}") logger.info(f"URL: {doc['url']}") logger.info(f"Token count: {doc['token_count']}") logger.info(f"Content preview: {doc['content'][:200]}...") logger.info("-" * 80) # Test database search db_manager = get_db_manager() search_results = await db_manager.search_documents("scraping") logger.info(f"Found {len(search_results)} documents matching 'scraping'") # Test document retrieval by URL doc = await db_manager.get_document_by_url(TEST_URLS[0]) if doc: logger.info(f"Retrieved document by URL: {doc['title']}") else: logger.error(f"Failed to retrieve document by URL: {TEST_URLS[0]}") # Count documents in database count = await db_manager.count_documents() logger.info(f"Total documents in database: {count}") return True if __name__ == "__main__": try: success = asyncio.run(test_document_scraper()) if success: logger.info("All tests passed!") sys.exit(0) else: logger.error("Tests failed!") sys.exit(1) except Exception as e: logger.exception(f"Error running tests: {str(e)}") sys.exit(1)