Update result collector, database manager, and document scraper test with improved error handling and performance optimizations

This commit is contained in:
Steve White 2025-02-28 08:07:19 -06:00
parent e748c345e2
commit 2c7b086930
3 changed files with 114 additions and 43 deletions

View File

@ -40,53 +40,55 @@ class ResultCollector:
search_results: Dictionary mapping search engine names to lists of results
dedup: Whether to deduplicate results
max_results: Maximum number of results to return
use_reranker: Whether to use the Jina Reranker for semantic ranking
use_reranker: Whether to use the reranker for semantic ranking
Returns:
List of processed search results
"""
# Combine results from all search engines
all_results = []
# Check if we have a flattened structure (single key with all results)
if len(search_results) == 1 and "combined" in search_results:
all_results = search_results["combined"]
print(f"Processing {len(all_results)} combined results")
else:
# Traditional structure with separate engines
for engine, results in search_results.items():
for result in results:
# Add the source if not already present
if "source" not in result:
result["source"] = engine
all_results.append(result)
print(f"Processing {len(all_results)} results from {len(search_results)} engines")
# Flatten results from all search engines
flattened_results = []
for engine, results in search_results.items():
for result in results:
# Add the source to each result
result['source'] = engine
flattened_results.append(result)
# Print a verification of the query in the flattened results
if flattened_results:
first_result = flattened_results[0]
query = first_result.get('query', '')
print(f"Verifying query in flattened results:")
print(f"Query in first result: {query[:50]}...")
# Deduplicate results if requested
if dedup:
all_results = self._deduplicate_results(all_results)
print(f"Deduplicated to {len(all_results)} results")
# Use the reranker if available and requested
flattened_results = self._deduplicate_results(flattened_results)
print(f"Processing {len(flattened_results)} combined results")
if dedup:
print(f"Deduplicated to {len(flattened_results)} results")
# Apply reranking if requested and available
if use_reranker and self.reranker is not None:
print("Using Jina Reranker for semantic ranking")
try:
print("Using Jina Reranker for semantic ranking")
all_results = self._rerank_results(all_results)
print(f"Reranked {len(all_results)} results")
reranked_results = self._rerank_results(flattened_results)
print(f"Reranked {len(reranked_results)} results")
processed_results = reranked_results
except Exception as e:
print(f"Error using reranker: {str(e)}")
# Fall back to basic scoring
all_results = self._score_and_sort_results(all_results)
print(f"Error during reranking: {str(e)}. Falling back to basic scoring.")
print("Using basic scoring")
processed_results = self._score_and_sort_results(flattened_results)
else:
# Use basic scoring
print("Using basic scoring")
all_results = self._score_and_sort_results(all_results)
processed_results = self._score_and_sort_results(flattened_results)
# Limit the number of results if requested
if max_results is not None:
all_results = all_results[:max_results]
return all_results
processed_results = processed_results[:max_results]
print(f"Processed {len(processed_results)} results {'with' if use_reranker and self.reranker is not None else 'without'} reranking")
return processed_results
def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
"""

View File

@ -298,6 +298,15 @@ class DBManager:
logger.error(f"Error deleting document: {str(e)}")
return False
async def clear_database(self):
"""Clear all data from the database."""
async with aiosqlite.connect(self.db_path) as db:
await db.execute('DELETE FROM metadata')
await db.execute('DELETE FROM documents')
await db.execute('DELETE FROM sqlite_sequence')
await db.commit()
logger.info("Database cleared")
async def search_documents(self, query: str, limit: int = 10, offset: int = 0) -> List[Dict[str, Any]]:
"""
Search for documents matching the query.

View File

@ -68,15 +68,75 @@ async def test_document_scraper():
return True
async def test_document_scraper_single_url(url, use_mock=False):
"""
Test the document scraper with a single URL.
Args:
url: The URL to scrape
use_mock: If True, use mock data instead of making actual API calls
"""
# Get document scraper
document_scraper = get_document_scraper(use_mock=use_mock)
logger.info(f"Testing document scraper with URL: {url}")
logger.info(f"Using mock data: {use_mock}")
# Scrape the URL
document = await document_scraper.scrape_url(url)
if document:
logger.info(f"Successfully scraped document: {document.get('title')}")
logger.info(f"URL: {document.get('url')}")
logger.info(f"Token count: {document.get('token_count')}")
content_preview = document.get('content', '')[:200] + '...' if document.get('content') else 'No content'
logger.info(f"Content snippet: {content_preview}")
# Print metadata
logger.info("\nMetadata:")
for key, value in document.get('metadata', {}).items():
logger.info(f" {key}: {value}")
else:
logger.info(f"Failed to scrape document: {url}")
async def clear_database():
"""Clear the document database."""
from report.database.db_manager import get_db_manager
# Get database manager
db_manager = get_db_manager()
# Clear the database
await db_manager.clear_database()
logger.info("Database cleared")
if __name__ == "__main__":
try:
success = asyncio.run(test_document_scraper())
if success:
logger.info("All tests passed!")
sys.exit(0)
else:
logger.error("Tests failed!")
import argparse
parser = argparse.ArgumentParser(description='Test the document scraper')
parser.add_argument('--url', type=str, default='https://fastapi.tiangolo.com/', help='URL to scrape')
parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
parser.add_argument('--run-all', action='store_true', help='Run all tests')
parser.add_argument('--clear-db', action='store_true', help='Clear the database')
args = parser.parse_args()
if args.run_all:
try:
success = asyncio.run(test_document_scraper())
if success:
logger.info("All tests passed!")
sys.exit(0)
else:
logger.error("Tests failed!")
sys.exit(1)
except Exception as e:
logger.exception(f"Error running tests: {str(e)}")
sys.exit(1)
except Exception as e:
logger.exception(f"Error running tests: {str(e)}")
sys.exit(1)
elif args.clear_db:
try:
asyncio.run(clear_database())
sys.exit(0)
except Exception as e:
logger.exception(f"Error clearing database: {str(e)}")
sys.exit(1)
else:
asyncio.run(test_document_scraper_single_url(args.url, use_mock=args.mock))