Update result collector, database manager, and document scraper test with improved error handling and performance optimizations
This commit is contained in:
parent
e748c345e2
commit
2c7b086930
|
@ -40,53 +40,55 @@ class ResultCollector:
|
|||
search_results: Dictionary mapping search engine names to lists of results
|
||||
dedup: Whether to deduplicate results
|
||||
max_results: Maximum number of results to return
|
||||
use_reranker: Whether to use the Jina Reranker for semantic ranking
|
||||
use_reranker: Whether to use the reranker for semantic ranking
|
||||
|
||||
Returns:
|
||||
List of processed search results
|
||||
"""
|
||||
# Combine results from all search engines
|
||||
all_results = []
|
||||
|
||||
# Check if we have a flattened structure (single key with all results)
|
||||
if len(search_results) == 1 and "combined" in search_results:
|
||||
all_results = search_results["combined"]
|
||||
print(f"Processing {len(all_results)} combined results")
|
||||
else:
|
||||
# Traditional structure with separate engines
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
# Add the source if not already present
|
||||
if "source" not in result:
|
||||
result["source"] = engine
|
||||
all_results.append(result)
|
||||
print(f"Processing {len(all_results)} results from {len(search_results)} engines")
|
||||
# Flatten results from all search engines
|
||||
flattened_results = []
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
# Add the source to each result
|
||||
result['source'] = engine
|
||||
flattened_results.append(result)
|
||||
|
||||
# Print a verification of the query in the flattened results
|
||||
if flattened_results:
|
||||
first_result = flattened_results[0]
|
||||
query = first_result.get('query', '')
|
||||
print(f"Verifying query in flattened results:")
|
||||
print(f"Query in first result: {query[:50]}...")
|
||||
|
||||
# Deduplicate results if requested
|
||||
if dedup:
|
||||
all_results = self._deduplicate_results(all_results)
|
||||
print(f"Deduplicated to {len(all_results)} results")
|
||||
|
||||
# Use the reranker if available and requested
|
||||
flattened_results = self._deduplicate_results(flattened_results)
|
||||
|
||||
print(f"Processing {len(flattened_results)} combined results")
|
||||
if dedup:
|
||||
print(f"Deduplicated to {len(flattened_results)} results")
|
||||
|
||||
# Apply reranking if requested and available
|
||||
if use_reranker and self.reranker is not None:
|
||||
print("Using Jina Reranker for semantic ranking")
|
||||
try:
|
||||
print("Using Jina Reranker for semantic ranking")
|
||||
all_results = self._rerank_results(all_results)
|
||||
print(f"Reranked {len(all_results)} results")
|
||||
reranked_results = self._rerank_results(flattened_results)
|
||||
print(f"Reranked {len(reranked_results)} results")
|
||||
processed_results = reranked_results
|
||||
except Exception as e:
|
||||
print(f"Error using reranker: {str(e)}")
|
||||
# Fall back to basic scoring
|
||||
all_results = self._score_and_sort_results(all_results)
|
||||
print(f"Error during reranking: {str(e)}. Falling back to basic scoring.")
|
||||
print("Using basic scoring")
|
||||
processed_results = self._score_and_sort_results(flattened_results)
|
||||
else:
|
||||
# Use basic scoring
|
||||
print("Using basic scoring")
|
||||
all_results = self._score_and_sort_results(all_results)
|
||||
|
||||
processed_results = self._score_and_sort_results(flattened_results)
|
||||
|
||||
# Limit the number of results if requested
|
||||
if max_results is not None:
|
||||
all_results = all_results[:max_results]
|
||||
|
||||
return all_results
|
||||
processed_results = processed_results[:max_results]
|
||||
|
||||
print(f"Processed {len(processed_results)} results {'with' if use_reranker and self.reranker is not None else 'without'} reranking")
|
||||
return processed_results
|
||||
|
||||
def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
|
|
|
@ -298,6 +298,15 @@ class DBManager:
|
|||
logger.error(f"Error deleting document: {str(e)}")
|
||||
return False
|
||||
|
||||
async def clear_database(self):
|
||||
"""Clear all data from the database."""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
await db.execute('DELETE FROM metadata')
|
||||
await db.execute('DELETE FROM documents')
|
||||
await db.execute('DELETE FROM sqlite_sequence')
|
||||
await db.commit()
|
||||
logger.info("Database cleared")
|
||||
|
||||
async def search_documents(self, query: str, limit: int = 10, offset: int = 0) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search for documents matching the query.
|
||||
|
|
|
@ -68,15 +68,75 @@ async def test_document_scraper():
|
|||
|
||||
return True
|
||||
|
||||
async def test_document_scraper_single_url(url, use_mock=False):
|
||||
"""
|
||||
Test the document scraper with a single URL.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
"""
|
||||
# Get document scraper
|
||||
document_scraper = get_document_scraper(use_mock=use_mock)
|
||||
|
||||
logger.info(f"Testing document scraper with URL: {url}")
|
||||
logger.info(f"Using mock data: {use_mock}")
|
||||
|
||||
# Scrape the URL
|
||||
document = await document_scraper.scrape_url(url)
|
||||
|
||||
if document:
|
||||
logger.info(f"Successfully scraped document: {document.get('title')}")
|
||||
logger.info(f"URL: {document.get('url')}")
|
||||
logger.info(f"Token count: {document.get('token_count')}")
|
||||
content_preview = document.get('content', '')[:200] + '...' if document.get('content') else 'No content'
|
||||
logger.info(f"Content snippet: {content_preview}")
|
||||
|
||||
# Print metadata
|
||||
logger.info("\nMetadata:")
|
||||
for key, value in document.get('metadata', {}).items():
|
||||
logger.info(f" {key}: {value}")
|
||||
else:
|
||||
logger.info(f"Failed to scrape document: {url}")
|
||||
|
||||
async def clear_database():
|
||||
"""Clear the document database."""
|
||||
from report.database.db_manager import get_db_manager
|
||||
|
||||
# Get database manager
|
||||
db_manager = get_db_manager()
|
||||
|
||||
# Clear the database
|
||||
await db_manager.clear_database()
|
||||
logger.info("Database cleared")
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
success = asyncio.run(test_document_scraper())
|
||||
if success:
|
||||
logger.info("All tests passed!")
|
||||
sys.exit(0)
|
||||
else:
|
||||
logger.error("Tests failed!")
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='Test the document scraper')
|
||||
parser.add_argument('--url', type=str, default='https://fastapi.tiangolo.com/', help='URL to scrape')
|
||||
parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
|
||||
parser.add_argument('--run-all', action='store_true', help='Run all tests')
|
||||
parser.add_argument('--clear-db', action='store_true', help='Clear the database')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.run_all:
|
||||
try:
|
||||
success = asyncio.run(test_document_scraper())
|
||||
if success:
|
||||
logger.info("All tests passed!")
|
||||
sys.exit(0)
|
||||
else:
|
||||
logger.error("Tests failed!")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error running tests: {str(e)}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error running tests: {str(e)}")
|
||||
sys.exit(1)
|
||||
elif args.clear_db:
|
||||
try:
|
||||
asyncio.run(clear_database())
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error clearing database: {str(e)}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
asyncio.run(test_document_scraper_single_url(args.url, use_mock=args.mock))
|
||||
|
|
Loading…
Reference in New Issue