127 lines
4.7 KiB
Python
127 lines
4.7 KiB
Python
"""
|
|
Test script for the Jina Reranker integration.
|
|
This script tests the reranker functionality by comparing results with and without reranking.
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from query.query_processor import QueryProcessor
|
|
from execution.search_executor import SearchExecutor
|
|
from execution.result_collector import ResultCollector
|
|
from ranking.jina_reranker import get_jina_reranker
|
|
|
|
|
|
def test_reranker():
|
|
"""Test the reranker functionality."""
|
|
# Initialize components
|
|
query_processor = QueryProcessor()
|
|
search_executor = SearchExecutor()
|
|
result_collector = ResultCollector()
|
|
|
|
# Check if reranker is available
|
|
try:
|
|
reranker = get_jina_reranker()
|
|
reranker_available = True
|
|
print("Jina Reranker is available.")
|
|
except ValueError:
|
|
reranker_available = False
|
|
print("Jina Reranker is not available. Will only test basic scoring.")
|
|
|
|
# Process a test query
|
|
query = "What are the latest advancements in quantum computing?"
|
|
print(f"Processing query: {query}")
|
|
|
|
processed_query = query_processor.process_query(query)
|
|
print(f"Processed query: {processed_query}")
|
|
|
|
# Execute the search
|
|
available_engines = search_executor.get_available_search_engines()
|
|
print(f"Available search engines: {available_engines}")
|
|
|
|
if 'search_engines' not in processed_query:
|
|
processed_query['search_engines'] = available_engines
|
|
|
|
# Execute the search
|
|
search_results = search_executor.execute_search(
|
|
structured_query=processed_query,
|
|
num_results=10
|
|
)
|
|
|
|
# Print which engines returned results
|
|
for engine, results in search_results.items():
|
|
print(f"Engine {engine} returned {len(results)} results")
|
|
|
|
# Add the query to each result for reranking
|
|
enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query))
|
|
print(f"Enhanced query for reranking: {enhanced_query}")
|
|
|
|
# Print the structure of the first result from each engine
|
|
print("\nResult structure examples:")
|
|
for engine, results in search_results.items():
|
|
if results:
|
|
print(f"\n{engine} result example:")
|
|
print(json.dumps(results[0], indent=2, default=str))
|
|
|
|
# Flatten results for easier manipulation
|
|
flattened_results = []
|
|
for engine, results in search_results.items():
|
|
for result in results:
|
|
# Add the query and engine to each result
|
|
result["query"] = enhanced_query
|
|
result["engine"] = engine
|
|
flattened_results.append(result)
|
|
|
|
# Verify that the query is in the flattened results
|
|
if flattened_results:
|
|
print(f"\nVerifying query in flattened results:")
|
|
print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
|
|
|
|
# Process results without reranking
|
|
print("\nProcessing results without reranking...")
|
|
basic_results = result_collector.process_results(
|
|
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=False
|
|
)
|
|
print(f"Processed {len(basic_results)} results with basic scoring")
|
|
|
|
# Save basic results
|
|
results_dir = Path(__file__).parent / "results"
|
|
results_dir.mkdir(exist_ok=True)
|
|
|
|
timestamp = int(time.time())
|
|
basic_file = results_dir / f"basic_results_{timestamp}.json"
|
|
|
|
with open(basic_file, "w") as f:
|
|
json.dump(basic_results, f, indent=2)
|
|
print(f"Basic results saved to {basic_file}")
|
|
|
|
# Process results with reranking (if available)
|
|
if reranker_available:
|
|
print("\nProcessing results with reranking...")
|
|
reranked_results = result_collector.process_results(
|
|
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=True
|
|
)
|
|
print(f"Processed {len(reranked_results)} results with reranking")
|
|
|
|
# Save reranked results
|
|
reranked_file = results_dir / f"reranked_results_{timestamp}.json"
|
|
|
|
with open(reranked_file, "w") as f:
|
|
json.dump(reranked_results, f, indent=2)
|
|
print(f"Reranked results saved to {reranked_file}")
|
|
|
|
# Compare top 5 results
|
|
print("\nComparing top 5 results:")
|
|
print("\nTop 5 results with basic scoring:")
|
|
for i, result in enumerate(basic_results[:5]):
|
|
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
|
|
|
|
print("\nTop 5 results with reranking:")
|
|
for i, result in enumerate(reranked_results[:5]):
|
|
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_reranker()
|