ira/test_reranker.py

127 lines
4.7 KiB
Python

"""
Test script for the Jina Reranker integration.
This script tests the reranker functionality by comparing results with and without reranking.
"""
import json
import time
from pathlib import Path
from query.query_processor import QueryProcessor
from execution.search_executor import SearchExecutor
from execution.result_collector import ResultCollector
from ranking.jina_reranker import get_jina_reranker
def test_reranker():
"""Test the reranker functionality."""
# Initialize components
query_processor = QueryProcessor()
search_executor = SearchExecutor()
result_collector = ResultCollector()
# Check if reranker is available
try:
reranker = get_jina_reranker()
reranker_available = True
print("Jina Reranker is available.")
except ValueError:
reranker_available = False
print("Jina Reranker is not available. Will only test basic scoring.")
# Process a test query
query = "What are the latest advancements in quantum computing?"
print(f"Processing query: {query}")
processed_query = query_processor.process_query(query)
print(f"Processed query: {processed_query}")
# Execute the search
available_engines = search_executor.get_available_search_engines()
print(f"Available search engines: {available_engines}")
if 'search_engines' not in processed_query:
processed_query['search_engines'] = available_engines
# Execute the search
search_results = search_executor.execute_search(
structured_query=processed_query,
num_results=10
)
# Print which engines returned results
for engine, results in search_results.items():
print(f"Engine {engine} returned {len(results)} results")
# Add the query to each result for reranking
enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query))
print(f"Enhanced query for reranking: {enhanced_query}")
# Print the structure of the first result from each engine
print("\nResult structure examples:")
for engine, results in search_results.items():
if results:
print(f"\n{engine} result example:")
print(json.dumps(results[0], indent=2, default=str))
# Flatten results for easier manipulation
flattened_results = []
for engine, results in search_results.items():
for result in results:
# Add the query and engine to each result
result["query"] = enhanced_query
result["engine"] = engine
flattened_results.append(result)
# Verify that the query is in the flattened results
if flattened_results:
print(f"\nVerifying query in flattened results:")
print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
# Process results without reranking
print("\nProcessing results without reranking...")
basic_results = result_collector.process_results(
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=False
)
print(f"Processed {len(basic_results)} results with basic scoring")
# Save basic results
results_dir = Path(__file__).parent / "results"
results_dir.mkdir(exist_ok=True)
timestamp = int(time.time())
basic_file = results_dir / f"basic_results_{timestamp}.json"
with open(basic_file, "w") as f:
json.dump(basic_results, f, indent=2)
print(f"Basic results saved to {basic_file}")
# Process results with reranking (if available)
if reranker_available:
print("\nProcessing results with reranking...")
reranked_results = result_collector.process_results(
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=True
)
print(f"Processed {len(reranked_results)} results with reranking")
# Save reranked results
reranked_file = results_dir / f"reranked_results_{timestamp}.json"
with open(reranked_file, "w") as f:
json.dump(reranked_results, f, indent=2)
print(f"Reranked results saved to {reranked_file}")
# Compare top 5 results
print("\nComparing top 5 results:")
print("\nTop 5 results with basic scoring:")
for i, result in enumerate(basic_results[:5]):
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
print("\nTop 5 results with reranking:")
for i, result in enumerate(reranked_results[:5]):
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
if __name__ == "__main__":
test_reranker()