ira/test_reranker.py

"""
Test script for the Jina Reranker integration.
This script tests the reranker functionality by comparing results with and without reranking.
"""

import json
import time
from pathlib import Path

from query.query_processor import QueryProcessor
from execution.search_executor import SearchExecutor
from execution.result_collector import ResultCollector
from ranking.jina_reranker import get_jina_reranker


def test_reranker():
    """Test the reranker functionality."""
    # Initialize components
    query_processor = QueryProcessor()
    search_executor = SearchExecutor()
    result_collector = ResultCollector()

    # Check if reranker is available
    try:
        reranker = get_jina_reranker()
        reranker_available = True
        print("Jina Reranker is available.")
    except ValueError:
        reranker_available = False
        print("Jina Reranker is not available. Will only test basic scoring.")

    # Process a test query
    query = "What are the latest advancements in quantum computing?"
    print(f"Processing query: {query}")

    processed_query = query_processor.process_query(query)
    print(f"Processed query: {processed_query}")

    # Execute the search
    available_engines = search_executor.get_available_search_engines()
    print(f"Available search engines: {available_engines}")

    if 'search_engines' not in processed_query:
        processed_query['search_engines'] = available_engines

    # Execute the search
    search_results = search_executor.execute_search(
        structured_query=processed_query,
        num_results=10
    )

    # Print which engines returned results
    for engine, results in search_results.items():
        print(f"Engine {engine} returned {len(results)} results")

    # Add the query to each result for reranking
    enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query))
    print(f"Enhanced query for reranking: {enhanced_query}")

    # Print the structure of the first result from each engine
    print("\nResult structure examples:")
    for engine, results in search_results.items():
        if results:
            print(f"\n{engine} result example:")
            print(json.dumps(results[0], indent=2, default=str))

    # Flatten results for easier manipulation
    flattened_results = []
    for engine, results in search_results.items():
        for result in results:
            # Add the query and engine to each result
            result["query"] = enhanced_query
            result["engine"] = engine
            flattened_results.append(result)

    # Verify that the query is in the flattened results
    if flattened_results:
        print(f"\nVerifying query in flattened results:")
        print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")

    # Process results without reranking
    print("\nProcessing results without reranking...")
    basic_results = result_collector.process_results(
        {"combined": flattened_results}, dedup=True, max_results=None, use_reranker=False
    )
    print(f"Processed {len(basic_results)} results with basic scoring")

    # Save basic results
    results_dir = Path(__file__).parent / "results"
    results_dir.mkdir(exist_ok=True)

    timestamp = int(time.time())
    basic_file = results_dir / f"basic_results_{timestamp}.json"

    with open(basic_file, "w") as f:
        json.dump(basic_results, f, indent=2)
    print(f"Basic results saved to {basic_file}")

    # Process results with reranking (if available)
    if reranker_available:
        print("\nProcessing results with reranking...")
        reranked_results = result_collector.process_results(
            {"combined": flattened_results}, dedup=True, max_results=None, use_reranker=True
        )
        print(f"Processed {len(reranked_results)} results with reranking")

        # Save reranked results
        reranked_file = results_dir / f"reranked_results_{timestamp}.json"

        with open(reranked_file, "w") as f:
            json.dump(reranked_results, f, indent=2)
        print(f"Reranked results saved to {reranked_file}")

        # Compare top 5 results
        print("\nComparing top 5 results:")
        print("\nTop 5 results with basic scoring:")
        for i, result in enumerate(basic_results[:5]):
            print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")

        print("\nTop 5 results with reranking:")
        for i, result in enumerate(reranked_results[:5]):
            print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")


if __name__ == "__main__":
    test_reranker()