Fix Jina Reranker API integration with proper request and response handling

2025-02-27 17:16:52 -06:00 · 2025-02-27 17:16:52 -06:00 · a34b92c103
parent 16c7dca2c7
commit a34b92c103
3 changed files with 218 additions and 129 deletions
--- a/ranking/jina_reranker.py
+++ b/ranking/jina_reranker.py
@ -78,14 +78,16 @@ class JinaReranker:
            "Accept": "application/json"
        }
        # The correct format is an array of plain strings, not objects with a "text" field
        data = {
            "model": self.model,
            "query": query,
-            "documents": documents,
+            "documents": documents,  # Plain array of strings
            "top_n": top_n
        }
        print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
        print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
        try:
            response = requests.post(self.endpoint, headers=headers, json=data)
@ -98,12 +100,30 @@ class JinaReranker:
            response.raise_for_status()  # Raise exception for HTTP errors
            result = response.json()
            print(f"Reranker API response structure: {list(result.keys())}")
            # Process and return the reranked results
            reranked_results = []
-            # Check for the specific response structure we observed
+            # Check for the specific response structure from the API
-            if "data" in result and isinstance(result["data"], list):
+            if "results" in result and isinstance(result["results"], list):
                results_list = result["results"]
                for item in results_list:
                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
                        reranked_results.append({
                            'index': item.get('index'),
                            'score': item.get('relevance_score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
                    # Handle newer Jina API format with document.text
                    elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
                        reranked_results.append({
                            'index': item.get('index'),
                            'score': item.get('relevance_score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
            # Fallback for older response structures with "data" field
            elif "data" in result and isinstance(result["data"], list):
                data_list = result["data"]
                for item in data_list:
                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
@ -112,26 +132,6 @@ class JinaReranker:
                            'score': item.get('relevance_score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
            # Check other possible response structures
            elif "results" in result:
                results_list = result["results"]
                for item in results_list:
                    if isinstance(item, dict) and "index" in item and "score" in item:
                        reranked_results.append({
                            'index': item.get('index'),
                            'score': item.get('score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
            elif "documents" in result:
                # Alternative API response structure
                docs_list = result["documents"]
                for i, doc in enumerate(docs_list):
                    if isinstance(doc, dict) and "score" in doc:
                        reranked_results.append({
                            'index': i,
                            'score': doc.get('score'),
                            'document': documents[i]
                        })
            print(f"Processed reranker results: {len(reranked_results)} items")
            return reranked_results
--- a/test_reranker.py
+++ b/test_reranker.py
@ -5,122 +5,59 @@ This script tests the reranker functionality by comparing results with and witho
 import json
 import time
 import os
 from pathlib import Path
 from typing import Dict, List, Any, Optional
-from query.query_processor import QueryProcessor
+# Import just what we need for the simple test
-from execution.search_executor import SearchExecutor
+from ranking.jina_reranker import JinaReranker, get_jina_reranker
 from execution.result_collector import ResultCollector
 from ranking.jina_reranker import get_jina_reranker
-
+def test_simple_reranker():
-def test_reranker():
+    """Test the Jina Reranker with a simple query and documents"""
-    """Test the reranker functionality."""
+    # Initialize the reranker directly without parameters (it will read from config)
    # Initialize components
    query_processor = QueryProcessor()
    search_executor = SearchExecutor()
    result_collector = ResultCollector()
    # Check if reranker is available
    try:
        reranker = get_jina_reranker()
-        reranker_available = True
+        print("Successfully initialized Jina Reranker")
-        print("Jina Reranker is available.")
+    except Exception as e:
-    except ValueError:
+        print(f"Error initializing Jina Reranker: {str(e)}")
-        reranker_available = False
+        return
        print("Jina Reranker is not available. Will only test basic scoring.")
-    # Process a test query
+    # Simple query and documents
-    query = "What are the latest advancements in quantum computing?"
+    query = "What is quantum computing?"
-    print(f"Processing query: {query}")
+    documents = [
        "Quantum computing is a type of computation that harnesses quantum mechanics.",
        "Classical computers use bits, while quantum computers use qubits.",
        "Machine learning is a subset of artificial intelligence.",
        "Quantum computers can solve certain problems faster than classical computers."
    ]
-    processed_query = query_processor.process_query(query)
+    print(f"Testing reranker with query: {query}")
-    print(f"Processed query: {processed_query}")
+    print(f"Documents: {documents}")
-    # Execute the search
+    # Rerank the documents
-    available_engines = search_executor.get_available_search_engines()
+    try:
-    print(f"Available search engines: {available_engines}")
+        reranked = reranker.rerank(query, documents)
-    
+        print(f"Reranked results: {json.dumps(reranked, indent=2)}")
    if 'search_engines' not in processed_query:
        processed_query['search_engines'] = available_engines
    # Execute the search
    search_results = search_executor.execute_search(
        structured_query=processed_query,
        num_results=10
    )
    # Print which engines returned results
    for engine, results in search_results.items():
        print(f"Engine {engine} returned {len(results)} results")
    # Add the query to each result for reranking
    enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query))
    print(f"Enhanced query for reranking: {enhanced_query}")
    # Print the structure of the first result from each engine
    print("\nResult structure examples:")
    for engine, results in search_results.items():
        if results:
            print(f"\n{engine} result example:")
            print(json.dumps(results[0], indent=2, default=str))
    # Flatten results for easier manipulation
    flattened_results = []
    for engine, results in search_results.items():
        for result in results:
            # Add the query and engine to each result
            result["query"] = enhanced_query
            result["engine"] = engine
            flattened_results.append(result)
    # Verify that the query is in the flattened results
    if flattened_results:
        print(f"\nVerifying query in flattened results:")
        print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
    # Process results without reranking
    print("\nProcessing results without reranking...")
    basic_results = result_collector.process_results(
        {"combined": flattened_results}, dedup=True, max_results=None, use_reranker=False
    )
    print(f"Processed {len(basic_results)} results with basic scoring")
    # Save basic results
    results_dir = Path(__file__).parent / "results"
    results_dir.mkdir(exist_ok=True)
    timestamp = int(time.time())
    basic_file = results_dir / f"basic_results_{timestamp}.json"
    with open(basic_file, "w") as f:
        json.dump(basic_results, f, indent=2)
    print(f"Basic results saved to {basic_file}")
    # Process results with reranking (if available)
    if reranker_available:
        print("\nProcessing results with reranking...")
        reranked_results = result_collector.process_results(
            {"combined": flattened_results}, dedup=True, max_results=None, use_reranker=True
        )
        print(f"Processed {len(reranked_results)} results with reranking")
-        # Save reranked results
+        # Save the results to a file for analysis
-        reranked_file = results_dir / f"reranked_results_{timestamp}.json"
+        results_dir = Path("results")
        results_dir.mkdir(exist_ok=True)
        results_file = results_dir / f"reranked_results_{int(time.time())}.json"
-        with open(reranked_file, "w") as f:
+        with open(results_file, "w") as f:
-            json.dump(reranked_results, f, indent=2)
+            json.dump(reranked, f, indent=2)
        print(f"Reranked results saved to {reranked_file}")
-        # Compare top 5 results
+        print(f"Results saved to {results_file}")
-        print("\nComparing top 5 results:")
+        return True
-        print("\nTop 5 results with basic scoring:")
+    except Exception as e:
-        for i, result in enumerate(basic_results[:5]):
+        print(f"Error reranking: {str(e)}")
-            print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
+        return False
        print("\nTop 5 results with reranking:")
        for i, result in enumerate(reranked_results[:5]):
            print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
 if __name__ == "__main__":
-    test_reranker()
+    # Just run the simple test
    success = test_simple_reranker()
    if success:
        print("Jina Reranker test completed successfully!")
    else:
        print("Jina Reranker test failed.")
--- a/test_simple_reranker.py
+++ b/test_simple_reranker.py
@ -0,0 +1,152 @@
 import json
 import sys
 import os
 import yaml
 from pathlib import Path
 # Add the project root to the path
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 # Let's create a custom JinaReranker class specifically for testing
 class TestJinaReranker:
    """Custom JinaReranker for testing with explicit initialization parameters"""
    def __init__(self, api_key, model, endpoint):
        """Initialize with explicit parameters"""
        self.api_key = api_key
        self.model = model
        self.endpoint = endpoint
        self.default_top_n = 10
    def rerank(self, query, documents, top_n=None):
        """
        Rerank documents based on their relevance to the query.
        """
        if not documents:
            return []
        # Use default top_n if not specified
        if top_n is None:
            top_n = min(self.default_top_n, len(documents))
        else:
            top_n = min(top_n, len(documents))
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}",
            "Accept": "application/json"
        }
        data = {
            "model": self.model,
            "query": query,
            "documents": documents,  # Plain array of strings
            "top_n": top_n
        }
        print(f"Making reranker API call with query: {query}")
        print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
        import requests
        try:
            response = requests.post(self.endpoint, headers=headers, json=data)
            print(f"Reranker API response status: {response.status_code}")
            if response.status_code != 200:
                print(f"Reranker API error: {response.text}")
                return []
            response.raise_for_status()  # Raise exception for HTTP errors
            result = response.json()
            print(f"Reranker API response structure: {list(result.keys())}")
            print(f"Full response: {json.dumps(result, indent=2)}")
            # Process and return the reranked results
            reranked_results = []
            # Check for the specific response structure from the API
            if "results" in result and isinstance(result["results"], list):
                results_list = result["results"]
                for item in results_list:
                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
                        reranked_results.append({
                            'index': item.get('index'),
                            'score': item.get('relevance_score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
                    # Handle newer Jina API format with document.text
                    elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
                        reranked_results.append({
                            'index': item.get('index'),
                            'score': item.get('relevance_score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
            # Fallback for older response structures
            elif "data" in result and isinstance(result["data"], list):
                data_list = result["data"]
                for item in data_list:
                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
                        reranked_results.append({
                            'index': item.get('index'),
                            'score': item.get('relevance_score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
            print(f"Processed reranker results: {len(reranked_results)} items")
            return reranked_results
        except Exception as e:
            print(f"Error calling reranker API: {str(e)}")
            return []
 def load_config():
    """Load configuration from YAML file"""
    config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config", "config.yaml")
    print(f"Loading config from {config_path}")
    if os.path.exists(config_path):
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)
        print("Configuration loaded successfully")
        return config
    else:
        print(f"Config file not found at {config_path}")
        return {}
 def test_simple_reranker():
    """Test the Jina Reranker with a simple query and documents"""
    # Get Jina API key from environment
    jina_api_key = os.environ.get("JINA_API_KEY", "")
    if not jina_api_key:
        print("JINA_API_KEY not found in environment variables")
        return
    print(f"Found JINA_API_KEY in environment variables")
    # Initialize the reranker
    reranker = TestJinaReranker(
        api_key=jina_api_key,
        model="jina-reranker-v2-base-multilingual",
        endpoint="https://api.jina.ai/v1/rerank"
    )
    # Simple query and documents
    query = "What is quantum computing?"
    documents = [
        "Quantum computing is a type of computation that harnesses quantum mechanics.",
        "Classical computers use bits, while quantum computers use qubits.",
        "Machine learning is a subset of artificial intelligence.",
        "Quantum computers can solve certain problems faster than classical computers."
    ]
    print(f"Testing simple reranker with query: {query}")
    print(f"Documents: {documents}")
    # Rerank the documents
    reranked = reranker.rerank(query, documents)
    print(f"Reranked results: {json.dumps(reranked, indent=2)}")
 if __name__ == "__main__":
    # Just run the simple test
    test_simple_reranker()