Fix Jina Reranker API integration with proper request and response handling

2025-02-27 17:16:52 -06:00 · 2025-02-27 17:16:52 -06:00 · a34b92c103
parent 16c7dca2c7
commit a34b92c103
3 changed files with 218 additions and 129 deletions
--- a/ranking/jina_reranker.py
+++ b/ranking/jina_reranker.py
@ -78,14 +78,16 @@ class JinaReranker:
            "Accept": "application/json"
        }
        
+        # The correct format is an array of plain strings, not objects with a "text" field
        data = {
            "model": self.model,
            "query": query,
-            "documents": documents,
+            "documents": documents,  # Plain array of strings
            "top_n": top_n
        }
        
        print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
+        print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
        
        try:
            response = requests.post(self.endpoint, headers=headers, json=data)
@ -98,12 +100,30 @@ class JinaReranker:
            response.raise_for_status()  # Raise exception for HTTP errors
            
            result = response.json()
+            print(f"Reranker API response structure: {list(result.keys())}")
            
            # Process and return the reranked results
            reranked_results = []
            
-            # Check for the specific response structure we observed
-            if "data" in result and isinstance(result["data"], list):
+            # Check for the specific response structure from the API
+            if "results" in result and isinstance(result["results"], list):
+                results_list = result["results"]
+                for item in results_list:
+                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
+                        reranked_results.append({
+                            'index': item.get('index'),
+                            'score': item.get('relevance_score'),
+                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
+                        })
+                    # Handle newer Jina API format with document.text
+                    elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
+                        reranked_results.append({
+                            'index': item.get('index'),
+                            'score': item.get('relevance_score'),
+                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
+                        })
+            # Fallback for older response structures with "data" field
+            elif "data" in result and isinstance(result["data"], list):
                data_list = result["data"]
                for item in data_list:
                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
@ -112,26 +132,6 @@ class JinaReranker:
                            'score': item.get('relevance_score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
-            # Check other possible response structures
-            elif "results" in result:
-                results_list = result["results"]
-                for item in results_list:
-                    if isinstance(item, dict) and "index" in item and "score" in item:
-                        reranked_results.append({
-                            'index': item.get('index'),
-                            'score': item.get('score'),
-                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
-                        })
-            elif "documents" in result:
-                # Alternative API response structure
-                docs_list = result["documents"]
-                for i, doc in enumerate(docs_list):
-                    if isinstance(doc, dict) and "score" in doc:
-                        reranked_results.append({
-                            'index': i,
-                            'score': doc.get('score'),
-                            'document': documents[i]
-                        })
            
            print(f"Processed reranker results: {len(reranked_results)} items")
            return reranked_results
--- a/test_reranker.py
+++ b/test_reranker.py
@ -5,122 +5,59 @@ This script tests the reranker functionality by comparing results with and witho

 import json
 import time
+import os
 from pathlib import Path
+from typing import Dict, List, Any, Optional

-from query.query_processor import QueryProcessor
-from execution.search_executor import SearchExecutor
-from execution.result_collector import ResultCollector
-from ranking.jina_reranker import get_jina_reranker
+# Import just what we need for the simple test
+from ranking.jina_reranker import JinaReranker, get_jina_reranker

-
-def test_reranker():
-    """Test the reranker functionality."""
-    # Initialize components
-    query_processor = QueryProcessor()
-    search_executor = SearchExecutor()
-    result_collector = ResultCollector()
-    
-    # Check if reranker is available
+def test_simple_reranker():
+    """Test the Jina Reranker with a simple query and documents"""
+    # Initialize the reranker directly without parameters (it will read from config)
    try:
        reranker = get_jina_reranker()
-        reranker_available = True
-        print("Jina Reranker is available.")
-    except ValueError:
-        reranker_available = False
-        print("Jina Reranker is not available. Will only test basic scoring.")
+        print("Successfully initialized Jina Reranker")
+    except Exception as e:
+        print(f"Error initializing Jina Reranker: {str(e)}")
+        return
    
-    # Process a test query
-    query = "What are the latest advancements in quantum computing?"
-    print(f"Processing query: {query}")
+    # Simple query and documents
+    query = "What is quantum computing?"
+    documents = [
+        "Quantum computing is a type of computation that harnesses quantum mechanics.",
+        "Classical computers use bits, while quantum computers use qubits.",
+        "Machine learning is a subset of artificial intelligence.",
+        "Quantum computers can solve certain problems faster than classical computers."
+    ]
    
-    processed_query = query_processor.process_query(query)
-    print(f"Processed query: {processed_query}")
+    print(f"Testing reranker with query: {query}")
+    print(f"Documents: {documents}")
    
-    # Execute the search
-    available_engines = search_executor.get_available_search_engines()
-    print(f"Available search engines: {available_engines}")
+    # Rerank the documents
+    try:
+        reranked = reranker.rerank(query, documents)
+        print(f"Reranked results: {json.dumps(reranked, indent=2)}")
        
-    if 'search_engines' not in processed_query:
-        processed_query['search_engines'] = available_engines
-    
-    # Execute the search
-    search_results = search_executor.execute_search(
-        structured_query=processed_query,
-        num_results=10
-    )
-    
-    # Print which engines returned results
-    for engine, results in search_results.items():
-        print(f"Engine {engine} returned {len(results)} results")
-    
-    # Add the query to each result for reranking
-    enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query))
-    print(f"Enhanced query for reranking: {enhanced_query}")
-    
-    # Print the structure of the first result from each engine
-    print("\nResult structure examples:")
-    for engine, results in search_results.items():
-        if results:
-            print(f"\n{engine} result example:")
-            print(json.dumps(results[0], indent=2, default=str))
-    
-    # Flatten results for easier manipulation
-    flattened_results = []
-    for engine, results in search_results.items():
-        for result in results:
-            # Add the query and engine to each result
-            result["query"] = enhanced_query
-            result["engine"] = engine
-            flattened_results.append(result)
-    
-    # Verify that the query is in the flattened results
-    if flattened_results:
-        print(f"\nVerifying query in flattened results:")
-        print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
-    
-    # Process results without reranking
-    print("\nProcessing results without reranking...")
-    basic_results = result_collector.process_results(
-        {"combined": flattened_results}, dedup=True, max_results=None, use_reranker=False
-    )
-    print(f"Processed {len(basic_results)} results with basic scoring")
-    
-    # Save basic results
-    results_dir = Path(__file__).parent / "results"
+        # Save the results to a file for analysis
+        results_dir = Path("results")
        results_dir.mkdir(exist_ok=True)
+        results_file = results_dir / f"reranked_results_{int(time.time())}.json"
        
-    timestamp = int(time.time())
-    basic_file = results_dir / f"basic_results_{timestamp}.json"
-    
-    with open(basic_file, "w") as f:
-        json.dump(basic_results, f, indent=2)
-    print(f"Basic results saved to {basic_file}")
-    
-    # Process results with reranking (if available)
-    if reranker_available:
-        print("\nProcessing results with reranking...")
-        reranked_results = result_collector.process_results(
-            {"combined": flattened_results}, dedup=True, max_results=None, use_reranker=True
-        )
-        print(f"Processed {len(reranked_results)} results with reranking")
-        
-        # Save reranked results
-        reranked_file = results_dir / f"reranked_results_{timestamp}.json"
-        
-        with open(reranked_file, "w") as f:
-            json.dump(reranked_results, f, indent=2)
-        print(f"Reranked results saved to {reranked_file}")
-        
-        # Compare top 5 results
-        print("\nComparing top 5 results:")
-        print("\nTop 5 results with basic scoring:")
-        for i, result in enumerate(basic_results[:5]):
-            print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
-        
-        print("\nTop 5 results with reranking:")
-        for i, result in enumerate(reranked_results[:5]):
-            print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
+        with open(results_file, "w") as f:
+            json.dump(reranked, f, indent=2)
        
+        print(f"Results saved to {results_file}")
+        return True
+    except Exception as e:
+        print(f"Error reranking: {str(e)}")
+        return False

 if __name__ == "__main__":
-    test_reranker()
+    # Just run the simple test
+    success = test_simple_reranker()
+    
+    if success:
+        print("Jina Reranker test completed successfully!")
+    else:
+        print("Jina Reranker test failed.")
--- a/test_simple_reranker.py
+++ b/test_simple_reranker.py
@ -0,0 +1,152 @@
+import json
+import sys
+import os
+import yaml
+from pathlib import Path
+
+# Add the project root to the path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+# Let's create a custom JinaReranker class specifically for testing
+class TestJinaReranker:
+    """Custom JinaReranker for testing with explicit initialization parameters"""
+    
+    def __init__(self, api_key, model, endpoint):
+        """Initialize with explicit parameters"""
+        self.api_key = api_key
+        self.model = model
+        self.endpoint = endpoint
+        self.default_top_n = 10
+    
+    def rerank(self, query, documents, top_n=None):
+        """
+        Rerank documents based on their relevance to the query.
+        """
+        if not documents:
+            return []
+        
+        # Use default top_n if not specified
+        if top_n is None:
+            top_n = min(self.default_top_n, len(documents))
+        else:
+            top_n = min(top_n, len(documents))
+        
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+            "Accept": "application/json"
+        }
+        
+        data = {
+            "model": self.model,
+            "query": query,
+            "documents": documents,  # Plain array of strings
+            "top_n": top_n
+        }
+        
+        print(f"Making reranker API call with query: {query}")
+        print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
+        
+        import requests
+        try:
+            response = requests.post(self.endpoint, headers=headers, json=data)
+            print(f"Reranker API response status: {response.status_code}")
+            
+            if response.status_code != 200:
+                print(f"Reranker API error: {response.text}")
+                return []
+                
+            response.raise_for_status()  # Raise exception for HTTP errors
+            
+            result = response.json()
+            print(f"Reranker API response structure: {list(result.keys())}")
+            print(f"Full response: {json.dumps(result, indent=2)}")
+            
+            # Process and return the reranked results
+            reranked_results = []
+            
+            # Check for the specific response structure from the API
+            if "results" in result and isinstance(result["results"], list):
+                results_list = result["results"]
+                for item in results_list:
+                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
+                        reranked_results.append({
+                            'index': item.get('index'),
+                            'score': item.get('relevance_score'),
+                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
+                        })
+                    # Handle newer Jina API format with document.text
+                    elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
+                        reranked_results.append({
+                            'index': item.get('index'),
+                            'score': item.get('relevance_score'),
+                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
+                        })
+            # Fallback for older response structures
+            elif "data" in result and isinstance(result["data"], list):
+                data_list = result["data"]
+                for item in data_list:
+                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
+                        reranked_results.append({
+                            'index': item.get('index'),
+                            'score': item.get('relevance_score'),
+                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
+                        })
+            
+            print(f"Processed reranker results: {len(reranked_results)} items")
+            return reranked_results
+            
+        except Exception as e:
+            print(f"Error calling reranker API: {str(e)}")
+            return []
+
+def load_config():
+    """Load configuration from YAML file"""
+    config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config", "config.yaml")
+    print(f"Loading config from {config_path}")
+    
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        print("Configuration loaded successfully")
+        return config
+    else:
+        print(f"Config file not found at {config_path}")
+        return {}
+
+def test_simple_reranker():
+    """Test the Jina Reranker with a simple query and documents"""
+    # Get Jina API key from environment
+    jina_api_key = os.environ.get("JINA_API_KEY", "")
+    if not jina_api_key:
+        print("JINA_API_KEY not found in environment variables")
+        return
+    
+    print(f"Found JINA_API_KEY in environment variables")
+    
+    # Initialize the reranker
+    reranker = TestJinaReranker(
+        api_key=jina_api_key,
+        model="jina-reranker-v2-base-multilingual",
+        endpoint="https://api.jina.ai/v1/rerank"
+    )
+    
+    # Simple query and documents
+    query = "What is quantum computing?"
+    documents = [
+        "Quantum computing is a type of computation that harnesses quantum mechanics.",
+        "Classical computers use bits, while quantum computers use qubits.",
+        "Machine learning is a subset of artificial intelligence.",
+        "Quantum computers can solve certain problems faster than classical computers."
+    ]
+    
+    print(f"Testing simple reranker with query: {query}")
+    print(f"Documents: {documents}")
+    
+    # Rerank the documents
+    reranked = reranker.rerank(query, documents)
+    print(f"Reranked results: {json.dumps(reranked, indent=2)}")
+
+if __name__ == "__main__":
+    # Just run the simple test
+    test_simple_reranker()