Fix Jina Reranker API integration with proper request and response handling

This commit is contained in:
Steve White 2025-02-27 17:16:52 -06:00
parent 16c7dca2c7
commit a34b92c103
3 changed files with 218 additions and 129 deletions

View File

@ -78,14 +78,16 @@ class JinaReranker:
"Accept": "application/json"
}
# The correct format is an array of plain strings, not objects with a "text" field
data = {
"model": self.model,
"query": query,
"documents": documents,
"documents": documents, # Plain array of strings
"top_n": top_n
}
print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
try:
response = requests.post(self.endpoint, headers=headers, json=data)
@ -98,12 +100,30 @@ class JinaReranker:
response.raise_for_status() # Raise exception for HTTP errors
result = response.json()
print(f"Reranker API response structure: {list(result.keys())}")
# Process and return the reranked results
reranked_results = []
# Check for the specific response structure we observed
if "data" in result and isinstance(result["data"], list):
# Check for the specific response structure from the API
if "results" in result and isinstance(result["results"], list):
results_list = result["results"]
for item in results_list:
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
reranked_results.append({
'index': item.get('index'),
'score': item.get('relevance_score'),
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
})
# Handle newer Jina API format with document.text
elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
reranked_results.append({
'index': item.get('index'),
'score': item.get('relevance_score'),
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
})
# Fallback for older response structures with "data" field
elif "data" in result and isinstance(result["data"], list):
data_list = result["data"]
for item in data_list:
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
@ -112,26 +132,6 @@ class JinaReranker:
'score': item.get('relevance_score'),
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
})
# Check other possible response structures
elif "results" in result:
results_list = result["results"]
for item in results_list:
if isinstance(item, dict) and "index" in item and "score" in item:
reranked_results.append({
'index': item.get('index'),
'score': item.get('score'),
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
})
elif "documents" in result:
# Alternative API response structure
docs_list = result["documents"]
for i, doc in enumerate(docs_list):
if isinstance(doc, dict) and "score" in doc:
reranked_results.append({
'index': i,
'score': doc.get('score'),
'document': documents[i]
})
print(f"Processed reranker results: {len(reranked_results)} items")
return reranked_results

View File

@ -5,122 +5,59 @@ This script tests the reranker functionality by comparing results with and witho
import json
import time
import os
from pathlib import Path
from typing import Dict, List, Any, Optional
from query.query_processor import QueryProcessor
from execution.search_executor import SearchExecutor
from execution.result_collector import ResultCollector
from ranking.jina_reranker import get_jina_reranker
# Import just what we need for the simple test
from ranking.jina_reranker import JinaReranker, get_jina_reranker
def test_reranker():
"""Test the reranker functionality."""
# Initialize components
query_processor = QueryProcessor()
search_executor = SearchExecutor()
result_collector = ResultCollector()
# Check if reranker is available
def test_simple_reranker():
"""Test the Jina Reranker with a simple query and documents"""
# Initialize the reranker directly without parameters (it will read from config)
try:
reranker = get_jina_reranker()
reranker_available = True
print("Jina Reranker is available.")
except ValueError:
reranker_available = False
print("Jina Reranker is not available. Will only test basic scoring.")
print("Successfully initialized Jina Reranker")
except Exception as e:
print(f"Error initializing Jina Reranker: {str(e)}")
return
# Process a test query
query = "What are the latest advancements in quantum computing?"
print(f"Processing query: {query}")
# Simple query and documents
query = "What is quantum computing?"
documents = [
"Quantum computing is a type of computation that harnesses quantum mechanics.",
"Classical computers use bits, while quantum computers use qubits.",
"Machine learning is a subset of artificial intelligence.",
"Quantum computers can solve certain problems faster than classical computers."
]
processed_query = query_processor.process_query(query)
print(f"Processed query: {processed_query}")
print(f"Testing reranker with query: {query}")
print(f"Documents: {documents}")
# Execute the search
available_engines = search_executor.get_available_search_engines()
print(f"Available search engines: {available_engines}")
if 'search_engines' not in processed_query:
processed_query['search_engines'] = available_engines
# Execute the search
search_results = search_executor.execute_search(
structured_query=processed_query,
num_results=10
)
# Print which engines returned results
for engine, results in search_results.items():
print(f"Engine {engine} returned {len(results)} results")
# Add the query to each result for reranking
enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query))
print(f"Enhanced query for reranking: {enhanced_query}")
# Print the structure of the first result from each engine
print("\nResult structure examples:")
for engine, results in search_results.items():
if results:
print(f"\n{engine} result example:")
print(json.dumps(results[0], indent=2, default=str))
# Flatten results for easier manipulation
flattened_results = []
for engine, results in search_results.items():
for result in results:
# Add the query and engine to each result
result["query"] = enhanced_query
result["engine"] = engine
flattened_results.append(result)
# Verify that the query is in the flattened results
if flattened_results:
print(f"\nVerifying query in flattened results:")
print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
# Process results without reranking
print("\nProcessing results without reranking...")
basic_results = result_collector.process_results(
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=False
)
print(f"Processed {len(basic_results)} results with basic scoring")
# Save basic results
results_dir = Path(__file__).parent / "results"
results_dir.mkdir(exist_ok=True)
timestamp = int(time.time())
basic_file = results_dir / f"basic_results_{timestamp}.json"
with open(basic_file, "w") as f:
json.dump(basic_results, f, indent=2)
print(f"Basic results saved to {basic_file}")
# Process results with reranking (if available)
if reranker_available:
print("\nProcessing results with reranking...")
reranked_results = result_collector.process_results(
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=True
)
print(f"Processed {len(reranked_results)} results with reranking")
# Rerank the documents
try:
reranked = reranker.rerank(query, documents)
print(f"Reranked results: {json.dumps(reranked, indent=2)}")
# Save reranked results
reranked_file = results_dir / f"reranked_results_{timestamp}.json"
# Save the results to a file for analysis
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
results_file = results_dir / f"reranked_results_{int(time.time())}.json"
with open(reranked_file, "w") as f:
json.dump(reranked_results, f, indent=2)
print(f"Reranked results saved to {reranked_file}")
with open(results_file, "w") as f:
json.dump(reranked, f, indent=2)
# Compare top 5 results
print("\nComparing top 5 results:")
print("\nTop 5 results with basic scoring:")
for i, result in enumerate(basic_results[:5]):
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
print("\nTop 5 results with reranking:")
for i, result in enumerate(reranked_results[:5]):
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
print(f"Results saved to {results_file}")
return True
except Exception as e:
print(f"Error reranking: {str(e)}")
return False
if __name__ == "__main__":
test_reranker()
# Just run the simple test
success = test_simple_reranker()
if success:
print("Jina Reranker test completed successfully!")
else:
print("Jina Reranker test failed.")

152
test_simple_reranker.py Normal file
View File

@ -0,0 +1,152 @@
import json
import sys
import os
import yaml
from pathlib import Path
# Add the project root to the path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Let's create a custom JinaReranker class specifically for testing
class TestJinaReranker:
"""Custom JinaReranker for testing with explicit initialization parameters"""
def __init__(self, api_key, model, endpoint):
"""Initialize with explicit parameters"""
self.api_key = api_key
self.model = model
self.endpoint = endpoint
self.default_top_n = 10
def rerank(self, query, documents, top_n=None):
"""
Rerank documents based on their relevance to the query.
"""
if not documents:
return []
# Use default top_n if not specified
if top_n is None:
top_n = min(self.default_top_n, len(documents))
else:
top_n = min(top_n, len(documents))
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
"Accept": "application/json"
}
data = {
"model": self.model,
"query": query,
"documents": documents, # Plain array of strings
"top_n": top_n
}
print(f"Making reranker API call with query: {query}")
print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
import requests
try:
response = requests.post(self.endpoint, headers=headers, json=data)
print(f"Reranker API response status: {response.status_code}")
if response.status_code != 200:
print(f"Reranker API error: {response.text}")
return []
response.raise_for_status() # Raise exception for HTTP errors
result = response.json()
print(f"Reranker API response structure: {list(result.keys())}")
print(f"Full response: {json.dumps(result, indent=2)}")
# Process and return the reranked results
reranked_results = []
# Check for the specific response structure from the API
if "results" in result and isinstance(result["results"], list):
results_list = result["results"]
for item in results_list:
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
reranked_results.append({
'index': item.get('index'),
'score': item.get('relevance_score'),
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
})
# Handle newer Jina API format with document.text
elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
reranked_results.append({
'index': item.get('index'),
'score': item.get('relevance_score'),
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
})
# Fallback for older response structures
elif "data" in result and isinstance(result["data"], list):
data_list = result["data"]
for item in data_list:
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
reranked_results.append({
'index': item.get('index'),
'score': item.get('relevance_score'),
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
})
print(f"Processed reranker results: {len(reranked_results)} items")
return reranked_results
except Exception as e:
print(f"Error calling reranker API: {str(e)}")
return []
def load_config():
"""Load configuration from YAML file"""
config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config", "config.yaml")
print(f"Loading config from {config_path}")
if os.path.exists(config_path):
with open(config_path, "r") as f:
config = yaml.safe_load(f)
print("Configuration loaded successfully")
return config
else:
print(f"Config file not found at {config_path}")
return {}
def test_simple_reranker():
"""Test the Jina Reranker with a simple query and documents"""
# Get Jina API key from environment
jina_api_key = os.environ.get("JINA_API_KEY", "")
if not jina_api_key:
print("JINA_API_KEY not found in environment variables")
return
print(f"Found JINA_API_KEY in environment variables")
# Initialize the reranker
reranker = TestJinaReranker(
api_key=jina_api_key,
model="jina-reranker-v2-base-multilingual",
endpoint="https://api.jina.ai/v1/rerank"
)
# Simple query and documents
query = "What is quantum computing?"
documents = [
"Quantum computing is a type of computation that harnesses quantum mechanics.",
"Classical computers use bits, while quantum computers use qubits.",
"Machine learning is a subset of artificial intelligence.",
"Quantum computers can solve certain problems faster than classical computers."
]
print(f"Testing simple reranker with query: {query}")
print(f"Documents: {documents}")
# Rerank the documents
reranked = reranker.rerank(query, documents)
print(f"Reranked results: {json.dumps(reranked, indent=2)}")
if __name__ == "__main__":
# Just run the simple test
test_simple_reranker()