Fix Jina Reranker API integration with proper request and response handling
This commit is contained in:
parent
16c7dca2c7
commit
a34b92c103
|
@ -78,14 +78,16 @@ class JinaReranker:
|
|||
"Accept": "application/json"
|
||||
}
|
||||
|
||||
# The correct format is an array of plain strings, not objects with a "text" field
|
||||
data = {
|
||||
"model": self.model,
|
||||
"query": query,
|
||||
"documents": documents,
|
||||
"documents": documents, # Plain array of strings
|
||||
"top_n": top_n
|
||||
}
|
||||
|
||||
print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
|
||||
print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
|
||||
|
||||
try:
|
||||
response = requests.post(self.endpoint, headers=headers, json=data)
|
||||
|
@ -98,12 +100,30 @@ class JinaReranker:
|
|||
response.raise_for_status() # Raise exception for HTTP errors
|
||||
|
||||
result = response.json()
|
||||
print(f"Reranker API response structure: {list(result.keys())}")
|
||||
|
||||
# Process and return the reranked results
|
||||
reranked_results = []
|
||||
|
||||
# Check for the specific response structure we observed
|
||||
if "data" in result and isinstance(result["data"], list):
|
||||
# Check for the specific response structure from the API
|
||||
if "results" in result and isinstance(result["results"], list):
|
||||
results_list = result["results"]
|
||||
for item in results_list:
|
||||
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Handle newer Jina API format with document.text
|
||||
elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Fallback for older response structures with "data" field
|
||||
elif "data" in result and isinstance(result["data"], list):
|
||||
data_list = result["data"]
|
||||
for item in data_list:
|
||||
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||
|
@ -112,26 +132,6 @@ class JinaReranker:
|
|||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Check other possible response structures
|
||||
elif "results" in result:
|
||||
results_list = result["results"]
|
||||
for item in results_list:
|
||||
if isinstance(item, dict) and "index" in item and "score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
elif "documents" in result:
|
||||
# Alternative API response structure
|
||||
docs_list = result["documents"]
|
||||
for i, doc in enumerate(docs_list):
|
||||
if isinstance(doc, dict) and "score" in doc:
|
||||
reranked_results.append({
|
||||
'index': i,
|
||||
'score': doc.get('score'),
|
||||
'document': documents[i]
|
||||
})
|
||||
|
||||
print(f"Processed reranker results: {len(reranked_results)} items")
|
||||
return reranked_results
|
||||
|
|
147
test_reranker.py
147
test_reranker.py
|
@ -5,122 +5,59 @@ This script tests the reranker functionality by comparing results with and witho
|
|||
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
from query.query_processor import QueryProcessor
|
||||
from execution.search_executor import SearchExecutor
|
||||
from execution.result_collector import ResultCollector
|
||||
from ranking.jina_reranker import get_jina_reranker
|
||||
# Import just what we need for the simple test
|
||||
from ranking.jina_reranker import JinaReranker, get_jina_reranker
|
||||
|
||||
|
||||
def test_reranker():
|
||||
"""Test the reranker functionality."""
|
||||
# Initialize components
|
||||
query_processor = QueryProcessor()
|
||||
search_executor = SearchExecutor()
|
||||
result_collector = ResultCollector()
|
||||
|
||||
# Check if reranker is available
|
||||
def test_simple_reranker():
|
||||
"""Test the Jina Reranker with a simple query and documents"""
|
||||
# Initialize the reranker directly without parameters (it will read from config)
|
||||
try:
|
||||
reranker = get_jina_reranker()
|
||||
reranker_available = True
|
||||
print("Jina Reranker is available.")
|
||||
except ValueError:
|
||||
reranker_available = False
|
||||
print("Jina Reranker is not available. Will only test basic scoring.")
|
||||
print("Successfully initialized Jina Reranker")
|
||||
except Exception as e:
|
||||
print(f"Error initializing Jina Reranker: {str(e)}")
|
||||
return
|
||||
|
||||
# Process a test query
|
||||
query = "What are the latest advancements in quantum computing?"
|
||||
print(f"Processing query: {query}")
|
||||
# Simple query and documents
|
||||
query = "What is quantum computing?"
|
||||
documents = [
|
||||
"Quantum computing is a type of computation that harnesses quantum mechanics.",
|
||||
"Classical computers use bits, while quantum computers use qubits.",
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Quantum computers can solve certain problems faster than classical computers."
|
||||
]
|
||||
|
||||
processed_query = query_processor.process_query(query)
|
||||
print(f"Processed query: {processed_query}")
|
||||
print(f"Testing reranker with query: {query}")
|
||||
print(f"Documents: {documents}")
|
||||
|
||||
# Execute the search
|
||||
available_engines = search_executor.get_available_search_engines()
|
||||
print(f"Available search engines: {available_engines}")
|
||||
# Rerank the documents
|
||||
try:
|
||||
reranked = reranker.rerank(query, documents)
|
||||
print(f"Reranked results: {json.dumps(reranked, indent=2)}")
|
||||
|
||||
if 'search_engines' not in processed_query:
|
||||
processed_query['search_engines'] = available_engines
|
||||
|
||||
# Execute the search
|
||||
search_results = search_executor.execute_search(
|
||||
structured_query=processed_query,
|
||||
num_results=10
|
||||
)
|
||||
|
||||
# Print which engines returned results
|
||||
for engine, results in search_results.items():
|
||||
print(f"Engine {engine} returned {len(results)} results")
|
||||
|
||||
# Add the query to each result for reranking
|
||||
enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query))
|
||||
print(f"Enhanced query for reranking: {enhanced_query}")
|
||||
|
||||
# Print the structure of the first result from each engine
|
||||
print("\nResult structure examples:")
|
||||
for engine, results in search_results.items():
|
||||
if results:
|
||||
print(f"\n{engine} result example:")
|
||||
print(json.dumps(results[0], indent=2, default=str))
|
||||
|
||||
# Flatten results for easier manipulation
|
||||
flattened_results = []
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
# Add the query and engine to each result
|
||||
result["query"] = enhanced_query
|
||||
result["engine"] = engine
|
||||
flattened_results.append(result)
|
||||
|
||||
# Verify that the query is in the flattened results
|
||||
if flattened_results:
|
||||
print(f"\nVerifying query in flattened results:")
|
||||
print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
|
||||
|
||||
# Process results without reranking
|
||||
print("\nProcessing results without reranking...")
|
||||
basic_results = result_collector.process_results(
|
||||
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=False
|
||||
)
|
||||
print(f"Processed {len(basic_results)} results with basic scoring")
|
||||
|
||||
# Save basic results
|
||||
results_dir = Path(__file__).parent / "results"
|
||||
# Save the results to a file for analysis
|
||||
results_dir = Path("results")
|
||||
results_dir.mkdir(exist_ok=True)
|
||||
results_file = results_dir / f"reranked_results_{int(time.time())}.json"
|
||||
|
||||
timestamp = int(time.time())
|
||||
basic_file = results_dir / f"basic_results_{timestamp}.json"
|
||||
|
||||
with open(basic_file, "w") as f:
|
||||
json.dump(basic_results, f, indent=2)
|
||||
print(f"Basic results saved to {basic_file}")
|
||||
|
||||
# Process results with reranking (if available)
|
||||
if reranker_available:
|
||||
print("\nProcessing results with reranking...")
|
||||
reranked_results = result_collector.process_results(
|
||||
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=True
|
||||
)
|
||||
print(f"Processed {len(reranked_results)} results with reranking")
|
||||
|
||||
# Save reranked results
|
||||
reranked_file = results_dir / f"reranked_results_{timestamp}.json"
|
||||
|
||||
with open(reranked_file, "w") as f:
|
||||
json.dump(reranked_results, f, indent=2)
|
||||
print(f"Reranked results saved to {reranked_file}")
|
||||
|
||||
# Compare top 5 results
|
||||
print("\nComparing top 5 results:")
|
||||
print("\nTop 5 results with basic scoring:")
|
||||
for i, result in enumerate(basic_results[:5]):
|
||||
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
|
||||
|
||||
print("\nTop 5 results with reranking:")
|
||||
for i, result in enumerate(reranked_results[:5]):
|
||||
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
|
||||
with open(results_file, "w") as f:
|
||||
json.dump(reranked, f, indent=2)
|
||||
|
||||
print(f"Results saved to {results_file}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error reranking: {str(e)}")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_reranker()
|
||||
# Just run the simple test
|
||||
success = test_simple_reranker()
|
||||
|
||||
if success:
|
||||
print("Jina Reranker test completed successfully!")
|
||||
else:
|
||||
print("Jina Reranker test failed.")
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
import json
|
||||
import sys
|
||||
import os
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
|
||||
# Add the project root to the path
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Let's create a custom JinaReranker class specifically for testing
|
||||
class TestJinaReranker:
|
||||
"""Custom JinaReranker for testing with explicit initialization parameters"""
|
||||
|
||||
def __init__(self, api_key, model, endpoint):
|
||||
"""Initialize with explicit parameters"""
|
||||
self.api_key = api_key
|
||||
self.model = model
|
||||
self.endpoint = endpoint
|
||||
self.default_top_n = 10
|
||||
|
||||
def rerank(self, query, documents, top_n=None):
|
||||
"""
|
||||
Rerank documents based on their relevance to the query.
|
||||
"""
|
||||
if not documents:
|
||||
return []
|
||||
|
||||
# Use default top_n if not specified
|
||||
if top_n is None:
|
||||
top_n = min(self.default_top_n, len(documents))
|
||||
else:
|
||||
top_n = min(top_n, len(documents))
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Accept": "application/json"
|
||||
}
|
||||
|
||||
data = {
|
||||
"model": self.model,
|
||||
"query": query,
|
||||
"documents": documents, # Plain array of strings
|
||||
"top_n": top_n
|
||||
}
|
||||
|
||||
print(f"Making reranker API call with query: {query}")
|
||||
print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
|
||||
|
||||
import requests
|
||||
try:
|
||||
response = requests.post(self.endpoint, headers=headers, json=data)
|
||||
print(f"Reranker API response status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Reranker API error: {response.text}")
|
||||
return []
|
||||
|
||||
response.raise_for_status() # Raise exception for HTTP errors
|
||||
|
||||
result = response.json()
|
||||
print(f"Reranker API response structure: {list(result.keys())}")
|
||||
print(f"Full response: {json.dumps(result, indent=2)}")
|
||||
|
||||
# Process and return the reranked results
|
||||
reranked_results = []
|
||||
|
||||
# Check for the specific response structure from the API
|
||||
if "results" in result and isinstance(result["results"], list):
|
||||
results_list = result["results"]
|
||||
for item in results_list:
|
||||
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Handle newer Jina API format with document.text
|
||||
elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Fallback for older response structures
|
||||
elif "data" in result and isinstance(result["data"], list):
|
||||
data_list = result["data"]
|
||||
for item in data_list:
|
||||
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
|
||||
print(f"Processed reranker results: {len(reranked_results)} items")
|
||||
return reranked_results
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error calling reranker API: {str(e)}")
|
||||
return []
|
||||
|
||||
def load_config():
|
||||
"""Load configuration from YAML file"""
|
||||
config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config", "config.yaml")
|
||||
print(f"Loading config from {config_path}")
|
||||
|
||||
if os.path.exists(config_path):
|
||||
with open(config_path, "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
print("Configuration loaded successfully")
|
||||
return config
|
||||
else:
|
||||
print(f"Config file not found at {config_path}")
|
||||
return {}
|
||||
|
||||
def test_simple_reranker():
|
||||
"""Test the Jina Reranker with a simple query and documents"""
|
||||
# Get Jina API key from environment
|
||||
jina_api_key = os.environ.get("JINA_API_KEY", "")
|
||||
if not jina_api_key:
|
||||
print("JINA_API_KEY not found in environment variables")
|
||||
return
|
||||
|
||||
print(f"Found JINA_API_KEY in environment variables")
|
||||
|
||||
# Initialize the reranker
|
||||
reranker = TestJinaReranker(
|
||||
api_key=jina_api_key,
|
||||
model="jina-reranker-v2-base-multilingual",
|
||||
endpoint="https://api.jina.ai/v1/rerank"
|
||||
)
|
||||
|
||||
# Simple query and documents
|
||||
query = "What is quantum computing?"
|
||||
documents = [
|
||||
"Quantum computing is a type of computation that harnesses quantum mechanics.",
|
||||
"Classical computers use bits, while quantum computers use qubits.",
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Quantum computers can solve certain problems faster than classical computers."
|
||||
]
|
||||
|
||||
print(f"Testing simple reranker with query: {query}")
|
||||
print(f"Documents: {documents}")
|
||||
|
||||
# Rerank the documents
|
||||
reranked = reranker.rerank(query, documents)
|
||||
print(f"Reranked results: {json.dumps(reranked, indent=2)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Just run the simple test
|
||||
test_simple_reranker()
|
Loading…
Reference in New Issue