Fix Jina Reranker API integration to handle response format correctly

This commit is contained in:
Steve White 2025-02-27 17:07:51 -06:00
parent 59bf4a22ee
commit 16c7dca2c7
3 changed files with 115 additions and 35 deletions

View File

@ -37,29 +37,52 @@ class ResultCollector:
Process search results from multiple search engines.
Args:
search_results: Dictionary mapping search engine names to lists of search results
dedup: Whether to deduplicate results based on URL
max_results: Maximum number of results to return (after processing)
search_results: Dictionary mapping search engine names to lists of results
dedup: Whether to deduplicate results
max_results: Maximum number of results to return
use_reranker: Whether to use the Jina Reranker for semantic ranking
Returns:
List of processed search results
"""
# Flatten and normalize results
all_results = self._flatten_results(search_results)
# Combine results from all search engines
all_results = []
# Check if we have a flattened structure (single key with all results)
if len(search_results) == 1 and "combined" in search_results:
all_results = search_results["combined"]
print(f"Processing {len(all_results)} combined results")
else:
# Traditional structure with separate engines
for engine, results in search_results.items():
for result in results:
# Add the source if not already present
if "source" not in result:
result["source"] = engine
all_results.append(result)
print(f"Processing {len(all_results)} results from {len(search_results)} engines")
# Deduplicate results if requested
if dedup:
all_results = self._deduplicate_results(all_results)
print(f"Deduplicated to {len(all_results)} results")
# Use reranker if available and requested, otherwise use basic scoring
if use_reranker and self.reranker_available:
# Use the reranker if available and requested
if use_reranker and self.reranker is not None:
try:
print("Using Jina Reranker for semantic ranking")
all_results = self._rerank_results(all_results)
print(f"Reranked {len(all_results)} results")
except Exception as e:
print(f"Error using reranker: {str(e)}")
# Fall back to basic scoring
all_results = self._score_and_sort_results(all_results)
else:
# Sort results by relevance (using a simple scoring algorithm)
# Use basic scoring
print("Using basic scoring")
all_results = self._score_and_sort_results(all_results)
# Limit results if requested
# Limit the number of results if requested
if max_results is not None:
all_results = all_results[:max_results]
@ -67,29 +90,29 @@ class ResultCollector:
def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
"""
Flatten results from multiple search engines into a single list.
Flatten search results from multiple search engines into a single list.
Args:
search_results: Dictionary mapping search engine names to lists of search results
search_results: Dictionary mapping search engine names to lists of results
Returns:
Flattened list of search results
"""
# This method is deprecated and kept for backward compatibility
# The process_results method now handles flattened results directly
all_results = []
# Check if we have a flattened structure (single key with all results)
if len(search_results) == 1 and "combined" in search_results:
return search_results["combined"]
# Traditional structure with separate engines
for engine, results in search_results.items():
for result in results:
# Ensure all results have the same basic structure
normalized_result = {
"title": result.get("title", ""),
"url": result.get("url", ""),
"snippet": result.get("snippet", ""),
"source": result.get("source", engine),
"domain": self._extract_domain(result.get("url", "")),
"timestamp": datetime.now().isoformat(),
"raw_data": result
}
all_results.append(normalized_result)
# Add the source if not already present
if "source" not in result:
result["source"] = engine
all_results.append(result)
return all_results
@ -204,15 +227,33 @@ class ResultCollector:
# Use the reranker to rerank the snippets
reranked = self.reranker.rerank(query, snippets)
if not reranked:
print("Reranker returned empty results. Using basic scoring instead.")
return self._score_and_sort_results(results)
print(f"Reranked {len(reranked)} results")
# Create a new list of results based on the reranking
reranked_results = []
for item in reranked:
# Get the original result and add the new score
original_result = results[item['index']]
index = item.get('index')
score = item.get('score')
if index is None or score is None or index >= len(results):
print(f"Warning: Invalid reranker result item: {item}")
continue
original_result = results[index]
new_result = original_result.copy()
new_result['relevance_score'] = item['score'] * 10 # Scale up the score for consistency
new_result['relevance_score'] = float(score) * 10 # Scale up the score for consistency
reranked_results.append(new_result)
# If we didn't get any valid results, fall back to basic scoring
if not reranked_results:
print("No valid reranked results. Using basic scoring instead.")
return self._score_and_sort_results(results)
return reranked_results
except Exception as e:
print(f"Error reranking results: {str(e)}")

View File

@ -85,21 +85,55 @@ class JinaReranker:
"top_n": top_n
}
print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
try:
response = requests.post(self.endpoint, headers=headers, json=data)
print(f"Reranker API response status: {response.status_code}")
if response.status_code != 200:
print(f"Reranker API error: {response.text}")
return []
response.raise_for_status() # Raise exception for HTTP errors
result = response.json()
# Process and return the reranked results
reranked_results = []
for item in result.get('results', []):
# Check for the specific response structure we observed
if "data" in result and isinstance(result["data"], list):
data_list = result["data"]
for item in data_list:
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
reranked_results.append({
'index': item.get('index'), # Original index in the documents list
'score': item.get('score'), # Relevance score
'document': documents[item.get('index')] # The actual document content
'index': item.get('index'),
'score': item.get('relevance_score'),
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
})
# Check other possible response structures
elif "results" in result:
results_list = result["results"]
for item in results_list:
if isinstance(item, dict) and "index" in item and "score" in item:
reranked_results.append({
'index': item.get('index'),
'score': item.get('score'),
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
})
elif "documents" in result:
# Alternative API response structure
docs_list = result["documents"]
for i, doc in enumerate(docs_list):
if isinstance(doc, dict) and "score" in doc:
reranked_results.append({
'index': i,
'score': doc.get('score'),
'document': documents[i]
})
print(f"Processed reranker results: {len(reranked_results)} items")
return reranked_results
except Exception as e:

View File

@ -73,6 +73,11 @@ def test_reranker():
result["engine"] = engine
flattened_results.append(result)
# Verify that the query is in the flattened results
if flattened_results:
print(f"\nVerifying query in flattened results:")
print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
# Process results without reranking
print("\nProcessing results without reranking...")
basic_results = result_collector.process_results(