Fix Jina Reranker API integration to handle response format correctly
This commit is contained in:
parent
59bf4a22ee
commit
16c7dca2c7
|
@ -37,29 +37,52 @@ class ResultCollector:
|
|||
Process search results from multiple search engines.
|
||||
|
||||
Args:
|
||||
search_results: Dictionary mapping search engine names to lists of search results
|
||||
dedup: Whether to deduplicate results based on URL
|
||||
max_results: Maximum number of results to return (after processing)
|
||||
search_results: Dictionary mapping search engine names to lists of results
|
||||
dedup: Whether to deduplicate results
|
||||
max_results: Maximum number of results to return
|
||||
use_reranker: Whether to use the Jina Reranker for semantic ranking
|
||||
|
||||
Returns:
|
||||
List of processed search results
|
||||
"""
|
||||
# Flatten and normalize results
|
||||
all_results = self._flatten_results(search_results)
|
||||
# Combine results from all search engines
|
||||
all_results = []
|
||||
|
||||
# Check if we have a flattened structure (single key with all results)
|
||||
if len(search_results) == 1 and "combined" in search_results:
|
||||
all_results = search_results["combined"]
|
||||
print(f"Processing {len(all_results)} combined results")
|
||||
else:
|
||||
# Traditional structure with separate engines
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
# Add the source if not already present
|
||||
if "source" not in result:
|
||||
result["source"] = engine
|
||||
all_results.append(result)
|
||||
print(f"Processing {len(all_results)} results from {len(search_results)} engines")
|
||||
|
||||
# Deduplicate results if requested
|
||||
if dedup:
|
||||
all_results = self._deduplicate_results(all_results)
|
||||
print(f"Deduplicated to {len(all_results)} results")
|
||||
|
||||
# Use reranker if available and requested, otherwise use basic scoring
|
||||
if use_reranker and self.reranker_available:
|
||||
# Use the reranker if available and requested
|
||||
if use_reranker and self.reranker is not None:
|
||||
try:
|
||||
print("Using Jina Reranker for semantic ranking")
|
||||
all_results = self._rerank_results(all_results)
|
||||
print(f"Reranked {len(all_results)} results")
|
||||
except Exception as e:
|
||||
print(f"Error using reranker: {str(e)}")
|
||||
# Fall back to basic scoring
|
||||
all_results = self._score_and_sort_results(all_results)
|
||||
else:
|
||||
# Sort results by relevance (using a simple scoring algorithm)
|
||||
# Use basic scoring
|
||||
print("Using basic scoring")
|
||||
all_results = self._score_and_sort_results(all_results)
|
||||
|
||||
# Limit results if requested
|
||||
# Limit the number of results if requested
|
||||
if max_results is not None:
|
||||
all_results = all_results[:max_results]
|
||||
|
||||
|
@ -67,29 +90,29 @@ class ResultCollector:
|
|||
|
||||
def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Flatten results from multiple search engines into a single list.
|
||||
Flatten search results from multiple search engines into a single list.
|
||||
|
||||
Args:
|
||||
search_results: Dictionary mapping search engine names to lists of search results
|
||||
search_results: Dictionary mapping search engine names to lists of results
|
||||
|
||||
Returns:
|
||||
Flattened list of search results
|
||||
"""
|
||||
# This method is deprecated and kept for backward compatibility
|
||||
# The process_results method now handles flattened results directly
|
||||
all_results = []
|
||||
|
||||
# Check if we have a flattened structure (single key with all results)
|
||||
if len(search_results) == 1 and "combined" in search_results:
|
||||
return search_results["combined"]
|
||||
|
||||
# Traditional structure with separate engines
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
# Ensure all results have the same basic structure
|
||||
normalized_result = {
|
||||
"title": result.get("title", ""),
|
||||
"url": result.get("url", ""),
|
||||
"snippet": result.get("snippet", ""),
|
||||
"source": result.get("source", engine),
|
||||
"domain": self._extract_domain(result.get("url", "")),
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"raw_data": result
|
||||
}
|
||||
all_results.append(normalized_result)
|
||||
# Add the source if not already present
|
||||
if "source" not in result:
|
||||
result["source"] = engine
|
||||
all_results.append(result)
|
||||
|
||||
return all_results
|
||||
|
||||
|
@ -204,15 +227,33 @@ class ResultCollector:
|
|||
# Use the reranker to rerank the snippets
|
||||
reranked = self.reranker.rerank(query, snippets)
|
||||
|
||||
if not reranked:
|
||||
print("Reranker returned empty results. Using basic scoring instead.")
|
||||
return self._score_and_sort_results(results)
|
||||
|
||||
print(f"Reranked {len(reranked)} results")
|
||||
|
||||
# Create a new list of results based on the reranking
|
||||
reranked_results = []
|
||||
for item in reranked:
|
||||
# Get the original result and add the new score
|
||||
original_result = results[item['index']]
|
||||
index = item.get('index')
|
||||
score = item.get('score')
|
||||
|
||||
if index is None or score is None or index >= len(results):
|
||||
print(f"Warning: Invalid reranker result item: {item}")
|
||||
continue
|
||||
|
||||
original_result = results[index]
|
||||
new_result = original_result.copy()
|
||||
new_result['relevance_score'] = item['score'] * 10 # Scale up the score for consistency
|
||||
new_result['relevance_score'] = float(score) * 10 # Scale up the score for consistency
|
||||
reranked_results.append(new_result)
|
||||
|
||||
# If we didn't get any valid results, fall back to basic scoring
|
||||
if not reranked_results:
|
||||
print("No valid reranked results. Using basic scoring instead.")
|
||||
return self._score_and_sort_results(results)
|
||||
|
||||
return reranked_results
|
||||
except Exception as e:
|
||||
print(f"Error reranking results: {str(e)}")
|
||||
|
|
|
@ -85,21 +85,55 @@ class JinaReranker:
|
|||
"top_n": top_n
|
||||
}
|
||||
|
||||
print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
|
||||
|
||||
try:
|
||||
response = requests.post(self.endpoint, headers=headers, json=data)
|
||||
print(f"Reranker API response status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Reranker API error: {response.text}")
|
||||
return []
|
||||
|
||||
response.raise_for_status() # Raise exception for HTTP errors
|
||||
|
||||
result = response.json()
|
||||
|
||||
# Process and return the reranked results
|
||||
reranked_results = []
|
||||
for item in result.get('results', []):
|
||||
|
||||
# Check for the specific response structure we observed
|
||||
if "data" in result and isinstance(result["data"], list):
|
||||
data_list = result["data"]
|
||||
for item in data_list:
|
||||
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'), # Original index in the documents list
|
||||
'score': item.get('score'), # Relevance score
|
||||
'document': documents[item.get('index')] # The actual document content
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Check other possible response structures
|
||||
elif "results" in result:
|
||||
results_list = result["results"]
|
||||
for item in results_list:
|
||||
if isinstance(item, dict) and "index" in item and "score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
elif "documents" in result:
|
||||
# Alternative API response structure
|
||||
docs_list = result["documents"]
|
||||
for i, doc in enumerate(docs_list):
|
||||
if isinstance(doc, dict) and "score" in doc:
|
||||
reranked_results.append({
|
||||
'index': i,
|
||||
'score': doc.get('score'),
|
||||
'document': documents[i]
|
||||
})
|
||||
|
||||
print(f"Processed reranker results: {len(reranked_results)} items")
|
||||
return reranked_results
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
@ -73,6 +73,11 @@ def test_reranker():
|
|||
result["engine"] = engine
|
||||
flattened_results.append(result)
|
||||
|
||||
# Verify that the query is in the flattened results
|
||||
if flattened_results:
|
||||
print(f"\nVerifying query in flattened results:")
|
||||
print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
|
||||
|
||||
# Process results without reranking
|
||||
print("\nProcessing results without reranking...")
|
||||
basic_results = result_collector.process_results(
|
||||
|
|
Loading…
Reference in New Issue