Fix Jina Reranker API integration to handle response format correctly
This commit is contained in:
parent
59bf4a22ee
commit
16c7dca2c7
|
@ -35,31 +35,54 @@ class ResultCollector:
|
||||||
use_reranker: bool = True) -> List[Dict[str, Any]]:
|
use_reranker: bool = True) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Process search results from multiple search engines.
|
Process search results from multiple search engines.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
search_results: Dictionary mapping search engine names to lists of search results
|
search_results: Dictionary mapping search engine names to lists of results
|
||||||
dedup: Whether to deduplicate results based on URL
|
dedup: Whether to deduplicate results
|
||||||
max_results: Maximum number of results to return (after processing)
|
max_results: Maximum number of results to return
|
||||||
use_reranker: Whether to use the Jina Reranker for semantic ranking
|
use_reranker: Whether to use the Jina Reranker for semantic ranking
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of processed search results
|
List of processed search results
|
||||||
"""
|
"""
|
||||||
# Flatten and normalize results
|
# Combine results from all search engines
|
||||||
all_results = self._flatten_results(search_results)
|
all_results = []
|
||||||
|
|
||||||
|
# Check if we have a flattened structure (single key with all results)
|
||||||
|
if len(search_results) == 1 and "combined" in search_results:
|
||||||
|
all_results = search_results["combined"]
|
||||||
|
print(f"Processing {len(all_results)} combined results")
|
||||||
|
else:
|
||||||
|
# Traditional structure with separate engines
|
||||||
|
for engine, results in search_results.items():
|
||||||
|
for result in results:
|
||||||
|
# Add the source if not already present
|
||||||
|
if "source" not in result:
|
||||||
|
result["source"] = engine
|
||||||
|
all_results.append(result)
|
||||||
|
print(f"Processing {len(all_results)} results from {len(search_results)} engines")
|
||||||
|
|
||||||
# Deduplicate results if requested
|
# Deduplicate results if requested
|
||||||
if dedup:
|
if dedup:
|
||||||
all_results = self._deduplicate_results(all_results)
|
all_results = self._deduplicate_results(all_results)
|
||||||
|
print(f"Deduplicated to {len(all_results)} results")
|
||||||
|
|
||||||
# Use reranker if available and requested, otherwise use basic scoring
|
# Use the reranker if available and requested
|
||||||
if use_reranker and self.reranker_available:
|
if use_reranker and self.reranker is not None:
|
||||||
all_results = self._rerank_results(all_results)
|
try:
|
||||||
|
print("Using Jina Reranker for semantic ranking")
|
||||||
|
all_results = self._rerank_results(all_results)
|
||||||
|
print(f"Reranked {len(all_results)} results")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error using reranker: {str(e)}")
|
||||||
|
# Fall back to basic scoring
|
||||||
|
all_results = self._score_and_sort_results(all_results)
|
||||||
else:
|
else:
|
||||||
# Sort results by relevance (using a simple scoring algorithm)
|
# Use basic scoring
|
||||||
|
print("Using basic scoring")
|
||||||
all_results = self._score_and_sort_results(all_results)
|
all_results = self._score_and_sort_results(all_results)
|
||||||
|
|
||||||
# Limit results if requested
|
# Limit the number of results if requested
|
||||||
if max_results is not None:
|
if max_results is not None:
|
||||||
all_results = all_results[:max_results]
|
all_results = all_results[:max_results]
|
||||||
|
|
||||||
|
@ -67,29 +90,29 @@ class ResultCollector:
|
||||||
|
|
||||||
def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Flatten results from multiple search engines into a single list.
|
Flatten search results from multiple search engines into a single list.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
search_results: Dictionary mapping search engine names to lists of search results
|
search_results: Dictionary mapping search engine names to lists of results
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Flattened list of search results
|
Flattened list of search results
|
||||||
"""
|
"""
|
||||||
|
# This method is deprecated and kept for backward compatibility
|
||||||
|
# The process_results method now handles flattened results directly
|
||||||
all_results = []
|
all_results = []
|
||||||
|
|
||||||
|
# Check if we have a flattened structure (single key with all results)
|
||||||
|
if len(search_results) == 1 and "combined" in search_results:
|
||||||
|
return search_results["combined"]
|
||||||
|
|
||||||
|
# Traditional structure with separate engines
|
||||||
for engine, results in search_results.items():
|
for engine, results in search_results.items():
|
||||||
for result in results:
|
for result in results:
|
||||||
# Ensure all results have the same basic structure
|
# Add the source if not already present
|
||||||
normalized_result = {
|
if "source" not in result:
|
||||||
"title": result.get("title", ""),
|
result["source"] = engine
|
||||||
"url": result.get("url", ""),
|
all_results.append(result)
|
||||||
"snippet": result.get("snippet", ""),
|
|
||||||
"source": result.get("source", engine),
|
|
||||||
"domain": self._extract_domain(result.get("url", "")),
|
|
||||||
"timestamp": datetime.now().isoformat(),
|
|
||||||
"raw_data": result
|
|
||||||
}
|
|
||||||
all_results.append(normalized_result)
|
|
||||||
|
|
||||||
return all_results
|
return all_results
|
||||||
|
|
||||||
|
@ -204,15 +227,33 @@ class ResultCollector:
|
||||||
# Use the reranker to rerank the snippets
|
# Use the reranker to rerank the snippets
|
||||||
reranked = self.reranker.rerank(query, snippets)
|
reranked = self.reranker.rerank(query, snippets)
|
||||||
|
|
||||||
|
if not reranked:
|
||||||
|
print("Reranker returned empty results. Using basic scoring instead.")
|
||||||
|
return self._score_and_sort_results(results)
|
||||||
|
|
||||||
|
print(f"Reranked {len(reranked)} results")
|
||||||
|
|
||||||
# Create a new list of results based on the reranking
|
# Create a new list of results based on the reranking
|
||||||
reranked_results = []
|
reranked_results = []
|
||||||
for item in reranked:
|
for item in reranked:
|
||||||
# Get the original result and add the new score
|
# Get the original result and add the new score
|
||||||
original_result = results[item['index']]
|
index = item.get('index')
|
||||||
|
score = item.get('score')
|
||||||
|
|
||||||
|
if index is None or score is None or index >= len(results):
|
||||||
|
print(f"Warning: Invalid reranker result item: {item}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
original_result = results[index]
|
||||||
new_result = original_result.copy()
|
new_result = original_result.copy()
|
||||||
new_result['relevance_score'] = item['score'] * 10 # Scale up the score for consistency
|
new_result['relevance_score'] = float(score) * 10 # Scale up the score for consistency
|
||||||
reranked_results.append(new_result)
|
reranked_results.append(new_result)
|
||||||
|
|
||||||
|
# If we didn't get any valid results, fall back to basic scoring
|
||||||
|
if not reranked_results:
|
||||||
|
print("No valid reranked results. Using basic scoring instead.")
|
||||||
|
return self._score_and_sort_results(results)
|
||||||
|
|
||||||
return reranked_results
|
return reranked_results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error reranking results: {str(e)}")
|
print(f"Error reranking results: {str(e)}")
|
||||||
|
|
|
@ -85,21 +85,55 @@ class JinaReranker:
|
||||||
"top_n": top_n
|
"top_n": top_n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.post(self.endpoint, headers=headers, json=data)
|
response = requests.post(self.endpoint, headers=headers, json=data)
|
||||||
|
print(f"Reranker API response status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Reranker API error: {response.text}")
|
||||||
|
return []
|
||||||
|
|
||||||
response.raise_for_status() # Raise exception for HTTP errors
|
response.raise_for_status() # Raise exception for HTTP errors
|
||||||
|
|
||||||
result = response.json()
|
result = response.json()
|
||||||
|
|
||||||
# Process and return the reranked results
|
# Process and return the reranked results
|
||||||
reranked_results = []
|
reranked_results = []
|
||||||
for item in result.get('results', []):
|
|
||||||
reranked_results.append({
|
|
||||||
'index': item.get('index'), # Original index in the documents list
|
|
||||||
'score': item.get('score'), # Relevance score
|
|
||||||
'document': documents[item.get('index')] # The actual document content
|
|
||||||
})
|
|
||||||
|
|
||||||
|
# Check for the specific response structure we observed
|
||||||
|
if "data" in result and isinstance(result["data"], list):
|
||||||
|
data_list = result["data"]
|
||||||
|
for item in data_list:
|
||||||
|
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||||
|
reranked_results.append({
|
||||||
|
'index': item.get('index'),
|
||||||
|
'score': item.get('relevance_score'),
|
||||||
|
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||||
|
})
|
||||||
|
# Check other possible response structures
|
||||||
|
elif "results" in result:
|
||||||
|
results_list = result["results"]
|
||||||
|
for item in results_list:
|
||||||
|
if isinstance(item, dict) and "index" in item and "score" in item:
|
||||||
|
reranked_results.append({
|
||||||
|
'index': item.get('index'),
|
||||||
|
'score': item.get('score'),
|
||||||
|
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||||
|
})
|
||||||
|
elif "documents" in result:
|
||||||
|
# Alternative API response structure
|
||||||
|
docs_list = result["documents"]
|
||||||
|
for i, doc in enumerate(docs_list):
|
||||||
|
if isinstance(doc, dict) and "score" in doc:
|
||||||
|
reranked_results.append({
|
||||||
|
'index': i,
|
||||||
|
'score': doc.get('score'),
|
||||||
|
'document': documents[i]
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"Processed reranker results: {len(reranked_results)} items")
|
||||||
return reranked_results
|
return reranked_results
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
@ -73,6 +73,11 @@ def test_reranker():
|
||||||
result["engine"] = engine
|
result["engine"] = engine
|
||||||
flattened_results.append(result)
|
flattened_results.append(result)
|
||||||
|
|
||||||
|
# Verify that the query is in the flattened results
|
||||||
|
if flattened_results:
|
||||||
|
print(f"\nVerifying query in flattened results:")
|
||||||
|
print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
|
||||||
|
|
||||||
# Process results without reranking
|
# Process results without reranking
|
||||||
print("\nProcessing results without reranking...")
|
print("\nProcessing results without reranking...")
|
||||||
basic_results = result_collector.process_results(
|
basic_results = result_collector.process_results(
|
||||||
|
|
Loading…
Reference in New Issue