Fix Jina Reranker API integration to handle response format correctly

2025-02-27 17:07:51 -06:00 · 2025-02-27 17:07:51 -06:00 · 16c7dca2c7
parent 59bf4a22ee
commit 16c7dca2c7
3 changed files with 115 additions and 35 deletions
--- a/execution/result_collector.py
+++ b/execution/result_collector.py
@ -37,29 +37,52 @@ class ResultCollector:
        Process search results from multiple search engines.
        
        Args:
-            search_results: Dictionary mapping search engine names to lists of search results
-            dedup: Whether to deduplicate results based on URL
-            max_results: Maximum number of results to return (after processing)
+            search_results: Dictionary mapping search engine names to lists of results
+            dedup: Whether to deduplicate results
+            max_results: Maximum number of results to return
            use_reranker: Whether to use the Jina Reranker for semantic ranking
            
        Returns:
            List of processed search results
        """
-        # Flatten and normalize results
-        all_results = self._flatten_results(search_results)
+        # Combine results from all search engines
+        all_results = []
+        
+        # Check if we have a flattened structure (single key with all results)
+        if len(search_results) == 1 and "combined" in search_results:
+            all_results = search_results["combined"]
+            print(f"Processing {len(all_results)} combined results")
+        else:
+            # Traditional structure with separate engines
+            for engine, results in search_results.items():
+                for result in results:
+                    # Add the source if not already present
+                    if "source" not in result:
+                        result["source"] = engine
+                    all_results.append(result)
+            print(f"Processing {len(all_results)} results from {len(search_results)} engines")
        
        # Deduplicate results if requested
        if dedup:
            all_results = self._deduplicate_results(all_results)
+            print(f"Deduplicated to {len(all_results)} results")
        
-        # Use reranker if available and requested, otherwise use basic scoring
-        if use_reranker and self.reranker_available:
+        # Use the reranker if available and requested
+        if use_reranker and self.reranker is not None:
+            try:
+                print("Using Jina Reranker for semantic ranking")
                all_results = self._rerank_results(all_results)
+                print(f"Reranked {len(all_results)} results")
+            except Exception as e:
+                print(f"Error using reranker: {str(e)}")
+                # Fall back to basic scoring
+                all_results = self._score_and_sort_results(all_results)
        else:
-            # Sort results by relevance (using a simple scoring algorithm)
+            # Use basic scoring
+            print("Using basic scoring")
            all_results = self._score_and_sort_results(all_results)
        
-        # Limit results if requested
+        # Limit the number of results if requested
        if max_results is not None:
            all_results = all_results[:max_results]
        
@ -67,29 +90,29 @@ class ResultCollector:

    def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
        """
-        Flatten results from multiple search engines into a single list.
+        Flatten search results from multiple search engines into a single list.
        
        Args:
-            search_results: Dictionary mapping search engine names to lists of search results
+            search_results: Dictionary mapping search engine names to lists of results
            
        Returns:
            Flattened list of search results
        """
+        # This method is deprecated and kept for backward compatibility
+        # The process_results method now handles flattened results directly
        all_results = []
        
+        # Check if we have a flattened structure (single key with all results)
+        if len(search_results) == 1 and "combined" in search_results:
+            return search_results["combined"]
+        
+        # Traditional structure with separate engines
        for engine, results in search_results.items():
            for result in results:
-                # Ensure all results have the same basic structure
-                normalized_result = {
-                    "title": result.get("title", ""),
-                    "url": result.get("url", ""),
-                    "snippet": result.get("snippet", ""),
-                    "source": result.get("source", engine),
-                    "domain": self._extract_domain(result.get("url", "")),
-                    "timestamp": datetime.now().isoformat(),
-                    "raw_data": result
-                }
-                all_results.append(normalized_result)
+                # Add the source if not already present
+                if "source" not in result:
+                    result["source"] = engine
+                all_results.append(result)
        
        return all_results

@ -204,15 +227,33 @@ class ResultCollector:
            # Use the reranker to rerank the snippets
            reranked = self.reranker.rerank(query, snippets)
            
+            if not reranked:
+                print("Reranker returned empty results. Using basic scoring instead.")
+                return self._score_and_sort_results(results)
+                
+            print(f"Reranked {len(reranked)} results")
+            
            # Create a new list of results based on the reranking
            reranked_results = []
            for item in reranked:
                # Get the original result and add the new score
-                original_result = results[item['index']]
+                index = item.get('index')
+                score = item.get('score')
+                
+                if index is None or score is None or index >= len(results):
+                    print(f"Warning: Invalid reranker result item: {item}")
+                    continue
+                    
+                original_result = results[index]
                new_result = original_result.copy()
-                new_result['relevance_score'] = item['score'] * 10  # Scale up the score for consistency
+                new_result['relevance_score'] = float(score) * 10  # Scale up the score for consistency
                reranked_results.append(new_result)
            
+            # If we didn't get any valid results, fall back to basic scoring
+            if not reranked_results:
+                print("No valid reranked results. Using basic scoring instead.")
+                return self._score_and_sort_results(results)
+                
            return reranked_results
        except Exception as e:
            print(f"Error reranking results: {str(e)}")
--- a/ranking/jina_reranker.py
+++ b/ranking/jina_reranker.py
@ -85,21 +85,55 @@ class JinaReranker:
            "top_n": top_n
        }
        
+        print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
+        
        try:
            response = requests.post(self.endpoint, headers=headers, json=data)
+            print(f"Reranker API response status: {response.status_code}")
+            
+            if response.status_code != 200:
+                print(f"Reranker API error: {response.text}")
+                return []
+                
            response.raise_for_status()  # Raise exception for HTTP errors
            
            result = response.json()
            
            # Process and return the reranked results
            reranked_results = []
-            for item in result.get('results', []):
+            
+            # Check for the specific response structure we observed
+            if "data" in result and isinstance(result["data"], list):
+                data_list = result["data"]
+                for item in data_list:
+                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
                        reranked_results.append({
-                    'index': item.get('index'),  # Original index in the documents list
-                    'score': item.get('score'),  # Relevance score
-                    'document': documents[item.get('index')]  # The actual document content
+                            'index': item.get('index'),
+                            'score': item.get('relevance_score'),
+                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
+                        })
+            # Check other possible response structures
+            elif "results" in result:
+                results_list = result["results"]
+                for item in results_list:
+                    if isinstance(item, dict) and "index" in item and "score" in item:
+                        reranked_results.append({
+                            'index': item.get('index'),
+                            'score': item.get('score'),
+                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
+                        })
+            elif "documents" in result:
+                # Alternative API response structure
+                docs_list = result["documents"]
+                for i, doc in enumerate(docs_list):
+                    if isinstance(doc, dict) and "score" in doc:
+                        reranked_results.append({
+                            'index': i,
+                            'score': doc.get('score'),
+                            'document': documents[i]
                        })
            
+            print(f"Processed reranker results: {len(reranked_results)} items")
            return reranked_results
        
        except Exception as e:
--- a/test_reranker.py
+++ b/test_reranker.py
@ -73,6 +73,11 @@ def test_reranker():
            result["engine"] = engine
            flattened_results.append(result)
    
+    # Verify that the query is in the flattened results
+    if flattened_results:
+        print(f"\nVerifying query in flattened results:")
+        print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
+    
    # Process results without reranking
    print("\nProcessing results without reranking...")
    basic_results = result_collector.process_results(