Fix Jina Reranker API integration to handle response format correctly

2025-02-27 17:07:51 -06:00 · 2025-02-27 17:07:51 -06:00 · 16c7dca2c7
parent 59bf4a22ee
commit 16c7dca2c7
3 changed files with 115 additions and 35 deletions
--- a/execution/result_collector.py
+++ b/execution/result_collector.py
@ -35,31 +35,54 @@ class ResultCollector:
                       use_reranker: bool = True) -> List[Dict[str, Any]]:
        """
        Process search results from multiple search engines.
-
+        
        Args:
-            search_results: Dictionary mapping search engine names to lists of search results
+            search_results: Dictionary mapping search engine names to lists of results
-            dedup: Whether to deduplicate results based on URL
+            dedup: Whether to deduplicate results
-            max_results: Maximum number of results to return (after processing)
+            max_results: Maximum number of results to return
            use_reranker: Whether to use the Jina Reranker for semantic ranking
-
+            
        Returns:
            List of processed search results
        """
-        # Flatten and normalize results
+        # Combine results from all search engines
-        all_results = self._flatten_results(search_results)
+        all_results = []
        # Check if we have a flattened structure (single key with all results)
        if len(search_results) == 1 and "combined" in search_results:
            all_results = search_results["combined"]
            print(f"Processing {len(all_results)} combined results")
        else:
            # Traditional structure with separate engines
            for engine, results in search_results.items():
                for result in results:
                    # Add the source if not already present
                    if "source" not in result:
                        result["source"] = engine
                    all_results.append(result)
            print(f"Processing {len(all_results)} results from {len(search_results)} engines")
        # Deduplicate results if requested
        if dedup:
            all_results = self._deduplicate_results(all_results)
            print(f"Deduplicated to {len(all_results)} results")
-        # Use reranker if available and requested, otherwise use basic scoring
+        # Use the reranker if available and requested
-        if use_reranker and self.reranker_available:
+        if use_reranker and self.reranker is not None:
-            all_results = self._rerank_results(all_results)
+            try:
                print("Using Jina Reranker for semantic ranking")
                all_results = self._rerank_results(all_results)
                print(f"Reranked {len(all_results)} results")
            except Exception as e:
                print(f"Error using reranker: {str(e)}")
                # Fall back to basic scoring
                all_results = self._score_and_sort_results(all_results)
        else:
-            # Sort results by relevance (using a simple scoring algorithm)
+            # Use basic scoring
            print("Using basic scoring")
            all_results = self._score_and_sort_results(all_results)
-        # Limit results if requested
+        # Limit the number of results if requested
        if max_results is not None:
            all_results = all_results[:max_results]
@ -67,29 +90,29 @@ class ResultCollector:
    def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
        """
-        Flatten results from multiple search engines into a single list.
+        Flatten search results from multiple search engines into a single list.
-
+        
        Args:
-            search_results: Dictionary mapping search engine names to lists of search results
+            search_results: Dictionary mapping search engine names to lists of results
-
+            
        Returns:
            Flattened list of search results
        """
        # This method is deprecated and kept for backward compatibility
        # The process_results method now handles flattened results directly
        all_results = []
        # Check if we have a flattened structure (single key with all results)
        if len(search_results) == 1 and "combined" in search_results:
            return search_results["combined"]
        # Traditional structure with separate engines
        for engine, results in search_results.items():
            for result in results:
-                # Ensure all results have the same basic structure
+                # Add the source if not already present
-                normalized_result = {
+                if "source" not in result:
-                    "title": result.get("title", ""),
+                    result["source"] = engine
-                    "url": result.get("url", ""),
+                all_results.append(result)
                    "snippet": result.get("snippet", ""),
                    "source": result.get("source", engine),
                    "domain": self._extract_domain(result.get("url", "")),
                    "timestamp": datetime.now().isoformat(),
                    "raw_data": result
                }
                all_results.append(normalized_result)
        return all_results
@ -204,15 +227,33 @@ class ResultCollector:
            # Use the reranker to rerank the snippets
            reranked = self.reranker.rerank(query, snippets)
            if not reranked:
                print("Reranker returned empty results. Using basic scoring instead.")
                return self._score_and_sort_results(results)
            print(f"Reranked {len(reranked)} results")
            # Create a new list of results based on the reranking
            reranked_results = []
            for item in reranked:
                # Get the original result and add the new score
-                original_result = results[item['index']]
+                index = item.get('index')
                score = item.get('score')
                if index is None or score is None or index >= len(results):
                    print(f"Warning: Invalid reranker result item: {item}")
                    continue
                original_result = results[index]
                new_result = original_result.copy()
-                new_result['relevance_score'] = item['score'] * 10  # Scale up the score for consistency
+                new_result['relevance_score'] = float(score) * 10  # Scale up the score for consistency
                reranked_results.append(new_result)
            # If we didn't get any valid results, fall back to basic scoring
            if not reranked_results:
                print("No valid reranked results. Using basic scoring instead.")
                return self._score_and_sort_results(results)
            return reranked_results
        except Exception as e:
            print(f"Error reranking results: {str(e)}")
--- a/ranking/jina_reranker.py
+++ b/ranking/jina_reranker.py
@ -85,21 +85,55 @@ class JinaReranker:
            "top_n": top_n
        }
        print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
        try:
            response = requests.post(self.endpoint, headers=headers, json=data)
            print(f"Reranker API response status: {response.status_code}")
            if response.status_code != 200:
                print(f"Reranker API error: {response.text}")
                return []
            response.raise_for_status()  # Raise exception for HTTP errors
            result = response.json()
            # Process and return the reranked results
            reranked_results = []
            for item in result.get('results', []):
                reranked_results.append({
                    'index': item.get('index'),  # Original index in the documents list
                    'score': item.get('score'),  # Relevance score
                    'document': documents[item.get('index')]  # The actual document content
                })
            # Check for the specific response structure we observed
            if "data" in result and isinstance(result["data"], list):
                data_list = result["data"]
                for item in data_list:
                    if isinstance(item, dict) and "index" in item and "relevance_score" in item:
                        reranked_results.append({
                            'index': item.get('index'),
                            'score': item.get('relevance_score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
            # Check other possible response structures
            elif "results" in result:
                results_list = result["results"]
                for item in results_list:
                    if isinstance(item, dict) and "index" in item and "score" in item:
                        reranked_results.append({
                            'index': item.get('index'),
                            'score': item.get('score'),
                            'document': documents[item.get('index')] if item.get('index') < len(documents) else None
                        })
            elif "documents" in result:
                # Alternative API response structure
                docs_list = result["documents"]
                for i, doc in enumerate(docs_list):
                    if isinstance(doc, dict) and "score" in doc:
                        reranked_results.append({
                            'index': i,
                            'score': doc.get('score'),
                            'document': documents[i]
                        })
            print(f"Processed reranker results: {len(reranked_results)} items")
            return reranked_results
        except Exception as e:
--- a/test_reranker.py
+++ b/test_reranker.py
@ -73,6 +73,11 @@ def test_reranker():
            result["engine"] = engine
            flattened_results.append(result)
    # Verify that the query is in the flattened results
    if flattened_results:
        print(f"\nVerifying query in flattened results:")
        print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...")
    # Process results without reranking
    print("\nProcessing results without reranking...")
    basic_results = result_collector.process_results(