Integrate Jina Reranker with ResultCollector for semantic ranking

2025-02-27 16:59:54 -06:00 · 2025-02-27 16:59:54 -06:00 · 16720d04c7
parent fc74a879b3
commit 16720d04c7
4 changed files with 214 additions and 7 deletions
--- a/.gradio/certificate.pem
+++ b/.gradio/certificate.pem
@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
--- a/execution/result_collector.py
+++ b/execution/result_collector.py
@ -10,6 +10,8 @@ from typing import Dict, List, Any, Optional, Set
 from urllib.parse import urlparse
 from datetime import datetime

+from ranking.jina_reranker import get_jina_reranker
+

 class ResultCollector:
    """
@ -19,12 +21,18 @@ class ResultCollector:

    def __init__(self):
        """Initialize the result collector."""
-        pass
+        try:
+            self.reranker = get_jina_reranker()
+            self.reranker_available = True
+        except ValueError:
+            print("Jina Reranker not available. Will use basic scoring instead.")
+            self.reranker_available = False

    def process_results(self, 
                       search_results: Dict[str, List[Dict[str, Any]]],
                       dedup: bool = True,
-                       max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+                       max_results: Optional[int] = None,
+                       use_reranker: bool = True) -> List[Dict[str, Any]]:
        """
        Process search results from multiple search engines.

@ -32,6 +40,7 @@ class ResultCollector:
            search_results: Dictionary mapping search engine names to lists of search results
            dedup: Whether to deduplicate results based on URL
            max_results: Maximum number of results to return (after processing)
+            use_reranker: Whether to use the Jina Reranker for semantic ranking

        Returns:
            List of processed search results
@ -43,8 +52,12 @@ class ResultCollector:
        if dedup:
            all_results = self._deduplicate_results(all_results)
        
-        # Sort results by relevance (using a simple scoring algorithm)
-        all_results = self._score_and_sort_results(all_results)
+        # Use reranker if available and requested, otherwise use basic scoring
+        if use_reranker and self.reranker_available:
+            all_results = self._rerank_results(all_results)
+        else:
+            # Sort results by relevance (using a simple scoring algorithm)
+            all_results = self._score_and_sort_results(all_results)
        
        # Limit results if requested
        if max_results is not None:
@ -152,6 +165,52 @@ class ResultCollector:
        
        return sorted_results

+    def _rerank_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Rerank results using the Jina Reranker.
+
+        Args:
+            results: List of search results
+
+        Returns:
+            Reranked list of search results
+        """
+        if not results:
+            return []
+            
+        # Get the original query from the first result (all should have the same query)
+        query = results[0].get("query", "")
+        if not query:
+            # If no query is found, use a fallback approach
+            print("Warning: No query found in results. Using basic scoring instead.")
+            return self._score_and_sort_results(results)
+        
+        # Extract snippets for reranking
+        snippets = []
+        for result in results:
+            # Combine title and snippet for better reranking
+            content = f"{result.get('title', '')} {result.get('snippet', '')}"
+            snippets.append(content)
+        
+        try:
+            # Use the reranker to rerank the snippets
+            reranked = self.reranker.rerank(query, snippets)
+            
+            # Create a new list of results based on the reranking
+            reranked_results = []
+            for item in reranked:
+                # Get the original result and add the new score
+                original_result = results[item['index']]
+                new_result = original_result.copy()
+                new_result['relevance_score'] = item['score']
+                reranked_results.append(new_result)
+            
+            return reranked_results
+        except Exception as e:
+            print(f"Error reranking results: {str(e)}")
+            # Fall back to basic scoring if reranking fails
+            return self._score_and_sort_results(results)
+
    def _extract_domain(self, url: str) -> str:
        """
        Extract the domain from a URL.
--- a/test_reranker.py
+++ b/test_reranker.py
@ -0,0 +1,106 @@
+"""
+Test script for the Jina Reranker integration.
+This script tests the reranker functionality by comparing results with and without reranking.
+"""
+
+import json
+import time
+from pathlib import Path
+
+from query.query_processor import QueryProcessor
+from execution.search_executor import SearchExecutor
+from execution.result_collector import ResultCollector
+from ranking.jina_reranker import get_jina_reranker
+
+
+def test_reranker():
+    """Test the reranker functionality."""
+    # Initialize components
+    query_processor = QueryProcessor()
+    search_executor = SearchExecutor()
+    result_collector = ResultCollector()
+    
+    # Check if reranker is available
+    try:
+        reranker = get_jina_reranker()
+        reranker_available = True
+        print("Jina Reranker is available.")
+    except ValueError:
+        reranker_available = False
+        print("Jina Reranker is not available. Will only test basic scoring.")
+    
+    # Process a test query
+    query = "What are the latest advancements in quantum computing?"
+    print(f"Processing query: {query}")
+    
+    processed_query = query_processor.process_query(query)
+    print(f"Processed query: {processed_query}")
+    
+    # Execute the search
+    available_engines = search_executor.get_available_search_engines()
+    print(f"Available search engines: {available_engines}")
+    
+    if 'search_engines' not in processed_query:
+        processed_query['search_engines'] = available_engines
+    
+    # Execute the search
+    search_results = search_executor.execute_search(
+        structured_query=processed_query,
+        num_results=10
+    )
+    
+    # Print which engines returned results
+    for engine, results in search_results.items():
+        print(f"Engine {engine} returned {len(results)} results")
+    
+    # Add the query to each result for reranking
+    for engine, results in search_results.items():
+        for result in results:
+            result["query"] = processed_query.get("enhanced_query", processed_query.get("original_query", query))
+    
+    # Process results without reranking
+    print("\nProcessing results without reranking...")
+    basic_results = result_collector.process_results(
+        search_results, dedup=True, max_results=None, use_reranker=False
+    )
+    print(f"Processed {len(basic_results)} results with basic scoring")
+    
+    # Save basic results
+    results_dir = Path(__file__).parent / "results"
+    results_dir.mkdir(exist_ok=True)
+    
+    timestamp = int(time.time())
+    basic_file = results_dir / f"basic_results_{timestamp}.json"
+    
+    with open(basic_file, "w") as f:
+        json.dump(basic_results, f, indent=2)
+    print(f"Basic results saved to {basic_file}")
+    
+    # Process results with reranking (if available)
+    if reranker_available:
+        print("\nProcessing results with reranking...")
+        reranked_results = result_collector.process_results(
+            search_results, dedup=True, max_results=None, use_reranker=True
+        )
+        print(f"Processed {len(reranked_results)} results with reranking")
+        
+        # Save reranked results
+        reranked_file = results_dir / f"reranked_results_{timestamp}.json"
+        
+        with open(reranked_file, "w") as f:
+            json.dump(reranked_results, f, indent=2)
+        print(f"Reranked results saved to {reranked_file}")
+        
+        # Compare top 5 results
+        print("\nComparing top 5 results:")
+        print("\nTop 5 results with basic scoring:")
+        for i, result in enumerate(basic_results[:5]):
+            print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
+        
+        print("\nTop 5 results with reranking:")
+        for i, result in enumerate(reranked_results[:5]):
+            print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
+
+
+if __name__ == "__main__":
+    test_reranker()
--- a/ui/gradio_interface.py
+++ b/ui/gradio_interface.py
@ -29,13 +29,14 @@ class GradioInterface:
        self.results_dir = Path(__file__).parent.parent / "results"
        self.results_dir.mkdir(exist_ok=True)

-    def process_query(self, query, num_results=10):
+    def process_query(self, query, num_results=10, use_reranker=True):
        """
        Process a query and return the results.
        
        Args:
            query (str): The query to process
            num_results (int): Number of results to return
+            use_reranker (bool): Whether to use the Jina Reranker for semantic ranking
            
        Returns:
            tuple: (markdown_results, json_results_path)
@ -72,10 +73,15 @@ class GradioInterface:
            for engine, results in search_results.items():
                print(f"Engine {engine} returned {len(results)} results")
            
+            # Add the query to each result for reranking
+            for engine, results in search_results.items():
+                for result in results:
+                    result["query"] = processed_query.get("enhanced_query", processed_query.get("original_query", query))
+            
            # Process the results - don't limit the number of results
            print(f"Processing results...")
            processed_results = self.result_collector.process_results(
-                search_results, dedup=True, max_results=None
+                search_results, dedup=True, max_results=None, use_reranker=use_reranker
            )
            print(f"Processed {len(processed_results)} results")
            
@ -187,6 +193,11 @@ class GradioInterface:
                        step=5,
                        label="Results Per Engine"
                    )
+                    use_reranker = gr.Checkbox(
+                        label="Use Semantic Reranker",
+                        value=True,
+                        info="Uses Jina AI's reranker for more relevant results"
+                    )
                    search_button = gr.Button("Search", variant="primary")
            
            gr.Examples(
@ -211,7 +222,7 @@ class GradioInterface:
            
            search_button.click(
                fn=self.process_query,
-                inputs=[query_input, num_results],
+                inputs=[query_input, num_results, use_reranker],
                outputs=[results_output, file_output]
            )