diff --git a/.gradio/certificate.pem b/.gradio/certificate.pem new file mode 100644 index 0000000..b85c803 --- /dev/null +++ b/.gradio/certificate.pem @@ -0,0 +1,31 @@ +-----BEGIN CERTIFICATE----- +MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw +TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh +cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4 +WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu +ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY +MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc +h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+ +0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U +A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW +T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH +B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC +B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv +KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn +OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn +jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw +qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI +rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV +HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq +hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL +ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ +3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK +NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5 +ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur +TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC +jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc +oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq +4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA +mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d +emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc= +-----END CERTIFICATE----- diff --git a/execution/result_collector.py b/execution/result_collector.py index d3e1fbc..fb43299 100644 --- a/execution/result_collector.py +++ b/execution/result_collector.py @@ -10,6 +10,8 @@ from typing import Dict, List, Any, Optional, Set from urllib.parse import urlparse from datetime import datetime +from ranking.jina_reranker import get_jina_reranker + class ResultCollector: """ @@ -19,12 +21,18 @@ class ResultCollector: def __init__(self): """Initialize the result collector.""" - pass + try: + self.reranker = get_jina_reranker() + self.reranker_available = True + except ValueError: + print("Jina Reranker not available. Will use basic scoring instead.") + self.reranker_available = False def process_results(self, search_results: Dict[str, List[Dict[str, Any]]], dedup: bool = True, - max_results: Optional[int] = None) -> List[Dict[str, Any]]: + max_results: Optional[int] = None, + use_reranker: bool = True) -> List[Dict[str, Any]]: """ Process search results from multiple search engines. @@ -32,6 +40,7 @@ class ResultCollector: search_results: Dictionary mapping search engine names to lists of search results dedup: Whether to deduplicate results based on URL max_results: Maximum number of results to return (after processing) + use_reranker: Whether to use the Jina Reranker for semantic ranking Returns: List of processed search results @@ -43,8 +52,12 @@ class ResultCollector: if dedup: all_results = self._deduplicate_results(all_results) - # Sort results by relevance (using a simple scoring algorithm) - all_results = self._score_and_sort_results(all_results) + # Use reranker if available and requested, otherwise use basic scoring + if use_reranker and self.reranker_available: + all_results = self._rerank_results(all_results) + else: + # Sort results by relevance (using a simple scoring algorithm) + all_results = self._score_and_sort_results(all_results) # Limit results if requested if max_results is not None: @@ -152,6 +165,52 @@ class ResultCollector: return sorted_results + def _rerank_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Rerank results using the Jina Reranker. + + Args: + results: List of search results + + Returns: + Reranked list of search results + """ + if not results: + return [] + + # Get the original query from the first result (all should have the same query) + query = results[0].get("query", "") + if not query: + # If no query is found, use a fallback approach + print("Warning: No query found in results. Using basic scoring instead.") + return self._score_and_sort_results(results) + + # Extract snippets for reranking + snippets = [] + for result in results: + # Combine title and snippet for better reranking + content = f"{result.get('title', '')} {result.get('snippet', '')}" + snippets.append(content) + + try: + # Use the reranker to rerank the snippets + reranked = self.reranker.rerank(query, snippets) + + # Create a new list of results based on the reranking + reranked_results = [] + for item in reranked: + # Get the original result and add the new score + original_result = results[item['index']] + new_result = original_result.copy() + new_result['relevance_score'] = item['score'] + reranked_results.append(new_result) + + return reranked_results + except Exception as e: + print(f"Error reranking results: {str(e)}") + # Fall back to basic scoring if reranking fails + return self._score_and_sort_results(results) + def _extract_domain(self, url: str) -> str: """ Extract the domain from a URL. diff --git a/test_reranker.py b/test_reranker.py new file mode 100644 index 0000000..46e6408 --- /dev/null +++ b/test_reranker.py @@ -0,0 +1,106 @@ +""" +Test script for the Jina Reranker integration. +This script tests the reranker functionality by comparing results with and without reranking. +""" + +import json +import time +from pathlib import Path + +from query.query_processor import QueryProcessor +from execution.search_executor import SearchExecutor +from execution.result_collector import ResultCollector +from ranking.jina_reranker import get_jina_reranker + + +def test_reranker(): + """Test the reranker functionality.""" + # Initialize components + query_processor = QueryProcessor() + search_executor = SearchExecutor() + result_collector = ResultCollector() + + # Check if reranker is available + try: + reranker = get_jina_reranker() + reranker_available = True + print("Jina Reranker is available.") + except ValueError: + reranker_available = False + print("Jina Reranker is not available. Will only test basic scoring.") + + # Process a test query + query = "What are the latest advancements in quantum computing?" + print(f"Processing query: {query}") + + processed_query = query_processor.process_query(query) + print(f"Processed query: {processed_query}") + + # Execute the search + available_engines = search_executor.get_available_search_engines() + print(f"Available search engines: {available_engines}") + + if 'search_engines' not in processed_query: + processed_query['search_engines'] = available_engines + + # Execute the search + search_results = search_executor.execute_search( + structured_query=processed_query, + num_results=10 + ) + + # Print which engines returned results + for engine, results in search_results.items(): + print(f"Engine {engine} returned {len(results)} results") + + # Add the query to each result for reranking + for engine, results in search_results.items(): + for result in results: + result["query"] = processed_query.get("enhanced_query", processed_query.get("original_query", query)) + + # Process results without reranking + print("\nProcessing results without reranking...") + basic_results = result_collector.process_results( + search_results, dedup=True, max_results=None, use_reranker=False + ) + print(f"Processed {len(basic_results)} results with basic scoring") + + # Save basic results + results_dir = Path(__file__).parent / "results" + results_dir.mkdir(exist_ok=True) + + timestamp = int(time.time()) + basic_file = results_dir / f"basic_results_{timestamp}.json" + + with open(basic_file, "w") as f: + json.dump(basic_results, f, indent=2) + print(f"Basic results saved to {basic_file}") + + # Process results with reranking (if available) + if reranker_available: + print("\nProcessing results with reranking...") + reranked_results = result_collector.process_results( + search_results, dedup=True, max_results=None, use_reranker=True + ) + print(f"Processed {len(reranked_results)} results with reranking") + + # Save reranked results + reranked_file = results_dir / f"reranked_results_{timestamp}.json" + + with open(reranked_file, "w") as f: + json.dump(reranked_results, f, indent=2) + print(f"Reranked results saved to {reranked_file}") + + # Compare top 5 results + print("\nComparing top 5 results:") + print("\nTop 5 results with basic scoring:") + for i, result in enumerate(basic_results[:5]): + print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})") + + print("\nTop 5 results with reranking:") + for i, result in enumerate(reranked_results[:5]): + print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})") + + +if __name__ == "__main__": + test_reranker() diff --git a/ui/gradio_interface.py b/ui/gradio_interface.py index 280e610..da6e625 100644 --- a/ui/gradio_interface.py +++ b/ui/gradio_interface.py @@ -29,13 +29,14 @@ class GradioInterface: self.results_dir = Path(__file__).parent.parent / "results" self.results_dir.mkdir(exist_ok=True) - def process_query(self, query, num_results=10): + def process_query(self, query, num_results=10, use_reranker=True): """ Process a query and return the results. Args: query (str): The query to process num_results (int): Number of results to return + use_reranker (bool): Whether to use the Jina Reranker for semantic ranking Returns: tuple: (markdown_results, json_results_path) @@ -72,10 +73,15 @@ class GradioInterface: for engine, results in search_results.items(): print(f"Engine {engine} returned {len(results)} results") + # Add the query to each result for reranking + for engine, results in search_results.items(): + for result in results: + result["query"] = processed_query.get("enhanced_query", processed_query.get("original_query", query)) + # Process the results - don't limit the number of results print(f"Processing results...") processed_results = self.result_collector.process_results( - search_results, dedup=True, max_results=None + search_results, dedup=True, max_results=None, use_reranker=use_reranker ) print(f"Processed {len(processed_results)} results") @@ -187,6 +193,11 @@ class GradioInterface: step=5, label="Results Per Engine" ) + use_reranker = gr.Checkbox( + label="Use Semantic Reranker", + value=True, + info="Uses Jina AI's reranker for more relevant results" + ) search_button = gr.Button("Search", variant="primary") gr.Examples( @@ -211,7 +222,7 @@ class GradioInterface: search_button.click( fn=self.process_query, - inputs=[query_input, num_results], + inputs=[query_input, num_results, use_reranker], outputs=[results_output, file_output] )