Integrate Jina Reranker with ResultCollector for semantic ranking

This commit is contained in:
Steve White 2025-02-27 16:59:54 -06:00
parent fc74a879b3
commit 16720d04c7
4 changed files with 214 additions and 7 deletions

31
.gradio/certificate.pem Normal file
View File

@ -0,0 +1,31 @@
-----BEGIN CERTIFICATE-----
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
-----END CERTIFICATE-----

View File

@ -10,6 +10,8 @@ from typing import Dict, List, Any, Optional, Set
from urllib.parse import urlparse
from datetime import datetime
from ranking.jina_reranker import get_jina_reranker
class ResultCollector:
"""
@ -19,12 +21,18 @@ class ResultCollector:
def __init__(self):
"""Initialize the result collector."""
pass
try:
self.reranker = get_jina_reranker()
self.reranker_available = True
except ValueError:
print("Jina Reranker not available. Will use basic scoring instead.")
self.reranker_available = False
def process_results(self,
search_results: Dict[str, List[Dict[str, Any]]],
dedup: bool = True,
max_results: Optional[int] = None) -> List[Dict[str, Any]]:
max_results: Optional[int] = None,
use_reranker: bool = True) -> List[Dict[str, Any]]:
"""
Process search results from multiple search engines.
@ -32,6 +40,7 @@ class ResultCollector:
search_results: Dictionary mapping search engine names to lists of search results
dedup: Whether to deduplicate results based on URL
max_results: Maximum number of results to return (after processing)
use_reranker: Whether to use the Jina Reranker for semantic ranking
Returns:
List of processed search results
@ -43,8 +52,12 @@ class ResultCollector:
if dedup:
all_results = self._deduplicate_results(all_results)
# Sort results by relevance (using a simple scoring algorithm)
all_results = self._score_and_sort_results(all_results)
# Use reranker if available and requested, otherwise use basic scoring
if use_reranker and self.reranker_available:
all_results = self._rerank_results(all_results)
else:
# Sort results by relevance (using a simple scoring algorithm)
all_results = self._score_and_sort_results(all_results)
# Limit results if requested
if max_results is not None:
@ -152,6 +165,52 @@ class ResultCollector:
return sorted_results
def _rerank_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Rerank results using the Jina Reranker.
Args:
results: List of search results
Returns:
Reranked list of search results
"""
if not results:
return []
# Get the original query from the first result (all should have the same query)
query = results[0].get("query", "")
if not query:
# If no query is found, use a fallback approach
print("Warning: No query found in results. Using basic scoring instead.")
return self._score_and_sort_results(results)
# Extract snippets for reranking
snippets = []
for result in results:
# Combine title and snippet for better reranking
content = f"{result.get('title', '')} {result.get('snippet', '')}"
snippets.append(content)
try:
# Use the reranker to rerank the snippets
reranked = self.reranker.rerank(query, snippets)
# Create a new list of results based on the reranking
reranked_results = []
for item in reranked:
# Get the original result and add the new score
original_result = results[item['index']]
new_result = original_result.copy()
new_result['relevance_score'] = item['score']
reranked_results.append(new_result)
return reranked_results
except Exception as e:
print(f"Error reranking results: {str(e)}")
# Fall back to basic scoring if reranking fails
return self._score_and_sort_results(results)
def _extract_domain(self, url: str) -> str:
"""
Extract the domain from a URL.

106
test_reranker.py Normal file
View File

@ -0,0 +1,106 @@
"""
Test script for the Jina Reranker integration.
This script tests the reranker functionality by comparing results with and without reranking.
"""
import json
import time
from pathlib import Path
from query.query_processor import QueryProcessor
from execution.search_executor import SearchExecutor
from execution.result_collector import ResultCollector
from ranking.jina_reranker import get_jina_reranker
def test_reranker():
"""Test the reranker functionality."""
# Initialize components
query_processor = QueryProcessor()
search_executor = SearchExecutor()
result_collector = ResultCollector()
# Check if reranker is available
try:
reranker = get_jina_reranker()
reranker_available = True
print("Jina Reranker is available.")
except ValueError:
reranker_available = False
print("Jina Reranker is not available. Will only test basic scoring.")
# Process a test query
query = "What are the latest advancements in quantum computing?"
print(f"Processing query: {query}")
processed_query = query_processor.process_query(query)
print(f"Processed query: {processed_query}")
# Execute the search
available_engines = search_executor.get_available_search_engines()
print(f"Available search engines: {available_engines}")
if 'search_engines' not in processed_query:
processed_query['search_engines'] = available_engines
# Execute the search
search_results = search_executor.execute_search(
structured_query=processed_query,
num_results=10
)
# Print which engines returned results
for engine, results in search_results.items():
print(f"Engine {engine} returned {len(results)} results")
# Add the query to each result for reranking
for engine, results in search_results.items():
for result in results:
result["query"] = processed_query.get("enhanced_query", processed_query.get("original_query", query))
# Process results without reranking
print("\nProcessing results without reranking...")
basic_results = result_collector.process_results(
search_results, dedup=True, max_results=None, use_reranker=False
)
print(f"Processed {len(basic_results)} results with basic scoring")
# Save basic results
results_dir = Path(__file__).parent / "results"
results_dir.mkdir(exist_ok=True)
timestamp = int(time.time())
basic_file = results_dir / f"basic_results_{timestamp}.json"
with open(basic_file, "w") as f:
json.dump(basic_results, f, indent=2)
print(f"Basic results saved to {basic_file}")
# Process results with reranking (if available)
if reranker_available:
print("\nProcessing results with reranking...")
reranked_results = result_collector.process_results(
search_results, dedup=True, max_results=None, use_reranker=True
)
print(f"Processed {len(reranked_results)} results with reranking")
# Save reranked results
reranked_file = results_dir / f"reranked_results_{timestamp}.json"
with open(reranked_file, "w") as f:
json.dump(reranked_results, f, indent=2)
print(f"Reranked results saved to {reranked_file}")
# Compare top 5 results
print("\nComparing top 5 results:")
print("\nTop 5 results with basic scoring:")
for i, result in enumerate(basic_results[:5]):
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
print("\nTop 5 results with reranking:")
for i, result in enumerate(reranked_results[:5]):
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
if __name__ == "__main__":
test_reranker()

View File

@ -29,13 +29,14 @@ class GradioInterface:
self.results_dir = Path(__file__).parent.parent / "results"
self.results_dir.mkdir(exist_ok=True)
def process_query(self, query, num_results=10):
def process_query(self, query, num_results=10, use_reranker=True):
"""
Process a query and return the results.
Args:
query (str): The query to process
num_results (int): Number of results to return
use_reranker (bool): Whether to use the Jina Reranker for semantic ranking
Returns:
tuple: (markdown_results, json_results_path)
@ -72,10 +73,15 @@ class GradioInterface:
for engine, results in search_results.items():
print(f"Engine {engine} returned {len(results)} results")
# Add the query to each result for reranking
for engine, results in search_results.items():
for result in results:
result["query"] = processed_query.get("enhanced_query", processed_query.get("original_query", query))
# Process the results - don't limit the number of results
print(f"Processing results...")
processed_results = self.result_collector.process_results(
search_results, dedup=True, max_results=None
search_results, dedup=True, max_results=None, use_reranker=use_reranker
)
print(f"Processed {len(processed_results)} results")
@ -187,6 +193,11 @@ class GradioInterface:
step=5,
label="Results Per Engine"
)
use_reranker = gr.Checkbox(
label="Use Semantic Reranker",
value=True,
info="Uses Jina AI's reranker for more relevant results"
)
search_button = gr.Button("Search", variant="primary")
gr.Examples(
@ -211,7 +222,7 @@ class GradioInterface:
search_button.click(
fn=self.process_query,
inputs=[query_input, num_results],
inputs=[query_input, num_results, use_reranker],
outputs=[results_output, file_output]
)