Integrate Jina Reranker with ResultCollector for semantic ranking
This commit is contained in:
parent
fc74a879b3
commit
16720d04c7
|
@ -0,0 +1,31 @@
|
|||
-----BEGIN CERTIFICATE-----
|
||||
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
||||
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
||||
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
||||
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
||||
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
||||
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
||||
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
||||
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
||||
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
||||
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
||||
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
||||
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
||||
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
||||
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
||||
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
||||
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
||||
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
||||
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
||||
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
||||
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
||||
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
||||
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
||||
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
||||
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
||||
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
||||
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
||||
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
||||
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
||||
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
||||
-----END CERTIFICATE-----
|
|
@ -10,6 +10,8 @@ from typing import Dict, List, Any, Optional, Set
|
|||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
|
||||
from ranking.jina_reranker import get_jina_reranker
|
||||
|
||||
|
||||
class ResultCollector:
|
||||
"""
|
||||
|
@ -19,12 +21,18 @@ class ResultCollector:
|
|||
|
||||
def __init__(self):
|
||||
"""Initialize the result collector."""
|
||||
pass
|
||||
try:
|
||||
self.reranker = get_jina_reranker()
|
||||
self.reranker_available = True
|
||||
except ValueError:
|
||||
print("Jina Reranker not available. Will use basic scoring instead.")
|
||||
self.reranker_available = False
|
||||
|
||||
def process_results(self,
|
||||
search_results: Dict[str, List[Dict[str, Any]]],
|
||||
dedup: bool = True,
|
||||
max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
max_results: Optional[int] = None,
|
||||
use_reranker: bool = True) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process search results from multiple search engines.
|
||||
|
||||
|
@ -32,6 +40,7 @@ class ResultCollector:
|
|||
search_results: Dictionary mapping search engine names to lists of search results
|
||||
dedup: Whether to deduplicate results based on URL
|
||||
max_results: Maximum number of results to return (after processing)
|
||||
use_reranker: Whether to use the Jina Reranker for semantic ranking
|
||||
|
||||
Returns:
|
||||
List of processed search results
|
||||
|
@ -43,8 +52,12 @@ class ResultCollector:
|
|||
if dedup:
|
||||
all_results = self._deduplicate_results(all_results)
|
||||
|
||||
# Sort results by relevance (using a simple scoring algorithm)
|
||||
all_results = self._score_and_sort_results(all_results)
|
||||
# Use reranker if available and requested, otherwise use basic scoring
|
||||
if use_reranker and self.reranker_available:
|
||||
all_results = self._rerank_results(all_results)
|
||||
else:
|
||||
# Sort results by relevance (using a simple scoring algorithm)
|
||||
all_results = self._score_and_sort_results(all_results)
|
||||
|
||||
# Limit results if requested
|
||||
if max_results is not None:
|
||||
|
@ -152,6 +165,52 @@ class ResultCollector:
|
|||
|
||||
return sorted_results
|
||||
|
||||
def _rerank_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Rerank results using the Jina Reranker.
|
||||
|
||||
Args:
|
||||
results: List of search results
|
||||
|
||||
Returns:
|
||||
Reranked list of search results
|
||||
"""
|
||||
if not results:
|
||||
return []
|
||||
|
||||
# Get the original query from the first result (all should have the same query)
|
||||
query = results[0].get("query", "")
|
||||
if not query:
|
||||
# If no query is found, use a fallback approach
|
||||
print("Warning: No query found in results. Using basic scoring instead.")
|
||||
return self._score_and_sort_results(results)
|
||||
|
||||
# Extract snippets for reranking
|
||||
snippets = []
|
||||
for result in results:
|
||||
# Combine title and snippet for better reranking
|
||||
content = f"{result.get('title', '')} {result.get('snippet', '')}"
|
||||
snippets.append(content)
|
||||
|
||||
try:
|
||||
# Use the reranker to rerank the snippets
|
||||
reranked = self.reranker.rerank(query, snippets)
|
||||
|
||||
# Create a new list of results based on the reranking
|
||||
reranked_results = []
|
||||
for item in reranked:
|
||||
# Get the original result and add the new score
|
||||
original_result = results[item['index']]
|
||||
new_result = original_result.copy()
|
||||
new_result['relevance_score'] = item['score']
|
||||
reranked_results.append(new_result)
|
||||
|
||||
return reranked_results
|
||||
except Exception as e:
|
||||
print(f"Error reranking results: {str(e)}")
|
||||
# Fall back to basic scoring if reranking fails
|
||||
return self._score_and_sort_results(results)
|
||||
|
||||
def _extract_domain(self, url: str) -> str:
|
||||
"""
|
||||
Extract the domain from a URL.
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
"""
|
||||
Test script for the Jina Reranker integration.
|
||||
This script tests the reranker functionality by comparing results with and without reranking.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from query.query_processor import QueryProcessor
|
||||
from execution.search_executor import SearchExecutor
|
||||
from execution.result_collector import ResultCollector
|
||||
from ranking.jina_reranker import get_jina_reranker
|
||||
|
||||
|
||||
def test_reranker():
|
||||
"""Test the reranker functionality."""
|
||||
# Initialize components
|
||||
query_processor = QueryProcessor()
|
||||
search_executor = SearchExecutor()
|
||||
result_collector = ResultCollector()
|
||||
|
||||
# Check if reranker is available
|
||||
try:
|
||||
reranker = get_jina_reranker()
|
||||
reranker_available = True
|
||||
print("Jina Reranker is available.")
|
||||
except ValueError:
|
||||
reranker_available = False
|
||||
print("Jina Reranker is not available. Will only test basic scoring.")
|
||||
|
||||
# Process a test query
|
||||
query = "What are the latest advancements in quantum computing?"
|
||||
print(f"Processing query: {query}")
|
||||
|
||||
processed_query = query_processor.process_query(query)
|
||||
print(f"Processed query: {processed_query}")
|
||||
|
||||
# Execute the search
|
||||
available_engines = search_executor.get_available_search_engines()
|
||||
print(f"Available search engines: {available_engines}")
|
||||
|
||||
if 'search_engines' not in processed_query:
|
||||
processed_query['search_engines'] = available_engines
|
||||
|
||||
# Execute the search
|
||||
search_results = search_executor.execute_search(
|
||||
structured_query=processed_query,
|
||||
num_results=10
|
||||
)
|
||||
|
||||
# Print which engines returned results
|
||||
for engine, results in search_results.items():
|
||||
print(f"Engine {engine} returned {len(results)} results")
|
||||
|
||||
# Add the query to each result for reranking
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
result["query"] = processed_query.get("enhanced_query", processed_query.get("original_query", query))
|
||||
|
||||
# Process results without reranking
|
||||
print("\nProcessing results without reranking...")
|
||||
basic_results = result_collector.process_results(
|
||||
search_results, dedup=True, max_results=None, use_reranker=False
|
||||
)
|
||||
print(f"Processed {len(basic_results)} results with basic scoring")
|
||||
|
||||
# Save basic results
|
||||
results_dir = Path(__file__).parent / "results"
|
||||
results_dir.mkdir(exist_ok=True)
|
||||
|
||||
timestamp = int(time.time())
|
||||
basic_file = results_dir / f"basic_results_{timestamp}.json"
|
||||
|
||||
with open(basic_file, "w") as f:
|
||||
json.dump(basic_results, f, indent=2)
|
||||
print(f"Basic results saved to {basic_file}")
|
||||
|
||||
# Process results with reranking (if available)
|
||||
if reranker_available:
|
||||
print("\nProcessing results with reranking...")
|
||||
reranked_results = result_collector.process_results(
|
||||
search_results, dedup=True, max_results=None, use_reranker=True
|
||||
)
|
||||
print(f"Processed {len(reranked_results)} results with reranking")
|
||||
|
||||
# Save reranked results
|
||||
reranked_file = results_dir / f"reranked_results_{timestamp}.json"
|
||||
|
||||
with open(reranked_file, "w") as f:
|
||||
json.dump(reranked_results, f, indent=2)
|
||||
print(f"Reranked results saved to {reranked_file}")
|
||||
|
||||
# Compare top 5 results
|
||||
print("\nComparing top 5 results:")
|
||||
print("\nTop 5 results with basic scoring:")
|
||||
for i, result in enumerate(basic_results[:5]):
|
||||
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
|
||||
|
||||
print("\nTop 5 results with reranking:")
|
||||
for i, result in enumerate(reranked_results[:5]):
|
||||
print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_reranker()
|
|
@ -29,13 +29,14 @@ class GradioInterface:
|
|||
self.results_dir = Path(__file__).parent.parent / "results"
|
||||
self.results_dir.mkdir(exist_ok=True)
|
||||
|
||||
def process_query(self, query, num_results=10):
|
||||
def process_query(self, query, num_results=10, use_reranker=True):
|
||||
"""
|
||||
Process a query and return the results.
|
||||
|
||||
Args:
|
||||
query (str): The query to process
|
||||
num_results (int): Number of results to return
|
||||
use_reranker (bool): Whether to use the Jina Reranker for semantic ranking
|
||||
|
||||
Returns:
|
||||
tuple: (markdown_results, json_results_path)
|
||||
|
@ -72,10 +73,15 @@ class GradioInterface:
|
|||
for engine, results in search_results.items():
|
||||
print(f"Engine {engine} returned {len(results)} results")
|
||||
|
||||
# Add the query to each result for reranking
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
result["query"] = processed_query.get("enhanced_query", processed_query.get("original_query", query))
|
||||
|
||||
# Process the results - don't limit the number of results
|
||||
print(f"Processing results...")
|
||||
processed_results = self.result_collector.process_results(
|
||||
search_results, dedup=True, max_results=None
|
||||
search_results, dedup=True, max_results=None, use_reranker=use_reranker
|
||||
)
|
||||
print(f"Processed {len(processed_results)} results")
|
||||
|
||||
|
@ -187,6 +193,11 @@ class GradioInterface:
|
|||
step=5,
|
||||
label="Results Per Engine"
|
||||
)
|
||||
use_reranker = gr.Checkbox(
|
||||
label="Use Semantic Reranker",
|
||||
value=True,
|
||||
info="Uses Jina AI's reranker for more relevant results"
|
||||
)
|
||||
search_button = gr.Button("Search", variant="primary")
|
||||
|
||||
gr.Examples(
|
||||
|
@ -211,7 +222,7 @@ class GradioInterface:
|
|||
|
||||
search_button.click(
|
||||
fn=self.process_query,
|
||||
inputs=[query_input, num_results],
|
||||
inputs=[query_input, num_results, use_reranker],
|
||||
outputs=[results_output, file_output]
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in New Issue