From a34b92c10366647df080cd5ebcf894a6104c1276 Mon Sep 17 00:00:00 2001 From: Steve White Date: Thu, 27 Feb 2025 17:16:52 -0600 Subject: [PATCH] Fix Jina Reranker API integration with proper request and response handling --- ranking/jina_reranker.py | 46 ++++++------ test_reranker.py | 149 +++++++++++--------------------------- test_simple_reranker.py | 152 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 218 insertions(+), 129 deletions(-) create mode 100644 test_simple_reranker.py diff --git a/ranking/jina_reranker.py b/ranking/jina_reranker.py index 8d4fa32..0daf93f 100644 --- a/ranking/jina_reranker.py +++ b/ranking/jina_reranker.py @@ -78,14 +78,16 @@ class JinaReranker: "Accept": "application/json" } + # The correct format is an array of plain strings, not objects with a "text" field data = { "model": self.model, "query": query, - "documents": documents, + "documents": documents, # Plain array of strings "top_n": top_n } print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents") + print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}") try: response = requests.post(self.endpoint, headers=headers, json=data) @@ -98,12 +100,30 @@ class JinaReranker: response.raise_for_status() # Raise exception for HTTP errors result = response.json() + print(f"Reranker API response structure: {list(result.keys())}") # Process and return the reranked results reranked_results = [] - # Check for the specific response structure we observed - if "data" in result and isinstance(result["data"], list): + # Check for the specific response structure from the API + if "results" in result and isinstance(result["results"], list): + results_list = result["results"] + for item in results_list: + if isinstance(item, dict) and "index" in item and "relevance_score" in item: + reranked_results.append({ + 'index': item.get('index'), + 'score': item.get('relevance_score'), + 'document': documents[item.get('index')] if item.get('index') < len(documents) else None + }) + # Handle newer Jina API format with document.text + elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item: + reranked_results.append({ + 'index': item.get('index'), + 'score': item.get('relevance_score'), + 'document': documents[item.get('index')] if item.get('index') < len(documents) else None + }) + # Fallback for older response structures with "data" field + elif "data" in result and isinstance(result["data"], list): data_list = result["data"] for item in data_list: if isinstance(item, dict) and "index" in item and "relevance_score" in item: @@ -112,26 +132,6 @@ class JinaReranker: 'score': item.get('relevance_score'), 'document': documents[item.get('index')] if item.get('index') < len(documents) else None }) - # Check other possible response structures - elif "results" in result: - results_list = result["results"] - for item in results_list: - if isinstance(item, dict) and "index" in item and "score" in item: - reranked_results.append({ - 'index': item.get('index'), - 'score': item.get('score'), - 'document': documents[item.get('index')] if item.get('index') < len(documents) else None - }) - elif "documents" in result: - # Alternative API response structure - docs_list = result["documents"] - for i, doc in enumerate(docs_list): - if isinstance(doc, dict) and "score" in doc: - reranked_results.append({ - 'index': i, - 'score': doc.get('score'), - 'document': documents[i] - }) print(f"Processed reranker results: {len(reranked_results)} items") return reranked_results diff --git a/test_reranker.py b/test_reranker.py index 969a658..4ace271 100644 --- a/test_reranker.py +++ b/test_reranker.py @@ -5,122 +5,59 @@ This script tests the reranker functionality by comparing results with and witho import json import time +import os from pathlib import Path +from typing import Dict, List, Any, Optional -from query.query_processor import QueryProcessor -from execution.search_executor import SearchExecutor -from execution.result_collector import ResultCollector -from ranking.jina_reranker import get_jina_reranker +# Import just what we need for the simple test +from ranking.jina_reranker import JinaReranker, get_jina_reranker - -def test_reranker(): - """Test the reranker functionality.""" - # Initialize components - query_processor = QueryProcessor() - search_executor = SearchExecutor() - result_collector = ResultCollector() - - # Check if reranker is available +def test_simple_reranker(): + """Test the Jina Reranker with a simple query and documents""" + # Initialize the reranker directly without parameters (it will read from config) try: reranker = get_jina_reranker() - reranker_available = True - print("Jina Reranker is available.") - except ValueError: - reranker_available = False - print("Jina Reranker is not available. Will only test basic scoring.") + print("Successfully initialized Jina Reranker") + except Exception as e: + print(f"Error initializing Jina Reranker: {str(e)}") + return - # Process a test query - query = "What are the latest advancements in quantum computing?" - print(f"Processing query: {query}") + # Simple query and documents + query = "What is quantum computing?" + documents = [ + "Quantum computing is a type of computation that harnesses quantum mechanics.", + "Classical computers use bits, while quantum computers use qubits.", + "Machine learning is a subset of artificial intelligence.", + "Quantum computers can solve certain problems faster than classical computers." + ] - processed_query = query_processor.process_query(query) - print(f"Processed query: {processed_query}") + print(f"Testing reranker with query: {query}") + print(f"Documents: {documents}") - # Execute the search - available_engines = search_executor.get_available_search_engines() - print(f"Available search engines: {available_engines}") - - if 'search_engines' not in processed_query: - processed_query['search_engines'] = available_engines - - # Execute the search - search_results = search_executor.execute_search( - structured_query=processed_query, - num_results=10 - ) - - # Print which engines returned results - for engine, results in search_results.items(): - print(f"Engine {engine} returned {len(results)} results") - - # Add the query to each result for reranking - enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query)) - print(f"Enhanced query for reranking: {enhanced_query}") - - # Print the structure of the first result from each engine - print("\nResult structure examples:") - for engine, results in search_results.items(): - if results: - print(f"\n{engine} result example:") - print(json.dumps(results[0], indent=2, default=str)) - - # Flatten results for easier manipulation - flattened_results = [] - for engine, results in search_results.items(): - for result in results: - # Add the query and engine to each result - result["query"] = enhanced_query - result["engine"] = engine - flattened_results.append(result) - - # Verify that the query is in the flattened results - if flattened_results: - print(f"\nVerifying query in flattened results:") - print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...") - - # Process results without reranking - print("\nProcessing results without reranking...") - basic_results = result_collector.process_results( - {"combined": flattened_results}, dedup=True, max_results=None, use_reranker=False - ) - print(f"Processed {len(basic_results)} results with basic scoring") - - # Save basic results - results_dir = Path(__file__).parent / "results" - results_dir.mkdir(exist_ok=True) - - timestamp = int(time.time()) - basic_file = results_dir / f"basic_results_{timestamp}.json" - - with open(basic_file, "w") as f: - json.dump(basic_results, f, indent=2) - print(f"Basic results saved to {basic_file}") - - # Process results with reranking (if available) - if reranker_available: - print("\nProcessing results with reranking...") - reranked_results = result_collector.process_results( - {"combined": flattened_results}, dedup=True, max_results=None, use_reranker=True - ) - print(f"Processed {len(reranked_results)} results with reranking") + # Rerank the documents + try: + reranked = reranker.rerank(query, documents) + print(f"Reranked results: {json.dumps(reranked, indent=2)}") - # Save reranked results - reranked_file = results_dir / f"reranked_results_{timestamp}.json" + # Save the results to a file for analysis + results_dir = Path("results") + results_dir.mkdir(exist_ok=True) + results_file = results_dir / f"reranked_results_{int(time.time())}.json" - with open(reranked_file, "w") as f: - json.dump(reranked_results, f, indent=2) - print(f"Reranked results saved to {reranked_file}") + with open(results_file, "w") as f: + json.dump(reranked, f, indent=2) - # Compare top 5 results - print("\nComparing top 5 results:") - print("\nTop 5 results with basic scoring:") - for i, result in enumerate(basic_results[:5]): - print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})") - - print("\nTop 5 results with reranking:") - for i, result in enumerate(reranked_results[:5]): - print(f"{i+1}. {result.get('title')} (Score: {result.get('relevance_score')}, Source: {result.get('source')})") - + print(f"Results saved to {results_file}") + return True + except Exception as e: + print(f"Error reranking: {str(e)}") + return False if __name__ == "__main__": - test_reranker() + # Just run the simple test + success = test_simple_reranker() + + if success: + print("Jina Reranker test completed successfully!") + else: + print("Jina Reranker test failed.") diff --git a/test_simple_reranker.py b/test_simple_reranker.py new file mode 100644 index 0000000..3f35c36 --- /dev/null +++ b/test_simple_reranker.py @@ -0,0 +1,152 @@ +import json +import sys +import os +import yaml +from pathlib import Path + +# Add the project root to the path +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +# Let's create a custom JinaReranker class specifically for testing +class TestJinaReranker: + """Custom JinaReranker for testing with explicit initialization parameters""" + + def __init__(self, api_key, model, endpoint): + """Initialize with explicit parameters""" + self.api_key = api_key + self.model = model + self.endpoint = endpoint + self.default_top_n = 10 + + def rerank(self, query, documents, top_n=None): + """ + Rerank documents based on their relevance to the query. + """ + if not documents: + return [] + + # Use default top_n if not specified + if top_n is None: + top_n = min(self.default_top_n, len(documents)) + else: + top_n = min(top_n, len(documents)) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + "Accept": "application/json" + } + + data = { + "model": self.model, + "query": query, + "documents": documents, # Plain array of strings + "top_n": top_n + } + + print(f"Making reranker API call with query: {query}") + print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}") + + import requests + try: + response = requests.post(self.endpoint, headers=headers, json=data) + print(f"Reranker API response status: {response.status_code}") + + if response.status_code != 200: + print(f"Reranker API error: {response.text}") + return [] + + response.raise_for_status() # Raise exception for HTTP errors + + result = response.json() + print(f"Reranker API response structure: {list(result.keys())}") + print(f"Full response: {json.dumps(result, indent=2)}") + + # Process and return the reranked results + reranked_results = [] + + # Check for the specific response structure from the API + if "results" in result and isinstance(result["results"], list): + results_list = result["results"] + for item in results_list: + if isinstance(item, dict) and "index" in item and "relevance_score" in item: + reranked_results.append({ + 'index': item.get('index'), + 'score': item.get('relevance_score'), + 'document': documents[item.get('index')] if item.get('index') < len(documents) else None + }) + # Handle newer Jina API format with document.text + elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item: + reranked_results.append({ + 'index': item.get('index'), + 'score': item.get('relevance_score'), + 'document': documents[item.get('index')] if item.get('index') < len(documents) else None + }) + # Fallback for older response structures + elif "data" in result and isinstance(result["data"], list): + data_list = result["data"] + for item in data_list: + if isinstance(item, dict) and "index" in item and "relevance_score" in item: + reranked_results.append({ + 'index': item.get('index'), + 'score': item.get('relevance_score'), + 'document': documents[item.get('index')] if item.get('index') < len(documents) else None + }) + + print(f"Processed reranker results: {len(reranked_results)} items") + return reranked_results + + except Exception as e: + print(f"Error calling reranker API: {str(e)}") + return [] + +def load_config(): + """Load configuration from YAML file""" + config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config", "config.yaml") + print(f"Loading config from {config_path}") + + if os.path.exists(config_path): + with open(config_path, "r") as f: + config = yaml.safe_load(f) + print("Configuration loaded successfully") + return config + else: + print(f"Config file not found at {config_path}") + return {} + +def test_simple_reranker(): + """Test the Jina Reranker with a simple query and documents""" + # Get Jina API key from environment + jina_api_key = os.environ.get("JINA_API_KEY", "") + if not jina_api_key: + print("JINA_API_KEY not found in environment variables") + return + + print(f"Found JINA_API_KEY in environment variables") + + # Initialize the reranker + reranker = TestJinaReranker( + api_key=jina_api_key, + model="jina-reranker-v2-base-multilingual", + endpoint="https://api.jina.ai/v1/rerank" + ) + + # Simple query and documents + query = "What is quantum computing?" + documents = [ + "Quantum computing is a type of computation that harnesses quantum mechanics.", + "Classical computers use bits, while quantum computers use qubits.", + "Machine learning is a subset of artificial intelligence.", + "Quantum computers can solve certain problems faster than classical computers." + ] + + print(f"Testing simple reranker with query: {query}") + print(f"Documents: {documents}") + + # Rerank the documents + reranked = reranker.rerank(query, documents) + print(f"Reranked results: {json.dumps(reranked, indent=2)}") + +if __name__ == "__main__": + # Just run the simple test + test_simple_reranker()