diff --git a/execution/result_collector.py b/execution/result_collector.py index 0d1af69..8366397 100644 --- a/execution/result_collector.py +++ b/execution/result_collector.py @@ -35,31 +35,54 @@ class ResultCollector: use_reranker: bool = True) -> List[Dict[str, Any]]: """ Process search results from multiple search engines. - + Args: - search_results: Dictionary mapping search engine names to lists of search results - dedup: Whether to deduplicate results based on URL - max_results: Maximum number of results to return (after processing) + search_results: Dictionary mapping search engine names to lists of results + dedup: Whether to deduplicate results + max_results: Maximum number of results to return use_reranker: Whether to use the Jina Reranker for semantic ranking - + Returns: List of processed search results """ - # Flatten and normalize results - all_results = self._flatten_results(search_results) + # Combine results from all search engines + all_results = [] + + # Check if we have a flattened structure (single key with all results) + if len(search_results) == 1 and "combined" in search_results: + all_results = search_results["combined"] + print(f"Processing {len(all_results)} combined results") + else: + # Traditional structure with separate engines + for engine, results in search_results.items(): + for result in results: + # Add the source if not already present + if "source" not in result: + result["source"] = engine + all_results.append(result) + print(f"Processing {len(all_results)} results from {len(search_results)} engines") # Deduplicate results if requested if dedup: all_results = self._deduplicate_results(all_results) + print(f"Deduplicated to {len(all_results)} results") - # Use reranker if available and requested, otherwise use basic scoring - if use_reranker and self.reranker_available: - all_results = self._rerank_results(all_results) + # Use the reranker if available and requested + if use_reranker and self.reranker is not None: + try: + print("Using Jina Reranker for semantic ranking") + all_results = self._rerank_results(all_results) + print(f"Reranked {len(all_results)} results") + except Exception as e: + print(f"Error using reranker: {str(e)}") + # Fall back to basic scoring + all_results = self._score_and_sort_results(all_results) else: - # Sort results by relevance (using a simple scoring algorithm) + # Use basic scoring + print("Using basic scoring") all_results = self._score_and_sort_results(all_results) - # Limit results if requested + # Limit the number of results if requested if max_results is not None: all_results = all_results[:max_results] @@ -67,29 +90,29 @@ class ResultCollector: def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: """ - Flatten results from multiple search engines into a single list. - + Flatten search results from multiple search engines into a single list. + Args: - search_results: Dictionary mapping search engine names to lists of search results - + search_results: Dictionary mapping search engine names to lists of results + Returns: Flattened list of search results """ + # This method is deprecated and kept for backward compatibility + # The process_results method now handles flattened results directly all_results = [] + # Check if we have a flattened structure (single key with all results) + if len(search_results) == 1 and "combined" in search_results: + return search_results["combined"] + + # Traditional structure with separate engines for engine, results in search_results.items(): for result in results: - # Ensure all results have the same basic structure - normalized_result = { - "title": result.get("title", ""), - "url": result.get("url", ""), - "snippet": result.get("snippet", ""), - "source": result.get("source", engine), - "domain": self._extract_domain(result.get("url", "")), - "timestamp": datetime.now().isoformat(), - "raw_data": result - } - all_results.append(normalized_result) + # Add the source if not already present + if "source" not in result: + result["source"] = engine + all_results.append(result) return all_results @@ -204,15 +227,33 @@ class ResultCollector: # Use the reranker to rerank the snippets reranked = self.reranker.rerank(query, snippets) + if not reranked: + print("Reranker returned empty results. Using basic scoring instead.") + return self._score_and_sort_results(results) + + print(f"Reranked {len(reranked)} results") + # Create a new list of results based on the reranking reranked_results = [] for item in reranked: # Get the original result and add the new score - original_result = results[item['index']] + index = item.get('index') + score = item.get('score') + + if index is None or score is None or index >= len(results): + print(f"Warning: Invalid reranker result item: {item}") + continue + + original_result = results[index] new_result = original_result.copy() - new_result['relevance_score'] = item['score'] * 10 # Scale up the score for consistency + new_result['relevance_score'] = float(score) * 10 # Scale up the score for consistency reranked_results.append(new_result) + # If we didn't get any valid results, fall back to basic scoring + if not reranked_results: + print("No valid reranked results. Using basic scoring instead.") + return self._score_and_sort_results(results) + return reranked_results except Exception as e: print(f"Error reranking results: {str(e)}") diff --git a/ranking/jina_reranker.py b/ranking/jina_reranker.py index 320b154..8d4fa32 100644 --- a/ranking/jina_reranker.py +++ b/ranking/jina_reranker.py @@ -85,21 +85,55 @@ class JinaReranker: "top_n": top_n } + print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents") + try: response = requests.post(self.endpoint, headers=headers, json=data) + print(f"Reranker API response status: {response.status_code}") + + if response.status_code != 200: + print(f"Reranker API error: {response.text}") + return [] + response.raise_for_status() # Raise exception for HTTP errors result = response.json() # Process and return the reranked results reranked_results = [] - for item in result.get('results', []): - reranked_results.append({ - 'index': item.get('index'), # Original index in the documents list - 'score': item.get('score'), # Relevance score - 'document': documents[item.get('index')] # The actual document content - }) + # Check for the specific response structure we observed + if "data" in result and isinstance(result["data"], list): + data_list = result["data"] + for item in data_list: + if isinstance(item, dict) and "index" in item and "relevance_score" in item: + reranked_results.append({ + 'index': item.get('index'), + 'score': item.get('relevance_score'), + 'document': documents[item.get('index')] if item.get('index') < len(documents) else None + }) + # Check other possible response structures + elif "results" in result: + results_list = result["results"] + for item in results_list: + if isinstance(item, dict) and "index" in item and "score" in item: + reranked_results.append({ + 'index': item.get('index'), + 'score': item.get('score'), + 'document': documents[item.get('index')] if item.get('index') < len(documents) else None + }) + elif "documents" in result: + # Alternative API response structure + docs_list = result["documents"] + for i, doc in enumerate(docs_list): + if isinstance(doc, dict) and "score" in doc: + reranked_results.append({ + 'index': i, + 'score': doc.get('score'), + 'document': documents[i] + }) + + print(f"Processed reranker results: {len(reranked_results)} items") return reranked_results except Exception as e: diff --git a/test_reranker.py b/test_reranker.py index c591740..969a658 100644 --- a/test_reranker.py +++ b/test_reranker.py @@ -73,6 +73,11 @@ def test_reranker(): result["engine"] = engine flattened_results.append(result) + # Verify that the query is in the flattened results + if flattened_results: + print(f"\nVerifying query in flattened results:") + print(f"Query in first result: {flattened_results[0].get('query', 'NOT FOUND')[:50]}...") + # Process results without reranking print("\nProcessing results without reranking...") basic_results = result_collector.process_results(