223 lines
8.3 KiB
Python
223 lines
8.3 KiB
Python
"""
|
|
Search executor module.
|
|
Handles the execution of search queries across multiple search engines.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import time
|
|
import asyncio
|
|
import concurrent.futures
|
|
from typing import Dict, List, Any, Optional, Union
|
|
|
|
from config.config import get_config
|
|
from .api_handlers.base_handler import BaseSearchHandler
|
|
from .api_handlers.serper_handler import SerperSearchHandler
|
|
from .api_handlers.scholar_handler import ScholarSearchHandler
|
|
from .api_handlers.arxiv_handler import ArxivSearchHandler
|
|
|
|
|
|
class SearchExecutor:
|
|
"""
|
|
Executes search queries across multiple search engines.
|
|
Manages rate limiting, error handling, and result aggregation.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the search executor with available search handlers."""
|
|
self.config = get_config()
|
|
self.handlers = self._initialize_handlers()
|
|
self.available_handlers = {name: handler for name, handler in self.handlers.items()
|
|
if handler.is_available()}
|
|
|
|
def _initialize_handlers(self) -> Dict[str, BaseSearchHandler]:
|
|
"""
|
|
Initialize all search handlers.
|
|
|
|
Returns:
|
|
Dictionary mapping handler names to handler instances
|
|
"""
|
|
return {
|
|
"serper": SerperSearchHandler(),
|
|
"scholar": ScholarSearchHandler(),
|
|
"arxiv": ArxivSearchHandler()
|
|
}
|
|
|
|
def get_available_search_engines(self) -> List[str]:
|
|
"""
|
|
Get a list of available search engines.
|
|
|
|
Returns:
|
|
List of available search engine names
|
|
"""
|
|
return list(self.available_handlers.keys())
|
|
|
|
def execute_search(self,
|
|
structured_query: Dict[str, Any],
|
|
search_engines: Optional[List[str]] = None,
|
|
num_results: int = 10,
|
|
timeout: int = 30) -> Dict[str, List[Dict[str, Any]]]:
|
|
"""
|
|
Execute a search query across multiple search engines.
|
|
|
|
Args:
|
|
structured_query: Structured query from the query processor
|
|
search_engines: List of search engines to use (if None, use all available)
|
|
num_results: Number of results to return per search engine
|
|
timeout: Timeout in seconds for each search engine
|
|
|
|
Returns:
|
|
Dictionary mapping search engine names to lists of search results
|
|
"""
|
|
# Get the raw query
|
|
raw_query = structured_query.get("raw_query", "")
|
|
|
|
# Get the enhanced query if available, otherwise use the raw query
|
|
query = structured_query.get("enhanced_query", raw_query)
|
|
|
|
# Truncate the query if it's too long (Serper API has a 2048 character limit)
|
|
if len(query) > 2000:
|
|
query = query[:2000]
|
|
|
|
# If no search engines specified, use all available
|
|
if search_engines is None:
|
|
search_engines = list(self.available_handlers.keys())
|
|
else:
|
|
# Filter to only include available search engines
|
|
search_engines = [engine for engine in search_engines
|
|
if engine in self.available_handlers]
|
|
|
|
# Get the search queries for each engine
|
|
search_queries = structured_query.get("search_queries", {})
|
|
|
|
# Execute searches in parallel
|
|
results = {}
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
future_to_engine = {}
|
|
|
|
for engine in search_engines:
|
|
if engine not in self.available_handlers:
|
|
continue
|
|
|
|
# Get the appropriate query for this engine
|
|
engine_query = search_queries.get(engine, query)
|
|
|
|
# Submit the search task
|
|
future = executor.submit(
|
|
self._execute_single_search,
|
|
engine=engine,
|
|
query=engine_query,
|
|
num_results=num_results
|
|
)
|
|
future_to_engine[future] = engine
|
|
|
|
# Collect results as they complete
|
|
for future in concurrent.futures.as_completed(future_to_engine, timeout=timeout):
|
|
engine = future_to_engine[future]
|
|
try:
|
|
engine_results = future.result()
|
|
results[engine] = engine_results
|
|
except Exception as e:
|
|
print(f"Error executing search for {engine}: {e}")
|
|
results[engine] = []
|
|
|
|
return results
|
|
|
|
def _execute_single_search(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]:
|
|
"""
|
|
Execute a search on a single search engine.
|
|
|
|
Args:
|
|
engine: Name of the search engine
|
|
query: Query to execute
|
|
num_results: Number of results to return
|
|
|
|
Returns:
|
|
List of search results
|
|
"""
|
|
handler = self.available_handlers.get(engine)
|
|
if not handler:
|
|
return []
|
|
|
|
try:
|
|
# Execute the search
|
|
results = handler.search(query, num_results=num_results)
|
|
return results
|
|
except Exception as e:
|
|
print(f"Error executing search for {engine}: {e}")
|
|
return []
|
|
|
|
async def execute_search_async(self,
|
|
structured_query: Dict[str, Any],
|
|
search_engines: Optional[List[str]] = None,
|
|
num_results: int = 10,
|
|
timeout: int = 30) -> Dict[str, List[Dict[str, Any]]]:
|
|
"""
|
|
Execute a search query across specified search engines asynchronously.
|
|
|
|
Args:
|
|
structured_query: The structured query from the query processor
|
|
search_engines: List of search engines to use (if None, use all available)
|
|
num_results: Number of results to return per search engine
|
|
timeout: Timeout in seconds for each search engine
|
|
|
|
Returns:
|
|
Dictionary mapping search engine names to lists of search results
|
|
"""
|
|
# If no search engines specified, use all available
|
|
if search_engines is None:
|
|
search_engines = list(self.available_handlers.keys())
|
|
else:
|
|
# Filter to only include available search engines
|
|
search_engines = [engine for engine in search_engines
|
|
if engine in self.available_handlers]
|
|
|
|
# Get the search queries for each engine
|
|
search_queries = structured_query.get("search_queries", {})
|
|
|
|
# Create tasks for each search engine
|
|
tasks = []
|
|
for engine in search_engines:
|
|
if engine not in self.available_handlers:
|
|
continue
|
|
|
|
# Get the appropriate query for this engine
|
|
query = search_queries.get(engine, structured_query.get("enhanced_query", ""))
|
|
|
|
# Create a task for this search
|
|
task = self._execute_single_search_async(engine, query, num_results)
|
|
tasks.append((engine, task))
|
|
|
|
# Execute all tasks with timeout
|
|
results = {}
|
|
for engine, task in tasks:
|
|
try:
|
|
engine_results = await asyncio.wait_for(task, timeout=timeout)
|
|
results[engine] = engine_results
|
|
except asyncio.TimeoutError:
|
|
print(f"Search timed out for {engine}")
|
|
results[engine] = []
|
|
except Exception as e:
|
|
print(f"Error executing search for {engine}: {e}")
|
|
results[engine] = []
|
|
|
|
return results
|
|
|
|
async def _execute_single_search_async(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]:
|
|
"""
|
|
Execute a search on a single search engine asynchronously.
|
|
|
|
Args:
|
|
engine: Name of the search engine
|
|
query: Query to execute
|
|
num_results: Number of results to return
|
|
|
|
Returns:
|
|
List of search results
|
|
"""
|
|
# Execute in a thread pool since most API calls are blocking
|
|
loop = asyncio.get_event_loop()
|
|
return await loop.run_in_executor(
|
|
None, self._execute_single_search, engine, query, num_results
|
|
)
|