ira/execution/search_executor.py

223 lines
8.3 KiB
Python

"""
Search executor module.
Handles the execution of search queries across multiple search engines.
"""
import os
import json
import time
import asyncio
import concurrent.futures
from typing import Dict, List, Any, Optional, Union
from config.config import get_config
from .api_handlers.base_handler import BaseSearchHandler
from .api_handlers.serper_handler import SerperSearchHandler
from .api_handlers.scholar_handler import ScholarSearchHandler
from .api_handlers.arxiv_handler import ArxivSearchHandler
class SearchExecutor:
"""
Executes search queries across multiple search engines.
Manages rate limiting, error handling, and result aggregation.
"""
def __init__(self):
"""Initialize the search executor with available search handlers."""
self.config = get_config()
self.handlers = self._initialize_handlers()
self.available_handlers = {name: handler for name, handler in self.handlers.items()
if handler.is_available()}
def _initialize_handlers(self) -> Dict[str, BaseSearchHandler]:
"""
Initialize all search handlers.
Returns:
Dictionary mapping handler names to handler instances
"""
return {
"serper": SerperSearchHandler(),
"scholar": ScholarSearchHandler(),
"arxiv": ArxivSearchHandler()
}
def get_available_search_engines(self) -> List[str]:
"""
Get a list of available search engines.
Returns:
List of available search engine names
"""
return list(self.available_handlers.keys())
def execute_search(self,
structured_query: Dict[str, Any],
search_engines: Optional[List[str]] = None,
num_results: int = 10,
timeout: int = 30) -> Dict[str, List[Dict[str, Any]]]:
"""
Execute a search query across multiple search engines.
Args:
structured_query: Structured query from the query processor
search_engines: List of search engines to use (if None, use all available)
num_results: Number of results to return per search engine
timeout: Timeout in seconds for each search engine
Returns:
Dictionary mapping search engine names to lists of search results
"""
# Get the raw query
raw_query = structured_query.get("raw_query", "")
# Get the enhanced query if available, otherwise use the raw query
query = structured_query.get("enhanced_query", raw_query)
# Truncate the query if it's too long (Serper API has a 2048 character limit)
if len(query) > 2000:
query = query[:2000]
# If no search engines specified, use all available
if search_engines is None:
search_engines = list(self.available_handlers.keys())
else:
# Filter to only include available search engines
search_engines = [engine for engine in search_engines
if engine in self.available_handlers]
# Get the search queries for each engine
search_queries = structured_query.get("search_queries", {})
# Execute searches in parallel
results = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_engine = {}
for engine in search_engines:
if engine not in self.available_handlers:
continue
# Get the appropriate query for this engine
engine_query = search_queries.get(engine, query)
# Submit the search task
future = executor.submit(
self._execute_single_search,
engine=engine,
query=engine_query,
num_results=num_results
)
future_to_engine[future] = engine
# Collect results as they complete
for future in concurrent.futures.as_completed(future_to_engine, timeout=timeout):
engine = future_to_engine[future]
try:
engine_results = future.result()
results[engine] = engine_results
except Exception as e:
print(f"Error executing search for {engine}: {e}")
results[engine] = []
return results
def _execute_single_search(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]:
"""
Execute a search on a single search engine.
Args:
engine: Name of the search engine
query: Query to execute
num_results: Number of results to return
Returns:
List of search results
"""
handler = self.available_handlers.get(engine)
if not handler:
return []
try:
# Execute the search
results = handler.search(query, num_results=num_results)
return results
except Exception as e:
print(f"Error executing search for {engine}: {e}")
return []
async def execute_search_async(self,
structured_query: Dict[str, Any],
search_engines: Optional[List[str]] = None,
num_results: int = 10,
timeout: int = 30) -> Dict[str, List[Dict[str, Any]]]:
"""
Execute a search query across specified search engines asynchronously.
Args:
structured_query: The structured query from the query processor
search_engines: List of search engines to use (if None, use all available)
num_results: Number of results to return per search engine
timeout: Timeout in seconds for each search engine
Returns:
Dictionary mapping search engine names to lists of search results
"""
# If no search engines specified, use all available
if search_engines is None:
search_engines = list(self.available_handlers.keys())
else:
# Filter to only include available search engines
search_engines = [engine for engine in search_engines
if engine in self.available_handlers]
# Get the search queries for each engine
search_queries = structured_query.get("search_queries", {})
# Create tasks for each search engine
tasks = []
for engine in search_engines:
if engine not in self.available_handlers:
continue
# Get the appropriate query for this engine
query = search_queries.get(engine, structured_query.get("enhanced_query", ""))
# Create a task for this search
task = self._execute_single_search_async(engine, query, num_results)
tasks.append((engine, task))
# Execute all tasks with timeout
results = {}
for engine, task in tasks:
try:
engine_results = await asyncio.wait_for(task, timeout=timeout)
results[engine] = engine_results
except asyncio.TimeoutError:
print(f"Search timed out for {engine}")
results[engine] = []
except Exception as e:
print(f"Error executing search for {engine}: {e}")
results[engine] = []
return results
async def _execute_single_search_async(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]:
"""
Execute a search on a single search engine asynchronously.
Args:
engine: Name of the search engine
query: Query to execute
num_results: Number of results to return
Returns:
List of search results
"""
# Execute in a thread pool since most API calls are blocking
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None, self._execute_single_search, engine, query, num_results
)