ira/execution/sub_question_executor.py

208 lines
8.3 KiB
Python

"""
Sub-question search executor module.
This module handles the execution of search queries for decomposed sub-questions,
aggregating results from multiple search engines.
"""
import os
import time
import asyncio
from typing import Dict, List, Any, Optional, Union
import logging
import concurrent.futures
from config.config import get_config
from .search_executor import SearchExecutor
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class SubQuestionExecutor:
"""
Executes search queries for sub-questions and aggregates results.
"""
def __init__(self):
"""Initialize the sub-question executor."""
self.search_executor = SearchExecutor()
self.config = get_config()
async def execute_sub_question_searches(self,
structured_query: Dict[str, Any],
num_results_per_engine: int = 5,
timeout: int = 60) -> Dict[str, Any]:
"""
Execute searches for all sub-questions in a structured query.
Args:
structured_query: The structured query containing sub-questions
num_results_per_engine: Number of results to return per search engine for each sub-question
timeout: Timeout in seconds for each sub-question's searches
Returns:
Updated structured query with sub-question search results
"""
# Extract sub-questions from the structured query
sub_questions = structured_query.get('sub_questions', [])
if not sub_questions:
logger.info("No sub-questions found in the structured query")
return structured_query
logger.info(f"Executing searches for {len(sub_questions)} sub-questions")
# Get available search engines
available_engines = self.search_executor.get_available_search_engines()
# Dictionary to store results for each sub-question
sub_question_results = []
# Process sub-questions sequentially to avoid overwhelming APIs
for i, sq in enumerate(sub_questions):
sub_q_text = sq.get('sub_question', '')
aspect = sq.get('aspect', 'unknown')
priority = sq.get('priority', 3)
search_queries = sq.get('search_queries', {})
if not sub_q_text:
continue
logger.info(f"Processing sub-question {i+1}/{len(sub_questions)}: {sub_q_text}")
# Create a mini structured query for this sub-question
mini_query = {
'original_query': sub_q_text,
'enhanced_query': sub_q_text,
'search_queries': search_queries,
'is_current_events': structured_query.get('is_current_events', False),
'is_academic': structured_query.get('is_academic', False),
'is_code': structured_query.get('is_code', False)
}
# Execute search for this sub-question
try:
# Use fewer results per engine for sub-questions to keep total result count manageable
sq_results = self.search_executor.execute_search(
structured_query=mini_query,
num_results=num_results_per_engine,
timeout=timeout
)
# Log results for each engine
for engine, results in sq_results.items():
logger.info(f" Engine {engine} returned {len(results)} results")
# Store results with sub-question metadata
sq_with_results = sq.copy()
sq_with_results['search_results'] = sq_results
sq_with_results['search_result_count'] = sum(len(results) for results in sq_results.values())
sub_question_results.append(sq_with_results)
# Add a small delay between sub-questions to avoid rate limiting
if i < len(sub_questions) - 1:
await asyncio.sleep(1)
except Exception as e:
logger.error(f"Error executing search for sub-question: {str(e)}")
# Add empty results if there was an error
sq_with_results = sq.copy()
sq_with_results['search_results'] = {}
sq_with_results['search_result_count'] = 0
sq_with_results['error'] = str(e)
sub_question_results.append(sq_with_results)
# Update the structured query with the results
structured_query['sub_questions'] = sub_question_results
# Calculate total results
total_results = sum(sq.get('search_result_count', 0) for sq in sub_question_results)
logger.info(f"Completed searches for all sub-questions. Total results: {total_results}")
return structured_query
def get_combined_results(self, structured_query: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
"""
Get a combined view of results from all sub-questions.
Args:
structured_query: The structured query with sub-question search results
Returns:
Dictionary mapping search engine names to lists of results
"""
sub_questions = structured_query.get('sub_questions', [])
if not sub_questions:
return {}
# Dictionary to store combined results
combined_results = {}
# Process each sub-question
for sq in sub_questions:
sub_q_text = sq.get('sub_question', '')
aspect = sq.get('aspect', 'unknown')
priority = sq.get('priority', 3)
search_results = sq.get('search_results', {})
# Process results from each engine
for engine, results in search_results.items():
if engine not in combined_results:
combined_results[engine] = []
# Add sub-question metadata to each result
for result in results:
if result and isinstance(result, dict):
# Only add metadata if it doesn't already exist
if 'sub_question' not in result:
result['sub_question'] = sub_q_text
if 'aspect' not in result:
result['aspect'] = aspect
if 'priority' not in result:
result['priority'] = priority
# Add the result to the combined results
combined_results[engine].append(result)
return combined_results
def prioritize_results(self,
combined_results: Dict[str, List[Dict[str, Any]]],
max_results_per_engine: int = 10) -> Dict[str, List[Dict[str, Any]]]:
"""
Prioritize results based on sub-question priority.
Args:
combined_results: Combined results from all sub-questions
max_results_per_engine: Maximum number of results to keep per engine
Returns:
Dictionary mapping search engine names to prioritized lists of results
"""
prioritized_results = {}
# Process each engine's results
for engine, results in combined_results.items():
# Sort results by priority (lower number = higher priority)
sorted_results = sorted(results, key=lambda r: r.get('priority', 5))
# Keep only the top N results
prioritized_results[engine] = sorted_results[:max_results_per_engine]
return prioritized_results
# Create a singleton instance for global use
sub_question_executor = SubQuestionExecutor()
def get_sub_question_executor() -> SubQuestionExecutor:
"""
Get the global sub-question executor instance.
Returns:
SubQuestionExecutor instance
"""
return sub_question_executor