208 lines
8.3 KiB
Python
208 lines
8.3 KiB
Python
"""
|
|
Sub-question search executor module.
|
|
|
|
This module handles the execution of search queries for decomposed sub-questions,
|
|
aggregating results from multiple search engines.
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
import asyncio
|
|
from typing import Dict, List, Any, Optional, Union
|
|
import logging
|
|
import concurrent.futures
|
|
|
|
from config.config import get_config
|
|
from .search_executor import SearchExecutor
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SubQuestionExecutor:
|
|
"""
|
|
Executes search queries for sub-questions and aggregates results.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the sub-question executor."""
|
|
self.search_executor = SearchExecutor()
|
|
self.config = get_config()
|
|
|
|
async def execute_sub_question_searches(self,
|
|
structured_query: Dict[str, Any],
|
|
num_results_per_engine: int = 5,
|
|
timeout: int = 60) -> Dict[str, Any]:
|
|
"""
|
|
Execute searches for all sub-questions in a structured query.
|
|
|
|
Args:
|
|
structured_query: The structured query containing sub-questions
|
|
num_results_per_engine: Number of results to return per search engine for each sub-question
|
|
timeout: Timeout in seconds for each sub-question's searches
|
|
|
|
Returns:
|
|
Updated structured query with sub-question search results
|
|
"""
|
|
# Extract sub-questions from the structured query
|
|
sub_questions = structured_query.get('sub_questions', [])
|
|
|
|
if not sub_questions:
|
|
logger.info("No sub-questions found in the structured query")
|
|
return structured_query
|
|
|
|
logger.info(f"Executing searches for {len(sub_questions)} sub-questions")
|
|
|
|
# Get available search engines
|
|
available_engines = self.search_executor.get_available_search_engines()
|
|
|
|
# Dictionary to store results for each sub-question
|
|
sub_question_results = []
|
|
|
|
# Process sub-questions sequentially to avoid overwhelming APIs
|
|
for i, sq in enumerate(sub_questions):
|
|
sub_q_text = sq.get('sub_question', '')
|
|
aspect = sq.get('aspect', 'unknown')
|
|
priority = sq.get('priority', 3)
|
|
search_queries = sq.get('search_queries', {})
|
|
|
|
if not sub_q_text:
|
|
continue
|
|
|
|
logger.info(f"Processing sub-question {i+1}/{len(sub_questions)}: {sub_q_text}")
|
|
|
|
# Create a mini structured query for this sub-question
|
|
mini_query = {
|
|
'original_query': sub_q_text,
|
|
'enhanced_query': sub_q_text,
|
|
'search_queries': search_queries,
|
|
'is_current_events': structured_query.get('is_current_events', False),
|
|
'is_academic': structured_query.get('is_academic', False),
|
|
'is_code': structured_query.get('is_code', False)
|
|
}
|
|
|
|
# Execute search for this sub-question
|
|
try:
|
|
# Use fewer results per engine for sub-questions to keep total result count manageable
|
|
sq_results = self.search_executor.execute_search(
|
|
structured_query=mini_query,
|
|
num_results=num_results_per_engine,
|
|
timeout=timeout
|
|
)
|
|
|
|
# Log results for each engine
|
|
for engine, results in sq_results.items():
|
|
logger.info(f" Engine {engine} returned {len(results)} results")
|
|
|
|
# Store results with sub-question metadata
|
|
sq_with_results = sq.copy()
|
|
sq_with_results['search_results'] = sq_results
|
|
sq_with_results['search_result_count'] = sum(len(results) for results in sq_results.values())
|
|
sub_question_results.append(sq_with_results)
|
|
|
|
# Add a small delay between sub-questions to avoid rate limiting
|
|
if i < len(sub_questions) - 1:
|
|
await asyncio.sleep(1)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error executing search for sub-question: {str(e)}")
|
|
# Add empty results if there was an error
|
|
sq_with_results = sq.copy()
|
|
sq_with_results['search_results'] = {}
|
|
sq_with_results['search_result_count'] = 0
|
|
sq_with_results['error'] = str(e)
|
|
sub_question_results.append(sq_with_results)
|
|
|
|
# Update the structured query with the results
|
|
structured_query['sub_questions'] = sub_question_results
|
|
|
|
# Calculate total results
|
|
total_results = sum(sq.get('search_result_count', 0) for sq in sub_question_results)
|
|
logger.info(f"Completed searches for all sub-questions. Total results: {total_results}")
|
|
|
|
return structured_query
|
|
|
|
def get_combined_results(self, structured_query: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
|
|
"""
|
|
Get a combined view of results from all sub-questions.
|
|
|
|
Args:
|
|
structured_query: The structured query with sub-question search results
|
|
|
|
Returns:
|
|
Dictionary mapping search engine names to lists of results
|
|
"""
|
|
sub_questions = structured_query.get('sub_questions', [])
|
|
|
|
if not sub_questions:
|
|
return {}
|
|
|
|
# Dictionary to store combined results
|
|
combined_results = {}
|
|
|
|
# Process each sub-question
|
|
for sq in sub_questions:
|
|
sub_q_text = sq.get('sub_question', '')
|
|
aspect = sq.get('aspect', 'unknown')
|
|
priority = sq.get('priority', 3)
|
|
search_results = sq.get('search_results', {})
|
|
|
|
# Process results from each engine
|
|
for engine, results in search_results.items():
|
|
if engine not in combined_results:
|
|
combined_results[engine] = []
|
|
|
|
# Add sub-question metadata to each result
|
|
for result in results:
|
|
if result and isinstance(result, dict):
|
|
# Only add metadata if it doesn't already exist
|
|
if 'sub_question' not in result:
|
|
result['sub_question'] = sub_q_text
|
|
if 'aspect' not in result:
|
|
result['aspect'] = aspect
|
|
if 'priority' not in result:
|
|
result['priority'] = priority
|
|
|
|
# Add the result to the combined results
|
|
combined_results[engine].append(result)
|
|
|
|
return combined_results
|
|
|
|
def prioritize_results(self,
|
|
combined_results: Dict[str, List[Dict[str, Any]]],
|
|
max_results_per_engine: int = 10) -> Dict[str, List[Dict[str, Any]]]:
|
|
"""
|
|
Prioritize results based on sub-question priority.
|
|
|
|
Args:
|
|
combined_results: Combined results from all sub-questions
|
|
max_results_per_engine: Maximum number of results to keep per engine
|
|
|
|
Returns:
|
|
Dictionary mapping search engine names to prioritized lists of results
|
|
"""
|
|
prioritized_results = {}
|
|
|
|
# Process each engine's results
|
|
for engine, results in combined_results.items():
|
|
# Sort results by priority (lower number = higher priority)
|
|
sorted_results = sorted(results, key=lambda r: r.get('priority', 5))
|
|
|
|
# Keep only the top N results
|
|
prioritized_results[engine] = sorted_results[:max_results_per_engine]
|
|
|
|
return prioritized_results
|
|
|
|
|
|
# Create a singleton instance for global use
|
|
sub_question_executor = SubQuestionExecutor()
|
|
|
|
def get_sub_question_executor() -> SubQuestionExecutor:
|
|
"""
|
|
Get the global sub-question executor instance.
|
|
|
|
Returns:
|
|
SubQuestionExecutor instance
|
|
"""
|
|
return sub_question_executor
|