ira/query/query_decomposer.py

246 lines
9.7 KiB
Python

"""
Query decomposition module for the intelligent research system.
This module handles the decomposition of complex queries into sub-questions,
enabling more comprehensive research and better handling of multi-faceted queries.
"""
from typing import Dict, Any, List, Optional
import asyncio
import logging
from .llm_interface import get_llm_interface
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class QueryDecomposer:
"""
Decomposer for complex research queries.
This class handles breaking down complex queries into sub-questions,
which can be processed separately and then synthesized into a comprehensive answer.
"""
def __init__(self):
"""Initialize the query decomposer."""
self.llm_interface = get_llm_interface()
async def decompose_query(self, query: str, structured_query: Dict[str, Any]) -> Dict[str, Any]:
"""
Decompose a complex query into sub-questions.
Args:
query: The original user query
structured_query: The structured query object
Returns:
Updated structured query with sub-questions
"""
# Skip decomposition for simple queries or specific query types where decomposition isn't helpful
if len(query.split()) < 8: # Skip very short queries
logger.info(f"Query too short for decomposition: {query}")
return structured_query
# Skip decomposition for code queries as they're usually specific
if structured_query.get('is_code', False):
logger.info(f"Skipping decomposition for code query: {query}")
return structured_query
# Get query type from the structured query
query_type = structured_query.get('type', 'unknown')
intent = structured_query.get('intent', 'research')
is_current_events = structured_query.get('is_current_events', False)
is_academic = structured_query.get('is_academic', False)
# Generate sub-questions based on the query and its type
sub_questions = await self._generate_sub_questions(
query,
query_type=query_type,
intent=intent,
is_current_events=is_current_events,
is_academic=is_academic
)
# Add the sub-questions to the structured query
structured_query['sub_questions'] = sub_questions
# Generate additional search queries for each sub-question
if len(sub_questions) > 0:
search_engines = structured_query.get('search_engines', [])
await self._generate_search_queries_for_sub_questions(structured_query, search_engines)
return structured_query
async def _generate_sub_questions(
self,
query: str,
query_type: str = 'unknown',
intent: str = 'research',
is_current_events: bool = False,
is_academic: bool = False
) -> List[Dict[str, Any]]:
"""
Generate sub-questions based on the query and its type.
Args:
query: The original user query
query_type: The type of query (factual, exploratory, comparative)
intent: The intent of the query
is_current_events: Whether the query is about current events
is_academic: Whether the query is about academic topics
Returns:
List of sub-questions
"""
logger.info(f"Generating sub-questions for query: {query}")
# Create prompt based on query type and characteristics
system_prompt = """You are an expert at breaking down complex research questions into smaller, focused sub-questions.
Your task is to analyze a research query and decompose it into 3-5 distinct sub-questions that, when answered together, will provide a comprehensive response to the original query.
For each sub-question:
1. Focus on a single aspect or component of the original query
2. Make it specific and answerable through targeted search
3. Ensure it contributes unique information to the overall research
Return ONLY a JSON array of objects, where each object has:
- "sub_question": The text of the sub-question
- "aspect": A short phrase (2-4 words) describing what aspect of the original query this addresses
- "priority": A number from 1-5 where 1 is highest priority (most important to answer)
Example output format:
[
{
"sub_question": "What are the key components of quantum computing hardware?",
"aspect": "hardware components",
"priority": 1
},
{
"sub_question": "How does quantum entanglement enable quantum computing?",
"aspect": "quantum principles",
"priority": 2
}
]
"""
# Tailor additional instructions based on query characteristics
if is_current_events:
system_prompt += """
Since this is a current events query:
- Include a sub-question about recent developments (last 6 months)
- Include a sub-question about historical context if relevant
- Focus on factual aspects rather than opinions
- Consider different stakeholders involved
"""
if is_academic:
system_prompt += """
Since this is an academic query:
- Include a sub-question about research methodologies if applicable
- Include a sub-question about competing theories or approaches
- Consider a sub-question about gaps in existing research
- Include a sub-question about practical applications or implications
"""
if query_type == 'comparative':
system_prompt += """
Since this is a comparative query:
- Ensure sub-questions address each item being compared
- Include sub-questions about specific comparison dimensions
- Consider including a sub-question about contexts where one option might be preferred
- Include a sub-question about common misconceptions in the comparison
"""
# Create the prompt for the LLM
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Please decompose this research query into sub-questions: {query}"}
]
# Generate sub-questions
try:
response = await self.llm_interface.generate_completion(messages)
# Parse the response as JSON
import json
# Find JSON array in the response - look for anything between [ and ]
import re
json_match = re.search(r'\[(.*?)\]', response, re.DOTALL)
if json_match:
response = f"[{json_match.group(1)}]"
sub_questions = json.loads(response)
# Validate the structure of each sub-question
validated_sub_questions = []
for sq in sub_questions:
if 'sub_question' in sq and 'aspect' in sq:
# Ensure priority is an integer
if 'priority' not in sq or not isinstance(sq['priority'], int):
sq['priority'] = 3 # Default medium priority
validated_sub_questions.append(sq)
logger.info(f"Generated {len(validated_sub_questions)} sub-questions for query: {query}")
return validated_sub_questions
except Exception as e:
logger.error(f"Error generating sub-questions: {str(e)}")
return []
async def _generate_search_queries_for_sub_questions(
self,
structured_query: Dict[str, Any],
search_engines: List[str]
) -> Dict[str, Any]:
"""
Generate optimized search queries for each sub-question.
Args:
structured_query: The structured query containing sub-questions
search_engines: List of search engines to generate queries for
Returns:
Updated structured query with search queries for sub-questions
"""
sub_questions = structured_query.get('sub_questions', [])
if not sub_questions:
return structured_query
# Structure to hold search queries for each sub-question
sub_question_search_queries = []
# Process each sub-question
for sq in sub_questions:
sub_q_text = sq.get('sub_question', '')
if not sub_q_text:
continue
# Generate search queries for this sub-question
search_queries = await self.llm_interface.generate_search_queries(sub_q_text, search_engines)
# Add search queries to the sub-question
sq_with_queries = sq.copy()
sq_with_queries['search_queries'] = search_queries
sub_question_search_queries.append(sq_with_queries)
# Update the structured query
structured_query['sub_questions'] = sub_question_search_queries
return structured_query
# Create a singleton instance for global use
query_decomposer = QueryDecomposer()
def get_query_decomposer() -> QueryDecomposer:
"""
Get the global query decomposer instance.
Returns:
QueryDecomposer instance
"""
return query_decomposer