246 lines
9.7 KiB
Python
246 lines
9.7 KiB
Python
"""
|
|
Query decomposition module for the intelligent research system.
|
|
|
|
This module handles the decomposition of complex queries into sub-questions,
|
|
enabling more comprehensive research and better handling of multi-faceted queries.
|
|
"""
|
|
|
|
from typing import Dict, Any, List, Optional
|
|
import asyncio
|
|
import logging
|
|
|
|
from .llm_interface import get_llm_interface
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class QueryDecomposer:
|
|
"""
|
|
Decomposer for complex research queries.
|
|
|
|
This class handles breaking down complex queries into sub-questions,
|
|
which can be processed separately and then synthesized into a comprehensive answer.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the query decomposer."""
|
|
self.llm_interface = get_llm_interface()
|
|
|
|
async def decompose_query(self, query: str, structured_query: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Decompose a complex query into sub-questions.
|
|
|
|
Args:
|
|
query: The original user query
|
|
structured_query: The structured query object
|
|
|
|
Returns:
|
|
Updated structured query with sub-questions
|
|
"""
|
|
# Skip decomposition for simple queries or specific query types where decomposition isn't helpful
|
|
if len(query.split()) < 8: # Skip very short queries
|
|
logger.info(f"Query too short for decomposition: {query}")
|
|
return structured_query
|
|
|
|
# Skip decomposition for code queries as they're usually specific
|
|
if structured_query.get('is_code', False):
|
|
logger.info(f"Skipping decomposition for code query: {query}")
|
|
return structured_query
|
|
|
|
# Get query type from the structured query
|
|
query_type = structured_query.get('type', 'unknown')
|
|
intent = structured_query.get('intent', 'research')
|
|
is_current_events = structured_query.get('is_current_events', False)
|
|
is_academic = structured_query.get('is_academic', False)
|
|
|
|
# Generate sub-questions based on the query and its type
|
|
sub_questions = await self._generate_sub_questions(
|
|
query,
|
|
query_type=query_type,
|
|
intent=intent,
|
|
is_current_events=is_current_events,
|
|
is_academic=is_academic
|
|
)
|
|
|
|
# Add the sub-questions to the structured query
|
|
structured_query['sub_questions'] = sub_questions
|
|
|
|
# Generate additional search queries for each sub-question
|
|
if len(sub_questions) > 0:
|
|
search_engines = structured_query.get('search_engines', [])
|
|
await self._generate_search_queries_for_sub_questions(structured_query, search_engines)
|
|
|
|
return structured_query
|
|
|
|
async def _generate_sub_questions(
|
|
self,
|
|
query: str,
|
|
query_type: str = 'unknown',
|
|
intent: str = 'research',
|
|
is_current_events: bool = False,
|
|
is_academic: bool = False
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Generate sub-questions based on the query and its type.
|
|
|
|
Args:
|
|
query: The original user query
|
|
query_type: The type of query (factual, exploratory, comparative)
|
|
intent: The intent of the query
|
|
is_current_events: Whether the query is about current events
|
|
is_academic: Whether the query is about academic topics
|
|
|
|
Returns:
|
|
List of sub-questions
|
|
"""
|
|
logger.info(f"Generating sub-questions for query: {query}")
|
|
|
|
# Create prompt based on query type and characteristics
|
|
system_prompt = """You are an expert at breaking down complex research questions into smaller, focused sub-questions.
|
|
|
|
Your task is to analyze a research query and decompose it into 3-5 distinct sub-questions that, when answered together, will provide a comprehensive response to the original query.
|
|
|
|
For each sub-question:
|
|
1. Focus on a single aspect or component of the original query
|
|
2. Make it specific and answerable through targeted search
|
|
3. Ensure it contributes unique information to the overall research
|
|
|
|
Return ONLY a JSON array of objects, where each object has:
|
|
- "sub_question": The text of the sub-question
|
|
- "aspect": A short phrase (2-4 words) describing what aspect of the original query this addresses
|
|
- "priority": A number from 1-5 where 1 is highest priority (most important to answer)
|
|
|
|
Example output format:
|
|
[
|
|
{
|
|
"sub_question": "What are the key components of quantum computing hardware?",
|
|
"aspect": "hardware components",
|
|
"priority": 1
|
|
},
|
|
{
|
|
"sub_question": "How does quantum entanglement enable quantum computing?",
|
|
"aspect": "quantum principles",
|
|
"priority": 2
|
|
}
|
|
]
|
|
"""
|
|
|
|
# Tailor additional instructions based on query characteristics
|
|
if is_current_events:
|
|
system_prompt += """
|
|
Since this is a current events query:
|
|
- Include a sub-question about recent developments (last 6 months)
|
|
- Include a sub-question about historical context if relevant
|
|
- Focus on factual aspects rather than opinions
|
|
- Consider different stakeholders involved
|
|
"""
|
|
|
|
if is_academic:
|
|
system_prompt += """
|
|
Since this is an academic query:
|
|
- Include a sub-question about research methodologies if applicable
|
|
- Include a sub-question about competing theories or approaches
|
|
- Consider a sub-question about gaps in existing research
|
|
- Include a sub-question about practical applications or implications
|
|
"""
|
|
|
|
if query_type == 'comparative':
|
|
system_prompt += """
|
|
Since this is a comparative query:
|
|
- Ensure sub-questions address each item being compared
|
|
- Include sub-questions about specific comparison dimensions
|
|
- Consider including a sub-question about contexts where one option might be preferred
|
|
- Include a sub-question about common misconceptions in the comparison
|
|
"""
|
|
|
|
# Create the prompt for the LLM
|
|
messages = [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": f"Please decompose this research query into sub-questions: {query}"}
|
|
]
|
|
|
|
# Generate sub-questions
|
|
try:
|
|
response = await self.llm_interface.generate_completion(messages)
|
|
|
|
# Parse the response as JSON
|
|
import json
|
|
# Find JSON array in the response - look for anything between [ and ]
|
|
import re
|
|
json_match = re.search(r'\[(.*?)\]', response, re.DOTALL)
|
|
if json_match:
|
|
response = f"[{json_match.group(1)}]"
|
|
|
|
sub_questions = json.loads(response)
|
|
|
|
# Validate the structure of each sub-question
|
|
validated_sub_questions = []
|
|
for sq in sub_questions:
|
|
if 'sub_question' in sq and 'aspect' in sq:
|
|
# Ensure priority is an integer
|
|
if 'priority' not in sq or not isinstance(sq['priority'], int):
|
|
sq['priority'] = 3 # Default medium priority
|
|
validated_sub_questions.append(sq)
|
|
|
|
logger.info(f"Generated {len(validated_sub_questions)} sub-questions for query: {query}")
|
|
return validated_sub_questions
|
|
except Exception as e:
|
|
logger.error(f"Error generating sub-questions: {str(e)}")
|
|
return []
|
|
|
|
async def _generate_search_queries_for_sub_questions(
|
|
self,
|
|
structured_query: Dict[str, Any],
|
|
search_engines: List[str]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Generate optimized search queries for each sub-question.
|
|
|
|
Args:
|
|
structured_query: The structured query containing sub-questions
|
|
search_engines: List of search engines to generate queries for
|
|
|
|
Returns:
|
|
Updated structured query with search queries for sub-questions
|
|
"""
|
|
sub_questions = structured_query.get('sub_questions', [])
|
|
if not sub_questions:
|
|
return structured_query
|
|
|
|
# Structure to hold search queries for each sub-question
|
|
sub_question_search_queries = []
|
|
|
|
# Process each sub-question
|
|
for sq in sub_questions:
|
|
sub_q_text = sq.get('sub_question', '')
|
|
if not sub_q_text:
|
|
continue
|
|
|
|
# Generate search queries for this sub-question
|
|
search_queries = await self.llm_interface.generate_search_queries(sub_q_text, search_engines)
|
|
|
|
# Add search queries to the sub-question
|
|
sq_with_queries = sq.copy()
|
|
sq_with_queries['search_queries'] = search_queries
|
|
sub_question_search_queries.append(sq_with_queries)
|
|
|
|
# Update the structured query
|
|
structured_query['sub_questions'] = sub_question_search_queries
|
|
|
|
return structured_query
|
|
|
|
|
|
# Create a singleton instance for global use
|
|
query_decomposer = QueryDecomposer()
|
|
|
|
|
|
def get_query_decomposer() -> QueryDecomposer:
|
|
"""
|
|
Get the global query decomposer instance.
|
|
|
|
Returns:
|
|
QueryDecomposer instance
|
|
"""
|
|
return query_decomposer
|