""" Query decomposition module for the intelligent research system. This module handles the decomposition of complex queries into sub-questions, enabling more comprehensive research and better handling of multi-faceted queries. """ from typing import Dict, Any, List, Optional import asyncio import logging from .llm_interface import get_llm_interface # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class QueryDecomposer: """ Decomposer for complex research queries. This class handles breaking down complex queries into sub-questions, which can be processed separately and then synthesized into a comprehensive answer. """ def __init__(self): """Initialize the query decomposer.""" self.llm_interface = get_llm_interface() async def decompose_query(self, query: str, structured_query: Dict[str, Any]) -> Dict[str, Any]: """ Decompose a complex query into sub-questions. Args: query: The original user query structured_query: The structured query object Returns: Updated structured query with sub-questions """ # Skip decomposition for simple queries or specific query types where decomposition isn't helpful if len(query.split()) < 8: # Skip very short queries logger.info(f"Query too short for decomposition: {query}") return structured_query # Skip decomposition for code queries as they're usually specific if structured_query.get('is_code', False): logger.info(f"Skipping decomposition for code query: {query}") return structured_query # Get query type from the structured query query_type = structured_query.get('type', 'unknown') intent = structured_query.get('intent', 'research') is_current_events = structured_query.get('is_current_events', False) is_academic = structured_query.get('is_academic', False) # Generate sub-questions based on the query and its type sub_questions = await self._generate_sub_questions( query, query_type=query_type, intent=intent, is_current_events=is_current_events, is_academic=is_academic ) # Add the sub-questions to the structured query structured_query['sub_questions'] = sub_questions # Generate additional search queries for each sub-question if len(sub_questions) > 0: search_engines = structured_query.get('search_engines', []) await self._generate_search_queries_for_sub_questions(structured_query, search_engines) return structured_query async def _generate_sub_questions( self, query: str, query_type: str = 'unknown', intent: str = 'research', is_current_events: bool = False, is_academic: bool = False ) -> List[Dict[str, Any]]: """ Generate sub-questions based on the query and its type. Args: query: The original user query query_type: The type of query (factual, exploratory, comparative) intent: The intent of the query is_current_events: Whether the query is about current events is_academic: Whether the query is about academic topics Returns: List of sub-questions """ logger.info(f"Generating sub-questions for query: {query}") # Create prompt based on query type and characteristics system_prompt = """You are an expert at breaking down complex research questions into smaller, focused sub-questions. Your task is to analyze a research query and decompose it into 3-5 distinct sub-questions that, when answered together, will provide a comprehensive response to the original query. For each sub-question: 1. Focus on a single aspect or component of the original query 2. Make it specific and answerable through targeted search 3. Ensure it contributes unique information to the overall research Return ONLY a JSON array of objects, where each object has: - "sub_question": The text of the sub-question - "aspect": A short phrase (2-4 words) describing what aspect of the original query this addresses - "priority": A number from 1-5 where 1 is highest priority (most important to answer) Example output format: [ { "sub_question": "What are the key components of quantum computing hardware?", "aspect": "hardware components", "priority": 1 }, { "sub_question": "How does quantum entanglement enable quantum computing?", "aspect": "quantum principles", "priority": 2 } ] """ # Tailor additional instructions based on query characteristics if is_current_events: system_prompt += """ Since this is a current events query: - Include a sub-question about recent developments (last 6 months) - Include a sub-question about historical context if relevant - Focus on factual aspects rather than opinions - Consider different stakeholders involved """ if is_academic: system_prompt += """ Since this is an academic query: - Include a sub-question about research methodologies if applicable - Include a sub-question about competing theories or approaches - Consider a sub-question about gaps in existing research - Include a sub-question about practical applications or implications """ if query_type == 'comparative': system_prompt += """ Since this is a comparative query: - Ensure sub-questions address each item being compared - Include sub-questions about specific comparison dimensions - Consider including a sub-question about contexts where one option might be preferred - Include a sub-question about common misconceptions in the comparison """ # Create the prompt for the LLM messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"Please decompose this research query into sub-questions: {query}"} ] # Generate sub-questions try: response = await self.llm_interface.generate_completion(messages) # Parse the response as JSON import json # Find JSON array in the response - look for anything between [ and ] import re json_match = re.search(r'\[(.*?)\]', response, re.DOTALL) if json_match: response = f"[{json_match.group(1)}]" sub_questions = json.loads(response) # Validate the structure of each sub-question validated_sub_questions = [] for sq in sub_questions: if 'sub_question' in sq and 'aspect' in sq: # Ensure priority is an integer if 'priority' not in sq or not isinstance(sq['priority'], int): sq['priority'] = 3 # Default medium priority validated_sub_questions.append(sq) logger.info(f"Generated {len(validated_sub_questions)} sub-questions for query: {query}") return validated_sub_questions except Exception as e: logger.error(f"Error generating sub-questions: {str(e)}") return [] async def _generate_search_queries_for_sub_questions( self, structured_query: Dict[str, Any], search_engines: List[str] ) -> Dict[str, Any]: """ Generate optimized search queries for each sub-question. Args: structured_query: The structured query containing sub-questions search_engines: List of search engines to generate queries for Returns: Updated structured query with search queries for sub-questions """ sub_questions = structured_query.get('sub_questions', []) if not sub_questions: return structured_query # Structure to hold search queries for each sub-question sub_question_search_queries = [] # Process each sub-question for sq in sub_questions: sub_q_text = sq.get('sub_question', '') if not sub_q_text: continue # Generate search queries for this sub-question search_queries = await self.llm_interface.generate_search_queries(sub_q_text, search_engines) # Add search queries to the sub-question sq_with_queries = sq.copy() sq_with_queries['search_queries'] = search_queries sub_question_search_queries.append(sq_with_queries) # Update the structured query structured_query['sub_questions'] = sub_question_search_queries return structured_query # Create a singleton instance for global use query_decomposer = QueryDecomposer() def get_query_decomposer() -> QueryDecomposer: """ Get the global query decomposer instance. Returns: QueryDecomposer instance """ return query_decomposer