ira/query/query_decomposer.py

"""
Query decomposition module for the intelligent research system.

This module handles the decomposition of complex queries into sub-questions,
enabling more comprehensive research and better handling of multi-faceted queries.
"""

from typing import Dict, Any, List, Optional
import asyncio
import logging

from .llm_interface import get_llm_interface

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class QueryDecomposer:
    """
    Decomposer for complex research queries.

    This class handles breaking down complex queries into sub-questions,
    which can be processed separately and then synthesized into a comprehensive answer.
    """

    def __init__(self):
        """Initialize the query decomposer."""
        self.llm_interface = get_llm_interface()

    async def decompose_query(self, query: str, structured_query: Dict[str, Any]) -> Dict[str, Any]:
        """
        Decompose a complex query into sub-questions.

        Args:
            query: The original user query
            structured_query: The structured query object

        Returns:
            Updated structured query with sub-questions
        """
        # Skip decomposition for simple queries or specific query types where decomposition isn't helpful
        if len(query.split()) < 8:  # Skip very short queries
            logger.info(f"Query too short for decomposition: {query}")
            return structured_query

        # Skip decomposition for code queries as they're usually specific
        if structured_query.get('is_code', False):
            logger.info(f"Skipping decomposition for code query: {query}")
            return structured_query

        # Get query type from the structured query
        query_type = structured_query.get('type', 'unknown')
        intent = structured_query.get('intent', 'research')
        is_current_events = structured_query.get('is_current_events', False)
        is_academic = structured_query.get('is_academic', False)

        # Generate sub-questions based on the query and its type
        sub_questions = await self._generate_sub_questions(
            query,
            query_type=query_type,
            intent=intent,
            is_current_events=is_current_events,
            is_academic=is_academic
        )

        # Add the sub-questions to the structured query
        structured_query['sub_questions'] = sub_questions

        # Generate additional search queries for each sub-question
        if len(sub_questions) > 0:
            search_engines = structured_query.get('search_engines', [])
            await self._generate_search_queries_for_sub_questions(structured_query, search_engines)

        return structured_query

    async def _generate_sub_questions(
        self,
        query: str,
        query_type: str = 'unknown',
        intent: str = 'research',
        is_current_events: bool = False,
        is_academic: bool = False
    ) -> List[Dict[str, Any]]:
        """
        Generate sub-questions based on the query and its type.

        Args:
            query: The original user query
            query_type: The type of query (factual, exploratory, comparative)
            intent: The intent of the query
            is_current_events: Whether the query is about current events
            is_academic: Whether the query is about academic topics

        Returns:
            List of sub-questions
        """
        logger.info(f"Generating sub-questions for query: {query}")

        # Create prompt based on query type and characteristics
        system_prompt = """You are an expert at breaking down complex research questions into smaller, focused sub-questions.

        Your task is to analyze a research query and decompose it into 3-5 distinct sub-questions that, when answered together, will provide a comprehensive response to the original query.

        For each sub-question:
        1. Focus on a single aspect or component of the original query
        2. Make it specific and answerable through targeted search
        3. Ensure it contributes unique information to the overall research

        Return ONLY a JSON array of objects, where each object has:
        - "sub_question": The text of the sub-question
        - "aspect": A short phrase (2-4 words) describing what aspect of the original query this addresses
        - "priority": A number from 1-5 where 1 is highest priority (most important to answer)

        Example output format:
        [
            {
                "sub_question": "What are the key components of quantum computing hardware?",
                "aspect": "hardware components",
                "priority": 1
            },
            {
                "sub_question": "How does quantum entanglement enable quantum computing?",
                "aspect": "quantum principles",
                "priority": 2
            }
        ]
        """

        # Tailor additional instructions based on query characteristics
        if is_current_events:
            system_prompt += """
            Since this is a current events query:
            - Include a sub-question about recent developments (last 6 months)
            - Include a sub-question about historical context if relevant
            - Focus on factual aspects rather than opinions
            - Consider different stakeholders involved
            """

        if is_academic:
            system_prompt += """
            Since this is an academic query:
            - Include a sub-question about research methodologies if applicable
            - Include a sub-question about competing theories or approaches
            - Consider a sub-question about gaps in existing research
            - Include a sub-question about practical applications or implications
            """

        if query_type == 'comparative':
            system_prompt += """
            Since this is a comparative query:
            - Ensure sub-questions address each item being compared
            - Include sub-questions about specific comparison dimensions
            - Consider including a sub-question about contexts where one option might be preferred
            - Include a sub-question about common misconceptions in the comparison
            """

        # Create the prompt for the LLM
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Please decompose this research query into sub-questions: {query}"}
        ]

        # Generate sub-questions
        try:
            response = await self.llm_interface.generate_completion(messages)

            # Parse the response as JSON
            import json
            # Find JSON array in the response - look for anything between [ and ]
            import re
            json_match = re.search(r'\[(.*?)\]', response, re.DOTALL)
            if json_match:
                response = f"[{json_match.group(1)}]"

            sub_questions = json.loads(response)

            # Validate the structure of each sub-question
            validated_sub_questions = []
            for sq in sub_questions:
                if 'sub_question' in sq and 'aspect' in sq:
                    # Ensure priority is an integer
                    if 'priority' not in sq or not isinstance(sq['priority'], int):
                        sq['priority'] = 3  # Default medium priority
                    validated_sub_questions.append(sq)

            logger.info(f"Generated {len(validated_sub_questions)} sub-questions for query: {query}")
            return validated_sub_questions
        except Exception as e:
            logger.error(f"Error generating sub-questions: {str(e)}")
            return []

    async def _generate_search_queries_for_sub_questions(
        self,
        structured_query: Dict[str, Any],
        search_engines: List[str]
    ) -> Dict[str, Any]:
        """
        Generate optimized search queries for each sub-question.

        Args:
            structured_query: The structured query containing sub-questions
            search_engines: List of search engines to generate queries for

        Returns:
            Updated structured query with search queries for sub-questions
        """
        sub_questions = structured_query.get('sub_questions', [])
        if not sub_questions:
            return structured_query

        # Structure to hold search queries for each sub-question
        sub_question_search_queries = []

        # Process each sub-question
        for sq in sub_questions:
            sub_q_text = sq.get('sub_question', '')
            if not sub_q_text:
                continue

            # Generate search queries for this sub-question
            search_queries = await self.llm_interface.generate_search_queries(sub_q_text, search_engines)

            # Add search queries to the sub-question
            sq_with_queries = sq.copy()
            sq_with_queries['search_queries'] = search_queries
            sub_question_search_queries.append(sq_with_queries)

        # Update the structured query
        structured_query['sub_questions'] = sub_question_search_queries

        return structured_query


# Create a singleton instance for global use
query_decomposer = QueryDecomposer()


def get_query_decomposer() -> QueryDecomposer:
    """
    Get the global query decomposer instance.

    Returns:
        QueryDecomposer instance
    """
    return query_decomposer