ira/query/query_processor.py

"""
Query processor module for the intelligent research system.

This module handles the processing of user queries, including enhancement,
classification, and structuring for downstream modules.
"""

from typing import Dict, Any, List, Optional

from .llm_interface import get_llm_interface


class QueryProcessor:
    """
    Processor for user research queries.

    This class handles the processing of user queries, including enhancement,
    classification, and structuring for downstream modules.
    """

    def __init__(self):
        """Initialize the query processor."""
        self.llm_interface = get_llm_interface()

    async def process_query(self, query: str) -> Dict[str, Any]:
        """
        Process a user query.

        Args:
            query: The raw user query

        Returns:
            Dictionary containing the processed query information
        """
        # Enhance the query
        enhanced_query = await self.llm_interface.enhance_query(query)

        # Classify the query
        classification = await self.llm_interface.classify_query(query)

        # Extract entities from the classification
        entities = classification.get('entities', [])

        # Structure the query for downstream modules
        structured_query = self._structure_query(query, enhanced_query, classification)

        return structured_query

    def _structure_query(self, original_query: str, enhanced_query: str,
                         classification: Dict[str, Any]) -> Dict[str, Any]:
        """
        Structure a query for downstream modules.

        Args:
            original_query: The original user query
            enhanced_query: The enhanced query
            classification: The query classification

        Returns:
            Dictionary containing the structured query
        """
        # Detect query types
        is_current_events = self._is_current_events_query(original_query, classification)
        is_academic = self._is_academic_query(original_query, classification)
        is_code = self._is_code_query(original_query, classification)

        return {
            'original_query': original_query,
            'enhanced_query': enhanced_query,
            'type': classification.get('type', 'unknown'),
            'intent': classification.get('intent', 'research'),
            'entities': classification.get('entities', []),
            'timestamp': None,  # Will be filled in by the caller
            'is_current_events': is_current_events,
            'is_academic': is_academic,
            'is_code': is_code,
            'metadata': {
                'classification': classification
            }
        }

    def _is_current_events_query(self, query: str, classification: Dict[str, Any]) -> bool:
        """
        Determine if a query is related to current events.

        Args:
            query: The original user query
            classification: The query classification

        Returns:
            True if the query is about current events, False otherwise
        """
        # Check for time-related keywords in the query
        time_keywords = ['recent', 'latest', 'current', 'today', 'yesterday', 'week', 'month',
                        'this year', 'breaking', 'news', 'announced', 'election',
                        'now', 'trends', 'emerging']

        query_lower = query.lower()

        # Check for named entities typical of current events
        current_event_entities = ['trump', 'biden', 'president', 'government', 'congress',
                                 'senate', 'tariffs', 'election', 'policy', 'coronavirus',
                                 'covid', 'market', 'stocks', 'stock market', 'war']

        # Count matches for time keywords
        time_keyword_count = sum(1 for keyword in time_keywords if keyword in query_lower)

        # Count matches for current event entities
        entity_count = sum(1 for entity in current_event_entities if entity in query_lower)

        # If the query directly asks about what's happening or what happened
        action_verbs = ['happen', 'occurred', 'announced', 'said', 'stated', 'declared', 'launched']
        verb_matches = sum(1 for verb in action_verbs if verb in query_lower)

        # Determine if this is likely a current events query
        # Either multiple time keywords or current event entities, or a combination
        is_current = (time_keyword_count >= 1 and entity_count >= 1) or time_keyword_count >= 2 or entity_count >= 2 or verb_matches >= 1

        return is_current

    def _is_academic_query(self, query: str, classification: Dict[str, Any]) -> bool:
        """
        Determine if a query is related to academic or scholarly research.

        Args:
            query: The original user query
            classification: The query classification

        Returns:
            True if the query is about academic research, False otherwise
        """
        query_lower = query.lower()

        # Check for academic terms
        academic_terms = [
            'paper', 'study', 'research', 'publication', 'journal', 'article', 'thesis',
            'dissertation', 'scholarly', 'academic', 'literature', 'published', 'author',
            'citation', 'cited', 'references', 'bibliography', 'doi', 'peer-reviewed',
            'peer reviewed', 'university', 'professor', 'conference', 'proceedings'
        ]

        # Check for research methodologies
        methods = [
            'methodology', 'experiment', 'hypothesis', 'theoretical', 'empirical',
            'qualitative', 'quantitative', 'data', 'analysis', 'statistical', 'results',
            'findings', 'conclusion', 'meta-analysis', 'systematic review', 'clinical trial'
        ]

        # Check for academic fields
        fields = [
            'science', 'physics', 'chemistry', 'biology', 'psychology', 'sociology',
            'economics', 'history', 'philosophy', 'engineering', 'computer science',
            'medicine', 'mathematics', 'geology', 'astronomy', 'linguistics'
        ]

        # Count matches
        academic_term_count = sum(1 for term in academic_terms if term in query_lower)
        method_count = sum(1 for method in methods if method in query_lower)
        field_count = sum(1 for field in fields if field in query_lower)

        # Check for common academic question patterns
        academic_patterns = [
            'what does research say about',
            'what studies show',
            'according to research',
            'scholarly view',
            'academic consensus',
            'published papers on',
            'recent studies on',
            'literature review',
            'research findings',
            'scientific evidence'
        ]

        pattern_matches = sum(1 for pattern in academic_patterns if pattern in query_lower)

        # Determine if this is likely an academic query
        # Either multiple academic terms, or a combination of terms, methods, and fields
        is_academic = (
            academic_term_count >= 2 or
            pattern_matches >= 1 or
            (academic_term_count >= 1 and (method_count >= 1 or field_count >= 1)) or
            (method_count >= 1 and field_count >= 1)
        )

        return is_academic

    def _is_code_query(self, query: str, classification: Dict[str, Any]) -> bool:
        """
        Determine if a query is related to programming or code.

        Args:
            query: The original user query
            classification: The query classification

        Returns:
            True if the query is about programming or code, False otherwise
        """
        query_lower = query.lower()

        # Check for programming languages and technologies
        programming_langs = [
            'python', 'javascript', 'java', 'c++', 'c#', 'ruby', 'go', 'rust',
            'php', 'swift', 'kotlin', 'typescript', 'perl', 'scala', 'r',
            'html', 'css', 'sql', 'bash', 'powershell', 'dart', 'julia'
        ]

        # Check for programming frameworks and libraries
        frameworks = [
            'react', 'angular', 'vue', 'django', 'flask', 'spring', 'laravel',
            'express', 'tensorflow', 'pytorch', 'pandas', 'numpy', 'scikit-learn',
            'bootstrap', 'jquery', 'node', 'rails', 'asp.net', 'unity', 'flutter',
            'pytorch', 'keras', '.net', 'core', 'maven', 'gradle', 'npm', 'pip'
        ]

        # Check for programming concepts and terms
        programming_terms = [
            'algorithm', 'function', 'class', 'method', 'variable', 'object', 'array',
            'string', 'integer', 'boolean', 'list', 'dictionary', 'hash', 'loop',
            'recursion', 'inheritance', 'interface', 'api', 'rest', 'json', 'xml',
            'database', 'query', 'schema', 'framework', 'library', 'package', 'module',
            'dependency', 'bug', 'error', 'exception', 'debugging', 'compiler', 'runtime',
            'syntax', 'parameter', 'argument', 'return', 'value', 'reference', 'pointer',
            'memory', 'stack', 'heap', 'thread', 'async', 'await', 'promise', 'callback',
            'event', 'listener', 'handler', 'middleware', 'frontend', 'backend', 'fullstack',
            'devops', 'ci/cd', 'docker', 'kubernetes', 'git', 'github', 'bitbucket', 'gitlab'
        ]

        # Check for programming question patterns
        code_patterns = [
            'how to code', 'how do i program', 'how to program', 'how to implement',
            'code example', 'example code', 'code snippet', 'write a function',
            'write a program', 'debugging', 'error message', 'getting error',
            'code review', 'refactor', 'optimize', 'performance issue',
            'best practice', 'design pattern', 'architecture', 'software design',
            'algorithm for', 'data structure', 'time complexity', 'space complexity',
            'big o', 'optimize code', 'refactor code', 'clean code', 'technical debt',
            'unit test', 'integration test', 'test coverage', 'mock', 'stub'
        ]

        # Count matches
        lang_count = sum(1 for lang in programming_langs if lang in query_lower)
        framework_count = sum(1 for framework in frameworks if framework in query_lower)
        term_count = sum(1 for term in programming_terms if term in query_lower)
        pattern_count = sum(1 for pattern in code_patterns if pattern in query_lower)

        # Check if the query contains code or a code block (denoted by backticks or indentation)
        contains_code_block = '```' in query or any(line.strip().startswith('    ') for line in query.split('\n'))

        # Determine if this is likely a code-related query
        is_code = (
            lang_count >= 1 or
            framework_count >= 1 or
            term_count >= 2 or
            pattern_count >= 1 or
            contains_code_block or
            (lang_count + framework_count + term_count >= 2)
        )

        return is_code

    async def generate_search_queries(self, structured_query: Dict[str, Any],
                               search_engines: List[str]) -> Dict[str, Any]:
        """
        Generate optimized search queries for different search engines.

        Args:
            structured_query: The structured query
            search_engines: List of search engines to generate queries for

        Returns:
            Updated structured query with search queries
        """
        # Use the enhanced query for generating search queries
        enhanced_query = structured_query['enhanced_query']

        # Generate search queries for each engine
        search_queries = await self.llm_interface.generate_search_queries(
            enhanced_query, search_engines
        )

        # Add search queries to the structured query
        structured_query['search_queries'] = search_queries

        return structured_query


# Create a singleton instance for global use
query_processor = QueryProcessor()


def get_query_processor() -> QueryProcessor:
    """
    Get the global query processor instance.

    Returns:
        QueryProcessor instance
    """
    return query_processor