ira/query/query_processor.py

391 lines
17 KiB
Python

"""
Query processor module for the intelligent research system.
This module handles the processing of user queries, including enhancement,
classification, decomposition, and structuring for downstream modules.
"""
from typing import Dict, Any, List, Optional
import logging
from .llm_interface import get_llm_interface
from .query_decomposer import get_query_decomposer
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class QueryProcessor:
"""
Processor for user research queries.
This class handles the processing of user queries, including enhancement,
classification, and structuring for downstream modules.
"""
def __init__(self):
"""Initialize the query processor."""
self.llm_interface = get_llm_interface()
self.query_decomposer = get_query_decomposer()
async def process_query(self, query: str) -> Dict[str, Any]:
"""
Process a user query.
Args:
query: The raw user query
Returns:
Dictionary containing the processed query information
"""
logger.info(f"Processing query: {query}")
# Enhance the query
enhanced_query = await self.llm_interface.enhance_query(query)
logger.info(f"Enhanced query: {enhanced_query}")
# Classify the query type (factual, exploratory, comparative)
query_type_classification = await self.llm_interface.classify_query(query)
logger.info(f"Query type classification: {query_type_classification}")
# Classify the query domain (academic, code, current_events, general)
domain_classification = await self.llm_interface.classify_query_domain(query)
logger.info(f"Query domain classification: {domain_classification}")
# Log classification details for monitoring
if domain_classification.get('secondary_types'):
for sec_type in domain_classification.get('secondary_types'):
logger.info(f"Secondary domain: {sec_type['type']} confidence={sec_type['confidence']}")
logger.info(f"Classification reasoning: {domain_classification.get('reasoning', 'None provided')}")
try:
# Structure the query using the new classification approach
structured_query = self._structure_query_with_llm(query, enhanced_query, query_type_classification, domain_classification)
except Exception as e:
logger.error(f"LLM domain classification failed: {e}. Falling back to keyword-based classification.")
# Fallback to keyword-based approach
structured_query = self._structure_query(query, enhanced_query, query_type_classification)
# Decompose the query into sub-questions (if complex enough)
structured_query = await self.query_decomposer.decompose_query(query, structured_query)
# Log the number of sub-questions if any
if 'sub_questions' in structured_query and structured_query['sub_questions']:
logger.info(f"Decomposed into {len(structured_query['sub_questions'])} sub-questions")
else:
logger.info("Query was not decomposed into sub-questions")
return structured_query
def _structure_query_with_llm(self, original_query: str, enhanced_query: str,
type_classification: Dict[str, Any],
domain_classification: Dict[str, Any]) -> Dict[str, Any]:
"""
Structure a query using LLM classification results.
Args:
original_query: The original user query
enhanced_query: The enhanced query
type_classification: Classification of query type (factual, exploratory, comparative)
domain_classification: Classification of query domain (academic, code, current_events)
Returns:
Dictionary containing the structured query
"""
# Get primary domain and confidence
primary_domain = domain_classification.get('primary_type', 'general')
primary_confidence = domain_classification.get('confidence', 0.5)
# Get secondary domains
secondary_domains = domain_classification.get('secondary_types', [])
# Determine domain flags
is_academic = primary_domain == 'academic' or any(d['type'] == 'academic' for d in secondary_domains)
is_code = primary_domain == 'code' or any(d['type'] == 'code' for d in secondary_domains)
is_current_events = primary_domain == 'current_events' or any(d['type'] == 'current_events' for d in secondary_domains)
# Higher threshold for secondary domains to avoid false positives
if primary_domain != 'academic' and any(d['type'] == 'academic' and d['confidence'] >= 0.3 for d in secondary_domains):
is_academic = True
if primary_domain != 'code' and any(d['type'] == 'code' and d['confidence'] >= 0.3 for d in secondary_domains):
is_code = True
if primary_domain != 'current_events' and any(d['type'] == 'current_events' and d['confidence'] >= 0.3 for d in secondary_domains):
is_current_events = True
return {
'original_query': original_query,
'enhanced_query': enhanced_query,
'type': type_classification.get('type', 'unknown'),
'intent': type_classification.get('intent', 'research'),
'entities': type_classification.get('entities', []),
'domain': primary_domain,
'domain_confidence': primary_confidence,
'secondary_domains': secondary_domains,
'classification_reasoning': domain_classification.get('reasoning', ''),
'timestamp': None, # Will be filled in by the caller
'is_current_events': is_current_events,
'is_academic': is_academic,
'is_code': is_code,
'metadata': {
'type_classification': type_classification,
'domain_classification': domain_classification
}
}
def _structure_query(self, original_query: str, enhanced_query: str,
classification: Dict[str, Any]) -> Dict[str, Any]:
"""
Structure a query for downstream modules using keyword-based classification.
This is a fallback method when LLM classification fails.
Args:
original_query: The original user query
enhanced_query: The enhanced query
classification: The query classification
Returns:
Dictionary containing the structured query
"""
# Detect query types using keyword-based methods
is_current_events = self._is_current_events_query(original_query, classification)
is_academic = self._is_academic_query(original_query, classification)
is_code = self._is_code_query(original_query, classification)
return {
'original_query': original_query,
'enhanced_query': enhanced_query,
'type': classification.get('type', 'unknown'),
'intent': classification.get('intent', 'research'),
'entities': classification.get('entities', []),
'timestamp': None, # Will be filled in by the caller
'is_current_events': is_current_events,
'is_academic': is_academic,
'is_code': is_code,
'metadata': {
'classification': classification,
'classification_method': 'keyword' # Indicate this used the keyword-based method
}
}
def _is_current_events_query(self, query: str, classification: Dict[str, Any]) -> bool:
"""
Determine if a query is related to current events.
Args:
query: The original user query
classification: The query classification
Returns:
True if the query is about current events, False otherwise
"""
# Check for time-related keywords in the query
time_keywords = ['recent', 'latest', 'current', 'today', 'yesterday', 'week', 'month',
'this year', 'breaking', 'news', 'announced', 'election',
'now', 'trends', 'emerging']
query_lower = query.lower()
# Check for named entities typical of current events
current_event_entities = ['trump', 'biden', 'president', 'government', 'congress',
'senate', 'tariffs', 'election', 'policy', 'coronavirus',
'covid', 'market', 'stocks', 'stock market', 'war']
# Count matches for time keywords
time_keyword_count = sum(1 for keyword in time_keywords if keyword in query_lower)
# Count matches for current event entities
entity_count = sum(1 for entity in current_event_entities if entity in query_lower)
# If the query directly asks about what's happening or what happened
action_verbs = ['happen', 'occurred', 'announced', 'said', 'stated', 'declared', 'launched']
verb_matches = sum(1 for verb in action_verbs if verb in query_lower)
# Determine if this is likely a current events query
# Either multiple time keywords or current event entities, or a combination
is_current = (time_keyword_count >= 1 and entity_count >= 1) or time_keyword_count >= 2 or entity_count >= 2 or verb_matches >= 1
return is_current
def _is_academic_query(self, query: str, classification: Dict[str, Any]) -> bool:
"""
Determine if a query is related to academic or scholarly research.
Args:
query: The original user query
classification: The query classification
Returns:
True if the query is about academic research, False otherwise
"""
query_lower = query.lower()
# Check for academic terms
academic_terms = [
'paper', 'study', 'research', 'publication', 'journal', 'article', 'thesis',
'dissertation', 'scholarly', 'academic', 'literature', 'published', 'author',
'citation', 'cited', 'references', 'bibliography', 'doi', 'peer-reviewed',
'peer reviewed', 'university', 'professor', 'conference', 'proceedings'
]
# Check for research methodologies
methods = [
'methodology', 'experiment', 'hypothesis', 'theoretical', 'empirical',
'qualitative', 'quantitative', 'data', 'analysis', 'statistical', 'results',
'findings', 'conclusion', 'meta-analysis', 'systematic review', 'clinical trial'
]
# Check for academic fields
fields = [
'science', 'physics', 'chemistry', 'biology', 'psychology', 'sociology',
'economics', 'history', 'philosophy', 'engineering', 'computer science',
'medicine', 'mathematics', 'geology', 'astronomy', 'linguistics'
]
# Count matches
academic_term_count = sum(1 for term in academic_terms if term in query_lower)
method_count = sum(1 for method in methods if method in query_lower)
field_count = sum(1 for field in fields if field in query_lower)
# Check for common academic question patterns
academic_patterns = [
'what does research say about',
'what studies show',
'according to research',
'scholarly view',
'academic consensus',
'published papers on',
'recent studies on',
'literature review',
'research findings',
'scientific evidence'
]
pattern_matches = sum(1 for pattern in academic_patterns if pattern in query_lower)
# Determine if this is likely an academic query
# Either multiple academic terms, or a combination of terms, methods, and fields
is_academic = (
academic_term_count >= 2 or
pattern_matches >= 1 or
(academic_term_count >= 1 and (method_count >= 1 or field_count >= 1)) or
(method_count >= 1 and field_count >= 1)
)
return is_academic
def _is_code_query(self, query: str, classification: Dict[str, Any]) -> bool:
"""
Determine if a query is related to programming or code.
Args:
query: The original user query
classification: The query classification
Returns:
True if the query is about programming or code, False otherwise
"""
query_lower = query.lower()
# Check for programming languages and technologies
programming_langs = [
'python', 'javascript', 'java', 'c++', 'c#', 'ruby', 'go', 'rust',
'php', 'swift', 'kotlin', 'typescript', 'perl', 'scala', 'r',
'html', 'css', 'sql', 'bash', 'powershell', 'dart', 'julia'
]
# Check for programming frameworks and libraries
frameworks = [
'react', 'angular', 'vue', 'django', 'flask', 'spring', 'laravel',
'express', 'tensorflow', 'pytorch', 'pandas', 'numpy', 'scikit-learn',
'bootstrap', 'jquery', 'node', 'rails', 'asp.net', 'unity', 'flutter',
'pytorch', 'keras', '.net', 'core', 'maven', 'gradle', 'npm', 'pip'
]
# Check for programming concepts and terms
programming_terms = [
'algorithm', 'function', 'class', 'method', 'variable', 'object', 'array',
'string', 'integer', 'boolean', 'list', 'dictionary', 'hash', 'loop',
'recursion', 'inheritance', 'interface', 'api', 'rest', 'json', 'xml',
'database', 'query', 'schema', 'framework', 'library', 'package', 'module',
'dependency', 'bug', 'error', 'exception', 'debugging', 'compiler', 'runtime',
'syntax', 'parameter', 'argument', 'return', 'value', 'reference', 'pointer',
'memory', 'stack', 'heap', 'thread', 'async', 'await', 'promise', 'callback',
'event', 'listener', 'handler', 'middleware', 'frontend', 'backend', 'fullstack',
'devops', 'ci/cd', 'docker', 'kubernetes', 'git', 'github', 'bitbucket', 'gitlab'
]
# Check for programming question patterns
code_patterns = [
'how to code', 'how do i program', 'how to program', 'how to implement',
'code example', 'example code', 'code snippet', 'write a function',
'write a program', 'debugging', 'error message', 'getting error',
'code review', 'refactor', 'optimize', 'performance issue',
'best practice', 'design pattern', 'architecture', 'software design',
'algorithm for', 'data structure', 'time complexity', 'space complexity',
'big o', 'optimize code', 'refactor code', 'clean code', 'technical debt',
'unit test', 'integration test', 'test coverage', 'mock', 'stub'
]
# Count matches
lang_count = sum(1 for lang in programming_langs if lang in query_lower)
framework_count = sum(1 for framework in frameworks if framework in query_lower)
term_count = sum(1 for term in programming_terms if term in query_lower)
pattern_count = sum(1 for pattern in code_patterns if pattern in query_lower)
# Check if the query contains code or a code block (denoted by backticks or indentation)
contains_code_block = '```' in query or any(line.strip().startswith(' ') for line in query.split('\n'))
# Determine if this is likely a code-related query
is_code = (
lang_count >= 1 or
framework_count >= 1 or
term_count >= 2 or
pattern_count >= 1 or
contains_code_block or
(lang_count + framework_count + term_count >= 2)
)
return is_code
async def generate_search_queries(self, structured_query: Dict[str, Any],
search_engines: List[str]) -> Dict[str, Any]:
"""
Generate optimized search queries for different search engines.
Args:
structured_query: The structured query
search_engines: List of search engines to generate queries for
Returns:
Updated structured query with search queries
"""
# Use the enhanced query for generating search queries
enhanced_query = structured_query['enhanced_query']
# Generate search queries for each engine
search_queries = await self.llm_interface.generate_search_queries(
enhanced_query, search_engines
)
# Add search queries to the structured query
structured_query['search_queries'] = search_queries
return structured_query
# Create a singleton instance for global use
query_processor = QueryProcessor()
def get_query_processor() -> QueryProcessor:
"""
Get the global query processor instance.
Returns:
QueryProcessor instance
"""
return query_processor