300 lines
12 KiB
Python
300 lines
12 KiB
Python
"""
|
|
Query processor module for the intelligent research system.
|
|
|
|
This module handles the processing of user queries, including enhancement,
|
|
classification, and structuring for downstream modules.
|
|
"""
|
|
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
from .llm_interface import get_llm_interface
|
|
|
|
|
|
class QueryProcessor:
|
|
"""
|
|
Processor for user research queries.
|
|
|
|
This class handles the processing of user queries, including enhancement,
|
|
classification, and structuring for downstream modules.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the query processor."""
|
|
self.llm_interface = get_llm_interface()
|
|
|
|
async def process_query(self, query: str) -> Dict[str, Any]:
|
|
"""
|
|
Process a user query.
|
|
|
|
Args:
|
|
query: The raw user query
|
|
|
|
Returns:
|
|
Dictionary containing the processed query information
|
|
"""
|
|
# Enhance the query
|
|
enhanced_query = await self.llm_interface.enhance_query(query)
|
|
|
|
# Classify the query
|
|
classification = await self.llm_interface.classify_query(query)
|
|
|
|
# Extract entities from the classification
|
|
entities = classification.get('entities', [])
|
|
|
|
# Structure the query for downstream modules
|
|
structured_query = self._structure_query(query, enhanced_query, classification)
|
|
|
|
return structured_query
|
|
|
|
def _structure_query(self, original_query: str, enhanced_query: str,
|
|
classification: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Structure a query for downstream modules.
|
|
|
|
Args:
|
|
original_query: The original user query
|
|
enhanced_query: The enhanced query
|
|
classification: The query classification
|
|
|
|
Returns:
|
|
Dictionary containing the structured query
|
|
"""
|
|
# Detect query types
|
|
is_current_events = self._is_current_events_query(original_query, classification)
|
|
is_academic = self._is_academic_query(original_query, classification)
|
|
is_code = self._is_code_query(original_query, classification)
|
|
|
|
return {
|
|
'original_query': original_query,
|
|
'enhanced_query': enhanced_query,
|
|
'type': classification.get('type', 'unknown'),
|
|
'intent': classification.get('intent', 'research'),
|
|
'entities': classification.get('entities', []),
|
|
'timestamp': None, # Will be filled in by the caller
|
|
'is_current_events': is_current_events,
|
|
'is_academic': is_academic,
|
|
'is_code': is_code,
|
|
'metadata': {
|
|
'classification': classification
|
|
}
|
|
}
|
|
|
|
def _is_current_events_query(self, query: str, classification: Dict[str, Any]) -> bool:
|
|
"""
|
|
Determine if a query is related to current events.
|
|
|
|
Args:
|
|
query: The original user query
|
|
classification: The query classification
|
|
|
|
Returns:
|
|
True if the query is about current events, False otherwise
|
|
"""
|
|
# Check for time-related keywords in the query
|
|
time_keywords = ['recent', 'latest', 'current', 'today', 'yesterday', 'week', 'month',
|
|
'this year', 'breaking', 'news', 'announced', 'election',
|
|
'now', 'trends', 'emerging']
|
|
|
|
query_lower = query.lower()
|
|
|
|
# Check for named entities typical of current events
|
|
current_event_entities = ['trump', 'biden', 'president', 'government', 'congress',
|
|
'senate', 'tariffs', 'election', 'policy', 'coronavirus',
|
|
'covid', 'market', 'stocks', 'stock market', 'war']
|
|
|
|
# Count matches for time keywords
|
|
time_keyword_count = sum(1 for keyword in time_keywords if keyword in query_lower)
|
|
|
|
# Count matches for current event entities
|
|
entity_count = sum(1 for entity in current_event_entities if entity in query_lower)
|
|
|
|
# If the query directly asks about what's happening or what happened
|
|
action_verbs = ['happen', 'occurred', 'announced', 'said', 'stated', 'declared', 'launched']
|
|
verb_matches = sum(1 for verb in action_verbs if verb in query_lower)
|
|
|
|
# Determine if this is likely a current events query
|
|
# Either multiple time keywords or current event entities, or a combination
|
|
is_current = (time_keyword_count >= 1 and entity_count >= 1) or time_keyword_count >= 2 or entity_count >= 2 or verb_matches >= 1
|
|
|
|
return is_current
|
|
|
|
def _is_academic_query(self, query: str, classification: Dict[str, Any]) -> bool:
|
|
"""
|
|
Determine if a query is related to academic or scholarly research.
|
|
|
|
Args:
|
|
query: The original user query
|
|
classification: The query classification
|
|
|
|
Returns:
|
|
True if the query is about academic research, False otherwise
|
|
"""
|
|
query_lower = query.lower()
|
|
|
|
# Check for academic terms
|
|
academic_terms = [
|
|
'paper', 'study', 'research', 'publication', 'journal', 'article', 'thesis',
|
|
'dissertation', 'scholarly', 'academic', 'literature', 'published', 'author',
|
|
'citation', 'cited', 'references', 'bibliography', 'doi', 'peer-reviewed',
|
|
'peer reviewed', 'university', 'professor', 'conference', 'proceedings'
|
|
]
|
|
|
|
# Check for research methodologies
|
|
methods = [
|
|
'methodology', 'experiment', 'hypothesis', 'theoretical', 'empirical',
|
|
'qualitative', 'quantitative', 'data', 'analysis', 'statistical', 'results',
|
|
'findings', 'conclusion', 'meta-analysis', 'systematic review', 'clinical trial'
|
|
]
|
|
|
|
# Check for academic fields
|
|
fields = [
|
|
'science', 'physics', 'chemistry', 'biology', 'psychology', 'sociology',
|
|
'economics', 'history', 'philosophy', 'engineering', 'computer science',
|
|
'medicine', 'mathematics', 'geology', 'astronomy', 'linguistics'
|
|
]
|
|
|
|
# Count matches
|
|
academic_term_count = sum(1 for term in academic_terms if term in query_lower)
|
|
method_count = sum(1 for method in methods if method in query_lower)
|
|
field_count = sum(1 for field in fields if field in query_lower)
|
|
|
|
# Check for common academic question patterns
|
|
academic_patterns = [
|
|
'what does research say about',
|
|
'what studies show',
|
|
'according to research',
|
|
'scholarly view',
|
|
'academic consensus',
|
|
'published papers on',
|
|
'recent studies on',
|
|
'literature review',
|
|
'research findings',
|
|
'scientific evidence'
|
|
]
|
|
|
|
pattern_matches = sum(1 for pattern in academic_patterns if pattern in query_lower)
|
|
|
|
# Determine if this is likely an academic query
|
|
# Either multiple academic terms, or a combination of terms, methods, and fields
|
|
is_academic = (
|
|
academic_term_count >= 2 or
|
|
pattern_matches >= 1 or
|
|
(academic_term_count >= 1 and (method_count >= 1 or field_count >= 1)) or
|
|
(method_count >= 1 and field_count >= 1)
|
|
)
|
|
|
|
return is_academic
|
|
|
|
def _is_code_query(self, query: str, classification: Dict[str, Any]) -> bool:
|
|
"""
|
|
Determine if a query is related to programming or code.
|
|
|
|
Args:
|
|
query: The original user query
|
|
classification: The query classification
|
|
|
|
Returns:
|
|
True if the query is about programming or code, False otherwise
|
|
"""
|
|
query_lower = query.lower()
|
|
|
|
# Check for programming languages and technologies
|
|
programming_langs = [
|
|
'python', 'javascript', 'java', 'c++', 'c#', 'ruby', 'go', 'rust',
|
|
'php', 'swift', 'kotlin', 'typescript', 'perl', 'scala', 'r',
|
|
'html', 'css', 'sql', 'bash', 'powershell', 'dart', 'julia'
|
|
]
|
|
|
|
# Check for programming frameworks and libraries
|
|
frameworks = [
|
|
'react', 'angular', 'vue', 'django', 'flask', 'spring', 'laravel',
|
|
'express', 'tensorflow', 'pytorch', 'pandas', 'numpy', 'scikit-learn',
|
|
'bootstrap', 'jquery', 'node', 'rails', 'asp.net', 'unity', 'flutter',
|
|
'pytorch', 'keras', '.net', 'core', 'maven', 'gradle', 'npm', 'pip'
|
|
]
|
|
|
|
# Check for programming concepts and terms
|
|
programming_terms = [
|
|
'algorithm', 'function', 'class', 'method', 'variable', 'object', 'array',
|
|
'string', 'integer', 'boolean', 'list', 'dictionary', 'hash', 'loop',
|
|
'recursion', 'inheritance', 'interface', 'api', 'rest', 'json', 'xml',
|
|
'database', 'query', 'schema', 'framework', 'library', 'package', 'module',
|
|
'dependency', 'bug', 'error', 'exception', 'debugging', 'compiler', 'runtime',
|
|
'syntax', 'parameter', 'argument', 'return', 'value', 'reference', 'pointer',
|
|
'memory', 'stack', 'heap', 'thread', 'async', 'await', 'promise', 'callback',
|
|
'event', 'listener', 'handler', 'middleware', 'frontend', 'backend', 'fullstack',
|
|
'devops', 'ci/cd', 'docker', 'kubernetes', 'git', 'github', 'bitbucket', 'gitlab'
|
|
]
|
|
|
|
# Check for programming question patterns
|
|
code_patterns = [
|
|
'how to code', 'how do i program', 'how to program', 'how to implement',
|
|
'code example', 'example code', 'code snippet', 'write a function',
|
|
'write a program', 'debugging', 'error message', 'getting error',
|
|
'code review', 'refactor', 'optimize', 'performance issue',
|
|
'best practice', 'design pattern', 'architecture', 'software design',
|
|
'algorithm for', 'data structure', 'time complexity', 'space complexity',
|
|
'big o', 'optimize code', 'refactor code', 'clean code', 'technical debt',
|
|
'unit test', 'integration test', 'test coverage', 'mock', 'stub'
|
|
]
|
|
|
|
# Count matches
|
|
lang_count = sum(1 for lang in programming_langs if lang in query_lower)
|
|
framework_count = sum(1 for framework in frameworks if framework in query_lower)
|
|
term_count = sum(1 for term in programming_terms if term in query_lower)
|
|
pattern_count = sum(1 for pattern in code_patterns if pattern in query_lower)
|
|
|
|
# Check if the query contains code or a code block (denoted by backticks or indentation)
|
|
contains_code_block = '```' in query or any(line.strip().startswith(' ') for line in query.split('\n'))
|
|
|
|
# Determine if this is likely a code-related query
|
|
is_code = (
|
|
lang_count >= 1 or
|
|
framework_count >= 1 or
|
|
term_count >= 2 or
|
|
pattern_count >= 1 or
|
|
contains_code_block or
|
|
(lang_count + framework_count + term_count >= 2)
|
|
)
|
|
|
|
return is_code
|
|
|
|
async def generate_search_queries(self, structured_query: Dict[str, Any],
|
|
search_engines: List[str]) -> Dict[str, Any]:
|
|
"""
|
|
Generate optimized search queries for different search engines.
|
|
|
|
Args:
|
|
structured_query: The structured query
|
|
search_engines: List of search engines to generate queries for
|
|
|
|
Returns:
|
|
Updated structured query with search queries
|
|
"""
|
|
# Use the enhanced query for generating search queries
|
|
enhanced_query = structured_query['enhanced_query']
|
|
|
|
# Generate search queries for each engine
|
|
search_queries = await self.llm_interface.generate_search_queries(
|
|
enhanced_query, search_engines
|
|
)
|
|
|
|
# Add search queries to the structured query
|
|
structured_query['search_queries'] = search_queries
|
|
|
|
return structured_query
|
|
|
|
|
|
# Create a singleton instance for global use
|
|
query_processor = QueryProcessor()
|
|
|
|
|
|
def get_query_processor() -> QueryProcessor:
|
|
"""
|
|
Get the global query processor instance.
|
|
|
|
Returns:
|
|
QueryProcessor instance
|
|
"""
|
|
return query_processor
|