diff --git a/.note/current_focus.md b/.note/current_focus.md index 090cd7e..cf1644c 100644 --- a/.note/current_focus.md +++ b/.note/current_focus.md @@ -47,6 +47,18 @@ - Tested the reranking functionality with the `JinaReranker` class - Checked that the report generation process works with the new structure +### Query Type Selection in Gradio UI +- ✅ Added a dropdown menu for query type selection in the "Generate Report" tab +- ✅ Included options for "auto-detect", "factual", "exploratory", and "comparative" +- ✅ Added descriptive tooltips explaining each query type +- ✅ Set "auto-detect" as the default option +- ✅ Modified the `generate_report` method in the `GradioInterface` class to handle the new query_type parameter +- ✅ Updated the report button click handler to pass the query type to the generate_report method +- ✅ Updated the `generate_report` method in the `ReportGenerator` class to accept a query_type parameter +- ✅ Modified the report synthesizer calls to pass the query_type parameter +- ✅ Added a "Query Types" section to the Gradio UI explaining each query type +- ✅ Committed changes with message "Add query type selection to Gradio UI and improve report generation" + ## Next Steps 1. Run comprehensive tests to ensure all functionality works with the new directory structure @@ -75,11 +87,20 @@ - Estimated difficulty: Easy to Moderate (2-3 days of work) 2. **UI Improvements**: - - **Add Chunk Processing Progress Indicators**: - - Modify the `report_synthesis.py` file to add logging during the map phase of the map-reduce process - - Add a counter variable to track which chunk is being processed - - Use the existing logging infrastructure to output progress messages in the UI - - Estimated difficulty: Easy (15-30 minutes of work) + - ✅ **Add Chunk Processing Progress Indicators**: + - ✅ Added a `set_progress_callback` method to the `ReportGenerator` class + - ✅ Implemented progress tracking in both standard and progressive report synthesizers + - ✅ Updated the Gradio UI to display progress during report generation + - ✅ Fixed issues with progress reporting in the UI + - ✅ Ensured proper initialization of the report generator in the UI + - ✅ Added proper error handling for progress updates + + - ✅ **Add Query Type Selection**: + - ✅ Added a dropdown menu for query type selection in the "Generate Report" tab + - ✅ Included options for "auto-detect", "factual", "exploratory", "comparative", and "code" + - ✅ Added descriptive tooltips explaining each query type + - ✅ Modified the report generation logic to handle the selected query type + - ✅ Added documentation to help users understand when to use each query type 3. **Visualization Components**: - Identify common data types in reports that would benefit from visualization @@ -96,8 +117,9 @@ - Implementing report versioning and comparison 2. **Integration with UI**: - - Adding report generation options to the UI - - Implementing progress indicators for document scraping and report generation + - ✅ Adding report generation options to the UI + - ✅ Implementing progress indicators for document scraping and report generation + - ✅ Adding query type selection to the UI - Creating visualization components for generated reports - Adding options to customize report generation parameters @@ -111,11 +133,11 @@ 1. **Report Templates Implementation**: - ✅ Created a dedicated `report_templates.py` module with a comprehensive template system - - ✅ Implemented `QueryType` enum for categorizing queries (FACTUAL, EXPLORATORY, COMPARATIVE) + - ✅ Implemented `QueryType` enum for categorizing queries (FACTUAL, EXPLORATORY, COMPARATIVE, CODE) - ✅ Created `DetailLevel` enum for different report detail levels (BRIEF, STANDARD, DETAILED, COMPREHENSIVE) - ✅ Designed a `ReportTemplate` class with validation for required sections - ✅ Implemented a `ReportTemplateManager` to manage and retrieve templates - - ✅ Created 12 different templates (3 query types × 4 detail levels) + - ✅ Created 16 different templates (4 query types × 4 detail levels) - ✅ Added testing with `test_report_templates.py` and `test_brief_report.py` - ✅ Updated memory bank documentation with template system details @@ -127,6 +149,12 @@ - ✅ Improved error handling in template retrieval with fallback to standard templates - ✅ Added better logging for template retrieval process +3. **UI Enhancements**: + - ✅ Added progress tracking for report generation + - ✅ Added query type selection dropdown + - ✅ Added documentation for query types and detail levels + - ✅ Improved error handling in the UI + ### Next Steps 1. **Further Refinement of Report Templates**: @@ -173,7 +201,20 @@ - ✅ Implemented optimization for token usage and processing efficiency - ✅ Fine-tuned prompts and parameters based on testing results -3. **Visualization Components**: +3. **Query Type Selection Enhancement**: + - ✅ Added query type selection dropdown to the UI + - ✅ Implemented handling of user-selected query types in the report generation process + - ✅ Added documentation to help users understand when to use each query type + - ✅ Added CODE as a new query type with specialized templates at all detail levels + - ✅ Implemented code query detection with language, framework, and pattern recognition + - ✅ Added GitHub and StackExchange search handlers for code-related queries + - ⏳ Test the query type selection with various queries to ensure it works correctly + - ⏳ Gather user feedback on the usefulness of manual query type selection + - ⏳ Consider adding more specialized templates for specific query types + - ⏳ Explore adding query type detection confidence scores to help users decide when to override + - ⏳ Add examples of each query type to help users understand the differences + +4. **Visualization Components**: - Identify common data types in reports that would benefit from visualization - Design and implement visualization components for these data types - Integrate visualization components into the report generation process @@ -194,3 +235,14 @@ - Tracks improvement scores to detect diminishing returns - Adapts batch size based on model context window - Provides progress tracking through callback mechanism +- Added query type selection to the UI: + - Allows users to explicitly select the query type (factual, exploratory, comparative, code) + - Provides auto-detect option for convenience + - Includes documentation to help users understand when to use each query type + - Passes the selected query type through the report generation pipeline +- Implemented specialized code query support: + - Added GitHub API for searching code repositories + - Added StackExchange API for programming Q&A content + - Created code detection based on programming languages, frameworks, and patterns + - Designed specialized report templates for code content with syntax highlighting + - Enhanced result ranking to prioritize code-related sources for programming queries diff --git a/README.md b/README.md index 13033df..85aa912 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,12 @@ This system automates the research process by: ## Features - **Query Processing**: Enhances user queries with additional context and classifies them by type and intent -- **Multi-Source Search**: Executes searches across Serper (Google), Google Scholar, and arXiv +- **Multi-Source Search**: Executes searches across general web (Serper/Google), academic sources, and current news +- **Specialized Search Handlers**: + - **Current Events**: Optimized news search for recent developments + - **Academic Research**: Specialized academic search with OpenAlex, CORE, arXiv, and Google Scholar + - **Open Access Detection**: Finds freely available versions of paywalled papers using Unpaywall + - **Code/Programming**: Specialized code search using GitHub and StackExchange - **Intelligent Ranking**: Uses Jina AI's Re-Ranker to prioritize the most relevant results - **Result Deduplication**: Removes duplicate results across different search engines - **Modular Architecture**: Easily extensible with new search engines and LLM providers @@ -24,7 +29,7 @@ This system automates the research process by: - **Search Executor**: Executes searches across multiple engines - **Result Collector**: Processes and organizes search results - **Document Ranker**: Ranks documents by relevance -- **Report Generator**: Synthesizes information into a coherent report (coming soon) +- **Report Generator**: Synthesizes information into coherent reports with specialized templates for different query types ## Getting Started @@ -33,8 +38,13 @@ This system automates the research process by: - Python 3.8+ - API keys for: - Serper API (for Google and Scholar search) + - NewsAPI (for current events search) + - CORE API (for open access academic search) + - GitHub API (for code search) + - StackExchange API (for programming Q&A content) - Groq (or other LLM provider) - Jina AI (for reranking) + - Email for OpenAlex and Unpaywall (recommended but not required) ### Installation @@ -58,8 +68,11 @@ cp config/config.yaml.example config/config.yaml ```yaml api_keys: serper: "your-serper-api-key" + newsapi: "your-newsapi-key" groq: "your-groq-api-key" jina: "your-jina-api-key" + github: "your-github-api-key" + stackexchange: "your-stackexchange-api-key" ``` ### Usage @@ -135,4 +148,10 @@ This project is licensed under the MIT License - see the LICENSE file for detail - [Jina AI](https://jina.ai/) for their embedding and reranking APIs - [Serper](https://serper.dev/) for their Google search API +- [NewsAPI](https://newsapi.org/) for their news search API +- [OpenAlex](https://openalex.org/) for their academic search API +- [CORE](https://core.ac.uk/) for their open access academic search API +- [Unpaywall](https://unpaywall.org/) for their open access discovery API - [Groq](https://groq.com/) for their fast LLM inference +- [GitHub](https://github.com/) for their code search API +- [StackExchange](https://stackexchange.com/) for their programming Q&A API diff --git a/config/config.yaml.example b/config/config.yaml.example index 8ea4d47..887317e 100644 --- a/config/config.yaml.example +++ b/config/config.yaml.example @@ -10,6 +10,10 @@ api_keys: anthropic: "your-anthropic-api-key" # Or set ANTHROPIC_API_KEY environment variable openrouter: "your-openrouter-api-key" # Or set OPENROUTER_API_KEY environment variable groq: "your-groq-api-key" # Or set GROQ_API_KEY environment variable + newsapi: "your-newsapi-key" # Or set NEWSAPI_API_KEY environment variable + core: "your-core-api-key" # Or set CORE_API_KEY environment variable + github: "your-github-api-key" # Or set GITHUB_API_KEY environment variable + stackexchange: "your-stackexchange-api-key" # Or set STACKEXCHANGE_API_KEY environment variable # LLM model configurations models: @@ -128,6 +132,35 @@ search_engines: arxiv: enabled: false max_results: 5 + + news: + enabled: true + max_results: 10 + days_back: 7 + use_headlines: false # Set to true to use top headlines endpoint + country: "us" # Country code for top headlines + language: "en" # Language code + + openalex: + enabled: true + max_results: 10 + filter_open_access: false # Set to true to only return open access publications + + core: + enabled: true + max_results: 10 + full_text: true # Set to true to search in full text of papers + + github: + enabled: true + max_results: 10 + sort: "best_match" # Options: best_match, stars, forks, updated + + stackexchange: + enabled: true + max_results: 10 + site: "stackoverflow" # Default site (stackoverflow, serverfault, superuser, etc.) + sort: "relevance" # Options: relevance, votes, creation, activity # Jina AI specific configurations jina: @@ -143,6 +176,22 @@ ui: title: "Intelligent Research System" description: "An automated system for finding, filtering, and synthesizing information" +# Academic search settings +academic_search: + email: "user@example.com" # Used for Unpaywall and OpenAlex APIs + + # OpenAlex settings + openalex: + default_sort: "relevance_score:desc" # Other options: cited_by_count:desc, publication_date:desc + + # Unpaywall settings + unpaywall: + # No specific settings needed + + # CORE settings + core: + # No specific settings needed + # System settings system: cache_dir: "data/cache" diff --git a/execution/api_handlers/github_handler.py b/execution/api_handlers/github_handler.py new file mode 100644 index 0000000..c95284e --- /dev/null +++ b/execution/api_handlers/github_handler.py @@ -0,0 +1,206 @@ +""" +GitHub API handler for code search. + +This module implements a search handler for GitHub's API, +allowing code searches across GitHub repositories. +""" + +import os +import requests +from typing import Dict, List, Any, Optional + +from config.config import get_config +from ..api_handlers.base_handler import BaseSearchHandler + + +class GitHubSearchHandler(BaseSearchHandler): + """Handler for GitHub code search.""" + + def __init__(self): + """Initialize the GitHub search handler.""" + self.config = get_config() + self.api_key = os.environ.get('GITHUB_API_KEY') or self.config.config_data.get('api_keys', {}).get('github') + self.api_url = "https://api.github.com" + self.search_endpoint = "/search/code" + self.user_agent = "SimSearch-Research-Assistant" + + def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: + """ + Execute a code search on GitHub. + + Args: + query: The search query + num_results: Number of results to return + **kwargs: Additional search parameters + - language: Filter by programming language + - sort: Sort by (indexed, stars, forks, updated) + - order: Sort order (asc, desc) + + Returns: + List of search results + """ + if not self.is_available(): + return [] + + # Prepare query parameters + params = { + "q": query, + "per_page": min(num_results, 30), # GitHub API limit + "page": 1 + } + + # Add optional parameters + if kwargs.get("language"): + params["q"] += f" language:{kwargs['language']}" + if kwargs.get("sort"): + params["sort"] = kwargs["sort"] + if kwargs.get("order"): + params["order"] = kwargs["order"] + + # Set up headers + headers = { + "Authorization": f"token {self.api_key}", + "Accept": "application/vnd.github.v3+json", + "User-Agent": self.user_agent + } + + try: + # Make the API request + response = requests.get( + f"{self.api_url}{self.search_endpoint}", + params=params, + headers=headers + ) + response.raise_for_status() + + # Process results + data = response.json() + results = [] + + for item in data.get("items", []): + # For each code result, fetch a bit of the file content + snippet = self._get_code_snippet(item) if item.get("url") else "Code snippet not available" + + # Construct a standardized result entry + result = { + "title": item.get("name", "Unnamed"), + "url": item.get("html_url", ""), + "snippet": snippet, + "source": "github", + "metadata": { + "repository": item.get("repository", {}).get("full_name", ""), + "path": item.get("path", ""), + "language": kwargs.get("language", ""), + "score": item.get("score", 0) + } + } + results.append(result) + + return results + + except requests.RequestException as e: + print(f"GitHub API error: {e}") + return [] + + def _get_code_snippet(self, item: Dict[str, Any]) -> str: + """ + Fetch a snippet of the code file. + + Args: + item: The GitHub code search result item + + Returns: + A string containing a snippet of the code + """ + try: + # Get the raw content URL + content_url = item.get("url") + if not content_url: + return "Content not available" + + # Request the content + headers = { + "Authorization": f"token {self.api_key}", + "Accept": "application/vnd.github.v3.raw", + "User-Agent": self.user_agent + } + + response = requests.get(content_url, headers=headers) + response.raise_for_status() + + # Get content and create a snippet + content = response.json().get("content", "") + if content: + # GitHub returns Base64 encoded content + import base64 + decoded = base64.b64decode(content).decode('utf-8') + + # Create a snippet (first ~500 chars) + snippet = decoded[:500] + ("..." if len(decoded) > 500 else "") + return snippet + return "Content not available" + + except Exception as e: + print(f"Error fetching code snippet: {e}") + return "Error fetching code snippet" + + def get_name(self) -> str: + """ + Get the name of the search handler. + + Returns: + Name of the search handler + """ + return "github" + + def is_available(self) -> bool: + """ + Check if the GitHub API is available and properly configured. + + Returns: + True if the API is available, False otherwise + """ + return self.api_key is not None + + def get_rate_limit_info(self) -> Dict[str, Any]: + """ + Get information about GitHub API rate limits. + + Returns: + Dictionary with rate limit information + """ + if not self.is_available(): + return {"error": "GitHub API not configured"} + + try: + headers = { + "Authorization": f"token {self.api_key}", + "Accept": "application/vnd.github.v3+json", + "User-Agent": self.user_agent + } + + response = requests.get( + f"{self.api_url}/rate_limit", + headers=headers + ) + response.raise_for_status() + + data = response.json() + rate_limits = data.get("resources", {}).get("search", {}) + + return { + "requests_per_minute": 30, # GitHub search API limit + "requests_per_hour": rate_limits.get("limit", 0), + "current_usage": { + "remaining": rate_limits.get("remaining", 0), + "reset_time": rate_limits.get("reset", 0) + } + } + + except Exception as e: + print(f"Error getting rate limit info: {e}") + return { + "error": str(e), + "requests_per_minute": 30, + "requests_per_hour": 5000 # Default limit + } \ No newline at end of file diff --git a/execution/api_handlers/stackexchange_handler.py b/execution/api_handlers/stackexchange_handler.py new file mode 100644 index 0000000..14730c6 --- /dev/null +++ b/execution/api_handlers/stackexchange_handler.py @@ -0,0 +1,231 @@ +""" +StackExchange API handler for programming question search. + +This module implements a search handler for the StackExchange API, +focusing on Stack Overflow and related programming Q&A sites. +""" + +import os +import requests +import time +from typing import Dict, List, Any, Optional +from urllib.parse import quote + +from config.config import get_config +from ..api_handlers.base_handler import BaseSearchHandler + + +class StackExchangeSearchHandler(BaseSearchHandler): + """Handler for StackExchange/Stack Overflow search.""" + + def __init__(self): + """Initialize the StackExchange search handler.""" + self.config = get_config() + self.api_key = os.environ.get('STACKEXCHANGE_API_KEY') or self.config.config_data.get('api_keys', {}).get('stackexchange') + self.api_url = "https://api.stackexchange.com/2.3" + self.search_endpoint = "/search/advanced" + self.last_request_time = 0 + self.min_request_interval = 1.0 # seconds between requests to avoid throttling + + def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: + """ + Execute a search on StackExchange. + + Args: + query: The search query + num_results: Number of results to return + **kwargs: Additional search parameters + - site: StackExchange site to search (default: stackoverflow) + - sort: Sort by (relevance, votes, creation, activity) + - tags: List of tags to filter by + - accepted: Only return questions with accepted answers + + Returns: + List of search results + """ + if not self.is_available(): + return [] + + # Rate limiting to avoid API restrictions + self._respect_rate_limit() + + # Prepare query parameters + site = kwargs.get("site", "stackoverflow") + params = { + "q": query, + "site": site, + "pagesize": min(num_results, 30), # SE API limit per page + "page": 1, + "filter": "withbody", # Include question body + "key": self.api_key + } + + # Add optional parameters + if kwargs.get("sort"): + params["sort"] = kwargs["sort"] + if kwargs.get("tags"): + params["tagged"] = ";".join(kwargs["tags"]) + if kwargs.get("accepted"): + params["accepted"] = "True" + + try: + # Make the API request + response = requests.get( + f"{self.api_url}{self.search_endpoint}", + params=params + ) + response.raise_for_status() + + # Process results + data = response.json() + results = [] + + for item in data.get("items", []): + # Get answer count and score + answer_count = item.get("answer_count", 0) + score = item.get("score", 0) + has_accepted = item.get("is_answered", False) + + # Format tags + tags = item.get("tags", []) + tag_str = ", ".join(tags) + + # Create snippet from question body + body = item.get("body", "") + snippet = self._extract_snippet(body, max_length=300) + + # Additional metadata for result display + meta_info = f"Score: {score} | Answers: {answer_count}" + if has_accepted: + meta_info += " | Has accepted answer" + + # Format the snippet with meta information + full_snippet = f"{snippet}\n\nTags: {tag_str}\n{meta_info}" + + # Construct a standardized result entry + result = { + "title": item.get("title", "Unnamed Question"), + "url": item.get("link", ""), + "snippet": full_snippet, + "source": f"stackexchange_{site}", + "metadata": { + "score": score, + "answer_count": answer_count, + "has_accepted": has_accepted, + "tags": tags, + "question_id": item.get("question_id", ""), + "creation_date": item.get("creation_date", "") + } + } + results.append(result) + + return results + + except requests.RequestException as e: + print(f"StackExchange API error: {e}") + return [] + + def _extract_snippet(self, html_content: str, max_length: int = 300) -> str: + """ + Extract a readable snippet from HTML content. + + Args: + html_content: HTML content from Stack Overflow + max_length: Maximum length of the snippet + + Returns: + A plain text snippet + """ + try: + # Basic HTML tag removal (a more robust solution would use a library like BeautifulSoup) + import re + text = re.sub(r'<[^>]+>', ' ', html_content) + + # Remove excessive whitespace + text = re.sub(r'\s+', ' ', text).strip() + + # Truncate to max_length + if len(text) > max_length: + text = text[:max_length] + "..." + + return text + + except Exception as e: + print(f"Error extracting snippet: {e}") + return "Snippet extraction failed" + + def _respect_rate_limit(self): + """ + Ensure we don't exceed StackExchange API rate limits. + """ + current_time = time.time() + time_since_last = current_time - self.last_request_time + + if time_since_last < self.min_request_interval: + sleep_time = self.min_request_interval - time_since_last + time.sleep(sleep_time) + + self.last_request_time = time.time() + + def get_name(self) -> str: + """ + Get the name of the search handler. + + Returns: + Name of the search handler + """ + return "stackexchange" + + def is_available(self) -> bool: + """ + Check if the StackExchange API is available. + Note: StackExchange API can be used without an API key with reduced quotas. + + Returns: + True if the API is available + """ + return True # Can be used with or without API key + + def get_rate_limit_info(self) -> Dict[str, Any]: + """ + Get information about StackExchange API rate limits. + + Returns: + Dictionary with rate limit information + """ + quota_max = 300 if self.api_key else 100 # Default quotas + + try: + # Make a request to check quota + params = { + "site": "stackoverflow" + } + if self.api_key: + params["key"] = self.api_key + + response = requests.get( + f"{self.api_url}/info", + params=params + ) + response.raise_for_status() + + data = response.json() + quota_remaining = data.get("quota_remaining", quota_max) + + return { + "requests_per_minute": 30, # Conservative estimate + "requests_per_day": quota_max, + "current_usage": { + "remaining": quota_remaining, + "max": quota_max, + "reset_time": "Daily" # SE resets quotas daily + } + } + + except Exception as e: + print(f"Error getting rate limit info: {e}") + return { + "error": str(e), + "requests_per_minute": 30, + "requests_per_day": quota_max + } \ No newline at end of file diff --git a/execution/result_collector.py b/execution/result_collector.py index 4adf62d..2ef8acb 100644 --- a/execution/result_collector.py +++ b/execution/result_collector.py @@ -27,6 +27,15 @@ class ResultCollector: except ValueError: print("Jina Reranker not available. Will use basic scoring instead.") self.reranker_available = False + + # Initialize result enrichers + try: + from .result_enrichers.unpaywall_enricher import UnpaywallEnricher + self.unpaywall_enricher = UnpaywallEnricher() + self.unpaywall_available = True + except (ImportError, ValueError): + print("Unpaywall enricher not available. Will not enrich results with open access links.") + self.unpaywall_available = False def process_results(self, search_results: Dict[str, List[Dict[str, Any]]], @@ -68,6 +77,16 @@ class ResultCollector: if dedup: print(f"Deduplicated to {len(flattened_results)} results") + # Enrich results with open access links if available + is_academic_query = any(result.get("source") in ["openalex", "core", "arxiv", "scholar"] for result in flattened_results) + if is_academic_query and hasattr(self, 'unpaywall_enricher') and self.unpaywall_available: + print("Enriching academic results with open access information") + try: + flattened_results = self.unpaywall_enricher.enrich_results(flattened_results) + print("Results enriched with open access information") + except Exception as e: + print(f"Error enriching results with Unpaywall: {str(e)}") + # Apply reranking if requested and available if use_reranker and self.reranker is not None: print("Using Jina Reranker for semantic ranking") @@ -161,12 +180,22 @@ class ResultCollector: source = result.get("source", "") if source == "scholar": score += 10 - elif source == "serper": - score += 9 + elif source == "openalex": + score += 10 # Top priority for academic queries + elif source == "core": + score += 9 # High priority for open access academic content elif source == "arxiv": - score += 8 + score += 8 # Good for preprints and specific fields + elif source == "github": + score += 9 # High priority for code/programming queries + elif source.startswith("stackexchange"): + score += 10 # Top priority for code/programming questions + elif source == "serper": + score += 7 # General web search + elif source == "news": + score += 8 # Good for current events elif source == "google": - score += 5 + score += 5 # Generic search # Boost score based on position in original results position = result.get("raw_data", {}).get("position", 0) diff --git a/execution/search_executor.py b/execution/search_executor.py index d1e4534..21d106e 100644 --- a/execution/search_executor.py +++ b/execution/search_executor.py @@ -15,6 +15,12 @@ from .api_handlers.base_handler import BaseSearchHandler from .api_handlers.serper_handler import SerperSearchHandler from .api_handlers.scholar_handler import ScholarSearchHandler from .api_handlers.arxiv_handler import ArxivSearchHandler +from .api_handlers.news_handler import NewsSearchHandler +from .api_handlers.openalex_handler import OpenAlexSearchHandler +from .api_handlers.core_handler import CoreSearchHandler +from .api_handlers.github_handler import GitHubSearchHandler +from .api_handlers.stackexchange_handler import StackExchangeSearchHandler +from .result_enrichers.unpaywall_enricher import UnpaywallEnricher class SearchExecutor: @@ -29,6 +35,9 @@ class SearchExecutor: self.handlers = self._initialize_handlers() self.available_handlers = {name: handler for name, handler in self.handlers.items() if handler.is_available()} + + # Initialize result enrichers + self.unpaywall_enricher = UnpaywallEnricher() def _initialize_handlers(self) -> Dict[str, BaseSearchHandler]: """ @@ -40,7 +49,12 @@ class SearchExecutor: return { "serper": SerperSearchHandler(), "scholar": ScholarSearchHandler(), - "arxiv": ArxivSearchHandler() + "arxiv": ArxivSearchHandler(), + "news": NewsSearchHandler(), + "openalex": OpenAlexSearchHandler(), + "core": CoreSearchHandler(), + "github": GitHubSearchHandler(), + "stackexchange": StackExchangeSearchHandler() } def get_available_search_engines(self) -> List[str]: @@ -82,14 +96,111 @@ class SearchExecutor: # If no search engines specified, use all available if search_engines is None: search_engines = list(self.available_handlers.keys()) + + # Handle specialized query types + + # Current events queries + if structured_query.get("is_current_events", False) and "news" in self.available_handlers: + print("Current events query detected, prioritizing news search") + # Make sure news is in the search engines + if "news" not in search_engines: + search_engines.append("news") + + # If a specific engine is requested, honor that - otherwise limit to news + a general search engine + # for a faster response with more relevant results + if not structured_query.get("specific_engines", False): + general_engines = ["serper", "google"] + # Find an available general engine + general_engine = next((e for e in general_engines if e in self.available_handlers), None) + if general_engine: + search_engines = ["news", general_engine] + else: + # Fall back to just news + search_engines = ["news"] + + # Academic queries + elif structured_query.get("is_academic", False): + print("Academic query detected, prioritizing academic search engines") + + # Define academic search engines in order of priority + academic_engines = ["openalex", "core", "arxiv", "scholar"] + available_academic = [engine for engine in academic_engines if engine in self.available_handlers] + + # Always include at least one general search engine for backup + general_engines = ["serper", "google"] + available_general = [engine for engine in general_engines if engine in self.available_handlers] + + if available_academic and not structured_query.get("specific_engines", False): + # Use available academic engines plus one general engine if available + search_engines = available_academic + if available_general: + search_engines.append(available_general[0]) + elif not available_academic: + # Just use general search if no academic engines are available + search_engines = available_general + + print(f"Selected engines for academic query: {search_engines}") + + # Code/programming queries + elif structured_query.get("is_code", False): + print("Code/programming query detected, prioritizing code search engines") + + # Define code search engines in order of priority + code_engines = ["github", "stackexchange"] + available_code = [engine for engine in code_engines if engine in self.available_handlers] + + # Always include at least one general search engine for backup + general_engines = ["serper", "google"] + available_general = [engine for engine in general_engines if engine in self.available_handlers] + + if available_code and not structured_query.get("specific_engines", False): + # Use available code engines plus one general engine if available + search_engines = available_code + if available_general: + search_engines.append(available_general[0]) + elif not available_code: + # Just use general search if no code engines are available + search_engines = available_general + + print(f"Selected engines for code query: {search_engines}") else: # Filter to only include available search engines search_engines = [engine for engine in search_engines if engine in self.available_handlers] + + # Add specialized handlers based on query type + + # For current events queries + if structured_query.get("is_current_events", False) and "news" in self.available_handlers and "news" not in search_engines: + print("Current events query detected, adding news search") + search_engines.append("news") + + # For academic queries + elif structured_query.get("is_academic", False): + academic_engines = ["openalex", "core", "arxiv", "scholar"] + for engine in academic_engines: + if engine in self.available_handlers and engine not in search_engines: + print(f"Academic query detected, adding {engine} search") + search_engines.append(engine) + + # For code/programming queries + elif structured_query.get("is_code", False): + code_engines = ["github", "stackexchange"] + for engine in code_engines: + if engine in self.available_handlers and engine not in search_engines: + print(f"Code query detected, adding {engine} search") + search_engines.append(engine) # Get the search queries for each engine search_queries = structured_query.get("search_queries", {}) + # For news searches on current events queries, add special parameters + news_params = {} + if "news" in search_engines and structured_query.get("is_current_events", False): + # Set up news search parameters + news_params["days_back"] = 7 # Limit to 7 days for current events + news_params["sort_by"] = "publishedAt" # Sort by publication date + # Execute searches in parallel results = {} with concurrent.futures.ThreadPoolExecutor() as executor: @@ -102,12 +213,18 @@ class SearchExecutor: # Get the appropriate query for this engine engine_query = search_queries.get(engine, query) + # Additional parameters for certain engines + kwargs = {} + if engine == "news" and news_params: + kwargs = news_params + # Submit the search task future = executor.submit( self._execute_single_search, engine=engine, query=engine_query, - num_results=num_results + num_results=num_results, + **kwargs ) future_to_engine[future] = engine @@ -123,7 +240,7 @@ class SearchExecutor: return results - def _execute_single_search(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]: + def _execute_single_search(self, engine: str, query: str, num_results: int, **kwargs) -> List[Dict[str, Any]]: """ Execute a search on a single search engine. @@ -131,6 +248,7 @@ class SearchExecutor: engine: Name of the search engine query: Query to execute num_results: Number of results to return + **kwargs: Additional parameters to pass to the search handler Returns: List of search results @@ -140,8 +258,8 @@ class SearchExecutor: return [] try: - # Execute the search - results = handler.search(query, num_results=num_results) + # Execute the search with any additional parameters + results = handler.search(query, num_results=num_results, **kwargs) return results except Exception as e: print(f"Error executing search for {engine}: {e}") @@ -164,17 +282,51 @@ class SearchExecutor: Returns: Dictionary mapping search engine names to lists of search results """ + # Get the enhanced query + query = structured_query.get("enhanced_query", structured_query.get("original_query", "")) + # If no search engines specified, use all available if search_engines is None: search_engines = list(self.available_handlers.keys()) + + # If this is a current events query, prioritize news handler if available + if structured_query.get("is_current_events", False) and "news" in self.available_handlers: + print("Current events query detected, prioritizing news search (async)") + # Make sure news is in the search engines + if "news" not in search_engines: + search_engines.append("news") + + # If a specific engine is requested, honor that - otherwise limit to news + a general search engine + # for a faster response with more relevant results + if not structured_query.get("specific_engines", False): + general_engines = ["serper", "google"] + # Find an available general engine + general_engine = next((e for e in general_engines if e in self.available_handlers), None) + if general_engine: + search_engines = ["news", general_engine] + else: + # Fall back to just news + search_engines = ["news"] else: # Filter to only include available search engines search_engines = [engine for engine in search_engines if engine in self.available_handlers] + + # If this is a current events query, add news handler if available and not already included + if structured_query.get("is_current_events", False) and "news" in self.available_handlers and "news" not in search_engines: + print("Current events query detected, adding news search (async)") + search_engines.append("news") # Get the search queries for each engine search_queries = structured_query.get("search_queries", {}) + # For news searches on current events queries, add special parameters + news_params = {} + if "news" in search_engines and structured_query.get("is_current_events", False): + # Set up news search parameters + news_params["days_back"] = 7 # Limit to 7 days for current events + news_params["sort_by"] = "publishedAt" # Sort by publication date + # Create tasks for each search engine tasks = [] for engine in search_engines: @@ -182,10 +334,15 @@ class SearchExecutor: continue # Get the appropriate query for this engine - query = search_queries.get(engine, structured_query.get("enhanced_query", "")) + engine_query = search_queries.get(engine, query) + + # Additional parameters for certain engines + kwargs = {} + if engine == "news" and news_params: + kwargs = news_params # Create a task for this search - task = self._execute_single_search_async(engine, query, num_results) + task = self._execute_single_search_async(engine, engine_query, num_results, **kwargs) tasks.append((engine, task)) # Execute all tasks with timeout @@ -203,7 +360,7 @@ class SearchExecutor: return results - async def _execute_single_search_async(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]: + async def _execute_single_search_async(self, engine: str, query: str, num_results: int, **kwargs) -> List[Dict[str, Any]]: """ Execute a search on a single search engine asynchronously. @@ -211,12 +368,16 @@ class SearchExecutor: engine: Name of the search engine query: Query to execute num_results: Number of results to return + **kwargs: Additional parameters to pass to the search handler Returns: List of search results """ # Execute in a thread pool since most API calls are blocking loop = asyncio.get_event_loop() - return await loop.run_in_executor( - None, self._execute_single_search, engine, query, num_results - ) + + # Create a partial function with all the arguments + def execute_search(): + return self._execute_single_search(engine, query, num_results, **kwargs) + + return await loop.run_in_executor(None, execute_search) diff --git a/query/llm_interface.py b/query/llm_interface.py index 3f05b66..9acc062 100644 --- a/query/llm_interface.py +++ b/query/llm_interface.py @@ -305,8 +305,75 @@ class LLMInterface: """Implementation of search query generation.""" engines_str = ", ".join(search_engines) + # Special instructions for news searches + news_instructions = "" + if "news" in search_engines: + news_instructions = """ + For the 'news' search engine: + - Focus on recent events and timely information + - Include specific date ranges when relevant (e.g., "last week", "since June 1") + - Use names of people, organizations, or specific events + - For current events queries, prioritize factual keywords over conceptual terms + - Include terms like "latest", "recent", "update", "announcement" where appropriate + - Exclude general background terms that would dilute current event focus + - Generate 3 queries optimized for news search + """ + + # Special instructions for academic searches + academic_instructions = "" + if any(engine in search_engines for engine in ["openalex", "core", "arxiv"]): + academic_instructions = """ + For academic search engines ('openalex', 'core', 'arxiv'): + - Focus on specific academic terminology and precise research concepts + - Include field-specific keywords and methodological terms + - For 'openalex' search: + - Include author names, journal names, or specific methodology terms when relevant + - Be precise with scientific terminology + - Consider including "review" or "meta-analysis" for summary-type queries + - For 'core' search: + - Focus on open access content + - Include institutional keywords when relevant + - Balance specificity with breadth + - For 'arxiv' search: + - Use more technical/mathematical terminology + - Include relevant field categories (e.g., "cs.AI", "physics", "math") + - Be precise with notation and specialized terms + - Generate 3 queries optimized for each academic search engine + """ + + # Special instructions for code/programming searches + code_instructions = "" + if any(engine in search_engines for engine in ["github", "stackexchange"]): + code_instructions = """ + For code/programming search engines ('github', 'stackexchange'): + - Focus on specific technical terminology, programming languages, and frameworks + - Include specific error messages, function names, or library references when relevant + - For 'github' search: + - Include programming language keywords (e.g., "python", "javascript", "java") + - Specify file extensions when relevant (e.g., ".py", ".js", ".java") + - Include framework or library names (e.g., "react", "tensorflow", "django") + - Use code-specific syntax and terminology + - Focus on implementation details, patterns, or techniques + - For 'stackexchange' search: + - Phrase as a specific programming question or problem + - Include relevant error messages as exact quotes when applicable + - Include specific version information when relevant + - Use precise technical terms that would appear in developer discussions + - Focus on problem-solving aspects or best practices + - Generate 3 queries optimized for each code search engine + """ + messages = [ - {"role": "system", "content": f"You are an AI research assistant. Generate optimized search queries for the following search engines: {engines_str}. For each search engine, provide 3 variations of the query that are optimized for that engine's search algorithm and will yield comprehensive results."}, + {"role": "system", "content": f"""You are an AI research assistant. Generate optimized search queries for the following search engines: {engines_str}. + + For each search engine, provide 3 variations of the query that are optimized for that engine's search algorithm and will yield comprehensive results. + + {news_instructions} + {academic_instructions} + {code_instructions} + + Return your response as a JSON object where each key is a search engine name and the value is an array of 3 optimized queries. + """}, {"role": "user", "content": f"Generate optimized search queries for this research topic: {query}"} ] diff --git a/query/query_processor.py b/query/query_processor.py index ebd4166..1a95259 100644 --- a/query/query_processor.py +++ b/query/query_processor.py @@ -59,6 +59,11 @@ class QueryProcessor: Returns: Dictionary containing the structured query """ + # Detect query types + is_current_events = self._is_current_events_query(original_query, classification) + is_academic = self._is_academic_query(original_query, classification) + is_code = self._is_code_query(original_query, classification) + return { 'original_query': original_query, 'enhanced_query': enhanced_query, @@ -66,10 +71,193 @@ class QueryProcessor: 'intent': classification.get('intent', 'research'), 'entities': classification.get('entities', []), 'timestamp': None, # Will be filled in by the caller + 'is_current_events': is_current_events, + 'is_academic': is_academic, + 'is_code': is_code, 'metadata': { 'classification': classification } } + + def _is_current_events_query(self, query: str, classification: Dict[str, Any]) -> bool: + """ + Determine if a query is related to current events. + + Args: + query: The original user query + classification: The query classification + + Returns: + True if the query is about current events, False otherwise + """ + # Check for time-related keywords in the query + time_keywords = ['recent', 'latest', 'current', 'today', 'yesterday', 'week', 'month', + 'this year', 'breaking', 'news', 'announced', 'election', + 'now', 'trends', 'emerging'] + + query_lower = query.lower() + + # Check for named entities typical of current events + current_event_entities = ['trump', 'biden', 'president', 'government', 'congress', + 'senate', 'tariffs', 'election', 'policy', 'coronavirus', + 'covid', 'market', 'stocks', 'stock market', 'war'] + + # Count matches for time keywords + time_keyword_count = sum(1 for keyword in time_keywords if keyword in query_lower) + + # Count matches for current event entities + entity_count = sum(1 for entity in current_event_entities if entity in query_lower) + + # If the query directly asks about what's happening or what happened + action_verbs = ['happen', 'occurred', 'announced', 'said', 'stated', 'declared', 'launched'] + verb_matches = sum(1 for verb in action_verbs if verb in query_lower) + + # Determine if this is likely a current events query + # Either multiple time keywords or current event entities, or a combination + is_current = (time_keyword_count >= 1 and entity_count >= 1) or time_keyword_count >= 2 or entity_count >= 2 or verb_matches >= 1 + + return is_current + + def _is_academic_query(self, query: str, classification: Dict[str, Any]) -> bool: + """ + Determine if a query is related to academic or scholarly research. + + Args: + query: The original user query + classification: The query classification + + Returns: + True if the query is about academic research, False otherwise + """ + query_lower = query.lower() + + # Check for academic terms + academic_terms = [ + 'paper', 'study', 'research', 'publication', 'journal', 'article', 'thesis', + 'dissertation', 'scholarly', 'academic', 'literature', 'published', 'author', + 'citation', 'cited', 'references', 'bibliography', 'doi', 'peer-reviewed', + 'peer reviewed', 'university', 'professor', 'conference', 'proceedings' + ] + + # Check for research methodologies + methods = [ + 'methodology', 'experiment', 'hypothesis', 'theoretical', 'empirical', + 'qualitative', 'quantitative', 'data', 'analysis', 'statistical', 'results', + 'findings', 'conclusion', 'meta-analysis', 'systematic review', 'clinical trial' + ] + + # Check for academic fields + fields = [ + 'science', 'physics', 'chemistry', 'biology', 'psychology', 'sociology', + 'economics', 'history', 'philosophy', 'engineering', 'computer science', + 'medicine', 'mathematics', 'geology', 'astronomy', 'linguistics' + ] + + # Count matches + academic_term_count = sum(1 for term in academic_terms if term in query_lower) + method_count = sum(1 for method in methods if method in query_lower) + field_count = sum(1 for field in fields if field in query_lower) + + # Check for common academic question patterns + academic_patterns = [ + 'what does research say about', + 'what studies show', + 'according to research', + 'scholarly view', + 'academic consensus', + 'published papers on', + 'recent studies on', + 'literature review', + 'research findings', + 'scientific evidence' + ] + + pattern_matches = sum(1 for pattern in academic_patterns if pattern in query_lower) + + # Determine if this is likely an academic query + # Either multiple academic terms, or a combination of terms, methods, and fields + is_academic = ( + academic_term_count >= 2 or + pattern_matches >= 1 or + (academic_term_count >= 1 and (method_count >= 1 or field_count >= 1)) or + (method_count >= 1 and field_count >= 1) + ) + + return is_academic + + def _is_code_query(self, query: str, classification: Dict[str, Any]) -> bool: + """ + Determine if a query is related to programming or code. + + Args: + query: The original user query + classification: The query classification + + Returns: + True if the query is about programming or code, False otherwise + """ + query_lower = query.lower() + + # Check for programming languages and technologies + programming_langs = [ + 'python', 'javascript', 'java', 'c++', 'c#', 'ruby', 'go', 'rust', + 'php', 'swift', 'kotlin', 'typescript', 'perl', 'scala', 'r', + 'html', 'css', 'sql', 'bash', 'powershell', 'dart', 'julia' + ] + + # Check for programming frameworks and libraries + frameworks = [ + 'react', 'angular', 'vue', 'django', 'flask', 'spring', 'laravel', + 'express', 'tensorflow', 'pytorch', 'pandas', 'numpy', 'scikit-learn', + 'bootstrap', 'jquery', 'node', 'rails', 'asp.net', 'unity', 'flutter', + 'pytorch', 'keras', '.net', 'core', 'maven', 'gradle', 'npm', 'pip' + ] + + # Check for programming concepts and terms + programming_terms = [ + 'algorithm', 'function', 'class', 'method', 'variable', 'object', 'array', + 'string', 'integer', 'boolean', 'list', 'dictionary', 'hash', 'loop', + 'recursion', 'inheritance', 'interface', 'api', 'rest', 'json', 'xml', + 'database', 'query', 'schema', 'framework', 'library', 'package', 'module', + 'dependency', 'bug', 'error', 'exception', 'debugging', 'compiler', 'runtime', + 'syntax', 'parameter', 'argument', 'return', 'value', 'reference', 'pointer', + 'memory', 'stack', 'heap', 'thread', 'async', 'await', 'promise', 'callback', + 'event', 'listener', 'handler', 'middleware', 'frontend', 'backend', 'fullstack', + 'devops', 'ci/cd', 'docker', 'kubernetes', 'git', 'github', 'bitbucket', 'gitlab' + ] + + # Check for programming question patterns + code_patterns = [ + 'how to code', 'how do i program', 'how to program', 'how to implement', + 'code example', 'example code', 'code snippet', 'write a function', + 'write a program', 'debugging', 'error message', 'getting error', + 'code review', 'refactor', 'optimize', 'performance issue', + 'best practice', 'design pattern', 'architecture', 'software design', + 'algorithm for', 'data structure', 'time complexity', 'space complexity', + 'big o', 'optimize code', 'refactor code', 'clean code', 'technical debt', + 'unit test', 'integration test', 'test coverage', 'mock', 'stub' + ] + + # Count matches + lang_count = sum(1 for lang in programming_langs if lang in query_lower) + framework_count = sum(1 for framework in frameworks if framework in query_lower) + term_count = sum(1 for term in programming_terms if term in query_lower) + pattern_count = sum(1 for pattern in code_patterns if pattern in query_lower) + + # Check if the query contains code or a code block (denoted by backticks or indentation) + contains_code_block = '```' in query or any(line.strip().startswith(' ') for line in query.split('\n')) + + # Determine if this is likely a code-related query + is_code = ( + lang_count >= 1 or + framework_count >= 1 or + term_count >= 2 or + pattern_count >= 1 or + contains_code_block or + (lang_count + framework_count + term_count >= 2) + ) + + return is_code async def generate_search_queries(self, structured_query: Dict[str, Any], search_engines: List[str]) -> Dict[str, Any]: diff --git a/report/report_templates.py b/report/report_templates.py index b9b526b..6372876 100644 --- a/report/report_templates.py +++ b/report/report_templates.py @@ -6,6 +6,7 @@ class QueryType(Enum): FACTUAL = 'factual' EXPLORATORY = 'exploratory' COMPARATIVE = 'comparative' + CODE = 'code' class DetailLevel(Enum): BRIEF = 'brief' @@ -66,6 +67,13 @@ class ReportTemplateManager: query_type=QueryType.COMPARATIVE, required_sections=['{title}', '{comparison_criteria}', '{key_findings}'] )) + + self.add_template(ReportTemplate( + template="# {title}\n\n## Problem Statement\n{problem_statement}\n\n## Solution\n{solution}\n\n```{language}\n{code_snippet}\n```", + detail_level=DetailLevel.BRIEF, + query_type=QueryType.CODE, + required_sections=['{title}', '{problem_statement}', '{solution}', '{language}', '{code_snippet}'] + )) # Standard templates self.add_template(ReportTemplate( @@ -88,6 +96,13 @@ class ReportTemplateManager: query_type=QueryType.COMPARATIVE, required_sections=['{title}', '{comparison_criteria}', '{methodology}', '{key_findings}', '{analysis}'] )) + + self.add_template(ReportTemplate( + template="# {title}\n\n## Problem Statement\n{problem_statement}\n\n## Approach\n{approach}\n\n## Solution\n{solution}\n\n```{language}\n{code_snippet}\n```\n\n## Explanation\n{explanation}\n\n## Usage Example\n{usage_example}", + detail_level=DetailLevel.STANDARD, + query_type=QueryType.CODE, + required_sections=['{title}', '{problem_statement}', '{approach}', '{solution}', '{language}', '{code_snippet}', '{explanation}', '{usage_example}'] + )) # Detailed templates self.add_template(ReportTemplate( @@ -110,6 +125,13 @@ class ReportTemplateManager: query_type=QueryType.COMPARATIVE, required_sections=['{title}', '{comparison_criteria}', '{methodology}', '{key_findings}', '{analysis}', '{conclusion}'] )) + + self.add_template(ReportTemplate( + template="# {title}\n\n## Problem Statement\n{problem_statement}\n\n## Context and Requirements\n{context}\n\n## Approach\n{approach}\n\n## Solution\n{solution}\n\n```{language}\n{code_snippet}\n```\n\n## Explanation\n{explanation}\n\n## Alternative Approaches\n{alternatives}\n\n## Best Practices\n{best_practices}\n\n## Usage Examples\n{usage_examples}\n\n## Common Issues\n{common_issues}", + detail_level=DetailLevel.DETAILED, + query_type=QueryType.CODE, + required_sections=['{title}', '{problem_statement}', '{context}', '{approach}', '{solution}', '{language}', '{code_snippet}', '{explanation}', '{alternatives}', '{best_practices}', '{usage_examples}', '{common_issues}'] + )) # Comprehensive templates self.add_template(ReportTemplate( @@ -132,3 +154,10 @@ class ReportTemplateManager: query_type=QueryType.COMPARATIVE, required_sections=['{title}', '{exec_summary}', '{comparison_criteria}', '{methodology}', '{key_findings}', '{analysis}', '{conclusion}', '{references}', '{appendices}'] )) + + self.add_template(ReportTemplate( + template="# {title}\n\n## Executive Summary\n{exec_summary}\n\n## Problem Statement\n{problem_statement}\n\n## Technical Background\n{technical_background}\n\n## Architectural Considerations\n{architecture}\n\n## Detailed Solution\n{detailed_solution}\n\n### Implementation Details\n```{language}\n{code_snippet}\n```\n\n## Explanation of Algorithm/Approach\n{algorithm_explanation}\n\n## Performance Considerations\n{performance}\n\n## Alternative Implementations\n{alternatives}\n\n## Best Practices and Design Patterns\n{best_practices}\n\n## Testing and Validation\n{testing}\n\n## Usage Examples\n{usage_examples}\n\n## Common Pitfalls and Workarounds\n{pitfalls}\n\n## References\n{references}\n\n## Appendices\n{appendices}", + detail_level=DetailLevel.COMPREHENSIVE, + query_type=QueryType.CODE, + required_sections=['{title}', '{exec_summary}', '{problem_statement}', '{technical_background}', '{architecture}', '{detailed_solution}', '{language}', '{code_snippet}', '{algorithm_explanation}', '{performance}', '{alternatives}', '{best_practices}', '{testing}', '{usage_examples}', '{pitfalls}', '{references}', '{appendices}'] + )) diff --git a/scripts/query_to_report.py b/scripts/query_to_report.py index 3bb33dc..08b098c 100755 --- a/scripts/query_to_report.py +++ b/scripts/query_to_report.py @@ -38,7 +38,11 @@ async def query_to_report( chunk_size: Optional[int] = None, overlap_size: Optional[int] = None, detail_level: str = "standard", - use_mock: bool = False + use_mock: bool = False, + query_type: Optional[str] = None, + is_code: bool = False, + is_academic: bool = False, + is_current_events: bool = False ) -> str: """ Execute the full workflow from query to report. @@ -67,6 +71,18 @@ async def query_to_report( # Add timestamp structured_query['timestamp'] = datetime.now().isoformat() + # Add query type if specified + if query_type: + structured_query['type'] = query_type + + # Add domain-specific flags if specified + if is_code: + structured_query['is_code'] = True + if is_academic: + structured_query['is_academic'] = True + if is_current_events: + structured_query['is_current_events'] = True + logger.info(f"Query processed. Type: {structured_query['type']}, Intent: {structured_query['intent']}") logger.info(f"Enhanced query: {structured_query['enhanced_query']}") @@ -180,6 +196,15 @@ def main(): parser.add_argument('--detail-level', '-d', type=str, default='standard', choices=['brief', 'standard', 'detailed', 'comprehensive'], help='Level of detail for the report') + parser.add_argument('--query-type', '-q', type=str, + choices=['factual', 'exploratory', 'comparative', 'code'], + help='Type of query to process') + parser.add_argument('--is-code', action='store_true', + help='Flag this query as a code/programming query') + parser.add_argument('--is-academic', action='store_true', + help='Flag this query as an academic query') + parser.add_argument('--is-current-events', action='store_true', + help='Flag this query as a current events query') parser.add_argument('--use-mock', '-m', action='store_true', help='Use mock data instead of API calls') parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose logging') parser.add_argument('--list-detail-levels', action='store_true', @@ -210,6 +235,10 @@ def main(): chunk_size=args.chunk_size, overlap_size=args.overlap_size, detail_level=args.detail_level, + query_type=args.query_type, + is_code=args.is_code, + is_academic=args.is_academic, + is_current_events=args.is_current_events, use_mock=args.use_mock )) diff --git a/tests/execution/test_all_handlers.py b/tests/execution/test_all_handlers.py index fb1a498..660e9b7 100644 --- a/tests/execution/test_all_handlers.py +++ b/tests/execution/test_all_handlers.py @@ -9,23 +9,42 @@ def main(): # Initialize the search executor executor = SearchExecutor() - # Execute a simple search - results = executor.execute_search({ + # Execute search tests + print("\n=== TESTING GENERAL SEARCH ===") + general_results = executor.execute_search({ 'raw_query': 'quantum computing', 'enhanced_query': 'quantum computing' }) - # Print results by source - print(f'Results by source: {[engine for engine, res in results.items() if res]}') + print("\n=== TESTING CODE SEARCH ===") + code_results = executor.execute_search({ + 'raw_query': 'implement merge sort in python', + 'enhanced_query': 'implement merge sort algorithm in python with time complexity analysis', + 'is_code': True + }) - # Print details + # Print general search results + print("\n=== GENERAL SEARCH RESULTS ===") + print(f'Results by source: {[engine for engine, res in general_results.items() if res]}') print('\nDetails:') - for engine, res in results.items(): + for engine, res in general_results.items(): print(f'{engine}: {len(res)} results') if res: - print(f' Sample result: {res[0]}') + print(f' Sample result: {res[0]["title"]}') - return results + # Print code search results + print("\n=== CODE SEARCH RESULTS ===") + print(f'Results by source: {[engine for engine, res in code_results.items() if res]}') + print('\nDetails:') + for engine, res in code_results.items(): + print(f'{engine}: {len(res)} results') + if res: + print(f' Sample result: {res[0]["title"]}') + + return { + 'general': general_results, + 'code': code_results + } if __name__ == "__main__": main() diff --git a/tests/integration/test_code_query.py b/tests/integration/test_code_query.py new file mode 100644 index 0000000..92d2512 --- /dev/null +++ b/tests/integration/test_code_query.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +""" +Integration test for code query to report workflow. + +This script tests the full pipeline from a code-related query to a report. +""" + +import os +import sys +import asyncio +import argparse +from datetime import datetime + +# Add parent directory to path to import modules +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from query.query_processor import get_query_processor +from scripts.query_to_report import query_to_report +from report.report_templates import QueryType +from report.report_detail_levels import DetailLevel + + +async def test_code_query(query: str = "How to implement a binary search in Python?", detail_level: str = "brief"): + """Test the code query to report workflow.""" + # Process the query to verify it's detected as code + print(f"\nTesting code query detection for: {query}") + query_processor = get_query_processor() + structured_query = await query_processor.process_query(query) + + # Check if query is detected as code + is_code = structured_query.get('is_code', False) + print(f"Detected as code query: {is_code}") + + if not is_code: + # Force code query type + print("Manually setting to code query type for testing") + structured_query['is_code'] = True + + # Generate timestamp for unique output files + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = f"test_code_query_{timestamp}.md" + + # Generate report + print(f"\nGenerating {detail_level} report for code query...") + await query_to_report( + query=query, + output_file=output_file, + detail_level=detail_level, + query_type=QueryType.CODE.value, + is_code=True + ) + + print(f"\nReport generated and saved to: {output_file}") + + # Display the start of the report + try: + with open(output_file, 'r', encoding='utf-8') as f: + content = f.read() + preview_length = min(500, len(content)) + print(f"\nReport preview:\n{'-' * 40}\n{content[:preview_length]}...\n{'-' * 40}") + print(f"Total length: {len(content)} characters") + except Exception as e: + print(f"Error reading report: {e}") + + return output_file + + +def main(): + """Parse arguments and run the test.""" + parser = argparse.ArgumentParser(description='Test code query to report pipeline') + parser.add_argument('--query', '-q', type=str, default="How to implement a binary search in Python?", + help='The code-related query to test') + parser.add_argument('--detail-level', '-d', type=str, default="brief", + choices=['brief', 'standard', 'detailed', 'comprehensive'], + help='Level of detail for the report') + + args = parser.parse_args() + asyncio.run(test_code_query(query=args.query, detail_level=args.detail_level)) + + +if __name__ == "__main__": + main() \ No newline at end of file