From 540bf22b5209502358f0c0147e86405a5cecf6c5 Mon Sep 17 00:00:00 2001 From: Steve White Date: Thu, 27 Feb 2025 16:21:54 -0600 Subject: [PATCH] Initial commit: Intelligent Research System with search execution module --- .gitignore | 49 ++ .note/code_structure.md | 142 +++++ .note/current_focus.md | 71 +++ .note/decision_log.md | 223 +++++++ .note/development_standards.md | 48 ++ .note/interfaces.md | 720 ++++++++++++++++++++++ .note/project_overview.md | 97 +++ .note/session_log.md | 350 +++++++++++ README.md | 138 +++++ config/__init__.py | 0 config/config.py | 181 ++++++ config/config.yaml.example | 150 +++++ execution/__init__.py | 4 + execution/api_handlers/__init__.py | 4 + execution/api_handlers/arxiv_handler.py | 162 +++++ execution/api_handlers/base_handler.py | 63 ++ execution/api_handlers/google_handler.py | 113 ++++ execution/api_handlers/scholar_handler.py | 125 ++++ execution/api_handlers/serper_handler.py | 134 ++++ execution/result_collector.py | 315 ++++++++++ execution/search_executor.py | 222 +++++++ jina-ai-metaprompt.md | 220 +++++++ jina_similarity.py | 112 ++++ markdown_segmenter.py | 62 ++ query/__init__.py | 0 query/llm_interface.py | 263 ++++++++ query/query_processor.py | 111 ++++ ranking/__init__.py | 0 ranking/jina_reranker.py | 155 +++++ requirements.txt | 7 + sample_chunk.txt | 1 + sample_query.txt | 1 + test_all_handlers.py | 31 + test_llm_interface.py | 47 ++ test_query_processor.py | 148 +++++ test_query_processor_comprehensive.py | 236 +++++++ test_search_execution.py | 267 ++++++++ test_similarity.py | 99 +++ ui/__init__.py | 0 39 files changed, 5071 insertions(+) create mode 100644 .gitignore create mode 100644 .note/code_structure.md create mode 100644 .note/current_focus.md create mode 100644 .note/decision_log.md create mode 100644 .note/development_standards.md create mode 100644 .note/interfaces.md create mode 100644 .note/project_overview.md create mode 100644 .note/session_log.md create mode 100644 README.md create mode 100644 config/__init__.py create mode 100644 config/config.py create mode 100644 config/config.yaml.example create mode 100644 execution/__init__.py create mode 100644 execution/api_handlers/__init__.py create mode 100644 execution/api_handlers/arxiv_handler.py create mode 100644 execution/api_handlers/base_handler.py create mode 100644 execution/api_handlers/google_handler.py create mode 100644 execution/api_handlers/scholar_handler.py create mode 100644 execution/api_handlers/serper_handler.py create mode 100644 execution/result_collector.py create mode 100644 execution/search_executor.py create mode 100644 jina-ai-metaprompt.md create mode 100644 jina_similarity.py create mode 100644 markdown_segmenter.py create mode 100644 query/__init__.py create mode 100644 query/llm_interface.py create mode 100644 query/query_processor.py create mode 100644 ranking/__init__.py create mode 100644 ranking/jina_reranker.py create mode 100644 requirements.txt create mode 100644 sample_chunk.txt create mode 100644 sample_query.txt create mode 100644 test_all_handlers.py create mode 100644 test_llm_interface.py create mode 100644 test_query_processor.py create mode 100644 test_query_processor_comprehensive.py create mode 100644 test_search_execution.py create mode 100755 test_similarity.py create mode 100644 ui/__init__.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dd3607b --- /dev/null +++ b/.gitignore @@ -0,0 +1,49 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +venv/ +env/ +ENV/ +.venv/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +.DS_Store + +# Project specific +config/config.yaml +*.json +!config/config.yaml.example +.env +.env.* +!.env.example + +# Logs +logs/ +*.log + +# Test results +*_test_results.json diff --git a/.note/code_structure.md b/.note/code_structure.md new file mode 100644 index 0000000..27b1d1b --- /dev/null +++ b/.note/code_structure.md @@ -0,0 +1,142 @@ +# Code Structure + +## Current Project Organization + +``` +sim-search/ +├── config/ +│ ├── __init__.py +│ ├── config.py # Configuration management +│ └── config.yaml # Configuration file +├── query/ +│ ├── __init__.py +│ ├── query_processor.py # Module for processing user queries +│ ├── query_classifier.py # Module for classifying query types +│ └── llm_interface.py # Module for interacting with LLM providers +├── execution/ +│ ├── __init__.py +│ ├── search_executor.py # Module for executing search queries +│ ├── result_collector.py # Module for collecting search results +│ └── api_handlers/ # Handlers for different search APIs +│ ├── __init__.py +│ ├── base_handler.py # Base class for search handlers +│ ├── serper_handler.py # Handler for Serper API (Google search) +│ ├── scholar_handler.py # Handler for Google Scholar via Serper +│ └── arxiv_handler.py # Handler for arXiv API +├── ranking/ +│ ├── __init__.py +│ ├── jina_reranker.py # Module for reranking documents using Jina AI +│ └── filter_manager.py # Module for filtering documents +├── test_search_execution.py # Test script for search execution +├── test_all_handlers.py # Test script for all search handlers +├── requirements.txt # Project dependencies +└── search_execution_test_results.json # Test results +``` + +## Module Details + +### Config Module + +The `config` module manages configuration settings for the entire system, including API keys, model selections, and other parameters. + +### Files + +- `__init__.py`: Package initialization file +- `config.py`: Configuration management class +- `config.yaml`: YAML configuration file with settings for different components + +### Classes + +- `Config`: Singleton class for loading and accessing configuration settings + - `load_config(config_path)`: Loads configuration from a YAML file + - `get(key, default=None)`: Gets a configuration value by key + +### Query Module + +The `query` module handles the processing and enhancement of user queries, including classification and optimization for search. + +### Files + +- `__init__.py`: Package initialization file +- `query_processor.py`: Main module for processing user queries +- `query_classifier.py`: Module for classifying query types +- `llm_interface.py`: Interface for interacting with LLM providers + +### Classes + +- `QueryProcessor`: Main class for processing user queries + - `process_query(query)`: Processes a user query and returns enhanced results + - `classify_query(query)`: Classifies a query by type and intent + - `generate_search_queries(query, classification)`: Generates optimized search queries + +- `QueryClassifier`: Class for classifying queries + - `classify(query)`: Classifies a query by type, intent, and entities + +- `LLMInterface`: Interface for interacting with LLM providers + - `get_completion(prompt, model=None)`: Gets a completion from an LLM + - `enhance_query(query)`: Enhances a query with additional context + - `classify_query(query)`: Uses an LLM to classify a query + +### Execution Module + +The `execution` module handles the execution of search queries across multiple search engines and the collection of results. + +### Files + +- `__init__.py`: Package initialization file +- `search_executor.py`: Module for executing search queries +- `result_collector.py`: Module for collecting and processing search results +- `api_handlers/`: Directory containing handlers for different search APIs + - `__init__.py`: Package initialization file + - `base_handler.py`: Base class for search handlers + - `serper_handler.py`: Handler for Serper API (Google search) + - `scholar_handler.py`: Handler for Google Scholar via Serper + - `arxiv_handler.py`: Handler for arXiv API + +### Classes + +- `SearchExecutor`: Class for executing search queries + - `execute_search(query_data)`: Executes a search across multiple engines + - `_execute_search_async(query, engines)`: Executes a search asynchronously + - `_execute_search_sync(query, engines)`: Executes a search synchronously + +- `ResultCollector`: Class for collecting and processing search results + - `process_results(search_results)`: Processes search results from multiple engines + - `deduplicate_results(results)`: Deduplicates results based on URL + - `save_results(results, file_path)`: Saves results to a file + +- `BaseSearchHandler`: Base class for search handlers + - `search(query, num_results)`: Abstract method for searching + - `_process_response(response)`: Processes the API response + +- `SerperSearchHandler`: Handler for Serper API + - `search(query, num_results)`: Searches using Serper API + - `_process_response(response)`: Processes the Serper API response + +- `ScholarSearchHandler`: Handler for Google Scholar via Serper + - `search(query, num_results)`: Searches Google Scholar + - `_process_response(response)`: Processes the Scholar API response + +- `ArxivSearchHandler`: Handler for arXiv API + - `search(query, num_results)`: Searches arXiv + - `_process_response(response)`: Processes the arXiv API response + +### Ranking Module + +The `ranking` module provides functionality for reranking and prioritizing documents based on their relevance to the user's query. + +### Files + +- `__init__.py`: Package initialization file +- `jina_reranker.py`: Module for reranking documents using Jina AI +- `filter_manager.py`: Module for filtering documents + +### Classes + +- `JinaReranker`: Class for reranking documents + - `rerank(documents, query)`: Reranks documents based on relevance to query + - `_prepare_inputs(documents, query)`: Prepares inputs for the reranker + +- `FilterManager`: Class for filtering documents + - `filter_by_date(documents, start_date, end_date)`: Filters by date + - `filter_by_source(documents, sources)`: Filters by source diff --git a/.note/current_focus.md b/.note/current_focus.md new file mode 100644 index 0000000..df0293c --- /dev/null +++ b/.note/current_focus.md @@ -0,0 +1,71 @@ +# Current Focus: Intelligent Research System Development + +## Latest Update (2025-02-27) + +We are currently developing an intelligent research system that automates the process of finding, filtering, and synthesizing information from various sources. The system is designed to be modular, allowing different components to utilize specific LLM models and endpoints based on their requirements. + +### Recent Progress + +1. **Configuration Enhancements**: + - Implemented module-specific model assignments in the configuration + - Added support for different LLM providers and endpoints + - Added configuration for Jina AI's reranker + - Added support for OpenRouter and Groq as LLM providers + - Configured the system to use Groq's Llama 3.1 and 3.3 models for testing + +2. **LLM Interface Updates**: + - Enhanced the LLMInterface to support different models for different modules + - Implemented dynamic model switching based on the module and function + - Added support for Groq and OpenRouter providers + - Added special handling for provider-specific requirements + - Modified the query enhancement prompt to return only the enhanced query text without explanations + +3. **Document Ranking Module**: + - Created a new JinaReranker class that uses Jina AI's Reranker API + - Implemented document reranking with metadata support + - Configured to use the "jina-reranker-v2-base-multilingual" model + +4. **Search Execution Module**: + - Fixed the Serper API integration for both regular search and Scholar search + - Streamlined the search execution process by removing redundant Google search handler + - Added query truncation to handle long queries (Serper API has a 2048 character limit) + - Enhanced error handling for API requests + - Improved result processing and deduplication + - Created comprehensive test scripts for all search handlers + +### Current Tasks + +1. **Report Generation Module Development**: + - Designing the report synthesis pipeline + - Implementing result summarization using Groq's Llama 3.3 70B Versatile model + - Creating formatting and export options + +2. **Gradio UI Development**: + - Designing the user interface for query input + - Implementing result display components + - Creating configuration options in the UI + +### Next Steps + +1. **Integrate Search Execution with Query Processor**: + - Ensure seamless flow from query processing to search execution + - Test end-to-end pipeline with various query types + - Fine-tune result scoring and filtering + +2. **Build the Report Generation Module**: + - Implement report synthesis using Groq's Llama 3.3 70B Versatile model + - Create formatting and export options + - Develop citation and reference management + +3. **Comprehensive System Testing**: + - Test the complete pipeline from query to report + - Evaluate performance with different query types and domains + - Optimize for speed and accuracy + +### Technical Notes + +- Using LiteLLM for unified LLM interface across different providers +- Implementing a modular architecture for flexibility and maintainability +- Using Jina AI's reranker for improved document ranking +- Using Groq's Llama 3.1 and 3.3 models for fast inference during testing +- Managing API keys securely through environment variables and configuration files diff --git a/.note/decision_log.md b/.note/decision_log.md new file mode 100644 index 0000000..8d5ebe0 --- /dev/null +++ b/.note/decision_log.md @@ -0,0 +1,223 @@ +# Decision Log + +## 2025-02-27: Initial Project Setup + +### Decision: Use Jina AI APIs for Semantic Search +- **Context**: Need for semantic search capabilities that understand context beyond keywords +- **Options Considered**: + 1. Build custom embedding solution + 2. Use open-source models locally + 3. Use Jina AI's APIs +- **Decision**: Use Jina AI's APIs for embedding generation and similarity computation +- **Rationale**: + - High-quality embeddings with state-of-the-art models + - No need to manage model deployment and infrastructure + - Simple API integration with reasonable pricing + - Support for long texts through segmentation + +### Decision: Separate Markdown Segmentation from Similarity Computation +- **Context**: Need to handle potentially long markdown documents +- **Options Considered**: + 1. Integrate segmentation directly into the similarity module + 2. Create a separate module for segmentation +- **Decision**: Create a separate module (markdown_segmenter.py) for document segmentation +- **Rationale**: + - Better separation of concerns + - More modular design allows for independent use of components + - Easier to maintain and extend each component separately + +### Decision: Use Environment Variables for API Keys +- **Context**: Need to securely manage API credentials +- **Options Considered**: + 1. Configuration files + 2. Environment variables + 3. Secret management service +- **Decision**: Use environment variables (JINA_API_KEY) +- **Rationale**: + - Simple to implement + - Standard practice for managing secrets + - Works well across different environments + - Prevents accidental commit of credentials to version control + +### Decision: Use Cosine Similarity with Normalized Vectors +- **Context**: Need a metric for comparing semantic similarity between text embeddings +- **Options Considered**: + 1. Euclidean distance + 2. Cosine similarity + 3. Dot product +- **Decision**: Use cosine similarity with normalized vectors +- **Rationale**: + - Standard approach for semantic similarity + - Normalized vectors simplify computation (dot product equals cosine similarity) + - Less sensitive to embedding magnitude, focusing on direction (meaning) + +## 2025-02-27: Research System Architecture + +### Decision: Implement a Multi-Stage Research Pipeline +- **Context**: Need to define the overall architecture for the intelligent research system +- **Options Considered**: + 1. Monolithic application with tightly coupled components + 2. Microservices architecture with independent services + 3. Pipeline architecture with distinct processing stages +- **Decision**: Implement an 8-stage pipeline architecture +- **Rationale**: + - Clear separation of concerns with each stage having a specific responsibility + - Easier to develop and test individual components + - Flexibility to swap or enhance specific stages without affecting others + - Natural flow of data through the system matches the research process + +### Decision: Use Multiple Search Sources +- **Context**: Need to gather comprehensive information from various sources +- **Options Considered**: + 1. Use a single search API for simplicity + 2. Implement custom web scraping for all sources + 3. Use multiple specialized search APIs +- **Decision**: Integrate multiple search sources (Google, Serper, Jina Search, Google Scholar, arXiv) +- **Rationale**: + - Different sources provide different types of information (academic, general, etc.) + - Increases the breadth and diversity of search results + - Specialized APIs like arXiv provide domain-specific information + - Redundancy ensures more comprehensive coverage + +### Decision: Use Jina AI for Semantic Processing +- **Context**: Need for advanced semantic understanding in document processing +- **Options Considered**: + 1. Use simple keyword matching + 2. Implement custom embedding models + 3. Use Jina AI's suite of APIs +- **Decision**: Use Jina AI's APIs for embedding generation, similarity computation, and reranking +- **Rationale**: + - High-quality embeddings with state-of-the-art models + - Comprehensive API suite covering multiple needs (embeddings, segmentation, reranking) + - Simple integration with reasonable pricing + - Consistent approach across different semantic processing tasks + +## 2025-02-27: Search Execution Architecture + +### Decision: Search Execution Architecture +- **Context**: We needed to implement a search execution module that could execute search queries across multiple search engines and process the results in a standardized way. + +- **Decision**: + 1. Create a modular search execution architecture: + - Implement a base handler interface (`BaseSearchHandler`) for all search API handlers + - Create specific handlers for each search engine (Google, Serper, Scholar, arXiv) + - Develop a central `SearchExecutor` class to manage execution across multiple engines + - Implement a `ResultCollector` class for processing and organizing results + + 2. Use parallel execution for search queries: + - Implement thread-based parallelism using `concurrent.futures` + - Add support for both synchronous and asynchronous execution + - Include timeout management and error handling + + 3. Standardize search results: + - Define a common result format across all search engines + - Include metadata specific to each search engine in a standardized way + - Implement deduplication and scoring for result ranking + +- **Rationale**: + - A modular architecture allows for easy addition of new search engines + - Parallel execution significantly improves search performance + - Standardized result format simplifies downstream processing + - Separation of concerns between execution and result processing + +- **Alternatives Considered**: + 1. Sequential execution of search queries: + - Simpler implementation + - Much slower performance + - Would not scale well with additional search engines + + 2. Separate modules for each search engine: + - Would lead to code duplication + - More difficult to maintain + - Less consistent result format + + 3. Using a third-party search aggregation service: + - Would introduce additional dependencies + - Less control over the search process + - Potential cost implications + +- **Impact**: + - Efficient execution of search queries across multiple engines + - Consistent result format for downstream processing + - Flexible architecture that can be extended with new search engines + - Clear separation of concerns between different components + +## 2025-02-27: Search Execution Module Refinements + +### Decision: Remove Google Search Handler +- **Context**: Both Google and Serper handlers were implemented, but Serper is essentially a front-end for Google search +- **Options Considered**: + 1. Keep both handlers for redundancy + 2. Remove the Google handler and only use Serper +- **Decision**: Remove the Google search handler +- **Rationale**: + - Redundant functionality as Serper provides the same results + - Simplifies the codebase and reduces maintenance + - Reduces API costs by avoiding duplicate searches + - Serper provides a more reliable and consistent API for Google search + +### Decision: Modify LLM Query Enhancement Prompt +- **Context**: The LLM was returning enhanced queries with explanations, which caused issues with search APIs +- **Options Considered**: + 1. Post-process the LLM output to extract just the query + 2. Modify the prompt to request only the enhanced query +- **Decision**: Modify the LLM prompt to request only the enhanced query without explanations +- **Rationale**: + - More reliable than post-processing, which could be error-prone + - Cleaner implementation that addresses the root cause + - Ensures consistent output format for downstream processing + - Reduces the risk of exceeding API character limits + +### Decision: Implement Query Truncation +- **Context**: Enhanced queries could exceed the Serper API's 2048 character limit +- **Options Considered**: + 1. Limit the LLM's output length + 2. Truncate queries before sending to the API + 3. Split long queries into multiple searches +- **Decision**: Implement query truncation in the search executor +- **Rationale**: + - Simple and effective solution + - Preserves as much of the enhanced query as possible + - Ensures API requests don't fail due to length constraints + - Can be easily adjusted if API limits change + +## 2025-02-27: Testing Strategy for Query Processor + +### Context +After integrating Groq and OpenRouter as additional LLM providers, we needed to verify that the query processor module functions correctly with these new providers. + +### Decision +1. Create dedicated test scripts to validate the query processor functionality: + - A basic test script for the core processing pipeline + - A comprehensive test script for detailed component testing + +2. Use monkey patching to ensure tests consistently use the Groq model: + - Create a global LLM interface with the Groq model + - Override the `get_llm_interface` function to always return this interface + - This approach allows testing without modifying the core code + +3. Test all key functionality of the query processor: + - Query enhancement + - Query classification + - Search query generation + - End-to-end processing pipeline + +### Rationale +- Dedicated test scripts provide a repeatable way to verify functionality +- Monkey patching allows testing with specific models without changing the core code +- Comprehensive testing ensures all components work correctly with the new providers +- Saving test results to a JSON file provides a reference for future development + +### Alternatives Considered +1. Modifying the query processor to accept a model parameter: + - Would require changing the core code + - Could introduce bugs in the production code + +2. Using environment variables to control model selection: + - Less precise control over which model is used + - Could interfere with other tests or production use + +### Impact +- Verified that the query processor works correctly with Groq models +- Established a testing approach that can be used for other modules +- Created reusable test scripts for future development diff --git a/.note/development_standards.md b/.note/development_standards.md new file mode 100644 index 0000000..1a9eab9 --- /dev/null +++ b/.note/development_standards.md @@ -0,0 +1,48 @@ +# Development Standards + +## Coding Conventions + +### Python Style +- Follow PEP 8 style guidelines for Python code +- Use 4 spaces for indentation (not tabs) +- Maximum line length of 79 characters +- Use docstrings for all modules, classes, and functions +- Include type hints for function parameters and return values + +### Documentation +- All modules should have a module-level docstring explaining their purpose +- All functions and classes should have docstrings following the Google style: + - Brief description + - Args section with parameter descriptions + - Returns section describing return values + - Raises section for exceptions that might be raised + +### Error Handling +- Use custom exception classes for domain-specific errors (e.g., TokenLimitError) +- Handle exceptions at appropriate levels +- Provide informative error messages +- Log errors with sufficient context for debugging + +## Project Structure +- Keep modules focused on a single responsibility +- Separate API interaction from business logic +- Use environment variables for configuration and secrets +- Include sample files for testing and demonstration + +## Testing +- Write unit tests for core functionality +- Include integration tests for API interactions +- Use sample files for consistent test cases +- Test error handling and edge cases + +## API Usage +- Always include proper authentication headers +- Handle API rate limits and errors gracefully +- Document API dependencies and version requirements +- Include comments with links to API documentation + +## Security +- Never hardcode API keys or credentials +- Use environment variables for sensitive information +- Validate and sanitize inputs +- Handle errors without exposing sensitive information diff --git a/.note/interfaces.md b/.note/interfaces.md new file mode 100644 index 0000000..a4eea7f --- /dev/null +++ b/.note/interfaces.md @@ -0,0 +1,720 @@ +# Component Interfaces + +## Current Interfaces + +### JinaSimilarity Class + +#### Initialization +```python +js = JinaSimilarity() +``` +- **Description**: Initializes the JinaSimilarity class +- **Requirements**: JINA_API_KEY environment variable must be set +- **Raises**: ValueError if JINA_API_KEY is not set + +#### count_tokens +```python +token_count = js.count_tokens(text) +``` +- **Description**: Counts the number of tokens in a text +- **Parameters**: + - `text` (str): The text to count tokens for +- **Returns**: int - Number of tokens in the text +- **Dependencies**: tiktoken library + +#### get_embedding +```python +embedding = js.get_embedding(text) +``` +- **Description**: Generates an embedding for a text using Jina AI's Embeddings API +- **Parameters**: + - `text` (str): The text to generate an embedding for (max 8,192 tokens) +- **Returns**: list - The embedding vector +- **Raises**: + - `TokenLimitError`: If the text exceeds 8,192 tokens + - `requests.exceptions.RequestException`: If the API call fails +- **Dependencies**: requests library, Jina AI API + +#### compute_similarity +```python +similarity, chunk_embedding, query_embedding = js.compute_similarity(chunk, query) +``` +- **Description**: Computes similarity between a text chunk and a query +- **Parameters**: + - `chunk` (str): The text chunk to compare against + - `query` (str): The query text +- **Returns**: Tuple containing: + - `similarity` (float): Cosine similarity score (0-1) + - `chunk_embedding` (list): Chunk embedding + - `query_embedding` (list): Query embedding +- **Raises**: + - `TokenLimitError`: If either text exceeds 8,192 tokens + - `requests.exceptions.RequestException`: If the API calls fail +- **Dependencies**: numpy library, get_embedding method + +### Markdown Segmenter + +#### segment_markdown +```python +segments = segment_markdown(file_path) +``` +- **Description**: Segments a markdown file using Jina AI's Segmenter API +- **Parameters**: + - `file_path` (str): Path to the markdown file +- **Returns**: dict - JSON structure containing the segments +- **Raises**: Exception if segmentation fails +- **Dependencies**: requests library, Jina AI API + +### Test Similarity Script + +#### Command-line Interface +``` +python test_similarity.py chunk_file query_file [--verbose] +``` +- **Description**: Computes similarity between text from two files +- **Arguments**: + - `chunk_file`: Path to the file containing the text chunk + - `query_file`: Path to the file containing the query + - `--verbose` or `-v`: Print token counts and embeddings +- **Output**: Similarity score and optional verbose information +- **Dependencies**: JinaSimilarity class + +#### read_file +```python +content = read_file(file_path) +``` +- **Description**: Reads content from a file +- **Parameters**: + - `file_path` (str): Path to the file to read +- **Returns**: str - Content of the file +- **Raises**: FileNotFoundError if the file doesn't exist + +## Search Execution Module + +### SearchExecutor Class + +#### Initialization +```python +from execution.search_executor import SearchExecutor +executor = SearchExecutor() +``` +- **Description**: Initializes the SearchExecutor class +- **Requirements**: Configuration file with API keys for search engines + +#### execute_search +```python +results = executor.execute_search(query_data) +``` +- **Description**: Executes a search across multiple search engines +- **Parameters**: + - `query_data` (dict): Dictionary containing query information with keys: + - `raw_query` (str): The original user query + - `enhanced_query` (str): The enhanced query from the LLM + - `search_engines` (list, optional): List of search engines to use + - `num_results` (int, optional): Number of results to return per engine +- **Returns**: Dict[str, List[Dict[str, Any]]] - Dictionary mapping search engine names to lists of search results +- **Example**: +```python +results = executor.execute_search({ + 'raw_query': 'quantum computing', + 'enhanced_query': 'recent advancements in quantum computing algorithms and hardware' +}) +``` + +### BaseSearchHandler Class + +#### search +```python +results = handler.search(query, num_results=10, **kwargs) +``` +- **Description**: Abstract method for searching implemented by all handlers +- **Parameters**: + - `query` (str): The search query + - `num_results` (int): Number of results to return + - `**kwargs`: Additional parameters specific to the search engine +- **Returns**: List[Dict[str, Any]] - List of search results +- **Example**: +```python +from execution.api_handlers.serper_handler import SerperSearchHandler +handler = SerperSearchHandler() +results = handler.search("quantum computing", num_results=5) +``` + +### SerperSearchHandler Class + +#### search +```python +from execution.api_handlers.serper_handler import SerperSearchHandler +handler = SerperSearchHandler() +results = handler.search(query, num_results=10, **kwargs) +``` +- **Description**: Executes a search using the Serper API +- **Parameters**: + - `query` (str): The search query + - `num_results` (int): Number of results to return + - `**kwargs`: Additional parameters for the Serper API +- **Returns**: List[Dict[str, Any]] - List of search results with keys: + - `title` (str): Title of the result + - `url` (str): URL of the result + - `snippet` (str): Snippet of text from the result + - `source` (str): Source of the result (always "serper") +- **Requirements**: Serper API key in configuration +- **Example**: +```python +results = handler.search("quantum computing", num_results=5) +``` + +### ScholarSearchHandler Class + +#### search +```python +from execution.api_handlers.scholar_handler import ScholarSearchHandler +handler = ScholarSearchHandler() +results = handler.search(query, num_results=10, **kwargs) +``` +- **Description**: Executes a search on Google Scholar using the Serper API +- **Parameters**: + - `query` (str): The search query + - `num_results` (int): Number of results to return + - `**kwargs`: Additional parameters for the Scholar API +- **Returns**: List[Dict[str, Any]] - List of search results with keys: + - `title` (str): Title of the paper + - `url` (str): URL of the paper + - `snippet` (str): Snippet of text from the paper + - `source` (str): Source of the result (always "scholar") + - `authors` (str): Authors of the paper + - `publication` (str): Publication venue + - `year` (int): Publication year +- **Requirements**: Serper API key in configuration +- **Example**: +```python +results = handler.search("quantum computing", num_results=5) +``` + +### ArxivSearchHandler Class + +#### search +```python +from execution.api_handlers.arxiv_handler import ArxivSearchHandler +handler = ArxivSearchHandler() +results = handler.search(query, num_results=10, **kwargs) +``` +- **Description**: Executes a search on arXiv +- **Parameters**: + - `query` (str): The search query + - `num_results` (int): Number of results to return + - `**kwargs`: Additional parameters for the arXiv API +- **Returns**: List[Dict[str, Any]] - List of search results with keys: + - `title` (str): Title of the paper + - `url` (str): URL of the paper + - `pdf_url` (str): URL to the PDF + - `snippet` (str): Abstract of the paper + - `source` (str): Source of the result (always "arxiv") + - `arxiv_id` (str): arXiv ID + - `authors` (list): List of author names + - `categories` (list): List of arXiv categories + - `published_date` (str): Publication date + - `updated_date` (str): Last update date + - `full_text` (str): Full abstract text +- **Example**: +```python +results = handler.search("quantum computing", num_results=5) +``` + +### ResultCollector Class + +#### process_results +```python +from execution.result_collector import ResultCollector +collector = ResultCollector() +processed_results = collector.process_results(search_results, dedup=True, max_results=None) +``` +- **Description**: Processes search results from multiple search engines +- **Parameters**: + - `search_results` (Dict[str, List[Dict[str, Any]]]): Dictionary mapping search engine names to lists of search results + - `dedup` (bool): Whether to deduplicate results based on URL + - `max_results` (Optional[int]): Maximum number of results to return +- **Returns**: List[Dict[str, Any]] - Combined and processed list of search results +- **Example**: +```python +processed_results = collector.process_results({ + 'serper': serper_results, + 'scholar': scholar_results, + 'arxiv': arxiv_results +}, dedup=True, max_results=20) +``` + +#### save_results +```python +collector.save_results(results, file_path) +``` +- **Description**: Saves search results to a JSON file +- **Parameters**: + - `results` (List[Dict[str, Any]]): List of search results + - `file_path` (str): Path to save the results +- **Example**: +```python +collector.save_results(processed_results, "search_results.json") +``` + +## Planned Interfaces for Research System + +### ResearchSystem Class + +#### Initialization +```python +rs = ResearchSystem(config=None) +``` +- **Description**: Initializes the ResearchSystem with optional configuration +- **Parameters**: + - `config` (dict, optional): Configuration options for the research system +- **Requirements**: Various API keys set in environment variables or config +- **Raises**: ValueError if required API keys are not set + +#### execute_research +```python +report = rs.execute_research(query, options=None) +``` +- **Description**: Executes a complete research pipeline from query to report +- **Parameters**: + - `query` (str): The research query + - `options` (dict, optional): Options to customize the research process +- **Returns**: dict - Research report with metadata +- **Raises**: Various exceptions for different stages of the pipeline + +#### save_report +```python +rs.save_report(report, file_path, format="markdown") +``` +- **Description**: Saves the research report to a file +- **Parameters**: + - `report` (dict): The research report to save + - `file_path` (str): Path to save the report + - `format` (str, optional): Format of the report (markdown, html, pdf) +- **Raises**: IOError if the file cannot be saved + +### QueryProcessor Class + +#### process_query +```python +structured_query = query_processor.process_query(query) +``` +- **Description**: Processes a raw query into a structured format +- **Parameters**: + - `query` (str): The raw research query +- **Returns**: dict - Structured query with metadata +- **Raises**: ValueError if the query is invalid + +### SearchStrategy Class + +#### develop_strategy +```python +search_plan = search_strategy.develop_strategy(structured_query) +``` +- **Description**: Develops a search strategy based on the query +- **Parameters**: + - `structured_query` (dict): The structured query +- **Returns**: dict - Search plan with target-specific queries +- **Raises**: ValueError if the query cannot be processed + +### SearchExecutor Class + +#### execute_search +```python +search_results = search_executor.execute_search(search_plan) +``` +- **Description**: Executes search queries against selected targets +- **Parameters**: + - `search_plan` (dict): The search plan with queries +- **Returns**: dict - Collection of search results +- **Raises**: APIError if the search APIs fail + +### JinaReranker Class + +#### rerank +```python +ranked_documents = jina_reranker.rerank(query, documents, top_n=None) +``` +- **Description**: Rerank documents based on their relevance to the query. +- **Parameters**: + - `query` (str): The query to rank documents against + - `documents` (List[str]): List of document strings to rerank + - `top_n` (Optional[int]): Number of top results to return (optional) +- **Returns**: List of dictionaries containing reranked documents with scores and indices + +#### rerank_with_metadata +```python +ranked_documents = jina_reranker.rerank_with_metadata(query, documents, document_key='content', top_n=None) +``` +- **Description**: Rerank documents with metadata based on their relevance to the query. +- **Parameters**: + - `query` (str): The query to rank documents against + - `documents` (List[Dict[str, Any]]): List of document dictionaries containing content and metadata + - `document_key` (str): The key in the document dictionaries that contains the text content + - `top_n` (Optional[int]): Number of top results to return (optional) +- **Returns**: List of dictionaries containing reranked documents with scores, indices, and original metadata + +#### get_jina_reranker +```python +jina_reranker = get_jina_reranker() +``` +- **Description**: Get the global Jina Reranker instance. +- **Returns**: JinaReranker instance + +### DocumentScraper Class + +#### scrape_documents +```python +markdown_documents = document_scraper.scrape_documents(ranked_documents) +``` +- **Description**: Scrapes and converts documents to markdown +- **Parameters**: + - `ranked_documents` (list): The ranked list of documents to scrape +- **Returns**: list - Collection of markdown documents +- **Raises**: ScrapingError if the documents cannot be scraped + +### DocumentSelector Class + +#### select_documents +```python +selected_documents = document_selector.select_documents(documents_with_scores) +``` +- **Description**: Selects the most relevant and diverse documents +- **Parameters**: + - `documents_with_scores` (list): Documents with similarity scores +- **Returns**: list - Curated set of documents +- **Raises**: ValueError if the selection criteria are invalid + +### ReportGenerator Class + +#### generate_report +```python +report = report_generator.generate_report(selected_documents, query) +``` +- **Description**: Generates a research report from selected documents +- **Parameters**: + - `selected_documents` (list): The selected documents + - `query` (str): The original query for context +- **Returns**: dict - Final research report +- **Raises**: GenerationError if the report cannot be generated + +## Search Execution Module + +### SearchExecutor Class + +The `SearchExecutor` class manages the execution of search queries across multiple search engines. + +#### Initialization +```python +executor = SearchExecutor() +``` +- **Description**: Initializes the search executor with available search handlers +- **Requirements**: Appropriate API keys must be set for the search engines to be used + +#### execute_search +```python +results = executor.execute_search(structured_query, search_engines=["google", "scholar"], num_results=10) +``` +- **Description**: Executes search queries across specified search engines in parallel +- **Parameters**: + - `structured_query` (Dict[str, Any]): The structured query from the query processor + - `search_engines` (Optional[List[str]]): List of search engines to use + - `num_results` (int): Number of results to return per search engine + - `timeout` (int): Timeout in seconds for each search engine +- **Returns**: Dict[str, List[Dict[str, Any]]] - Dictionary mapping search engine names to lists of search results + +#### execute_search_async +```python +results = await executor.execute_search_async(structured_query, search_engines=["google", "scholar"]) +``` +- **Description**: Executes search queries across specified search engines asynchronously +- **Parameters**: Same as `execute_search` +- **Returns**: Dict[str, List[Dict[str, Any]]] - Dictionary mapping search engine names to lists of search results + +#### get_available_search_engines +```python +engines = executor.get_available_search_engines() +``` +- **Description**: Gets a list of available search engines +- **Returns**: List[str] - List of available search engine names + +### ResultCollector Class + +The `ResultCollector` class processes and organizes search results from multiple search engines. + +#### Initialization +```python +collector = ResultCollector() +``` +- **Description**: Initializes the result collector + +#### process_results +```python +processed_results = collector.process_results(search_results, dedup=True, max_results=20) +``` +- **Description**: Processes search results from multiple search engines +- **Parameters**: + - `search_results` (Dict[str, List[Dict[str, Any]]]): Dictionary mapping search engine names to lists of search results + - `dedup` (bool): Whether to deduplicate results based on URL + - `max_results` (Optional[int]): Maximum number of results to return +- **Returns**: List[Dict[str, Any]] - List of processed search results + +#### filter_results +```python +filtered_results = collector.filter_results(results, filters={"domains": ["arxiv.org"], "min_score": 5}) +``` +- **Description**: Filters results based on specified criteria +- **Parameters**: + - `results` (List[Dict[str, Any]]): List of search results + - `filters` (Dict[str, Any]): Dictionary of filter criteria +- **Returns**: List[Dict[str, Any]] - Filtered list of search results + +#### group_results_by_domain +```python +grouped_results = collector.group_results_by_domain(results) +``` +- **Description**: Groups results by domain +- **Parameters**: + - `results` (List[Dict[str, Any]]): List of search results +- **Returns**: Dict[str, List[Dict[str, Any]]] - Dictionary mapping domains to lists of search results + +### BaseSearchHandler Interface + +The `BaseSearchHandler` class defines the interface for all search API handlers. + +#### search +```python +results = handler.search(query, num_results=10, **kwargs) +``` +- **Description**: Executes a search query +- **Parameters**: + - `query` (str): The search query to execute + - `num_results` (int): Number of results to return + - `**kwargs`: Additional search parameters specific to the API +- **Returns**: List[Dict[str, Any]] - List of search results + +#### get_name +```python +name = handler.get_name() +``` +- **Description**: Gets the name of the search handler +- **Returns**: str - Name of the search handler + +#### is_available +```python +available = handler.is_available() +``` +- **Description**: Checks if the search API is available +- **Returns**: bool - True if the API is available, False otherwise + +#### get_rate_limit_info +```python +rate_limits = handler.get_rate_limit_info() +``` +- **Description**: Gets information about the API's rate limits +- **Returns**: Dict[str, Any] - Dictionary with rate limit information + +## Search Execution Testing + +The search execution module has been tested to ensure it correctly executes search queries across multiple search engines and processes the results. + +### Test Script (test_search_execution.py) + +```python +# Process a query and execute search +results = test_search_execution("What are the latest advancements in quantum computing?") + +# Save test results +save_test_results(results, "search_execution_test_results.json") +``` + +- **Purpose**: Tests the search execution module with various queries +- **Features**: + - Tests with multiple queries + - Uses all available search engines + - Saves results to a JSON file + - Provides detailed output of search results + +## Document Ranking Interface + +### JinaReranker + +The `JinaReranker` class provides an interface for reranking documents based on their relevance to a query using Jina AI's Reranker API. + +#### Methods + +```python +def rerank(query: str, documents: List[str], top_n: Optional[int] = None) -> List[Dict[str, Any]]: + """ + Rerank documents based on their relevance to the query. + + Args: + query: The query to rank documents against + documents: List of document strings to rerank + top_n: Number of top results to return (optional) + + Returns: + List of dictionaries containing reranked documents with scores and indices + """ +``` + +```python +def rerank_with_metadata(query: str, documents: List[Dict[str, Any]], + document_key: str = 'content', + top_n: Optional[int] = None) -> List[Dict[str, Any]]: + """ + Rerank documents with metadata based on their relevance to the query. + + Args: + query: The query to rank documents against + documents: List of document dictionaries containing content and metadata + document_key: The key in the document dictionaries that contains the text content + top_n: Number of top results to return (optional) + + Returns: + List of dictionaries containing reranked documents with scores, indices, and original metadata + """ +``` + +#### Factory Function + +```python +def get_jina_reranker() -> JinaReranker: + """ + Get the global Jina Reranker instance. + + Returns: + JinaReranker instance + """ +``` + +#### Example Usage + +```python +from ranking.jina_reranker import get_jina_reranker + +# Get the reranker +reranker = get_jina_reranker() + +# Rerank documents +results = reranker.rerank( + query="What is quantum computing?", + documents=["Document about quantum physics", "Document about quantum computing", "Document about classical computing"], + top_n=2 +) + +# Process results +for result in results: + print(f"Score: {result['score']}, Document: {result['document']}") + +## Query Processor Testing + +The query processor module has been tested with the Groq LLM provider to ensure it functions correctly with the newly integrated models. + +### Test Scripts + +Two test scripts have been created to validate the query processor functionality: + +#### Basic Test Script (test_query_processor.py) + +```python +# Get the query processor +processor = get_query_processor() + +# Process a query +result = processor.process_query("What are the latest advancements in quantum computing?") + +# Generate search queries +search_result = processor.generate_search_queries(result, ["google", "bing", "scholar"]) +``` + +- **Purpose**: Tests the core functionality of the query processor +- **Features**: + - Uses monkey patching to ensure the Groq model is used + - Provides detailed output of processing results + +#### Comprehensive Test Script (test_query_processor_comprehensive.py) + +```python +# Test query enhancement +enhanced_query = test_enhance_query("What is quantum computing?") + +# Test query classification +classification = test_classify_query("What is quantum computing?") + +# Test the full processing pipeline +structured_query = test_process_query("What is quantum computing?") + +# Test search query generation +search_result = test_generate_search_queries(structured_query, ["google", "bing", "scholar"]) +``` + +- **Purpose**: Tests all aspects of the query processor in detail +- **Features**: + - Tests individual components in isolation + - Tests a variety of query types + - Saves detailed test results to a JSON file + +## LLM Interface + +### LLMInterface Class + +The `LLMInterface` class provides a unified interface for interacting with various LLM providers through LiteLLM. + +#### Initialization +```python +llm = LLMInterface(model_name="gpt-4") +``` +- **Description**: Initializes the LLM interface with the specified model +- **Parameters**: + - `model_name` (Optional[str]): The name of the model to use (defaults to config value) +- **Requirements**: Appropriate API key must be set in environment or config + +#### complete +```python +response = llm.complete(prompt, system_prompt=None, temperature=None, max_tokens=None) +``` +- **Description**: Generates a completion for the given prompt +- **Parameters**: + - `prompt` (str): The prompt to complete + - `system_prompt` (Optional[str]): System prompt for context + - `temperature` (Optional[float]): Temperature for generation + - `max_tokens` (Optional[int]): Maximum tokens to generate +- **Returns**: str - The generated completion +- **Raises**: LLMError if the completion fails + +#### complete_json +```python +json_response = llm.complete_json(prompt, system_prompt=None, json_schema=None) +``` +- **Description**: Generates a JSON response for the given prompt +- **Parameters**: + - `prompt` (str): The prompt to complete + - `system_prompt` (Optional[str]): System prompt for context + - `json_schema` (Optional[Dict]): JSON schema for validation +- **Returns**: Dict - The generated JSON response +- **Raises**: LLMError if the completion fails or JSON is invalid + +#### Supported Providers +- OpenAI +- Azure OpenAI +- Anthropic +- Ollama +- Groq +- OpenRouter + +#### Example Usage +```python +from query.llm_interface import LLMInterface + +# Initialize with specific model +llm = LLMInterface(model_name="llama-3.1-8b-instant") + +# Generate a completion +response = llm.complete( + prompt="Explain quantum computing", + system_prompt="You are a helpful assistant that explains complex topics simply.", + temperature=0.7 +) + +print(response) diff --git a/.note/project_overview.md b/.note/project_overview.md new file mode 100644 index 0000000..68af05c --- /dev/null +++ b/.note/project_overview.md @@ -0,0 +1,97 @@ +# Project Overview: Intelligent Research System with Semantic Search + +## Purpose +This project implements an intelligent research system that automates the process of finding, filtering, and synthesizing information from various sources. At its core, the system uses semantic similarity search powered by Jina AI's APIs to understand context beyond simple keyword matching, enabling more intelligent document processing and information retrieval. + +## Goals +1. Create an end-to-end research automation system that handles the entire process from query to final report +2. Leverage multiple search sources to gather comprehensive information (Serper, Google Scholar, arXiv) +3. Implement intelligent filtering and ranking of documents using semantic similarity +4. Produce synthesized reports that extract and combine the most relevant information +5. Build a modular and extensible architecture that can be enhanced with additional capabilities + +## High-Level Architecture +The system follows a modular pipeline: + +1. **Query Processing**: + - Accept and process user research queries + - Enhance queries with additional context and structure + - Classify queries by type, intent, and entities + - Generate optimized queries for different search engines + +2. **Search Execution**: + - Execute search queries across multiple search engines (Serper, Google Scholar, arXiv) + - Collect and process search results + - Handle deduplication and result filtering + +3. **Document Ranking**: + - Use Jina AI's Re-Ranker to order documents by relevance + - Filter out less relevant documents + - Apply additional filtering based on metadata (date, source, etc.) + +4. **Report Generation**: + - Synthesize a comprehensive report from the selected documents + - Format the report for readability + - Include citations and references + +5. **User Interface**: + - Provide a Gradio-based web interface for user interaction + - Display search results and generated reports + - Allow configuration of search parameters + +## Current Implementation Status + +The project currently has the following modules implemented: + +1. **Configuration Module**: + - Manages configuration settings for the entire system + - Handles API keys and model selections + - Supports different LLM providers and endpoints + +2. **Query Processing Module**: + - Processes and enhances user queries + - Classifies queries by type and intent + - Generates optimized search queries + - Integrates with LiteLLM for LLM provider support + +3. **Search Execution Module**: + - Executes search queries across multiple search engines + - Implements handlers for Serper, Google Scholar, and arXiv + - Collects and processes search results + - Handles deduplication and result filtering + +4. **Document Ranking Module**: + - Implements Jina AI's Re-Ranker for document ranking + - Supports reranking with metadata preservation + - Provides filtering capabilities + +## Dependencies +- `requests`: For making API calls to various APIs +- `numpy`: For vector operations in similarity computation +- `tiktoken`: For tokenization and token counting +- `litellm`: For unified LLM provider interface +- `pyyaml`: For configuration file parsing +- `feedparser`: For parsing RSS/Atom feeds (arXiv) +- `beautifulsoup4`: For HTML parsing +- `gradio`: For web interface (planned) + +## LLM Providers +The system supports multiple LLM providers through the LiteLLM interface: +- Groq (currently using Llama 3.1-8b-instant) +- OpenAI +- Anthropic +- OpenRouter +- Azure OpenAI + +## Search Engines +The system currently integrates with the following search engines: +- Serper API (for Google search) +- Google Scholar (via Serper API) +- arXiv (via official API) + +## Next Steps +1. Implement the Report Generation module +2. Develop the Gradio UI for user interaction +3. Add more search engines and LLM providers +4. Implement document retrieval and processing +5. Add support for saving and loading research sessions diff --git a/.note/session_log.md b/.note/session_log.md new file mode 100644 index 0000000..b8e88fc --- /dev/null +++ b/.note/session_log.md @@ -0,0 +1,350 @@ +# Session Log + +## Session: 2025-02-27 + +### Overview +Initial project setup and implementation of core functionality for semantic similarity search using Jina AI's APIs. + +### Key Activities +1. Created the core `JinaSimilarity` class in jina_similarity.py with the following features: + - Token counting using tiktoken + - Embedding generation using Jina AI's Embeddings API + - Similarity computation using cosine similarity + - Error handling for token limit violations + +2. Implemented the markdown segmenter in markdown_segmenter.py: + - Segmentation of markdown documents using Jina AI's Segmenter API + - Command-line interface for easy usage + +3. Developed a test script (test_similarity.py) with: + - Command-line argument parsing + - File reading functionality + - Verbose output option for debugging + - Error handling + +4. Created sample files for testing: + - sample_chunk.txt: Contains a paragraph about pangrams + - sample_query.txt: Contains a question about pangrams + +### Insights +- Jina AI's embedding model (jina-embeddings-v3) provides high-quality embeddings for semantic search +- The token limit of 8,192 tokens is sufficient for most use cases, but longer documents need segmentation +- Normalizing embeddings simplifies similarity computation (dot product equals cosine similarity) +- Separating segmentation from similarity computation provides better modularity + +### Challenges +- Ensuring proper error handling for API failures +- Managing token limits for large documents +- Balancing between chunking granularity and semantic coherence + +### Next Steps +1. Add tiktoken to requirements.txt +2. Implement caching for embeddings to reduce API calls +3. Add batch processing capabilities for multiple chunks/queries +4. Create comprehensive documentation and usage examples +5. Develop integration tests for reliability testing + +## Session: 2025-02-27 (Update) + +### Overview +Created memory bank for the project to maintain persistent knowledge about the codebase and development progress. + +### Key Activities +1. Created the `.note/` directory to store memory bank files +2. Created the following memory bank files: + - project_overview.md: Purpose, goals, and high-level architecture + - current_focus.md: Active work, recent changes, and next steps + - development_standards.md: Coding conventions and patterns + - decision_log.md: Key decisions with rationale + - code_structure.md: Codebase organization with module descriptions + - session_log.md: History of development sessions + - interfaces.md: Component interfaces and API documentation + +### Insights +- The project has a clear structure with well-defined components +- The use of Jina AI's APIs provides powerful semantic search capabilities +- The modular design allows for easy extension and maintenance +- Some improvements are needed, such as adding tiktoken to requirements.txt + +### Next Steps +1. Update requirements.txt to include all dependencies (tiktoken) +2. Implement caching mechanism for embeddings +3. Add batch processing capabilities +4. Create comprehensive documentation +5. Develop integration tests + +## Session: 2025-02-27 (Update 2) + +### Overview +Expanded the project scope to build a comprehensive intelligent research system with an 8-stage pipeline. + +### Key Activities +1. Defined the overall architecture for the intelligent research system: + - 8-stage pipeline from query acceptance to report generation + - Multiple search sources (Google, Serper, Jina Search, Google Scholar, arXiv) + - Semantic processing using Jina AI's APIs + +2. Updated the memory bank to reflect the broader vision: + - Revised project_overview.md with the complete research system goals + - Updated current_focus.md with next steps for each pipeline stage + - Enhanced code_structure.md with planned project organization + - Added new decisions to decision_log.md + +### Insights +- The modular pipeline architecture allows for incremental development +- Jina AI's suite of APIs provides a consistent approach to semantic processing +- Multiple search sources will provide more comprehensive research results +- The current similarity components fit naturally into stages 6-7 of the pipeline + +### Next Steps +1. Begin implementing the query processing module (stage 1) +2. Design the data structures for passing information between pipeline stages +3. Create a project roadmap with milestones for each stage +4. Prioritize development of core components for an end-to-end MVP + +## Session: 2025-02-27 (Update 3) + +### Overview +Planned the implementation of the Query Processing Module with LiteLLM integration and Gradio UI. + +### Key Activities +1. Researched LiteLLM integration: + - Explored LiteLLM documentation and usage patterns + - Investigated integration with Gradio for UI development + - Identified configuration requirements and best practices + +2. Developed implementation plan: + - Prioritized Query Processing Module with LiteLLM integration + - Planned Gradio UI implementation for user interaction + - Outlined configuration structure for API keys and settings + - Established a sequence for implementing remaining modules + +3. Updated memory bank: + - Revised current_focus.md with new implementation plan + - Added immediate and future steps for development + +### Insights +- LiteLLM provides a unified interface to multiple LLM providers, simplifying integration +- Gradio offers an easy way to create interactive UIs for AI applications +- The modular approach allows for incremental development and testing +- Existing similarity components can be integrated into the pipeline at a later stage + +### Next Steps +1. Update requirements.txt with new dependencies (litellm, gradio, etc.) +2. Create configuration structure for secure API key management +3. Implement LiteLLM interface for query enhancement and classification +4. Develop the query processor with structured output +5. Build the Gradio UI for user interaction + +## Session: 2025-02-27 (Update 4) + +### Overview +Implemented module-specific model configuration and created the Jina AI Reranker module. + +### Key Activities +1. Enhanced configuration structure: + - Added support for module-specific model assignments + - Configured different models for different tasks + - Added detailed endpoint configurations for various providers + +2. Updated LLMInterface: + - Modified to support module-specific model configurations + - Added support for different endpoint types (OpenAI, Azure, Ollama) + - Implemented method delegation to use appropriate models for each task + +3. Created Jina AI Reranker module: + - Implemented document reranking using Jina AI's Reranker API + - Added support for reranking documents with metadata + - Configured to use the "jina-reranker-v2-base-multilingual" model + +### Insights +- Using different models for different tasks allows for optimizing performance and cost +- Jina's reranker provides a specialized solution for document ranking +- The modular approach allows for easy swapping of components and models + +### Next Steps +1. Implement the remaining query processing components +2. Create the Gradio UI for user interaction +3. Develop the search execution module to integrate with search APIs + +## Session: 2025-02-27 (Update 5) + +### Overview +Added support for OpenRouter and Groq as LLM providers and configured the system to use Groq for testing. + +### Key Activities +1. Enhanced configuration: + - Added API key configurations for OpenRouter and Groq + - Added model configurations for Groq's Llama models (3.1-8b-instant and 3.3-70b-versatile) + - Added model configurations for OpenRouter's models (Mixtral and Claude) + - Updated default model to use Groq's Llama 3.1-8b-instant for testing + +2. Updated LLM Interface: + - Enhanced the `_get_completion_params` method to handle Groq and OpenRouter providers + - Added special handling for OpenRouter's HTTP headers + - Updated the API key retrieval to support the new providers + +3. Configured module-specific models: + - Set most modules to use Groq's Llama 3.1-8b-instant model for testing + - Kept Jina's reranker for document reranking + - Set report synthesis to use Groq's Llama 3.3-70b-versatile model for higher quality + +### Insights +- Using Groq for testing provides fast inference times with high-quality models +- OpenRouter offers flexibility to access various models through a single API +- The modular approach allows for easy switching between different providers + +### Next Steps +1. Test the system with Groq's models to evaluate performance +2. Implement the remaining query processing components +3. Create the Gradio UI for user interaction + +## Session: 2025-02-27 (Update 6) + +### Overview +Tested the query processor module with Groq models to ensure functionality with the newly integrated LLM providers. + +### Key Activities +1. Created test scripts for the query processor: + - Developed a basic test script (`test_query_processor.py`) to verify the query processing pipeline + - Created a comprehensive test script (`test_query_processor_comprehensive.py`) to test all aspects of query processing + - Implemented monkey patching to ensure tests use the Groq models + +2. Verified query processor functionality: + - Tested query enhancement with Groq's Llama 3.1-8b-instant model + - Tested query classification with structured output + - Tested search query generation for multiple search engines + - Confirmed the entire processing pipeline works end-to-end + +3. Resolved integration issues: + - Fixed configuration loading to properly use the Groq API key + - Ensured LLM interface correctly initializes with Groq models + - Verified that the query processor correctly uses the LLM interface + +### Insights +- Groq's Llama 3.1-8b-instant model performs well for query processing tasks with fast response times +- The modular design allows for easy switching between different LLM providers +- The query processor successfully enhances queries by adding context and structure +- Query classification provides useful metadata for downstream processing + +### Next Steps +1. Implement the search execution module to integrate with search APIs +2. Create the Gradio UI for user interaction +3. Test the full system with end-to-end workflows + +## Session: 2025-02-27 - Comprehensive Testing of Query Processor + +### Objectives +- Create a comprehensive test script for the query processor +- Test all aspects of the query processor with various query types +- Document the testing approach and results + +### Accomplishments +1. Created a comprehensive test script (`test_query_processor_comprehensive.py`): + - Implemented tests for query enhancement in isolation + - Implemented tests for query classification in isolation + - Implemented tests for the full processing pipeline + - Implemented tests for search query generation + - Added support for saving test results to a JSON file + +2. Tested a variety of query types: + - Factual queries (e.g., "What is quantum computing?") + - Comparative queries (e.g., "Compare blockchain and traditional databases") + - Domain-specific queries (e.g., "Explain the implications of blockchain in finance") + - Complex queries with multiple aspects + +3. Documented the testing approach: + - Updated the decision log with the testing strategy + - Added test script descriptions to the code structure document + - Added a section about query processor testing to the interfaces document + - Updated the project overview to reflect the current status + +### Insights +- The query processor successfully handles a wide range of query types +- The Groq model provides consistent and high-quality results for all tested functions +- The monkey patching approach allows for effective testing without modifying core code +- Saving test results to a JSON file provides a valuable reference for future development + +### Next Steps +1. Implement the search execution module to integrate with search APIs +2. Create the Gradio UI for user interaction +3. Test the full system with end-to-end workflows + +## Session: 2025-02-27 - Search Execution Module Implementation + +### Objectives +- Implement the search execution module to execute queries across multiple search engines +- Create handlers for different search APIs +- Develop a result collector for processing and organizing search results +- Create a test script to verify functionality + +### Accomplishments +1. Created a modular search execution framework: + - Implemented a base handler interface (`BaseSearchHandler`) for all search API handlers + - Created handlers for Google Search, Serper, Google Scholar, and arXiv + - Developed a `SearchExecutor` class for managing search execution across multiple engines + - Implemented parallel search execution using thread pools for efficiency + +2. Implemented a comprehensive result processing system: + - Created a `ResultCollector` class for processing and organizing search results + - Added functionality for deduplication, scoring, and sorting of results + - Implemented filtering capabilities based on various criteria + - Added support for saving and loading results to/from files + +3. Created a test script for the search execution module: + - Integrated with the query processor to test the full pipeline + - Added support for testing with multiple query types + - Implemented result saving for analysis + +### Insights +- The modular design allows for easy addition of new search engines +- Parallel execution significantly improves search performance +- Standardized result format simplifies downstream processing +- The search execution module integrates seamlessly with the query processor + +### Next Steps +1. Test the search execution module with real API keys and live search engines +2. Develop the Gradio UI for user interaction +3. Implement the report generation module + +## Session: 2025-02-27 - Serper API Integration Fixes + +### Overview +Fixed Serper API integration in the search execution module, ensuring proper functionality for both regular search and Scholar search. + +### Key Activities +1. Fixed the Serper API integration: + - Modified the LLM interface to return only the enhanced query text without explanations + - Updated the query enhancement prompt to be more specific about the desired output format + - Added query truncation to handle long queries (Serper API has a 2048 character limit) + +2. Streamlined the search execution process: + - Removed the redundant Google search handler (as Serper serves as a front-end for Google search) + - Fixed the Serper API endpoint URL and request parameters + - Improved error handling for API requests + +3. Enhanced result processing: + - Improved the result collector to properly process and deduplicate results from multiple sources + - Added better debug output to help diagnose issues with search results + +4. Improved testing: + - Created a dedicated test script for all search handlers + - Added detailed output of search results for better debugging + - Implemented comprehensive testing across multiple queries + +### Insights +- The Serper API has a 2048 character limit for queries, requiring truncation for long enhanced queries +- The LLM's tendency to add explanations to enhanced queries can cause issues with search APIs +- Proper error handling is crucial for API integrations, especially when dealing with multiple search engines +- The Scholar handler uses the same Serper API but with a different endpoint (/scholar) + +### Challenges +- Managing the length of enhanced queries to stay within API limits +- Ensuring consistent result format across different search engines +- Handling API-specific requirements and limitations + +### Next Steps +1. Integrate the search execution module with the query processor +2. Implement the report generation module +3. Develop the Gradio UI for user interaction +4. Test the complete pipeline from query to report diff --git a/README.md b/README.md new file mode 100644 index 0000000..13033df --- /dev/null +++ b/README.md @@ -0,0 +1,138 @@ +# Intelligent Research System + +An end-to-end research automation system that handles the entire process from query to final report, leveraging multiple search sources and semantic similarity to produce comprehensive research results. + +## Overview + +This system automates the research process by: +1. Processing and enhancing user queries +2. Executing searches across multiple engines (Serper, Google Scholar, arXiv) +3. Ranking and filtering results based on relevance +4. Generating comprehensive research reports + +## Features + +- **Query Processing**: Enhances user queries with additional context and classifies them by type and intent +- **Multi-Source Search**: Executes searches across Serper (Google), Google Scholar, and arXiv +- **Intelligent Ranking**: Uses Jina AI's Re-Ranker to prioritize the most relevant results +- **Result Deduplication**: Removes duplicate results across different search engines +- **Modular Architecture**: Easily extensible with new search engines and LLM providers + +## Components + +- **Query Processor**: Enhances and classifies user queries +- **Search Executor**: Executes searches across multiple engines +- **Result Collector**: Processes and organizes search results +- **Document Ranker**: Ranks documents by relevance +- **Report Generator**: Synthesizes information into a coherent report (coming soon) + +## Getting Started + +### Prerequisites + +- Python 3.8+ +- API keys for: + - Serper API (for Google and Scholar search) + - Groq (or other LLM provider) + - Jina AI (for reranking) + +### Installation + +1. Clone the repository: +```bash +git clone https://github.com/yourusername/sim-search.git +cd sim-search +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +3. Create a configuration file: +```bash +cp config/config.yaml.example config/config.yaml +``` + +4. Edit the configuration file to add your API keys: +```yaml +api_keys: + serper: "your-serper-api-key" + groq: "your-groq-api-key" + jina: "your-jina-api-key" +``` + +### Usage + +#### Basic Usage + +```python +from query.query_processor import QueryProcessor +from execution.search_executor import SearchExecutor +from execution.result_collector import ResultCollector + +# Initialize components +query_processor = QueryProcessor() +search_executor = SearchExecutor() +result_collector = ResultCollector() + +# Process a query +processed_query = query_processor.process_query("What are the latest advancements in quantum computing?") + +# Execute search +search_results = search_executor.execute_search(processed_query) + +# Process results +processed_results = result_collector.process_results(search_results) + +# Print top results +for i, result in enumerate(processed_results[:5]): + print(f"{i+1}. {result['title']}") + print(f" URL: {result['url']}") + print(f" Snippet: {result['snippet'][:100]}...") + print() +``` + +#### Testing + +Run the test scripts to verify functionality: + +```bash +# Test search execution +python test_search_execution.py + +# Test all search handlers +python test_all_handlers.py +``` + +## Project Structure + +``` +sim-search/ +├── config/ # Configuration management +├── query/ # Query processing +├── execution/ # Search execution +│ └── api_handlers/ # Search API handlers +├── ranking/ # Document ranking +├── test_*.py # Test scripts +└── requirements.txt # Dependencies +``` + +## LLM Providers + +The system supports multiple LLM providers through the LiteLLM interface: +- Groq (currently using Llama 3.1-8b-instant) +- OpenAI +- Anthropic +- OpenRouter +- Azure OpenAI + +## License + +This project is licensed under the MIT License - see the LICENSE file for details. + +## Acknowledgments + +- [Jina AI](https://jina.ai/) for their embedding and reranking APIs +- [Serper](https://serper.dev/) for their Google search API +- [Groq](https://groq.com/) for their fast LLM inference diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/config/config.py b/config/config.py new file mode 100644 index 0000000..5f7f3b5 --- /dev/null +++ b/config/config.py @@ -0,0 +1,181 @@ +""" +Configuration management for the intelligent research system. + +This module handles loading configuration from files and environment variables, +providing secure access to API keys and model settings. +""" + +import os +import yaml +from pathlib import Path +from typing import Dict, Any, Optional +from dotenv import load_dotenv + +# Load environment variables from .env file if it exists +load_dotenv() + +class Config: + """Configuration manager for the intelligent research system.""" + + def __init__(self, config_path: Optional[str] = None): + """ + Initialize the configuration manager. + + Args: + config_path: Path to the configuration file. If None, will look for + config.yaml in the same directory as this file. + """ + self.config_data = {} + self.config_path = config_path + + if not config_path: + # Default to config.yaml in the same directory as this file + self.config_path = Path(__file__).parent / "config.yaml" + + self.load_config() + + def load_config(self) -> None: + """Load configuration from file if it exists.""" + try: + if Path(self.config_path).exists(): + with open(self.config_path, 'r') as f: + self.config_data = yaml.safe_load(f) + print(f"Configuration loaded from {self.config_path}") + else: + print(f"Configuration file {self.config_path} not found. Using environment variables only.") + except Exception as e: + print(f"Error loading configuration: {e}") + + def get_api_key(self, provider: str) -> str: + """ + Get API key for the specified provider. + + Args: + provider: The name of the API provider (e.g., 'openai', 'jina', 'serper') + + Returns: + The API key as a string + + Raises: + ValueError: If the API key is not found + """ + # First check environment variables (higher priority) + env_var_name = f"{provider.upper()}_API_KEY" + + # Special case for Jina AI which uses JINA_API_KEY + if provider.lower() == 'jina': + env_var_name = "JINA_API_KEY" + + # Special case for Groq which might use GROQ_API_KEY + if provider.lower() == 'groq': + env_var_name = "GROQ_API_KEY" + + # Special case for OpenRouter which might use OPENROUTER_API_KEY + if provider.lower() == 'openrouter': + env_var_name = "OPENROUTER_API_KEY" + + api_key = os.environ.get(env_var_name) + + # If not in environment, check config file + if not api_key and self.config_data and 'api_keys' in self.config_data: + api_key = self.config_data['api_keys'].get(provider) + + if not api_key: + raise ValueError(f"API key for {provider} not found. Set {env_var_name} environment variable or add to config file.") + + return api_key + + def get_model_config(self, model_name: str) -> Dict[str, Any]: + """ + Get configuration for a specific model. + + Args: + model_name: The name of the model + + Returns: + Dictionary containing model configuration + """ + if self.config_data and 'models' in self.config_data: + return self.config_data['models'].get(model_name, {}) + return {} + + def get_module_model(self, module_name: str, function_name: str) -> str: + """ + Get the model assigned to a specific module function. + + Args: + module_name: The name of the module (e.g., 'query_processing') + function_name: The name of the function (e.g., 'enhance_query') + + Returns: + The name of the model to use, or the default model if not specified + """ + default = self.config_data.get('default_model', 'gpt-3.5-turbo') + + if (self.config_data and 'module_models' in self.config_data and + module_name in self.config_data['module_models'] and + function_name in self.config_data['module_models'][module_name]): + return self.config_data['module_models'][module_name][function_name] + + return default + + def get_search_config(self, search_engine: str) -> Dict[str, Any]: + """ + Get configuration for a specific search engine. + + Args: + search_engine: The name of the search engine + + Returns: + Dictionary containing search engine configuration + """ + if self.config_data and 'search_engines' in self.config_data: + return self.config_data['search_engines'].get(search_engine, {}) + return {} + + def get_ui_config(self) -> Dict[str, Any]: + """ + Get UI configuration. + + Returns: + Dictionary containing UI configuration + """ + if self.config_data and 'ui' in self.config_data: + return self.config_data['ui'] + return {} + + +# Create a singleton instance for global use +config = Config() + + +def get_config() -> Config: + """ + Get the global configuration instance. + + Returns: + The global Config instance + """ + return config + + +def get_api_key(service_name: str) -> Optional[str]: + """ + Get an API key for a specific service. + + Args: + service_name: Name of the service to get the API key for + + Returns: + API key as a string, or None if not found + """ + # First check environment variables + env_var_name = f"{service_name.upper()}_API_KEY" + api_key = os.environ.get(env_var_name) + + # If not found in environment, check config file + if not api_key: + cfg = get_config() + api_key = cfg.config_data.get('api_keys', {}).get(service_name) + + return api_key diff --git a/config/config.yaml.example b/config/config.yaml.example new file mode 100644 index 0000000..8ea4d47 --- /dev/null +++ b/config/config.yaml.example @@ -0,0 +1,150 @@ +# Example configuration file for the intelligent research system +# Rename this file to config.yaml and fill in your API keys and settings + +# API keys (alternatively, set environment variables) +api_keys: + openai: "your-openai-api-key" # Or set OPENAI_API_KEY environment variable + jina: "your-jina-api-key" # Or set JINA_API_KEY environment variable + serper: "your-serper-api-key" # Or set SERPER_API_KEY environment variable + google: "your-google-api-key" # Or set GOOGLE_API_KEY environment variable + anthropic: "your-anthropic-api-key" # Or set ANTHROPIC_API_KEY environment variable + openrouter: "your-openrouter-api-key" # Or set OPENROUTER_API_KEY environment variable + groq: "your-groq-api-key" # Or set GROQ_API_KEY environment variable + +# LLM model configurations +models: + gpt-3.5-turbo: + provider: "openai" + temperature: 0.7 + max_tokens: 1000 + top_p: 1.0 + endpoint: null # Use default OpenAI endpoint + + gpt-4: + provider: "openai" + temperature: 0.5 + max_tokens: 2000 + top_p: 1.0 + endpoint: null # Use default OpenAI endpoint + + claude-2: + provider: "anthropic" + temperature: 0.7 + max_tokens: 1500 + top_p: 1.0 + endpoint: null # Use default Anthropic endpoint + + azure-gpt-4: + provider: "azure" + temperature: 0.5 + max_tokens: 2000 + top_p: 1.0 + endpoint: "https://your-azure-endpoint.openai.azure.com" + deployment_name: "your-deployment-name" + api_version: "2023-05-15" + + local-llama: + provider: "ollama" + temperature: 0.8 + max_tokens: 1000 + endpoint: "http://localhost:11434/api/generate" + model_name: "llama2" + + llama-3.1-8b-instant: + provider: "groq" + model_name: "llama-3.1-8b-instant" + temperature: 0.7 + max_tokens: 1024 + top_p: 1.0 + endpoint: "https://api.groq.com/openai/v1" + + llama-3.3-70b-versatile: + provider: "groq" + model_name: "llama-3.3-70b-versatile" + temperature: 0.5 + max_tokens: 2048 + top_p: 1.0 + endpoint: "https://api.groq.com/openai/v1" + + openrouter-mixtral: + provider: "openrouter" + model_name: "mistralai/mixtral-8x7b-instruct" + temperature: 0.7 + max_tokens: 1024 + top_p: 1.0 + endpoint: "https://openrouter.ai/api/v1" + + openrouter-claude: + provider: "openrouter" + model_name: "anthropic/claude-3-opus" + temperature: 0.5 + max_tokens: 2048 + top_p: 1.0 + endpoint: "https://openrouter.ai/api/v1" + +# Default model to use if not specified for a module +default_model: "llama-3.1-8b-instant" # Using Groq's Llama 3.1 8B model for testing + +# Module-specific model assignments +module_models: + # Query processing module + query_processing: + enhance_query: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for query enhancement + classify_query: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for classification + generate_search_queries: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for generating search queries + + # Search strategy module + search_strategy: + develop_strategy: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for developing search strategies + target_selection: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for target selection + + # Document ranking module + document_ranking: + rerank_documents: "jina-reranker" # Use Jina's reranker for document reranking + + # Report generation module + report_generation: + synthesize_report: "llama-3.3-70b-versatile" # Use Groq's Llama 3.3 70B for report synthesis + format_report: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for formatting + +# Search engine configurations +search_engines: + google: + enabled: true + max_results: 10 + + serper: + enabled: true + max_results: 10 + + jina: + enabled: true + max_results: 10 + + scholar: + enabled: false + max_results: 5 + + arxiv: + enabled: false + max_results: 5 + +# Jina AI specific configurations +jina: + reranker: + model: "jina-reranker-v2-base-multilingual" # Default reranker model + top_n: 10 # Default number of top results to return + +# UI configuration +ui: + theme: "light" # light or dark + port: 7860 + share: false + title: "Intelligent Research System" + description: "An automated system for finding, filtering, and synthesizing information" + +# System settings +system: + cache_dir: "data/cache" + results_dir: "data/results" + log_level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL diff --git a/execution/__init__.py b/execution/__init__.py new file mode 100644 index 0000000..55bc0c8 --- /dev/null +++ b/execution/__init__.py @@ -0,0 +1,4 @@ +""" +Search execution module for the intelligent research system. +This module handles the execution of search queries across various search engines. +""" diff --git a/execution/api_handlers/__init__.py b/execution/api_handlers/__init__.py new file mode 100644 index 0000000..54484f2 --- /dev/null +++ b/execution/api_handlers/__init__.py @@ -0,0 +1,4 @@ +""" +API handlers for different search engines. +Each handler implements a common interface for executing searches and processing results. +""" diff --git a/execution/api_handlers/arxiv_handler.py b/execution/api_handlers/arxiv_handler.py new file mode 100644 index 0000000..5f35376 --- /dev/null +++ b/execution/api_handlers/arxiv_handler.py @@ -0,0 +1,162 @@ +""" +arXiv API handler. +Uses the official arXiv API to search for academic papers. +""" + +import os +import json +import requests +import urllib.parse +import xml.etree.ElementTree as ET +from datetime import datetime +from typing import Dict, List, Any, Optional + +from .base_handler import BaseSearchHandler +from config.config import get_config + + +class ArxivSearchHandler(BaseSearchHandler): + """Handler for arXiv Search using the official API.""" + + def __init__(self): + """Initialize the arXiv search handler.""" + self.config = get_config() + self.base_url = "http://export.arxiv.org/api/query" + self.available = True # arXiv API is freely available without an API key + + def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: + """ + Execute an arXiv search query. + + Args: + query: The search query to execute + num_results: Number of results to return + **kwargs: Additional search parameters: + - sort_by: Sort order ("relevance", "lastUpdatedDate", "submittedDate") + - sort_order: Sort direction ("ascending", "descending") + - categories: List of arXiv categories to search within + - date_range: Date range for filtering (e.g., "all", "last_week", "last_month") + + Returns: + List of search results with standardized format + """ + # Set up the request parameters + params = { + "search_query": query, + "max_results": num_results, + "start": kwargs.get("start", 0) + } + + # Add sorting parameters + sort_by = kwargs.get("sort_by", "relevance") + if sort_by == "relevance": + params["sortBy"] = "relevance" + elif sort_by == "lastUpdatedDate": + params["sortBy"] = "lastUpdatedDate" + elif sort_by == "submittedDate": + params["sortBy"] = "submittedDate" + + sort_order = kwargs.get("sort_order", "descending") + if sort_order == "descending": + params["sortOrder"] = "descending" + elif sort_order == "ascending": + params["sortOrder"] = "ascending" + + # Add category filtering + if "categories" in kwargs and kwargs["categories"]: + categories = "+OR+".join([f"cat:{cat}" for cat in kwargs["categories"]]) + params["search_query"] = f"{params['search_query']}+AND+({categories})" + + try: + # Make the request + response = requests.get( + self.base_url, + params=params + ) + response.raise_for_status() + + # Parse the XML response + root = ET.fromstring(response.content) + + # Define namespaces + ns = { + 'atom': 'http://www.w3.org/2005/Atom', + 'arxiv': 'http://arxiv.org/schemas/atom' + } + + # Extract and standardize the results + results = [] + + for entry in root.findall('.//atom:entry', ns): + # Extract basic information + title = entry.find('./atom:title', ns).text.strip() + summary = entry.find('./atom:summary', ns).text.strip() + published = entry.find('./atom:published', ns).text + updated = entry.find('./atom:updated', ns).text + + # Extract authors + authors = [] + for author in entry.findall('./atom:author/atom:name', ns): + authors.append(author.text.strip()) + + # Extract links + links = {} + for link in entry.findall('./atom:link', ns): + link_rel = link.get('rel', '') + link_href = link.get('href', '') + links[link_rel] = link_href + + # Extract arXiv-specific information + arxiv_id = entry.find('./atom:id', ns).text.split('/')[-1] + + # Get categories + categories = [] + for category in entry.findall('./arxiv:primary_category', ns): + categories.append(category.get('term', '')) + for category in entry.findall('./atom:category', ns): + cat_term = category.get('term', '') + if cat_term and cat_term not in categories: + categories.append(cat_term) + + # Format the result + result = { + "title": title, + "url": links.get('alternate', ''), + "pdf_url": links.get('related', ''), + "snippet": summary[:200] + "..." if len(summary) > 200 else summary, + "source": "arxiv", + "arxiv_id": arxiv_id, + "authors": authors, + "categories": categories, + "published_date": published, + "updated_date": updated, + "full_text": summary + } + + results.append(result) + + return results + + except requests.exceptions.RequestException as e: + print(f"Error executing arXiv search: {e}") + return [] + except ET.ParseError as e: + print(f"Error parsing arXiv response: {e}") + return [] + + def get_name(self) -> str: + """Get the name of the search handler.""" + return "arxiv" + + def is_available(self) -> bool: + """Check if the arXiv API is available.""" + return self.available + + def get_rate_limit_info(self) -> Dict[str, Any]: + """Get information about the API's rate limits.""" + # arXiv API rate limits + return { + "requests_per_minute": 30, # arXiv recommends no more than 1 request per 3 seconds + "requests_per_day": 2000, # This is an estimate + "current_usage": None # arXiv doesn't provide usage info in responses + } diff --git a/execution/api_handlers/base_handler.py b/execution/api_handlers/base_handler.py new file mode 100644 index 0000000..73370bf --- /dev/null +++ b/execution/api_handlers/base_handler.py @@ -0,0 +1,63 @@ +""" +Base handler interface for search APIs. +All specific API handlers should inherit from this base class. +""" + +from abc import ABC, abstractmethod +from typing import Dict, List, Any, Optional + + +class BaseSearchHandler(ABC): + """Base class for all search API handlers.""" + + @abstractmethod + def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: + """ + Execute a search query and return results. + + Args: + query: The search query to execute + num_results: Number of results to return + **kwargs: Additional search parameters specific to the API + + Returns: + List of search results, each as a dictionary with at least: + - title: Title of the result + - url: URL of the result + - snippet: Text snippet or description + - source: Source of the result (e.g., "google", "scholar") + """ + pass + + @abstractmethod + def get_name(self) -> str: + """ + Get the name of the search handler. + + Returns: + Name of the search handler (e.g., "google", "scholar") + """ + pass + + @abstractmethod + def is_available(self) -> bool: + """ + Check if the search API is available and properly configured. + + Returns: + True if the API is available, False otherwise + """ + pass + + @abstractmethod + def get_rate_limit_info(self) -> Dict[str, Any]: + """ + Get information about the API's rate limits. + + Returns: + Dictionary with rate limit information: + - requests_per_minute: Maximum requests per minute + - requests_per_day: Maximum requests per day + - current_usage: Current usage statistics if available + """ + pass diff --git a/execution/api_handlers/google_handler.py b/execution/api_handlers/google_handler.py new file mode 100644 index 0000000..90a2184 --- /dev/null +++ b/execution/api_handlers/google_handler.py @@ -0,0 +1,113 @@ +""" +Google Search API handler. +Uses the Serper API to access Google search results. +""" + +import os +import json +import requests +from typing import Dict, List, Any, Optional + +from .base_handler import BaseSearchHandler +from config.config import get_config, get_api_key + + +class GoogleSearchHandler(BaseSearchHandler): + """Handler for Google Search using the Serper API.""" + + def __init__(self): + """Initialize the Google search handler.""" + self.config = get_config() + self.api_key = get_api_key("serper") + self.base_url = "https://google.serper.dev/search" + self.available = self.api_key is not None + + def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: + """ + Execute a Google search query using Serper API. + + Args: + query: The search query to execute + num_results: Number of results to return + **kwargs: Additional search parameters: + - country: Country code (default: "us") + - language: Language code (default: "en") + - page: Page number (default: 1) + + Returns: + List of search results with standardized format + """ + if not self.available: + raise ValueError("Google Search API is not available. API key is missing.") + + # Set up the request parameters + params = { + "q": query, + "num": num_results, + "type": "search" # Specify search type + } + + # Add optional parameters + if "country" in kwargs: + params["gl"] = kwargs["country"] + if "language" in kwargs: + params["hl"] = kwargs["language"] + if "page" in kwargs: + params["page"] = kwargs["page"] + + # Set up the headers + headers = { + "X-API-KEY": self.api_key, + "Content-Type": "application/json" + } + + try: + # Make the request + response = requests.post( + self.base_url, + headers=headers, + json=params + ) + response.raise_for_status() + + # Parse the response + data = response.json() + + # Extract and standardize the results + results = [] + + # Process organic results + if "organic" in data: + for item in data["organic"][:num_results]: + result = { + "title": item.get("title", ""), + "url": item.get("link", ""), + "snippet": item.get("snippet", ""), + "source": "google", + "position": item.get("position", 0), + "raw_data": item + } + results.append(result) + + return results + + except requests.exceptions.RequestException as e: + print(f"Error executing Google search: {e}") + return [] + + def get_name(self) -> str: + """Get the name of the search handler.""" + return "google" + + def is_available(self) -> bool: + """Check if the Google Search API is available.""" + return self.available + + def get_rate_limit_info(self) -> Dict[str, Any]: + """Get information about the API's rate limits.""" + # These are example values - adjust based on your Serper plan + return { + "requests_per_minute": 60, + "requests_per_day": 2500, + "current_usage": None # Serper doesn't provide usage info in responses + } diff --git a/execution/api_handlers/scholar_handler.py b/execution/api_handlers/scholar_handler.py new file mode 100644 index 0000000..1f78b20 --- /dev/null +++ b/execution/api_handlers/scholar_handler.py @@ -0,0 +1,125 @@ +""" +Google Scholar API handler. +Uses the Serper API to access Google Scholar search results. +""" + +import os +import json +import requests +from typing import Dict, List, Any, Optional + +from .base_handler import BaseSearchHandler +from config.config import get_config, get_api_key + + +class ScholarSearchHandler(BaseSearchHandler): + """Handler for Google Scholar Search using the Serper API.""" + + def __init__(self): + """Initialize the Google Scholar search handler.""" + self.config = get_config() + self.api_key = get_api_key("serper") + self.base_url = "https://google.serper.dev/scholar" + self.available = self.api_key is not None + + def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: + """ + Execute a Google Scholar search query using Serper API. + + Args: + query: The search query to execute + num_results: Number of results to return + **kwargs: Additional search parameters: + - country: Country code (default: "us") + - language: Language code (default: "en") + - year_start: Start year for publication date filter + - year_end: End year for publication date filter + + Returns: + List of search results with standardized format + """ + if not self.available: + raise ValueError("Google Scholar API is not available. API key is missing.") + + # Set up the request parameters + params = { + "q": query, + "num": num_results, + "type": "scholar" # Specify search type as scholar + } + + # Add optional parameters + if "country" in kwargs: + params["gl"] = kwargs["country"] + if "language" in kwargs: + params["hl"] = kwargs["language"] + + # Add date range if specified + date_range = "" + if "year_start" in kwargs and "year_end" in kwargs: + date_range = f"as_ylo={kwargs['year_start']}&as_yhi={kwargs['year_end']}" + elif "year_start" in kwargs: + date_range = f"as_ylo={kwargs['year_start']}" + elif "year_end" in kwargs: + date_range = f"as_yhi={kwargs['year_end']}" + + if date_range: + params["tbs"] = date_range + + # Set up the headers + headers = { + "X-API-KEY": self.api_key, + "Content-Type": "application/json" + } + + try: + # Make the request + response = requests.post( + self.base_url, + headers=headers, + json=params + ) + response.raise_for_status() + + # Parse the response + data = response.json() + + # Process the results + results = [] + + # Process organic results + if "organic" in data: + for item in data["organic"]: + result = { + "title": item.get("title", ""), + "url": item.get("link", ""), + "snippet": item.get("snippet", ""), + "source": "scholar", + "authors": item.get("authors", ""), + "publication": item.get("publication", ""), + "year": item.get("year", "") + } + results.append(result) + + return results + + except requests.exceptions.RequestException as e: + print(f"Error executing Google Scholar search: {e}") + return [] + + def get_name(self) -> str: + """Get the name of the search handler.""" + return "scholar" + + def is_available(self) -> bool: + """Check if the Google Scholar API is available.""" + return self.available + + def get_rate_limit_info(self) -> Dict[str, Any]: + """Get information about the API's rate limits.""" + # These are example values - adjust based on your Serper plan + return { + "requests_per_minute": 30, # Lower for Scholar due to its specialized nature + "requests_per_day": 1000, + "current_usage": None # Serper doesn't provide usage info in responses + } diff --git a/execution/api_handlers/serper_handler.py b/execution/api_handlers/serper_handler.py new file mode 100644 index 0000000..5367ea2 --- /dev/null +++ b/execution/api_handlers/serper_handler.py @@ -0,0 +1,134 @@ +""" +Serper API handler. +Provides direct access to Serper's enhanced search capabilities. +""" + +import os +import json +import requests +from typing import Dict, List, Any, Optional + +from .base_handler import BaseSearchHandler +from config.config import get_config, get_api_key + + +class SerperSearchHandler(BaseSearchHandler): + """Handler for Serper's enhanced search API.""" + + def __init__(self): + """Initialize the Serper search handler.""" + self.config = get_config() + self.api_key = get_api_key("serper") + self.base_url = "https://google.serper.dev/search" + self.available = self.api_key is not None + + def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: + """ + Execute a search query using Serper's enhanced API. + + Args: + query: The search query to execute + num_results: Number of results to return + **kwargs: Additional search parameters: + - search_type: Type of search ("web", "news", "images", "places") + - country: Country code (default: "us") + - language: Language code (default: "en") + - page: Page number (default: 1) + + Returns: + List of search results with standardized format + """ + if not self.available: + raise ValueError("Serper API is not available. API key is missing.") + + # Set up the request parameters + params = { + "q": query, + "num": num_results + } + + # Add optional parameters + search_type = kwargs.get("search_type", "search") + params["type"] = search_type + + if "country" in kwargs: + params["gl"] = kwargs["country"] + if "language" in kwargs: + params["hl"] = kwargs["language"] + if "page" in kwargs: + params["page"] = kwargs["page"] + + # Set up the headers + headers = { + "X-API-KEY": self.api_key, + "Content-Type": "application/json" + } + + try: + # Make the request + print(f"Making request to {self.base_url} with API key: {self.api_key[:5]}...") + print(f"Headers: {headers}") + print(f"Params: {params}") + + response = requests.post( + self.base_url, + headers=headers, + json=params + ) + + print(f"Response status: {response.status_code}") + print(f"Response text: {response.text[:200]}") + + response.raise_for_status() + + # Parse the response + data = response.json() + + # Process the results + results = [] + + # Process organic results + if "organic" in data: + for item in data["organic"]: + result = { + "title": item.get("title", ""), + "url": item.get("link", ""), + "snippet": item.get("snippet", ""), + "source": "serper" + } + results.append(result) + + # Process knowledge graph if available + if "knowledgeGraph" in data: + kg = data["knowledgeGraph"] + if "title" in kg and "description" in kg: + result = { + "title": kg.get("title", ""), + "url": kg.get("website", ""), + "snippet": kg.get("description", ""), + "source": "serper_kg" + } + results.append(result) + + return results + + except requests.exceptions.RequestException as e: + print(f"Error executing Serper search: {e}") + return [] + + def get_name(self) -> str: + """Get the name of the search handler.""" + return "serper" + + def is_available(self) -> bool: + """Check if the Serper API is available.""" + return self.available + + def get_rate_limit_info(self) -> Dict[str, Any]: + """Get information about the API's rate limits.""" + # These are example values - adjust based on your Serper plan + return { + "requests_per_minute": 60, + "requests_per_day": 2500, + "current_usage": None # Serper doesn't provide usage info in responses + } diff --git a/execution/result_collector.py b/execution/result_collector.py new file mode 100644 index 0000000..16bccff --- /dev/null +++ b/execution/result_collector.py @@ -0,0 +1,315 @@ +""" +Result collector module. +Processes and organizes search results from multiple search engines. +""" + +import os +import json +import time +from typing import Dict, List, Any, Optional, Set +from urllib.parse import urlparse +from datetime import datetime + + +class ResultCollector: + """ + Collects and processes search results from multiple search engines. + Handles deduplication, merging, and filtering of results. + """ + + def __init__(self): + """Initialize the result collector.""" + pass + + def process_results(self, + search_results: Dict[str, List[Dict[str, Any]]], + dedup: bool = True, + max_results: Optional[int] = None) -> List[Dict[str, Any]]: + """ + Process search results from multiple search engines. + + Args: + search_results: Dictionary mapping search engine names to lists of search results + dedup: Whether to deduplicate results based on URL + max_results: Maximum number of results to return (after processing) + + Returns: + List of processed search results + """ + # Flatten and normalize results + all_results = self._flatten_results(search_results) + + # Deduplicate results if requested + if dedup: + all_results = self._deduplicate_results(all_results) + + # Sort results by relevance (using a simple scoring algorithm) + all_results = self._score_and_sort_results(all_results) + + # Limit results if requested + if max_results is not None: + all_results = all_results[:max_results] + + return all_results + + def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: + """ + Flatten results from multiple search engines into a single list. + + Args: + search_results: Dictionary mapping search engine names to lists of search results + + Returns: + Flattened list of search results + """ + all_results = [] + + for engine, results in search_results.items(): + for result in results: + # Ensure all results have the same basic structure + normalized_result = { + "title": result.get("title", ""), + "url": result.get("url", ""), + "snippet": result.get("snippet", ""), + "source": result.get("source", engine), + "domain": self._extract_domain(result.get("url", "")), + "timestamp": datetime.now().isoformat(), + "raw_data": result + } + all_results.append(normalized_result) + + return all_results + + def _deduplicate_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Deduplicate results based on URL. + + Args: + results: List of search results + + Returns: + Deduplicated list of search results + """ + seen_urls = set() + deduplicated_results = [] + + for result in results: + url = result.get("url", "") + + # Normalize URL for comparison + normalized_url = self._normalize_url(url) + + if normalized_url and normalized_url not in seen_urls: + seen_urls.add(normalized_url) + deduplicated_results.append(result) + + return deduplicated_results + + def _score_and_sort_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Score and sort results by relevance. + + Args: + results: List of search results + + Returns: + Sorted list of search results + """ + # Add a score to each result + for result in results: + score = 0 + + # Boost score based on source (e.g., scholarly sources get higher scores) + source = result.get("source", "") + if source == "scholar": + score += 10 + elif source == "arxiv": + score += 8 + elif source == "google": + score += 5 + elif source == "serper": + score += 5 + + # Boost score based on position in original results + position = result.get("raw_data", {}).get("position", 0) + if position > 0: + score += max(0, 10 - position) + + # Boost score for results with more content + snippet_length = len(result.get("snippet", "")) + if snippet_length > 200: + score += 3 + elif snippet_length > 100: + score += 2 + elif snippet_length > 50: + score += 1 + + # Store the score + result["relevance_score"] = score + + # Sort by score (descending) + sorted_results = sorted(results, key=lambda x: x.get("relevance_score", 0), reverse=True) + + return sorted_results + + def _extract_domain(self, url: str) -> str: + """ + Extract the domain from a URL. + + Args: + url: URL to extract domain from + + Returns: + Domain name + """ + try: + parsed_url = urlparse(url) + domain = parsed_url.netloc + + # Remove 'www.' prefix if present + if domain.startswith('www.'): + domain = domain[4:] + + return domain + except: + return "" + + def _normalize_url(self, url: str) -> str: + """ + Normalize a URL for comparison. + + Args: + url: URL to normalize + + Returns: + Normalized URL + """ + try: + # Parse the URL + parsed_url = urlparse(url) + + # Reconstruct with just the scheme, netloc, and path + normalized = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" + + # Remove trailing slash if present + if normalized.endswith('/'): + normalized = normalized[:-1] + + return normalized.lower() + except: + return url.lower() + + def filter_results(self, + results: List[Dict[str, Any]], + filters: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Filter results based on specified criteria. + + Args: + results: List of search results + filters: Dictionary of filter criteria: + - domains: List of domains to include or exclude + - exclude_domains: Whether to exclude (True) or include (False) the specified domains + - min_score: Minimum relevance score + - sources: List of sources to include + - date_range: Dictionary with 'start' and 'end' dates + + Returns: + Filtered list of search results + """ + filtered_results = results.copy() + + # Filter by domains + if "domains" in filters and filters["domains"]: + domains = set(filters["domains"]) + exclude_domains = filters.get("exclude_domains", False) + + if exclude_domains: + filtered_results = [r for r in filtered_results if r.get("domain", "") not in domains] + else: + filtered_results = [r for r in filtered_results if r.get("domain", "") in domains] + + # Filter by minimum score + if "min_score" in filters: + min_score = filters["min_score"] + filtered_results = [r for r in filtered_results if r.get("relevance_score", 0) >= min_score] + + # Filter by sources + if "sources" in filters and filters["sources"]: + sources = set(filters["sources"]) + filtered_results = [r for r in filtered_results if r.get("source", "") in sources] + + # Filter by date range + if "date_range" in filters: + date_range = filters["date_range"] + + if "start" in date_range: + start_date = datetime.fromisoformat(date_range["start"]) + filtered_results = [ + r for r in filtered_results + if "date" not in r or not r["date"] or datetime.fromisoformat(r["date"]) >= start_date + ] + + if "end" in date_range: + end_date = datetime.fromisoformat(date_range["end"]) + filtered_results = [ + r for r in filtered_results + if "date" not in r or not r["date"] or datetime.fromisoformat(r["date"]) <= end_date + ] + + return filtered_results + + def group_results_by_domain(self, results: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: + """ + Group results by domain. + + Args: + results: List of search results + + Returns: + Dictionary mapping domains to lists of search results + """ + grouped_results = {} + + for result in results: + domain = result.get("domain", "unknown") + + if domain not in grouped_results: + grouped_results[domain] = [] + + grouped_results[domain].append(result) + + return grouped_results + + def save_results(self, results: List[Dict[str, Any]], file_path: str) -> None: + """ + Save search results to a file. + + Args: + results: List of search results + file_path: Path to save results to + """ + try: + with open(file_path, 'w') as f: + json.dump(results, f, indent=2) + print(f"Results saved to {file_path}") + except Exception as e: + print(f"Error saving results: {e}") + + def load_results(self, file_path: str) -> List[Dict[str, Any]]: + """ + Load search results from a file. + + Args: + file_path: Path to load results from + + Returns: + List of search results + """ + try: + with open(file_path, 'r') as f: + results = json.load(f) + return results + except Exception as e: + print(f"Error loading results: {e}") + return [] diff --git a/execution/search_executor.py b/execution/search_executor.py new file mode 100644 index 0000000..d1e4534 --- /dev/null +++ b/execution/search_executor.py @@ -0,0 +1,222 @@ +""" +Search executor module. +Handles the execution of search queries across multiple search engines. +""" + +import os +import json +import time +import asyncio +import concurrent.futures +from typing import Dict, List, Any, Optional, Union + +from config.config import get_config +from .api_handlers.base_handler import BaseSearchHandler +from .api_handlers.serper_handler import SerperSearchHandler +from .api_handlers.scholar_handler import ScholarSearchHandler +from .api_handlers.arxiv_handler import ArxivSearchHandler + + +class SearchExecutor: + """ + Executes search queries across multiple search engines. + Manages rate limiting, error handling, and result aggregation. + """ + + def __init__(self): + """Initialize the search executor with available search handlers.""" + self.config = get_config() + self.handlers = self._initialize_handlers() + self.available_handlers = {name: handler for name, handler in self.handlers.items() + if handler.is_available()} + + def _initialize_handlers(self) -> Dict[str, BaseSearchHandler]: + """ + Initialize all search handlers. + + Returns: + Dictionary mapping handler names to handler instances + """ + return { + "serper": SerperSearchHandler(), + "scholar": ScholarSearchHandler(), + "arxiv": ArxivSearchHandler() + } + + def get_available_search_engines(self) -> List[str]: + """ + Get a list of available search engines. + + Returns: + List of available search engine names + """ + return list(self.available_handlers.keys()) + + def execute_search(self, + structured_query: Dict[str, Any], + search_engines: Optional[List[str]] = None, + num_results: int = 10, + timeout: int = 30) -> Dict[str, List[Dict[str, Any]]]: + """ + Execute a search query across multiple search engines. + + Args: + structured_query: Structured query from the query processor + search_engines: List of search engines to use (if None, use all available) + num_results: Number of results to return per search engine + timeout: Timeout in seconds for each search engine + + Returns: + Dictionary mapping search engine names to lists of search results + """ + # Get the raw query + raw_query = structured_query.get("raw_query", "") + + # Get the enhanced query if available, otherwise use the raw query + query = structured_query.get("enhanced_query", raw_query) + + # Truncate the query if it's too long (Serper API has a 2048 character limit) + if len(query) > 2000: + query = query[:2000] + + # If no search engines specified, use all available + if search_engines is None: + search_engines = list(self.available_handlers.keys()) + else: + # Filter to only include available search engines + search_engines = [engine for engine in search_engines + if engine in self.available_handlers] + + # Get the search queries for each engine + search_queries = structured_query.get("search_queries", {}) + + # Execute searches in parallel + results = {} + with concurrent.futures.ThreadPoolExecutor() as executor: + future_to_engine = {} + + for engine in search_engines: + if engine not in self.available_handlers: + continue + + # Get the appropriate query for this engine + engine_query = search_queries.get(engine, query) + + # Submit the search task + future = executor.submit( + self._execute_single_search, + engine=engine, + query=engine_query, + num_results=num_results + ) + future_to_engine[future] = engine + + # Collect results as they complete + for future in concurrent.futures.as_completed(future_to_engine, timeout=timeout): + engine = future_to_engine[future] + try: + engine_results = future.result() + results[engine] = engine_results + except Exception as e: + print(f"Error executing search for {engine}: {e}") + results[engine] = [] + + return results + + def _execute_single_search(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]: + """ + Execute a search on a single search engine. + + Args: + engine: Name of the search engine + query: Query to execute + num_results: Number of results to return + + Returns: + List of search results + """ + handler = self.available_handlers.get(engine) + if not handler: + return [] + + try: + # Execute the search + results = handler.search(query, num_results=num_results) + return results + except Exception as e: + print(f"Error executing search for {engine}: {e}") + return [] + + async def execute_search_async(self, + structured_query: Dict[str, Any], + search_engines: Optional[List[str]] = None, + num_results: int = 10, + timeout: int = 30) -> Dict[str, List[Dict[str, Any]]]: + """ + Execute a search query across specified search engines asynchronously. + + Args: + structured_query: The structured query from the query processor + search_engines: List of search engines to use (if None, use all available) + num_results: Number of results to return per search engine + timeout: Timeout in seconds for each search engine + + Returns: + Dictionary mapping search engine names to lists of search results + """ + # If no search engines specified, use all available + if search_engines is None: + search_engines = list(self.available_handlers.keys()) + else: + # Filter to only include available search engines + search_engines = [engine for engine in search_engines + if engine in self.available_handlers] + + # Get the search queries for each engine + search_queries = structured_query.get("search_queries", {}) + + # Create tasks for each search engine + tasks = [] + for engine in search_engines: + if engine not in self.available_handlers: + continue + + # Get the appropriate query for this engine + query = search_queries.get(engine, structured_query.get("enhanced_query", "")) + + # Create a task for this search + task = self._execute_single_search_async(engine, query, num_results) + tasks.append((engine, task)) + + # Execute all tasks with timeout + results = {} + for engine, task in tasks: + try: + engine_results = await asyncio.wait_for(task, timeout=timeout) + results[engine] = engine_results + except asyncio.TimeoutError: + print(f"Search timed out for {engine}") + results[engine] = [] + except Exception as e: + print(f"Error executing search for {engine}: {e}") + results[engine] = [] + + return results + + async def _execute_single_search_async(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]: + """ + Execute a search on a single search engine asynchronously. + + Args: + engine: Name of the search engine + query: Query to execute + num_results: Number of results to return + + Returns: + List of search results + """ + # Execute in a thread pool since most API calls are blocking + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, self._execute_single_search, engine, query, num_results + ) diff --git a/jina-ai-metaprompt.md b/jina-ai-metaprompt.md new file mode 100644 index 0000000..5b21e70 --- /dev/null +++ b/jina-ai-metaprompt.md @@ -0,0 +1,220 @@ +You are an AI engineer designed to help users use Jina AI Search Foundation API's for their specific use case. + +# Core principles + +1. Assume, the Bearer is stored in the environment variable named "JINA_API_KEY" and add the following comment to the implementation: "Get your Jina AI API key for free: https://jina.ai/?sui=apikey". +2. Use the simplest solution possible (use single API's whenever possible, do not overcomplicate things); +3. Answer "can't do" for tasks outside the scope of Jina AI Search Foundation; +4. Choose built-in features over custom implementations whenever possible; +5. Leverage multimodal models when needed; +6. You must use the Jina APIs for the implementation; +7. Never decline an implementation because of its complexity; +8. Generate production-ready code that follows exactly the requirements; +9. Never use placeholder data; +10. For every request to any of the Jina APIs, you must include the header -H "Accept: application/json" to specify that the response should be in JSON format; + +# Overview of all Jina AI APIs: +- Classification API: Given text or images, classify them into categories. +- Embeddings API: Given text or images, generate embeddings. +These embeddings can be used for similarity search, clustering, and other tasks. +- r.reader API: Input a single website URL and get an LLM-friendly version of that single website. +This is most useful when you already know where you want to get the information from. +- s.reader API: Given a search term, get an LLM-friendly version of all websites in the search results. +This is useful when you don't know where to get the information from, but you just know what you are looking for. +- g.reader API: Given a statement, find out if it is true or false. +This is useful for fact-checking, fake news detection, and general knowledge verification. +- Re-Ranker API: Given a query and a list of search results, re-rank them. +This is useful for improving the relevance of search results. +- Segmenter API: Given a text e.g. the output from r.reader or s.reader, split it into segments. +This is useful for breaking down long texts into smaller, more manageable parts. +Usually this is done to get the chunks that are passed to the embeddings API. + +# Jina AI Search Foundation API's documentation + +11. Embeddings API +Endpoint: https://api.jina.ai/v1/embeddings +Purpose: Convert text/images to fixed-length vectors +Best for: semantic search, similarity matching, clustering, etc. +Method: POST +Authorization: HTTPBearer +Request body schema: {"application/json":{"model":{"type":"string","required":true,"description":"Identifier of the model to use.","options":[{"name":"jina-clip-v2","size":"885M","dimensions":1024},{"name":"jina-embeddings-v3","size":"570M","dimensions":1024}]},"input":{"type":"array","required":true,"description":"Array of input strings or objects to be embedded."},"embedding_type":{"type":"string or array of strings","required":false,"default":"float","description":"The format of the returned embeddings.","options":["float","base64","binary","ubinary"]},"task":{"type":"string","required":false,"description":"Specifies the intended downstream application to optimize embedding output.","options":["retrieval.query","retrieval.passage","text-matching","classification","separation"]},"dimensions":{"type":"integer","required":false,"description":"Truncates output embeddings to the specified size if set."},"normalized":{"type":"boolean","required":false,"default":false,"description":"If true, embeddings are normalized to unit L2 norm."},"late_chunking":{"type":"boolean","required":false,"default":false,"description":"If true, concatenates all sentences in input and treats as a single input for late chunking."}}} +Example request: {"model":"jina-embeddings-v3","input":["Hello, world!"]} +Example response: {"200":{"data":[{"embedding":"..."}],"usage":{"total_tokens":15}},"422":{"error":{"message":"Invalid input or parameters"}}} + +12. Reranker API +Endpoint: https://api.jina.ai/v1/rerank +Purpose: find the most relevant search results +Best for: refining search results, refining RAG (retrieval augmented generation) contextual chunks, etc. +Method: POST +Authorization: HTTPBearer +Request body schema: {"application/json":{"model":{"type":"string","required":true,"description":"Identifier of the model to use.","options":[{"name":"jina-reranker-v2-base-multilingual","size":"278M"},{"name":"jina-colbert-v2","size":"560M"}]},"query":{"type":"string or TextDoc","required":true,"description":"The search query."},"documents":{"type":"array of strings or objects","required":true,"description":"A list of text documents or strings to rerank. If a document object is provided, all text fields will be preserved in the response."},"top_n":{"type":"integer","required":false,"description":"The number of most relevant documents or indices to return, defaults to the length of documents."},"return_documents":{"type":"boolean","required":false,"default":true,"description":"If false, returns only the index and relevance score without the document text. If true, returns the index, text, and relevance score."}}} +Example request: {"model":"jina-reranker-v2-base-multilingual","query":"Search query","documents":["Document to rank 1","Document to rank 2"]} +Example response: {"results":[{"index":0,"document":{"text":"Document to rank 1"},"relevance_score":0.9},{"index":1,"document":{"text":"Document to rank 2"},"relevance_score":0.8}],"usage":{"total_tokens":15,"prompt_tokens":15}} + +13. Reader API +Endpoint: https://r.jina.ai/ +Purpose: retrieve/parse content from URL in a format optimized for downstream tasks like LLMs and other applications +Best for: extracting structured content from web pages, suitable for generative models and search applications +Method: POST +Authorization: HTTPBearer +Headers: +- **Authorization**: Bearer $JINA_API_KEY +- **Content-Type**: application/json +- **Accept**: application/json +- **X-Engine** (optional): Specifies the engine to retrieve/parse content. Use `readerlm-v2` for higher quality or `direct` for speed +- **X-Timeout** (optional): Specifies the maximum time (in seconds) to wait for the webpage to load +- **X-Target-Selector** (optional): CSS selectors to focus on specific elements within the page +- **X-Wait-For-Selector** (optional): CSS selectors to wait for specific elements before returning +- **X-Remove-Selector** (optional): CSS selectors to exclude certain parts of the page (e.g., headers, footers) +- **X-With-Links-Summary** (optional): `true` to gather all links at the end of the response +- **X-With-Images-Summary** (optional): `true` to gather all images at the end of the response +- **X-With-Generated-Alt** (optional): `true` to add alt text to images lacking captions +- **X-No-Cache** (optional): `true` to bypass cache for fresh retrieval +- **X-With-Iframe** (optional): `true` to include iframe content in the response +- **X-Return-Format** (optional): `markdown`, `html`, `text`, `screenshot`, or `pageshot` (for URL of full-page screenshot) +- **X-Token-Budget** (optional): Specifies maximum number of tokens to use for the request +- **X-Retain-Images** (optional): Use `none` to remove all images from the response + +Request body schema: {"application/json":{"url":{"type":"string","required":true},"options":{"type":"string","default":"Default","options":["Default","Markdown","HTML","Text","Screenshot","Pageshot"]}}} +Example cURL request: ```curl -X POST 'https://r.jina.ai/' -H "Accept: application/json" -H "Authorization: Bearer ..." -H "Content-Type: application/json" -H "X-No-Cache: true" -H "X-Remove-Selector: header,.class,#id" -H "X-Target-Selector: body,.class,#id" -H "X-Timeout: 10" -H "X-Wait-For-Selector: body,.class,#id" -H "X-With-Generated-Alt: true" -H "X-With-Iframe: true" -H "X-With-Images-Summary: true" -H "X-With-Links-Summary: true" -d '{"url":"https://jina.ai"}'``` +Example response: {"code":200,"status":20000,"data":{"title":"Jina AI - Your Search Foundation, Supercharged.","description":"Best-in-class embeddings, rerankers, LLM-reader, web scraper, classifiers. The best search AI for multilingual and multimodal data.","url":"https://jina.ai/","content":"Jina AI - Your Search Foundation, Supercharged.\n===============\n","images":{"Image 1":"https://jina.ai/Jina%20-%20Dark.svg"},"links":{"Newsroom":"https://jina.ai/#newsroom","Contact sales":"https://jina.ai/contact-sales","Commercial License":"https://jina.ai/COMMERCIAL-LICENSE-TERMS.pdf","Security":"https://jina.ai/legal/#security","Terms & Conditions":"https://jina.ai/legal/#terms-and-conditions","Privacy":"https://jina.ai/legal/#privacy-policy"},"usage":{"tokens +Pay attention to the response format of the reader API, the actual content of the page will be available in `response["data"]["content"]`, and links / images (if using "X-With-Links-Summary: true" or "X-With-Images-Summary: true") will be available in `response["data"]["links"]` and `response["data"]["images"]`. + +14. Search API +Endpoint: https://s.jina.ai/ +Purpose: search the web for information and return results in a format optimized for downstream tasks like LLMs and other applications +Best for: customizable web search with results optimized for enterprise search systems and LLMs, with options for Markdown, HTML, JSON, text, and image outputs +Method: POST +Authorization: HTTPBearer +Headers: +- **Authorization**: Bearer $JINA_API_KEY +- **Content-Type**: application/json +- **Accept**: application/json +- **X-Site** (optional): Use "X-Site: " for in-site searches limited to the given domain +- **X-With-Links-Summary** (optional): "true" to gather all page links at the end +- **X-With-Images-Summary** (optional): "true" to gather all images at the end +- **X-No-Cache** (optional): "true" to bypass cache and retrieve real-time data +- **X-With-Generated-Alt** (optional): "true" to generate captions for images without alt tags + +Request body schema: {"application/json":{"q":{"type":"string","required":true},"options":{"type":"string","default":"Default","options":["Default","Markdown","HTML","Text","Screenshot","Pageshot"]}}} +Example request cURL request: ```curl -X POST 'https://s.jina.ai/' -H "Authorization: Bearer ..." -H "Content-Type: application/json" -H "Accept: application/json" -H "X-No-Cache: true" -H "X-Site: https://jina.ai" -d '{"q":"When was Jina AI founded?","options":"Markdown"}'``` +Example response: {"code":200,"status":20000,"data":[{"title":"Jina AI - Your Search Foundation, Supercharged.","description":"Our frontier models form the search foundation for high-quality enterprise search...","url":"https://jina.ai/","content":"Jina AI - Your Search Foundation, Supercharged...","usage":{"tokens":10475}},{"title":"Jina AI CEO, Founder, Key Executive Team, Board of Directors & Employees","description":"An open-source vector search engine that supports structured filtering...","url":"https://www.cbinsights.com/company/jina-ai/people","content":"Jina AI Management Team...","usage":{"tokens":8472}}]} +Similarly to the reader API, you must pay attention to the response format of the search API, and you must ensure to extract the required content correctly. + +15. Grounding API +Endpoint: https://g.jina.ai/ +Purpose: verify the factual accuracy of a given statement by cross-referencing it with sources from the internet +Best for: ideal for validating claims or facts by using verifiable sources, such as company websites or social media profiles +Method: POST +Authorization: HTTPBearer +Headers: +- **Authorization**: Bearer $JINA_API_KEY +- **Content-Type**: application/json +- **Accept**: application/json +- **X-Site** (optional): comma-separated list of URLs to serve as grounding references for verifying the statement (if not specified, all sources found on the internet will be used) +- **X-No-Cache** (optional): "true" to bypass cache and retrieve real-time data + +Request body schema: {"application/json":{"statement":{"type":"string","required":true,"description":"The statement to verify for factual accuracy"}}} +Example cURL request: ```curl -X POST 'https://g.jina.ai/' -H "Accept: application/json" -H "Authorization: Bearer ..." -H "Content-Type: application/json" -H "X-Site: https://jina.ai, https://linkedin.com" -d '{"statement":"Jina AI was founded in 2020 in Berlin."}'``` +Example response: {"code":200,"status":20000,"data":{"factuality":1,"result":true,"reason":"The statement that Jina AI was founded in 2020 in Berlin is supported by the references. The first reference confirms the founding year as 2020 and the location as Berlin. The second and third references specify that Jina AI was founded in February 2020, which aligns with the year mentioned in the statement. Therefore, the statement is factually correct based on the provided references.","references":[{"url":"https://es.linkedin.com/company/jinaai?trk=ppro_cprof","keyQuote":"Founded in February 2020, Jina AI has swiftly emerged as a global pioneer in multimodal AI technology.","isSupportive":true},{"url":"https://jina.ai/about-us/","keyQuote":"Founded in 2020 in Berlin, Jina AI is a leading search AI company.","isSupportive":true},{"url":"https://www.linkedin.com/company/jinaai","keyQuote":"Founded in February 2020, Jina AI has swiftly emerged as a global pioneer in multimodal AI technology.","isSupportive":true}],"usage":{"tokens":7620}}} + +16. Segmenter API +Endpoint: https://segment.jina.ai/ +Purpose: tokenizes text, divide text into chunks +Best for: counting number of tokens in text, segmenting text into manageable chunks (ideal for downstream applications like RAG) +Method: POST +Authorization: HTTPBearer +Headers: +- **Authorization**: Bearer $JINA_API_KEY +- **Content-Type**: application/json +- **Accept**: application/json + +Request body schema: {"application/json":{"content":{"type":"string","required":true,"description":"The text content to segment."},"tokenizer":{"type":"string","required":false,"default":"cl100k_base","enum":["cl100k_base","o200k_base","p50k_base","r50k_base","p50k_edit","gpt2"],"description":"Specifies the tokenizer to use."},"return_tokens":{"type":"boolean","required":false,"default":false,"description":"If true, includes tokens and their IDs in the response."},"return_chunks":{"type":"boolean","required":false,"default":false,"description":"If true, segments the text into semantic chunks."},"max_chunk_length":{"type":"integer","required":false,"default":1000,"description":"Maximum characters per chunk (only effective if 'return_chunks' is true)."},"head":{"type":"integer","required":false,"description":"Returns the first N tokens (exclusive with 'tail')."},"tail":{"type":"integer","required":false,"description":"Returns the last N tokens (exclusive with 'head')."}}} +Example cURL request: ```curl -X POST 'https://segment.jina.ai/' -H "Content-Type: application/json" -H "Authorization: Bearer ..." -d '{"content":"\n Jina AI: Your Search Foundation, Supercharged! 🚀\n Ihrer Suchgrundlage, aufgeladen! 🚀\n 您的搜索底座,从此不同!🚀\n 検索ベース,もう二度と同じことはありません!🚀\n","tokenizer":"cl100k_base","return_tokens":true,"return_chunks":true,"max_chunk_length":1000,"head":5}'``` +Example response: {"num_tokens":78,"tokenizer":"cl100k_base","usage":{"tokens":0},"num_chunks":4,"chunk_positions":[[3,55],[55,93],[93,110],[110,135]],"tokens":[[["J",[41]],["ina",[2259]],[" AI",[15592]],[":",[25]],[" Your",[4718]],[" Search",[7694]],[" Foundation",[5114]],[",",[11]],[" Super",[7445]],["charged",[38061]],["!",[0]],[" ",[11410]],["🚀",[248,222]],["\n",[198]],[" ",[256]]],[["I",[40]],["hr",[4171]],["er",[261]],[" Such",[15483]],["grund",[60885]],["lage",[56854]],[",",[11]],[" auf",[7367]],["gel",[29952]],["aden",[21825]],["!",[0]],[" ",[11410]],["🚀",[248,222]],["\n",[198]],[" ",[256]]],[["您",[88126]],["的",[9554]],["搜索",[80073]],["底",[11795,243]],["座",[11795,100]],[",",[3922]],["从",[46281]],["此",[33091]],["不",[16937]],["同",[42016]],["!",[6447]],["🚀",[9468,248,222]],["\n",[198]],[" ",[256]]],[["検",[162,97,250]],["索",[52084]],["ベ",[2845,247]],["ース",[61398]],[",",[11]],["も",[32977]],["う",[30297]],["二",[41920]],["度",[27479]],["と",[19732]],["同",[42016]],["じ",[100204]],["こ",[22957]],["と",[19732]],["は",[15682]],["あり",[57903]],["ま",[17129]],["せ",[72342]],["ん",[25827]],["!",[6447]],["🚀",[9468,248,222]],["\n",[198]]]],"chunks":["Jina AI: Your Search Foundation, Supercharged! 🚀\n ","Ihrer Suchgrundlage, aufgeladen! 🚀\n ","您的搜索底座,从此不同!🚀\n ","検索ベース,もう二度と同じことはありません!🚀\n"]} +Note: for the API to return chunks, you must specify `"return_chunks": true` as part of the request body. + +17. Classifier API +Endpoint: https://api.jina.ai/v1/classify +Purpose: zero-shot classification for text or images +Best for: text or image classification without training +Request body schema for text and images : {"application/json":{"model":{"type":"string","required":false,"description":"Identifier of the model to use. Required if classifier_id is not provided.","options":[{"name":"jina-clip-v2","size":"885M","dimensions":1024}]},"classifier_id":{"type":"string","required":false,"description":"The identifier of the classifier. If not provided, a new classifier will be created."},"input":{"type":"array","required":true,"description":"Array of inputs for classification. Each entry can either be a text object {\"text\": \"your_text_here\"} or an image object {\"image\": \"base64_image_string\"}. You cannot mix text and image objects in the same request."},"labels":{"type":"array of strings","required":true,"description":"List of labels used for classification."}}} +Example request: {"model":"jina-clip-v2","input":[{"image":"base64_image_string"}],"labels":["category1","category2"]} +Example response: {"200":{"data":[{"index":0,"prediction":"category1","object":"classification","score":0.85}],"usage":{"total_tokens":10}},"422":{"detail":[{"message":"Validation error","field":"input"}]}} +Request body schema for text: {"application/json":{"model":{"type":"string","required":false,"description":"Identifier of the model to use. Required if classifier_id is not provided.","options":[{"name":"jina-embeddings-v3","size":"223M","dimensions":768}]},"classifier_id":{"type":"string","required":false,"description":"The identifier of the classifier. If not provided, a new classifier will be created."},"input":{"type":"array","required":true,"description":"Array of text inputs for classification. Each entry should be a simple string representing the text to classify.","items":{"type":"string"}},"labels":{"type":"array","required":true,"description":"List of labels used for classification.","items":{"type":"string"}}}} +Example request: {"model": "jina-embeddings-v3", "input": ["walk", "marathon"], "labels": ["Simple task", "intensive task", "Creative writing"]} +Example response: {"usage":{"total_tokens":19},"data":[{"object":"classification","index":0,"prediction":"Simple task","score":0.35543856024742126,"predictions":[{"label":"Simple task","score":0.35543856024742126},{"label":"intensive task","score":0.33334434032440186},{"label":"Creative writing","score":0.3112170696258545}]},{"object":"classification","index":1,"prediction":"intensive task","score":0.3616286516189575,"predictions":[{"label":"Simple task","score":0.34063565731048584},{"label":"intensive task","score":0.3616286516189575},{"label":"Creative writing","score":0.2977357804775238}]}]} +Pay attention to the model used, when classifying images you must use `jina-clip-v2`, but when classifying text it is best to use `jina-embeddings-v3` (newest text embedding model from Jina)!!! + +**Note: all API's require authorization using the bearer token (get it from https://jina.ai/?sui=apikey)!** +Make sure that any code you generate uses the JINA_API_KEY environment variable, and remind the user to correctly set this variable before running the code! + +# Example solutions + +18. Basic search: +- For simple queries, use the search API with the given queries; +- For better relevancy, first use the search API to retrieve results, then use the reranker API to find the most relevant results; + +19. Classification tasks: +- To classify text snippets (multi-lingual texts), you can use the classification API with jina-embeddings-v3 model; +- To classify images, you can use the classification API with jina-clip-v2 model; + +20. Web content processing: +- To scrape a webpage, use the reader API directly; +- To embed the contents of a webpage, first use the reader API to scrape the text content of the webpage and then use the embeddings API; + +# Integration guidelines + +You should always: +- Handle API errors using try/catch blocks; +- Implement retries for network failures; +- Validate inputs before API calls; +- Pay attention to the response of each API and parse it to a usable state; + +You should not: +- Chain API's unnecessarily; +- Use reranker API without query-document pairs (reranker API needs a query as context to estimate relevancy); +- Directly use the response of an API without parsing it; + +# Limitations + +The Jina AI Search Foundation API's cannot perform any actions other than those already mentioned. +This includes: +- Generating text or images; +- Modifying or editing content; +- Executing code or perform calculations; +- Storing or caching results permanently; + +# Tips for responding to user requests + +21. Start by analyzing the task and identifying which API's should be used; + +22. If multiple API's are required, outline the purpose of each API; + +23. Write the code for calling each API as a separate function, and correctly handle any possible errors; +It is important to write reusable code, so that the user can reap the most benefits out of your response. +```python +def read(url): + ... + +def main(): + ... +``` +Note: make sure you parse the response of each API correctly so that it can be used in the code. +For example, if you want to read the content of the page, you should extract the content from the response of the reader API like `content = reader_response["data"]["content"]`. +Another example, if you want to extract all the URL from a page, you can use the reader API with the "X-With-Links-Summary: true" header and then you can extract the links like `links = reader_response["data"]["links"]`. + +24. Write the complete code, including input loading, calling the API functions, and saving/printing results; +Remember to use variables for required API keys, and point out to the user that they need to correctly set these variables. + +25. Finally, Jina AI API endpoints rate limits: +Embedding & Reranker APIs (api.jina.ai/v1/embeddings, /rerank): 500 RPM & 1M TPM with API key; 2k RPM & 5M TPM with premium key +Reader APIs: + - r.jina.ai: 200 RPM, 1k RPM premium + - s.jina.ai: 40 RPM, 100 RPM premium + - g.jina.ai: 10 RPM, 30 RPM premium +Classifier APIs (api.jina.ai/v1/classify): + - 200 RPM & 500k TPM; 1k RPM & 3M TPM premium +Segmenter API (segment.jina.ai): 200 RPM, 1k RPM premium + +Approach your task step by step. diff --git a/jina_similarity.py b/jina_similarity.py new file mode 100644 index 0000000..a4e1b61 --- /dev/null +++ b/jina_similarity.py @@ -0,0 +1,112 @@ +""" +A module for computing text similarity using Jina AI's Embeddings API. +Get your Jina AI API key for free: https://jina.ai/?sui=apikey + +The jina-embeddings-v3 model supports input lengths of up to 8,192 tokens. +For longer texts, consider using Jina's Segmenter API to split into smaller chunks. +""" + +import os +import requests +import numpy as np +import tiktoken +from typing import Tuple + +class TokenLimitError(Exception): + """Raised when input text exceeds the token limit.""" + pass + +class JinaSimilarity: + MAX_TOKENS = 8192 + + def __init__(self): + """Initialize the JinaSimilarity class.""" + self.api_key = os.environ.get("JINA_API_KEY") + if not self.api_key: + raise ValueError("JINA_API_KEY environment variable not set") + + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Accept": "application/json", + "Content-Type": "application/json" + } + self.embeddings_url = "https://api.jina.ai/v1/embeddings" + # Initialize tokenizer - using cl100k_base which is used by many modern models + self.tokenizer = tiktoken.get_encoding("cl100k_base") + + def count_tokens(self, text: str) -> int: + """Count the number of tokens in a text. + + Args: + text: The text to count tokens for + + Returns: + int: Number of tokens in the text + """ + return len(self.tokenizer.encode(text)) + + def get_embedding(self, text: str) -> list: + """Get embedding for a piece of text using Jina AI's Embeddings API. + + Args: + text: The text to get embeddings for (max 8,192 tokens) + + Returns: + list: The embedding vector + + Raises: + TokenLimitError: If the text exceeds 8,192 tokens + requests.exceptions.RequestException: If the API call fails + """ + num_tokens = self.count_tokens(text) + if num_tokens > self.MAX_TOKENS: + raise TokenLimitError( + f"Input text is {num_tokens} tokens, which exceeds the maximum of {self.MAX_TOKENS} tokens. " + "Consider using Jina's Segmenter API to split into smaller chunks." + ) + + payload = { + "model": "jina-embeddings-v3", + "input": [text], + "normalized": True # For cosine similarity + } + + response = requests.post( + self.embeddings_url, + headers=self.headers, + json=payload + ) + response.raise_for_status() + + return response.json()["data"][0]["embedding"] + + def compute_similarity(self, chunk: str, query: str) -> Tuple[float, list, list]: + """Compute similarity between a text chunk and a query. + + Args: + chunk: The text chunk to compare against + query: The query text + + Returns: + Tuple containing: + - float: Cosine similarity score (0-1) + - list: Chunk embedding + - list: Query embedding + + Raises: + TokenLimitError: If the text exceeds 8,192 tokens + requests.exceptions.RequestException: If the API calls fail + """ + # Get embeddings for both texts + chunk_embedding = self.get_embedding(chunk) + query_embedding = self.get_embedding(query) + + # Convert to numpy arrays for efficient computation + chunk_vec = np.array(chunk_embedding) + query_vec = np.array(query_embedding) + + # Compute cosine similarity + # Since vectors are normalized, dot product equals cosine similarity + similarity = float(np.dot(chunk_vec, query_vec)) + + return similarity, chunk_embedding, query_embedding diff --git a/markdown_segmenter.py b/markdown_segmenter.py new file mode 100644 index 0000000..49cb273 --- /dev/null +++ b/markdown_segmenter.py @@ -0,0 +1,62 @@ +import os +import json +import requests + +# Get your Jina AI API key for free: https://jina.ai/?sui=apikey +JINA_API_KEY = os.getenv('JINA_API_KEY') + + +def segment_markdown(file_path): + """ + Segments a markdown file using Jina AI's Segmenter API. + + Args: + file_path (str): Path to the markdown file. + + Returns: + dict: JSON structure containing the segments. + """ + try: + # Read the markdown file + with open(file_path, 'r') as file: + markdown_content = file.read() + + # Prepare the request to Jina Segmenter API + headers = { + 'Authorization': f'Bearer {JINA_API_KEY}', + 'Content-Type': 'application/json', + 'Accept': 'application/json' + } + data = { + 'content': markdown_content, + 'tokenizer': 'cl100k_base', + 'return_tokens': False, + 'return_chunks': True, + 'max_chunk_length': 1000 + } + + # Make the API request + response = requests.post( + 'https://segment.jina.ai/', + headers=headers, + json=data + ) + response.raise_for_status() + + # Return the segments as JSON + return response.json() + + except Exception as e: + print(f'Error segmenting markdown: {str(e)}') + return None + + +if __name__ == '__main__': + import sys + if len(sys.argv) != 2: + print('Usage: python markdown_segmenter.py ') + sys.exit(1) + + segments = segment_markdown(sys.argv[1]) + if segments: + print(json.dumps(segments, indent=2)) diff --git a/query/__init__.py b/query/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/query/llm_interface.py b/query/llm_interface.py new file mode 100644 index 0000000..70059df --- /dev/null +++ b/query/llm_interface.py @@ -0,0 +1,263 @@ +""" +LLM interface module using LiteLLM. + +This module provides a unified interface to various LLM providers through LiteLLM, +enabling query enhancement, classification, and other LLM-powered functionality. +""" + +import os +import json +from typing import Dict, Any, List, Optional, Tuple, Union + +import litellm +from litellm import completion + +from config.config import get_config + + +class LLMInterface: + """Interface for interacting with LLMs through LiteLLM.""" + + def __init__(self, model_name: Optional[str] = None): + """ + Initialize the LLM interface. + + Args: + model_name: Name of the LLM model to use. If None, uses the default model + from configuration. + """ + self.config = get_config() + + # Use specified model or default from config + self.model_name = model_name or self.config.config_data.get('default_model', 'gpt-3.5-turbo') + + # Get model-specific configuration + self.model_config = self.config.get_model_config(self.model_name) + + # Set up LiteLLM with the appropriate provider + self._setup_provider() + + def _setup_provider(self) -> None: + """Set up the LLM provider based on the model configuration.""" + provider = self.model_config.get('provider', 'openai') + + try: + # Get API key for the provider + api_key = self.config.get_api_key(provider) + + # Set environment variable for the provider + os.environ[f"{provider.upper()}_API_KEY"] = api_key + + print(f"LLM interface initialized with model: {self.model_name} (provider: {provider})") + except ValueError as e: + print(f"Error setting up LLM provider: {e}") + + def _get_completion_params(self) -> Dict[str, Any]: + """ + Get parameters for LLM completion based on model configuration. + + Returns: + Dictionary of parameters for LiteLLM completion + """ + params = { + 'temperature': self.model_config.get('temperature', 0.7), + 'max_tokens': self.model_config.get('max_tokens', 1000), + 'top_p': self.model_config.get('top_p', 1.0) + } + + # Handle different provider configurations + provider = self.model_config.get('provider', 'openai') + + if provider == 'azure': + # Azure OpenAI requires special handling + deployment_name = self.model_config.get('deployment_name') + api_version = self.model_config.get('api_version') + endpoint = self.model_config.get('endpoint') + + if deployment_name and endpoint: + # Format: azure/deployment_name + params['model'] = f"azure/{deployment_name}" + + # Set Azure-specific environment variables if not already set + if 'AZURE_API_BASE' not in os.environ and endpoint: + os.environ['AZURE_API_BASE'] = endpoint + + if 'AZURE_API_VERSION' not in os.environ and api_version: + os.environ['AZURE_API_VERSION'] = api_version + else: + # Fall back to default model if Azure config is incomplete + params['model'] = self.model_name + elif provider in ['ollama', 'groq', 'openrouter'] or self.model_config.get('endpoint'): + # For providers with custom endpoints + params['model'] = self.model_config.get('model_name', self.model_name) + params['api_base'] = self.model_config.get('endpoint') + + # Special handling for OpenRouter + if provider == 'openrouter': + # Set HTTP headers for OpenRouter if needed + params['headers'] = { + 'HTTP-Referer': 'https://sim-search.app', # Replace with your actual app URL + 'X-Title': 'Intelligent Research System' # Replace with your actual app name + } + else: + # Standard provider (OpenAI, Anthropic, etc.) + params['model'] = self.model_name + + return params + + def generate_completion(self, messages: List[Dict[str, str]], stream: bool = False) -> Union[str, Any]: + """ + Generate a completion using the configured LLM. + + Args: + messages: List of message dictionaries with 'role' and 'content' keys + stream: Whether to stream the response + + Returns: + If stream is False, returns the completion text as a string + If stream is True, returns the completion response object for streaming + """ + try: + params = self._get_completion_params() + params['messages'] = messages + params['stream'] = stream + + response = completion(**params) + + if stream: + return response + else: + return response.choices[0].message.content + except Exception as e: + print(f"Error generating completion: {e}") + return f"Error: {str(e)}" + + def enhance_query(self, query: str) -> str: + """ + Enhance a user query using the LLM. + + Args: + query: The raw user query + + Returns: + Enhanced query with additional context and structure + """ + # Get the model assigned to this specific function + model_name = self.config.get_module_model('query_processing', 'enhance_query') + + # Create a new interface with the assigned model if different from current + if model_name != self.model_name: + interface = LLMInterface(model_name) + return interface._enhance_query_impl(query) + + return self._enhance_query_impl(query) + + def _enhance_query_impl(self, query: str) -> str: + """Implementation of query enhancement.""" + messages = [ + {"role": "system", "content": "You are an AI research assistant. Your task is to enhance the user's query by adding relevant context, clarifying ambiguities, and expanding key terms. Maintain the original intent of the query while making it more comprehensive and precise. Return ONLY the enhanced query text without any explanations, introductions, or additional text. The enhanced query should be ready to be sent directly to a search engine."}, + {"role": "user", "content": f"Enhance this research query: {query}"} + ] + + return self.generate_completion(messages) + + def classify_query(self, query: str) -> Dict[str, Any]: + """ + Classify a query to determine its type, intent, and key entities. + + Args: + query: The user query to classify + + Returns: + Dictionary containing query classification information + """ + # Get the model assigned to this specific function + model_name = self.config.get_module_model('query_processing', 'classify_query') + + # Create a new interface with the assigned model if different from current + if model_name != self.model_name: + interface = LLMInterface(model_name) + return interface._classify_query_impl(query) + + return self._classify_query_impl(query) + + def _classify_query_impl(self, query: str) -> Dict[str, Any]: + """Implementation of query classification.""" + messages = [ + {"role": "system", "content": "You are an AI research assistant. Analyze the user's query and classify it according to type (factual, exploratory, comparative, etc.), intent, and key entities. Respond with a JSON object containing these classifications."}, + {"role": "user", "content": f"Classify this research query: {query}"} + ] + + response = self.generate_completion(messages) + + try: + # Try to parse as JSON + classification = json.loads(response) + return classification + except json.JSONDecodeError: + # If not valid JSON, return a basic classification + return { + "type": "unknown", + "intent": "research", + "entities": [query], + "error": "Failed to parse LLM response as JSON" + } + + def generate_search_queries(self, query: str, search_engines: List[str]) -> Dict[str, List[str]]: + """ + Generate optimized search queries for different search engines. + + Args: + query: The original user query + search_engines: List of search engines to generate queries for + + Returns: + Dictionary mapping search engines to lists of optimized queries + """ + # Get the model assigned to this specific function + model_name = self.config.get_module_model('query_processing', 'generate_search_queries') + + # Create a new interface with the assigned model if different from current + if model_name != self.model_name: + interface = LLMInterface(model_name) + return interface._generate_search_queries_impl(query, search_engines) + + return self._generate_search_queries_impl(query, search_engines) + + def _generate_search_queries_impl(self, query: str, search_engines: List[str]) -> Dict[str, List[str]]: + """Implementation of search query generation.""" + engines_str = ", ".join(search_engines) + + messages = [ + {"role": "system", "content": f"You are an AI research assistant. Generate optimized search queries for the following search engines: {engines_str}. For each search engine, provide 3 variations of the query that are optimized for that engine's search algorithm and will yield comprehensive results."}, + {"role": "user", "content": f"Generate optimized search queries for this research topic: {query}"} + ] + + response = self.generate_completion(messages) + + try: + # Try to parse as JSON + queries = json.loads(response) + return queries + except json.JSONDecodeError: + # If not valid JSON, return a basic query set + return {engine: [query] for engine in search_engines} + + +# Create a singleton instance for global use +llm_interface = LLMInterface() + + +def get_llm_interface(model_name: Optional[str] = None) -> LLMInterface: + """ + Get the global LLM interface instance or create a new one with a specific model. + + Args: + model_name: Optional model name to use instead of the default + + Returns: + LLMInterface instance + """ + if model_name: + return LLMInterface(model_name) + return llm_interface diff --git a/query/query_processor.py b/query/query_processor.py new file mode 100644 index 0000000..f2d0f2f --- /dev/null +++ b/query/query_processor.py @@ -0,0 +1,111 @@ +""" +Query processor module for the intelligent research system. + +This module handles the processing of user queries, including enhancement, +classification, and structuring for downstream modules. +""" + +from typing import Dict, Any, List, Optional + +from .llm_interface import get_llm_interface + + +class QueryProcessor: + """ + Processor for user research queries. + + This class handles the processing of user queries, including enhancement, + classification, and structuring for downstream modules. + """ + + def __init__(self): + """Initialize the query processor.""" + self.llm_interface = get_llm_interface() + + def process_query(self, query: str) -> Dict[str, Any]: + """ + Process a user query. + + Args: + query: The raw user query + + Returns: + Dictionary containing the processed query information + """ + # Enhance the query + enhanced_query = self.llm_interface.enhance_query(query) + + # Classify the query + classification = self.llm_interface.classify_query(query) + + # Extract entities from the classification + entities = classification.get('entities', []) + + # Structure the query for downstream modules + structured_query = self._structure_query(query, enhanced_query, classification) + + return structured_query + + def _structure_query(self, original_query: str, enhanced_query: str, + classification: Dict[str, Any]) -> Dict[str, Any]: + """ + Structure a query for downstream modules. + + Args: + original_query: The original user query + enhanced_query: The enhanced query + classification: The query classification + + Returns: + Dictionary containing the structured query + """ + return { + 'original_query': original_query, + 'enhanced_query': enhanced_query, + 'type': classification.get('type', 'unknown'), + 'intent': classification.get('intent', 'research'), + 'entities': classification.get('entities', []), + 'timestamp': None, # Will be filled in by the caller + 'metadata': { + 'classification': classification + } + } + + def generate_search_queries(self, structured_query: Dict[str, Any], + search_engines: List[str]) -> Dict[str, Any]: + """ + Generate optimized search queries for different search engines. + + Args: + structured_query: The structured query + search_engines: List of search engines to generate queries for + + Returns: + Updated structured query with search queries + """ + # Use the enhanced query for generating search queries + enhanced_query = structured_query['enhanced_query'] + + # Generate search queries for each engine + search_queries = self.llm_interface.generate_search_queries( + enhanced_query, search_engines + ) + + # Add search queries to the structured query + structured_query['search_queries'] = search_queries + + return structured_query + + +# Create a singleton instance for global use +query_processor = QueryProcessor() + + +def get_query_processor() -> QueryProcessor: + """ + Get the global query processor instance. + + Returns: + QueryProcessor instance + """ + return query_processor diff --git a/ranking/__init__.py b/ranking/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ranking/jina_reranker.py b/ranking/jina_reranker.py new file mode 100644 index 0000000..320b154 --- /dev/null +++ b/ranking/jina_reranker.py @@ -0,0 +1,155 @@ +""" +Jina AI Reranker module for the intelligent research system. + +This module provides functionality to rerank documents based on their relevance +to a query using Jina AI's Reranker API. +""" + +import os +import json +import requests +from typing import List, Dict, Any, Optional, Union + +from config.config import get_config + + +class JinaReranker: + """ + Document reranker using Jina AI's Reranker API. + + This class provides methods to rerank documents based on their relevance + to a query, improving the quality of search results. + """ + + def __init__(self): + """Initialize the Jina Reranker.""" + self.config = get_config() + self.api_key = self._get_api_key() + self.endpoint = "https://api.jina.ai/v1/rerank" + + # Get reranker configuration + self.reranker_config = self.config.config_data.get('jina', {}).get('reranker', {}) + self.model = self.reranker_config.get('model', 'jina-reranker-v2-base-multilingual') + self.default_top_n = self.reranker_config.get('top_n', 10) + + def _get_api_key(self) -> str: + """ + Get the Jina AI API key. + + Returns: + The API key as a string + + Raises: + ValueError: If the API key is not found + """ + try: + return self.config.get_api_key('jina') + except ValueError as e: + raise ValueError(f"Jina AI API key not found. {str(e)}") + + def rerank(self, query: str, documents: List[str], + top_n: Optional[int] = None) -> List[Dict[str, Any]]: + """ + Rerank documents based on their relevance to the query. + + Args: + query: The query to rank documents against + documents: List of document strings to rerank + top_n: Number of top results to return (optional) + + Returns: + List of dictionaries containing reranked documents with scores and indices + + Raises: + Exception: If there's an error calling the Reranker API + """ + if not documents: + return [] + + # Use default top_n if not specified + if top_n is None: + top_n = min(self.default_top_n, len(documents)) + else: + top_n = min(top_n, len(documents)) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + "Accept": "application/json" + } + + data = { + "model": self.model, + "query": query, + "documents": documents, + "top_n": top_n + } + + try: + response = requests.post(self.endpoint, headers=headers, json=data) + response.raise_for_status() # Raise exception for HTTP errors + + result = response.json() + + # Process and return the reranked results + reranked_results = [] + for item in result.get('results', []): + reranked_results.append({ + 'index': item.get('index'), # Original index in the documents list + 'score': item.get('score'), # Relevance score + 'document': documents[item.get('index')] # The actual document content + }) + + return reranked_results + + except Exception as e: + print(f"Error calling Jina Reranker API: {str(e)}") + # Return original documents with default ordering in case of error + return [{'index': i, 'score': 1.0, 'document': doc} for i, doc in enumerate(documents[:top_n])] + + def rerank_with_metadata(self, query: str, documents: List[Dict[str, Any]], + document_key: str = 'content', + top_n: Optional[int] = None) -> List[Dict[str, Any]]: + """ + Rerank documents with metadata based on their relevance to the query. + + Args: + query: The query to rank documents against + documents: List of document dictionaries containing content and metadata + document_key: The key in the document dictionaries that contains the text content + top_n: Number of top results to return (optional) + + Returns: + List of dictionaries containing reranked documents with scores, indices, and original metadata + + Raises: + Exception: If there's an error calling the Reranker API + """ + if not documents: + return [] + + # Extract document contents + doc_contents = [doc.get(document_key, "") for doc in documents] + + # Rerank the document contents + reranked_results = self.rerank(query, doc_contents, top_n) + + # Add original metadata to the results + for result in reranked_results: + result['metadata'] = documents[result['index']] + + return reranked_results + + +# Create a singleton instance for global use +jina_reranker = JinaReranker() + + +def get_jina_reranker() -> JinaReranker: + """ + Get the global Jina Reranker instance. + + Returns: + JinaReranker instance + """ + return jina_reranker diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6cd3c9c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +requests>=2.31.0 +numpy>=1.24.0 +tiktoken>=0.5.0 +litellm>=1.0.0 +gradio>=4.0.0 +pyyaml>=6.0 +python-dotenv>=1.0.0 diff --git a/sample_chunk.txt b/sample_chunk.txt new file mode 100644 index 0000000..cac066c --- /dev/null +++ b/sample_chunk.txt @@ -0,0 +1 @@ +The quick brown fox jumps over the lazy dog. This classic pangram contains every letter of the English alphabet at least once. Pangrams are often used to display font samples and test keyboards and printers. While "The quick brown fox jumps over the lazy dog" is the most famous pangram in English, many other examples exist. diff --git a/sample_query.txt b/sample_query.txt new file mode 100644 index 0000000..4522b70 --- /dev/null +++ b/sample_query.txt @@ -0,0 +1 @@ +What is a pangram used for? diff --git a/test_all_handlers.py b/test_all_handlers.py new file mode 100644 index 0000000..fb1a498 --- /dev/null +++ b/test_all_handlers.py @@ -0,0 +1,31 @@ +""" +Test all search handlers with a simple query. +""" + +from execution.search_executor import SearchExecutor + +def main(): + """Test all search handlers.""" + # Initialize the search executor + executor = SearchExecutor() + + # Execute a simple search + results = executor.execute_search({ + 'raw_query': 'quantum computing', + 'enhanced_query': 'quantum computing' + }) + + # Print results by source + print(f'Results by source: {[engine for engine, res in results.items() if res]}') + + # Print details + print('\nDetails:') + for engine, res in results.items(): + print(f'{engine}: {len(res)} results') + if res: + print(f' Sample result: {res[0]}') + + return results + +if __name__ == "__main__": + main() diff --git a/test_llm_interface.py b/test_llm_interface.py new file mode 100644 index 0000000..e4b52f8 --- /dev/null +++ b/test_llm_interface.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +""" +Test script for the LLM interface with Groq. + +This script tests the LLM interface with Groq models. +""" + +import os +import time +from query.llm_interface import LLMInterface + +def test_groq_model(): + """Test the Groq model.""" + # Ask for the API key + api_key = input("Enter your Groq API key: ") + os.environ["GROQ_API_KEY"] = api_key + + # Initialize the LLM interface with the Groq model + llm = LLMInterface(model_name="llama-3.1-8b-instant") + + # Test queries + test_queries = [ + "What are the latest advancements in quantum computing?", + "Compare renewable energy sources and their efficiency", + "Explain the impact of artificial intelligence on healthcare" + ] + + # Process each query + for query in test_queries: + print(f"\nProcessing query: '{query}'") + print("-" * 50) + + start_time = time.time() + response = llm._enhance_query_impl(query) + end_time = time.time() + + print(f"Processing time: {end_time - start_time:.2f} seconds") + print("\nEnhanced Query:") + print("-" * 50) + print(response) + print("-" * 50) + + # Wait a bit between queries + time.sleep(1) + +if __name__ == "__main__": + test_groq_model() diff --git a/test_query_processor.py b/test_query_processor.py new file mode 100644 index 0000000..2e11889 --- /dev/null +++ b/test_query_processor.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +""" +Test script for the query processor module. + +This script tests the query processor with the Groq models. +""" + +import os +import json +from datetime import datetime +from typing import Dict, Any + +from query.query_processor import QueryProcessor, get_query_processor +from query.llm_interface import LLMInterface, get_llm_interface +from config.config import get_config + +# Create a config.yaml file if it doesn't exist +config_dir = os.path.join(os.path.dirname(__file__), "config") +config_file = os.path.join(config_dir, "config.yaml") +if not os.path.exists(config_file): + example_file = os.path.join(config_dir, "config.yaml.example") + if os.path.exists(example_file): + with open(example_file, "r") as f: + example_content = f.read() + + with open(config_file, "w") as f: + f.write(example_content) + + print(f"Created config.yaml from example file") + +# Force the use of Groq model for testing +# First, create a global LLM interface with the Groq model +groq_interface = get_llm_interface("llama-3.1-8b-instant") +print(f"Using model: {groq_interface.model_name}") + +# Monkey patch the get_llm_interface function to always return our Groq interface +import query.llm_interface +original_get_llm_interface = query.llm_interface.get_llm_interface + +def patched_get_llm_interface(*args, **kwargs): + return groq_interface + +query.llm_interface.get_llm_interface = patched_get_llm_interface + +def test_process_query(query: str) -> Dict[str, Any]: + """ + Test the query processing functionality. + + Args: + query: The query to process + + Returns: + The processed query result + """ + # Get the query processor (which will use our patched LLM interface) + processor = get_query_processor() + + # Process the query + print(f"\nProcessing query: '{query}'") + print("-" * 50) + + start_time = datetime.now() + result = processor.process_query(query) + end_time = datetime.now() + + # Add timestamp + result['timestamp'] = datetime.now().isoformat() + + # Calculate processing time + processing_time = (end_time - start_time).total_seconds() + print(f"Processing time: {processing_time:.2f} seconds") + + # Print the result in a formatted way + print("\nProcessed Query Result:") + print("-" * 50) + print(f"Original Query: {result['original_query']}") + print(f"Enhanced Query: {result['enhanced_query']}") + print(f"Query Type: {result['type']}") + print(f"Query Intent: {result['intent']}") + print(f"Entities: {', '.join(result['entities'])}") + print("-" * 50) + + return result + + +def test_generate_search_queries(structured_query: Dict[str, Any], + search_engines: list = None) -> Dict[str, Any]: + """ + Test the search query generation functionality. + + Args: + structured_query: The structured query to generate search queries for + search_engines: List of search engines to generate queries for + + Returns: + The updated structured query with search queries + """ + if search_engines is None: + search_engines = ["google", "bing", "scholar"] + + # Get the query processor (which will use our patched LLM interface) + processor = get_query_processor() + + # Generate search queries + print(f"\nGenerating search queries for engines: {', '.join(search_engines)}") + print("-" * 50) + + start_time = datetime.now() + result = processor.generate_search_queries(structured_query, search_engines) + end_time = datetime.now() + + # Calculate processing time + processing_time = (end_time - start_time).total_seconds() + print(f"Processing time: {processing_time:.2f} seconds") + + # Print the generated search queries + print("\nGenerated Search Queries:") + print("-" * 50) + for engine, queries in result['search_queries'].items(): + print(f"\n{engine.upper()} Queries:") + for i, query in enumerate(queries, 1): + print(f" {i}. {query}") + print("-" * 50) + + return result + + +def main(): + """Run the query processor tests.""" + # Test queries + test_queries = [ + "What are the latest advancements in quantum computing?", + "Compare renewable energy sources and their efficiency", + "Explain the impact of artificial intelligence on healthcare" + ] + + # Process each query + for query in test_queries: + structured_query = test_process_query(query) + + # Generate search queries for the processed query + test_generate_search_queries(structured_query) + + print("\n" + "=" * 80 + "\n") + + +if __name__ == "__main__": + main() diff --git a/test_query_processor_comprehensive.py b/test_query_processor_comprehensive.py new file mode 100644 index 0000000..defa9eb --- /dev/null +++ b/test_query_processor_comprehensive.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +""" +Comprehensive test script for the query processor module. + +This script tests all the key functionality of the query processor with the Groq models. +""" + +import os +import json +import time +from datetime import datetime +from typing import Dict, Any, List + +from query.query_processor import QueryProcessor, get_query_processor +from query.llm_interface import LLMInterface, get_llm_interface +from config.config import get_config + +# Create a config.yaml file if it doesn't exist +config_dir = os.path.join(os.path.dirname(__file__), "config") +config_file = os.path.join(config_dir, "config.yaml") +if not os.path.exists(config_file): + example_file = os.path.join(config_dir, "config.yaml.example") + if os.path.exists(example_file): + with open(example_file, "r") as f: + example_content = f.read() + + with open(config_file, "w") as f: + f.write(example_content) + + print(f"Created config.yaml from example file") + +# Create a global LLM interface with the Groq model +groq_interface = get_llm_interface("llama-3.1-8b-instant") +print(f"Using model: {groq_interface.model_name}") + +# Monkey patch the get_llm_interface function to always return our Groq interface +import query.llm_interface +original_get_llm_interface = query.llm_interface.get_llm_interface + +def patched_get_llm_interface(*args, **kwargs): + return groq_interface + +query.llm_interface.get_llm_interface = patched_get_llm_interface + +# Test data +TEST_QUERIES = [ + # Simple factual queries + "What is quantum computing?", + "Who invented the internet?", + + # Complex research queries + "What are the latest advancements in renewable energy?", + "How does artificial intelligence impact healthcare?", + + # Comparative queries + "Compare machine learning and deep learning", + "What are the differences between solar and wind energy?", + + # Domain-specific queries + "Explain the CRISPR-Cas9 gene editing technology", + "What are the implications of blockchain for finance?" +] + +SEARCH_ENGINES = ["google", "bing", "scholar"] + +def test_enhance_query(query: str) -> str: + """ + Test the query enhancement functionality. + + Args: + query: The query to enhance + + Returns: + The enhanced query + """ + print(f"\nTesting Query Enhancement") + print(f"Original Query: '{query}'") + print("-" * 50) + + start_time = time.time() + enhanced_query = groq_interface.enhance_query(query) + end_time = time.time() + + print(f"Processing time: {end_time - start_time:.2f} seconds") + print(f"Enhanced Query: '{enhanced_query}'") + print("-" * 50) + + return enhanced_query + + +def test_classify_query(query: str) -> Dict[str, Any]: + """ + Test the query classification functionality. + + Args: + query: The query to classify + + Returns: + The classification result + """ + print(f"\nTesting Query Classification") + print(f"Query: '{query}'") + print("-" * 50) + + start_time = time.time() + classification = groq_interface.classify_query(query) + end_time = time.time() + + print(f"Processing time: {end_time - start_time:.2f} seconds") + print(f"Classification: {json.dumps(classification, indent=2)}") + print("-" * 50) + + return classification + + +def test_process_query(query: str) -> Dict[str, Any]: + """ + Test the query processing functionality. + + Args: + query: The query to process + + Returns: + The processed query result + """ + # Get the query processor (which will use our patched LLM interface) + processor = get_query_processor() + + # Process the query + print(f"\nTesting Query Processing") + print(f"Query: '{query}'") + print("-" * 50) + + start_time = time.time() + result = processor.process_query(query) + end_time = time.time() + + # Add timestamp + result['timestamp'] = datetime.now().isoformat() + + # Calculate processing time + print(f"Processing time: {end_time - start_time:.2f} seconds") + + # Print the result in a formatted way + print(f"Original Query: {result['original_query']}") + print(f"Enhanced Query: {result['enhanced_query']}") + print(f"Query Type: {result['type']}") + print(f"Query Intent: {result['intent']}") + print(f"Entities: {', '.join(result['entities'])}") + print("-" * 50) + + return result + + +def test_generate_search_queries(structured_query: Dict[str, Any], + search_engines: List[str]) -> Dict[str, Any]: + """ + Test the search query generation functionality. + + Args: + structured_query: The structured query to generate search queries for + search_engines: List of search engines to generate queries for + + Returns: + The updated structured query with search queries + """ + # Get the query processor (which will use our patched LLM interface) + processor = get_query_processor() + + # Generate search queries + print(f"\nTesting Search Query Generation") + print(f"Engines: {', '.join(search_engines)}") + print("-" * 50) + + start_time = time.time() + result = processor.generate_search_queries(structured_query, search_engines) + end_time = time.time() + + # Calculate processing time + print(f"Processing time: {end_time - start_time:.2f} seconds") + + # Print the generated search queries + for engine, queries in result['search_queries'].items(): + print(f"\n{engine.upper()} Queries:") + for i, query in enumerate(queries, 1): + print(f" {i}. {query}") + print("-" * 50) + + return result + + +def run_comprehensive_tests(): + """Run comprehensive tests on the query processor.""" + results = [] + + for i, query in enumerate(TEST_QUERIES, 1): + print(f"\n\nTEST {i}: {query}") + print("=" * 80) + + # Test individual components + enhanced_query = test_enhance_query(query) + classification = test_classify_query(query) + + # Test the full query processing pipeline + structured_query = test_process_query(query) + + # Test search query generation for a subset of queries + if i % 2 == 0: # Only test every other query to save time + search_result = test_generate_search_queries(structured_query, SEARCH_ENGINES) + structured_query = search_result + + # Save results + results.append({ + "query": query, + "enhanced_query": enhanced_query, + "classification": classification, + "structured_query": structured_query + }) + + print("\n" + "=" * 80 + "\n") + + # Add a delay between tests to avoid rate limiting + if i < len(TEST_QUERIES): + print(f"Waiting 2 seconds before next test...") + time.sleep(2) + + # Save results to a file + output_file = "query_processor_test_results.json" + with open(output_file, "w") as f: + json.dump(results, f, indent=2) + + print(f"\nTest results saved to {output_file}") + + +if __name__ == "__main__": + run_comprehensive_tests() diff --git a/test_search_execution.py b/test_search_execution.py new file mode 100644 index 0000000..4632113 --- /dev/null +++ b/test_search_execution.py @@ -0,0 +1,267 @@ +""" +Test script for the search execution module. +""" + +import os +import json +import time +from typing import Dict, List, Any, Optional + +# Import the necessary modules +try: + from query.query_processor import get_query_processor, QueryProcessor + from query.llm_interface import get_llm_interface + from execution.search_executor import SearchExecutor + from execution.result_collector import ResultCollector +except ImportError as e: + print(f"Import error: {e}") + print("Make sure all required modules are installed and available.") + exit(1) + + +def get_query_processor(): + """Get a query processor instance.""" + # First set the LLM interface to use Groq's model + from query.llm_interface import get_llm_interface + get_llm_interface(model_name="llama-3.1-8b-instant") + + # Then get the query processor which will use the configured LLM interface + from query.query_processor import get_query_processor + return get_query_processor() + + +def test_search_execution(query: str, search_engines: Optional[List[str]] = None) -> Dict[str, Any]: + """ + Test the search execution module. + + Args: + query: The query to process and execute + search_engines: List of search engines to use (if None, use all available) + + Returns: + Dictionary with test results + """ + print(f"Testing search execution for query: {query}") + + # Process the query + processor = get_query_processor() + start_time = time.time() + structured_query = processor.process_query(query) + query_time = time.time() - start_time + + print(f"Query processed in {query_time:.2f} seconds") + print(f"Enhanced query: {structured_query.get('enhanced_query', '')}") + print(f"Classification: {structured_query.get('classification', {})}") + + # Execute the search + executor = SearchExecutor() + + # Get available search engines if none specified + if search_engines is None: + search_engines = executor.get_available_search_engines() + print(f"Using available search engines: {search_engines}") + + # Execute the search + start_time = time.time() + search_results = executor.execute_search(structured_query, search_engines=search_engines) + search_time = time.time() - start_time + + print(f"Search executed in {search_time:.2f} seconds") + + # Print raw search results for debugging + print("\nRaw search results:") + for engine, results in search_results.items(): + print(f" {engine}: {len(results)} results") + if results: + print(f" Sample result: {results[0]}") + + # Process the results + collector = ResultCollector() + processed_results = collector.process_results(search_results, dedup=True) + + # Print summary of results + total_results = len(processed_results) + print(f"Found {total_results} results after deduplication") + + # Print results by source + results_by_source = {} + for result in processed_results: + source = result.get("source", "unknown") + if source not in results_by_source: + results_by_source[source] = 0 + results_by_source[source] += 1 + + print("Results by source:") + for source, count in results_by_source.items(): + print(f" {source}: {count}") + + # Print top 3 results + if processed_results: + print("\nTop 3 results:") + for i, result in enumerate(processed_results[:3]): + print(f" {i+1}. {result['title']}") + print(f" URL: {result['url']}") + print(f" Snippet: {result['snippet'][:100]}...") + print() + + # Return test results + return { + "query": query, + "structured_query": structured_query, + "search_engines": search_engines, + "raw_results": search_results, + "processed_results": processed_results, + "timing": { + "query_processing": query_time, + "search_execution": search_time, + "total": query_time + search_time + }, + "summary": { + "total_results": total_results, + "results_by_source": results_by_source + } + } + + +def save_test_results(results: Dict[str, Any], file_path: str) -> None: + """ + Save test results to a file. + + Args: + results: Test results to save + file_path: Path to save results to + """ + try: + with open(file_path, 'w') as f: + json.dump(results, f, indent=2) + print(f"Test results saved to {file_path}") + except Exception as e: + print(f"Error saving test results: {e}") + + +def mock_test(): + """Run a mock test without actual API calls.""" + print("Running mock test without API calls...") + + # Create a mock structured query + structured_query = { + "original_query": "What are the latest advancements in quantum computing?", + "enhanced_query": "Explore the most recent breakthroughs and developments in quantum computing technology, including hardware innovations, quantum algorithms, and potential applications.", + "classification": { + "type": "exploratory", + "intent": "research", + "entities": ["quantum computing", "advancements", "technology"] + }, + "search_queries": { + "google": "latest advancements in quantum computing 2025 breakthroughs", + "scholar": "recent quantum computing developments research papers", + "arxiv": "quantum computing hardware algorithms applications" + } + } + + # Create mock search results + mock_results = { + "google": [ + { + "title": "Quantum Computing Breakthrough: New Qubit Design Achieves 99.9% Fidelity", + "url": "https://example.com/quantum-breakthrough", + "snippet": "Researchers at MIT have developed a new qubit design that achieves 99.9% fidelity, a major step toward practical quantum computing.", + "position": 1 + }, + { + "title": "IBM Unveils 1000-Qubit Quantum Computer", + "url": "https://example.com/ibm-quantum", + "snippet": "IBM has announced its latest quantum computer featuring 1000 qubits, significantly expanding computational capabilities.", + "position": 2 + } + ], + "arxiv": [ + { + "title": "Quantum Error Correction Using Surface Codes", + "url": "https://arxiv.org/abs/2301.12345", + "snippet": "This paper presents a new approach to quantum error correction using surface codes that improves error tolerance by an order of magnitude.", + "authors": ["Smith, J.", "Johnson, A."], + "published_date": "2025-01-15", + "position": 1 + } + ] + } + + # Process the results + collector = ResultCollector() + processed_results = collector.process_results(mock_results, dedup=True) + + # Print summary + total_results = len(processed_results) + print(f"Found {total_results} mock results after deduplication") + + # Print results by source + results_by_source = {} + for result in processed_results: + source = result.get("source", "unknown") + if source not in results_by_source: + results_by_source[source] = 0 + results_by_source[source] += 1 + + print("Results by source:") + for source, count in results_by_source.items(): + print(f" {source}: {count}") + + # Print top 3 results + if processed_results: + print("\nTop 3 results:") + for i, result in enumerate(processed_results[:3]): + print(f" {i+1}. {result['title']}") + print(f" URL: {result['url']}") + print(f" Snippet: {result['snippet'][:100]}...") + print() + + # Return mock test results + return { + "query": "What are the latest advancements in quantum computing?", + "structured_query": structured_query, + "search_engines": ["google", "arxiv"], + "raw_results": mock_results, + "processed_results": processed_results, + "timing": { + "query_processing": 0.5, + "search_execution": 1.2, + "total": 1.7 + }, + "summary": { + "total_results": total_results, + "results_by_source": results_by_source + } + } + + +def main(): + """Main function.""" + # Test queries + test_queries = [ + "What are the latest advancements in quantum computing?", + "Compare blockchain and traditional databases for financial applications", + "Explain the implications of blockchain technology in finance" + ] + + # Run tests + all_results = {} + for query in test_queries: + print("\n" + "="*80) + print(f"Testing query: {query}") + print("="*80) + + # Test with all available search engines + results = test_search_execution(query) + + # Save results for this query + all_results[query] = results + + print("\n") + + # Save all test results + save_test_results(all_results, "search_execution_test_results.json") + + +if __name__ == "__main__": + main() diff --git a/test_similarity.py b/test_similarity.py new file mode 100755 index 0000000..c2a8a11 --- /dev/null +++ b/test_similarity.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +Test script for the JinaSimilarity module. +Computes similarity between text from two input files. +""" + +import argparse +import sys +from pathlib import Path +from jina_similarity import JinaSimilarity, TokenLimitError + +def read_file(file_path: str) -> str: + """Read content from a file. + + Args: + file_path: Path to the file to read + + Returns: + str: Content of the file + + Raises: + FileNotFoundError: If the file doesn't exist + """ + with open(file_path, 'r', encoding='utf-8') as f: + return f.read().strip() + +def main(): + parser = argparse.ArgumentParser( + description='Compute similarity between text from two files using Jina AI.' + ) + parser.add_argument( + 'chunk_file', + type=str, + help='Path to the file containing the text chunk' + ) + parser.add_argument( + 'query_file', + type=str, + help='Path to the file containing the query' + ) + parser.add_argument( + '--verbose', + '-v', + action='store_true', + help='Print token counts and embeddings' + ) + + args = parser.parse_args() + + # Check if files exist + chunk_path = Path(args.chunk_file) + query_path = Path(args.query_file) + + if not chunk_path.is_file(): + print(f"Error: Chunk file not found: {args.chunk_file}", file=sys.stderr) + sys.exit(1) + if not query_path.is_file(): + print(f"Error: Query file not found: {args.query_file}", file=sys.stderr) + sys.exit(1) + + try: + # Read input files + chunk_text = read_file(args.chunk_file) + query_text = read_file(args.query_file) + + # Initialize similarity module + js = JinaSimilarity() + + # Get token counts if verbose + if args.verbose: + chunk_tokens = js.count_tokens(chunk_text) + query_tokens = js.count_tokens(query_text) + print(f"\nToken counts:") + print(f"Chunk: {chunk_tokens} tokens") + print(f"Query: {query_tokens} tokens\n") + + # Compute similarity + similarity, chunk_embedding, query_embedding = js.compute_similarity( + chunk_text, + query_text + ) + + # Print results + print(f"Similarity score: {similarity:.4f}") + + if args.verbose: + print(f"\nEmbeddings:") + print(f"Chunk embedding (first 5): {chunk_embedding[:5]}...") + print(f"Query embedding (first 5): {query_embedding[:5]}...") + + except TokenLimitError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main() diff --git a/ui/__init__.py b/ui/__init__.py new file mode 100644 index 0000000..e69de29