Compare commits
No commits in common. "bf49474ca665e140cb326be4f649409e66718b1a" and "2b52d5268dfee29900c53c0c2e6e99f68e7bdeb7" have entirely different histories.
bf49474ca6
...
2b52d5268d
|
@ -1,53 +0,0 @@
|
|||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# Virtual Environment
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
.venv/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
.DS_Store
|
||||
|
||||
# Project specific
|
||||
*.json
|
||||
!config/config.yaml.example
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
report_*.md
|
||||
|
||||
# Logs
|
||||
logs/
|
||||
*.log
|
||||
|
||||
# Test results
|
||||
*_test_results.json
|
||||
|
||||
# Database files
|
||||
*.db
|
||||
report/database/*.db
|
|
@ -1,31 +0,0 @@
|
|||
-----BEGIN CERTIFICATE-----
|
||||
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
||||
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
||||
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
||||
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
||||
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
||||
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
||||
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
||||
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
||||
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
||||
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
||||
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
||||
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
||||
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
||||
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
||||
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
||||
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
||||
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
||||
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
||||
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
||||
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
||||
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
||||
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
||||
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
||||
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
||||
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
||||
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
||||
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
||||
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
||||
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
||||
-----END CERTIFICATE-----
|
|
@ -1,330 +0,0 @@
|
|||
# Code Structure
|
||||
|
||||
## Current Project Organization
|
||||
|
||||
```
|
||||
project/
|
||||
│
|
||||
├── examples/ # Sample data and query examples
|
||||
├── report/ # Report generation module
|
||||
│ ├── __init__.py
|
||||
│ ├── report_generator.py # Module for generating reports
|
||||
│ ├── report_synthesis.py # Module for synthesizing reports
|
||||
│ ├── progressive_report_synthesis.py # Module for progressive report generation
|
||||
│ ├── document_processor.py # Module for processing documents
|
||||
│ ├── document_scraper.py # Module for scraping documents
|
||||
│ ├── report_detail_levels.py # Module for managing report detail levels
|
||||
│ ├── report_templates.py # Module for managing report templates
|
||||
│ └── database/ # Database for storing reports
|
||||
│ ├── __init__.py
|
||||
│ └── db_manager.py # Module for managing the database
|
||||
├── tests/ # Test suite
|
||||
│ ├── __init__.py
|
||||
│ ├── execution/ # Search execution tests
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── test_search.py
|
||||
│ │ ├── test_search_execution.py
|
||||
│ │ └── test_all_handlers.py
|
||||
│ ├── integration/ # Integration tests
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── test_ev_query.py
|
||||
│ │ └── test_query_to_report.py
|
||||
│ ├── query/ # Query processing tests
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── test_query_processor.py
|
||||
│ │ ├── test_query_processor_comprehensive.py
|
||||
│ │ └── test_llm_interface.py
|
||||
│ ├── ranking/ # Ranking algorithm tests
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── test_reranker.py
|
||||
│ │ ├── test_similarity.py
|
||||
│ │ └── test_simple_reranker.py
|
||||
│ ├── report/ # Report generation tests
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── test_custom_model.py
|
||||
│ │ ├── test_detail_levels.py
|
||||
│ │ ├── test_brief_report.py
|
||||
│ │ └── test_report_templates.py
|
||||
│ ├── ui/ # UI component tests
|
||||
│ │ ├── __init__.py
|
||||
│ │ └── test_ui_search.py
|
||||
│ ├── test_document_processor.py
|
||||
│ ├── test_document_scraper.py
|
||||
│ └── test_report_synthesis.py
|
||||
├── utils/ # Utility scripts and shared functions
|
||||
│ ├── __init__.py
|
||||
│ ├── jina_similarity.py # Module for computing text similarity
|
||||
│ └── markdown_segmenter.py # Module for segmenting markdown documents
|
||||
├── config/ # Configuration management
|
||||
│ ├── __init__.py
|
||||
│ ├── config.py # Configuration management class
|
||||
│ └── config.yaml # YAML configuration file with settings for different components
|
||||
├── query/ # Query processing module
|
||||
│ ├── __init__.py
|
||||
│ ├── query_processor.py # Module for processing user queries
|
||||
│ └── llm_interface.py # Module for interacting with LLM providers
|
||||
├── execution/ # Search execution module
|
||||
│ ├── __init__.py
|
||||
│ ├── search_executor.py # Module for executing search queries
|
||||
│ ├── result_collector.py # Module for collecting search results
|
||||
│ └── api_handlers/ # Handlers for different search APIs
|
||||
│ ├── __init__.py
|
||||
│ ├── base_handler.py # Base class for search handlers
|
||||
│ ├── serper_handler.py # Handler for Serper API (Google search)
|
||||
│ ├── scholar_handler.py # Handler for Google Scholar via Serper
|
||||
│ ├── google_handler.py # Handler for Google search
|
||||
│ └── arxiv_handler.py # Handler for arXiv API
|
||||
├── ranking/ # Ranking module
|
||||
│ ├── __init__.py
|
||||
│ └── jina_reranker.py # Module for reranking documents using Jina AI
|
||||
├── ui/ # UI module
|
||||
│ ├── __init__.py
|
||||
│ └── gradio_interface.py # Gradio-based web interface
|
||||
├── scripts/ # Scripts
|
||||
│ └── query_to_report.py # Script for generating reports from queries
|
||||
├── run_ui.py # Script to run the UI
|
||||
└── requirements.txt # Project dependencies
|
||||
```
|
||||
|
||||
## Module Details
|
||||
|
||||
### Config Module
|
||||
|
||||
The `config` module manages configuration settings for the entire system, including API keys, model selections, and other parameters.
|
||||
|
||||
### Files
|
||||
|
||||
- `__init__.py`: Package initialization file
|
||||
- `config.py`: Configuration management class
|
||||
- `config.yaml`: YAML configuration file with settings for different components
|
||||
|
||||
### Classes
|
||||
|
||||
- `Config`: Singleton class for loading and accessing configuration settings
|
||||
- `load_config(config_path)`: Loads configuration from a YAML file
|
||||
- `get(key, default=None)`: Gets a configuration value by key
|
||||
|
||||
### Query Module
|
||||
|
||||
The `query` module handles the processing and enhancement of user queries, including classification and optimization for search.
|
||||
|
||||
### Files
|
||||
|
||||
- `__init__.py`: Package initialization file
|
||||
- `query_processor.py`: Main module for processing user queries
|
||||
- `query_classifier.py`: Module for classifying query types
|
||||
- `llm_interface.py`: Interface for interacting with LLM providers
|
||||
|
||||
### Classes
|
||||
|
||||
- `QueryProcessor`: Main class for processing user queries
|
||||
- `process_query(query)`: Processes a user query and returns enhanced results
|
||||
- `classify_query(query)`: Classifies a query by type and intent
|
||||
- `generate_search_queries(query, classification)`: Generates optimized search queries
|
||||
|
||||
- `QueryClassifier`: Class for classifying queries
|
||||
- `classify(query)`: Classifies a query by type, intent, and entities
|
||||
|
||||
- `LLMInterface`: Interface for interacting with LLM providers
|
||||
- `get_completion(prompt, model=None)`: Gets a completion from an LLM
|
||||
- `enhance_query(query)`: Enhances a query with additional context
|
||||
- `classify_query(query)`: Uses an LLM to classify a query
|
||||
|
||||
### Execution Module
|
||||
|
||||
The `execution` module handles the execution of search queries across multiple search engines and the collection of results.
|
||||
|
||||
### Files
|
||||
|
||||
- `__init__.py`: Package initialization file
|
||||
- `search_executor.py`: Module for executing search queries
|
||||
- `result_collector.py`: Module for collecting and processing search results
|
||||
- `api_handlers/`: Directory containing handlers for different search APIs
|
||||
- `__init__.py`: Package initialization file
|
||||
- `base_handler.py`: Base class for search handlers
|
||||
- `serper_handler.py`: Handler for Serper API (Google search)
|
||||
- `scholar_handler.py`: Handler for Google Scholar via Serper
|
||||
- `arxiv_handler.py`: Handler for arXiv API
|
||||
|
||||
### Classes
|
||||
|
||||
- `SearchExecutor`: Class for executing search queries
|
||||
- `execute_search(query_data)`: Executes a search across multiple engines
|
||||
- `_execute_search_async(query, engines)`: Executes a search asynchronously
|
||||
- `_execute_search_sync(query, engines)`: Executes a search synchronously
|
||||
|
||||
- `ResultCollector`: Class for collecting and processing search results
|
||||
- `process_results(search_results)`: Processes search results from multiple engines
|
||||
- `deduplicate_results(results)`: Deduplicates results based on URL
|
||||
- `save_results(results, file_path)`: Saves results to a file
|
||||
|
||||
- `BaseSearchHandler`: Base class for search handlers
|
||||
- `search(query, num_results)`: Abstract method for searching
|
||||
- `_process_response(response)`: Processes the API response
|
||||
|
||||
- `SerperSearchHandler`: Handler for Serper API
|
||||
- `search(query, num_results)`: Searches using Serper API
|
||||
- `_process_response(response)`: Processes the Serper API response
|
||||
|
||||
- `ScholarSearchHandler`: Handler for Google Scholar via Serper
|
||||
- `search(query, num_results)`: Searches Google Scholar
|
||||
- `_process_response(response)`: Processes the Scholar API response
|
||||
|
||||
- `ArxivSearchHandler`: Handler for arXiv API
|
||||
- `search(query, num_results)`: Searches arXiv
|
||||
- `_process_response(response)`: Processes the arXiv API response
|
||||
|
||||
### Ranking Module
|
||||
|
||||
The `ranking` module provides functionality for reranking and prioritizing documents based on their relevance to the user's query.
|
||||
|
||||
### Files
|
||||
|
||||
- `__init__.py`: Package initialization file
|
||||
- `jina_reranker.py`: Module for reranking documents using Jina AI
|
||||
- `filter_manager.py`: Module for filtering documents
|
||||
|
||||
### Classes
|
||||
|
||||
- `JinaReranker`: Class for reranking documents
|
||||
- `rerank(documents, query)`: Reranks documents based on relevance to query
|
||||
- `_prepare_inputs(documents, query)`: Prepares inputs for the reranker
|
||||
|
||||
- `FilterManager`: Class for filtering documents
|
||||
- `filter_by_date(documents, start_date, end_date)`: Filters by date
|
||||
- `filter_by_source(documents, sources)`: Filters by source
|
||||
|
||||
### Report Templates Module
|
||||
|
||||
The `report_templates` module provides a template system for generating reports with different detail levels and query types.
|
||||
|
||||
### Files
|
||||
|
||||
- `__init__.py`: Package initialization file
|
||||
- `report_templates.py`: Module for managing report templates
|
||||
|
||||
### Classes
|
||||
|
||||
- `QueryType` (Enum): Defines the types of queries supported by the system
|
||||
- `FACTUAL`: For factual queries seeking specific information
|
||||
- `EXPLORATORY`: For exploratory queries investigating a topic
|
||||
- `COMPARATIVE`: For comparative queries comparing multiple items
|
||||
|
||||
- `DetailLevel` (Enum): Defines the levels of detail for generated reports
|
||||
- `BRIEF`: Short summary with key findings
|
||||
- `STANDARD`: Standard report with introduction, key findings, and analysis
|
||||
- `DETAILED`: Detailed report with methodology and more in-depth analysis
|
||||
- `COMPREHENSIVE`: Comprehensive report with executive summary, literature review, and appendices
|
||||
|
||||
- `ReportTemplate`: Class representing a report template
|
||||
- `template` (str): The template string with placeholders
|
||||
- `detail_level` (DetailLevel): The detail level of the template
|
||||
- `query_type` (QueryType): The query type the template is designed for
|
||||
- `model` (Optional[str]): The LLM model recommended for this template
|
||||
- `required_sections` (Optional[List[str]]): Required sections in the template
|
||||
- `validate()`: Validates that the template contains all required sections
|
||||
|
||||
- `ReportTemplateManager`: Class for managing report templates
|
||||
- `add_template(template)`: Adds a template to the manager
|
||||
- `get_template(query_type, detail_level)`: Gets a template for a specific query type and detail level
|
||||
- `get_available_templates()`: Gets a list of available templates
|
||||
- `initialize_default_templates()`: Initializes the default templates for all combinations of query types and detail levels
|
||||
|
||||
### Progressive Report Synthesis Module
|
||||
|
||||
The `progressive_report_synthesis` module provides functionality to synthesize reports from document chunks using a progressive approach, where chunks are processed iteratively and the report is refined over time.
|
||||
|
||||
### Files
|
||||
|
||||
- `__init__.py`: Package initialization file
|
||||
- `progressive_report_synthesis.py`: Module for progressive report generation
|
||||
|
||||
### Classes
|
||||
|
||||
- `ReportState`: Class to track the state of a progressive report
|
||||
- `current_report` (str): The current version of the report
|
||||
- `processed_chunks` (Set[str]): Set of document IDs that have been processed
|
||||
- `version` (int): Current version number of the report
|
||||
- `last_update_time` (float): Timestamp of the last update
|
||||
- `improvement_scores` (List[float]): List of improvement scores for each iteration
|
||||
- `is_complete` (bool): Whether the report generation is complete
|
||||
- `termination_reason` (Optional[str]): Reason for termination if complete
|
||||
|
||||
- `ProgressiveReportSynthesizer`: Class for progressive report synthesis
|
||||
- Extends `ReportSynthesizer` to implement a progressive approach
|
||||
- `set_progress_callback(callback)`: Sets a callback function to report progress
|
||||
- `prioritize_chunks(chunks, query)`: Prioritizes chunks based on relevance
|
||||
- `extract_information_from_chunk(chunk, query, detail_level)`: Extracts key information from a chunk
|
||||
- `refine_report(current_report, new_information, query, query_type, detail_level)`: Refines the report with new information
|
||||
- `initialize_report(initial_chunks, query, query_type, detail_level)`: Initializes the report with the first batch of chunks
|
||||
- `should_terminate(improvement_score)`: Determines if the process should terminate
|
||||
- `synthesize_report_progressively(chunks, query, query_type, detail_level)`: Main method for progressive report generation
|
||||
- `synthesize_report(chunks, query, query_type, detail_level)`: Override of parent method to use progressive approach for comprehensive detail level
|
||||
|
||||
- `get_progressive_report_synthesizer(model_name)`: Factory function to get a singleton instance
|
||||
|
||||
## Recent Updates
|
||||
|
||||
### 2025-03-12: Progressive Report Generation Implementation
|
||||
|
||||
1. **Progressive Report Synthesis Module**:
|
||||
- Created a new module `progressive_report_synthesis.py` for progressive report generation
|
||||
- Implemented `ReportState` class to track the state of a progressive report
|
||||
- Created `ProgressiveReportSynthesizer` class extending from `ReportSynthesizer`
|
||||
- Implemented chunk prioritization algorithm based on relevance scores
|
||||
- Developed iterative refinement process with specialized prompts
|
||||
- Added state management to track report versions and processed chunks
|
||||
- Implemented termination conditions (all chunks processed, diminishing returns, max iterations)
|
||||
- Added support for different models with adaptive batch sizing
|
||||
- Implemented progress tracking and callback mechanism
|
||||
|
||||
2. **Report Generator Integration**:
|
||||
- Modified `report_generator.py` to use the progressive report synthesizer for comprehensive detail level
|
||||
- Created a hybrid system that uses standard map-reduce for brief/standard/detailed levels
|
||||
- Added proper model selection and configuration for both synthesizers
|
||||
|
||||
3. **Testing**:
|
||||
- Created `test_progressive_report.py` to test progressive report generation
|
||||
- Implemented comparison functionality between progressive and standard approaches
|
||||
- Added test cases for different query types and document collections
|
||||
|
||||
### 2025-03-11: Report Templates Implementation
|
||||
|
||||
1. **Report Templates Module**:
|
||||
- Created a new module `report_templates.py` for managing report templates
|
||||
- Implemented enums for query types (FACTUAL, EXPLORATORY, COMPARATIVE) and detail levels (BRIEF, STANDARD, DETAILED, COMPREHENSIVE)
|
||||
- Created a template system with placeholders for different report sections
|
||||
- Implemented 12 different templates (3 query types × 4 detail levels)
|
||||
- Added validation to ensure templates contain all required sections
|
||||
|
||||
2. **Report Synthesis Integration**:
|
||||
- Updated the report synthesis module to use the new template system
|
||||
- Added support for different templates based on query type and detail level
|
||||
- Implemented fallback to standard templates when specific templates are not found
|
||||
- Added better logging for template retrieval process
|
||||
|
||||
3. **Testing**:
|
||||
- Created test_report_templates.py to test template retrieval and validation
|
||||
- Implemented test_brief_report.py to test the brief report generation
|
||||
- Successfully tested all combinations of detail levels and query types
|
||||
|
||||
### 2025-02-28: Async Implementation and Reference Formatting
|
||||
|
||||
1. **LLM Interface Updates**:
|
||||
- Converted key methods to async:
|
||||
- `generate_completion`
|
||||
- `classify_query`
|
||||
- `enhance_query`
|
||||
- `generate_search_queries`
|
||||
- Added special handling for Gemini models
|
||||
- Improved reference formatting instructions
|
||||
|
||||
2. **Query Processor Updates**:
|
||||
- Updated `process_query` to be async
|
||||
- Made `generate_search_queries` async
|
||||
- Fixed async/await patterns throughout
|
||||
|
||||
3. **Gradio Interface Updates**:
|
||||
- Modified `generate_report` to handle async operations
|
||||
- Updated report button click handler
|
||||
- Improved error handling
|
|
@ -1,196 +0,0 @@
|
|||
# Current Focus: Project Directory Reorganization, Testing, and Embedding Usage
|
||||
|
||||
## Active Work
|
||||
|
||||
### Project Directory Reorganization
|
||||
- ✅ Reorganized project directory structure for better maintainability
|
||||
- ✅ Moved utility scripts to the `utils/` directory
|
||||
- ✅ Organized test files into subdirectories under `tests/`
|
||||
- ✅ Moved sample data to the `examples/data/` directory
|
||||
- ✅ Created proper `__init__.py` files for all packages
|
||||
- ✅ Verified pipeline functionality after reorganization
|
||||
|
||||
### Embedding Usage Analysis
|
||||
- ✅ Confirmed that the pipeline uses Jina AI's Embeddings API through the `JinaSimilarity` class
|
||||
- ✅ Verified that the `JinaReranker` class uses embeddings for document reranking
|
||||
- ✅ Analyzed how embeddings are integrated into the search and ranking process
|
||||
|
||||
### Pipeline Testing
|
||||
- ✅ Tested the pipeline after reorganization to ensure functionality
|
||||
- ✅ Verified that the UI works correctly with the new directory structure
|
||||
- ✅ Confirmed that all imports are working properly with the new structure
|
||||
|
||||
## Repository Cleanup
|
||||
- Reorganized test files into dedicated directories under `tests/`
|
||||
- Created `examples/` directory for sample data
|
||||
- Moved utility scripts to `utils/`
|
||||
- Committed changes with message 'Clean up repository: Remove unused test files and add new test directories'
|
||||
|
||||
## Recent Changes
|
||||
|
||||
### Directory Structure Reorganization
|
||||
- Created a dedicated `utils/` directory for utility scripts
|
||||
- Moved `jina_similarity.py` to `utils/`
|
||||
- Added `__init__.py` to make it a proper Python package
|
||||
- Organized test files into subdirectories under `tests/`
|
||||
- Created subdirectories for each module (query, execution, ranking, report, ui, integration)
|
||||
- Added `__init__.py` files to all test directories
|
||||
- Created an `examples/` directory with subdirectories for data and scripts
|
||||
- Moved sample data to `examples/data/`
|
||||
- Added `__init__.py` files to make them proper Python packages
|
||||
- Added a dedicated `scripts/` directory for utility scripts
|
||||
- Moved `query_to_report.py` to `scripts/`
|
||||
|
||||
### Pipeline Verification
|
||||
- Verified that the pipeline functions correctly after reorganization
|
||||
- Confirmed that the `JinaSimilarity` class in `utils/jina_similarity.py` is properly used for embeddings
|
||||
- Tested the reranking functionality with the `JinaReranker` class
|
||||
- Checked that the report generation process works with the new structure
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Run comprehensive tests to ensure all functionality works with the new directory structure
|
||||
2. Update any remaining documentation to reflect the new directory structure
|
||||
3. Consider moving the remaining test files in the root of the `tests/` directory to appropriate subdirectories
|
||||
4. Review import statements throughout the codebase to ensure they follow the new structure
|
||||
5. Add more comprehensive documentation about the directory structure
|
||||
6. Consider creating a development guide for new contributors
|
||||
7. Implement automated tests to verify the directory structure remains consistent
|
||||
|
||||
### Future Enhancements
|
||||
|
||||
1. **Query Processing Improvements**:
|
||||
- **Multiple Query Variation Generation**:
|
||||
- Generate several similar queries with different keywords and expanded intent for better search coverage
|
||||
- Enhance the `QueryProcessor` class to generate multiple query variations (3-4 per query)
|
||||
- Update the `execute_search` method to handle multiple queries and merge results
|
||||
- Implement deduplication for results from different query variations
|
||||
- Estimated difficulty: Moderate (3-4 days of work)
|
||||
|
||||
- **Threshold-Based Reranking with Larger Document Sets**:
|
||||
- Process more initial documents and use reranking to select the top N most relevant ones
|
||||
- Modify detail level configurations to include parameters for initial results count and final results after reranking
|
||||
- Update the `SearchExecutor` to fetch more results initially
|
||||
- Enhance the reranking process to filter based on a score threshold or top N
|
||||
- Estimated difficulty: Easy to Moderate (2-3 days of work)
|
||||
|
||||
2. **UI Improvements**:
|
||||
- **Add Chunk Processing Progress Indicators**:
|
||||
- Modify the `report_synthesis.py` file to add logging during the map phase of the map-reduce process
|
||||
- Add a counter variable to track which chunk is being processed
|
||||
- Use the existing logging infrastructure to output progress messages in the UI
|
||||
- Estimated difficulty: Easy (15-30 minutes of work)
|
||||
|
||||
3. **Visualization Components**:
|
||||
- Identify common data types in reports that would benefit from visualization
|
||||
- Design and implement visualization components for these data types
|
||||
- Integrate visualization components into the report generation process
|
||||
|
||||
### Current Tasks
|
||||
|
||||
1. **Report Generation Module Implementation (Phase 4)**:
|
||||
- Implementing support for alternative models with larger context windows
|
||||
- Implementing progressive report generation for very large research tasks
|
||||
- Creating visualization components for data mentioned in reports
|
||||
- Adding interactive elements to the generated reports
|
||||
- Implementing report versioning and comparison
|
||||
|
||||
2. **Integration with UI**:
|
||||
- Adding report generation options to the UI
|
||||
- Implementing progress indicators for document scraping and report generation
|
||||
- Creating visualization components for generated reports
|
||||
- Adding options to customize report generation parameters
|
||||
|
||||
3. **Performance Optimization**:
|
||||
- Optimizing token usage for more efficient LLM utilization
|
||||
- Implementing caching strategies for document scraping and LLM calls
|
||||
- Parallelizing document scraping and processing
|
||||
- Exploring parallel processing for the map phase of report synthesis
|
||||
|
||||
### Recent Progress
|
||||
|
||||
1. **Report Templates Implementation**:
|
||||
- ✅ Created a dedicated `report_templates.py` module with a comprehensive template system
|
||||
- ✅ Implemented `QueryType` enum for categorizing queries (FACTUAL, EXPLORATORY, COMPARATIVE)
|
||||
- ✅ Created `DetailLevel` enum for different report detail levels (BRIEF, STANDARD, DETAILED, COMPREHENSIVE)
|
||||
- ✅ Designed a `ReportTemplate` class with validation for required sections
|
||||
- ✅ Implemented a `ReportTemplateManager` to manage and retrieve templates
|
||||
- ✅ Created 12 different templates (3 query types × 4 detail levels)
|
||||
- ✅ Added testing with `test_report_templates.py` and `test_brief_report.py`
|
||||
- ✅ Updated memory bank documentation with template system details
|
||||
|
||||
2. **Testing and Validation of Report Templates**:
|
||||
- ✅ Fixed template retrieval issues in the report synthesis module
|
||||
- ✅ Successfully tested all detail levels (brief, standard, detailed, comprehensive) with factual queries
|
||||
- ✅ Successfully tested all detail levels with exploratory queries
|
||||
- ✅ Successfully tested all detail levels with comparative queries
|
||||
- ✅ Improved error handling in template retrieval with fallback to standard templates
|
||||
- ✅ Added better logging for template retrieval process
|
||||
|
||||
### Next Steps
|
||||
|
||||
1. **Further Refinement of Report Templates**:
|
||||
- Conduct additional testing with real-world queries and document sets
|
||||
- Compare the analytical depth and quality of reports generated with different detail levels
|
||||
- Gather user feedback on the improved reports at different detail levels
|
||||
- Further refine the detail level configurations based on testing and feedback
|
||||
- Integrate the template system with the UI to allow users to select detail levels
|
||||
- Add more specialized templates for specific research domains
|
||||
- Implement template customization options for users
|
||||
|
||||
2. **Progressive Report Generation Implementation**:
|
||||
- ✅ Implemented progressive report generation for comprehensive detail level reports
|
||||
- ✅ Created a hybrid system that uses standard map-reduce for brief/standard/detailed levels and progressive generation for comprehensive level
|
||||
- ✅ Added support for different models with adaptive batch sizing
|
||||
- ✅ Implemented progress tracking and callback mechanism
|
||||
- ✅ Created comprehensive test suite for progressive report generation
|
||||
- ⏳ Add UI controls to monitor and control the progressive generation process
|
||||
|
||||
#### Implementation Details for Progressive Report Generation
|
||||
|
||||
**Phase 1: Core Implementation (Completed)**
|
||||
- ✅ Created a new `ProgressiveReportSynthesizer` class extending from `ReportSynthesizer`
|
||||
- ✅ Implemented chunk prioritization algorithm based on relevance scores
|
||||
- ✅ Developed the iterative refinement process with specialized prompts
|
||||
- ✅ Added state management to track report versions and processed chunks
|
||||
- ✅ Implemented termination conditions (all chunks processed, diminishing returns, user intervention)
|
||||
|
||||
**Phase 2: Model Flexibility (Completed)**
|
||||
- ✅ Modified the implementation to support different models beyond Gemini
|
||||
- ✅ Created model-specific configurations for progressive generation
|
||||
- ✅ Implemented adaptive batch sizing based on model context window
|
||||
- ✅ Added fallback mechanisms for when context windows are exceeded
|
||||
|
||||
**Phase 3: UI Integration (In Progress)**
|
||||
- ✅ Added progress tracking callback mechanism
|
||||
- ⏳ Implement controls to pause, resume, or terminate the process
|
||||
- ⏳ Create a preview mode to see the current report state
|
||||
- ⏳ Add options to compare different versions of the report
|
||||
|
||||
**Phase 4: Testing and Optimization (Completed)**
|
||||
- ✅ Created test script for progressive report generation
|
||||
- ✅ Added comparison functionality between progressive and standard approaches
|
||||
- ✅ Implemented optimization for token usage and processing efficiency
|
||||
- ✅ Fine-tuned prompts and parameters based on testing results
|
||||
|
||||
3. **Visualization Components**:
|
||||
- Identify common data types in reports that would benefit from visualization
|
||||
- Design and implement visualization components for these data types
|
||||
- Integrate visualization components into the report generation process
|
||||
- Consider how visualizations can be incorporated into progressive reports
|
||||
|
||||
### Technical Notes
|
||||
|
||||
- Using Groq's Llama 3.3 70B Versatile model for detailed and comprehensive report synthesis
|
||||
- Using Groq's Llama 3.1 8B Instant model for brief and standard report synthesis
|
||||
- Implemented map-reduce approach for processing document chunks with detail-level-specific extraction
|
||||
- Created enhanced report templates focused on analytical depth rather than just additional sections
|
||||
- Added citation generation and reference management
|
||||
- Using asynchronous processing for improved performance in report generation
|
||||
- Managing API keys securely through environment variables and configuration files
|
||||
- Implemented progressive report generation for comprehensive detail level:
|
||||
- Uses iterative refinement process to gradually improve report quality
|
||||
- Processes document chunks in batches based on priority
|
||||
- Tracks improvement scores to detect diminishing returns
|
||||
- Adapts batch size based on model context window
|
||||
- Provides progress tracking through callback mechanism
|
|
@ -1,441 +0,0 @@
|
|||
# Decision Log
|
||||
|
||||
## 2025-02-27: Initial Project Setup
|
||||
|
||||
### Decision: Use Jina AI APIs for Semantic Search
|
||||
- **Context**: Need for semantic search capabilities that understand context beyond keywords
|
||||
- **Options Considered**:
|
||||
1. Build custom embedding solution
|
||||
2. Use open-source models locally
|
||||
3. Use Jina AI's APIs
|
||||
- **Decision**: Use Jina AI's APIs for embedding generation and similarity computation
|
||||
- **Rationale**:
|
||||
- High-quality embeddings with state-of-the-art models
|
||||
- No need to manage model deployment and infrastructure
|
||||
- Simple API integration with reasonable pricing
|
||||
- Support for long texts through segmentation
|
||||
|
||||
### Decision: Separate Markdown Segmentation from Similarity Computation
|
||||
- **Context**: Need to handle potentially long markdown documents
|
||||
- **Options Considered**:
|
||||
1. Integrate segmentation directly into the similarity module
|
||||
2. Create a separate module for segmentation
|
||||
- **Decision**: Create a separate module (markdown_segmenter.py) for document segmentation
|
||||
- **Rationale**:
|
||||
- Better separation of concerns
|
||||
- More modular design allows for independent use of components
|
||||
- Easier to maintain and extend each component separately
|
||||
|
||||
### Decision: Use Environment Variables for API Keys
|
||||
- **Context**: Need to securely manage API credentials
|
||||
- **Options Considered**:
|
||||
1. Configuration files
|
||||
2. Environment variables
|
||||
3. Secret management service
|
||||
- **Decision**: Use environment variables (JINA_API_KEY)
|
||||
- **Rationale**:
|
||||
- Simple to implement
|
||||
- Standard practice for managing secrets
|
||||
- Works well across different environments
|
||||
- Prevents accidental commit of credentials to version control
|
||||
|
||||
### Decision: Use Cosine Similarity with Normalized Vectors
|
||||
- **Context**: Need a metric for comparing semantic similarity between text embeddings
|
||||
- **Options Considered**:
|
||||
1. Euclidean distance
|
||||
2. Cosine similarity
|
||||
3. Dot product
|
||||
- **Decision**: Use cosine similarity with normalized vectors
|
||||
- **Rationale**:
|
||||
- Standard approach for semantic similarity
|
||||
- Normalized vectors simplify computation (dot product equals cosine similarity)
|
||||
- Less sensitive to embedding magnitude, focusing on direction (meaning)
|
||||
|
||||
## 2025-02-27: Research System Architecture
|
||||
|
||||
### Decision: Implement a Multi-Stage Research Pipeline
|
||||
- **Context**: Need to define the overall architecture for the intelligent research system
|
||||
- **Options Considered**:
|
||||
1. Monolithic application with tightly coupled components
|
||||
2. Microservices architecture with independent services
|
||||
3. Pipeline architecture with distinct processing stages
|
||||
- **Decision**: Implement an 8-stage pipeline architecture
|
||||
- **Rationale**:
|
||||
- Clear separation of concerns with each stage having a specific responsibility
|
||||
- Easier to develop and test individual components
|
||||
- Flexibility to swap or enhance specific stages without affecting others
|
||||
- Natural flow of data through the system matches the research process
|
||||
|
||||
### Decision: Use Multiple Search Sources
|
||||
- **Context**: Need to gather comprehensive information from various sources
|
||||
- **Options Considered**:
|
||||
1. Use a single search API for simplicity
|
||||
2. Implement custom web scraping for all sources
|
||||
3. Use multiple specialized search APIs
|
||||
- **Decision**: Integrate multiple search sources (Google, Serper, Jina Search, Google Scholar, arXiv)
|
||||
- **Rationale**:
|
||||
- Different sources provide different types of information (academic, general, etc.)
|
||||
- Increases the breadth and diversity of search results
|
||||
- Specialized APIs like arXiv provide domain-specific information
|
||||
- Redundancy ensures more comprehensive coverage
|
||||
|
||||
### Decision: Use Jina AI for Semantic Processing
|
||||
- **Context**: Need for advanced semantic understanding in document processing
|
||||
- **Options Considered**:
|
||||
1. Use simple keyword matching
|
||||
2. Implement custom embedding models
|
||||
3. Use Jina AI's suite of APIs
|
||||
- **Decision**: Use Jina AI's APIs for embedding generation, similarity computation, and reranking
|
||||
- **Rationale**:
|
||||
- High-quality embeddings with state-of-the-art models
|
||||
- Comprehensive API suite covering multiple needs (embeddings, segmentation, reranking)
|
||||
- Simple integration with reasonable pricing
|
||||
- Consistent approach across different semantic processing tasks
|
||||
|
||||
## 2025-02-27: Search Execution Architecture
|
||||
|
||||
### Decision: Search Execution Architecture
|
||||
- **Context**: We needed to implement a search execution module that could execute search queries across multiple search engines and process the results in a standardized way.
|
||||
|
||||
- **Decision**:
|
||||
1. Create a modular search execution architecture:
|
||||
- Implement a base handler interface (`BaseSearchHandler`) for all search API handlers
|
||||
- Create specific handlers for each search engine (Google, Serper, Scholar, arXiv)
|
||||
- Develop a central `SearchExecutor` class to manage execution across multiple engines
|
||||
- Implement a `ResultCollector` class for processing and organizing results
|
||||
|
||||
2. Use parallel execution for search queries:
|
||||
- Implement thread-based parallelism using `concurrent.futures`
|
||||
- Add support for both synchronous and asynchronous execution
|
||||
- Include timeout management and error handling
|
||||
|
||||
3. Standardize search results:
|
||||
- Define a common result format across all search engines
|
||||
- Include metadata specific to each search engine in a standardized way
|
||||
- Implement deduplication and scoring for result ranking
|
||||
|
||||
- **Rationale**:
|
||||
- A modular architecture allows for easy addition of new search engines
|
||||
- Parallel execution significantly improves search performance
|
||||
- Standardized result format simplifies downstream processing
|
||||
- Separation of concerns between execution and result processing
|
||||
|
||||
- **Alternatives Considered**:
|
||||
1. Sequential execution of search queries:
|
||||
- Simpler implementation
|
||||
- Much slower performance
|
||||
- Would not scale well with additional search engines
|
||||
|
||||
2. Separate modules for each search engine:
|
||||
- Would lead to code duplication
|
||||
- More difficult to maintain
|
||||
- Less consistent result format
|
||||
|
||||
3. Using a third-party search aggregation service:
|
||||
- Would introduce additional dependencies
|
||||
- Less control over the search process
|
||||
- Potential cost implications
|
||||
|
||||
- **Impact**:
|
||||
- Efficient execution of search queries across multiple engines
|
||||
- Consistent result format for downstream processing
|
||||
- Flexible architecture that can be extended with new search engines
|
||||
- Clear separation of concerns between different components
|
||||
|
||||
## 2025-02-27: Search Execution Module Refinements
|
||||
|
||||
### Decision: Remove Google Search Handler
|
||||
- **Context**: Both Google and Serper handlers were implemented, but Serper is essentially a front-end for Google search
|
||||
- **Options Considered**:
|
||||
1. Keep both handlers for redundancy
|
||||
2. Remove the Google handler and only use Serper
|
||||
- **Decision**: Remove the Google search handler
|
||||
- **Rationale**:
|
||||
- Redundant functionality as Serper provides the same results
|
||||
- Simplifies the codebase and reduces maintenance
|
||||
- Reduces API costs by avoiding duplicate searches
|
||||
- Serper provides a more reliable and consistent API for Google search
|
||||
|
||||
### Decision: Modify LLM Query Enhancement Prompt
|
||||
- **Context**: The LLM was returning enhanced queries with explanations, which caused issues with search APIs
|
||||
- **Options Considered**:
|
||||
1. Post-process the LLM output to extract just the query
|
||||
2. Modify the prompt to request only the enhanced query
|
||||
- **Decision**: Modify the LLM prompt to request only the enhanced query without explanations
|
||||
- **Rationale**:
|
||||
- More reliable than post-processing, which could be error-prone
|
||||
- Cleaner implementation that addresses the root cause
|
||||
- Ensures consistent output format for downstream processing
|
||||
- Reduces the risk of exceeding API character limits
|
||||
|
||||
### Decision: Implement Query Truncation
|
||||
- **Context**: Enhanced queries could exceed the Serper API's 2048 character limit
|
||||
- **Options Considered**:
|
||||
1. Limit the LLM's output length
|
||||
2. Truncate queries before sending to the API
|
||||
3. Split long queries into multiple searches
|
||||
- **Decision**: Implement query truncation in the search executor
|
||||
- **Rationale**:
|
||||
- Simple and effective solution
|
||||
- Preserves as much of the enhanced query as possible
|
||||
- Ensures API requests don't fail due to length constraints
|
||||
- Can be easily adjusted if API limits change
|
||||
|
||||
## 2025-02-27: Testing Strategy for Query Processor
|
||||
|
||||
### Context
|
||||
After integrating Groq and OpenRouter as additional LLM providers, we needed to verify that the query processor module functions correctly with these new providers.
|
||||
|
||||
### Decision
|
||||
1. Create dedicated test scripts to validate the query processor functionality:
|
||||
- A basic test script for the core processing pipeline
|
||||
- A comprehensive test script for detailed component testing
|
||||
|
||||
2. Use monkey patching to ensure tests consistently use the Groq model:
|
||||
- Create a global LLM interface with the Groq model
|
||||
- Override the `get_llm_interface` function to always return this interface
|
||||
- This approach allows testing without modifying the core code
|
||||
|
||||
3. Test all key functionality of the query processor:
|
||||
- Query enhancement
|
||||
- Query classification
|
||||
- Search query generation
|
||||
- End-to-end processing pipeline
|
||||
|
||||
### Rationale
|
||||
- Dedicated test scripts provide a repeatable way to verify functionality
|
||||
- Monkey patching allows testing with specific models without changing the core code
|
||||
- Comprehensive testing ensures all components work correctly with the new providers
|
||||
- Saving test results to a JSON file provides a reference for future development
|
||||
|
||||
### Alternatives Considered
|
||||
1. Modifying the query processor to accept a model parameter:
|
||||
- Would require changing the core code
|
||||
- Could introduce bugs in the production code
|
||||
|
||||
2. Using environment variables to control model selection:
|
||||
- Less precise control over which model is used
|
||||
- Could interfere with other tests or production use
|
||||
|
||||
### Impact
|
||||
- Verified that the query processor works correctly with Groq models
|
||||
- Established a testing approach that can be used for other modules
|
||||
- Created reusable test scripts for future development
|
||||
|
||||
## 2025-02-27: Report Generation Module Implementation
|
||||
|
||||
### Decision: Use Jina Reader for Web Scraping and SQLite for Document Storage
|
||||
- **Context**: Need to implement document scraping and storage for the Report Generation module
|
||||
- **Options Considered**:
|
||||
1. In-memory document storage with custom web scraping
|
||||
2. SQLite database with Jina Reader for web scraping
|
||||
3. NoSQL database (e.g., MongoDB) with BeautifulSoup for web scraping
|
||||
4. Cloud-based document storage with third-party scraping service
|
||||
- **Decision**: Use Jina Reader for web scraping and SQLite for document storage
|
||||
- **Rationale**:
|
||||
- Jina Reader provides clean content extraction from web pages
|
||||
- Integration with existing Jina components (embeddings, reranker) for a consistent approach
|
||||
- SQLite offers persistence without the complexity of a full database server
|
||||
- SQLite's transactional nature ensures data integrity
|
||||
- Local storage reduces latency and eliminates cloud dependencies
|
||||
- Ability to store metadata alongside documents for better filtering and selection
|
||||
|
||||
### Decision: Implement Phased Approach for Report Generation
|
||||
- **Context**: Need to handle potentially large numbers of documents within LLM context window limitations
|
||||
- **Options Considered**:
|
||||
1. Single-pass approach with document truncation
|
||||
2. Use of a model with larger context window
|
||||
3. Phased approach with document prioritization and chunking
|
||||
4. Outsourcing document synthesis to a specialized service
|
||||
- **Decision**: Implement a phased approach with document prioritization and chunking
|
||||
- **Rationale**:
|
||||
- Allows handling of large document collections despite context window limitations
|
||||
- Prioritization ensures the most relevant content is included
|
||||
- Chunking strategies can preserve document structure and context
|
||||
- Map-reduce pattern enables processing of unlimited document collections
|
||||
- Flexible architecture can accommodate different models as needed
|
||||
- Progressive implementation allows for iterative testing and refinement
|
||||
|
||||
## 2025-02-27: Document Prioritization and Chunking Strategies
|
||||
|
||||
### Decision
|
||||
|
||||
Implemented document prioritization and chunking strategies for the Report Generation module (Phase 2) to extract the most relevant portions of scraped documents and prepare them for LLM processing.
|
||||
|
||||
### Context
|
||||
|
||||
After implementing the document scraping and storage components (Phase 1), we needed to develop strategies for prioritizing documents based on relevance and chunking them to fit within the LLM's context window limits. This is crucial for ensuring that the most important information is included in the final report.
|
||||
|
||||
### Options Considered
|
||||
|
||||
1. **Document Prioritization:**
|
||||
- Option A: Use only relevance scores from search results
|
||||
- Option B: Combine relevance scores with document metadata (recency, token count)
|
||||
- Option C: Use a machine learning model to score documents
|
||||
|
||||
2. **Chunking Strategies:**
|
||||
- Option A: Fixed-size chunking with overlap
|
||||
- Option B: Section-based chunking using Markdown headers
|
||||
- Option C: Hierarchical chunking for very large documents
|
||||
- Option D: Semantic chunking based on content similarity
|
||||
|
||||
### Decision and Rationale
|
||||
|
||||
For document prioritization, we chose Option B: a weighted scoring system that combines:
|
||||
- Relevance scores from search results (primary factor)
|
||||
- Document recency (secondary factor)
|
||||
- Document token count (tertiary factor)
|
||||
|
||||
This approach allows us to prioritize documents that are both relevant to the query and recent, while also considering the information density of the document.
|
||||
|
||||
For chunking strategies, we implemented a hybrid approach:
|
||||
- Section-based chunking (Option B) as the primary strategy, which preserves the logical structure of documents
|
||||
- Fixed-size chunking (Option A) as a fallback for documents without clear section headers
|
||||
- Hierarchical chunking (Option C) for very large documents, which creates a summary chunk and preserves important sections
|
||||
|
||||
We decided against semantic chunking (Option D) for now due to the additional computational overhead and complexity, but may consider it for future enhancements.
|
||||
|
||||
### Implementation Details
|
||||
|
||||
1. **Document Prioritization:**
|
||||
- Created a scoring formula that weights relevance (50-60%), recency (30%), and token count (10-20%)
|
||||
- Normalized all scores to a 0-1 range for consistent weighting
|
||||
- Added the priority score to each document for use in chunk selection
|
||||
|
||||
2. **Chunking Strategies:**
|
||||
- Implemented section-based chunking using regex to identify Markdown headers
|
||||
- Added fixed-size chunking with configurable chunk size and overlap
|
||||
- Created hierarchical chunking for very large documents
|
||||
- Preserved document metadata in all chunks for traceability
|
||||
|
||||
3. **Chunk Selection:**
|
||||
- Implemented a token budget management system to stay within context limits
|
||||
- Created an algorithm to select chunks based on priority while ensuring representation from multiple documents
|
||||
- Added minimum chunks per document to prevent over-representation of a single source
|
||||
|
||||
### Impact and Next Steps
|
||||
|
||||
This implementation allows us to:
|
||||
- Prioritize the most relevant and recent information
|
||||
- Preserve the logical structure of documents
|
||||
- Efficiently manage token budgets for different LLM models
|
||||
- Balance information from multiple sources
|
||||
|
||||
Next steps include:
|
||||
- Integrating with the LLM interface for report synthesis (Phase 3)
|
||||
- Implementing the map-reduce approach for processing document chunks
|
||||
- Creating report templates for different query types
|
||||
- Adding citation generation and reference management
|
||||
|
||||
## 2025-02-27: Map-Reduce Approach for Report Synthesis
|
||||
|
||||
### Context
|
||||
For Phase 3 of the Report Generation module, we needed to implement a method to synthesize comprehensive reports from multiple document chunks. The challenge was to effectively process potentially large amounts of information while maintaining coherence and staying within token limits of LLM models.
|
||||
|
||||
### Options Considered
|
||||
1. **Single-Pass Approach**: Send all document chunks to the LLM at once for processing.
|
||||
- Pros: Simpler implementation, LLM has full context at once
|
||||
- Cons: Limited by context window size, may exceed token limits for large documents
|
||||
|
||||
2. **Sequential Summarization**: Process each document sequentially, building up a summary incrementally.
|
||||
- Pros: Can handle unlimited documents, maintains some context
|
||||
- Cons: Risk of information loss, earlier documents may have undue influence
|
||||
|
||||
3. **Map-Reduce Approach**: Process individual chunks first (map), then combine the extracted information (reduce).
|
||||
- Pros: Can handle large numbers of documents, preserves key information, more efficient token usage
|
||||
- Cons: More complex implementation, requires two LLM passes
|
||||
|
||||
### Decision
|
||||
We chose the **Map-Reduce Approach** for report synthesis because:
|
||||
1. It allows us to process a large number of document chunks efficiently
|
||||
2. It preserves key information from each document by extracting it in the map phase
|
||||
3. It produces more coherent reports by synthesizing the extracted information in the reduce phase
|
||||
4. It makes better use of token limits by focusing on relevant information
|
||||
|
||||
### Implementation Details
|
||||
- **Map Phase**: Each document chunk is processed individually to extract key information relevant to the query
|
||||
- **Reduce Phase**: The extracted information is synthesized into a coherent report
|
||||
- **Query Type Templates**: Different report templates are used based on the query type (factual, exploratory, comparative)
|
||||
- **Citation Management**: Citations are included in the report with a references section at the end
|
||||
|
||||
### Success Metrics
|
||||
- Ability to process more documents than a single-pass approach
|
||||
- Higher quality reports with better organization and coherence
|
||||
- Proper attribution of information to sources
|
||||
- Efficient token usage
|
||||
|
||||
### Status
|
||||
Implemented and tested successfully with both sample data and real URLs.
|
||||
|
||||
## 2025-02-27: Report Generation Enhancements
|
||||
|
||||
### Decision: Implement Customizable Report Detail Levels
|
||||
- **Context**: Need to provide flexibility in report generation to accommodate different use cases and detail requirements
|
||||
- **Options Considered**:
|
||||
1. Fixed report format with predetermined detail level
|
||||
2. Simple toggle between "brief" and "detailed" reports
|
||||
3. Comprehensive configuration system with multiple adjustable parameters
|
||||
- **Decision**: Implement a comprehensive configuration system with multiple adjustable parameters
|
||||
- **Rationale**:
|
||||
- Different research tasks require different levels of detail
|
||||
- Users have varying needs for report comprehensiveness
|
||||
- A flexible system allows for fine-tuning based on specific use cases
|
||||
- Multiple configuration options provide more control over the output
|
||||
|
||||
### Implementation Details
|
||||
1. **Configurable Parameters**:
|
||||
- Number of search results per engine
|
||||
- Token budget for report generation
|
||||
- Synthesis prompts for the LLM
|
||||
- Report style templates
|
||||
- Chunking parameters (size and overlap)
|
||||
- Model selection options
|
||||
|
||||
2. **Integration Points**:
|
||||
- Command-line arguments for scripts
|
||||
- Configuration file options
|
||||
- API parameters for programmatic use
|
||||
- UI controls for user-facing applications
|
||||
|
||||
3. **Default Configurations**:
|
||||
- Create preset configurations for common use cases:
|
||||
- Brief overview (fewer results, smaller token budget)
|
||||
- Standard report (balanced approach)
|
||||
- Comprehensive analysis (more results, larger token budget)
|
||||
- Technical deep-dive (specialized prompts, larger context)
|
||||
|
||||
## 2025-02-28: Async Implementation and Reference Formatting
|
||||
|
||||
### Decision: Convert LLM Interface Methods to Async
|
||||
|
||||
**Context**: The codebase was experiencing runtime errors related to coroutine handling, particularly with the LLM interface methods.
|
||||
|
||||
**Decision**: Convert all LLM interface methods to async and update dependent code to properly await these methods.
|
||||
|
||||
**Rationale**:
|
||||
- LLM API calls are I/O-bound operations that benefit from async handling
|
||||
- Consistent async/await patterns throughout the codebase improve reliability
|
||||
- Proper async implementation prevents runtime errors related to coroutine handling
|
||||
|
||||
**Implementation**:
|
||||
- Converted `generate_completion`, `classify_query`, `enhance_query`, and `generate_search_queries` methods to async
|
||||
- Updated QueryProcessor methods to be async
|
||||
- Modified query_to_report.py to correctly await async methods
|
||||
- Updated the Gradio interface to handle async operations
|
||||
|
||||
### Decision: Enhance Reference Formatting Instructions
|
||||
|
||||
**Context**: References in generated reports were missing URLs and sometimes using generic placeholders like "Document 1".
|
||||
|
||||
**Decision**: Enhance the reference formatting instructions to emphasize including URLs and improve context preparation.
|
||||
|
||||
**Rationale**:
|
||||
- Proper references with URLs are essential for academic and professional reports
|
||||
- Clear instructions to the LLM improve the quality of generated references
|
||||
- Duplicate URL fields in the context ensure URLs are captured
|
||||
|
||||
**Implementation**:
|
||||
- Improved instructions to emphasize including URLs for each reference
|
||||
- Added duplicate URL fields in the context to ensure URLs are captured
|
||||
- Updated the reference generation prompt to explicitly request URLs
|
||||
- Added a separate reference generation step to handle truncated references
|
|
@ -1,48 +0,0 @@
|
|||
# Development Standards
|
||||
|
||||
## Coding Conventions
|
||||
|
||||
### Python Style
|
||||
- Follow PEP 8 style guidelines for Python code
|
||||
- Use 4 spaces for indentation (not tabs)
|
||||
- Maximum line length of 79 characters
|
||||
- Use docstrings for all modules, classes, and functions
|
||||
- Include type hints for function parameters and return values
|
||||
|
||||
### Documentation
|
||||
- All modules should have a module-level docstring explaining their purpose
|
||||
- All functions and classes should have docstrings following the Google style:
|
||||
- Brief description
|
||||
- Args section with parameter descriptions
|
||||
- Returns section describing return values
|
||||
- Raises section for exceptions that might be raised
|
||||
|
||||
### Error Handling
|
||||
- Use custom exception classes for domain-specific errors (e.g., TokenLimitError)
|
||||
- Handle exceptions at appropriate levels
|
||||
- Provide informative error messages
|
||||
- Log errors with sufficient context for debugging
|
||||
|
||||
## Project Structure
|
||||
- Keep modules focused on a single responsibility
|
||||
- Separate API interaction from business logic
|
||||
- Use environment variables for configuration and secrets
|
||||
- Include sample files for testing and demonstration
|
||||
|
||||
## Testing
|
||||
- Write unit tests for core functionality
|
||||
- Include integration tests for API interactions
|
||||
- Use sample files for consistent test cases
|
||||
- Test error handling and edge cases
|
||||
|
||||
## API Usage
|
||||
- Always include proper authentication headers
|
||||
- Handle API rate limits and errors gracefully
|
||||
- Document API dependencies and version requirements
|
||||
- Include comments with links to API documentation
|
||||
|
||||
## Security
|
||||
- Never hardcode API keys or credentials
|
||||
- Use environment variables for sensitive information
|
||||
- Validate and sanitize inputs
|
||||
- Handle errors without exposing sensitive information
|
|
@ -1,927 +0,0 @@
|
|||
# Component Interfaces
|
||||
|
||||
## Current Interfaces
|
||||
|
||||
### JinaSimilarity Class
|
||||
|
||||
#### Initialization
|
||||
```python
|
||||
js = JinaSimilarity()
|
||||
```
|
||||
- **Description**: Initializes the JinaSimilarity class
|
||||
- **Requirements**: JINA_API_KEY environment variable must be set
|
||||
- **Raises**: ValueError if JINA_API_KEY is not set
|
||||
|
||||
#### count_tokens
|
||||
```python
|
||||
token_count = js.count_tokens(text)
|
||||
```
|
||||
- **Description**: Counts the number of tokens in a text
|
||||
- **Parameters**:
|
||||
- `text` (str): The text to count tokens for
|
||||
- **Returns**: int - Number of tokens in the text
|
||||
- **Dependencies**: tiktoken library
|
||||
|
||||
#### get_embedding
|
||||
```python
|
||||
embedding = js.get_embedding(text)
|
||||
```
|
||||
- **Description**: Generates an embedding for a text using Jina AI's Embeddings API
|
||||
- **Parameters**:
|
||||
- `text` (str): The text to generate an embedding for (max 8,192 tokens)
|
||||
- **Returns**: list - The embedding vector
|
||||
- **Raises**:
|
||||
- `TokenLimitError`: If the text exceeds 8,192 tokens
|
||||
- `requests.exceptions.RequestException`: If the API call fails
|
||||
- **Dependencies**: requests library, Jina AI API
|
||||
|
||||
#### compute_similarity
|
||||
```python
|
||||
similarity, chunk_embedding, query_embedding = js.compute_similarity(chunk, query)
|
||||
```
|
||||
- **Description**: Computes similarity between a text chunk and a query
|
||||
- **Parameters**:
|
||||
- `chunk` (str): The text chunk to compare against
|
||||
- `query` (str): The query text
|
||||
- **Returns**: Tuple containing:
|
||||
- `similarity` (float): Cosine similarity score (0-1)
|
||||
- `chunk_embedding` (list): Chunk embedding
|
||||
- `query_embedding` (list): Query embedding
|
||||
- **Raises**:
|
||||
- `TokenLimitError`: If either text exceeds 8,192 tokens
|
||||
- `requests.exceptions.RequestException`: If the API calls fail
|
||||
- **Dependencies**: numpy library, get_embedding method
|
||||
|
||||
### Markdown Segmenter
|
||||
|
||||
#### segment_markdown
|
||||
```python
|
||||
segments = segment_markdown(file_path)
|
||||
```
|
||||
- **Description**: Segments a markdown file using Jina AI's Segmenter API
|
||||
- **Parameters**:
|
||||
- `file_path` (str): Path to the markdown file
|
||||
- **Returns**: dict - JSON structure containing the segments
|
||||
- **Raises**: Exception if segmentation fails
|
||||
- **Dependencies**: requests library, Jina AI API
|
||||
|
||||
### Test Similarity Script
|
||||
|
||||
#### Command-line Interface
|
||||
```
|
||||
python test_similarity.py chunk_file query_file [--verbose]
|
||||
```
|
||||
- **Description**: Computes similarity between text from two files
|
||||
- **Arguments**:
|
||||
- `chunk_file`: Path to the file containing the text chunk
|
||||
- `query_file`: Path to the file containing the query
|
||||
- `--verbose` or `-v`: Print token counts and embeddings
|
||||
- **Output**: Similarity score and optional verbose information
|
||||
- **Dependencies**: JinaSimilarity class
|
||||
|
||||
#### read_file
|
||||
```python
|
||||
content = read_file(file_path)
|
||||
```
|
||||
- **Description**: Reads content from a file
|
||||
- **Parameters**:
|
||||
- `file_path` (str): Path to the file to read
|
||||
- **Returns**: str - Content of the file
|
||||
- **Raises**: FileNotFoundError if the file doesn't exist
|
||||
|
||||
## Search Execution Module
|
||||
|
||||
### SearchExecutor Class
|
||||
|
||||
#### Initialization
|
||||
```python
|
||||
from execution.search_executor import SearchExecutor
|
||||
executor = SearchExecutor()
|
||||
```
|
||||
- **Description**: Initializes the SearchExecutor class
|
||||
- **Requirements**: Configuration file with API keys for search engines
|
||||
|
||||
#### execute_search
|
||||
```python
|
||||
results = executor.execute_search(query_data)
|
||||
```
|
||||
- **Description**: Executes a search across multiple search engines
|
||||
- **Parameters**:
|
||||
- `query_data` (dict): Dictionary containing query information with keys:
|
||||
- `raw_query` (str): The original user query
|
||||
- `enhanced_query` (str): The enhanced query from the LLM
|
||||
- `search_engines` (list, optional): List of search engines to use
|
||||
- `num_results` (int, optional): Number of results to return per engine
|
||||
- **Returns**: Dict[str, List[Dict[str, Any]]] - Dictionary mapping search engine names to lists of search results
|
||||
- **Example**:
|
||||
```python
|
||||
results = executor.execute_search({
|
||||
'raw_query': 'quantum computing',
|
||||
'enhanced_query': 'recent advancements in quantum computing algorithms and hardware'
|
||||
})
|
||||
```
|
||||
|
||||
### BaseSearchHandler Class
|
||||
|
||||
#### search
|
||||
```python
|
||||
results = handler.search(query, num_results=10, **kwargs)
|
||||
```
|
||||
- **Description**: Abstract method for searching implemented by all handlers
|
||||
- **Parameters**:
|
||||
- `query` (str): The search query
|
||||
- `num_results` (int): Number of results to return
|
||||
- `**kwargs`: Additional parameters specific to the search engine
|
||||
- **Returns**: List[Dict[str, Any]] - List of search results
|
||||
- **Example**:
|
||||
```python
|
||||
from execution.api_handlers.serper_handler import SerperSearchHandler
|
||||
handler = SerperSearchHandler()
|
||||
results = handler.search("quantum computing", num_results=5)
|
||||
```
|
||||
|
||||
### SerperSearchHandler Class
|
||||
|
||||
#### search
|
||||
```python
|
||||
from execution.api_handlers.serper_handler import SerperSearchHandler
|
||||
handler = SerperSearchHandler()
|
||||
results = handler.search(query, num_results=10, **kwargs)
|
||||
```
|
||||
- **Description**: Executes a search using the Serper API
|
||||
- **Parameters**:
|
||||
- `query` (str): The search query
|
||||
- `num_results` (int): Number of results to return
|
||||
- `**kwargs`: Additional parameters for the Serper API
|
||||
- **Returns**: List[Dict[str, Any]] - List of search results with keys:
|
||||
- `title` (str): Title of the result
|
||||
- `url` (str): URL of the result
|
||||
- `snippet` (str): Snippet of text from the result
|
||||
- `source` (str): Source of the result (always "serper")
|
||||
- **Requirements**: Serper API key in configuration
|
||||
- **Example**:
|
||||
```python
|
||||
results = handler.search("quantum computing", num_results=5)
|
||||
```
|
||||
|
||||
### ScholarSearchHandler Class
|
||||
|
||||
#### search
|
||||
```python
|
||||
from execution.api_handlers.scholar_handler import ScholarSearchHandler
|
||||
handler = ScholarSearchHandler()
|
||||
results = handler.search(query, num_results=10, **kwargs)
|
||||
```
|
||||
- **Description**: Executes a search on Google Scholar using the Serper API
|
||||
- **Parameters**:
|
||||
- `query` (str): The search query
|
||||
- `num_results` (int): Number of results to return
|
||||
- `**kwargs`: Additional parameters for the Scholar API
|
||||
- **Returns**: List[Dict[str, Any]] - List of search results with keys:
|
||||
- `title` (str): Title of the paper
|
||||
- `url` (str): URL of the paper
|
||||
- `snippet` (str): Snippet of text from the paper
|
||||
- `source` (str): Source of the result (always "scholar")
|
||||
- `authors` (str): Authors of the paper
|
||||
- `publication` (str): Publication venue
|
||||
- `year` (int): Publication year
|
||||
- **Requirements**: Serper API key in configuration
|
||||
- **Example**:
|
||||
```python
|
||||
results = handler.search("quantum computing", num_results=5)
|
||||
```
|
||||
|
||||
### ArxivSearchHandler Class
|
||||
|
||||
#### search
|
||||
```python
|
||||
from execution.api_handlers.arxiv_handler import ArxivSearchHandler
|
||||
handler = ArxivSearchHandler()
|
||||
results = handler.search(query, num_results=10, **kwargs)
|
||||
```
|
||||
- **Description**: Executes a search on arXiv
|
||||
- **Parameters**:
|
||||
- `query` (str): The search query
|
||||
- `num_results` (int): Number of results to return
|
||||
- `**kwargs`: Additional parameters for the arXiv API
|
||||
- **Returns**: List[Dict[str, Any]] - List of search results with keys:
|
||||
- `title` (str): Title of the paper
|
||||
- `url` (str): URL of the paper
|
||||
- `pdf_url` (str): URL to the PDF
|
||||
- `snippet` (str): Abstract of the paper
|
||||
- `source` (str): Source of the result (always "arxiv")
|
||||
- `arxiv_id` (str): arXiv ID
|
||||
- `authors` (list): List of author names
|
||||
- `categories` (list): List of arXiv categories
|
||||
- `published_date` (str): Publication date
|
||||
- `updated_date` (str): Last update date
|
||||
- `full_text` (str): Full abstract text
|
||||
- **Example**:
|
||||
```python
|
||||
results = handler.search("quantum computing", num_results=5)
|
||||
```
|
||||
|
||||
### ResultCollector Class
|
||||
|
||||
#### process_results
|
||||
```python
|
||||
from execution.result_collector import ResultCollector
|
||||
collector = ResultCollector()
|
||||
processed_results = collector.process_results(search_results, dedup=True, max_results=None)
|
||||
```
|
||||
- **Description**: Processes search results from multiple search engines
|
||||
- **Parameters**:
|
||||
- `search_results` (Dict[str, List[Dict[str, Any]]]): Dictionary mapping search engine names to lists of search results
|
||||
- `dedup` (bool): Whether to deduplicate results based on URL
|
||||
- `max_results` (Optional[int]): Maximum number of results to return
|
||||
- **Returns**: List[Dict[str, Any]] - Combined and processed list of search results
|
||||
- **Example**:
|
||||
```python
|
||||
processed_results = collector.process_results({
|
||||
'serper': serper_results,
|
||||
'scholar': scholar_results,
|
||||
'arxiv': arxiv_results
|
||||
}, dedup=True, max_results=20)
|
||||
```
|
||||
|
||||
#### save_results
|
||||
```python
|
||||
collector.save_results(results, file_path)
|
||||
```
|
||||
- **Description**: Saves search results to a JSON file
|
||||
- **Parameters**:
|
||||
- `results` (List[Dict[str, Any]]): List of search results
|
||||
- `file_path` (str): Path to save the results
|
||||
- **Example**:
|
||||
```python
|
||||
collector.save_results(processed_results, "search_results.json")
|
||||
```
|
||||
|
||||
## Planned Interfaces for Research System
|
||||
|
||||
### ResearchSystem Class
|
||||
|
||||
#### Initialization
|
||||
```python
|
||||
rs = ResearchSystem(config=None)
|
||||
```
|
||||
- **Description**: Initializes the ResearchSystem with optional configuration
|
||||
- **Parameters**:
|
||||
- `config` (dict, optional): Configuration options for the research system
|
||||
- **Requirements**: Various API keys set in environment variables or config
|
||||
- **Raises**: ValueError if required API keys are not set
|
||||
|
||||
#### execute_research
|
||||
```python
|
||||
report = rs.execute_research(query, options=None)
|
||||
```
|
||||
- **Description**: Executes a complete research pipeline from query to report
|
||||
- **Parameters**:
|
||||
- `query` (str): The research query
|
||||
- `options` (dict, optional): Options to customize the research process
|
||||
- **Returns**: dict - Research report with metadata
|
||||
- **Raises**: Various exceptions for different stages of the pipeline
|
||||
|
||||
#### save_report
|
||||
```python
|
||||
rs.save_report(report, file_path, format="markdown")
|
||||
```
|
||||
- **Description**: Saves the research report to a file
|
||||
- **Parameters**:
|
||||
- `report` (dict): The research report to save
|
||||
- `file_path` (str): Path to save the report
|
||||
- `format` (str, optional): Format of the report (markdown, html, pdf)
|
||||
- **Raises**: IOError if the file cannot be saved
|
||||
|
||||
### QueryProcessor Class
|
||||
|
||||
#### process_query
|
||||
```python
|
||||
structured_query = query_processor.process_query(query)
|
||||
```
|
||||
- **Description**: Processes a raw query into a structured format
|
||||
- **Parameters**:
|
||||
- `query` (str): The raw research query
|
||||
- **Returns**: dict - Structured query with metadata
|
||||
- **Raises**: ValueError if the query is invalid
|
||||
|
||||
### SearchStrategy Class
|
||||
|
||||
#### develop_strategy
|
||||
```python
|
||||
search_plan = search_strategy.develop_strategy(structured_query)
|
||||
```
|
||||
- **Description**: Develops a search strategy based on the query
|
||||
- **Parameters**:
|
||||
- `structured_query` (dict): The structured query
|
||||
- **Returns**: dict - Search plan with target-specific queries
|
||||
- **Raises**: ValueError if the query cannot be processed
|
||||
|
||||
### SearchExecutor Class
|
||||
|
||||
#### execute_search
|
||||
```python
|
||||
search_results = search_executor.execute_search(search_plan)
|
||||
```
|
||||
- **Description**: Executes search queries against selected targets
|
||||
- **Parameters**:
|
||||
- `search_plan` (dict): The search plan with queries
|
||||
- **Returns**: dict - Collection of search results
|
||||
- **Raises**: APIError if the search APIs fail
|
||||
|
||||
### JinaReranker Class
|
||||
|
||||
#### rerank
|
||||
```python
|
||||
ranked_documents = jina_reranker.rerank(query, documents, top_n=None)
|
||||
```
|
||||
- **Description**: Rerank documents based on their relevance to the query.
|
||||
- **Parameters**:
|
||||
- `query` (str): The query to rank documents against
|
||||
- `documents` (List[str]): List of document strings to rerank
|
||||
- `top_n` (Optional[int]): Number of top results to return (optional)
|
||||
- **Returns**: List of dictionaries containing reranked documents with scores and indices
|
||||
|
||||
#### rerank_with_metadata
|
||||
```python
|
||||
ranked_documents = jina_reranker.rerank_with_metadata(query, documents, document_key='content', top_n=None)
|
||||
```
|
||||
- **Description**: Rerank documents with metadata based on their relevance to the query.
|
||||
- **Parameters**:
|
||||
- `query` (str): The query to rank documents against
|
||||
- `documents` (List[Dict[str, Any]]): List of document dictionaries containing content and metadata
|
||||
- `document_key` (str): The key in the document dictionaries that contains the text content
|
||||
- `top_n` (Optional[int]): Number of top results to return (optional)
|
||||
- **Returns**: List of dictionaries containing reranked documents with scores, indices, and original metadata
|
||||
|
||||
#### get_jina_reranker
|
||||
```python
|
||||
jina_reranker = get_jina_reranker()
|
||||
```
|
||||
- **Description**: Get the global Jina Reranker instance.
|
||||
- **Returns**: JinaReranker instance
|
||||
|
||||
### DocumentScraper Class
|
||||
|
||||
#### scrape_documents
|
||||
```python
|
||||
markdown_documents = document_scraper.scrape_documents(ranked_documents)
|
||||
```
|
||||
- **Description**: Scrapes and converts documents to markdown
|
||||
- **Parameters**:
|
||||
- `ranked_documents` (list): The ranked list of documents to scrape
|
||||
- **Returns**: list - Collection of markdown documents
|
||||
- **Raises**: ScrapingError if the documents cannot be scraped
|
||||
|
||||
### DocumentSelector Class
|
||||
|
||||
#### select_documents
|
||||
```python
|
||||
selected_documents = document_selector.select_documents(documents_with_scores)
|
||||
```
|
||||
- **Description**: Selects the most relevant and diverse documents
|
||||
- **Parameters**:
|
||||
- `documents_with_scores` (list): Documents with similarity scores
|
||||
- **Returns**: list - Curated set of documents
|
||||
- **Raises**: ValueError if the selection criteria are invalid
|
||||
|
||||
### ReportGenerator Class
|
||||
|
||||
#### generate_report
|
||||
```python
|
||||
report = report_generator.generate_report(selected_documents, query)
|
||||
```
|
||||
- **Description**: Generates a research report from selected documents
|
||||
- **Parameters**:
|
||||
- `selected_documents` (list): The selected documents
|
||||
- `query` (str): The original query for context
|
||||
- **Returns**: dict - Final research report
|
||||
- **Raises**: GenerationError if the report cannot be generated
|
||||
|
||||
## Search Execution Module
|
||||
|
||||
### SearchExecutor Class
|
||||
|
||||
The `SearchExecutor` class manages the execution of search queries across multiple search engines.
|
||||
|
||||
#### Initialization
|
||||
```python
|
||||
executor = SearchExecutor()
|
||||
```
|
||||
- **Description**: Initializes the search executor with available search handlers
|
||||
- **Requirements**: Appropriate API keys must be set for the search engines to be used
|
||||
|
||||
#### execute_search
|
||||
```python
|
||||
results = executor.execute_search(structured_query, search_engines=["google", "scholar"], num_results=10)
|
||||
```
|
||||
- **Description**: Executes search queries across specified search engines in parallel
|
||||
- **Parameters**:
|
||||
- `structured_query` (Dict[str, Any]): The structured query from the query processor
|
||||
- `search_engines` (Optional[List[str]]): List of search engines to use
|
||||
- `num_results` (int): Number of results to return per search engine
|
||||
- `timeout` (int): Timeout in seconds for each search engine
|
||||
- **Returns**: Dict[str, List[Dict[str, Any]]] - Dictionary mapping search engine names to lists of search results
|
||||
|
||||
#### execute_search_async
|
||||
```python
|
||||
results = await executor.execute_search_async(structured_query, search_engines=["google", "scholar"])
|
||||
```
|
||||
- **Description**: Executes search queries across specified search engines asynchronously
|
||||
- **Parameters**: Same as `execute_search`
|
||||
- **Returns**: Dict[str, List[Dict[str, Any]]] - Dictionary mapping search engine names to lists of search results
|
||||
|
||||
#### get_available_search_engines
|
||||
```python
|
||||
engines = executor.get_available_search_engines()
|
||||
```
|
||||
- **Description**: Gets a list of available search engines
|
||||
- **Returns**: List[str] - List of available search engine names
|
||||
|
||||
### ResultCollector Class
|
||||
|
||||
The `ResultCollector` class processes and organizes search results from multiple search engines.
|
||||
|
||||
#### Initialization
|
||||
```python
|
||||
collector = ResultCollector()
|
||||
```
|
||||
- **Description**: Initializes the result collector
|
||||
|
||||
#### process_results
|
||||
```python
|
||||
processed_results = collector.process_results(search_results, dedup=True, max_results=20)
|
||||
```
|
||||
- **Description**: Processes search results from multiple search engines
|
||||
- **Parameters**:
|
||||
- `search_results` (Dict[str, List[Dict[str, Any]]]): Dictionary mapping search engine names to lists of search results
|
||||
- `dedup` (bool): Whether to deduplicate results based on URL
|
||||
- `max_results` (Optional[int]): Maximum number of results to return
|
||||
- **Returns**: List[Dict[str, Any]] - List of processed search results
|
||||
|
||||
#### filter_results
|
||||
```python
|
||||
filtered_results = collector.filter_results(results, filters={"domains": ["arxiv.org"], "min_score": 5})
|
||||
```
|
||||
- **Description**: Filters results based on specified criteria
|
||||
- **Parameters**:
|
||||
- `results` (List[Dict[str, Any]]): List of search results
|
||||
- `filters` (Dict[str, Any]): Dictionary of filter criteria
|
||||
- **Returns**: List[Dict[str, Any]] - Filtered list of search results
|
||||
|
||||
#### group_results_by_domain
|
||||
```python
|
||||
grouped_results = collector.group_results_by_domain(results)
|
||||
```
|
||||
- **Description**: Groups results by domain
|
||||
- **Parameters**:
|
||||
- `results` (List[Dict[str, Any]]): List of search results
|
||||
- **Returns**: Dict[str, List[Dict[str, Any]]] - Dictionary mapping domains to lists of search results
|
||||
|
||||
### BaseSearchHandler Interface
|
||||
|
||||
The `BaseSearchHandler` class defines the interface for all search API handlers.
|
||||
|
||||
#### search
|
||||
```python
|
||||
results = handler.search(query, num_results=10, **kwargs)
|
||||
```
|
||||
- **Description**: Executes a search query
|
||||
- **Parameters**:
|
||||
- `query` (str): The search query to execute
|
||||
- `num_results` (int): Number of results to return
|
||||
- `**kwargs`: Additional search parameters specific to the API
|
||||
- **Returns**: List[Dict[str, Any]] - List of search results
|
||||
|
||||
#### get_name
|
||||
```python
|
||||
name = handler.get_name()
|
||||
```
|
||||
- **Description**: Gets the name of the search handler
|
||||
- **Returns**: str - Name of the search handler
|
||||
|
||||
#### is_available
|
||||
```python
|
||||
available = handler.is_available()
|
||||
```
|
||||
- **Description**: Checks if the search API is available
|
||||
- **Returns**: bool - True if the API is available, False otherwise
|
||||
|
||||
#### get_rate_limit_info
|
||||
```python
|
||||
rate_limits = handler.get_rate_limit_info()
|
||||
```
|
||||
- **Description**: Gets information about the API's rate limits
|
||||
- **Returns**: Dict[str, Any] - Dictionary with rate limit information
|
||||
|
||||
## Ranking Module
|
||||
|
||||
### JinaReranker Class
|
||||
|
||||
The `JinaReranker` class provides document reranking functionality using Jina AI's Reranker API.
|
||||
|
||||
#### Initialization
|
||||
```python
|
||||
reranker = JinaReranker(
|
||||
api_key=None, # Optional, will use environment variable if not provided
|
||||
model="jina-reranker-v2-base-multilingual", # Default model
|
||||
endpoint="https://api.jina.ai/v1/rerank" # Default endpoint
|
||||
)
|
||||
```
|
||||
- **Description**: Initializes the JinaReranker with the specified API key, model, and endpoint
|
||||
- **Parameters**:
|
||||
- `api_key` (Optional[str]): Jina AI API key (defaults to environment variable)
|
||||
- `model` (str): The reranker model to use
|
||||
- `endpoint` (str): The API endpoint
|
||||
- **Requirements**: JINA_API_KEY environment variable must be set if api_key is not provided
|
||||
- **Raises**: ValueError if API key is not available
|
||||
|
||||
#### rerank
|
||||
```python
|
||||
reranked_docs = reranker.rerank(query, documents, top_n=None)
|
||||
```
|
||||
- **Description**: Reranks a list of documents based on their relevance to the query
|
||||
- **Parameters**:
|
||||
- `query` (str): The query string
|
||||
- `documents` (List[str]): List of document strings to rerank
|
||||
- `top_n` (Optional[int]): Number of top documents to return (defaults to all)
|
||||
- **Returns**: List[Dict[str, Any]] - List of reranked documents with scores
|
||||
- **Example Return Format**:
|
||||
```json
|
||||
[
|
||||
{
|
||||
"index": 0,
|
||||
"score": 0.95,
|
||||
"document": "Document content here"
|
||||
},
|
||||
{
|
||||
"index": 3,
|
||||
"score": 0.82,
|
||||
"document": "Another document content"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
#### get_jina_reranker
|
||||
```python
|
||||
reranker = get_jina_reranker()
|
||||
```
|
||||
- **Description**: Factory function to get a JinaReranker instance with configuration from the config file
|
||||
- **Returns**: JinaReranker - Initialized reranker instance
|
||||
- **Raises**: ValueError if API key is not available
|
||||
|
||||
### Usage Examples
|
||||
|
||||
#### Basic Usage
|
||||
```python
|
||||
from ranking.jina_reranker import JinaReranker
|
||||
|
||||
# Initialize with specific model
|
||||
reranker = JinaReranker()
|
||||
|
||||
# Rerank documents
|
||||
results = reranker.rerank(
|
||||
query="What is quantum computing?",
|
||||
documents=["Document about quantum physics", "Document about quantum computing", "Document about classical computing"],
|
||||
top_n=2
|
||||
)
|
||||
|
||||
# Process results
|
||||
for result in results:
|
||||
print(f"Score: {result['score']}, Document: {result['document']}")
|
||||
```
|
||||
|
||||
#### Integration with ResultCollector
|
||||
```python
|
||||
from execution.result_collector import ResultCollector
|
||||
from ranking.jina_reranker import get_jina_reranker
|
||||
|
||||
# Initialize components
|
||||
reranker = get_jina_reranker()
|
||||
collector = ResultCollector(reranker=reranker)
|
||||
|
||||
# Process search results with reranking
|
||||
reranked_results = collector.process_results(
|
||||
search_results,
|
||||
dedup=True,
|
||||
max_results=20,
|
||||
use_reranker=True
|
||||
)
|
||||
```
|
||||
|
||||
#### Testing
|
||||
```python
|
||||
# Simple test script
|
||||
import json
|
||||
from ranking.jina_reranker import get_jina_reranker
|
||||
|
||||
reranker = get_jina_reranker()
|
||||
query = "What is quantum computing?"
|
||||
documents = [
|
||||
"Quantum computing is a type of computation that harnesses quantum mechanics.",
|
||||
"Classical computers use bits, while quantum computers use qubits.",
|
||||
"Machine learning is a subset of artificial intelligence."
|
||||
]
|
||||
|
||||
reranked = reranker.rerank(query, documents)
|
||||
print(json.dumps(reranked, indent=2))
|
||||
```
|
||||
|
||||
## Search Execution Testing
|
||||
|
||||
The search execution module has been tested to ensure it correctly executes search queries across multiple search engines and processes the results.
|
||||
|
||||
### Test Script (test_search_execution.py)
|
||||
|
||||
```python
|
||||
# Process a query and execute search
|
||||
results = test_search_execution("What are the latest advancements in quantum computing?")
|
||||
|
||||
# Save test results
|
||||
save_test_results(results, "search_execution_test_results.json")
|
||||
```
|
||||
|
||||
- **Purpose**: Tests the search execution module with various queries
|
||||
- **Features**:
|
||||
- Tests with multiple queries
|
||||
- Uses all available search engines
|
||||
- Saves results to a JSON file
|
||||
- Provides detailed output of search results
|
||||
|
||||
## UI Module
|
||||
|
||||
### GradioInterface Class
|
||||
|
||||
#### Initialization
|
||||
```python
|
||||
from ui.gradio_interface import GradioInterface
|
||||
interface = GradioInterface()
|
||||
```
|
||||
- **Description**: Initializes the Gradio interface for the research system
|
||||
- **Requirements**: Gradio library installed
|
||||
|
||||
#### process_query
|
||||
```python
|
||||
markdown_results, results_file = interface.process_query(query, num_results=10)
|
||||
```
|
||||
- **Description**: Processes a query and returns the results
|
||||
- **Parameters**:
|
||||
- `query` (str): The query to process
|
||||
- `num_results` (int): Number of results to return
|
||||
- **Returns**:
|
||||
- `markdown_results` (str): Markdown formatted results
|
||||
- `results_file` (str): Path to the JSON file with saved results
|
||||
- **Example**:
|
||||
```python
|
||||
results, file_path = interface.process_query("What are the latest advancements in quantum computing?", num_results=15)
|
||||
```
|
||||
|
||||
#### create_interface
|
||||
```python
|
||||
interface_blocks = interface.create_interface()
|
||||
```
|
||||
- **Description**: Creates and returns the Gradio interface
|
||||
- **Returns**: `gr.Blocks` - The Gradio interface object
|
||||
- **Example**:
|
||||
```python
|
||||
blocks = interface.create_interface()
|
||||
blocks.launch()
|
||||
```
|
||||
|
||||
#### launch
|
||||
```python
|
||||
interface.launch(share=True, server_port=7860, debug=False)
|
||||
```
|
||||
- **Description**: Launches the Gradio interface
|
||||
- **Parameters**:
|
||||
- `share` (bool): Whether to create a public link for sharing
|
||||
- `server_port` (int): Port to run the server on
|
||||
- `debug` (bool): Whether to run in debug mode
|
||||
- **Example**:
|
||||
```python
|
||||
interface.launch(share=True)
|
||||
```
|
||||
|
||||
### Running the UI
|
||||
```bash
|
||||
python run_ui.py --share --port 7860
|
||||
```
|
||||
- **Description**: Runs the Gradio interface
|
||||
- **Parameters**:
|
||||
- `--share`: Create a public link for sharing
|
||||
- `--port`: Port to run the server on (default: 7860)
|
||||
- `--debug`: Run in debug mode
|
||||
- **Example**:
|
||||
```bash
|
||||
python run_ui.py --share
|
||||
```
|
||||
|
||||
## Document Ranking Interface
|
||||
|
||||
### JinaReranker
|
||||
|
||||
The `JinaReranker` class provides an interface for reranking documents based on their relevance to a query using Jina AI's Reranker API.
|
||||
|
||||
#### Methods
|
||||
|
||||
```python
|
||||
def rerank(query: str, documents: List[str], top_n: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Rerank documents based on their relevance to the query.
|
||||
|
||||
Args:
|
||||
query: The query to rank documents against
|
||||
documents: List of document strings to rerank
|
||||
top_n: Number of top results to return (optional)
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing reranked documents with scores and indices
|
||||
"""
|
||||
```
|
||||
|
||||
```python
|
||||
def rerank_with_metadata(query: str, documents: List[Dict[str, Any]],
|
||||
document_key: str = 'content',
|
||||
top_n: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Rerank documents with metadata based on their relevance to the query.
|
||||
|
||||
Args:
|
||||
query: The query to rank documents against
|
||||
documents: List of document dictionaries containing content and metadata
|
||||
document_key: The key in the document dictionaries that contains the text content
|
||||
top_n: Number of top results to return (optional)
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing reranked documents with scores, indices, and original metadata
|
||||
"""
|
||||
```
|
||||
|
||||
#### Factory Function
|
||||
|
||||
```python
|
||||
def get_jina_reranker() -> JinaReranker:
|
||||
"""
|
||||
Get the global Jina Reranker instance.
|
||||
|
||||
Returns:
|
||||
JinaReranker instance
|
||||
"""
|
||||
```
|
||||
|
||||
#### Example Usage
|
||||
|
||||
```python
|
||||
from ranking.jina_reranker import get_jina_reranker
|
||||
|
||||
# Get the reranker
|
||||
reranker = get_jina_reranker()
|
||||
|
||||
# Rerank documents
|
||||
results = reranker.rerank(
|
||||
query="What is quantum computing?",
|
||||
documents=["Document about quantum physics", "Document about quantum computing", "Document about classical computing"],
|
||||
top_n=2
|
||||
)
|
||||
|
||||
# Process results
|
||||
for result in results:
|
||||
print(f"Score: {result['score']}, Document: {result['document']}")
|
||||
```
|
||||
|
||||
## Report Generation Module
|
||||
|
||||
### ReportDetailLevelManager Class
|
||||
|
||||
The `ReportDetailLevelManager` class manages configurations for different report detail levels.
|
||||
|
||||
#### Initialization
|
||||
```python
|
||||
detail_level_manager = get_report_detail_level_manager()
|
||||
```
|
||||
- **Description**: Gets a singleton instance of the ReportDetailLevelManager
|
||||
|
||||
#### get_detail_level_config
|
||||
```python
|
||||
config = detail_level_manager.get_detail_level_config(detail_level)
|
||||
```
|
||||
- **Description**: Gets configuration parameters for a specific detail level
|
||||
- **Parameters**:
|
||||
- `detail_level` (str): Detail level as a string (brief, standard, detailed, comprehensive)
|
||||
- **Returns**: Dict[str, Any] - Configuration parameters for the specified detail level
|
||||
- **Raises**: ValueError if the detail level is not valid
|
||||
|
||||
#### get_template_modifier
|
||||
```python
|
||||
template = detail_level_manager.get_template_modifier(detail_level, query_type)
|
||||
```
|
||||
- **Description**: Gets template modifier for a specific detail level and query type
|
||||
- **Parameters**:
|
||||
- `detail_level` (str): Detail level as a string (brief, standard, detailed, comprehensive)
|
||||
- `query_type` (str): Query type as a string (factual, exploratory, comparative)
|
||||
- **Returns**: str - Template modifier as a string
|
||||
- **Raises**: ValueError if the detail level or query type is not valid
|
||||
|
||||
#### get_available_detail_levels
|
||||
```python
|
||||
levels = detail_level_manager.get_available_detail_levels()
|
||||
```
|
||||
- **Description**: Gets a list of available detail levels with descriptions
|
||||
- **Returns**: List[Tuple[str, str]] - List of tuples containing detail level and description
|
||||
|
||||
### ReportGenerator Class
|
||||
|
||||
The `ReportGenerator` class generates reports from search results.
|
||||
|
||||
#### Initialization
|
||||
```python
|
||||
report_generator = get_report_generator()
|
||||
```
|
||||
- **Description**: Gets a singleton instance of the ReportGenerator
|
||||
|
||||
#### initialize
|
||||
```python
|
||||
await report_generator.initialize()
|
||||
```
|
||||
- **Description**: Initializes the report generator by setting up the database
|
||||
- **Returns**: None
|
||||
|
||||
#### set_detail_level
|
||||
```python
|
||||
report_generator.set_detail_level(detail_level)
|
||||
```
|
||||
- **Description**: Sets the detail level for report generation
|
||||
- **Parameters**:
|
||||
- `detail_level` (str): Detail level (brief, standard, detailed, comprehensive)
|
||||
- **Returns**: None
|
||||
- **Raises**: ValueError if the detail level is not valid
|
||||
|
||||
#### get_detail_level_config
|
||||
```python
|
||||
config = report_generator.get_detail_level_config()
|
||||
```
|
||||
- **Description**: Gets the current detail level configuration
|
||||
- **Returns**: Dict[str, Any] - Configuration parameters for the current detail level
|
||||
|
||||
#### get_available_detail_levels
|
||||
```python
|
||||
levels = report_generator.get_available_detail_levels()
|
||||
```
|
||||
- **Description**: Gets a list of available detail levels with descriptions
|
||||
- **Returns**: List[Tuple[str, str]] - List of tuples containing detail level and description
|
||||
|
||||
#### process_search_results
|
||||
```python
|
||||
documents = await report_generator.process_search_results(search_results)
|
||||
```
|
||||
- **Description**: Processes search results by scraping the URLs and storing them in the database
|
||||
- **Parameters**:
|
||||
- `search_results` (List[Dict[str, Any]]): List of search results, each containing at least a 'url' field
|
||||
- **Returns**: List[Dict[str, Any]] - List of processed documents
|
||||
|
||||
#### prepare_documents_for_report
|
||||
```python
|
||||
chunks = await report_generator.prepare_documents_for_report(search_results, token_budget, chunk_size, overlap_size)
|
||||
```
|
||||
- **Description**: Prepares documents for report generation by chunking and selecting relevant content
|
||||
- **Parameters**:
|
||||
- `search_results` (List[Dict[str, Any]]): List of search results
|
||||
- `token_budget` (Optional[int]): Maximum number of tokens to use
|
||||
- `chunk_size` (Optional[int]): Maximum number of tokens per chunk
|
||||
- `overlap_size` (Optional[int]): Number of tokens to overlap between chunks
|
||||
- **Returns**: List[Dict[str, Any]] - List of selected document chunks
|
||||
|
||||
#### generate_report
|
||||
```python
|
||||
report = await report_generator.generate_report(
|
||||
search_results=search_results,
|
||||
query=query,
|
||||
token_budget=token_budget,
|
||||
chunk_size=chunk_size,
|
||||
overlap_size=overlap_size,
|
||||
detail_level=detail_level
|
||||
)
|
||||
```
|
||||
- **Description**: Generates a report from search results
|
||||
- **Parameters**:
|
||||
- `search_results` (List[Dict[str, Any]]): List of search results
|
||||
- `query` (str): Original search query
|
||||
- `token_budget` (Optional[int]): Maximum number of tokens to use
|
||||
- `chunk_size` (Optional[int]): Maximum number of tokens per chunk
|
||||
- `overlap_size` (Optional[int]): Number of tokens to overlap between chunks
|
||||
- `detail_level` (Optional[str]): Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
- **Returns**: str - Generated report as a string
|
||||
|
||||
#### initialize_report_generator
|
||||
```python
|
||||
await initialize_report_generator()
|
||||
```
|
||||
- **Description**: Initializes the global report generator instance
|
||||
- **Returns**: None
|
||||
|
||||
#### get_report_generator
|
||||
```python
|
||||
report_generator = get_report_generator()
|
||||
```
|
||||
- **Description**: Gets the global report generator instance
|
||||
- **Returns**: ReportGenerator - Initialized report generator instance
|
|
@ -1,110 +0,0 @@
|
|||
# Project Overview: Intelligent Research System with Semantic Search
|
||||
|
||||
## Purpose
|
||||
This project implements an intelligent research system that automates the process of finding, filtering, and synthesizing information from various sources. At its core, the system uses semantic similarity search powered by Jina AI's APIs to understand context beyond simple keyword matching, enabling more intelligent document processing and information retrieval.
|
||||
|
||||
## Goals
|
||||
1. Create an end-to-end research automation system that handles the entire process from query to final report
|
||||
2. Leverage multiple search sources to gather comprehensive information (Serper, Google Scholar, arXiv)
|
||||
3. Implement intelligent filtering and ranking of documents using semantic similarity
|
||||
4. Produce synthesized reports that extract and combine the most relevant information
|
||||
5. Build a modular and extensible architecture that can be enhanced with additional capabilities
|
||||
|
||||
## High-Level Architecture
|
||||
The system follows a modular pipeline:
|
||||
|
||||
1. **Query Processing**:
|
||||
- Accept and process user research queries
|
||||
- Enhance queries with additional context and structure
|
||||
- Classify queries by type, intent, and entities
|
||||
- Generate optimized queries for different search engines
|
||||
|
||||
2. **Search Execution**:
|
||||
- Execute search queries across multiple search engines (Serper, Google Scholar, arXiv)
|
||||
- Collect and process search results
|
||||
- Handle deduplication and result filtering
|
||||
|
||||
3. **Document Ranking**:
|
||||
- Use Jina AI's Re-Ranker to order documents by relevance
|
||||
- Filter out less relevant documents
|
||||
- Apply additional filtering based on metadata (date, source, etc.)
|
||||
|
||||
4. **Report Generation**:
|
||||
- Synthesize a comprehensive report from the selected documents
|
||||
- Format the report for readability
|
||||
- Include citations and references
|
||||
|
||||
5. **User Interface**:
|
||||
- Provide a Gradio-based web interface for user interaction
|
||||
- Display search results and generated reports
|
||||
- Allow configuration of search parameters
|
||||
|
||||
## Current Implementation Status
|
||||
|
||||
The project currently has the following modules implemented:
|
||||
|
||||
1. **Configuration Module**:
|
||||
- Manages configuration settings for the entire system
|
||||
- Handles API keys and model selections
|
||||
- Supports different LLM providers and endpoints
|
||||
|
||||
2. **Query Processing Module**:
|
||||
- Processes and enhances user queries
|
||||
- Classifies queries by type and intent
|
||||
- Generates optimized search queries
|
||||
- Integrates with LiteLLM for LLM provider support
|
||||
|
||||
3. **Search Execution Module**:
|
||||
- Executes search queries across multiple search engines
|
||||
- Implements handlers for Serper, Google Scholar, and arXiv
|
||||
- Collects and processes search results
|
||||
- Handles deduplication and result filtering
|
||||
|
||||
4. **Document Ranking Module**:
|
||||
- Implements Jina AI's Re-Ranker for document ranking
|
||||
- Supports reranking with metadata preservation
|
||||
- Provides filtering capabilities
|
||||
|
||||
5. **Report Generation Module**:
|
||||
- Synthesizes comprehensive reports from selected documents
|
||||
- Formats reports for readability
|
||||
- Includes citations and references
|
||||
|
||||
## Dependencies
|
||||
- `requests`: For making API calls to various APIs
|
||||
- `numpy`: For vector operations in similarity computation
|
||||
- `tiktoken`: For tokenization and token counting
|
||||
- `litellm`: For unified LLM provider interface
|
||||
- `pyyaml`: For configuration file parsing
|
||||
- `feedparser`: For parsing RSS/Atom feeds (arXiv)
|
||||
- `beautifulsoup4`: For HTML parsing
|
||||
- `gradio`: For web interface
|
||||
|
||||
## LLM Providers
|
||||
The system supports multiple LLM providers through the LiteLLM interface:
|
||||
- Groq (currently using Llama 3.1-8b-instant)
|
||||
- OpenAI
|
||||
- Anthropic
|
||||
- OpenRouter
|
||||
- Azure OpenAI
|
||||
|
||||
## Search Engines
|
||||
The system currently integrates with the following search engines:
|
||||
- Serper API (for Google search)
|
||||
- Google Scholar (via Serper API)
|
||||
- arXiv (via official API)
|
||||
|
||||
## Next Steps
|
||||
1. ✅ Implement the Report Generation module
|
||||
2. ✅ Develop the Gradio UI for user interaction
|
||||
3. ✅ Add more search engines and LLM providers
|
||||
4. ✅ Implement document retrieval and processing
|
||||
5. ⏳ Enhance the Report Generation module with advanced features:
|
||||
- Customizable report detail levels
|
||||
- Progressive report generation
|
||||
- Visualization components
|
||||
- Interactive elements
|
||||
- Report versioning and comparison
|
||||
6. ⏳ Improve UI integration with report generation options
|
||||
7. ⏳ Add support for saving and loading research sessions
|
||||
8. ⏳ Implement comprehensive testing and documentation
|
|
@ -1,863 +0,0 @@
|
|||
# Session Log
|
||||
|
||||
## Session: 2025-02-27
|
||||
|
||||
### Overview
|
||||
Initial project setup and implementation of core functionality for semantic similarity search using Jina AI's APIs.
|
||||
|
||||
### Key Activities
|
||||
1. Created the core `JinaSimilarity` class in jina_similarity.py with the following features:
|
||||
- Token counting using tiktoken
|
||||
- Embedding generation using Jina AI's Embeddings API
|
||||
- Similarity computation using cosine similarity
|
||||
- Error handling for token limit violations
|
||||
|
||||
2. Implemented the markdown segmenter in markdown_segmenter.py:
|
||||
- Segmentation of markdown documents using Jina AI's Segmenter API
|
||||
- Command-line interface for easy usage
|
||||
|
||||
3. Developed a test script (test_similarity.py) with:
|
||||
- Command-line argument parsing
|
||||
- File reading functionality
|
||||
- Verbose output option for debugging
|
||||
- Error handling
|
||||
|
||||
4. Created sample files for testing:
|
||||
- sample_chunk.txt: Contains a paragraph about pangrams
|
||||
- sample_query.txt: Contains a question about pangrams
|
||||
|
||||
### Insights
|
||||
- Jina AI's embedding model (jina-embeddings-v3) provides high-quality embeddings for semantic search
|
||||
- The token limit of 8,192 tokens is sufficient for most use cases, but longer documents need segmentation
|
||||
- Normalizing embeddings simplifies similarity computation (dot product equals cosine similarity)
|
||||
- Separating segmentation from similarity computation provides better modularity
|
||||
|
||||
### Challenges
|
||||
- Ensuring proper error handling for API failures
|
||||
- Managing token limits for large documents
|
||||
- Balancing between chunking granularity and semantic coherence
|
||||
|
||||
### Next Steps
|
||||
1. Add tiktoken to requirements.txt
|
||||
2. Implement caching for embeddings to reduce API calls
|
||||
3. Add batch processing capabilities for multiple chunks/queries
|
||||
4. Create comprehensive documentation and usage examples
|
||||
5. Develop integration tests for reliability testing
|
||||
|
||||
## Session: 2025-02-27 (Update)
|
||||
|
||||
### Overview
|
||||
Created memory bank for the project to maintain persistent knowledge about the codebase and development progress.
|
||||
|
||||
### Key Activities
|
||||
1. Created the `.note/` directory to store memory bank files
|
||||
2. Created the following memory bank files:
|
||||
- project_overview.md: Purpose, goals, and high-level architecture
|
||||
- current_focus.md: Active work, recent changes, and next steps
|
||||
- development_standards.md: Coding conventions and patterns
|
||||
- decision_log.md: Key decisions with rationale
|
||||
- code_structure.md: Codebase organization with module descriptions
|
||||
- session_log.md: History of development sessions
|
||||
- interfaces.md: Component interfaces and API documentation
|
||||
|
||||
### Insights
|
||||
- The project has a clear structure with well-defined components
|
||||
- The use of Jina AI's APIs provides powerful semantic search capabilities
|
||||
- The modular design allows for easy extension and maintenance
|
||||
- Some improvements are needed, such as adding tiktoken to requirements.txt
|
||||
|
||||
### Next Steps
|
||||
1. Update requirements.txt to include all dependencies (tiktoken)
|
||||
2. Implement caching mechanism for embeddings
|
||||
3. Add batch processing capabilities
|
||||
4. Create comprehensive documentation
|
||||
5. Develop integration tests
|
||||
|
||||
## Session: 2025-02-27 (Update 2)
|
||||
|
||||
### Overview
|
||||
Expanded the project scope to build a comprehensive intelligent research system with an 8-stage pipeline.
|
||||
|
||||
### Key Activities
|
||||
1. Defined the overall architecture for the intelligent research system:
|
||||
- 8-stage pipeline from query acceptance to report generation
|
||||
- Multiple search sources (Google, Serper, Jina Search, Google Scholar, arXiv)
|
||||
- Semantic processing using Jina AI's APIs
|
||||
|
||||
2. Updated the memory bank to reflect the broader vision:
|
||||
- Revised project_overview.md with the complete research system goals
|
||||
- Updated current_focus.md with next steps for each pipeline stage
|
||||
- Enhanced code_structure.md with planned project organization
|
||||
- Added new decisions to decision_log.md
|
||||
|
||||
### Insights
|
||||
- The modular pipeline architecture allows for incremental development
|
||||
- Jina AI's suite of APIs provides a consistent approach to semantic processing
|
||||
- Multiple search sources will provide more comprehensive research results
|
||||
- The current similarity components fit naturally into stages 6-7 of the pipeline
|
||||
|
||||
### Next Steps
|
||||
1. Begin implementing the query processing module (stage 1)
|
||||
2. Design the data structures for passing information between pipeline stages
|
||||
3. Create a project roadmap with milestones for each stage
|
||||
4. Prioritize development of core components for an end-to-end MVP
|
||||
|
||||
## Session: 2025-02-27 (Update 3)
|
||||
|
||||
### Overview
|
||||
Planned the implementation of the Query Processing Module with LiteLLM integration and Gradio UI.
|
||||
|
||||
### Key Activities
|
||||
1. Researched LiteLLM integration:
|
||||
- Explored LiteLLM documentation and usage patterns
|
||||
- Investigated integration with Gradio for UI development
|
||||
- Identified configuration requirements and best practices
|
||||
|
||||
2. Developed implementation plan:
|
||||
- Prioritized Query Processing Module with LiteLLM integration
|
||||
- Planned Gradio UI implementation for user interaction
|
||||
- Outlined configuration structure for API keys and settings
|
||||
- Established a sequence for implementing remaining modules
|
||||
|
||||
3. Updated memory bank:
|
||||
- Revised current_focus.md with new implementation plan
|
||||
- Added immediate and future steps for development
|
||||
|
||||
### Insights
|
||||
- LiteLLM provides a unified interface to multiple LLM providers, simplifying integration
|
||||
- Gradio offers an easy way to create interactive UIs for AI applications
|
||||
- The modular approach allows for incremental development and testing
|
||||
- Existing similarity components can be integrated into the pipeline at a later stage
|
||||
|
||||
### Next Steps
|
||||
1. Update requirements.txt with new dependencies (litellm, gradio, etc.)
|
||||
2. Create configuration structure for secure API key management
|
||||
3. Implement LiteLLM interface for query enhancement and classification
|
||||
4. Develop the query processor with structured output
|
||||
5. Build the Gradio UI for user interaction
|
||||
|
||||
## Session: 2025-02-27 (Update 4)
|
||||
|
||||
### Overview
|
||||
Implemented module-specific model configuration and created the Jina AI Reranker module.
|
||||
|
||||
### Key Activities
|
||||
1. Enhanced configuration structure:
|
||||
- Added support for module-specific model assignments
|
||||
- Configured different models for different tasks
|
||||
- Added detailed endpoint configurations for various providers
|
||||
|
||||
2. Updated LLMInterface:
|
||||
- Modified to support module-specific model configurations
|
||||
- Added support for different endpoint types (OpenAI, Azure, Ollama)
|
||||
- Implemented method delegation to use appropriate models for each task
|
||||
|
||||
3. Created Jina AI Reranker module:
|
||||
- Implemented document reranking using Jina AI's Reranker API
|
||||
- Added support for reranking documents with metadata
|
||||
- Configured to use the "jina-reranker-v2-base-multilingual" model
|
||||
|
||||
### Insights
|
||||
- Using different models for different tasks allows for optimizing performance and cost
|
||||
- Jina's reranker provides a specialized solution for document ranking
|
||||
- The modular approach allows for easy swapping of components and models
|
||||
|
||||
### Next Steps
|
||||
1. Implement the remaining query processing components
|
||||
2. Create the Gradio UI for user interaction
|
||||
3. Test the full system with end-to-end workflows
|
||||
|
||||
## Session: 2025-02-27 (Update 5)
|
||||
|
||||
### Overview
|
||||
Added support for OpenRouter and Groq as LLM providers and configured the system to use Groq for testing.
|
||||
|
||||
### Key Activities
|
||||
1. **Jina Reranker API Integration**:
|
||||
- Updated the `rerank` method in the JinaReranker class to match the expected API request format
|
||||
- Modified the request payload to send an array of plain string documents instead of objects
|
||||
- Enhanced response processing to handle both current and older API response formats
|
||||
- Added detailed logging for API requests and responses for better debugging
|
||||
|
||||
2. **Testing Improvements**:
|
||||
- Created a simplified test script (`test_simple_reranker.py`) to isolate and test the reranker functionality
|
||||
- Updated the main test script to focus on core functionality without complex dependencies
|
||||
- Implemented JSON result saving for better analysis of reranker output
|
||||
- Added proper error handling in tests to provide clear feedback on issues
|
||||
|
||||
3. **Code Quality Enhancements**:
|
||||
- Improved error handling throughout the reranker implementation
|
||||
- Added informative debug messages at key points in the execution flow
|
||||
- Ensured backward compatibility with previous API response formats
|
||||
- Documented the expected request and response structures
|
||||
|
||||
### Insights and Learnings
|
||||
- The Jina Reranker API expects documents as an array of plain strings, not objects with a "text" field
|
||||
- The reranker response format includes a "document" field in the results which may contain either the text directly or an object with a "text" field
|
||||
- Proper error handling and debug output are crucial for diagnosing issues with external API integrations
|
||||
- Isolating components for testing makes debugging much more efficient
|
||||
|
||||
### Challenges
|
||||
- Adapting to changes in the Jina Reranker API response format
|
||||
- Ensuring backward compatibility with older response formats
|
||||
- Debugging nested API response structures
|
||||
- Managing environment variables and configuration consistently across test scripts
|
||||
|
||||
### Next Steps
|
||||
1. **Expand Testing**: Develop more comprehensive test cases for the reranker with diverse document types
|
||||
2. **Integration**: Ensure the reranker is properly integrated with the result collector for end-to-end functionality
|
||||
3. **Documentation**: Update API documentation to reflect the latest changes to the reranker implementation
|
||||
4. **UI Integration**: Add reranker configuration options to the Gradio interface
|
||||
|
||||
## Session: 2025-02-27 - Report Generation Module Planning
|
||||
|
||||
### Overview
|
||||
In this session, we focused on planning the Report Generation module, designing a comprehensive implementation approach, and making key decisions about document scraping, storage, and processing.
|
||||
|
||||
### Key Activities
|
||||
1. **Designed a Phased Implementation Plan**:
|
||||
- Created a four-phase implementation plan for the Report Generation module
|
||||
- Phase 1: Document Scraping and Storage
|
||||
- Phase 2: Document Prioritization and Chunking
|
||||
- Phase 3: Report Generation
|
||||
- Phase 4: Advanced Features
|
||||
- Documented the plan in the memory bank for future reference
|
||||
|
||||
2. **Made Key Design Decisions**:
|
||||
- Decided to use Jina Reader for web scraping due to its clean content extraction capabilities
|
||||
- Chose SQLite for document storage to ensure persistence and efficient querying
|
||||
- Designed a database schema with Documents and Metadata tables
|
||||
- Planned a token budget management system to handle context window limitations
|
||||
- Decided on a map-reduce approach for processing large document collections
|
||||
|
||||
3. **Addressed Context Window Limitations**:
|
||||
- Evaluated Groq's Llama 3.3 70B Versatile model's 128K context window
|
||||
- Designed document prioritization strategies based on relevance scores
|
||||
- Planned chunking strategies for handling long documents
|
||||
- Considered alternative models with larger context windows for future implementation
|
||||
|
||||
4. **Updated Documentation**:
|
||||
- Added the implementation plan to the memory bank
|
||||
- Updated the decision log with rationale for key decisions
|
||||
- Revised the current focus to reflect the new implementation priorities
|
||||
- Added a new session log entry to document the planning process
|
||||
|
||||
### Insights
|
||||
- A phased implementation approach allows for incremental development and testing
|
||||
- SQLite provides a good balance of simplicity and functionality for document storage
|
||||
- Jina Reader integrates well with our existing Jina components (embeddings, reranker)
|
||||
- The map-reduce pattern enables processing of unlimited document collections despite context window limitations
|
||||
- Document prioritization is crucial for ensuring the most relevant content is included in reports
|
||||
|
||||
### Challenges
|
||||
- Managing the 128K context window limitation with potentially large document collections
|
||||
- Balancing between document coverage and report quality
|
||||
- Ensuring efficient web scraping without overwhelming target websites
|
||||
- Designing a flexible architecture that can accommodate different models and approaches
|
||||
|
||||
### Next Steps
|
||||
1. Begin implementing Phase 1 of the Report Generation module:
|
||||
- Set up the SQLite database with the designed schema
|
||||
- Implement the Jina Reader integration for web scraping
|
||||
- Create the document processing pipeline
|
||||
- Develop URL validation and normalization functionality
|
||||
- Add caching and deduplication for scraped content
|
||||
|
||||
2. Plan for Phase 2 implementation:
|
||||
- Design the token budget management system
|
||||
- Develop document prioritization algorithms
|
||||
- Create chunking strategies for long documents
|
||||
|
||||
## Session: 2025-02-27 - Report Generation Module Implementation (Phase 1)
|
||||
|
||||
### Overview
|
||||
In this session, we implemented Phase 1 of the Report Generation module, focusing on document scraping and SQLite storage. We created the necessary components for scraping web pages, storing their content in a SQLite database, and retrieving documents for report generation.
|
||||
|
||||
### Key Activities
|
||||
1. **Created Database Manager**:
|
||||
- Implemented a SQLite database manager with tables for documents and metadata
|
||||
- Added full CRUD operations for documents
|
||||
- Implemented transaction handling for data integrity
|
||||
- Created methods for document search and retrieval
|
||||
- Used aiosqlite for asynchronous database operations
|
||||
|
||||
2. **Implemented Document Scraper**:
|
||||
- Created a document scraper with Jina Reader API integration
|
||||
- Added fallback mechanism using BeautifulSoup for when Jina API fails
|
||||
- Implemented URL validation and normalization
|
||||
- Added content conversion to Markdown format
|
||||
- Implemented token counting using tiktoken
|
||||
- Created metadata extraction from HTML content
|
||||
- Added document deduplication using content hashing
|
||||
|
||||
3. **Developed Report Generator Base**:
|
||||
- Created the basic structure for the report generation process
|
||||
- Implemented methods to process search results by scraping URLs
|
||||
- Integrated with the database manager and document scraper
|
||||
- Set up the foundation for future phases
|
||||
|
||||
4. **Created Test Script**:
|
||||
- Developed a test script to verify functionality
|
||||
- Tested document scraping, storage, and retrieval
|
||||
- Verified search functionality within the database
|
||||
- Ensured proper error handling and fallback mechanisms
|
||||
|
||||
### Insights
|
||||
- The fallback mechanism for document scraping is crucial, as the Jina Reader API may not always be available or may fail for certain URLs
|
||||
- Asynchronous processing significantly improves performance when scraping multiple URLs
|
||||
- Content hashing is an effective way to prevent duplicate documents in the database
|
||||
- Storing metadata separately from document content provides flexibility for future enhancements
|
||||
- The SQLite database provides a good balance of simplicity and functionality for document storage
|
||||
|
||||
### Challenges
|
||||
- Handling different HTML structures across websites for metadata extraction
|
||||
- Managing asynchronous operations and error handling
|
||||
- Ensuring proper transaction handling for database operations
|
||||
- Balancing between clean content extraction and preserving important information
|
||||
|
||||
### Next Steps
|
||||
1. **Integration with Search Execution**:
|
||||
- Connect the report generation module to the search execution pipeline
|
||||
- Implement automatic processing of search results
|
||||
|
||||
2. **Begin Phase 2 Implementation**:
|
||||
- Develop document prioritization based on relevance scores
|
||||
- Implement chunking strategies for long documents
|
||||
- Create token budget management system
|
||||
|
||||
3. **Testing and Refinement**:
|
||||
- Create more comprehensive tests for edge cases
|
||||
- Refine error handling and logging
|
||||
- Optimize performance for large numbers of documents
|
||||
|
||||
## Session: 2025-02-27 (Update)
|
||||
|
||||
### Overview
|
||||
Implemented Phase 3 of the Report Generation module, focusing on report synthesis using LLMs with a map-reduce approach.
|
||||
|
||||
### Key Activities
|
||||
1. **Created Report Synthesis Module**:
|
||||
- Implemented the `ReportSynthesizer` class for generating reports using Groq's Llama 3.3 70B model
|
||||
- Created a map-reduce approach for processing document chunks:
|
||||
- Map phase: Extract key information from individual chunks
|
||||
- Reduce phase: Synthesize extracted information into a coherent report
|
||||
- Added support for different query types (factual, exploratory, comparative)
|
||||
- Implemented automatic query type detection based on query text
|
||||
- Added citation generation and reference management
|
||||
|
||||
2. **Updated Report Generator**:
|
||||
- Integrated the new report synthesis module with the existing report generator
|
||||
- Replaced the placeholder report generation with the new LLM-based synthesis
|
||||
- Added proper error handling and logging throughout the process
|
||||
|
||||
3. **Created Test Scripts**:
|
||||
- Developed a dedicated test script for the report synthesis functionality
|
||||
- Implemented tests with both sample data and real URLs
|
||||
- Added support for mock data to avoid API dependencies during testing
|
||||
- Verified end-to-end functionality from document scraping to report generation
|
||||
|
||||
4. **Fixed LLM Integration Issues**:
|
||||
- Corrected the model name format for Groq provider by prefixing it with 'groq/'
|
||||
- Improved error handling for API failures
|
||||
- Added proper logging for the map-reduce process
|
||||
|
||||
### Insights
|
||||
- The map-reduce approach is effective for processing large amounts of document data
|
||||
- Different query types benefit from specialized report templates
|
||||
- Groq's Llama 3.3 70B model produces high-quality reports with good coherence and factual accuracy
|
||||
- Proper citation management is essential for creating trustworthy reports
|
||||
- Automatic query type detection works well for common query patterns
|
||||
|
||||
### Challenges
|
||||
- Managing API errors and rate limits with external LLM providers
|
||||
- Ensuring consistent formatting across different report sections
|
||||
- Balancing between report comprehensiveness and token usage
|
||||
- Handling edge cases where document chunks contain irrelevant information
|
||||
|
||||
### Next Steps
|
||||
1. Implement support for alternative models with larger context windows
|
||||
2. Develop progressive report generation for very large research tasks
|
||||
3. Create visualization components for data mentioned in reports
|
||||
4. Add interactive elements to the generated reports
|
||||
5. Implement report versioning and comparison
|
||||
|
||||
## Session: 2025-02-27 (Update 2)
|
||||
|
||||
### Overview
|
||||
Successfully tested the end-to-end query to report pipeline with a specific query about the environmental and economic impact of electric vehicles, and fixed an issue with the Jina reranker integration.
|
||||
|
||||
### Key Activities
|
||||
1. **Fixed Jina Reranker Integration**:
|
||||
- Corrected the import statement in query_to_report.py to use the proper function name (get_jina_reranker)
|
||||
- Updated the reranker call to properly format the results for the JinaReranker
|
||||
- Implemented proper extraction of text from search results for reranking
|
||||
- Added mapping of reranked indices back to the original results
|
||||
|
||||
2. **Created EV Query Test Script**:
|
||||
- Developed a dedicated test script (test_ev_query.py) for testing the pipeline with a query about electric vehicles
|
||||
- Configured the script to use 7 results per search engine for a comprehensive report
|
||||
- Added proper error handling and result display
|
||||
|
||||
3. **Tested End-to-End Pipeline**:
|
||||
- Successfully executed the full query to report workflow
|
||||
- Verified that all components (query processor, search executor, reranker, report generator) work together seamlessly
|
||||
- Generated a comprehensive report on the environmental and economic impact of electric vehicles
|
||||
|
||||
4. **Identified Report Detail Configuration Options**:
|
||||
- Documented multiple ways to adjust the level of detail in generated reports
|
||||
- Identified parameters that can be modified to control report comprehensiveness
|
||||
- Created a plan for implementing customizable report detail levels
|
||||
|
||||
### Insights
|
||||
- The end-to-end pipeline successfully connects all major components of the system
|
||||
- The Jina reranker significantly improves the relevance of search results for report generation
|
||||
- The map-reduce approach effectively processes document chunks into a coherent report
|
||||
- Some document sources (like ScienceDirect and ResearchGate) may require special handling due to access restrictions
|
||||
|
||||
### Challenges
|
||||
- Handling API errors and access restrictions for certain document sources
|
||||
- Ensuring proper formatting of data between different components
|
||||
- Managing the processing of a large number of document chunks efficiently
|
||||
|
||||
### Next Steps
|
||||
1. **Implement Customizable Report Detail Levels**:
|
||||
- Develop a system to allow users to select different levels of detail for generated reports
|
||||
- Integrate the customizable detail levels into the report generator
|
||||
- Test the new feature with various query types
|
||||
|
||||
2. **Add Support for Alternative Models**:
|
||||
- Research and implement support for alternative models with larger context windows
|
||||
- Test the new models with the report generation pipeline
|
||||
|
||||
3. **Develop Progressive Report Generation**:
|
||||
- Design and implement a system for progressive report generation
|
||||
- Test the new feature with very large research tasks
|
||||
|
||||
4. **Create Visualization Components**:
|
||||
- Develop visualization components for data mentioned in reports
|
||||
- Integrate the visualization components into the report generator
|
||||
|
||||
5. **Add Interactive Elements**:
|
||||
- Develop interactive elements for the generated reports
|
||||
- Integrate the interactive elements into the report generator
|
||||
|
||||
## Session: 2025-02-28
|
||||
|
||||
### Overview
|
||||
Implemented customizable report detail levels for the Report Generation Module, allowing users to select different levels of detail for generated reports.
|
||||
|
||||
### Key Activities
|
||||
1. **Created Report Detail Levels Module**:
|
||||
- Implemented a new module `report_detail_levels.py` with an enum for detail levels (Brief, Standard, Detailed, Comprehensive)
|
||||
- Created a `ReportDetailLevelManager` class to manage detail level configurations
|
||||
- Defined specific parameters for each detail level (num_results, token_budget, chunk_size, overlap_size, model)
|
||||
- Added methods to validate and retrieve detail level configurations
|
||||
|
||||
2. **Updated Report Synthesis Module**:
|
||||
- Modified the `ReportSynthesizer` class to accept and use detail level parameters
|
||||
- Updated synthesis templates to adapt based on the selected detail level
|
||||
- Adjusted the map-reduce process to handle different levels of detail
|
||||
- Implemented model selection based on detail level requirements
|
||||
|
||||
3. **Enhanced Report Generator**:
|
||||
- Added methods to set and get detail levels in the `ReportGenerator` class
|
||||
- Updated the document preparation process to use detail level configurations
|
||||
- Modified the report generation workflow to incorporate detail level settings
|
||||
- Implemented validation for detail level parameters
|
||||
|
||||
4. **Updated Query to Report Script**:
|
||||
- Added command-line arguments for detail level selection
|
||||
- Implemented a `--list-detail-levels` option to display available options
|
||||
- Updated the main workflow to pass detail level parameters to the report generator
|
||||
- Added documentation for the new parameters
|
||||
|
||||
5. **Created Test Scripts**:
|
||||
- Updated `test_ev_query.py` to support detail level selection
|
||||
- Created a new `test_detail_levels.py` script to generate reports with all detail levels for comparison
|
||||
- Added metrics collection (timing, report size, word count) for comparison
|
||||
|
||||
### Insights
|
||||
- Different detail levels significantly affect report length, depth, and generation time
|
||||
- The brief level is useful for quick summaries, while comprehensive provides exhaustive information
|
||||
- Using different models for different detail levels offers a good balance between speed and quality
|
||||
- Configuring multiple parameters (num_results, token_budget, etc.) together creates a coherent detail level experience
|
||||
|
||||
### Challenges
|
||||
- Ensuring that the templates produce appropriate output for each detail level
|
||||
- Balancing between speed and quality for different detail levels
|
||||
- Managing token budgets effectively across different detail levels
|
||||
- Ensuring backward compatibility with existing code
|
||||
|
||||
### Next Steps
|
||||
1. Conduct thorough testing of the detail level features with various query types
|
||||
2. Gather user feedback on the quality and usefulness of reports at different detail levels
|
||||
3. Refine the detail level configurations based on testing and feedback
|
||||
4. Implement progressive report generation for very large research tasks
|
||||
5. Develop visualization components for data mentioned in reports
|
||||
|
||||
## Session: 2025-02-28 - Enhanced Report Detail Levels
|
||||
|
||||
### Overview
|
||||
In this session, we enhanced the report detail levels to focus more on analytical depth rather than just adding additional sections. We improved the document chunk processing to extract more meaningful information from each chunk for detailed and comprehensive reports.
|
||||
|
||||
### Key Activities
|
||||
1. **Enhanced Template Modifiers for Detailed and Comprehensive Reports**:
|
||||
- Rewrote the template modifiers to focus on analytical depth, evidence density, and perspective diversity
|
||||
- Added explicit instructions to prioritize depth over breadth
|
||||
- Emphasized multi-layered analysis, causal relationships, and interconnections
|
||||
- Added instructions for exploring second and third-order effects
|
||||
|
||||
2. **Improved Document Chunk Processing**:
|
||||
- Created a new `_get_extraction_prompt` method that provides different extraction prompts based on detail level
|
||||
- For DETAILED reports: Added focus on underlying principles, causal relationships, and different perspectives
|
||||
- For COMPREHENSIVE reports: Added focus on multi-layered analysis, complex causal networks, and theoretical frameworks
|
||||
- Modified the `map_document_chunks` method to pass the detail level parameter
|
||||
|
||||
3. **Enhanced MapReduce Approach**:
|
||||
- Updated the map phase to use detail-level-specific extraction prompts
|
||||
- Ensured the detail level parameter is passed throughout the process
|
||||
- Maintained the efficient processing of document chunks while improving the quality of extraction
|
||||
|
||||
### Insights
|
||||
- The MapReduce approach is well-suited for LLM-based report generation, allowing processing of more information than would fit in a single context window
|
||||
- Different extraction prompts for different detail levels significantly affect the quality and depth of the extracted information
|
||||
- Focusing on analytical depth rather than additional sections provides more value to the end user
|
||||
- The enhanced prompts guide the LLM to provide deeper analysis of causal relationships, underlying mechanisms, and interconnections
|
||||
|
||||
### Challenges
|
||||
- Balancing between depth and breadth in detailed reports
|
||||
- Ensuring that the extraction prompts extract the most relevant information for each detail level
|
||||
- Managing the increased processing time for detailed and comprehensive reports with enhanced extraction
|
||||
|
||||
### Next Steps
|
||||
1. Conduct thorough testing of the enhanced detail level features with various query types
|
||||
2. Compare the analytical depth and quality of reports generated with the new prompts
|
||||
3. Gather user feedback on the improved reports at different detail levels
|
||||
4. Explore parallel processing for the map phase to reduce overall report generation time
|
||||
5. Further refine the detail level configurations based on testing and feedback
|
||||
|
||||
## Session: 2025-02-28 - Gradio UI Enhancements and Future Planning
|
||||
|
||||
### Overview
|
||||
In this session, we fixed issues in the Gradio UI for report generation and planned future enhancements to improve search quality and user experience.
|
||||
|
||||
### Key Activities
|
||||
1. **Fixed Gradio UI for Report Generation**:
|
||||
- Updated the `generate_report` method in the Gradio UI to properly process queries and generate structured queries
|
||||
- Integrated the `QueryProcessor` to create structured queries from user input
|
||||
- Fixed method calls and parameter passing to the `execute_search` method
|
||||
- Implemented functionality to process `<thinking>` tags in the generated report
|
||||
- Added support for custom model selection in the UI
|
||||
- Updated the interfaces documentation to include ReportGenerator and ReportDetailLevelManager interfaces
|
||||
|
||||
2. **Planned Future Enhancements**:
|
||||
- **Multiple Query Variation Generation**:
|
||||
- Designed an approach to generate several similar queries with different keywords for better search coverage
|
||||
- Planned modifications to the QueryProcessor and SearchExecutor to handle multiple queries
|
||||
- Estimated this as a moderate difficulty task (3-4 days of work)
|
||||
|
||||
- **Threshold-Based Reranking with Larger Document Sets**:
|
||||
- Developed a plan to process more initial documents and use reranking to select the most relevant ones
|
||||
- Designed new detail level configuration parameters for initial and final result counts
|
||||
- Estimated this as an easy to moderate difficulty task (2-3 days of work)
|
||||
|
||||
- **UI Progress Indicators**:
|
||||
- Identified the need for chunk processing progress indicators in the UI
|
||||
- Planned modifications to report_synthesis.py to add logging during document processing
|
||||
- Estimated this as a simple enhancement (15-30 minutes of work)
|
||||
|
||||
### Insights
|
||||
- The modular architecture of the system makes it easy to extend with new features
|
||||
- Providing progress indicators during report generation would significantly improve user experience
|
||||
- Generating multiple query variations could substantially improve search coverage and result quality
|
||||
- Using a two-stage approach (fetch more, then filter) for document retrieval would likely improve report quality
|
||||
|
||||
### Challenges
|
||||
- Balancing between fetching enough documents for comprehensive coverage and maintaining performance
|
||||
- Ensuring proper deduplication when using multiple query variations
|
||||
- Managing the increased API usage that would result from processing more queries and documents
|
||||
|
||||
### Next Steps
|
||||
1. Implement the chunk processing progress indicators as a quick win
|
||||
2. Begin work on the multiple query variation generation feature
|
||||
3. Test the current implementation with various query types to identify any remaining issues
|
||||
4. Update the documentation to reflect the new features and future plans
|
||||
|
||||
## Session: 2025-02-28: Google Gemini Integration and Reference Formatting
|
||||
|
||||
### Overview
|
||||
Fixed the integration of Google Gemini models with LiteLLM, and fixed reference formatting issues.
|
||||
|
||||
### Key Activities
|
||||
1. **Fixed Google Gemini Integration**:
|
||||
- Updated the model format to `gemini/gemini-2.0-flash` in config.yaml
|
||||
- Modified message formatting for Gemini models in LLM interface
|
||||
- Added proper handling for the 'gemini' provider in environment variable setup
|
||||
|
||||
2. **Fixed Reference Formatting Issues**:
|
||||
- Enhanced the instructions for reference formatting to ensure URLs are included
|
||||
- Added a recovery mechanism for truncated references
|
||||
- Improved context preparation to better extract URLs for references
|
||||
|
||||
3. **Converted LLM Interface Methods to Async**:
|
||||
- Made `generate_completion`, `classify_query`, and `enhance_query` methods async
|
||||
- Updated dependent code to properly await these methods
|
||||
- Fixed runtime errors related to async/await patterns
|
||||
|
||||
### Key Insights
|
||||
- Gemini models require special message formatting (using 'user' and 'model' roles instead of 'system' and 'assistant')
|
||||
- References were getting cut off due to token limits, requiring a separate generation step
|
||||
- The async conversion was necessary to properly handle async LLM calls throughout the codebase
|
||||
|
||||
### Challenges
|
||||
- Ensuring that the templates produce appropriate output for each detail level
|
||||
- Balancing between speed and quality for different detail levels
|
||||
- Managing token budgets effectively across different detail levels
|
||||
- Ensuring backward compatibility with existing code
|
||||
|
||||
### Next Steps
|
||||
1. Continue testing with Gemini models to ensure stable operation
|
||||
2. Consider adding more robust error handling for LLM provider-specific issues
|
||||
3. Improve the reference formatting further if needed
|
||||
|
||||
## Session: 2025-02-28: Fixing Reference Formatting and Async Implementation
|
||||
|
||||
### Overview
|
||||
Fixed reference formatting issues with Gemini models and updated the codebase to properly handle async methods.
|
||||
|
||||
### Key Activities
|
||||
1. **Enhanced Reference Formatting**:
|
||||
- Improved instructions to emphasize including URLs for each reference
|
||||
- Added duplicate URL fields in the context to ensure URLs are captured
|
||||
- Updated the reference generation prompt to explicitly request URLs
|
||||
- Added a separate reference generation step to handle truncated references
|
||||
|
||||
2. **Fixed Async Implementation**:
|
||||
- Converted all LLM interface methods to async for proper handling
|
||||
- Updated QueryProcessor's generate_search_queries method to be async
|
||||
- Modified query_to_report.py to correctly await async methods
|
||||
- Fixed runtime errors related to async/await patterns
|
||||
|
||||
3. **Updated Gradio Interface**:
|
||||
- Modified the generate_report method to properly handle async operations
|
||||
- Updated the report button click handler to correctly pass parameters
|
||||
- Fixed the parameter order in the lambda function for async execution
|
||||
- Improved error handling in the UI
|
||||
|
||||
## Session: 2025-03-11
|
||||
|
||||
### Overview
|
||||
|
||||
Reorganized the project directory structure to improve maintainability and clarity, ensuring all components are properly organized into their respective directories.
|
||||
|
||||
### Key Activities
|
||||
|
||||
1. **Directory Structure Reorganization**:
|
||||
|
||||
- Created a dedicated `utils/` directory for utility scripts
|
||||
- Moved `jina_similarity.py` to `utils/`
|
||||
- Added `__init__.py` to make it a proper Python package
|
||||
- Organized test files into subdirectories under `tests/`
|
||||
- Created subdirectories for each module (query, execution, ranking, report, ui, integration)
|
||||
- Added `__init__.py` files to all test directories
|
||||
- Created an `examples/` directory with subdirectories for data and scripts
|
||||
- Moved sample data to `examples/data/`
|
||||
- Added `__init__.py` files to make them proper Python packages
|
||||
- Added a dedicated `scripts/` directory for utility scripts
|
||||
- Moved `query_to_report.py` to `scripts/`
|
||||
|
||||
2. **Pipeline Verification**:
|
||||
|
||||
- Tested the pipeline after reorganization to ensure functionality
|
||||
- Verified that the UI works correctly with the new directory structure
|
||||
- Confirmed that all imports are working properly with the new structure
|
||||
|
||||
3. **Embedding Usage Analysis**:
|
||||
|
||||
- Confirmed that the pipeline uses Jina AI's Embeddings API through the `JinaSimilarity` class
|
||||
- Verified that the `JinaReranker` class uses embeddings for document reranking
|
||||
- Analyzed how embeddings are integrated into the search and ranking process
|
||||
|
||||
### Insights
|
||||
|
||||
- A well-organized directory structure significantly improves code maintainability and readability
|
||||
- Using proper Python package structure with `__init__.py` files ensures clean imports
|
||||
- Separating tests, utilities, examples, and scripts into dedicated directories makes the codebase more navigable
|
||||
- The Jina AI embeddings are used throughout the pipeline for semantic similarity and document reranking
|
||||
|
||||
### Challenges
|
||||
|
||||
- Ensuring all import statements are updated correctly after moving files
|
||||
- Maintaining backward compatibility with existing code
|
||||
- Verifying that all components still work together after reorganization
|
||||
|
||||
### Next Steps
|
||||
|
||||
1. Run comprehensive tests to ensure all functionality works with the new directory structure
|
||||
2. Update any remaining documentation to reflect the new directory structure
|
||||
3. Consider moving the remaining test files in the root of the `tests/` directory to appropriate subdirectories
|
||||
4. Review import statements throughout the codebase to ensure they follow the new structure
|
||||
|
||||
### Key Insights
|
||||
- Async/await patterns need to be consistently applied throughout the codebase
|
||||
- Reference formatting requires explicit instructions to include URLs
|
||||
- Gradio's interface needs special handling for async functions
|
||||
|
||||
### Challenges
|
||||
- Ensuring that all async methods are properly awaited
|
||||
- Balancing between detailed instructions and token limits for reference generation
|
||||
- Managing the increased processing time for async operations
|
||||
|
||||
### Next Steps
|
||||
1. Continue testing with Gemini models to ensure stable operation
|
||||
2. Consider adding more robust error handling for LLM provider-specific issues
|
||||
3. Improve the reference formatting further if needed
|
||||
4. Update documentation to reflect the changes made to the LLM interface
|
||||
5. Consider adding more unit tests for the async methods
|
||||
|
||||
## Session: 2025-02-28: Fixed NoneType Error in Report Synthesis
|
||||
|
||||
### Issue
|
||||
Encountered an error during report generation:
|
||||
```
|
||||
TypeError: 'NoneType' object is not subscriptable
|
||||
```
|
||||
|
||||
The error occurred in the `map_document_chunks` method of the `ReportSynthesizer` class when trying to slice a title that was `None`.
|
||||
|
||||
### Changes Made
|
||||
1. Fixed the chunk counter in `map_document_chunks` method:
|
||||
- Used a separate counter for individual chunks instead of using the batch index
|
||||
- Added a null check for chunk titles with a fallback to 'Untitled'
|
||||
|
||||
2. Added defensive code in `synthesize_report` method:
|
||||
- Added code to ensure all chunks have a title before processing
|
||||
- Added null checks for title fields
|
||||
|
||||
3. Updated the `DocumentProcessor` class:
|
||||
- Modified `process_documents_for_report` to ensure all chunks have a title
|
||||
- Updated `chunk_document_by_sections`, `chunk_document_fixed_size`, and `chunk_document_hierarchical` methods to handle None titles
|
||||
- Added default 'Untitled' value for all title fields
|
||||
|
||||
### Testing
|
||||
The changes were tested with a report generation task that previously failed, and the error was resolved.
|
||||
|
||||
### Next Steps
|
||||
1. Consider adding more comprehensive null checks throughout the codebase
|
||||
2. Add unit tests to verify proper handling of missing or null fields
|
||||
3. Implement better error handling and recovery mechanisms
|
||||
|
||||
## Session: 2025-03-11
|
||||
|
||||
### Overview
|
||||
Focused on resolving issues with the report generation template system and ensuring that different detail levels and query types work correctly in the report synthesis process.
|
||||
|
||||
### Key Activities
|
||||
1. **Fixed Template Retrieval Issues**:
|
||||
- Updated the `get_template` method in the `ReportTemplateManager` to ensure it retrieves templates correctly based on query type and detail level
|
||||
- Implemented a helper method `_get_template_from_strings` in the `ReportSynthesizer` to convert string values for query types and detail levels to their respective enum objects
|
||||
- Added better logging for template retrieval process to aid in debugging
|
||||
|
||||
2. **Tested All Detail Levels and Query Types**:
|
||||
- Created a comprehensive test script `test_all_detail_levels.py` to test all combinations of detail levels and query types
|
||||
- Successfully tested all detail levels (brief, standard, detailed, comprehensive) with factual queries
|
||||
- Successfully tested all detail levels with exploratory queries
|
||||
- Successfully tested all detail levels with comparative queries
|
||||
|
||||
3. **Improved Error Handling**:
|
||||
- Added fallback to standard templates if specific templates are not found
|
||||
- Enhanced logging to track whether templates are found during the synthesis process
|
||||
|
||||
4. **Code Organization**:
|
||||
- Removed duplicate `ReportTemplateManager` and `ReportTemplate` classes from `report_synthesis.py`
|
||||
- Used the imported versions from `report_templates.py` for better code maintainability
|
||||
|
||||
### Insights
|
||||
- The template system is now working correctly for all combinations of query types and detail levels
|
||||
- Proper logging is essential for debugging template retrieval issues
|
||||
- Converting string values to enum objects is necessary for consistent template retrieval
|
||||
- Having a dedicated test script for all combinations helps ensure comprehensive coverage
|
||||
|
||||
### Challenges
|
||||
- Initially encountered issues where templates were not found during report synthesis, leading to `ValueError`
|
||||
- Needed to ensure that the correct classes and methods were used for template retrieval
|
||||
|
||||
### Next Steps
|
||||
1. Conduct additional testing with real-world queries and document sets
|
||||
2. Compare the analytical depth and quality of reports generated with different detail levels
|
||||
3. Gather user feedback on the improved reports at different detail levels
|
||||
4. Further refine the detail level configurations based on testing and feedback
|
||||
|
||||
## Session: 2025-03-12 - Report Templates and Progressive Report Generation
|
||||
|
||||
### Overview
|
||||
Implemented a dedicated report templates module to standardize report generation across different query types and detail levels, and implemented progressive report generation for comprehensive reports.
|
||||
|
||||
### Key Activities
|
||||
1. **Created Report Templates Module**:
|
||||
- Developed a new `report_templates.py` module with a comprehensive template system
|
||||
- Implemented `QueryType` enum for categorizing queries (FACTUAL, EXPLORATORY, COMPARATIVE)
|
||||
- Created `DetailLevel` enum for different report detail levels (BRIEF, STANDARD, DETAILED, COMPREHENSIVE)
|
||||
- Designed a `ReportTemplate` class with validation for required sections
|
||||
- Implemented a `ReportTemplateManager` to manage and retrieve templates
|
||||
|
||||
2. **Implemented Template Variations**:
|
||||
- Created 12 different templates (3 query types × 4 detail levels)
|
||||
- Designed templates with appropriate sections for each combination
|
||||
- Added placeholders for dynamic content in each template
|
||||
- Ensured templates follow a consistent structure while adapting to specific needs
|
||||
|
||||
3. **Added Testing**:
|
||||
- Created `test_report_templates.py` to verify template retrieval and validation
|
||||
- Implemented `test_brief_report.py` to test brief report generation with a simple query
|
||||
- Verified that all templates can be correctly retrieved and used
|
||||
|
||||
4. **Implemented Progressive Report Generation**:
|
||||
- Created a new `progressive_report_synthesis.py` module with a `ProgressiveReportSynthesizer` class
|
||||
- Implemented chunk prioritization algorithm based on relevance scores
|
||||
- Developed iterative refinement process with specialized prompts
|
||||
- Added state management to track report versions and processed chunks
|
||||
- Implemented termination conditions (all chunks processed, diminishing returns, max iterations)
|
||||
- Added support for different models with adaptive batch sizing
|
||||
- Implemented progress tracking and callback mechanism
|
||||
- Created comprehensive test suite for progressive report generation
|
||||
|
||||
5. **Updated Report Generator**:
|
||||
- Modified `report_generator.py` to use the progressive report synthesizer for comprehensive detail level
|
||||
- Created a hybrid system that uses standard map-reduce for brief/standard/detailed levels
|
||||
- Added proper model selection and configuration for both synthesizers
|
||||
|
||||
6. **Updated Memory Bank**:
|
||||
- Added report templates information to code_structure.md
|
||||
- Updated current_focus.md with implementation details for progressive report generation
|
||||
- Updated session_log.md with details about the implementation
|
||||
- Ensured all new files are properly documented
|
||||
|
||||
### Insights
|
||||
- A standardized template system significantly improves report consistency
|
||||
- Different query types require specialized report structures
|
||||
- Validation ensures all required sections are present in templates
|
||||
- Enums provide type safety and prevent errors from string comparisons
|
||||
- Progressive report generation provides better results for very large document collections
|
||||
- The hybrid approach leverages the strengths of both map-reduce and progressive methods
|
||||
- Tracking improvement scores helps detect diminishing returns and optimize processing
|
||||
- Adaptive batch sizing based on model context window improves efficiency
|
||||
|
||||
### Challenges
|
||||
- Designing templates that are flexible enough for various content types
|
||||
- Balancing between standardization and customization for different query types
|
||||
- Ensuring proper integration with the existing report synthesis process
|
||||
- Managing state and tracking progress in progressive report generation
|
||||
- Preventing entrenchment of initial report structure in progressive approach
|
||||
- Optimizing token usage when sending entire reports for refinement
|
||||
- Determining appropriate termination conditions for the progressive approach
|
||||
|
||||
### Next Steps
|
||||
1. Integrate the progressive approach with the UI
|
||||
- Implement controls to pause, resume, or terminate the process
|
||||
- Create a preview mode to see the current report state
|
||||
- Add options to compare different versions of the report
|
||||
2. Conduct additional testing with real-world queries and document sets
|
||||
3. Add specialized templates for specific research domains
|
||||
4. Implement template customization options for users
|
||||
5. Implement visualization components for data mentioned in reports
|
|
@ -1,5 +0,0 @@
|
|||
Review the contensts of .note/ before modifying any files.
|
||||
|
||||
After each major successful test, please commit the changes to the repository with a meaningful commit message.
|
||||
|
||||
Update the contents of .note/ after each major change.
|
139
README.md
139
README.md
|
@ -1,138 +1,3 @@
|
|||
# Intelligent Research System
|
||||
# ira
|
||||
|
||||
An end-to-end research automation system that handles the entire process from query to final report, leveraging multiple search sources and semantic similarity to produce comprehensive research results.
|
||||
|
||||
## Overview
|
||||
|
||||
This system automates the research process by:
|
||||
1. Processing and enhancing user queries
|
||||
2. Executing searches across multiple engines (Serper, Google Scholar, arXiv)
|
||||
3. Ranking and filtering results based on relevance
|
||||
4. Generating comprehensive research reports
|
||||
|
||||
## Features
|
||||
|
||||
- **Query Processing**: Enhances user queries with additional context and classifies them by type and intent
|
||||
- **Multi-Source Search**: Executes searches across Serper (Google), Google Scholar, and arXiv
|
||||
- **Intelligent Ranking**: Uses Jina AI's Re-Ranker to prioritize the most relevant results
|
||||
- **Result Deduplication**: Removes duplicate results across different search engines
|
||||
- **Modular Architecture**: Easily extensible with new search engines and LLM providers
|
||||
|
||||
## Components
|
||||
|
||||
- **Query Processor**: Enhances and classifies user queries
|
||||
- **Search Executor**: Executes searches across multiple engines
|
||||
- **Result Collector**: Processes and organizes search results
|
||||
- **Document Ranker**: Ranks documents by relevance
|
||||
- **Report Generator**: Synthesizes information into a coherent report (coming soon)
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Python 3.8+
|
||||
- API keys for:
|
||||
- Serper API (for Google and Scholar search)
|
||||
- Groq (or other LLM provider)
|
||||
- Jina AI (for reranking)
|
||||
|
||||
### Installation
|
||||
|
||||
1. Clone the repository:
|
||||
```bash
|
||||
git clone https://github.com/yourusername/sim-search.git
|
||||
cd sim-search
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Create a configuration file:
|
||||
```bash
|
||||
cp config/config.yaml.example config/config.yaml
|
||||
```
|
||||
|
||||
4. Edit the configuration file to add your API keys:
|
||||
```yaml
|
||||
api_keys:
|
||||
serper: "your-serper-api-key"
|
||||
groq: "your-groq-api-key"
|
||||
jina: "your-jina-api-key"
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
#### Basic Usage
|
||||
|
||||
```python
|
||||
from query.query_processor import QueryProcessor
|
||||
from execution.search_executor import SearchExecutor
|
||||
from execution.result_collector import ResultCollector
|
||||
|
||||
# Initialize components
|
||||
query_processor = QueryProcessor()
|
||||
search_executor = SearchExecutor()
|
||||
result_collector = ResultCollector()
|
||||
|
||||
# Process a query
|
||||
processed_query = query_processor.process_query("What are the latest advancements in quantum computing?")
|
||||
|
||||
# Execute search
|
||||
search_results = search_executor.execute_search(processed_query)
|
||||
|
||||
# Process results
|
||||
processed_results = result_collector.process_results(search_results)
|
||||
|
||||
# Print top results
|
||||
for i, result in enumerate(processed_results[:5]):
|
||||
print(f"{i+1}. {result['title']}")
|
||||
print(f" URL: {result['url']}")
|
||||
print(f" Snippet: {result['snippet'][:100]}...")
|
||||
print()
|
||||
```
|
||||
|
||||
#### Testing
|
||||
|
||||
Run the test scripts to verify functionality:
|
||||
|
||||
```bash
|
||||
# Test search execution
|
||||
python test_search_execution.py
|
||||
|
||||
# Test all search handlers
|
||||
python test_all_handlers.py
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
sim-search/
|
||||
├── config/ # Configuration management
|
||||
├── query/ # Query processing
|
||||
├── execution/ # Search execution
|
||||
│ └── api_handlers/ # Search API handlers
|
||||
├── ranking/ # Document ranking
|
||||
├── test_*.py # Test scripts
|
||||
└── requirements.txt # Dependencies
|
||||
```
|
||||
|
||||
## LLM Providers
|
||||
|
||||
The system supports multiple LLM providers through the LiteLLM interface:
|
||||
- Groq (currently using Llama 3.1-8b-instant)
|
||||
- OpenAI
|
||||
- Anthropic
|
||||
- OpenRouter
|
||||
- Azure OpenAI
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the MIT License - see the LICENSE file for details.
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
- [Jina AI](https://jina.ai/) for their embedding and reranking APIs
|
||||
- [Serper](https://serper.dev/) for their Google search API
|
||||
- [Groq](https://groq.com/) for their fast LLM inference
|
||||
Intelligent Research Assistant
|
187
config/config.py
187
config/config.py
|
@ -1,187 +0,0 @@
|
|||
"""
|
||||
Configuration management for the intelligent research system.
|
||||
|
||||
This module handles loading configuration from files and environment variables,
|
||||
providing secure access to API keys and model settings.
|
||||
"""
|
||||
|
||||
import os
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables from .env file if it exists
|
||||
load_dotenv()
|
||||
|
||||
class Config:
|
||||
"""Configuration manager for the intelligent research system."""
|
||||
|
||||
def __init__(self, config_path: Optional[str] = None):
|
||||
"""
|
||||
Initialize the configuration manager.
|
||||
|
||||
Args:
|
||||
config_path: Path to the configuration file. If None, will look for
|
||||
config.yaml in the same directory as this file.
|
||||
"""
|
||||
self.config_data = {}
|
||||
self.config_path = config_path
|
||||
|
||||
if not config_path:
|
||||
# Default to config.yaml in the same directory as this file
|
||||
self.config_path = Path(__file__).parent / "config.yaml"
|
||||
|
||||
self.load_config()
|
||||
|
||||
def load_config(self) -> None:
|
||||
"""Load configuration from file if it exists."""
|
||||
try:
|
||||
if Path(self.config_path).exists():
|
||||
with open(self.config_path, 'r') as f:
|
||||
self.config_data = yaml.safe_load(f)
|
||||
print(f"Configuration loaded from {self.config_path}")
|
||||
else:
|
||||
print(f"Configuration file {self.config_path} not found. Using environment variables only.")
|
||||
except Exception as e:
|
||||
print(f"Error loading configuration: {e}")
|
||||
|
||||
def get_api_key(self, provider: str) -> str:
|
||||
"""Get the API key for a specific provider.
|
||||
|
||||
Args:
|
||||
provider: The provider name (e.g., 'openai', 'anthropic', 'google')
|
||||
|
||||
Returns:
|
||||
The API key for the specified provider
|
||||
|
||||
Raises:
|
||||
ValueError: If the API key is not found
|
||||
"""
|
||||
provider = provider.lower()
|
||||
|
||||
# Map provider names to environment variable names
|
||||
provider_env_map = {
|
||||
'openai': 'OPENAI_API_KEY',
|
||||
'anthropic': 'ANTHROPIC_API_KEY',
|
||||
'google': 'GEMINI_API_KEY',
|
||||
'gemini': 'GEMINI_API_KEY',
|
||||
'vertex_ai': 'GOOGLE_APPLICATION_CREDENTIALS',
|
||||
'groq': 'GROQ_API_KEY',
|
||||
'openrouter': 'OPENROUTER_API_KEY',
|
||||
'serper': 'SERPER_API_KEY',
|
||||
'tavily': 'TAVILY_API_KEY',
|
||||
'perplexity': 'PERPLEXITY_API_KEY'
|
||||
}
|
||||
|
||||
# Get the environment variable name for the provider
|
||||
env_var = provider_env_map.get(provider)
|
||||
if not env_var:
|
||||
env_var = f"{provider.upper()}_API_KEY"
|
||||
|
||||
# Try to get the API key from environment variables
|
||||
api_key = os.environ.get(env_var)
|
||||
|
||||
# If not found in environment, check the config file
|
||||
if not api_key and 'api_keys' in self.config_data:
|
||||
api_key = self.config_data['api_keys'].get(provider)
|
||||
|
||||
if not api_key:
|
||||
raise ValueError(f"API key for {provider} not found. Please set the {env_var} environment variable or add it to the config file.")
|
||||
|
||||
return api_key
|
||||
|
||||
def get_model_config(self, model_name: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get configuration for a specific model.
|
||||
|
||||
Args:
|
||||
model_name: The name of the model
|
||||
|
||||
Returns:
|
||||
Dictionary containing model configuration
|
||||
"""
|
||||
if self.config_data and 'models' in self.config_data:
|
||||
return self.config_data['models'].get(model_name, {})
|
||||
return {}
|
||||
|
||||
def get_module_model(self, module_name: str, function_name: str) -> str:
|
||||
"""
|
||||
Get the model assigned to a specific module function.
|
||||
|
||||
Args:
|
||||
module_name: The name of the module (e.g., 'query_processing')
|
||||
function_name: The name of the function (e.g., 'enhance_query')
|
||||
|
||||
Returns:
|
||||
The name of the model to use, or the default model if not specified
|
||||
"""
|
||||
default = self.config_data.get('default_model', 'gpt-3.5-turbo')
|
||||
|
||||
if (self.config_data and 'module_models' in self.config_data and
|
||||
module_name in self.config_data['module_models'] and
|
||||
function_name in self.config_data['module_models'][module_name]):
|
||||
return self.config_data['module_models'][module_name][function_name]
|
||||
|
||||
return default
|
||||
|
||||
def get_search_config(self, search_engine: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get configuration for a specific search engine.
|
||||
|
||||
Args:
|
||||
search_engine: The name of the search engine
|
||||
|
||||
Returns:
|
||||
Dictionary containing search engine configuration
|
||||
"""
|
||||
if self.config_data and 'search_engines' in self.config_data:
|
||||
return self.config_data['search_engines'].get(search_engine, {})
|
||||
return {}
|
||||
|
||||
def get_ui_config(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get UI configuration.
|
||||
|
||||
Returns:
|
||||
Dictionary containing UI configuration
|
||||
"""
|
||||
if self.config_data and 'ui' in self.config_data:
|
||||
return self.config_data['ui']
|
||||
return {}
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
config = Config()
|
||||
|
||||
|
||||
def get_config() -> Config:
|
||||
"""
|
||||
Get the global configuration instance.
|
||||
|
||||
Returns:
|
||||
The global Config instance
|
||||
"""
|
||||
return config
|
||||
|
||||
|
||||
def get_api_key(service_name: str) -> Optional[str]:
|
||||
"""
|
||||
Get an API key for a specific service.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service to get the API key for
|
||||
|
||||
Returns:
|
||||
API key as a string, or None if not found
|
||||
"""
|
||||
# First check environment variables
|
||||
env_var_name = f"{service_name.upper()}_API_KEY"
|
||||
api_key = os.environ.get(env_var_name)
|
||||
|
||||
# If not found in environment, check config file
|
||||
if not api_key:
|
||||
cfg = get_config()
|
||||
api_key = cfg.config_data.get('api_keys', {}).get(service_name)
|
||||
|
||||
return api_key
|
|
@ -1,157 +0,0 @@
|
|||
# Example configuration file for the intelligent research system
|
||||
# Rename this file to config.yaml and fill in your API keys and settings
|
||||
|
||||
# API keys (alternatively, set environment variables)
|
||||
api_keys:
|
||||
openai: "your-openai-api-key" # Or set OPENAI_API_KEY environment variable
|
||||
jina: "your-jina-api-key" # Or set JINA_API_KEY environment variable
|
||||
serper: "your-serper-api-key" # Or set SERPER_API_KEY environment variable
|
||||
google: "your-google-api-key" # Or set GOOGLE_API_KEY environment variable
|
||||
anthropic: "your-anthropic-api-key" # Or set ANTHROPIC_API_KEY environment variable
|
||||
openrouter: "your-openrouter-api-key" # Or set OPENROUTER_API_KEY environment variable
|
||||
groq: "your-groq-api-key" # Or set GROQ_API_KEY environment variable
|
||||
|
||||
# LLM model configurations
|
||||
models:
|
||||
gpt-3.5-turbo:
|
||||
provider: "openai"
|
||||
temperature: 0.7
|
||||
max_tokens: 1000
|
||||
top_p: 1.0
|
||||
endpoint: null # Use default OpenAI endpoint
|
||||
|
||||
gpt-4:
|
||||
provider: "openai"
|
||||
temperature: 0.5
|
||||
max_tokens: 2000
|
||||
top_p: 1.0
|
||||
endpoint: null # Use default OpenAI endpoint
|
||||
|
||||
claude-2:
|
||||
provider: "anthropic"
|
||||
temperature: 0.7
|
||||
max_tokens: 1500
|
||||
top_p: 1.0
|
||||
endpoint: null # Use default Anthropic endpoint
|
||||
|
||||
azure-gpt-4:
|
||||
provider: "azure"
|
||||
temperature: 0.5
|
||||
max_tokens: 2000
|
||||
top_p: 1.0
|
||||
endpoint: "https://your-azure-endpoint.openai.azure.com"
|
||||
deployment_name: "your-deployment-name"
|
||||
api_version: "2023-05-15"
|
||||
|
||||
local-llama:
|
||||
provider: "ollama"
|
||||
temperature: 0.8
|
||||
max_tokens: 1000
|
||||
endpoint: "http://localhost:11434/api/generate"
|
||||
model_name: "llama2"
|
||||
|
||||
llama-3.1-8b-instant:
|
||||
provider: "groq"
|
||||
model_name: "llama-3.1-8b-instant"
|
||||
temperature: 0.7
|
||||
max_tokens: 1024
|
||||
top_p: 1.0
|
||||
endpoint: "https://api.groq.com/openai/v1"
|
||||
|
||||
llama-3.3-70b-versatile:
|
||||
provider: "groq"
|
||||
model_name: "llama-3.3-70b-versatile"
|
||||
temperature: 0.5
|
||||
max_tokens: 2048
|
||||
top_p: 1.0
|
||||
endpoint: "https://api.groq.com/openai/v1"
|
||||
|
||||
openrouter-mixtral:
|
||||
provider: "openrouter"
|
||||
model_name: "mistralai/mixtral-8x7b-instruct"
|
||||
temperature: 0.7
|
||||
max_tokens: 1024
|
||||
top_p: 1.0
|
||||
endpoint: "https://openrouter.ai/api/v1"
|
||||
|
||||
openrouter-claude:
|
||||
provider: "openrouter"
|
||||
model_name: "anthropic/claude-3-opus"
|
||||
temperature: 0.5
|
||||
max_tokens: 2048
|
||||
top_p: 1.0
|
||||
endpoint: "https://openrouter.ai/api/v1"
|
||||
|
||||
gemini-2.0-flash:
|
||||
provider: "gemini"
|
||||
model_name: "gemini-2.0-flash"
|
||||
temperature: 0.5
|
||||
max_tokens: 2048
|
||||
top_p: 1.0
|
||||
|
||||
# Default model to use if not specified for a module
|
||||
default_model: "llama-3.1-8b-instant" # Using Groq's Llama 3.1 8B model for testing
|
||||
|
||||
# Module-specific model assignments
|
||||
module_models:
|
||||
# Query processing module
|
||||
query_processing:
|
||||
enhance_query: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for query enhancement
|
||||
classify_query: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for classification
|
||||
generate_search_queries: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for generating search queries
|
||||
|
||||
# Search strategy module
|
||||
search_strategy:
|
||||
develop_strategy: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for developing search strategies
|
||||
target_selection: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for target selection
|
||||
|
||||
# Document ranking module
|
||||
document_ranking:
|
||||
rerank_documents: "jina-reranker" # Use Jina's reranker for document reranking
|
||||
|
||||
# Report generation module
|
||||
report_generation:
|
||||
synthesize_report: "gemini-2.0-flash" # Use Google's Gemini 2.0 Flash for report synthesis
|
||||
format_report: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for formatting
|
||||
|
||||
# Search engine configurations
|
||||
search_engines:
|
||||
google:
|
||||
enabled: true
|
||||
max_results: 10
|
||||
|
||||
serper:
|
||||
enabled: true
|
||||
max_results: 10
|
||||
|
||||
jina:
|
||||
enabled: true
|
||||
max_results: 10
|
||||
|
||||
scholar:
|
||||
enabled: false
|
||||
max_results: 5
|
||||
|
||||
arxiv:
|
||||
enabled: false
|
||||
max_results: 5
|
||||
|
||||
# Jina AI specific configurations
|
||||
jina:
|
||||
reranker:
|
||||
model: "jina-reranker-v2-base-multilingual" # Default reranker model
|
||||
top_n: 10 # Default number of top results to return
|
||||
|
||||
# UI configuration
|
||||
ui:
|
||||
theme: "light" # light or dark
|
||||
port: 7860
|
||||
share: false
|
||||
title: "Intelligent Research System"
|
||||
description: "An automated system for finding, filtering, and synthesizing information"
|
||||
|
||||
# System settings
|
||||
system:
|
||||
cache_dir: "data/cache"
|
||||
results_dir: "data/results"
|
||||
log_level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
|
@ -1,150 +0,0 @@
|
|||
# Example configuration file for the intelligent research system
|
||||
# Rename this file to config.yaml and fill in your API keys and settings
|
||||
|
||||
# API keys (alternatively, set environment variables)
|
||||
api_keys:
|
||||
openai: "your-openai-api-key" # Or set OPENAI_API_KEY environment variable
|
||||
jina: "your-jina-api-key" # Or set JINA_API_KEY environment variable
|
||||
serper: "your-serper-api-key" # Or set SERPER_API_KEY environment variable
|
||||
google: "your-google-api-key" # Or set GOOGLE_API_KEY environment variable
|
||||
anthropic: "your-anthropic-api-key" # Or set ANTHROPIC_API_KEY environment variable
|
||||
openrouter: "your-openrouter-api-key" # Or set OPENROUTER_API_KEY environment variable
|
||||
groq: "your-groq-api-key" # Or set GROQ_API_KEY environment variable
|
||||
|
||||
# LLM model configurations
|
||||
models:
|
||||
gpt-3.5-turbo:
|
||||
provider: "openai"
|
||||
temperature: 0.7
|
||||
max_tokens: 1000
|
||||
top_p: 1.0
|
||||
endpoint: null # Use default OpenAI endpoint
|
||||
|
||||
gpt-4:
|
||||
provider: "openai"
|
||||
temperature: 0.5
|
||||
max_tokens: 2000
|
||||
top_p: 1.0
|
||||
endpoint: null # Use default OpenAI endpoint
|
||||
|
||||
claude-2:
|
||||
provider: "anthropic"
|
||||
temperature: 0.7
|
||||
max_tokens: 1500
|
||||
top_p: 1.0
|
||||
endpoint: null # Use default Anthropic endpoint
|
||||
|
||||
azure-gpt-4:
|
||||
provider: "azure"
|
||||
temperature: 0.5
|
||||
max_tokens: 2000
|
||||
top_p: 1.0
|
||||
endpoint: "https://your-azure-endpoint.openai.azure.com"
|
||||
deployment_name: "your-deployment-name"
|
||||
api_version: "2023-05-15"
|
||||
|
||||
local-llama:
|
||||
provider: "ollama"
|
||||
temperature: 0.8
|
||||
max_tokens: 1000
|
||||
endpoint: "http://localhost:11434/api/generate"
|
||||
model_name: "llama2"
|
||||
|
||||
llama-3.1-8b-instant:
|
||||
provider: "groq"
|
||||
model_name: "llama-3.1-8b-instant"
|
||||
temperature: 0.7
|
||||
max_tokens: 1024
|
||||
top_p: 1.0
|
||||
endpoint: "https://api.groq.com/openai/v1"
|
||||
|
||||
llama-3.3-70b-versatile:
|
||||
provider: "groq"
|
||||
model_name: "llama-3.3-70b-versatile"
|
||||
temperature: 0.5
|
||||
max_tokens: 2048
|
||||
top_p: 1.0
|
||||
endpoint: "https://api.groq.com/openai/v1"
|
||||
|
||||
openrouter-mixtral:
|
||||
provider: "openrouter"
|
||||
model_name: "mistralai/mixtral-8x7b-instruct"
|
||||
temperature: 0.7
|
||||
max_tokens: 1024
|
||||
top_p: 1.0
|
||||
endpoint: "https://openrouter.ai/api/v1"
|
||||
|
||||
openrouter-claude:
|
||||
provider: "openrouter"
|
||||
model_name: "anthropic/claude-3-opus"
|
||||
temperature: 0.5
|
||||
max_tokens: 2048
|
||||
top_p: 1.0
|
||||
endpoint: "https://openrouter.ai/api/v1"
|
||||
|
||||
# Default model to use if not specified for a module
|
||||
default_model: "llama-3.1-8b-instant" # Using Groq's Llama 3.1 8B model for testing
|
||||
|
||||
# Module-specific model assignments
|
||||
module_models:
|
||||
# Query processing module
|
||||
query_processing:
|
||||
enhance_query: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for query enhancement
|
||||
classify_query: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for classification
|
||||
generate_search_queries: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for generating search queries
|
||||
|
||||
# Search strategy module
|
||||
search_strategy:
|
||||
develop_strategy: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for developing search strategies
|
||||
target_selection: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for target selection
|
||||
|
||||
# Document ranking module
|
||||
document_ranking:
|
||||
rerank_documents: "jina-reranker" # Use Jina's reranker for document reranking
|
||||
|
||||
# Report generation module
|
||||
report_generation:
|
||||
synthesize_report: "llama-3.3-70b-versatile" # Use Groq's Llama 3.3 70B for report synthesis
|
||||
format_report: "llama-3.1-8b-instant" # Use Groq's Llama 3.1 8B for formatting
|
||||
|
||||
# Search engine configurations
|
||||
search_engines:
|
||||
google:
|
||||
enabled: true
|
||||
max_results: 10
|
||||
|
||||
serper:
|
||||
enabled: true
|
||||
max_results: 10
|
||||
|
||||
jina:
|
||||
enabled: true
|
||||
max_results: 10
|
||||
|
||||
scholar:
|
||||
enabled: false
|
||||
max_results: 5
|
||||
|
||||
arxiv:
|
||||
enabled: false
|
||||
max_results: 5
|
||||
|
||||
# Jina AI specific configurations
|
||||
jina:
|
||||
reranker:
|
||||
model: "jina-reranker-v2-base-multilingual" # Default reranker model
|
||||
top_n: 10 # Default number of top results to return
|
||||
|
||||
# UI configuration
|
||||
ui:
|
||||
theme: "light" # light or dark
|
||||
port: 7860
|
||||
share: false
|
||||
title: "Intelligent Research System"
|
||||
description: "An automated system for finding, filtering, and synthesizing information"
|
||||
|
||||
# System settings
|
||||
system:
|
||||
cache_dir: "data/cache"
|
||||
results_dir: "data/results"
|
||||
log_level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
|
@ -1,4 +0,0 @@
|
|||
"""
|
||||
Example scripts and data for the intelligent research system.
|
||||
This package contains example scripts and sample data that demonstrate the system's functionality.
|
||||
"""
|
|
@ -1 +0,0 @@
|
|||
Please research and discuss the impact of the US Government sponsored mind-control experiment MKUltra on popular media, like TV shows, Movies, and Books.
|
|
@ -1 +0,0 @@
|
|||
The quick brown fox jumps over the lazy dog. This classic pangram contains every letter of the English alphabet at least once. Pangrams are often used to display font samples and test keyboards and printers. While "The quick brown fox jumps over the lazy dog" is the most famous pangram in English, many other examples exist.
|
|
@ -1 +0,0 @@
|
|||
What is a pangram used for?
|
|
@ -1,4 +0,0 @@
|
|||
"""
|
||||
Example scripts for the intelligent research system.
|
||||
This package contains example scripts that demonstrate the system's functionality.
|
||||
"""
|
|
@ -1,4 +0,0 @@
|
|||
"""
|
||||
Search execution module for the intelligent research system.
|
||||
This module handles the execution of search queries across various search engines.
|
||||
"""
|
|
@ -1,4 +0,0 @@
|
|||
"""
|
||||
API handlers for different search engines.
|
||||
Each handler implements a common interface for executing searches and processing results.
|
||||
"""
|
|
@ -1,162 +0,0 @@
|
|||
"""
|
||||
arXiv API handler.
|
||||
Uses the official arXiv API to search for academic papers.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import urllib.parse
|
||||
import xml.etree.ElementTree as ET
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
from .base_handler import BaseSearchHandler
|
||||
from config.config import get_config
|
||||
|
||||
|
||||
class ArxivSearchHandler(BaseSearchHandler):
|
||||
"""Handler for arXiv Search using the official API."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the arXiv search handler."""
|
||||
self.config = get_config()
|
||||
self.base_url = "http://export.arxiv.org/api/query"
|
||||
self.available = True # arXiv API is freely available without an API key
|
||||
|
||||
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Execute an arXiv search query.
|
||||
|
||||
Args:
|
||||
query: The search query to execute
|
||||
num_results: Number of results to return
|
||||
**kwargs: Additional search parameters:
|
||||
- sort_by: Sort order ("relevance", "lastUpdatedDate", "submittedDate")
|
||||
- sort_order: Sort direction ("ascending", "descending")
|
||||
- categories: List of arXiv categories to search within
|
||||
- date_range: Date range for filtering (e.g., "all", "last_week", "last_month")
|
||||
|
||||
Returns:
|
||||
List of search results with standardized format
|
||||
"""
|
||||
# Set up the request parameters
|
||||
params = {
|
||||
"search_query": query,
|
||||
"max_results": num_results,
|
||||
"start": kwargs.get("start", 0)
|
||||
}
|
||||
|
||||
# Add sorting parameters
|
||||
sort_by = kwargs.get("sort_by", "relevance")
|
||||
if sort_by == "relevance":
|
||||
params["sortBy"] = "relevance"
|
||||
elif sort_by == "lastUpdatedDate":
|
||||
params["sortBy"] = "lastUpdatedDate"
|
||||
elif sort_by == "submittedDate":
|
||||
params["sortBy"] = "submittedDate"
|
||||
|
||||
sort_order = kwargs.get("sort_order", "descending")
|
||||
if sort_order == "descending":
|
||||
params["sortOrder"] = "descending"
|
||||
elif sort_order == "ascending":
|
||||
params["sortOrder"] = "ascending"
|
||||
|
||||
# Add category filtering
|
||||
if "categories" in kwargs and kwargs["categories"]:
|
||||
categories = "+OR+".join([f"cat:{cat}" for cat in kwargs["categories"]])
|
||||
params["search_query"] = f"{params['search_query']}+AND+({categories})"
|
||||
|
||||
try:
|
||||
# Make the request
|
||||
response = requests.get(
|
||||
self.base_url,
|
||||
params=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse the XML response
|
||||
root = ET.fromstring(response.content)
|
||||
|
||||
# Define namespaces
|
||||
ns = {
|
||||
'atom': 'http://www.w3.org/2005/Atom',
|
||||
'arxiv': 'http://arxiv.org/schemas/atom'
|
||||
}
|
||||
|
||||
# Extract and standardize the results
|
||||
results = []
|
||||
|
||||
for entry in root.findall('.//atom:entry', ns):
|
||||
# Extract basic information
|
||||
title = entry.find('./atom:title', ns).text.strip()
|
||||
summary = entry.find('./atom:summary', ns).text.strip()
|
||||
published = entry.find('./atom:published', ns).text
|
||||
updated = entry.find('./atom:updated', ns).text
|
||||
|
||||
# Extract authors
|
||||
authors = []
|
||||
for author in entry.findall('./atom:author/atom:name', ns):
|
||||
authors.append(author.text.strip())
|
||||
|
||||
# Extract links
|
||||
links = {}
|
||||
for link in entry.findall('./atom:link', ns):
|
||||
link_rel = link.get('rel', '')
|
||||
link_href = link.get('href', '')
|
||||
links[link_rel] = link_href
|
||||
|
||||
# Extract arXiv-specific information
|
||||
arxiv_id = entry.find('./atom:id', ns).text.split('/')[-1]
|
||||
|
||||
# Get categories
|
||||
categories = []
|
||||
for category in entry.findall('./arxiv:primary_category', ns):
|
||||
categories.append(category.get('term', ''))
|
||||
for category in entry.findall('./atom:category', ns):
|
||||
cat_term = category.get('term', '')
|
||||
if cat_term and cat_term not in categories:
|
||||
categories.append(cat_term)
|
||||
|
||||
# Format the result
|
||||
result = {
|
||||
"title": title,
|
||||
"url": links.get('alternate', ''),
|
||||
"pdf_url": links.get('related', ''),
|
||||
"snippet": summary[:200] + "..." if len(summary) > 200 else summary,
|
||||
"source": "arxiv",
|
||||
"arxiv_id": arxiv_id,
|
||||
"authors": authors,
|
||||
"categories": categories,
|
||||
"published_date": published,
|
||||
"updated_date": updated,
|
||||
"full_text": summary
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error executing arXiv search: {e}")
|
||||
return []
|
||||
except ET.ParseError as e:
|
||||
print(f"Error parsing arXiv response: {e}")
|
||||
return []
|
||||
|
||||
def get_name(self) -> str:
|
||||
"""Get the name of the search handler."""
|
||||
return "arxiv"
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if the arXiv API is available."""
|
||||
return self.available
|
||||
|
||||
def get_rate_limit_info(self) -> Dict[str, Any]:
|
||||
"""Get information about the API's rate limits."""
|
||||
# arXiv API rate limits
|
||||
return {
|
||||
"requests_per_minute": 30, # arXiv recommends no more than 1 request per 3 seconds
|
||||
"requests_per_day": 2000, # This is an estimate
|
||||
"current_usage": None # arXiv doesn't provide usage info in responses
|
||||
}
|
|
@ -1,63 +0,0 @@
|
|||
"""
|
||||
Base handler interface for search APIs.
|
||||
All specific API handlers should inherit from this base class.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
|
||||
class BaseSearchHandler(ABC):
|
||||
"""Base class for all search API handlers."""
|
||||
|
||||
@abstractmethod
|
||||
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Execute a search query and return results.
|
||||
|
||||
Args:
|
||||
query: The search query to execute
|
||||
num_results: Number of results to return
|
||||
**kwargs: Additional search parameters specific to the API
|
||||
|
||||
Returns:
|
||||
List of search results, each as a dictionary with at least:
|
||||
- title: Title of the result
|
||||
- url: URL of the result
|
||||
- snippet: Text snippet or description
|
||||
- source: Source of the result (e.g., "google", "scholar")
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_name(self) -> str:
|
||||
"""
|
||||
Get the name of the search handler.
|
||||
|
||||
Returns:
|
||||
Name of the search handler (e.g., "google", "scholar")
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_available(self) -> bool:
|
||||
"""
|
||||
Check if the search API is available and properly configured.
|
||||
|
||||
Returns:
|
||||
True if the API is available, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_rate_limit_info(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get information about the API's rate limits.
|
||||
|
||||
Returns:
|
||||
Dictionary with rate limit information:
|
||||
- requests_per_minute: Maximum requests per minute
|
||||
- requests_per_day: Maximum requests per day
|
||||
- current_usage: Current usage statistics if available
|
||||
"""
|
||||
pass
|
|
@ -1,113 +0,0 @@
|
|||
"""
|
||||
Google Search API handler.
|
||||
Uses the Serper API to access Google search results.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
from .base_handler import BaseSearchHandler
|
||||
from config.config import get_config, get_api_key
|
||||
|
||||
|
||||
class GoogleSearchHandler(BaseSearchHandler):
|
||||
"""Handler for Google Search using the Serper API."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Google search handler."""
|
||||
self.config = get_config()
|
||||
self.api_key = get_api_key("serper")
|
||||
self.base_url = "https://google.serper.dev/search"
|
||||
self.available = self.api_key is not None
|
||||
|
||||
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Execute a Google search query using Serper API.
|
||||
|
||||
Args:
|
||||
query: The search query to execute
|
||||
num_results: Number of results to return
|
||||
**kwargs: Additional search parameters:
|
||||
- country: Country code (default: "us")
|
||||
- language: Language code (default: "en")
|
||||
- page: Page number (default: 1)
|
||||
|
||||
Returns:
|
||||
List of search results with standardized format
|
||||
"""
|
||||
if not self.available:
|
||||
raise ValueError("Google Search API is not available. API key is missing.")
|
||||
|
||||
# Set up the request parameters
|
||||
params = {
|
||||
"q": query,
|
||||
"num": num_results,
|
||||
"type": "search" # Specify search type
|
||||
}
|
||||
|
||||
# Add optional parameters
|
||||
if "country" in kwargs:
|
||||
params["gl"] = kwargs["country"]
|
||||
if "language" in kwargs:
|
||||
params["hl"] = kwargs["language"]
|
||||
if "page" in kwargs:
|
||||
params["page"] = kwargs["page"]
|
||||
|
||||
# Set up the headers
|
||||
headers = {
|
||||
"X-API-KEY": self.api_key,
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
try:
|
||||
# Make the request
|
||||
response = requests.post(
|
||||
self.base_url,
|
||||
headers=headers,
|
||||
json=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse the response
|
||||
data = response.json()
|
||||
|
||||
# Extract and standardize the results
|
||||
results = []
|
||||
|
||||
# Process organic results
|
||||
if "organic" in data:
|
||||
for item in data["organic"][:num_results]:
|
||||
result = {
|
||||
"title": item.get("title", ""),
|
||||
"url": item.get("link", ""),
|
||||
"snippet": item.get("snippet", ""),
|
||||
"source": "google",
|
||||
"position": item.get("position", 0),
|
||||
"raw_data": item
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error executing Google search: {e}")
|
||||
return []
|
||||
|
||||
def get_name(self) -> str:
|
||||
"""Get the name of the search handler."""
|
||||
return "google"
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if the Google Search API is available."""
|
||||
return self.available
|
||||
|
||||
def get_rate_limit_info(self) -> Dict[str, Any]:
|
||||
"""Get information about the API's rate limits."""
|
||||
# These are example values - adjust based on your Serper plan
|
||||
return {
|
||||
"requests_per_minute": 60,
|
||||
"requests_per_day": 2500,
|
||||
"current_usage": None # Serper doesn't provide usage info in responses
|
||||
}
|
|
@ -1,125 +0,0 @@
|
|||
"""
|
||||
Google Scholar API handler.
|
||||
Uses the Serper API to access Google Scholar search results.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
from .base_handler import BaseSearchHandler
|
||||
from config.config import get_config, get_api_key
|
||||
|
||||
|
||||
class ScholarSearchHandler(BaseSearchHandler):
|
||||
"""Handler for Google Scholar Search using the Serper API."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Google Scholar search handler."""
|
||||
self.config = get_config()
|
||||
self.api_key = get_api_key("serper")
|
||||
self.base_url = "https://google.serper.dev/scholar"
|
||||
self.available = self.api_key is not None
|
||||
|
||||
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Execute a Google Scholar search query using Serper API.
|
||||
|
||||
Args:
|
||||
query: The search query to execute
|
||||
num_results: Number of results to return
|
||||
**kwargs: Additional search parameters:
|
||||
- country: Country code (default: "us")
|
||||
- language: Language code (default: "en")
|
||||
- year_start: Start year for publication date filter
|
||||
- year_end: End year for publication date filter
|
||||
|
||||
Returns:
|
||||
List of search results with standardized format
|
||||
"""
|
||||
if not self.available:
|
||||
raise ValueError("Google Scholar API is not available. API key is missing.")
|
||||
|
||||
# Set up the request parameters
|
||||
params = {
|
||||
"q": query,
|
||||
"num": num_results,
|
||||
"type": "scholar" # Specify search type as scholar
|
||||
}
|
||||
|
||||
# Add optional parameters
|
||||
if "country" in kwargs:
|
||||
params["gl"] = kwargs["country"]
|
||||
if "language" in kwargs:
|
||||
params["hl"] = kwargs["language"]
|
||||
|
||||
# Add date range if specified
|
||||
date_range = ""
|
||||
if "year_start" in kwargs and "year_end" in kwargs:
|
||||
date_range = f"as_ylo={kwargs['year_start']}&as_yhi={kwargs['year_end']}"
|
||||
elif "year_start" in kwargs:
|
||||
date_range = f"as_ylo={kwargs['year_start']}"
|
||||
elif "year_end" in kwargs:
|
||||
date_range = f"as_yhi={kwargs['year_end']}"
|
||||
|
||||
if date_range:
|
||||
params["tbs"] = date_range
|
||||
|
||||
# Set up the headers
|
||||
headers = {
|
||||
"X-API-KEY": self.api_key,
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
try:
|
||||
# Make the request
|
||||
response = requests.post(
|
||||
self.base_url,
|
||||
headers=headers,
|
||||
json=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse the response
|
||||
data = response.json()
|
||||
|
||||
# Process the results
|
||||
results = []
|
||||
|
||||
# Process organic results
|
||||
if "organic" in data:
|
||||
for item in data["organic"]:
|
||||
result = {
|
||||
"title": item.get("title", ""),
|
||||
"url": item.get("link", ""),
|
||||
"snippet": item.get("snippet", ""),
|
||||
"source": "scholar",
|
||||
"authors": item.get("authors", ""),
|
||||
"publication": item.get("publication", ""),
|
||||
"year": item.get("year", "")
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error executing Google Scholar search: {e}")
|
||||
return []
|
||||
|
||||
def get_name(self) -> str:
|
||||
"""Get the name of the search handler."""
|
||||
return "scholar"
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if the Google Scholar API is available."""
|
||||
return self.available
|
||||
|
||||
def get_rate_limit_info(self) -> Dict[str, Any]:
|
||||
"""Get information about the API's rate limits."""
|
||||
# These are example values - adjust based on your Serper plan
|
||||
return {
|
||||
"requests_per_minute": 30, # Lower for Scholar due to its specialized nature
|
||||
"requests_per_day": 1000,
|
||||
"current_usage": None # Serper doesn't provide usage info in responses
|
||||
}
|
|
@ -1,134 +0,0 @@
|
|||
"""
|
||||
Serper API handler.
|
||||
Provides direct access to Serper's enhanced search capabilities.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
from .base_handler import BaseSearchHandler
|
||||
from config.config import get_config, get_api_key
|
||||
|
||||
|
||||
class SerperSearchHandler(BaseSearchHandler):
|
||||
"""Handler for Serper's enhanced search API."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Serper search handler."""
|
||||
self.config = get_config()
|
||||
self.api_key = get_api_key("serper")
|
||||
self.base_url = "https://google.serper.dev/search"
|
||||
self.available = self.api_key is not None
|
||||
|
||||
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Execute a search query using Serper's enhanced API.
|
||||
|
||||
Args:
|
||||
query: The search query to execute
|
||||
num_results: Number of results to return
|
||||
**kwargs: Additional search parameters:
|
||||
- search_type: Type of search ("web", "news", "images", "places")
|
||||
- country: Country code (default: "us")
|
||||
- language: Language code (default: "en")
|
||||
- page: Page number (default: 1)
|
||||
|
||||
Returns:
|
||||
List of search results with standardized format
|
||||
"""
|
||||
if not self.available:
|
||||
raise ValueError("Serper API is not available. API key is missing.")
|
||||
|
||||
# Set up the request parameters
|
||||
params = {
|
||||
"q": query,
|
||||
"num": num_results
|
||||
}
|
||||
|
||||
# Add optional parameters
|
||||
search_type = kwargs.get("search_type", "search")
|
||||
params["type"] = search_type
|
||||
|
||||
if "country" in kwargs:
|
||||
params["gl"] = kwargs["country"]
|
||||
if "language" in kwargs:
|
||||
params["hl"] = kwargs["language"]
|
||||
if "page" in kwargs:
|
||||
params["page"] = kwargs["page"]
|
||||
|
||||
# Set up the headers
|
||||
headers = {
|
||||
"X-API-KEY": self.api_key,
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
try:
|
||||
# Make the request
|
||||
print(f"Making request to {self.base_url} with API key: {self.api_key[:5]}...")
|
||||
print(f"Headers: {headers}")
|
||||
print(f"Params: {params}")
|
||||
|
||||
response = requests.post(
|
||||
self.base_url,
|
||||
headers=headers,
|
||||
json=params
|
||||
)
|
||||
|
||||
print(f"Response status: {response.status_code}")
|
||||
print(f"Response text: {response.text[:200]}")
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse the response
|
||||
data = response.json()
|
||||
|
||||
# Process the results
|
||||
results = []
|
||||
|
||||
# Process organic results
|
||||
if "organic" in data:
|
||||
for item in data["organic"]:
|
||||
result = {
|
||||
"title": item.get("title", ""),
|
||||
"url": item.get("link", ""),
|
||||
"snippet": item.get("snippet", ""),
|
||||
"source": "serper"
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
# Process knowledge graph if available
|
||||
if "knowledgeGraph" in data:
|
||||
kg = data["knowledgeGraph"]
|
||||
if "title" in kg and "description" in kg:
|
||||
result = {
|
||||
"title": kg.get("title", ""),
|
||||
"url": kg.get("website", ""),
|
||||
"snippet": kg.get("description", ""),
|
||||
"source": "serper_kg"
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error executing Serper search: {e}")
|
||||
return []
|
||||
|
||||
def get_name(self) -> str:
|
||||
"""Get the name of the search handler."""
|
||||
return "serper"
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if the Serper API is available."""
|
||||
return self.available
|
||||
|
||||
def get_rate_limit_info(self) -> Dict[str, Any]:
|
||||
"""Get information about the API's rate limits."""
|
||||
# These are example values - adjust based on your Serper plan
|
||||
return {
|
||||
"requests_per_minute": 60,
|
||||
"requests_per_day": 2500,
|
||||
"current_usage": None # Serper doesn't provide usage info in responses
|
||||
}
|
|
@ -1,425 +0,0 @@
|
|||
"""
|
||||
Result collector module.
|
||||
Processes and organizes search results from multiple search engines.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, List, Any, Optional, Set
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
|
||||
from ranking.jina_reranker import get_jina_reranker
|
||||
|
||||
|
||||
class ResultCollector:
|
||||
"""
|
||||
Collects and processes search results from multiple search engines.
|
||||
Handles deduplication, merging, and filtering of results.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the result collector."""
|
||||
try:
|
||||
self.reranker = get_jina_reranker()
|
||||
self.reranker_available = True
|
||||
except ValueError:
|
||||
print("Jina Reranker not available. Will use basic scoring instead.")
|
||||
self.reranker_available = False
|
||||
|
||||
def process_results(self,
|
||||
search_results: Dict[str, List[Dict[str, Any]]],
|
||||
dedup: bool = True,
|
||||
max_results: Optional[int] = None,
|
||||
use_reranker: bool = True) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process search results from multiple search engines.
|
||||
|
||||
Args:
|
||||
search_results: Dictionary mapping search engine names to lists of results
|
||||
dedup: Whether to deduplicate results
|
||||
max_results: Maximum number of results to return
|
||||
use_reranker: Whether to use the reranker for semantic ranking
|
||||
|
||||
Returns:
|
||||
List of processed search results
|
||||
"""
|
||||
# Flatten results from all search engines
|
||||
flattened_results = []
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
# Add the source to each result
|
||||
result['source'] = engine
|
||||
flattened_results.append(result)
|
||||
|
||||
# Print a verification of the query in the flattened results
|
||||
if flattened_results:
|
||||
first_result = flattened_results[0]
|
||||
query = first_result.get('query', '')
|
||||
print(f"Verifying query in flattened results:")
|
||||
print(f"Query in first result: {query[:50]}...")
|
||||
|
||||
# Deduplicate results if requested
|
||||
if dedup:
|
||||
flattened_results = self._deduplicate_results(flattened_results)
|
||||
|
||||
print(f"Processing {len(flattened_results)} combined results")
|
||||
if dedup:
|
||||
print(f"Deduplicated to {len(flattened_results)} results")
|
||||
|
||||
# Apply reranking if requested and available
|
||||
if use_reranker and self.reranker is not None:
|
||||
print("Using Jina Reranker for semantic ranking")
|
||||
try:
|
||||
reranked_results = self._rerank_results(flattened_results)
|
||||
print(f"Reranked {len(reranked_results)} results")
|
||||
processed_results = reranked_results
|
||||
except Exception as e:
|
||||
print(f"Error during reranking: {str(e)}. Falling back to basic scoring.")
|
||||
print("Using basic scoring")
|
||||
processed_results = self._score_and_sort_results(flattened_results)
|
||||
else:
|
||||
print("Using basic scoring")
|
||||
processed_results = self._score_and_sort_results(flattened_results)
|
||||
|
||||
# Limit the number of results if requested
|
||||
if max_results is not None:
|
||||
processed_results = processed_results[:max_results]
|
||||
|
||||
print(f"Processed {len(processed_results)} results {'with' if use_reranker and self.reranker is not None else 'without'} reranking")
|
||||
return processed_results
|
||||
|
||||
def _flatten_results(self, search_results: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Flatten search results from multiple search engines into a single list.
|
||||
|
||||
Args:
|
||||
search_results: Dictionary mapping search engine names to lists of results
|
||||
|
||||
Returns:
|
||||
Flattened list of search results
|
||||
"""
|
||||
# This method is deprecated and kept for backward compatibility
|
||||
# The process_results method now handles flattened results directly
|
||||
all_results = []
|
||||
|
||||
# Check if we have a flattened structure (single key with all results)
|
||||
if len(search_results) == 1 and "combined" in search_results:
|
||||
return search_results["combined"]
|
||||
|
||||
# Traditional structure with separate engines
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
# Add the source if not already present
|
||||
if "source" not in result:
|
||||
result["source"] = engine
|
||||
all_results.append(result)
|
||||
|
||||
return all_results
|
||||
|
||||
def _deduplicate_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Deduplicate results based on URL.
|
||||
|
||||
Args:
|
||||
results: List of search results
|
||||
|
||||
Returns:
|
||||
Deduplicated list of search results
|
||||
"""
|
||||
seen_urls = set()
|
||||
deduplicated_results = []
|
||||
|
||||
for result in results:
|
||||
url = result.get("url", "")
|
||||
|
||||
# Normalize URL for comparison
|
||||
normalized_url = self._normalize_url(url)
|
||||
|
||||
if normalized_url and normalized_url not in seen_urls:
|
||||
seen_urls.add(normalized_url)
|
||||
deduplicated_results.append(result)
|
||||
|
||||
return deduplicated_results
|
||||
|
||||
def _score_and_sort_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Score and sort results by relevance.
|
||||
|
||||
Args:
|
||||
results: List of search results
|
||||
|
||||
Returns:
|
||||
Sorted list of search results
|
||||
"""
|
||||
# Add a score to each result
|
||||
for result in results:
|
||||
score = 0
|
||||
|
||||
# Boost score based on source (e.g., scholarly sources get higher scores)
|
||||
source = result.get("source", "")
|
||||
if source == "scholar":
|
||||
score += 10
|
||||
elif source == "serper":
|
||||
score += 9
|
||||
elif source == "arxiv":
|
||||
score += 8
|
||||
elif source == "google":
|
||||
score += 5
|
||||
|
||||
# Boost score based on position in original results
|
||||
position = result.get("raw_data", {}).get("position", 0)
|
||||
if position > 0:
|
||||
score += max(0, 10 - position)
|
||||
|
||||
# Boost score for results with more content
|
||||
snippet_length = len(result.get("snippet", ""))
|
||||
if snippet_length > 200:
|
||||
score += 3
|
||||
elif snippet_length > 100:
|
||||
score += 2
|
||||
elif snippet_length > 50:
|
||||
score += 1
|
||||
|
||||
# Store the score
|
||||
result["relevance_score"] = score
|
||||
|
||||
# Sort by score (descending)
|
||||
sorted_results = sorted(results, key=lambda x: x.get("relevance_score", 0), reverse=True)
|
||||
|
||||
return sorted_results
|
||||
|
||||
def _rerank_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Rerank results using the Jina Reranker.
|
||||
|
||||
Args:
|
||||
results: List of search results
|
||||
|
||||
Returns:
|
||||
Reranked list of search results
|
||||
"""
|
||||
if not results:
|
||||
return []
|
||||
|
||||
# Extract the original query
|
||||
# First try to get it from the first result
|
||||
query = ""
|
||||
for result in results:
|
||||
if "query" in result:
|
||||
query = result.get("query", "")
|
||||
break
|
||||
|
||||
if not query:
|
||||
# If no query is found, use a fallback approach
|
||||
print("Warning: No query found in results. Using basic scoring instead.")
|
||||
return self._score_and_sort_results(results)
|
||||
|
||||
print(f"Reranking with query: {query}")
|
||||
|
||||
# Extract snippets for reranking
|
||||
snippets = []
|
||||
for result in results:
|
||||
# Combine title and snippet for better reranking
|
||||
content = f"{result.get('title', '')} {result.get('snippet', '')}"
|
||||
snippets.append(content)
|
||||
|
||||
try:
|
||||
# Use the reranker to rerank the snippets
|
||||
reranked = self.reranker.rerank(query, snippets)
|
||||
|
||||
if not reranked:
|
||||
print("Reranker returned empty results. Using basic scoring instead.")
|
||||
return self._score_and_sort_results(results)
|
||||
|
||||
print(f"Reranked {len(reranked)} results")
|
||||
|
||||
# Create a new list of results based on the reranking
|
||||
reranked_results = []
|
||||
for item in reranked:
|
||||
# Get the original result and add the new score
|
||||
index = item.get('index')
|
||||
score = item.get('score')
|
||||
|
||||
if index is None or score is None or index >= len(results):
|
||||
print(f"Warning: Invalid reranker result item: {item}")
|
||||
continue
|
||||
|
||||
original_result = results[index]
|
||||
new_result = original_result.copy()
|
||||
new_result['relevance_score'] = float(score) * 10 # Scale up the score for consistency
|
||||
reranked_results.append(new_result)
|
||||
|
||||
# If we didn't get any valid results, fall back to basic scoring
|
||||
if not reranked_results:
|
||||
print("No valid reranked results. Using basic scoring instead.")
|
||||
return self._score_and_sort_results(results)
|
||||
|
||||
return reranked_results
|
||||
except Exception as e:
|
||||
print(f"Error reranking results: {str(e)}")
|
||||
# Fall back to basic scoring if reranking fails
|
||||
return self._score_and_sort_results(results)
|
||||
|
||||
def _extract_domain(self, url: str) -> str:
|
||||
"""
|
||||
Extract the domain from a URL.
|
||||
|
||||
Args:
|
||||
url: URL to extract domain from
|
||||
|
||||
Returns:
|
||||
Domain name
|
||||
"""
|
||||
try:
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc
|
||||
|
||||
# Remove 'www.' prefix if present
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
return domain
|
||||
except:
|
||||
return ""
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
"""
|
||||
Normalize a URL for comparison.
|
||||
|
||||
Args:
|
||||
url: URL to normalize
|
||||
|
||||
Returns:
|
||||
Normalized URL
|
||||
"""
|
||||
try:
|
||||
# Parse the URL
|
||||
parsed_url = urlparse(url)
|
||||
|
||||
# Reconstruct with just the scheme, netloc, and path
|
||||
normalized = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
|
||||
|
||||
# Remove trailing slash if present
|
||||
if normalized.endswith('/'):
|
||||
normalized = normalized[:-1]
|
||||
|
||||
return normalized.lower()
|
||||
except:
|
||||
return url.lower()
|
||||
|
||||
def filter_results(self,
|
||||
results: List[Dict[str, Any]],
|
||||
filters: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Filter results based on specified criteria.
|
||||
|
||||
Args:
|
||||
results: List of search results
|
||||
filters: Dictionary of filter criteria:
|
||||
- domains: List of domains to include or exclude
|
||||
- exclude_domains: Whether to exclude (True) or include (False) the specified domains
|
||||
- min_score: Minimum relevance score
|
||||
- sources: List of sources to include
|
||||
- date_range: Dictionary with 'start' and 'end' dates
|
||||
|
||||
Returns:
|
||||
Filtered list of search results
|
||||
"""
|
||||
filtered_results = results.copy()
|
||||
|
||||
# Filter by domains
|
||||
if "domains" in filters and filters["domains"]:
|
||||
domains = set(filters["domains"])
|
||||
exclude_domains = filters.get("exclude_domains", False)
|
||||
|
||||
if exclude_domains:
|
||||
filtered_results = [r for r in filtered_results if r.get("domain", "") not in domains]
|
||||
else:
|
||||
filtered_results = [r for r in filtered_results if r.get("domain", "") in domains]
|
||||
|
||||
# Filter by minimum score
|
||||
if "min_score" in filters:
|
||||
min_score = filters["min_score"]
|
||||
filtered_results = [r for r in filtered_results if r.get("relevance_score", 0) >= min_score]
|
||||
|
||||
# Filter by sources
|
||||
if "sources" in filters and filters["sources"]:
|
||||
sources = set(filters["sources"])
|
||||
filtered_results = [r for r in filtered_results if r.get("source", "") in sources]
|
||||
|
||||
# Filter by date range
|
||||
if "date_range" in filters:
|
||||
date_range = filters["date_range"]
|
||||
|
||||
if "start" in date_range:
|
||||
start_date = datetime.fromisoformat(date_range["start"])
|
||||
filtered_results = [
|
||||
r for r in filtered_results
|
||||
if "date" not in r or not r["date"] or datetime.fromisoformat(r["date"]) >= start_date
|
||||
]
|
||||
|
||||
if "end" in date_range:
|
||||
end_date = datetime.fromisoformat(date_range["end"])
|
||||
filtered_results = [
|
||||
r for r in filtered_results
|
||||
if "date" not in r or not r["date"] or datetime.fromisoformat(r["date"]) <= end_date
|
||||
]
|
||||
|
||||
return filtered_results
|
||||
|
||||
def group_results_by_domain(self, results: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""
|
||||
Group results by domain.
|
||||
|
||||
Args:
|
||||
results: List of search results
|
||||
|
||||
Returns:
|
||||
Dictionary mapping domains to lists of search results
|
||||
"""
|
||||
grouped_results = {}
|
||||
|
||||
for result in results:
|
||||
domain = result.get("domain", "unknown")
|
||||
|
||||
if domain not in grouped_results:
|
||||
grouped_results[domain] = []
|
||||
|
||||
grouped_results[domain].append(result)
|
||||
|
||||
return grouped_results
|
||||
|
||||
def save_results(self, results: List[Dict[str, Any]], file_path: str) -> None:
|
||||
"""
|
||||
Save search results to a file.
|
||||
|
||||
Args:
|
||||
results: List of search results
|
||||
file_path: Path to save results to
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'w') as f:
|
||||
json.dump(results, f, indent=2)
|
||||
print(f"Results saved to {file_path}")
|
||||
except Exception as e:
|
||||
print(f"Error saving results: {e}")
|
||||
|
||||
def load_results(self, file_path: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Load search results from a file.
|
||||
|
||||
Args:
|
||||
file_path: Path to load results from
|
||||
|
||||
Returns:
|
||||
List of search results
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
results = json.load(f)
|
||||
return results
|
||||
except Exception as e:
|
||||
print(f"Error loading results: {e}")
|
||||
return []
|
|
@ -1,222 +0,0 @@
|
|||
"""
|
||||
Search executor module.
|
||||
Handles the execution of search queries across multiple search engines.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
from typing import Dict, List, Any, Optional, Union
|
||||
|
||||
from config.config import get_config
|
||||
from .api_handlers.base_handler import BaseSearchHandler
|
||||
from .api_handlers.serper_handler import SerperSearchHandler
|
||||
from .api_handlers.scholar_handler import ScholarSearchHandler
|
||||
from .api_handlers.arxiv_handler import ArxivSearchHandler
|
||||
|
||||
|
||||
class SearchExecutor:
|
||||
"""
|
||||
Executes search queries across multiple search engines.
|
||||
Manages rate limiting, error handling, and result aggregation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the search executor with available search handlers."""
|
||||
self.config = get_config()
|
||||
self.handlers = self._initialize_handlers()
|
||||
self.available_handlers = {name: handler for name, handler in self.handlers.items()
|
||||
if handler.is_available()}
|
||||
|
||||
def _initialize_handlers(self) -> Dict[str, BaseSearchHandler]:
|
||||
"""
|
||||
Initialize all search handlers.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping handler names to handler instances
|
||||
"""
|
||||
return {
|
||||
"serper": SerperSearchHandler(),
|
||||
"scholar": ScholarSearchHandler(),
|
||||
"arxiv": ArxivSearchHandler()
|
||||
}
|
||||
|
||||
def get_available_search_engines(self) -> List[str]:
|
||||
"""
|
||||
Get a list of available search engines.
|
||||
|
||||
Returns:
|
||||
List of available search engine names
|
||||
"""
|
||||
return list(self.available_handlers.keys())
|
||||
|
||||
def execute_search(self,
|
||||
structured_query: Dict[str, Any],
|
||||
search_engines: Optional[List[str]] = None,
|
||||
num_results: int = 10,
|
||||
timeout: int = 30) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""
|
||||
Execute a search query across multiple search engines.
|
||||
|
||||
Args:
|
||||
structured_query: Structured query from the query processor
|
||||
search_engines: List of search engines to use (if None, use all available)
|
||||
num_results: Number of results to return per search engine
|
||||
timeout: Timeout in seconds for each search engine
|
||||
|
||||
Returns:
|
||||
Dictionary mapping search engine names to lists of search results
|
||||
"""
|
||||
# Get the raw query
|
||||
raw_query = structured_query.get("raw_query", "")
|
||||
|
||||
# Get the enhanced query if available, otherwise use the raw query
|
||||
query = structured_query.get("enhanced_query", raw_query)
|
||||
|
||||
# Truncate the query if it's too long (Serper API has a 2048 character limit)
|
||||
if len(query) > 2000:
|
||||
query = query[:2000]
|
||||
|
||||
# If no search engines specified, use all available
|
||||
if search_engines is None:
|
||||
search_engines = list(self.available_handlers.keys())
|
||||
else:
|
||||
# Filter to only include available search engines
|
||||
search_engines = [engine for engine in search_engines
|
||||
if engine in self.available_handlers]
|
||||
|
||||
# Get the search queries for each engine
|
||||
search_queries = structured_query.get("search_queries", {})
|
||||
|
||||
# Execute searches in parallel
|
||||
results = {}
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future_to_engine = {}
|
||||
|
||||
for engine in search_engines:
|
||||
if engine not in self.available_handlers:
|
||||
continue
|
||||
|
||||
# Get the appropriate query for this engine
|
||||
engine_query = search_queries.get(engine, query)
|
||||
|
||||
# Submit the search task
|
||||
future = executor.submit(
|
||||
self._execute_single_search,
|
||||
engine=engine,
|
||||
query=engine_query,
|
||||
num_results=num_results
|
||||
)
|
||||
future_to_engine[future] = engine
|
||||
|
||||
# Collect results as they complete
|
||||
for future in concurrent.futures.as_completed(future_to_engine, timeout=timeout):
|
||||
engine = future_to_engine[future]
|
||||
try:
|
||||
engine_results = future.result()
|
||||
results[engine] = engine_results
|
||||
except Exception as e:
|
||||
print(f"Error executing search for {engine}: {e}")
|
||||
results[engine] = []
|
||||
|
||||
return results
|
||||
|
||||
def _execute_single_search(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Execute a search on a single search engine.
|
||||
|
||||
Args:
|
||||
engine: Name of the search engine
|
||||
query: Query to execute
|
||||
num_results: Number of results to return
|
||||
|
||||
Returns:
|
||||
List of search results
|
||||
"""
|
||||
handler = self.available_handlers.get(engine)
|
||||
if not handler:
|
||||
return []
|
||||
|
||||
try:
|
||||
# Execute the search
|
||||
results = handler.search(query, num_results=num_results)
|
||||
return results
|
||||
except Exception as e:
|
||||
print(f"Error executing search for {engine}: {e}")
|
||||
return []
|
||||
|
||||
async def execute_search_async(self,
|
||||
structured_query: Dict[str, Any],
|
||||
search_engines: Optional[List[str]] = None,
|
||||
num_results: int = 10,
|
||||
timeout: int = 30) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""
|
||||
Execute a search query across specified search engines asynchronously.
|
||||
|
||||
Args:
|
||||
structured_query: The structured query from the query processor
|
||||
search_engines: List of search engines to use (if None, use all available)
|
||||
num_results: Number of results to return per search engine
|
||||
timeout: Timeout in seconds for each search engine
|
||||
|
||||
Returns:
|
||||
Dictionary mapping search engine names to lists of search results
|
||||
"""
|
||||
# If no search engines specified, use all available
|
||||
if search_engines is None:
|
||||
search_engines = list(self.available_handlers.keys())
|
||||
else:
|
||||
# Filter to only include available search engines
|
||||
search_engines = [engine for engine in search_engines
|
||||
if engine in self.available_handlers]
|
||||
|
||||
# Get the search queries for each engine
|
||||
search_queries = structured_query.get("search_queries", {})
|
||||
|
||||
# Create tasks for each search engine
|
||||
tasks = []
|
||||
for engine in search_engines:
|
||||
if engine not in self.available_handlers:
|
||||
continue
|
||||
|
||||
# Get the appropriate query for this engine
|
||||
query = search_queries.get(engine, structured_query.get("enhanced_query", ""))
|
||||
|
||||
# Create a task for this search
|
||||
task = self._execute_single_search_async(engine, query, num_results)
|
||||
tasks.append((engine, task))
|
||||
|
||||
# Execute all tasks with timeout
|
||||
results = {}
|
||||
for engine, task in tasks:
|
||||
try:
|
||||
engine_results = await asyncio.wait_for(task, timeout=timeout)
|
||||
results[engine] = engine_results
|
||||
except asyncio.TimeoutError:
|
||||
print(f"Search timed out for {engine}")
|
||||
results[engine] = []
|
||||
except Exception as e:
|
||||
print(f"Error executing search for {engine}: {e}")
|
||||
results[engine] = []
|
||||
|
||||
return results
|
||||
|
||||
async def _execute_single_search_async(self, engine: str, query: str, num_results: int) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Execute a search on a single search engine asynchronously.
|
||||
|
||||
Args:
|
||||
engine: Name of the search engine
|
||||
query: Query to execute
|
||||
num_results: Number of results to return
|
||||
|
||||
Returns:
|
||||
List of search results
|
||||
"""
|
||||
# Execute in a thread pool since most API calls are blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(
|
||||
None, self._execute_single_search, engine, query, num_results
|
||||
)
|
|
@ -1,220 +0,0 @@
|
|||
You are an AI engineer designed to help users use Jina AI Search Foundation API's for their specific use case.
|
||||
|
||||
# Core principles
|
||||
|
||||
1. Assume, the Bearer is stored in the environment variable named "JINA_API_KEY" and add the following comment to the implementation: "Get your Jina AI API key for free: https://jina.ai/?sui=apikey".
|
||||
2. Use the simplest solution possible (use single API's whenever possible, do not overcomplicate things);
|
||||
3. Answer "can't do" for tasks outside the scope of Jina AI Search Foundation;
|
||||
4. Choose built-in features over custom implementations whenever possible;
|
||||
5. Leverage multimodal models when needed;
|
||||
6. You must use the Jina APIs for the implementation;
|
||||
7. Never decline an implementation because of its complexity;
|
||||
8. Generate production-ready code that follows exactly the requirements;
|
||||
9. Never use placeholder data;
|
||||
10. For every request to any of the Jina APIs, you must include the header -H "Accept: application/json" to specify that the response should be in JSON format;
|
||||
|
||||
# Overview of all Jina AI APIs:
|
||||
- Classification API: Given text or images, classify them into categories.
|
||||
- Embeddings API: Given text or images, generate embeddings.
|
||||
These embeddings can be used for similarity search, clustering, and other tasks.
|
||||
- r.reader API: Input a single website URL and get an LLM-friendly version of that single website.
|
||||
This is most useful when you already know where you want to get the information from.
|
||||
- s.reader API: Given a search term, get an LLM-friendly version of all websites in the search results.
|
||||
This is useful when you don't know where to get the information from, but you just know what you are looking for.
|
||||
- g.reader API: Given a statement, find out if it is true or false.
|
||||
This is useful for fact-checking, fake news detection, and general knowledge verification.
|
||||
- Re-Ranker API: Given a query and a list of search results, re-rank them.
|
||||
This is useful for improving the relevance of search results.
|
||||
- Segmenter API: Given a text e.g. the output from r.reader or s.reader, split it into segments.
|
||||
This is useful for breaking down long texts into smaller, more manageable parts.
|
||||
Usually this is done to get the chunks that are passed to the embeddings API.
|
||||
|
||||
# Jina AI Search Foundation API's documentation
|
||||
|
||||
11. Embeddings API
|
||||
Endpoint: https://api.jina.ai/v1/embeddings
|
||||
Purpose: Convert text/images to fixed-length vectors
|
||||
Best for: semantic search, similarity matching, clustering, etc.
|
||||
Method: POST
|
||||
Authorization: HTTPBearer
|
||||
Request body schema: {"application/json":{"model":{"type":"string","required":true,"description":"Identifier of the model to use.","options":[{"name":"jina-clip-v2","size":"885M","dimensions":1024},{"name":"jina-embeddings-v3","size":"570M","dimensions":1024}]},"input":{"type":"array","required":true,"description":"Array of input strings or objects to be embedded."},"embedding_type":{"type":"string or array of strings","required":false,"default":"float","description":"The format of the returned embeddings.","options":["float","base64","binary","ubinary"]},"task":{"type":"string","required":false,"description":"Specifies the intended downstream application to optimize embedding output.","options":["retrieval.query","retrieval.passage","text-matching","classification","separation"]},"dimensions":{"type":"integer","required":false,"description":"Truncates output embeddings to the specified size if set."},"normalized":{"type":"boolean","required":false,"default":false,"description":"If true, embeddings are normalized to unit L2 norm."},"late_chunking":{"type":"boolean","required":false,"default":false,"description":"If true, concatenates all sentences in input and treats as a single input for late chunking."}}}
|
||||
Example request: {"model":"jina-embeddings-v3","input":["Hello, world!"]}
|
||||
Example response: {"200":{"data":[{"embedding":"..."}],"usage":{"total_tokens":15}},"422":{"error":{"message":"Invalid input or parameters"}}}
|
||||
|
||||
12. Reranker API
|
||||
Endpoint: https://api.jina.ai/v1/rerank
|
||||
Purpose: find the most relevant search results
|
||||
Best for: refining search results, refining RAG (retrieval augmented generation) contextual chunks, etc.
|
||||
Method: POST
|
||||
Authorization: HTTPBearer
|
||||
Request body schema: {"application/json":{"model":{"type":"string","required":true,"description":"Identifier of the model to use.","options":[{"name":"jina-reranker-v2-base-multilingual","size":"278M"},{"name":"jina-colbert-v2","size":"560M"}]},"query":{"type":"string or TextDoc","required":true,"description":"The search query."},"documents":{"type":"array of strings or objects","required":true,"description":"A list of text documents or strings to rerank. If a document object is provided, all text fields will be preserved in the response."},"top_n":{"type":"integer","required":false,"description":"The number of most relevant documents or indices to return, defaults to the length of documents."},"return_documents":{"type":"boolean","required":false,"default":true,"description":"If false, returns only the index and relevance score without the document text. If true, returns the index, text, and relevance score."}}}
|
||||
Example request: {"model":"jina-reranker-v2-base-multilingual","query":"Search query","documents":["Document to rank 1","Document to rank 2"]}
|
||||
Example response: {"results":[{"index":0,"document":{"text":"Document to rank 1"},"relevance_score":0.9},{"index":1,"document":{"text":"Document to rank 2"},"relevance_score":0.8}],"usage":{"total_tokens":15,"prompt_tokens":15}}
|
||||
|
||||
13. Reader API
|
||||
Endpoint: https://r.jina.ai/
|
||||
Purpose: retrieve/parse content from URL in a format optimized for downstream tasks like LLMs and other applications
|
||||
Best for: extracting structured content from web pages, suitable for generative models and search applications
|
||||
Method: POST
|
||||
Authorization: HTTPBearer
|
||||
Headers:
|
||||
- **Authorization**: Bearer $JINA_API_KEY
|
||||
- **Content-Type**: application/json
|
||||
- **Accept**: application/json
|
||||
- **X-Engine** (optional): Specifies the engine to retrieve/parse content. Use `readerlm-v2` for higher quality or `direct` for speed
|
||||
- **X-Timeout** (optional): Specifies the maximum time (in seconds) to wait for the webpage to load
|
||||
- **X-Target-Selector** (optional): CSS selectors to focus on specific elements within the page
|
||||
- **X-Wait-For-Selector** (optional): CSS selectors to wait for specific elements before returning
|
||||
- **X-Remove-Selector** (optional): CSS selectors to exclude certain parts of the page (e.g., headers, footers)
|
||||
- **X-With-Links-Summary** (optional): `true` to gather all links at the end of the response
|
||||
- **X-With-Images-Summary** (optional): `true` to gather all images at the end of the response
|
||||
- **X-With-Generated-Alt** (optional): `true` to add alt text to images lacking captions
|
||||
- **X-No-Cache** (optional): `true` to bypass cache for fresh retrieval
|
||||
- **X-With-Iframe** (optional): `true` to include iframe content in the response
|
||||
- **X-Return-Format** (optional): `markdown`, `html`, `text`, `screenshot`, or `pageshot` (for URL of full-page screenshot)
|
||||
- **X-Token-Budget** (optional): Specifies maximum number of tokens to use for the request
|
||||
- **X-Retain-Images** (optional): Use `none` to remove all images from the response
|
||||
|
||||
Request body schema: {"application/json":{"url":{"type":"string","required":true},"options":{"type":"string","default":"Default","options":["Default","Markdown","HTML","Text","Screenshot","Pageshot"]}}}
|
||||
Example cURL request: ```curl -X POST 'https://r.jina.ai/' -H "Accept: application/json" -H "Authorization: Bearer ..." -H "Content-Type: application/json" -H "X-No-Cache: true" -H "X-Remove-Selector: header,.class,#id" -H "X-Target-Selector: body,.class,#id" -H "X-Timeout: 10" -H "X-Wait-For-Selector: body,.class,#id" -H "X-With-Generated-Alt: true" -H "X-With-Iframe: true" -H "X-With-Images-Summary: true" -H "X-With-Links-Summary: true" -d '{"url":"https://jina.ai"}'```
|
||||
Example response: {"code":200,"status":20000,"data":{"title":"Jina AI - Your Search Foundation, Supercharged.","description":"Best-in-class embeddings, rerankers, LLM-reader, web scraper, classifiers. The best search AI for multilingual and multimodal data.","url":"https://jina.ai/","content":"Jina AI - Your Search Foundation, Supercharged.\n===============\n","images":{"Image 1":"https://jina.ai/Jina%20-%20Dark.svg"},"links":{"Newsroom":"https://jina.ai/#newsroom","Contact sales":"https://jina.ai/contact-sales","Commercial License":"https://jina.ai/COMMERCIAL-LICENSE-TERMS.pdf","Security":"https://jina.ai/legal/#security","Terms & Conditions":"https://jina.ai/legal/#terms-and-conditions","Privacy":"https://jina.ai/legal/#privacy-policy"},"usage":{"tokens
|
||||
Pay attention to the response format of the reader API, the actual content of the page will be available in `response["data"]["content"]`, and links / images (if using "X-With-Links-Summary: true" or "X-With-Images-Summary: true") will be available in `response["data"]["links"]` and `response["data"]["images"]`.
|
||||
|
||||
14. Search API
|
||||
Endpoint: https://s.jina.ai/
|
||||
Purpose: search the web for information and return results in a format optimized for downstream tasks like LLMs and other applications
|
||||
Best for: customizable web search with results optimized for enterprise search systems and LLMs, with options for Markdown, HTML, JSON, text, and image outputs
|
||||
Method: POST
|
||||
Authorization: HTTPBearer
|
||||
Headers:
|
||||
- **Authorization**: Bearer $JINA_API_KEY
|
||||
- **Content-Type**: application/json
|
||||
- **Accept**: application/json
|
||||
- **X-Site** (optional): Use "X-Site: " for in-site searches limited to the given domain
|
||||
- **X-With-Links-Summary** (optional): "true" to gather all page links at the end
|
||||
- **X-With-Images-Summary** (optional): "true" to gather all images at the end
|
||||
- **X-No-Cache** (optional): "true" to bypass cache and retrieve real-time data
|
||||
- **X-With-Generated-Alt** (optional): "true" to generate captions for images without alt tags
|
||||
|
||||
Request body schema: {"application/json":{"q":{"type":"string","required":true},"options":{"type":"string","default":"Default","options":["Default","Markdown","HTML","Text","Screenshot","Pageshot"]}}}
|
||||
Example request cURL request: ```curl -X POST 'https://s.jina.ai/' -H "Authorization: Bearer ..." -H "Content-Type: application/json" -H "Accept: application/json" -H "X-No-Cache: true" -H "X-Site: https://jina.ai" -d '{"q":"When was Jina AI founded?","options":"Markdown"}'```
|
||||
Example response: {"code":200,"status":20000,"data":[{"title":"Jina AI - Your Search Foundation, Supercharged.","description":"Our frontier models form the search foundation for high-quality enterprise search...","url":"https://jina.ai/","content":"Jina AI - Your Search Foundation, Supercharged...","usage":{"tokens":10475}},{"title":"Jina AI CEO, Founder, Key Executive Team, Board of Directors & Employees","description":"An open-source vector search engine that supports structured filtering...","url":"https://www.cbinsights.com/company/jina-ai/people","content":"Jina AI Management Team...","usage":{"tokens":8472}}]}
|
||||
Similarly to the reader API, you must pay attention to the response format of the search API, and you must ensure to extract the required content correctly.
|
||||
|
||||
15. Grounding API
|
||||
Endpoint: https://g.jina.ai/
|
||||
Purpose: verify the factual accuracy of a given statement by cross-referencing it with sources from the internet
|
||||
Best for: ideal for validating claims or facts by using verifiable sources, such as company websites or social media profiles
|
||||
Method: POST
|
||||
Authorization: HTTPBearer
|
||||
Headers:
|
||||
- **Authorization**: Bearer $JINA_API_KEY
|
||||
- **Content-Type**: application/json
|
||||
- **Accept**: application/json
|
||||
- **X-Site** (optional): comma-separated list of URLs to serve as grounding references for verifying the statement (if not specified, all sources found on the internet will be used)
|
||||
- **X-No-Cache** (optional): "true" to bypass cache and retrieve real-time data
|
||||
|
||||
Request body schema: {"application/json":{"statement":{"type":"string","required":true,"description":"The statement to verify for factual accuracy"}}}
|
||||
Example cURL request: ```curl -X POST 'https://g.jina.ai/' -H "Accept: application/json" -H "Authorization: Bearer ..." -H "Content-Type: application/json" -H "X-Site: https://jina.ai, https://linkedin.com" -d '{"statement":"Jina AI was founded in 2020 in Berlin."}'```
|
||||
Example response: {"code":200,"status":20000,"data":{"factuality":1,"result":true,"reason":"The statement that Jina AI was founded in 2020 in Berlin is supported by the references. The first reference confirms the founding year as 2020 and the location as Berlin. The second and third references specify that Jina AI was founded in February 2020, which aligns with the year mentioned in the statement. Therefore, the statement is factually correct based on the provided references.","references":[{"url":"https://es.linkedin.com/company/jinaai?trk=ppro_cprof","keyQuote":"Founded in February 2020, Jina AI has swiftly emerged as a global pioneer in multimodal AI technology.","isSupportive":true},{"url":"https://jina.ai/about-us/","keyQuote":"Founded in 2020 in Berlin, Jina AI is a leading search AI company.","isSupportive":true},{"url":"https://www.linkedin.com/company/jinaai","keyQuote":"Founded in February 2020, Jina AI has swiftly emerged as a global pioneer in multimodal AI technology.","isSupportive":true}],"usage":{"tokens":7620}}}
|
||||
|
||||
16. Segmenter API
|
||||
Endpoint: https://segment.jina.ai/
|
||||
Purpose: tokenizes text, divide text into chunks
|
||||
Best for: counting number of tokens in text, segmenting text into manageable chunks (ideal for downstream applications like RAG)
|
||||
Method: POST
|
||||
Authorization: HTTPBearer
|
||||
Headers:
|
||||
- **Authorization**: Bearer $JINA_API_KEY
|
||||
- **Content-Type**: application/json
|
||||
- **Accept**: application/json
|
||||
|
||||
Request body schema: {"application/json":{"content":{"type":"string","required":true,"description":"The text content to segment."},"tokenizer":{"type":"string","required":false,"default":"cl100k_base","enum":["cl100k_base","o200k_base","p50k_base","r50k_base","p50k_edit","gpt2"],"description":"Specifies the tokenizer to use."},"return_tokens":{"type":"boolean","required":false,"default":false,"description":"If true, includes tokens and their IDs in the response."},"return_chunks":{"type":"boolean","required":false,"default":false,"description":"If true, segments the text into semantic chunks."},"max_chunk_length":{"type":"integer","required":false,"default":1000,"description":"Maximum characters per chunk (only effective if 'return_chunks' is true)."},"head":{"type":"integer","required":false,"description":"Returns the first N tokens (exclusive with 'tail')."},"tail":{"type":"integer","required":false,"description":"Returns the last N tokens (exclusive with 'head')."}}}
|
||||
Example cURL request: ```curl -X POST 'https://segment.jina.ai/' -H "Content-Type: application/json" -H "Authorization: Bearer ..." -d '{"content":"\n Jina AI: Your Search Foundation, Supercharged! 🚀\n Ihrer Suchgrundlage, aufgeladen! 🚀\n 您的搜索底座,从此不同!🚀\n 検索ベース,もう二度と同じことはありません!🚀\n","tokenizer":"cl100k_base","return_tokens":true,"return_chunks":true,"max_chunk_length":1000,"head":5}'```
|
||||
Example response: {"num_tokens":78,"tokenizer":"cl100k_base","usage":{"tokens":0},"num_chunks":4,"chunk_positions":[[3,55],[55,93],[93,110],[110,135]],"tokens":[[["J",[41]],["ina",[2259]],[" AI",[15592]],[":",[25]],[" Your",[4718]],[" Search",[7694]],[" Foundation",[5114]],[",",[11]],[" Super",[7445]],["charged",[38061]],["!",[0]],[" ",[11410]],["🚀",[248,222]],["\n",[198]],[" ",[256]]],[["I",[40]],["hr",[4171]],["er",[261]],[" Such",[15483]],["grund",[60885]],["lage",[56854]],[",",[11]],[" auf",[7367]],["gel",[29952]],["aden",[21825]],["!",[0]],[" ",[11410]],["🚀",[248,222]],["\n",[198]],[" ",[256]]],[["您",[88126]],["的",[9554]],["搜索",[80073]],["底",[11795,243]],["座",[11795,100]],[",",[3922]],["从",[46281]],["此",[33091]],["不",[16937]],["同",[42016]],["!",[6447]],["🚀",[9468,248,222]],["\n",[198]],[" ",[256]]],[["検",[162,97,250]],["索",[52084]],["ベ",[2845,247]],["ース",[61398]],[",",[11]],["も",[32977]],["う",[30297]],["二",[41920]],["度",[27479]],["と",[19732]],["同",[42016]],["じ",[100204]],["こ",[22957]],["と",[19732]],["は",[15682]],["あり",[57903]],["ま",[17129]],["せ",[72342]],["ん",[25827]],["!",[6447]],["🚀",[9468,248,222]],["\n",[198]]]],"chunks":["Jina AI: Your Search Foundation, Supercharged! 🚀\n ","Ihrer Suchgrundlage, aufgeladen! 🚀\n ","您的搜索底座,从此不同!🚀\n ","検索ベース,もう二度と同じことはありません!🚀\n"]}
|
||||
Note: for the API to return chunks, you must specify `"return_chunks": true` as part of the request body.
|
||||
|
||||
17. Classifier API
|
||||
Endpoint: https://api.jina.ai/v1/classify
|
||||
Purpose: zero-shot classification for text or images
|
||||
Best for: text or image classification without training
|
||||
Request body schema for text and images : {"application/json":{"model":{"type":"string","required":false,"description":"Identifier of the model to use. Required if classifier_id is not provided.","options":[{"name":"jina-clip-v2","size":"885M","dimensions":1024}]},"classifier_id":{"type":"string","required":false,"description":"The identifier of the classifier. If not provided, a new classifier will be created."},"input":{"type":"array","required":true,"description":"Array of inputs for classification. Each entry can either be a text object {\"text\": \"your_text_here\"} or an image object {\"image\": \"base64_image_string\"}. You cannot mix text and image objects in the same request."},"labels":{"type":"array of strings","required":true,"description":"List of labels used for classification."}}}
|
||||
Example request: {"model":"jina-clip-v2","input":[{"image":"base64_image_string"}],"labels":["category1","category2"]}
|
||||
Example response: {"200":{"data":[{"index":0,"prediction":"category1","object":"classification","score":0.85}],"usage":{"total_tokens":10}},"422":{"detail":[{"message":"Validation error","field":"input"}]}}
|
||||
Request body schema for text: {"application/json":{"model":{"type":"string","required":false,"description":"Identifier of the model to use. Required if classifier_id is not provided.","options":[{"name":"jina-embeddings-v3","size":"223M","dimensions":768}]},"classifier_id":{"type":"string","required":false,"description":"The identifier of the classifier. If not provided, a new classifier will be created."},"input":{"type":"array","required":true,"description":"Array of text inputs for classification. Each entry should be a simple string representing the text to classify.","items":{"type":"string"}},"labels":{"type":"array","required":true,"description":"List of labels used for classification.","items":{"type":"string"}}}}
|
||||
Example request: {"model": "jina-embeddings-v3", "input": ["walk", "marathon"], "labels": ["Simple task", "intensive task", "Creative writing"]}
|
||||
Example response: {"usage":{"total_tokens":19},"data":[{"object":"classification","index":0,"prediction":"Simple task","score":0.35543856024742126,"predictions":[{"label":"Simple task","score":0.35543856024742126},{"label":"intensive task","score":0.33334434032440186},{"label":"Creative writing","score":0.3112170696258545}]},{"object":"classification","index":1,"prediction":"intensive task","score":0.3616286516189575,"predictions":[{"label":"Simple task","score":0.34063565731048584},{"label":"intensive task","score":0.3616286516189575},{"label":"Creative writing","score":0.2977357804775238}]}]}
|
||||
Pay attention to the model used, when classifying images you must use `jina-clip-v2`, but when classifying text it is best to use `jina-embeddings-v3` (newest text embedding model from Jina)!!!
|
||||
|
||||
**Note: all API's require authorization using the bearer token (get it from https://jina.ai/?sui=apikey)!**
|
||||
Make sure that any code you generate uses the JINA_API_KEY environment variable, and remind the user to correctly set this variable before running the code!
|
||||
|
||||
# Example solutions
|
||||
|
||||
18. Basic search:
|
||||
- For simple queries, use the search API with the given queries;
|
||||
- For better relevancy, first use the search API to retrieve results, then use the reranker API to find the most relevant results;
|
||||
|
||||
19. Classification tasks:
|
||||
- To classify text snippets (multi-lingual texts), you can use the classification API with jina-embeddings-v3 model;
|
||||
- To classify images, you can use the classification API with jina-clip-v2 model;
|
||||
|
||||
20. Web content processing:
|
||||
- To scrape a webpage, use the reader API directly;
|
||||
- To embed the contents of a webpage, first use the reader API to scrape the text content of the webpage and then use the embeddings API;
|
||||
|
||||
# Integration guidelines
|
||||
|
||||
You should always:
|
||||
- Handle API errors using try/catch blocks;
|
||||
- Implement retries for network failures;
|
||||
- Validate inputs before API calls;
|
||||
- Pay attention to the response of each API and parse it to a usable state;
|
||||
|
||||
You should not:
|
||||
- Chain API's unnecessarily;
|
||||
- Use reranker API without query-document pairs (reranker API needs a query as context to estimate relevancy);
|
||||
- Directly use the response of an API without parsing it;
|
||||
|
||||
# Limitations
|
||||
|
||||
The Jina AI Search Foundation API's cannot perform any actions other than those already mentioned.
|
||||
This includes:
|
||||
- Generating text or images;
|
||||
- Modifying or editing content;
|
||||
- Executing code or perform calculations;
|
||||
- Storing or caching results permanently;
|
||||
|
||||
# Tips for responding to user requests
|
||||
|
||||
21. Start by analyzing the task and identifying which API's should be used;
|
||||
|
||||
22. If multiple API's are required, outline the purpose of each API;
|
||||
|
||||
23. Write the code for calling each API as a separate function, and correctly handle any possible errors;
|
||||
It is important to write reusable code, so that the user can reap the most benefits out of your response.
|
||||
```python
|
||||
def read(url):
|
||||
...
|
||||
|
||||
def main():
|
||||
...
|
||||
```
|
||||
Note: make sure you parse the response of each API correctly so that it can be used in the code.
|
||||
For example, if you want to read the content of the page, you should extract the content from the response of the reader API like `content = reader_response["data"]["content"]`.
|
||||
Another example, if you want to extract all the URL from a page, you can use the reader API with the "X-With-Links-Summary: true" header and then you can extract the links like `links = reader_response["data"]["links"]`.
|
||||
|
||||
24. Write the complete code, including input loading, calling the API functions, and saving/printing results;
|
||||
Remember to use variables for required API keys, and point out to the user that they need to correctly set these variables.
|
||||
|
||||
25. Finally, Jina AI API endpoints rate limits:
|
||||
Embedding & Reranker APIs (api.jina.ai/v1/embeddings, /rerank): 500 RPM & 1M TPM with API key; 2k RPM & 5M TPM with premium key
|
||||
Reader APIs:
|
||||
- r.jina.ai: 200 RPM, 1k RPM premium
|
||||
- s.jina.ai: 40 RPM, 100 RPM premium
|
||||
- g.jina.ai: 10 RPM, 30 RPM premium
|
||||
Classifier APIs (api.jina.ai/v1/classify):
|
||||
- 200 RPM & 500k TPM; 1k RPM & 3M TPM premium
|
||||
Segmenter API (segment.jina.ai): 200 RPM, 1k RPM premium
|
||||
|
||||
Approach your task step by step.
|
|
@ -1,340 +0,0 @@
|
|||
"""
|
||||
LLM interface module using LiteLLM.
|
||||
|
||||
This module provides a unified interface to various LLM providers through LiteLLM,
|
||||
enabling query enhancement, classification, and other LLM-powered functionality.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional, Tuple, Union
|
||||
import asyncio
|
||||
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
||||
from config.config import get_config
|
||||
|
||||
|
||||
class LLMInterface:
|
||||
"""Interface for interacting with LLMs through LiteLLM."""
|
||||
|
||||
def __init__(self, model_name: Optional[str] = None):
|
||||
"""
|
||||
Initialize the LLM interface.
|
||||
|
||||
Args:
|
||||
model_name: Name of the LLM model to use. If None, uses the default model
|
||||
from configuration.
|
||||
"""
|
||||
self.config = get_config()
|
||||
|
||||
# Use specified model or default from config
|
||||
self.model_name = model_name or self.config.config_data.get('default_model', 'gpt-3.5-turbo')
|
||||
|
||||
# Get model-specific configuration
|
||||
self.model_config = self.config.get_model_config(self.model_name)
|
||||
|
||||
# Set up LiteLLM with the appropriate provider
|
||||
self._setup_provider()
|
||||
|
||||
def _setup_provider(self) -> None:
|
||||
"""Set up the LLM provider based on the model configuration."""
|
||||
provider = self.model_config.get('provider', 'openai')
|
||||
|
||||
try:
|
||||
# Get API key for the provider
|
||||
api_key = self.config.get_api_key(provider)
|
||||
|
||||
# Set environment variable for the provider
|
||||
if provider.lower() == 'google' or provider.lower() == 'gemini':
|
||||
os.environ["GEMINI_API_KEY"] = api_key
|
||||
elif provider.lower() == 'vertex_ai':
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = api_key
|
||||
else:
|
||||
os.environ[f"{provider.upper()}_API_KEY"] = api_key
|
||||
|
||||
print(f"LLM interface initialized with model: {self.model_name} (provider: {provider})")
|
||||
except ValueError as e:
|
||||
print(f"Error setting up LLM provider: {e}")
|
||||
|
||||
def _get_completion_params(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get parameters for LLM completion based on model configuration.
|
||||
|
||||
Returns:
|
||||
Dictionary of parameters for LiteLLM completion
|
||||
"""
|
||||
params = {
|
||||
'temperature': self.model_config.get('temperature', 0.7),
|
||||
'max_tokens': self.model_config.get('max_tokens', 1000),
|
||||
'top_p': self.model_config.get('top_p', 1.0)
|
||||
}
|
||||
|
||||
# Handle different provider configurations
|
||||
provider = self.model_config.get('provider', 'openai')
|
||||
|
||||
if provider == 'azure':
|
||||
# Azure OpenAI requires special handling
|
||||
deployment_name = self.model_config.get('deployment_name')
|
||||
api_version = self.model_config.get('api_version')
|
||||
endpoint = self.model_config.get('endpoint')
|
||||
|
||||
if deployment_name and endpoint:
|
||||
# Format: azure/deployment_name
|
||||
params['model'] = f"azure/{deployment_name}"
|
||||
|
||||
# Set Azure-specific environment variables if not already set
|
||||
if 'AZURE_API_BASE' not in os.environ and endpoint:
|
||||
os.environ['AZURE_API_BASE'] = endpoint
|
||||
|
||||
if 'AZURE_API_VERSION' not in os.environ and api_version:
|
||||
os.environ['AZURE_API_VERSION'] = api_version
|
||||
else:
|
||||
# Fall back to default model if Azure config is incomplete
|
||||
params['model'] = self.model_name
|
||||
elif provider in ['ollama', 'groq', 'openrouter'] or self.model_config.get('endpoint'):
|
||||
# For providers with custom endpoints
|
||||
params['model'] = self.model_config.get('model_name', self.model_name)
|
||||
params['api_base'] = self.model_config.get('endpoint')
|
||||
|
||||
# Special handling for OpenRouter
|
||||
if provider == 'openrouter':
|
||||
# Set HTTP headers for OpenRouter if needed
|
||||
params['headers'] = {
|
||||
'HTTP-Referer': 'https://sim-search.app', # Replace with your actual app URL
|
||||
'X-Title': 'Intelligent Research System' # Replace with your actual app name
|
||||
}
|
||||
elif provider == 'google' or provider == 'gemini':
|
||||
# Special handling for Google Gemini models
|
||||
# Format: gemini/model_name (e.g., gemini/gemini-2.0-flash)
|
||||
params['model'] = f"gemini/{self.model_config.get('model_name', self.model_name)}"
|
||||
|
||||
# Add additional parameters for Gemini
|
||||
params['custom_llm_provider'] = 'gemini'
|
||||
elif provider == 'vertex_ai':
|
||||
# Special handling for Vertex AI Gemini models
|
||||
params['model'] = f"vertex_ai/{self.model_config.get('model_name', self.model_name)}"
|
||||
|
||||
# Add Vertex AI specific parameters
|
||||
params['vertex_project'] = self.model_config.get('vertex_project', 'sim-search')
|
||||
params['vertex_location'] = self.model_config.get('vertex_location', 'us-central1')
|
||||
|
||||
# Set custom provider
|
||||
params['custom_llm_provider'] = 'vertex_ai'
|
||||
else:
|
||||
# Standard provider (OpenAI, Anthropic, etc.)
|
||||
params['model'] = self.model_name
|
||||
|
||||
return params
|
||||
|
||||
async def generate_completion(self, messages: List[Dict[str, str]], stream: bool = False) -> Union[str, Any]:
|
||||
"""
|
||||
Generate a completion using the configured LLM.
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries with 'role' and 'content' keys
|
||||
stream: Whether to stream the response
|
||||
|
||||
Returns:
|
||||
If stream is False, returns the completion text as a string
|
||||
If stream is True, returns the completion response object for streaming
|
||||
"""
|
||||
# Get provider from model config
|
||||
provider = self.model_config.get('provider', 'openai').lower()
|
||||
|
||||
# Special handling for Gemini models - they use 'user' and 'model' roles
|
||||
if provider == 'gemini':
|
||||
formatted_messages = []
|
||||
for msg in messages:
|
||||
role = msg['role']
|
||||
# Map 'system' to 'user' for the first message
|
||||
if role == 'system' and not formatted_messages:
|
||||
formatted_messages.append({
|
||||
'role': 'user',
|
||||
'content': msg['content']
|
||||
})
|
||||
# Map 'assistant' to 'model'
|
||||
elif role == 'assistant':
|
||||
formatted_messages.append({
|
||||
'role': 'model',
|
||||
'content': msg['content']
|
||||
})
|
||||
# Keep 'user' as is
|
||||
else:
|
||||
formatted_messages.append(msg)
|
||||
else:
|
||||
formatted_messages = messages
|
||||
|
||||
# Get completion parameters
|
||||
params = self._get_completion_params()
|
||||
|
||||
try:
|
||||
# Generate completion
|
||||
if stream:
|
||||
response = litellm.completion(
|
||||
messages=formatted_messages,
|
||||
stream=True,
|
||||
**params
|
||||
)
|
||||
return response
|
||||
else:
|
||||
response = litellm.completion(
|
||||
messages=formatted_messages,
|
||||
**params
|
||||
)
|
||||
|
||||
# Extract content from response
|
||||
content = response.choices[0].message.content
|
||||
|
||||
# Process thinking tags if enabled
|
||||
if hasattr(self, 'process_thinking_tags') and self.process_thinking_tags:
|
||||
content = self._process_thinking_tags(content)
|
||||
|
||||
return content
|
||||
except Exception as e:
|
||||
error_msg = f"Error generating completion: {str(e)}"
|
||||
print(error_msg)
|
||||
|
||||
# Return error message in a user-friendly format
|
||||
return f"I encountered an error while processing your request: {str(e)}"
|
||||
|
||||
async def classify_query(self, query: str) -> Dict[str, str]:
|
||||
"""
|
||||
Classify a query as factual, exploratory, or comparative.
|
||||
|
||||
Args:
|
||||
query: The query to classify
|
||||
|
||||
Returns:
|
||||
Dictionary with query type and confidence
|
||||
"""
|
||||
# Call the async implementation directly
|
||||
return await self._classify_query_impl(query)
|
||||
|
||||
async def _classify_query_impl(self, query: str) -> Dict[str, str]:
|
||||
"""
|
||||
Classify a query as factual, exploratory, or comparative.
|
||||
|
||||
Args:
|
||||
query: The query to classify
|
||||
|
||||
Returns:
|
||||
Dictionary with query type and confidence
|
||||
"""
|
||||
messages = [
|
||||
{"role": "system", "content": """You are an expert query classifier.
|
||||
Analyze the given query and classify it into one of the following types:
|
||||
- factual: Seeking specific facts or information
|
||||
- exploratory: Seeking to understand a topic broadly
|
||||
- comparative: Seeking to compare multiple items or concepts
|
||||
|
||||
Respond with a JSON object containing:
|
||||
- type: The query type (factual, exploratory, or comparative)
|
||||
- confidence: Your confidence in this classification (high, medium, low)
|
||||
|
||||
Example response:
|
||||
{"type": "exploratory", "confidence": "high"}
|
||||
"""},
|
||||
{"role": "user", "content": query}
|
||||
]
|
||||
|
||||
# Generate classification
|
||||
response = await self.generate_completion(messages)
|
||||
|
||||
# Parse JSON response
|
||||
try:
|
||||
classification = json.loads(response)
|
||||
return classification
|
||||
except json.JSONDecodeError:
|
||||
# Fallback to default classification if parsing fails
|
||||
print(f"Error parsing classification response: {response}")
|
||||
return {"type": "exploratory", "confidence": "low"}
|
||||
|
||||
async def enhance_query(self, query: str) -> str:
|
||||
"""
|
||||
Enhance a user query using the LLM.
|
||||
|
||||
Args:
|
||||
query: The raw user query
|
||||
|
||||
Returns:
|
||||
Enhanced query with additional context and structure
|
||||
"""
|
||||
# Get the model assigned to this specific function
|
||||
model_name = self.config.get_module_model('query_processing', 'enhance_query')
|
||||
|
||||
# Create a new interface with the assigned model if different from current
|
||||
if model_name != self.model_name:
|
||||
interface = LLMInterface(model_name)
|
||||
return await interface._enhance_query_impl(query)
|
||||
|
||||
return await self._enhance_query_impl(query)
|
||||
|
||||
async def _enhance_query_impl(self, query: str) -> str:
|
||||
"""Implementation of query enhancement."""
|
||||
messages = [
|
||||
{"role": "system", "content": "You are an AI research assistant. Your task is to enhance the user's query by adding relevant context, clarifying ambiguities, and expanding key terms. Maintain the original intent of the query while making it more comprehensive and precise. Return ONLY the enhanced query text without any explanations, introductions, or additional text. The enhanced query should be ready to be sent directly to a search engine."},
|
||||
{"role": "user", "content": f"Enhance this research query: {query}"}
|
||||
]
|
||||
|
||||
return await self.generate_completion(messages)
|
||||
|
||||
async def generate_search_queries(self, query: str, search_engines: List[str]) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Generate optimized search queries for different search engines.
|
||||
|
||||
Args:
|
||||
query: The original user query
|
||||
search_engines: List of search engines to generate queries for
|
||||
|
||||
Returns:
|
||||
Dictionary mapping search engines to lists of optimized queries
|
||||
"""
|
||||
# Get the model assigned to this specific function
|
||||
model_name = self.config.get_module_model('query_processing', 'generate_search_queries')
|
||||
|
||||
# Create a new interface with the assigned model if different from current
|
||||
if model_name != self.model_name:
|
||||
interface = LLMInterface(model_name)
|
||||
return await interface._generate_search_queries_impl(query, search_engines)
|
||||
|
||||
return await self._generate_search_queries_impl(query, search_engines)
|
||||
|
||||
async def _generate_search_queries_impl(self, query: str, search_engines: List[str]) -> Dict[str, List[str]]:
|
||||
"""Implementation of search query generation."""
|
||||
engines_str = ", ".join(search_engines)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": f"You are an AI research assistant. Generate optimized search queries for the following search engines: {engines_str}. For each search engine, provide 3 variations of the query that are optimized for that engine's search algorithm and will yield comprehensive results."},
|
||||
{"role": "user", "content": f"Generate optimized search queries for this research topic: {query}"}
|
||||
]
|
||||
|
||||
response = await self.generate_completion(messages)
|
||||
|
||||
try:
|
||||
# Try to parse as JSON
|
||||
queries = json.loads(response)
|
||||
return queries
|
||||
except json.JSONDecodeError:
|
||||
# If not valid JSON, return a basic query set
|
||||
return {engine: [query] for engine in search_engines}
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
llm_interface = LLMInterface()
|
||||
|
||||
|
||||
def get_llm_interface(model_name: Optional[str] = None) -> LLMInterface:
|
||||
"""
|
||||
Get the global LLM interface instance or create a new one with a specific model.
|
||||
|
||||
Args:
|
||||
model_name: Optional model name to use instead of the default
|
||||
|
||||
Returns:
|
||||
LLMInterface instance
|
||||
"""
|
||||
if model_name:
|
||||
return LLMInterface(model_name)
|
||||
return llm_interface
|
|
@ -1,111 +0,0 @@
|
|||
"""
|
||||
Query processor module for the intelligent research system.
|
||||
|
||||
This module handles the processing of user queries, including enhancement,
|
||||
classification, and structuring for downstream modules.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
from .llm_interface import get_llm_interface
|
||||
|
||||
|
||||
class QueryProcessor:
|
||||
"""
|
||||
Processor for user research queries.
|
||||
|
||||
This class handles the processing of user queries, including enhancement,
|
||||
classification, and structuring for downstream modules.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the query processor."""
|
||||
self.llm_interface = get_llm_interface()
|
||||
|
||||
async def process_query(self, query: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a user query.
|
||||
|
||||
Args:
|
||||
query: The raw user query
|
||||
|
||||
Returns:
|
||||
Dictionary containing the processed query information
|
||||
"""
|
||||
# Enhance the query
|
||||
enhanced_query = await self.llm_interface.enhance_query(query)
|
||||
|
||||
# Classify the query
|
||||
classification = await self.llm_interface.classify_query(query)
|
||||
|
||||
# Extract entities from the classification
|
||||
entities = classification.get('entities', [])
|
||||
|
||||
# Structure the query for downstream modules
|
||||
structured_query = self._structure_query(query, enhanced_query, classification)
|
||||
|
||||
return structured_query
|
||||
|
||||
def _structure_query(self, original_query: str, enhanced_query: str,
|
||||
classification: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Structure a query for downstream modules.
|
||||
|
||||
Args:
|
||||
original_query: The original user query
|
||||
enhanced_query: The enhanced query
|
||||
classification: The query classification
|
||||
|
||||
Returns:
|
||||
Dictionary containing the structured query
|
||||
"""
|
||||
return {
|
||||
'original_query': original_query,
|
||||
'enhanced_query': enhanced_query,
|
||||
'type': classification.get('type', 'unknown'),
|
||||
'intent': classification.get('intent', 'research'),
|
||||
'entities': classification.get('entities', []),
|
||||
'timestamp': None, # Will be filled in by the caller
|
||||
'metadata': {
|
||||
'classification': classification
|
||||
}
|
||||
}
|
||||
|
||||
async def generate_search_queries(self, structured_query: Dict[str, Any],
|
||||
search_engines: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate optimized search queries for different search engines.
|
||||
|
||||
Args:
|
||||
structured_query: The structured query
|
||||
search_engines: List of search engines to generate queries for
|
||||
|
||||
Returns:
|
||||
Updated structured query with search queries
|
||||
"""
|
||||
# Use the enhanced query for generating search queries
|
||||
enhanced_query = structured_query['enhanced_query']
|
||||
|
||||
# Generate search queries for each engine
|
||||
search_queries = await self.llm_interface.generate_search_queries(
|
||||
enhanced_query, search_engines
|
||||
)
|
||||
|
||||
# Add search queries to the structured query
|
||||
structured_query['search_queries'] = search_queries
|
||||
|
||||
return structured_query
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
query_processor = QueryProcessor()
|
||||
|
||||
|
||||
def get_query_processor() -> QueryProcessor:
|
||||
"""
|
||||
Get the global query processor instance.
|
||||
|
||||
Returns:
|
||||
QueryProcessor instance
|
||||
"""
|
||||
return query_processor
|
|
@ -1,189 +0,0 @@
|
|||
"""
|
||||
Jina AI Reranker module for the intelligent research system.
|
||||
|
||||
This module provides functionality to rerank documents based on their relevance
|
||||
to a query using Jina AI's Reranker API.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
|
||||
from config.config import get_config
|
||||
|
||||
|
||||
class JinaReranker:
|
||||
"""
|
||||
Document reranker using Jina AI's Reranker API.
|
||||
|
||||
This class provides methods to rerank documents based on their relevance
|
||||
to a query, improving the quality of search results.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Jina Reranker."""
|
||||
self.config = get_config()
|
||||
self.api_key = self._get_api_key()
|
||||
self.endpoint = "https://api.jina.ai/v1/rerank"
|
||||
|
||||
# Get reranker configuration
|
||||
self.reranker_config = self.config.config_data.get('jina', {}).get('reranker', {})
|
||||
self.model = self.reranker_config.get('model', 'jina-reranker-v2-base-multilingual')
|
||||
self.default_top_n = self.reranker_config.get('top_n', 10)
|
||||
|
||||
def _get_api_key(self) -> str:
|
||||
"""
|
||||
Get the Jina AI API key.
|
||||
|
||||
Returns:
|
||||
The API key as a string
|
||||
|
||||
Raises:
|
||||
ValueError: If the API key is not found
|
||||
"""
|
||||
try:
|
||||
return self.config.get_api_key('jina')
|
||||
except ValueError as e:
|
||||
raise ValueError(f"Jina AI API key not found. {str(e)}")
|
||||
|
||||
def rerank(self, query: str, documents: List[str],
|
||||
top_n: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Rerank documents based on their relevance to the query.
|
||||
|
||||
Args:
|
||||
query: The query to rank documents against
|
||||
documents: List of document strings to rerank
|
||||
top_n: Number of top results to return (optional)
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing reranked documents with scores and indices
|
||||
|
||||
Raises:
|
||||
Exception: If there's an error calling the Reranker API
|
||||
"""
|
||||
if not documents:
|
||||
return []
|
||||
|
||||
# Use default top_n if not specified
|
||||
if top_n is None:
|
||||
top_n = min(self.default_top_n, len(documents))
|
||||
else:
|
||||
top_n = min(top_n, len(documents))
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Accept": "application/json"
|
||||
}
|
||||
|
||||
# The correct format is an array of plain strings, not objects with a "text" field
|
||||
data = {
|
||||
"model": self.model,
|
||||
"query": query,
|
||||
"documents": documents, # Plain array of strings
|
||||
"top_n": top_n
|
||||
}
|
||||
|
||||
print(f"Making reranker API call with query: {query[:50]}... and {len(documents)} documents")
|
||||
print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
|
||||
|
||||
try:
|
||||
response = requests.post(self.endpoint, headers=headers, json=data)
|
||||
print(f"Reranker API response status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Reranker API error: {response.text}")
|
||||
return []
|
||||
|
||||
response.raise_for_status() # Raise exception for HTTP errors
|
||||
|
||||
result = response.json()
|
||||
print(f"Reranker API response structure: {list(result.keys())}")
|
||||
|
||||
# Process and return the reranked results
|
||||
reranked_results = []
|
||||
|
||||
# Check for the specific response structure from the API
|
||||
if "results" in result and isinstance(result["results"], list):
|
||||
results_list = result["results"]
|
||||
for item in results_list:
|
||||
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Handle newer Jina API format with document.text
|
||||
elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Fallback for older response structures with "data" field
|
||||
elif "data" in result and isinstance(result["data"], list):
|
||||
data_list = result["data"]
|
||||
for item in data_list:
|
||||
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
|
||||
print(f"Processed reranker results: {len(reranked_results)} items")
|
||||
return reranked_results
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error calling Jina Reranker API: {str(e)}")
|
||||
# Return original documents with default ordering in case of error
|
||||
return [{'index': i, 'score': 1.0, 'document': doc} for i, doc in enumerate(documents[:top_n])]
|
||||
|
||||
def rerank_with_metadata(self, query: str, documents: List[Dict[str, Any]],
|
||||
document_key: str = 'content',
|
||||
top_n: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Rerank documents with metadata based on their relevance to the query.
|
||||
|
||||
Args:
|
||||
query: The query to rank documents against
|
||||
documents: List of document dictionaries containing content and metadata
|
||||
document_key: The key in the document dictionaries that contains the text content
|
||||
top_n: Number of top results to return (optional)
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing reranked documents with scores, indices, and original metadata
|
||||
|
||||
Raises:
|
||||
Exception: If there's an error calling the Reranker API
|
||||
"""
|
||||
if not documents:
|
||||
return []
|
||||
|
||||
# Extract document contents
|
||||
doc_contents = [doc.get(document_key, "") for doc in documents]
|
||||
|
||||
# Rerank the document contents
|
||||
reranked_results = self.rerank(query, doc_contents, top_n)
|
||||
|
||||
# Add original metadata to the results
|
||||
for result in reranked_results:
|
||||
result['metadata'] = documents[result['index']]
|
||||
|
||||
return reranked_results
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
jina_reranker = JinaReranker()
|
||||
|
||||
|
||||
def get_jina_reranker() -> JinaReranker:
|
||||
"""
|
||||
Get the global Jina Reranker instance.
|
||||
|
||||
Returns:
|
||||
JinaReranker instance
|
||||
"""
|
||||
return jina_reranker
|
58
report.md
58
report.md
|
@ -1,58 +0,0 @@
|
|||
**Report: Version Control, DevOps, and Agile Development with Plastic SCM**
|
||||
|
||||
**Introduction**
|
||||
|
||||
Plastic SCM is a version control system that supports DevOps and Agile development methodologies. It offers a range of features, including branching, merging, and collaboration, to help teams manage their software development projects. This report provides an overview of Plastic SCM, its features, and its use in DevOps and Agile development.
|
||||
|
||||
**Key Features**
|
||||
|
||||
Plastic SCM offers several key features that make it an attractive choice for teams. These include:
|
||||
|
||||
* Branching: Plastic SCM allows teams to create branches for specific tasks or features, making it easy to manage multiple lines of development.
|
||||
* Merging: Plastic SCM's merging capabilities are robust and flexible, allowing teams to merge changes from multiple branches into a single branch.
|
||||
* Collaboration: Plastic SCM supports collaborative development, allowing teams to work together on projects and share changes easily.
|
||||
* Replication: Plastic SCM's replication feature allows teams to replicate their repository to multiple servers, making it easy to collaborate with team members and manage changes.
|
||||
|
||||
**DevOps and Agile Development**
|
||||
|
||||
Plastic SCM is well-suited for DevOps and Agile development methodologies. Its branching and merging features make it easy to manage multiple lines of development, and its collaboration features support collaborative development. The system also supports automated testing and continuous integration, making it easy to integrate with CI systems.
|
||||
|
||||
The "Branch per task" pattern is a popular approach in Agile development, where each task is worked on in a separate branch. Plastic SCM supports this approach, allowing teams to create branches for specific tasks or features.
|
||||
|
||||
**Merge Tracking**
|
||||
|
||||
Merge tracking is a critical feature in Plastic SCM. It allows teams to track the history of merges and resolve conflicts easily. The system uses a changeset-based merge tracking algorithm, which ensures that merges are always done in a way that preserves branch history.
|
||||
|
||||
**Directory Management**
|
||||
|
||||
Plastic SCM's directory management features are designed to make it easy to manage large projects. The system supports empty directories and file moves, making it easy to refactor code and manage complex directory structures.
|
||||
|
||||
**Conflict Resolution**
|
||||
|
||||
Conflict resolution is an important aspect of merge tracking. Plastic SCM's conflict resolution algorithm is designed to resolve conflicts automatically, but it can also be configured to require manual intervention.
|
||||
|
||||
**Partial Workspaces**
|
||||
|
||||
Partial workspaces are a feature in Plastic SCM that allows teams to work on a subset of files in a repository. This is useful for teams that need to work on specific files or directories without affecting the rest of the repository.
|
||||
|
||||
**DevOps Implementation**
|
||||
|
||||
Plastic SCM can be used to implement DevOps in a variety of ways. One approach is to use the "Branch per task" pattern, where each task is worked on in a separate branch. This allows teams to automate testing and continuous integration, making it easy to integrate with CI systems.
|
||||
|
||||
**Conclusion**
|
||||
|
||||
Plastic SCM is a powerful version control system that supports DevOps and Agile development methodologies. Its features, including branching, merging, and collaboration, make it an attractive choice for teams. The system's merge tracking and conflict resolution algorithms ensure that merges are always done in a way that preserves branch history, and its directory management and conflict resolution features make it easy to manage complex projects.
|
||||
|
||||
**References**
|
||||
|
||||
[1] Plastic SCM documentation. (n.d.). Retrieved from <https://docs.plasticscm.com/book/>
|
||||
|
||||
[2] Plastic SCM blog. (n.d.). Retrieved from <https://www.plasticscm.com/blog/>
|
||||
|
||||
[3] Plastic SCM GitHub repository. (n.d.). Retrieved from <https://github.com/plasticscm/plasticscm>
|
||||
|
||||
[4] Standish Group. (2018). Chaos Report 2018. Retrieved from <https://www.projectsmart.co.uk/docs/chaos-report-2018.pdf>
|
||||
|
||||
[5] VersionOne. (2019). State of Agile Report 2019. Retrieved from <https://www.versionone.com/docs/state-of-agile-2019.pdf>
|
||||
|
||||
Note: The references listed above are a selection of the sources used to create this report. For a complete list of sources, please refer to the original document.
|
|
@ -1,19 +0,0 @@
|
|||
"""
|
||||
Report generation module for the intelligent research system.
|
||||
|
||||
This module provides functionality to generate reports from search results
|
||||
by scraping documents, storing them in a database, and synthesizing them
|
||||
into a comprehensive report.
|
||||
"""
|
||||
|
||||
from report.report_generator import get_report_generator, initialize_report_generator
|
||||
from report.document_scraper import get_document_scraper
|
||||
from report.database.db_manager import get_db_manager, initialize_database
|
||||
|
||||
__all__ = [
|
||||
'get_report_generator',
|
||||
'initialize_report_generator',
|
||||
'get_document_scraper',
|
||||
'get_db_manager',
|
||||
'initialize_database'
|
||||
]
|
|
@ -1,14 +0,0 @@
|
|||
"""
|
||||
Database module for the report generation module.
|
||||
|
||||
This module provides functionality to create, manage, and query the SQLite database
|
||||
for storing scraped documents and their metadata.
|
||||
"""
|
||||
|
||||
from report.database.db_manager import get_db_manager, initialize_database, DBManager
|
||||
|
||||
__all__ = [
|
||||
'get_db_manager',
|
||||
'initialize_database',
|
||||
'DBManager'
|
||||
]
|
|
@ -1,402 +0,0 @@
|
|||
"""
|
||||
SQLite database manager for the report generation module.
|
||||
|
||||
This module provides functionality to create, manage, and query the SQLite database
|
||||
for storing scraped documents and their metadata.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import aiosqlite
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DBManager:
|
||||
"""
|
||||
Database manager for the report generation module.
|
||||
|
||||
This class provides methods to create, manage, and query the SQLite database
|
||||
for storing scraped documents and their metadata.
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: str = "report/database/documents.db"):
|
||||
"""
|
||||
Initialize the database manager.
|
||||
|
||||
Args:
|
||||
db_path: Path to the SQLite database file
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self._ensure_dir_exists()
|
||||
|
||||
def _ensure_dir_exists(self):
|
||||
"""Ensure the directory for the database file exists."""
|
||||
db_dir = os.path.dirname(self.db_path)
|
||||
if not os.path.exists(db_dir):
|
||||
os.makedirs(db_dir)
|
||||
logger.info(f"Created directory: {db_dir}")
|
||||
|
||||
async def initialize_db(self):
|
||||
"""
|
||||
Initialize the database by creating necessary tables if they don't exist.
|
||||
|
||||
This method creates the documents and metadata tables.
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
# Create documents table
|
||||
await db.execute('''
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
title TEXT,
|
||||
content TEXT NOT NULL,
|
||||
scrape_date TIMESTAMP NOT NULL,
|
||||
content_type TEXT,
|
||||
token_count INTEGER,
|
||||
hash TEXT UNIQUE
|
||||
)
|
||||
''')
|
||||
|
||||
# Create metadata table
|
||||
await db.execute('''
|
||||
CREATE TABLE IF NOT EXISTS metadata (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
document_id INTEGER NOT NULL,
|
||||
key TEXT NOT NULL,
|
||||
value TEXT,
|
||||
FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE,
|
||||
UNIQUE (document_id, key)
|
||||
)
|
||||
''')
|
||||
|
||||
# Create index on url for faster lookups
|
||||
await db.execute('CREATE INDEX IF NOT EXISTS idx_documents_url ON documents (url)')
|
||||
|
||||
# Create index on document_id for faster metadata lookups
|
||||
await db.execute('CREATE INDEX IF NOT EXISTS idx_metadata_document_id ON metadata (document_id)')
|
||||
|
||||
await db.commit()
|
||||
logger.info("Database initialized successfully")
|
||||
|
||||
async def document_exists(self, url: str) -> bool:
|
||||
"""
|
||||
Check if a document with the given URL already exists in the database.
|
||||
|
||||
Args:
|
||||
url: URL of the document to check
|
||||
|
||||
Returns:
|
||||
True if the document exists, False otherwise
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
cursor = await db.execute('SELECT id FROM documents WHERE url = ?', (url,))
|
||||
result = await cursor.fetchone()
|
||||
return result is not None
|
||||
|
||||
async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get a document by its URL.
|
||||
|
||||
Args:
|
||||
url: URL of the document to retrieve
|
||||
|
||||
Returns:
|
||||
Document as a dictionary, or None if not found
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
cursor = await db.execute('''
|
||||
SELECT id, url, title, content, scrape_date, content_type, token_count, hash
|
||||
FROM documents
|
||||
WHERE url = ?
|
||||
''', (url,))
|
||||
|
||||
document = await cursor.fetchone()
|
||||
if not document:
|
||||
return None
|
||||
|
||||
# Convert to dictionary
|
||||
doc_dict = dict(document)
|
||||
|
||||
# Get metadata
|
||||
cursor = await db.execute('''
|
||||
SELECT key, value
|
||||
FROM metadata
|
||||
WHERE document_id = ?
|
||||
''', (doc_dict['id'],))
|
||||
|
||||
metadata = await cursor.fetchall()
|
||||
doc_dict['metadata'] = {row['key']: row['value'] for row in metadata}
|
||||
|
||||
return doc_dict
|
||||
|
||||
async def add_document(self, url: str, title: str, content: str,
|
||||
content_type: str, token_count: int,
|
||||
metadata: Dict[str, str], doc_hash: str) -> int:
|
||||
"""
|
||||
Add a document to the database.
|
||||
|
||||
Args:
|
||||
url: URL of the document
|
||||
title: Title of the document
|
||||
content: Content of the document
|
||||
content_type: Type of content (e.g., 'markdown', 'html', 'text')
|
||||
token_count: Number of tokens in the document
|
||||
metadata: Dictionary of metadata key-value pairs
|
||||
doc_hash: Hash of the document content for deduplication
|
||||
|
||||
Returns:
|
||||
ID of the added document
|
||||
|
||||
Raises:
|
||||
aiosqlite.Error: If there's an error adding the document
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
try:
|
||||
# Begin transaction
|
||||
await db.execute('BEGIN TRANSACTION')
|
||||
|
||||
# Insert document
|
||||
cursor = await db.execute('''
|
||||
INSERT INTO documents (url, title, content, scrape_date, content_type, token_count, hash)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
''', (url, title, content, datetime.now().isoformat(), content_type, token_count, doc_hash))
|
||||
|
||||
document_id = cursor.lastrowid
|
||||
|
||||
# Insert metadata
|
||||
for key, value in metadata.items():
|
||||
await db.execute('''
|
||||
INSERT INTO metadata (document_id, key, value)
|
||||
VALUES (?, ?, ?)
|
||||
''', (document_id, key, value))
|
||||
|
||||
# Commit transaction
|
||||
await db.commit()
|
||||
logger.info(f"Added document: {url} (ID: {document_id})")
|
||||
return document_id
|
||||
|
||||
except aiosqlite.Error as e:
|
||||
# Rollback transaction on error
|
||||
await db.execute('ROLLBACK')
|
||||
logger.error(f"Error adding document: {str(e)}")
|
||||
raise
|
||||
|
||||
async def update_document(self, document_id: int, content: str = None,
|
||||
title: str = None, token_count: int = None,
|
||||
metadata: Dict[str, str] = None) -> bool:
|
||||
"""
|
||||
Update an existing document in the database.
|
||||
|
||||
Args:
|
||||
document_id: ID of the document to update
|
||||
content: New content (optional)
|
||||
title: New title (optional)
|
||||
token_count: New token count (optional)
|
||||
metadata: New or updated metadata (optional)
|
||||
|
||||
Returns:
|
||||
True if the document was updated, False otherwise
|
||||
|
||||
Raises:
|
||||
aiosqlite.Error: If there's an error updating the document
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
try:
|
||||
# Begin transaction
|
||||
await db.execute('BEGIN TRANSACTION')
|
||||
|
||||
# Update document fields if provided
|
||||
update_parts = []
|
||||
params = []
|
||||
|
||||
if content is not None:
|
||||
update_parts.append("content = ?")
|
||||
params.append(content)
|
||||
|
||||
if title is not None:
|
||||
update_parts.append("title = ?")
|
||||
params.append(title)
|
||||
|
||||
if token_count is not None:
|
||||
update_parts.append("token_count = ?")
|
||||
params.append(token_count)
|
||||
|
||||
if update_parts:
|
||||
update_query = f"UPDATE documents SET {', '.join(update_parts)} WHERE id = ?"
|
||||
params.append(document_id)
|
||||
await db.execute(update_query, params)
|
||||
|
||||
# Update metadata if provided
|
||||
if metadata:
|
||||
for key, value in metadata.items():
|
||||
# Check if metadata key exists
|
||||
cursor = await db.execute('''
|
||||
SELECT id FROM metadata
|
||||
WHERE document_id = ? AND key = ?
|
||||
''', (document_id, key))
|
||||
|
||||
result = await cursor.fetchone()
|
||||
|
||||
if result:
|
||||
# Update existing metadata
|
||||
await db.execute('''
|
||||
UPDATE metadata SET value = ?
|
||||
WHERE document_id = ? AND key = ?
|
||||
''', (value, document_id, key))
|
||||
else:
|
||||
# Insert new metadata
|
||||
await db.execute('''
|
||||
INSERT INTO metadata (document_id, key, value)
|
||||
VALUES (?, ?, ?)
|
||||
''', (document_id, key, value))
|
||||
|
||||
# Commit transaction
|
||||
await db.commit()
|
||||
logger.info(f"Updated document ID: {document_id}")
|
||||
return True
|
||||
|
||||
except aiosqlite.Error as e:
|
||||
# Rollback transaction on error
|
||||
await db.execute('ROLLBACK')
|
||||
logger.error(f"Error updating document: {str(e)}")
|
||||
raise
|
||||
|
||||
async def delete_document(self, document_id: int) -> bool:
|
||||
"""
|
||||
Delete a document from the database.
|
||||
|
||||
Args:
|
||||
document_id: ID of the document to delete
|
||||
|
||||
Returns:
|
||||
True if the document was deleted, False otherwise
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
try:
|
||||
# Begin transaction
|
||||
await db.execute('BEGIN TRANSACTION')
|
||||
|
||||
# Delete document (metadata will be deleted via ON DELETE CASCADE)
|
||||
await db.execute('DELETE FROM documents WHERE id = ?', (document_id,))
|
||||
|
||||
# Commit transaction
|
||||
await db.commit()
|
||||
logger.info(f"Deleted document ID: {document_id}")
|
||||
return True
|
||||
|
||||
except aiosqlite.Error as e:
|
||||
# Rollback transaction on error
|
||||
await db.execute('ROLLBACK')
|
||||
logger.error(f"Error deleting document: {str(e)}")
|
||||
return False
|
||||
|
||||
async def clear_database(self):
|
||||
"""Clear all data from the database."""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
await db.execute('DELETE FROM metadata')
|
||||
await db.execute('DELETE FROM documents')
|
||||
await db.execute('DELETE FROM sqlite_sequence')
|
||||
await db.commit()
|
||||
logger.info("Database cleared")
|
||||
|
||||
async def search_documents(self, query: str, limit: int = 10, offset: int = 0) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search for documents matching the query.
|
||||
|
||||
Args:
|
||||
query: Search query (will be matched against title and content)
|
||||
limit: Maximum number of results to return
|
||||
offset: Number of results to skip
|
||||
|
||||
Returns:
|
||||
List of matching documents as dictionaries
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
|
||||
# Search documents
|
||||
cursor = await db.execute('''
|
||||
SELECT id, url, title, content, scrape_date, content_type, token_count
|
||||
FROM documents
|
||||
WHERE title LIKE ? OR content LIKE ?
|
||||
ORDER BY scrape_date DESC
|
||||
LIMIT ? OFFSET ?
|
||||
''', (f'%{query}%', f'%{query}%', limit, offset))
|
||||
|
||||
documents = await cursor.fetchall()
|
||||
results = []
|
||||
|
||||
# Get metadata for each document
|
||||
for doc in documents:
|
||||
doc_dict = dict(doc)
|
||||
|
||||
cursor = await db.execute('''
|
||||
SELECT key, value
|
||||
FROM metadata
|
||||
WHERE document_id = ?
|
||||
''', (doc_dict['id'],))
|
||||
|
||||
metadata = await cursor.fetchall()
|
||||
doc_dict['metadata'] = {row['key']: row['value'] for row in metadata}
|
||||
|
||||
results.append(doc_dict)
|
||||
|
||||
return results
|
||||
|
||||
async def get_documents_by_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get multiple documents by their URLs.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to retrieve
|
||||
|
||||
Returns:
|
||||
List of documents as dictionaries
|
||||
"""
|
||||
results = []
|
||||
for url in urls:
|
||||
doc = await self.get_document_by_url(url)
|
||||
if doc:
|
||||
results.append(doc)
|
||||
return results
|
||||
|
||||
async def count_documents(self) -> int:
|
||||
"""
|
||||
Get the total number of documents in the database.
|
||||
|
||||
Returns:
|
||||
Number of documents
|
||||
"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
cursor = await db.execute('SELECT COUNT(*) as count FROM documents')
|
||||
result = await cursor.fetchone()
|
||||
return result[0] if result else 0
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
db_manager = DBManager()
|
||||
|
||||
async def initialize_database():
|
||||
"""Initialize the database."""
|
||||
await db_manager.initialize_db()
|
||||
|
||||
def get_db_manager() -> DBManager:
|
||||
"""
|
||||
Get the global database manager instance.
|
||||
|
||||
Returns:
|
||||
DBManager instance
|
||||
"""
|
||||
return db_manager
|
||||
|
||||
# Run database initialization if this module is executed directly
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(initialize_database())
|
Binary file not shown.
|
@ -1,511 +0,0 @@
|
|||
"""
|
||||
Document processor module for the report generation module.
|
||||
|
||||
This module provides functionality to prioritize documents based on relevance scores,
|
||||
chunk long documents into manageable pieces, and select the most relevant chunks
|
||||
to stay within token budget limits.
|
||||
"""
|
||||
|
||||
import re
|
||||
import math
|
||||
import logging
|
||||
import tiktoken
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union, Set
|
||||
from datetime import datetime
|
||||
|
||||
from report.database.db_manager import get_db_manager
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentProcessor:
|
||||
"""
|
||||
Document processor for the report generation module.
|
||||
|
||||
This class provides methods to prioritize documents based on relevance scores,
|
||||
chunk long documents into manageable pieces, and select the most relevant chunks
|
||||
to stay within token budget limits.
|
||||
"""
|
||||
|
||||
def __init__(self, default_token_limit: int = 120000):
|
||||
"""
|
||||
Initialize the document processor.
|
||||
|
||||
Args:
|
||||
default_token_limit: Default token limit for the context window
|
||||
"""
|
||||
self.db_manager = get_db_manager()
|
||||
self.default_token_limit = default_token_limit
|
||||
self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer
|
||||
|
||||
def _count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Count the number of tokens in a text.
|
||||
|
||||
Args:
|
||||
text: The text to count tokens for
|
||||
|
||||
Returns:
|
||||
Number of tokens in the text
|
||||
"""
|
||||
return len(self.tokenizer.encode(text))
|
||||
|
||||
def prioritize_documents(self, documents: List[Dict[str, Any]],
|
||||
relevance_scores: Optional[Dict[str, float]] = None,
|
||||
recency_weight: float = 0.3,
|
||||
token_count_weight: float = 0.2) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Prioritize documents based on relevance scores, recency, and token count.
|
||||
|
||||
Args:
|
||||
documents: List of documents to prioritize
|
||||
relevance_scores: Dictionary mapping document URLs to relevance scores
|
||||
recency_weight: Weight for recency in the prioritization score
|
||||
token_count_weight: Weight for token count in the prioritization score
|
||||
|
||||
Returns:
|
||||
List of documents sorted by priority score
|
||||
"""
|
||||
# If no relevance scores provided, use equal scores for all documents
|
||||
if relevance_scores is None:
|
||||
relevance_scores = {doc['url']: 1.0 for doc in documents}
|
||||
|
||||
# Get current time for recency calculation
|
||||
current_time = datetime.now()
|
||||
|
||||
# Calculate priority scores
|
||||
for doc in documents:
|
||||
# Relevance score (normalized to 0-1)
|
||||
relevance_score = relevance_scores.get(doc['url'], 0.0)
|
||||
|
||||
# Recency score (normalized to 0-1)
|
||||
try:
|
||||
doc_time = datetime.fromisoformat(doc['scrape_date'])
|
||||
time_diff = (current_time - doc_time).total_seconds() / 86400 # Convert to days
|
||||
recency_score = 1.0 / (1.0 + time_diff) # Newer documents get higher scores
|
||||
except (KeyError, ValueError):
|
||||
recency_score = 0.5 # Default if scrape_date is missing or invalid
|
||||
|
||||
# Token count score (normalized to 0-1)
|
||||
# Prefer documents with more tokens, but not too many
|
||||
token_count = doc.get('token_count', 0)
|
||||
token_count_score = min(token_count / 5000, 1.0) # Normalize to 0-1
|
||||
|
||||
# Calculate final priority score
|
||||
relevance_weight = 1.0 - recency_weight - token_count_weight
|
||||
priority_score = (
|
||||
relevance_weight * relevance_score +
|
||||
recency_weight * recency_score +
|
||||
token_count_weight * token_count_score
|
||||
)
|
||||
|
||||
# Add priority score to document
|
||||
doc['priority_score'] = priority_score
|
||||
|
||||
# Sort documents by priority score (descending)
|
||||
return sorted(documents, key=lambda x: x.get('priority_score', 0.0), reverse=True)
|
||||
|
||||
def chunk_document_by_sections(self, document: Dict[str, Any],
|
||||
max_chunk_tokens: int = 1000,
|
||||
overlap_tokens: int = 100) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Chunk a document by sections based on Markdown headers.
|
||||
|
||||
Args:
|
||||
document: Document to chunk
|
||||
max_chunk_tokens: Maximum number of tokens per chunk
|
||||
overlap_tokens: Number of tokens to overlap between chunks
|
||||
|
||||
Returns:
|
||||
List of document chunks
|
||||
"""
|
||||
content = document.get('content', '')
|
||||
|
||||
# If content is empty, return empty list
|
||||
if not content.strip():
|
||||
return []
|
||||
|
||||
# Ensure document has a title
|
||||
document_title = document.get('title')
|
||||
if document_title is None:
|
||||
document_title = 'Untitled'
|
||||
|
||||
# Find all headers in the content
|
||||
header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
|
||||
headers = list(header_pattern.finditer(content))
|
||||
|
||||
# If no headers found, use fixed-size chunking
|
||||
if not headers:
|
||||
return self.chunk_document_fixed_size(document, max_chunk_tokens, overlap_tokens)
|
||||
|
||||
chunks = []
|
||||
|
||||
# Process each section (from one header to the next)
|
||||
for i in range(len(headers)):
|
||||
start_pos = headers[i].start()
|
||||
|
||||
# Determine end position (next header or end of content)
|
||||
if i < len(headers) - 1:
|
||||
end_pos = headers[i + 1].start()
|
||||
else:
|
||||
end_pos = len(content)
|
||||
|
||||
section_content = content[start_pos:end_pos]
|
||||
section_tokens = self._count_tokens(section_content)
|
||||
|
||||
# If section is small enough, add it as a single chunk
|
||||
if section_tokens <= max_chunk_tokens:
|
||||
chunks.append({
|
||||
'document_id': document.get('id'),
|
||||
'url': document.get('url'),
|
||||
'title': document_title,
|
||||
'content': section_content,
|
||||
'token_count': section_tokens,
|
||||
'chunk_type': 'section',
|
||||
'section_title': headers[i].group(2),
|
||||
'section_level': len(headers[i].group(1)),
|
||||
'priority_score': document.get('priority_score', 0.0)
|
||||
})
|
||||
else:
|
||||
# If section is too large, split it into fixed-size chunks
|
||||
section_chunks = self._split_text_fixed_size(
|
||||
section_content,
|
||||
max_chunk_tokens,
|
||||
overlap_tokens
|
||||
)
|
||||
|
||||
for j, chunk_content in enumerate(section_chunks):
|
||||
chunk_tokens = self._count_tokens(chunk_content)
|
||||
chunks.append({
|
||||
'document_id': document.get('id'),
|
||||
'url': document.get('url'),
|
||||
'title': document_title,
|
||||
'content': chunk_content,
|
||||
'token_count': chunk_tokens,
|
||||
'chunk_type': 'section_part',
|
||||
'section_title': headers[i].group(2),
|
||||
'section_level': len(headers[i].group(1)),
|
||||
'section_part': j + 1,
|
||||
'total_parts': len(section_chunks),
|
||||
'priority_score': document.get('priority_score', 0.0)
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
def chunk_document_fixed_size(self, document: Dict[str, Any],
|
||||
max_chunk_tokens: int = 1000,
|
||||
overlap_tokens: int = 100) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Chunk a document into fixed-size chunks with overlap.
|
||||
|
||||
Args:
|
||||
document: Document to chunk
|
||||
max_chunk_tokens: Maximum number of tokens per chunk
|
||||
overlap_tokens: Number of tokens to overlap between chunks
|
||||
|
||||
Returns:
|
||||
List of document chunks
|
||||
"""
|
||||
content = document.get('content', '')
|
||||
|
||||
# If content is empty, return empty list
|
||||
if not content.strip():
|
||||
return []
|
||||
|
||||
# Ensure document has a title
|
||||
document_title = document.get('title')
|
||||
if document_title is None:
|
||||
document_title = 'Untitled'
|
||||
|
||||
# Split the content into fixed-size chunks
|
||||
chunk_contents = self._split_text_fixed_size(content, max_chunk_tokens, overlap_tokens)
|
||||
|
||||
# Create chunk objects
|
||||
chunks = []
|
||||
for i, chunk_content in enumerate(chunk_contents):
|
||||
chunk_tokens = self._count_tokens(chunk_content)
|
||||
chunks.append({
|
||||
'document_id': document.get('id'),
|
||||
'url': document.get('url'),
|
||||
'title': document_title,
|
||||
'content': chunk_content,
|
||||
'token_count': chunk_tokens,
|
||||
'chunk_type': 'fixed',
|
||||
'chunk_index': i,
|
||||
'total_chunks': len(chunk_contents),
|
||||
'priority_score': document.get('priority_score', 0.0) * (1.0 - (i * 0.05)) # Slightly reduce priority for later chunks
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
def chunk_document_hierarchical(self, document: Dict[str, Any],
|
||||
max_chunk_tokens: int = 1000,
|
||||
overlap_tokens: int = 100) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Chunk a very large document using a hierarchical approach.
|
||||
|
||||
This method first chunks the document by sections, then further chunks
|
||||
large sections into smaller pieces.
|
||||
|
||||
Args:
|
||||
document: Document to chunk
|
||||
max_chunk_tokens: Maximum number of tokens per chunk
|
||||
overlap_tokens: Number of tokens to overlap between chunks
|
||||
|
||||
Returns:
|
||||
List of document chunks
|
||||
"""
|
||||
# First, chunk by sections
|
||||
section_chunks = self.chunk_document_by_sections(document, max_chunk_tokens, overlap_tokens)
|
||||
|
||||
# If the document is small enough, return section chunks
|
||||
if sum(chunk.get('token_count', 0) for chunk in section_chunks) <= max_chunk_tokens * 3:
|
||||
return section_chunks
|
||||
|
||||
# Otherwise, create a summary chunk and keep the most important sections
|
||||
content = document.get('content', '')
|
||||
title = document.get('title', 'Untitled')
|
||||
|
||||
# Extract first paragraph as summary
|
||||
first_para_match = re.search(r'^(.*?)\n\n', content, re.DOTALL)
|
||||
summary = first_para_match.group(1) if first_para_match else content[:500]
|
||||
|
||||
# Create summary chunk
|
||||
summary_chunk = {
|
||||
'document_id': document.get('id'),
|
||||
'url': document.get('url'),
|
||||
'title': title,
|
||||
'content': f"# {title}\n\n{summary}\n\n(This is a summary of a large document)",
|
||||
'token_count': self._count_tokens(f"# {title}\n\n{summary}\n\n(This is a summary of a large document)"),
|
||||
'chunk_type': 'summary',
|
||||
'priority_score': document.get('priority_score', 0.0) * 1.2 # Boost summary priority
|
||||
}
|
||||
|
||||
# Sort section chunks by priority (section level and position)
|
||||
def section_priority(chunk):
|
||||
# Prioritize by section level (lower is more important)
|
||||
level_score = 6 - chunk.get('section_level', 3)
|
||||
# Prioritize earlier sections
|
||||
position_score = 1.0 / (1.0 + chunk.get('chunk_index', 0) + chunk.get('section_part', 0))
|
||||
return level_score * position_score
|
||||
|
||||
sorted_sections = sorted(section_chunks, key=section_priority, reverse=True)
|
||||
|
||||
# Return summary chunk and top sections
|
||||
return [summary_chunk] + sorted_sections
|
||||
|
||||
def _split_text_fixed_size(self, text: str,
|
||||
max_chunk_tokens: int = 1000,
|
||||
overlap_tokens: int = 100) -> List[str]:
|
||||
"""
|
||||
Split text into fixed-size chunks with overlap.
|
||||
|
||||
Args:
|
||||
text: Text to split
|
||||
max_chunk_tokens: Maximum number of tokens per chunk
|
||||
overlap_tokens: Number of tokens to overlap between chunks
|
||||
|
||||
Returns:
|
||||
List of text chunks
|
||||
"""
|
||||
# Encode text into tokens
|
||||
tokens = self.tokenizer.encode(text)
|
||||
|
||||
# If text is small enough, return as a single chunk
|
||||
if len(tokens) <= max_chunk_tokens:
|
||||
return [text]
|
||||
|
||||
# Calculate number of chunks needed
|
||||
num_chunks = math.ceil((len(tokens) - overlap_tokens) / (max_chunk_tokens - overlap_tokens))
|
||||
|
||||
chunks = []
|
||||
|
||||
# Split tokens into chunks
|
||||
for i in range(num_chunks):
|
||||
# Calculate start and end positions
|
||||
start_pos = i * (max_chunk_tokens - overlap_tokens)
|
||||
end_pos = min(start_pos + max_chunk_tokens, len(tokens))
|
||||
|
||||
# Extract chunk tokens
|
||||
chunk_tokens = tokens[start_pos:end_pos]
|
||||
|
||||
# Decode chunk tokens back to text
|
||||
chunk_text = self.tokenizer.decode(chunk_tokens)
|
||||
|
||||
chunks.append(chunk_text)
|
||||
|
||||
return chunks
|
||||
|
||||
def select_chunks_for_context(self, chunks: List[Dict[str, Any]],
|
||||
token_budget: int,
|
||||
min_chunks_per_doc: int = 1) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Select chunks to include in the context window based on token budget.
|
||||
|
||||
Args:
|
||||
chunks: List of document chunks
|
||||
token_budget: Maximum number of tokens to use
|
||||
min_chunks_per_doc: Minimum number of chunks to include per document
|
||||
|
||||
Returns:
|
||||
List of selected chunks
|
||||
"""
|
||||
# Group chunks by document
|
||||
doc_chunks = {}
|
||||
for chunk in chunks:
|
||||
doc_id = chunk.get('document_id')
|
||||
if doc_id not in doc_chunks:
|
||||
doc_chunks[doc_id] = []
|
||||
doc_chunks[doc_id].append(chunk)
|
||||
|
||||
# Sort chunks within each document by priority
|
||||
for doc_id in doc_chunks:
|
||||
doc_chunks[doc_id] = sorted(
|
||||
doc_chunks[doc_id],
|
||||
key=lambda x: x.get('priority_score', 0.0),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
# Select at least min_chunks_per_doc from each document
|
||||
selected_chunks = []
|
||||
remaining_budget = token_budget
|
||||
|
||||
# First pass: select minimum chunks from each document
|
||||
for doc_id, chunks in doc_chunks.items():
|
||||
for i in range(min(min_chunks_per_doc, len(chunks))):
|
||||
chunk = chunks[i]
|
||||
selected_chunks.append(chunk)
|
||||
remaining_budget -= chunk.get('token_count', 0)
|
||||
|
||||
# If we've exceeded the budget, sort selected chunks and trim
|
||||
if remaining_budget <= 0:
|
||||
selected_chunks = sorted(
|
||||
selected_chunks,
|
||||
key=lambda x: x.get('priority_score', 0.0),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
# Keep adding chunks until we exceed the budget
|
||||
current_budget = 0
|
||||
for i, chunk in enumerate(selected_chunks):
|
||||
current_budget += chunk.get('token_count', 0)
|
||||
if current_budget > token_budget:
|
||||
selected_chunks = selected_chunks[:i]
|
||||
break
|
||||
|
||||
return selected_chunks
|
||||
|
||||
# Second pass: add more chunks based on priority until budget is exhausted
|
||||
# Flatten remaining chunks from all documents
|
||||
remaining_chunks = []
|
||||
for doc_id, chunks in doc_chunks.items():
|
||||
if len(chunks) > min_chunks_per_doc:
|
||||
remaining_chunks.extend(chunks[min_chunks_per_doc:])
|
||||
|
||||
# Sort remaining chunks by priority
|
||||
remaining_chunks = sorted(
|
||||
remaining_chunks,
|
||||
key=lambda x: x.get('priority_score', 0.0),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
# Add chunks until budget is exhausted
|
||||
for chunk in remaining_chunks:
|
||||
if chunk.get('token_count', 0) <= remaining_budget:
|
||||
selected_chunks.append(chunk)
|
||||
remaining_budget -= chunk.get('token_count', 0)
|
||||
|
||||
if remaining_budget <= 0:
|
||||
break
|
||||
|
||||
return selected_chunks
|
||||
|
||||
def process_documents_for_report(self, documents: List[Dict[str, Any]],
|
||||
relevance_scores: Optional[Dict[str, float]] = None,
|
||||
token_budget: Optional[int] = None,
|
||||
chunk_size: int = 1000,
|
||||
overlap_size: int = 100) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process documents for report generation.
|
||||
|
||||
This method prioritizes documents, chunks them, and selects the most
|
||||
relevant chunks to stay within the token budget.
|
||||
|
||||
Args:
|
||||
documents: List of documents to process
|
||||
relevance_scores: Dictionary mapping document URLs to relevance scores
|
||||
token_budget: Maximum number of tokens to use (default: self.default_token_limit)
|
||||
chunk_size: Maximum number of tokens per chunk
|
||||
overlap_size: Number of tokens to overlap between chunks
|
||||
|
||||
Returns:
|
||||
List of selected document chunks
|
||||
"""
|
||||
if token_budget is None:
|
||||
token_budget = self.default_token_limit
|
||||
|
||||
# Prioritize documents
|
||||
prioritized_docs = self.prioritize_documents(documents, relevance_scores)
|
||||
|
||||
# Chunk documents
|
||||
all_chunks = []
|
||||
for doc in prioritized_docs:
|
||||
# Ensure document has a title
|
||||
if doc.get('title') is None:
|
||||
doc['title'] = 'Untitled'
|
||||
|
||||
# Choose chunking strategy based on document size
|
||||
token_count = doc.get('token_count', 0)
|
||||
|
||||
if token_count > chunk_size * 10:
|
||||
# Very large document: use hierarchical chunking
|
||||
chunks = self.chunk_document_hierarchical(doc, chunk_size, overlap_size)
|
||||
elif token_count > chunk_size:
|
||||
# Medium document: use section-based chunking
|
||||
chunks = self.chunk_document_by_sections(doc, chunk_size, overlap_size)
|
||||
else:
|
||||
# Small document: keep as a single chunk
|
||||
chunks = [{
|
||||
'document_id': doc.get('id'),
|
||||
'url': doc.get('url'),
|
||||
'title': doc.get('title', 'Untitled'),
|
||||
'content': doc.get('content', ''),
|
||||
'token_count': token_count,
|
||||
'chunk_type': 'full',
|
||||
'priority_score': doc.get('priority_score', 0.0)
|
||||
}]
|
||||
|
||||
# Ensure all chunks have a title
|
||||
for chunk in chunks:
|
||||
if chunk.get('title') is None:
|
||||
chunk['title'] = 'Untitled'
|
||||
|
||||
all_chunks.extend(chunks)
|
||||
|
||||
# Select chunks based on token budget
|
||||
selected_chunks = self.select_chunks_for_context(all_chunks, token_budget)
|
||||
|
||||
# Log statistics
|
||||
total_docs = len(documents)
|
||||
total_chunks = len(all_chunks)
|
||||
selected_chunk_count = len(selected_chunks)
|
||||
selected_token_count = sum(chunk.get('token_count', 0) for chunk in selected_chunks)
|
||||
|
||||
logger.info(f"Processed {total_docs} documents into {total_chunks} chunks")
|
||||
logger.info(f"Selected {selected_chunk_count} chunks with {selected_token_count} tokens")
|
||||
|
||||
return selected_chunks
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
document_processor = DocumentProcessor()
|
||||
|
||||
def get_document_processor() -> DocumentProcessor:
|
||||
"""
|
||||
Get the global document processor instance.
|
||||
|
||||
Returns:
|
||||
DocumentProcessor instance
|
||||
"""
|
||||
return document_processor
|
|
@ -1,510 +0,0 @@
|
|||
"""
|
||||
Document scraper module for the report generation module.
|
||||
|
||||
This module provides functionality to scrape web pages and extract clean content
|
||||
using Jina Reader API or fallback methods.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import hashlib
|
||||
import logging
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import validators
|
||||
import tiktoken
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
import html2text
|
||||
|
||||
from config.config import get_config
|
||||
from report.database.db_manager import get_db_manager, DBManager
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentScraper:
|
||||
"""
|
||||
Document scraper for the report generation module.
|
||||
|
||||
This class provides methods to scrape web pages and extract clean content
|
||||
using Jina Reader API or fallback methods.
|
||||
"""
|
||||
|
||||
def __init__(self, use_mock: bool = False):
|
||||
"""
|
||||
Initialize the document scraper.
|
||||
|
||||
Args:
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
"""
|
||||
self.config = get_config()
|
||||
self.api_key = self._get_api_key()
|
||||
self.endpoint = "https://api.jina.ai/v1/reader"
|
||||
self.db_manager = get_db_manager()
|
||||
self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer
|
||||
self.use_mock = use_mock
|
||||
self.jina_api_available = self.api_key != ""
|
||||
|
||||
def _get_api_key(self) -> str:
|
||||
"""
|
||||
Get the Jina AI API key.
|
||||
|
||||
Returns:
|
||||
The API key as a string
|
||||
|
||||
Raises:
|
||||
ValueError: If the API key is not found
|
||||
"""
|
||||
try:
|
||||
return self.config.get_api_key('jina')
|
||||
except ValueError as e:
|
||||
logger.warning(f"Jina AI API key not found. Fallback methods will be used. {str(e)}")
|
||||
return ""
|
||||
|
||||
def _count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Count the number of tokens in a text.
|
||||
|
||||
Args:
|
||||
text: The text to count tokens for
|
||||
|
||||
Returns:
|
||||
Number of tokens in the text
|
||||
"""
|
||||
return len(self.tokenizer.encode(text))
|
||||
|
||||
def _compute_hash(self, content: str) -> str:
|
||||
"""
|
||||
Compute a hash of the document content for deduplication.
|
||||
|
||||
Args:
|
||||
content: The document content
|
||||
|
||||
Returns:
|
||||
Hash of the content
|
||||
"""
|
||||
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
"""
|
||||
Normalize a URL by removing fragments and unnecessary query parameters.
|
||||
|
||||
Args:
|
||||
url: The URL to normalize
|
||||
|
||||
Returns:
|
||||
Normalized URL
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
# Remove fragment
|
||||
normalized = parsed._replace(fragment="")
|
||||
|
||||
# TODO: Add more normalization rules if needed
|
||||
|
||||
return normalized.geturl()
|
||||
|
||||
def _validate_url(self, url: str) -> bool:
|
||||
"""
|
||||
Validate a URL.
|
||||
|
||||
Args:
|
||||
url: The URL to validate
|
||||
|
||||
Returns:
|
||||
True if the URL is valid, False otherwise
|
||||
"""
|
||||
return validators.url(url) is True
|
||||
|
||||
async def _extract_metadata_from_html(self, html: str, url: str) -> Dict[str, str]:
|
||||
"""
|
||||
Extract metadata from HTML content.
|
||||
|
||||
Args:
|
||||
html: The HTML content
|
||||
url: The URL of the page
|
||||
|
||||
Returns:
|
||||
Dictionary of metadata
|
||||
"""
|
||||
metadata = {
|
||||
"source_url": url,
|
||||
"scrape_date": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Extract title
|
||||
if soup.title:
|
||||
metadata["title"] = soup.title.string
|
||||
|
||||
# Extract meta tags
|
||||
for meta in soup.find_all('meta'):
|
||||
# Author
|
||||
if meta.get('name') and meta.get('name').lower() == 'author' and meta.get('content'):
|
||||
metadata["author"] = meta.get('content')
|
||||
|
||||
# Description
|
||||
if meta.get('name') and meta.get('name').lower() == 'description' and meta.get('content'):
|
||||
metadata["description"] = meta.get('content')
|
||||
|
||||
# Keywords
|
||||
if meta.get('name') and meta.get('name').lower() == 'keywords' and meta.get('content'):
|
||||
metadata["keywords"] = meta.get('content')
|
||||
|
||||
# Publication date
|
||||
if meta.get('property') and meta.get('property').lower() in ['article:published_time', 'og:published_time'] and meta.get('content'):
|
||||
metadata["publication_date"] = meta.get('content')
|
||||
|
||||
# Open Graph data
|
||||
if meta.get('property') and meta.get('property').lower().startswith('og:') and meta.get('content'):
|
||||
og_key = meta.get('property').lower().replace('og:', 'og_')
|
||||
metadata[og_key] = meta.get('content')
|
||||
|
||||
# Extract structured data (JSON-LD)
|
||||
for script in soup.find_all('script', type='application/ld+json'):
|
||||
try:
|
||||
ld_data = json.loads(script.string)
|
||||
if isinstance(ld_data, dict):
|
||||
# Extract date published
|
||||
if ld_data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting'] and ld_data.get('datePublished'):
|
||||
metadata["publication_date"] = ld_data.get('datePublished')
|
||||
|
||||
# Extract author
|
||||
if ld_data.get('author'):
|
||||
author = ld_data.get('author')
|
||||
if isinstance(author, dict) and author.get('name'):
|
||||
metadata["author"] = author.get('name')
|
||||
elif isinstance(author, str):
|
||||
metadata["author"] = author
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error extracting metadata: {str(e)}")
|
||||
|
||||
return metadata
|
||||
|
||||
async def _html_to_markdown(self, html: str) -> str:
|
||||
"""
|
||||
Convert HTML to Markdown.
|
||||
|
||||
Args:
|
||||
html: The HTML content
|
||||
|
||||
Returns:
|
||||
Markdown content
|
||||
"""
|
||||
converter = html2text.HTML2Text()
|
||||
converter.ignore_links = False
|
||||
converter.ignore_images = False
|
||||
converter.ignore_tables = False
|
||||
converter.body_width = 0 # No wrapping
|
||||
|
||||
return converter.handle(html)
|
||||
|
||||
async def _get_mock_content(self, url: str) -> Tuple[str, Dict[str, str]]:
|
||||
"""
|
||||
Generate mock content for testing.
|
||||
|
||||
Args:
|
||||
url: The URL to generate mock content for
|
||||
|
||||
Returns:
|
||||
Tuple of (content, metadata)
|
||||
"""
|
||||
domain = urlparse(url).netloc
|
||||
path = urlparse(url).path
|
||||
|
||||
# Generate a title based on the URL
|
||||
title = f"Mock Content for {domain}{path}"
|
||||
|
||||
# Generate mock content
|
||||
content = f"""# {title}
|
||||
|
||||
## Introduction
|
||||
|
||||
This is mock content generated for testing purposes. The original URL is {url}.
|
||||
|
||||
## Section 1
|
||||
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam euismod, nisl eget
|
||||
aliquam ultricies, nunc nisl aliquet nunc, quis aliquam nisl nunc eu nisl.
|
||||
|
||||
## Section 2
|
||||
|
||||
Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas.
|
||||
Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante.
|
||||
|
||||
## Conclusion
|
||||
|
||||
This mock content was generated on {datetime.now().isoformat()}.
|
||||
"""
|
||||
|
||||
# Generate mock metadata
|
||||
metadata = {
|
||||
"source_url": url,
|
||||
"title": title,
|
||||
"description": "This is mock content generated for testing purposes.",
|
||||
"author": "Mock Generator",
|
||||
"scrape_date": datetime.now().isoformat(),
|
||||
"publication_date": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
return content, metadata
|
||||
|
||||
async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
|
||||
"""
|
||||
Scrape a web page using Jina Reader API.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
|
||||
Returns:
|
||||
Tuple of (content, metadata)
|
||||
"""
|
||||
# If using mock data, return mock content
|
||||
if self.use_mock:
|
||||
logger.info(f"Using mock data for URL: {url}")
|
||||
return await self._get_mock_content(url)
|
||||
|
||||
# If Jina API is not available, skip this step
|
||||
if not self.jina_api_available:
|
||||
logger.info("Jina API key not available. Using fallback method.")
|
||||
return None, None
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Accept": "application/json"
|
||||
}
|
||||
|
||||
data = {
|
||||
"url": url,
|
||||
"format": "markdown" # Request markdown format
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(self.endpoint, headers=headers, json=data, timeout=30) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.warning(f"Jina Reader API error: {response.status} - {error_text}")
|
||||
|
||||
# If we get a 404 or 429 (rate limit), mark the API as unavailable for this session
|
||||
if response.status in [404, 429]:
|
||||
logger.warning("Jina Reader API appears to be unavailable. Using fallback method for all subsequent requests.")
|
||||
self.jina_api_available = False
|
||||
|
||||
return None, None
|
||||
|
||||
result = await response.json()
|
||||
|
||||
if "content" not in result:
|
||||
logger.warning(f"Jina Reader API returned no content: {result}")
|
||||
return None, None
|
||||
|
||||
content = result.get("content", "")
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
# Add source URL to metadata
|
||||
metadata["source_url"] = url
|
||||
|
||||
return content, metadata
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"Timeout calling Jina Reader API for URL: {url}")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Jina Reader API: {str(e)}")
|
||||
return None, None
|
||||
|
||||
async def _scrape_with_fallback(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
|
||||
"""
|
||||
Scrape a web page using fallback method (aiohttp + BeautifulSoup).
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
|
||||
Returns:
|
||||
Tuple of (content, metadata)
|
||||
"""
|
||||
# If using mock data, return mock content
|
||||
if self.use_mock:
|
||||
logger.info(f"Using mock data for URL: {url}")
|
||||
return await self._get_mock_content(url)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=30) as response:
|
||||
if response.status != 200:
|
||||
logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}")
|
||||
return None, None
|
||||
|
||||
html = await response.text()
|
||||
|
||||
# Extract metadata
|
||||
metadata = await self._extract_metadata_from_html(html, url)
|
||||
|
||||
# Convert to markdown
|
||||
content = await self._html_to_markdown(html)
|
||||
|
||||
return content, metadata
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"Timeout fetching URL: {url}")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fallback scraping: {str(e)}")
|
||||
return None, None
|
||||
|
||||
async def scrape_url(self, url: str, force_refresh: bool = False) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Scrape a web page and store the content in the database.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
force_refresh: If True, scrape the URL even if it's already in the database
|
||||
|
||||
Returns:
|
||||
Document dictionary if successful, None otherwise
|
||||
"""
|
||||
# Validate URL
|
||||
if not self._validate_url(url):
|
||||
logger.warning(f"Invalid URL: {url}")
|
||||
return None
|
||||
|
||||
# Normalize URL
|
||||
normalized_url = self._normalize_url(url)
|
||||
|
||||
# Check if document already exists in database
|
||||
if not force_refresh and await self.db_manager.document_exists(normalized_url):
|
||||
logger.info(f"Document already exists in database: {normalized_url}")
|
||||
return await self.db_manager.get_document_by_url(normalized_url)
|
||||
|
||||
# Try Jina Reader first if it's available
|
||||
content, metadata = None, None
|
||||
if self.jina_api_available:
|
||||
content, metadata = await self._scrape_with_jina_reader(normalized_url)
|
||||
|
||||
# Fallback to custom scraping if Jina Reader fails or is unavailable
|
||||
if content is None:
|
||||
logger.info(f"Falling back to custom scraping for URL: {normalized_url}")
|
||||
content, metadata = await self._scrape_with_fallback(normalized_url)
|
||||
|
||||
if content is None or not content.strip():
|
||||
logger.warning(f"Failed to extract content from URL: {normalized_url}")
|
||||
return None
|
||||
|
||||
# Count tokens
|
||||
token_count = self._count_tokens(content)
|
||||
|
||||
# Compute hash for deduplication
|
||||
doc_hash = self._compute_hash(content)
|
||||
|
||||
# Get title from metadata or use URL as fallback
|
||||
title = metadata.get("title", urlparse(normalized_url).netloc)
|
||||
|
||||
# Store in database
|
||||
try:
|
||||
document_id = await self.db_manager.add_document(
|
||||
url=normalized_url,
|
||||
title=title,
|
||||
content=content,
|
||||
content_type="markdown",
|
||||
token_count=token_count,
|
||||
metadata=metadata,
|
||||
doc_hash=doc_hash
|
||||
)
|
||||
|
||||
# Return the document
|
||||
return await self.db_manager.get_document_by_url(normalized_url)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error storing document in database: {str(e)}")
|
||||
return None
|
||||
|
||||
async def scrape_urls(self, urls: List[str], force_refresh: bool = False) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Scrape multiple URLs in parallel.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to scrape
|
||||
force_refresh: If True, scrape URLs even if they're already in the database
|
||||
|
||||
Returns:
|
||||
List of document dictionaries
|
||||
"""
|
||||
tasks = [self.scrape_url(url, force_refresh) for url in urls]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Filter out None results
|
||||
return [doc for doc in results if doc is not None]
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
document_scraper = DocumentScraper()
|
||||
|
||||
def get_document_scraper(use_mock: bool = False) -> DocumentScraper:
|
||||
"""
|
||||
Get the global document scraper instance.
|
||||
|
||||
Args:
|
||||
use_mock: If True, create a new instance with mock data
|
||||
|
||||
Returns:
|
||||
DocumentScraper instance
|
||||
"""
|
||||
global document_scraper
|
||||
|
||||
# If mock is requested, create a new instance with mock enabled
|
||||
if use_mock:
|
||||
return DocumentScraper(use_mock=True)
|
||||
|
||||
return document_scraper
|
||||
|
||||
# Example usage
|
||||
async def test_scraper(use_mock: bool = False):
|
||||
"""
|
||||
Test the document scraper with a sample URL.
|
||||
|
||||
Args:
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
"""
|
||||
from report.database.db_manager import initialize_database
|
||||
|
||||
# Initialize database
|
||||
await initialize_database()
|
||||
|
||||
# Scrape a URL
|
||||
scraper = get_document_scraper(use_mock=use_mock)
|
||||
|
||||
# Test URLs
|
||||
test_urls = [
|
||||
"https://en.wikipedia.org/wiki/Web_scraping",
|
||||
"https://docs.python.org/3/",
|
||||
"https://www.python.org/"
|
||||
]
|
||||
|
||||
print(f"Testing scraper with {'mock data' if use_mock else 'real data'}")
|
||||
|
||||
for url in test_urls:
|
||||
print(f"\nScraping URL: {url}")
|
||||
document = await scraper.scrape_url(url)
|
||||
|
||||
if document:
|
||||
print(f"Successfully scraped document: {document['title']}")
|
||||
print(f"Token count: {document['token_count']}")
|
||||
print(f"Content preview: {document['content'][:200]}...")
|
||||
else:
|
||||
print(f"Failed to scrape document: {url}")
|
||||
|
||||
# Run test if this module is executed directly
|
||||
if __name__ == "__main__":
|
||||
# Test with real data by default
|
||||
asyncio.run(test_scraper(use_mock=False))
|
|
@ -1,532 +0,0 @@
|
|||
"""
|
||||
Progressive report synthesis module for the intelligent research system.
|
||||
|
||||
This module provides functionality to synthesize reports from document chunks
|
||||
using LLMs with a progressive approach, where chunks are processed iteratively
|
||||
and the report is refined over time.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union, Set
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
||||
from config.config import get_config
|
||||
from report.report_detail_levels import get_report_detail_level_manager, DetailLevel
|
||||
from report.report_templates import QueryType, DetailLevel as TemplateDetailLevel, ReportTemplateManager, ReportTemplate
|
||||
from report.report_synthesis import ReportSynthesizer
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReportState:
|
||||
"""Class to track the state of a progressive report."""
|
||||
current_report: str = ""
|
||||
processed_chunks: Set[str] = field(default_factory=set)
|
||||
version: int = 0
|
||||
last_update_time: float = field(default_factory=time.time)
|
||||
improvement_scores: List[float] = field(default_factory=list)
|
||||
is_complete: bool = False
|
||||
termination_reason: Optional[str] = None
|
||||
|
||||
|
||||
class ProgressiveReportSynthesizer(ReportSynthesizer):
|
||||
"""
|
||||
Progressive report synthesizer for the intelligent research system.
|
||||
|
||||
This class extends the ReportSynthesizer to implement a progressive approach
|
||||
to report generation, where chunks are processed iteratively and the report
|
||||
is refined over time.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: Optional[str] = None):
|
||||
"""
|
||||
Initialize the progressive report synthesizer.
|
||||
|
||||
Args:
|
||||
model_name: Name of the LLM model to use. If None, uses the default model
|
||||
from configuration.
|
||||
"""
|
||||
super().__init__(model_name)
|
||||
|
||||
# Initialize report state
|
||||
self.report_state = ReportState()
|
||||
|
||||
# Configuration for progressive generation
|
||||
self.min_improvement_threshold = 0.2 # Minimum improvement score to continue
|
||||
self.max_consecutive_low_improvements = 3 # Max number of consecutive low improvements before stopping
|
||||
self.batch_size = 3 # Number of chunks to process in each iteration
|
||||
self.max_iterations = 20 # Maximum number of iterations
|
||||
self.consecutive_low_improvements = 0 # Counter for consecutive low improvements
|
||||
|
||||
# Progress tracking
|
||||
self.total_chunks = 0
|
||||
self.processed_chunk_count = 0
|
||||
self.progress_callback = None
|
||||
|
||||
def set_progress_callback(self, callback):
|
||||
"""
|
||||
Set a callback function to report progress.
|
||||
|
||||
Args:
|
||||
callback: Function that takes (current_progress, total, current_report) as arguments
|
||||
"""
|
||||
self.progress_callback = callback
|
||||
|
||||
def _report_progress(self):
|
||||
"""Report progress through the callback if set."""
|
||||
if self.progress_callback and self.total_chunks > 0:
|
||||
progress = min(self.processed_chunk_count / self.total_chunks, 1.0)
|
||||
self.progress_callback(progress, self.total_chunks, self.report_state.current_report)
|
||||
|
||||
def prioritize_chunks(self, chunks: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Prioritize chunks based on relevance to the query and other factors.
|
||||
|
||||
Args:
|
||||
chunks: List of document chunks
|
||||
query: Original search query
|
||||
|
||||
Returns:
|
||||
List of chunks sorted by priority
|
||||
"""
|
||||
# Start with chunks already prioritized by the document processor
|
||||
# Further refine based on additional criteria if needed
|
||||
|
||||
# Filter out chunks that have already been processed
|
||||
unprocessed_chunks = [
|
||||
chunk for chunk in chunks
|
||||
if chunk.get('document_id') and str(chunk.get('document_id')) not in self.report_state.processed_chunks
|
||||
]
|
||||
|
||||
# If all chunks have been processed, return an empty list
|
||||
if not unprocessed_chunks:
|
||||
return []
|
||||
|
||||
# Sort by priority score (already set by document processor)
|
||||
prioritized_chunks = sorted(
|
||||
unprocessed_chunks,
|
||||
key=lambda x: x.get('priority_score', 0.0),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
return prioritized_chunks
|
||||
|
||||
async def extract_information_from_chunk(self, chunk: Dict[str, Any], query: str, detail_level: str = "comprehensive", query_type: str = "exploratory") -> str:
|
||||
"""
|
||||
Extract key information from a document chunk.
|
||||
|
||||
Args:
|
||||
chunk: Document chunk
|
||||
query: Original search query
|
||||
detail_level: Level of detail for extraction
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
|
||||
Returns:
|
||||
Extracted information as a string
|
||||
"""
|
||||
# Get the appropriate extraction prompt based on detail level and query type
|
||||
extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
|
||||
|
||||
# Create a prompt for extracting key information from the chunk
|
||||
messages = [
|
||||
{"role": "system", "content": extraction_prompt},
|
||||
{"role": "user", "content": f"""Query: {query}
|
||||
|
||||
Document title: {chunk.get('title', 'Untitled')}
|
||||
Document URL: {chunk.get('url', 'Unknown')}
|
||||
|
||||
Document chunk content:
|
||||
{chunk.get('content', '')}
|
||||
|
||||
Extract the most relevant information from this document chunk that addresses the query."""}
|
||||
]
|
||||
|
||||
# Process the chunk with the LLM
|
||||
extracted_info = await self.generate_completion(messages)
|
||||
|
||||
return extracted_info
|
||||
|
||||
async def refine_report(self, current_report: str, new_information: List[Tuple[Dict[str, Any], str]], query: str, query_type: str, detail_level: str) -> Tuple[str, float]:
|
||||
"""
|
||||
Refine the current report with new information.
|
||||
|
||||
Args:
|
||||
current_report: Current version of the report
|
||||
new_information: List of tuples containing (chunk, extracted_information)
|
||||
query: Original search query
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
detail_level: Level of detail for the report
|
||||
|
||||
Returns:
|
||||
Tuple of (refined_report, improvement_score)
|
||||
"""
|
||||
# Prepare context with new information
|
||||
context = ""
|
||||
for chunk, extracted_info in new_information:
|
||||
title = chunk.get('title', 'Untitled')
|
||||
url = chunk.get('url', 'Unknown')
|
||||
|
||||
context += f"Document: {title}\n"
|
||||
context += f"URL: {url}\n"
|
||||
context += f"Source URL: {url}\n" # Duplicate for emphasis
|
||||
context += f"Extracted information:\n{extracted_info}\n\n"
|
||||
|
||||
# Get template for the report
|
||||
template = self._get_template_from_strings(query_type, detail_level)
|
||||
|
||||
if not template:
|
||||
raise ValueError(f"No template found for {query_type} {detail_level}")
|
||||
|
||||
# Create the prompt for refining the report
|
||||
messages = [
|
||||
{"role": "system", "content": f"""You are an expert research assistant tasked with progressively refining a research report.
|
||||
|
||||
You will be given:
|
||||
1. The current version of the report
|
||||
2. New information extracted from additional documents
|
||||
|
||||
Your task is to refine and improve the report by incorporating the new information. Follow these guidelines:
|
||||
|
||||
1. Maintain the overall structure and format of the report
|
||||
2. Add new relevant information where appropriate
|
||||
3. Expand sections with new details, examples, or evidence
|
||||
4. Improve analysis based on the new information
|
||||
5. Add or update citations for new information
|
||||
6. Ensure the report follows this template structure:
|
||||
{template.template}
|
||||
|
||||
Format the report in Markdown with clear headings, subheadings, and bullet points where appropriate.
|
||||
Make the report readable, engaging, and informative while maintaining academic rigor.
|
||||
|
||||
IMPORTANT FOR REFERENCES:
|
||||
- Use a consistent format: [1] Title of the Article/Page. URL
|
||||
- DO NOT use generic placeholders like "Document 1" for references
|
||||
- ALWAYS include the actual URL from the source documents
|
||||
- Each reference MUST include both the title and the URL
|
||||
- Make sure all references are complete and properly formatted
|
||||
- Number the references sequentially
|
||||
|
||||
After refining the report, rate how much the new information improved the report on a scale of 0.0 to 1.0:
|
||||
- 0.0: No improvement (new information was redundant or irrelevant)
|
||||
- 0.5: Moderate improvement (new information added some value)
|
||||
- 1.0: Significant improvement (new information substantially enhanced the report)
|
||||
|
||||
End your response with a single line containing only the improvement score in this format:
|
||||
IMPROVEMENT_SCORE: [score]
|
||||
"""},
|
||||
{"role": "user", "content": f"""Query: {query}
|
||||
|
||||
Current report:
|
||||
{current_report}
|
||||
|
||||
New information from additional sources:
|
||||
{context}
|
||||
|
||||
Please refine the report by incorporating this new information while maintaining the overall structure and format."""}
|
||||
]
|
||||
|
||||
# Generate the refined report
|
||||
response = await self.generate_completion(messages)
|
||||
|
||||
# Extract the improvement score
|
||||
improvement_score = 0.5 # Default moderate improvement
|
||||
score_line = response.strip().split('\n')[-1]
|
||||
if score_line.startswith('IMPROVEMENT_SCORE:'):
|
||||
try:
|
||||
improvement_score = float(score_line.split(':')[1].strip())
|
||||
# Remove the score line from the report
|
||||
response = '\n'.join(response.strip().split('\n')[:-1])
|
||||
except (ValueError, IndexError):
|
||||
logger.warning("Could not parse improvement score, using default value of 0.5")
|
||||
|
||||
return response, improvement_score
|
||||
|
||||
async def initialize_report(self, initial_chunks: List[Dict[str, Any]], query: str, query_type: str, detail_level: str) -> str:
|
||||
"""
|
||||
Initialize the report with the first batch of chunks.
|
||||
|
||||
Args:
|
||||
initial_chunks: Initial batch of document chunks
|
||||
query: Original search query
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
detail_level: Level of detail for the report
|
||||
|
||||
Returns:
|
||||
Initial report as a string
|
||||
"""
|
||||
logger.info(f"Initializing report with {len(initial_chunks)} chunks")
|
||||
|
||||
# Process initial chunks using the standard map-reduce approach
|
||||
processed_chunks = await self.map_document_chunks(initial_chunks, query, detail_level, query_type)
|
||||
|
||||
# Generate initial report
|
||||
initial_report = await self.reduce_processed_chunks(processed_chunks, query, query_type, detail_level)
|
||||
|
||||
# Update report state
|
||||
self.report_state.current_report = initial_report
|
||||
self.report_state.version = 1
|
||||
self.report_state.last_update_time = time.time()
|
||||
|
||||
# Mark chunks as processed
|
||||
for chunk in initial_chunks:
|
||||
if chunk.get('document_id'):
|
||||
self.report_state.processed_chunks.add(str(chunk.get('document_id')))
|
||||
|
||||
self.processed_chunk_count += len(initial_chunks)
|
||||
self._report_progress()
|
||||
|
||||
return initial_report
|
||||
|
||||
def should_terminate(self, improvement_score: float) -> Tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Determine if the progressive report generation should terminate.
|
||||
|
||||
Args:
|
||||
improvement_score: Score indicating how much the report improved
|
||||
|
||||
Returns:
|
||||
Tuple of (should_terminate, reason)
|
||||
"""
|
||||
# Check if all chunks have been processed
|
||||
if self.processed_chunk_count >= self.total_chunks:
|
||||
return True, "All chunks processed"
|
||||
|
||||
# Check if maximum iterations reached
|
||||
if self.report_state.version >= self.max_iterations:
|
||||
return True, "Maximum iterations reached"
|
||||
|
||||
# Check for diminishing returns
|
||||
if improvement_score < self.min_improvement_threshold:
|
||||
self.consecutive_low_improvements += 1
|
||||
if self.consecutive_low_improvements >= self.max_consecutive_low_improvements:
|
||||
return True, "Diminishing returns (consecutive low improvements)"
|
||||
else:
|
||||
self.consecutive_low_improvements = 0
|
||||
|
||||
return False, None
|
||||
|
||||
async def synthesize_report_progressively(self, chunks: List[Dict[str, Any]], query: str, query_type: str = "exploratory", detail_level: str = "comprehensive") -> str:
|
||||
"""
|
||||
Synthesize a report from document chunks using a progressive approach.
|
||||
|
||||
Args:
|
||||
chunks: List of document chunks
|
||||
query: Original search query
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
detail_level: Level of detail for the report
|
||||
|
||||
Returns:
|
||||
Synthesized report as a string
|
||||
"""
|
||||
if not chunks:
|
||||
logger.warning("No document chunks provided for report synthesis.")
|
||||
return "No information found for the given query."
|
||||
|
||||
# Reset report state
|
||||
self.report_state = ReportState()
|
||||
self.consecutive_low_improvements = 0
|
||||
self.total_chunks = len(chunks)
|
||||
self.processed_chunk_count = 0
|
||||
|
||||
# Verify that a template exists for the given query type and detail level
|
||||
template = self._get_template_from_strings(query_type, detail_level)
|
||||
if not template:
|
||||
logger.warning(f"No template found for {query_type} {detail_level}, falling back to standard template")
|
||||
# Fall back to standard detail level if the requested one doesn't exist
|
||||
detail_level = "standard"
|
||||
|
||||
# Determine batch size based on the model
|
||||
if "gemini" in self.model_name.lower():
|
||||
self.batch_size = 5 # Larger batch size for Gemini models with 1M token windows
|
||||
else:
|
||||
self.batch_size = 3 # Smaller batch size for other models
|
||||
|
||||
logger.info(f"Using batch size of {self.batch_size} for model {self.model_name}")
|
||||
|
||||
# Prioritize chunks
|
||||
prioritized_chunks = self.prioritize_chunks(chunks, query)
|
||||
|
||||
# Initialize report with first batch of chunks
|
||||
initial_batch = prioritized_chunks[:self.batch_size]
|
||||
await self.initialize_report(initial_batch, query, query_type, detail_level)
|
||||
|
||||
# Progressive refinement loop
|
||||
while True:
|
||||
# Check if we should terminate
|
||||
should_terminate, reason = self.should_terminate(
|
||||
self.report_state.improvement_scores[-1] if self.report_state.improvement_scores else 1.0
|
||||
)
|
||||
|
||||
if should_terminate:
|
||||
logger.info(f"Terminating progressive report generation: {reason}")
|
||||
self.report_state.is_complete = True
|
||||
self.report_state.termination_reason = reason
|
||||
break
|
||||
|
||||
# Get next batch of chunks
|
||||
prioritized_chunks = self.prioritize_chunks(chunks, query)
|
||||
next_batch = prioritized_chunks[:self.batch_size]
|
||||
|
||||
if not next_batch:
|
||||
logger.info("No more chunks to process")
|
||||
self.report_state.is_complete = True
|
||||
self.report_state.termination_reason = "All chunks processed"
|
||||
break
|
||||
|
||||
logger.info(f"Processing batch {self.report_state.version + 1} with {len(next_batch)} chunks")
|
||||
|
||||
# Extract information from chunks
|
||||
new_information = []
|
||||
for chunk in next_batch:
|
||||
extracted_info = await self.extract_information_from_chunk(chunk, query, detail_level, query_type)
|
||||
new_information.append((chunk, extracted_info))
|
||||
|
||||
# Mark chunk as processed
|
||||
if chunk.get('document_id'):
|
||||
self.report_state.processed_chunks.add(str(chunk.get('document_id')))
|
||||
|
||||
# Refine report with new information
|
||||
refined_report, improvement_score = await self.refine_report(
|
||||
self.report_state.current_report,
|
||||
new_information,
|
||||
query,
|
||||
query_type,
|
||||
detail_level
|
||||
)
|
||||
|
||||
# Update report state
|
||||
self.report_state.current_report = refined_report
|
||||
self.report_state.version += 1
|
||||
self.report_state.last_update_time = time.time()
|
||||
self.report_state.improvement_scores.append(improvement_score)
|
||||
|
||||
self.processed_chunk_count += len(next_batch)
|
||||
self._report_progress()
|
||||
|
||||
logger.info(f"Completed iteration {self.report_state.version} with improvement score {improvement_score:.2f}")
|
||||
|
||||
# Add a small delay between iterations to avoid rate limiting
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Final report
|
||||
return self.report_state.current_report
|
||||
|
||||
async def synthesize_report(self, chunks: List[Dict[str, Any]], query: str, query_type: str = "exploratory", detail_level: str = "standard") -> str:
|
||||
"""
|
||||
Synthesize a report from document chunks.
|
||||
|
||||
This method overrides the parent method to use progressive synthesis for comprehensive
|
||||
detail level and standard map-reduce for other detail levels.
|
||||
|
||||
Args:
|
||||
chunks: List of document chunks
|
||||
query: Original search query
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
detail_level: Level of detail for the report
|
||||
|
||||
Returns:
|
||||
Synthesized report as a string
|
||||
"""
|
||||
# Use progressive synthesis for comprehensive detail level
|
||||
if detail_level.lower() == "comprehensive":
|
||||
logger.info(f"Using progressive synthesis for {detail_level} detail level")
|
||||
return await self.synthesize_report_progressively(chunks, query, query_type, detail_level)
|
||||
else:
|
||||
# Use standard map-reduce for other detail levels
|
||||
logger.info(f"Using standard map-reduce for {detail_level} detail level")
|
||||
return await super().synthesize_report(chunks, query, query_type, detail_level)
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
progressive_report_synthesizer = ProgressiveReportSynthesizer()
|
||||
|
||||
def get_progressive_report_synthesizer(model_name: Optional[str] = None) -> ProgressiveReportSynthesizer:
|
||||
"""
|
||||
Get the global progressive report synthesizer instance or create a new one with a specific model.
|
||||
|
||||
Args:
|
||||
model_name: Optional model name to use instead of the default
|
||||
|
||||
Returns:
|
||||
ProgressiveReportSynthesizer instance
|
||||
"""
|
||||
global progressive_report_synthesizer
|
||||
|
||||
if model_name and model_name != progressive_report_synthesizer.model_name:
|
||||
progressive_report_synthesizer = ProgressiveReportSynthesizer(model_name)
|
||||
|
||||
return progressive_report_synthesizer
|
||||
|
||||
async def test_progressive_report_synthesizer():
|
||||
"""Test the progressive report synthesizer with sample document chunks."""
|
||||
# Sample document chunks
|
||||
chunks = [
|
||||
{
|
||||
"document_id": "1",
|
||||
"title": "Introduction to Python",
|
||||
"url": "https://docs.python.org/3/tutorial/index.html",
|
||||
"content": "Python is an easy to learn, powerful programming language. It has efficient high-level data structures and a simple but effective approach to object-oriented programming. Python's elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms.",
|
||||
"priority_score": 0.9
|
||||
},
|
||||
{
|
||||
"document_id": "2",
|
||||
"title": "Python Features",
|
||||
"url": "https://www.python.org/about/",
|
||||
"content": "Python is a programming language that lets you work quickly and integrate systems more effectively. Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development, as well as for use as a scripting or glue language to connect existing components together.",
|
||||
"priority_score": 0.8
|
||||
},
|
||||
{
|
||||
"document_id": "3",
|
||||
"title": "Python Applications",
|
||||
"url": "https://www.python.org/about/apps/",
|
||||
"content": "Python is used in many application domains. Here's a sampling: Web and Internet Development, Scientific and Numeric Computing, Education, Desktop GUIs, Software Development, and Business Applications. Python is also used in Data Science, Machine Learning, and Artificial Intelligence applications.",
|
||||
"priority_score": 0.7
|
||||
},
|
||||
{
|
||||
"document_id": "4",
|
||||
"title": "Python History",
|
||||
"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
|
||||
"content": "Python was conceived in the late 1980s by Guido van Rossum at Centrum Wiskunde & Informatica (CWI) in the Netherlands as a successor to the ABC language, capable of exception handling and interfacing with the Amoeba operating system. Its implementation began in December 1989.",
|
||||
"priority_score": 0.6
|
||||
}
|
||||
]
|
||||
|
||||
# Initialize the progressive report synthesizer
|
||||
synthesizer = get_progressive_report_synthesizer()
|
||||
|
||||
# Test query
|
||||
query = "What are the key features and applications of Python programming language?"
|
||||
|
||||
# Define a progress callback
|
||||
def progress_callback(progress, total, current_report):
|
||||
print(f"Progress: {progress:.2%} ({total} chunks)")
|
||||
|
||||
# Set progress callback
|
||||
synthesizer.set_progress_callback(progress_callback)
|
||||
|
||||
# Generate report progressively
|
||||
report = await synthesizer.synthesize_report_progressively(chunks, query, query_type="exploratory", detail_level="comprehensive")
|
||||
|
||||
# Print report
|
||||
print("\nFinal Generated Report:")
|
||||
print(report)
|
||||
|
||||
# Print report state
|
||||
print("\nReport State:")
|
||||
print(f"Versions: {synthesizer.report_state.version}")
|
||||
print(f"Processed Chunks: {len(synthesizer.report_state.processed_chunks)}")
|
||||
print(f"Improvement Scores: {synthesizer.report_state.improvement_scores}")
|
||||
print(f"Termination Reason: {synthesizer.report_state.termination_reason}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_progressive_report_synthesizer())
|
|
@ -1,263 +0,0 @@
|
|||
"""
|
||||
Report detail levels module for the intelligent research system.
|
||||
|
||||
This module provides functionality to define and configure different levels of detail
|
||||
for generated reports, allowing users to customize the depth and breadth of information
|
||||
included in reports.
|
||||
"""
|
||||
|
||||
import enum
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
|
||||
|
||||
class DetailLevel(enum.Enum):
|
||||
"""Enum for different report detail levels."""
|
||||
BRIEF = "brief"
|
||||
STANDARD = "standard"
|
||||
DETAILED = "detailed"
|
||||
COMPREHENSIVE = "comprehensive"
|
||||
|
||||
|
||||
class ReportDetailLevelManager:
|
||||
"""
|
||||
Manager for report detail levels.
|
||||
|
||||
This class provides methods to get configuration parameters for different
|
||||
report detail levels, allowing users to customize the depth and breadth of
|
||||
information included in reports.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the report detail level manager."""
|
||||
# Define default configurations for different detail levels
|
||||
self.detail_level_configs = {
|
||||
DetailLevel.BRIEF: {
|
||||
"num_results": 3,
|
||||
"token_budget": 50000,
|
||||
"chunk_size": 800,
|
||||
"overlap_size": 50,
|
||||
"model": "llama-3.1-8b-instant",
|
||||
"description": "A concise summary of key findings and conclusions."
|
||||
},
|
||||
DetailLevel.STANDARD: {
|
||||
"num_results": 7,
|
||||
"token_budget": 100000,
|
||||
"chunk_size": 1000,
|
||||
"overlap_size": 100,
|
||||
"model": "llama-3.1-8b-instant",
|
||||
"description": "A balanced report with key findings, analysis, and conclusions."
|
||||
},
|
||||
DetailLevel.DETAILED: {
|
||||
"num_results": 12,
|
||||
"token_budget": 150000,
|
||||
"chunk_size": 1200,
|
||||
"overlap_size": 150,
|
||||
"model": "llama-3.3-70b-versatile",
|
||||
"description": "A comprehensive report with in-depth analysis, methodology, and implications."
|
||||
},
|
||||
DetailLevel.COMPREHENSIVE: {
|
||||
"num_results": 12,
|
||||
"token_budget": 200000,
|
||||
"chunk_size": 1200,
|
||||
"overlap_size": 120,
|
||||
"model": "gemini-2.0-flash-lite",
|
||||
"description": "An exhaustive report with all available information, extensive analysis, and detailed references."
|
||||
}
|
||||
}
|
||||
|
||||
# Define template modifiers for different detail levels
|
||||
self.template_modifiers = {
|
||||
DetailLevel.BRIEF: {
|
||||
"factual": "Create a brief factual report that directly answers the query. Focus on accuracy and clarity. Include:\n"
|
||||
"1. A clear, direct answer to the query\n"
|
||||
"2. Key supporting evidence and facts\n"
|
||||
"3. Citations for information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"4. A concise references section\n\n"
|
||||
"Keep the report concise and to the point, focusing only on the most essential information.",
|
||||
|
||||
"comparative": "Create a brief comparative report that analyzes different perspectives on the query. Include:\n"
|
||||
"1. A concise overview of the topic\n"
|
||||
"2. Key similarities and differences between perspectives\n"
|
||||
"3. Citations for information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"4. A concise references section\n\n"
|
||||
"Keep the report concise and to the point, focusing only on the most essential comparisons.",
|
||||
|
||||
"exploratory": "Create a brief exploratory report that investigates the query. Include:\n"
|
||||
"1. A concise introduction to the topic\n"
|
||||
"2. Key findings and insights\n"
|
||||
"3. Citations for information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"4. A concise references section\n\n"
|
||||
"Keep the report concise and to the point, focusing only on the most essential information."
|
||||
},
|
||||
|
||||
DetailLevel.STANDARD: {
|
||||
"factual": "Create a standard factual report that directly answers the query. Focus on accuracy and clarity. Include:\n"
|
||||
"1. A clear, direct answer to the query\n"
|
||||
"2. Supporting evidence and facts from the sources\n"
|
||||
"3. Any relevant context needed to understand the answer\n"
|
||||
"4. Citations for all information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"5. A references section at the end listing all sources",
|
||||
|
||||
"comparative": "Create a standard comparative report that analyzes different perspectives on the query. Include:\n"
|
||||
"1. An overview of the topic and why it's significant\n"
|
||||
"2. A balanced presentation of different viewpoints or approaches\n"
|
||||
"3. Analysis of similarities and differences\n"
|
||||
"4. Evidence supporting each perspective\n"
|
||||
"5. A synthesis of the information that highlights key insights\n"
|
||||
"6. Citations for all information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"7. A references section at the end listing all sources",
|
||||
|
||||
"exploratory": "Create a standard exploratory report that investigates the query in depth. Include:\n"
|
||||
"1. An introduction that frames the topic and its significance\n"
|
||||
"2. Key concepts and definitions\n"
|
||||
"3. Main findings and insights from the sources\n"
|
||||
"4. Analysis of the information that highlights patterns and connections\n"
|
||||
"5. Implications or applications of the findings\n"
|
||||
"6. Citations for all information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"7. A references section at the end listing all sources"
|
||||
},
|
||||
|
||||
DetailLevel.DETAILED: {
|
||||
"factual": "Create a detailed factual report that thoroughly answers the query with deeper analysis. Focus on accuracy, clarity, and analytical depth. Include:\n"
|
||||
"1. A comprehensive answer to the query with nuanced details and multi-layered analysis\n"
|
||||
"2. Extensive supporting evidence including statistics, expert opinions, and case examples\n"
|
||||
"3. Deeper exploration of causal relationships and underlying mechanisms\n"
|
||||
"4. Analysis of how different factors interact and influence each other\n"
|
||||
"5. Examination of historical context and how it shapes current understanding\n"
|
||||
"6. Consideration of diverse perspectives and interpretations of the evidence\n"
|
||||
"7. Citations for all information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"8. A detailed references section at the end listing all sources\n\n"
|
||||
"Prioritize depth of analysis over breadth of topics. Focus on providing rich, insightful explanations rather than just adding more sections.",
|
||||
|
||||
"comparative": "Create a detailed comparative report with deeper analytical insights. Include:\n"
|
||||
"1. A comprehensive overview of the topic with analysis of underlying principles\n"
|
||||
"2. In-depth presentation of different viewpoints with exploration of their theoretical foundations\n"
|
||||
"3. Detailed analysis of why differences exist and their historical development\n"
|
||||
"4. Examination of how different perspectives might apply in various contexts or scenarios\n"
|
||||
"5. Analysis of trade-offs, with quantitative measures where possible\n"
|
||||
"6. Exploration of hybrid approaches or potential reconciliation of different viewpoints\n"
|
||||
"7. Citations for all information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"8. A detailed references section at the end listing all sources\n\n"
|
||||
"Prioritize depth of analysis over breadth of topics. Focus on providing rich, insightful explanations rather than just adding more sections.",
|
||||
|
||||
"exploratory": "Create a detailed exploratory report with deeper analytical insights. Include:\n"
|
||||
"1. A comprehensive introduction with analysis of why this topic matters and to whom\n"
|
||||
"2. Detailed explanation of key concepts with exploration of their evolution and interconnections\n"
|
||||
"3. In-depth analysis of cause-and-effect relationships and systemic factors\n"
|
||||
"4. Examination of apparent contradictions and how they might be resolved\n"
|
||||
"5. Analysis of how different stakeholders might be affected differently\n"
|
||||
"6. Exploration of second and third-order effects or implications\n"
|
||||
"7. Citations for all information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"8. A detailed references section at the end listing all sources\n\n"
|
||||
"Prioritize depth of analysis over breadth of topics. Focus on providing rich, insightful explanations rather than just adding more sections."
|
||||
},
|
||||
|
||||
DetailLevel.COMPREHENSIVE: {
|
||||
"factual": "Create an exhaustive factual report with the deepest possible analysis. Include:\n"
|
||||
"1. A multi-dimensional answer that explores all facets of the query with layered analysis\n"
|
||||
"2. Comprehensive evidence including statistical data, expert consensus, minority viewpoints, and case studies\n"
|
||||
"3. Detailed exploration of complex causal networks and feedback loops\n"
|
||||
"4. Analysis of how the topic intersects with related domains and disciplines\n"
|
||||
"5. Examination of historical evolution, current state, and future trajectories\n"
|
||||
"6. Critical evaluation of the quality and reliability of different evidence sources\n"
|
||||
"7. Synthesis of insights across multiple analytical frameworks\n"
|
||||
"8. Visual elements such as tables or bullet points to organize complex information\n"
|
||||
"9. Citations for all information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"10. A comprehensive references section at the end listing all sources\n\n"
|
||||
"Maximize analytical depth while maintaining clarity. Provide the richest possible understanding rather than simply covering more ground.",
|
||||
|
||||
"comparative": "Create an exhaustive comparative report with the deepest possible analysis. Include:\n"
|
||||
"1. A multi-layered overview that places the comparison in broader theoretical and practical contexts\n"
|
||||
"2. Comprehensive presentation of all viewpoints with analysis of their epistemological foundations\n"
|
||||
"3. Detailed examination of how different perspectives have evolved in response to each other\n"
|
||||
"4. Analysis of how cultural, historical, and disciplinary contexts shape different viewpoints\n"
|
||||
"5. Quantitative and qualitative comparison using multiple frameworks and metrics\n"
|
||||
"6. Exploration of edge cases where different perspectives might converge or diverge\n"
|
||||
"7. Synthesis that identifies meta-patterns across different analytical dimensions\n"
|
||||
"8. Visual elements such as tables or matrices to organize complex comparisons\n"
|
||||
"9. Citations for all information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"10. A comprehensive references section at the end listing all sources\n\n"
|
||||
"Maximize analytical depth while maintaining clarity. Provide the richest possible understanding rather than simply covering more ground.",
|
||||
|
||||
"exploratory": "Create an exhaustive exploratory report with the deepest possible analysis. Include:\n"
|
||||
"1. A multi-dimensional introduction that situates the topic within broader intellectual landscapes\n"
|
||||
"2. Comprehensive explanation of all concepts with analysis of their theoretical underpinnings\n"
|
||||
"3. In-depth exploration of complex interaction effects and emergent properties\n"
|
||||
"4. Detailed examination of how different methodological approaches yield different insights\n"
|
||||
"5. Analysis of how the topic might be understood differently across disciplines or paradigms\n"
|
||||
"6. Exploration of counterfactuals, thought experiments, and alternative scenarios\n"
|
||||
"7. Synthesis that integrates insights across multiple levels of analysis\n"
|
||||
"8. Visual elements such as tables or concept maps to organize complex information\n"
|
||||
"9. Citations for all information (use numbered citations in square brackets [1], [2], etc.)\n"
|
||||
"10. A comprehensive references section at the end listing all sources\n\n"
|
||||
"Maximize analytical depth while maintaining clarity. Provide the richest possible understanding rather than simply covering more ground."
|
||||
}
|
||||
}
|
||||
|
||||
def get_detail_level_config(self, detail_level: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get configuration parameters for a specific detail level.
|
||||
|
||||
Args:
|
||||
detail_level: Detail level as a string (brief, standard, detailed, comprehensive)
|
||||
|
||||
Returns:
|
||||
Dictionary of configuration parameters for the specified detail level
|
||||
|
||||
Raises:
|
||||
ValueError: If the detail level is not valid
|
||||
"""
|
||||
try:
|
||||
level = DetailLevel(detail_level.lower())
|
||||
return self.detail_level_configs[level]
|
||||
except (ValueError, KeyError):
|
||||
valid_levels = [level.value for level in DetailLevel]
|
||||
raise ValueError(f"Invalid detail level: {detail_level}. Valid levels are: {', '.join(valid_levels)}")
|
||||
|
||||
def get_template_modifier(self, detail_level: str, query_type: str) -> str:
|
||||
"""
|
||||
Get template modifier for a specific detail level and query type.
|
||||
|
||||
Args:
|
||||
detail_level: Detail level as a string (brief, standard, detailed, comprehensive)
|
||||
query_type: Query type as a string (factual, exploratory, comparative)
|
||||
|
||||
Returns:
|
||||
Template modifier as a string
|
||||
|
||||
Raises:
|
||||
ValueError: If the detail level or query type is not valid
|
||||
"""
|
||||
try:
|
||||
level = DetailLevel(detail_level.lower())
|
||||
if query_type not in ["factual", "exploratory", "comparative"]:
|
||||
query_type = "exploratory" # Default to exploratory if query type is not valid
|
||||
|
||||
return self.template_modifiers[level][query_type]
|
||||
except (ValueError, KeyError):
|
||||
valid_levels = [level.value for level in DetailLevel]
|
||||
raise ValueError(f"Invalid detail level: {detail_level}. Valid levels are: {', '.join(valid_levels)}")
|
||||
|
||||
def get_available_detail_levels(self) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Get a list of available detail levels with descriptions.
|
||||
|
||||
Returns:
|
||||
List of tuples containing detail level and description
|
||||
"""
|
||||
return [(level.value, config["description"])
|
||||
for level, config in self.detail_level_configs.items()]
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
report_detail_level_manager = ReportDetailLevelManager()
|
||||
|
||||
|
||||
def get_report_detail_level_manager() -> ReportDetailLevelManager:
|
||||
"""
|
||||
Get the global report detail level manager instance.
|
||||
|
||||
Returns:
|
||||
ReportDetailLevelManager instance
|
||||
"""
|
||||
return report_detail_level_manager
|
|
@ -1,351 +0,0 @@
|
|||
"""
|
||||
Report generator module for the intelligent research system.
|
||||
|
||||
This module provides functionality to generate reports from search results
|
||||
by scraping documents, storing them in a database, and synthesizing them
|
||||
into a comprehensive report.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union
|
||||
|
||||
from report.database.db_manager import get_db_manager, initialize_database
|
||||
from report.document_scraper import get_document_scraper
|
||||
from report.document_processor import get_document_processor
|
||||
from report.report_synthesis import get_report_synthesizer
|
||||
from report.progressive_report_synthesis import get_progressive_report_synthesizer
|
||||
from report.report_detail_levels import get_report_detail_level_manager, DetailLevel
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ReportGenerator:
|
||||
"""
|
||||
Report generator for the intelligent research system.
|
||||
|
||||
This class provides methods to generate reports from search results
|
||||
by scraping documents, storing them in a database, and synthesizing them
|
||||
into a comprehensive report.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the report generator."""
|
||||
self.db_manager = get_db_manager()
|
||||
self.document_scraper = get_document_scraper()
|
||||
self.document_processor = get_document_processor()
|
||||
self.report_synthesizer = get_report_synthesizer()
|
||||
self.progressive_report_synthesizer = get_progressive_report_synthesizer()
|
||||
self.detail_level_manager = get_report_detail_level_manager()
|
||||
self.detail_level = "standard" # Default detail level
|
||||
self.model_name = None # Will use default model based on detail level
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the report generator by setting up the database."""
|
||||
await initialize_database()
|
||||
logger.info("Report generator initialized")
|
||||
|
||||
def set_detail_level(self, detail_level: str) -> None:
|
||||
"""
|
||||
Set the detail level for report generation.
|
||||
|
||||
Args:
|
||||
detail_level: Detail level (brief, standard, detailed, comprehensive)
|
||||
"""
|
||||
try:
|
||||
# Validate detail level
|
||||
config = self.detail_level_manager.get_detail_level_config(detail_level)
|
||||
self.detail_level = detail_level
|
||||
|
||||
# Update model if needed
|
||||
model = config.get("model")
|
||||
if model and model != self.model_name:
|
||||
self.model_name = model
|
||||
self.report_synthesizer = get_report_synthesizer(model)
|
||||
self.progressive_report_synthesizer = get_progressive_report_synthesizer(model)
|
||||
|
||||
logger.info(f"Detail level set to {detail_level} with model {model}")
|
||||
except ValueError as e:
|
||||
logger.error(f"Error setting detail level: {e}")
|
||||
raise
|
||||
|
||||
def get_detail_level_config(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get the current detail level configuration.
|
||||
|
||||
Returns:
|
||||
Dictionary of configuration parameters for the current detail level
|
||||
"""
|
||||
return self.detail_level_manager.get_detail_level_config(self.detail_level)
|
||||
|
||||
def get_available_detail_levels(self) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Get a list of available detail levels with descriptions.
|
||||
|
||||
Returns:
|
||||
List of tuples containing detail level and description
|
||||
"""
|
||||
return self.detail_level_manager.get_available_detail_levels()
|
||||
|
||||
async def process_search_results(self, search_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process search results by scraping the URLs and storing them in the database.
|
||||
|
||||
Args:
|
||||
search_results: List of search results, each containing at least a 'url' field
|
||||
|
||||
Returns:
|
||||
List of processed documents
|
||||
"""
|
||||
# Extract URLs from search results
|
||||
urls = [result.get('url') for result in search_results if result.get('url')]
|
||||
|
||||
# Extract relevance scores if available
|
||||
relevance_scores = {}
|
||||
for result in search_results:
|
||||
if result.get('url') and result.get('score') is not None:
|
||||
relevance_scores[result.get('url')] = result.get('score')
|
||||
|
||||
# Scrape URLs and store in database
|
||||
documents = await self.document_scraper.scrape_urls(urls)
|
||||
|
||||
# Log results
|
||||
logger.info(f"Processed {len(documents)} documents out of {len(urls)} URLs")
|
||||
|
||||
return documents, relevance_scores
|
||||
|
||||
async def get_document_by_url(self, url: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get a document by its URL.
|
||||
|
||||
Args:
|
||||
url: URL of the document
|
||||
|
||||
Returns:
|
||||
Document as a dictionary, or None if not found
|
||||
"""
|
||||
return await self.db_manager.get_document_by_url(url)
|
||||
|
||||
async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search for documents in the database.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
limit: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of matching documents
|
||||
"""
|
||||
return await self.db_manager.search_documents(query, limit)
|
||||
|
||||
async def prepare_documents_for_report(self,
|
||||
search_results: List[Dict[str, Any]],
|
||||
token_budget: Optional[int] = None,
|
||||
chunk_size: Optional[int] = None,
|
||||
overlap_size: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Prepare documents for report generation by processing search results,
|
||||
prioritizing documents, and chunking them to fit within token budget.
|
||||
|
||||
Args:
|
||||
search_results: List of search results
|
||||
token_budget: Maximum number of tokens to use
|
||||
chunk_size: Maximum number of tokens per chunk
|
||||
overlap_size: Number of tokens to overlap between chunks
|
||||
|
||||
Returns:
|
||||
List of selected document chunks
|
||||
"""
|
||||
# Get configuration from detail level if not specified
|
||||
config = self.get_detail_level_config()
|
||||
|
||||
if token_budget is None:
|
||||
token_budget = config.get("token_budget")
|
||||
|
||||
if chunk_size is None:
|
||||
chunk_size = config.get("chunk_size", 1000)
|
||||
|
||||
if overlap_size is None:
|
||||
overlap_size = config.get("overlap_size", 100)
|
||||
|
||||
logger.info(f"Preparing documents with token_budget={token_budget}, chunk_size={chunk_size}, overlap_size={overlap_size}")
|
||||
|
||||
# Process search results to get documents and relevance scores
|
||||
documents, relevance_scores = await self.process_search_results(search_results)
|
||||
|
||||
# Prioritize and chunk documents
|
||||
selected_chunks = self.document_processor.process_documents_for_report(
|
||||
documents,
|
||||
relevance_scores,
|
||||
token_budget,
|
||||
chunk_size,
|
||||
overlap_size
|
||||
)
|
||||
|
||||
return selected_chunks
|
||||
|
||||
def set_progress_callback(self, callback):
|
||||
"""
|
||||
Set the progress callback for both synthesizers.
|
||||
|
||||
Args:
|
||||
callback: Function that takes (current_progress, total, current_report) as arguments
|
||||
"""
|
||||
# Set the callback for both synthesizers
|
||||
if hasattr(self.report_synthesizer, 'set_progress_callback'):
|
||||
self.report_synthesizer.set_progress_callback(callback)
|
||||
|
||||
if hasattr(self.progressive_report_synthesizer, 'set_progress_callback'):
|
||||
self.progressive_report_synthesizer.set_progress_callback(callback)
|
||||
|
||||
async def generate_report(self,
|
||||
search_results: List[Dict[str, Any]],
|
||||
query: str,
|
||||
token_budget: Optional[int] = None,
|
||||
chunk_size: Optional[int] = None,
|
||||
overlap_size: Optional[int] = None,
|
||||
detail_level: Optional[str] = None,
|
||||
query_type: Optional[str] = None) -> str:
|
||||
"""
|
||||
Generate a report from search results.
|
||||
|
||||
Args:
|
||||
search_results: List of search results
|
||||
query: Original search query
|
||||
token_budget: Maximum number of tokens to use
|
||||
chunk_size: Maximum number of tokens per chunk
|
||||
overlap_size: Number of tokens to overlap between chunks
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
|
||||
Returns:
|
||||
Generated report as a string
|
||||
"""
|
||||
# Set detail level if specified
|
||||
if detail_level:
|
||||
self.set_detail_level(detail_level)
|
||||
|
||||
# Prepare documents for report
|
||||
selected_chunks = await self.prepare_documents_for_report(
|
||||
search_results,
|
||||
token_budget,
|
||||
chunk_size,
|
||||
overlap_size
|
||||
)
|
||||
|
||||
# Log query type information
|
||||
if query_type:
|
||||
logger.info(f"Using specified query type: {query_type}")
|
||||
else:
|
||||
logger.info("Using automatic query type detection")
|
||||
|
||||
# Choose the appropriate synthesizer based on detail level
|
||||
if self.detail_level.lower() == "comprehensive":
|
||||
# Use progressive report synthesizer for comprehensive detail level
|
||||
logger.info(f"Using progressive report synthesizer for {self.detail_level} detail level")
|
||||
report = await self.progressive_report_synthesizer.synthesize_report(
|
||||
selected_chunks,
|
||||
query,
|
||||
query_type=query_type,
|
||||
detail_level=self.detail_level
|
||||
)
|
||||
else:
|
||||
# Use standard report synthesizer for other detail levels
|
||||
logger.info(f"Using standard report synthesizer for {self.detail_level} detail level")
|
||||
report = await self.report_synthesizer.synthesize_report(
|
||||
selected_chunks,
|
||||
query,
|
||||
query_type=query_type,
|
||||
detail_level=self.detail_level
|
||||
)
|
||||
|
||||
return report
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
report_generator = ReportGenerator()
|
||||
|
||||
async def initialize_report_generator():
|
||||
"""Initialize the report generator."""
|
||||
await report_generator.initialize()
|
||||
|
||||
def get_report_generator() -> ReportGenerator:
|
||||
"""
|
||||
Get the global report generator instance.
|
||||
|
||||
Returns:
|
||||
ReportGenerator instance
|
||||
"""
|
||||
return report_generator
|
||||
|
||||
async def test_report_generator(use_mock: bool = False):
|
||||
"""
|
||||
Test the report generator with sample search results.
|
||||
|
||||
Args:
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
"""
|
||||
# Initialize the report generator
|
||||
await initialize_report_generator()
|
||||
|
||||
# Get document scraper with mock option
|
||||
document_scraper = get_document_scraper(use_mock=use_mock)
|
||||
|
||||
# Sample search results with real, accessible URLs
|
||||
search_results = [
|
||||
{
|
||||
'title': 'Python Documentation',
|
||||
'url': 'https://docs.python.org/3/tutorial/index.html',
|
||||
'snippet': 'The Python Tutorial.',
|
||||
'score': 0.95
|
||||
},
|
||||
{
|
||||
'title': 'Python Requests Library',
|
||||
'url': 'https://requests.readthedocs.io/en/latest/',
|
||||
'snippet': 'Requests is an elegant and simple HTTP library for Python.',
|
||||
'score': 0.85
|
||||
},
|
||||
{
|
||||
'title': 'Real Python',
|
||||
'url': 'https://realpython.com/',
|
||||
'snippet': 'Python tutorials for developers of all skill levels.',
|
||||
'score': 0.75
|
||||
}
|
||||
]
|
||||
|
||||
try:
|
||||
# Process search results
|
||||
documents, relevance_scores = await report_generator.process_search_results(search_results)
|
||||
|
||||
# Print documents
|
||||
print(f"Processed {len(documents)} documents")
|
||||
for doc in documents:
|
||||
print(f"Document: {doc.get('title')} ({doc.get('url')})")
|
||||
print(f"Token count: {doc.get('token_count')}")
|
||||
content_preview = doc.get('content', '')[:100] + '...' if doc.get('content') else 'No content'
|
||||
print(f"Content snippet: {content_preview}")
|
||||
print()
|
||||
|
||||
# Generate report
|
||||
report = await report_generator.generate_report(search_results, "Python programming")
|
||||
|
||||
# Print report
|
||||
print("Generated Report:")
|
||||
print(report)
|
||||
except Exception as e:
|
||||
logger.error(f"Error during report generation test: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Run test if this module is executed directly
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Test the report generator')
|
||||
parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Running test with {'mock data' if args.mock else 'real data'}")
|
||||
asyncio.run(test_report_generator(use_mock=args.mock))
|
|
@ -1,719 +0,0 @@
|
|||
"""
|
||||
Report synthesis module for the intelligent research system.
|
||||
|
||||
This module provides functionality to synthesize reports from document chunks
|
||||
using LLMs with a map-reduce approach.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union
|
||||
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
||||
from config.config import get_config
|
||||
from report.report_detail_levels import get_report_detail_level_manager, DetailLevel
|
||||
from report.report_templates import QueryType, DetailLevel as TemplateDetailLevel, ReportTemplateManager, ReportTemplate
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Note: ReportTemplateManager and ReportTemplate are now imported from report_templates.py
|
||||
|
||||
class ReportSynthesizer:
|
||||
"""
|
||||
Report synthesizer for the intelligent research system.
|
||||
|
||||
This class provides methods to synthesize reports from document chunks
|
||||
using LLMs with a map-reduce approach.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: Optional[str] = None):
|
||||
"""
|
||||
Initialize the report synthesizer.
|
||||
|
||||
Args:
|
||||
model_name: Name of the LLM model to use. If None, uses the default model
|
||||
from configuration.
|
||||
"""
|
||||
self.config = get_config()
|
||||
|
||||
# Use specified model or default from config for report synthesis
|
||||
self.model_name = model_name or self.config.config_data.get('report_synthesis', {}).get('model', 'llama-3.3-70b-versatile')
|
||||
|
||||
# Get model-specific configuration
|
||||
self.model_config = self.config.get_model_config(self.model_name)
|
||||
|
||||
# Set up LiteLLM with the appropriate provider
|
||||
self._setup_provider()
|
||||
|
||||
# Initialize template manager
|
||||
self.template_manager = ReportTemplateManager()
|
||||
self.template_manager.initialize_default_templates()
|
||||
|
||||
# Flag to process <thinking> tags in model output
|
||||
self.process_thinking_tags = False
|
||||
|
||||
# Progress tracking
|
||||
self.progress_callback = None
|
||||
self.total_chunks = 0
|
||||
self.processed_chunk_count = 0
|
||||
|
||||
def set_progress_callback(self, callback):
|
||||
"""
|
||||
Set a callback function to report progress.
|
||||
|
||||
Args:
|
||||
callback: Function that takes (current_progress, total, current_report) as arguments
|
||||
"""
|
||||
self.progress_callback = callback
|
||||
|
||||
def _report_progress(self, current_report=None):
|
||||
"""Report progress through the callback if set."""
|
||||
if self.progress_callback and self.total_chunks > 0:
|
||||
progress = min(self.processed_chunk_count / self.total_chunks, 1.0)
|
||||
self.progress_callback(progress, self.total_chunks, current_report)
|
||||
|
||||
def _setup_provider(self) -> None:
|
||||
"""Set up the LLM provider based on the model configuration."""
|
||||
provider = self.model_config.get('provider', 'groq')
|
||||
|
||||
try:
|
||||
# Get API key for the provider
|
||||
api_key = self.config.get_api_key(provider)
|
||||
|
||||
# Set environment variable for the provider
|
||||
if provider.lower() == 'google' or provider.lower() == 'gemini':
|
||||
os.environ["GEMINI_API_KEY"] = api_key
|
||||
elif provider.lower() == 'vertex_ai':
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = api_key
|
||||
else:
|
||||
os.environ[f"{provider.upper()}_API_KEY"] = api_key
|
||||
|
||||
logger.info(f"Report synthesizer initialized with model: {self.model_name} (provider: {provider})")
|
||||
except ValueError as e:
|
||||
logger.error(f"Error setting up LLM provider: {e}")
|
||||
|
||||
def _get_completion_params(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get parameters for LLM completion based on model configuration.
|
||||
|
||||
Returns:
|
||||
Dictionary of parameters for LiteLLM completion
|
||||
"""
|
||||
params = {
|
||||
'temperature': self.model_config.get('temperature', 0.3), # Lower temperature for factual reporting
|
||||
'max_tokens': self.model_config.get('max_tokens', 4000), # Longer output for comprehensive reports
|
||||
'top_p': self.model_config.get('top_p', 0.9)
|
||||
}
|
||||
|
||||
# Handle different provider configurations
|
||||
provider = self.model_config.get('provider', 'groq')
|
||||
|
||||
if provider == 'groq':
|
||||
# For Groq provider
|
||||
params['model'] = f"groq/{self.model_name}"
|
||||
elif provider == 'openrouter':
|
||||
# For OpenRouter provider
|
||||
params['model'] = self.model_config.get('model_name', self.model_name)
|
||||
params['api_base'] = self.model_config.get('endpoint')
|
||||
|
||||
# Set HTTP headers for OpenRouter if needed
|
||||
params['headers'] = {
|
||||
'HTTP-Referer': 'https://sim-search.app', # Replace with your actual app URL
|
||||
'X-Title': 'Intelligent Research System' # Replace with your actual app name
|
||||
}
|
||||
elif provider == 'google' or provider == 'gemini':
|
||||
# Special handling for Google Gemini models
|
||||
# Format: gemini/model_name (e.g., gemini/gemini-2.0-flash)
|
||||
params['model'] = f"gemini/{self.model_config.get('model_name', self.model_name)}"
|
||||
|
||||
# Add additional parameters for Gemini
|
||||
params['custom_llm_provider'] = 'gemini'
|
||||
elif provider == 'vertex_ai':
|
||||
# Special handling for Vertex AI Gemini models
|
||||
params['model'] = f"vertex_ai/{self.model_config.get('model_name', self.model_name)}"
|
||||
|
||||
# Add Vertex AI specific parameters
|
||||
params['vertex_project'] = self.model_config.get('vertex_project', 'sim-search')
|
||||
params['vertex_location'] = self.model_config.get('vertex_location', 'us-central1')
|
||||
|
||||
# Set custom provider
|
||||
params['custom_llm_provider'] = 'vertex_ai'
|
||||
else:
|
||||
# Standard provider (OpenAI, Anthropic, etc.)
|
||||
params['model'] = self.model_name
|
||||
|
||||
return params
|
||||
|
||||
async def generate_completion(self, messages: List[Dict[str, str]], stream: bool = False) -> Union[str, Any]:
|
||||
"""
|
||||
Generate a completion using the configured LLM.
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries with 'role' and 'content' keys
|
||||
stream: Whether to stream the response
|
||||
|
||||
Returns:
|
||||
If stream is False, returns the completion text as a string
|
||||
If stream is True, returns the completion response object for streaming
|
||||
"""
|
||||
# Get provider from model config
|
||||
provider = self.model_config.get('provider', 'groq').lower()
|
||||
|
||||
# Special handling for Gemini models - they use 'user' and 'model' roles
|
||||
if provider == 'gemini':
|
||||
formatted_messages = []
|
||||
for msg in messages:
|
||||
role = msg['role']
|
||||
# Map 'system' to 'user' for the first message
|
||||
if role == 'system' and not formatted_messages:
|
||||
formatted_messages.append({
|
||||
'role': 'user',
|
||||
'content': msg['content']
|
||||
})
|
||||
# Map 'assistant' to 'model'
|
||||
elif role == 'assistant':
|
||||
formatted_messages.append({
|
||||
'role': 'model',
|
||||
'content': msg['content']
|
||||
})
|
||||
# Keep 'user' as is
|
||||
else:
|
||||
formatted_messages.append(msg)
|
||||
else:
|
||||
formatted_messages = messages
|
||||
|
||||
# Get completion parameters
|
||||
params = self._get_completion_params()
|
||||
|
||||
try:
|
||||
# Generate completion
|
||||
if stream:
|
||||
response = litellm.completion(
|
||||
messages=formatted_messages,
|
||||
stream=True,
|
||||
**params
|
||||
)
|
||||
return response
|
||||
else:
|
||||
response = litellm.completion(
|
||||
messages=formatted_messages,
|
||||
**params
|
||||
)
|
||||
|
||||
# Extract content from response
|
||||
content = response.choices[0].message.content
|
||||
|
||||
# Process thinking tags if enabled
|
||||
if self.process_thinking_tags:
|
||||
content = self._process_thinking_tags(content)
|
||||
|
||||
return content
|
||||
except Exception as e:
|
||||
error_msg = f"Error generating completion: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
|
||||
# Return error message in a user-friendly format
|
||||
return f"I encountered an error while processing your request: {str(e)}"
|
||||
|
||||
def _process_thinking_tags(self, content: str) -> str:
|
||||
"""
|
||||
Process and remove <thinking> tags from model output.
|
||||
|
||||
Some models like deepseek-r1-distill use <thinking> tags for their internal reasoning.
|
||||
This method removes these tags and their content to produce a clean output.
|
||||
|
||||
Args:
|
||||
content: The raw content from the model
|
||||
|
||||
Returns:
|
||||
Processed content with thinking tags removed
|
||||
"""
|
||||
import re
|
||||
|
||||
# Remove <thinking>...</thinking> blocks
|
||||
clean_content = re.sub(r'<thinking>.*?</thinking>', '', content, flags=re.DOTALL)
|
||||
|
||||
# Clean up any remaining tags
|
||||
clean_content = re.sub(r'</?thinking>', '', clean_content)
|
||||
|
||||
# Remove extra newlines that might have been created
|
||||
clean_content = re.sub(r'\n{3,}', '\n\n', clean_content)
|
||||
|
||||
return clean_content.strip()
|
||||
|
||||
async def map_document_chunks(self, chunks: List[Dict[str, Any]], query: str, detail_level: str = "standard", query_type: str = "exploratory") -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Map phase: Process individual document chunks to extract key information.
|
||||
|
||||
Args:
|
||||
chunks: List of document chunks
|
||||
query: Original search query
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
|
||||
Returns:
|
||||
List of processed chunks with extracted information
|
||||
"""
|
||||
processed_chunks = []
|
||||
|
||||
# Get the appropriate extraction prompt based on detail level and query type
|
||||
extraction_prompt = self._get_extraction_prompt(detail_level, query_type)
|
||||
|
||||
total_chunks = len(chunks)
|
||||
logger.info(f"Starting to process {total_chunks} document chunks")
|
||||
|
||||
# Determine batch size based on the model - Gemini can handle larger batches
|
||||
if "gemini" in self.model_name.lower():
|
||||
batch_size = 8 # Larger batch size for Gemini models with 1M token windows
|
||||
else:
|
||||
batch_size = 3 # Smaller batch size for other models
|
||||
|
||||
logger.info(f"Using batch size of {batch_size} for model {self.model_name}")
|
||||
|
||||
for i in range(0, len(chunks), batch_size):
|
||||
batch = chunks[i:i+batch_size]
|
||||
logger.info(f"Processing batch {i//batch_size + 1}/{(len(chunks) + batch_size - 1)//batch_size} with {len(batch)} chunks")
|
||||
|
||||
# Process this batch
|
||||
batch_results = []
|
||||
for j, chunk in enumerate(batch):
|
||||
chunk_title = chunk.get('title', 'Untitled')
|
||||
chunk_index = i + j + 1
|
||||
logger.info(f"Processing chunk {chunk_index}/{total_chunks}: {chunk_title[:50] if chunk_title else 'Untitled'}...")
|
||||
|
||||
# Create a prompt for extracting key information from the chunk
|
||||
messages = [
|
||||
{"role": "system", "content": extraction_prompt},
|
||||
{"role": "user", "content": f"""Query: {query}
|
||||
|
||||
Document title: {chunk.get('title', 'Untitled')}
|
||||
Document URL: {chunk.get('url', 'Unknown')}
|
||||
|
||||
Document chunk content:
|
||||
{chunk.get('content', '')}
|
||||
|
||||
Extract the most relevant information from this document chunk that addresses the query."""}
|
||||
]
|
||||
|
||||
try:
|
||||
# Process the chunk with the LLM
|
||||
extracted_info = await self.generate_completion(messages)
|
||||
|
||||
# Add the extracted information to the chunk
|
||||
processed_chunk = chunk.copy()
|
||||
processed_chunk['extracted_info'] = extracted_info
|
||||
batch_results.append(processed_chunk)
|
||||
|
||||
# Update progress
|
||||
self.processed_chunk_count += 1
|
||||
self._report_progress()
|
||||
|
||||
logger.info(f"Completed chunk {chunk_index}/{total_chunks} ({chunk_index/total_chunks*100:.1f}% complete)")
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing chunk {chunk_index}/{total_chunks}: {str(e)}")
|
||||
# Add a placeholder for the failed chunk to maintain document order
|
||||
processed_chunk = chunk.copy()
|
||||
processed_chunk['extracted_info'] = f"Error extracting information: {str(e)}"
|
||||
batch_results.append(processed_chunk)
|
||||
|
||||
# Update progress even for failed chunks
|
||||
self.processed_chunk_count += 1
|
||||
self._report_progress()
|
||||
|
||||
processed_chunks.extend(batch_results)
|
||||
|
||||
# Add a small delay between batches to avoid rate limiting
|
||||
if i + batch_size < len(chunks):
|
||||
logger.info("Pausing briefly between batches...")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
logger.info(f"Completed processing all {total_chunks} chunks")
|
||||
return processed_chunks
|
||||
|
||||
def _get_extraction_prompt(self, detail_level: str, query_type: str = "exploratory") -> str:
|
||||
"""
|
||||
Get the appropriate extraction prompt based on detail level and query type.
|
||||
|
||||
Args:
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
|
||||
Returns:
|
||||
Extraction prompt as a string
|
||||
"""
|
||||
# Base prompts by detail level
|
||||
if detail_level.lower() in ["brief", "standard"]:
|
||||
base_prompt = """You are an expert research assistant. Extract the most relevant information from this document chunk that addresses the user's query.
|
||||
Focus on factual information, key concepts, and important details.
|
||||
Include any relevant statistics, definitions, or explanations that would be valuable for a report.
|
||||
Format your response as a concise summary with bullet points for key facts."""
|
||||
elif detail_level.lower() == "detailed":
|
||||
base_prompt = """You are an expert research analyst with deep domain knowledge. Extract comprehensive information from this document chunk that addresses the user's query.
|
||||
Focus on:
|
||||
- Detailed factual information and evidence
|
||||
- Underlying principles and mechanisms
|
||||
- Causal relationships and correlations
|
||||
- Contextual factors and historical development
|
||||
- Different perspectives or interpretations
|
||||
- Quantitative data and qualitative insights
|
||||
- Nuances, edge cases, and exceptions
|
||||
|
||||
Prioritize depth of analysis over breadth. Extract information that provides deeper understanding rather than just basic facts.
|
||||
Format your response with clear sections and bullet points for key insights."""
|
||||
else: # comprehensive
|
||||
base_prompt = """You are a world-class research analyst with exceptional analytical abilities. Extract the most comprehensive and nuanced information from this document chunk.
|
||||
Focus on:
|
||||
- Multi-layered analysis of all relevant facts and evidence
|
||||
- Complex causal networks and interaction effects
|
||||
- Theoretical frameworks and their applications
|
||||
- Historical evolution and future trajectories
|
||||
- Methodological considerations and limitations
|
||||
- Diverse perspectives and their epistemological foundations
|
||||
- Statistical data, case studies, and expert opinions
|
||||
- Contradictions, paradoxes, and unresolved questions
|
||||
|
||||
Extract information that provides the deepest possible understanding of the topic as it relates to the query.
|
||||
Analyze the reliability and significance of the information.
|
||||
Format your response with clearly organized sections and detailed bullet points."""
|
||||
|
||||
# Add specific instructions for comparative queries
|
||||
if query_type.lower() == "comparative":
|
||||
comparative_instructions = """
|
||||
IMPORTANT: This is a COMPARATIVE query. The user is asking to compare two or more things.
|
||||
|
||||
When extracting information, focus specifically on:
|
||||
1. Characteristics, features, or attributes of EACH item being compared
|
||||
2. Direct comparisons between the items mentioned in the query
|
||||
3. Advantages and disadvantages of each item
|
||||
4. Similarities and differences between the items
|
||||
5. Contexts where one item might be preferred over others
|
||||
|
||||
Make sure to clearly identify which information relates to which item being compared.
|
||||
Organize your extraction to facilitate easy comparison between the items.
|
||||
"""
|
||||
return base_prompt + comparative_instructions
|
||||
|
||||
return base_prompt
|
||||
|
||||
def _get_template_from_strings(self, query_type_str: str, detail_level_str: str) -> Optional[ReportTemplate]:
|
||||
"""
|
||||
Helper method to get a template using string values for query_type and detail_level.
|
||||
|
||||
Args:
|
||||
query_type_str: String value of query type (factual, exploratory, comparative)
|
||||
detail_level_str: String value of detail level (brief, standard, detailed, comprehensive)
|
||||
|
||||
Returns:
|
||||
ReportTemplate object or None if not found
|
||||
"""
|
||||
try:
|
||||
# Convert string values to enum objects
|
||||
query_type_enum = QueryType(query_type_str)
|
||||
detail_level_enum = TemplateDetailLevel(detail_level_str)
|
||||
|
||||
# Get template using enum objects
|
||||
template = self.template_manager.get_template(query_type_enum, detail_level_enum)
|
||||
if template:
|
||||
logger.info(f"Found template for {query_type_str} {detail_level_str}")
|
||||
else:
|
||||
logger.warning(f"No template found for {query_type_str} {detail_level_str}")
|
||||
return template
|
||||
except (ValueError, KeyError) as e:
|
||||
logger.error(f"Error getting template for {query_type_str} {detail_level_str}: {str(e)}")
|
||||
return None
|
||||
|
||||
async def reduce_processed_chunks(self, processed_chunks: List[Dict[str, Any]], query: str, query_type: str = "exploratory", detail_level: str = "standard") -> str:
|
||||
"""
|
||||
Reduce phase: Synthesize processed chunks into a coherent report.
|
||||
|
||||
Args:
|
||||
processed_chunks: List of processed chunks with extracted information
|
||||
query: Original search query
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
|
||||
Returns:
|
||||
Synthesized report as a string
|
||||
"""
|
||||
# Prepare the context with all extracted information
|
||||
context = ""
|
||||
for i, chunk in enumerate(processed_chunks):
|
||||
title = chunk.get('title', 'Untitled')
|
||||
url = chunk.get('url', 'Unknown')
|
||||
|
||||
context += f"Document {i+1}:\n"
|
||||
context += f"Title: {title}\n"
|
||||
context += f"URL: {url}\n"
|
||||
context += f"Source URL: {url}\n" # Duplicate for emphasis
|
||||
context += f"Extracted information:\n{chunk.get('extracted_info', '')}\n\n"
|
||||
|
||||
# Get template modifier based on detail level and query type using helper method
|
||||
template = self._get_template_from_strings(query_type, detail_level)
|
||||
|
||||
if not template:
|
||||
raise ValueError(f"No template found for {query_type} {detail_level}")
|
||||
|
||||
# Add specific instructions for references formatting
|
||||
reference_instructions = """
|
||||
When including references, use a consistent format:
|
||||
|
||||
[1] Title of the Article/Page. URL
|
||||
|
||||
IMPORTANT:
|
||||
1. DO NOT use generic placeholders like "Document 1" for references
|
||||
2. ALWAYS include the actual URL from the source documents
|
||||
3. Each reference MUST include both the title and the URL
|
||||
4. Make sure all references are complete and properly formatted
|
||||
5. Number the references sequentially starting from 1
|
||||
6. Include the URL for EACH reference - this is critical.
|
||||
"""
|
||||
|
||||
# Special handling for Gemini models
|
||||
if "gemini" in self.model_name.lower():
|
||||
reference_instructions += """
|
||||
IMPORTANT: Due to token limitations, ensure the References section is completed properly.
|
||||
If you feel you might run out of tokens, start the References section earlier and make it more concise.
|
||||
Never leave the References section incomplete or cut off mid-reference.
|
||||
"""
|
||||
|
||||
# Create the prompt for synthesizing the report
|
||||
messages = [
|
||||
{"role": "system", "content": f"""You are an expert research assistant tasked with creating comprehensive, well-structured reports.
|
||||
{template.template}
|
||||
|
||||
Format the report in Markdown with clear headings, subheadings, and bullet points where appropriate.
|
||||
Make the report readable, engaging, and informative while maintaining academic rigor.
|
||||
|
||||
{reference_instructions}"""},
|
||||
{"role": "user", "content": f"""Query: {query}
|
||||
|
||||
Information from sources:
|
||||
{context}
|
||||
|
||||
Synthesize this information into a report that addresses the query. Use your own words to create a coherent narrative, but ensure all information is based on the provided sources. Include citations and a references section."""}
|
||||
]
|
||||
|
||||
# Generate the report
|
||||
report = await self.generate_completion(messages)
|
||||
|
||||
# Check if the report might be cut off at the end
|
||||
if report.strip().endswith('[') or report.strip().endswith(']') or report.strip().endswith('...'):
|
||||
logger.warning("Report appears to be cut off at the end. Attempting to fix references section.")
|
||||
|
||||
# Try to fix the references section by generating it separately
|
||||
try:
|
||||
# Extract what we have so far without the incomplete references
|
||||
if "References" in report:
|
||||
report_without_refs = report.split("References")[0].strip()
|
||||
else:
|
||||
report_without_refs = report
|
||||
|
||||
# Generate just the references section
|
||||
ref_messages = [
|
||||
{"role": "system", "content": """You are an expert at formatting reference lists. Create a properly formatted References section for the following documents.
|
||||
|
||||
IMPORTANT:
|
||||
1. Use the actual title and URL from each document
|
||||
2. DO NOT use generic placeholders like "Document 1"
|
||||
3. Format each reference as: [1] Title of the Article/Page. URL
|
||||
4. Each reference MUST include both the title and the URL
|
||||
5. Make sure all references are complete and properly formatted
|
||||
6. Number the references sequentially starting from 1"""},
|
||||
{"role": "user", "content": f"""Here are the documents used in the report:
|
||||
|
||||
{context}
|
||||
|
||||
Create a complete, properly formatted References section in Markdown format.
|
||||
Remember to include the URL for EACH reference - this is critical."""}
|
||||
]
|
||||
|
||||
references = await self.generate_completion(ref_messages)
|
||||
|
||||
# Combine the report with the fixed references
|
||||
report = f"{report_without_refs}\n\n## References\n\n{references}"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fixing references section: {str(e)}")
|
||||
|
||||
return report
|
||||
|
||||
async def synthesize_report(self, chunks: List[Dict[str, Any]], query: str, query_type: str = "exploratory", detail_level: str = "standard") -> str:
|
||||
"""
|
||||
Synthesize a report from document chunks using the map-reduce approach.
|
||||
|
||||
Args:
|
||||
chunks: List of document chunks
|
||||
query: Original search query
|
||||
query_type: Type of query (factual, exploratory, comparative)
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
|
||||
Returns:
|
||||
Synthesized report as a string
|
||||
"""
|
||||
if not chunks:
|
||||
logger.warning("No document chunks provided for report synthesis.")
|
||||
return "No information found for the given query."
|
||||
|
||||
# Reset progress tracking
|
||||
self.total_chunks = len(chunks)
|
||||
self.processed_chunk_count = 0
|
||||
|
||||
# Verify that a template exists for the given query type and detail level
|
||||
template = self._get_template_from_strings(query_type, detail_level)
|
||||
if not template:
|
||||
logger.warning(f"No template found for {query_type} {detail_level}, falling back to standard template")
|
||||
# Fall back to standard detail level if the requested one doesn't exist
|
||||
detail_level = "standard"
|
||||
|
||||
# Get detail level configuration
|
||||
detail_level_manager = get_report_detail_level_manager()
|
||||
config = detail_level_manager.get_detail_level_config(detail_level)
|
||||
token_budget = config.get("token_budget", 100000)
|
||||
|
||||
# Determine query type based on the query text
|
||||
# Always try to infer the query type, regardless of what was passed in
|
||||
if any(term in query.lower() for term in ["what is", "who is", "when did", "where is", "how does"]):
|
||||
query_type = "factual"
|
||||
elif any(term in query.lower() for term in ["compare", "difference", "versus", "vs", "pros and cons"]):
|
||||
query_type = "comparative"
|
||||
else:
|
||||
# Default to exploratory if no specific pattern is detected
|
||||
query_type = "exploratory"
|
||||
|
||||
logger.info(f"Query type determined as: {query_type}")
|
||||
|
||||
# Estimate total tokens in chunks
|
||||
total_tokens = sum(len(chunk.get('content', '').split()) * 1.3 for chunk in chunks) # Rough estimate
|
||||
logger.info(f"Estimated total tokens in {len(chunks)} chunks: {total_tokens}")
|
||||
|
||||
# If total tokens exceeds 80% of the token budget, reduce the number of chunks
|
||||
if total_tokens > token_budget * 0.8:
|
||||
max_chunks = int(len(chunks) * (token_budget * 0.8 / total_tokens))
|
||||
max_chunks = max(3, max_chunks) # Ensure we have at least 3 chunks
|
||||
logger.warning(f"Token count ({total_tokens}) exceeds 80% of budget ({token_budget}). Reducing chunks from {len(chunks)} to {max_chunks}.")
|
||||
chunks = chunks[:max_chunks]
|
||||
# Recalculate estimated tokens
|
||||
total_tokens = sum(len(chunk.get('content', '').split()) * 1.3 for chunk in chunks)
|
||||
logger.info(f"Reduced to {len(chunks)} chunks with estimated {total_tokens} tokens")
|
||||
|
||||
# Update total chunks for progress tracking
|
||||
self.total_chunks = len(chunks)
|
||||
|
||||
logger.info(f"Starting map phase for {len(chunks)} document chunks with query type '{query_type}' and detail level '{detail_level}'")
|
||||
|
||||
# Process chunks in batches to avoid hitting payload limits
|
||||
# Determine batch size based on the model - Gemini can handle larger batches
|
||||
if "gemini" in self.model_name.lower():
|
||||
batch_size = 8 # Larger batch size for Gemini models with 1M token windows
|
||||
else:
|
||||
batch_size = 3 # Smaller batch size for other models
|
||||
|
||||
logger.info(f"Using batch size of {batch_size} for model {self.model_name}")
|
||||
processed_chunks = []
|
||||
|
||||
for i in range(0, len(chunks), batch_size):
|
||||
batch = chunks[i:i+batch_size]
|
||||
logger.info(f"Processing batch {i//batch_size + 1}/{(len(chunks) + batch_size - 1)//batch_size} with {len(batch)} chunks")
|
||||
|
||||
# Ensure all chunks have a title, even if it's 'Untitled'
|
||||
for chunk in batch:
|
||||
if chunk.get('title') is None:
|
||||
chunk['title'] = 'Untitled'
|
||||
|
||||
# Process this batch
|
||||
batch_results = await self.map_document_chunks(batch, query, detail_level, query_type)
|
||||
processed_chunks.extend(batch_results)
|
||||
|
||||
# Add a small delay between batches to avoid rate limiting
|
||||
if i + batch_size < len(chunks):
|
||||
logger.info("Pausing briefly between batches...")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
logger.info(f"Starting reduce phase to synthesize report from {len(processed_chunks)} processed chunks")
|
||||
|
||||
# Update progress status for reduce phase
|
||||
if self.progress_callback:
|
||||
self.progress_callback(0.9, self.total_chunks, "Synthesizing final report...")
|
||||
|
||||
# Reduce phase: Synthesize processed chunks into a coherent report
|
||||
report = await self.reduce_processed_chunks(processed_chunks, query, query_type, detail_level)
|
||||
|
||||
# Process thinking tags if enabled
|
||||
if self.process_thinking_tags and "<thinking>" in report:
|
||||
logger.info("Processing thinking tags in report")
|
||||
report = self._process_thinking_tags(report)
|
||||
|
||||
# Final progress update
|
||||
if self.progress_callback:
|
||||
self.progress_callback(1.0, self.total_chunks, report)
|
||||
|
||||
return report
|
||||
|
||||
|
||||
# Create a singleton instance for global use
|
||||
report_synthesizer = ReportSynthesizer()
|
||||
|
||||
def get_report_synthesizer(model_name: Optional[str] = None) -> ReportSynthesizer:
|
||||
"""
|
||||
Get the global report synthesizer instance or create a new one with a specific model.
|
||||
|
||||
Args:
|
||||
model_name: Optional model name to use instead of the default
|
||||
|
||||
Returns:
|
||||
ReportSynthesizer instance
|
||||
"""
|
||||
global report_synthesizer
|
||||
|
||||
if model_name and model_name != report_synthesizer.model_name:
|
||||
report_synthesizer = ReportSynthesizer(model_name)
|
||||
|
||||
return report_synthesizer
|
||||
|
||||
async def test_report_synthesizer():
|
||||
"""Test the report synthesizer with sample document chunks."""
|
||||
# Sample document chunks
|
||||
chunks = [
|
||||
{
|
||||
"title": "Introduction to Python",
|
||||
"url": "https://docs.python.org/3/tutorial/index.html",
|
||||
"content": "Python is an easy to learn, powerful programming language. It has efficient high-level data structures and a simple but effective approach to object-oriented programming. Python's elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms."
|
||||
},
|
||||
{
|
||||
"title": "Python Features",
|
||||
"url": "https://www.python.org/about/",
|
||||
"content": "Python is a programming language that lets you work quickly and integrate systems more effectively. Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development, as well as for use as a scripting or glue language to connect existing components together."
|
||||
}
|
||||
]
|
||||
|
||||
# Initialize the report synthesizer
|
||||
synthesizer = get_report_synthesizer()
|
||||
|
||||
# Test query
|
||||
query = "What are the key features of Python programming language?"
|
||||
|
||||
# Map phase
|
||||
processed_chunks = await synthesizer.map_document_chunks(chunks, query, detail_level="detailed")
|
||||
|
||||
# Print processed chunks
|
||||
print("Processed chunks:")
|
||||
for i, chunk in enumerate(processed_chunks):
|
||||
print(f"Chunk {i+1}: {chunk.get('title')}")
|
||||
print(f"Extracted information: {chunk.get('extracted_info')}")
|
||||
print()
|
||||
|
||||
# Reduce phase
|
||||
report = await synthesizer.reduce_processed_chunks(processed_chunks, query, detail_level="detailed")
|
||||
|
||||
# Print report
|
||||
print("Generated Report:")
|
||||
print(report)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_report_synthesizer())
|
|
@ -1,134 +0,0 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Dict, Optional, List
|
||||
from enum import Enum
|
||||
|
||||
class QueryType(Enum):
|
||||
FACTUAL = 'factual'
|
||||
EXPLORATORY = 'exploratory'
|
||||
COMPARATIVE = 'comparative'
|
||||
|
||||
class DetailLevel(Enum):
|
||||
BRIEF = 'brief'
|
||||
STANDARD = 'standard'
|
||||
DETAILED = 'detailed'
|
||||
COMPREHENSIVE = 'comprehensive'
|
||||
|
||||
@dataclass
|
||||
class ReportTemplate:
|
||||
template: str
|
||||
detail_level: DetailLevel
|
||||
query_type: QueryType
|
||||
model: Optional[str] = None
|
||||
required_sections: Optional[List[str]] = None
|
||||
|
||||
def validate(self) -> bool:
|
||||
"""Validate that the template contains all required sections"""
|
||||
if not self.required_sections:
|
||||
return True
|
||||
return all(section in self.template for section in self.required_sections)
|
||||
|
||||
class ReportTemplateManager:
|
||||
def __init__(self):
|
||||
self.templates: Dict[str, ReportTemplate] = {}
|
||||
|
||||
def add_template(self, template: ReportTemplate):
|
||||
if not template.validate():
|
||||
raise ValueError(f"Template missing required sections: {template.required_sections}")
|
||||
key = f"{template.query_type.value}_{template.detail_level.value}"
|
||||
self.templates[key] = template
|
||||
|
||||
def get_template(self, query_type: QueryType, detail_level: DetailLevel) -> ReportTemplate:
|
||||
key = f"{query_type.value}_{detail_level.value}"
|
||||
return self.templates.get(key)
|
||||
|
||||
def get_available_templates(self) -> List[str]:
|
||||
return list(self.templates.keys())
|
||||
|
||||
def initialize_default_templates(self):
|
||||
# Brief templates
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Summary\n{summary}\n\n## Key Findings\n{key_findings}",
|
||||
detail_level=DetailLevel.BRIEF,
|
||||
query_type=QueryType.FACTUAL,
|
||||
required_sections=['{title}', '{summary}', '{key_findings}']
|
||||
))
|
||||
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Research Questions\n{research_questions}\n\n## Key Findings\n{key_findings}",
|
||||
detail_level=DetailLevel.BRIEF,
|
||||
query_type=QueryType.EXPLORATORY,
|
||||
required_sections=['{title}', '{research_questions}', '{key_findings}']
|
||||
))
|
||||
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Comparison Criteria\n{comparison_criteria}\n\n## Key Findings\n{key_findings}",
|
||||
detail_level=DetailLevel.BRIEF,
|
||||
query_type=QueryType.COMPARATIVE,
|
||||
required_sections=['{title}', '{comparison_criteria}', '{key_findings}']
|
||||
))
|
||||
|
||||
# Standard templates
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Introduction\n{introduction}\n\n## Key Findings\n{key_findings}\n\n## Analysis\n{analysis}\n\n## Conclusion\n{conclusion}",
|
||||
detail_level=DetailLevel.STANDARD,
|
||||
query_type=QueryType.FACTUAL,
|
||||
required_sections=['{title}', '{introduction}', '{key_findings}', '{analysis}', '{conclusion}']
|
||||
))
|
||||
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Research Questions\n{research_questions}\n\n## Methodology\n{methodology}\n\n## Key Findings\n{key_findings}\n\n## Analysis\n{analysis}",
|
||||
detail_level=DetailLevel.STANDARD,
|
||||
query_type=QueryType.EXPLORATORY,
|
||||
required_sections=['{title}', '{research_questions}', '{methodology}', '{key_findings}', '{analysis}']
|
||||
))
|
||||
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Comparison Criteria\n{comparison_criteria}\n\n## Methodology\n{methodology}\n\n## Key Findings\n{key_findings}\n\n## Analysis\n{analysis}",
|
||||
detail_level=DetailLevel.STANDARD,
|
||||
query_type=QueryType.COMPARATIVE,
|
||||
required_sections=['{title}', '{comparison_criteria}', '{methodology}', '{key_findings}', '{analysis}']
|
||||
))
|
||||
|
||||
# Detailed templates
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Introduction\n{introduction}\n\n## Methodology\n{methodology}\n\n## Key Findings\n{key_findings}\n\n## Analysis\n{analysis}\n\n## Conclusion\n{conclusion}",
|
||||
detail_level=DetailLevel.DETAILED,
|
||||
query_type=QueryType.FACTUAL,
|
||||
required_sections=['{title}', '{introduction}', '{methodology}', '{key_findings}', '{analysis}', '{conclusion}']
|
||||
))
|
||||
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Research Questions\n{research_questions}\n\n## Literature Review\n{literature_review}\n\n## Methodology\n{methodology}\n\n## Key Findings\n{key_findings}\n\n## Analysis\n{analysis}",
|
||||
detail_level=DetailLevel.DETAILED,
|
||||
query_type=QueryType.EXPLORATORY,
|
||||
required_sections=['{title}', '{research_questions}', '{literature_review}', '{methodology}', '{key_findings}', '{analysis}']
|
||||
))
|
||||
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Comparison Criteria\n{comparison_criteria}\n\n## Methodology\n{methodology}\n\n## Key Findings\n{key_findings}\n\n## Analysis\n{analysis}\n\n## Conclusion\n{conclusion}",
|
||||
detail_level=DetailLevel.DETAILED,
|
||||
query_type=QueryType.COMPARATIVE,
|
||||
required_sections=['{title}', '{comparison_criteria}', '{methodology}', '{key_findings}', '{analysis}', '{conclusion}']
|
||||
))
|
||||
|
||||
# Comprehensive templates
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Executive Summary\n{exec_summary}\n\n## Introduction\n{introduction}\n\n## Methodology\n{methodology}\n\n## Key Findings\n{key_findings}\n\n## Analysis\n{analysis}\n\n## Conclusion\n{conclusion}\n\n## References\n{references}\n\n## Appendices\n{appendices}",
|
||||
detail_level=DetailLevel.COMPREHENSIVE,
|
||||
query_type=QueryType.FACTUAL,
|
||||
required_sections=['{title}', '{exec_summary}', '{introduction}', '{methodology}', '{key_findings}', '{analysis}', '{conclusion}', '{references}', '{appendices}']
|
||||
))
|
||||
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Executive Summary\n{exec_summary}\n\n## Research Questions\n{research_questions}\n\n## Literature Review\n{literature_review}\n\n## Methodology\n{methodology}\n\n## Key Findings\n{key_findings}\n\n## Analysis\n{analysis}\n\n## Conclusion\n{conclusion}\n\n## References\n{references}\n\n## Appendices\n{appendices}",
|
||||
detail_level=DetailLevel.COMPREHENSIVE,
|
||||
query_type=QueryType.EXPLORATORY,
|
||||
required_sections=['{title}', '{exec_summary}', '{research_questions}', '{literature_review}', '{methodology}', '{key_findings}', '{analysis}', '{conclusion}', '{references}', '{appendices}']
|
||||
))
|
||||
|
||||
self.add_template(ReportTemplate(
|
||||
template="# {title}\n\n## Executive Summary\n{exec_summary}\n\n## Comparison Criteria\n{comparison_criteria}\n\n## Methodology\n{methodology}\n\n## Key Findings\n{key_findings}\n\n## Analysis\n{analysis}\n\n## Conclusion\n{conclusion}\n\n## References\n{references}\n\n## Appendices\n{appendices}",
|
||||
detail_level=DetailLevel.COMPREHENSIVE,
|
||||
query_type=QueryType.COMPARATIVE,
|
||||
required_sections=['{title}', '{exec_summary}', '{comparison_criteria}', '{methodology}', '{key_findings}', '{analysis}', '{conclusion}', '{references}', '{appendices}']
|
||||
))
|
|
@ -1,28 +0,0 @@
|
|||
## Introduction
|
||||
The Python programming language has gained significant popularity in recent years due to its simplicity, flexibility, and versatility. Understanding the key features of Python is essential for developers, researchers, and enthusiasts alike. This report aims to provide an in-depth exploration of the key features of Python, synthesizing information from various sources [1], [2].
|
||||
|
||||
## Key Concepts and Definitions
|
||||
Python is a high-level, interpreted language that supports object-oriented programming [2]. It is known for its simplicity, readability, and ease of use, making it an ideal language for beginners and experienced developers alike. The language has a large and comprehensive standard library, providing modules and functions for various tasks, such as file I/O, networking, and data structures [2].
|
||||
|
||||
## Main Findings and Insights
|
||||
Although the provided document chunks from [1] and [2] do not contain relevant information about the key features of Python, general knowledge about the language highlights several important aspects:
|
||||
* **Easy to learn**: Python has a simple syntax and is relatively easy to learn, making it a great language for beginners [2].
|
||||
* **High-level language**: Python abstracts away many low-level details, allowing developers to focus on the logic of their code [2].
|
||||
* **Interpreted language**: Python code is executed line by line, without the need for compilation [2].
|
||||
* **Object-oriented**: Python supports concepts such as classes, objects, and inheritance [2].
|
||||
* **Large standard library**: Python's standard library provides modules and functions for various tasks, such as file I/O, networking, and data structures [2].
|
||||
* **Cross-platform**: Python can run on multiple platforms, including Windows, macOS, and Linux [2].
|
||||
* **Extensive community**: Python has a large and active community, with many resources available for learning and troubleshooting [2].
|
||||
|
||||
## Analysis of the Information
|
||||
The key features of Python highlight its versatility and flexibility, making it a popular choice for various applications, such as web development, data analysis, and artificial intelligence [2]. The language's simplicity and ease of use also make it an ideal choice for beginners, while its extensive standard library and cross-platform compatibility make it a popular choice for experienced developers.
|
||||
|
||||
## Implications or Applications of the Findings
|
||||
Understanding the key features of Python is essential for developers, researchers, and enthusiasts alike. The language's versatility and flexibility make it a popular choice for various applications, and its simplicity and ease of use make it an ideal choice for beginners. The extensive community and large standard library also provide numerous resources for learning and troubleshooting [2].
|
||||
|
||||
## Conclusion
|
||||
In conclusion, the key features of Python programming language include its simplicity, flexibility, and versatility. Although the provided document chunks do not contain relevant information, general knowledge about the language highlights its importance and popularity. For more information, it is recommended to visit the actual Python documentation website, such as https://docs.python.org/3/tutorial/index.html, or other reliable sources that provide information about the Python programming language [1], [2].
|
||||
|
||||
## References
|
||||
[1] https://docs.python.org/3/tutorial/index.html
|
||||
[2] https://www.python.org/about/
|
|
@ -1,47 +0,0 @@
|
|||
**Report: Exploring the Potential Relationship between Creatine Supplementation and Muscle Loss due to GLP1-Ar Drugs for Weight Loss**
|
||||
|
||||
**Introduction:**
|
||||
|
||||
Glucagon-like peptide-1 receptor agonists (GLP1-ar) are a class of medications used for weight loss and type 2 diabetes management. While effective in promoting weight loss, GLP1-ar drugs can cause muscle loss as a side effect, particularly when used for extended periods. Creatine supplementation is a popular dietary supplement known to increase muscle strength and endurance. This report aims to explore the potential relationship between creatine supplementation and muscle loss due to GLP1-ar drugs for weight loss.
|
||||
|
||||
**Background:**
|
||||
|
||||
GLP1-ar drugs work by mimicking the action of the GLP-1 hormone to regulate blood sugar levels and appetite. They are commonly used for weight loss, but their effects on muscle mass and function are not well understood (Aroda et al., 2016; Larsen et al., 2016). GLP1-ar drugs can cause muscle loss by reducing muscle protein synthesis and increasing muscle protein breakdown (Larsen et al., 2016). Creatine supplementation has been shown to increase muscle mass and strength in various studies (Cronin et al., 2017).
|
||||
|
||||
**Key Findings:**
|
||||
|
||||
* GLP1-ar drugs can cause muscle loss as a side effect, particularly in the context of weight loss (Aroda et al., 2016).
|
||||
* Creatine supplementation has been shown to increase muscle mass and strength in various studies (Cronin et al., 2017).
|
||||
* There is limited research on the specific interaction between creatine supplementation and muscle loss due to GLP1-ar drugs (Aroda et al., 2016).
|
||||
|
||||
**Potential Relationship between Creatine Supplementation and Muscle Loss:**
|
||||
|
||||
While the exact mechanisms of muscle loss due to GLP1-ar drugs are not fully understood, it is thought to be related to increased muscle protein breakdown and decreased muscle protein synthesis (Larsen et al., 2016). Creatine supplementation may potentially mitigate muscle loss due to GLP1-ar drugs by increasing muscle protein synthesis and reducing muscle damage (Cronin et al., 2017). Further research is needed to fully understand the relationship between creatine supplementation and muscle loss due to GLP1-ar drugs.
|
||||
|
||||
**Recommendations:**
|
||||
|
||||
* Future research should investigate the potential benefits of creatine supplementation in mitigating muscle loss due to GLP1-ar drugs.
|
||||
* Healthcare providers should consider the potential for muscle loss when prescribing GLP1-ar drugs for weight loss.
|
||||
* Patients taking GLP1-ar drugs for weight loss should be monitored for muscle loss and potentially supplemented with creatine to mitigate this effect.
|
||||
|
||||
**Conclusion:**
|
||||
|
||||
In conclusion, while the exact relationship between creatine supplementation and muscle loss due to GLP1-ar drugs is not well established, creatine supplementation may potentially mitigate muscle loss due to GLP1-ar drugs. Further research is needed to fully understand the relationship between creatine supplementation and muscle loss due to GLP1-ar drugs.
|
||||
|
||||
**References:**
|
||||
|
||||
[1] Aroda, V. R., et al. (2016). Effects of glucagon-like peptide-1 receptor agonists on muscle mass and strength in type 2 diabetes. Journal of Clinical Endocrinology and Metabolism, 101(4), 1331-1340.
|
||||
|
||||
[2] Cronin, J. B., et al. (2017). Effects of creatine supplementation on exercise performance: a meta-analysis. Journal of Strength and Conditioning Research, 31(1), 25-35.
|
||||
|
||||
[3] Larsen, C. M., et al. (2016). GLP-1 receptor agonists and the muscle: a review of the evidence. Journal of Diabetes Research, 2016, 1-9.
|
||||
|
||||
[4] Marso, S. P., et al. (2016). Semaglutide and cardiovascular outcomes in patients with type 2 diabetes. New England Journal of Medicine, 375(19), 1834-1844.
|
||||
|
||||
[5] Aroda, V. R., et al. (2016). Effects of glucagon-like peptide-1 receptor agonists on muscle mass and strength in type 2 diabetes. Journal of Clinical Endocrinology and Metabolism, 101(4), 1331-1340.
|
||||
|
||||
[6] Bader, E. D., & Winklhofer, K. F. (2020). Mechanisms of muscle loss in diabetes. Journal of Diabetes Research, 2020, 1-11.
|
||||
|
||||
[7] Chen, Y., et al. (2019). Liraglutide attenuates NLRP3 inflammasome-dependent pyroptosis via regulating SIRT1/NOX4/ROS pathway in H9c2 cells. Biomedicine & Pharmacotherapy, 120, 109537.
|
||||
|
||||
[8] Connelly, K.
|
|
@ -1,15 +0,0 @@
|
|||
requests>=2.31.0
|
||||
numpy>=1.24.0
|
||||
tiktoken>=0.5.0
|
||||
litellm>=1.0.0
|
||||
gradio>=4.0.0
|
||||
pyyaml>=6.0
|
||||
python-dotenv>=1.0.0
|
||||
beautifulsoup4>=4.12.0
|
||||
aiosqlite>=0.19.0
|
||||
asyncio>=3.4.3
|
||||
aiohttp>=3.9.0
|
||||
validators>=0.22.0
|
||||
markdown>=3.5.0
|
||||
html2text>=2020.1.16
|
||||
feedparser>=6.0.10
|
56
run_ui.py
56
run_ui.py
|
@ -1,56 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Run script for the Intelligent Research System UI.
|
||||
This script launches the Gradio interface for the research system.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from ui.gradio_interface import GradioInterface
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""Parse command line arguments."""
|
||||
parser = argparse.ArgumentParser(description="Run the Intelligent Research System UI")
|
||||
parser.add_argument(
|
||||
"--share",
|
||||
action="store_true",
|
||||
help="Create a public link for sharing"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=7860,
|
||||
help="Port to run the server on"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_true",
|
||||
help="Run in debug mode"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to run the UI."""
|
||||
args = parse_args()
|
||||
|
||||
print("Starting Intelligent Research System UI...")
|
||||
|
||||
# Create interface and initialize async components
|
||||
import asyncio
|
||||
interface = GradioInterface()
|
||||
|
||||
# Run the async initialization in the event loop
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(interface.async_init())
|
||||
|
||||
# Launch with the specified arguments
|
||||
interface.launch(
|
||||
share=args.share,
|
||||
server_port=args.port,
|
||||
debug=args.debug
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,218 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Query to Report Script
|
||||
|
||||
This script demonstrates the full workflow from query to report,
|
||||
taking a user query and generating a comprehensive report saved in Markdown format.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import asyncio
|
||||
import logging
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
# Add parent directory to path to import modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from query.query_processor import get_query_processor
|
||||
from execution.search_executor import SearchExecutor
|
||||
from ranking.jina_reranker import get_jina_reranker
|
||||
from report.report_generator import get_report_generator, initialize_report_generator
|
||||
from report.report_detail_levels import get_report_detail_level_manager
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def query_to_report(
|
||||
query: str,
|
||||
output_file: str,
|
||||
search_engines: Optional[List[str]] = None,
|
||||
num_results: int = 10,
|
||||
token_budget: Optional[int] = None,
|
||||
chunk_size: Optional[int] = None,
|
||||
overlap_size: Optional[int] = None,
|
||||
detail_level: str = "standard",
|
||||
use_mock: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
Execute the full workflow from query to report.
|
||||
|
||||
Args:
|
||||
query: User query
|
||||
output_file: Path to save the report
|
||||
search_engines: List of search engines to use
|
||||
num_results: Number of results to return per search engine
|
||||
token_budget: Maximum number of tokens to use for report generation
|
||||
chunk_size: Maximum number of tokens per chunk
|
||||
overlap_size: Number of tokens to overlap between chunks
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
|
||||
Returns:
|
||||
Path to the generated report
|
||||
"""
|
||||
logger.info(f"Processing query: {query}")
|
||||
logger.info(f"Detail level: {detail_level}")
|
||||
|
||||
# Step 1: Process the query
|
||||
query_processor = get_query_processor()
|
||||
structured_query = await query_processor.process_query(query)
|
||||
|
||||
# Add timestamp
|
||||
structured_query['timestamp'] = datetime.now().isoformat()
|
||||
|
||||
logger.info(f"Query processed. Type: {structured_query['type']}, Intent: {structured_query['intent']}")
|
||||
logger.info(f"Enhanced query: {structured_query['enhanced_query']}")
|
||||
|
||||
# Step 2: Generate search queries for different engines
|
||||
if search_engines is None:
|
||||
search_executor = SearchExecutor()
|
||||
search_engines = search_executor.get_available_search_engines()
|
||||
|
||||
if not use_mock:
|
||||
# Generate search queries for each engine
|
||||
search_queries = await query_processor.generate_search_queries(
|
||||
structured_query,
|
||||
search_engines or list(search_executor.available_handlers.keys())
|
||||
)
|
||||
structured_query['search_queries'] = search_queries
|
||||
logger.info(f"Generated search queries for engines: {', '.join(search_queries.keys())}")
|
||||
else:
|
||||
# Use mock data
|
||||
structured_query = await query_processor.generate_search_queries(structured_query, search_engines)
|
||||
logger.info(f"Generated search queries for engines: {', '.join(search_engines)}")
|
||||
|
||||
# Step 3: Execute search
|
||||
search_executor = SearchExecutor()
|
||||
|
||||
# If detail level is specified, adjust num_results based on the detail level
|
||||
if detail_level and not num_results:
|
||||
detail_level_manager = get_report_detail_level_manager()
|
||||
config = detail_level_manager.get_detail_level_config(detail_level)
|
||||
num_results = config.get("num_results", 10)
|
||||
logger.info(f"Using {num_results} results per search engine based on detail level: {detail_level}")
|
||||
|
||||
search_results = search_executor.execute_search(
|
||||
structured_query,
|
||||
search_engines=search_engines,
|
||||
num_results=num_results
|
||||
)
|
||||
|
||||
# Flatten search results
|
||||
flattened_results = []
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
# Add the search engine to the result
|
||||
result['engine'] = engine
|
||||
flattened_results.append(result)
|
||||
|
||||
logger.info(f"Search executed. Got {len(flattened_results)} results from {len(search_results)} engines")
|
||||
|
||||
# Step 4: Rerank results
|
||||
reranker = get_jina_reranker()
|
||||
|
||||
# Extract text from results for reranking
|
||||
documents_for_reranking = []
|
||||
for result in flattened_results:
|
||||
# Use snippet or title as the document text
|
||||
doc_text = result.get('snippet', result.get('title', ''))
|
||||
documents_for_reranking.append(doc_text)
|
||||
|
||||
# Rerank the documents
|
||||
reranked_indices = reranker.rerank(
|
||||
query=structured_query['enhanced_query'],
|
||||
documents=documents_for_reranking
|
||||
)
|
||||
|
||||
# Map the reranked indices back to the original results
|
||||
reranked_results = []
|
||||
for item in reranked_indices:
|
||||
if 'index' in item and item['index'] < len(flattened_results):
|
||||
original_result = flattened_results[item['index']]
|
||||
# Add the reranking score to the result
|
||||
original_result['score'] = item['score']
|
||||
reranked_results.append(original_result)
|
||||
|
||||
logger.info(f"Results reranked. Got {len(reranked_results)} reranked results")
|
||||
|
||||
# Step 5: Initialize report generator
|
||||
await initialize_report_generator()
|
||||
report_generator = get_report_generator()
|
||||
|
||||
# Step 6: Generate report
|
||||
logger.info(f"Generating report with detail level: {detail_level}...")
|
||||
report = await report_generator.generate_report(
|
||||
search_results=reranked_results,
|
||||
query=query,
|
||||
token_budget=token_budget,
|
||||
chunk_size=chunk_size,
|
||||
overlap_size=overlap_size,
|
||||
detail_level=detail_level
|
||||
)
|
||||
|
||||
logger.info(f"Report generated. Length: {len(report)} characters")
|
||||
|
||||
# Step 7: Save report to file
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(report)
|
||||
|
||||
logger.info(f"Report saved to: {output_file}")
|
||||
|
||||
return output_file
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to parse arguments and run the workflow."""
|
||||
parser = argparse.ArgumentParser(description='Generate a report from a query')
|
||||
parser.add_argument('query', help='The query to process')
|
||||
parser.add_argument('--output', '-o', default='report.md', help='Output file path')
|
||||
parser.add_argument('--search-engines', '-s', nargs='+', help='Search engines to use')
|
||||
parser.add_argument('--num-results', '-n', type=int, help='Number of results per search engine')
|
||||
parser.add_argument('--token-budget', '-t', type=int, help='Maximum number of tokens for report generation')
|
||||
parser.add_argument('--chunk-size', '-c', type=int, help='Maximum tokens per chunk')
|
||||
parser.add_argument('--overlap-size', '-l', type=int, help='Tokens to overlap between chunks')
|
||||
parser.add_argument('--detail-level', '-d', type=str, default='standard',
|
||||
choices=['brief', 'standard', 'detailed', 'comprehensive'],
|
||||
help='Level of detail for the report')
|
||||
parser.add_argument('--use-mock', '-m', action='store_true', help='Use mock data instead of API calls')
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose logging')
|
||||
parser.add_argument('--list-detail-levels', action='store_true',
|
||||
help='List available detail levels with descriptions and exit')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set log level
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
# List detail levels if requested
|
||||
if args.list_detail_levels:
|
||||
detail_level_manager = get_report_detail_level_manager()
|
||||
detail_levels = detail_level_manager.get_available_detail_levels()
|
||||
print("Available detail levels:")
|
||||
for level, description in detail_levels:
|
||||
print(f" {level}: {description}")
|
||||
return
|
||||
|
||||
# Run the workflow
|
||||
asyncio.run(query_to_report(
|
||||
query=args.query,
|
||||
output_file=args.output,
|
||||
search_engines=args.search_engines,
|
||||
num_results=args.num_results,
|
||||
token_budget=args.token_budget,
|
||||
chunk_size=args.chunk_size,
|
||||
overlap_size=args.overlap_size,
|
||||
detail_level=args.detail_level,
|
||||
use_mock=args.use_mock
|
||||
))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,37 +0,0 @@
|
|||
**Report: Latest Advancements in Quantum Computing**
|
||||
|
||||
Quantum computing has made significant progress in recent years, with breakthroughs in quantum algorithms, quantum error correction, and quantum control. Researchers have developed new quantum algorithms, such as the Quantum Approximate Optimization Algorithm (QAOA) and the Quantum Alternating Projection Algorithm (QAPA), which can solve complex optimization problems more efficiently than classical computers. Quantum error correction techniques, such as surface codes and topological codes, have been developed to mitigate the effects of decoherence and noise in quantum systems.
|
||||
|
||||
The number of qubits in a quantum processor has increased rapidly, with IBM's 127-qubit Eagle processor and Google's 53-qubit Sycamore processor being notable examples. These advancements have enabled the simulation of complex quantum systems, such as chemical reactions and materials properties, with unprecedented accuracy. Quantum computing has also been explored for its potential to solve complex optimization problems, including logistics and finance.
|
||||
|
||||
Quantum computing has the potential to revolutionize fields like chemistry, materials science, and machine learning. Companies like Google, IBM, and Microsoft are actively developing quantum computing hardware and software. The development of quantum computing is still in its early stages, and many technical challenges need to be overcome before it can be widely adopted.
|
||||
|
||||
Research has made significant progress in Noisy Intermediate-Scale Quantum (NISQ) computing, which aims to solve real-world problems using noisy quantum systems. Quantum Physics-Informed Neural Networks (QPINN) have been developed to solve quantum physics problems. Google achieved quantum supremacy in 2019, and IBM unveiled a 400-qubit quantum processor in 2022.
|
||||
|
||||
Quantum computing has several applications, including cryptography, optimization problems, and machine learning. Quantum computing can break certain classical encryption algorithms, but it can also be used to develop new quantum-resistant cryptographic techniques. Quantum computing has the potential to solve complex optimization problems, including logistics and finance.
|
||||
|
||||
The development of quantum computing is a rapidly advancing field, with new breakthroughs and innovations emerging regularly. Quantum computing has the potential to revolutionize computation and tackle humanity's most complex problems. However, the development of quantum computing is still in its early stages, and many technical challenges need to be overcome before it can be widely adopted.
|
||||
|
||||
**References:**
|
||||
|
||||
[1] Arute et al. (2019). Quantum supremacy using a 53-qubit quantum computer. Nature, 574(7780), 505-510.
|
||||
|
||||
[2] Calvo et al. (2023). Functional Matrices on Quantum Computing Simulation. Mathematics, 11, 3742.
|
||||
|
||||
[3] Lau et al. (2022). NISQ computing: Where are we and where do we go? AAPPS Bull., 32, 27.
|
||||
|
||||
[4] Google. (2019). Quantum Supremacy. Available online: <https://www.newsweek.com/quantum-computing-google-scientists-breakthrough-supercomputer-1467256>
|
||||
|
||||
[5] IBM. (2022). 400-Qubit Quantum Processor. Available online: <https://newsroom.ibm.com/2022-11-09-IBM-Unveils-400-Qubit-Plus-Quantum-Processor>
|
||||
|
||||
[6] National Quantum Initiative. Available online: <https://www.quantum.gov/>
|
||||
|
||||
[7] Konig et al. (2005). On the power of quantum memory. IEEE Trans. Inf. Theory, 51, 2391-2401.
|
||||
|
||||
[8] Vadyala et al. (2023). General implementation of quantum physics-informed neural networks. Array, 18, 100287.
|
||||
|
||||
[9] Collins et al. (2022). IBM Unveils. Available online: <https://newsroom.ibm.com/2022-11-09-IBM-Unveils-400-Qubit-Plus-Quantum-Processor>
|
||||
|
||||
[10] Google. (2023). The Quantum Insider. Available online: <https://thequantuminsider.com/2023/07/04/google-claims-latest-quantum-experiment-would-take-decades-on-classical-computer/>
|
||||
|
||||
Note: The references provided are a selection of the sources used in the report and are not an exhaustive list.
|
|
@ -1,3 +0,0 @@
|
|||
"""
|
||||
Test modules for the intelligent research system.
|
||||
"""
|
|
@ -1,3 +0,0 @@
|
|||
"""
|
||||
Tests for the search execution module.
|
||||
"""
|
|
@ -1,31 +0,0 @@
|
|||
"""
|
||||
Test all search handlers with a simple query.
|
||||
"""
|
||||
|
||||
from execution.search_executor import SearchExecutor
|
||||
|
||||
def main():
|
||||
"""Test all search handlers."""
|
||||
# Initialize the search executor
|
||||
executor = SearchExecutor()
|
||||
|
||||
# Execute a simple search
|
||||
results = executor.execute_search({
|
||||
'raw_query': 'quantum computing',
|
||||
'enhanced_query': 'quantum computing'
|
||||
})
|
||||
|
||||
# Print results by source
|
||||
print(f'Results by source: {[engine for engine, res in results.items() if res]}')
|
||||
|
||||
# Print details
|
||||
print('\nDetails:')
|
||||
for engine, res in results.items():
|
||||
print(f'{engine}: {len(res)} results')
|
||||
if res:
|
||||
print(f' Sample result: {res[0]}')
|
||||
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,59 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to check if search functionality is working
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from execution.search_executor import SearchExecutor
|
||||
from query.query_processor import QueryProcessor
|
||||
|
||||
async def test_search():
|
||||
"""Test search functionality"""
|
||||
query = "Research and explain in detail the potential effects of creatine supplementation on muscle mass and strength"
|
||||
|
||||
# Initialize components
|
||||
query_processor = QueryProcessor()
|
||||
search_executor = SearchExecutor()
|
||||
|
||||
# Print available search engines
|
||||
available_engines = search_executor.get_available_search_engines()
|
||||
print(f"Available search engines: {available_engines}")
|
||||
|
||||
# Process the query
|
||||
structured_query = query_processor.process_query(query)
|
||||
print(f"Structured query: {json.dumps(structured_query, indent=2)}")
|
||||
|
||||
# Generate search queries for different engines
|
||||
structured_query = query_processor.generate_search_queries(
|
||||
structured_query,
|
||||
search_executor.get_available_search_engines()
|
||||
)
|
||||
print(f"Search queries: {json.dumps(structured_query.get('search_queries', {}), indent=2)}")
|
||||
|
||||
# Execute search
|
||||
search_results = search_executor.execute_search(
|
||||
structured_query,
|
||||
num_results=5
|
||||
)
|
||||
|
||||
# Print results
|
||||
for engine, results in search_results.items():
|
||||
print(f"\nResults from {engine}: {len(results)}")
|
||||
for i, result in enumerate(results[:3]): # Show first 3 results
|
||||
print(f" {i+1}. {result.get('title')} - {result.get('url')}")
|
||||
|
||||
# Return total number of results
|
||||
total_results = sum(len(results) for results in search_results.values())
|
||||
return total_results
|
||||
|
||||
if __name__ == "__main__":
|
||||
total_results = asyncio.run(test_search())
|
||||
print(f"\nTotal results: {total_results}")
|
|
@ -1,267 +0,0 @@
|
|||
"""
|
||||
Test script for the search execution module.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
# Import the necessary modules
|
||||
try:
|
||||
from query.query_processor import get_query_processor, QueryProcessor
|
||||
from query.llm_interface import get_llm_interface
|
||||
from execution.search_executor import SearchExecutor
|
||||
from execution.result_collector import ResultCollector
|
||||
except ImportError as e:
|
||||
print(f"Import error: {e}")
|
||||
print("Make sure all required modules are installed and available.")
|
||||
exit(1)
|
||||
|
||||
|
||||
def get_query_processor():
|
||||
"""Get a query processor instance."""
|
||||
# First set the LLM interface to use Groq's model
|
||||
from query.llm_interface import get_llm_interface
|
||||
get_llm_interface(model_name="llama-3.1-8b-instant")
|
||||
|
||||
# Then get the query processor which will use the configured LLM interface
|
||||
from query.query_processor import get_query_processor
|
||||
return get_query_processor()
|
||||
|
||||
|
||||
def test_search_execution(query: str, search_engines: Optional[List[str]] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Test the search execution module.
|
||||
|
||||
Args:
|
||||
query: The query to process and execute
|
||||
search_engines: List of search engines to use (if None, use all available)
|
||||
|
||||
Returns:
|
||||
Dictionary with test results
|
||||
"""
|
||||
print(f"Testing search execution for query: {query}")
|
||||
|
||||
# Process the query
|
||||
processor = get_query_processor()
|
||||
start_time = time.time()
|
||||
structured_query = processor.process_query(query)
|
||||
query_time = time.time() - start_time
|
||||
|
||||
print(f"Query processed in {query_time:.2f} seconds")
|
||||
print(f"Enhanced query: {structured_query.get('enhanced_query', '')}")
|
||||
print(f"Classification: {structured_query.get('classification', {})}")
|
||||
|
||||
# Execute the search
|
||||
executor = SearchExecutor()
|
||||
|
||||
# Get available search engines if none specified
|
||||
if search_engines is None:
|
||||
search_engines = executor.get_available_search_engines()
|
||||
print(f"Using available search engines: {search_engines}")
|
||||
|
||||
# Execute the search
|
||||
start_time = time.time()
|
||||
search_results = executor.execute_search(structured_query, search_engines=search_engines)
|
||||
search_time = time.time() - start_time
|
||||
|
||||
print(f"Search executed in {search_time:.2f} seconds")
|
||||
|
||||
# Print raw search results for debugging
|
||||
print("\nRaw search results:")
|
||||
for engine, results in search_results.items():
|
||||
print(f" {engine}: {len(results)} results")
|
||||
if results:
|
||||
print(f" Sample result: {results[0]}")
|
||||
|
||||
# Process the results
|
||||
collector = ResultCollector()
|
||||
processed_results = collector.process_results(search_results, dedup=True)
|
||||
|
||||
# Print summary of results
|
||||
total_results = len(processed_results)
|
||||
print(f"Found {total_results} results after deduplication")
|
||||
|
||||
# Print results by source
|
||||
results_by_source = {}
|
||||
for result in processed_results:
|
||||
source = result.get("source", "unknown")
|
||||
if source not in results_by_source:
|
||||
results_by_source[source] = 0
|
||||
results_by_source[source] += 1
|
||||
|
||||
print("Results by source:")
|
||||
for source, count in results_by_source.items():
|
||||
print(f" {source}: {count}")
|
||||
|
||||
# Print top 3 results
|
||||
if processed_results:
|
||||
print("\nTop 3 results:")
|
||||
for i, result in enumerate(processed_results[:3]):
|
||||
print(f" {i+1}. {result['title']}")
|
||||
print(f" URL: {result['url']}")
|
||||
print(f" Snippet: {result['snippet'][:100]}...")
|
||||
print()
|
||||
|
||||
# Return test results
|
||||
return {
|
||||
"query": query,
|
||||
"structured_query": structured_query,
|
||||
"search_engines": search_engines,
|
||||
"raw_results": search_results,
|
||||
"processed_results": processed_results,
|
||||
"timing": {
|
||||
"query_processing": query_time,
|
||||
"search_execution": search_time,
|
||||
"total": query_time + search_time
|
||||
},
|
||||
"summary": {
|
||||
"total_results": total_results,
|
||||
"results_by_source": results_by_source
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def save_test_results(results: Dict[str, Any], file_path: str) -> None:
|
||||
"""
|
||||
Save test results to a file.
|
||||
|
||||
Args:
|
||||
results: Test results to save
|
||||
file_path: Path to save results to
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'w') as f:
|
||||
json.dump(results, f, indent=2)
|
||||
print(f"Test results saved to {file_path}")
|
||||
except Exception as e:
|
||||
print(f"Error saving test results: {e}")
|
||||
|
||||
|
||||
def mock_test():
|
||||
"""Run a mock test without actual API calls."""
|
||||
print("Running mock test without API calls...")
|
||||
|
||||
# Create a mock structured query
|
||||
structured_query = {
|
||||
"original_query": "What are the latest advancements in quantum computing?",
|
||||
"enhanced_query": "Explore the most recent breakthroughs and developments in quantum computing technology, including hardware innovations, quantum algorithms, and potential applications.",
|
||||
"classification": {
|
||||
"type": "exploratory",
|
||||
"intent": "research",
|
||||
"entities": ["quantum computing", "advancements", "technology"]
|
||||
},
|
||||
"search_queries": {
|
||||
"google": "latest advancements in quantum computing 2025 breakthroughs",
|
||||
"scholar": "recent quantum computing developments research papers",
|
||||
"arxiv": "quantum computing hardware algorithms applications"
|
||||
}
|
||||
}
|
||||
|
||||
# Create mock search results
|
||||
mock_results = {
|
||||
"google": [
|
||||
{
|
||||
"title": "Quantum Computing Breakthrough: New Qubit Design Achieves 99.9% Fidelity",
|
||||
"url": "https://example.com/quantum-breakthrough",
|
||||
"snippet": "Researchers at MIT have developed a new qubit design that achieves 99.9% fidelity, a major step toward practical quantum computing.",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"title": "IBM Unveils 1000-Qubit Quantum Computer",
|
||||
"url": "https://example.com/ibm-quantum",
|
||||
"snippet": "IBM has announced its latest quantum computer featuring 1000 qubits, significantly expanding computational capabilities.",
|
||||
"position": 2
|
||||
}
|
||||
],
|
||||
"arxiv": [
|
||||
{
|
||||
"title": "Quantum Error Correction Using Surface Codes",
|
||||
"url": "https://arxiv.org/abs/2301.12345",
|
||||
"snippet": "This paper presents a new approach to quantum error correction using surface codes that improves error tolerance by an order of magnitude.",
|
||||
"authors": ["Smith, J.", "Johnson, A."],
|
||||
"published_date": "2025-01-15",
|
||||
"position": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Process the results
|
||||
collector = ResultCollector()
|
||||
processed_results = collector.process_results(mock_results, dedup=True)
|
||||
|
||||
# Print summary
|
||||
total_results = len(processed_results)
|
||||
print(f"Found {total_results} mock results after deduplication")
|
||||
|
||||
# Print results by source
|
||||
results_by_source = {}
|
||||
for result in processed_results:
|
||||
source = result.get("source", "unknown")
|
||||
if source not in results_by_source:
|
||||
results_by_source[source] = 0
|
||||
results_by_source[source] += 1
|
||||
|
||||
print("Results by source:")
|
||||
for source, count in results_by_source.items():
|
||||
print(f" {source}: {count}")
|
||||
|
||||
# Print top 3 results
|
||||
if processed_results:
|
||||
print("\nTop 3 results:")
|
||||
for i, result in enumerate(processed_results[:3]):
|
||||
print(f" {i+1}. {result['title']}")
|
||||
print(f" URL: {result['url']}")
|
||||
print(f" Snippet: {result['snippet'][:100]}...")
|
||||
print()
|
||||
|
||||
# Return mock test results
|
||||
return {
|
||||
"query": "What are the latest advancements in quantum computing?",
|
||||
"structured_query": structured_query,
|
||||
"search_engines": ["google", "arxiv"],
|
||||
"raw_results": mock_results,
|
||||
"processed_results": processed_results,
|
||||
"timing": {
|
||||
"query_processing": 0.5,
|
||||
"search_execution": 1.2,
|
||||
"total": 1.7
|
||||
},
|
||||
"summary": {
|
||||
"total_results": total_results,
|
||||
"results_by_source": results_by_source
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function."""
|
||||
# Test queries
|
||||
test_queries = [
|
||||
"What are the latest advancements in quantum computing?",
|
||||
"Compare blockchain and traditional databases for financial applications",
|
||||
"Explain the implications of blockchain technology in finance"
|
||||
]
|
||||
|
||||
# Run tests
|
||||
all_results = {}
|
||||
for query in test_queries:
|
||||
print("\n" + "="*80)
|
||||
print(f"Testing query: {query}")
|
||||
print("="*80)
|
||||
|
||||
# Test with all available search engines
|
||||
results = test_search_execution(query)
|
||||
|
||||
# Save results for this query
|
||||
all_results[query] = results
|
||||
|
||||
print("\n")
|
||||
|
||||
# Save all test results
|
||||
save_test_results(all_results, "search_execution_test_results.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,4 +0,0 @@
|
|||
"""
|
||||
Integration tests for the intelligent research system.
|
||||
These tests verify the end-to-end functionality of the system.
|
||||
"""
|
|
@ -1,101 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Test Query to Report Script with Electric Vehicles Query
|
||||
|
||||
This script tests the query_to_report.py script with a query about the impact of electric vehicles.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
# Add parent directory to path to import modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from scripts.query_to_report import query_to_report
|
||||
from report.report_detail_levels import get_report_detail_level_manager
|
||||
|
||||
|
||||
async def run_ev_test(detail_level: str = "standard", use_mock: bool = False):
|
||||
"""
|
||||
Run a test of the query to report workflow with an electric vehicles query.
|
||||
|
||||
Args:
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
"""
|
||||
# Query about electric vehicles
|
||||
query = "What is the environmental and economic impact of electric vehicles compared to traditional vehicles?"
|
||||
|
||||
# Generate timestamp for unique output file
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = f"ev_report_{timestamp}_{detail_level}.md"
|
||||
|
||||
print(f"Processing query: {query}")
|
||||
print(f"Detail level: {detail_level}")
|
||||
print(f"This may take a few minutes depending on the number of search results and API response times...")
|
||||
|
||||
# Get detail level configuration
|
||||
detail_level_manager = get_report_detail_level_manager()
|
||||
config = detail_level_manager.get_detail_level_config(detail_level)
|
||||
|
||||
# Print detail level configuration
|
||||
print(f"\nDetail level configuration:")
|
||||
print(f" Number of results per search engine: {config.get('num_results')}")
|
||||
print(f" Token budget: {config.get('token_budget')}")
|
||||
print(f" Chunk size: {config.get('chunk_size')}")
|
||||
print(f" Overlap size: {config.get('overlap_size')}")
|
||||
print(f" Model: {config.get('model')}")
|
||||
|
||||
# Run the workflow
|
||||
await query_to_report(
|
||||
query=query,
|
||||
output_file=output_file,
|
||||
detail_level=detail_level,
|
||||
use_mock=use_mock
|
||||
)
|
||||
|
||||
print(f"\nTest completed successfully!")
|
||||
print(f"Report saved to: {output_file}")
|
||||
|
||||
# Print the first few lines of the report
|
||||
try:
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
preview = f.read(1000) # Show a larger preview
|
||||
print("\nReport Preview:")
|
||||
print("-" * 80)
|
||||
print(preview + "...")
|
||||
print("-" * 80)
|
||||
except Exception as e:
|
||||
print(f"Error reading report: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to parse arguments and run the test."""
|
||||
parser = argparse.ArgumentParser(description='Test the query to report workflow with EV query')
|
||||
parser.add_argument('--detail-level', '-d', type=str, default='standard',
|
||||
choices=['brief', 'standard', 'detailed', 'comprehensive'],
|
||||
help='Level of detail for the report')
|
||||
parser.add_argument('--use-mock', '-m', action='store_true', help='Use mock data instead of API calls')
|
||||
parser.add_argument('--list-detail-levels', action='store_true',
|
||||
help='List available detail levels with descriptions and exit')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# List detail levels if requested
|
||||
if args.list_detail_levels:
|
||||
detail_level_manager = get_report_detail_level_manager()
|
||||
detail_levels = detail_level_manager.get_available_detail_levels()
|
||||
print("Available detail levels:")
|
||||
for level, description in detail_levels:
|
||||
print(f" {level}: {description}")
|
||||
return
|
||||
|
||||
# Run the test
|
||||
asyncio.run(run_ev_test(detail_level=args.detail_level, use_mock=args.use_mock))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,69 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Test Query to Report Script
|
||||
|
||||
This script tests the query_to_report.py script with a sample query.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
# Add parent directory to path to import modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from scripts.query_to_report import query_to_report
|
||||
|
||||
|
||||
async def run_test(use_mock: bool = False):
|
||||
"""
|
||||
Run a test of the query to report workflow.
|
||||
|
||||
Args:
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
"""
|
||||
# Sample query
|
||||
query = "What are the latest advancements in quantum computing?"
|
||||
|
||||
# Generate timestamp for unique output file
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = f"report_{timestamp}.md"
|
||||
|
||||
# Run the workflow
|
||||
await query_to_report(
|
||||
query=query,
|
||||
output_file=output_file,
|
||||
num_results=5, # Limit to 5 results per engine for faster testing
|
||||
use_mock=use_mock
|
||||
)
|
||||
|
||||
print(f"\nTest completed successfully!")
|
||||
print(f"Report saved to: {output_file}")
|
||||
|
||||
# Print the first few lines of the report
|
||||
try:
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
preview = f.read(500)
|
||||
print("\nReport Preview:")
|
||||
print("-" * 80)
|
||||
print(preview + "...")
|
||||
print("-" * 80)
|
||||
except Exception as e:
|
||||
print(f"Error reading report: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to parse arguments and run the test."""
|
||||
parser = argparse.ArgumentParser(description='Test the query to report workflow')
|
||||
parser.add_argument('--use-mock', '-m', action='store_true', help='Use mock data instead of API calls')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Run the test
|
||||
asyncio.run(run_test(use_mock=args.use_mock))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,3 +0,0 @@
|
|||
"""
|
||||
Tests for the query processing module.
|
||||
"""
|
|
@ -1,47 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the LLM interface with Groq.
|
||||
|
||||
This script tests the LLM interface with Groq models.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from query.llm_interface import LLMInterface
|
||||
|
||||
def test_groq_model():
|
||||
"""Test the Groq model."""
|
||||
# Ask for the API key
|
||||
api_key = input("Enter your Groq API key: ")
|
||||
os.environ["GROQ_API_KEY"] = api_key
|
||||
|
||||
# Initialize the LLM interface with the Groq model
|
||||
llm = LLMInterface(model_name="llama-3.1-8b-instant")
|
||||
|
||||
# Test queries
|
||||
test_queries = [
|
||||
"What are the latest advancements in quantum computing?",
|
||||
"Compare renewable energy sources and their efficiency",
|
||||
"Explain the impact of artificial intelligence on healthcare"
|
||||
]
|
||||
|
||||
# Process each query
|
||||
for query in test_queries:
|
||||
print(f"\nProcessing query: '{query}'")
|
||||
print("-" * 50)
|
||||
|
||||
start_time = time.time()
|
||||
response = llm._enhance_query_impl(query)
|
||||
end_time = time.time()
|
||||
|
||||
print(f"Processing time: {end_time - start_time:.2f} seconds")
|
||||
print("\nEnhanced Query:")
|
||||
print("-" * 50)
|
||||
print(response)
|
||||
print("-" * 50)
|
||||
|
||||
# Wait a bit between queries
|
||||
time.sleep(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_groq_model()
|
|
@ -1,148 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the query processor module.
|
||||
|
||||
This script tests the query processor with the Groq models.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
from query.query_processor import QueryProcessor, get_query_processor
|
||||
from query.llm_interface import LLMInterface, get_llm_interface
|
||||
from config.config import get_config
|
||||
|
||||
# Create a config.yaml file if it doesn't exist
|
||||
config_dir = os.path.join(os.path.dirname(__file__), "config")
|
||||
config_file = os.path.join(config_dir, "config.yaml")
|
||||
if not os.path.exists(config_file):
|
||||
example_file = os.path.join(config_dir, "config.yaml.example")
|
||||
if os.path.exists(example_file):
|
||||
with open(example_file, "r") as f:
|
||||
example_content = f.read()
|
||||
|
||||
with open(config_file, "w") as f:
|
||||
f.write(example_content)
|
||||
|
||||
print(f"Created config.yaml from example file")
|
||||
|
||||
# Force the use of Groq model for testing
|
||||
# First, create a global LLM interface with the Groq model
|
||||
groq_interface = get_llm_interface("llama-3.1-8b-instant")
|
||||
print(f"Using model: {groq_interface.model_name}")
|
||||
|
||||
# Monkey patch the get_llm_interface function to always return our Groq interface
|
||||
import query.llm_interface
|
||||
original_get_llm_interface = query.llm_interface.get_llm_interface
|
||||
|
||||
def patched_get_llm_interface(*args, **kwargs):
|
||||
return groq_interface
|
||||
|
||||
query.llm_interface.get_llm_interface = patched_get_llm_interface
|
||||
|
||||
def test_process_query(query: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Test the query processing functionality.
|
||||
|
||||
Args:
|
||||
query: The query to process
|
||||
|
||||
Returns:
|
||||
The processed query result
|
||||
"""
|
||||
# Get the query processor (which will use our patched LLM interface)
|
||||
processor = get_query_processor()
|
||||
|
||||
# Process the query
|
||||
print(f"\nProcessing query: '{query}'")
|
||||
print("-" * 50)
|
||||
|
||||
start_time = datetime.now()
|
||||
result = processor.process_query(query)
|
||||
end_time = datetime.now()
|
||||
|
||||
# Add timestamp
|
||||
result['timestamp'] = datetime.now().isoformat()
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = (end_time - start_time).total_seconds()
|
||||
print(f"Processing time: {processing_time:.2f} seconds")
|
||||
|
||||
# Print the result in a formatted way
|
||||
print("\nProcessed Query Result:")
|
||||
print("-" * 50)
|
||||
print(f"Original Query: {result['original_query']}")
|
||||
print(f"Enhanced Query: {result['enhanced_query']}")
|
||||
print(f"Query Type: {result['type']}")
|
||||
print(f"Query Intent: {result['intent']}")
|
||||
print(f"Entities: {', '.join(result['entities'])}")
|
||||
print("-" * 50)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def test_generate_search_queries(structured_query: Dict[str, Any],
|
||||
search_engines: list = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Test the search query generation functionality.
|
||||
|
||||
Args:
|
||||
structured_query: The structured query to generate search queries for
|
||||
search_engines: List of search engines to generate queries for
|
||||
|
||||
Returns:
|
||||
The updated structured query with search queries
|
||||
"""
|
||||
if search_engines is None:
|
||||
search_engines = ["google", "bing", "scholar"]
|
||||
|
||||
# Get the query processor (which will use our patched LLM interface)
|
||||
processor = get_query_processor()
|
||||
|
||||
# Generate search queries
|
||||
print(f"\nGenerating search queries for engines: {', '.join(search_engines)}")
|
||||
print("-" * 50)
|
||||
|
||||
start_time = datetime.now()
|
||||
result = processor.generate_search_queries(structured_query, search_engines)
|
||||
end_time = datetime.now()
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = (end_time - start_time).total_seconds()
|
||||
print(f"Processing time: {processing_time:.2f} seconds")
|
||||
|
||||
# Print the generated search queries
|
||||
print("\nGenerated Search Queries:")
|
||||
print("-" * 50)
|
||||
for engine, queries in result['search_queries'].items():
|
||||
print(f"\n{engine.upper()} Queries:")
|
||||
for i, query in enumerate(queries, 1):
|
||||
print(f" {i}. {query}")
|
||||
print("-" * 50)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Run the query processor tests."""
|
||||
# Test queries
|
||||
test_queries = [
|
||||
"What are the latest advancements in quantum computing?",
|
||||
"Compare renewable energy sources and their efficiency",
|
||||
"Explain the impact of artificial intelligence on healthcare"
|
||||
]
|
||||
|
||||
# Process each query
|
||||
for query in test_queries:
|
||||
structured_query = test_process_query(query)
|
||||
|
||||
# Generate search queries for the processed query
|
||||
test_generate_search_queries(structured_query)
|
||||
|
||||
print("\n" + "=" * 80 + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,236 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive test script for the query processor module.
|
||||
|
||||
This script tests all the key functionality of the query processor with the Groq models.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, List
|
||||
|
||||
from query.query_processor import QueryProcessor, get_query_processor
|
||||
from query.llm_interface import LLMInterface, get_llm_interface
|
||||
from config.config import get_config
|
||||
|
||||
# Create a config.yaml file if it doesn't exist
|
||||
config_dir = os.path.join(os.path.dirname(__file__), "config")
|
||||
config_file = os.path.join(config_dir, "config.yaml")
|
||||
if not os.path.exists(config_file):
|
||||
example_file = os.path.join(config_dir, "config.yaml.example")
|
||||
if os.path.exists(example_file):
|
||||
with open(example_file, "r") as f:
|
||||
example_content = f.read()
|
||||
|
||||
with open(config_file, "w") as f:
|
||||
f.write(example_content)
|
||||
|
||||
print(f"Created config.yaml from example file")
|
||||
|
||||
# Create a global LLM interface with the Groq model
|
||||
groq_interface = get_llm_interface("llama-3.1-8b-instant")
|
||||
print(f"Using model: {groq_interface.model_name}")
|
||||
|
||||
# Monkey patch the get_llm_interface function to always return our Groq interface
|
||||
import query.llm_interface
|
||||
original_get_llm_interface = query.llm_interface.get_llm_interface
|
||||
|
||||
def patched_get_llm_interface(*args, **kwargs):
|
||||
return groq_interface
|
||||
|
||||
query.llm_interface.get_llm_interface = patched_get_llm_interface
|
||||
|
||||
# Test data
|
||||
TEST_QUERIES = [
|
||||
# Simple factual queries
|
||||
"What is quantum computing?",
|
||||
"Who invented the internet?",
|
||||
|
||||
# Complex research queries
|
||||
"What are the latest advancements in renewable energy?",
|
||||
"How does artificial intelligence impact healthcare?",
|
||||
|
||||
# Comparative queries
|
||||
"Compare machine learning and deep learning",
|
||||
"What are the differences between solar and wind energy?",
|
||||
|
||||
# Domain-specific queries
|
||||
"Explain the CRISPR-Cas9 gene editing technology",
|
||||
"What are the implications of blockchain for finance?"
|
||||
]
|
||||
|
||||
SEARCH_ENGINES = ["google", "bing", "scholar"]
|
||||
|
||||
def test_enhance_query(query: str) -> str:
|
||||
"""
|
||||
Test the query enhancement functionality.
|
||||
|
||||
Args:
|
||||
query: The query to enhance
|
||||
|
||||
Returns:
|
||||
The enhanced query
|
||||
"""
|
||||
print(f"\nTesting Query Enhancement")
|
||||
print(f"Original Query: '{query}'")
|
||||
print("-" * 50)
|
||||
|
||||
start_time = time.time()
|
||||
enhanced_query = groq_interface.enhance_query(query)
|
||||
end_time = time.time()
|
||||
|
||||
print(f"Processing time: {end_time - start_time:.2f} seconds")
|
||||
print(f"Enhanced Query: '{enhanced_query}'")
|
||||
print("-" * 50)
|
||||
|
||||
return enhanced_query
|
||||
|
||||
|
||||
def test_classify_query(query: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Test the query classification functionality.
|
||||
|
||||
Args:
|
||||
query: The query to classify
|
||||
|
||||
Returns:
|
||||
The classification result
|
||||
"""
|
||||
print(f"\nTesting Query Classification")
|
||||
print(f"Query: '{query}'")
|
||||
print("-" * 50)
|
||||
|
||||
start_time = time.time()
|
||||
classification = groq_interface.classify_query(query)
|
||||
end_time = time.time()
|
||||
|
||||
print(f"Processing time: {end_time - start_time:.2f} seconds")
|
||||
print(f"Classification: {json.dumps(classification, indent=2)}")
|
||||
print("-" * 50)
|
||||
|
||||
return classification
|
||||
|
||||
|
||||
def test_process_query(query: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Test the query processing functionality.
|
||||
|
||||
Args:
|
||||
query: The query to process
|
||||
|
||||
Returns:
|
||||
The processed query result
|
||||
"""
|
||||
# Get the query processor (which will use our patched LLM interface)
|
||||
processor = get_query_processor()
|
||||
|
||||
# Process the query
|
||||
print(f"\nTesting Query Processing")
|
||||
print(f"Query: '{query}'")
|
||||
print("-" * 50)
|
||||
|
||||
start_time = time.time()
|
||||
result = processor.process_query(query)
|
||||
end_time = time.time()
|
||||
|
||||
# Add timestamp
|
||||
result['timestamp'] = datetime.now().isoformat()
|
||||
|
||||
# Calculate processing time
|
||||
print(f"Processing time: {end_time - start_time:.2f} seconds")
|
||||
|
||||
# Print the result in a formatted way
|
||||
print(f"Original Query: {result['original_query']}")
|
||||
print(f"Enhanced Query: {result['enhanced_query']}")
|
||||
print(f"Query Type: {result['type']}")
|
||||
print(f"Query Intent: {result['intent']}")
|
||||
print(f"Entities: {', '.join(result['entities'])}")
|
||||
print("-" * 50)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def test_generate_search_queries(structured_query: Dict[str, Any],
|
||||
search_engines: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Test the search query generation functionality.
|
||||
|
||||
Args:
|
||||
structured_query: The structured query to generate search queries for
|
||||
search_engines: List of search engines to generate queries for
|
||||
|
||||
Returns:
|
||||
The updated structured query with search queries
|
||||
"""
|
||||
# Get the query processor (which will use our patched LLM interface)
|
||||
processor = get_query_processor()
|
||||
|
||||
# Generate search queries
|
||||
print(f"\nTesting Search Query Generation")
|
||||
print(f"Engines: {', '.join(search_engines)}")
|
||||
print("-" * 50)
|
||||
|
||||
start_time = time.time()
|
||||
result = processor.generate_search_queries(structured_query, search_engines)
|
||||
end_time = time.time()
|
||||
|
||||
# Calculate processing time
|
||||
print(f"Processing time: {end_time - start_time:.2f} seconds")
|
||||
|
||||
# Print the generated search queries
|
||||
for engine, queries in result['search_queries'].items():
|
||||
print(f"\n{engine.upper()} Queries:")
|
||||
for i, query in enumerate(queries, 1):
|
||||
print(f" {i}. {query}")
|
||||
print("-" * 50)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def run_comprehensive_tests():
|
||||
"""Run comprehensive tests on the query processor."""
|
||||
results = []
|
||||
|
||||
for i, query in enumerate(TEST_QUERIES, 1):
|
||||
print(f"\n\nTEST {i}: {query}")
|
||||
print("=" * 80)
|
||||
|
||||
# Test individual components
|
||||
enhanced_query = test_enhance_query(query)
|
||||
classification = test_classify_query(query)
|
||||
|
||||
# Test the full query processing pipeline
|
||||
structured_query = test_process_query(query)
|
||||
|
||||
# Test search query generation for a subset of queries
|
||||
if i % 2 == 0: # Only test every other query to save time
|
||||
search_result = test_generate_search_queries(structured_query, SEARCH_ENGINES)
|
||||
structured_query = search_result
|
||||
|
||||
# Save results
|
||||
results.append({
|
||||
"query": query,
|
||||
"enhanced_query": enhanced_query,
|
||||
"classification": classification,
|
||||
"structured_query": structured_query
|
||||
})
|
||||
|
||||
print("\n" + "=" * 80 + "\n")
|
||||
|
||||
# Add a delay between tests to avoid rate limiting
|
||||
if i < len(TEST_QUERIES):
|
||||
print(f"Waiting 2 seconds before next test...")
|
||||
time.sleep(2)
|
||||
|
||||
# Save results to a file
|
||||
output_file = "query_processor_test_results.json"
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
|
||||
print(f"\nTest results saved to {output_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_comprehensive_tests()
|
|
@ -1,3 +0,0 @@
|
|||
"""
|
||||
Tests for the ranking module.
|
||||
"""
|
|
@ -1,63 +0,0 @@
|
|||
"""
|
||||
Test script for the Jina Reranker integration.
|
||||
This script tests the reranker functionality by comparing results with and without reranking.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
# Import just what we need for the simple test
|
||||
from ranking.jina_reranker import JinaReranker, get_jina_reranker
|
||||
|
||||
def test_simple_reranker():
|
||||
"""Test the Jina Reranker with a simple query and documents"""
|
||||
# Initialize the reranker directly without parameters (it will read from config)
|
||||
try:
|
||||
reranker = get_jina_reranker()
|
||||
print("Successfully initialized Jina Reranker")
|
||||
except Exception as e:
|
||||
print(f"Error initializing Jina Reranker: {str(e)}")
|
||||
return
|
||||
|
||||
# Simple query and documents
|
||||
query = "What is quantum computing?"
|
||||
documents = [
|
||||
"Quantum computing is a type of computation that harnesses quantum mechanics.",
|
||||
"Classical computers use bits, while quantum computers use qubits.",
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Quantum computers can solve certain problems faster than classical computers."
|
||||
]
|
||||
|
||||
print(f"Testing reranker with query: {query}")
|
||||
print(f"Documents: {documents}")
|
||||
|
||||
# Rerank the documents
|
||||
try:
|
||||
reranked = reranker.rerank(query, documents)
|
||||
print(f"Reranked results: {json.dumps(reranked, indent=2)}")
|
||||
|
||||
# Save the results to a file for analysis
|
||||
results_dir = Path("results")
|
||||
results_dir.mkdir(exist_ok=True)
|
||||
results_file = results_dir / f"reranked_results_{int(time.time())}.json"
|
||||
|
||||
with open(results_file, "w") as f:
|
||||
json.dump(reranked, f, indent=2)
|
||||
|
||||
print(f"Results saved to {results_file}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error reranking: {str(e)}")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Just run the simple test
|
||||
success = test_simple_reranker()
|
||||
|
||||
if success:
|
||||
print("Jina Reranker test completed successfully!")
|
||||
else:
|
||||
print("Jina Reranker test failed.")
|
|
@ -1,99 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the JinaSimilarity module.
|
||||
Computes similarity between text from two input files.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from jina_similarity import JinaSimilarity, TokenLimitError
|
||||
|
||||
def read_file(file_path: str) -> str:
|
||||
"""Read content from a file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to read
|
||||
|
||||
Returns:
|
||||
str: Content of the file
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file doesn't exist
|
||||
"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Compute similarity between text from two files using Jina AI.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'chunk_file',
|
||||
type=str,
|
||||
help='Path to the file containing the text chunk'
|
||||
)
|
||||
parser.add_argument(
|
||||
'query_file',
|
||||
type=str,
|
||||
help='Path to the file containing the query'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
'-v',
|
||||
action='store_true',
|
||||
help='Print token counts and embeddings'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if files exist
|
||||
chunk_path = Path(args.chunk_file)
|
||||
query_path = Path(args.query_file)
|
||||
|
||||
if not chunk_path.is_file():
|
||||
print(f"Error: Chunk file not found: {args.chunk_file}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
if not query_path.is_file():
|
||||
print(f"Error: Query file not found: {args.query_file}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Read input files
|
||||
chunk_text = read_file(args.chunk_file)
|
||||
query_text = read_file(args.query_file)
|
||||
|
||||
# Initialize similarity module
|
||||
js = JinaSimilarity()
|
||||
|
||||
# Get token counts if verbose
|
||||
if args.verbose:
|
||||
chunk_tokens = js.count_tokens(chunk_text)
|
||||
query_tokens = js.count_tokens(query_text)
|
||||
print(f"\nToken counts:")
|
||||
print(f"Chunk: {chunk_tokens} tokens")
|
||||
print(f"Query: {query_tokens} tokens\n")
|
||||
|
||||
# Compute similarity
|
||||
similarity, chunk_embedding, query_embedding = js.compute_similarity(
|
||||
chunk_text,
|
||||
query_text
|
||||
)
|
||||
|
||||
# Print results
|
||||
print(f"Similarity score: {similarity:.4f}")
|
||||
|
||||
if args.verbose:
|
||||
print(f"\nEmbeddings:")
|
||||
print(f"Chunk embedding (first 5): {chunk_embedding[:5]}...")
|
||||
print(f"Query embedding (first 5): {query_embedding[:5]}...")
|
||||
|
||||
except TokenLimitError as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,152 +0,0 @@
|
|||
import json
|
||||
import sys
|
||||
import os
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
|
||||
# Add the project root to the path
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Let's create a custom JinaReranker class specifically for testing
|
||||
class TestJinaReranker:
|
||||
"""Custom JinaReranker for testing with explicit initialization parameters"""
|
||||
|
||||
def __init__(self, api_key, model, endpoint):
|
||||
"""Initialize with explicit parameters"""
|
||||
self.api_key = api_key
|
||||
self.model = model
|
||||
self.endpoint = endpoint
|
||||
self.default_top_n = 10
|
||||
|
||||
def rerank(self, query, documents, top_n=None):
|
||||
"""
|
||||
Rerank documents based on their relevance to the query.
|
||||
"""
|
||||
if not documents:
|
||||
return []
|
||||
|
||||
# Use default top_n if not specified
|
||||
if top_n is None:
|
||||
top_n = min(self.default_top_n, len(documents))
|
||||
else:
|
||||
top_n = min(top_n, len(documents))
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Accept": "application/json"
|
||||
}
|
||||
|
||||
data = {
|
||||
"model": self.model,
|
||||
"query": query,
|
||||
"documents": documents, # Plain array of strings
|
||||
"top_n": top_n
|
||||
}
|
||||
|
||||
print(f"Making reranker API call with query: {query}")
|
||||
print(f"Request payload structure: model, query, documents (array of {len(documents)} strings), top_n={top_n}")
|
||||
|
||||
import requests
|
||||
try:
|
||||
response = requests.post(self.endpoint, headers=headers, json=data)
|
||||
print(f"Reranker API response status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Reranker API error: {response.text}")
|
||||
return []
|
||||
|
||||
response.raise_for_status() # Raise exception for HTTP errors
|
||||
|
||||
result = response.json()
|
||||
print(f"Reranker API response structure: {list(result.keys())}")
|
||||
print(f"Full response: {json.dumps(result, indent=2)}")
|
||||
|
||||
# Process and return the reranked results
|
||||
reranked_results = []
|
||||
|
||||
# Check for the specific response structure from the API
|
||||
if "results" in result and isinstance(result["results"], list):
|
||||
results_list = result["results"]
|
||||
for item in results_list:
|
||||
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Handle newer Jina API format with document.text
|
||||
elif isinstance(item, dict) and "index" in item and "document" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
# Fallback for older response structures
|
||||
elif "data" in result and isinstance(result["data"], list):
|
||||
data_list = result["data"]
|
||||
for item in data_list:
|
||||
if isinstance(item, dict) and "index" in item and "relevance_score" in item:
|
||||
reranked_results.append({
|
||||
'index': item.get('index'),
|
||||
'score': item.get('relevance_score'),
|
||||
'document': documents[item.get('index')] if item.get('index') < len(documents) else None
|
||||
})
|
||||
|
||||
print(f"Processed reranker results: {len(reranked_results)} items")
|
||||
return reranked_results
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error calling reranker API: {str(e)}")
|
||||
return []
|
||||
|
||||
def load_config():
|
||||
"""Load configuration from YAML file"""
|
||||
config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config", "config.yaml")
|
||||
print(f"Loading config from {config_path}")
|
||||
|
||||
if os.path.exists(config_path):
|
||||
with open(config_path, "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
print("Configuration loaded successfully")
|
||||
return config
|
||||
else:
|
||||
print(f"Config file not found at {config_path}")
|
||||
return {}
|
||||
|
||||
def test_simple_reranker():
|
||||
"""Test the Jina Reranker with a simple query and documents"""
|
||||
# Get Jina API key from environment
|
||||
jina_api_key = os.environ.get("JINA_API_KEY", "")
|
||||
if not jina_api_key:
|
||||
print("JINA_API_KEY not found in environment variables")
|
||||
return
|
||||
|
||||
print(f"Found JINA_API_KEY in environment variables")
|
||||
|
||||
# Initialize the reranker
|
||||
reranker = TestJinaReranker(
|
||||
api_key=jina_api_key,
|
||||
model="jina-reranker-v2-base-multilingual",
|
||||
endpoint="https://api.jina.ai/v1/rerank"
|
||||
)
|
||||
|
||||
# Simple query and documents
|
||||
query = "What is quantum computing?"
|
||||
documents = [
|
||||
"Quantum computing is a type of computation that harnesses quantum mechanics.",
|
||||
"Classical computers use bits, while quantum computers use qubits.",
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Quantum computers can solve certain problems faster than classical computers."
|
||||
]
|
||||
|
||||
print(f"Testing simple reranker with query: {query}")
|
||||
print(f"Documents: {documents}")
|
||||
|
||||
# Rerank the documents
|
||||
reranked = reranker.rerank(query, documents)
|
||||
print(f"Reranked results: {json.dumps(reranked, indent=2)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Just run the simple test
|
||||
test_simple_reranker()
|
|
@ -1,3 +0,0 @@
|
|||
"""
|
||||
Tests for the report generation module.
|
||||
"""
|
|
@ -1,101 +0,0 @@
|
|||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
import argparse
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from report.report_synthesis import ReportSynthesizer
|
||||
from report.report_templates import QueryType, DetailLevel
|
||||
|
||||
async def generate_report(query_type, detail_level, query, chunks):
|
||||
"""Generate a report with the specified parameters."""
|
||||
synthesizer = ReportSynthesizer()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Generating {detail_level} report with {query_type} query type")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# Convert string values to enum objects
|
||||
query_type_enum = QueryType(query_type)
|
||||
detail_level_enum = DetailLevel(detail_level)
|
||||
|
||||
report = await synthesizer.synthesize_report(
|
||||
query_type=query_type_enum.value,
|
||||
detail_level=detail_level_enum.value,
|
||||
query=query,
|
||||
chunks=chunks
|
||||
)
|
||||
|
||||
print(f"\nGenerated Report:\n")
|
||||
print(report)
|
||||
|
||||
return report
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description='Test report generation with different detail levels')
|
||||
parser.add_argument('--query-type', choices=['factual', 'exploratory', 'comparative'], default='factual',
|
||||
help='Query type to test (default: factual)')
|
||||
parser.add_argument('--detail-level', choices=['brief', 'standard', 'detailed', 'comprehensive'], default=None,
|
||||
help='Detail level to test (default: test all)')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Test data
|
||||
queries = {
|
||||
'factual': "What is the capital of France?",
|
||||
'exploratory': "How do electric vehicles impact the environment?",
|
||||
'comparative': "Compare solar and wind energy technologies."
|
||||
}
|
||||
|
||||
chunks = {
|
||||
'factual': [
|
||||
{
|
||||
'content': 'Paris is the capital of France. It is located in the north-central part of the country.',
|
||||
'source': 'Wikipedia',
|
||||
'url': 'https://en.wikipedia.org/wiki/Paris'
|
||||
}
|
||||
],
|
||||
'exploratory': [
|
||||
{
|
||||
'content': 'Electric vehicles produce zero direct emissions, which improves air quality in urban areas.',
|
||||
'source': 'EPA',
|
||||
'url': 'https://www.epa.gov/greenvehicles/electric-vehicles'
|
||||
},
|
||||
{
|
||||
'content': 'The environmental impact of electric vehicles depends on how the electricity is generated. Renewable sources make EVs more environmentally friendly.',
|
||||
'source': 'Energy.gov',
|
||||
'url': 'https://www.energy.gov/eere/electricvehicles/electric-vehicle-benefits'
|
||||
}
|
||||
],
|
||||
'comparative': [
|
||||
{
|
||||
'content': 'Solar energy is generated by converting sunlight into electricity using photovoltaic cells or concentrated solar power.',
|
||||
'source': 'National Renewable Energy Laboratory',
|
||||
'url': 'https://www.nrel.gov/research/re-solar.html'
|
||||
},
|
||||
{
|
||||
'content': 'Wind energy is generated by using wind turbines to create mechanical power that can be converted into electricity.',
|
||||
'source': 'Department of Energy',
|
||||
'url': 'https://www.energy.gov/eere/wind/how-do-wind-turbines-work'
|
||||
},
|
||||
{
|
||||
'content': 'Solar energy works best in sunny areas, while wind energy is more effective in windy regions. Both have different land use requirements.',
|
||||
'source': 'Renewable Energy World',
|
||||
'url': 'https://www.renewableenergyworld.com/solar/solar-vs-wind/'
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Get the query type to test
|
||||
query_type = args.query_type
|
||||
query = queries[query_type]
|
||||
test_chunks = chunks[query_type]
|
||||
|
||||
# Test all detail levels or just the specified one
|
||||
detail_levels = ['brief', 'standard', 'detailed', 'comprehensive'] if args.detail_level is None else [args.detail_level]
|
||||
|
||||
for detail_level in detail_levels:
|
||||
await generate_report(query_type, detail_level, query, test_chunks)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
|
@ -1,36 +0,0 @@
|
|||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from report.report_synthesis import ReportSynthesizer
|
||||
from report.report_templates import QueryType, DetailLevel
|
||||
|
||||
async def main():
|
||||
# Initialize synthesizer
|
||||
synthesizer = ReportSynthesizer()
|
||||
|
||||
# Test data
|
||||
query = "What is the capital of France?"
|
||||
chunks = [
|
||||
{
|
||||
'content': 'Paris is the capital of France.',
|
||||
'source': 'Wikipedia',
|
||||
'url': 'https://en.wikipedia.org/wiki/Paris'
|
||||
}
|
||||
]
|
||||
|
||||
# Generate brief report
|
||||
report = await synthesizer.synthesize_report(
|
||||
query_type=QueryType.FACTUAL.value,
|
||||
detail_level=DetailLevel.BRIEF.value,
|
||||
query=query,
|
||||
chunks=chunks
|
||||
)
|
||||
|
||||
print('Generated Report:\n')
|
||||
print(report)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
|
@ -1,132 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Test Query to Report Script with Custom Model
|
||||
|
||||
This script tests the query_to_report.py script with a custom model and query.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
# Add parent directory to path to import modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from scripts.query_to_report import query_to_report
|
||||
from report.report_detail_levels import get_report_detail_level_manager
|
||||
from report.report_synthesis import ReportSynthesizer, get_report_synthesizer
|
||||
from config.config import get_config
|
||||
|
||||
|
||||
async def run_custom_model_test(
|
||||
query: str,
|
||||
model_name: str,
|
||||
detail_level: str = "standard",
|
||||
use_mock: bool = False,
|
||||
process_thinking_tags: bool = False
|
||||
):
|
||||
"""
|
||||
Run a test of the query to report workflow with a custom model.
|
||||
|
||||
Args:
|
||||
query: The query to process
|
||||
model_name: The name of the model to use
|
||||
detail_level: Level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
process_thinking_tags: If True, process and remove <thinking> tags from the model output
|
||||
"""
|
||||
# Generate timestamp for unique output file
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
model_short_name = model_name.split('/')[-1] if '/' in model_name else model_name
|
||||
output_file = f"report_{timestamp}_{model_short_name}.md"
|
||||
|
||||
print(f"Processing query: {query}")
|
||||
print(f"Model: {model_name}")
|
||||
print(f"Detail level: {detail_level}")
|
||||
print(f"Process thinking tags: {process_thinking_tags}")
|
||||
print(f"This may take a few minutes depending on the number of search results and API response times...")
|
||||
|
||||
# Get detail level configuration
|
||||
detail_level_manager = get_report_detail_level_manager()
|
||||
config = detail_level_manager.get_detail_level_config(detail_level)
|
||||
|
||||
# Print detail level configuration
|
||||
print(f"\nDetail level configuration:")
|
||||
print(f" Number of results per search engine: {config.get('num_results')}")
|
||||
print(f" Token budget: {config.get('token_budget')}")
|
||||
print(f" Chunk size: {config.get('chunk_size')}")
|
||||
print(f" Overlap size: {config.get('overlap_size')}")
|
||||
print(f" Default model: {config.get('model')}")
|
||||
print(f" Using custom model: {model_name}")
|
||||
|
||||
# Create a custom report synthesizer with the specified model
|
||||
custom_synthesizer = ReportSynthesizer(model_name=model_name)
|
||||
|
||||
# Set the process_thinking_tags flag if needed
|
||||
if process_thinking_tags:
|
||||
custom_synthesizer.process_thinking_tags = True
|
||||
|
||||
# Store the original synthesizer to restore later
|
||||
original_synthesizer = get_report_synthesizer()
|
||||
|
||||
# Replace the global synthesizer with our custom one
|
||||
from report.report_synthesis import report_synthesizer
|
||||
report_synthesis_module = sys.modules['report.report_synthesis']
|
||||
report_synthesis_module.report_synthesizer = custom_synthesizer
|
||||
|
||||
try:
|
||||
# Run the workflow
|
||||
await query_to_report(
|
||||
query=query,
|
||||
output_file=output_file,
|
||||
detail_level=detail_level,
|
||||
use_mock=use_mock
|
||||
)
|
||||
|
||||
print(f"\nTest completed successfully!")
|
||||
print(f"Report saved to: {output_file}")
|
||||
|
||||
# Print the first few lines of the report
|
||||
try:
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
preview = f.read(1000) # Show a larger preview
|
||||
print("\nReport Preview:")
|
||||
print("-" * 80)
|
||||
print(preview + "...")
|
||||
print("-" * 80)
|
||||
except Exception as e:
|
||||
print(f"Error reading report: {e}")
|
||||
|
||||
finally:
|
||||
# Restore the original synthesizer
|
||||
report_synthesis_module.report_synthesizer = original_synthesizer
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to parse arguments and run the test."""
|
||||
parser = argparse.ArgumentParser(description='Test the query to report workflow with a custom model')
|
||||
parser.add_argument('query', help='The query to process')
|
||||
parser.add_argument('--model', '-m', required=True, help='The model to use (e.g., groq/deepseek-r1-distill-llama-70b-specdec)')
|
||||
parser.add_argument('--detail-level', '-d', type=str, default='standard',
|
||||
choices=['brief', 'standard', 'detailed', 'comprehensive'],
|
||||
help='Level of detail for the report')
|
||||
parser.add_argument('--use-mock', action='store_true', help='Use mock data instead of API calls')
|
||||
parser.add_argument('--process-thinking-tags', '-t', action='store_true',
|
||||
help='Process and remove <thinking> tags from model output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Run the test
|
||||
asyncio.run(run_custom_model_test(
|
||||
query=args.query,
|
||||
model_name=args.model,
|
||||
detail_level=args.detail_level,
|
||||
use_mock=args.use_mock,
|
||||
process_thinking_tags=args.process_thinking_tags
|
||||
))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,123 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Test Detail Levels Script
|
||||
|
||||
This script tests the report generation with different detail levels
|
||||
for the same query to demonstrate the differences.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
# Add parent directory to path to import modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from scripts.query_to_report import query_to_report
|
||||
from report.report_detail_levels import get_report_detail_level_manager, DetailLevel
|
||||
|
||||
|
||||
async def run_detail_level_test(query: str, use_mock: bool = False):
|
||||
"""
|
||||
Run a test of the query to report workflow with different detail levels.
|
||||
|
||||
Args:
|
||||
query: The query to process
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
"""
|
||||
# Generate timestamp for unique output files
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
# Get detail level manager
|
||||
detail_level_manager = get_report_detail_level_manager()
|
||||
|
||||
# Get all detail levels
|
||||
detail_levels = [level.value for level in DetailLevel]
|
||||
|
||||
print(f"Processing query: {query}")
|
||||
print(f"Testing {len(detail_levels)} detail levels: {', '.join(detail_levels)}")
|
||||
print(f"This may take several minutes to complete all detail levels...")
|
||||
|
||||
# Process each detail level
|
||||
for detail_level in detail_levels:
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Processing detail level: {detail_level}")
|
||||
|
||||
# Get detail level configuration
|
||||
config = detail_level_manager.get_detail_level_config(detail_level)
|
||||
|
||||
# Print detail level configuration
|
||||
print(f"Detail level configuration:")
|
||||
print(f" Number of results per search engine: {config.get('num_results')}")
|
||||
print(f" Token budget: {config.get('token_budget')}")
|
||||
print(f" Chunk size: {config.get('chunk_size')}")
|
||||
print(f" Overlap size: {config.get('overlap_size')}")
|
||||
print(f" Model: {config.get('model')}")
|
||||
|
||||
# Set output file
|
||||
output_file = f"report_{timestamp}_{detail_level}.md"
|
||||
|
||||
# Run the workflow
|
||||
start_time = datetime.now()
|
||||
print(f"Started at: {start_time.strftime('%H:%M:%S')}")
|
||||
|
||||
await query_to_report(
|
||||
query=query,
|
||||
output_file=output_file,
|
||||
detail_level=detail_level,
|
||||
use_mock=use_mock
|
||||
)
|
||||
|
||||
end_time = datetime.now()
|
||||
duration = end_time - start_time
|
||||
print(f"Completed at: {end_time.strftime('%H:%M:%S')}")
|
||||
print(f"Duration: {duration.total_seconds():.2f} seconds")
|
||||
|
||||
# Get report file size
|
||||
file_size = os.path.getsize(output_file)
|
||||
print(f"Report saved to: {output_file}")
|
||||
print(f"Report size: {file_size} bytes")
|
||||
|
||||
# Count words in report
|
||||
try:
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
word_count = len(content.split())
|
||||
print(f"Word count: {word_count}")
|
||||
except Exception as e:
|
||||
print(f"Error reading report: {e}")
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"All detail levels processed successfully!")
|
||||
print(f"Reports saved with prefix: report_{timestamp}_")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to parse arguments and run the test."""
|
||||
parser = argparse.ArgumentParser(description='Test report generation with different detail levels')
|
||||
parser.add_argument('--query', '-q', type=str,
|
||||
default="What is the environmental and economic impact of electric vehicles compared to traditional vehicles?",
|
||||
help='The query to process')
|
||||
parser.add_argument('--use-mock', '-m', action='store_true', help='Use mock data instead of API calls')
|
||||
parser.add_argument('--list-detail-levels', action='store_true',
|
||||
help='List available detail levels with descriptions and exit')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# List detail levels if requested
|
||||
if args.list_detail_levels:
|
||||
detail_level_manager = get_report_detail_level_manager()
|
||||
detail_levels = detail_level_manager.get_available_detail_levels()
|
||||
print("Available detail levels:")
|
||||
for level, description in detail_levels:
|
||||
print(f" {level}: {description}")
|
||||
return
|
||||
|
||||
# Run the test
|
||||
asyncio.run(run_detail_level_test(query=args.query, use_mock=args.use_mock))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,293 +0,0 @@
|
|||
"""
|
||||
Test script for the progressive report generation functionality.
|
||||
|
||||
This script tests the progressive report generation approach for comprehensive reports.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
# Add the project root directory to the Python path
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from report.progressive_report_synthesis import get_progressive_report_synthesizer
|
||||
from report.report_generator import get_report_generator, initialize_report_generator
|
||||
from report.report_detail_levels import get_report_detail_level_manager
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sample document chunks for testing
|
||||
SAMPLE_CHUNKS = [
|
||||
{
|
||||
"document_id": "1",
|
||||
"title": "Introduction to Electric Vehicles",
|
||||
"url": "https://example.com/ev-intro",
|
||||
"content": """
|
||||
Electric vehicles (EVs) are automobiles that are propelled by one or more electric motors, using energy stored in rechargeable batteries. Compared to internal combustion engine (ICE) vehicles, EVs are quieter, have no exhaust emissions, and lower emissions overall. In the long run, EVs are often cheaper to maintain due to fewer moving parts and the increasing efficiency of battery technology.
|
||||
|
||||
The first practical production EVs were produced in the 1880s. However, internal combustion engines were preferred for road vehicles for most of the 20th century. EVs saw a resurgence in the 21st century due to technological developments, and an increased focus on renewable energy and potential reduction of transportation's impact on climate change and other environmental issues.
|
||||
""",
|
||||
"priority_score": 0.95
|
||||
},
|
||||
{
|
||||
"document_id": "2",
|
||||
"title": "Environmental Impact of Electric Vehicles",
|
||||
"url": "https://example.com/ev-environment",
|
||||
"content": """
|
||||
The environmental impact of electric vehicles (EVs) is a complex topic that requires consideration of multiple factors. While EVs produce zero direct emissions, their overall environmental impact depends on how the electricity used to charge them is generated.
|
||||
|
||||
In regions where electricity is produced from low-carbon sources like renewables or nuclear, EVs offer significant environmental benefits over conventional vehicles. However, in areas heavily dependent on coal or other fossil fuels for electricity generation, the benefits may be reduced.
|
||||
|
||||
Life cycle assessments show that EVs typically have a higher environmental impact during manufacturing, primarily due to battery production, but this is usually offset by lower emissions during operation. The total lifecycle emissions of an EV are generally lower than those of a comparable conventional vehicle, especially as the vehicle is used over time.
|
||||
""",
|
||||
"priority_score": 0.9
|
||||
},
|
||||
{
|
||||
"document_id": "3",
|
||||
"title": "Economic Considerations of Electric Vehicles",
|
||||
"url": "https://example.com/ev-economics",
|
||||
"content": """
|
||||
The economics of electric vehicles (EVs) involve several factors including purchase price, operating costs, maintenance, and resale value. While EVs typically have higher upfront costs compared to conventional vehicles, they often have lower operating and maintenance costs.
|
||||
|
||||
The total cost of ownership (TCO) analysis shows that EVs can be economically competitive or even advantageous over the vehicle's lifetime, especially in regions with high fuel prices or significant incentives for EV adoption. Factors affecting TCO include:
|
||||
|
||||
1. Purchase price and available incentives
|
||||
2. Electricity costs versus fuel costs
|
||||
3. Maintenance requirements and costs
|
||||
4. Battery longevity and replacement costs
|
||||
5. Resale value
|
||||
|
||||
Government incentives, including tax credits, rebates, and other benefits, can significantly reduce the effective purchase price of EVs, making them more competitive with conventional vehicles.
|
||||
""",
|
||||
"priority_score": 0.85
|
||||
},
|
||||
{
|
||||
"document_id": "4",
|
||||
"title": "Electric Vehicle Battery Technology",
|
||||
"url": "https://example.com/ev-batteries",
|
||||
"content": """
|
||||
Battery technology is a critical component of electric vehicles (EVs). Most modern EVs use lithium-ion batteries, which offer high energy density, low self-discharge, and no memory effect. However, these batteries face challenges including limited range, long charging times, degradation over time, and resource constraints for materials like lithium, cobalt, and nickel.
|
||||
|
||||
Research and development in battery technology focus on several areas:
|
||||
|
||||
1. Increasing energy density to improve range
|
||||
2. Reducing charging time through fast-charging technologies
|
||||
3. Extending battery lifespan and reducing degradation
|
||||
4. Developing batteries with more abundant and sustainable materials
|
||||
5. Improving safety and thermal management
|
||||
|
||||
Solid-state batteries represent a promising future technology, potentially offering higher energy density, faster charging, longer lifespan, and improved safety compared to current lithium-ion batteries.
|
||||
""",
|
||||
"priority_score": 0.8
|
||||
},
|
||||
{
|
||||
"document_id": "5",
|
||||
"title": "Electric Vehicle Infrastructure",
|
||||
"url": "https://example.com/ev-infrastructure",
|
||||
"content": """
|
||||
Electric vehicle (EV) infrastructure refers to the charging stations, grid capacity, and supporting systems necessary for widespread EV adoption. The availability and accessibility of charging infrastructure is a critical factor in EV adoption rates.
|
||||
|
||||
Charging infrastructure can be categorized into three main types:
|
||||
|
||||
1. Level 1 (120V AC): Standard household outlet, providing about 2-5 miles of range per hour of charging
|
||||
2. Level 2 (240V AC): Dedicated charging station providing about 10-30 miles of range per hour
|
||||
3. DC Fast Charging: High-powered stations providing 60-80% charge in 20-30 minutes
|
||||
|
||||
The development of EV infrastructure faces several challenges, including:
|
||||
|
||||
- High installation costs, particularly for fast-charging stations
|
||||
- Grid capacity constraints in areas with high EV adoption
|
||||
- Standardization of charging connectors and protocols
|
||||
- Equitable distribution of charging infrastructure
|
||||
|
||||
Government initiatives, utility programs, and private investments are all contributing to the expansion of EV charging infrastructure globally.
|
||||
""",
|
||||
"priority_score": 0.75
|
||||
},
|
||||
{
|
||||
"document_id": "6",
|
||||
"title": "Future Trends in Electric Vehicles",
|
||||
"url": "https://example.com/ev-future",
|
||||
"content": """
|
||||
The electric vehicle (EV) market is rapidly evolving, with several key trends shaping its future:
|
||||
|
||||
1. Increasing range: Newer EV models are offering ranges exceeding 300 miles on a single charge, addressing one of the primary concerns of potential adopters.
|
||||
|
||||
2. Decreasing battery costs: Battery costs have declined by approximately 85% since 2010, making EVs increasingly cost-competitive with conventional vehicles.
|
||||
|
||||
3. Autonomous driving features: Many EVs are at the forefront of autonomous driving technology, with features like advanced driver assistance systems (ADAS) becoming more common.
|
||||
|
||||
4. Vehicle-to-grid (V2G) technology: This allows EVs to not only consume electricity but also return it to the grid during peak demand, potentially creating new economic opportunities for EV owners.
|
||||
|
||||
5. Wireless charging: Development of inductive charging technology could eliminate the need for physical connections to charge EVs.
|
||||
|
||||
6. Integration with renewable energy: Synergies between EVs and renewable energy sources like solar and wind power are being explored to create more sustainable transportation systems.
|
||||
|
||||
These trends suggest that EVs will continue to gain market share and could potentially become the dominant form of personal transportation in many markets within the next few decades.
|
||||
""",
|
||||
"priority_score": 0.7
|
||||
}
|
||||
]
|
||||
|
||||
async def test_progressive_report_generation():
|
||||
"""Test the progressive report generation functionality."""
|
||||
# Initialize the report generator
|
||||
await initialize_report_generator()
|
||||
|
||||
# Get the progressive report synthesizer
|
||||
synthesizer = get_progressive_report_synthesizer()
|
||||
|
||||
# Define a progress callback
|
||||
def progress_callback(progress, total, current_report):
|
||||
logger.info(f"Progress: {progress:.2%} ({total} chunks)")
|
||||
|
||||
# Set progress callback
|
||||
synthesizer.set_progress_callback(progress_callback)
|
||||
|
||||
# Test query
|
||||
query = "What are the environmental and economic impacts of electric vehicles?"
|
||||
|
||||
logger.info(f"Starting progressive report generation for query: {query}")
|
||||
|
||||
# Generate report progressively
|
||||
report = await synthesizer.synthesize_report_progressively(
|
||||
SAMPLE_CHUNKS,
|
||||
query,
|
||||
query_type="comparative",
|
||||
detail_level="comprehensive"
|
||||
)
|
||||
|
||||
# Print report state
|
||||
logger.info(f"Report generation completed after {synthesizer.report_state.version} iterations")
|
||||
logger.info(f"Processed {len(synthesizer.report_state.processed_chunks)} chunks")
|
||||
logger.info(f"Improvement scores: {synthesizer.report_state.improvement_scores}")
|
||||
logger.info(f"Termination reason: {synthesizer.report_state.termination_reason}")
|
||||
|
||||
# Save the report to a file
|
||||
with open("progressive_report_test_output.md", "w") as f:
|
||||
f.write(report)
|
||||
|
||||
logger.info(f"Report saved to progressive_report_test_output.md")
|
||||
|
||||
return report
|
||||
|
||||
async def test_report_generator_with_progressive_synthesis():
|
||||
"""Test the report generator with progressive synthesis for comprehensive detail level."""
|
||||
# Initialize the report generator
|
||||
await initialize_report_generator()
|
||||
|
||||
# Get the report generator
|
||||
generator = get_report_generator()
|
||||
|
||||
# Set detail level to comprehensive
|
||||
generator.set_detail_level("comprehensive")
|
||||
|
||||
# Create mock search results
|
||||
search_results = [
|
||||
{
|
||||
'title': chunk['title'],
|
||||
'url': chunk['url'],
|
||||
'snippet': chunk['content'][:100] + '...',
|
||||
'score': chunk['priority_score']
|
||||
}
|
||||
for chunk in SAMPLE_CHUNKS
|
||||
]
|
||||
|
||||
# Test query
|
||||
query = "What are the environmental and economic impacts of electric vehicles?"
|
||||
|
||||
logger.info(f"Starting report generation with progressive synthesis for query: {query}")
|
||||
|
||||
# Generate report
|
||||
report = await generator.generate_report(search_results, query)
|
||||
|
||||
# Save the report to a file
|
||||
with open("report_generator_progressive_test_output.md", "w") as f:
|
||||
f.write(report)
|
||||
|
||||
logger.info(f"Report saved to report_generator_progressive_test_output.md")
|
||||
|
||||
return report
|
||||
|
||||
async def compare_progressive_vs_standard():
|
||||
"""Compare progressive synthesis with standard map-reduce approach."""
|
||||
# Initialize the report generator
|
||||
await initialize_report_generator()
|
||||
|
||||
# Get the synthesizers
|
||||
progressive_synthesizer = get_progressive_report_synthesizer()
|
||||
standard_synthesizer = get_progressive_report_synthesizer() # Using the same class but different method
|
||||
|
||||
# Test query
|
||||
query = "What are the environmental and economic impacts of electric vehicles?"
|
||||
|
||||
logger.info("Starting comparison between progressive and standard synthesis")
|
||||
|
||||
# Generate report using progressive synthesis
|
||||
logger.info("Generating report with progressive synthesis...")
|
||||
progressive_start_time = asyncio.get_event_loop().time()
|
||||
progressive_report = await progressive_synthesizer.synthesize_report_progressively(
|
||||
SAMPLE_CHUNKS,
|
||||
query,
|
||||
query_type="comparative",
|
||||
detail_level="comprehensive"
|
||||
)
|
||||
progressive_end_time = asyncio.get_event_loop().time()
|
||||
progressive_duration = progressive_end_time - progressive_start_time
|
||||
|
||||
# Generate report using standard map-reduce
|
||||
logger.info("Generating report with standard map-reduce...")
|
||||
standard_start_time = asyncio.get_event_loop().time()
|
||||
standard_report = await standard_synthesizer.synthesize_report(
|
||||
SAMPLE_CHUNKS,
|
||||
query,
|
||||
query_type="comparative",
|
||||
detail_level="detailed" # Using detailed instead of comprehensive to use map-reduce
|
||||
)
|
||||
standard_end_time = asyncio.get_event_loop().time()
|
||||
standard_duration = standard_end_time - standard_start_time
|
||||
|
||||
# Save reports to files
|
||||
with open("progressive_synthesis_report.md", "w") as f:
|
||||
f.write(progressive_report)
|
||||
|
||||
with open("standard_synthesis_report.md", "w") as f:
|
||||
f.write(standard_report)
|
||||
|
||||
# Compare results
|
||||
logger.info(f"Progressive synthesis took {progressive_duration:.2f} seconds")
|
||||
logger.info(f"Standard synthesis took {standard_duration:.2f} seconds")
|
||||
logger.info(f"Progressive report length: {len(progressive_report)} characters")
|
||||
logger.info(f"Standard report length: {len(standard_report)} characters")
|
||||
|
||||
return {
|
||||
"progressive": {
|
||||
"duration": progressive_duration,
|
||||
"length": len(progressive_report),
|
||||
"iterations": progressive_synthesizer.report_state.version
|
||||
},
|
||||
"standard": {
|
||||
"duration": standard_duration,
|
||||
"length": len(standard_report)
|
||||
}
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Test progressive report generation')
|
||||
parser.add_argument('--test', choices=['progressive', 'generator', 'compare'], default='progressive',
|
||||
help='Test to run (progressive, generator, or compare)')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.test == 'progressive':
|
||||
asyncio.run(test_progressive_report_generation())
|
||||
elif args.test == 'generator':
|
||||
asyncio.run(test_report_generator_with_progressive_synthesis())
|
||||
elif args.test == 'compare':
|
||||
asyncio.run(compare_progressive_vs_standard())
|
|
@ -1,24 +0,0 @@
|
|||
import unittest
|
||||
from report.report_templates import ReportTemplateManager, QueryType, DetailLevel
|
||||
|
||||
class TestReportTemplates(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.manager = ReportTemplateManager()
|
||||
self.manager.initialize_default_templates()
|
||||
|
||||
def test_template_retrieval(self):
|
||||
template = self.manager.get_template(QueryType.FACTUAL, DetailLevel.BRIEF)
|
||||
self.assertIsNotNone(template)
|
||||
self.assertEqual(template.detail_level, DetailLevel.BRIEF)
|
||||
self.assertEqual(template.query_type, QueryType.FACTUAL)
|
||||
|
||||
def test_template_validation(self):
|
||||
template = self.manager.get_template(QueryType.FACTUAL, DetailLevel.BRIEF)
|
||||
self.assertTrue(template.validate())
|
||||
|
||||
def test_all_templates_available(self):
|
||||
templates = self.manager.get_available_templates()
|
||||
self.assertEqual(len(templates), 12) # 3 query types * 4 detail levels
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -1,114 +0,0 @@
|
|||
"""
|
||||
Test script for the document processor module.
|
||||
|
||||
This script tests the document prioritization and chunking functionality
|
||||
of the document processor module.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
# Add the project root directory to the Python path
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from report.document_processor import get_document_processor
|
||||
from report.database.db_manager import get_db_manager, initialize_database
|
||||
from report.document_scraper import get_document_scraper
|
||||
|
||||
async def test_document_processor(use_mock: bool = False):
|
||||
"""Test the document processor with sample documents."""
|
||||
# Initialize database
|
||||
await initialize_database()
|
||||
|
||||
# Create document processor
|
||||
document_processor = get_document_processor()
|
||||
|
||||
# Create document scraper with mock option
|
||||
document_scraper = get_document_scraper(use_mock=use_mock)
|
||||
|
||||
# Sample search results with real, accessible URLs
|
||||
search_results = [
|
||||
{
|
||||
'title': 'Python Documentation',
|
||||
'url': 'https://docs.python.org/3/',
|
||||
'snippet': 'Official Python documentation.',
|
||||
'score': 0.95
|
||||
},
|
||||
{
|
||||
'title': 'Python.org',
|
||||
'url': 'https://www.python.org/',
|
||||
'snippet': 'The official home of the Python Programming Language.',
|
||||
'score': 0.85
|
||||
},
|
||||
{
|
||||
'title': 'Wikipedia - Python',
|
||||
'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
|
||||
'snippet': 'Python is a high-level, general-purpose programming language.',
|
||||
'score': 0.75
|
||||
}
|
||||
]
|
||||
|
||||
# Process search results
|
||||
documents = []
|
||||
relevance_scores = {}
|
||||
|
||||
for result in search_results:
|
||||
# Scrape document
|
||||
document = await document_scraper.scrape_url(result['url'])
|
||||
if document:
|
||||
documents.append(document)
|
||||
relevance_scores[document['url']] = result['score']
|
||||
|
||||
print(f"Scraped {len(documents)} documents")
|
||||
|
||||
# Test document prioritization
|
||||
prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)
|
||||
print("\nPrioritized documents:")
|
||||
for i, doc in enumerate(prioritized_docs):
|
||||
print(f"{i+1}. {doc['title']} (Score: {doc.get('priority_score', 'N/A')})")
|
||||
|
||||
# Test document chunking
|
||||
if documents:
|
||||
print("\nChunking document:", documents[0]['title'])
|
||||
chunks = document_processor.chunk_document_by_sections(documents[0])
|
||||
print(f"Created {len(chunks)} chunks")
|
||||
for i, chunk in enumerate(chunks[:3]): # Show first 3 chunks
|
||||
print(f"Chunk {i+1}: {chunk['title']} ({chunk['token_count']} tokens)")
|
||||
content_preview = chunk['content'][:100] + '...' if len(chunk['content']) > 100 else chunk['content']
|
||||
print(f"Content: {content_preview}")
|
||||
|
||||
# Test token budget management
|
||||
token_budget = 4000
|
||||
print(f"\nSelecting chunks with token budget: {token_budget}")
|
||||
|
||||
# Create chunks for each document
|
||||
all_chunks = []
|
||||
for doc in prioritized_docs:
|
||||
doc_chunks = document_processor.chunk_document_by_sections(doc)
|
||||
all_chunks.extend(doc_chunks)
|
||||
|
||||
# Select chunks based on token budget
|
||||
selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)
|
||||
print(f"Selected {len(selected_chunks)} chunks with total tokens: {sum(c['token_count'] for c in selected_chunks)}")
|
||||
|
||||
# Test end-to-end processing
|
||||
print("\nTesting end-to-end processing")
|
||||
processed_chunks = document_processor.process_documents_for_report(documents, relevance_scores)
|
||||
print(f"Processed {len(processed_chunks)} chunks for report")
|
||||
|
||||
return processed_chunks
|
||||
|
||||
# Run test if this module is executed directly
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Test the document processor')
|
||||
parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Running test with {'mock data' if args.mock else 'real data'}")
|
||||
asyncio.run(test_document_processor(use_mock=args.mock))
|
|
@ -1,142 +0,0 @@
|
|||
"""
|
||||
Test script for the document scraper module.
|
||||
|
||||
This script tests the functionality of the document scraper module
|
||||
by scraping a few sample URLs and storing them in the database.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Add parent directory to path to allow importing modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from report.database.db_manager import initialize_database, get_db_manager
|
||||
from report.document_scraper import get_document_scraper
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sample URLs for testing
|
||||
TEST_URLS = [
|
||||
"https://en.wikipedia.org/wiki/Web_scraping",
|
||||
"https://en.wikipedia.org/wiki/Natural_language_processing",
|
||||
"https://en.wikipedia.org/wiki/SQLite"
|
||||
]
|
||||
|
||||
async def test_document_scraper():
|
||||
"""Test the document scraper with sample URLs."""
|
||||
# Initialize database
|
||||
await initialize_database()
|
||||
logger.info("Database initialized")
|
||||
|
||||
# Get document scraper
|
||||
scraper = get_document_scraper()
|
||||
|
||||
# Scrape URLs
|
||||
logger.info(f"Scraping {len(TEST_URLS)} URLs...")
|
||||
documents = await scraper.scrape_urls(TEST_URLS)
|
||||
|
||||
# Print results
|
||||
logger.info(f"Successfully scraped {len(documents)} documents")
|
||||
for doc in documents:
|
||||
logger.info(f"Title: {doc['title']}")
|
||||
logger.info(f"URL: {doc['url']}")
|
||||
logger.info(f"Token count: {doc['token_count']}")
|
||||
logger.info(f"Content preview: {doc['content'][:200]}...")
|
||||
logger.info("-" * 80)
|
||||
|
||||
# Test database search
|
||||
db_manager = get_db_manager()
|
||||
search_results = await db_manager.search_documents("scraping")
|
||||
logger.info(f"Found {len(search_results)} documents matching 'scraping'")
|
||||
|
||||
# Test document retrieval by URL
|
||||
doc = await db_manager.get_document_by_url(TEST_URLS[0])
|
||||
if doc:
|
||||
logger.info(f"Retrieved document by URL: {doc['title']}")
|
||||
else:
|
||||
logger.error(f"Failed to retrieve document by URL: {TEST_URLS[0]}")
|
||||
|
||||
# Count documents in database
|
||||
count = await db_manager.count_documents()
|
||||
logger.info(f"Total documents in database: {count}")
|
||||
|
||||
return True
|
||||
|
||||
async def test_document_scraper_single_url(url, use_mock=False):
|
||||
"""
|
||||
Test the document scraper with a single URL.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
use_mock: If True, use mock data instead of making actual API calls
|
||||
"""
|
||||
# Get document scraper
|
||||
document_scraper = get_document_scraper(use_mock=use_mock)
|
||||
|
||||
logger.info(f"Testing document scraper with URL: {url}")
|
||||
logger.info(f"Using mock data: {use_mock}")
|
||||
|
||||
# Scrape the URL
|
||||
document = await document_scraper.scrape_url(url)
|
||||
|
||||
if document:
|
||||
logger.info(f"Successfully scraped document: {document.get('title')}")
|
||||
logger.info(f"URL: {document.get('url')}")
|
||||
logger.info(f"Token count: {document.get('token_count')}")
|
||||
content_preview = document.get('content', '')[:200] + '...' if document.get('content') else 'No content'
|
||||
logger.info(f"Content snippet: {content_preview}")
|
||||
|
||||
# Print metadata
|
||||
logger.info("\nMetadata:")
|
||||
for key, value in document.get('metadata', {}).items():
|
||||
logger.info(f" {key}: {value}")
|
||||
else:
|
||||
logger.info(f"Failed to scrape document: {url}")
|
||||
|
||||
async def clear_database():
|
||||
"""Clear the document database."""
|
||||
from report.database.db_manager import get_db_manager
|
||||
|
||||
# Get database manager
|
||||
db_manager = get_db_manager()
|
||||
|
||||
# Clear the database
|
||||
await db_manager.clear_database()
|
||||
logger.info("Database cleared")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='Test the document scraper')
|
||||
parser.add_argument('--url', type=str, default='https://fastapi.tiangolo.com/', help='URL to scrape')
|
||||
parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
|
||||
parser.add_argument('--run-all', action='store_true', help='Run all tests')
|
||||
parser.add_argument('--clear-db', action='store_true', help='Clear the database')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.run_all:
|
||||
try:
|
||||
success = asyncio.run(test_document_scraper())
|
||||
if success:
|
||||
logger.info("All tests passed!")
|
||||
sys.exit(0)
|
||||
else:
|
||||
logger.error("Tests failed!")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error running tests: {str(e)}")
|
||||
sys.exit(1)
|
||||
elif args.clear_db:
|
||||
try:
|
||||
asyncio.run(clear_database())
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error clearing database: {str(e)}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
asyncio.run(test_document_scraper_single_url(args.url, use_mock=args.mock))
|
|
@ -1,153 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the report synthesis functionality.
|
||||
|
||||
This script tests the report synthesis functionality by generating a report
|
||||
from sample document chunks.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import json
|
||||
import argparse
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
# Add the parent directory to the path so we can import the modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from report.report_synthesis import get_report_synthesizer
|
||||
from report.document_processor import get_document_processor
|
||||
from report.document_scraper import get_document_scraper
|
||||
from report.database.db_manager import get_db_manager, initialize_database
|
||||
|
||||
async def test_with_sample_chunks():
|
||||
"""Test report synthesis with sample document chunks."""
|
||||
# Sample document chunks
|
||||
chunks = [
|
||||
{
|
||||
"title": "Introduction to Python",
|
||||
"url": "https://docs.python.org/3/tutorial/index.html",
|
||||
"content": "Python is an easy to learn, powerful programming language. It has efficient high-level data structures and a simple but effective approach to object-oriented programming. Python's elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms.",
|
||||
"chunk_type": "introduction",
|
||||
"priority_score": 0.95
|
||||
},
|
||||
{
|
||||
"title": "Python Features",
|
||||
"url": "https://www.python.org/about/",
|
||||
"content": "Python is a programming language that lets you work quickly and integrate systems more effectively. Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development, as well as for use as a scripting or glue language to connect existing components together.",
|
||||
"chunk_type": "features",
|
||||
"priority_score": 0.90
|
||||
},
|
||||
{
|
||||
"title": "Python Applications",
|
||||
"url": "https://www.python.org/about/apps/",
|
||||
"content": "Python is used in many application domains. Here's a sampling: Web and Internet Development, Scientific and Numeric Computing, Education, Desktop GUIs, Software Development, and Business Applications. Python is also used as a scripting language for web applications, e.g. via mod_wsgi for the Apache webserver. With Web Server Gateway Interface support, it has become the language of choice for many web developers.",
|
||||
"chunk_type": "applications",
|
||||
"priority_score": 0.85
|
||||
}
|
||||
]
|
||||
|
||||
# Initialize the report synthesizer
|
||||
synthesizer = get_report_synthesizer()
|
||||
|
||||
# Test query
|
||||
query = "What are the key features and applications of Python programming language?"
|
||||
|
||||
# Generate report
|
||||
print(f"Generating report for query: '{query}'")
|
||||
print("-" * 50)
|
||||
|
||||
report = await synthesizer.synthesize_report(chunks, query)
|
||||
|
||||
print("\nGenerated Report:")
|
||||
print("=" * 50)
|
||||
print(report)
|
||||
print("=" * 50)
|
||||
|
||||
async def test_with_real_urls(urls: List[str], query: str, use_mock: bool = False):
|
||||
"""
|
||||
Test report synthesis with real URLs.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to scrape
|
||||
query: Query to use for the report
|
||||
use_mock: Whether to use mock data for document scraping
|
||||
"""
|
||||
# Initialize the database
|
||||
await initialize_database()
|
||||
|
||||
# Get document scraper with mock option
|
||||
document_scraper = get_document_scraper(use_mock=use_mock)
|
||||
|
||||
# Get document processor
|
||||
document_processor = get_document_processor()
|
||||
|
||||
# Get report synthesizer
|
||||
report_synthesizer = get_report_synthesizer()
|
||||
|
||||
# Scrape URLs
|
||||
print(f"Scraping {len(urls)} URLs...")
|
||||
documents = await document_scraper.scrape_urls(urls)
|
||||
print(f"Scraped {len(documents)} documents")
|
||||
|
||||
# Create relevance scores (mock scores for this test)
|
||||
relevance_scores = {}
|
||||
for i, doc in enumerate(documents):
|
||||
relevance_scores[doc.get('url')] = 1.0 - (i * 0.1) # Simple decreasing scores
|
||||
|
||||
# Process documents for report
|
||||
print("Processing documents for report...")
|
||||
selected_chunks = document_processor.process_documents_for_report(
|
||||
documents,
|
||||
relevance_scores,
|
||||
token_budget=4000,
|
||||
chunk_size=1000,
|
||||
overlap_size=100
|
||||
)
|
||||
print(f"Selected {len(selected_chunks)} chunks for report")
|
||||
|
||||
# Generate report
|
||||
print(f"Generating report for query: '{query}'")
|
||||
print("-" * 50)
|
||||
|
||||
report = await report_synthesizer.synthesize_report(selected_chunks, query)
|
||||
|
||||
print("\nGenerated Report:")
|
||||
print("=" * 50)
|
||||
print(report)
|
||||
print("=" * 50)
|
||||
|
||||
# Save the report to a file
|
||||
output_file = f"report_{int(asyncio.get_event_loop().time())}.md"
|
||||
with open(output_file, "w") as f:
|
||||
f.write(report)
|
||||
|
||||
print(f"Report saved to {output_file}")
|
||||
|
||||
async def main():
|
||||
"""Main function to run the test."""
|
||||
parser = argparse.ArgumentParser(description="Test report synthesis functionality")
|
||||
parser.add_argument("--sample", action="store_true", help="Use sample document chunks")
|
||||
parser.add_argument("--urls", nargs="+", help="URLs to scrape")
|
||||
parser.add_argument("--query", type=str, default="What are the key features and applications of Python programming language?", help="Query to use for the report")
|
||||
parser.add_argument("--mock", action="store_true", help="Use mock data for document scraping")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.sample:
|
||||
await test_with_sample_chunks()
|
||||
elif args.urls:
|
||||
await test_with_real_urls(args.urls, args.query, args.mock)
|
||||
else:
|
||||
# Default test with some Python-related URLs
|
||||
default_urls = [
|
||||
"https://docs.python.org/3/tutorial/index.html",
|
||||
"https://www.python.org/about/",
|
||||
"https://www.python.org/about/apps/",
|
||||
"https://realpython.com/python-introduction/"
|
||||
]
|
||||
await test_with_real_urls(default_urls, args.query, args.mock)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
|
@ -1,3 +0,0 @@
|
|||
"""
|
||||
Tests for the UI module.
|
||||
"""
|
|
@ -1,55 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the UI search functionality.
|
||||
This script tests the search functionality of the UI without launching the Gradio interface.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from ui.gradio_interface import GradioInterface
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to test the UI search functionality."""
|
||||
# Create the interface
|
||||
interface = GradioInterface()
|
||||
|
||||
# Test queries
|
||||
test_queries = [
|
||||
"What are the latest advancements in quantum computing?",
|
||||
"Compare transformer and RNN architectures for NLP tasks",
|
||||
"Explain the environmental impact of electric vehicles"
|
||||
]
|
||||
|
||||
# Test each query
|
||||
for query in test_queries:
|
||||
print(f"\n\n{'=' * 80}")
|
||||
print(f"Testing query: {query}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
# Process the query
|
||||
markdown_results, results_file = interface.process_query(query, num_results=5)
|
||||
|
||||
# Print the results
|
||||
print(f"\nResults file: {results_file}")
|
||||
print(f"\nMarkdown results preview:")
|
||||
print(f"{markdown_results[:500]}...\n")
|
||||
|
||||
# Check if results file exists and has content
|
||||
if results_file and os.path.exists(results_file):
|
||||
with open(results_file, 'r') as f:
|
||||
results = json.load(f)
|
||||
print(f"Number of results: {len(results)}")
|
||||
|
||||
if len(results) > 0:
|
||||
print(f"First result title: {results[0].get('title', 'No title')}")
|
||||
print(f"First result URL: {results[0].get('url', 'No URL')}")
|
||||
else:
|
||||
print("No results file or empty results.")
|
||||
|
||||
print("\nTest completed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,647 +0,0 @@
|
|||
"""
|
||||
Gradio interface for the intelligent research system.
|
||||
This module provides a web interface for users to interact with the research system.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import gradio as gr
|
||||
import sys
|
||||
import time
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Add the parent directory to the path to allow importing from other modules
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from query.query_processor import QueryProcessor
|
||||
from execution.search_executor import SearchExecutor
|
||||
from execution.result_collector import ResultCollector
|
||||
from report.report_generator import get_report_generator, initialize_report_generator
|
||||
from report.report_detail_levels import get_report_detail_level_manager, DetailLevel
|
||||
from config.config import Config
|
||||
|
||||
|
||||
class GradioInterface:
|
||||
"""Gradio interface for the intelligent research system."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Gradio interface."""
|
||||
self.query_processor = QueryProcessor()
|
||||
self.search_executor = SearchExecutor()
|
||||
self.result_collector = ResultCollector()
|
||||
self.results_dir = Path(__file__).parent.parent / "results"
|
||||
self.results_dir.mkdir(exist_ok=True)
|
||||
self.reports_dir = Path(__file__).parent.parent
|
||||
self.reports_dir.mkdir(exist_ok=True)
|
||||
self.detail_level_manager = get_report_detail_level_manager()
|
||||
self.config = Config()
|
||||
|
||||
# The report generator will be initialized in the async init method
|
||||
self.report_generator = None
|
||||
|
||||
async def async_init(self):
|
||||
"""Asynchronously initialize components that require async initialization."""
|
||||
# Initialize the report generator
|
||||
await initialize_report_generator()
|
||||
self.report_generator = get_report_generator()
|
||||
return self
|
||||
|
||||
def process_query(self, query, num_results=10, use_reranker=True):
|
||||
"""
|
||||
Process a query and return the results.
|
||||
|
||||
Args:
|
||||
query (str): The query to process
|
||||
num_results (int): Number of results to return
|
||||
use_reranker (bool): Whether to use the Jina Reranker for semantic ranking
|
||||
|
||||
Returns:
|
||||
tuple: (markdown_results, json_results_path)
|
||||
"""
|
||||
try:
|
||||
# Process the query
|
||||
print(f"Processing query: {query}")
|
||||
processed_query = self.query_processor.process_query(query)
|
||||
print(f"Processed query: {processed_query}")
|
||||
|
||||
# Get available search engines and print their status
|
||||
available_engines = self.search_executor.get_available_search_engines()
|
||||
print(f"Available search engines: {available_engines}")
|
||||
|
||||
# Check which handlers are actually available
|
||||
for engine_name, handler in self.search_executor.available_handlers.items():
|
||||
print(f"Handler {engine_name} available: {handler.is_available()}")
|
||||
if not handler.is_available():
|
||||
print(f" - Reason: API key may be missing for {engine_name}")
|
||||
|
||||
# Add search engines if not specified
|
||||
if 'search_engines' not in processed_query:
|
||||
processed_query['search_engines'] = available_engines
|
||||
print(f"Using search engines: {available_engines}")
|
||||
|
||||
# Execute the search - request more results from each engine
|
||||
print(f"Executing search...")
|
||||
search_results = self.search_executor.execute_search(
|
||||
structured_query=processed_query,
|
||||
num_results=num_results
|
||||
)
|
||||
|
||||
# Print which engines returned results
|
||||
for engine, results in search_results.items():
|
||||
print(f"Engine {engine} returned {len(results)} results")
|
||||
|
||||
# Add the query to each result for reranking
|
||||
enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query))
|
||||
|
||||
# Flatten results for easier manipulation
|
||||
flattened_results = []
|
||||
for engine, results in search_results.items():
|
||||
for result in results:
|
||||
# Add the query and engine to each result
|
||||
result["query"] = enhanced_query
|
||||
result["engine"] = engine
|
||||
flattened_results.append(result)
|
||||
|
||||
# Process the results - don't limit the number of results
|
||||
print(f"Processing results...")
|
||||
processed_results = self.result_collector.process_results(
|
||||
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=use_reranker
|
||||
)
|
||||
print(f"Processed {len(processed_results)} results")
|
||||
|
||||
# Save results to file
|
||||
timestamp = int(time.time())
|
||||
results_file = self.results_dir / f"results_{timestamp}.json"
|
||||
|
||||
# Ensure the results are not empty before saving
|
||||
if processed_results:
|
||||
with open(results_file, "w") as f:
|
||||
json.dump(processed_results, f, indent=2)
|
||||
print(f"Results saved to {results_file}")
|
||||
file_path = str(results_file)
|
||||
else:
|
||||
error_message = "No results found. Please try a different query or check API keys."
|
||||
print(error_message)
|
||||
file_path = None
|
||||
return f"## No Results Found\n\n{error_message}", file_path
|
||||
|
||||
# Format results for display
|
||||
markdown_results = self._format_results_as_markdown(processed_results)
|
||||
|
||||
return markdown_results, file_path
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"Error processing query: {str(e)}"
|
||||
print(f"ERROR: {error_message}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return f"## Error\n\n{error_message}", None
|
||||
|
||||
def _format_results_as_markdown(self, results):
|
||||
"""
|
||||
Format results as markdown.
|
||||
|
||||
Args:
|
||||
results (list): List of result dictionaries
|
||||
|
||||
Returns:
|
||||
str: Markdown formatted results
|
||||
"""
|
||||
if not results:
|
||||
return "## No Results Found\n\nNo results were found for your query."
|
||||
|
||||
# Count results by source
|
||||
source_counts = {}
|
||||
for result in results:
|
||||
source = result.get("source", "unknown")
|
||||
source_counts[source] = source_counts.get(source, 0) + 1
|
||||
|
||||
# Create source distribution string
|
||||
source_distribution = ", ".join([f"{source}: {count}" for source, count in source_counts.items()])
|
||||
|
||||
markdown = f"## Search Results\n\n"
|
||||
markdown += f"*Sources: {source_distribution}*\n\n"
|
||||
|
||||
for i, result in enumerate(results):
|
||||
title = result.get("title", "Untitled")
|
||||
url = result.get("url", "")
|
||||
snippet = result.get("snippet", "No snippet available")
|
||||
source = result.get("source", "unknown")
|
||||
authors = result.get("authors", "Unknown")
|
||||
year = result.get("year", "Unknown")
|
||||
score = result.get("relevance_score", 0)
|
||||
|
||||
markdown += f"### {i+1}. {title}\n\n"
|
||||
markdown += f"**Source**: {source}\n\n"
|
||||
markdown += f"**URL**: [{url}]({url})\n\n"
|
||||
markdown += f"**Snippet**: {snippet}\n\n"
|
||||
markdown += f"**Authors**: {authors}\n\n"
|
||||
markdown += f"**Year**: {year}\n\n"
|
||||
markdown += f"**Score**: {score}\n\n"
|
||||
markdown += "---\n\n"
|
||||
|
||||
return markdown
|
||||
|
||||
async def generate_report(self, query, detail_level="standard", query_type="auto-detect", custom_model=None,
|
||||
results_file=None, process_thinking_tags=False, progress=gr.Progress()):
|
||||
"""
|
||||
Generate a report for the given query.
|
||||
|
||||
Args:
|
||||
query: The query to generate a report for
|
||||
detail_level: The level of detail for the report (brief, standard, detailed, comprehensive)
|
||||
custom_model: Custom model to use for report generation
|
||||
results_file: Path to a file containing search results
|
||||
process_thinking_tags: Whether to process thinking tags in the model output
|
||||
progress: Gradio progress indicator
|
||||
|
||||
Returns:
|
||||
Path to the generated report
|
||||
"""
|
||||
try:
|
||||
# Create a timestamped output file
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
model_suffix = ""
|
||||
|
||||
# Extract the actual model name from the description if selected
|
||||
if custom_model:
|
||||
# If the model is in the format "model_name (provider: model_display)"
|
||||
if "(" in custom_model:
|
||||
custom_model = custom_model.split(" (")[0]
|
||||
|
||||
model_name = custom_model.split('/')[-1]
|
||||
model_suffix = f"_{model_name}"
|
||||
|
||||
output_file = self.reports_dir / f"report_{timestamp}{model_suffix}.md"
|
||||
|
||||
# Get detail level configuration
|
||||
config = self.detail_level_manager.get_detail_level_config(detail_level)
|
||||
|
||||
# If custom model is provided, use it
|
||||
if custom_model:
|
||||
config["model"] = custom_model
|
||||
|
||||
# Ensure report generator is initialized
|
||||
if self.report_generator is None:
|
||||
print("Initializing report generator...")
|
||||
await initialize_report_generator()
|
||||
self.report_generator = get_report_generator()
|
||||
|
||||
# This will update the report synthesizer to use the custom model
|
||||
self.report_generator.set_detail_level(detail_level)
|
||||
|
||||
print(f"Generating report with detail level: {detail_level}")
|
||||
print(f"Detail level configuration: {config}")
|
||||
print(f"Using model: {config['model']}")
|
||||
print(f"Processing thinking tags: {process_thinking_tags}")
|
||||
|
||||
# If results file is provided, load results from it
|
||||
search_results = []
|
||||
if results_file and os.path.exists(results_file):
|
||||
with open(results_file, 'r') as f:
|
||||
search_results = json.load(f)
|
||||
print(f"Loaded {len(search_results)} results from {results_file}")
|
||||
else:
|
||||
# If no results file is provided, perform a search
|
||||
print(f"No results file provided, performing search for: {query}")
|
||||
|
||||
# Process the query to create a structured query
|
||||
structured_query = await self.query_processor.process_query(query)
|
||||
|
||||
# Generate search queries for different engines
|
||||
structured_query = await self.query_processor.generate_search_queries(
|
||||
structured_query,
|
||||
self.search_executor.get_available_search_engines()
|
||||
)
|
||||
|
||||
# Execute the search with the structured query
|
||||
search_results_dict = self.search_executor.execute_search(
|
||||
structured_query,
|
||||
num_results=config["num_results"]
|
||||
)
|
||||
|
||||
# Add debug logging
|
||||
print(f"Search results by engine:")
|
||||
for engine, results in search_results_dict.items():
|
||||
print(f" {engine}: {len(results)} results")
|
||||
|
||||
# Flatten the search results
|
||||
search_results = []
|
||||
for engine_results in search_results_dict.values():
|
||||
search_results.extend(engine_results)
|
||||
|
||||
print(f"Total flattened search results: {len(search_results)}")
|
||||
|
||||
# Fallback mechanism if no search results are found
|
||||
if len(search_results) == 0:
|
||||
print("WARNING: No search results found. Using fallback search mechanism...")
|
||||
|
||||
# Try a simplified version of the query
|
||||
simplified_query = query.split(" ")[:10] # Take first 10 words
|
||||
simplified_query = " ".join(simplified_query)
|
||||
if simplified_query != query:
|
||||
print(f"Trying simplified query: {simplified_query}")
|
||||
|
||||
# Create a basic structured query
|
||||
basic_structured_query = {
|
||||
"original_query": simplified_query,
|
||||
"enhanced_query": simplified_query,
|
||||
"type": "unknown",
|
||||
"intent": "research"
|
||||
}
|
||||
|
||||
# Try search again with simplified query
|
||||
search_results_dict = self.search_executor.execute_search(
|
||||
basic_structured_query,
|
||||
num_results=config["num_results"]
|
||||
)
|
||||
|
||||
# Flatten the search results
|
||||
search_results = []
|
||||
for engine_results in search_results_dict.values():
|
||||
search_results.extend(engine_results)
|
||||
|
||||
print(f"Fallback search returned {len(search_results)} results")
|
||||
|
||||
# Second fallback: If still no results, create a mock result to prevent report generation failure
|
||||
if len(search_results) == 0:
|
||||
print("WARNING: Fallback search also failed. Creating mock search result...")
|
||||
|
||||
# Create a mock search result with the query as the title
|
||||
search_results = [{
|
||||
"title": f"Information about: {query}",
|
||||
"url": "https://example.com/search-result",
|
||||
"snippet": f"This is a placeholder result for the query: {query}. " +
|
||||
"The search system was unable to find relevant results. " +
|
||||
"Please try refining your query or check your search API configuration.",
|
||||
"source": "mock_result",
|
||||
"score": 1.0
|
||||
}]
|
||||
|
||||
print("Created mock search result to allow report generation to proceed")
|
||||
|
||||
# Rerank results if we have a reranker
|
||||
if hasattr(self, 'reranker') and self.reranker:
|
||||
search_results = self.reranker.rerank_with_metadata(
|
||||
query,
|
||||
search_results,
|
||||
document_key='snippet',
|
||||
top_n=config["num_results"]
|
||||
)
|
||||
|
||||
# Set up progress tracking
|
||||
self.progress_status = "Preparing documents..."
|
||||
self.progress_value = 0
|
||||
self.progress_total = 1 # Will be updated when we know the total chunks
|
||||
|
||||
# Define progress callback function
|
||||
def progress_callback(current_progress, total_chunks, current_report):
|
||||
self.progress_value = current_progress
|
||||
self.progress_total = total_chunks
|
||||
# Update the progress bar
|
||||
progress(current_progress)
|
||||
|
||||
# Set the progress callback for the report generator
|
||||
if hasattr(self.report_generator, 'set_progress_callback'):
|
||||
self.report_generator.set_progress_callback(progress_callback)
|
||||
|
||||
# Generate the report
|
||||
print(f"Generating report with {len(search_results)} search results")
|
||||
if len(search_results) == 0:
|
||||
print("WARNING: No search results found. Report generation may fail.")
|
||||
|
||||
# Update progress status based on detail level
|
||||
if detail_level.lower() == "comprehensive":
|
||||
self.progress_status = "Generating progressive report..."
|
||||
else:
|
||||
self.progress_status = "Processing document chunks..."
|
||||
|
||||
# Initial progress update
|
||||
progress(0)
|
||||
|
||||
# Handle query_type parameter
|
||||
actual_query_type = None
|
||||
if query_type != "auto-detect":
|
||||
actual_query_type = query_type
|
||||
print(f"Using user-selected query type: {actual_query_type}")
|
||||
else:
|
||||
print("Using auto-detection for query type")
|
||||
|
||||
report = await self.report_generator.generate_report(
|
||||
search_results=search_results,
|
||||
query=query,
|
||||
token_budget=config["token_budget"],
|
||||
chunk_size=config["chunk_size"],
|
||||
overlap_size=config["overlap_size"],
|
||||
detail_level=detail_level,
|
||||
query_type=actual_query_type
|
||||
)
|
||||
|
||||
# Final progress update
|
||||
progress(1.0)
|
||||
|
||||
# Process thinking tags if requested
|
||||
if process_thinking_tags:
|
||||
report = self._process_thinking_tags(report)
|
||||
|
||||
# Save report to file
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(report)
|
||||
|
||||
print(f"Report saved to: {output_file}")
|
||||
|
||||
return report, str(output_file)
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"Error generating report: {str(e)}"
|
||||
print(f"ERROR: {error_message}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return f"## Error\n\n{error_message}", None
|
||||
|
||||
def _process_thinking_tags(self, text):
|
||||
"""
|
||||
Process thinking tags in the text.
|
||||
|
||||
Args:
|
||||
text (str): Text to process
|
||||
|
||||
Returns:
|
||||
str: Processed text
|
||||
"""
|
||||
# Remove content between <thinking> and </thinking> tags
|
||||
import re
|
||||
return re.sub(r'<thinking>.*?</thinking>', '', text, flags=re.DOTALL)
|
||||
|
||||
def get_available_models(self):
|
||||
"""
|
||||
Get a list of available models for report generation.
|
||||
|
||||
Returns:
|
||||
list: List of available model names
|
||||
"""
|
||||
# Get models from config
|
||||
models = []
|
||||
|
||||
# Extract all model names from the config file
|
||||
if 'models' in self.config.config_data:
|
||||
models = list(self.config.config_data['models'].keys())
|
||||
|
||||
# If no models found, provide some defaults
|
||||
if not models:
|
||||
models = [
|
||||
"llama-3.1-8b-instant",
|
||||
"llama-3.3-70b-versatile",
|
||||
"groq/deepseek-r1-distill-llama-70b-specdec",
|
||||
"openrouter-mixtral",
|
||||
"openrouter-claude",
|
||||
"gemini-2.0-flash-lite"
|
||||
]
|
||||
|
||||
return models
|
||||
|
||||
def get_model_descriptions(self):
|
||||
"""
|
||||
Get descriptions for available models.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary mapping model names to descriptions
|
||||
"""
|
||||
descriptions = {}
|
||||
model_name_to_description = {}
|
||||
|
||||
if 'models' in self.config.config_data:
|
||||
for model_name, model_config in self.config.config_data['models'].items():
|
||||
provider = model_config.get('provider', 'unknown')
|
||||
model_display = model_config.get('model_name', model_name)
|
||||
max_tokens = model_config.get('max_tokens', 'unknown')
|
||||
temperature = model_config.get('temperature', 'unknown')
|
||||
|
||||
# Create a description that includes the provider and actual model name
|
||||
display_name = f"{model_name} ({provider}: {model_display})"
|
||||
descriptions[model_name] = display_name
|
||||
|
||||
# Create a more detailed description for the dropdown tooltip
|
||||
detailed_info = f"{display_name} - Max tokens: {max_tokens}, Temperature: {temperature}"
|
||||
model_name_to_description[display_name] = detailed_info
|
||||
|
||||
self.model_name_to_description = model_name_to_description
|
||||
return descriptions
|
||||
|
||||
def create_interface(self):
|
||||
"""
|
||||
Create and return the Gradio interface.
|
||||
|
||||
Returns:
|
||||
gr.Blocks: The Gradio interface
|
||||
"""
|
||||
with gr.Blocks(title="Intelligent Research System") as interface:
|
||||
gr.Markdown("# Intelligent Research System")
|
||||
gr.Markdown(
|
||||
"""
|
||||
This system helps you research topics by searching across multiple sources
|
||||
including Google (via Serper), Google Scholar, and arXiv.
|
||||
|
||||
You can either search for results or generate a comprehensive report.
|
||||
"""
|
||||
)
|
||||
|
||||
with gr.Tabs() as tabs:
|
||||
with gr.TabItem("Search"):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
search_query_input = gr.Textbox(
|
||||
label="Research Query",
|
||||
placeholder="Enter your research question here...",
|
||||
lines=3
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
search_num_results = gr.Slider(
|
||||
minimum=5,
|
||||
maximum=50,
|
||||
value=20,
|
||||
step=5,
|
||||
label="Results Per Engine"
|
||||
)
|
||||
search_use_reranker = gr.Checkbox(
|
||||
label="Use Semantic Reranker",
|
||||
value=True,
|
||||
info="Uses Jina AI's reranker for more relevant results"
|
||||
)
|
||||
search_button = gr.Button("Search", variant="primary")
|
||||
|
||||
gr.Examples(
|
||||
examples=[
|
||||
["What are the latest advancements in quantum computing?"],
|
||||
["Compare transformer and RNN architectures for NLP tasks"],
|
||||
["Explain the environmental impact of electric vehicles"]
|
||||
],
|
||||
inputs=search_query_input
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
search_results_output = gr.Markdown(label="Results")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
search_file_output = gr.Textbox(
|
||||
label="Results saved to file",
|
||||
interactive=False
|
||||
)
|
||||
|
||||
with gr.TabItem("Generate Report"):
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
report_query_input = gr.Textbox(
|
||||
label="Research Query",
|
||||
placeholder="Enter your research question here...",
|
||||
lines=3
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
report_detail_level = gr.Dropdown(
|
||||
choices=["brief", "standard", "detailed", "comprehensive"],
|
||||
value="standard",
|
||||
label="Detail Level",
|
||||
info="Controls the depth and breadth of the report"
|
||||
)
|
||||
report_query_type = gr.Dropdown(
|
||||
choices=["auto-detect", "factual", "exploratory", "comparative"],
|
||||
value="auto-detect",
|
||||
label="Query Type",
|
||||
info="Type of query determines the report structure"
|
||||
)
|
||||
model_descriptions = self.get_model_descriptions()
|
||||
report_custom_model = gr.Dropdown(
|
||||
choices=list(self.model_name_to_description.keys()),
|
||||
value=None,
|
||||
label="Custom Model (Optional)",
|
||||
info="Select a custom model for report generation"
|
||||
)
|
||||
report_process_thinking = gr.Checkbox(
|
||||
label="Process Thinking Tags",
|
||||
value=False,
|
||||
info="Process <thinking> tags in model output"
|
||||
)
|
||||
report_button = gr.Button("Generate Report", variant="primary")
|
||||
|
||||
gr.Examples(
|
||||
examples=[
|
||||
["What are the latest advancements in quantum computing?"],
|
||||
["Compare transformer and RNN architectures for NLP tasks"],
|
||||
["Explain the environmental impact of electric vehicles"],
|
||||
["Explain the potential relationship between creatine supplementation and muscle loss due to GLP1-ar drugs for weight loss."]
|
||||
],
|
||||
inputs=report_query_input
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
report_output = gr.Markdown(label="Generated Report")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
report_file_output = gr.Textbox(
|
||||
label="Report saved to file",
|
||||
interactive=False
|
||||
)
|
||||
|
||||
# Add information about detail levels and query types
|
||||
detail_levels_info = ""
|
||||
for level, description in self.detail_level_manager.get_available_detail_levels():
|
||||
detail_levels_info += f"- **{level}**: {description}\n"
|
||||
|
||||
query_types_info = """
|
||||
- **auto-detect**: Automatically determine the query type based on the query text
|
||||
- **factual**: For queries seeking specific information (e.g., "What is...", "How does...")
|
||||
- **exploratory**: For queries investigating a topic broadly (e.g., "Tell me about...")
|
||||
- **comparative**: For queries comparing multiple items (e.g., "Compare X and Y", "Differences between...")
|
||||
"""
|
||||
|
||||
gr.Markdown(f"### Detail Levels\n{detail_levels_info}")
|
||||
gr.Markdown(f"### Query Types\n{query_types_info}")
|
||||
|
||||
# Set up event handlers
|
||||
search_button.click(
|
||||
fn=self.process_query,
|
||||
inputs=[search_query_input, search_num_results, search_use_reranker],
|
||||
outputs=[search_results_output, search_file_output]
|
||||
)
|
||||
|
||||
report_button.click(
|
||||
fn=lambda q, d, t, m, r, p: asyncio.run(self.generate_report(q, d, t, m, r, p)),
|
||||
inputs=[report_query_input, report_detail_level, report_query_type, report_custom_model,
|
||||
search_file_output, report_process_thinking],
|
||||
outputs=[report_output, report_file_output]
|
||||
)
|
||||
|
||||
return interface
|
||||
|
||||
def launch(self, **kwargs):
|
||||
"""
|
||||
Launch the Gradio interface.
|
||||
|
||||
Args:
|
||||
**kwargs: Keyword arguments to pass to gr.Interface.launch()
|
||||
"""
|
||||
interface = self.create_interface()
|
||||
interface.launch(**kwargs)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to launch the Gradio interface."""
|
||||
# Create interface and initialize async components
|
||||
interface = GradioInterface()
|
||||
|
||||
# Run the async initialization in the event loop
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(interface.async_init())
|
||||
|
||||
# Launch the interface
|
||||
interface.launch(share=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,4 +0,0 @@
|
|||
"""
|
||||
Utility modules for the intelligent research system.
|
||||
This package contains utility functions and classes used across the system.
|
||||
"""
|
|
@ -1,112 +0,0 @@
|
|||
"""
|
||||
A module for computing text similarity using Jina AI's Embeddings API.
|
||||
Get your Jina AI API key for free: https://jina.ai/?sui=apikey
|
||||
|
||||
The jina-embeddings-v3 model supports input lengths of up to 8,192 tokens.
|
||||
For longer texts, consider using Jina's Segmenter API to split into smaller chunks.
|
||||
"""
|
||||
|
||||
import os
|
||||
import requests
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
from typing import Tuple
|
||||
|
||||
class TokenLimitError(Exception):
|
||||
"""Raised when input text exceeds the token limit."""
|
||||
pass
|
||||
|
||||
class JinaSimilarity:
|
||||
MAX_TOKENS = 8192
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the JinaSimilarity class."""
|
||||
self.api_key = os.environ.get("JINA_API_KEY")
|
||||
if not self.api_key:
|
||||
raise ValueError("JINA_API_KEY environment variable not set")
|
||||
|
||||
self.headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Accept": "application/json",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
self.embeddings_url = "https://api.jina.ai/v1/embeddings"
|
||||
# Initialize tokenizer - using cl100k_base which is used by many modern models
|
||||
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""Count the number of tokens in a text.
|
||||
|
||||
Args:
|
||||
text: The text to count tokens for
|
||||
|
||||
Returns:
|
||||
int: Number of tokens in the text
|
||||
"""
|
||||
return len(self.tokenizer.encode(text))
|
||||
|
||||
def get_embedding(self, text: str) -> list:
|
||||
"""Get embedding for a piece of text using Jina AI's Embeddings API.
|
||||
|
||||
Args:
|
||||
text: The text to get embeddings for (max 8,192 tokens)
|
||||
|
||||
Returns:
|
||||
list: The embedding vector
|
||||
|
||||
Raises:
|
||||
TokenLimitError: If the text exceeds 8,192 tokens
|
||||
requests.exceptions.RequestException: If the API call fails
|
||||
"""
|
||||
num_tokens = self.count_tokens(text)
|
||||
if num_tokens > self.MAX_TOKENS:
|
||||
raise TokenLimitError(
|
||||
f"Input text is {num_tokens} tokens, which exceeds the maximum of {self.MAX_TOKENS} tokens. "
|
||||
"Consider using Jina's Segmenter API to split into smaller chunks."
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": "jina-embeddings-v3",
|
||||
"input": [text],
|
||||
"normalized": True # For cosine similarity
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
self.embeddings_url,
|
||||
headers=self.headers,
|
||||
json=payload
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json()["data"][0]["embedding"]
|
||||
|
||||
def compute_similarity(self, chunk: str, query: str) -> Tuple[float, list, list]:
|
||||
"""Compute similarity between a text chunk and a query.
|
||||
|
||||
Args:
|
||||
chunk: The text chunk to compare against
|
||||
query: The query text
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
- float: Cosine similarity score (0-1)
|
||||
- list: Chunk embedding
|
||||
- list: Query embedding
|
||||
|
||||
Raises:
|
||||
TokenLimitError: If the text exceeds 8,192 tokens
|
||||
requests.exceptions.RequestException: If the API calls fail
|
||||
"""
|
||||
# Get embeddings for both texts
|
||||
chunk_embedding = self.get_embedding(chunk)
|
||||
query_embedding = self.get_embedding(query)
|
||||
|
||||
# Convert to numpy arrays for efficient computation
|
||||
chunk_vec = np.array(chunk_embedding)
|
||||
query_vec = np.array(query_embedding)
|
||||
|
||||
# Compute cosine similarity
|
||||
# Since vectors are normalized, dot product equals cosine similarity
|
||||
similarity = float(np.dot(chunk_vec, query_vec))
|
||||
|
||||
return similarity, chunk_embedding, query_embedding
|
|
@ -1,62 +0,0 @@
|
|||
import os
|
||||
import json
|
||||
import requests
|
||||
|
||||
# Get your Jina AI API key for free: https://jina.ai/?sui=apikey
|
||||
JINA_API_KEY = os.getenv('JINA_API_KEY')
|
||||
|
||||
|
||||
def segment_markdown(file_path):
|
||||
"""
|
||||
Segments a markdown file using Jina AI's Segmenter API.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the markdown file.
|
||||
|
||||
Returns:
|
||||
dict: JSON structure containing the segments.
|
||||
"""
|
||||
try:
|
||||
# Read the markdown file
|
||||
with open(file_path, 'r') as file:
|
||||
markdown_content = file.read()
|
||||
|
||||
# Prepare the request to Jina Segmenter API
|
||||
headers = {
|
||||
'Authorization': f'Bearer {JINA_API_KEY}',
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
data = {
|
||||
'content': markdown_content,
|
||||
'tokenizer': 'cl100k_base',
|
||||
'return_tokens': False,
|
||||
'return_chunks': True,
|
||||
'max_chunk_length': 1000
|
||||
}
|
||||
|
||||
# Make the API request
|
||||
response = requests.post(
|
||||
'https://segment.jina.ai/',
|
||||
headers=headers,
|
||||
json=data
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Return the segments as JSON
|
||||
return response.json()
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error segmenting markdown: {str(e)}')
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) != 2:
|
||||
print('Usage: python markdown_segmenter.py <markdown_file>')
|
||||
sys.exit(1)
|
||||
|
||||
segments = segment_markdown(sys.argv[1])
|
||||
if segments:
|
||||
print(json.dumps(segments, indent=2))
|
Loading…
Reference in New Issue