ira/ui/gradio_interface.py

1566 lines
70 KiB
Python

"""
Gradio interface for the intelligent research system.
This module provides a web interface for users to interact with the research system.
"""
import os
import json
import gradio as gr
import sys
import time
import asyncio
from pathlib import Path
from datetime import datetime
# Add the parent directory to the path to allow importing from other modules
sys.path.append(str(Path(__file__).parent.parent))
from query.query_processor import QueryProcessor
from execution.search_executor import SearchExecutor
from execution.result_collector import ResultCollector
from execution.sub_question_executor import get_sub_question_executor
from report.report_generator import get_report_generator, initialize_report_generator
from report.report_detail_levels import get_report_detail_level_manager, DetailLevel
from config.config import Config
class GradioInterface:
"""Gradio interface for the intelligent research system."""
def __init__(self):
"""Initialize the Gradio interface."""
self.query_processor = QueryProcessor()
self.search_executor = SearchExecutor()
self.result_collector = ResultCollector()
self.sub_question_executor = get_sub_question_executor()
self.results_dir = Path(__file__).parent.parent / "results"
self.results_dir.mkdir(exist_ok=True)
# Create a dedicated reports directory with subdirectories
self.reports_dir = Path(__file__).parent.parent / "reports"
self.reports_dir.mkdir(exist_ok=True)
# Create daily subdirectory for organization
self.reports_daily_dir = self.reports_dir / datetime.now().strftime("%Y-%m-%d")
self.reports_daily_dir.mkdir(exist_ok=True)
# Create a metadata file to track reports
self.reports_metadata_file = self.reports_dir / "reports_metadata.json"
if not self.reports_metadata_file.exists():
with open(self.reports_metadata_file, "w") as f:
json.dump({"reports": []}, f, indent=2)
self.detail_level_manager = get_report_detail_level_manager()
self.config = Config()
# The report generator will be initialized in the async init method
self.report_generator = None
# We're using Gradio's built-in progress tracking (gr.Progress) instead of custom elements
async def async_init(self):
"""Asynchronously initialize components that require async initialization."""
# Initialize the report generator
await initialize_report_generator()
self.report_generator = get_report_generator()
return self
def process_query(self, query, num_results=10, use_reranker=True):
"""
Process a query and return the results.
Args:
query (str): The query to process
num_results (int): Number of results to return
use_reranker (bool): Whether to use the Jina Reranker for semantic ranking
Returns:
tuple: (markdown_results, json_results_path)
"""
try:
# Process the query
print(f"Processing query: {query}")
processed_query = self.query_processor.process_query(query)
print(f"Processed query: {processed_query}")
# Get available search engines and print their status
available_engines = self.search_executor.get_available_search_engines()
print(f"Available search engines: {available_engines}")
# Check which handlers are actually available
for engine_name, handler in self.search_executor.available_handlers.items():
print(f"Handler {engine_name} available: {handler.is_available()}")
if not handler.is_available():
print(f" - Reason: API key may be missing for {engine_name}")
# Add search engines if not specified
if 'search_engines' not in processed_query:
processed_query['search_engines'] = available_engines
print(f"Using search engines: {available_engines}")
# Execute the search - request more results from each engine
print(f"Executing search...")
search_results = self.search_executor.execute_search(
structured_query=processed_query,
num_results=num_results
)
# Print which engines returned results
for engine, results in search_results.items():
print(f"Engine {engine} returned {len(results)} results")
# Add the query to each result for reranking
enhanced_query = processed_query.get("enhanced_query", processed_query.get("original_query", query))
# Flatten results for easier manipulation
flattened_results = []
for engine, results in search_results.items():
for result in results:
# Add the query and engine to each result
result["query"] = enhanced_query
result["engine"] = engine
flattened_results.append(result)
# Process the results - don't limit the number of results
print(f"Processing results...")
processed_results = self.result_collector.process_results(
{"combined": flattened_results}, dedup=True, max_results=None, use_reranker=use_reranker
)
print(f"Processed {len(processed_results)} results")
# Save results to file
timestamp = int(time.time())
results_file = self.results_dir / f"results_{timestamp}.json"
# Ensure the results are not empty before saving
if processed_results:
with open(results_file, "w") as f:
json.dump(processed_results, f, indent=2)
print(f"Results saved to {results_file}")
file_path = str(results_file)
else:
error_message = "No results found. Please try a different query or check API keys."
print(error_message)
file_path = None
return f"## No Results Found\n\n{error_message}", file_path
# Format results for display
markdown_results = self._format_results_as_markdown(processed_results)
return markdown_results, file_path
except Exception as e:
error_message = f"Error processing query: {str(e)}"
print(f"ERROR: {error_message}")
import traceback
traceback.print_exc()
return f"## Error\n\n{error_message}", None
def _format_results_as_markdown(self, results):
"""
Format results as markdown.
Args:
results (list): List of result dictionaries
Returns:
str: Markdown formatted results
"""
if not results:
return "## No Results Found\n\nNo results were found for your query."
# Count results by source
source_counts = {}
for result in results:
source = result.get("source", "unknown")
source_counts[source] = source_counts.get(source, 0) + 1
# Create source distribution string
source_distribution = ", ".join([f"{source}: {count}" for source, count in source_counts.items()])
markdown = f"## Search Results\n\n"
markdown += f"*Sources: {source_distribution}*\n\n"
for i, result in enumerate(results):
title = result.get("title", "Untitled")
url = result.get("url", "")
snippet = result.get("snippet", "No snippet available")
source = result.get("source", "unknown")
authors = result.get("authors", "Unknown")
year = result.get("year", "Unknown")
score = result.get("relevance_score", 0)
markdown += f"### {i+1}. {title}\n\n"
markdown += f"**Source**: {source}\n\n"
markdown += f"**URL**: [{url}]({url})\n\n"
markdown += f"**Snippet**: {snippet}\n\n"
markdown += f"**Authors**: {authors}\n\n"
markdown += f"**Year**: {year}\n\n"
markdown += f"**Score**: {score}\n\n"
markdown += "---\n\n"
return markdown
async def generate_report(self, query, detail_level="standard", query_type="auto-detect", custom_model=None,
results_file=None, process_thinking_tags=False, initial_results=10, final_results=7,
progress=gr.Progress()):
"""
Generate a report for the given query.
Args:
query: The query to generate a report for
detail_level: The level of detail for the report (brief, standard, detailed, comprehensive)
custom_model: Custom model to use for report generation
results_file: Path to a file containing search results
process_thinking_tags: Whether to process thinking tags in the model output
progress: Gradio progress indicator
Returns:
Path to the generated report
"""
try:
# Create a timestamped output file in the daily directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_suffix = ""
# Extract the actual model name from the description if selected
if custom_model:
# If the model is in the format "model_name (provider: model_display)"
original_custom_model = custom_model
if "(" in custom_model:
custom_model = custom_model.split(" (")[0]
model_name = custom_model.split('/')[-1]
model_suffix = f"_{model_name}"
# Log the model selection for debugging
print(f"Selected model from UI: {original_custom_model}")
print(f"Extracted model name: {custom_model}")
print(f"Using model suffix: {model_suffix}")
# Create a unique report ID
import hashlib
report_id = f"{timestamp}_{hashlib.md5(query.encode()).hexdigest()[:8]}"
# Define the output file path in the daily directory
output_file = self.reports_daily_dir / f"report_{report_id}{model_suffix}.md"
# Get detail level configuration
config = self.detail_level_manager.get_detail_level_config(detail_level)
# Override num_results if provided
if initial_results:
config["initial_results_per_engine"] = initial_results
# Set final results after reranking if provided
if final_results:
config["final_results_after_reranking"] = final_results
# If custom model is provided, use it
if custom_model:
# Extract the actual model name from the display name format if needed
model_name = custom_model.split(" (")[0] if " (" in custom_model else custom_model
config["model"] = model_name
print(f"Using custom model: {model_name}")
# Ensure report generator is initialized
if self.report_generator is None:
print("Initializing report generator...")
await initialize_report_generator()
self.report_generator = get_report_generator()
# Debug: Print initial model configuration based on detail level
detail_config = self.detail_level_manager.get_detail_level_config(detail_level)
default_model = detail_config.get("model", "unknown")
print(f"Default model for {detail_level} detail level: {default_model}")
# Then explicitly override with custom model if provided
if custom_model:
# Extract the actual model name from the display name format
# The format is "model_name (provider: model_display)"
model_name = custom_model.split(" (")[0] if " (" in custom_model else custom_model
print(f"Setting report generator to use custom model: {model_name}")
# Look for a set_model method in the report generator
if hasattr(self.report_generator, 'set_model'):
self.report_generator.set_model(model_name)
print(f"After setting custom model, report generator model is: {self.report_generator.model_name}")
else:
print("Warning: Report generator does not have set_model method. Using alternative approach.")
# Update the config with the model as a fallback
current_config = self.report_generator.get_detail_level_config()
if current_config:
current_config["model"] = model_name
print(f"Updated config model to: {model_name}")
print(f"Generating report with detail level: {detail_level}")
print(f"Detail level configuration: {config}")
print(f"Using model: {config['model']}")
print(f"Processing thinking tags: {process_thinking_tags}")
# If results file is provided, load results from it
search_results = []
if results_file and os.path.exists(results_file):
with open(results_file, 'r') as f:
search_results = json.load(f)
print(f"Loaded {len(search_results)} results from {results_file}")
else:
# If no results file is provided, perform a search
print(f"No results file provided, performing search for: {query}")
# Process the query to create a structured query
structured_query = await self.query_processor.process_query(query)
# Generate search queries for different engines
structured_query = await self.query_processor.generate_search_queries(
structured_query,
self.search_executor.get_available_search_engines()
)
# Set the number of results to fetch per engine early so it's available throughout the function
num_results_to_fetch = config.get("initial_results_per_engine", config.get("num_results", 10))
# Initialize sub_question_results as an empty dict in case there are no sub-questions
sub_question_results = {}
# Check if the query was decomposed into sub-questions
has_sub_questions = 'sub_questions' in structured_query and structured_query['sub_questions']
if has_sub_questions:
# Log sub-questions
print(f"Query was decomposed into {len(structured_query['sub_questions'])} sub-questions:")
for i, sq in enumerate(structured_query['sub_questions']):
print(f" {i+1}. {sq.get('sub_question')} (aspect: {sq.get('aspect')}, priority: {sq.get('priority')})")
# Execute searches for sub-questions
progress(0.1, desc="Executing searches for sub-questions...")
structured_query = await self.sub_question_executor.execute_sub_question_searches(
structured_query,
num_results_per_engine=3 # Use fewer results per engine for sub-questions
)
# Get combined results from sub-questions
sub_question_results = self.sub_question_executor.get_combined_results(structured_query)
print(f"Sub-questions returned results from {len(sub_question_results)} engines")
# Prioritize results from sub-questions
sub_question_results = self.sub_question_executor.prioritize_results(
sub_question_results,
max_results_per_engine=num_results_to_fetch # Use same limit as main query
)
progress(0.2, desc="Completed sub-question searches")
# Execute main search
progress(0.3, desc="Executing main search...")
search_results_dict = self.search_executor.execute_search(
structured_query,
num_results=num_results_to_fetch
)
# Add debug logging
print(f"Main search results by engine:")
for engine, results in search_results_dict.items():
print(f" {engine}: {len(results)} results")
# If we have sub-question results, combine them with the main search results
if has_sub_questions and 'sub_questions' in structured_query:
print("Combining main search results with sub-question results")
progress(0.4, desc="Combining results from sub-questions...")
# Merge results from sub-questions into the main search results
for engine, results in sub_question_results.items():
if engine in search_results_dict:
# Add sub-question results to the main results
search_results_dict[engine].extend(results)
print(f" Added {len(results)} results from sub-questions to {engine}")
else:
# Engine only has sub-question results
search_results_dict[engine] = results
print(f" Added {len(results)} results from sub-questions as new engine {engine}")
# Flatten the search results
search_results = []
for engine_results in search_results_dict.values():
search_results.extend(engine_results)
print(f"Total flattened search results: {len(search_results)}")
# Fallback mechanism if no search results are found
if len(search_results) == 0:
print("WARNING: No search results found. Using fallback search mechanism...")
# Try a simplified version of the query
simplified_query = query.split(" ")[:10] # Take first 10 words
simplified_query = " ".join(simplified_query)
if simplified_query != query:
print(f"Trying simplified query: {simplified_query}")
# Create a basic structured query
basic_structured_query = {
"original_query": simplified_query,
"enhanced_query": simplified_query,
"type": "unknown",
"intent": "research"
}
# Try search again with simplified query
search_results_dict = self.search_executor.execute_search(
basic_structured_query,
num_results=config["num_results"]
)
# Flatten the search results
search_results = []
for engine_results in search_results_dict.values():
search_results.extend(engine_results)
print(f"Fallback search returned {len(search_results)} results")
# Second fallback: If still no results, create a mock result to prevent report generation failure
if len(search_results) == 0:
print("WARNING: Fallback search also failed. Creating mock search result...")
# Create a mock search result with the query as the title
search_results = [{
"title": f"Information about: {query}",
"url": "https://example.com/search-result",
"snippet": f"This is a placeholder result for the query: {query}. " +
"The search system was unable to find relevant results. " +
"Please try refining your query or check your search API configuration.",
"source": "mock_result",
"score": 1.0
}]
print("Created mock search result to allow report generation to proceed")
# Rerank results if we have a reranker
if hasattr(self, 'reranker') and self.reranker:
# Use final_results_after_reranking if available, otherwise fall back to num_results
top_n_results = config.get("final_results_after_reranking", config.get("num_results", 7))
search_results = self.reranker.rerank_with_metadata(
query,
search_results,
document_key='snippet',
top_n=top_n_results
)
# Set up progress tracking
# Define progress callback function
def progress_callback(current_progress, total_chunks, current_report):
# Calculate current chunk number
current_chunk = int(current_progress * total_chunks) if total_chunks > 0 else 0
# Determine the status message based on progress
if current_progress == 0:
status_message = "Preparing documents..."
elif current_progress >= 1.0:
status_message = "Finalizing report..."
else:
status_message = f"Processing chunk {current_chunk}/{total_chunks}..."
# Add current chunk title if available
if hasattr(self.report_generator, 'current_chunk_title'):
chunk_title = self.report_generator.current_chunk_title
if chunk_title:
status_message += f" ({chunk_title})"
# Add model information to status message
if hasattr(self.report_generator, 'model_name') and self.report_generator.model_name:
model_display = self.report_generator.model_name.split('/')[-1] # Extract model name without provider
status_message += f" (Using model: {model_display})"
# Update the progress status directly
return status_message
# Set the progress callback for the report generator
if hasattr(self.report_generator, 'set_progress_callback'):
# Create a wrapper function that updates the UI elements
def ui_progress_callback(current_progress, total_chunks, current_report):
status_message = progress_callback(current_progress, total_chunks, current_report)
# Use Gradio's built-in progress tracking mechanism
# This will properly update the UI during async operations
progress(current_progress, desc=status_message)
return status_message
self.report_generator.set_progress_callback(ui_progress_callback)
# Generate the report
print(f"Generating report with {len(search_results)} search results")
if len(search_results) == 0:
print("WARNING: No search results found. Report generation may fail.")
# Log the current model being used by the report generator
print(f"Report generator is using model: {self.report_generator.model_name}")
# Update progress status based on detail level
if detail_level.lower() == "comprehensive":
self.progress_status = "Generating progressive report..."
else:
self.progress_status = "Processing document chunks..."
# Initial progress state is handled by Gradio's built-in progress tracking
# Handle query_type parameter
actual_query_type = None
if query_type != "auto-detect":
actual_query_type = query_type
print(f"Using user-selected query type: {actual_query_type}")
else:
print("Using auto-detection for query type")
# Ensure structured_query is defined
if not locals().get('structured_query'):
structured_query = None
report = await self.report_generator.generate_report(
search_results=search_results,
query=query,
token_budget=config["token_budget"],
chunk_size=config["chunk_size"],
overlap_size=config["overlap_size"],
detail_level=detail_level,
query_type=actual_query_type,
structured_query=structured_query if structured_query and 'sub_questions' in structured_query else None
)
# Final progress update
progress(1.0)
# Process thinking tags if requested
if process_thinking_tags:
report = self._process_thinking_tags(report)
# Save report to file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(report)
print(f"Report saved to: {output_file}")
# Update report metadata
self._update_report_metadata(report_id, {
"id": report_id,
"timestamp": timestamp,
"query": query,
"detail_level": detail_level,
"query_type": query_type,
"model": custom_model if custom_model else config.get("model", "default"),
"file_path": str(output_file),
"file_size": output_file.stat().st_size,
"creation_date": datetime.now().isoformat()
})
return report, str(output_file)
except Exception as e:
error_message = f"Error generating report: {str(e)}"
print(f"ERROR: {error_message}")
import traceback
traceback.print_exc()
return f"## Error\n\n{error_message}", None
def _process_thinking_tags(self, text):
"""
Process thinking tags in the text.
Args:
text (str): Text to process
Returns:
str: Processed text
"""
# Remove content between <thinking> and </thinking> tags
import re
return re.sub(r'<thinking>.*?</thinking>', '', text, flags=re.DOTALL)
def _update_report_metadata(self, report_id, metadata):
"""
Update the report metadata file with new report information.
Args:
report_id (str): Unique identifier for the report
metadata (dict): Report metadata to store
"""
try:
# Load existing metadata
with open(self.reports_metadata_file, 'r') as f:
all_metadata = json.load(f)
# Check if report already exists
existing_report = None
for i, report in enumerate(all_metadata.get('reports', [])):
if report.get('id') == report_id:
existing_report = i
break
# Update or add the report metadata
if existing_report is not None:
all_metadata['reports'][existing_report] = metadata
else:
all_metadata['reports'].append(metadata)
# Save updated metadata
with open(self.reports_metadata_file, 'w') as f:
json.dump(all_metadata, f, indent=2)
print(f"Updated metadata for report {report_id}")
except Exception as e:
print(f"Error updating report metadata: {str(e)}")
def get_all_reports(self):
"""
Get all report metadata.
Returns:
list: List of report metadata dictionaries
"""
try:
# Load metadata
with open(self.reports_metadata_file, 'r') as f:
all_metadata = json.load(f)
# Return reports sorted by creation date (newest first)
reports = all_metadata.get('reports', [])
return sorted(reports, key=lambda x: x.get('creation_date', ''), reverse=True)
except Exception as e:
print(f"Error getting report metadata: {str(e)}")
return []
def delete_report(self, report_id):
"""
Delete a report and its metadata.
Args:
report_id (str): ID of the report to delete
Returns:
bool: True if successful, False otherwise
"""
try:
# Load metadata
with open(self.reports_metadata_file, 'r') as f:
all_metadata = json.load(f)
# Find the report
report_to_delete = None
for report in all_metadata.get('reports', []):
if report.get('id') == report_id:
report_to_delete = report
break
if not report_to_delete:
print(f"Report {report_id} not found")
return False
# Delete the report file
file_path = report_to_delete.get('file_path')
print(f"Deleting report: report_id={report_id}, file_path={file_path}")
if file_path and Path(file_path).exists():
print(f"File exists: {Path(file_path).exists()}")
Path(file_path).unlink()
print(f"Deleted report file: {file_path}")
else:
print(f"File not found or file_path is missing")
# Remove from metadata
all_metadata['reports'] = [r for r in all_metadata.get('reports', []) if r.get('id') != report_id]
# Save updated metadata
with open(self.reports_metadata_file, 'w') as f:
json.dump(all_metadata, f, indent=2)
print(f"Deleted report {report_id} from metadata")
return True
except Exception as e:
print(f"Error deleting report: {str(e)}")
return False
def get_available_models(self):
"""
Get a list of available models for report generation.
Returns:
list: List of available model names
"""
# Get models from config
models = []
# Extract all model names from the config file
if 'models' in self.config.config_data:
models = list(self.config.config_data['models'].keys())
# If no models found, provide some defaults
if not models:
models = [
"llama-3.1-8b-instant",
"llama-3.3-70b-versatile",
"groq/deepseek-r1-distill-llama-70b-specdec",
"openrouter-mixtral",
"openrouter-claude",
"gemini-2.0-flash-lite"
]
return models
def get_model_descriptions(self):
"""
Get descriptions for available models.
Returns:
dict: Dictionary mapping model names to descriptions
"""
descriptions = {}
model_name_to_description = {}
if 'models' in self.config.config_data:
for model_name, model_config in self.config.config_data['models'].items():
provider = model_config.get('provider', 'unknown')
model_display = model_config.get('model_name', model_name)
max_tokens = model_config.get('max_tokens', 'unknown')
temperature = model_config.get('temperature', 'unknown')
# Create a description that includes the provider and actual model name
display_name = f"{model_name} ({provider}: {model_display})"
descriptions[model_name] = display_name
# Create a more detailed description for the dropdown tooltip
detailed_info = f"{display_name} - Max tokens: {max_tokens}, Temperature: {temperature}"
model_name_to_description[display_name] = detailed_info
self.model_name_to_description = model_name_to_description
return descriptions
def _get_reports_for_display(self):
"""Get reports formatted for display in the UI"""
reports = self.get_all_reports()
display_data = []
for report in reports:
# Format timestamp for display
timestamp = report.get('timestamp', '')
creation_date = report.get('creation_date', '')
if creation_date:
try:
# Convert ISO format to datetime and format for display
dt = datetime.fromisoformat(creation_date)
formatted_date = dt.strftime('%Y-%m-%d %H:%M:%S')
except:
formatted_date = creation_date
else:
formatted_date = timestamp
# Format file size
file_size = report.get('file_size', 0)
if file_size < 1024:
formatted_size = f"{file_size} B"
elif file_size < 1024 * 1024:
formatted_size = f"{file_size / 1024:.1f} KB"
else:
formatted_size = f"{file_size / (1024 * 1024):.1f} MB"
# Add row to display data
display_data.append([
report.get('id', ''),
report.get('query', '')[:50] + ('...' if len(report.get('query', '')) > 50 else ''),
report.get('model', '').split('/')[-1], # Show only the model name without provider
report.get('detail_level', ''),
formatted_date,
formatted_size,
Path(report.get('file_path', '')).name, # Just the filename
])
return display_data
def _delete_selected_reports(self, selected_choices):
"""Delete selected reports
Args:
selected_choices (list): List of selected checkbox values in format "ID: Query (Model)"
Returns:
tuple: Updated reports table data and updated checkbox choices
"""
if not selected_choices:
# If no reports are selected, just refresh the display
reports_data = self._get_reports_for_display()
choices = self._get_report_choices(reports_data)
return reports_data, choices, "No reports selected for deletion."
print(f"Selected choices for deletion: {selected_choices}")
# Extract report IDs from selected choices
selected_report_ids = []
for choice in selected_choices:
try:
# Convert to string and handle different input formats
choice_str = str(choice).strip().strip('"\'')
print(f"Processing choice: '{choice_str}'")
# Split at the first colon to get the ID
if ':' in choice_str:
report_id = choice_str.split(':', 1)[0].strip()
selected_report_ids.append(report_id)
else:
# If no colon, use the entire string as ID
selected_report_ids.append(choice_str)
print(f"Using full string as ID: '{choice_str}'")
except Exception as e:
print(f"Error processing choice {choice}: {e}")
print(f"Deleting report IDs: {selected_report_ids}")
# Delete selected reports
deleted_count = 0
for report_id in selected_report_ids:
if self.delete_report(report_id):
deleted_count += 1
print(f"Successfully deleted report: {report_id}")
else:
print(f"Failed to delete report: {report_id}")
print(f"Deleted {deleted_count} reports")
# Refresh the table and choices
reports_data = self._get_reports_for_display()
choices = self._get_report_choices(reports_data)
status_message = f"Deleted {deleted_count} report(s)."
return reports_data, choices, status_message
def _download_selected_reports(self, selected_choices):
"""Prepare selected reports for download
Args:
selected_choices (list): List of selected checkbox values in format "ID: Query (Model)"
Returns:
list: List of file paths to download
"""
if not selected_choices:
return []
print(f"Selected choices for download: {selected_choices}")
# Extract report IDs from selected choices
selected_report_ids = []
for choice in selected_choices:
try:
# Convert to string and handle different input formats
choice_str = str(choice).strip().strip('"\'')
print(f"Processing choice: '{choice_str}'")
# Split at the first colon to get the ID
if ':' in choice_str:
report_id = choice_str.split(':', 1)[0].strip()
selected_report_ids.append(report_id)
else:
# If no colon, use the entire string as ID
selected_report_ids.append(choice_str)
print(f"Using full string as ID: '{choice_str}'")
except Exception as e:
print(f"Error processing choice {choice}: {e}")
print(f"Extracted report IDs: {selected_report_ids}")
# Get file paths for selected reports
all_reports = self.get_all_reports()
files_to_download = []
for report_id in selected_report_ids:
report = next((r for r in all_reports if r.get('id') == report_id), None)
if report and "file_path" in report:
file_path = report["file_path"]
print(f"Downloading report: report_id={report_id}, file_path={file_path}")
# Verify the file exists
if os.path.exists(file_path):
files_to_download.append(file_path)
print(f"Added file for download: {file_path}")
else:
print(f"Warning: File does not exist: {file_path}")
else:
print(f"Warning: Could not find report with ID {report_id}")
return files_to_download
def _get_report_choices(self, reports_data):
"""Generate choices for the checkbox group based on reports data
Args:
reports_data (list): List of report data rows
Returns:
list: List of choices for the checkbox group in format "ID: Query (Model)"
"""
choices = []
# If reports_data is empty, return an empty list
if not reports_data:
return []
# Get all reports from the metadata file to ensure IDs are available
all_reports = self.get_all_reports()
# Create a mapping of report IDs to their full data
report_map = {report.get('id', ''): report for report in all_reports}
for row in reports_data:
try:
report_id = row[0]
if not report_id:
continue
# Get data from the table row
query = row[1]
model = row[2]
# Format: "ID: Query (Model)"
choice_text = f"{report_id}: {query} ({model})"
choices.append(choice_text)
except (IndexError, TypeError) as e:
print(f"Error processing report row: {e}")
continue
return choices
def _refresh_reports_with_html(self):
"""Refresh the reports list with updated HTML
Returns:
tuple: Updated reports data, HTML content, and reset hidden field value
"""
reports_data = self._get_reports_for_display()
choices = self._get_report_choices(reports_data)
html_content = create_checkbox_html(choices)
return reports_data, html_content, "[]" # Reset the hidden field
def _delete_selected_reports_with_html(self, selected_json):
"""Delete selected reports and return updated HTML
Args:
selected_json (str): JSON string containing selected report IDs
Returns:
tuple: Updated reports data, HTML content, reset hidden field value, and status message
"""
try:
# Parse JSON with error handling
if not selected_json or selected_json == "[]":
selected = []
else:
try:
selected = json.loads(selected_json)
print(f"Parsed JSON selections: {selected}")
except Exception as json_err:
print(f"JSON parse error: {json_err}")
# If JSON parsing fails, try to extract values directly
selected = [s.strip(' "') for s in selected_json.strip('[]').split(',')]
print(f"Fallback parsing to: {selected}")
# Delete reports
updated_table, _, message = self._delete_selected_reports(selected)
choices = self._get_report_choices(updated_table)
html_content = create_checkbox_html(choices)
return updated_table, html_content, "[]", f"{message}"
except Exception as e:
import traceback
traceback.print_exc()
return self._get_reports_for_display(), create_checkbox_html([]), "[]", f"Error: {str(e)}"
def _download_with_html(self, selected_json):
"""Prepare selected reports for download with improved JSON parsing
Args:
selected_json (str): JSON string containing selected report IDs
Returns:
list: Files prepared for download
"""
try:
# Parse JSON with error handling
if not selected_json or selected_json == "[]":
selected = []
else:
try:
selected = json.loads(selected_json)
print(f"Parsed JSON selections for download: {selected}")
except Exception as json_err:
print(f"JSON parse error: {json_err}")
# If JSON parsing fails, try to extract values directly
selected = [s.strip(' "') for s in selected_json.strip('[]').split(',')]
print(f"Fallback parsing to: {selected}")
# Get file paths for download
files = self._download_selected_reports(selected)
return files
except Exception as e:
import traceback
traceback.print_exc()
return []
def _cleanup_old_reports(self, days):
"""Delete reports older than the specified number of days
Args:
days (int): Number of days to keep reports for
Returns:
list: Updated reports table data
"""
try:
if days <= 0:
print("Cleanup skipped - days parameter is 0 or negative")
return self._get_reports_for_display()
# Calculate cutoff date
from datetime import timedelta
cutoff_date = datetime.now() - timedelta(days=days)
cutoff_str = cutoff_date.isoformat()
print(f"Cleaning up reports older than {cutoff_date.strftime('%Y-%m-%d %H:%M:%S')}")
# Get all reports
all_reports = self.get_all_reports()
print(f"Found {len(all_reports)} total reports")
reports_to_delete = []
# Find reports older than cutoff date
for report in all_reports:
creation_date = report.get('creation_date', '')
if not creation_date:
print(f"Warning: Report {report.get('id')} has no creation date")
continue
if creation_date < cutoff_str:
reports_to_delete.append(report.get('id'))
print(f"Marking report {report.get('id')} from {creation_date} for deletion")
print(f"Found {len(reports_to_delete)} reports to delete")
# Delete old reports
deleted_count = 0
for report_id in reports_to_delete:
if self.delete_report(report_id):
deleted_count += 1
print(f"Successfully deleted {deleted_count} reports")
# Refresh the table
updated_display = self._get_reports_for_display()
print(f"Returning updated display with {len(updated_display)} reports")
return updated_display
except Exception as e:
print(f"Error in cleanup_old_reports: {e}")
import traceback
traceback.print_exc()
# Return current display data in case of error
return self._get_reports_for_display()
def migrate_existing_reports(self):
"""Migrate existing reports from the root directory to the reports directory structure
Returns:
str: Status message indicating the result of the migration
"""
import re
import shutil
import os
# Pattern to match report files like report_20250317_122351_llama-3.3-70b-versatile.md
report_pattern = re.compile(r'report_(?P<date>\d{8})_(?P<time>\d{6})_?(?P<model>.*?)?\.md$')
# Get the root directory
root_dir = Path(__file__).parent.parent
# Find all report files in the root directory
migrated_count = 0
for file_path in root_dir.glob('report_*.md'):
if not file_path.is_file():
continue
# Extract information from the filename
match = report_pattern.match(file_path.name)
if not match:
continue
date_str = match.group('date')
time_str = match.group('time')
model = match.group('model') or 'unknown'
# Format date for directory structure (YYYY-MM-DD)
try:
year = date_str[:4]
month = date_str[4:6]
day = date_str[6:8]
formatted_date = f"{year}-{month}-{day}"
# Create timestamp for metadata
timestamp = f"{year}-{month}-{day} {time_str[:2]}:{time_str[2:4]}:{time_str[4:6]}"
creation_date = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S").isoformat()
except ValueError:
# If date parsing fails, use current date
formatted_date = datetime.now().strftime("%Y-%m-%d")
creation_date = datetime.now().isoformat()
# Create directory for the date if it doesn't exist
date_dir = self.reports_dir / formatted_date
date_dir.mkdir(exist_ok=True)
# Generate a unique report ID
report_id = f"{date_str}_{time_str}"
# Copy the file to the new location
new_file_path = date_dir / file_path.name
shutil.copy2(file_path, new_file_path)
# Read the report content to extract query if possible
query = ""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read(1000) # Read just the beginning to find the query
# Try to extract query from title or first few lines
title_match = re.search(r'#\s*(.+?)\n', content)
if title_match:
query = title_match.group(1).strip()
else:
# Just use the first line as query
query = content.split('\n')[0].strip()
except Exception as e:
print(f"Error reading file {file_path}: {e}")
# Create metadata for the report
file_size = os.path.getsize(file_path)
metadata = {
"id": report_id,
"query": query,
"model": model,
"detail_level": "unknown", # We don't know the detail level from the filename
"timestamp": timestamp,
"creation_date": creation_date,
"file_path": str(new_file_path),
"file_size": file_size
}
# Update the metadata file
self._update_report_metadata(report_id, metadata)
migrated_count += 1
return f"Migrated {migrated_count} existing reports to the new directory structure."
def create_interface(self):
"""
Create and return the Gradio interface.
Returns:
gr.Blocks: The Gradio interface
"""
with gr.Blocks(title="Intelligent Research System") as interface:
gr.Markdown("# Intelligent Research System")
gr.Markdown(
"""
This system helps you research topics by searching across multiple sources
including Google (via Serper), Google Scholar, arXiv, and news sources.
You can either search for results or generate a comprehensive report.
**Special Capabilities:**
- Automatically detects and optimizes current events queries
- Specialized search handlers for different types of information
- Semantic ranking for the most relevant results
"""
)
# Create tabs for different sections
with gr.Tabs() as tabs:
# Report Generation Tab
with gr.TabItem("Generate Report"):
with gr.Row():
with gr.Column(scale=4):
report_query_input = gr.Textbox(
label="Research Query",
placeholder="Enter your research question here...",
lines=3
)
with gr.Column(scale=1):
report_detail_level = gr.Dropdown(
choices=["brief", "standard", "detailed", "comprehensive"],
value="standard",
label="Detail Level",
info="Controls the depth and breadth of the report"
)
report_query_type = gr.Dropdown(
choices=["auto-detect", "factual", "exploratory", "comparative", "code"],
value="auto-detect",
label="Query Type",
info="Type of query determines the report structure"
)
model_descriptions = self.get_model_descriptions()
report_custom_model = gr.Dropdown(
choices=list(self.model_name_to_description.keys()),
value=None,
label="Custom Model (Optional)",
info="Select a custom model for report generation"
)
with gr.Row():
with gr.Column():
gr.Markdown("### Advanced Settings")
with gr.Row():
with gr.Column():
with gr.Accordion("Search Parameters", open=False):
with gr.Row():
initial_results_slider = gr.Slider(
minimum=5,
maximum=50,
value=10,
step=5,
label="Initial Results Per Engine",
info="Number of results to fetch from each search engine"
)
final_results_slider = gr.Slider(
minimum=3,
maximum=30,
value=7,
step=1,
label="Final Results After Reranking",
info="Number of results to keep after reranking"
)
with gr.Accordion("Processing Options", open=False):
with gr.Row():
report_process_thinking = gr.Checkbox(
label="Process Thinking Tags",
value=False,
info="Process <thinking> tags in model output"
)
with gr.Row():
report_button = gr.Button("Generate Report", variant="primary", size="lg")
# Note: We've removed the redundant progress indicators here
# The built-in Gradio progress tracking (gr.Progress) is used instead
# This is passed to the generate_report method and handles progress updates
gr.Examples(
examples=[
["What are the latest advancements in quantum computing?"],
["Compare transformer and RNN architectures for NLP tasks"],
["Explain the environmental impact of electric vehicles"],
["Explain the potential relationship between creatine supplementation and muscle loss due to GLP1-ar drugs for weight loss."],
["What recent actions has Trump taken regarding tariffs?"],
["What are the recent papers on large language model alignment?"],
["What are the main research findings on climate change adaptation strategies in agriculture?"]
],
inputs=report_query_input
)
with gr.Row():
with gr.Column():
report_output = gr.Markdown(label="Generated Report")
with gr.Row():
with gr.Column():
report_file_output = gr.Textbox(
label="Report saved to file",
interactive=False
)
# Add information about detail levels and query types
detail_levels_info = ""
for level, description in self.detail_level_manager.get_available_detail_levels():
detail_levels_info += f"- **{level}**: {description}\n"
query_types_info = """
- **auto-detect**: Automatically determine the query type based on the query text
- **factual**: For queries seeking specific information (e.g., "What is...", "How does...")
- **exploratory**: For queries investigating a topic broadly (e.g., "Tell me about...")
- **comparative**: For queries comparing multiple items (e.g., "Compare X and Y", "Differences between...")
- **code**: For queries related to programming, software development, or technical implementation
"""
gr.Markdown(f"### Detail Levels\n{detail_levels_info}")
gr.Markdown(f"### Query Types\n{query_types_info}")
# Report Management Tab - Reimplemented from scratch
with gr.TabItem("Manage Reports"):
with gr.Row():
gr.Markdown("## Report Management")
with gr.Row():
gr.Markdown("Select reports to download or delete. You can filter and sort the reports using the table controls.")
# Get the reports data
reports_data = self._get_reports_for_display()
# Create a state to store selected report IDs
selected_report_ids = gr.State([])
# We've removed the DataTable as requested by the user
# Selection controls
with gr.Row():
with gr.Column(scale=2):
# Create a checkbox group for selecting reports
report_choices = self._get_report_choices(reports_data)
reports_checkbox_group = gr.CheckboxGroup(
choices=report_choices,
label="Select Reports",
info="Check the reports you want to download or delete",
interactive=True
)
with gr.Column(scale=1):
# Action buttons
with gr.Row():
refresh_button = gr.Button("Refresh List", size="sm")
with gr.Row():
select_all_button = gr.Button("Select All", size="sm")
clear_selection_button = gr.Button("Clear Selection", size="sm")
with gr.Row():
download_button = gr.Button("Download Selected", size="sm")
delete_button = gr.Button("Delete Selected", variant="stop", size="sm")
with gr.Row():
cleanup_days = gr.Slider(
minimum=0,
maximum=90,
value=30,
step=1,
label="Delete Reports Older Than (Days)",
info="Set to 0 to disable automatic cleanup"
)
cleanup_button = gr.Button("Clean Up Old Reports", size="sm")
# File download component
with gr.Row():
file_output = gr.File(
label="Downloaded Reports",
file_count="multiple",
type="filepath",
interactive=False
)
# Status message
with gr.Row():
status_message = gr.Markdown("")
# Migration button for existing reports
with gr.Row():
with gr.Column():
gr.Markdown("### Migrate Existing Reports")
gr.Markdown("Use this button to migrate existing reports from the root directory to the new reports directory structure.")
migrate_button = gr.Button("Migrate Existing Reports", variant="primary")
# Set up event handlers
# Update the progress tracking in the generate_report method
async def generate_report_with_progress(query, detail_level, query_type, model_name, process_thinking, initial_results, final_results):
# Set up progress tracking
progress_data = gr.Progress(track_tqdm=True)
# Debug the model selection
print(f"Model selected from UI dropdown: {model_name}")
# Call the original generate_report method
result = await self.generate_report(
query,
detail_level,
query_type,
model_name,
None, # results_file is now None since we removed the search tab
process_thinking,
initial_results,
final_results
)
return result
report_button.click(
fn=lambda q, d, t, m, p, i, f: asyncio.run(generate_report_with_progress(q, d, t, m, p, i, f)),
inputs=[report_query_input, report_detail_level, report_query_type, report_custom_model,
report_process_thinking, initial_results_slider, final_results_slider],
outputs=[report_output, report_file_output]
)
# Report Management Tab Event Handlers
# Refresh reports list
def refresh_reports_list():
"""Refresh the reports list and update the UI components"""
reports_data = self._get_reports_for_display()
report_choices = self._get_report_choices(reports_data)
return reports_data, report_choices, "Reports list refreshed."
refresh_button.click(
fn=refresh_reports_list,
inputs=[],
outputs=[reports_checkbox_group, reports_checkbox_group, status_message]
)
# Select all reports
def select_all_reports():
"""Select all reports in the checkbox group"""
report_choices = self._get_report_choices(self._get_reports_for_display())
return report_choices, "Selected all reports."
select_all_button.click(
fn=select_all_reports,
inputs=[],
outputs=[reports_checkbox_group, status_message]
)
# Clear selection
def clear_selection():
"""Clear the selection in the checkbox group"""
return [], "Selection cleared."
clear_selection_button.click(
fn=clear_selection,
inputs=[],
outputs=[reports_checkbox_group, status_message]
)
# Download selected reports
def download_selected_reports(selected_choices):
"""Download selected reports"""
if not selected_choices:
return [], "No reports selected for download."
print(f"Selected choices for download: {selected_choices}")
files = self._download_selected_reports(selected_choices)
if files:
return files, f"Prepared {len(files)} report(s) for download."
else:
return [], "No files found for the selected reports."
download_button.click(
fn=download_selected_reports,
inputs=[reports_checkbox_group],
outputs=[file_output, status_message]
)
# Delete selected reports
def delete_selected_reports(selected_choices):
"""Delete selected reports and update the UI"""
if not selected_choices:
return self._get_reports_for_display(), [], "No reports selected for deletion."
print(f"Selected choices for deletion: {selected_choices}")
# Extract report IDs from selected choices
selected_report_ids = []
for choice in selected_choices:
try:
# Split at the first colon to get the ID
if ':' in choice:
report_id = choice.split(':', 1)[0].strip()
selected_report_ids.append(report_id)
else:
# If no colon, use the entire string as ID
selected_report_ids.append(choice)
except Exception as e:
print(f"Error processing choice {choice}: {e}")
# Delete selected reports
deleted_count = 0
for report_id in selected_report_ids:
if self.delete_report(report_id):
deleted_count += 1
# Refresh the table and choices
updated_reports_data = self._get_reports_for_display()
updated_choices = self._get_report_choices(updated_reports_data)
return updated_choices, f"Deleted {deleted_count} report(s)."
delete_button.click(
fn=delete_selected_reports,
inputs=[reports_checkbox_group],
outputs=[reports_checkbox_group, status_message]
)
# Clean up old reports
def cleanup_old_reports(days):
"""Delete reports older than the specified number of days"""
if days <= 0:
return self._get_reports_for_display(), self._get_report_choices(self._get_reports_for_display()), "Cleanup skipped - days parameter is 0 or negative."
updated_reports_data = self._cleanup_old_reports(days)
updated_choices = self._get_report_choices(updated_reports_data)
return updated_reports_data, updated_choices, f"Reports older than {days} days have been deleted."
cleanup_button.click(
fn=cleanup_old_reports,
inputs=[cleanup_days],
outputs=[reports_checkbox_group, status_message]
)
# Migration button event handler
def migrate_existing_reports():
"""Migrate existing reports from the root directory to the reports directory structure"""
print("Starting migration of existing reports...")
status = self.migrate_existing_reports()
print("Migration completed, refreshing display...")
# Refresh the reports list
updated_reports_data = self._get_reports_for_display()
updated_choices = self._get_report_choices(updated_reports_data)
return status, updated_reports_data, updated_choices
migrate_button.click(
fn=migrate_existing_reports,
inputs=[],
outputs=[status_message, reports_checkbox_group]
)
# Initialize the UI on page load
def init_reports_ui():
"""Initialize the reports UI with current data"""
print("Initializing reports UI...")
reports_data = self._get_reports_for_display()
choices = self._get_report_choices(reports_data)
print(f"Initializing reports UI with {len(reports_data)} reports and {len(choices)} choices")
return choices, "Reports management initialized successfully."
interface.load(
fn=init_reports_ui,
inputs=[],
outputs=[reports_checkbox_group, status_message]
)
return interface
def launch(self, **kwargs):
"""
Launch the Gradio interface.
Args:
**kwargs: Keyword arguments to pass to gr.Interface.launch()
"""
interface = self.create_interface()
interface.launch(**kwargs)
def main():
"""Main function to launch the Gradio interface."""
# Create interface and initialize async components
interface = GradioInterface()
# Run the async initialization in the event loop
loop = asyncio.get_event_loop()
loop.run_until_complete(interface.async_init())
# Launch the interface
interface.launch(share=True)
if __name__ == "__main__":
main()