ira/test_search_execution.py

268 lines
9.0 KiB
Python

"""
Test script for the search execution module.
"""
import os
import json
import time
from typing import Dict, List, Any, Optional
# Import the necessary modules
try:
from query.query_processor import get_query_processor, QueryProcessor
from query.llm_interface import get_llm_interface
from execution.search_executor import SearchExecutor
from execution.result_collector import ResultCollector
except ImportError as e:
print(f"Import error: {e}")
print("Make sure all required modules are installed and available.")
exit(1)
def get_query_processor():
"""Get a query processor instance."""
# First set the LLM interface to use Groq's model
from query.llm_interface import get_llm_interface
get_llm_interface(model_name="llama-3.1-8b-instant")
# Then get the query processor which will use the configured LLM interface
from query.query_processor import get_query_processor
return get_query_processor()
def test_search_execution(query: str, search_engines: Optional[List[str]] = None) -> Dict[str, Any]:
"""
Test the search execution module.
Args:
query: The query to process and execute
search_engines: List of search engines to use (if None, use all available)
Returns:
Dictionary with test results
"""
print(f"Testing search execution for query: {query}")
# Process the query
processor = get_query_processor()
start_time = time.time()
structured_query = processor.process_query(query)
query_time = time.time() - start_time
print(f"Query processed in {query_time:.2f} seconds")
print(f"Enhanced query: {structured_query.get('enhanced_query', '')}")
print(f"Classification: {structured_query.get('classification', {})}")
# Execute the search
executor = SearchExecutor()
# Get available search engines if none specified
if search_engines is None:
search_engines = executor.get_available_search_engines()
print(f"Using available search engines: {search_engines}")
# Execute the search
start_time = time.time()
search_results = executor.execute_search(structured_query, search_engines=search_engines)
search_time = time.time() - start_time
print(f"Search executed in {search_time:.2f} seconds")
# Print raw search results for debugging
print("\nRaw search results:")
for engine, results in search_results.items():
print(f" {engine}: {len(results)} results")
if results:
print(f" Sample result: {results[0]}")
# Process the results
collector = ResultCollector()
processed_results = collector.process_results(search_results, dedup=True)
# Print summary of results
total_results = len(processed_results)
print(f"Found {total_results} results after deduplication")
# Print results by source
results_by_source = {}
for result in processed_results:
source = result.get("source", "unknown")
if source not in results_by_source:
results_by_source[source] = 0
results_by_source[source] += 1
print("Results by source:")
for source, count in results_by_source.items():
print(f" {source}: {count}")
# Print top 3 results
if processed_results:
print("\nTop 3 results:")
for i, result in enumerate(processed_results[:3]):
print(f" {i+1}. {result['title']}")
print(f" URL: {result['url']}")
print(f" Snippet: {result['snippet'][:100]}...")
print()
# Return test results
return {
"query": query,
"structured_query": structured_query,
"search_engines": search_engines,
"raw_results": search_results,
"processed_results": processed_results,
"timing": {
"query_processing": query_time,
"search_execution": search_time,
"total": query_time + search_time
},
"summary": {
"total_results": total_results,
"results_by_source": results_by_source
}
}
def save_test_results(results: Dict[str, Any], file_path: str) -> None:
"""
Save test results to a file.
Args:
results: Test results to save
file_path: Path to save results to
"""
try:
with open(file_path, 'w') as f:
json.dump(results, f, indent=2)
print(f"Test results saved to {file_path}")
except Exception as e:
print(f"Error saving test results: {e}")
def mock_test():
"""Run a mock test without actual API calls."""
print("Running mock test without API calls...")
# Create a mock structured query
structured_query = {
"original_query": "What are the latest advancements in quantum computing?",
"enhanced_query": "Explore the most recent breakthroughs and developments in quantum computing technology, including hardware innovations, quantum algorithms, and potential applications.",
"classification": {
"type": "exploratory",
"intent": "research",
"entities": ["quantum computing", "advancements", "technology"]
},
"search_queries": {
"google": "latest advancements in quantum computing 2025 breakthroughs",
"scholar": "recent quantum computing developments research papers",
"arxiv": "quantum computing hardware algorithms applications"
}
}
# Create mock search results
mock_results = {
"google": [
{
"title": "Quantum Computing Breakthrough: New Qubit Design Achieves 99.9% Fidelity",
"url": "https://example.com/quantum-breakthrough",
"snippet": "Researchers at MIT have developed a new qubit design that achieves 99.9% fidelity, a major step toward practical quantum computing.",
"position": 1
},
{
"title": "IBM Unveils 1000-Qubit Quantum Computer",
"url": "https://example.com/ibm-quantum",
"snippet": "IBM has announced its latest quantum computer featuring 1000 qubits, significantly expanding computational capabilities.",
"position": 2
}
],
"arxiv": [
{
"title": "Quantum Error Correction Using Surface Codes",
"url": "https://arxiv.org/abs/2301.12345",
"snippet": "This paper presents a new approach to quantum error correction using surface codes that improves error tolerance by an order of magnitude.",
"authors": ["Smith, J.", "Johnson, A."],
"published_date": "2025-01-15",
"position": 1
}
]
}
# Process the results
collector = ResultCollector()
processed_results = collector.process_results(mock_results, dedup=True)
# Print summary
total_results = len(processed_results)
print(f"Found {total_results} mock results after deduplication")
# Print results by source
results_by_source = {}
for result in processed_results:
source = result.get("source", "unknown")
if source not in results_by_source:
results_by_source[source] = 0
results_by_source[source] += 1
print("Results by source:")
for source, count in results_by_source.items():
print(f" {source}: {count}")
# Print top 3 results
if processed_results:
print("\nTop 3 results:")
for i, result in enumerate(processed_results[:3]):
print(f" {i+1}. {result['title']}")
print(f" URL: {result['url']}")
print(f" Snippet: {result['snippet'][:100]}...")
print()
# Return mock test results
return {
"query": "What are the latest advancements in quantum computing?",
"structured_query": structured_query,
"search_engines": ["google", "arxiv"],
"raw_results": mock_results,
"processed_results": processed_results,
"timing": {
"query_processing": 0.5,
"search_execution": 1.2,
"total": 1.7
},
"summary": {
"total_results": total_results,
"results_by_source": results_by_source
}
}
def main():
"""Main function."""
# Test queries
test_queries = [
"What are the latest advancements in quantum computing?",
"Compare blockchain and traditional databases for financial applications",
"Explain the implications of blockchain technology in finance"
]
# Run tests
all_results = {}
for query in test_queries:
print("\n" + "="*80)
print(f"Testing query: {query}")
print("="*80)
# Test with all available search engines
results = test_search_execution(query)
# Save results for this query
all_results[query] = results
print("\n")
# Save all test results
save_test_results(all_results, "search_execution_test_results.json")
if __name__ == "__main__":
main()