ira/test_search_execution.py

"""
Test script for the search execution module.
"""

import os
import json
import time
from typing import Dict, List, Any, Optional

# Import the necessary modules
try:
    from query.query_processor import get_query_processor, QueryProcessor
    from query.llm_interface import get_llm_interface
    from execution.search_executor import SearchExecutor
    from execution.result_collector import ResultCollector
except ImportError as e:
    print(f"Import error: {e}")
    print("Make sure all required modules are installed and available.")
    exit(1)


def get_query_processor():
    """Get a query processor instance."""
    # First set the LLM interface to use Groq's model
    from query.llm_interface import get_llm_interface
    get_llm_interface(model_name="llama-3.1-8b-instant")

    # Then get the query processor which will use the configured LLM interface
    from query.query_processor import get_query_processor
    return get_query_processor()


def test_search_execution(query: str, search_engines: Optional[List[str]] = None) -> Dict[str, Any]:
    """
    Test the search execution module.

    Args:
        query: The query to process and execute
        search_engines: List of search engines to use (if None, use all available)

    Returns:
        Dictionary with test results
    """
    print(f"Testing search execution for query: {query}")

    # Process the query
    processor = get_query_processor()
    start_time = time.time()
    structured_query = processor.process_query(query)
    query_time = time.time() - start_time

    print(f"Query processed in {query_time:.2f} seconds")
    print(f"Enhanced query: {structured_query.get('enhanced_query', '')}")
    print(f"Classification: {structured_query.get('classification', {})}")

    # Execute the search
    executor = SearchExecutor()

    # Get available search engines if none specified
    if search_engines is None:
        search_engines = executor.get_available_search_engines()
        print(f"Using available search engines: {search_engines}")

    # Execute the search
    start_time = time.time()
    search_results = executor.execute_search(structured_query, search_engines=search_engines)
    search_time = time.time() - start_time

    print(f"Search executed in {search_time:.2f} seconds")

    # Print raw search results for debugging
    print("\nRaw search results:")
    for engine, results in search_results.items():
        print(f"  {engine}: {len(results)} results")
        if results:
            print(f"    Sample result: {results[0]}")

    # Process the results
    collector = ResultCollector()
    processed_results = collector.process_results(search_results, dedup=True)

    # Print summary of results
    total_results = len(processed_results)
    print(f"Found {total_results} results after deduplication")

    # Print results by source
    results_by_source = {}
    for result in processed_results:
        source = result.get("source", "unknown")
        if source not in results_by_source:
            results_by_source[source] = 0
        results_by_source[source] += 1

    print("Results by source:")
    for source, count in results_by_source.items():
        print(f"  {source}: {count}")

    # Print top 3 results
    if processed_results:
        print("\nTop 3 results:")
        for i, result in enumerate(processed_results[:3]):
            print(f"  {i+1}. {result['title']}")
            print(f"     URL: {result['url']}")
            print(f"     Snippet: {result['snippet'][:100]}...")
            print()

    # Return test results
    return {
        "query": query,
        "structured_query": structured_query,
        "search_engines": search_engines,
        "raw_results": search_results,
        "processed_results": processed_results,
        "timing": {
            "query_processing": query_time,
            "search_execution": search_time,
            "total": query_time + search_time
        },
        "summary": {
            "total_results": total_results,
            "results_by_source": results_by_source
        }
    }


def save_test_results(results: Dict[str, Any], file_path: str) -> None:
    """
    Save test results to a file.

    Args:
        results: Test results to save
        file_path: Path to save results to
    """
    try:
        with open(file_path, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"Test results saved to {file_path}")
    except Exception as e:
        print(f"Error saving test results: {e}")


def mock_test():
    """Run a mock test without actual API calls."""
    print("Running mock test without API calls...")

    # Create a mock structured query
    structured_query = {
        "original_query": "What are the latest advancements in quantum computing?",
        "enhanced_query": "Explore the most recent breakthroughs and developments in quantum computing technology, including hardware innovations, quantum algorithms, and potential applications.",
        "classification": {
            "type": "exploratory",
            "intent": "research",
            "entities": ["quantum computing", "advancements", "technology"]
        },
        "search_queries": {
            "google": "latest advancements in quantum computing 2025 breakthroughs",
            "scholar": "recent quantum computing developments research papers",
            "arxiv": "quantum computing hardware algorithms applications"
        }
    }

    # Create mock search results
    mock_results = {
        "google": [
            {
                "title": "Quantum Computing Breakthrough: New Qubit Design Achieves 99.9% Fidelity",
                "url": "https://example.com/quantum-breakthrough",
                "snippet": "Researchers at MIT have developed a new qubit design that achieves 99.9% fidelity, a major step toward practical quantum computing.",
                "position": 1
            },
            {
                "title": "IBM Unveils 1000-Qubit Quantum Computer",
                "url": "https://example.com/ibm-quantum",
                "snippet": "IBM has announced its latest quantum computer featuring 1000 qubits, significantly expanding computational capabilities.",
                "position": 2
            }
        ],
        "arxiv": [
            {
                "title": "Quantum Error Correction Using Surface Codes",
                "url": "https://arxiv.org/abs/2301.12345",
                "snippet": "This paper presents a new approach to quantum error correction using surface codes that improves error tolerance by an order of magnitude.",
                "authors": ["Smith, J.", "Johnson, A."],
                "published_date": "2025-01-15",
                "position": 1
            }
        ]
    }

    # Process the results
    collector = ResultCollector()
    processed_results = collector.process_results(mock_results, dedup=True)

    # Print summary
    total_results = len(processed_results)
    print(f"Found {total_results} mock results after deduplication")

    # Print results by source
    results_by_source = {}
    for result in processed_results:
        source = result.get("source", "unknown")
        if source not in results_by_source:
            results_by_source[source] = 0
        results_by_source[source] += 1

    print("Results by source:")
    for source, count in results_by_source.items():
        print(f"  {source}: {count}")

    # Print top 3 results
    if processed_results:
        print("\nTop 3 results:")
        for i, result in enumerate(processed_results[:3]):
            print(f"  {i+1}. {result['title']}")
            print(f"     URL: {result['url']}")
            print(f"     Snippet: {result['snippet'][:100]}...")
            print()

    # Return mock test results
    return {
        "query": "What are the latest advancements in quantum computing?",
        "structured_query": structured_query,
        "search_engines": ["google", "arxiv"],
        "raw_results": mock_results,
        "processed_results": processed_results,
        "timing": {
            "query_processing": 0.5,
            "search_execution": 1.2,
            "total": 1.7
        },
        "summary": {
            "total_results": total_results,
            "results_by_source": results_by_source
        }
    }


def main():
    """Main function."""
    # Test queries
    test_queries = [
        "What are the latest advancements in quantum computing?",
        "Compare blockchain and traditional databases for financial applications",
        "Explain the implications of blockchain technology in finance"
    ]

    # Run tests
    all_results = {}
    for query in test_queries:
        print("\n" + "="*80)
        print(f"Testing query: {query}")
        print("="*80)

        # Test with all available search engines
        results = test_search_execution(query)

        # Save results for this query
        all_results[query] = results

        print("\n")

    # Save all test results
    save_test_results(all_results, "search_execution_test_results.json")


if __name__ == "__main__":
    main()