ira/tests/integration/test_query_classification_s...

"""
Integration test for query classification and search execution.

This test demonstrates how the LLM-based query domain classification
affects the search engines selected for different types of queries.
"""

import os
import sys
import json
import asyncio
from typing import Dict, Any, List

# Add parent directory to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))

from query.query_processor import get_query_processor
from execution.search_executor import get_search_executor


async def test_query_classification_search_integration():
    """Test how query classification affects search engine selection."""
    query_processor = get_query_processor()
    search_executor = get_search_executor()

    # Test queries for different domains
    test_queries = [
        {
            "description": "Academic query about quantum computing",
            "query": "What are the latest theoretical advances in quantum computing algorithms?"
        },
        {
            "description": "Code query about implementing a neural network",
            "query": "How do I implement a convolutional neural network in TensorFlow?"
        },
        {
            "description": "Current events query about economic policy",
            "query": "What are the recent changes to Federal Reserve interest rates and their economic impact?"
        },
        {
            "description": "Mixed query with academic and code aspects",
            "query": "How are transformer models being implemented for natural language processing tasks?"
        }
    ]

    results = []

    for test_case in test_queries:
        query = test_case["query"]
        description = test_case["description"]

        print(f"\n=== Testing: {description} ===")
        print(f"Query: {query}")

        # Process the query
        structured_query = await query_processor.process_query(query)

        # Get domain classification results
        domain = structured_query.get('domain', 'general')
        domain_confidence = structured_query.get('domain_confidence', 0.0)
        is_academic = structured_query.get('is_academic', False)
        is_code = structured_query.get('is_code', False)
        is_current_events = structured_query.get('is_current_events', False)

        print(f"Domain: {domain} (confidence: {domain_confidence})")
        print(f"Is academic: {is_academic}")
        print(f"Is code: {is_code}")
        print(f"Is current events: {is_current_events}")

        # Execute search with default search engines based on classification
        search_results = await search_executor.execute_search(structured_query)

        # Get the search engines that were selected
        selected_engines = list(search_results.keys())
        print(f"Selected search engines: {selected_engines}")

        # Store the results
        result = {
            "query": query,
            "description": description,
            "domain": domain,
            "domain_confidence": domain_confidence,
            "is_academic": is_academic,
            "is_code": is_code,
            "is_current_events": is_current_events,
            "selected_engines": selected_engines,
            "num_results_per_engine": {engine: len(results) for engine, results in search_results.items()}
        }

        results.append(result)

    # Save results to a file
    with open('query_classification_search_results.json', 'w') as f:
        json.dump(results, indent=2, fp=f)

    print(f"\nResults saved to query_classification_search_results.json")


if __name__ == "__main__":
    asyncio.run(test_query_classification_search_integration())