ira/execution/api_handlers/arxiv_handler.py

"""
arXiv API handler.
Uses the official arXiv API to search for academic papers.
"""

import os
import json
import requests
import urllib.parse
import xml.etree.ElementTree as ET
from datetime import datetime
from typing import Dict, List, Any, Optional

from .base_handler import BaseSearchHandler
from config.config import get_config


class ArxivSearchHandler(BaseSearchHandler):
    """Handler for arXiv Search using the official API."""

    def __init__(self):
        """Initialize the arXiv search handler."""
        self.config = get_config()
        self.base_url = "http://export.arxiv.org/api/query"
        self.available = True  # arXiv API is freely available without an API key

    def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
        """
        Execute an arXiv search query.

        Args:
            query: The search query to execute
            num_results: Number of results to return
            **kwargs: Additional search parameters:
                - sort_by: Sort order ("relevance", "lastUpdatedDate", "submittedDate")
                - sort_order: Sort direction ("ascending", "descending")
                - categories: List of arXiv categories to search within
                - date_range: Date range for filtering (e.g., "all", "last_week", "last_month")

        Returns:
            List of search results with standardized format
        """
        # Set up the request parameters
        params = {
            "search_query": query,
            "max_results": num_results,
            "start": kwargs.get("start", 0)
        }

        # Add sorting parameters
        sort_by = kwargs.get("sort_by", "relevance")
        if sort_by == "relevance":
            params["sortBy"] = "relevance"
        elif sort_by == "lastUpdatedDate":
            params["sortBy"] = "lastUpdatedDate"
        elif sort_by == "submittedDate":
            params["sortBy"] = "submittedDate"

        sort_order = kwargs.get("sort_order", "descending")
        if sort_order == "descending":
            params["sortOrder"] = "descending"
        elif sort_order == "ascending":
            params["sortOrder"] = "ascending"

        # Add category filtering
        if "categories" in kwargs and kwargs["categories"]:
            categories = "+OR+".join([f"cat:{cat}" for cat in kwargs["categories"]])
            params["search_query"] = f"{params['search_query']}+AND+({categories})"

        try:
            # Make the request
            response = requests.get(
                self.base_url,
                params=params
            )
            response.raise_for_status()

            # Parse the XML response
            root = ET.fromstring(response.content)

            # Define namespaces
            ns = {
                'atom': 'http://www.w3.org/2005/Atom',
                'arxiv': 'http://arxiv.org/schemas/atom'
            }

            # Extract and standardize the results
            results = []

            for entry in root.findall('.//atom:entry', ns):
                # Extract basic information
                title = entry.find('./atom:title', ns).text.strip()
                summary = entry.find('./atom:summary', ns).text.strip()
                published = entry.find('./atom:published', ns).text
                updated = entry.find('./atom:updated', ns).text

                # Extract authors
                authors = []
                for author in entry.findall('./atom:author/atom:name', ns):
                    authors.append(author.text.strip())

                # Extract links
                links = {}
                for link in entry.findall('./atom:link', ns):
                    link_rel = link.get('rel', '')
                    link_href = link.get('href', '')
                    links[link_rel] = link_href

                # Extract arXiv-specific information
                arxiv_id = entry.find('./atom:id', ns).text.split('/')[-1]

                # Get categories
                categories = []
                for category in entry.findall('./arxiv:primary_category', ns):
                    categories.append(category.get('term', ''))
                for category in entry.findall('./atom:category', ns):
                    cat_term = category.get('term', '')
                    if cat_term and cat_term not in categories:
                        categories.append(cat_term)

                # Format the result
                result = {
                    "title": title,
                    "url": links.get('alternate', ''),
                    "pdf_url": links.get('related', ''),
                    "snippet": summary[:200] + "..." if len(summary) > 200 else summary,
                    "source": "arxiv",
                    "arxiv_id": arxiv_id,
                    "authors": authors,
                    "categories": categories,
                    "published_date": published,
                    "updated_date": updated,
                    "full_text": summary
                }

                results.append(result)

            return results

        except requests.exceptions.RequestException as e:
            print(f"Error executing arXiv search: {e}")
            return []
        except ET.ParseError as e:
            print(f"Error parsing arXiv response: {e}")
            return []

    def get_name(self) -> str:
        """Get the name of the search handler."""
        return "arxiv"

    def is_available(self) -> bool:
        """Check if the arXiv API is available."""
        return self.available

    def get_rate_limit_info(self) -> Dict[str, Any]:
        """Get information about the API's rate limits."""
        # arXiv API rate limits
        return {
            "requests_per_minute": 30,  # arXiv recommends no more than 1 request per 3 seconds
            "requests_per_day": 2000,   # This is an estimate
            "current_usage": None       # arXiv doesn't provide usage info in responses
        }