""" arXiv API handler. Uses the official arXiv API to search for academic papers. """ import os import json import requests import urllib.parse import xml.etree.ElementTree as ET from datetime import datetime from typing import Dict, List, Any, Optional from .base_handler import BaseSearchHandler from config.config import get_config class ArxivSearchHandler(BaseSearchHandler): """Handler for arXiv Search using the official API.""" def __init__(self): """Initialize the arXiv search handler.""" self.config = get_config() self.base_url = "http://export.arxiv.org/api/query" self.available = True # arXiv API is freely available without an API key def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: """ Execute an arXiv search query. Args: query: The search query to execute num_results: Number of results to return **kwargs: Additional search parameters: - sort_by: Sort order ("relevance", "lastUpdatedDate", "submittedDate") - sort_order: Sort direction ("ascending", "descending") - categories: List of arXiv categories to search within - date_range: Date range for filtering (e.g., "all", "last_week", "last_month") Returns: List of search results with standardized format """ # Set up the request parameters params = { "search_query": query, "max_results": num_results, "start": kwargs.get("start", 0) } # Add sorting parameters sort_by = kwargs.get("sort_by", "relevance") if sort_by == "relevance": params["sortBy"] = "relevance" elif sort_by == "lastUpdatedDate": params["sortBy"] = "lastUpdatedDate" elif sort_by == "submittedDate": params["sortBy"] = "submittedDate" sort_order = kwargs.get("sort_order", "descending") if sort_order == "descending": params["sortOrder"] = "descending" elif sort_order == "ascending": params["sortOrder"] = "ascending" # Add category filtering if "categories" in kwargs and kwargs["categories"]: categories = "+OR+".join([f"cat:{cat}" for cat in kwargs["categories"]]) params["search_query"] = f"{params['search_query']}+AND+({categories})" try: # Make the request response = requests.get( self.base_url, params=params ) response.raise_for_status() # Parse the XML response root = ET.fromstring(response.content) # Define namespaces ns = { 'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom' } # Extract and standardize the results results = [] for entry in root.findall('.//atom:entry', ns): # Extract basic information title = entry.find('./atom:title', ns).text.strip() summary = entry.find('./atom:summary', ns).text.strip() published = entry.find('./atom:published', ns).text updated = entry.find('./atom:updated', ns).text # Extract authors authors = [] for author in entry.findall('./atom:author/atom:name', ns): authors.append(author.text.strip()) # Extract links links = {} for link in entry.findall('./atom:link', ns): link_rel = link.get('rel', '') link_href = link.get('href', '') links[link_rel] = link_href # Extract arXiv-specific information arxiv_id = entry.find('./atom:id', ns).text.split('/')[-1] # Get categories categories = [] for category in entry.findall('./arxiv:primary_category', ns): categories.append(category.get('term', '')) for category in entry.findall('./atom:category', ns): cat_term = category.get('term', '') if cat_term and cat_term not in categories: categories.append(cat_term) # Format the result result = { "title": title, "url": links.get('alternate', ''), "pdf_url": links.get('related', ''), "snippet": summary[:200] + "..." if len(summary) > 200 else summary, "source": "arxiv", "arxiv_id": arxiv_id, "authors": authors, "categories": categories, "published_date": published, "updated_date": updated, "full_text": summary } results.append(result) return results except requests.exceptions.RequestException as e: print(f"Error executing arXiv search: {e}") return [] except ET.ParseError as e: print(f"Error parsing arXiv response: {e}") return [] def get_name(self) -> str: """Get the name of the search handler.""" return "arxiv" def is_available(self) -> bool: """Check if the arXiv API is available.""" return self.available def get_rate_limit_info(self) -> Dict[str, Any]: """Get information about the API's rate limits.""" # arXiv API rate limits return { "requests_per_minute": 30, # arXiv recommends no more than 1 request per 3 seconds "requests_per_day": 2000, # This is an estimate "current_usage": None # arXiv doesn't provide usage info in responses }