ira/execution/api_handlers/arxiv_handler.py

163 lines
6.1 KiB
Python

"""
arXiv API handler.
Uses the official arXiv API to search for academic papers.
"""
import os
import json
import requests
import urllib.parse
import xml.etree.ElementTree as ET
from datetime import datetime
from typing import Dict, List, Any, Optional
from .base_handler import BaseSearchHandler
from config.config import get_config
class ArxivSearchHandler(BaseSearchHandler):
"""Handler for arXiv Search using the official API."""
def __init__(self):
"""Initialize the arXiv search handler."""
self.config = get_config()
self.base_url = "http://export.arxiv.org/api/query"
self.available = True # arXiv API is freely available without an API key
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
"""
Execute an arXiv search query.
Args:
query: The search query to execute
num_results: Number of results to return
**kwargs: Additional search parameters:
- sort_by: Sort order ("relevance", "lastUpdatedDate", "submittedDate")
- sort_order: Sort direction ("ascending", "descending")
- categories: List of arXiv categories to search within
- date_range: Date range for filtering (e.g., "all", "last_week", "last_month")
Returns:
List of search results with standardized format
"""
# Set up the request parameters
params = {
"search_query": query,
"max_results": num_results,
"start": kwargs.get("start", 0)
}
# Add sorting parameters
sort_by = kwargs.get("sort_by", "relevance")
if sort_by == "relevance":
params["sortBy"] = "relevance"
elif sort_by == "lastUpdatedDate":
params["sortBy"] = "lastUpdatedDate"
elif sort_by == "submittedDate":
params["sortBy"] = "submittedDate"
sort_order = kwargs.get("sort_order", "descending")
if sort_order == "descending":
params["sortOrder"] = "descending"
elif sort_order == "ascending":
params["sortOrder"] = "ascending"
# Add category filtering
if "categories" in kwargs and kwargs["categories"]:
categories = "+OR+".join([f"cat:{cat}" for cat in kwargs["categories"]])
params["search_query"] = f"{params['search_query']}+AND+({categories})"
try:
# Make the request
response = requests.get(
self.base_url,
params=params
)
response.raise_for_status()
# Parse the XML response
root = ET.fromstring(response.content)
# Define namespaces
ns = {
'atom': 'http://www.w3.org/2005/Atom',
'arxiv': 'http://arxiv.org/schemas/atom'
}
# Extract and standardize the results
results = []
for entry in root.findall('.//atom:entry', ns):
# Extract basic information
title = entry.find('./atom:title', ns).text.strip()
summary = entry.find('./atom:summary', ns).text.strip()
published = entry.find('./atom:published', ns).text
updated = entry.find('./atom:updated', ns).text
# Extract authors
authors = []
for author in entry.findall('./atom:author/atom:name', ns):
authors.append(author.text.strip())
# Extract links
links = {}
for link in entry.findall('./atom:link', ns):
link_rel = link.get('rel', '')
link_href = link.get('href', '')
links[link_rel] = link_href
# Extract arXiv-specific information
arxiv_id = entry.find('./atom:id', ns).text.split('/')[-1]
# Get categories
categories = []
for category in entry.findall('./arxiv:primary_category', ns):
categories.append(category.get('term', ''))
for category in entry.findall('./atom:category', ns):
cat_term = category.get('term', '')
if cat_term and cat_term not in categories:
categories.append(cat_term)
# Format the result
result = {
"title": title,
"url": links.get('alternate', ''),
"pdf_url": links.get('related', ''),
"snippet": summary[:200] + "..." if len(summary) > 200 else summary,
"source": "arxiv",
"arxiv_id": arxiv_id,
"authors": authors,
"categories": categories,
"published_date": published,
"updated_date": updated,
"full_text": summary
}
results.append(result)
return results
except requests.exceptions.RequestException as e:
print(f"Error executing arXiv search: {e}")
return []
except ET.ParseError as e:
print(f"Error parsing arXiv response: {e}")
return []
def get_name(self) -> str:
"""Get the name of the search handler."""
return "arxiv"
def is_available(self) -> bool:
"""Check if the arXiv API is available."""
return self.available
def get_rate_limit_info(self) -> Dict[str, Any]:
"""Get information about the API's rate limits."""
# arXiv API rate limits
return {
"requests_per_minute": 30, # arXiv recommends no more than 1 request per 3 seconds
"requests_per_day": 2000, # This is an estimate
"current_usage": None # arXiv doesn't provide usage info in responses
}