163 lines
6.1 KiB
Python
163 lines
6.1 KiB
Python
"""
|
|
arXiv API handler.
|
|
Uses the official arXiv API to search for academic papers.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import requests
|
|
import urllib.parse
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
from .base_handler import BaseSearchHandler
|
|
from config.config import get_config
|
|
|
|
|
|
class ArxivSearchHandler(BaseSearchHandler):
|
|
"""Handler for arXiv Search using the official API."""
|
|
|
|
def __init__(self):
|
|
"""Initialize the arXiv search handler."""
|
|
self.config = get_config()
|
|
self.base_url = "http://export.arxiv.org/api/query"
|
|
self.available = True # arXiv API is freely available without an API key
|
|
|
|
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
|
|
"""
|
|
Execute an arXiv search query.
|
|
|
|
Args:
|
|
query: The search query to execute
|
|
num_results: Number of results to return
|
|
**kwargs: Additional search parameters:
|
|
- sort_by: Sort order ("relevance", "lastUpdatedDate", "submittedDate")
|
|
- sort_order: Sort direction ("ascending", "descending")
|
|
- categories: List of arXiv categories to search within
|
|
- date_range: Date range for filtering (e.g., "all", "last_week", "last_month")
|
|
|
|
Returns:
|
|
List of search results with standardized format
|
|
"""
|
|
# Set up the request parameters
|
|
params = {
|
|
"search_query": query,
|
|
"max_results": num_results,
|
|
"start": kwargs.get("start", 0)
|
|
}
|
|
|
|
# Add sorting parameters
|
|
sort_by = kwargs.get("sort_by", "relevance")
|
|
if sort_by == "relevance":
|
|
params["sortBy"] = "relevance"
|
|
elif sort_by == "lastUpdatedDate":
|
|
params["sortBy"] = "lastUpdatedDate"
|
|
elif sort_by == "submittedDate":
|
|
params["sortBy"] = "submittedDate"
|
|
|
|
sort_order = kwargs.get("sort_order", "descending")
|
|
if sort_order == "descending":
|
|
params["sortOrder"] = "descending"
|
|
elif sort_order == "ascending":
|
|
params["sortOrder"] = "ascending"
|
|
|
|
# Add category filtering
|
|
if "categories" in kwargs and kwargs["categories"]:
|
|
categories = "+OR+".join([f"cat:{cat}" for cat in kwargs["categories"]])
|
|
params["search_query"] = f"{params['search_query']}+AND+({categories})"
|
|
|
|
try:
|
|
# Make the request
|
|
response = requests.get(
|
|
self.base_url,
|
|
params=params
|
|
)
|
|
response.raise_for_status()
|
|
|
|
# Parse the XML response
|
|
root = ET.fromstring(response.content)
|
|
|
|
# Define namespaces
|
|
ns = {
|
|
'atom': 'http://www.w3.org/2005/Atom',
|
|
'arxiv': 'http://arxiv.org/schemas/atom'
|
|
}
|
|
|
|
# Extract and standardize the results
|
|
results = []
|
|
|
|
for entry in root.findall('.//atom:entry', ns):
|
|
# Extract basic information
|
|
title = entry.find('./atom:title', ns).text.strip()
|
|
summary = entry.find('./atom:summary', ns).text.strip()
|
|
published = entry.find('./atom:published', ns).text
|
|
updated = entry.find('./atom:updated', ns).text
|
|
|
|
# Extract authors
|
|
authors = []
|
|
for author in entry.findall('./atom:author/atom:name', ns):
|
|
authors.append(author.text.strip())
|
|
|
|
# Extract links
|
|
links = {}
|
|
for link in entry.findall('./atom:link', ns):
|
|
link_rel = link.get('rel', '')
|
|
link_href = link.get('href', '')
|
|
links[link_rel] = link_href
|
|
|
|
# Extract arXiv-specific information
|
|
arxiv_id = entry.find('./atom:id', ns).text.split('/')[-1]
|
|
|
|
# Get categories
|
|
categories = []
|
|
for category in entry.findall('./arxiv:primary_category', ns):
|
|
categories.append(category.get('term', ''))
|
|
for category in entry.findall('./atom:category', ns):
|
|
cat_term = category.get('term', '')
|
|
if cat_term and cat_term not in categories:
|
|
categories.append(cat_term)
|
|
|
|
# Format the result
|
|
result = {
|
|
"title": title,
|
|
"url": links.get('alternate', ''),
|
|
"pdf_url": links.get('related', ''),
|
|
"snippet": summary[:200] + "..." if len(summary) > 200 else summary,
|
|
"source": "arxiv",
|
|
"arxiv_id": arxiv_id,
|
|
"authors": authors,
|
|
"categories": categories,
|
|
"published_date": published,
|
|
"updated_date": updated,
|
|
"full_text": summary
|
|
}
|
|
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Error executing arXiv search: {e}")
|
|
return []
|
|
except ET.ParseError as e:
|
|
print(f"Error parsing arXiv response: {e}")
|
|
return []
|
|
|
|
def get_name(self) -> str:
|
|
"""Get the name of the search handler."""
|
|
return "arxiv"
|
|
|
|
def is_available(self) -> bool:
|
|
"""Check if the arXiv API is available."""
|
|
return self.available
|
|
|
|
def get_rate_limit_info(self) -> Dict[str, Any]:
|
|
"""Get information about the API's rate limits."""
|
|
# arXiv API rate limits
|
|
return {
|
|
"requests_per_minute": 30, # arXiv recommends no more than 1 request per 3 seconds
|
|
"requests_per_day": 2000, # This is an estimate
|
|
"current_usage": None # arXiv doesn't provide usage info in responses
|
|
}
|