ira/execution/api_handlers/openalex_handler.py

180 lines
7.0 KiB
Python

"""
OpenAlex API handler.
Provides access to academic research papers and scholarly information.
"""
import os
import requests
from typing import Dict, List, Any, Optional
from .base_handler import BaseSearchHandler
from config.config import get_config, get_api_key
class OpenAlexSearchHandler(BaseSearchHandler):
"""Handler for OpenAlex academic search API."""
def __init__(self):
"""Initialize the OpenAlex search handler."""
self.config = get_config()
# OpenAlex doesn't require an API key, but using an email is recommended
self.email = self.config.config_data.get("academic_search", {}).get("email", "user@example.com")
self.base_url = "https://api.openalex.org/works"
self.available = True # OpenAlex doesn't require an API key
# Get any custom settings from config
self.academic_config = self.config.config_data.get("academic_search", {}).get("openalex", {})
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
"""
Execute a search query using OpenAlex.
Args:
query: The search query to execute
num_results: Number of results to return
**kwargs: Additional search parameters:
- filter_type: Filter by work type (article, book, etc.)
- filter_year: Filter by publication year or range
- filter_open_access: Only return open access publications
- sort: Sort by relevance, citations, publication date
- filter_concept: Filter by academic concept/field
Returns:
List of search results with standardized format
"""
# Build the search URL with parameters
params = {
"search": query,
"per_page": num_results,
"mailto": self.email # Good practice for the API
}
# Add filters
filters = []
# Type filter (article, book, etc.)
if "filter_type" in kwargs:
filters.append(f"type.id:{kwargs['filter_type']}")
# Year filter
if "filter_year" in kwargs:
filters.append(f"publication_year:{kwargs['filter_year']}")
# Open access filter
if kwargs.get("filter_open_access", False):
filters.append("is_oa:true")
# Concept/field filter
if "filter_concept" in kwargs:
filters.append(f"concepts.id:{kwargs['filter_concept']}")
# Combine filters if there are any
if filters:
params["filter"] = ",".join(filters)
# Sort parameter
if "sort" in kwargs:
params["sort"] = kwargs["sort"]
else:
# Default to sorting by relevance score
params["sort"] = "relevance_score:desc"
try:
# Make the request
response = requests.get(self.base_url, params=params)
response.raise_for_status()
# Parse the response
data = response.json()
# Process the results
results = []
for item in data.get("results", []):
# Extract authors
authors = []
for author in item.get("authorships", [])[:3]:
author_name = author.get("author", {}).get("display_name", "")
if author_name:
authors.append(author_name)
# Format citation count
citation_count = item.get("cited_by_count", 0)
# Get the publication year
pub_year = item.get("publication_year", "Unknown")
# Check if it's open access
is_oa = item.get("open_access", {}).get("is_oa", False)
oa_status = "Open Access" if is_oa else "Subscription"
# Get journal/venue name
journal = None
if "primary_location" in item and item["primary_location"]:
source = item.get("primary_location", {}).get("source", {})
if source:
journal = source.get("display_name", "Unknown Journal")
# Get DOI
doi = item.get("doi")
url = f"https://doi.org/{doi}" if doi else item.get("url", "")
# Get abstract
abstract = item.get("abstract_inverted_index", None)
snippet = ""
# Convert abstract_inverted_index to readable text if available
if abstract:
try:
# The OpenAlex API uses an inverted index format
# We need to reconstruct the text from this format
words = {}
for word, positions in abstract.items():
for pos in positions:
words[pos] = word
# Reconstruct the abstract from the positions
snippet = " ".join([words.get(i, "") for i in sorted(words.keys())])
except:
snippet = "Abstract not available in readable format"
# Fallback if no abstract is available
if not snippet:
snippet = f"Academic paper: {item.get('title', 'Untitled')}. Published in {journal or 'Unknown'} ({pub_year}). {citation_count} citations."
# Create the result
result = {
"title": item.get("title", "Untitled"),
"url": url,
"snippet": snippet,
"source": "openalex",
"authors": ", ".join(authors),
"year": pub_year,
"citation_count": citation_count,
"access_status": oa_status,
"journal": journal,
"doi": doi
}
results.append(result)
return results
except requests.exceptions.RequestException as e:
print(f"Error executing OpenAlex search: {e}")
return []
def get_name(self) -> str:
"""Get the name of the search handler."""
return "openalex"
def is_available(self) -> bool:
"""Check if the OpenAlex API is available."""
return self.available
def get_rate_limit_info(self) -> Dict[str, Any]:
"""Get information about the API's rate limits."""
return {
"requests_per_minute": 100, # OpenAlex is quite generous with rate limits
"requests_per_day": 100000, # 100k requests per day for anonymous users
"current_usage": None # OpenAlex doesn't provide usage info in responses
}