180 lines
7.0 KiB
Python
180 lines
7.0 KiB
Python
"""
|
|
OpenAlex API handler.
|
|
Provides access to academic research papers and scholarly information.
|
|
"""
|
|
|
|
import os
|
|
import requests
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
from .base_handler import BaseSearchHandler
|
|
from config.config import get_config, get_api_key
|
|
|
|
|
|
class OpenAlexSearchHandler(BaseSearchHandler):
|
|
"""Handler for OpenAlex academic search API."""
|
|
|
|
def __init__(self):
|
|
"""Initialize the OpenAlex search handler."""
|
|
self.config = get_config()
|
|
# OpenAlex doesn't require an API key, but using an email is recommended
|
|
self.email = self.config.config_data.get("academic_search", {}).get("email", "user@example.com")
|
|
self.base_url = "https://api.openalex.org/works"
|
|
self.available = True # OpenAlex doesn't require an API key
|
|
|
|
# Get any custom settings from config
|
|
self.academic_config = self.config.config_data.get("academic_search", {}).get("openalex", {})
|
|
|
|
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
|
|
"""
|
|
Execute a search query using OpenAlex.
|
|
|
|
Args:
|
|
query: The search query to execute
|
|
num_results: Number of results to return
|
|
**kwargs: Additional search parameters:
|
|
- filter_type: Filter by work type (article, book, etc.)
|
|
- filter_year: Filter by publication year or range
|
|
- filter_open_access: Only return open access publications
|
|
- sort: Sort by relevance, citations, publication date
|
|
- filter_concept: Filter by academic concept/field
|
|
|
|
Returns:
|
|
List of search results with standardized format
|
|
"""
|
|
# Build the search URL with parameters
|
|
params = {
|
|
"search": query,
|
|
"per_page": num_results,
|
|
"mailto": self.email # Good practice for the API
|
|
}
|
|
|
|
# Add filters
|
|
filters = []
|
|
|
|
# Type filter (article, book, etc.)
|
|
if "filter_type" in kwargs:
|
|
filters.append(f"type.id:{kwargs['filter_type']}")
|
|
|
|
# Year filter
|
|
if "filter_year" in kwargs:
|
|
filters.append(f"publication_year:{kwargs['filter_year']}")
|
|
|
|
# Open access filter
|
|
if kwargs.get("filter_open_access", False):
|
|
filters.append("is_oa:true")
|
|
|
|
# Concept/field filter
|
|
if "filter_concept" in kwargs:
|
|
filters.append(f"concepts.id:{kwargs['filter_concept']}")
|
|
|
|
# Combine filters if there are any
|
|
if filters:
|
|
params["filter"] = ",".join(filters)
|
|
|
|
# Sort parameter
|
|
if "sort" in kwargs:
|
|
params["sort"] = kwargs["sort"]
|
|
else:
|
|
# Default to sorting by relevance score
|
|
params["sort"] = "relevance_score:desc"
|
|
|
|
try:
|
|
# Make the request
|
|
response = requests.get(self.base_url, params=params)
|
|
response.raise_for_status()
|
|
|
|
# Parse the response
|
|
data = response.json()
|
|
|
|
# Process the results
|
|
results = []
|
|
for item in data.get("results", []):
|
|
# Extract authors
|
|
authors = []
|
|
for author in item.get("authorships", [])[:3]:
|
|
author_name = author.get("author", {}).get("display_name", "")
|
|
if author_name:
|
|
authors.append(author_name)
|
|
|
|
# Format citation count
|
|
citation_count = item.get("cited_by_count", 0)
|
|
|
|
# Get the publication year
|
|
pub_year = item.get("publication_year", "Unknown")
|
|
|
|
# Check if it's open access
|
|
is_oa = item.get("open_access", {}).get("is_oa", False)
|
|
oa_status = "Open Access" if is_oa else "Subscription"
|
|
|
|
# Get journal/venue name
|
|
journal = None
|
|
if "primary_location" in item and item["primary_location"]:
|
|
source = item.get("primary_location", {}).get("source", {})
|
|
if source:
|
|
journal = source.get("display_name", "Unknown Journal")
|
|
|
|
# Get DOI
|
|
doi = item.get("doi")
|
|
url = f"https://doi.org/{doi}" if doi else item.get("url", "")
|
|
|
|
# Get abstract
|
|
abstract = item.get("abstract_inverted_index", None)
|
|
snippet = ""
|
|
|
|
# Convert abstract_inverted_index to readable text if available
|
|
if abstract:
|
|
try:
|
|
# The OpenAlex API uses an inverted index format
|
|
# We need to reconstruct the text from this format
|
|
words = {}
|
|
for word, positions in abstract.items():
|
|
for pos in positions:
|
|
words[pos] = word
|
|
|
|
# Reconstruct the abstract from the positions
|
|
snippet = " ".join([words.get(i, "") for i in sorted(words.keys())])
|
|
except:
|
|
snippet = "Abstract not available in readable format"
|
|
|
|
# Fallback if no abstract is available
|
|
if not snippet:
|
|
snippet = f"Academic paper: {item.get('title', 'Untitled')}. Published in {journal or 'Unknown'} ({pub_year}). {citation_count} citations."
|
|
|
|
# Create the result
|
|
result = {
|
|
"title": item.get("title", "Untitled"),
|
|
"url": url,
|
|
"snippet": snippet,
|
|
"source": "openalex",
|
|
"authors": ", ".join(authors),
|
|
"year": pub_year,
|
|
"citation_count": citation_count,
|
|
"access_status": oa_status,
|
|
"journal": journal,
|
|
"doi": doi
|
|
}
|
|
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Error executing OpenAlex search: {e}")
|
|
return []
|
|
|
|
def get_name(self) -> str:
|
|
"""Get the name of the search handler."""
|
|
return "openalex"
|
|
|
|
def is_available(self) -> bool:
|
|
"""Check if the OpenAlex API is available."""
|
|
return self.available
|
|
|
|
def get_rate_limit_info(self) -> Dict[str, Any]:
|
|
"""Get information about the API's rate limits."""
|
|
return {
|
|
"requests_per_minute": 100, # OpenAlex is quite generous with rate limits
|
|
"requests_per_day": 100000, # 100k requests per day for anonymous users
|
|
"current_usage": None # OpenAlex doesn't provide usage info in responses
|
|
} |