ira/execution/result_enrichers/unpaywall_enricher.py

132 lines
4.5 KiB
Python

"""
Unpaywall enricher for finding open access versions of scholarly articles.
"""
import os
import requests
from typing import Dict, List, Any, Optional
from config.config import get_config, get_api_key
class UnpaywallEnricher:
"""Enricher for finding open access versions of papers using Unpaywall."""
def __init__(self):
"""Initialize the Unpaywall enricher."""
self.config = get_config()
# Unpaywall recommends using an email for API access
self.email = self.config.config_data.get("academic_search", {}).get("email", "user@example.com")
self.base_url = "https://api.unpaywall.org/v2/"
self.available = True # Unpaywall doesn't require an API key, just an email
# Get any custom settings from config
self.academic_config = self.config.config_data.get("academic_search", {}).get("unpaywall", {})
def enrich_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Enrich search results with open access links from Unpaywall.
Args:
results: List of search results to enrich
Returns:
Enriched list of search results
"""
if not self.available:
return results
# Process each result that has a DOI
for result in results:
doi = result.get("doi")
if not doi:
continue
# Skip results that are already marked as open access
if result.get("open_access", False) or result.get("access_status") == "Open Access":
continue
# Lookup the DOI in Unpaywall
oa_data = self._lookup_doi(doi)
if not oa_data:
continue
# Enrich the result with open access data
if oa_data.get("is_oa", False):
result["open_access"] = True
result["access_status"] = "Open Access"
# Get the best open access URL
best_oa_url = self._get_best_oa_url(oa_data)
if best_oa_url:
result["oa_url"] = best_oa_url
# Add a note to the snippet about open access availability
if "snippet" in result:
result["snippet"] += " [Open access version available]"
else:
result["open_access"] = False
result["access_status"] = "Subscription"
return results
def _lookup_doi(self, doi: str) -> Optional[Dict[str, Any]]:
"""
Look up a DOI in Unpaywall.
Args:
doi: The DOI to look up
Returns:
Unpaywall data for the DOI, or None if not found
"""
try:
# Normalize the DOI
doi = doi.strip().lower()
if doi.startswith("https://doi.org/"):
doi = doi[16:]
elif doi.startswith("doi:"):
doi = doi[4:]
# Make the request to Unpaywall
url = f"{self.base_url}{doi}?email={self.email}"
response = requests.get(url)
# Check for successful response
if response.status_code == 200:
return response.json()
return None
except Exception as e:
print(f"Error looking up DOI in Unpaywall: {e}")
return None
def _get_best_oa_url(self, oa_data: Dict[str, Any]) -> Optional[str]:
"""
Get the best open access URL from Unpaywall data.
Args:
oa_data: Unpaywall data for a DOI
Returns:
Best open access URL, or None if not available
"""
# Check if there's a best OA location
best_oa_location = oa_data.get("best_oa_location", None)
if best_oa_location:
# Get the URL from the best location
return best_oa_location.get("url_for_pdf") or best_oa_location.get("url")
# If no best location, check all OA locations
oa_locations = oa_data.get("oa_locations", [])
if oa_locations:
# Prefer PDF URLs
for location in oa_locations:
if location.get("url_for_pdf"):
return location.get("url_for_pdf")
# Fall back to HTML URLs
for location in oa_locations:
if location.get("url"):
return location.get("url")
return None