132 lines
4.5 KiB
Python
132 lines
4.5 KiB
Python
"""
|
|
Unpaywall enricher for finding open access versions of scholarly articles.
|
|
"""
|
|
|
|
import os
|
|
import requests
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
from config.config import get_config, get_api_key
|
|
|
|
|
|
class UnpaywallEnricher:
|
|
"""Enricher for finding open access versions of papers using Unpaywall."""
|
|
|
|
def __init__(self):
|
|
"""Initialize the Unpaywall enricher."""
|
|
self.config = get_config()
|
|
# Unpaywall recommends using an email for API access
|
|
self.email = self.config.config_data.get("academic_search", {}).get("email", "user@example.com")
|
|
self.base_url = "https://api.unpaywall.org/v2/"
|
|
self.available = True # Unpaywall doesn't require an API key, just an email
|
|
|
|
# Get any custom settings from config
|
|
self.academic_config = self.config.config_data.get("academic_search", {}).get("unpaywall", {})
|
|
|
|
def enrich_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Enrich search results with open access links from Unpaywall.
|
|
|
|
Args:
|
|
results: List of search results to enrich
|
|
|
|
Returns:
|
|
Enriched list of search results
|
|
"""
|
|
if not self.available:
|
|
return results
|
|
|
|
# Process each result that has a DOI
|
|
for result in results:
|
|
doi = result.get("doi")
|
|
if not doi:
|
|
continue
|
|
|
|
# Skip results that are already marked as open access
|
|
if result.get("open_access", False) or result.get("access_status") == "Open Access":
|
|
continue
|
|
|
|
# Lookup the DOI in Unpaywall
|
|
oa_data = self._lookup_doi(doi)
|
|
if not oa_data:
|
|
continue
|
|
|
|
# Enrich the result with open access data
|
|
if oa_data.get("is_oa", False):
|
|
result["open_access"] = True
|
|
result["access_status"] = "Open Access"
|
|
|
|
# Get the best open access URL
|
|
best_oa_url = self._get_best_oa_url(oa_data)
|
|
if best_oa_url:
|
|
result["oa_url"] = best_oa_url
|
|
# Add a note to the snippet about open access availability
|
|
if "snippet" in result:
|
|
result["snippet"] += " [Open access version available]"
|
|
else:
|
|
result["open_access"] = False
|
|
result["access_status"] = "Subscription"
|
|
|
|
return results
|
|
|
|
def _lookup_doi(self, doi: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Look up a DOI in Unpaywall.
|
|
|
|
Args:
|
|
doi: The DOI to look up
|
|
|
|
Returns:
|
|
Unpaywall data for the DOI, or None if not found
|
|
"""
|
|
try:
|
|
# Normalize the DOI
|
|
doi = doi.strip().lower()
|
|
if doi.startswith("https://doi.org/"):
|
|
doi = doi[16:]
|
|
elif doi.startswith("doi:"):
|
|
doi = doi[4:]
|
|
|
|
# Make the request to Unpaywall
|
|
url = f"{self.base_url}{doi}?email={self.email}"
|
|
response = requests.get(url)
|
|
|
|
# Check for successful response
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error looking up DOI in Unpaywall: {e}")
|
|
return None
|
|
|
|
def _get_best_oa_url(self, oa_data: Dict[str, Any]) -> Optional[str]:
|
|
"""
|
|
Get the best open access URL from Unpaywall data.
|
|
|
|
Args:
|
|
oa_data: Unpaywall data for a DOI
|
|
|
|
Returns:
|
|
Best open access URL, or None if not available
|
|
"""
|
|
# Check if there's a best OA location
|
|
best_oa_location = oa_data.get("best_oa_location", None)
|
|
if best_oa_location:
|
|
# Get the URL from the best location
|
|
return best_oa_location.get("url_for_pdf") or best_oa_location.get("url")
|
|
|
|
# If no best location, check all OA locations
|
|
oa_locations = oa_data.get("oa_locations", [])
|
|
if oa_locations:
|
|
# Prefer PDF URLs
|
|
for location in oa_locations:
|
|
if location.get("url_for_pdf"):
|
|
return location.get("url_for_pdf")
|
|
|
|
# Fall back to HTML URLs
|
|
for location in oa_locations:
|
|
if location.get("url"):
|
|
return location.get("url")
|
|
|
|
return None |