""" Unpaywall enricher for finding open access versions of scholarly articles. """ import os import requests from typing import Dict, List, Any, Optional from config.config import get_config, get_api_key class UnpaywallEnricher: """Enricher for finding open access versions of papers using Unpaywall.""" def __init__(self): """Initialize the Unpaywall enricher.""" self.config = get_config() # Unpaywall recommends using an email for API access self.email = self.config.config_data.get("academic_search", {}).get("email", "user@example.com") self.base_url = "https://api.unpaywall.org/v2/" self.available = True # Unpaywall doesn't require an API key, just an email # Get any custom settings from config self.academic_config = self.config.config_data.get("academic_search", {}).get("unpaywall", {}) def enrich_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Enrich search results with open access links from Unpaywall. Args: results: List of search results to enrich Returns: Enriched list of search results """ if not self.available: return results # Process each result that has a DOI for result in results: doi = result.get("doi") if not doi: continue # Skip results that are already marked as open access if result.get("open_access", False) or result.get("access_status") == "Open Access": continue # Lookup the DOI in Unpaywall oa_data = self._lookup_doi(doi) if not oa_data: continue # Enrich the result with open access data if oa_data.get("is_oa", False): result["open_access"] = True result["access_status"] = "Open Access" # Get the best open access URL best_oa_url = self._get_best_oa_url(oa_data) if best_oa_url: result["oa_url"] = best_oa_url # Add a note to the snippet about open access availability if "snippet" in result: result["snippet"] += " [Open access version available]" else: result["open_access"] = False result["access_status"] = "Subscription" return results def _lookup_doi(self, doi: str) -> Optional[Dict[str, Any]]: """ Look up a DOI in Unpaywall. Args: doi: The DOI to look up Returns: Unpaywall data for the DOI, or None if not found """ try: # Normalize the DOI doi = doi.strip().lower() if doi.startswith("https://doi.org/"): doi = doi[16:] elif doi.startswith("doi:"): doi = doi[4:] # Make the request to Unpaywall url = f"{self.base_url}{doi}?email={self.email}" response = requests.get(url) # Check for successful response if response.status_code == 200: return response.json() return None except Exception as e: print(f"Error looking up DOI in Unpaywall: {e}") return None def _get_best_oa_url(self, oa_data: Dict[str, Any]) -> Optional[str]: """ Get the best open access URL from Unpaywall data. Args: oa_data: Unpaywall data for a DOI Returns: Best open access URL, or None if not available """ # Check if there's a best OA location best_oa_location = oa_data.get("best_oa_location", None) if best_oa_location: # Get the URL from the best location return best_oa_location.get("url_for_pdf") or best_oa_location.get("url") # If no best location, check all OA locations oa_locations = oa_data.get("oa_locations", []) if oa_locations: # Prefer PDF URLs for location in oa_locations: if location.get("url_for_pdf"): return location.get("url_for_pdf") # Fall back to HTML URLs for location in oa_locations: if location.get("url"): return location.get("url") return None