""" OpenAlex API handler. Provides access to academic research papers and scholarly information. """ import os import requests from typing import Dict, List, Any, Optional from .base_handler import BaseSearchHandler from config.config import get_config, get_api_key class OpenAlexSearchHandler(BaseSearchHandler): """Handler for OpenAlex academic search API.""" def __init__(self): """Initialize the OpenAlex search handler.""" self.config = get_config() # OpenAlex doesn't require an API key, but using an email is recommended self.email = self.config.config_data.get("academic_search", {}).get("email", "user@example.com") self.base_url = "https://api.openalex.org/works" self.available = True # OpenAlex doesn't require an API key # Get any custom settings from config self.academic_config = self.config.config_data.get("academic_search", {}).get("openalex", {}) def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: """ Execute a search query using OpenAlex. Args: query: The search query to execute num_results: Number of results to return **kwargs: Additional search parameters: - filter_type: Filter by work type (article, book, etc.) - filter_year: Filter by publication year or range - filter_open_access: Only return open access publications - sort: Sort by relevance, citations, publication date - filter_concept: Filter by academic concept/field Returns: List of search results with standardized format """ # Build the search URL with parameters params = { "search": query, "per_page": num_results, "mailto": self.email # Good practice for the API } # Add filters filters = [] # Type filter (article, book, etc.) if "filter_type" in kwargs: filters.append(f"type.id:{kwargs['filter_type']}") # Year filter if "filter_year" in kwargs: filters.append(f"publication_year:{kwargs['filter_year']}") # Open access filter if kwargs.get("filter_open_access", False): filters.append("is_oa:true") # Concept/field filter if "filter_concept" in kwargs: filters.append(f"concepts.id:{kwargs['filter_concept']}") # Combine filters if there are any if filters: params["filter"] = ",".join(filters) # Sort parameter if "sort" in kwargs: params["sort"] = kwargs["sort"] else: # Default to sorting by relevance score params["sort"] = "relevance_score:desc" try: # Make the request response = requests.get(self.base_url, params=params) response.raise_for_status() # Parse the response data = response.json() # Process the results results = [] for item in data.get("results", []): # Extract authors authors = [] for author in item.get("authorships", [])[:3]: author_name = author.get("author", {}).get("display_name", "") if author_name: authors.append(author_name) # Format citation count citation_count = item.get("cited_by_count", 0) # Get the publication year pub_year = item.get("publication_year", "Unknown") # Check if it's open access is_oa = item.get("open_access", {}).get("is_oa", False) oa_status = "Open Access" if is_oa else "Subscription" # Get journal/venue name journal = None if "primary_location" in item and item["primary_location"]: source = item.get("primary_location", {}).get("source", {}) if source: journal = source.get("display_name", "Unknown Journal") # Get DOI doi = item.get("doi") url = f"https://doi.org/{doi}" if doi else item.get("url", "") # Get abstract abstract = item.get("abstract_inverted_index", None) snippet = "" # Convert abstract_inverted_index to readable text if available if abstract: try: # The OpenAlex API uses an inverted index format # We need to reconstruct the text from this format words = {} for word, positions in abstract.items(): for pos in positions: words[pos] = word # Reconstruct the abstract from the positions snippet = " ".join([words.get(i, "") for i in sorted(words.keys())]) except: snippet = "Abstract not available in readable format" # Fallback if no abstract is available if not snippet: snippet = f"Academic paper: {item.get('title', 'Untitled')}. Published in {journal or 'Unknown'} ({pub_year}). {citation_count} citations." # Create the result result = { "title": item.get("title", "Untitled"), "url": url, "snippet": snippet, "source": "openalex", "authors": ", ".join(authors), "year": pub_year, "citation_count": citation_count, "access_status": oa_status, "journal": journal, "doi": doi } results.append(result) return results except requests.exceptions.RequestException as e: print(f"Error executing OpenAlex search: {e}") return [] def get_name(self) -> str: """Get the name of the search handler.""" return "openalex" def is_available(self) -> bool: """Check if the OpenAlex API is available.""" return self.available def get_rate_limit_info(self) -> Dict[str, Any]: """Get information about the API's rate limits.""" return { "requests_per_minute": 100, # OpenAlex is quite generous with rate limits "requests_per_day": 100000, # 100k requests per day for anonymous users "current_usage": None # OpenAlex doesn't provide usage info in responses }