""" CORE.ac.uk API handler. Provides access to open access academic papers from institutional repositories. """ import os import requests from typing import Dict, List, Any, Optional from .base_handler import BaseSearchHandler from config.config import get_config, get_api_key class CoreSearchHandler(BaseSearchHandler): """Handler for CORE.ac.uk academic search API.""" def __init__(self): """Initialize the CORE search handler.""" self.config = get_config() self.api_key = get_api_key("core") self.base_url = "https://api.core.ac.uk/v3/search/works" self.available = self.api_key is not None # Get any custom settings from config self.academic_config = self.config.config_data.get("academic_search", {}).get("core", {}) def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]: """ Execute a search query using CORE.ac.uk. Args: query: The search query to execute num_results: Number of results to return **kwargs: Additional search parameters: - full_text: Whether to search in full text (default: True) - filter_year: Filter by publication year or range - sort: Sort by relevance or publication date - repositories: Limit to specific repositories Returns: List of search results with standardized format """ if not self.available: raise ValueError("CORE API is not available. API key is missing.") # Set up the request headers headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json" } # Set up the request body body = { "q": query, "limit": num_results, "offset": 0 } # Add full text search parameter full_text = kwargs.get("full_text", True) if full_text: body["fields"] = ["title", "authors", "year", "abstract", "fullText"] else: body["fields"] = ["title", "authors", "year", "abstract"] # Add year filter if specified if "filter_year" in kwargs: body["filters"] = [{"year": kwargs["filter_year"]}] # Add sort parameter if "sort" in kwargs: if kwargs["sort"] == "date": body["sort"] = [{"year": "desc"}] else: body["sort"] = [{"_score": "desc"}] # Default to relevance # Add repository filter if specified if "repositories" in kwargs: if "filters" not in body: body["filters"] = [] body["filters"].append({"repositoryIds": kwargs["repositories"]}) try: # Make the request response = requests.post(self.base_url, headers=headers, json=body) response.raise_for_status() # Parse the response data = response.json() # Process the results results = [] for item in data.get("results", []): # Extract authors authors = [] for author in item.get("authors", [])[:3]: author_name = author.get("name", "") if author_name: authors.append(author_name) # Get publication year pub_year = item.get("year", "Unknown") # Get DOI doi = item.get("doi", "") # Determine URL - prefer the download URL if available url = item.get("downloadUrl", "") if not url and doi: url = f"https://doi.org/{doi}" if not url: url = item.get("sourceFulltextUrls", [""])[0] if item.get("sourceFulltextUrls") else "" # Create snippet from abstract or first part of full text snippet = item.get("abstract", "") if not snippet and "fullText" in item: snippet = item.get("fullText", "")[:500] + "..." # If no snippet is available, create one from metadata if not snippet: journal = item.get("publisher", "Unknown Journal") snippet = f"Open access academic paper from {journal}. {pub_year}." # Create the result result = { "title": item.get("title", "Untitled"), "url": url, "snippet": snippet, "source": "core", "authors": ", ".join(authors), "year": pub_year, "journal": item.get("publisher", ""), "doi": doi, "open_access": True # CORE only indexes open access content } results.append(result) return results except requests.exceptions.RequestException as e: print(f"Error executing CORE search: {e}") return [] def get_name(self) -> str: """Get the name of the search handler.""" return "core" def is_available(self) -> bool: """Check if the CORE API is available.""" return self.available def get_rate_limit_info(self) -> Dict[str, Any]: """Get information about the API's rate limits.""" # These limits are based on the free tier return { "requests_per_minute": 30, "requests_per_day": 10000, "current_usage": None }