160 lines
5.8 KiB
Python
160 lines
5.8 KiB
Python
"""
|
|
CORE.ac.uk API handler.
|
|
Provides access to open access academic papers from institutional repositories.
|
|
"""
|
|
|
|
import os
|
|
import requests
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
from .base_handler import BaseSearchHandler
|
|
from config.config import get_config, get_api_key
|
|
|
|
|
|
class CoreSearchHandler(BaseSearchHandler):
|
|
"""Handler for CORE.ac.uk academic search API."""
|
|
|
|
def __init__(self):
|
|
"""Initialize the CORE search handler."""
|
|
self.config = get_config()
|
|
self.api_key = get_api_key("core")
|
|
self.base_url = "https://api.core.ac.uk/v3/search/works"
|
|
self.available = self.api_key is not None
|
|
|
|
# Get any custom settings from config
|
|
self.academic_config = self.config.config_data.get("academic_search", {}).get("core", {})
|
|
|
|
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
|
|
"""
|
|
Execute a search query using CORE.ac.uk.
|
|
|
|
Args:
|
|
query: The search query to execute
|
|
num_results: Number of results to return
|
|
**kwargs: Additional search parameters:
|
|
- full_text: Whether to search in full text (default: True)
|
|
- filter_year: Filter by publication year or range
|
|
- sort: Sort by relevance or publication date
|
|
- repositories: Limit to specific repositories
|
|
|
|
Returns:
|
|
List of search results with standardized format
|
|
"""
|
|
if not self.available:
|
|
raise ValueError("CORE API is not available. API key is missing.")
|
|
|
|
# Set up the request headers
|
|
headers = {
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
# Set up the request body
|
|
body = {
|
|
"q": query,
|
|
"limit": num_results,
|
|
"offset": 0
|
|
}
|
|
|
|
# Add full text search parameter
|
|
full_text = kwargs.get("full_text", True)
|
|
if full_text:
|
|
body["fields"] = ["title", "authors", "year", "abstract", "fullText"]
|
|
else:
|
|
body["fields"] = ["title", "authors", "year", "abstract"]
|
|
|
|
# Add year filter if specified
|
|
if "filter_year" in kwargs:
|
|
body["filters"] = [{"year": kwargs["filter_year"]}]
|
|
|
|
# Add sort parameter
|
|
if "sort" in kwargs:
|
|
if kwargs["sort"] == "date":
|
|
body["sort"] = [{"year": "desc"}]
|
|
else:
|
|
body["sort"] = [{"_score": "desc"}] # Default to relevance
|
|
|
|
# Add repository filter if specified
|
|
if "repositories" in kwargs:
|
|
if "filters" not in body:
|
|
body["filters"] = []
|
|
body["filters"].append({"repositoryIds": kwargs["repositories"]})
|
|
|
|
try:
|
|
# Make the request
|
|
response = requests.post(self.base_url, headers=headers, json=body)
|
|
response.raise_for_status()
|
|
|
|
# Parse the response
|
|
data = response.json()
|
|
|
|
# Process the results
|
|
results = []
|
|
for item in data.get("results", []):
|
|
# Extract authors
|
|
authors = []
|
|
for author in item.get("authors", [])[:3]:
|
|
author_name = author.get("name", "")
|
|
if author_name:
|
|
authors.append(author_name)
|
|
|
|
# Get publication year
|
|
pub_year = item.get("year", "Unknown")
|
|
|
|
# Get DOI
|
|
doi = item.get("doi", "")
|
|
|
|
# Determine URL - prefer the download URL if available
|
|
url = item.get("downloadUrl", "")
|
|
if not url and doi:
|
|
url = f"https://doi.org/{doi}"
|
|
if not url:
|
|
url = item.get("sourceFulltextUrls", [""])[0] if item.get("sourceFulltextUrls") else ""
|
|
|
|
# Create snippet from abstract or first part of full text
|
|
snippet = item.get("abstract", "")
|
|
if not snippet and "fullText" in item:
|
|
snippet = item.get("fullText", "")[:500] + "..."
|
|
|
|
# If no snippet is available, create one from metadata
|
|
if not snippet:
|
|
journal = item.get("publisher", "Unknown Journal")
|
|
snippet = f"Open access academic paper from {journal}. {pub_year}."
|
|
|
|
# Create the result
|
|
result = {
|
|
"title": item.get("title", "Untitled"),
|
|
"url": url,
|
|
"snippet": snippet,
|
|
"source": "core",
|
|
"authors": ", ".join(authors),
|
|
"year": pub_year,
|
|
"journal": item.get("publisher", ""),
|
|
"doi": doi,
|
|
"open_access": True # CORE only indexes open access content
|
|
}
|
|
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Error executing CORE search: {e}")
|
|
return []
|
|
|
|
def get_name(self) -> str:
|
|
"""Get the name of the search handler."""
|
|
return "core"
|
|
|
|
def is_available(self) -> bool:
|
|
"""Check if the CORE API is available."""
|
|
return self.available
|
|
|
|
def get_rate_limit_info(self) -> Dict[str, Any]:
|
|
"""Get information about the API's rate limits."""
|
|
# These limits are based on the free tier
|
|
return {
|
|
"requests_per_minute": 30,
|
|
"requests_per_day": 10000,
|
|
"current_usage": None
|
|
} |