ira/execution/api_handlers/core_handler.py

160 lines
5.8 KiB
Python

"""
CORE.ac.uk API handler.
Provides access to open access academic papers from institutional repositories.
"""
import os
import requests
from typing import Dict, List, Any, Optional
from .base_handler import BaseSearchHandler
from config.config import get_config, get_api_key
class CoreSearchHandler(BaseSearchHandler):
"""Handler for CORE.ac.uk academic search API."""
def __init__(self):
"""Initialize the CORE search handler."""
self.config = get_config()
self.api_key = get_api_key("core")
self.base_url = "https://api.core.ac.uk/v3/search/works"
self.available = self.api_key is not None
# Get any custom settings from config
self.academic_config = self.config.config_data.get("academic_search", {}).get("core", {})
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
"""
Execute a search query using CORE.ac.uk.
Args:
query: The search query to execute
num_results: Number of results to return
**kwargs: Additional search parameters:
- full_text: Whether to search in full text (default: True)
- filter_year: Filter by publication year or range
- sort: Sort by relevance or publication date
- repositories: Limit to specific repositories
Returns:
List of search results with standardized format
"""
if not self.available:
raise ValueError("CORE API is not available. API key is missing.")
# Set up the request headers
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
# Set up the request body
body = {
"q": query,
"limit": num_results,
"offset": 0
}
# Add full text search parameter
full_text = kwargs.get("full_text", True)
if full_text:
body["fields"] = ["title", "authors", "year", "abstract", "fullText"]
else:
body["fields"] = ["title", "authors", "year", "abstract"]
# Add year filter if specified
if "filter_year" in kwargs:
body["filters"] = [{"year": kwargs["filter_year"]}]
# Add sort parameter
if "sort" in kwargs:
if kwargs["sort"] == "date":
body["sort"] = [{"year": "desc"}]
else:
body["sort"] = [{"_score": "desc"}] # Default to relevance
# Add repository filter if specified
if "repositories" in kwargs:
if "filters" not in body:
body["filters"] = []
body["filters"].append({"repositoryIds": kwargs["repositories"]})
try:
# Make the request
response = requests.post(self.base_url, headers=headers, json=body)
response.raise_for_status()
# Parse the response
data = response.json()
# Process the results
results = []
for item in data.get("results", []):
# Extract authors
authors = []
for author in item.get("authors", [])[:3]:
author_name = author.get("name", "")
if author_name:
authors.append(author_name)
# Get publication year
pub_year = item.get("year", "Unknown")
# Get DOI
doi = item.get("doi", "")
# Determine URL - prefer the download URL if available
url = item.get("downloadUrl", "")
if not url and doi:
url = f"https://doi.org/{doi}"
if not url:
url = item.get("sourceFulltextUrls", [""])[0] if item.get("sourceFulltextUrls") else ""
# Create snippet from abstract or first part of full text
snippet = item.get("abstract", "")
if not snippet and "fullText" in item:
snippet = item.get("fullText", "")[:500] + "..."
# If no snippet is available, create one from metadata
if not snippet:
journal = item.get("publisher", "Unknown Journal")
snippet = f"Open access academic paper from {journal}. {pub_year}."
# Create the result
result = {
"title": item.get("title", "Untitled"),
"url": url,
"snippet": snippet,
"source": "core",
"authors": ", ".join(authors),
"year": pub_year,
"journal": item.get("publisher", ""),
"doi": doi,
"open_access": True # CORE only indexes open access content
}
results.append(result)
return results
except requests.exceptions.RequestException as e:
print(f"Error executing CORE search: {e}")
return []
def get_name(self) -> str:
"""Get the name of the search handler."""
return "core"
def is_available(self) -> bool:
"""Check if the CORE API is available."""
return self.available
def get_rate_limit_info(self) -> Dict[str, Any]:
"""Get information about the API's rate limits."""
# These limits are based on the free tier
return {
"requests_per_minute": 30,
"requests_per_day": 10000,
"current_usage": None
}