ira/execution/api_handlers/core_handler.py

"""
CORE.ac.uk API handler.
Provides access to open access academic papers from institutional repositories.
"""

import os
import requests
from typing import Dict, List, Any, Optional

from .base_handler import BaseSearchHandler
from config.config import get_config, get_api_key


class CoreSearchHandler(BaseSearchHandler):
    """Handler for CORE.ac.uk academic search API."""

    def __init__(self):
        """Initialize the CORE search handler."""
        self.config = get_config()
        self.api_key = get_api_key("core")
        self.base_url = "https://api.core.ac.uk/v3/search/works"
        self.available = self.api_key is not None

        # Get any custom settings from config
        self.academic_config = self.config.config_data.get("academic_search", {}).get("core", {})

    def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
        """
        Execute a search query using CORE.ac.uk.

        Args:
            query: The search query to execute
            num_results: Number of results to return
            **kwargs: Additional search parameters:
                - full_text: Whether to search in full text (default: True)
                - filter_year: Filter by publication year or range
                - sort: Sort by relevance or publication date
                - repositories: Limit to specific repositories

        Returns:
            List of search results with standardized format
        """
        if not self.available:
            raise ValueError("CORE API is not available. API key is missing.")

        # Set up the request headers
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }

        # Set up the request body
        body = {
            "q": query,
            "limit": num_results,
            "offset": 0
        }

        # Add full text search parameter
        full_text = kwargs.get("full_text", True)
        if full_text:
            body["fields"] = ["title", "authors", "year", "abstract", "fullText"]
        else:
            body["fields"] = ["title", "authors", "year", "abstract"]

        # Add year filter if specified
        if "filter_year" in kwargs:
            body["filters"] = [{"year": kwargs["filter_year"]}]

        # Add sort parameter
        if "sort" in kwargs:
            if kwargs["sort"] == "date":
                body["sort"] = [{"year": "desc"}]
            else:
                body["sort"] = [{"_score": "desc"}]  # Default to relevance

        # Add repository filter if specified
        if "repositories" in kwargs:
            if "filters" not in body:
                body["filters"] = []
            body["filters"].append({"repositoryIds": kwargs["repositories"]})

        try:
            # Make the request
            response = requests.post(self.base_url, headers=headers, json=body)
            response.raise_for_status()

            # Parse the response
            data = response.json()

            # Process the results
            results = []
            for item in data.get("results", []):
                # Extract authors
                authors = []
                for author in item.get("authors", [])[:3]:
                    author_name = author.get("name", "")
                    if author_name:
                        authors.append(author_name)

                # Get publication year
                pub_year = item.get("year", "Unknown")

                # Get DOI
                doi = item.get("doi", "")

                # Determine URL - prefer the download URL if available
                url = item.get("downloadUrl", "")
                if not url and doi:
                    url = f"https://doi.org/{doi}"
                if not url:
                    url = item.get("sourceFulltextUrls", [""])[0] if item.get("sourceFulltextUrls") else ""

                # Create snippet from abstract or first part of full text
                snippet = item.get("abstract", "")
                if not snippet and "fullText" in item:
                    snippet = item.get("fullText", "")[:500] + "..."

                # If no snippet is available, create one from metadata
                if not snippet:
                    journal = item.get("publisher", "Unknown Journal")
                    snippet = f"Open access academic paper from {journal}. {pub_year}."

                # Create the result
                result = {
                    "title": item.get("title", "Untitled"),
                    "url": url,
                    "snippet": snippet,
                    "source": "core",
                    "authors": ", ".join(authors),
                    "year": pub_year,
                    "journal": item.get("publisher", ""),
                    "doi": doi,
                    "open_access": True  # CORE only indexes open access content
                }

                results.append(result)

            return results

        except requests.exceptions.RequestException as e:
            print(f"Error executing CORE search: {e}")
            return []

    def get_name(self) -> str:
        """Get the name of the search handler."""
        return "core"

    def is_available(self) -> bool:
        """Check if the CORE API is available."""
        return self.available

    def get_rate_limit_info(self) -> Dict[str, Any]:
        """Get information about the API's rate limits."""
        # These limits are based on the free tier
        return {
            "requests_per_minute": 30,
            "requests_per_day": 10000,
            "current_usage": None
        }