ira/execution/api_handlers/scholar_handler.py

126 lines
4.2 KiB
Python

"""
Google Scholar API handler.
Uses the Serper API to access Google Scholar search results.
"""
import os
import json
import requests
from typing import Dict, List, Any, Optional
from .base_handler import BaseSearchHandler
from config.config import get_config, get_api_key
class ScholarSearchHandler(BaseSearchHandler):
"""Handler for Google Scholar Search using the Serper API."""
def __init__(self):
"""Initialize the Google Scholar search handler."""
self.config = get_config()
self.api_key = get_api_key("serper")
self.base_url = "https://google.serper.dev/scholar"
self.available = self.api_key is not None
def search(self, query: str, num_results: int = 10, **kwargs) -> List[Dict[str, Any]]:
"""
Execute a Google Scholar search query using Serper API.
Args:
query: The search query to execute
num_results: Number of results to return
**kwargs: Additional search parameters:
- country: Country code (default: "us")
- language: Language code (default: "en")
- year_start: Start year for publication date filter
- year_end: End year for publication date filter
Returns:
List of search results with standardized format
"""
if not self.available:
raise ValueError("Google Scholar API is not available. API key is missing.")
# Set up the request parameters
params = {
"q": query,
"num": num_results,
"type": "scholar" # Specify search type as scholar
}
# Add optional parameters
if "country" in kwargs:
params["gl"] = kwargs["country"]
if "language" in kwargs:
params["hl"] = kwargs["language"]
# Add date range if specified
date_range = ""
if "year_start" in kwargs and "year_end" in kwargs:
date_range = f"as_ylo={kwargs['year_start']}&as_yhi={kwargs['year_end']}"
elif "year_start" in kwargs:
date_range = f"as_ylo={kwargs['year_start']}"
elif "year_end" in kwargs:
date_range = f"as_yhi={kwargs['year_end']}"
if date_range:
params["tbs"] = date_range
# Set up the headers
headers = {
"X-API-KEY": self.api_key,
"Content-Type": "application/json"
}
try:
# Make the request
response = requests.post(
self.base_url,
headers=headers,
json=params
)
response.raise_for_status()
# Parse the response
data = response.json()
# Process the results
results = []
# Process organic results
if "organic" in data:
for item in data["organic"]:
result = {
"title": item.get("title", ""),
"url": item.get("link", ""),
"snippet": item.get("snippet", ""),
"source": "scholar",
"authors": item.get("authors", ""),
"publication": item.get("publication", ""),
"year": item.get("year", "")
}
results.append(result)
return results
except requests.exceptions.RequestException as e:
print(f"Error executing Google Scholar search: {e}")
return []
def get_name(self) -> str:
"""Get the name of the search handler."""
return "scholar"
def is_available(self) -> bool:
"""Check if the Google Scholar API is available."""
return self.available
def get_rate_limit_info(self) -> Dict[str, Any]:
"""Get information about the API's rate limits."""
# These are example values - adjust based on your Serper plan
return {
"requests_per_minute": 30, # Lower for Scholar due to its specialized nature
"requests_per_day": 1000,
"current_usage": None # Serper doesn't provide usage info in responses
}