arxiva/arxiva.go

120 lines
2.8 KiB
Go

package arxiva
import (
"encoding/json"
"encoding/xml"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strings"
"time"
)
var httpClient = &http.Client{}
type Paper struct {
Title string `json:"title"`
Abstract string `json:"abstract"`
ArxivID string `json:"arxiv_id"`
}
type Feed struct {
XMLName xml.Name `xml:"feed"`
Entries []Entry `xml:"entry"`
}
type Entry struct {
Title string `xml:"title"`
Abstract string `xml:"summary"`
ID string `xml:"id"`
}
func SanitizeFilename(query string) string {
invalid := []string{"/", "\\", "?", "%", "*", ":", "|", "\"", "<", ">", " "}
sanitized := query
for _, char := range invalid {
sanitized = strings.ReplaceAll(sanitized, char, "_")
}
return sanitized
}
func extractArxivID(idURL string) string {
parts := strings.Split(idURL, "/")
return parts[len(parts)-1]
}
func FetchPapers(startDate string, endDate string, query string, maxPapers int) ([]Paper, error) {
if maxPapers < 1 || maxPapers > 2000 {
return nil, fmt.Errorf("maxPapers must be between 1 and 2000")
}
_, err := time.Parse("20060102", startDate)
if err != nil {
return nil, fmt.Errorf("invalid start date format: %v", err)
}
_, err = time.Parse("20060102", endDate)
if err != nil {
return nil, fmt.Errorf("invalid end date format: %v", err)
}
baseURL := "http://export.arxiv.org/api/query"
searchQuery := fmt.Sprintf("submittedDate:[%s TO %s] AND %s", startDate, endDate, query)
params := url.Values{}
params.Add("search_query", searchQuery)
params.Add("max_results", fmt.Sprintf("%d", maxPapers))
req, err := http.NewRequest("GET", baseURL+"?"+params.Encode(), nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %v", err)
}
resp, err := httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to fetch from arXiv: %v", err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response: %v", err)
}
var feed Feed
if err := xml.Unmarshal(body, &feed); err != nil {
return nil, fmt.Errorf("failed to parse XML: %v", err)
}
var papers []Paper
for _, entry := range feed.Entries {
paper := Paper{
Title: strings.TrimSpace(entry.Title),
Abstract: strings.TrimSpace(entry.Abstract),
ArxivID: extractArxivID(entry.ID),
}
papers = append(papers, paper)
}
return papers, nil
}
func SaveToFile(papers []Paper, startDate string, endDate string, query string) error {
if len(papers) == 0 {
return fmt.Errorf("no papers to save")
}
filename := fmt.Sprintf("%s-%s-%s.json", startDate, endDate, SanitizeFilename(query))
data, err := json.MarshalIndent(papers, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal JSON: %v", err)
}
err = ioutil.WriteFile(filename, data, 0644)
if err != nil {
return fmt.Errorf("failed to write file: %v", err)
}
return nil
}