From d2190124f90021e4128c210de945957bae86c21b Mon Sep 17 00:00:00 2001 From: Steve White Date: Sat, 25 Jan 2025 11:07:24 -0600 Subject: [PATCH] Initial Commit --- README.md | 66 +++++++++++++++++++++++++++++++++ arxiva.go | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ go.mod | 3 ++ 3 files changed, 178 insertions(+) create mode 100644 README.md create mode 100644 arxiva.go create mode 100644 go.mod diff --git a/README.md b/README.md new file mode 100644 index 0000000..a98f906 --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +# Arxiva - arXiv Paper Fetcher Library + +Arxiva is a Go library for fetching and saving research papers from arXiv.org. + +## Features +- Fetch papers by date range and search query +- Save results as JSON files +- Automatic filename sanitization + +## Installation + +Add the package to your project: + +```bash +go get gitea.r8z.us/stwhite/arxiva +``` + +## Usage + +Import the package in your Go code: + +```go +import "gitea.r8z.us/stwhite/arxiva" +``` + +Example usage: + +```go +package main + +import ( + "log" + "gitea.r8z.us/stwhite/arxiva" +) + +func main() { + // Fetch papers + papers, err := arxiva.FetchPapers("20240101", "20240125", "quantum computing", 100) + if err != nil { + log.Fatal(err) + } + + // Save to file + err = arxiva.SaveToFile(papers, "20240101", "20240125", "quantum computing") + if err != nil { + log.Fatal(err) + } +} +``` + +## API Reference + +### FetchPapers(startDate, endDate, query string, maxResults int) ([]Paper, error) +Fetches papers from arXiv API + +### SaveToFile(papers []Paper, startDate, endDate, query string) error +Saves papers to a JSON file + +## Dependencies + +- Standard Go libraries +- arXiv API (no API key required) + +## License + +MIT License diff --git a/arxiva.go b/arxiva.go new file mode 100644 index 0000000..a019b05 --- /dev/null +++ b/arxiva.go @@ -0,0 +1,109 @@ +package arxiva + +import ( + "encoding/json" + "encoding/xml" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "strings" + "time" +) + +type Paper struct { + Title string `json:"title"` + Abstract string `json:"abstract"` + ArxivID string `json:"arxiv_id"` +} + +type Feed struct { + XMLName xml.Name `xml:"feed"` + Entries []Entry `xml:"entry"` +} + +type Entry struct { + Title string `xml:"title"` + Abstract string `xml:"summary"` + ID string `xml:"id"` +} + +func SanitizeFilename(query string) string { + invalid := []string{"/", "\\", "?", "%", "*", ":", "|", "\"", "<", ">", " "} + sanitized := query + for _, char := range invalid { + sanitized = strings.ReplaceAll(sanitized, char, "_") + } + return sanitized +} + +func extractArxivID(idURL string) string { + parts := strings.Split(idURL, "/") + return parts[len(parts)-1] +} + +func FetchPapers(startDate string, endDate string, query string, maxPapers int) ([]Paper, error) { + if maxPapers < 1 || maxPapers > 2000 { + return nil, fmt.Errorf("maxPapers must be between 1 and 2000") + } + + _, err := time.Parse("20060102", startDate) + if err != nil { + return nil, fmt.Errorf("invalid start date format: %v", err) + } + _, err = time.Parse("20060102", endDate) + if err != nil { + return nil, fmt.Errorf("invalid end date format: %v", err) + } + + baseURL := "http://export.arxiv.org/api/query" + searchQuery := fmt.Sprintf("submittedDate:[%s TO %s] AND %s", startDate, endDate, query) + + params := url.Values{} + params.Add("search_query", searchQuery) + params.Add("max_results", fmt.Sprintf("%d", maxPapers)) + + resp, err := http.Get(baseURL + "?" + params.Encode()) + if err != nil { + return nil, fmt.Errorf("failed to fetch from arXiv: %v", err) + } + defer resp.Body.Close() + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response: %v", err) + } + + var feed Feed + if err := xml.Unmarshal(body, &feed); err != nil { + return nil, fmt.Errorf("failed to parse XML: %v", err) + } + + var papers []Paper + for _, entry := range feed.Entries { + paper := Paper{ + Title: strings.TrimSpace(entry.Title), + Abstract: strings.TrimSpace(entry.Abstract), + ArxivID: extractArxivID(entry.ID), + } + papers = append(papers, paper) + } + + return papers, nil +} + +func SaveToFile(papers []Paper, startDate string, endDate string, query string) error { + filename := fmt.Sprintf("%s-%s-%s.json", startDate, endDate, SanitizeFilename(query)) + + data, err := json.MarshalIndent(papers, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal JSON: %v", err) + } + + err = ioutil.WriteFile(filename, data, 0644) + if err != nil { + return fmt.Errorf("failed to write file: %v", err) + } + + return nil +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..9cafb46 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module gitea.r8z.us/stwhite/arxiva + +go 1.23.4