Initial Commit

This commit is contained in:
Steve White 2025-01-25 11:07:24 -06:00
commit d2190124f9
3 changed files with 178 additions and 0 deletions

66
README.md Normal file
View File

@ -0,0 +1,66 @@
# Arxiva - arXiv Paper Fetcher Library
Arxiva is a Go library for fetching and saving research papers from arXiv.org.
## Features
- Fetch papers by date range and search query
- Save results as JSON files
- Automatic filename sanitization
## Installation
Add the package to your project:
```bash
go get gitea.r8z.us/stwhite/arxiva
```
## Usage
Import the package in your Go code:
```go
import "gitea.r8z.us/stwhite/arxiva"
```
Example usage:
```go
package main
import (
"log"
"gitea.r8z.us/stwhite/arxiva"
)
func main() {
// Fetch papers
papers, err := arxiva.FetchPapers("20240101", "20240125", "quantum computing", 100)
if err != nil {
log.Fatal(err)
}
// Save to file
err = arxiva.SaveToFile(papers, "20240101", "20240125", "quantum computing")
if err != nil {
log.Fatal(err)
}
}
```
## API Reference
### FetchPapers(startDate, endDate, query string, maxResults int) ([]Paper, error)
Fetches papers from arXiv API
### SaveToFile(papers []Paper, startDate, endDate, query string) error
Saves papers to a JSON file
## Dependencies
- Standard Go libraries
- arXiv API (no API key required)
## License
MIT License

109
arxiva.go Normal file
View File

@ -0,0 +1,109 @@
package arxiva
import (
"encoding/json"
"encoding/xml"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strings"
"time"
)
type Paper struct {
Title string `json:"title"`
Abstract string `json:"abstract"`
ArxivID string `json:"arxiv_id"`
}
type Feed struct {
XMLName xml.Name `xml:"feed"`
Entries []Entry `xml:"entry"`
}
type Entry struct {
Title string `xml:"title"`
Abstract string `xml:"summary"`
ID string `xml:"id"`
}
func SanitizeFilename(query string) string {
invalid := []string{"/", "\\", "?", "%", "*", ":", "|", "\"", "<", ">", " "}
sanitized := query
for _, char := range invalid {
sanitized = strings.ReplaceAll(sanitized, char, "_")
}
return sanitized
}
func extractArxivID(idURL string) string {
parts := strings.Split(idURL, "/")
return parts[len(parts)-1]
}
func FetchPapers(startDate string, endDate string, query string, maxPapers int) ([]Paper, error) {
if maxPapers < 1 || maxPapers > 2000 {
return nil, fmt.Errorf("maxPapers must be between 1 and 2000")
}
_, err := time.Parse("20060102", startDate)
if err != nil {
return nil, fmt.Errorf("invalid start date format: %v", err)
}
_, err = time.Parse("20060102", endDate)
if err != nil {
return nil, fmt.Errorf("invalid end date format: %v", err)
}
baseURL := "http://export.arxiv.org/api/query"
searchQuery := fmt.Sprintf("submittedDate:[%s TO %s] AND %s", startDate, endDate, query)
params := url.Values{}
params.Add("search_query", searchQuery)
params.Add("max_results", fmt.Sprintf("%d", maxPapers))
resp, err := http.Get(baseURL + "?" + params.Encode())
if err != nil {
return nil, fmt.Errorf("failed to fetch from arXiv: %v", err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response: %v", err)
}
var feed Feed
if err := xml.Unmarshal(body, &feed); err != nil {
return nil, fmt.Errorf("failed to parse XML: %v", err)
}
var papers []Paper
for _, entry := range feed.Entries {
paper := Paper{
Title: strings.TrimSpace(entry.Title),
Abstract: strings.TrimSpace(entry.Abstract),
ArxivID: extractArxivID(entry.ID),
}
papers = append(papers, paper)
}
return papers, nil
}
func SaveToFile(papers []Paper, startDate string, endDate string, query string) error {
filename := fmt.Sprintf("%s-%s-%s.json", startDate, endDate, SanitizeFilename(query))
data, err := json.MarshalIndent(papers, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal JSON: %v", err)
}
err = ioutil.WriteFile(filename, data, 0644)
if err != nil {
return fmt.Errorf("failed to write file: %v", err)
}
return nil
}

3
go.mod Normal file
View File

@ -0,0 +1,3 @@
module gitea.r8z.us/stwhite/arxiva
go 1.23.4