Initial Commit
This commit is contained in:
commit
d2190124f9
|
@ -0,0 +1,66 @@
|
|||
# Arxiva - arXiv Paper Fetcher Library
|
||||
|
||||
Arxiva is a Go library for fetching and saving research papers from arXiv.org.
|
||||
|
||||
## Features
|
||||
- Fetch papers by date range and search query
|
||||
- Save results as JSON files
|
||||
- Automatic filename sanitization
|
||||
|
||||
## Installation
|
||||
|
||||
Add the package to your project:
|
||||
|
||||
```bash
|
||||
go get gitea.r8z.us/stwhite/arxiva
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Import the package in your Go code:
|
||||
|
||||
```go
|
||||
import "gitea.r8z.us/stwhite/arxiva"
|
||||
```
|
||||
|
||||
Example usage:
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"gitea.r8z.us/stwhite/arxiva"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Fetch papers
|
||||
papers, err := arxiva.FetchPapers("20240101", "20240125", "quantum computing", 100)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// Save to file
|
||||
err = arxiva.SaveToFile(papers, "20240101", "20240125", "quantum computing")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### FetchPapers(startDate, endDate, query string, maxResults int) ([]Paper, error)
|
||||
Fetches papers from arXiv API
|
||||
|
||||
### SaveToFile(papers []Paper, startDate, endDate, query string) error
|
||||
Saves papers to a JSON file
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Standard Go libraries
|
||||
- arXiv API (no API key required)
|
||||
|
||||
## License
|
||||
|
||||
MIT License
|
|
@ -0,0 +1,109 @@
|
|||
package arxiva
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Paper struct {
|
||||
Title string `json:"title"`
|
||||
Abstract string `json:"abstract"`
|
||||
ArxivID string `json:"arxiv_id"`
|
||||
}
|
||||
|
||||
type Feed struct {
|
||||
XMLName xml.Name `xml:"feed"`
|
||||
Entries []Entry `xml:"entry"`
|
||||
}
|
||||
|
||||
type Entry struct {
|
||||
Title string `xml:"title"`
|
||||
Abstract string `xml:"summary"`
|
||||
ID string `xml:"id"`
|
||||
}
|
||||
|
||||
func SanitizeFilename(query string) string {
|
||||
invalid := []string{"/", "\\", "?", "%", "*", ":", "|", "\"", "<", ">", " "}
|
||||
sanitized := query
|
||||
for _, char := range invalid {
|
||||
sanitized = strings.ReplaceAll(sanitized, char, "_")
|
||||
}
|
||||
return sanitized
|
||||
}
|
||||
|
||||
func extractArxivID(idURL string) string {
|
||||
parts := strings.Split(idURL, "/")
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
|
||||
func FetchPapers(startDate string, endDate string, query string, maxPapers int) ([]Paper, error) {
|
||||
if maxPapers < 1 || maxPapers > 2000 {
|
||||
return nil, fmt.Errorf("maxPapers must be between 1 and 2000")
|
||||
}
|
||||
|
||||
_, err := time.Parse("20060102", startDate)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid start date format: %v", err)
|
||||
}
|
||||
_, err = time.Parse("20060102", endDate)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid end date format: %v", err)
|
||||
}
|
||||
|
||||
baseURL := "http://export.arxiv.org/api/query"
|
||||
searchQuery := fmt.Sprintf("submittedDate:[%s TO %s] AND %s", startDate, endDate, query)
|
||||
|
||||
params := url.Values{}
|
||||
params.Add("search_query", searchQuery)
|
||||
params.Add("max_results", fmt.Sprintf("%d", maxPapers))
|
||||
|
||||
resp, err := http.Get(baseURL + "?" + params.Encode())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch from arXiv: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response: %v", err)
|
||||
}
|
||||
|
||||
var feed Feed
|
||||
if err := xml.Unmarshal(body, &feed); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse XML: %v", err)
|
||||
}
|
||||
|
||||
var papers []Paper
|
||||
for _, entry := range feed.Entries {
|
||||
paper := Paper{
|
||||
Title: strings.TrimSpace(entry.Title),
|
||||
Abstract: strings.TrimSpace(entry.Abstract),
|
||||
ArxivID: extractArxivID(entry.ID),
|
||||
}
|
||||
papers = append(papers, paper)
|
||||
}
|
||||
|
||||
return papers, nil
|
||||
}
|
||||
|
||||
func SaveToFile(papers []Paper, startDate string, endDate string, query string) error {
|
||||
filename := fmt.Sprintf("%s-%s-%s.json", startDate, endDate, SanitizeFilename(query))
|
||||
|
||||
data, err := json.MarshalIndent(papers, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal JSON: %v", err)
|
||||
}
|
||||
|
||||
err = ioutil.WriteFile(filename, data, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to write file: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
Loading…
Reference in New Issue