120 lines
2.8 KiB
Go
120 lines
2.8 KiB
Go
package arxiva
|
|
|
|
import (
|
|
"encoding/json"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
var httpClient = &http.Client{}
|
|
|
|
type Paper struct {
|
|
Title string `json:"title"`
|
|
Abstract string `json:"abstract"`
|
|
ArxivID string `json:"arxiv_id"`
|
|
}
|
|
|
|
type Feed struct {
|
|
XMLName xml.Name `xml:"feed"`
|
|
Entries []Entry `xml:"entry"`
|
|
}
|
|
|
|
type Entry struct {
|
|
Title string `xml:"title"`
|
|
Abstract string `xml:"summary"`
|
|
ID string `xml:"id"`
|
|
}
|
|
|
|
func SanitizeFilename(query string) string {
|
|
invalid := []string{"/", "\\", "?", "%", "*", ":", "|", "\"", "<", ">", " "}
|
|
sanitized := query
|
|
for _, char := range invalid {
|
|
sanitized = strings.ReplaceAll(sanitized, char, "_")
|
|
}
|
|
return sanitized
|
|
}
|
|
|
|
func extractArxivID(idURL string) string {
|
|
parts := strings.Split(idURL, "/")
|
|
return parts[len(parts)-1]
|
|
}
|
|
|
|
func FetchPapers(startDate string, endDate string, query string, maxPapers int) ([]Paper, error) {
|
|
if maxPapers < 1 || maxPapers > 2000 {
|
|
return nil, fmt.Errorf("maxPapers must be between 1 and 2000")
|
|
}
|
|
|
|
_, err := time.Parse("20060102", startDate)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid start date format: %v", err)
|
|
}
|
|
_, err = time.Parse("20060102", endDate)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid end date format: %v", err)
|
|
}
|
|
|
|
baseURL := "http://export.arxiv.org/api/query"
|
|
searchQuery := fmt.Sprintf("submittedDate:[%s TO %s] AND %s", startDate, endDate, query)
|
|
|
|
params := url.Values{}
|
|
params.Add("search_query", searchQuery)
|
|
params.Add("max_results", fmt.Sprintf("%d", maxPapers))
|
|
|
|
req, err := http.NewRequest("GET", baseURL+"?"+params.Encode(), nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request: %v", err)
|
|
}
|
|
resp, err := httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch from arXiv: %v", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read response: %v", err)
|
|
}
|
|
|
|
var feed Feed
|
|
if err := xml.Unmarshal(body, &feed); err != nil {
|
|
return nil, fmt.Errorf("failed to parse XML: %v", err)
|
|
}
|
|
|
|
var papers []Paper
|
|
for _, entry := range feed.Entries {
|
|
paper := Paper{
|
|
Title: strings.TrimSpace(entry.Title),
|
|
Abstract: strings.TrimSpace(entry.Abstract),
|
|
ArxivID: extractArxivID(entry.ID),
|
|
}
|
|
papers = append(papers, paper)
|
|
}
|
|
|
|
return papers, nil
|
|
}
|
|
|
|
func SaveToFile(papers []Paper, startDate string, endDate string, query string) error {
|
|
if len(papers) == 0 {
|
|
return fmt.Errorf("no papers to save")
|
|
}
|
|
|
|
filename := fmt.Sprintf("%s-%s-%s.json", startDate, endDate, SanitizeFilename(query))
|
|
|
|
data, err := json.MarshalIndent(papers, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal JSON: %v", err)
|
|
}
|
|
|
|
err = ioutil.WriteFile(filename, data, 0644)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to write file: %v", err)
|
|
}
|
|
|
|
return nil
|
|
}
|