215 lines
5.1 KiB
Go
215 lines
5.1 KiB
Go
|
package main
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
"encoding/json"
|
||
|
"encoding/xml"
|
||
|
"flag"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"log"
|
||
|
"net/http"
|
||
|
"net/url"
|
||
|
"os"
|
||
|
"strings"
|
||
|
"time"
|
||
|
)
|
||
|
|
||
|
func main() {
|
||
|
searchQuery := flag.String("search", "", "Search query")
|
||
|
dateRange := flag.String("date-range", "", "Date range in YYYYMMDD-YYYYMMDD format")
|
||
|
outputFile := flag.String("output", "papers_data.json", "Output file path")
|
||
|
|
||
|
flag.Parse()
|
||
|
|
||
|
if *searchQuery == "" || *dateRange == "" {
|
||
|
log.Fatal("Both --search and --date-range are required")
|
||
|
}
|
||
|
|
||
|
// Parse date range
|
||
|
dates := strings.Split(*dateRange, "-")
|
||
|
if len(dates) != 2 {
|
||
|
log.Fatal("Invalid date range format. Use YYYYMMDD-YYYYMMDD")
|
||
|
}
|
||
|
|
||
|
startDate, err := time.Parse("20060102", dates[0])
|
||
|
if err != nil {
|
||
|
log.Fatalf("Invalid start date: %v", err)
|
||
|
}
|
||
|
|
||
|
endDate, err := time.Parse("20060102", dates[1])
|
||
|
if err != nil {
|
||
|
log.Fatalf("Invalid end date: %v", err)
|
||
|
}
|
||
|
|
||
|
// Create arXiv client
|
||
|
client := NewClient()
|
||
|
|
||
|
// Fetch papers
|
||
|
fmt.Println("Fetching papers...")
|
||
|
papers, err := client.FetchPapers(*searchQuery, startDate, endDate)
|
||
|
if err != nil {
|
||
|
log.Fatalf("Failed to fetch papers: %v", err)
|
||
|
}
|
||
|
|
||
|
fmt.Printf("Fetched %d papers\n", len(papers))
|
||
|
|
||
|
// Save papers to JSON
|
||
|
err = SavePapers(*outputFile, papers)
|
||
|
if err != nil {
|
||
|
log.Fatalf("Failed to save papers: %v", err)
|
||
|
}
|
||
|
|
||
|
fmt.Printf("Saved paper data to %s\n", *outputFile)
|
||
|
|
||
|
// LLM processing placeholder
|
||
|
fmt.Println("\nTo process these papers with an LLM:")
|
||
|
fmt.Println("1. Choose an LLM API (e.g., OpenAI, Anthropic, local model)")
|
||
|
fmt.Println("2. Implement the LLM integration in llm package")
|
||
|
fmt.Println("3. Run the program again with your criteria")
|
||
|
}
|
||
|
|
||
|
// Paper represents a single arXiv paper
|
||
|
type Paper struct {
|
||
|
Title string `json:"title"`
|
||
|
Abstract string `json:"abstract"`
|
||
|
ID string `json:"arxiv_id"`
|
||
|
}
|
||
|
|
||
|
// Client handles arXiv API interactions
|
||
|
type Client struct {
|
||
|
httpClient *http.Client
|
||
|
baseURL string
|
||
|
}
|
||
|
|
||
|
// NewClient creates a new arXiv client
|
||
|
func NewClient() *Client {
|
||
|
return &Client{
|
||
|
httpClient: &http.Client{
|
||
|
Timeout: 30 * time.Second,
|
||
|
},
|
||
|
baseURL: "http://export.arxiv.org/api/query",
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// FetchPapers retrieves papers matching the search query within the date range
|
||
|
func (c *Client) FetchPapers(searchQuery string, startDate, endDate time.Time) ([]Paper, error) {
|
||
|
var papers []Paper
|
||
|
start := 0
|
||
|
batchSize := 100
|
||
|
delay := 3 * time.Second
|
||
|
|
||
|
for {
|
||
|
// Construct query with date range and pagination
|
||
|
query := fmt.Sprintf("%s AND submittedDate:[%s TO %s]",
|
||
|
searchQuery,
|
||
|
startDate.Format("20060102"),
|
||
|
endDate.Format("20060102"),
|
||
|
)
|
||
|
|
||
|
// Build URL with query parameters
|
||
|
params := url.Values{}
|
||
|
params.Add("search_query", query)
|
||
|
params.Add("start", fmt.Sprintf("%d", start))
|
||
|
params.Add("max_results", fmt.Sprintf("%d", batchSize))
|
||
|
|
||
|
reqURL := fmt.Sprintf("%s?%s", c.baseURL, params.Encode())
|
||
|
|
||
|
// Make HTTP request
|
||
|
fmt.Printf("Making request to: %s\n", reqURL)
|
||
|
resp, err := c.httpClient.Get(reqURL)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("failed to fetch papers: %w", err)
|
||
|
}
|
||
|
defer resp.Body.Close()
|
||
|
|
||
|
// Read and log raw response for debugging
|
||
|
bodyBytes, err := io.ReadAll(resp.Body)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("failed to read response body: %w", err)
|
||
|
}
|
||
|
fmt.Printf("Response status: %s\n", resp.Status)
|
||
|
fmt.Printf("Raw API response (first 1000 chars):\n%s\n", string(bodyBytes[:min(len(bodyBytes), 1000)]))
|
||
|
|
||
|
// Parse XML response
|
||
|
batch, totalResults, err := c.parseResponse(bytes.NewReader(bodyBytes))
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("failed to parse response: %w", err)
|
||
|
}
|
||
|
|
||
|
papers = append(papers, batch...)
|
||
|
start += len(batch)
|
||
|
|
||
|
// Check if we've fetched all papers
|
||
|
if start >= totalResults || len(batch) == 0 {
|
||
|
break
|
||
|
}
|
||
|
|
||
|
// Respect arXiv's rate limits
|
||
|
time.Sleep(delay)
|
||
|
}
|
||
|
|
||
|
return papers, nil
|
||
|
}
|
||
|
|
||
|
// parseResponse handles XML parsing of the arXiv API response
|
||
|
func (c *Client) parseResponse(r io.Reader) ([]Paper, int, error) {
|
||
|
type atomEntry struct {
|
||
|
Title string `xml:"title"`
|
||
|
Summary string `xml:"summary"`
|
||
|
ID string `xml:"id"`
|
||
|
}
|
||
|
|
||
|
type atomFeed struct {
|
||
|
XMLName xml.Name `xml:"feed"`
|
||
|
TotalResults int `xml:"totalResults"`
|
||
|
Entries []atomEntry `xml:"entry"`
|
||
|
}
|
||
|
|
||
|
var feed atomFeed
|
||
|
if err := xml.NewDecoder(r).Decode(&feed); err != nil {
|
||
|
return nil, 0, fmt.Errorf("failed to decode XML: %w", err)
|
||
|
}
|
||
|
|
||
|
var papers []Paper
|
||
|
for _, entry := range feed.Entries {
|
||
|
// Extract just the ID part from the full URL
|
||
|
idParts := strings.Split(entry.ID, "/abs/")
|
||
|
if len(idParts) < 2 {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
papers = append(papers, Paper{
|
||
|
Title: strings.TrimSpace(entry.Title),
|
||
|
Abstract: strings.TrimSpace(entry.Summary),
|
||
|
ID: idParts[1],
|
||
|
})
|
||
|
}
|
||
|
|
||
|
return papers, feed.TotalResults, nil
|
||
|
}
|
||
|
|
||
|
// SavePapers saves the papers to a JSON file
|
||
|
func SavePapers(filename string, papers []Paper) error {
|
||
|
file, err := os.Create(filename)
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("failed to create file: %w", err)
|
||
|
}
|
||
|
defer file.Close()
|
||
|
|
||
|
encoder := json.NewEncoder(file)
|
||
|
encoder.SetIndent("", " ")
|
||
|
if err := encoder.Encode(papers); err != nil {
|
||
|
return fmt.Errorf("failed to encode JSON: %w", err)
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func min(a, b int) int {
|
||
|
if a < b {
|
||
|
return a
|
||
|
}
|
||
|
return b
|
||
|
}
|