paper-system/arxiv-processor/main.go

215 lines
5.1 KiB
Go
Raw Permalink Normal View History

2025-01-24 15:26:47 +00:00
package main
import (
"bytes"
"encoding/json"
"encoding/xml"
"flag"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"strings"
"time"
)
func main() {
searchQuery := flag.String("search", "", "Search query")
dateRange := flag.String("date-range", "", "Date range in YYYYMMDD-YYYYMMDD format")
outputFile := flag.String("output", "papers_data.json", "Output file path")
flag.Parse()
if *searchQuery == "" || *dateRange == "" {
log.Fatal("Both --search and --date-range are required")
}
// Parse date range
dates := strings.Split(*dateRange, "-")
if len(dates) != 2 {
log.Fatal("Invalid date range format. Use YYYYMMDD-YYYYMMDD")
}
startDate, err := time.Parse("20060102", dates[0])
if err != nil {
log.Fatalf("Invalid start date: %v", err)
}
endDate, err := time.Parse("20060102", dates[1])
if err != nil {
log.Fatalf("Invalid end date: %v", err)
}
// Create arXiv client
client := NewClient()
// Fetch papers
fmt.Println("Fetching papers...")
papers, err := client.FetchPapers(*searchQuery, startDate, endDate)
if err != nil {
log.Fatalf("Failed to fetch papers: %v", err)
}
fmt.Printf("Fetched %d papers\n", len(papers))
// Save papers to JSON
err = SavePapers(*outputFile, papers)
if err != nil {
log.Fatalf("Failed to save papers: %v", err)
}
fmt.Printf("Saved paper data to %s\n", *outputFile)
// LLM processing placeholder
fmt.Println("\nTo process these papers with an LLM:")
fmt.Println("1. Choose an LLM API (e.g., OpenAI, Anthropic, local model)")
fmt.Println("2. Implement the LLM integration in llm package")
fmt.Println("3. Run the program again with your criteria")
}
// Paper represents a single arXiv paper
type Paper struct {
Title string `json:"title"`
Abstract string `json:"abstract"`
ID string `json:"arxiv_id"`
}
// Client handles arXiv API interactions
type Client struct {
httpClient *http.Client
baseURL string
}
// NewClient creates a new arXiv client
func NewClient() *Client {
return &Client{
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
baseURL: "http://export.arxiv.org/api/query",
}
}
// FetchPapers retrieves papers matching the search query within the date range
func (c *Client) FetchPapers(searchQuery string, startDate, endDate time.Time) ([]Paper, error) {
var papers []Paper
start := 0
batchSize := 100
delay := 3 * time.Second
for {
// Construct query with date range and pagination
query := fmt.Sprintf("%s AND submittedDate:[%s TO %s]",
searchQuery,
startDate.Format("20060102"),
endDate.Format("20060102"),
)
// Build URL with query parameters
params := url.Values{}
params.Add("search_query", query)
params.Add("start", fmt.Sprintf("%d", start))
params.Add("max_results", fmt.Sprintf("%d", batchSize))
reqURL := fmt.Sprintf("%s?%s", c.baseURL, params.Encode())
// Make HTTP request
fmt.Printf("Making request to: %s\n", reqURL)
resp, err := c.httpClient.Get(reqURL)
if err != nil {
return nil, fmt.Errorf("failed to fetch papers: %w", err)
}
defer resp.Body.Close()
// Read and log raw response for debugging
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
fmt.Printf("Response status: %s\n", resp.Status)
fmt.Printf("Raw API response (first 1000 chars):\n%s\n", string(bodyBytes[:min(len(bodyBytes), 1000)]))
// Parse XML response
batch, totalResults, err := c.parseResponse(bytes.NewReader(bodyBytes))
if err != nil {
return nil, fmt.Errorf("failed to parse response: %w", err)
}
papers = append(papers, batch...)
start += len(batch)
// Check if we've fetched all papers
if start >= totalResults || len(batch) == 0 {
break
}
// Respect arXiv's rate limits
time.Sleep(delay)
}
return papers, nil
}
// parseResponse handles XML parsing of the arXiv API response
func (c *Client) parseResponse(r io.Reader) ([]Paper, int, error) {
type atomEntry struct {
Title string `xml:"title"`
Summary string `xml:"summary"`
ID string `xml:"id"`
}
type atomFeed struct {
XMLName xml.Name `xml:"feed"`
TotalResults int `xml:"totalResults"`
Entries []atomEntry `xml:"entry"`
}
var feed atomFeed
if err := xml.NewDecoder(r).Decode(&feed); err != nil {
return nil, 0, fmt.Errorf("failed to decode XML: %w", err)
}
var papers []Paper
for _, entry := range feed.Entries {
// Extract just the ID part from the full URL
idParts := strings.Split(entry.ID, "/abs/")
if len(idParts) < 2 {
continue
}
papers = append(papers, Paper{
Title: strings.TrimSpace(entry.Title),
Abstract: strings.TrimSpace(entry.Summary),
ID: idParts[1],
})
}
return papers, feed.TotalResults, nil
}
// SavePapers saves the papers to a JSON file
func SavePapers(filename string, papers []Paper) error {
file, err := os.Create(filename)
if err != nil {
return fmt.Errorf("failed to create file: %w", err)
}
defer file.Close()
encoder := json.NewEncoder(file)
encoder.SetIndent("", " ")
if err := encoder.Encode(papers); err != nil {
return fmt.Errorf("failed to encode JSON: %w", err)
}
return nil
}
func min(a, b int) int {
if a < b {
return a
}
return b
}