package main import ( "bytes" "encoding/json" "encoding/xml" "flag" "fmt" "io" "log" "net/http" "net/url" "os" "strings" "time" ) func main() { searchQuery := flag.String("search", "", "Search query") dateRange := flag.String("date-range", "", "Date range in YYYYMMDD-YYYYMMDD format") outputFile := flag.String("output", "papers_data.json", "Output file path") flag.Parse() if *searchQuery == "" || *dateRange == "" { log.Fatal("Both --search and --date-range are required") } // Parse date range dates := strings.Split(*dateRange, "-") if len(dates) != 2 { log.Fatal("Invalid date range format. Use YYYYMMDD-YYYYMMDD") } startDate, err := time.Parse("20060102", dates[0]) if err != nil { log.Fatalf("Invalid start date: %v", err) } endDate, err := time.Parse("20060102", dates[1]) if err != nil { log.Fatalf("Invalid end date: %v", err) } // Create arXiv client client := NewClient() // Fetch papers fmt.Println("Fetching papers...") papers, err := client.FetchPapers(*searchQuery, startDate, endDate) if err != nil { log.Fatalf("Failed to fetch papers: %v", err) } fmt.Printf("Fetched %d papers\n", len(papers)) // Save papers to JSON err = SavePapers(*outputFile, papers) if err != nil { log.Fatalf("Failed to save papers: %v", err) } fmt.Printf("Saved paper data to %s\n", *outputFile) // LLM processing placeholder fmt.Println("\nTo process these papers with an LLM:") fmt.Println("1. Choose an LLM API (e.g., OpenAI, Anthropic, local model)") fmt.Println("2. Implement the LLM integration in llm package") fmt.Println("3. Run the program again with your criteria") } // Paper represents a single arXiv paper type Paper struct { Title string `json:"title"` Abstract string `json:"abstract"` ID string `json:"arxiv_id"` } // Client handles arXiv API interactions type Client struct { httpClient *http.Client baseURL string } // NewClient creates a new arXiv client func NewClient() *Client { return &Client{ httpClient: &http.Client{ Timeout: 30 * time.Second, }, baseURL: "http://export.arxiv.org/api/query", } } // FetchPapers retrieves papers matching the search query within the date range func (c *Client) FetchPapers(searchQuery string, startDate, endDate time.Time) ([]Paper, error) { var papers []Paper start := 0 batchSize := 100 delay := 3 * time.Second for { // Construct query with date range and pagination query := fmt.Sprintf("%s AND submittedDate:[%s TO %s]", searchQuery, startDate.Format("20060102"), endDate.Format("20060102"), ) // Build URL with query parameters params := url.Values{} params.Add("search_query", query) params.Add("start", fmt.Sprintf("%d", start)) params.Add("max_results", fmt.Sprintf("%d", batchSize)) reqURL := fmt.Sprintf("%s?%s", c.baseURL, params.Encode()) // Make HTTP request fmt.Printf("Making request to: %s\n", reqURL) resp, err := c.httpClient.Get(reqURL) if err != nil { return nil, fmt.Errorf("failed to fetch papers: %w", err) } defer resp.Body.Close() // Read and log raw response for debugging bodyBytes, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response body: %w", err) } fmt.Printf("Response status: %s\n", resp.Status) fmt.Printf("Raw API response (first 1000 chars):\n%s\n", string(bodyBytes[:min(len(bodyBytes), 1000)])) // Parse XML response batch, totalResults, err := c.parseResponse(bytes.NewReader(bodyBytes)) if err != nil { return nil, fmt.Errorf("failed to parse response: %w", err) } papers = append(papers, batch...) start += len(batch) // Check if we've fetched all papers if start >= totalResults || len(batch) == 0 { break } // Respect arXiv's rate limits time.Sleep(delay) } return papers, nil } // parseResponse handles XML parsing of the arXiv API response func (c *Client) parseResponse(r io.Reader) ([]Paper, int, error) { type atomEntry struct { Title string `xml:"title"` Summary string `xml:"summary"` ID string `xml:"id"` } type atomFeed struct { XMLName xml.Name `xml:"feed"` TotalResults int `xml:"totalResults"` Entries []atomEntry `xml:"entry"` } var feed atomFeed if err := xml.NewDecoder(r).Decode(&feed); err != nil { return nil, 0, fmt.Errorf("failed to decode XML: %w", err) } var papers []Paper for _, entry := range feed.Entries { // Extract just the ID part from the full URL idParts := strings.Split(entry.ID, "/abs/") if len(idParts) < 2 { continue } papers = append(papers, Paper{ Title: strings.TrimSpace(entry.Title), Abstract: strings.TrimSpace(entry.Summary), ID: idParts[1], }) } return papers, feed.TotalResults, nil } // SavePapers saves the papers to a JSON file func SavePapers(filename string, papers []Paper) error { file, err := os.Create(filename) if err != nil { return fmt.Errorf("failed to create file: %w", err) } defer file.Close() encoder := json.NewEncoder(file) encoder.SetIndent("", " ") if err := encoder.Encode(papers); err != nil { return fmt.Errorf("failed to encode JSON: %w", err) } return nil } func min(a, b int) int { if a < b { return a } return b }