package main import ( "flag" "fmt" "log" "os" "regexp" "strings" "time" "gitea.r8z.us/stwhite/arxiva" "gitea.r8z.us/stwhite/paperformatter" "gitea.r8z.us/stwhite/paperprocessor" ) // sanitizeFilename replaces invalid filename characters to match arxiva's sanitization func sanitizeFilename(s string) string { s = strings.ReplaceAll(s, ":", "_") s = strings.ReplaceAll(s, " ", "_") return s } // isValidDate checks if the date string is in YYYYMMDD format func isValidDate(date string) bool { // Check basic format with regex matched, err := regexp.MatchString(`^\d{8}$`, date) if err != nil || !matched { return false } // Parse date to verify it's a valid date _, err = time.Parse("20060102", date) return err == nil } func main() { // Set custom usage message flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0]) fmt.Fprintf(os.Stderr, "Description:\n") fmt.Fprintf(os.Stderr, " Fetches papers from arXiv, processes them using an LLM, and generates both JSON and Markdown outputs.\n\n") fmt.Fprintf(os.Stderr, "Pipeline:\n") fmt.Fprintf(os.Stderr, " 1. Fetches papers from arXiv based on date range and query\n") fmt.Fprintf(os.Stderr, " 2. Saves raw papers to JSON (format: YYYYMMDD-YYYYMMDD-query.json)\n") fmt.Fprintf(os.Stderr, " 3. Processes papers using specified LLM model\n") fmt.Fprintf(os.Stderr, " 4. Formats results to both JSON and Markdown\n\n") fmt.Fprintf(os.Stderr, "Required flags:\n") fmt.Fprintf(os.Stderr, " -start : Start date (YYYYMMDD)\n") fmt.Fprintf(os.Stderr, " -end : End date (YYYYMMDD)\n") fmt.Fprintf(os.Stderr, " -query : Search query\n") fmt.Fprintf(os.Stderr, " -api-key : API key for LLM service\n\n") fmt.Fprintf(os.Stderr, "Options:\n") flag.PrintDefaults() fmt.Fprintf(os.Stderr, "\nExamples:\n") fmt.Fprintf(os.Stderr, " Basic usage:\n") fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\"\n\n", os.Args[0]) fmt.Fprintf(os.Stderr, " With custom model and outputs:\n") fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\" \\\n", os.Args[0]) fmt.Fprintf(os.Stderr, " -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n") } // Parse command line arguments startDate := flag.String("start", "", "Start date in YYYYMMDD format") endDate := flag.String("end", "", "End date in YYYYMMDD format") query := flag.String("query", "", "Search query") maxResults := flag.Int("maxResults", 100, "Maximum number of results (1-2000)") model := flag.String("model", "phi-4", "Model to use for processing") apiKey := flag.String("api-key", "", "API key for service authentication") apiEndpoint := flag.String("api-endpoint", "http://localhost:1234/v1/chat/completions", "API endpoint URL") criteriaFile := flag.String("criteria", "criteria.md", "Path to evaluation criteria markdown file") jsonOutput := flag.String("json-output", "", "JSON output file path (default: YYYYMMDD-YYYYMMDD-query.json)") mdOutput := flag.String("md-output", "", "Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)") flag.Parse() // Generate base filename from parameters with sanitization baseFilename := fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query)) // Set default output filenames if not provided if *jsonOutput == "" { *jsonOutput = baseFilename + ".json" } if *mdOutput == "" { *mdOutput = baseFilename + ".md" } // Validate required flags if *startDate == "" || *endDate == "" || *query == "" || *apiKey == "" { fmt.Fprintf(os.Stderr, "Error: start date, end date, query, and api-key are required\n\n") flag.Usage() os.Exit(1) } // Validate date format if !isValidDate(*startDate) || !isValidDate(*endDate) { fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n") os.Exit(1) } // Validate maxResults range if *maxResults < 1 || *maxResults > 2000 { fmt.Fprintf(os.Stderr, "Error: maxResults must be between 1 and 2000\n") os.Exit(1) } // Create processor configuration config := paperprocessor.Config{ APIEndpoint: *apiEndpoint, APIKey: *apiKey, Model: *model, RequestDelay: 2 * time.Second, } // Fetch papers using command line args papers, err := arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults) if err != nil { log.Fatalf("Failed to fetch papers: %v", err) } // Save papers to JSON file using the same naming convention if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil { log.Fatalf("Failed to save papers: %v", err) } // Wait briefly for file system to sync and verify file exists time.Sleep(100 * time.Millisecond) if _, err := os.Stat(baseFilename + ".json"); os.IsNotExist(err) { log.Fatalf("Failed to find saved papers file: %s", baseFilename+".json") } // Process the saved file using the base filename if err := paperprocessor.ProcessFile( baseFilename+".json", *jsonOutput, *criteriaFile, config, ); err != nil { log.Fatalf("Processing failed: %v", err) } // Format the processed results to markdown if err := paperformatter.FormatPapers(*jsonOutput, *mdOutput); err != nil { log.Fatalf("Formatting failed: %v", err) } log.Printf("Successfully processed papers. Results written to %s and formatted to %s", *jsonOutput, *mdOutput) }