From 5983a9b8ac432cafdd2dd11e6b7f6291d589598e Mon Sep 17 00:00:00 2001 From: Steve White Date: Wed, 29 Jan 2025 09:09:35 -0600 Subject: [PATCH] Updated file hanndling --- papers.go | 188 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 136 insertions(+), 52 deletions(-) diff --git a/papers.go b/papers.go index ca4cf7a..754f3a5 100644 --- a/papers.go +++ b/papers.go @@ -1,8 +1,10 @@ package main import ( + "encoding/json" "flag" "fmt" + "io" "log" "os" "regexp" @@ -14,6 +16,47 @@ import ( "gitea.r8z.us/stwhite/paperprocessor" ) +// Paper represents the expected structure of papers in the input JSON file +type Paper struct { + Title string `json:"title"` + Abstract string `json:"abstract"` + ArxivID string `json:"arxiv_id"` +} + +// validateInputFile checks if the input file exists and has valid JSON structure +func validateInputFile(path string) ([]Paper, error) { + file, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("failed to open input file: %v", err) + } + defer file.Close() + + content, err := io.ReadAll(file) + if err != nil { + return nil, fmt.Errorf("failed to read input file: %v", err) + } + + var papers []Paper + if err := json.Unmarshal(content, &papers); err != nil { + return nil, fmt.Errorf("invalid JSON format: %v", err) + } + + // Validate required fields + for i, paper := range papers { + if paper.Title == "" { + return nil, fmt.Errorf("paper at index %d missing title", i) + } + if paper.Abstract == "" { + return nil, fmt.Errorf("paper at index %d missing abstract", i) + } + if paper.ArxivID == "" { + return nil, fmt.Errorf("paper at index %d missing arxiv_id", i) + } + } + + return papers, nil +} + // sanitizeFilename replaces invalid filename characters to match arxiva's sanitization func sanitizeFilename(s string) string { s = strings.ReplaceAll(s, ":", "_") @@ -39,31 +82,35 @@ func main() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0]) fmt.Fprintf(os.Stderr, "Description:\n") - fmt.Fprintf(os.Stderr, " Fetches papers from arXiv, processes them using an LLM, and generates both JSON and Markdown outputs.\n\n") + fmt.Fprintf(os.Stderr, " Fetches papers from arXiv (or uses input file), processes them using an LLM, and generates both JSON and Markdown outputs.\n\n") fmt.Fprintf(os.Stderr, "Pipeline:\n") - fmt.Fprintf(os.Stderr, " 1. Fetches papers from arXiv based on date range and query\n") - fmt.Fprintf(os.Stderr, " 2. Saves raw papers to JSON (format: YYYYMMDD-YYYYMMDD-query.json)\n") - fmt.Fprintf(os.Stderr, " 3. Processes papers using specified LLM model\n") - fmt.Fprintf(os.Stderr, " 4. Formats results to both JSON and Markdown\n\n") + fmt.Fprintf(os.Stderr, " 1. Either:\n") + fmt.Fprintf(os.Stderr, " a) Fetches papers from arXiv based on date range and query, or\n") + fmt.Fprintf(os.Stderr, " b) Uses papers from provided input file\n") + fmt.Fprintf(os.Stderr, " 2. Processes papers using specified LLM model\n") + fmt.Fprintf(os.Stderr, " 3. Formats results to both JSON and Markdown\n\n") fmt.Fprintf(os.Stderr, "Required flags:\n") + fmt.Fprintf(os.Stderr, " -api-key : API key for LLM service\n\n") + fmt.Fprintf(os.Stderr, "Required for arXiv fetching (if not using -input):\n") fmt.Fprintf(os.Stderr, " -start : Start date (YYYYMMDD)\n") fmt.Fprintf(os.Stderr, " -end : End date (YYYYMMDD)\n") - fmt.Fprintf(os.Stderr, " -query : Search query\n") - fmt.Fprintf(os.Stderr, " -api-key : API key for LLM service\n\n") + fmt.Fprintf(os.Stderr, " -query : Search query\n\n") fmt.Fprintf(os.Stderr, "Options:\n") flag.PrintDefaults() fmt.Fprintf(os.Stderr, "\nExamples:\n") - fmt.Fprintf(os.Stderr, " Basic usage:\n") + fmt.Fprintf(os.Stderr, " Using arXiv:\n") fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\"\n\n", os.Args[0]) - fmt.Fprintf(os.Stderr, " With custom model and outputs:\n") - fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\" \\\n", os.Args[0]) - fmt.Fprintf(os.Stderr, " -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n") + fmt.Fprintf(os.Stderr, " Using input file:\n") + fmt.Fprintf(os.Stderr, " %s -input papers.json -api-key \"your-key\"\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " With custom options:\n") + fmt.Fprintf(os.Stderr, " %s -input papers.json -api-key \"your-key\" -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n", os.Args[0]) } // Parse command line arguments - startDate := flag.String("start", "", "Start date in YYYYMMDD format") - endDate := flag.String("end", "", "End date in YYYYMMDD format") - query := flag.String("query", "", "Search query") + inputFile := flag.String("input", "", "Input JSON file containing papers (optional)") + startDate := flag.String("start", "", "Start date in YYYYMMDD format (required if not using -input)") + endDate := flag.String("end", "", "End date in YYYYMMDD format (required if not using -input)") + query := flag.String("query", "", "Search query (required if not using -input)") maxResults := flag.Int("maxResults", 100, "Maximum number of results (1-2000)") model := flag.String("model", "phi-4", "Model to use for processing") apiKey := flag.String("api-key", "", "API key for service authentication") @@ -73,34 +120,73 @@ func main() { mdOutput := flag.String("md-output", "", "Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)") flag.Parse() - // Generate base filename from parameters with sanitization - baseFilename := fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query)) - - // Set default output filenames if not provided - if *jsonOutput == "" { - *jsonOutput = baseFilename + ".json" - } - if *mdOutput == "" { - *mdOutput = baseFilename + ".md" - } - - // Validate required flags - if *startDate == "" || *endDate == "" || *query == "" || *apiKey == "" { - fmt.Fprintf(os.Stderr, "Error: start date, end date, query, and api-key are required\n\n") + // Validate required flags and input + if *apiKey == "" { + fmt.Fprintf(os.Stderr, "Error: api-key is required\n\n") flag.Usage() os.Exit(1) } - // Validate date format - if !isValidDate(*startDate) || !isValidDate(*endDate) { - fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n") - os.Exit(1) - } + var ( + papers []arxiva.Paper + err error + baseFilename string + ) - // Validate maxResults range - if *maxResults < 1 || *maxResults > 2000 { - fmt.Fprintf(os.Stderr, "Error: maxResults must be between 1 and 2000\n") - os.Exit(1) + if *inputFile != "" { + // Use input file + inputPapers, err := validateInputFile(*inputFile) + if err != nil { + log.Fatalf("Invalid input file: %v", err) + } + + // Convert input papers to arxiva.Paper format + papers = make([]arxiva.Paper, len(inputPapers)) + for i, p := range inputPapers { + papers[i] = arxiva.Paper{ + Title: p.Title, + Abstract: p.Abstract, + ArxivID: p.ArxivID, + } + } + + // Use input filename as base for outputs + baseFilename = *inputFile + if ext := ".json"; strings.HasSuffix(baseFilename, ext) { + baseFilename = baseFilename[:len(baseFilename)-len(ext)] + } + } else { + // Validate arXiv fetching parameters + if *startDate == "" || *endDate == "" || *query == "" { + fmt.Fprintf(os.Stderr, "Error: start date, end date, and query are required when not using -input\n\n") + flag.Usage() + os.Exit(1) + } + + // Validate date format + if !isValidDate(*startDate) || !isValidDate(*endDate) { + fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n") + os.Exit(1) + } + + // Validate maxResults range + if *maxResults < 1 || *maxResults > 2000 { + fmt.Fprintf(os.Stderr, "Error: maxResults must be between 1 and 2000\n") + os.Exit(1) + } + + // Fetch papers from arXiv + papers, err = arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults) + if err != nil { + log.Fatalf("Failed to fetch papers: %v", err) + } + + // Save papers to JSON file using the same naming convention + if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil { + log.Fatalf("Failed to save papers: %v", err) + } + + baseFilename = fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query)) } // Create processor configuration @@ -111,26 +197,24 @@ func main() { RequestDelay: 2 * time.Second, } - // Fetch papers using command line args - papers, err := arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults) - if err != nil { - log.Fatalf("Failed to fetch papers: %v", err) + // Get criteria filename without extension for output naming + criteriaBase := *criteriaFile + if ext := ".md"; strings.HasSuffix(criteriaBase, ext) { + criteriaBase = criteriaBase[:len(criteriaBase)-len(ext)] } - // Save papers to JSON file using the same naming convention - if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil { - log.Fatalf("Failed to save papers: %v", err) + // Set default output filenames if not provided + if *jsonOutput == "" { + *jsonOutput = fmt.Sprintf("%s-%s.json", baseFilename, criteriaBase) + } + if *mdOutput == "" { + *mdOutput = fmt.Sprintf("%s-%s.md", baseFilename, criteriaBase) } - // Wait briefly for file system to sync and verify file exists - time.Sleep(100 * time.Millisecond) - if _, err := os.Stat(baseFilename + ".json"); os.IsNotExist(err) { - log.Fatalf("Failed to find saved papers file: %s", baseFilename+".json") - } - - // Process the saved file using the base filename + // Process the papers + inputJson := baseFilename + ".json" if err := paperprocessor.ProcessFile( - baseFilename+".json", + inputJson, *jsonOutput, *criteriaFile, config,