Updated README.md and papers to add -search-only and -input options

Updated file hanndling
2025-01-29 09:25:20 -06:00 · 2025-01-29 09:09:35 -06:00
2 changed files with 262 additions and 52 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,95 @@
+# Papers
+
+A Go CLI tool for fetching, processing, and analyzing academic papers from arXiv using LLM-based evaluation.
+
+## Features
+
+- Fetch papers from arXiv API based on date range and search query
+- Process papers using configurable LLM models (default: phi-4)
+- Generate both JSON and Markdown outputs
+- Customizable evaluation criteria
+- Rate-limited API requests (2-second delay between requests)
+
+## Installation
+
+```bash
+go install gitea.r8z.us/stwhite/papers@latest
+```
+
+## Usage
+
+Basic usage:
+```bash
+papers -start 20240101 -end 20240131 -query "machine learning" -api-key "your-key"
+```
+
+With custom model and output paths:
+```bash
+papers -start 20240101 -end 20240131 -query "machine learning" -api-key "your-key" \
+  -model "gpt-4" -json-output "results.json" -md-output "summary.md"
+```
+
+Fetch papers without processing:
+```bash
+papers -search-only -start 20240101 -end 20240131 -query "machine learning"
+```
+
+Use input file:
+```bash
+papers -input papers.json -api-key "your-key"
+```
+
+### Required Flags
+
+- `-start`: Start date (YYYYMMDD format)
+- `-end`: End date (YYYYMMDD format)
+- `-query`: Search query
+
+### Optional Flags
+
+- `-search-only`: Fetch papers from arXiv and save to JSON file without processing
+- `-input`: Input JSON file containing papers (optional)
+- `-maxResults`: Maximum number of results to fetch (1-2000, default: 100)
+- `-model`: LLM model to use for processing (default: "phi-4")
+- `-api-endpoint`: API endpoint URL (default: "http://localhost:1234/v1/chat/completions")
+- `-criteria`: Path to evaluation criteria markdown file (default: "criteria.md")
+- `-json-output`: Custom JSON output file path (default: YYYYMMDD-YYYYMMDD-query.json)
+- `-md-output`: Custom Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)
+
+## Pipeline
+
+1. **Fetch**: Retrieves papers from arXiv based on specified date range and query
+2. **Save**: Stores raw paper data in JSON format
+3. **Process**: Evaluates papers using the specified LLM model according to criteria
+4. **Format**: Generates both JSON and Markdown outputs of the processed results
+
+## Output Files
+
+The tool generates two types of output files:
+
+1. **JSON Output**: Contains the raw processing results
+   - Default name format: `YYYYMMDD-YYYYMMDD-query.json`
+   - Can be customized with `-json-output` flag
+
+2. **Markdown Output**: Human-readable formatted results
+   - Default name format: `YYYYMMDD-YYYYMMDD-query.md`
+   - Can be customized with `-md-output` flag
+
+## Dependencies
+
+- [arxiva](gitea.r8z.us/stwhite/arxiva): Paper fetching from arXiv
+- [paperprocessor](gitea.r8z.us/stwhite/paperprocessor): LLM-based paper processing
+- [paperformatter](gitea.r8z.us/stwhite/paperformatter): Output formatting
+
+## Error Handling
+
+The tool includes various error checks:
+- Date format validation (YYYYMMDD)
+- Required flag validation
+- Maximum results range validation (1-2000)
+- File system operations verification
+- API request error handling
+
+## License
+
+[License information not provided in source]
--- a/papers.go
+++ b/papers.go
@ -1,8 +1,10 @@
 package main

 import (
+	"encoding/json"
 	"flag"
 	"fmt"
+	"io"
 	"log"
 	"os"
 	"regexp"
@ -14,6 +16,47 @@ import (
 	"gitea.r8z.us/stwhite/paperprocessor"
 )

+// Paper represents the expected structure of papers in the input JSON file
+type Paper struct {
+	Title    string `json:"title"`
+	Abstract string `json:"abstract"`
+	ArxivID  string `json:"arxiv_id"`
+}
+
+// validateInputFile checks if the input file exists and has valid JSON structure
+func validateInputFile(path string) ([]Paper, error) {
+	file, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open input file: %v", err)
+	}
+	defer file.Close()
+
+	content, err := io.ReadAll(file)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read input file: %v", err)
+	}
+
+	var papers []Paper
+	if err := json.Unmarshal(content, &papers); err != nil {
+		return nil, fmt.Errorf("invalid JSON format: %v", err)
+	}
+
+	// Validate required fields
+	for i, paper := range papers {
+		if paper.Title == "" {
+			return nil, fmt.Errorf("paper at index %d missing title", i)
+		}
+		if paper.Abstract == "" {
+			return nil, fmt.Errorf("paper at index %d missing abstract", i)
+		}
+		if paper.ArxivID == "" {
+			return nil, fmt.Errorf("paper at index %d missing arxiv_id", i)
+		}
+	}
+
+	return papers, nil
+}
+
 // sanitizeFilename replaces invalid filename characters to match arxiva's sanitization
 func sanitizeFilename(s string) string {
 	s = strings.ReplaceAll(s, ":", "_")
@ -39,31 +82,38 @@ func main() {
 	flag.Usage = func() {
 		fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0])
 		fmt.Fprintf(os.Stderr, "Description:\n")
-		fmt.Fprintf(os.Stderr, "  Fetches papers from arXiv, processes them using an LLM, and generates both JSON and Markdown outputs.\n\n")
+		fmt.Fprintf(os.Stderr, "  Fetches papers from arXiv (or uses input file), processes them using an LLM, and generates both JSON and Markdown outputs.\n\n")
 		fmt.Fprintf(os.Stderr, "Pipeline:\n")
-		fmt.Fprintf(os.Stderr, "  1. Fetches papers from arXiv based on date range and query\n")
-		fmt.Fprintf(os.Stderr, "  2. Saves raw papers to JSON (format: YYYYMMDD-YYYYMMDD-query.json)\n")
-		fmt.Fprintf(os.Stderr, "  3. Processes papers using specified LLM model\n")
-		fmt.Fprintf(os.Stderr, "  4. Formats results to both JSON and Markdown\n\n")
+		fmt.Fprintf(os.Stderr, "  1. Either:\n")
+		fmt.Fprintf(os.Stderr, "     a) Fetches papers from arXiv based on date range and query, or\n")
+		fmt.Fprintf(os.Stderr, "     b) Uses papers from provided input file\n")
+		fmt.Fprintf(os.Stderr, "  2. Processes papers using specified LLM model\n")
+		fmt.Fprintf(os.Stderr, "  3. Formats results to both JSON and Markdown\n\n")
 		fmt.Fprintf(os.Stderr, "Required flags:\n")
+		fmt.Fprintf(os.Stderr, "  -api-key   : API key for LLM service\n\n")
+		fmt.Fprintf(os.Stderr, "Required for arXiv fetching (if not using -input):\n")
 		fmt.Fprintf(os.Stderr, "  -start     : Start date (YYYYMMDD)\n")
 		fmt.Fprintf(os.Stderr, "  -end       : End date (YYYYMMDD)\n")
-		fmt.Fprintf(os.Stderr, "  -query     : Search query\n")
-		fmt.Fprintf(os.Stderr, "  -api-key   : API key for LLM service\n\n")
+		fmt.Fprintf(os.Stderr, "  -query     : Search query\n\n")
 		fmt.Fprintf(os.Stderr, "Options:\n")
 		flag.PrintDefaults()
 		fmt.Fprintf(os.Stderr, "\nExamples:\n")
-		fmt.Fprintf(os.Stderr, "  Basic usage:\n")
+		fmt.Fprintf(os.Stderr, "  Using arXiv:\n")
 		fmt.Fprintf(os.Stderr, "    %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\"\n\n", os.Args[0])
-		fmt.Fprintf(os.Stderr, "  With custom model and outputs:\n")
-		fmt.Fprintf(os.Stderr, "    %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\" \\\n", os.Args[0])
-		fmt.Fprintf(os.Stderr, "      -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n")
+		fmt.Fprintf(os.Stderr, "  Using input file:\n")
+		fmt.Fprintf(os.Stderr, "    %s -input papers.json -api-key \"your-key\"\n\n", os.Args[0])
+		fmt.Fprintf(os.Stderr, "  With custom options:\n")
+		fmt.Fprintf(os.Stderr, "    %s -input papers.json -api-key \"your-key\" -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n", os.Args[0])
+		fmt.Fprintf(os.Stderr, "  Search only:\n")
+		fmt.Fprintf(os.Stderr, "    %s -search-only -start 20240101 -end 20240131 -query \"machine learning\" \n\n", os.Args[0])
 	}

 	// Parse command line arguments
-	startDate := flag.String("start", "", "Start date in YYYYMMDD format")
-	endDate := flag.String("end", "", "End date in YYYYMMDD format")
-	query := flag.String("query", "", "Search query")
+	searchOnly := flag.Bool("search-only", false, "Only fetch papers from arXiv and save to JSON file (do not process)")
+	inputFile := flag.String("input", "", "Input JSON file containing papers (optional)")
+	startDate := flag.String("start", "", "Start date in YYYYMMDD format (required if not using -input)")
+	endDate := flag.String("end", "", "End date in YYYYMMDD format (required if not using -input)")
+	query := flag.String("query", "", "Search query (required if not using -input)")
 	maxResults := flag.Int("maxResults", 100, "Maximum number of results (1-2000)")
 	model := flag.String("model", "phi-4", "Model to use for processing")
 	apiKey := flag.String("api-key", "", "API key for service authentication")
@ -73,34 +123,101 @@ func main() {
 	mdOutput := flag.String("md-output", "", "Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)")
 	flag.Parse()

-	// Generate base filename from parameters with sanitization
-	baseFilename := fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query))
+	// Validate required flags and input
+	if *searchOnly {
+		if *startDate == "" || *endDate == "" || *query == "" {
+			fmt.Fprintf(os.Stderr, "Error: start date, end date, and query are required when using -search-only\n\n")
+			flag.Usage()
+			os.Exit(1)
+		}

-	// Set default output filenames if not provided
-	if *jsonOutput == "" {
-		*jsonOutput = baseFilename + ".json"
-	}
-	if *mdOutput == "" {
-		*mdOutput = baseFilename + ".md"
+		// Validate date format
+		if !isValidDate(*startDate) || !isValidDate(*endDate) {
+			fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n")
+			os.Exit(1)
+		}
+
+		// Validate maxResults range
+		if *maxResults < 1 || *maxResults > 2000 {
+			fmt.Fprintf(os.Stderr, "Error: maxResults must be between 1 and 2000\n")
+			os.Exit(1)
+		}
+
+		// Fetch papers from arXiv
+		papers, err := arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults)
+		if err != nil {
+			log.Fatalf("Failed to fetch papers: %v", err)
+		}
+
+		// Save papers to JSON file using the same naming convention
+		if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil {
+			log.Fatalf("Failed to save papers: %v", err)
+		}
+
+		log.Printf("Successfully fetched and saved papers to %s-%s-%s.json", *startDate, *endDate, sanitizeFilename(*query))
+		os.Exit(0)
 	}

-	// Validate required flags
-	if *startDate == "" || *endDate == "" || *query == "" || *apiKey == "" {
-		fmt.Fprintf(os.Stderr, "Error: start date, end date, query, and api-key are required\n\n")
-		flag.Usage()
-		os.Exit(1)
-	}
+	var (
+		papers       []arxiva.Paper
+		err          error
+		baseFilename string
+	)

-	// Validate date format
-	if !isValidDate(*startDate) || !isValidDate(*endDate) {
-		fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n")
-		os.Exit(1)
-	}
+	if *inputFile != "" {
+		// Use input file
+		inputPapers, err := validateInputFile(*inputFile)
+		if err != nil {
+			log.Fatalf("Invalid input file: %v", err)
+		}

-	// Validate maxResults range
-	if *maxResults < 1 || *maxResults > 2000 {
-		fmt.Fprintf(os.Stderr, "Error: maxResults must be between 1 and 2000\n")
-		os.Exit(1)
+		// Convert input papers to arxiva.Paper format
+		papers = make([]arxiva.Paper, len(inputPapers))
+		for i, p := range inputPapers {
+			papers[i] = arxiva.Paper{
+				Title:    p.Title,
+				Abstract: p.Abstract,
+				ArxivID:  p.ArxivID,
+			}
+		}
+
+		// Use input filename as base for outputs
+		baseFilename = *inputFile
+		if ext := ".json"; strings.HasSuffix(baseFilename, ext) {
+			baseFilename = baseFilename[:len(baseFilename)-len(ext)]
+		}
+	} else {
+		// Validate arXiv fetching parameters
+		if *startDate == "" || *endDate == "" || *query == "" {
+			fmt.Fprintf(os.Stderr, "Error: start date, end date, and query are required when not using -input\n\n")
+			flag.Usage()
+			os.Exit(1)
+		}
+
+		// Validate date format
+		if !isValidDate(*startDate) || !isValidDate(*endDate) {
+			fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n")
+			os.Exit(1)
+		}
+
+		// Validate maxResults range
+		if *maxResults < 1 || *maxResults > 2000 {
+			fmt.Fprintf(os.Stderr, "Error: maxResults must be between 1 and 2000\n")
+			os.Exit(1)
+		}
+
+		// Fetch papers from arXiv
+		papers, err = arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults)
+		if err != nil {
+			log.Fatalf("Failed to fetch papers: %v", err)
+		}
+
+		// Save papers to JSON file using the same naming convention
+		if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil {
+			log.Fatalf("Failed to save papers: %v", err)
+		}
+
+		baseFilename = fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query))
 	}

 	// Create processor configuration
@ -111,26 +228,24 @@ func main() {
 		RequestDelay: 2 * time.Second,
 	}

-	// Fetch papers using command line args
-	papers, err := arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults)
-	if err != nil {
-		log.Fatalf("Failed to fetch papers: %v", err)
+	// Get criteria filename without extension for output naming
+	criteriaBase := *criteriaFile
+	if ext := ".md"; strings.HasSuffix(criteriaBase, ext) {
+		criteriaBase = criteriaBase[:len(criteriaBase)-len(ext)]
 	}

-	// Save papers to JSON file using the same naming convention
-	if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil {
-		log.Fatalf("Failed to save papers: %v", err)
+	// Set default output filenames if not provided
+	if *jsonOutput == "" {
+		*jsonOutput = fmt.Sprintf("%s-%s.json", baseFilename, criteriaBase)
+	}
+	if *mdOutput == "" {
+		*mdOutput = fmt.Sprintf("%s-%s.md", baseFilename, criteriaBase)
 	}

-	// Wait briefly for file system to sync and verify file exists
-	time.Sleep(100 * time.Millisecond)
-	if _, err := os.Stat(baseFilename + ".json"); os.IsNotExist(err) {
-		log.Fatalf("Failed to find saved papers file: %s", baseFilename+".json")
-	}
-
-	// Process the saved file using the base filename
+	// Process the papers
+	inputJson := baseFilename + ".json"
 	if err := paperprocessor.ProcessFile(
-		baseFilename+".json",
+		inputJson,
 		*jsonOutput,
 		*criteriaFile,
 		config,
Author	SHA1	Message	Date
Steve White	4813904fc7	Updated README.md and papers to add -search-only and -input options	2025-01-29 09:25:20 -06:00
Steve White	5983a9b8ac	Updated file hanndling	2025-01-29 09:09:35 -06:00