From 7fd890828ef03b27c81a3ddeb6348d763294b170 Mon Sep 17 00:00:00 2001 From: Steve White Date: Sun, 26 Jan 2025 14:15:57 -0600 Subject: [PATCH] Initial Commit of papers system. --- .clinerules | 47 +++++++++++++++++ go.mod | 11 ++++ go.sum | 6 +++ papers.go | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 209 insertions(+) create mode 100644 .clinerules create mode 100644 go.mod create mode 100644 go.sum create mode 100644 papers.go diff --git a/.clinerules b/.clinerules new file mode 100644 index 0000000..8c96197 --- /dev/null +++ b/.clinerules @@ -0,0 +1,47 @@ +## Here are the api signatures for arxiva +### FetchPapers(startDate, endDate, query string, maxResults int) ([]Paper, error) +startDate: Start date in format "YYYYMMDD" +endDate: End date in format "YYYYMMDD" +query: Search query +maxResults: Maximum number of results (1-2000) +Fetches papers from arXiv API + +### SaveToFile(papers []Paper, startDate, endDate, query string) error +papers: Array of Paper structs +startDate: Start date in format "YYYYMMDD" +endDate: End date in format "YYYYMMDD" +query: Search query +Saves papers to a JSON file + +JSON file is named "YYYMMDD-YYYYMMDD-query.json" (where YYYYMMDD is start date and YYYYMMDD is end date and query is search query) + +## here is the API signature for paperprocessor: + +### ProcessFile +`func ProcessFile(inputPath, outputPath, criteriaPath string, config Config, debug bool) error` + +Processes papers from input JSON file and writes results to output JSON file + +Parameters: +- inputPath: Path to input JSON file containing papers array +- outputPath: Path to write processing results JSON +- criteriaPath: Path to text file with evaluation criteria +- config: Configuration settings for API and processing +- debug: Enable debug logging when true + +Returns: +- error: Processing error or nil if successful + +You create config like this: + config := paperprocessor.Config{ + APIEndpoint: "http://localhost:1234/v1/chat/completions", + APIKey: apiKey, + Model: "qwen2-7b-instruct", + RequestDelay: 2 * time.Second, // 2 second delay between requests + + +## Here is the usage for paperformatter: +err := paperformatter.FormatPapers("input.json", "output.md") +if err != nil { + log.Fatal(err) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..38be507 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module gitea.r8z.us/stwhite/papers + +go 1.23.4 + +toolchain go1.23.5 + +require ( + gitea.r8z.us/stwhite/arxiva v0.1.0 + gitea.r8z.us/stwhite/paperformatter v0.1.3 + gitea.r8z.us/stwhite/paperprocessor v0.1.5 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..021fbe3 --- /dev/null +++ b/go.sum @@ -0,0 +1,6 @@ +gitea.r8z.us/stwhite/arxiva v0.1.0 h1:v4rRQazWDQN6A4jqUjvJoTuGVlxHH9ee1SU/vculBN4= +gitea.r8z.us/stwhite/arxiva v0.1.0/go.mod h1:V+xRJF205br/E1NM15S0htyfPnG8FERaluMTs97DcGM= +gitea.r8z.us/stwhite/paperformatter v0.1.3 h1:Z8yIdfCmQ+c5A5To+Y3XPHNOdK5B/q5VqBPZkmYqKPc= +gitea.r8z.us/stwhite/paperformatter v0.1.3/go.mod h1:As2zIT0NSsMirYqdvIfIXXxIHOcdsHANhnh0VNcAluQ= +gitea.r8z.us/stwhite/paperprocessor v0.1.5 h1:c9HYWblP0D7mz0/mfcg4j98j1cisrcUPuQFSWuGpsIQ= +gitea.r8z.us/stwhite/paperprocessor v0.1.5/go.mod h1:0wHe7XjtQICFrPKbO53SVrUiVw9yi8GOGo9J7znpo+E= diff --git a/papers.go b/papers.go new file mode 100644 index 0000000..2589392 --- /dev/null +++ b/papers.go @@ -0,0 +1,145 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + "regexp" + "strings" + "time" + + "gitea.r8z.us/stwhite/arxiva" + "gitea.r8z.us/stwhite/paperformatter" + "gitea.r8z.us/stwhite/paperprocessor" +) + +// sanitizeFilename replaces invalid filename characters +func sanitizeFilename(s string) string { + return strings.ReplaceAll(s, ":", "_") +} + +// isValidDate checks if the date string is in YYYYMMDD format +func isValidDate(date string) bool { + // Check basic format with regex + matched, err := regexp.MatchString(`^\d{8}$`, date) + if err != nil || !matched { + return false + } + + // Parse date to verify it's a valid date + _, err = time.Parse("20060102", date) + return err == nil +} + +func main() { + // Set custom usage message + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, "Description:\n") + fmt.Fprintf(os.Stderr, " Fetches papers from arXiv, processes them using an LLM, and generates both JSON and Markdown outputs.\n\n") + fmt.Fprintf(os.Stderr, "Pipeline:\n") + fmt.Fprintf(os.Stderr, " 1. Fetches papers from arXiv based on date range and query\n") + fmt.Fprintf(os.Stderr, " 2. Saves raw papers to JSON (format: YYYYMMDD-YYYYMMDD-query.json)\n") + fmt.Fprintf(os.Stderr, " 3. Processes papers using specified LLM model\n") + fmt.Fprintf(os.Stderr, " 4. Formats results to both JSON and Markdown\n\n") + fmt.Fprintf(os.Stderr, "Required flags:\n") + fmt.Fprintf(os.Stderr, " -start : Start date (YYYYMMDD)\n") + fmt.Fprintf(os.Stderr, " -end : End date (YYYYMMDD)\n") + fmt.Fprintf(os.Stderr, " -query : Search query\n") + fmt.Fprintf(os.Stderr, " -api-key : API key for LLM service\n\n") + fmt.Fprintf(os.Stderr, "Options:\n") + flag.PrintDefaults() + fmt.Fprintf(os.Stderr, "\nExamples:\n") + fmt.Fprintf(os.Stderr, " Basic usage:\n") + fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\"\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " With custom model and outputs:\n") + fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\" \\\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n") + } + + // Parse command line arguments + startDate := flag.String("start", "", "Start date in YYYYMMDD format") + endDate := flag.String("end", "", "End date in YYYYMMDD format") + query := flag.String("query", "", "Search query") + maxResults := flag.Int("maxResults", 100, "Maximum number of results (1-2000)") + model := flag.String("model", "phi-4", "Model to use for processing") + apiKey := flag.String("api-key", "", "API key for service authentication") + apiEndpoint := flag.String("api-endpoint", "http://localhost:1234/v1/chat/completions", "API endpoint URL") + criteriaFile := flag.String("criteria", "criteria.md", "Path to evaluation criteria markdown file") + jsonOutput := flag.String("json-output", "", "JSON output file path (default: YYYYMMDD-YYYYMMDD-query.json)") + mdOutput := flag.String("md-output", "", "Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)") + flag.Parse() + + // Generate base filename from parameters with sanitization + baseFilename := fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query)) + + // Set default output filenames if not provided + if *jsonOutput == "" { + *jsonOutput = baseFilename + ".json" + } + if *mdOutput == "" { + *mdOutput = baseFilename + ".md" + } + + // Validate required flags + if *startDate == "" || *endDate == "" || *query == "" || *apiKey == "" { + fmt.Fprintf(os.Stderr, "Error: start date, end date, query, and api-key are required\n\n") + flag.Usage() + os.Exit(1) + } + + // Validate date format + if !isValidDate(*startDate) || !isValidDate(*endDate) { + fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n") + os.Exit(1) + } + + // Validate maxResults range + if *maxResults < 1 || *maxResults > 2000 { + fmt.Fprintf(os.Stderr, "Error: maxResults must be between 1 and 2000\n") + os.Exit(1) + } + + // Create processor configuration + config := paperprocessor.Config{ + APIEndpoint: *apiEndpoint, + APIKey: *apiKey, + Model: *model, + RequestDelay: 2 * time.Second, + } + + // Fetch papers using command line args + papers, err := arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults) + if err != nil { + log.Fatalf("Failed to fetch papers: %v", err) + } + + // Save papers to JSON file using the same naming convention + if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil { + log.Fatalf("Failed to save papers: %v", err) + } + + // Wait briefly for file system to sync and verify file exists + time.Sleep(100 * time.Millisecond) + if _, err := os.Stat(baseFilename + ".json"); os.IsNotExist(err) { + log.Fatalf("Failed to find saved papers file: %s", baseFilename+".json") + } + + // Process the saved file using the base filename + if err := paperprocessor.ProcessFile( + baseFilename+".json", + *jsonOutput, + *criteriaFile, + config, + ); err != nil { + log.Fatalf("Processing failed: %v", err) + } + + // Format the processed results to markdown + if err := paperformatter.FormatPapers(*jsonOutput, *mdOutput); err != nil { + log.Fatalf("Formatting failed: %v", err) + } + + log.Printf("Successfully processed papers. Results written to %s and formatted to %s", *jsonOutput, *mdOutput) +}