2025-01-26 20:15:57 +00:00
package main
import (
2025-01-29 15:09:35 +00:00
"encoding/json"
2025-01-26 20:15:57 +00:00
"flag"
"fmt"
2025-01-29 15:09:35 +00:00
"io"
2025-01-26 20:15:57 +00:00
"log"
"os"
"strings"
"time"
"gitea.r8z.us/stwhite/arxiva"
"gitea.r8z.us/stwhite/paperformatter"
"gitea.r8z.us/stwhite/paperprocessor"
)
2025-01-29 15:09:35 +00:00
// Paper represents the expected structure of papers in the input JSON file
type Paper struct {
Title string ` json:"title" `
Abstract string ` json:"abstract" `
ArxivID string ` json:"arxiv_id" `
}
// validateInputFile checks if the input file exists and has valid JSON structure
func validateInputFile ( path string ) ( [ ] Paper , error ) {
file , err := os . Open ( path )
if err != nil {
return nil , fmt . Errorf ( "failed to open input file: %v" , err )
}
defer file . Close ( )
content , err := io . ReadAll ( file )
if err != nil {
return nil , fmt . Errorf ( "failed to read input file: %v" , err )
}
var papers [ ] Paper
if err := json . Unmarshal ( content , & papers ) ; err != nil {
return nil , fmt . Errorf ( "invalid JSON format: %v" , err )
}
// Validate required fields
for i , paper := range papers {
if paper . Title == "" {
return nil , fmt . Errorf ( "paper at index %d missing title" , i )
}
if paper . Abstract == "" {
return nil , fmt . Errorf ( "paper at index %d missing abstract" , i )
}
if paper . ArxivID == "" {
return nil , fmt . Errorf ( "paper at index %d missing arxiv_id" , i )
}
}
return papers , nil
}
2025-01-26 20:15:57 +00:00
func main ( ) {
// Set custom usage message
flag . Usage = func ( ) {
fmt . Fprintf ( os . Stderr , "Usage: %s [options]\n\n" , os . Args [ 0 ] )
fmt . Fprintf ( os . Stderr , "Description:\n" )
2025-01-29 15:09:35 +00:00
fmt . Fprintf ( os . Stderr , " Fetches papers from arXiv (or uses input file), processes them using an LLM, and generates both JSON and Markdown outputs.\n\n" )
2025-01-26 20:15:57 +00:00
fmt . Fprintf ( os . Stderr , "Pipeline:\n" )
2025-01-29 15:09:35 +00:00
fmt . Fprintf ( os . Stderr , " 1. Either:\n" )
fmt . Fprintf ( os . Stderr , " a) Fetches papers from arXiv based on date range and query, or\n" )
fmt . Fprintf ( os . Stderr , " b) Uses papers from provided input file\n" )
fmt . Fprintf ( os . Stderr , " 2. Processes papers using specified LLM model\n" )
fmt . Fprintf ( os . Stderr , " 3. Formats results to both JSON and Markdown\n\n" )
2025-01-26 20:15:57 +00:00
fmt . Fprintf ( os . Stderr , "Required flags:\n" )
2025-01-29 15:09:35 +00:00
fmt . Fprintf ( os . Stderr , " -api-key : API key for LLM service\n\n" )
fmt . Fprintf ( os . Stderr , "Required for arXiv fetching (if not using -input):\n" )
2025-01-26 20:15:57 +00:00
fmt . Fprintf ( os . Stderr , " -start : Start date (YYYYMMDD)\n" )
fmt . Fprintf ( os . Stderr , " -end : End date (YYYYMMDD)\n" )
2025-01-29 15:09:35 +00:00
fmt . Fprintf ( os . Stderr , " -query : Search query\n\n" )
2025-01-26 20:15:57 +00:00
fmt . Fprintf ( os . Stderr , "Options:\n" )
flag . PrintDefaults ( )
fmt . Fprintf ( os . Stderr , "\nExamples:\n" )
2025-01-29 15:09:35 +00:00
fmt . Fprintf ( os . Stderr , " Using arXiv:\n" )
2025-01-26 20:15:57 +00:00
fmt . Fprintf ( os . Stderr , " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\"\n\n" , os . Args [ 0 ] )
2025-01-29 15:09:35 +00:00
fmt . Fprintf ( os . Stderr , " Using input file:\n" )
fmt . Fprintf ( os . Stderr , " %s -input papers.json -api-key \"your-key\"\n\n" , os . Args [ 0 ] )
fmt . Fprintf ( os . Stderr , " With custom options:\n" )
fmt . Fprintf ( os . Stderr , " %s -input papers.json -api-key \"your-key\" -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n" , os . Args [ 0 ] )
2025-01-29 15:25:20 +00:00
fmt . Fprintf ( os . Stderr , " Search only:\n" )
2025-01-29 17:48:13 +00:00
fmt . Fprintf ( os . Stderr , " %s -search-only -start 20240101 -end 20240131 -query \"machine learning\" \n" , os . Args [ 0 ] )
fmt . Fprintf ( os . Stderr , " Server mode:\n" )
fmt . Fprintf ( os . Stderr , " %s -serve -port 8080\n\n" , os . Args [ 0 ] )
2025-01-26 20:15:57 +00:00
}
// Parse command line arguments
2025-01-29 17:48:13 +00:00
serve := flag . Bool ( "serve" , false , "Run in server mode" )
port := flag . String ( "port" , "8080" , "Port to run server on" )
2025-01-29 15:25:20 +00:00
searchOnly := flag . Bool ( "search-only" , false , "Only fetch papers from arXiv and save to JSON file (do not process)" )
2025-01-29 15:09:35 +00:00
inputFile := flag . String ( "input" , "" , "Input JSON file containing papers (optional)" )
startDate := flag . String ( "start" , "" , "Start date in YYYYMMDD format (required if not using -input)" )
endDate := flag . String ( "end" , "" , "End date in YYYYMMDD format (required if not using -input)" )
query := flag . String ( "query" , "" , "Search query (required if not using -input)" )
2025-01-26 20:15:57 +00:00
maxResults := flag . Int ( "maxResults" , 100 , "Maximum number of results (1-2000)" )
model := flag . String ( "model" , "phi-4" , "Model to use for processing" )
apiKey := flag . String ( "api-key" , "" , "API key for service authentication" )
apiEndpoint := flag . String ( "api-endpoint" , "http://localhost:1234/v1/chat/completions" , "API endpoint URL" )
criteriaFile := flag . String ( "criteria" , "criteria.md" , "Path to evaluation criteria markdown file" )
jsonOutput := flag . String ( "json-output" , "" , "JSON output file path (default: YYYYMMDD-YYYYMMDD-query.json)" )
mdOutput := flag . String ( "md-output" , "" , "Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)" )
flag . Parse ( )
2025-01-29 15:09:35 +00:00
// Validate required flags and input
2025-01-29 17:48:13 +00:00
if * serve {
server := NewServer ( * port , * apiEndpoint )
if err := server . Run ( ) ; err != nil {
log . Fatal ( err )
}
return
}
2025-01-29 15:25:20 +00:00
if * searchOnly {
if * startDate == "" || * endDate == "" || * query == "" {
fmt . Fprintf ( os . Stderr , "Error: start date, end date, and query are required when using -search-only\n\n" )
flag . Usage ( )
os . Exit ( 1 )
}
// Validate date format
2025-01-29 17:48:13 +00:00
if ! IsValidDate ( * startDate ) || ! IsValidDate ( * endDate ) {
2025-01-29 15:25:20 +00:00
fmt . Fprintf ( os . Stderr , "Error: dates must be in YYYYMMDD format\n" )
os . Exit ( 1 )
}
// Validate maxResults range
if * maxResults < 1 || * maxResults > 2000 {
fmt . Fprintf ( os . Stderr , "Error: maxResults must be between 1 and 2000\n" )
os . Exit ( 1 )
}
// Fetch papers from arXiv
papers , err := arxiva . FetchPapers ( * startDate , * endDate , * query , * maxResults )
if err != nil {
log . Fatalf ( "Failed to fetch papers: %v" , err )
}
// Save papers to JSON file using the same naming convention
if err := arxiva . SaveToFile ( papers , * startDate , * endDate , * query ) ; err != nil {
log . Fatalf ( "Failed to save papers: %v" , err )
}
2025-01-29 17:48:13 +00:00
log . Printf ( "Successfully fetched and saved papers to %s-%s-%s.json" , * startDate , * endDate , SanitizeFilename ( * query ) )
2025-01-29 15:25:20 +00:00
os . Exit ( 0 )
2025-01-26 20:15:57 +00:00
}
2025-01-29 15:09:35 +00:00
var (
papers [ ] arxiva . Paper
err error
baseFilename string
)
if * inputFile != "" {
// Use input file
inputPapers , err := validateInputFile ( * inputFile )
if err != nil {
log . Fatalf ( "Invalid input file: %v" , err )
}
// Convert input papers to arxiva.Paper format
papers = make ( [ ] arxiva . Paper , len ( inputPapers ) )
for i , p := range inputPapers {
papers [ i ] = arxiva . Paper {
Title : p . Title ,
Abstract : p . Abstract ,
ArxivID : p . ArxivID ,
}
}
// Use input filename as base for outputs
baseFilename = * inputFile
if ext := ".json" ; strings . HasSuffix ( baseFilename , ext ) {
baseFilename = baseFilename [ : len ( baseFilename ) - len ( ext ) ]
}
} else {
// Validate arXiv fetching parameters
if * startDate == "" || * endDate == "" || * query == "" {
fmt . Fprintf ( os . Stderr , "Error: start date, end date, and query are required when not using -input\n\n" )
flag . Usage ( )
os . Exit ( 1 )
}
// Validate date format
2025-01-29 17:48:13 +00:00
if ! IsValidDate ( * startDate ) || ! IsValidDate ( * endDate ) {
2025-01-29 15:09:35 +00:00
fmt . Fprintf ( os . Stderr , "Error: dates must be in YYYYMMDD format\n" )
os . Exit ( 1 )
}
// Validate maxResults range
if * maxResults < 1 || * maxResults > 2000 {
fmt . Fprintf ( os . Stderr , "Error: maxResults must be between 1 and 2000\n" )
os . Exit ( 1 )
}
// Fetch papers from arXiv
papers , err = arxiva . FetchPapers ( * startDate , * endDate , * query , * maxResults )
if err != nil {
log . Fatalf ( "Failed to fetch papers: %v" , err )
}
// Save papers to JSON file using the same naming convention
if err := arxiva . SaveToFile ( papers , * startDate , * endDate , * query ) ; err != nil {
log . Fatalf ( "Failed to save papers: %v" , err )
}
2025-01-29 17:48:13 +00:00
baseFilename = fmt . Sprintf ( "%s-%s-%s" , * startDate , * endDate , SanitizeFilename ( * query ) )
2025-01-26 20:15:57 +00:00
}
// Create processor configuration
config := paperprocessor . Config {
APIEndpoint : * apiEndpoint ,
APIKey : * apiKey ,
Model : * model ,
RequestDelay : 2 * time . Second ,
}
2025-01-29 15:09:35 +00:00
// Get criteria filename without extension for output naming
criteriaBase := * criteriaFile
if ext := ".md" ; strings . HasSuffix ( criteriaBase , ext ) {
criteriaBase = criteriaBase [ : len ( criteriaBase ) - len ( ext ) ]
2025-01-26 20:15:57 +00:00
}
2025-01-29 15:09:35 +00:00
// Set default output filenames if not provided
if * jsonOutput == "" {
* jsonOutput = fmt . Sprintf ( "%s-%s.json" , baseFilename , criteriaBase )
2025-01-26 20:15:57 +00:00
}
2025-01-29 15:09:35 +00:00
if * mdOutput == "" {
* mdOutput = fmt . Sprintf ( "%s-%s.md" , baseFilename , criteriaBase )
2025-01-26 20:15:57 +00:00
}
2025-01-29 15:09:35 +00:00
// Process the papers
inputJson := baseFilename + ".json"
2025-01-26 20:15:57 +00:00
if err := paperprocessor . ProcessFile (
2025-01-29 15:09:35 +00:00
inputJson ,
2025-01-26 20:15:57 +00:00
* jsonOutput ,
* criteriaFile ,
config ,
) ; err != nil {
log . Fatalf ( "Processing failed: %v" , err )
}
// Format the processed results to markdown
if err := paperformatter . FormatPapers ( * jsonOutput , * mdOutput ) ; err != nil {
log . Fatalf ( "Formatting failed: %v" , err )
}
log . Printf ( "Successfully processed papers. Results written to %s and formatted to %s" , * jsonOutput , * mdOutput )
}