enabled server mode and updated README.md

This commit is contained in:
Steve White 2025-01-29 11:48:13 -06:00
parent ac208fabdd
commit e552411298
5 changed files with 516 additions and 197 deletions

View File

@ -27,6 +27,7 @@ This is hard to pull off with keyword searches. You might exclude every paper th
- Generate both JSON and Markdown outputs - Generate both JSON and Markdown outputs
- Customizable evaluation criteria - Customizable evaluation criteria
- Rate-limited API requests (2-second delay between requests) - Rate-limited API requests (2-second delay between requests)
- HTTP API server mode for integration with other services
## Installation ## Installation
@ -65,6 +66,8 @@ papers -input papers.json -api-key "your-key" -criteria criteria.md
### Optional Flags ### Optional Flags
- `-serve`: Run in server mode with HTTP API endpoints
- `-port`: Port to run server on (default: "8080")
- `-search-only`: Fetch papers from arXiv and save to JSON file without processing - `-search-only`: Fetch papers from arXiv and save to JSON file without processing
- `-input`: Input JSON file containing papers (optional) - `-input`: Input JSON file containing papers (optional)
- `-maxResults`: Maximum number of results to fetch (1-2000, default: 100) - `-maxResults`: Maximum number of results to fetch (1-2000, default: 100)
@ -76,6 +79,21 @@ papers -input papers.json -api-key "your-key" -criteria criteria.md
**NB: default API endpoint is LMStudio, and Phi-4 does a great job filtering papers** **NB: default API endpoint is LMStudio, and Phi-4 does a great job filtering papers**
## Server Mode
The tool can be run as an HTTP server providing API endpoints for paper search and processing:
```bash
papers -serve -port 8080
```
This starts a server with the following endpoints:
- `POST /api/papers/search` - Search for papers on arXiv
- `POST /api/papers/process` - Process papers using LLM
- `POST /api/papers/search-process` - Combined search and process
See [API.md](API.md) for detailed API documentation.
## Pipeline ## Pipeline
1. **Fetch**: Retrieves papers from arXiv based on specified date range and query 1. **Fetch**: Retrieves papers from arXiv based on specified date range and query

2
go.sum
View File

@ -4,3 +4,5 @@ gitea.r8z.us/stwhite/paperformatter v0.1.3 h1:Z8yIdfCmQ+c5A5To+Y3XPHNOdK5B/q5VqB
gitea.r8z.us/stwhite/paperformatter v0.1.3/go.mod h1:As2zIT0NSsMirYqdvIfIXXxIHOcdsHANhnh0VNcAluQ= gitea.r8z.us/stwhite/paperformatter v0.1.3/go.mod h1:As2zIT0NSsMirYqdvIfIXXxIHOcdsHANhnh0VNcAluQ=
gitea.r8z.us/stwhite/paperprocessor v0.1.8 h1:pV810JZQFhuKcle4ix7stUz12LZNIgFCVWxSC/RYWpE= gitea.r8z.us/stwhite/paperprocessor v0.1.8 h1:pV810JZQFhuKcle4ix7stUz12LZNIgFCVWxSC/RYWpE=
gitea.r8z.us/stwhite/paperprocessor v0.1.8/go.mod h1:0wHe7XjtQICFrPKbO53SVrUiVw9yi8GOGo9J7znpo+E= gitea.r8z.us/stwhite/paperprocessor v0.1.8/go.mod h1:0wHe7XjtQICFrPKbO53SVrUiVw9yi8GOGo9J7znpo+E=
github.com/go-chi/chi/v5 v5.0.11 h1:BnpYbFZ3T3S1WMpD79r7R5ThWX40TaFB7L31Y8xqSwA=
github.com/go-chi/chi/v5 v5.0.11/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=

View File

@ -7,7 +7,6 @@ import (
"io" "io"
"log" "log"
"os" "os"
"regexp"
"strings" "strings"
"time" "time"
@ -57,26 +56,6 @@ func validateInputFile(path string) ([]Paper, error) {
return papers, nil return papers, nil
} }
// sanitizeFilename replaces invalid filename characters to match arxiva's sanitization
func sanitizeFilename(s string) string {
s = strings.ReplaceAll(s, ":", "_")
s = strings.ReplaceAll(s, " ", "_")
return s
}
// isValidDate checks if the date string is in YYYYMMDD format
func isValidDate(date string) bool {
// Check basic format with regex
matched, err := regexp.MatchString(`^\d{8}$`, date)
if err != nil || !matched {
return false
}
// Parse date to verify it's a valid date
_, err = time.Parse("20060102", date)
return err == nil
}
func main() { func main() {
// Set custom usage message // Set custom usage message
flag.Usage = func() { flag.Usage = func() {
@ -105,10 +84,14 @@ func main() {
fmt.Fprintf(os.Stderr, " With custom options:\n") fmt.Fprintf(os.Stderr, " With custom options:\n")
fmt.Fprintf(os.Stderr, " %s -input papers.json -api-key \"your-key\" -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n", os.Args[0]) fmt.Fprintf(os.Stderr, " %s -input papers.json -api-key \"your-key\" -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n", os.Args[0])
fmt.Fprintf(os.Stderr, " Search only:\n") fmt.Fprintf(os.Stderr, " Search only:\n")
fmt.Fprintf(os.Stderr, " %s -search-only -start 20240101 -end 20240131 -query \"machine learning\" \n\n", os.Args[0]) fmt.Fprintf(os.Stderr, " %s -search-only -start 20240101 -end 20240131 -query \"machine learning\" \n", os.Args[0])
fmt.Fprintf(os.Stderr, " Server mode:\n")
fmt.Fprintf(os.Stderr, " %s -serve -port 8080\n\n", os.Args[0])
} }
// Parse command line arguments // Parse command line arguments
serve := flag.Bool("serve", false, "Run in server mode")
port := flag.String("port", "8080", "Port to run server on")
searchOnly := flag.Bool("search-only", false, "Only fetch papers from arXiv and save to JSON file (do not process)") searchOnly := flag.Bool("search-only", false, "Only fetch papers from arXiv and save to JSON file (do not process)")
inputFile := flag.String("input", "", "Input JSON file containing papers (optional)") inputFile := flag.String("input", "", "Input JSON file containing papers (optional)")
startDate := flag.String("start", "", "Start date in YYYYMMDD format (required if not using -input)") startDate := flag.String("start", "", "Start date in YYYYMMDD format (required if not using -input)")
@ -124,6 +107,14 @@ func main() {
flag.Parse() flag.Parse()
// Validate required flags and input // Validate required flags and input
if *serve {
server := NewServer(*port, *apiEndpoint)
if err := server.Run(); err != nil {
log.Fatal(err)
}
return
}
if *searchOnly { if *searchOnly {
if *startDate == "" || *endDate == "" || *query == "" { if *startDate == "" || *endDate == "" || *query == "" {
fmt.Fprintf(os.Stderr, "Error: start date, end date, and query are required when using -search-only\n\n") fmt.Fprintf(os.Stderr, "Error: start date, end date, and query are required when using -search-only\n\n")
@ -132,7 +123,7 @@ func main() {
} }
// Validate date format // Validate date format
if !isValidDate(*startDate) || !isValidDate(*endDate) { if !IsValidDate(*startDate) || !IsValidDate(*endDate) {
fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n") fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n")
os.Exit(1) os.Exit(1)
} }
@ -154,7 +145,7 @@ func main() {
log.Fatalf("Failed to save papers: %v", err) log.Fatalf("Failed to save papers: %v", err)
} }
log.Printf("Successfully fetched and saved papers to %s-%s-%s.json", *startDate, *endDate, sanitizeFilename(*query)) log.Printf("Successfully fetched and saved papers to %s-%s-%s.json", *startDate, *endDate, SanitizeFilename(*query))
os.Exit(0) os.Exit(0)
} }
@ -195,7 +186,7 @@ func main() {
} }
// Validate date format // Validate date format
if !isValidDate(*startDate) || !isValidDate(*endDate) { if !IsValidDate(*startDate) || !IsValidDate(*endDate) {
fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n") fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n")
os.Exit(1) os.Exit(1)
} }
@ -217,7 +208,7 @@ func main() {
log.Fatalf("Failed to save papers: %v", err) log.Fatalf("Failed to save papers: %v", err)
} }
baseFilename = fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query)) baseFilename = fmt.Sprintf("%s-%s-%s", *startDate, *endDate, SanitizeFilename(*query))
} }
// Create processor configuration // Create processor configuration

623
server.go
View File

@ -1,193 +1,474 @@
package main package main
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"log" "log"
"net/http" "net/http"
"os"
"github.com/go-chi/chi/v5" "strings"
"github.com/go-chi/chi/v5/middleware" "sync"
"time"
"gitea.r8z.us/stwhite/arxiva"
"gitea.r8z.us/stwhite/paperformatter"
"gitea.r8z.us/stwhite/paperprocessor"
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
) )
type Server struct { type ProcessingJob struct {
router *chi.Mux ID string // Unique job identifier
port string Status string // "pending", "processing", "completed", "failed"
StartTime time.Time // When the job started
Error string // Error message if failed
JSONPath string // Path to JSON output file
MDPath string // Path to Markdown output file
MarkdownText string // Content of markdown file when completed
} }
func NewServer(port string) *Server { type Server struct {
s := &Server{ router *chi.Mux
router: chi.NewRouter(), port string
port: port, apiEndpoint string
} jobs map[string]*ProcessingJob // Track processing jobs
jobsMutex sync.RWMutex // Protect jobs map
s.setupRoutes() }
return s
func NewServer(port string, apiEndpoint string) *Server {
s := &Server{
router: chi.NewRouter(),
port: port,
apiEndpoint: apiEndpoint,
jobs: make(map[string]*ProcessingJob),
}
s.setupRoutes()
return s
} }
func (s *Server) setupRoutes() { func (s *Server) setupRoutes() {
s.router.Use(middleware.Logger) s.router.Use(middleware.Logger)
s.router.Use(middleware.Recoverer) s.router.Use(middleware.Recoverer)
s.router.Post("/api/papers/search", s.handleSearch) s.router.Post("/api/papers/search", s.handleSearch)
s.router.Post("/api/papers/process", s.handleProcess) s.router.Post("/api/papers/process", s.handleProcess)
s.router.Post("/api/papers/search-process", s.handleSearchAndProcess) s.router.Post("/api/papers/search-process", s.handleSearchAndProcess)
s.router.Get("/api/jobs/{jobID}", s.handleJobStatus)
} }
func (s *Server) Run() error { func (s *Server) Run() error {
addr := fmt.Sprintf(":%s", s.port) addr := fmt.Sprintf(":%s", s.port)
log.Printf("Starting server on %s", addr) log.Printf("Starting server on %s", addr)
return http.ListenAndServe(addr, s.router) return http.ListenAndServe(addr, s.router)
} }
func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) { func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
var req struct { var req struct {
StartDate string `json:"start_date"` StartDate string `json:"start_date"`
EndDate string `json:"end_date"` EndDate string `json:"end_date"`
Query string `json:"query"` Query string `json:"query"`
MaxResults int `json:"max_results"` MaxResults int `json:"max_results"`
} }
if err := json.NewDecoder(r.Body).Decode(&req); err != nil { if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Invalid request body", http.StatusBadRequest) http.Error(w, "Invalid request body", http.StatusBadRequest)
return return
} }
// Reuse existing validation // Reuse existing validation
if !isValidDate(req.StartDate) || !isValidDate(req.EndDate) { if !IsValidDate(req.StartDate) || !IsValidDate(req.EndDate) {
http.Error(w, "Invalid date format", http.StatusBadRequest) http.Error(w, "Invalid date format", http.StatusBadRequest)
return return
} }
papers, err := arxiva.FetchPapers(req.StartDate, req.EndDate, req.Query, req.MaxResults) papers, err := arxiva.FetchPapers(req.StartDate, req.EndDate, req.Query, req.MaxResults)
if err != nil { if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError) http.Error(w, err.Error(), http.StatusInternalServerError)
return return
} }
json.NewEncoder(w).Encode(papers) json.NewEncoder(w).Encode(papers)
} }
func (s *Server) handleProcess(w http.ResponseWriter, r *http.Request) { func (s *Server) handleProcess(w http.ResponseWriter, r *http.Request) {
var req struct { var req struct {
InputFile string `json:"input_file"` Papers []arxiva.Paper `json:"papers,omitempty"` // Optional: Direct paper data
CriteriaFile string `json:"criteria_file"` InputFile string `json:"input_file,omitempty"` // Optional: Input file path
ApiKey string `json:"api_key"` CriteriaFile string `json:"criteria_file,omitempty"` // Optional: Criteria file path
Model string `json:"model"` Criteria string `json:"criteria,omitempty"` // Optional: Direct criteria text
} ApiKey string `json:"api_key"` // Required: API key
Model string `json:"model,omitempty"` // Optional: Model name
if err := json.NewDecoder(r.Body).Decode(&req); err != nil { }
http.Error(w, "Invalid request body", http.StatusBadRequest)
return if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
} http.Error(w, "Invalid request body", http.StatusBadRequest)
return
// Create processor configuration }
config := paperprocessor.Config{
APIEndpoint: *apiEndpoint, // This would need to be passed to Server struct // Validate required fields
APIKey: req.ApiKey, if req.CriteriaFile == "" && req.Criteria == "" {
Model: req.Model, http.Error(w, "either criteria_file or criteria must be provided", http.StatusBadRequest)
RequestDelay: 2 * time.Second, return
} }
if req.ApiKey == "" {
// Process the papers http.Error(w, "api_key is required", http.StatusBadRequest)
outputJSON := req.InputFile + "-processed.json" return
if err := paperprocessor.ProcessFile( }
req.InputFile,
outputJSON, // Create processor configuration
req.CriteriaFile, config := paperprocessor.Config{
config, APIEndpoint: s.apiEndpoint,
); err != nil { APIKey: req.ApiKey,
http.Error(w, fmt.Sprintf("Processing failed: %v", err), http.StatusInternalServerError) Model: req.Model,
return RequestDelay: 2 * time.Second,
} }
// Format to markdown var inputJSON string
outputMD := req.InputFile + "-processed.md"
if err := paperformatter.FormatPapers(outputJSON, outputMD); err != nil { // Handle direct paper data
http.Error(w, fmt.Sprintf("Formatting failed: %v", err), http.StatusInternalServerError) if len(req.Papers) > 0 {
return // Create temporary file for paper data
} tempFile, err := os.CreateTemp("", "papers-*.json")
if err != nil {
// Return the paths to the generated files http.Error(w, fmt.Sprintf("Failed to create temp file: %v", err), http.StatusInternalServerError)
json.NewEncoder(w).Encode(struct { return
JSONOutput string `json:"json_output"` }
MDOutput string `json:"md_output"` defer os.Remove(tempFile.Name()) // Clean up temp file
}{
JSONOutput: outputJSON, // Write papers to temp file
MDOutput: outputMD, if err := json.NewEncoder(tempFile).Encode(req.Papers); err != nil {
}) http.Error(w, fmt.Sprintf("Failed to write papers: %v", err), http.StatusInternalServerError)
return
}
tempFile.Close()
inputJSON = tempFile.Name()
} else if req.InputFile != "" {
inputJSON = req.InputFile
} else {
http.Error(w, "either papers or input_file must be provided", http.StatusBadRequest)
return
}
// Generate output filenames
timestamp := time.Now().Format("20060102150405")
outputJSON := fmt.Sprintf("processed-%s.json", timestamp)
outputMD := fmt.Sprintf("processed-%s.md", timestamp)
// Handle criteria
var criteriaFile string
if req.Criteria != "" {
// Create temporary file for criteria
tempFile, err := os.CreateTemp("", "criteria-*.md")
if err != nil {
http.Error(w, fmt.Sprintf("Failed to create temp criteria file: %v", err), http.StatusInternalServerError)
return
}
defer os.Remove(tempFile.Name()) // Clean up temp file
// Write criteria to temp file
if _, err := tempFile.WriteString(req.Criteria); err != nil {
http.Error(w, fmt.Sprintf("Failed to write criteria: %v", err), http.StatusInternalServerError)
return
}
tempFile.Close()
criteriaFile = tempFile.Name()
} else {
criteriaFile = req.CriteriaFile
}
// Process the papers
if err := paperprocessor.ProcessFile(
inputJSON,
outputJSON,
criteriaFile,
config,
); err != nil {
http.Error(w, fmt.Sprintf("Processing failed: %v", err), http.StatusInternalServerError)
return
}
// Format to markdown
if err := paperformatter.FormatPapers(outputJSON, outputMD); err != nil {
http.Error(w, fmt.Sprintf("Formatting failed: %v", err), http.StatusInternalServerError)
return
}
// Generate job ID and create job
jobID := fmt.Sprintf("job-%s", timestamp)
job := &ProcessingJob{
ID: jobID,
Status: "pending",
StartTime: time.Now(),
JSONPath: outputJSON,
MDPath: outputMD,
}
// Store job
s.jobsMutex.Lock()
s.jobs[jobID] = job
s.jobsMutex.Unlock()
// Start processing in background
go func() {
// Process the papers
if err := paperprocessor.ProcessFile(
inputJSON,
outputJSON,
criteriaFile,
config,
); err != nil {
s.jobsMutex.Lock()
job.Status = "failed"
job.Error = fmt.Sprintf("Processing failed: %v", err)
s.jobsMutex.Unlock()
return
}
// Format to markdown
if err := paperformatter.FormatPapers(outputJSON, outputMD); err != nil {
s.jobsMutex.Lock()
job.Status = "failed"
job.Error = fmt.Sprintf("Formatting failed: %v", err)
s.jobsMutex.Unlock()
return
}
// Read markdown content
mdContent, err := os.ReadFile(outputMD)
if err != nil {
s.jobsMutex.Lock()
job.Status = "failed"
job.Error = fmt.Sprintf("Failed to read markdown: %v", err)
s.jobsMutex.Unlock()
return
}
// Convert Windows line endings to Unix
mdString := strings.ReplaceAll(string(mdContent), "\r\n", "\n")
// Ensure file ends with newline
if !strings.HasSuffix(mdString, "\n") {
mdString += "\n"
}
// Update job with success
s.jobsMutex.Lock()
job.Status = "completed"
job.MarkdownText = mdString
s.jobsMutex.Unlock()
}()
// Return job ID immediately
json.NewEncoder(w).Encode(struct {
JobID string `json:"job_id"`
}{
JobID: jobID,
})
}
func (s *Server) handleJobStatus(w http.ResponseWriter, r *http.Request) {
jobID := chi.URLParam(r, "jobID")
s.jobsMutex.RLock()
job, exists := s.jobs[jobID]
s.jobsMutex.RUnlock()
if !exists {
http.Error(w, "Job not found", http.StatusNotFound)
return
}
response := struct {
ID string `json:"id"`
Status string `json:"status"`
StartTime time.Time `json:"start_time"`
Error string `json:"error,omitempty"`
MarkdownText string `json:"markdown_text,omitempty"`
}{
ID: job.ID,
Status: job.Status,
StartTime: job.StartTime,
Error: job.Error,
}
// Only include markdown text if job is completed
if job.Status == "completed" {
response.MarkdownText = job.MarkdownText
}
json.NewEncoder(w).Encode(response)
} }
func (s *Server) handleSearchAndProcess(w http.ResponseWriter, r *http.Request) { func (s *Server) handleSearchAndProcess(w http.ResponseWriter, r *http.Request) {
var req struct { var req struct {
StartDate string `json:"start_date"` StartDate string `json:"start_date"`
EndDate string `json:"end_date"` EndDate string `json:"end_date"`
Query string `json:"query"` Query string `json:"query"`
MaxResults int `json:"max_results"` MaxResults int `json:"max_results"`
CriteriaFile string `json:"criteria_file"` CriteriaFile string `json:"criteria_file,omitempty"`
ApiKey string `json:"api_key"` Criteria string `json:"criteria,omitempty"`
Model string `json:"model"` ApiKey string `json:"api_key"`
} Model string `json:"model,omitempty"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Invalid request body", http.StatusBadRequest) if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
return http.Error(w, "Invalid request body", http.StatusBadRequest)
} return
}
// Validate dates
if !isValidDate(req.StartDate) || !isValidDate(req.EndDate) { // Validate dates
http.Error(w, "Invalid date format", http.StatusBadRequest) if !IsValidDate(req.StartDate) || !IsValidDate(req.EndDate) {
return http.Error(w, "Invalid date format", http.StatusBadRequest)
} return
}
// Fetch papers
papers, err := arxiva.FetchPapers(req.StartDate, req.EndDate, req.Query, req.MaxResults) // Fetch papers
if err != nil { papers, err := arxiva.FetchPapers(req.StartDate, req.EndDate, req.Query, req.MaxResults)
http.Error(w, err.Error(), http.StatusInternalServerError) if err != nil {
return http.Error(w, err.Error(), http.StatusInternalServerError)
} return
}
// Save papers to temporary JSON file
baseFilename := fmt.Sprintf("%s-%s-%s", req.StartDate, req.EndDate, sanitizeFilename(req.Query)) // Save papers to temporary JSON file
inputJSON := baseFilename + ".json" baseFilename := fmt.Sprintf("%s-%s-%s", req.StartDate, req.EndDate, SanitizeFilename(req.Query))
if err := arxiva.SaveToFile(papers, req.StartDate, req.EndDate, req.Query); err != nil { inputJSON := baseFilename + ".json"
http.Error(w, fmt.Sprintf("Failed to save papers: %v", err), http.StatusInternalServerError) if err := arxiva.SaveToFile(papers, req.StartDate, req.EndDate, req.Query); err != nil {
return http.Error(w, fmt.Sprintf("Failed to save papers: %v", err), http.StatusInternalServerError)
} return
}
// Create processor configuration
config := paperprocessor.Config{ // Create processor configuration
APIEndpoint: *apiEndpoint, // This would need to be passed to Server struct config := paperprocessor.Config{
APIKey: req.ApiKey, APIEndpoint: s.apiEndpoint,
Model: req.Model, APIKey: req.ApiKey,
RequestDelay: 2 * time.Second, Model: req.Model,
} RequestDelay: 2 * time.Second,
}
// Process the papers
outputJSON := baseFilename + "-processed.json" // Handle criteria
if err := paperprocessor.ProcessFile( var criteriaFile string
inputJSON, var tempCriteriaFile string // Track temporary file for cleanup
outputJSON,
req.CriteriaFile, if req.Criteria != "" {
config, // Create temporary file for criteria
); err != nil { tempFile, err := os.CreateTemp("", "criteria-*.md")
http.Error(w, fmt.Sprintf("Processing failed: %v", err), http.StatusInternalServerError) if err != nil {
return http.Error(w, fmt.Sprintf("Failed to create temp criteria file: %v", err), http.StatusInternalServerError)
} return
}
// Format to markdown tempCriteriaFile = tempFile.Name() // Save for cleanup after processing
outputMD := baseFilename + "-processed.md"
if err := paperformatter.FormatPapers(outputJSON, outputMD); err != nil { // Write criteria to temp file
http.Error(w, fmt.Sprintf("Formatting failed: %v", err), http.StatusInternalServerError) if _, err := tempFile.WriteString(req.Criteria); err != nil {
return os.Remove(tempCriteriaFile) // Clean up on error
} http.Error(w, fmt.Sprintf("Failed to write criteria: %v", err), http.StatusInternalServerError)
return
// Return the paths to the generated files }
json.NewEncoder(w).Encode(struct { tempFile.Close()
JSONOutput string `json:"json_output"` criteriaFile = tempCriteriaFile
MDOutput string `json:"md_output"` } else if req.CriteriaFile != "" {
}{ criteriaFile = req.CriteriaFile
JSONOutput: outputJSON, } else {
MDOutput: outputMD, http.Error(w, "either criteria_file or criteria must be provided", http.StatusBadRequest)
}) return
} }
// Process the papers
outputJSON := baseFilename + "-processed.json"
if err := paperprocessor.ProcessFile(
inputJSON,
outputJSON,
criteriaFile,
config,
); err != nil {
if tempCriteriaFile != "" {
os.Remove(tempCriteriaFile) // Clean up temp file on error
}
http.Error(w, fmt.Sprintf("Processing failed: %v", err), http.StatusInternalServerError)
return
}
// Format to markdown
outputMD := baseFilename + "-processed.md"
if err := paperformatter.FormatPapers(outputJSON, outputMD); err != nil {
http.Error(w, fmt.Sprintf("Formatting failed: %v", err), http.StatusInternalServerError)
return
}
// Generate job ID and create job
jobID := fmt.Sprintf("job-%s", baseFilename)
job := &ProcessingJob{
ID: jobID,
Status: "pending",
StartTime: time.Now(),
JSONPath: outputJSON,
MDPath: outputMD,
}
// Store job
s.jobsMutex.Lock()
s.jobs[jobID] = job
s.jobsMutex.Unlock()
// Start processing in background
go func() {
defer func() {
if tempCriteriaFile != "" {
os.Remove(tempCriteriaFile) // Clean up temp file after processing
}
}()
// Process the papers
if err := paperprocessor.ProcessFile(
inputJSON,
outputJSON,
criteriaFile,
config,
); err != nil {
s.jobsMutex.Lock()
job.Status = "failed"
job.Error = fmt.Sprintf("Processing failed: %v", err)
s.jobsMutex.Unlock()
return
}
// Format to markdown
if err := paperformatter.FormatPapers(outputJSON, outputMD); err != nil {
s.jobsMutex.Lock()
job.Status = "failed"
job.Error = fmt.Sprintf("Formatting failed: %v", err)
s.jobsMutex.Unlock()
return
}
// Read markdown content
mdContent, err := os.ReadFile(outputMD)
if err != nil {
s.jobsMutex.Lock()
job.Status = "failed"
job.Error = fmt.Sprintf("Failed to read markdown: %v", err)
s.jobsMutex.Unlock()
return
}
// Convert Windows line endings to Unix
mdString := strings.ReplaceAll(string(mdContent), "\r\n", "\n")
// Ensure file ends with newline
if !strings.HasSuffix(mdString, "\n") {
mdString += "\n"
}
// Update job with success
s.jobsMutex.Lock()
job.Status = "completed"
job.MarkdownText = mdString
s.jobsMutex.Unlock()
}()
// Return job ID immediately
json.NewEncoder(w).Encode(struct {
JobID string `json:"job_id"`
}{
JobID: jobID,
})
}

27
utils.go Normal file
View File

@ -0,0 +1,27 @@
package main
import (
"regexp"
"strings"
"time"
)
// IsValidDate checks if the date string is in YYYYMMDD format
func IsValidDate(date string) bool {
// Check basic format with regex
matched, err := regexp.MatchString(`^\d{8}$`, date)
if err != nil || !matched {
return false
}
// Parse date to verify it's a valid date
_, err = time.Parse("20060102", date)
return err == nil
}
// SanitizeFilename replaces invalid filename characters to match arxiva's sanitization
func SanitizeFilename(s string) string {
s = strings.ReplaceAll(s, ":", "_")
s = strings.ReplaceAll(s, " ", "_")
return s
}