Initial Commit of papers system.
This commit is contained in:
commit
7fd890828e
|
@ -0,0 +1,47 @@
|
|||
## Here are the api signatures for arxiva
|
||||
### FetchPapers(startDate, endDate, query string, maxResults int) ([]Paper, error)
|
||||
startDate: Start date in format "YYYYMMDD"
|
||||
endDate: End date in format "YYYYMMDD"
|
||||
query: Search query
|
||||
maxResults: Maximum number of results (1-2000)
|
||||
Fetches papers from arXiv API
|
||||
|
||||
### SaveToFile(papers []Paper, startDate, endDate, query string) error
|
||||
papers: Array of Paper structs
|
||||
startDate: Start date in format "YYYYMMDD"
|
||||
endDate: End date in format "YYYYMMDD"
|
||||
query: Search query
|
||||
Saves papers to a JSON file
|
||||
|
||||
JSON file is named "YYYMMDD-YYYYMMDD-query.json" (where YYYYMMDD is start date and YYYYMMDD is end date and query is search query)
|
||||
|
||||
## here is the API signature for paperprocessor:
|
||||
|
||||
### ProcessFile
|
||||
`func ProcessFile(inputPath, outputPath, criteriaPath string, config Config, debug bool) error`
|
||||
|
||||
Processes papers from input JSON file and writes results to output JSON file
|
||||
|
||||
Parameters:
|
||||
- inputPath: Path to input JSON file containing papers array
|
||||
- outputPath: Path to write processing results JSON
|
||||
- criteriaPath: Path to text file with evaluation criteria
|
||||
- config: Configuration settings for API and processing
|
||||
- debug: Enable debug logging when true
|
||||
|
||||
Returns:
|
||||
- error: Processing error or nil if successful
|
||||
|
||||
You create config like this:
|
||||
config := paperprocessor.Config{
|
||||
APIEndpoint: "http://localhost:1234/v1/chat/completions",
|
||||
APIKey: apiKey,
|
||||
Model: "qwen2-7b-instruct",
|
||||
RequestDelay: 2 * time.Second, // 2 second delay between requests
|
||||
|
||||
|
||||
## Here is the usage for paperformatter:
|
||||
err := paperformatter.FormatPapers("input.json", "output.md")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
module gitea.r8z.us/stwhite/papers
|
||||
|
||||
go 1.23.4
|
||||
|
||||
toolchain go1.23.5
|
||||
|
||||
require (
|
||||
gitea.r8z.us/stwhite/arxiva v0.1.0
|
||||
gitea.r8z.us/stwhite/paperformatter v0.1.3
|
||||
gitea.r8z.us/stwhite/paperprocessor v0.1.5
|
||||
)
|
|
@ -0,0 +1,6 @@
|
|||
gitea.r8z.us/stwhite/arxiva v0.1.0 h1:v4rRQazWDQN6A4jqUjvJoTuGVlxHH9ee1SU/vculBN4=
|
||||
gitea.r8z.us/stwhite/arxiva v0.1.0/go.mod h1:V+xRJF205br/E1NM15S0htyfPnG8FERaluMTs97DcGM=
|
||||
gitea.r8z.us/stwhite/paperformatter v0.1.3 h1:Z8yIdfCmQ+c5A5To+Y3XPHNOdK5B/q5VqBPZkmYqKPc=
|
||||
gitea.r8z.us/stwhite/paperformatter v0.1.3/go.mod h1:As2zIT0NSsMirYqdvIfIXXxIHOcdsHANhnh0VNcAluQ=
|
||||
gitea.r8z.us/stwhite/paperprocessor v0.1.5 h1:c9HYWblP0D7mz0/mfcg4j98j1cisrcUPuQFSWuGpsIQ=
|
||||
gitea.r8z.us/stwhite/paperprocessor v0.1.5/go.mod h1:0wHe7XjtQICFrPKbO53SVrUiVw9yi8GOGo9J7znpo+E=
|
|
@ -0,0 +1,145 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.r8z.us/stwhite/arxiva"
|
||||
"gitea.r8z.us/stwhite/paperformatter"
|
||||
"gitea.r8z.us/stwhite/paperprocessor"
|
||||
)
|
||||
|
||||
// sanitizeFilename replaces invalid filename characters
|
||||
func sanitizeFilename(s string) string {
|
||||
return strings.ReplaceAll(s, ":", "_")
|
||||
}
|
||||
|
||||
// isValidDate checks if the date string is in YYYYMMDD format
|
||||
func isValidDate(date string) bool {
|
||||
// Check basic format with regex
|
||||
matched, err := regexp.MatchString(`^\d{8}$`, date)
|
||||
if err != nil || !matched {
|
||||
return false
|
||||
}
|
||||
|
||||
// Parse date to verify it's a valid date
|
||||
_, err = time.Parse("20060102", date)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Set custom usage message
|
||||
flag.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, "Description:\n")
|
||||
fmt.Fprintf(os.Stderr, " Fetches papers from arXiv, processes them using an LLM, and generates both JSON and Markdown outputs.\n\n")
|
||||
fmt.Fprintf(os.Stderr, "Pipeline:\n")
|
||||
fmt.Fprintf(os.Stderr, " 1. Fetches papers from arXiv based on date range and query\n")
|
||||
fmt.Fprintf(os.Stderr, " 2. Saves raw papers to JSON (format: YYYYMMDD-YYYYMMDD-query.json)\n")
|
||||
fmt.Fprintf(os.Stderr, " 3. Processes papers using specified LLM model\n")
|
||||
fmt.Fprintf(os.Stderr, " 4. Formats results to both JSON and Markdown\n\n")
|
||||
fmt.Fprintf(os.Stderr, "Required flags:\n")
|
||||
fmt.Fprintf(os.Stderr, " -start : Start date (YYYYMMDD)\n")
|
||||
fmt.Fprintf(os.Stderr, " -end : End date (YYYYMMDD)\n")
|
||||
fmt.Fprintf(os.Stderr, " -query : Search query\n")
|
||||
fmt.Fprintf(os.Stderr, " -api-key : API key for LLM service\n\n")
|
||||
fmt.Fprintf(os.Stderr, "Options:\n")
|
||||
flag.PrintDefaults()
|
||||
fmt.Fprintf(os.Stderr, "\nExamples:\n")
|
||||
fmt.Fprintf(os.Stderr, " Basic usage:\n")
|
||||
fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\"\n\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, " With custom model and outputs:\n")
|
||||
fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\" \\\n", os.Args[0])
|
||||
fmt.Fprintf(os.Stderr, " -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n")
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
startDate := flag.String("start", "", "Start date in YYYYMMDD format")
|
||||
endDate := flag.String("end", "", "End date in YYYYMMDD format")
|
||||
query := flag.String("query", "", "Search query")
|
||||
maxResults := flag.Int("maxResults", 100, "Maximum number of results (1-2000)")
|
||||
model := flag.String("model", "phi-4", "Model to use for processing")
|
||||
apiKey := flag.String("api-key", "", "API key for service authentication")
|
||||
apiEndpoint := flag.String("api-endpoint", "http://localhost:1234/v1/chat/completions", "API endpoint URL")
|
||||
criteriaFile := flag.String("criteria", "criteria.md", "Path to evaluation criteria markdown file")
|
||||
jsonOutput := flag.String("json-output", "", "JSON output file path (default: YYYYMMDD-YYYYMMDD-query.json)")
|
||||
mdOutput := flag.String("md-output", "", "Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)")
|
||||
flag.Parse()
|
||||
|
||||
// Generate base filename from parameters with sanitization
|
||||
baseFilename := fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query))
|
||||
|
||||
// Set default output filenames if not provided
|
||||
if *jsonOutput == "" {
|
||||
*jsonOutput = baseFilename + ".json"
|
||||
}
|
||||
if *mdOutput == "" {
|
||||
*mdOutput = baseFilename + ".md"
|
||||
}
|
||||
|
||||
// Validate required flags
|
||||
if *startDate == "" || *endDate == "" || *query == "" || *apiKey == "" {
|
||||
fmt.Fprintf(os.Stderr, "Error: start date, end date, query, and api-key are required\n\n")
|
||||
flag.Usage()
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Validate date format
|
||||
if !isValidDate(*startDate) || !isValidDate(*endDate) {
|
||||
fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Validate maxResults range
|
||||
if *maxResults < 1 || *maxResults > 2000 {
|
||||
fmt.Fprintf(os.Stderr, "Error: maxResults must be between 1 and 2000\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Create processor configuration
|
||||
config := paperprocessor.Config{
|
||||
APIEndpoint: *apiEndpoint,
|
||||
APIKey: *apiKey,
|
||||
Model: *model,
|
||||
RequestDelay: 2 * time.Second,
|
||||
}
|
||||
|
||||
// Fetch papers using command line args
|
||||
papers, err := arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to fetch papers: %v", err)
|
||||
}
|
||||
|
||||
// Save papers to JSON file using the same naming convention
|
||||
if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil {
|
||||
log.Fatalf("Failed to save papers: %v", err)
|
||||
}
|
||||
|
||||
// Wait briefly for file system to sync and verify file exists
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
if _, err := os.Stat(baseFilename + ".json"); os.IsNotExist(err) {
|
||||
log.Fatalf("Failed to find saved papers file: %s", baseFilename+".json")
|
||||
}
|
||||
|
||||
// Process the saved file using the base filename
|
||||
if err := paperprocessor.ProcessFile(
|
||||
baseFilename+".json",
|
||||
*jsonOutput,
|
||||
*criteriaFile,
|
||||
config,
|
||||
); err != nil {
|
||||
log.Fatalf("Processing failed: %v", err)
|
||||
}
|
||||
|
||||
// Format the processed results to markdown
|
||||
if err := paperformatter.FormatPapers(*jsonOutput, *mdOutput); err != nil {
|
||||
log.Fatalf("Formatting failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Successfully processed papers. Results written to %s and formatted to %s", *jsonOutput, *mdOutput)
|
||||
}
|
Loading…
Reference in New Issue