Compare commits
No commits in common. "4813904fc75994362297c6581d01beb19cd471a4" and "175a4f57c6a13904a541c8e5129b51a1bf72c796" have entirely different histories.
4813904fc7
...
175a4f57c6
95
README.md
95
README.md
|
@ -1,95 +0,0 @@
|
||||||
# Papers
|
|
||||||
|
|
||||||
A Go CLI tool for fetching, processing, and analyzing academic papers from arXiv using LLM-based evaluation.
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- Fetch papers from arXiv API based on date range and search query
|
|
||||||
- Process papers using configurable LLM models (default: phi-4)
|
|
||||||
- Generate both JSON and Markdown outputs
|
|
||||||
- Customizable evaluation criteria
|
|
||||||
- Rate-limited API requests (2-second delay between requests)
|
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
go install gitea.r8z.us/stwhite/papers@latest
|
|
||||||
```
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
Basic usage:
|
|
||||||
```bash
|
|
||||||
papers -start 20240101 -end 20240131 -query "machine learning" -api-key "your-key"
|
|
||||||
```
|
|
||||||
|
|
||||||
With custom model and output paths:
|
|
||||||
```bash
|
|
||||||
papers -start 20240101 -end 20240131 -query "machine learning" -api-key "your-key" \
|
|
||||||
-model "gpt-4" -json-output "results.json" -md-output "summary.md"
|
|
||||||
```
|
|
||||||
|
|
||||||
Fetch papers without processing:
|
|
||||||
```bash
|
|
||||||
papers -search-only -start 20240101 -end 20240131 -query "machine learning"
|
|
||||||
```
|
|
||||||
|
|
||||||
Use input file:
|
|
||||||
```bash
|
|
||||||
papers -input papers.json -api-key "your-key"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Required Flags
|
|
||||||
|
|
||||||
- `-start`: Start date (YYYYMMDD format)
|
|
||||||
- `-end`: End date (YYYYMMDD format)
|
|
||||||
- `-query`: Search query
|
|
||||||
|
|
||||||
### Optional Flags
|
|
||||||
|
|
||||||
- `-search-only`: Fetch papers from arXiv and save to JSON file without processing
|
|
||||||
- `-input`: Input JSON file containing papers (optional)
|
|
||||||
- `-maxResults`: Maximum number of results to fetch (1-2000, default: 100)
|
|
||||||
- `-model`: LLM model to use for processing (default: "phi-4")
|
|
||||||
- `-api-endpoint`: API endpoint URL (default: "http://localhost:1234/v1/chat/completions")
|
|
||||||
- `-criteria`: Path to evaluation criteria markdown file (default: "criteria.md")
|
|
||||||
- `-json-output`: Custom JSON output file path (default: YYYYMMDD-YYYYMMDD-query.json)
|
|
||||||
- `-md-output`: Custom Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)
|
|
||||||
|
|
||||||
## Pipeline
|
|
||||||
|
|
||||||
1. **Fetch**: Retrieves papers from arXiv based on specified date range and query
|
|
||||||
2. **Save**: Stores raw paper data in JSON format
|
|
||||||
3. **Process**: Evaluates papers using the specified LLM model according to criteria
|
|
||||||
4. **Format**: Generates both JSON and Markdown outputs of the processed results
|
|
||||||
|
|
||||||
## Output Files
|
|
||||||
|
|
||||||
The tool generates two types of output files:
|
|
||||||
|
|
||||||
1. **JSON Output**: Contains the raw processing results
|
|
||||||
- Default name format: `YYYYMMDD-YYYYMMDD-query.json`
|
|
||||||
- Can be customized with `-json-output` flag
|
|
||||||
|
|
||||||
2. **Markdown Output**: Human-readable formatted results
|
|
||||||
- Default name format: `YYYYMMDD-YYYYMMDD-query.md`
|
|
||||||
- Can be customized with `-md-output` flag
|
|
||||||
|
|
||||||
## Dependencies
|
|
||||||
|
|
||||||
- [arxiva](gitea.r8z.us/stwhite/arxiva): Paper fetching from arXiv
|
|
||||||
- [paperprocessor](gitea.r8z.us/stwhite/paperprocessor): LLM-based paper processing
|
|
||||||
- [paperformatter](gitea.r8z.us/stwhite/paperformatter): Output formatting
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
|
|
||||||
The tool includes various error checks:
|
|
||||||
- Date format validation (YYYYMMDD)
|
|
||||||
- Required flag validation
|
|
||||||
- Maximum results range validation (1-2000)
|
|
||||||
- File system operations verification
|
|
||||||
- API request error handling
|
|
||||||
|
|
||||||
## License
|
|
||||||
|
|
||||||
[License information not provided in source]
|
|
201
papers.go
201
papers.go
|
@ -1,10 +1,8 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
@ -16,47 +14,6 @@ import (
|
||||||
"gitea.r8z.us/stwhite/paperprocessor"
|
"gitea.r8z.us/stwhite/paperprocessor"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Paper represents the expected structure of papers in the input JSON file
|
|
||||||
type Paper struct {
|
|
||||||
Title string `json:"title"`
|
|
||||||
Abstract string `json:"abstract"`
|
|
||||||
ArxivID string `json:"arxiv_id"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// validateInputFile checks if the input file exists and has valid JSON structure
|
|
||||||
func validateInputFile(path string) ([]Paper, error) {
|
|
||||||
file, err := os.Open(path)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to open input file: %v", err)
|
|
||||||
}
|
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
content, err := io.ReadAll(file)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to read input file: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
var papers []Paper
|
|
||||||
if err := json.Unmarshal(content, &papers); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid JSON format: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate required fields
|
|
||||||
for i, paper := range papers {
|
|
||||||
if paper.Title == "" {
|
|
||||||
return nil, fmt.Errorf("paper at index %d missing title", i)
|
|
||||||
}
|
|
||||||
if paper.Abstract == "" {
|
|
||||||
return nil, fmt.Errorf("paper at index %d missing abstract", i)
|
|
||||||
}
|
|
||||||
if paper.ArxivID == "" {
|
|
||||||
return nil, fmt.Errorf("paper at index %d missing arxiv_id", i)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return papers, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// sanitizeFilename replaces invalid filename characters to match arxiva's sanitization
|
// sanitizeFilename replaces invalid filename characters to match arxiva's sanitization
|
||||||
func sanitizeFilename(s string) string {
|
func sanitizeFilename(s string) string {
|
||||||
s = strings.ReplaceAll(s, ":", "_")
|
s = strings.ReplaceAll(s, ":", "_")
|
||||||
|
@ -82,38 +39,31 @@ func main() {
|
||||||
flag.Usage = func() {
|
flag.Usage = func() {
|
||||||
fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0])
|
fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0])
|
||||||
fmt.Fprintf(os.Stderr, "Description:\n")
|
fmt.Fprintf(os.Stderr, "Description:\n")
|
||||||
fmt.Fprintf(os.Stderr, " Fetches papers from arXiv (or uses input file), processes them using an LLM, and generates both JSON and Markdown outputs.\n\n")
|
fmt.Fprintf(os.Stderr, " Fetches papers from arXiv, processes them using an LLM, and generates both JSON and Markdown outputs.\n\n")
|
||||||
fmt.Fprintf(os.Stderr, "Pipeline:\n")
|
fmt.Fprintf(os.Stderr, "Pipeline:\n")
|
||||||
fmt.Fprintf(os.Stderr, " 1. Either:\n")
|
fmt.Fprintf(os.Stderr, " 1. Fetches papers from arXiv based on date range and query\n")
|
||||||
fmt.Fprintf(os.Stderr, " a) Fetches papers from arXiv based on date range and query, or\n")
|
fmt.Fprintf(os.Stderr, " 2. Saves raw papers to JSON (format: YYYYMMDD-YYYYMMDD-query.json)\n")
|
||||||
fmt.Fprintf(os.Stderr, " b) Uses papers from provided input file\n")
|
fmt.Fprintf(os.Stderr, " 3. Processes papers using specified LLM model\n")
|
||||||
fmt.Fprintf(os.Stderr, " 2. Processes papers using specified LLM model\n")
|
fmt.Fprintf(os.Stderr, " 4. Formats results to both JSON and Markdown\n\n")
|
||||||
fmt.Fprintf(os.Stderr, " 3. Formats results to both JSON and Markdown\n\n")
|
|
||||||
fmt.Fprintf(os.Stderr, "Required flags:\n")
|
fmt.Fprintf(os.Stderr, "Required flags:\n")
|
||||||
fmt.Fprintf(os.Stderr, " -api-key : API key for LLM service\n\n")
|
|
||||||
fmt.Fprintf(os.Stderr, "Required for arXiv fetching (if not using -input):\n")
|
|
||||||
fmt.Fprintf(os.Stderr, " -start : Start date (YYYYMMDD)\n")
|
fmt.Fprintf(os.Stderr, " -start : Start date (YYYYMMDD)\n")
|
||||||
fmt.Fprintf(os.Stderr, " -end : End date (YYYYMMDD)\n")
|
fmt.Fprintf(os.Stderr, " -end : End date (YYYYMMDD)\n")
|
||||||
fmt.Fprintf(os.Stderr, " -query : Search query\n\n")
|
fmt.Fprintf(os.Stderr, " -query : Search query\n")
|
||||||
|
fmt.Fprintf(os.Stderr, " -api-key : API key for LLM service\n\n")
|
||||||
fmt.Fprintf(os.Stderr, "Options:\n")
|
fmt.Fprintf(os.Stderr, "Options:\n")
|
||||||
flag.PrintDefaults()
|
flag.PrintDefaults()
|
||||||
fmt.Fprintf(os.Stderr, "\nExamples:\n")
|
fmt.Fprintf(os.Stderr, "\nExamples:\n")
|
||||||
fmt.Fprintf(os.Stderr, " Using arXiv:\n")
|
fmt.Fprintf(os.Stderr, " Basic usage:\n")
|
||||||
fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\"\n\n", os.Args[0])
|
fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\"\n\n", os.Args[0])
|
||||||
fmt.Fprintf(os.Stderr, " Using input file:\n")
|
fmt.Fprintf(os.Stderr, " With custom model and outputs:\n")
|
||||||
fmt.Fprintf(os.Stderr, " %s -input papers.json -api-key \"your-key\"\n\n", os.Args[0])
|
fmt.Fprintf(os.Stderr, " %s -start 20240101 -end 20240131 -query \"machine learning\" -api-key \"your-key\" \\\n", os.Args[0])
|
||||||
fmt.Fprintf(os.Stderr, " With custom options:\n")
|
fmt.Fprintf(os.Stderr, " -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n")
|
||||||
fmt.Fprintf(os.Stderr, " %s -input papers.json -api-key \"your-key\" -model \"gpt-4\" -json-output \"results.json\" -md-output \"summary.md\"\n", os.Args[0])
|
|
||||||
fmt.Fprintf(os.Stderr, " Search only:\n")
|
|
||||||
fmt.Fprintf(os.Stderr, " %s -search-only -start 20240101 -end 20240131 -query \"machine learning\" \n\n", os.Args[0])
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse command line arguments
|
// Parse command line arguments
|
||||||
searchOnly := flag.Bool("search-only", false, "Only fetch papers from arXiv and save to JSON file (do not process)")
|
startDate := flag.String("start", "", "Start date in YYYYMMDD format")
|
||||||
inputFile := flag.String("input", "", "Input JSON file containing papers (optional)")
|
endDate := flag.String("end", "", "End date in YYYYMMDD format")
|
||||||
startDate := flag.String("start", "", "Start date in YYYYMMDD format (required if not using -input)")
|
query := flag.String("query", "", "Search query")
|
||||||
endDate := flag.String("end", "", "End date in YYYYMMDD format (required if not using -input)")
|
|
||||||
query := flag.String("query", "", "Search query (required if not using -input)")
|
|
||||||
maxResults := flag.Int("maxResults", 100, "Maximum number of results (1-2000)")
|
maxResults := flag.Int("maxResults", 100, "Maximum number of results (1-2000)")
|
||||||
model := flag.String("model", "phi-4", "Model to use for processing")
|
model := flag.String("model", "phi-4", "Model to use for processing")
|
||||||
apiKey := flag.String("api-key", "", "API key for service authentication")
|
apiKey := flag.String("api-key", "", "API key for service authentication")
|
||||||
|
@ -123,10 +73,20 @@ func main() {
|
||||||
mdOutput := flag.String("md-output", "", "Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)")
|
mdOutput := flag.String("md-output", "", "Markdown output file path (default: YYYYMMDD-YYYYMMDD-query.md)")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
// Validate required flags and input
|
// Generate base filename from parameters with sanitization
|
||||||
if *searchOnly {
|
baseFilename := fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query))
|
||||||
if *startDate == "" || *endDate == "" || *query == "" {
|
|
||||||
fmt.Fprintf(os.Stderr, "Error: start date, end date, and query are required when using -search-only\n\n")
|
// Set default output filenames if not provided
|
||||||
|
if *jsonOutput == "" {
|
||||||
|
*jsonOutput = baseFilename + ".json"
|
||||||
|
}
|
||||||
|
if *mdOutput == "" {
|
||||||
|
*mdOutput = baseFilename + ".md"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate required flags
|
||||||
|
if *startDate == "" || *endDate == "" || *query == "" || *apiKey == "" {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error: start date, end date, query, and api-key are required\n\n")
|
||||||
flag.Usage()
|
flag.Usage()
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
@ -143,83 +103,6 @@ func main() {
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fetch papers from arXiv
|
|
||||||
papers, err := arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("Failed to fetch papers: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save papers to JSON file using the same naming convention
|
|
||||||
if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil {
|
|
||||||
log.Fatalf("Failed to save papers: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Printf("Successfully fetched and saved papers to %s-%s-%s.json", *startDate, *endDate, sanitizeFilename(*query))
|
|
||||||
os.Exit(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
papers []arxiva.Paper
|
|
||||||
err error
|
|
||||||
baseFilename string
|
|
||||||
)
|
|
||||||
|
|
||||||
if *inputFile != "" {
|
|
||||||
// Use input file
|
|
||||||
inputPapers, err := validateInputFile(*inputFile)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("Invalid input file: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert input papers to arxiva.Paper format
|
|
||||||
papers = make([]arxiva.Paper, len(inputPapers))
|
|
||||||
for i, p := range inputPapers {
|
|
||||||
papers[i] = arxiva.Paper{
|
|
||||||
Title: p.Title,
|
|
||||||
Abstract: p.Abstract,
|
|
||||||
ArxivID: p.ArxivID,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use input filename as base for outputs
|
|
||||||
baseFilename = *inputFile
|
|
||||||
if ext := ".json"; strings.HasSuffix(baseFilename, ext) {
|
|
||||||
baseFilename = baseFilename[:len(baseFilename)-len(ext)]
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Validate arXiv fetching parameters
|
|
||||||
if *startDate == "" || *endDate == "" || *query == "" {
|
|
||||||
fmt.Fprintf(os.Stderr, "Error: start date, end date, and query are required when not using -input\n\n")
|
|
||||||
flag.Usage()
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate date format
|
|
||||||
if !isValidDate(*startDate) || !isValidDate(*endDate) {
|
|
||||||
fmt.Fprintf(os.Stderr, "Error: dates must be in YYYYMMDD format\n")
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate maxResults range
|
|
||||||
if *maxResults < 1 || *maxResults > 2000 {
|
|
||||||
fmt.Fprintf(os.Stderr, "Error: maxResults must be between 1 and 2000\n")
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fetch papers from arXiv
|
|
||||||
papers, err = arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("Failed to fetch papers: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save papers to JSON file using the same naming convention
|
|
||||||
if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil {
|
|
||||||
log.Fatalf("Failed to save papers: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
baseFilename = fmt.Sprintf("%s-%s-%s", *startDate, *endDate, sanitizeFilename(*query))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create processor configuration
|
// Create processor configuration
|
||||||
config := paperprocessor.Config{
|
config := paperprocessor.Config{
|
||||||
APIEndpoint: *apiEndpoint,
|
APIEndpoint: *apiEndpoint,
|
||||||
|
@ -228,24 +111,26 @@ func main() {
|
||||||
RequestDelay: 2 * time.Second,
|
RequestDelay: 2 * time.Second,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get criteria filename without extension for output naming
|
// Fetch papers using command line args
|
||||||
criteriaBase := *criteriaFile
|
papers, err := arxiva.FetchPapers(*startDate, *endDate, *query, *maxResults)
|
||||||
if ext := ".md"; strings.HasSuffix(criteriaBase, ext) {
|
if err != nil {
|
||||||
criteriaBase = criteriaBase[:len(criteriaBase)-len(ext)]
|
log.Fatalf("Failed to fetch papers: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set default output filenames if not provided
|
// Save papers to JSON file using the same naming convention
|
||||||
if *jsonOutput == "" {
|
if err := arxiva.SaveToFile(papers, *startDate, *endDate, *query); err != nil {
|
||||||
*jsonOutput = fmt.Sprintf("%s-%s.json", baseFilename, criteriaBase)
|
log.Fatalf("Failed to save papers: %v", err)
|
||||||
}
|
|
||||||
if *mdOutput == "" {
|
|
||||||
*mdOutput = fmt.Sprintf("%s-%s.md", baseFilename, criteriaBase)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process the papers
|
// Wait briefly for file system to sync and verify file exists
|
||||||
inputJson := baseFilename + ".json"
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
if _, err := os.Stat(baseFilename + ".json"); os.IsNotExist(err) {
|
||||||
|
log.Fatalf("Failed to find saved papers file: %s", baseFilename+".json")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process the saved file using the base filename
|
||||||
if err := paperprocessor.ProcessFile(
|
if err := paperprocessor.ProcessFile(
|
||||||
inputJson,
|
baseFilename+".json",
|
||||||
*jsonOutput,
|
*jsonOutput,
|
||||||
*criteriaFile,
|
*criteriaFile,
|
||||||
config,
|
config,
|
||||||
|
|
Loading…
Reference in New Issue