paper-system/llm_processor/processor/processor.go

package processor

import (
	"context"
	"encoding/json"
	"fmt"
	"strings"
	"time"

	"llm_processor/client"
	"llm_processor/models"
	"llm_processor/storage"
)

type Processor struct {
	modelName string
	batchSize int
	timeout   time.Duration
	client    *client.OpenRouterClient
}

// SetTimeout allows changing the processor timeout
func (p *Processor) SetTimeout(timeout time.Duration) {
	p.timeout = timeout
}

func NewProcessor(modelName string, batchSize int, apiKey string) *Processor {
	return &Processor{
		modelName: modelName,
		batchSize: batchSize,
		timeout:   3600 * time.Second, // Default 1 hour timeout
		client:    client.NewOpenRouterClient(apiKey),
	}
}

func (p *Processor) ProcessPapers(parentCtx context.Context, inputPath, outputPath, criteriaPath string, delay time.Duration) error {
	startTime := time.Now()
	ctx, cancel := context.WithTimeout(parentCtx, p.timeout)
	defer cancel()

	// Load papers from input file
	papers, err := storage.LoadPapers(inputPath)
	if err != nil {
		return fmt.Errorf("failed to load papers: %w", err)
	}

	// Set criteria path in storage
	storage.SetCriteriaPath(criteriaPath)

	// Initialize results file
	if err := storage.InitializeResultsFile(outputPath); err != nil {
		return fmt.Errorf("failed to initialize results file: %w", err)
	}

	// Process papers in batches
	for i := 0; i < len(papers); i += p.batchSize {
		end := i + p.batchSize
		if end > len(papers) {
			end = len(papers)
		}

		batch := papers[i:end]
		if err := p.processBatch(ctx, batch, delay, startTime, outputPath); err != nil {
			return fmt.Errorf("failed to process batch: %w", err)
		}
	}

	return nil
}

func (p *Processor) processBatch(ctx context.Context, papers []models.Paper, delay time.Duration, startTime time.Time, outputPath string) error {
	ctx, cancel := context.WithTimeout(ctx, p.timeout)
	defer cancel()

	// Load criteria
	criteria, err := storage.LoadCriteria()
	if err != nil {
		return fmt.Errorf("failed to load criteria: %w", err)
	}
	for i, paper := range papers {
		var evaluation string
		var lastErr error

		// Retry up to 3 times with exponential backoff
		for attempt := 0; attempt < 3; attempt++ {
			evaluation, lastErr = p.client.EvaluatePaper(ctx, paper, criteria, p.modelName)
			if lastErr == nil {
				break
			}
			time.Sleep(time.Duration(attempt+1) * time.Second) // Exponential backoff
		}

		if lastErr != nil {
			// Log error but continue with next paper
			evaluation = fmt.Sprintf(`{
					"decision": "ERROR",
					"explanation": "Failed to evaluate paper after 3 attempts: %v"
				}`, lastErr)
		}

		// Parse and validate evaluation response
		var evalResponse struct {
			Decision    string `json:"decision"`
			Explanation string `json:"explanation"`
		}

		if err := json.Unmarshal([]byte(evaluation), &evalResponse); err != nil {
			// Try to extract decision and explanation using regex
			decision := "ERROR"
			explanation := evaluation

			if strings.Contains(evaluation, `"decision": "ACCEPT"`) || strings.Contains(evaluation, `"decision":"ACCEPT"`) {
				decision = "ACCEPT"
			} else if strings.Contains(evaluation, `"decision": "REJECT"`) || strings.Contains(evaluation, `"decision":"REJECT"`) {
				decision = "REJECT"
			}

			// Try to extract just the explanation if it's in JSON format
			if strings.Contains(evaluation, `"explanation"`) {
				parts := strings.Split(evaluation, `"explanation"`)
				if len(parts) > 1 {
					// Find the content between the first : and the next "
					expl := parts[1]
					start := strings.Index(expl, ":")
					if start != -1 {
						expl = expl[start+1:]
						// Remove leading/trailing whitespace and quotes
						expl = strings.Trim(expl, " \t\n\r\"")
						// Remove trailing JSON syntax
						expl = strings.TrimRight(expl, "}")
						expl = strings.TrimRight(expl, ",")
						explanation = expl
					}
				}
			}

			evalResponse = struct {
				Decision    string `json:"decision"`
				Explanation string `json:"explanation"`
			}{
				Decision:    decision,
				Explanation: explanation,
			}
		}

		// Sanitize the explanation
		explanation := evalResponse.Explanation
		// Remove any markdown code block syntax
		explanation = strings.ReplaceAll(explanation, "```", "")
		// Remove any JSON formatting if the explanation is a raw JSON string
		if strings.HasPrefix(strings.TrimSpace(explanation), "{") {
			var jsonExpl struct {
				Explanation string `json:"explanation"`
			}
			if err := json.Unmarshal([]byte(explanation), &jsonExpl); err == nil && jsonExpl.Explanation != "" {
				explanation = jsonExpl.Explanation
			}
		}
		// Escape any remaining special markdown characters
		explanation = strings.ReplaceAll(explanation, "*", "\\*")
		explanation = strings.ReplaceAll(explanation, "_", "\\_")
		explanation = strings.ReplaceAll(explanation, "`", "\\`")

		result := models.Result{
			Paper:       paper,
			Decision:    evalResponse.Decision,
			Explanation: explanation,
		}

		// Save result with detailed logging
		fmt.Printf("Saving result for paper %q to %s\n", paper.Title, outputPath)
		if err := storage.SaveResult(result, outputPath); err != nil {
			fmt.Printf("Failed to save result for paper %q: %v\n", paper.Title, err)
			return fmt.Errorf("failed to save result: %w", err)
		}
		fmt.Printf("Successfully saved result for paper %q\n", paper.Title)

		// Print progress
		elapsed := time.Since(startTime).Seconds()
		fmt.Printf("Processed paper %d/%d (%s) - Total runtime: %.2f seconds\n",
			i+1, len(papers), paper.Title, elapsed)

		// Apply delay between papers if specified
		if delay > 0 {
			time.Sleep(delay)
		}
	}

	return nil
}