diff --git a/README.md b/README.md index a9cad05..4825faf 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,10 @@ PyNamer is a command-line tool that uses AI vision models to generate descriptiv - Uses LiteLLM to integrate with various vision-capable LLMs (default: GPT-4 Vision) - Configurable via YAML config file - Supports multiple image formats (jpg, jpeg, png, gif, webp) +- Automatically resizes large images before processing (configurable max dimension) - Dry-run mode to preview changes without renaming files - Handles filename collisions automatically +- Robust config file discovery (user config, package config, or explicit path) ## Installation @@ -47,17 +49,35 @@ You can customize the following settings: - LLM provider and model - API key and endpoint - Supported image formats +- Image resizing parameters (max dimension, output format) - Prompt templates for filename generation Example configuration file: ```yaml +# LLM API Configuration llm: provider: "openai" model: "gpt-4-vision-preview" api_key: "your-api-key-here" max_tokens: 100 temperature: 0.7 + +# Image Processing +image: + supported_formats: + - ".jpg" + - ".jpeg" + - ".png" + - ".gif" + - ".webp" + resize_max_dimension: 1024 # Max width/height before resizing + resize_format: "JPEG" # Output format for resized images + +# Prompt Configuration +prompt: + system_message: "You are a helpful assistant that generates concise, descriptive filenames..." + user_message: "Generate a descriptive filename for this image..." ``` ## Usage diff --git a/src/pynamer/core.py b/src/pynamer/core.py index d3a5962..434cdac 100644 --- a/src/pynamer/core.py +++ b/src/pynamer/core.py @@ -1,14 +1,18 @@ """Core functionality for PyNamer.""" +import argparse import base64 +import io import os import sys from pathlib import Path import yaml from typing import Dict, List, Optional, Union + import litellm from litellm import completion import logging +from PIL import Image # Configure logging logging.basicConfig( @@ -24,7 +28,8 @@ class PyNamer: """Initialize the PyNamer with configuration. Args: - config_path: Path to the YAML configuration file + config_path: Optional path to the YAML configuration file. + If None, will look in default locations. """ if config_path is None: # Look for config in user's home directory first @@ -34,8 +39,13 @@ class PyNamer: logger.info(f"Using user config from {user_config_path}") else: # Fall back to default config in package - config_path = os.path.join(os.path.dirname(__file__), 'config.yaml') - logger.info(f"Using default config from {config_path}") + package_dir = os.path.dirname(os.path.abspath(__file__)) + config_path = os.path.join(package_dir, 'config.yaml') + if os.path.exists(config_path): + logger.info(f"Using package config from {config_path}") + else: + logger.error("No configuration file found in package directory") + sys.exit(1) self.config = self._load_config(config_path) self._setup_llm() @@ -75,21 +85,54 @@ class PyNamer: self.model = llm_config.get('model', 'gpt-4-vision-preview') self.max_tokens = llm_config.get('max_tokens', 100) self.temperature = llm_config.get('temperature', 0.7) + + # Image processing settings + image_config = self.config.get('image', {}) + self.resize_max_dimension = image_config.get('resize_max_dimension', 1024) + self.resize_format = image_config.get('resize_format', 'JPEG') logger.info(f"LLM setup complete. Using model: {self.model}") - - def _encode_image(self, image_path: str) -> str: - """Encode image to base64 for API submission. - + logger.info(f"Image resize settings: max_dimension={self.resize_max_dimension}, format={self.resize_format}") + + def _resize_and_encode_image(self, image_path: str) -> str: + """Resize image if necessary and encode to base64 for API submission. + Args: image_path: Path to the image file Returns: Base64 encoded image string """ - with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode('utf-8') - + try: + with Image.open(image_path) as img: + # Calculate new size maintaining aspect ratio + width, height = img.size + if max(width, height) > self.resize_max_dimension: + if width > height: + new_width = self.resize_max_dimension + new_height = int(height * (self.resize_max_dimension / width)) + else: + new_height = self.resize_max_dimension + new_width = int(width * (self.resize_max_dimension / height)) + + logger.debug(f"Resizing image from {width}x{height} to {new_width}x{new_height}") + img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) + else: + logger.debug("Image size is within limits, no resize needed.") + + # Save resized image to a bytes buffer + buffer = io.BytesIO() + # Handle potential transparency issues when saving as JPEG + if self.resize_format.upper() == 'JPEG' and img.mode in ('RGBA', 'P'): + img = img.convert('RGB') + img.save(buffer, format=self.resize_format) + img_bytes = buffer.getvalue() + + return base64.b64encode(img_bytes).decode('utf-8') + except Exception as e: + logger.error(f"Error processing image {image_path}: {e}") + raise + def _is_supported_format(self, file_path: str) -> bool: """Check if the file format is supported. @@ -121,9 +164,12 @@ class PyNamer: return None try: - # Encode image - base64_image = self._encode_image(image_path) + # Resize and encode image + base64_image = self._resize_and_encode_image(image_path) + # Determine the mime type based on the resize format + mime_type = f"image/{self.resize_format.lower()}" + # Prepare messages for LLM system_message = self.config.get('prompt', {}).get('system_message', '') user_message = self.config.get('prompt', {}).get('user_message', '') @@ -136,7 +182,7 @@ class PyNamer: {"type": "text", "text": user_message}, { "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"} + "image_url": {"url": f"data:{mime_type};base64,{base64_image}"} } ] }