ira/markdown_segmenter.py

63 lines
1.6 KiB
Python

import os
import json
import requests
# Get your Jina AI API key for free: https://jina.ai/?sui=apikey
JINA_API_KEY = os.getenv('JINA_API_KEY')
def segment_markdown(file_path):
"""
Segments a markdown file using Jina AI's Segmenter API.
Args:
file_path (str): Path to the markdown file.
Returns:
dict: JSON structure containing the segments.
"""
try:
# Read the markdown file
with open(file_path, 'r') as file:
markdown_content = file.read()
# Prepare the request to Jina Segmenter API
headers = {
'Authorization': f'Bearer {JINA_API_KEY}',
'Content-Type': 'application/json',
'Accept': 'application/json'
}
data = {
'content': markdown_content,
'tokenizer': 'cl100k_base',
'return_tokens': False,
'return_chunks': True,
'max_chunk_length': 1000
}
# Make the API request
response = requests.post(
'https://segment.jina.ai/',
headers=headers,
json=data
)
response.raise_for_status()
# Return the segments as JSON
return response.json()
except Exception as e:
print(f'Error segmenting markdown: {str(e)}')
return None
if __name__ == '__main__':
import sys
if len(sys.argv) != 2:
print('Usage: python markdown_segmenter.py <markdown_file>')
sys.exit(1)
segments = segment_markdown(sys.argv[1])
if segments:
print(json.dumps(segments, indent=2))