63 lines
1.6 KiB
Python
63 lines
1.6 KiB
Python
import os
|
|
import json
|
|
import requests
|
|
|
|
# Get your Jina AI API key for free: https://jina.ai/?sui=apikey
|
|
JINA_API_KEY = os.getenv('JINA_API_KEY')
|
|
|
|
|
|
def segment_markdown(file_path):
|
|
"""
|
|
Segments a markdown file using Jina AI's Segmenter API.
|
|
|
|
Args:
|
|
file_path (str): Path to the markdown file.
|
|
|
|
Returns:
|
|
dict: JSON structure containing the segments.
|
|
"""
|
|
try:
|
|
# Read the markdown file
|
|
with open(file_path, 'r') as file:
|
|
markdown_content = file.read()
|
|
|
|
# Prepare the request to Jina Segmenter API
|
|
headers = {
|
|
'Authorization': f'Bearer {JINA_API_KEY}',
|
|
'Content-Type': 'application/json',
|
|
'Accept': 'application/json'
|
|
}
|
|
data = {
|
|
'content': markdown_content,
|
|
'tokenizer': 'cl100k_base',
|
|
'return_tokens': False,
|
|
'return_chunks': True,
|
|
'max_chunk_length': 1000
|
|
}
|
|
|
|
# Make the API request
|
|
response = requests.post(
|
|
'https://segment.jina.ai/',
|
|
headers=headers,
|
|
json=data
|
|
)
|
|
response.raise_for_status()
|
|
|
|
# Return the segments as JSON
|
|
return response.json()
|
|
|
|
except Exception as e:
|
|
print(f'Error segmenting markdown: {str(e)}')
|
|
return None
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
if len(sys.argv) != 2:
|
|
print('Usage: python markdown_segmenter.py <markdown_file>')
|
|
sys.exit(1)
|
|
|
|
segments = segment_markdown(sys.argv[1])
|
|
if segments:
|
|
print(json.dumps(segments, indent=2))
|