Improve document scraper with better error handling and add mock option for testing
This commit is contained in:
parent
695e4b7ecd
commit
34be5ce36f
|
@ -35,13 +35,20 @@ class DocumentScraper:
|
||||||
using Jina Reader API or fallback methods.
|
using Jina Reader API or fallback methods.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, use_mock: bool = False):
|
||||||
"""Initialize the document scraper."""
|
"""
|
||||||
|
Initialize the document scraper.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
use_mock: If True, use mock data instead of making actual API calls
|
||||||
|
"""
|
||||||
self.config = get_config()
|
self.config = get_config()
|
||||||
self.api_key = self._get_api_key()
|
self.api_key = self._get_api_key()
|
||||||
self.endpoint = "https://api.jina.ai/v1/reader"
|
self.endpoint = "https://api.jina.ai/v1/reader"
|
||||||
self.db_manager = get_db_manager()
|
self.db_manager = get_db_manager()
|
||||||
self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer
|
self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer
|
||||||
|
self.use_mock = use_mock
|
||||||
|
self.jina_api_available = self.api_key != ""
|
||||||
|
|
||||||
def _get_api_key(self) -> str:
|
def _get_api_key(self) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -201,6 +208,56 @@ class DocumentScraper:
|
||||||
|
|
||||||
return converter.handle(html)
|
return converter.handle(html)
|
||||||
|
|
||||||
|
async def _get_mock_content(self, url: str) -> Tuple[str, Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Generate mock content for testing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to generate mock content for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (content, metadata)
|
||||||
|
"""
|
||||||
|
domain = urlparse(url).netloc
|
||||||
|
path = urlparse(url).path
|
||||||
|
|
||||||
|
# Generate a title based on the URL
|
||||||
|
title = f"Mock Content for {domain}{path}"
|
||||||
|
|
||||||
|
# Generate mock content
|
||||||
|
content = f"""# {title}
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
This is mock content generated for testing purposes. The original URL is {url}.
|
||||||
|
|
||||||
|
## Section 1
|
||||||
|
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam euismod, nisl eget
|
||||||
|
aliquam ultricies, nunc nisl aliquet nunc, quis aliquam nisl nunc eu nisl.
|
||||||
|
|
||||||
|
## Section 2
|
||||||
|
|
||||||
|
Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas.
|
||||||
|
Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante.
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
This mock content was generated on {datetime.now().isoformat()}.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Generate mock metadata
|
||||||
|
metadata = {
|
||||||
|
"source_url": url,
|
||||||
|
"title": title,
|
||||||
|
"description": "This is mock content generated for testing purposes.",
|
||||||
|
"author": "Mock Generator",
|
||||||
|
"scrape_date": datetime.now().isoformat(),
|
||||||
|
"publication_date": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
return content, metadata
|
||||||
|
|
||||||
async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
|
async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
|
||||||
"""
|
"""
|
||||||
Scrape a web page using Jina Reader API.
|
Scrape a web page using Jina Reader API.
|
||||||
|
@ -211,8 +268,14 @@ class DocumentScraper:
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (content, metadata)
|
Tuple of (content, metadata)
|
||||||
"""
|
"""
|
||||||
if not self.api_key:
|
# If using mock data, return mock content
|
||||||
logger.warning("Jina API key not available. Using fallback method.")
|
if self.use_mock:
|
||||||
|
logger.info(f"Using mock data for URL: {url}")
|
||||||
|
return await self._get_mock_content(url)
|
||||||
|
|
||||||
|
# If Jina API is not available, skip this step
|
||||||
|
if not self.jina_api_available:
|
||||||
|
logger.info("Jina API key not available. Using fallback method.")
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
|
@ -228,9 +291,16 @@ class DocumentScraper:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.post(self.endpoint, headers=headers, json=data) as response:
|
async with session.post(self.endpoint, headers=headers, json=data, timeout=30) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
logger.warning(f"Jina Reader API error: {response.status} - {await response.text()}")
|
error_text = await response.text()
|
||||||
|
logger.warning(f"Jina Reader API error: {response.status} - {error_text}")
|
||||||
|
|
||||||
|
# If we get a 404 or 429 (rate limit), mark the API as unavailable for this session
|
||||||
|
if response.status in [404, 429]:
|
||||||
|
logger.warning("Jina Reader API appears to be unavailable. Using fallback method for all subsequent requests.")
|
||||||
|
self.jina_api_available = False
|
||||||
|
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
result = await response.json()
|
result = await response.json()
|
||||||
|
@ -247,6 +317,9 @@ class DocumentScraper:
|
||||||
|
|
||||||
return content, metadata
|
return content, metadata
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(f"Timeout calling Jina Reader API for URL: {url}")
|
||||||
|
return None, None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error calling Jina Reader API: {str(e)}")
|
logger.error(f"Error calling Jina Reader API: {str(e)}")
|
||||||
return None, None
|
return None, None
|
||||||
|
@ -261,9 +334,14 @@ class DocumentScraper:
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (content, metadata)
|
Tuple of (content, metadata)
|
||||||
"""
|
"""
|
||||||
|
# If using mock data, return mock content
|
||||||
|
if self.use_mock:
|
||||||
|
logger.info(f"Using mock data for URL: {url}")
|
||||||
|
return await self._get_mock_content(url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
|
async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=30) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}")
|
logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}")
|
||||||
return None, None
|
return None, None
|
||||||
|
@ -278,6 +356,9 @@ class DocumentScraper:
|
||||||
|
|
||||||
return content, metadata
|
return content, metadata
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(f"Timeout fetching URL: {url}")
|
||||||
|
return None, None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in fallback scraping: {str(e)}")
|
logger.error(f"Error in fallback scraping: {str(e)}")
|
||||||
return None, None
|
return None, None
|
||||||
|
@ -306,10 +387,12 @@ class DocumentScraper:
|
||||||
logger.info(f"Document already exists in database: {normalized_url}")
|
logger.info(f"Document already exists in database: {normalized_url}")
|
||||||
return await self.db_manager.get_document_by_url(normalized_url)
|
return await self.db_manager.get_document_by_url(normalized_url)
|
||||||
|
|
||||||
# Try Jina Reader first
|
# Try Jina Reader first if it's available
|
||||||
|
content, metadata = None, None
|
||||||
|
if self.jina_api_available:
|
||||||
content, metadata = await self._scrape_with_jina_reader(normalized_url)
|
content, metadata = await self._scrape_with_jina_reader(normalized_url)
|
||||||
|
|
||||||
# Fallback to custom scraping if Jina Reader fails
|
# Fallback to custom scraping if Jina Reader fails or is unavailable
|
||||||
if content is None:
|
if content is None:
|
||||||
logger.info(f"Falling back to custom scraping for URL: {normalized_url}")
|
logger.info(f"Falling back to custom scraping for URL: {normalized_url}")
|
||||||
content, metadata = await self._scrape_with_fallback(normalized_url)
|
content, metadata = await self._scrape_with_fallback(normalized_url)
|
||||||
|
@ -367,34 +450,61 @@ class DocumentScraper:
|
||||||
# Create a singleton instance for global use
|
# Create a singleton instance for global use
|
||||||
document_scraper = DocumentScraper()
|
document_scraper = DocumentScraper()
|
||||||
|
|
||||||
def get_document_scraper() -> DocumentScraper:
|
def get_document_scraper(use_mock: bool = False) -> DocumentScraper:
|
||||||
"""
|
"""
|
||||||
Get the global document scraper instance.
|
Get the global document scraper instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
use_mock: If True, create a new instance with mock data
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DocumentScraper instance
|
DocumentScraper instance
|
||||||
"""
|
"""
|
||||||
|
global document_scraper
|
||||||
|
|
||||||
|
# If mock is requested, create a new instance with mock enabled
|
||||||
|
if use_mock:
|
||||||
|
return DocumentScraper(use_mock=True)
|
||||||
|
|
||||||
return document_scraper
|
return document_scraper
|
||||||
|
|
||||||
# Example usage
|
# Example usage
|
||||||
async def test_scraper():
|
async def test_scraper(use_mock: bool = False):
|
||||||
"""Test the document scraper with a sample URL."""
|
"""
|
||||||
|
Test the document scraper with a sample URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
use_mock: If True, use mock data instead of making actual API calls
|
||||||
|
"""
|
||||||
from report.database.db_manager import initialize_database
|
from report.database.db_manager import initialize_database
|
||||||
|
|
||||||
# Initialize database
|
# Initialize database
|
||||||
await initialize_database()
|
await initialize_database()
|
||||||
|
|
||||||
# Scrape a URL
|
# Scrape a URL
|
||||||
scraper = get_document_scraper()
|
scraper = get_document_scraper(use_mock=use_mock)
|
||||||
document = await scraper.scrape_url("https://en.wikipedia.org/wiki/Web_scraping")
|
|
||||||
|
# Test URLs
|
||||||
|
test_urls = [
|
||||||
|
"https://en.wikipedia.org/wiki/Web_scraping",
|
||||||
|
"https://docs.python.org/3/",
|
||||||
|
"https://www.python.org/"
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"Testing scraper with {'mock data' if use_mock else 'real data'}")
|
||||||
|
|
||||||
|
for url in test_urls:
|
||||||
|
print(f"\nScraping URL: {url}")
|
||||||
|
document = await scraper.scrape_url(url)
|
||||||
|
|
||||||
if document:
|
if document:
|
||||||
print(f"Successfully scraped document: {document['title']}")
|
print(f"Successfully scraped document: {document['title']}")
|
||||||
print(f"Token count: {document['token_count']}")
|
print(f"Token count: {document['token_count']}")
|
||||||
print(f"Content preview: {document['content'][:500]}...")
|
print(f"Content preview: {document['content'][:200]}...")
|
||||||
else:
|
else:
|
||||||
print("Failed to scrape document")
|
print(f"Failed to scrape document: {url}")
|
||||||
|
|
||||||
# Run test if this module is executed directly
|
# Run test if this module is executed directly
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(test_scraper())
|
# Test with real data by default
|
||||||
|
asyncio.run(test_scraper(use_mock=False))
|
||||||
|
|
|
@ -186,33 +186,42 @@ def get_report_generator() -> ReportGenerator:
|
||||||
"""
|
"""
|
||||||
return report_generator
|
return report_generator
|
||||||
|
|
||||||
async def test_report_generator():
|
async def test_report_generator(use_mock: bool = False):
|
||||||
"""Test the report generator with sample search results."""
|
"""
|
||||||
|
Test the report generator with sample search results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
use_mock: If True, use mock data instead of making actual API calls
|
||||||
|
"""
|
||||||
# Initialize the report generator
|
# Initialize the report generator
|
||||||
await initialize_report_generator()
|
await initialize_report_generator()
|
||||||
|
|
||||||
# Sample search results
|
# Get document scraper with mock option
|
||||||
|
document_scraper = get_document_scraper(use_mock=use_mock)
|
||||||
|
|
||||||
|
# Sample search results with real, accessible URLs
|
||||||
search_results = [
|
search_results = [
|
||||||
{
|
|
||||||
'title': 'Example Document 1',
|
|
||||||
'url': 'https://example.com/doc1',
|
|
||||||
'snippet': 'This is an example document.',
|
|
||||||
'score': 0.95
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'title': 'Example Document 2',
|
|
||||||
'url': 'https://example.com/doc2',
|
|
||||||
'snippet': 'This is another example document.',
|
|
||||||
'score': 0.85
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
'title': 'Python Documentation',
|
'title': 'Python Documentation',
|
||||||
'url': 'https://docs.python.org/3/',
|
'url': 'https://docs.python.org/3/',
|
||||||
'snippet': 'Official Python documentation.',
|
'snippet': 'Official Python documentation.',
|
||||||
|
'score': 0.95
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'title': 'Python.org',
|
||||||
|
'url': 'https://www.python.org/',
|
||||||
|
'snippet': 'The official home of the Python Programming Language.',
|
||||||
|
'score': 0.85
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'title': 'Wikipedia - Python',
|
||||||
|
'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
|
||||||
|
'snippet': 'Python is a high-level, general-purpose programming language.',
|
||||||
'score': 0.75
|
'score': 0.75
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
# Process search results
|
# Process search results
|
||||||
documents, relevance_scores = await report_generator.process_search_results(search_results)
|
documents, relevance_scores = await report_generator.process_search_results(search_results)
|
||||||
|
|
||||||
|
@ -221,7 +230,8 @@ async def test_report_generator():
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
print(f"Document: {doc.get('title')} ({doc.get('url')})")
|
print(f"Document: {doc.get('title')} ({doc.get('url')})")
|
||||||
print(f"Token count: {doc.get('token_count')}")
|
print(f"Token count: {doc.get('token_count')}")
|
||||||
print(f"Content snippet: {doc.get('content')[:100]}...")
|
content_preview = doc.get('content', '')[:100] + '...' if doc.get('content') else 'No content'
|
||||||
|
print(f"Content snippet: {content_preview}")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# Generate report
|
# Generate report
|
||||||
|
@ -230,7 +240,18 @@ async def test_report_generator():
|
||||||
# Print report
|
# Print report
|
||||||
print("Generated Report:")
|
print("Generated Report:")
|
||||||
print(report)
|
print(report)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during report generation test: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
# Run test if this module is executed directly
|
# Run test if this module is executed directly
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(test_report_generator())
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Test the report generator')
|
||||||
|
parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print(f"Running test with {'mock data' if args.mock else 'real data'}")
|
||||||
|
asyncio.run(test_report_generator(use_mock=args.mock))
|
||||||
|
|
|
@ -19,138 +19,96 @@ from report.document_processor import get_document_processor
|
||||||
from report.database.db_manager import get_db_manager, initialize_database
|
from report.database.db_manager import get_db_manager, initialize_database
|
||||||
from report.document_scraper import get_document_scraper
|
from report.document_scraper import get_document_scraper
|
||||||
|
|
||||||
async def test_document_processor():
|
async def test_document_processor(use_mock: bool = False):
|
||||||
"""Test the document processor with sample documents."""
|
"""Test the document processor with sample documents."""
|
||||||
# Initialize the database
|
# Initialize database
|
||||||
await initialize_database()
|
await initialize_database()
|
||||||
|
|
||||||
# Get the document processor and scraper
|
# Create document processor
|
||||||
document_processor = get_document_processor()
|
document_processor = get_document_processor()
|
||||||
document_scraper = get_document_scraper()
|
|
||||||
db_manager = get_db_manager()
|
|
||||||
|
|
||||||
# Sample URLs to test with
|
# Create document scraper with mock option
|
||||||
test_urls = [
|
document_scraper = get_document_scraper(use_mock=use_mock)
|
||||||
"https://en.wikipedia.org/wiki/Python_(programming_language)",
|
|
||||||
"https://en.wikipedia.org/wiki/Natural_language_processing",
|
# Sample search results with real, accessible URLs
|
||||||
"https://docs.python.org/3/tutorial/index.html",
|
search_results = [
|
||||||
"https://en.wikipedia.org/wiki/Machine_learning"
|
{
|
||||||
|
'title': 'Python Documentation',
|
||||||
|
'url': 'https://docs.python.org/3/',
|
||||||
|
'snippet': 'Official Python documentation.',
|
||||||
|
'score': 0.95
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'title': 'Python.org',
|
||||||
|
'url': 'https://www.python.org/',
|
||||||
|
'snippet': 'The official home of the Python Programming Language.',
|
||||||
|
'score': 0.85
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'title': 'Wikipedia - Python',
|
||||||
|
'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
|
||||||
|
'snippet': 'Python is a high-level, general-purpose programming language.',
|
||||||
|
'score': 0.75
|
||||||
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
# Scrape the URLs
|
# Process search results
|
||||||
print(f"Scraping {len(test_urls)} URLs...")
|
documents = []
|
||||||
documents = await document_scraper.scrape_urls(test_urls)
|
relevance_scores = {}
|
||||||
|
|
||||||
|
for result in search_results:
|
||||||
|
# Scrape document
|
||||||
|
document = await document_scraper.scrape_url(result['url'])
|
||||||
|
if document:
|
||||||
|
documents.append(document)
|
||||||
|
relevance_scores[document['url']] = result['score']
|
||||||
|
|
||||||
print(f"Scraped {len(documents)} documents")
|
print(f"Scraped {len(documents)} documents")
|
||||||
|
|
||||||
# Sample relevance scores
|
|
||||||
relevance_scores = {
|
|
||||||
"https://en.wikipedia.org/wiki/Python_(programming_language)": 0.95,
|
|
||||||
"https://en.wikipedia.org/wiki/Natural_language_processing": 0.85,
|
|
||||||
"https://docs.python.org/3/tutorial/index.html": 0.75,
|
|
||||||
"https://en.wikipedia.org/wiki/Machine_learning": 0.65
|
|
||||||
}
|
|
||||||
|
|
||||||
# Test document prioritization
|
# Test document prioritization
|
||||||
print("\nTesting document prioritization...")
|
|
||||||
prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)
|
prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)
|
||||||
|
print("\nPrioritized documents:")
|
||||||
print("Prioritized documents:")
|
|
||||||
for i, doc in enumerate(prioritized_docs):
|
for i, doc in enumerate(prioritized_docs):
|
||||||
print(f"{i+1}. {doc.get('title')} - Score: {doc.get('priority_score', 0.0):.2f}")
|
print(f"{i+1}. {doc['title']} (Score: {doc.get('priority_score', 'N/A')})")
|
||||||
|
|
||||||
# Test document chunking
|
# Test document chunking
|
||||||
print("\nTesting document chunking...")
|
|
||||||
|
|
||||||
# Test section-based chunking
|
|
||||||
print("\nSection-based chunking:")
|
|
||||||
if documents:
|
if documents:
|
||||||
section_chunks = document_processor.chunk_document_by_sections(documents[0], 1000, 100)
|
print("\nChunking document:", documents[0]['title'])
|
||||||
print(f"Created {len(section_chunks)} section-based chunks")
|
chunks = document_processor.chunk_document_by_sections(documents[0])
|
||||||
|
print(f"Created {len(chunks)} chunks")
|
||||||
|
for i, chunk in enumerate(chunks[:3]): # Show first 3 chunks
|
||||||
|
print(f"Chunk {i+1}: {chunk['title']} ({chunk['token_count']} tokens)")
|
||||||
|
content_preview = chunk['content'][:100] + '...' if len(chunk['content']) > 100 else chunk['content']
|
||||||
|
print(f"Content: {content_preview}")
|
||||||
|
|
||||||
for i, chunk in enumerate(section_chunks[:3]): # Show first 3 chunks
|
# Test token budget management
|
||||||
print(f"Chunk {i+1}:")
|
token_budget = 4000
|
||||||
print(f" Type: {chunk.get('chunk_type')}")
|
print(f"\nSelecting chunks with token budget: {token_budget}")
|
||||||
print(f" Section: {chunk.get('section_title', 'N/A')}")
|
|
||||||
print(f" Tokens: {chunk.get('token_count')}")
|
|
||||||
content = chunk.get('content', '')
|
|
||||||
print(f" Content preview: {content[:100]}...")
|
|
||||||
|
|
||||||
# Test fixed-size chunking
|
# Create chunks for each document
|
||||||
print("\nFixed-size chunking:")
|
|
||||||
if documents:
|
|
||||||
fixed_chunks = document_processor.chunk_document_fixed_size(documents[0], 1000, 100)
|
|
||||||
print(f"Created {len(fixed_chunks)} fixed-size chunks")
|
|
||||||
|
|
||||||
for i, chunk in enumerate(fixed_chunks[:3]): # Show first 3 chunks
|
|
||||||
print(f"Chunk {i+1}:")
|
|
||||||
print(f" Type: {chunk.get('chunk_type')}")
|
|
||||||
print(f" Index: {chunk.get('chunk_index')}/{chunk.get('total_chunks')}")
|
|
||||||
print(f" Tokens: {chunk.get('token_count')}")
|
|
||||||
content = chunk.get('content', '')
|
|
||||||
print(f" Content preview: {content[:100]}...")
|
|
||||||
|
|
||||||
# Test hierarchical chunking
|
|
||||||
print("\nHierarchical chunking:")
|
|
||||||
if documents:
|
|
||||||
hierarchical_chunks = document_processor.chunk_document_hierarchical(documents[0], 1000, 100)
|
|
||||||
print(f"Created {len(hierarchical_chunks)} hierarchical chunks")
|
|
||||||
|
|
||||||
for i, chunk in enumerate(hierarchical_chunks[:3]): # Show first 3 chunks
|
|
||||||
print(f"Chunk {i+1}:")
|
|
||||||
print(f" Type: {chunk.get('chunk_type')}")
|
|
||||||
if chunk.get('chunk_type') == 'summary':
|
|
||||||
print(f" Summary chunk")
|
|
||||||
else:
|
|
||||||
print(f" Section: {chunk.get('section_title', 'N/A')}")
|
|
||||||
print(f" Tokens: {chunk.get('token_count')}")
|
|
||||||
content = chunk.get('content', '')
|
|
||||||
print(f" Content preview: {content[:100]}...")
|
|
||||||
|
|
||||||
# Test chunk selection
|
|
||||||
print("\nTesting chunk selection...")
|
|
||||||
|
|
||||||
# Create a mix of chunks from all documents
|
|
||||||
all_chunks = []
|
all_chunks = []
|
||||||
for doc in documents:
|
for doc in prioritized_docs:
|
||||||
chunks = document_processor.chunk_document_by_sections(doc, 1000, 100)
|
doc_chunks = document_processor.chunk_document_by_sections(doc)
|
||||||
all_chunks.extend(chunks)
|
all_chunks.extend(doc_chunks)
|
||||||
|
|
||||||
print(f"Total chunks: {len(all_chunks)}")
|
|
||||||
|
|
||||||
# Select chunks based on token budget
|
# Select chunks based on token budget
|
||||||
token_budget = 10000
|
|
||||||
selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)
|
selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)
|
||||||
|
print(f"Selected {len(selected_chunks)} chunks with total tokens: {sum(c['token_count'] for c in selected_chunks)}")
|
||||||
|
|
||||||
total_tokens = sum(chunk.get('token_count', 0) for chunk in selected_chunks)
|
# Test end-to-end processing
|
||||||
print(f"Selected {len(selected_chunks)} chunks with {total_tokens} tokens (budget: {token_budget})")
|
print("\nTesting end-to-end processing")
|
||||||
|
processed_chunks = document_processor.process_documents_for_report(documents, relevance_scores)
|
||||||
|
print(f"Processed {len(processed_chunks)} chunks for report")
|
||||||
|
|
||||||
# Test full document processing
|
return processed_chunks
|
||||||
print("\nTesting full document processing...")
|
|
||||||
processed_chunks = document_processor.process_documents_for_report(
|
|
||||||
documents,
|
|
||||||
relevance_scores,
|
|
||||||
token_budget=20000,
|
|
||||||
chunk_size=1000,
|
|
||||||
overlap_size=100
|
|
||||||
)
|
|
||||||
|
|
||||||
total_processed_tokens = sum(chunk.get('token_count', 0) for chunk in processed_chunks)
|
|
||||||
print(f"Processed {len(processed_chunks)} chunks with {total_processed_tokens} tokens")
|
|
||||||
|
|
||||||
# Show the top 5 chunks
|
|
||||||
print("\nTop 5 chunks:")
|
|
||||||
for i, chunk in enumerate(processed_chunks[:5]):
|
|
||||||
print(f"Chunk {i+1}:")
|
|
||||||
print(f" Document: {chunk.get('title')}")
|
|
||||||
print(f" Type: {chunk.get('chunk_type')}")
|
|
||||||
print(f" Priority: {chunk.get('priority_score', 0.0):.2f}")
|
|
||||||
print(f" Tokens: {chunk.get('token_count')}")
|
|
||||||
content = chunk.get('content', '')
|
|
||||||
print(f" Content preview: {content[:100]}...")
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""Main function to run the tests."""
|
|
||||||
await test_document_processor()
|
|
||||||
|
|
||||||
|
# Run test if this module is executed directly
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Test the document processor')
|
||||||
|
parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print(f"Running test with {'mock data' if args.mock else 'real data'}")
|
||||||
|
asyncio.run(test_document_processor(use_mock=args.mock))
|
||||||
|
|
Loading…
Reference in New Issue