From 34be5ce36f6e545def39674e3a817162b8832678 Mon Sep 17 00:00:00 2001 From: Steve White Date: Thu, 27 Feb 2025 17:51:05 -0600 Subject: [PATCH] Improve document scraper with better error handling and add mock option for testing --- report/document_scraper.py | 154 ++++++++++++++++++++++---- report/report_generator.py | 87 +++++++++------ tests/test_document_processor.py | 178 ++++++++++++------------------- 3 files changed, 254 insertions(+), 165 deletions(-) diff --git a/report/document_scraper.py b/report/document_scraper.py index d2000f2..1c64d7c 100644 --- a/report/document_scraper.py +++ b/report/document_scraper.py @@ -35,13 +35,20 @@ class DocumentScraper: using Jina Reader API or fallback methods. """ - def __init__(self): - """Initialize the document scraper.""" + def __init__(self, use_mock: bool = False): + """ + Initialize the document scraper. + + Args: + use_mock: If True, use mock data instead of making actual API calls + """ self.config = get_config() self.api_key = self._get_api_key() self.endpoint = "https://api.jina.ai/v1/reader" self.db_manager = get_db_manager() self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer + self.use_mock = use_mock + self.jina_api_available = self.api_key != "" def _get_api_key(self) -> str: """ @@ -201,6 +208,56 @@ class DocumentScraper: return converter.handle(html) + async def _get_mock_content(self, url: str) -> Tuple[str, Dict[str, str]]: + """ + Generate mock content for testing. + + Args: + url: The URL to generate mock content for + + Returns: + Tuple of (content, metadata) + """ + domain = urlparse(url).netloc + path = urlparse(url).path + + # Generate a title based on the URL + title = f"Mock Content for {domain}{path}" + + # Generate mock content + content = f"""# {title} + +## Introduction + +This is mock content generated for testing purposes. The original URL is {url}. + +## Section 1 + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam euismod, nisl eget +aliquam ultricies, nunc nisl aliquet nunc, quis aliquam nisl nunc eu nisl. + +## Section 2 + +Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. +Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. + +## Conclusion + +This mock content was generated on {datetime.now().isoformat()}. +""" + + # Generate mock metadata + metadata = { + "source_url": url, + "title": title, + "description": "This is mock content generated for testing purposes.", + "author": "Mock Generator", + "scrape_date": datetime.now().isoformat(), + "publication_date": datetime.now().isoformat() + } + + return content, metadata + async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]: """ Scrape a web page using Jina Reader API. @@ -211,8 +268,14 @@ class DocumentScraper: Returns: Tuple of (content, metadata) """ - if not self.api_key: - logger.warning("Jina API key not available. Using fallback method.") + # If using mock data, return mock content + if self.use_mock: + logger.info(f"Using mock data for URL: {url}") + return await self._get_mock_content(url) + + # If Jina API is not available, skip this step + if not self.jina_api_available: + logger.info("Jina API key not available. Using fallback method.") return None, None headers = { @@ -228,9 +291,16 @@ class DocumentScraper: try: async with aiohttp.ClientSession() as session: - async with session.post(self.endpoint, headers=headers, json=data) as response: + async with session.post(self.endpoint, headers=headers, json=data, timeout=30) as response: if response.status != 200: - logger.warning(f"Jina Reader API error: {response.status} - {await response.text()}") + error_text = await response.text() + logger.warning(f"Jina Reader API error: {response.status} - {error_text}") + + # If we get a 404 or 429 (rate limit), mark the API as unavailable for this session + if response.status in [404, 429]: + logger.warning("Jina Reader API appears to be unavailable. Using fallback method for all subsequent requests.") + self.jina_api_available = False + return None, None result = await response.json() @@ -247,6 +317,9 @@ class DocumentScraper: return content, metadata + except asyncio.TimeoutError: + logger.warning(f"Timeout calling Jina Reader API for URL: {url}") + return None, None except Exception as e: logger.error(f"Error calling Jina Reader API: {str(e)}") return None, None @@ -261,9 +334,14 @@ class DocumentScraper: Returns: Tuple of (content, metadata) """ + # If using mock data, return mock content + if self.use_mock: + logger.info(f"Using mock data for URL: {url}") + return await self._get_mock_content(url) + try: async with aiohttp.ClientSession() as session: - async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response: + async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=30) as response: if response.status != 200: logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}") return None, None @@ -278,6 +356,9 @@ class DocumentScraper: return content, metadata + except asyncio.TimeoutError: + logger.warning(f"Timeout fetching URL: {url}") + return None, None except Exception as e: logger.error(f"Error in fallback scraping: {str(e)}") return None, None @@ -306,10 +387,12 @@ class DocumentScraper: logger.info(f"Document already exists in database: {normalized_url}") return await self.db_manager.get_document_by_url(normalized_url) - # Try Jina Reader first - content, metadata = await self._scrape_with_jina_reader(normalized_url) + # Try Jina Reader first if it's available + content, metadata = None, None + if self.jina_api_available: + content, metadata = await self._scrape_with_jina_reader(normalized_url) - # Fallback to custom scraping if Jina Reader fails + # Fallback to custom scraping if Jina Reader fails or is unavailable if content is None: logger.info(f"Falling back to custom scraping for URL: {normalized_url}") content, metadata = await self._scrape_with_fallback(normalized_url) @@ -367,34 +450,61 @@ class DocumentScraper: # Create a singleton instance for global use document_scraper = DocumentScraper() -def get_document_scraper() -> DocumentScraper: +def get_document_scraper(use_mock: bool = False) -> DocumentScraper: """ Get the global document scraper instance. + Args: + use_mock: If True, create a new instance with mock data + Returns: DocumentScraper instance """ + global document_scraper + + # If mock is requested, create a new instance with mock enabled + if use_mock: + return DocumentScraper(use_mock=True) + return document_scraper # Example usage -async def test_scraper(): - """Test the document scraper with a sample URL.""" +async def test_scraper(use_mock: bool = False): + """ + Test the document scraper with a sample URL. + + Args: + use_mock: If True, use mock data instead of making actual API calls + """ from report.database.db_manager import initialize_database # Initialize database await initialize_database() # Scrape a URL - scraper = get_document_scraper() - document = await scraper.scrape_url("https://en.wikipedia.org/wiki/Web_scraping") + scraper = get_document_scraper(use_mock=use_mock) - if document: - print(f"Successfully scraped document: {document['title']}") - print(f"Token count: {document['token_count']}") - print(f"Content preview: {document['content'][:500]}...") - else: - print("Failed to scrape document") + # Test URLs + test_urls = [ + "https://en.wikipedia.org/wiki/Web_scraping", + "https://docs.python.org/3/", + "https://www.python.org/" + ] + + print(f"Testing scraper with {'mock data' if use_mock else 'real data'}") + + for url in test_urls: + print(f"\nScraping URL: {url}") + document = await scraper.scrape_url(url) + + if document: + print(f"Successfully scraped document: {document['title']}") + print(f"Token count: {document['token_count']}") + print(f"Content preview: {document['content'][:200]}...") + else: + print(f"Failed to scrape document: {url}") # Run test if this module is executed directly if __name__ == "__main__": - asyncio.run(test_scraper()) + # Test with real data by default + asyncio.run(test_scraper(use_mock=False)) diff --git a/report/report_generator.py b/report/report_generator.py index e62a1a4..41d656d 100644 --- a/report/report_generator.py +++ b/report/report_generator.py @@ -186,51 +186,72 @@ def get_report_generator() -> ReportGenerator: """ return report_generator -async def test_report_generator(): - """Test the report generator with sample search results.""" +async def test_report_generator(use_mock: bool = False): + """ + Test the report generator with sample search results. + + Args: + use_mock: If True, use mock data instead of making actual API calls + """ # Initialize the report generator await initialize_report_generator() - # Sample search results + # Get document scraper with mock option + document_scraper = get_document_scraper(use_mock=use_mock) + + # Sample search results with real, accessible URLs search_results = [ - { - 'title': 'Example Document 1', - 'url': 'https://example.com/doc1', - 'snippet': 'This is an example document.', - 'score': 0.95 - }, - { - 'title': 'Example Document 2', - 'url': 'https://example.com/doc2', - 'snippet': 'This is another example document.', - 'score': 0.85 - }, { 'title': 'Python Documentation', 'url': 'https://docs.python.org/3/', 'snippet': 'Official Python documentation.', + 'score': 0.95 + }, + { + 'title': 'Python.org', + 'url': 'https://www.python.org/', + 'snippet': 'The official home of the Python Programming Language.', + 'score': 0.85 + }, + { + 'title': 'Wikipedia - Python', + 'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)', + 'snippet': 'Python is a high-level, general-purpose programming language.', 'score': 0.75 } ] - # Process search results - documents, relevance_scores = await report_generator.process_search_results(search_results) - - # Print documents - print(f"Processed {len(documents)} documents") - for doc in documents: - print(f"Document: {doc.get('title')} ({doc.get('url')})") - print(f"Token count: {doc.get('token_count')}") - print(f"Content snippet: {doc.get('content')[:100]}...") - print() - - # Generate report - report = await report_generator.generate_report(search_results, "Python programming") - - # Print report - print("Generated Report:") - print(report) + try: + # Process search results + documents, relevance_scores = await report_generator.process_search_results(search_results) + + # Print documents + print(f"Processed {len(documents)} documents") + for doc in documents: + print(f"Document: {doc.get('title')} ({doc.get('url')})") + print(f"Token count: {doc.get('token_count')}") + content_preview = doc.get('content', '')[:100] + '...' if doc.get('content') else 'No content' + print(f"Content snippet: {content_preview}") + print() + + # Generate report + report = await report_generator.generate_report(search_results, "Python programming") + + # Print report + print("Generated Report:") + print(report) + except Exception as e: + logger.error(f"Error during report generation test: {str(e)}") + import traceback + traceback.print_exc() # Run test if this module is executed directly if __name__ == "__main__": - asyncio.run(test_report_generator()) + import argparse + + parser = argparse.ArgumentParser(description='Test the report generator') + parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls') + args = parser.parse_args() + + print(f"Running test with {'mock data' if args.mock else 'real data'}") + asyncio.run(test_report_generator(use_mock=args.mock)) diff --git a/tests/test_document_processor.py b/tests/test_document_processor.py index dd6f81d..031a5dc 100644 --- a/tests/test_document_processor.py +++ b/tests/test_document_processor.py @@ -19,138 +19,96 @@ from report.document_processor import get_document_processor from report.database.db_manager import get_db_manager, initialize_database from report.document_scraper import get_document_scraper -async def test_document_processor(): +async def test_document_processor(use_mock: bool = False): """Test the document processor with sample documents.""" - # Initialize the database + # Initialize database await initialize_database() - # Get the document processor and scraper + # Create document processor document_processor = get_document_processor() - document_scraper = get_document_scraper() - db_manager = get_db_manager() - # Sample URLs to test with - test_urls = [ - "https://en.wikipedia.org/wiki/Python_(programming_language)", - "https://en.wikipedia.org/wiki/Natural_language_processing", - "https://docs.python.org/3/tutorial/index.html", - "https://en.wikipedia.org/wiki/Machine_learning" + # Create document scraper with mock option + document_scraper = get_document_scraper(use_mock=use_mock) + + # Sample search results with real, accessible URLs + search_results = [ + { + 'title': 'Python Documentation', + 'url': 'https://docs.python.org/3/', + 'snippet': 'Official Python documentation.', + 'score': 0.95 + }, + { + 'title': 'Python.org', + 'url': 'https://www.python.org/', + 'snippet': 'The official home of the Python Programming Language.', + 'score': 0.85 + }, + { + 'title': 'Wikipedia - Python', + 'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)', + 'snippet': 'Python is a high-level, general-purpose programming language.', + 'score': 0.75 + } ] - # Scrape the URLs - print(f"Scraping {len(test_urls)} URLs...") - documents = await document_scraper.scrape_urls(test_urls) + # Process search results + documents = [] + relevance_scores = {} + + for result in search_results: + # Scrape document + document = await document_scraper.scrape_url(result['url']) + if document: + documents.append(document) + relevance_scores[document['url']] = result['score'] + print(f"Scraped {len(documents)} documents") - # Sample relevance scores - relevance_scores = { - "https://en.wikipedia.org/wiki/Python_(programming_language)": 0.95, - "https://en.wikipedia.org/wiki/Natural_language_processing": 0.85, - "https://docs.python.org/3/tutorial/index.html": 0.75, - "https://en.wikipedia.org/wiki/Machine_learning": 0.65 - } - # Test document prioritization - print("\nTesting document prioritization...") prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores) - - print("Prioritized documents:") + print("\nPrioritized documents:") for i, doc in enumerate(prioritized_docs): - print(f"{i+1}. {doc.get('title')} - Score: {doc.get('priority_score', 0.0):.2f}") + print(f"{i+1}. {doc['title']} (Score: {doc.get('priority_score', 'N/A')})") # Test document chunking - print("\nTesting document chunking...") - - # Test section-based chunking - print("\nSection-based chunking:") if documents: - section_chunks = document_processor.chunk_document_by_sections(documents[0], 1000, 100) - print(f"Created {len(section_chunks)} section-based chunks") - - for i, chunk in enumerate(section_chunks[:3]): # Show first 3 chunks - print(f"Chunk {i+1}:") - print(f" Type: {chunk.get('chunk_type')}") - print(f" Section: {chunk.get('section_title', 'N/A')}") - print(f" Tokens: {chunk.get('token_count')}") - content = chunk.get('content', '') - print(f" Content preview: {content[:100]}...") + print("\nChunking document:", documents[0]['title']) + chunks = document_processor.chunk_document_by_sections(documents[0]) + print(f"Created {len(chunks)} chunks") + for i, chunk in enumerate(chunks[:3]): # Show first 3 chunks + print(f"Chunk {i+1}: {chunk['title']} ({chunk['token_count']} tokens)") + content_preview = chunk['content'][:100] + '...' if len(chunk['content']) > 100 else chunk['content'] + print(f"Content: {content_preview}") - # Test fixed-size chunking - print("\nFixed-size chunking:") - if documents: - fixed_chunks = document_processor.chunk_document_fixed_size(documents[0], 1000, 100) - print(f"Created {len(fixed_chunks)} fixed-size chunks") - - for i, chunk in enumerate(fixed_chunks[:3]): # Show first 3 chunks - print(f"Chunk {i+1}:") - print(f" Type: {chunk.get('chunk_type')}") - print(f" Index: {chunk.get('chunk_index')}/{chunk.get('total_chunks')}") - print(f" Tokens: {chunk.get('token_count')}") - content = chunk.get('content', '') - print(f" Content preview: {content[:100]}...") + # Test token budget management + token_budget = 4000 + print(f"\nSelecting chunks with token budget: {token_budget}") - # Test hierarchical chunking - print("\nHierarchical chunking:") - if documents: - hierarchical_chunks = document_processor.chunk_document_hierarchical(documents[0], 1000, 100) - print(f"Created {len(hierarchical_chunks)} hierarchical chunks") - - for i, chunk in enumerate(hierarchical_chunks[:3]): # Show first 3 chunks - print(f"Chunk {i+1}:") - print(f" Type: {chunk.get('chunk_type')}") - if chunk.get('chunk_type') == 'summary': - print(f" Summary chunk") - else: - print(f" Section: {chunk.get('section_title', 'N/A')}") - print(f" Tokens: {chunk.get('token_count')}") - content = chunk.get('content', '') - print(f" Content preview: {content[:100]}...") - - # Test chunk selection - print("\nTesting chunk selection...") - - # Create a mix of chunks from all documents + # Create chunks for each document all_chunks = [] - for doc in documents: - chunks = document_processor.chunk_document_by_sections(doc, 1000, 100) - all_chunks.extend(chunks) - - print(f"Total chunks: {len(all_chunks)}") + for doc in prioritized_docs: + doc_chunks = document_processor.chunk_document_by_sections(doc) + all_chunks.extend(doc_chunks) # Select chunks based on token budget - token_budget = 10000 selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget) + print(f"Selected {len(selected_chunks)} chunks with total tokens: {sum(c['token_count'] for c in selected_chunks)}") - total_tokens = sum(chunk.get('token_count', 0) for chunk in selected_chunks) - print(f"Selected {len(selected_chunks)} chunks with {total_tokens} tokens (budget: {token_budget})") + # Test end-to-end processing + print("\nTesting end-to-end processing") + processed_chunks = document_processor.process_documents_for_report(documents, relevance_scores) + print(f"Processed {len(processed_chunks)} chunks for report") - # Test full document processing - print("\nTesting full document processing...") - processed_chunks = document_processor.process_documents_for_report( - documents, - relevance_scores, - token_budget=20000, - chunk_size=1000, - overlap_size=100 - ) - - total_processed_tokens = sum(chunk.get('token_count', 0) for chunk in processed_chunks) - print(f"Processed {len(processed_chunks)} chunks with {total_processed_tokens} tokens") - - # Show the top 5 chunks - print("\nTop 5 chunks:") - for i, chunk in enumerate(processed_chunks[:5]): - print(f"Chunk {i+1}:") - print(f" Document: {chunk.get('title')}") - print(f" Type: {chunk.get('chunk_type')}") - print(f" Priority: {chunk.get('priority_score', 0.0):.2f}") - print(f" Tokens: {chunk.get('token_count')}") - content = chunk.get('content', '') - print(f" Content preview: {content[:100]}...") - -async def main(): - """Main function to run the tests.""" - await test_document_processor() + return processed_chunks +# Run test if this module is executed directly if __name__ == "__main__": - asyncio.run(main()) + import argparse + + parser = argparse.ArgumentParser(description='Test the document processor') + parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls') + args = parser.parse_args() + + print(f"Running test with {'mock data' if args.mock else 'real data'}") + asyncio.run(test_document_processor(use_mock=args.mock))