Improve document scraper with better error handling and add mock option for testing

2025-02-27 17:51:05 -06:00 · 2025-02-27 17:51:05 -06:00 · 34be5ce36f
parent 695e4b7ecd
commit 34be5ce36f
3 changed files with 254 additions and 165 deletions
--- a/report/document_scraper.py
+++ b/report/document_scraper.py
@ -35,13 +35,20 @@ class DocumentScraper:
    using Jina Reader API or fallback methods.
    """
    
-    def __init__(self):
-        """Initialize the document scraper."""
+    def __init__(self, use_mock: bool = False):
+        """
+        Initialize the document scraper.
+        
+        Args:
+            use_mock: If True, use mock data instead of making actual API calls
+        """
        self.config = get_config()
        self.api_key = self._get_api_key()
        self.endpoint = "https://api.jina.ai/v1/reader"
        self.db_manager = get_db_manager()
        self.tokenizer = tiktoken.get_encoding("cl100k_base")  # Using OpenAI's tokenizer
+        self.use_mock = use_mock
+        self.jina_api_available = self.api_key != ""
        
    def _get_api_key(self) -> str:
        """
@ -201,6 +208,56 @@ class DocumentScraper:
        
        return converter.handle(html)
    
+    async def _get_mock_content(self, url: str) -> Tuple[str, Dict[str, str]]:
+        """
+        Generate mock content for testing.
+        
+        Args:
+            url: The URL to generate mock content for
+            
+        Returns:
+            Tuple of (content, metadata)
+        """
+        domain = urlparse(url).netloc
+        path = urlparse(url).path
+        
+        # Generate a title based on the URL
+        title = f"Mock Content for {domain}{path}"
+        
+        # Generate mock content
+        content = f"""# {title}
+
+## Introduction
+
+This is mock content generated for testing purposes. The original URL is {url}.
+
+## Section 1
+
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam euismod, nisl eget
+aliquam ultricies, nunc nisl aliquet nunc, quis aliquam nisl nunc eu nisl.
+
+## Section 2
+
+Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas.
+Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante.
+
+## Conclusion
+
+This mock content was generated on {datetime.now().isoformat()}.
+"""
+        
+        # Generate mock metadata
+        metadata = {
+            "source_url": url,
+            "title": title,
+            "description": "This is mock content generated for testing purposes.",
+            "author": "Mock Generator",
+            "scrape_date": datetime.now().isoformat(),
+            "publication_date": datetime.now().isoformat()
+        }
+        
+        return content, metadata
+    
    async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
        """
        Scrape a web page using Jina Reader API.
@ -211,8 +268,14 @@ class DocumentScraper:
        Returns:
            Tuple of (content, metadata)
        """
-        if not self.api_key:
-            logger.warning("Jina API key not available. Using fallback method.")
+        # If using mock data, return mock content
+        if self.use_mock:
+            logger.info(f"Using mock data for URL: {url}")
+            return await self._get_mock_content(url)
+        
+        # If Jina API is not available, skip this step
+        if not self.jina_api_available:
+            logger.info("Jina API key not available. Using fallback method.")
            return None, None
        
        headers = {
@ -228,9 +291,16 @@ class DocumentScraper:
        
        try:
            async with aiohttp.ClientSession() as session:
-                async with session.post(self.endpoint, headers=headers, json=data) as response:
+                async with session.post(self.endpoint, headers=headers, json=data, timeout=30) as response:
                    if response.status != 200:
-                        logger.warning(f"Jina Reader API error: {response.status} - {await response.text()}")
+                        error_text = await response.text()
+                        logger.warning(f"Jina Reader API error: {response.status} - {error_text}")
+                        
+                        # If we get a 404 or 429 (rate limit), mark the API as unavailable for this session
+                        if response.status in [404, 429]:
+                            logger.warning("Jina Reader API appears to be unavailable. Using fallback method for all subsequent requests.")
+                            self.jina_api_available = False
+                        
                        return None, None
                    
                    result = await response.json()
@ -247,6 +317,9 @@ class DocumentScraper:
                    
                    return content, metadata
        
+        except asyncio.TimeoutError:
+            logger.warning(f"Timeout calling Jina Reader API for URL: {url}")
+            return None, None
        except Exception as e:
            logger.error(f"Error calling Jina Reader API: {str(e)}")
            return None, None
@ -261,9 +334,14 @@ class DocumentScraper:
        Returns:
            Tuple of (content, metadata)
        """
+        # If using mock data, return mock content
+        if self.use_mock:
+            logger.info(f"Using mock data for URL: {url}")
+            return await self._get_mock_content(url)
+        
        try:
            async with aiohttp.ClientSession() as session:
-                async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
+                async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=30) as response:
                    if response.status != 200:
                        logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}")
                        return None, None
@ -278,6 +356,9 @@ class DocumentScraper:
                    
                    return content, metadata
        
+        except asyncio.TimeoutError:
+            logger.warning(f"Timeout fetching URL: {url}")
+            return None, None
        except Exception as e:
            logger.error(f"Error in fallback scraping: {str(e)}")
            return None, None
@ -306,10 +387,12 @@ class DocumentScraper:
            logger.info(f"Document already exists in database: {normalized_url}")
            return await self.db_manager.get_document_by_url(normalized_url)
        
-        # Try Jina Reader first
-        content, metadata = await self._scrape_with_jina_reader(normalized_url)
+        # Try Jina Reader first if it's available
+        content, metadata = None, None
+        if self.jina_api_available:
+            content, metadata = await self._scrape_with_jina_reader(normalized_url)
        
-        # Fallback to custom scraping if Jina Reader fails
+        # Fallback to custom scraping if Jina Reader fails or is unavailable
        if content is None:
            logger.info(f"Falling back to custom scraping for URL: {normalized_url}")
            content, metadata = await self._scrape_with_fallback(normalized_url)
@ -367,34 +450,61 @@ class DocumentScraper:
 # Create a singleton instance for global use
 document_scraper = DocumentScraper()

-def get_document_scraper() -> DocumentScraper:
+def get_document_scraper(use_mock: bool = False) -> DocumentScraper:
    """
    Get the global document scraper instance.
    
+    Args:
+        use_mock: If True, create a new instance with mock data
+        
    Returns:
        DocumentScraper instance
    """
+    global document_scraper
+    
+    # If mock is requested, create a new instance with mock enabled
+    if use_mock:
+        return DocumentScraper(use_mock=True)
+    
    return document_scraper

 # Example usage
-async def test_scraper():
-    """Test the document scraper with a sample URL."""
+async def test_scraper(use_mock: bool = False):
+    """
+    Test the document scraper with a sample URL.
+    
+    Args:
+        use_mock: If True, use mock data instead of making actual API calls
+    """
    from report.database.db_manager import initialize_database
    
    # Initialize database
    await initialize_database()
    
    # Scrape a URL
-    scraper = get_document_scraper()
-    document = await scraper.scrape_url("https://en.wikipedia.org/wiki/Web_scraping")
+    scraper = get_document_scraper(use_mock=use_mock)
    
-    if document:
-        print(f"Successfully scraped document: {document['title']}")
-        print(f"Token count: {document['token_count']}")
-        print(f"Content preview: {document['content'][:500]}...")
-    else:
-        print("Failed to scrape document")
+    # Test URLs
+    test_urls = [
+        "https://en.wikipedia.org/wiki/Web_scraping",
+        "https://docs.python.org/3/",
+        "https://www.python.org/"
+    ]
+    
+    print(f"Testing scraper with {'mock data' if use_mock else 'real data'}")
+    
+    for url in test_urls:
+        print(f"\nScraping URL: {url}")
+        document = await scraper.scrape_url(url)
+        
+        if document:
+            print(f"Successfully scraped document: {document['title']}")
+            print(f"Token count: {document['token_count']}")
+            print(f"Content preview: {document['content'][:200]}...")
+        else:
+            print(f"Failed to scrape document: {url}")

 # Run test if this module is executed directly
 if __name__ == "__main__":
-    asyncio.run(test_scraper())
+    # Test with real data by default
+    asyncio.run(test_scraper(use_mock=False))
--- a/report/report_generator.py
+++ b/report/report_generator.py
@ -186,51 +186,72 @@ def get_report_generator() -> ReportGenerator:
    """
    return report_generator

-async def test_report_generator():
-    """Test the report generator with sample search results."""
+async def test_report_generator(use_mock: bool = False):
+    """
+    Test the report generator with sample search results.
+    
+    Args:
+        use_mock: If True, use mock data instead of making actual API calls
+    """
    # Initialize the report generator
    await initialize_report_generator()
    
-    # Sample search results
+    # Get document scraper with mock option
+    document_scraper = get_document_scraper(use_mock=use_mock)
+    
+    # Sample search results with real, accessible URLs
    search_results = [
-        {
-            'title': 'Example Document 1',
-            'url': 'https://example.com/doc1',
-            'snippet': 'This is an example document.',
-            'score': 0.95
-        },
-        {
-            'title': 'Example Document 2',
-            'url': 'https://example.com/doc2',
-            'snippet': 'This is another example document.',
-            'score': 0.85
-        },
        {
            'title': 'Python Documentation',
            'url': 'https://docs.python.org/3/',
            'snippet': 'Official Python documentation.',
+            'score': 0.95
+        },
+        {
+            'title': 'Python.org',
+            'url': 'https://www.python.org/',
+            'snippet': 'The official home of the Python Programming Language.',
+            'score': 0.85
+        },
+        {
+            'title': 'Wikipedia - Python',
+            'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
+            'snippet': 'Python is a high-level, general-purpose programming language.',
            'score': 0.75
        }
    ]
    
-    # Process search results
-    documents, relevance_scores = await report_generator.process_search_results(search_results)
+    try:
+        # Process search results
+        documents, relevance_scores = await report_generator.process_search_results(search_results)
        
-    # Print documents
-    print(f"Processed {len(documents)} documents")
-    for doc in documents:
-        print(f"Document: {doc.get('title')} ({doc.get('url')})")
-        print(f"Token count: {doc.get('token_count')}")
-        print(f"Content snippet: {doc.get('content')[:100]}...")
-        print()
+        # Print documents
+        print(f"Processed {len(documents)} documents")
+        for doc in documents:
+            print(f"Document: {doc.get('title')} ({doc.get('url')})")
+            print(f"Token count: {doc.get('token_count')}")
+            content_preview = doc.get('content', '')[:100] + '...' if doc.get('content') else 'No content'
+            print(f"Content snippet: {content_preview}")
+            print()
        
-    # Generate report
-    report = await report_generator.generate_report(search_results, "Python programming")
+        # Generate report
+        report = await report_generator.generate_report(search_results, "Python programming")
        
-    # Print report
-    print("Generated Report:")
-    print(report)
+        # Print report
+        print("Generated Report:")
+        print(report)
+    except Exception as e:
+        logger.error(f"Error during report generation test: {str(e)}")
+        import traceback
+        traceback.print_exc()

 # Run test if this module is executed directly
 if __name__ == "__main__":
-    asyncio.run(test_report_generator())
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Test the report generator')
+    parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
+    args = parser.parse_args()
+    
+    print(f"Running test with {'mock data' if args.mock else 'real data'}")
+    asyncio.run(test_report_generator(use_mock=args.mock))
--- a/tests/test_document_processor.py
+++ b/tests/test_document_processor.py
@ -19,138 +19,96 @@ from report.document_processor import get_document_processor
 from report.database.db_manager import get_db_manager, initialize_database
 from report.document_scraper import get_document_scraper

-async def test_document_processor():
+async def test_document_processor(use_mock: bool = False):
    """Test the document processor with sample documents."""
-    # Initialize the database
+    # Initialize database
    await initialize_database()
    
-    # Get the document processor and scraper
+    # Create document processor
    document_processor = get_document_processor()
-    document_scraper = get_document_scraper()
-    db_manager = get_db_manager()
    
-    # Sample URLs to test with
-    test_urls = [
-        "https://en.wikipedia.org/wiki/Python_(programming_language)",
-        "https://en.wikipedia.org/wiki/Natural_language_processing",
-        "https://docs.python.org/3/tutorial/index.html",
-        "https://en.wikipedia.org/wiki/Machine_learning"
+    # Create document scraper with mock option
+    document_scraper = get_document_scraper(use_mock=use_mock)
+    
+    # Sample search results with real, accessible URLs
+    search_results = [
+        {
+            'title': 'Python Documentation',
+            'url': 'https://docs.python.org/3/',
+            'snippet': 'Official Python documentation.',
+            'score': 0.95
+        },
+        {
+            'title': 'Python.org',
+            'url': 'https://www.python.org/',
+            'snippet': 'The official home of the Python Programming Language.',
+            'score': 0.85
+        },
+        {
+            'title': 'Wikipedia - Python',
+            'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
+            'snippet': 'Python is a high-level, general-purpose programming language.',
+            'score': 0.75
+        }
    ]
    
-    # Scrape the URLs
-    print(f"Scraping {len(test_urls)} URLs...")
-    documents = await document_scraper.scrape_urls(test_urls)
+    # Process search results
+    documents = []
+    relevance_scores = {}
+    
+    for result in search_results:
+        # Scrape document
+        document = await document_scraper.scrape_url(result['url'])
+        if document:
+            documents.append(document)
+            relevance_scores[document['url']] = result['score']
+    
    print(f"Scraped {len(documents)} documents")
    
-    # Sample relevance scores
-    relevance_scores = {
-        "https://en.wikipedia.org/wiki/Python_(programming_language)": 0.95,
-        "https://en.wikipedia.org/wiki/Natural_language_processing": 0.85,
-        "https://docs.python.org/3/tutorial/index.html": 0.75,
-        "https://en.wikipedia.org/wiki/Machine_learning": 0.65
-    }
-    
    # Test document prioritization
-    print("\nTesting document prioritization...")
    prioritized_docs = document_processor.prioritize_documents(documents, relevance_scores)
-    
-    print("Prioritized documents:")
+    print("\nPrioritized documents:")
    for i, doc in enumerate(prioritized_docs):
-        print(f"{i+1}. {doc.get('title')} - Score: {doc.get('priority_score', 0.0):.2f}")
+        print(f"{i+1}. {doc['title']} (Score: {doc.get('priority_score', 'N/A')})")
    
    # Test document chunking
-    print("\nTesting document chunking...")
-    
-    # Test section-based chunking
-    print("\nSection-based chunking:")
    if documents:
-        section_chunks = document_processor.chunk_document_by_sections(documents[0], 1000, 100)
-        print(f"Created {len(section_chunks)} section-based chunks")
+        print("\nChunking document:", documents[0]['title'])
+        chunks = document_processor.chunk_document_by_sections(documents[0])
+        print(f"Created {len(chunks)} chunks")
+        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
+            print(f"Chunk {i+1}: {chunk['title']} ({chunk['token_count']} tokens)")
+            content_preview = chunk['content'][:100] + '...' if len(chunk['content']) > 100 else chunk['content']
+            print(f"Content: {content_preview}")
    
-        for i, chunk in enumerate(section_chunks[:3]):  # Show first 3 chunks
-            print(f"Chunk {i+1}:")
-            print(f"  Type: {chunk.get('chunk_type')}")
-            print(f"  Section: {chunk.get('section_title', 'N/A')}")
-            print(f"  Tokens: {chunk.get('token_count')}")
-            content = chunk.get('content', '')
-            print(f"  Content preview: {content[:100]}...")
+    # Test token budget management
+    token_budget = 4000
+    print(f"\nSelecting chunks with token budget: {token_budget}")
    
-    # Test fixed-size chunking
-    print("\nFixed-size chunking:")
-    if documents:
-        fixed_chunks = document_processor.chunk_document_fixed_size(documents[0], 1000, 100)
-        print(f"Created {len(fixed_chunks)} fixed-size chunks")
-        
-        for i, chunk in enumerate(fixed_chunks[:3]):  # Show first 3 chunks
-            print(f"Chunk {i+1}:")
-            print(f"  Type: {chunk.get('chunk_type')}")
-            print(f"  Index: {chunk.get('chunk_index')}/{chunk.get('total_chunks')}")
-            print(f"  Tokens: {chunk.get('token_count')}")
-            content = chunk.get('content', '')
-            print(f"  Content preview: {content[:100]}...")
-    
-    # Test hierarchical chunking
-    print("\nHierarchical chunking:")
-    if documents:
-        hierarchical_chunks = document_processor.chunk_document_hierarchical(documents[0], 1000, 100)
-        print(f"Created {len(hierarchical_chunks)} hierarchical chunks")
-        
-        for i, chunk in enumerate(hierarchical_chunks[:3]):  # Show first 3 chunks
-            print(f"Chunk {i+1}:")
-            print(f"  Type: {chunk.get('chunk_type')}")
-            if chunk.get('chunk_type') == 'summary':
-                print(f"  Summary chunk")
-            else:
-                print(f"  Section: {chunk.get('section_title', 'N/A')}")
-            print(f"  Tokens: {chunk.get('token_count')}")
-            content = chunk.get('content', '')
-            print(f"  Content preview: {content[:100]}...")
-    
-    # Test chunk selection
-    print("\nTesting chunk selection...")
-    
-    # Create a mix of chunks from all documents
+    # Create chunks for each document
    all_chunks = []
-    for doc in documents:
-        chunks = document_processor.chunk_document_by_sections(doc, 1000, 100)
-        all_chunks.extend(chunks)
-    
-    print(f"Total chunks: {len(all_chunks)}")
+    for doc in prioritized_docs:
+        doc_chunks = document_processor.chunk_document_by_sections(doc)
+        all_chunks.extend(doc_chunks)
    
    # Select chunks based on token budget
-    token_budget = 10000
    selected_chunks = document_processor.select_chunks_for_context(all_chunks, token_budget)
+    print(f"Selected {len(selected_chunks)} chunks with total tokens: {sum(c['token_count'] for c in selected_chunks)}")
    
-    total_tokens = sum(chunk.get('token_count', 0) for chunk in selected_chunks)
-    print(f"Selected {len(selected_chunks)} chunks with {total_tokens} tokens (budget: {token_budget})")
+    # Test end-to-end processing
+    print("\nTesting end-to-end processing")
+    processed_chunks = document_processor.process_documents_for_report(documents, relevance_scores)
+    print(f"Processed {len(processed_chunks)} chunks for report")
    
-    # Test full document processing
-    print("\nTesting full document processing...")
-    processed_chunks = document_processor.process_documents_for_report(
-        documents,
-        relevance_scores,
-        token_budget=20000,
-        chunk_size=1000,
-        overlap_size=100
-    )
-    
-    total_processed_tokens = sum(chunk.get('token_count', 0) for chunk in processed_chunks)
-    print(f"Processed {len(processed_chunks)} chunks with {total_processed_tokens} tokens")
-    
-    # Show the top 5 chunks
-    print("\nTop 5 chunks:")
-    for i, chunk in enumerate(processed_chunks[:5]):
-        print(f"Chunk {i+1}:")
-        print(f"  Document: {chunk.get('title')}")
-        print(f"  Type: {chunk.get('chunk_type')}")
-        print(f"  Priority: {chunk.get('priority_score', 0.0):.2f}")
-        print(f"  Tokens: {chunk.get('token_count')}")
-        content = chunk.get('content', '')
-        print(f"  Content preview: {content[:100]}...")
-
-async def main():
-    """Main function to run the tests."""
-    await test_document_processor()
+    return processed_chunks

+# Run test if this module is executed directly
 if __name__ == "__main__":
-    asyncio.run(main())
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Test the document processor')
+    parser.add_argument('--mock', action='store_true', help='Use mock data instead of making actual API calls')
+    args = parser.parse_args()
+    
+    print(f"Running test with {'mock data' if args.mock else 'real data'}")
+    asyncio.run(test_document_processor(use_mock=args.mock))