""" Document scraper module for the report generation module. This module provides functionality to scrape web pages and extract clean content using Jina Reader API or fallback methods. """ import os import re import json import hashlib import logging import asyncio import aiohttp import validators import tiktoken from typing import Dict, List, Any, Optional, Tuple, Union from datetime import datetime from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup import html2text from config.config import get_config from report.database.db_manager import get_db_manager, DBManager # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class DocumentScraper: """ Document scraper for the report generation module. This class provides methods to scrape web pages and extract clean content using Jina Reader API or fallback methods. """ def __init__(self, use_mock: bool = False): """ Initialize the document scraper. Args: use_mock: If True, use mock data instead of making actual API calls """ self.config = get_config() self.api_key = self._get_api_key() self.endpoint = "https://api.jina.ai/v1/reader" self.db_manager = get_db_manager() self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer self.use_mock = use_mock self.jina_api_available = self.api_key != "" def _get_api_key(self) -> str: """ Get the Jina AI API key. Returns: The API key as a string Raises: ValueError: If the API key is not found """ try: return self.config.get_api_key('jina') except ValueError as e: logger.warning(f"Jina AI API key not found. Fallback methods will be used. {str(e)}") return "" def _count_tokens(self, text: str) -> int: """ Count the number of tokens in a text. Args: text: The text to count tokens for Returns: Number of tokens in the text """ return len(self.tokenizer.encode(text)) def _compute_hash(self, content: str) -> str: """ Compute a hash of the document content for deduplication. Args: content: The document content Returns: Hash of the content """ return hashlib.sha256(content.encode('utf-8')).hexdigest() def _normalize_url(self, url: str) -> str: """ Normalize a URL by removing fragments and unnecessary query parameters. Args: url: The URL to normalize Returns: Normalized URL """ parsed = urlparse(url) # Remove fragment normalized = parsed._replace(fragment="") # TODO: Add more normalization rules if needed return normalized.geturl() def _validate_url(self, url: str) -> bool: """ Validate a URL. Args: url: The URL to validate Returns: True if the URL is valid, False otherwise """ return validators.url(url) is True async def _extract_metadata_from_html(self, html: str, url: str) -> Dict[str, str]: """ Extract metadata from HTML content. Args: html: The HTML content url: The URL of the page Returns: Dictionary of metadata """ metadata = { "source_url": url, "scrape_date": datetime.now().isoformat() } try: soup = BeautifulSoup(html, 'html.parser') # Extract title if soup.title: metadata["title"] = soup.title.string # Extract meta tags for meta in soup.find_all('meta'): # Author if meta.get('name') and meta.get('name').lower() == 'author' and meta.get('content'): metadata["author"] = meta.get('content') # Description if meta.get('name') and meta.get('name').lower() == 'description' and meta.get('content'): metadata["description"] = meta.get('content') # Keywords if meta.get('name') and meta.get('name').lower() == 'keywords' and meta.get('content'): metadata["keywords"] = meta.get('content') # Publication date if meta.get('property') and meta.get('property').lower() in ['article:published_time', 'og:published_time'] and meta.get('content'): metadata["publication_date"] = meta.get('content') # Open Graph data if meta.get('property') and meta.get('property').lower().startswith('og:') and meta.get('content'): og_key = meta.get('property').lower().replace('og:', 'og_') metadata[og_key] = meta.get('content') # Extract structured data (JSON-LD) for script in soup.find_all('script', type='application/ld+json'): try: ld_data = json.loads(script.string) if isinstance(ld_data, dict): # Extract date published if ld_data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting'] and ld_data.get('datePublished'): metadata["publication_date"] = ld_data.get('datePublished') # Extract author if ld_data.get('author'): author = ld_data.get('author') if isinstance(author, dict) and author.get('name'): metadata["author"] = author.get('name') elif isinstance(author, str): metadata["author"] = author except (json.JSONDecodeError, AttributeError): pass except Exception as e: logger.warning(f"Error extracting metadata: {str(e)}") return metadata async def _html_to_markdown(self, html: str) -> str: """ Convert HTML to Markdown. Args: html: The HTML content Returns: Markdown content """ converter = html2text.HTML2Text() converter.ignore_links = False converter.ignore_images = False converter.ignore_tables = False converter.body_width = 0 # No wrapping return converter.handle(html) async def _get_mock_content(self, url: str) -> Tuple[str, Dict[str, str]]: """ Generate mock content for testing. Args: url: The URL to generate mock content for Returns: Tuple of (content, metadata) """ domain = urlparse(url).netloc path = urlparse(url).path # Generate a title based on the URL title = f"Mock Content for {domain}{path}" # Generate mock content content = f"""# {title} ## Introduction This is mock content generated for testing purposes. The original URL is {url}. ## Section 1 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam euismod, nisl eget aliquam ultricies, nunc nisl aliquet nunc, quis aliquam nisl nunc eu nisl. ## Section 2 Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. ## Conclusion This mock content was generated on {datetime.now().isoformat()}. """ # Generate mock metadata metadata = { "source_url": url, "title": title, "description": "This is mock content generated for testing purposes.", "author": "Mock Generator", "scrape_date": datetime.now().isoformat(), "publication_date": datetime.now().isoformat() } return content, metadata async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]: """ Scrape a web page using Jina Reader API. Args: url: The URL to scrape Returns: Tuple of (content, metadata) """ # If using mock data, return mock content if self.use_mock: logger.info(f"Using mock data for URL: {url}") return await self._get_mock_content(url) # If Jina API is not available, skip this step if not self.jina_api_available: logger.info("Jina API key not available. Using fallback method.") return None, None headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}", "Accept": "application/json" } data = { "url": url, "format": "markdown" # Request markdown format } try: async with aiohttp.ClientSession() as session: async with session.post(self.endpoint, headers=headers, json=data, timeout=30) as response: if response.status != 200: error_text = await response.text() logger.warning(f"Jina Reader API error: {response.status} - {error_text}") # If we get a 404 or 429 (rate limit), mark the API as unavailable for this session if response.status in [404, 429]: logger.warning("Jina Reader API appears to be unavailable. Using fallback method for all subsequent requests.") self.jina_api_available = False return None, None result = await response.json() if "content" not in result: logger.warning(f"Jina Reader API returned no content: {result}") return None, None content = result.get("content", "") metadata = result.get("metadata", {}) # Add source URL to metadata metadata["source_url"] = url return content, metadata except asyncio.TimeoutError: logger.warning(f"Timeout calling Jina Reader API for URL: {url}") return None, None except Exception as e: logger.error(f"Error calling Jina Reader API: {str(e)}") return None, None async def _scrape_with_fallback(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]: """ Scrape a web page using fallback method (aiohttp + BeautifulSoup). Args: url: The URL to scrape Returns: Tuple of (content, metadata) """ # If using mock data, return mock content if self.use_mock: logger.info(f"Using mock data for URL: {url}") return await self._get_mock_content(url) try: async with aiohttp.ClientSession() as session: async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=30) as response: if response.status != 200: logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}") return None, None html = await response.text() # Extract metadata metadata = await self._extract_metadata_from_html(html, url) # Convert to markdown content = await self._html_to_markdown(html) return content, metadata except asyncio.TimeoutError: logger.warning(f"Timeout fetching URL: {url}") return None, None except Exception as e: logger.error(f"Error in fallback scraping: {str(e)}") return None, None async def scrape_url(self, url: str, force_refresh: bool = False) -> Optional[Dict[str, Any]]: """ Scrape a web page and store the content in the database. Args: url: The URL to scrape force_refresh: If True, scrape the URL even if it's already in the database Returns: Document dictionary if successful, None otherwise """ # Validate URL if not self._validate_url(url): logger.warning(f"Invalid URL: {url}") return None # Normalize URL normalized_url = self._normalize_url(url) # Check if document already exists in database if not force_refresh and await self.db_manager.document_exists(normalized_url): logger.info(f"Document already exists in database: {normalized_url}") return await self.db_manager.get_document_by_url(normalized_url) # Try Jina Reader first if it's available content, metadata = None, None if self.jina_api_available: content, metadata = await self._scrape_with_jina_reader(normalized_url) # Fallback to custom scraping if Jina Reader fails or is unavailable if content is None: logger.info(f"Falling back to custom scraping for URL: {normalized_url}") content, metadata = await self._scrape_with_fallback(normalized_url) if content is None or not content.strip(): logger.warning(f"Failed to extract content from URL: {normalized_url}") return None # Count tokens token_count = self._count_tokens(content) # Compute hash for deduplication doc_hash = self._compute_hash(content) # Get title from metadata or use URL as fallback title = metadata.get("title", urlparse(normalized_url).netloc) # Store in database try: document_id = await self.db_manager.add_document( url=normalized_url, title=title, content=content, content_type="markdown", token_count=token_count, metadata=metadata, doc_hash=doc_hash ) # Return the document return await self.db_manager.get_document_by_url(normalized_url) except Exception as e: logger.error(f"Error storing document in database: {str(e)}") return None async def scrape_urls(self, urls: List[str], force_refresh: bool = False) -> List[Dict[str, Any]]: """ Scrape multiple URLs in parallel. Args: urls: List of URLs to scrape force_refresh: If True, scrape URLs even if they're already in the database Returns: List of document dictionaries """ tasks = [self.scrape_url(url, force_refresh) for url in urls] results = await asyncio.gather(*tasks) # Filter out None results return [doc for doc in results if doc is not None] # Create a singleton instance for global use document_scraper = DocumentScraper() def get_document_scraper(use_mock: bool = False) -> DocumentScraper: """ Get the global document scraper instance. Args: use_mock: If True, create a new instance with mock data Returns: DocumentScraper instance """ global document_scraper # If mock is requested, create a new instance with mock enabled if use_mock: return DocumentScraper(use_mock=True) return document_scraper # Example usage async def test_scraper(use_mock: bool = False): """ Test the document scraper with a sample URL. Args: use_mock: If True, use mock data instead of making actual API calls """ from report.database.db_manager import initialize_database # Initialize database await initialize_database() # Scrape a URL scraper = get_document_scraper(use_mock=use_mock) # Test URLs test_urls = [ "https://en.wikipedia.org/wiki/Web_scraping", "https://docs.python.org/3/", "https://www.python.org/" ] print(f"Testing scraper with {'mock data' if use_mock else 'real data'}") for url in test_urls: print(f"\nScraping URL: {url}") document = await scraper.scrape_url(url) if document: print(f"Successfully scraped document: {document['title']}") print(f"Token count: {document['token_count']}") print(f"Content preview: {document['content'][:200]}...") else: print(f"Failed to scrape document: {url}") # Run test if this module is executed directly if __name__ == "__main__": # Test with real data by default asyncio.run(test_scraper(use_mock=False))