""" Document scraper module for the report generation module. This module provides functionality to scrape web pages and extract clean content using Jina Reader API or fallback methods. """ import os import re import json import hashlib import logging import asyncio import aiohttp import validators import tiktoken from typing import Dict, List, Any, Optional, Tuple, Union from datetime import datetime from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup import html2text from config.config import get_config from report.database.db_manager import get_db_manager, DBManager # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class DocumentScraper: """ Document scraper for the report generation module. This class provides methods to scrape web pages and extract clean content using Jina Reader API or fallback methods. """ def __init__(self): """Initialize the document scraper.""" self.config = get_config() self.api_key = self._get_api_key() self.endpoint = "https://api.jina.ai/v1/reader" self.db_manager = get_db_manager() self.tokenizer = tiktoken.get_encoding("cl100k_base") # Using OpenAI's tokenizer def _get_api_key(self) -> str: """ Get the Jina AI API key. Returns: The API key as a string Raises: ValueError: If the API key is not found """ try: return self.config.get_api_key('jina') except ValueError as e: logger.warning(f"Jina AI API key not found. Fallback methods will be used. {str(e)}") return "" def _count_tokens(self, text: str) -> int: """ Count the number of tokens in a text. Args: text: The text to count tokens for Returns: Number of tokens in the text """ return len(self.tokenizer.encode(text)) def _compute_hash(self, content: str) -> str: """ Compute a hash of the document content for deduplication. Args: content: The document content Returns: Hash of the content """ return hashlib.sha256(content.encode('utf-8')).hexdigest() def _normalize_url(self, url: str) -> str: """ Normalize a URL by removing fragments and unnecessary query parameters. Args: url: The URL to normalize Returns: Normalized URL """ parsed = urlparse(url) # Remove fragment normalized = parsed._replace(fragment="") # TODO: Add more normalization rules if needed return normalized.geturl() def _validate_url(self, url: str) -> bool: """ Validate a URL. Args: url: The URL to validate Returns: True if the URL is valid, False otherwise """ return validators.url(url) is True async def _extract_metadata_from_html(self, html: str, url: str) -> Dict[str, str]: """ Extract metadata from HTML content. Args: html: The HTML content url: The URL of the page Returns: Dictionary of metadata """ metadata = { "source_url": url, "scrape_date": datetime.now().isoformat() } try: soup = BeautifulSoup(html, 'html.parser') # Extract title if soup.title: metadata["title"] = soup.title.string # Extract meta tags for meta in soup.find_all('meta'): # Author if meta.get('name') and meta.get('name').lower() == 'author' and meta.get('content'): metadata["author"] = meta.get('content') # Description if meta.get('name') and meta.get('name').lower() == 'description' and meta.get('content'): metadata["description"] = meta.get('content') # Keywords if meta.get('name') and meta.get('name').lower() == 'keywords' and meta.get('content'): metadata["keywords"] = meta.get('content') # Publication date if meta.get('property') and meta.get('property').lower() in ['article:published_time', 'og:published_time'] and meta.get('content'): metadata["publication_date"] = meta.get('content') # Open Graph data if meta.get('property') and meta.get('property').lower().startswith('og:') and meta.get('content'): og_key = meta.get('property').lower().replace('og:', 'og_') metadata[og_key] = meta.get('content') # Extract structured data (JSON-LD) for script in soup.find_all('script', type='application/ld+json'): try: ld_data = json.loads(script.string) if isinstance(ld_data, dict): # Extract date published if ld_data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting'] and ld_data.get('datePublished'): metadata["publication_date"] = ld_data.get('datePublished') # Extract author if ld_data.get('author'): author = ld_data.get('author') if isinstance(author, dict) and author.get('name'): metadata["author"] = author.get('name') elif isinstance(author, str): metadata["author"] = author except (json.JSONDecodeError, AttributeError): pass except Exception as e: logger.warning(f"Error extracting metadata: {str(e)}") return metadata async def _html_to_markdown(self, html: str) -> str: """ Convert HTML to Markdown. Args: html: The HTML content Returns: Markdown content """ converter = html2text.HTML2Text() converter.ignore_links = False converter.ignore_images = False converter.ignore_tables = False converter.body_width = 0 # No wrapping return converter.handle(html) async def _scrape_with_jina_reader(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]: """ Scrape a web page using Jina Reader API. Args: url: The URL to scrape Returns: Tuple of (content, metadata) """ if not self.api_key: logger.warning("Jina API key not available. Using fallback method.") return None, None headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}", "Accept": "application/json" } data = { "url": url, "format": "markdown" # Request markdown format } try: async with aiohttp.ClientSession() as session: async with session.post(self.endpoint, headers=headers, json=data) as response: if response.status != 200: logger.warning(f"Jina Reader API error: {response.status} - {await response.text()}") return None, None result = await response.json() if "content" not in result: logger.warning(f"Jina Reader API returned no content: {result}") return None, None content = result.get("content", "") metadata = result.get("metadata", {}) # Add source URL to metadata metadata["source_url"] = url return content, metadata except Exception as e: logger.error(f"Error calling Jina Reader API: {str(e)}") return None, None async def _scrape_with_fallback(self, url: str) -> Tuple[Optional[str], Optional[Dict[str, str]]]: """ Scrape a web page using fallback method (aiohttp + BeautifulSoup). Args: url: The URL to scrape Returns: Tuple of (content, metadata) """ try: async with aiohttp.ClientSession() as session: async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response: if response.status != 200: logger.warning(f"Failed to fetch URL: {url} - Status: {response.status}") return None, None html = await response.text() # Extract metadata metadata = await self._extract_metadata_from_html(html, url) # Convert to markdown content = await self._html_to_markdown(html) return content, metadata except Exception as e: logger.error(f"Error in fallback scraping: {str(e)}") return None, None async def scrape_url(self, url: str, force_refresh: bool = False) -> Optional[Dict[str, Any]]: """ Scrape a web page and store the content in the database. Args: url: The URL to scrape force_refresh: If True, scrape the URL even if it's already in the database Returns: Document dictionary if successful, None otherwise """ # Validate URL if not self._validate_url(url): logger.warning(f"Invalid URL: {url}") return None # Normalize URL normalized_url = self._normalize_url(url) # Check if document already exists in database if not force_refresh and await self.db_manager.document_exists(normalized_url): logger.info(f"Document already exists in database: {normalized_url}") return await self.db_manager.get_document_by_url(normalized_url) # Try Jina Reader first content, metadata = await self._scrape_with_jina_reader(normalized_url) # Fallback to custom scraping if Jina Reader fails if content is None: logger.info(f"Falling back to custom scraping for URL: {normalized_url}") content, metadata = await self._scrape_with_fallback(normalized_url) if content is None or not content.strip(): logger.warning(f"Failed to extract content from URL: {normalized_url}") return None # Count tokens token_count = self._count_tokens(content) # Compute hash for deduplication doc_hash = self._compute_hash(content) # Get title from metadata or use URL as fallback title = metadata.get("title", urlparse(normalized_url).netloc) # Store in database try: document_id = await self.db_manager.add_document( url=normalized_url, title=title, content=content, content_type="markdown", token_count=token_count, metadata=metadata, doc_hash=doc_hash ) # Return the document return await self.db_manager.get_document_by_url(normalized_url) except Exception as e: logger.error(f"Error storing document in database: {str(e)}") return None async def scrape_urls(self, urls: List[str], force_refresh: bool = False) -> List[Dict[str, Any]]: """ Scrape multiple URLs in parallel. Args: urls: List of URLs to scrape force_refresh: If True, scrape URLs even if they're already in the database Returns: List of document dictionaries """ tasks = [self.scrape_url(url, force_refresh) for url in urls] results = await asyncio.gather(*tasks) # Filter out None results return [doc for doc in results if doc is not None] # Create a singleton instance for global use document_scraper = DocumentScraper() def get_document_scraper() -> DocumentScraper: """ Get the global document scraper instance. Returns: DocumentScraper instance """ return document_scraper # Example usage async def test_scraper(): """Test the document scraper with a sample URL.""" from report.database.db_manager import initialize_database # Initialize database await initialize_database() # Scrape a URL scraper = get_document_scraper() document = await scraper.scrape_url("https://en.wikipedia.org/wiki/Web_scraping") if document: print(f"Successfully scraped document: {document['title']}") print(f"Token count: {document['token_count']}") print(f"Content preview: {document['content'][:500]}...") else: print("Failed to scrape document") # Run test if this module is executed directly if __name__ == "__main__": asyncio.run(test_scraper())