"""HTML processing utilities for ScrapeRL backend.""" import re from typing import Any, Optional from bs4 import BeautifulSoup, Tag, NavigableString from app.utils.logging import get_logger logger = get_logger(__name__) def parse_html(html: str, parser: str = "html.parser") -> BeautifulSoup: """ Parse HTML string into a BeautifulSoup object. Args: html: Raw HTML string parser: Parser to use (html.parser, lxml, html5lib) Returns: Parsed BeautifulSoup object """ return BeautifulSoup(html, parser) def clean_html( html: str, remove_scripts: bool = True, remove_styles: bool = True, remove_comments: bool = True, remove_tags: Optional[list[str]] = None, ) -> str: """ Clean HTML by removing unwanted elements. Args: html: Raw HTML string remove_scripts: Remove