"""HTML processing utilities for ScrapeRL backend."""
import re
from typing import Any, Optional
from bs4 import BeautifulSoup, Tag, NavigableString
from app.utils.logging import get_logger
logger = get_logger(__name__)
def parse_html(html: str, parser: str = "html.parser") -> BeautifulSoup:
"""
Parse HTML string into a BeautifulSoup object.
Args:
html: Raw HTML string
parser: Parser to use (html.parser, lxml, html5lib)
Returns:
Parsed BeautifulSoup object
"""
return BeautifulSoup(html, parser)
def clean_html(
html: str,
remove_scripts: bool = True,
remove_styles: bool = True,
remove_comments: bool = True,
remove_tags: Optional[list[str]] = None,
) -> str:
"""
Clean HTML by removing unwanted elements.
Args:
html: Raw HTML string
remove_scripts: Remove