"""Filter on span dictionaries This module contains the internal representation of heading filters, which are used to test if a span should be included in the ToC. """ import re from typing import Optional from re import Pattern DEF_TOLERANCE: float = 1e-5 def admits_float(expect: Optional[float], actual: Optional[float], tolerance: float) -> bool: """Check if a float should be admitted by a filter""" return (expect is None) or \ (actual is not None and abs(expect - actual) <= tolerance) class FontFilter: """Filter on font attributes""" name: Pattern size: Optional[float] size_tolerance: float color: Optional[int] flags: int # besides the usual true (1) and false (0), we have another state, # unset (x), where the truth table would be # a b diff? # 0 0 0 # 0 1 1 # 1 0 1 # 1 1 0 # x 0 0 # x 1 0 # it's very inefficient to compare bit by bit, which would take 5 bitwise # operations to compare, and then 4 to combine the results, we will use a # trick to reduce it to 2 ops. # step 1: use XOR to find different bits. if unset, set bit to 0, we will # take care of false positives in the next step # a b a^b # 0 0 0 # 0 1 1 # 1 0 1 # 1 1 0 # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false # positives # a b a&b # 0 1 0 <- no diff # 0 0 0 <- no diff # 1 1 1 <- found difference # 1 0 0 <- ignored ign_mask: int def __init__(self, font_dict: dict): self.name = re.compile(font_dict.get('name', "")) self.size = font_dict.get('size') self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE) self.color = font_dict.get('color') # some branchless trick, mainly to save space # x * True = x # x * False = 0 self.flags = (0b00001 * font_dict.get('superscript', False) | 0b00010 * font_dict.get('italic', False) | 0b00100 * font_dict.get('serif', False) | 0b01000 * font_dict.get('monospace', False) | 0b10000 * font_dict.get('bold', False)) self.ign_mask = (0b00001 * ('superscript' in font_dict) | 0b00010 * ('italic' in font_dict) | 0b00100 * ('serif' in font_dict) | 0b01000 * ('monospace' in font_dict) | 0b10000 * ('bold' in font_dict)) def admits(self, spn: dict) -> bool: """Check if the font attributes admit the span Argument spn: the span dict to be checked Returns False if the span doesn't match current font attribute """ if not self.name.search(spn.get('font', "")): return False if self.color is not None and self.color != spn.get('color'): return False if not admits_float(self.size, spn.get('size'), self.size_tolerance): return False flags = spn.get('flags', ~self.flags) # see above for explanation return not (flags ^ self.flags) & self.ign_mask class BoundingBoxFilter: """Filter on bounding boxes""" left: Optional[float] top: Optional[float] right: Optional[float] bottom: Optional[float] tolernace: float def __init__(self, bbox_dict: dict): self.left = bbox_dict.get('left') self.top = bbox_dict.get('top') self.right = bbox_dict.get('right') self.bottom = bbox_dict.get('bottom') self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE) def admits(self, spn: dict) -> bool: """Check if the bounding box admit the span Argument spn: the span dict to be checked Returns False if the span doesn't match current bounding box setting """ bbox = spn.get('bbox', (None, None, None, None)) return (admits_float(self.left, bbox[0], self.tolerance) and admits_float(self.top, bbox[1], self.tolerance) and admits_float(self.right, bbox[2], self.tolerance) and admits_float(self.bottom, bbox[3], self.tolerance)) class ToCFilter: """Filter on span dictionary to pick out headings in the ToC""" # The level of the title, strictly > 0 level: int # When set, the filter will be more *greedy* and extract all the text in a # block even when at least one match occurs greedy: bool font: FontFilter bbox: BoundingBoxFilter def __init__(self, fltr_dict: dict): lvl = fltr_dict.get('level') if lvl is None: raise ValueError("filter's 'level' is not set") if lvl < 1: raise ValueError("filter's 'level' must be >= 1") self.level = lvl self.greedy = fltr_dict.get('greedy', False) self.font = FontFilter(fltr_dict.get('font', {})) self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {})) def admits(self, spn: dict) -> bool: """Check if the filter admits the span Arguments spn: the span dict to be checked Returns False if the span doesn't match the filter """ return self.font.admits(spn) and self.bbox.admits(spn)