| |
|
|
| from typing import List, Dict, Any |
| import re |
|
|
| |
| EXCLUDE_PHRASES = [ |
| "thank you", "thank", "goods sold", "not returnable", "returnable", |
| "shopping at", "visit again", "customer copy", "merchant copy", |
| "powered by", "terms and conditions", "t&c apply", "cashier", |
| "counter", "sdn bhd", "bhd", "pte ltd", "pvt ltd", "llc", "inc", |
| "gst summary", "tax summary", "payment", "change", "cash", |
| "credit card", "debit card", "subtotal", "sub total", "grand total", |
| "total includes", "includes gst", "tax invoice", "invoice" |
| ] |
|
|
| def extract_table_items(words: List[str], boxes: List[List[int]]) -> List[Dict[str, Any]]: |
| """ |
| Geometric Heuristic to extract table rows. |
| Logic: |
| 1. Find 'Header' Y-position (words like 'Description', 'Item', 'Qty'). |
| 2. Find 'Footer' Y-position (where 'Total' usually sits). |
| 3. Filter all words strictly BETWEEN Header and Footer. |
| 4. Group remaining words into 'Rows' based on similar Y-coordinates. |
| """ |
| |
| if not words or not boxes: |
| return [] |
| |
| |
| header_y = 0 |
| footer_y = float('inf') |
| |
| header_keywords = ["description", "item", "particulars", "qty", "quantity", "price", "amount", "rate", "uom", "unit"] |
| footer_keywords = ["total", "subtotal", "tax", "grand total", "payment", "cash", "change", "gst summary", "tax summary"] |
| |
| |
| for i, word in enumerate(words): |
| if word.lower() in header_keywords: |
| y_bottom = boxes[i][1] + boxes[i][3] |
| if y_bottom > header_y: |
| header_y = y_bottom |
|
|
| |
| for i, word in enumerate(words): |
| if word.lower() in footer_keywords: |
| y_top = boxes[i][1] |
| if y_top < footer_y and y_top > header_y: |
| footer_y = y_top |
|
|
| |
| if header_y == 0 and boxes: |
| max_y = max(b[1] for b in boxes) |
| header_y = max_y * 0.25 |
| |
| |
| if footer_y == float('inf') and boxes: |
| max_y = max(b[1] for b in boxes) |
| footer_y = max_y * 0.75 |
|
|
| |
| table_words = [] |
| for i, word in enumerate(words): |
| bx, by, bw, bh = boxes[i] |
| if by > header_y and (by + bh) < footer_y: |
| table_words.append({"text": word, "box": boxes[i]}) |
|
|
| |
| rows = [] |
| if not table_words: |
| return [] |
|
|
| table_words.sort(key=lambda x: x["box"][1]) |
| |
| current_row = [table_words[0]] |
| current_y = table_words[0]["box"][1] |
| |
| for item in table_words[1:]: |
| y = item["box"][1] |
| if abs(y - current_y) < 15: |
| current_row.append(item) |
| else: |
| current_row.sort(key=lambda x: x["box"][0]) |
| rows.append(current_row) |
| current_row = [item] |
| current_y = y |
| |
| if current_row: |
| current_row.sort(key=lambda x: x["box"][0]) |
| rows.append(current_row) |
|
|
| |
| structured_items = [] |
| |
| for row in rows: |
| full_text = " ".join([w["text"] for w in row]) |
| full_text_lower = full_text.lower() |
| |
| |
| if any(phrase in full_text_lower for phrase in EXCLUDE_PHRASES): |
| continue |
| |
| |
| if len(full_text.strip()) < 3: |
| continue |
| |
| |
| |
| numbers = re.findall(r'\d{1,3}(?:,\d{3})*\.?\d*', full_text) |
| |
| item_obj = { |
| "description": full_text, |
| "quantity": 1, |
| "unit_price": 0.0, |
| "total": 0.0 |
| } |
| |
| if numbers: |
| try: |
| |
| val = float(numbers[-1].replace(',', '')) |
| |
| |
| if val <= 0: |
| continue |
| |
| item_obj["total"] = val |
| item_obj["unit_price"] = val |
| |
| item_obj["description"] = full_text.replace(numbers[-1], "").strip() |
| |
| |
| if len(item_obj["description"].strip()) < 2: |
| continue |
| |
| except: |
| continue |
| else: |
| |
| continue |
| |
| structured_items.append(item_obj) |
|
|
| return structured_items |