| import re |
| from collections.abc import Sequence |
|
|
| import pandas as pd |
|
|
|
|
| def _normalize_header_value(value: object) -> str: |
| return str(value).strip().replace("\xa0", " ").lower() |
|
|
|
|
| def _is_numeric_like(value: str) -> bool: |
| return bool(re.fullmatch(r"[+-]?\d+(?:\.\d+)?", value)) |
|
|
|
|
| def _header_row_score(values: Sequence[object], expected_columns: Sequence[str] | None) -> int: |
| normalized = [_normalize_header_value(value) for value in values] |
| non_empty = [value for value in normalized if value and value != "nan"] |
| expected = { |
| _normalize_header_value(column) for column in (expected_columns or []) if column |
| } |
|
|
| expected_matches = len(expected.intersection(non_empty)) |
| text_like_count = sum(any(char.isalpha() for char in value) for value in non_empty) |
| numeric_like_count = sum(_is_numeric_like(value) for value in non_empty) |
| info_penalty = 6 if "info" in non_empty and len(non_empty) <= 2 else 0 |
|
|
| return ( |
| expected_matches * 100 |
| + text_like_count * 3 |
| + len(non_empty) |
| - numeric_like_count * 2 |
| - info_penalty |
| ) |
|
|
|
|
| def _probe_sheet_name(sheet_name) -> str | int: |
| if isinstance(sheet_name, (str, int)): |
| return sheet_name |
| if isinstance(sheet_name, Sequence) and sheet_name: |
| return sheet_name[0] |
| raise ValueError("sheet_name must be a sheet label or a non-empty sequence of sheets") |
|
|
|
|
| def detect_dump_header_row( |
| file_path, |
| sheet_name, |
| *, |
| expected_columns: Sequence[str] | None = None, |
| engine: str = "calamine", |
| ) -> int: |
| """ |
| Detect whether a dump sheet header is on the first or second row. |
| """ |
| probe_sheet = _probe_sheet_name(sheet_name) |
|
|
| if hasattr(file_path, "seek"): |
| file_path.seek(0) |
|
|
| preview = pd.read_excel( |
| file_path, |
| sheet_name=probe_sheet, |
| engine=engine, |
| header=None, |
| nrows=2, |
| ) |
|
|
| if hasattr(file_path, "seek"): |
| file_path.seek(0) |
|
|
| if preview.empty: |
| return 0 |
|
|
| best_row = 0 |
| best_score = _header_row_score(preview.iloc[0].tolist(), expected_columns) |
|
|
| max_candidate_row = min(len(preview.index), 2) |
| for row_index in range(1, max_candidate_row): |
| score = _header_row_score(preview.iloc[row_index].tolist(), expected_columns) |
| if score > best_score: |
| best_row = row_index |
| best_score = score |
|
|
| return best_row |
|
|
|
|
| def read_dump_excel( |
| file_path, |
| *, |
| sheet_name, |
| expected_columns: Sequence[str] | None = None, |
| engine: str = "calamine", |
| **kwargs, |
| ): |
| """ |
| Read dump sheets while supporting headers placed on row 1 or row 2. |
| """ |
| if "header" in kwargs or "skiprows" in kwargs: |
| raise ValueError("read_dump_excel manages header detection internally") |
|
|
| header_row = detect_dump_header_row( |
| file_path, |
| sheet_name, |
| expected_columns=expected_columns, |
| engine=engine, |
| ) |
|
|
| if hasattr(file_path, "seek"): |
| file_path.seek(0) |
|
|
| return pd.read_excel( |
| file_path, |
| sheet_name=sheet_name, |
| engine=engine, |
| header=header_row, |
| **kwargs, |
| ) |
|
|