db_query / utils /dump_excel.py
DavMelchi's picture
Handle dump headers on first or second row
7efc151
import re
from collections.abc import Sequence
import pandas as pd
def _normalize_header_value(value: object) -> str:
return str(value).strip().replace("\xa0", " ").lower()
def _is_numeric_like(value: str) -> bool:
return bool(re.fullmatch(r"[+-]?\d+(?:\.\d+)?", value))
def _header_row_score(values: Sequence[object], expected_columns: Sequence[str] | None) -> int:
normalized = [_normalize_header_value(value) for value in values]
non_empty = [value for value in normalized if value and value != "nan"]
expected = {
_normalize_header_value(column) for column in (expected_columns or []) if column
}
expected_matches = len(expected.intersection(non_empty))
text_like_count = sum(any(char.isalpha() for char in value) for value in non_empty)
numeric_like_count = sum(_is_numeric_like(value) for value in non_empty)
info_penalty = 6 if "info" in non_empty and len(non_empty) <= 2 else 0
return (
expected_matches * 100
+ text_like_count * 3
+ len(non_empty)
- numeric_like_count * 2
- info_penalty
)
def _probe_sheet_name(sheet_name) -> str | int:
if isinstance(sheet_name, (str, int)):
return sheet_name
if isinstance(sheet_name, Sequence) and sheet_name:
return sheet_name[0]
raise ValueError("sheet_name must be a sheet label or a non-empty sequence of sheets")
def detect_dump_header_row(
file_path,
sheet_name,
*,
expected_columns: Sequence[str] | None = None,
engine: str = "calamine",
) -> int:
"""
Detect whether a dump sheet header is on the first or second row.
"""
probe_sheet = _probe_sheet_name(sheet_name)
if hasattr(file_path, "seek"):
file_path.seek(0)
preview = pd.read_excel(
file_path,
sheet_name=probe_sheet,
engine=engine,
header=None,
nrows=2,
)
if hasattr(file_path, "seek"):
file_path.seek(0)
if preview.empty:
return 0
best_row = 0
best_score = _header_row_score(preview.iloc[0].tolist(), expected_columns)
max_candidate_row = min(len(preview.index), 2)
for row_index in range(1, max_candidate_row):
score = _header_row_score(preview.iloc[row_index].tolist(), expected_columns)
if score > best_score:
best_row = row_index
best_score = score
return best_row
def read_dump_excel(
file_path,
*,
sheet_name,
expected_columns: Sequence[str] | None = None,
engine: str = "calamine",
**kwargs,
):
"""
Read dump sheets while supporting headers placed on row 1 or row 2.
"""
if "header" in kwargs or "skiprows" in kwargs:
raise ValueError("read_dump_excel manages header detection internally")
header_row = detect_dump_header_row(
file_path,
sheet_name,
expected_columns=expected_columns,
engine=engine,
)
if hasattr(file_path, "seek"):
file_path.seek(0)
return pd.read_excel(
file_path,
sheet_name=sheet_name,
engine=engine,
header=header_row,
**kwargs,
)