Spaces:

DavMelchi
/

db_query

Running

App Files Files Community

db_query / utils /dump_excel.py

DavMelchi

Handle dump headers on first or second row

7efc151 about 1 month ago

raw

history blame contribute delete

3.15 kB

	import re
	from collections.abc import Sequence

	import pandas as pd


	def _normalize_header_value(value: object) -> str:
	return str(value).strip().replace("\xa0", " ").lower()


	def _is_numeric_like(value: str) -> bool:
	return bool(re.fullmatch(r"[+-]?\d+(?:\.\d+)?", value))


	def _header_row_score(values: Sequence[object], expected_columns: Sequence[str] \| None) -> int:
	normalized = [_normalize_header_value(value) for value in values]
	non_empty = [value for value in normalized if value and value != "nan"]
	expected = {
	_normalize_header_value(column) for column in (expected_columns or []) if column
	}

	expected_matches = len(expected.intersection(non_empty))
	text_like_count = sum(any(char.isalpha() for char in value) for value in non_empty)
	numeric_like_count = sum(_is_numeric_like(value) for value in non_empty)
	info_penalty = 6 if "info" in non_empty and len(non_empty) <= 2 else 0

	return (
	expected_matches * 100
	+ text_like_count * 3
	+ len(non_empty)
	- numeric_like_count * 2
	- info_penalty
	)


	def _probe_sheet_name(sheet_name) -> str \| int:
	if isinstance(sheet_name, (str, int)):
	return sheet_name
	if isinstance(sheet_name, Sequence) and sheet_name:
	return sheet_name[0]
	raise ValueError("sheet_name must be a sheet label or a non-empty sequence of sheets")


	def detect_dump_header_row(
	file_path,
	sheet_name,
	*,
	expected_columns: Sequence[str] \| None = None,
	engine: str = "calamine",
	) -> int:
	"""
	Detect whether a dump sheet header is on the first or second row.
	"""
	probe_sheet = _probe_sheet_name(sheet_name)

	if hasattr(file_path, "seek"):
	file_path.seek(0)

	preview = pd.read_excel(
	file_path,
	sheet_name=probe_sheet,
	engine=engine,
	header=None,
	nrows=2,
	)

	if hasattr(file_path, "seek"):
	file_path.seek(0)

	if preview.empty:
	return 0

	best_row = 0
	best_score = _header_row_score(preview.iloc[0].tolist(), expected_columns)

	max_candidate_row = min(len(preview.index), 2)
	for row_index in range(1, max_candidate_row):
	score = _header_row_score(preview.iloc[row_index].tolist(), expected_columns)
	if score > best_score:
	best_row = row_index
	best_score = score

	return best_row


	def read_dump_excel(
	file_path,
	*,
	sheet_name,
	expected_columns: Sequence[str] \| None = None,
	engine: str = "calamine",
	**kwargs,
	):
	"""
	Read dump sheets while supporting headers placed on row 1 or row 2.
	"""
	if "header" in kwargs or "skiprows" in kwargs:
	raise ValueError("read_dump_excel manages header detection internally")

	header_row = detect_dump_header_row(
	file_path,
	sheet_name,
	expected_columns=expected_columns,
	engine=engine,
	)

	if hasattr(file_path, "seek"):
	file_path.seek(0)

	return pd.read_excel(
	file_path,
	sheet_name=sheet_name,
	engine=engine,
	header=header_row,
	**kwargs,
	)