| | import re |
| | import pandas as pd |
| | import numpy as np |
| | import streamlit as st |
| |
|
| | |
| |
|
| | @st.cache_data |
| | def clean_dataframe_fallback(df): |
| | """Hardcoded data cleaning pipeline""" |
| | |
| | """Generic data cleaning pipeline with categorical preservation""" |
| | df_cleaned = df.copy() |
| |
|
| |
|
| | df_cleaned = df_cleaned.applymap( |
| | lambda x: re.sub(r"\(.*?\)", "", str(x)) if isinstance(x, str) else x) |
| |
|
| | |
| | df_cleaned = df_cleaned.applymap( |
| | lambda x: re.sub(r"ref\.", "", str(x), flags=re.IGNORECASE) if isinstance(x, str) else x) |
| | |
| | |
| | df_cleaned = df_cleaned.applymap( |
| | lambda x: re.sub(r"[^\w\s\d\.]", "", str(x)).strip() if isinstance(x, str) else x |
| | ) |
| |
|
| |
|
| | |
| | df_cleaned.columns = [col.strip().lower().replace(' ', '_') for col in df_cleaned.columns] |
| | |
| | |
| | measurement_units = { |
| | 'weight': r'\s*(kg|kilograms|lbs|pounds)$', |
| | 'height': r'\s*(cm|centimeters|inches|feet|ft)$' |
| | } |
| |
|
| |
|
| | |
| | |
| | preserve_pattern = re.compile(r'(name|brand|model|type|category|region|text|desc|color|size)', re.IGNORECASE) |
| | preserved_cols = [col for col in df_cleaned.columns if preserve_pattern.search(col)] |
| | |
| | |
| | id_pattern = re.compile(r'(_id|id_|num|no|number|identifier|code|idx|row)', re.IGNORECASE) |
| | id_cols = [col for col in df_cleaned.columns if id_pattern.search(col) and col not in preserved_cols] |
| | |
| | |
| | unique_cols = [col for col in df_cleaned.columns |
| | if df_cleaned[col].nunique() == len(df_cleaned) |
| | and col not in preserved_cols] |
| | |
| | redundant_cols = list(set(id_cols + unique_cols)) |
| | df_cleaned = df_cleaned.drop(columns=redundant_cols) |
| | print(f"Removed {len(redundant_cols)} redundant columns: {redundant_cols}") |
| |
|
| | |
| | for col in df_cleaned.columns: |
| | if col in preserved_cols: |
| | print(f"Preserving categorical column: {col}") |
| | continue |
| |
|
| | if any(unit in col for unit in measurement_units.keys()): |
| | pattern = measurement_units.get(col.split('_')[0], r'') |
| | df_cleaned[col] = df_cleaned[col].astype(str).str.replace(pattern, '', regex=True).str.strip() |
| | |
| |
|
| |
|
| | if pd.api.types.is_numeric_dtype(df_cleaned[col]): |
| | continue |
| | |
| | |
| | non_null_count = df_cleaned[col].dropna().shape[0] |
| | sample_size = min(100, non_null_count) |
| | sample = df_cleaned[col].dropna().sample(sample_size, random_state=42) |
| | numeric_pattern = r'^[-+]?\d*\.?\d+$' |
| | num_matches = sample.astype(str).str.fullmatch(numeric_pattern).mean() |
| | |
| | if num_matches > 0.8: |
| | |
| | cleaned = df_cleaned[col].replace(r'[^\d\.\-]', '', regex=True) |
| | converted = pd.to_numeric(cleaned, errors='coerce') |
| | success_rate = converted.notna().mean() |
| | |
| | if success_rate > 0.9: |
| | df_cleaned[col] = converted |
| | print(f"Converted {col} to numeric (success: {success_rate:.1%})") |
| |
|
| | |
| | date_cols = [] |
| | for col in df_cleaned.select_dtypes(exclude=np.number).columns: |
| | if col in preserved_cols: |
| | continue |
| | try: |
| | df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='raise') |
| | date_cols.append(col) |
| | print(f"Detected datetime: {col}") |
| | except: |
| | pass |
| |
|
| | |
| | currency_cols = [col for col in df_cleaned.columns if any(keyword in col.lower() for keyword in ["price", "gross", "budget"])] |
| | for col in currency_cols: |
| | df_cleaned[col] = df_cleaned[col].astype(str).str.replace(r'[^\d\.]', '', regex=True) |
| | df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce') |
| |
|
| |
|
| |
|
| | |
| | numeric_cols = df_cleaned.select_dtypes(include=np.number).columns |
| | categorical_cols = df_cleaned.select_dtypes(exclude=np.number).columns |
| | |
| | |
| | for col in numeric_cols: |
| | if df_cleaned[col].isna().any(): |
| | df_cleaned[f'{col}_missing'] = df_cleaned[col].isna().astype(int) |
| | df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True) |
| | |
| | |
| | for col in categorical_cols: |
| | if df_cleaned[col].isna().any(): |
| | mode_val = df_cleaned[col].mode()[0] if not df_cleaned[col].mode().empty else 'Unknown' |
| | df_cleaned[col] = df_cleaned[col].fillna(mode_val) |
| |
|
| | |
| | text_cols = [col for col in categorical_cols if col not in preserved_cols] |
| | for col in text_cols: |
| | df_cleaned[col] = df_cleaned[col].astype(str).apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', x)).strip().lower()) |
| | |
| | |
| | |
| | numeric_cols = df_cleaned.select_dtypes(include=np.number).columns |
| | for col in numeric_cols: |
| | if df_cleaned[col].nunique() > 10: |
| | q1 = df_cleaned[col].quantile(0.05) |
| | q3 = df_cleaned[col].quantile(0.95) |
| | df_cleaned[col] = np.clip(df_cleaned[col], q1, q3) |
| |
|
| | |
| | df_cleaned = df_cleaned.drop_duplicates().reset_index(drop=True) |
| | |
| | return df_cleaned |
| | |