""" Layer 1A: Phase Statistics - Extract 6 stats (mean, std, min, max, AUC, slope) per signal per phase. - Total per batch: 8 phases * 6 signals * 6 stats = 288 features. """ import pandas as pd import numpy as np import pickle import os from scipy.stats import linregress import sys # Add parent directory to path for config import sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from config import CFG def extract_phase_stats_single(batch_df: pd.DataFrame, batch_id: str) -> pd.Series: """ Extracts 288 features for a single batch. Handles missing phases by returning zeros. """ feats = {} for phase in CFG.PHASES: p_df = batch_df[batch_df["Phase"] == phase] p_short = phase[:4] for sig in CFG.STAT_SIGNALS: s_short = sig[:8] prefix = f"{p_short}_{s_short}" if p_df.empty: # Rule: Phase has 0 rows: return all zeros feats[f"{prefix}_mean"] = 0.0 feats[f"{prefix}_std"] = 0.0 feats[f"{prefix}_min"] = 0.0 feats[f"{prefix}_max"] = 0.0 feats[f"{prefix}_auc"] = 0.0 feats[f"{prefix}_slope"] = 0.0 continue data = p_df[sig].values time = p_df["Time_Minutes"].values # Fill NaNs with 0 for internal calculations if any exist data = np.nan_to_num(data, nan=0.0) feats[f"{prefix}_mean"] = float(np.mean(data)) feats[f"{prefix}_min"] = float(np.min(data)) feats[f"{prefix}_max"] = float(np.max(data)) if len(data) == 1: # Rule: Phase has 1 row: std=0, slope=0, auc=value feats[f"{prefix}_std"] = 0.0 feats[f"{prefix}_auc"] = float(data[0]) feats[f"{prefix}_slope"] = 0.0 else: feats[f"{prefix}_std"] = float(np.std(data)) # AUC using trapezoid (NumPy 2.0 compatible) try: feats[f"{prefix}_auc"] = float(np.trapezoid(data)) except AttributeError: feats[f"{prefix}_auc"] = float(np.trapz(data)) # Slope slope = 0.0 if len(data) >= 2: # Final check: if all values are same, linregress might be unstable if np.all(data == data[0]): slope = 0.0 else: res = linregress(time, data) slope = res.slope if not np.isnan(res.slope) else 0.0 feats[f"{prefix}_slope"] = slope return pd.Series(feats, name=batch_id) def main(): print(">>> Starting Layer 1A: Phase Statistics") with open(os.path.join(CFG.PROC_DIR, "process_clean.pkl"), "rb") as f: df = pickle.load(f) batch_ids = df["Batch_ID"].unique() all_feats = [] for bid in batch_ids: b_df = df[df["Batch_ID"] == bid] all_feats.append(extract_phase_stats_single(b_df, bid)) df_feats = pd.concat(all_feats, axis=1).T df_feats.index.name = "Batch_ID" output_path = os.path.join(CFG.PROC_DIR, "phase_stats.pkl") with open(output_path, "wb") as f: pickle.dump(df_feats, f) print("="*60) print(f"✅ LAYER 1A COMPLETE") print(f" Output shape: {df_feats.shape}") print(f" NaN count: {df_feats.isna().sum().sum()}") print(f" Output file: {output_path}") print("="*60) if __name__ == "__main__": main()