| import pandas as pd |
| import numpy as np |
| import streamlit as st |
| import joblib |
| from pathlib import Path |
|
|
| st.set_page_config(page_title='Clustering Predictor (GMM)', page_icon='🧩', layout='centered') |
|
|
| st.title('🧩 Clustering Predictor (GMM)') |
| st.write('Single-row cluster prediction using saved preprocessing: StandardScaler → PCA → GaussianMixture.') |
|
|
| BASE_DIR = Path(__file__).resolve().parent |
|
|
| FEATURES_PATH = BASE_DIR / 'feature_names.pkl' |
| SCALER_PATH = BASE_DIR / 'scaler.pkl' |
| PCA_PATH = BASE_DIR / 'pca.pkl' |
| GMM_PATH = BASE_DIR / 'gmm_model.pkl' |
|
|
| @st.cache_resource |
| def load_assets(): |
| missing = [p.name for p in [FEATURES_PATH, SCALER_PATH, PCA_PATH, GMM_PATH] if not p.exists()] |
| if missing: |
| raise FileNotFoundError(f'Missing files in repo root: {missing}. Put them next to app.py.') |
|
|
| feature_names = joblib.load(FEATURES_PATH) |
| scaler = joblib.load(SCALER_PATH) |
| pca = joblib.load(PCA_PATH) |
| model = joblib.load(GMM_PATH) |
|
|
| |
| if hasattr(pca, 'n_features_in_') and len(feature_names) != int(pca.n_features_in_): |
| raise ValueError( |
| f'Feature mismatch: feature_names has {len(feature_names)} features, ' |
| f'but PCA expects {int(pca.n_features_in_)}. ' |
| 'Re-export feature_names.pkl and pca.pkl from the same training run.' |
| ) |
|
|
| return feature_names, scaler, pca, model |
|
|
| try: |
| feature_names, scaler, pca, model = load_assets() |
| except Exception as e: |
| st.error(str(e)) |
| st.stop() |
|
|
| def predict_cluster(values_dict: dict) -> int: |
| df_one = pd.DataFrame([values_dict], columns=feature_names) |
|
|
| |
| for c in df_one.columns: |
| df_one[c] = pd.to_numeric(df_one[c], errors='coerce') |
|
|
| if df_one.isna().any().any(): |
| bad = df_one.columns[df_one.isna().any()].tolist() |
| raise ValueError(f'NaN values found in columns: {bad}. Please provide valid numeric values.') |
|
|
| X_scaled = scaler.transform(df_one) |
| X_pca = pca.transform(X_scaled) |
| pred = model.predict(X_pca)[0] |
| return int(pred) |
|
|
| st.subheader('🧮 Single Prediction') |
| st.caption('Tip: Use a real row from your dataset for realistic values (all zeros may be unrealistic).') |
|
|
| with st.form('single_pred_form'): |
| cols = st.columns(2) |
| values = {} |
|
|
| for i, feat in enumerate(feature_names): |
| if i % 2 == 0: |
| values[feat] = cols[0].number_input(feat, value=0.0) |
| else: |
| values[feat] = cols[1].number_input(feat, value=0.0) |
|
|
| submitted = st.form_submit_button('Predict cluster') |
|
|
| if submitted: |
| try: |
| pred = predict_cluster(values) |
| st.success(f'✅ Predicted cluster: **{pred}**') |
| except Exception as e: |
| st.error(str(e)) |
|
|
| with st.expander('Show expected feature columns'): |
| st.write(feature_names) |
|
|
| with st.expander('Debug shapes (advanced)'): |
| st.write('Number of input features:', len(feature_names)) |
| st.write('PCA expects n_features_in_:', getattr(pca, 'n_features_in_', 'NA')) |
| st.write('PCA output components:', getattr(pca, 'n_components_', 'NA')) |