Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| import os | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Optional, Tuple, Union | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import ( | |
| average_precision_score, | |
| brier_score_loss, | |
| roc_auc_score, | |
| ) | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import OneHotEncoder | |
| # Resolve repository root relative to this file | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| # Output directory and preset dataset paths relative to repository root | |
| SCORES_DIR = BASE_DIR / "scores" | |
| # Preset example dataset paths | |
| PRESET_FEATURES = BASE_DIR / "examples" / "synthetic_v2" / "leads_features.csv" | |
| PRESET_OUTCOMES = BASE_DIR / "examples" / "synthetic_v2" / "outcomes.csv" | |
| # Reuse same feature candidates and categorical sets as scripts/batch_scoring.py | |
| FEATURE_CANDIDATES = [ | |
| "living_area_sqft", | |
| "average_monthly_kwh", | |
| "average_monthly_bill_usd", | |
| "shading_factor", | |
| "roof_suitability_score", | |
| "seasonality_index", | |
| "electric_panel_amperage", | |
| "has_pool", | |
| "is_remote_worker_household", | |
| "tdsp", | |
| "rate_structure", | |
| "credit_score_range", | |
| "household_income_bracket", | |
| "preferred_financing_type", | |
| "neighborhood_type", | |
| ] | |
| CATEGORICAL = [ | |
| "tdsp", | |
| "rate_structure", | |
| "credit_score_range", | |
| "household_income_bracket", | |
| "preferred_financing_type", | |
| "neighborhood_type", | |
| ] | |
| def _safe_path(file_or_path: Optional[Union[str, gr.File]]) -> Optional[str]: | |
| """ | |
| Convert a gradio File object or string path to a usable string path. | |
| """ | |
| if file_or_path is None: | |
| return None | |
| if isinstance(file_or_path, str): | |
| return file_or_path | |
| # gradio File component returns a tempfile object-like with .name | |
| if hasattr(file_or_path, "name"): | |
| return file_or_path.name | |
| # Some versions return a dict with 'name' | |
| if isinstance(file_or_path, dict) and "name" in file_or_path: | |
| return file_or_path["name"] | |
| return None | |
| def _validate_inputs(df_features: pd.DataFrame, df_outcomes: pd.DataFrame) -> None: | |
| if "lead_id" not in df_features.columns: | |
| raise ValueError("Features CSV must contain a 'lead_id' column.") | |
| if "lead_id" not in df_outcomes.columns: | |
| raise ValueError("Outcomes CSV must contain a 'lead_id' column.") | |
| if "sold" not in df_outcomes.columns: | |
| raise ValueError("Outcomes CSV must contain a 'sold' column (0/1).") | |
| def _compute_metrics(y_true: np.ndarray, y_prob: np.ndarray) -> Tuple[Optional[float], Optional[float], Optional[float]]: | |
| """ | |
| Compute ROC AUC, PR AUC, and Brier score with graceful fallbacks. | |
| """ | |
| auc = None | |
| pr_auc = None | |
| brier = None | |
| # Brier score is defined for binary labels even if only one class is present | |
| try: | |
| brier = float(brier_score_loss(y_true.astype(int), y_prob)) | |
| except Exception: | |
| brier = None | |
| # ROC AUC and PR AUC require both classes to be present in y_true | |
| try: | |
| if len(np.unique(y_true.astype(int))) >= 2: | |
| auc = float(roc_auc_score(y_true.astype(int), y_prob)) | |
| else: | |
| auc = None | |
| except Exception: | |
| auc = None | |
| try: | |
| if len(np.unique(y_true.astype(int))) >= 2: | |
| pr_auc = float(average_precision_score(y_true.astype(int), y_prob)) | |
| else: | |
| pr_auc = None | |
| except Exception: | |
| pr_auc = None | |
| return auc, pr_auc, brier | |
| def train_and_score( | |
| mode: str, | |
| features_file: Optional[Union[str, gr.File]], | |
| outcomes_file: Optional[Union[str, gr.File]], | |
| ): | |
| """ | |
| mode: "Use example synthetic_v2" or "Upload CSVs" | |
| Returns: | |
| - metrics_markdown (str) | |
| - preds_preview (pd.DataFrame) | |
| - scored_preview (pd.DataFrame) | |
| - predictions_file (str path) | |
| - scored_file (str path) | |
| """ | |
| try: | |
| if mode == "Use example synthetic_v2": | |
| features_path = PRESET_FEATURES | |
| outcomes_path = PRESET_OUTCOMES | |
| if not features_path.exists() or not outcomes_path.exists(): | |
| raise FileNotFoundError( | |
| f"Preset files not found. Expected:\n- {PRESET_FEATURES}\n- {PRESET_OUTCOMES}" | |
| ) | |
| else: | |
| f_path = _safe_path(features_file) | |
| o_path = _safe_path(outcomes_file) | |
| if not f_path or not o_path: | |
| raise ValueError("Please upload BOTH Features CSV and Outcomes CSV.") | |
| features_path = Path(f_path) | |
| outcomes_path = Path(o_path) | |
| if not features_path.exists(): | |
| raise FileNotFoundError(f"Features file not found: {features_path}") | |
| if not outcomes_path.exists(): | |
| raise FileNotFoundError(f"Outcomes file not found: {outcomes_path}") | |
| X = pd.read_csv(features_path) | |
| y_df = pd.read_csv(outcomes_path)[["lead_id", "sold"]] | |
| _validate_inputs(X, y_df) | |
| df = X.merge(y_df, on="lead_id", how="inner") | |
| # Select features present in this dataset | |
| available = [c for c in FEATURE_CANDIDATES if c in df.columns] | |
| if not available: | |
| raise ValueError( | |
| "No candidate features found in features CSV. " | |
| f"Expected any of: {', '.join(FEATURE_CANDIDATES)}" | |
| ) | |
| numeric = [c for c in available if c not in CATEGORICAL] | |
| cat_cols = [c for c in available if c in CATEGORICAL] | |
| preproc = ColumnTransformer( | |
| transformers=[ | |
| ("num", "passthrough", numeric), | |
| ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols), | |
| ], | |
| remainder="drop", | |
| ) | |
| model = LogisticRegression(max_iter=5000) | |
| pipe = Pipeline(steps=[("pre", preproc), ("clf", model)]) | |
| y = df["sold"].astype(int) | |
| # Only stratify if both classes present | |
| if len(np.unique(y)) >= 2: | |
| train_df, test_df = train_test_split( | |
| df, test_size=0.25, random_state=42, stratify=y | |
| ) | |
| else: | |
| train_df, test_df = train_test_split( | |
| df, test_size=0.25, random_state=42, stratify=None | |
| ) | |
| pipe.fit(train_df[available], train_df["sold"].astype(int)) | |
| test_probs = pipe.predict_proba(test_df[available])[:, 1] | |
| auc, pr_auc, brier = _compute_metrics(test_df["sold"].values, test_probs) | |
| # Score all rows | |
| all_probs = pipe.predict_proba(df[available])[:, 1] | |
| preds = df[["lead_id"]].copy() | |
| preds["probability_to_buy"] = np.round(all_probs, 4) | |
| # Persist outputs | |
| SCORES_DIR.mkdir(parents=True, exist_ok=True) | |
| ts = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| predictions_path = SCORES_DIR / f"predictions_{ts}.csv" | |
| scored_path = SCORES_DIR / f"leads_features_scored_{ts}.csv" | |
| preds.to_csv(predictions_path, index=False) | |
| scored = X.merge(preds, on="lead_id", how="left") | |
| scored.to_csv(scored_path, index=False) | |
| # Prepare outputs | |
| def fmt(val: Optional[float]) -> str: | |
| return f"{val:.3f}" if val is not None else "N/A" | |
| metrics_md = ( | |
| "### Evaluation Metrics (test split)\n" | |
| f"- ROC AUC: {fmt(auc)}\n" | |
| f"- PR AUC: {fmt(pr_auc)}\n" | |
| f"- Brier Score: {fmt(brier)}\n\n" | |
| f"Outputs were saved to:\n" | |
| f"- {predictions_path}\n" | |
| f"- {scored_path}\n" | |
| ) | |
| preds_preview = preds.head(20) | |
| scored_preview = scored.head(20) | |
| return ( | |
| metrics_md, | |
| preds_preview, | |
| scored_preview, | |
| str(predictions_path), | |
| str(scored_path), | |
| ) | |
| except Exception as e: | |
| # On error, return message and empty placeholders | |
| metrics_md = f"### Error\n{str(e)}" | |
| return metrics_md, pd.DataFrame(), pd.DataFrame(), None, None | |
| with gr.Blocks(title="SOLAI Scoring Dashboard") as demo: | |
| gr.Markdown( | |
| """ | |
| # SOLAI Scoring Dashboard (Gradio) | |
| Train a baseline Logistic Regression model on your solar lead dataset and generate probability_to_buy predictions. | |
| - Default dataset: examples/synthetic_v2 | |
| - Outputs are always written to /Users/git/solai/scores and are also downloadable below. | |
| """.strip() | |
| ) | |
| with gr.Row(): | |
| mode = gr.Radio( | |
| choices=["Use example synthetic_v2", "Upload CSVs"], | |
| value="Use example synthetic_v2", | |
| label="Data Source", | |
| ) | |
| with gr.Row(): | |
| features_upload = gr.File( | |
| label="Features CSV (for 'Upload CSVs' mode)", | |
| file_types=[".csv"], | |
| visible=False, | |
| ) | |
| outcomes_upload = gr.File( | |
| label="Outcomes CSV with columns [lead_id, sold] (for 'Upload CSVs' mode)", | |
| file_types=[".csv"], | |
| visible=False, | |
| ) | |
| def toggle_uploads(selected_mode: str): | |
| show = selected_mode == "Upload CSVs" | |
| return [ | |
| gr.update(visible=show), | |
| gr.update(visible=show), | |
| ] | |
| mode.change( | |
| toggle_uploads, | |
| inputs=[mode], | |
| outputs=[features_upload, outcomes_upload], | |
| ) | |
| with gr.Row(): | |
| run_btn = gr.Button("Train + Score", variant="primary") | |
| with gr.Row(): | |
| metrics_md = gr.Markdown() | |
| with gr.Row(): | |
| preds_df = gr.Dataframe(label="predictions.csv (preview)", interactive=False) | |
| with gr.Row(): | |
| scored_df = gr.Dataframe(label="leads_features_scored.csv (preview)", interactive=False) | |
| with gr.Row(): | |
| pred_file = gr.File(label="Download predictions.csv") | |
| scored_file = gr.File(label="Download leads_features_scored.csv") | |
| run_btn.click( | |
| fn=train_and_score, | |
| inputs=[mode, features_upload, outcomes_upload], | |
| outputs=[metrics_md, preds_df, scored_df, pred_file, scored_file], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |