Spaces:
Runtime error
Runtime error
File size: 10,134 Bytes
1e3f942 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 | #!/usr/bin/env python3
import os
from datetime import datetime
from pathlib import Path
from typing import Optional, Tuple, Union
import gradio as gr
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
average_precision_score,
brier_score_loss,
roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
# Resolve repository root relative to this file
BASE_DIR = Path(__file__).resolve().parent.parent
# Output directory and preset dataset paths relative to repository root
SCORES_DIR = BASE_DIR / "scores"
# Preset example dataset paths
PRESET_FEATURES = BASE_DIR / "examples" / "synthetic_v2" / "leads_features.csv"
PRESET_OUTCOMES = BASE_DIR / "examples" / "synthetic_v2" / "outcomes.csv"
# Reuse same feature candidates and categorical sets as scripts/batch_scoring.py
FEATURE_CANDIDATES = [
"living_area_sqft",
"average_monthly_kwh",
"average_monthly_bill_usd",
"shading_factor",
"roof_suitability_score",
"seasonality_index",
"electric_panel_amperage",
"has_pool",
"is_remote_worker_household",
"tdsp",
"rate_structure",
"credit_score_range",
"household_income_bracket",
"preferred_financing_type",
"neighborhood_type",
]
CATEGORICAL = [
"tdsp",
"rate_structure",
"credit_score_range",
"household_income_bracket",
"preferred_financing_type",
"neighborhood_type",
]
def _safe_path(file_or_path: Optional[Union[str, gr.File]]) -> Optional[str]:
"""
Convert a gradio File object or string path to a usable string path.
"""
if file_or_path is None:
return None
if isinstance(file_or_path, str):
return file_or_path
# gradio File component returns a tempfile object-like with .name
if hasattr(file_or_path, "name"):
return file_or_path.name
# Some versions return a dict with 'name'
if isinstance(file_or_path, dict) and "name" in file_or_path:
return file_or_path["name"]
return None
def _validate_inputs(df_features: pd.DataFrame, df_outcomes: pd.DataFrame) -> None:
if "lead_id" not in df_features.columns:
raise ValueError("Features CSV must contain a 'lead_id' column.")
if "lead_id" not in df_outcomes.columns:
raise ValueError("Outcomes CSV must contain a 'lead_id' column.")
if "sold" not in df_outcomes.columns:
raise ValueError("Outcomes CSV must contain a 'sold' column (0/1).")
def _compute_metrics(y_true: np.ndarray, y_prob: np.ndarray) -> Tuple[Optional[float], Optional[float], Optional[float]]:
"""
Compute ROC AUC, PR AUC, and Brier score with graceful fallbacks.
"""
auc = None
pr_auc = None
brier = None
# Brier score is defined for binary labels even if only one class is present
try:
brier = float(brier_score_loss(y_true.astype(int), y_prob))
except Exception:
brier = None
# ROC AUC and PR AUC require both classes to be present in y_true
try:
if len(np.unique(y_true.astype(int))) >= 2:
auc = float(roc_auc_score(y_true.astype(int), y_prob))
else:
auc = None
except Exception:
auc = None
try:
if len(np.unique(y_true.astype(int))) >= 2:
pr_auc = float(average_precision_score(y_true.astype(int), y_prob))
else:
pr_auc = None
except Exception:
pr_auc = None
return auc, pr_auc, brier
def train_and_score(
mode: str,
features_file: Optional[Union[str, gr.File]],
outcomes_file: Optional[Union[str, gr.File]],
):
"""
mode: "Use example synthetic_v2" or "Upload CSVs"
Returns:
- metrics_markdown (str)
- preds_preview (pd.DataFrame)
- scored_preview (pd.DataFrame)
- predictions_file (str path)
- scored_file (str path)
"""
try:
if mode == "Use example synthetic_v2":
features_path = PRESET_FEATURES
outcomes_path = PRESET_OUTCOMES
if not features_path.exists() or not outcomes_path.exists():
raise FileNotFoundError(
f"Preset files not found. Expected:\n- {PRESET_FEATURES}\n- {PRESET_OUTCOMES}"
)
else:
f_path = _safe_path(features_file)
o_path = _safe_path(outcomes_file)
if not f_path or not o_path:
raise ValueError("Please upload BOTH Features CSV and Outcomes CSV.")
features_path = Path(f_path)
outcomes_path = Path(o_path)
if not features_path.exists():
raise FileNotFoundError(f"Features file not found: {features_path}")
if not outcomes_path.exists():
raise FileNotFoundError(f"Outcomes file not found: {outcomes_path}")
X = pd.read_csv(features_path)
y_df = pd.read_csv(outcomes_path)[["lead_id", "sold"]]
_validate_inputs(X, y_df)
df = X.merge(y_df, on="lead_id", how="inner")
# Select features present in this dataset
available = [c for c in FEATURE_CANDIDATES if c in df.columns]
if not available:
raise ValueError(
"No candidate features found in features CSV. "
f"Expected any of: {', '.join(FEATURE_CANDIDATES)}"
)
numeric = [c for c in available if c not in CATEGORICAL]
cat_cols = [c for c in available if c in CATEGORICAL]
preproc = ColumnTransformer(
transformers=[
("num", "passthrough", numeric),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
],
remainder="drop",
)
model = LogisticRegression(max_iter=5000)
pipe = Pipeline(steps=[("pre", preproc), ("clf", model)])
y = df["sold"].astype(int)
# Only stratify if both classes present
if len(np.unique(y)) >= 2:
train_df, test_df = train_test_split(
df, test_size=0.25, random_state=42, stratify=y
)
else:
train_df, test_df = train_test_split(
df, test_size=0.25, random_state=42, stratify=None
)
pipe.fit(train_df[available], train_df["sold"].astype(int))
test_probs = pipe.predict_proba(test_df[available])[:, 1]
auc, pr_auc, brier = _compute_metrics(test_df["sold"].values, test_probs)
# Score all rows
all_probs = pipe.predict_proba(df[available])[:, 1]
preds = df[["lead_id"]].copy()
preds["probability_to_buy"] = np.round(all_probs, 4)
# Persist outputs
SCORES_DIR.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
predictions_path = SCORES_DIR / f"predictions_{ts}.csv"
scored_path = SCORES_DIR / f"leads_features_scored_{ts}.csv"
preds.to_csv(predictions_path, index=False)
scored = X.merge(preds, on="lead_id", how="left")
scored.to_csv(scored_path, index=False)
# Prepare outputs
def fmt(val: Optional[float]) -> str:
return f"{val:.3f}" if val is not None else "N/A"
metrics_md = (
"### Evaluation Metrics (test split)\n"
f"- ROC AUC: {fmt(auc)}\n"
f"- PR AUC: {fmt(pr_auc)}\n"
f"- Brier Score: {fmt(brier)}\n\n"
f"Outputs were saved to:\n"
f"- {predictions_path}\n"
f"- {scored_path}\n"
)
preds_preview = preds.head(20)
scored_preview = scored.head(20)
return (
metrics_md,
preds_preview,
scored_preview,
str(predictions_path),
str(scored_path),
)
except Exception as e:
# On error, return message and empty placeholders
metrics_md = f"### Error\n{str(e)}"
return metrics_md, pd.DataFrame(), pd.DataFrame(), None, None
with gr.Blocks(title="SOLAI Scoring Dashboard") as demo:
gr.Markdown(
"""
# SOLAI Scoring Dashboard (Gradio)
Train a baseline Logistic Regression model on your solar lead dataset and generate probability_to_buy predictions.
- Default dataset: examples/synthetic_v2
- Outputs are always written to /Users/git/solai/scores and are also downloadable below.
""".strip()
)
with gr.Row():
mode = gr.Radio(
choices=["Use example synthetic_v2", "Upload CSVs"],
value="Use example synthetic_v2",
label="Data Source",
)
with gr.Row():
features_upload = gr.File(
label="Features CSV (for 'Upload CSVs' mode)",
file_types=[".csv"],
visible=False,
)
outcomes_upload = gr.File(
label="Outcomes CSV with columns [lead_id, sold] (for 'Upload CSVs' mode)",
file_types=[".csv"],
visible=False,
)
def toggle_uploads(selected_mode: str):
show = selected_mode == "Upload CSVs"
return [
gr.update(visible=show),
gr.update(visible=show),
]
mode.change(
toggle_uploads,
inputs=[mode],
outputs=[features_upload, outcomes_upload],
)
with gr.Row():
run_btn = gr.Button("Train + Score", variant="primary")
with gr.Row():
metrics_md = gr.Markdown()
with gr.Row():
preds_df = gr.Dataframe(label="predictions.csv (preview)", interactive=False)
with gr.Row():
scored_df = gr.Dataframe(label="leads_features_scored.csv (preview)", interactive=False)
with gr.Row():
pred_file = gr.File(label="Download predictions.csv")
scored_file = gr.File(label="Download leads_features_scored.csv")
run_btn.click(
fn=train_and_score,
inputs=[mode, features_upload, outcomes_upload],
outputs=[metrics_md, preds_df, scored_df, pred_file, scored_file],
)
if __name__ == "__main__":
demo.launch()
|