File size: 10,134 Bytes
1e3f942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#!/usr/bin/env python3
import os
from datetime import datetime
from pathlib import Path
from typing import Optional, Tuple, Union

import gradio as gr
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    average_precision_score,
    brier_score_loss,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


# Resolve repository root relative to this file
BASE_DIR = Path(__file__).resolve().parent.parent

# Output directory and preset dataset paths relative to repository root
SCORES_DIR = BASE_DIR / "scores"

# Preset example dataset paths
PRESET_FEATURES = BASE_DIR / "examples" / "synthetic_v2" / "leads_features.csv"
PRESET_OUTCOMES = BASE_DIR / "examples" / "synthetic_v2" / "outcomes.csv"

# Reuse same feature candidates and categorical sets as scripts/batch_scoring.py
FEATURE_CANDIDATES = [
    "living_area_sqft",
    "average_monthly_kwh",
    "average_monthly_bill_usd",
    "shading_factor",
    "roof_suitability_score",
    "seasonality_index",
    "electric_panel_amperage",
    "has_pool",
    "is_remote_worker_household",
    "tdsp",
    "rate_structure",
    "credit_score_range",
    "household_income_bracket",
    "preferred_financing_type",
    "neighborhood_type",
]

CATEGORICAL = [
    "tdsp",
    "rate_structure",
    "credit_score_range",
    "household_income_bracket",
    "preferred_financing_type",
    "neighborhood_type",
]


def _safe_path(file_or_path: Optional[Union[str, gr.File]]) -> Optional[str]:
    """
    Convert a gradio File object or string path to a usable string path.
    """
    if file_or_path is None:
        return None
    if isinstance(file_or_path, str):
        return file_or_path
    # gradio File component returns a tempfile object-like with .name
    if hasattr(file_or_path, "name"):
        return file_or_path.name
    # Some versions return a dict with 'name'
    if isinstance(file_or_path, dict) and "name" in file_or_path:
        return file_or_path["name"]
    return None


def _validate_inputs(df_features: pd.DataFrame, df_outcomes: pd.DataFrame) -> None:
    if "lead_id" not in df_features.columns:
        raise ValueError("Features CSV must contain a 'lead_id' column.")
    if "lead_id" not in df_outcomes.columns:
        raise ValueError("Outcomes CSV must contain a 'lead_id' column.")
    if "sold" not in df_outcomes.columns:
        raise ValueError("Outcomes CSV must contain a 'sold' column (0/1).")


def _compute_metrics(y_true: np.ndarray, y_prob: np.ndarray) -> Tuple[Optional[float], Optional[float], Optional[float]]:
    """
    Compute ROC AUC, PR AUC, and Brier score with graceful fallbacks.
    """
    auc = None
    pr_auc = None
    brier = None

    # Brier score is defined for binary labels even if only one class is present
    try:
        brier = float(brier_score_loss(y_true.astype(int), y_prob))
    except Exception:
        brier = None

    # ROC AUC and PR AUC require both classes to be present in y_true
    try:
        if len(np.unique(y_true.astype(int))) >= 2:
            auc = float(roc_auc_score(y_true.astype(int), y_prob))
        else:
            auc = None
    except Exception:
        auc = None

    try:
        if len(np.unique(y_true.astype(int))) >= 2:
            pr_auc = float(average_precision_score(y_true.astype(int), y_prob))
        else:
            pr_auc = None
    except Exception:
        pr_auc = None

    return auc, pr_auc, brier


def train_and_score(
    mode: str,
    features_file: Optional[Union[str, gr.File]],
    outcomes_file: Optional[Union[str, gr.File]],
):
    """
    mode: "Use example synthetic_v2" or "Upload CSVs"
    Returns:
      - metrics_markdown (str)
      - preds_preview (pd.DataFrame)
      - scored_preview (pd.DataFrame)
      - predictions_file (str path)
      - scored_file (str path)
    """
    try:
        if mode == "Use example synthetic_v2":
            features_path = PRESET_FEATURES
            outcomes_path = PRESET_OUTCOMES
            if not features_path.exists() or not outcomes_path.exists():
                raise FileNotFoundError(
                    f"Preset files not found. Expected:\n- {PRESET_FEATURES}\n- {PRESET_OUTCOMES}"
                )
        else:
            f_path = _safe_path(features_file)
            o_path = _safe_path(outcomes_file)
            if not f_path or not o_path:
                raise ValueError("Please upload BOTH Features CSV and Outcomes CSV.")
            features_path = Path(f_path)
            outcomes_path = Path(o_path)
            if not features_path.exists():
                raise FileNotFoundError(f"Features file not found: {features_path}")
            if not outcomes_path.exists():
                raise FileNotFoundError(f"Outcomes file not found: {outcomes_path}")

        X = pd.read_csv(features_path)
        y_df = pd.read_csv(outcomes_path)[["lead_id", "sold"]]

        _validate_inputs(X, y_df)

        df = X.merge(y_df, on="lead_id", how="inner")

        # Select features present in this dataset
        available = [c for c in FEATURE_CANDIDATES if c in df.columns]
        if not available:
            raise ValueError(
                "No candidate features found in features CSV. "
                f"Expected any of: {', '.join(FEATURE_CANDIDATES)}"
            )

        numeric = [c for c in available if c not in CATEGORICAL]
        cat_cols = [c for c in available if c in CATEGORICAL]

        preproc = ColumnTransformer(
            transformers=[
                ("num", "passthrough", numeric),
                ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ],
            remainder="drop",
        )

        model = LogisticRegression(max_iter=5000)
        pipe = Pipeline(steps=[("pre", preproc), ("clf", model)])

        y = df["sold"].astype(int)
        # Only stratify if both classes present
        if len(np.unique(y)) >= 2:
            train_df, test_df = train_test_split(
                df, test_size=0.25, random_state=42, stratify=y
            )
        else:
            train_df, test_df = train_test_split(
                df, test_size=0.25, random_state=42, stratify=None
            )

        pipe.fit(train_df[available], train_df["sold"].astype(int))

        test_probs = pipe.predict_proba(test_df[available])[:, 1]
        auc, pr_auc, brier = _compute_metrics(test_df["sold"].values, test_probs)

        # Score all rows
        all_probs = pipe.predict_proba(df[available])[:, 1]
        preds = df[["lead_id"]].copy()
        preds["probability_to_buy"] = np.round(all_probs, 4)

        # Persist outputs
        SCORES_DIR.mkdir(parents=True, exist_ok=True)
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        predictions_path = SCORES_DIR / f"predictions_{ts}.csv"
        scored_path = SCORES_DIR / f"leads_features_scored_{ts}.csv"

        preds.to_csv(predictions_path, index=False)

        scored = X.merge(preds, on="lead_id", how="left")
        scored.to_csv(scored_path, index=False)

        # Prepare outputs
        def fmt(val: Optional[float]) -> str:
            return f"{val:.3f}" if val is not None else "N/A"

        metrics_md = (
            "### Evaluation Metrics (test split)\n"
            f"- ROC AUC: {fmt(auc)}\n"
            f"- PR AUC: {fmt(pr_auc)}\n"
            f"- Brier Score: {fmt(brier)}\n\n"
            f"Outputs were saved to:\n"
            f"- {predictions_path}\n"
            f"- {scored_path}\n"
        )

        preds_preview = preds.head(20)
        scored_preview = scored.head(20)

        return (
            metrics_md,
            preds_preview,
            scored_preview,
            str(predictions_path),
            str(scored_path),
        )

    except Exception as e:
        # On error, return message and empty placeholders
        metrics_md = f"### Error\n{str(e)}"
        return metrics_md, pd.DataFrame(), pd.DataFrame(), None, None


with gr.Blocks(title="SOLAI Scoring Dashboard") as demo:
    gr.Markdown(
        """
# SOLAI Scoring Dashboard (Gradio)
Train a baseline Logistic Regression model on your solar lead dataset and generate probability_to_buy predictions.

- Default dataset: examples/synthetic_v2
- Outputs are always written to /Users/git/solai/scores and are also downloadable below.
        """.strip()
    )

    with gr.Row():
        mode = gr.Radio(
            choices=["Use example synthetic_v2", "Upload CSVs"],
            value="Use example synthetic_v2",
            label="Data Source",
        )

    with gr.Row():
        features_upload = gr.File(
            label="Features CSV (for 'Upload CSVs' mode)",
            file_types=[".csv"],
            visible=False,
        )
        outcomes_upload = gr.File(
            label="Outcomes CSV with columns [lead_id, sold] (for 'Upload CSVs' mode)",
            file_types=[".csv"],
            visible=False,
        )

    def toggle_uploads(selected_mode: str):
        show = selected_mode == "Upload CSVs"
        return [
            gr.update(visible=show),
            gr.update(visible=show),
        ]

    mode.change(
        toggle_uploads,
        inputs=[mode],
        outputs=[features_upload, outcomes_upload],
    )

    with gr.Row():
        run_btn = gr.Button("Train + Score", variant="primary")

    with gr.Row():
        metrics_md = gr.Markdown()
    with gr.Row():
        preds_df = gr.Dataframe(label="predictions.csv (preview)", interactive=False)
    with gr.Row():
        scored_df = gr.Dataframe(label="leads_features_scored.csv (preview)", interactive=False)
    with gr.Row():
        pred_file = gr.File(label="Download predictions.csv")
        scored_file = gr.File(label="Download leads_features_scored.csv")

    run_btn.click(
        fn=train_and_score,
        inputs=[mode, features_upload, outcomes_upload],
        outputs=[metrics_md, preds_df, scored_df, pred_file, scored_file],
    )

if __name__ == "__main__":
    demo.launch()