Spaces:

ash-coded-it
/

Solar_Culient_Predictor

Runtime error

App Files Files Community

Solar_Culient_Predictor / app.py

ash-coded-it

Upload folder using huggingface_hub

1e3f942 verified 6 months ago

raw

history blame contribute delete

10.1 kB

	#!/usr/bin/env python3
	import os
	from datetime import datetime
	from pathlib import Path
	from typing import Optional, Tuple, Union

	import gradio as gr
	import numpy as np
	import pandas as pd
	from sklearn.compose import ColumnTransformer
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import (
	average_precision_score,
	brier_score_loss,
	roc_auc_score,
	)
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder


	# Resolve repository root relative to this file
	BASE_DIR = Path(__file__).resolve().parent.parent

	# Output directory and preset dataset paths relative to repository root
	SCORES_DIR = BASE_DIR / "scores"

	# Preset example dataset paths
	PRESET_FEATURES = BASE_DIR / "examples" / "synthetic_v2" / "leads_features.csv"
	PRESET_OUTCOMES = BASE_DIR / "examples" / "synthetic_v2" / "outcomes.csv"

	# Reuse same feature candidates and categorical sets as scripts/batch_scoring.py
	FEATURE_CANDIDATES = [
	"living_area_sqft",
	"average_monthly_kwh",
	"average_monthly_bill_usd",
	"shading_factor",
	"roof_suitability_score",
	"seasonality_index",
	"electric_panel_amperage",
	"has_pool",
	"is_remote_worker_household",
	"tdsp",
	"rate_structure",
	"credit_score_range",
	"household_income_bracket",
	"preferred_financing_type",
	"neighborhood_type",
	]

	CATEGORICAL = [
	"tdsp",
	"rate_structure",
	"credit_score_range",
	"household_income_bracket",
	"preferred_financing_type",
	"neighborhood_type",
	]


	def _safe_path(file_or_path: Optional[Union[str, gr.File]]) -> Optional[str]:
	"""
	Convert a gradio File object or string path to a usable string path.
	"""
	if file_or_path is None:
	return None
	if isinstance(file_or_path, str):
	return file_or_path
	# gradio File component returns a tempfile object-like with .name
	if hasattr(file_or_path, "name"):
	return file_or_path.name
	# Some versions return a dict with 'name'
	if isinstance(file_or_path, dict) and "name" in file_or_path:
	return file_or_path["name"]
	return None


	def _validate_inputs(df_features: pd.DataFrame, df_outcomes: pd.DataFrame) -> None:
	if "lead_id" not in df_features.columns:
	raise ValueError("Features CSV must contain a 'lead_id' column.")
	if "lead_id" not in df_outcomes.columns:
	raise ValueError("Outcomes CSV must contain a 'lead_id' column.")
	if "sold" not in df_outcomes.columns:
	raise ValueError("Outcomes CSV must contain a 'sold' column (0/1).")


	def _compute_metrics(y_true: np.ndarray, y_prob: np.ndarray) -> Tuple[Optional[float], Optional[float], Optional[float]]:
	"""
	Compute ROC AUC, PR AUC, and Brier score with graceful fallbacks.
	"""
	auc = None
	pr_auc = None
	brier = None

	# Brier score is defined for binary labels even if only one class is present
	try:
	brier = float(brier_score_loss(y_true.astype(int), y_prob))
	except Exception:
	brier = None

	# ROC AUC and PR AUC require both classes to be present in y_true
	try:
	if len(np.unique(y_true.astype(int))) >= 2:
	auc = float(roc_auc_score(y_true.astype(int), y_prob))
	else:
	auc = None
	except Exception:
	auc = None

	try:
	if len(np.unique(y_true.astype(int))) >= 2:
	pr_auc = float(average_precision_score(y_true.astype(int), y_prob))
	else:
	pr_auc = None
	except Exception:
	pr_auc = None

	return auc, pr_auc, brier


	def train_and_score(
	mode: str,
	features_file: Optional[Union[str, gr.File]],
	outcomes_file: Optional[Union[str, gr.File]],
	):
	"""
	mode: "Use example synthetic_v2" or "Upload CSVs"
	Returns:
	- metrics_markdown (str)
	- preds_preview (pd.DataFrame)
	- scored_preview (pd.DataFrame)
	- predictions_file (str path)
	- scored_file (str path)
	"""
	try:
	if mode == "Use example synthetic_v2":
	features_path = PRESET_FEATURES
	outcomes_path = PRESET_OUTCOMES
	if not features_path.exists() or not outcomes_path.exists():
	raise FileNotFoundError(
	f"Preset files not found. Expected:\n- {PRESET_FEATURES}\n- {PRESET_OUTCOMES}"
	)
	else:
	f_path = _safe_path(features_file)
	o_path = _safe_path(outcomes_file)
	if not f_path or not o_path:
	raise ValueError("Please upload BOTH Features CSV and Outcomes CSV.")
	features_path = Path(f_path)
	outcomes_path = Path(o_path)
	if not features_path.exists():
	raise FileNotFoundError(f"Features file not found: {features_path}")
	if not outcomes_path.exists():
	raise FileNotFoundError(f"Outcomes file not found: {outcomes_path}")

	X = pd.read_csv(features_path)
	y_df = pd.read_csv(outcomes_path)[["lead_id", "sold"]]

	_validate_inputs(X, y_df)

	df = X.merge(y_df, on="lead_id", how="inner")

	# Select features present in this dataset
	available = [c for c in FEATURE_CANDIDATES if c in df.columns]
	if not available:
	raise ValueError(
	"No candidate features found in features CSV. "
	f"Expected any of: {', '.join(FEATURE_CANDIDATES)}"
	)

	numeric = [c for c in available if c not in CATEGORICAL]
	cat_cols = [c for c in available if c in CATEGORICAL]

	preproc = ColumnTransformer(
	transformers=[
	("num", "passthrough", numeric),
	("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
	],
	remainder="drop",
	)

	model = LogisticRegression(max_iter=5000)
	pipe = Pipeline(steps=[("pre", preproc), ("clf", model)])

	y = df["sold"].astype(int)
	# Only stratify if both classes present
	if len(np.unique(y)) >= 2:
	train_df, test_df = train_test_split(
	df, test_size=0.25, random_state=42, stratify=y
	)
	else:
	train_df, test_df = train_test_split(
	df, test_size=0.25, random_state=42, stratify=None
	)

	pipe.fit(train_df[available], train_df["sold"].astype(int))

	test_probs = pipe.predict_proba(test_df[available])[:, 1]
	auc, pr_auc, brier = _compute_metrics(test_df["sold"].values, test_probs)

	# Score all rows
	all_probs = pipe.predict_proba(df[available])[:, 1]
	preds = df[["lead_id"]].copy()
	preds["probability_to_buy"] = np.round(all_probs, 4)

	# Persist outputs
	SCORES_DIR.mkdir(parents=True, exist_ok=True)
	ts = datetime.now().strftime("%Y%m%d_%H%M%S")
	predictions_path = SCORES_DIR / f"predictions_{ts}.csv"
	scored_path = SCORES_DIR / f"leads_features_scored_{ts}.csv"

	preds.to_csv(predictions_path, index=False)

	scored = X.merge(preds, on="lead_id", how="left")
	scored.to_csv(scored_path, index=False)

	# Prepare outputs
	def fmt(val: Optional[float]) -> str:
	return f"{val:.3f}" if val is not None else "N/A"

	metrics_md = (
	"### Evaluation Metrics (test split)\n"
	f"- ROC AUC: {fmt(auc)}\n"
	f"- PR AUC: {fmt(pr_auc)}\n"
	f"- Brier Score: {fmt(brier)}\n\n"
	f"Outputs were saved to:\n"
	f"- {predictions_path}\n"
	f"- {scored_path}\n"
	)

	preds_preview = preds.head(20)
	scored_preview = scored.head(20)

	return (
	metrics_md,
	preds_preview,
	scored_preview,
	str(predictions_path),
	str(scored_path),
	)

	except Exception as e:
	# On error, return message and empty placeholders
	metrics_md = f"### Error\n{str(e)}"
	return metrics_md, pd.DataFrame(), pd.DataFrame(), None, None


	with gr.Blocks(title="SOLAI Scoring Dashboard") as demo:
	gr.Markdown(
	"""
	# SOLAI Scoring Dashboard (Gradio)
	Train a baseline Logistic Regression model on your solar lead dataset and generate probability_to_buy predictions.

	- Default dataset: examples/synthetic_v2
	- Outputs are always written to /Users/git/solai/scores and are also downloadable below.
	""".strip()
	)

	with gr.Row():
	mode = gr.Radio(
	choices=["Use example synthetic_v2", "Upload CSVs"],
	value="Use example synthetic_v2",
	label="Data Source",
	)

	with gr.Row():
	features_upload = gr.File(
	label="Features CSV (for 'Upload CSVs' mode)",
	file_types=[".csv"],
	visible=False,
	)
	outcomes_upload = gr.File(
	label="Outcomes CSV with columns [lead_id, sold] (for 'Upload CSVs' mode)",
	file_types=[".csv"],
	visible=False,
	)

	def toggle_uploads(selected_mode: str):
	show = selected_mode == "Upload CSVs"
	return [
	gr.update(visible=show),
	gr.update(visible=show),
	]

	mode.change(
	toggle_uploads,
	inputs=[mode],
	outputs=[features_upload, outcomes_upload],
	)

	with gr.Row():
	run_btn = gr.Button("Train + Score", variant="primary")

	with gr.Row():
	metrics_md = gr.Markdown()
	with gr.Row():
	preds_df = gr.Dataframe(label="predictions.csv (preview)", interactive=False)
	with gr.Row():
	scored_df = gr.Dataframe(label="leads_features_scored.csv (preview)", interactive=False)
	with gr.Row():
	pred_file = gr.File(label="Download predictions.csv")
	scored_file = gr.File(label="Download leads_features_scored.csv")

	run_btn.click(
	fn=train_and_score,
	inputs=[mode, features_upload, outcomes_upload],
	outputs=[metrics_md, preds_df, scored_df, pred_file, scored_file],
	)

	if __name__ == "__main__":
	demo.launch()