| import marimo |
|
|
| __generated_with = "0.11.20" |
| app = marimo.App(width="medium") |
|
|
|
|
| @app.cell |
| def _(mo): |
| mo.md(r"""# Customer Churn Analysis""") |
| return |
|
|
|
|
| @app.cell |
| def _(): |
| import marimo as mo |
| import polars as pl |
| import altair as alt |
| return alt, mo, pl |
|
|
|
|
| @app.cell |
| def _(pl): |
| df = pl.read_csv( |
| "hf://datasets/louiecerv/customer_churn/customer_churn_data.csv" |
| ) |
| df.describe() |
| return (df,) |
|
|
|
|
| @app.cell |
| def _(df): |
| df.head() |
| return |
|
|
|
|
| @app.cell |
| def _(df, pl): |
| from sklearn.preprocessing import ( |
| RobustScaler, |
| OneHotEncoder, |
| MinMaxScaler, |
| OrdinalEncoder, |
| ) |
| from sklearn.pipeline import make_pipeline |
| from sklearn.compose import make_column_transformer |
| from sklearn.linear_model import ( |
| LogisticRegression, |
| BayesianRidge, |
| RidgeClassifier, |
| SGDClassifier, |
| ) |
| from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis |
| from sklearn.naive_bayes import BernoulliNB |
| from sklearn.svm import SVC |
| from sklearn.tree import DecisionTreeClassifier |
| from sklearn.neighbors import KNeighborsClassifier |
| from sklearn.ensemble import ( |
| VotingClassifier, |
| BaggingClassifier, |
| GradientBoostingClassifier, |
| RandomForestClassifier, |
| ) |
| from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector |
| from sklearn.model_selection import train_test_split |
|
|
| num_features = ["tenure", "monthly_charges", "total_charges"] |
| cat_features = ["contract_One Two year", "internet_service_Fiber No"] |
| random_state = 33 |
|
|
| df2 = df.with_columns( |
| (pl.col("contract_One year") + "_" + pl.col("contract_Two year")).alias( |
| "contract_One Two year" |
| ), |
| ( |
| pl.col("internet_service_Fiber optic") |
| + "_" |
| + pl.col("internet_service_No") |
| ).alias("internet_service_Fiber No"), |
| ) |
|
|
| X, y = df2.select(num_features + cat_features), df2.select(["churn"]) |
|
|
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.32, random_state=random_state |
| ) |
|
|
| preprocessor = make_column_transformer( |
| (OneHotEncoder(), cat_features), |
| (MinMaxScaler(), num_features), |
| ) |
|
|
| knc = KNeighborsClassifier(algorithm="ball_tree") |
| dtree = DecisionTreeClassifier(criterion="entropy", random_state=random_state) |
| rfc = RandomForestClassifier( |
| criterion="entropy", max_features=0.3, random_state=random_state |
| ) |
| gbc = GradientBoostingClassifier(random_state=random_state) |
| bag = BaggingClassifier( |
| KNeighborsClassifier(), |
| max_samples=0.8, |
| max_features=0.8, |
| random_state=random_state, |
| ) |
|
|
| log_pipe = make_pipeline( |
| preprocessor, LogisticRegression(max_iter=10000, random_state=random_state) |
| ) |
| bridge_pipe = make_pipeline(preprocessor, BayesianRidge(max_iter=10000)) |
| ridge_pipe = make_pipeline( |
| preprocessor, RidgeClassifier(max_iter=10000, random_state=random_state) |
| ) |
| sgd_pipe = make_pipeline( |
| preprocessor, |
| SGDClassifier( |
| loss="hinge", penalty="l2", max_iter=10000, random_state=random_state |
| ), |
| ) |
| lda_pipe = make_pipeline(preprocessor, QuadraticDiscriminantAnalysis()) |
| bnb_pipe = make_pipeline(preprocessor, BernoulliNB()) |
| svc_pipe = make_pipeline( |
| preprocessor, SVC(kernel="rbf", max_iter=10000, random_state=random_state) |
| ) |
| dtree_pipe = make_pipeline(preprocessor, dtree) |
| rfc_pipe = make_pipeline(preprocessor, rfc) |
| knc_pipe = make_pipeline(preprocessor, knc) |
| gbc_pipe = make_pipeline(preprocessor, gbc) |
| vot_pipe = make_pipeline( |
| preprocessor, |
| VotingClassifier( |
| estimators=[ |
| ("qda", QuadraticDiscriminantAnalysis()), |
| ("dtree", dtree), |
| ], |
| voting="soft", |
| weights=[5, 2], |
| ), |
| ) |
| bag_pipe = make_pipeline(preprocessor, bag) |
|
|
| log_pred = log_pipe.fit(X_train, y_train).predict(X_test) |
| bridge_pred = bridge_pipe.fit(X_train, y_train).predict(X_test) |
| ridge_pred = ridge_pipe.fit(X_train, y_train).predict(X_test) |
| sgd_pred = sgd_pipe.fit(X_train, y_train).predict(X_test) |
| lda_pred = lda_pipe.fit(X_train, y_train).predict(X_test) |
| bnb_pred = bnb_pipe.fit(X_train, y_train).predict(X_test) |
| svc_pred = svc_pipe.fit(X_train, y_train).predict(X_test) |
| dtree_pred = dtree_pipe.fit(X_train, y_train).predict(X_test) |
| rfc_pred = dtree_pipe.fit(X_train, y_train).predict(X_test) |
| knc_pred = knc_pipe.fit(X_train, y_train).predict(X_test) |
| gbc_pred = gbc_pipe.fit(X_train, y_train).predict(X_test) |
| vot_pred = vot_pipe.fit(X_train, y_train).predict(X_test) |
| bag_pred = bag_pipe.fit(X_train, y_train).predict(X_test) |
| return ( |
| BaggingClassifier, |
| BayesianRidge, |
| BernoulliNB, |
| DecisionTreeClassifier, |
| GradientBoostingClassifier, |
| KNeighborsClassifier, |
| LogisticRegression, |
| MinMaxScaler, |
| OneHotEncoder, |
| OrdinalEncoder, |
| QuadraticDiscriminantAnalysis, |
| RFE, |
| RFECV, |
| RandomForestClassifier, |
| RidgeClassifier, |
| RobustScaler, |
| SGDClassifier, |
| SVC, |
| SequentialFeatureSelector, |
| VotingClassifier, |
| X, |
| X_test, |
| X_train, |
| bag, |
| bag_pipe, |
| bag_pred, |
| bnb_pipe, |
| bnb_pred, |
| bridge_pipe, |
| bridge_pred, |
| cat_features, |
| df2, |
| dtree, |
| dtree_pipe, |
| dtree_pred, |
| gbc, |
| gbc_pipe, |
| gbc_pred, |
| knc, |
| knc_pipe, |
| knc_pred, |
| lda_pipe, |
| lda_pred, |
| log_pipe, |
| log_pred, |
| make_column_transformer, |
| make_pipeline, |
| num_features, |
| preprocessor, |
| random_state, |
| rfc, |
| rfc_pipe, |
| rfc_pred, |
| ridge_pipe, |
| ridge_pred, |
| sgd_pipe, |
| sgd_pred, |
| svc_pipe, |
| svc_pred, |
| train_test_split, |
| vot_pipe, |
| vot_pred, |
| y, |
| y_test, |
| y_train, |
| ) |
|
|
|
|
| @app.cell |
| def _( |
| bag_pred, |
| bnb_pred, |
| bridge_pred, |
| dtree_pred, |
| gbc_pred, |
| knc_pred, |
| lda_pred, |
| log_pred, |
| mo, |
| rfc_pred, |
| ridge_pred, |
| sgd_pred, |
| svc_pred, |
| vot_pred, |
| y_test, |
| ): |
| from sklearn.metrics import ( |
| accuracy_score, |
| precision_score, |
| f1_score, |
| recall_score, |
| roc_auc_score, |
| log_loss, |
| mean_squared_error, |
| root_mean_squared_error, |
| mean_absolute_error, |
| r2_score, |
| explained_variance_score, |
| ) |
|
|
| mo.md(f""" |
| # Model Metrics |
| |
| ## Logistic Regression |
| |
| - Accuracy: {accuracy_score(y_test, log_pred)} |
| - Precision: {precision_score(y_test, log_pred)} |
| - Recall: {recall_score(y_test, log_pred)} |
| - F1: {f1_score(y_test, log_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, log_pred)} |
| - Log Loss: {log_loss(y_test, log_pred)} |
| |
| ## Ridge Classifier |
| |
| - Accuracy: {accuracy_score(y_test, ridge_pred)} |
| - Precision: {precision_score(y_test, ridge_pred)} |
| - Recall: {recall_score(y_test, ridge_pred)} |
| - F1: {f1_score(y_test, ridge_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, ridge_pred)} |
| - Log Loss: {log_loss(y_test, ridge_pred)} |
| |
| ## SGD Classifier |
| |
| - Accuracy: {accuracy_score(y_test, sgd_pred)} |
| - Precision: {precision_score(y_test, sgd_pred)} |
| - Recall: {recall_score(y_test, sgd_pred)} |
| - F1: {f1_score(y_test, sgd_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, sgd_pred)} |
| - Log Loss: {log_loss(y_test, sgd_pred)} |
| |
| ## Bayesian Ridge Regression |
| |
| - Mean Squared Error: {mean_squared_error(y_test, bridge_pred)} |
| - Root Mean Squared Error: {root_mean_squared_error(y_test, bridge_pred)} |
| - Mean Absolute Error: {mean_absolute_error(y_test, bridge_pred)} |
| - R^2: {r2_score(y_test, bridge_pred)} |
| - Explained Variance: {explained_variance_score(y_test, bridge_pred)} |
| |
| ## Quadratic Discriminant Analysis |
| |
| - Accuracy: {accuracy_score(y_test, lda_pred)} |
| - Precision: {precision_score(y_test, lda_pred)} |
| - Recall: {recall_score(y_test, lda_pred)} |
| - F1: {f1_score(y_test, lda_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, lda_pred)} |
| - Log Loss: {log_loss(y_test, lda_pred)} |
| |
| ## Bernoulli Naive Bayes |
| |
| - Accuracy: {accuracy_score(y_test, bnb_pred)} |
| - Precision: {precision_score(y_test, bnb_pred)} |
| - Recall: {recall_score(y_test, bnb_pred)} |
| - F1: {f1_score(y_test, bnb_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, bnb_pred)} |
| - Log Loss: {log_loss(y_test, bnb_pred)} |
| |
| ## C-Support Vector Classifier |
| |
| - Accuracy: {accuracy_score(y_test, svc_pred)} |
| - Precision: {precision_score(y_test, svc_pred)} |
| - Recall: {recall_score(y_test, svc_pred)} |
| - F1: {f1_score(y_test, svc_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, svc_pred)} |
| - Log Loss: {log_loss(y_test, svc_pred)} |
| |
| ## Decision Tree Classifier |
| |
| - Accuracy: {accuracy_score(y_test, dtree_pred)} |
| - Precision: {precision_score(y_test, dtree_pred)} |
| - Recall: {recall_score(y_test, dtree_pred)} |
| - F1: {f1_score(y_test, dtree_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, dtree_pred)} |
| - Log Loss: {log_loss(y_test, dtree_pred)} |
| |
| ## Random Forest Classifier |
| |
| - Accuracy: {accuracy_score(y_test, rfc_pred)} |
| - Precision: {precision_score(y_test, rfc_pred)} |
| - Recall: {recall_score(y_test, rfc_pred)} |
| - F1: {f1_score(y_test, rfc_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, rfc_pred)} |
| - Log Loss: {log_loss(y_test, rfc_pred)} |
| |
| ## K Neighbors Classifier |
| |
| - Accuracy: {accuracy_score(y_test, knc_pred)} |
| - Precision: {precision_score(y_test, knc_pred)} |
| - Recall: {recall_score(y_test, knc_pred)} |
| - F1: {f1_score(y_test, knc_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, knc_pred)} |
| - Log Loss: {log_loss(y_test, knc_pred)} |
| |
| ## Gradient Boosting Classifier |
| |
| - Accuracy: {accuracy_score(y_test, gbc_pred)} |
| - Precision: {precision_score(y_test, gbc_pred)} |
| - Recall: {recall_score(y_test, gbc_pred)} |
| - F1: {f1_score(y_test, gbc_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, gbc_pred)} |
| - Log Loss: {log_loss(y_test, gbc_pred)} |
| |
| ## Voting Classifier |
| |
| - Accuracy: {accuracy_score(y_test, vot_pred)} |
| - Precision: {precision_score(y_test, vot_pred)} |
| - Recall: {recall_score(y_test, vot_pred)} |
| - F1: {f1_score(y_test, vot_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, vot_pred)} |
| - Log Loss: {log_loss(y_test, vot_pred)} |
| |
| ## Bagging Classifier |
| |
| - Accuracy: {accuracy_score(y_test, bag_pred)} |
| - Precision: {precision_score(y_test, bag_pred)} |
| - Recall: {recall_score(y_test, bag_pred)} |
| - F1: {f1_score(y_test, bag_pred)} |
| - ROC-AUC: {roc_auc_score(y_test, bag_pred)} |
| - Log Loss: {log_loss(y_test, bag_pred)} |
| |
| { |
| mo.callout( |
| "From the metrics, the Quadratic Discriminant Analysis and the Decision Tree Classifier perform the best, thus, they were chosen for the Voting Classifier", |
| kind="info", |
| ) |
| } |
| """) |
| return ( |
| accuracy_score, |
| explained_variance_score, |
| f1_score, |
| log_loss, |
| mean_absolute_error, |
| mean_squared_error, |
| precision_score, |
| r2_score, |
| recall_score, |
| roc_auc_score, |
| root_mean_squared_error, |
| ) |
|
|
|
|
| @app.cell |
| def _(mo): |
| user_inputs = mo.ui.dictionary( |
| { |
| "tenure": mo.ui.number(label="Tenure", start=1, stop=72, step=1), |
| "monthly_charges": mo.ui.number( |
| label="Monthly Charges", start=20, stop=120, step=1 |
| ), |
| "total_charges": mo.ui.number( |
| label="Total Charges", start=20, stop=8000, step=1 |
| ), |
| "contract": mo.ui.dropdown( |
| label="Contract (Year)", options=["None", "One", "Two"] |
| ), |
| "service": mo.ui.dropdown( |
| label="Service", options=["None", "Basic", "Fiber Optic"] |
| ), |
| } |
| ) |
|
|
| mo.vstack(user_inputs.values()) |
| return (user_inputs,) |
|
|
|
|
| @app.cell |
| def _(mo, pl, user_inputs, vot_pipe): |
| contract = None |
| service = None |
|
|
| match user_inputs["contract"].value: |
| case "None": |
| contract = "false_false" |
| case "One": |
| contract = "true_false" |
| case "Two": |
| contract = "false_true" |
| case _: |
| pass |
|
|
| match user_inputs["service"].value: |
| case "None": |
| service = "false_false" |
| case "Basic": |
| service = "true_false" |
| case "Fiber Optic": |
| service = "false_true" |
| case _: |
| pass |
|
|
| preds = pl.DataFrame({ |
| "tenure": user_inputs["tenure"].value, |
| "monthly_charges": user_inputs["monthly_charges"].value, |
| "total_charges": user_inputs["total_charges"].value, |
| "contract_One Two year": contract, |
| "internet_service_Fiber No": service, |
| }) |
|
|
| prediction = (vot_pipe.predict(preds), vot_pipe.predict_proba(preds)) |
|
|
| mo.md(f"Prediction: {"Yes" if prediction[0][0] else "No" }, with about {prediction[1][0][0] * 100 if not prediction[0][0] else prediction[1][0][1] * 100:.2f}% probability.") |
| return contract, prediction, preds, service |
|
|
|
|
| if __name__ == "__main__": |
| app.run() |
|
|