| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.pipeline import Pipeline |
| from scripts.download_data import download_data |
| from sklearn.metrics import f1_score |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.preprocessing import PowerTransformer |
| from sklearn.preprocessing import OneHotEncoder |
| from sklearn.pipeline import Pipeline |
| from sklearn.compose import ColumnTransformer |
| from sklearn.preprocessing import QuantileTransformer |
| import pandas as pd |
|
|
| def calculate_metric(model): |
| _, test_set = download_data() |
| X_test, y_test = test_set.drop(columns=['cardio']), test_set['cardio'] |
| y_pred = model.predict(X_test) |
| f1 = f1_score(y_test, y_pred, pos_label='positive') |
| return f1 |
|
|
|
|
| def model_training(): |
| train_set, _ = download_data() |
| X_train, y_train = train_set.drop(columns=['cardio']), train_set['cardio'] |
| |
| num_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',] |
| cat_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'] |
| |
| num_pipe = Pipeline([ |
| ('qt', QuantileTransformer(output_distribution="normal")), |
| ('scaler', StandardScaler()), |
| ('power', PowerTransformer()), |
| ]) |
| |
| cat_pipe = Pipeline([ |
| ('encoder', OneHotEncoder(handle_unknown='ignore')) |
| ]) |
|
|
| preprocessors_all = ColumnTransformer(transformers=[ |
| ('num_p', num_pipe, num_columns), |
| ('cat_p', cat_pipe, cat_columns), |
| ]) |
|
|
| pipe_all = Pipeline([ |
| ('preprocessors', preprocessors_all), |
| ('model', RandomForestClassifier(n_estimators=200, |
| criterion = "gini", |
| min_samples_split=15, |
| max_depth=15, |
| oob_score=True) |
| ) |
| ]) |
| |
| pipe_all.fit(X_train, y_train) |
| |
| return pipe_all, calculate_metric(pipe_all) |
|
|
|
|