| |
| |
|
|
| |
| |
| |
|
|
| |
| import numpy as np |
| import pandas as pd |
| import sklearn |
| import pickle |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.pipeline import make_pipeline |
| from sklearn.feature_extraction import DictVectorizer |
|
|
|
|
| print(f'pandas=={pd.__version__}') |
| print(f'numpy=={np.__version__}') |
| print(f'sklearn=={sklearn.__version__}') |
|
|
|
|
| |
| def load_data(): |
| data_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv" |
| df = pd.read_csv(data_url) |
| return df |
|
|
|
|
|
|
| def train_model(df): |
| |
| categorical = ['lead_source'] |
| numeric = ['number_of_courses_viewed', 'annual_income'] |
|
|
| df[categorical] = df[categorical].fillna('NA') |
| df[numeric] = df[numeric].fillna(0) |
|
|
| train_dict = df[categorical + numeric].to_dict(orient='records') |
|
|
| pipeline = make_pipeline( |
| DictVectorizer(), |
| LogisticRegression(solver='liblinear') |
| ) |
|
|
| |
| y_train = df.converted |
|
|
| pipeline.fit(train_dict, y_train) |
| return pipeline |
|
|
|
|
| def save_model(filename, model): |
| with open(filename, 'wb') as f_out: |
| pickle.dump(model, f_out) |
| |
| print(f"Model saved to {filename}") |
|
|
|
|
| df = load_data() |
| pipeline = train_model(df) |
| save_model('model.bin', pipeline) |
|
|
|
|
|
|