Spaces:

Emil25
/

mlops

Running

App Files Files Community

mlops / scripts /model_training.py

Emil25

Upload 4 files

61f924c verified almost 2 years ago

raw

history blame contribute delete

1.96 kB

	from sklearn.ensemble import RandomForestClassifier
	from sklearn.pipeline import Pipeline
	from scripts.download_data import download_data
	from sklearn.metrics import f1_score
	from sklearn.preprocessing import StandardScaler
	from sklearn.preprocessing import PowerTransformer
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.pipeline import Pipeline
	from sklearn.compose import ColumnTransformer
	from sklearn.preprocessing import QuantileTransformer
	import pandas as pd

	def calculate_metric(model):
	_, test_set = download_data()
	X_test, y_test = test_set.drop(columns=['cardio']), test_set['cardio']
	y_pred = model.predict(X_test)
	f1 = f1_score(y_test, y_pred, pos_label='positive')
	return f1


	def model_training():
	train_set, _ = download_data()
	X_train, y_train = train_set.drop(columns=['cardio']), train_set['cardio']

	num_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',]
	cat_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

	num_pipe = Pipeline([
	('qt', QuantileTransformer(output_distribution="normal")),
	('scaler', StandardScaler()),
	('power', PowerTransformer()),
	])

	cat_pipe = Pipeline([
	('encoder', OneHotEncoder(handle_unknown='ignore'))
	])

	preprocessors_all = ColumnTransformer(transformers=[
	('num_p', num_pipe, num_columns),
	('cat_p', cat_pipe, cat_columns),
	])

	pipe_all = Pipeline([
	('preprocessors', preprocessors_all),
	('model', RandomForestClassifier(n_estimators=200,
	criterion = "gini",
	min_samples_split=15,
	max_depth=15,
	oob_score=True)
	)
	])

	pipe_all.fit(X_train, y_train)

	return pipe_all, calculate_metric(pipe_all)