Spaces:

eaedk
/

iris-flowers-classification

Running

App Files Files Community

iris-flowers-classification / src /problem_solving.py

eaedk

all my project

f1d873d almost 3 years ago

raw

history blame contribute delete

4.02 kB

	# Imports

	import pickle
	import os
	from sklearn.metrics import classification_report, ConfusionMatrixDisplay
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.impute import SimpleImputer, KNNImputer
	from sklearn.pipeline import Pipeline
	from sklearn.compose import ColumnTransformer
	from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
	from sklearn.model_selection import train_test_split
	import pandas as pd
	from ydata_profiling import ProfileReport
	from sklearn import datasets
	from subprocess import call

	# PATHS
	DIRPATH = os.path.dirname(os.path.realpath(__file__))
	ml_fp = os.path.join(DIRPATH, "assets", "ml", "ml_components.pkl")
	req_fp = os.path.join(DIRPATH, "assets", "ml", "requirements.txt")
	eda_report_fp = os.path.join(DIRPATH, "assets", "ml", "eda-report.html")

	# import some data to play with
	iris = datasets.load_iris(return_X_y=False, as_frame=True)

	df = iris['frame']
	target_col = 'target'
	# pandas profiling
	profile = ProfileReport(df, title="Dataset", html={
	'style': {'full_width': True}})
	profile.to_file(eda_report_fp)

	# Dataset Splitting
	# Please specify
	to_ignore_cols = [
	"ID", # ID
	"Id", "id",
	target_col
	]


	num_cols = list(set(df.select_dtypes('number')) - set(to_ignore_cols))
	cat_cols = list(set(df.select_dtypes(exclude='number')) - set(to_ignore_cols))
	print(f"\n[Info] The '{len(num_cols)}' numeric columns are : {num_cols}\nThe '{len(cat_cols)}' categorical columns are : {cat_cols}")

	X, y = df.iloc[:, :-1], df.iloc[:, -1].values


	X_train, X_eval, y_train, y_eval = train_test_split(
	X, y, test_size=0.2, random_state=0, stratify=y)

	print(
	f"\n[Info] Dataset splitted : (X_train , y_train) = {(X_train.shape , y_train.shape)}, (X_eval y_eval) = {(X_eval.shape , y_eval.shape)}. \n")

	y_train

	# Modeling

	# Imputers
	num_imputer = SimpleImputer(strategy="mean").set_output(transform="pandas")
	cat_imputer = SimpleImputer(
	strategy="most_frequent").set_output(transform="pandas")

	# Scaler & Encoder
	if len(cat_cols) > 0:
	df_imputed_stacked_cat = cat_imputer.fit_transform(
	df
	.append(df)
	.append(df)
	[cat_cols])
	cat_ = OneHotEncoder(sparse=False, drop="first").fit(
	df_imputed_stacked_cat).categories_
	else:
	cat_ = 'auto'

	encoder = OneHotEncoder(categories=cat_, sparse=False, drop="first")
	scaler = StandardScaler().set_output(transform="pandas")


	# feature pipelines
	num_pipe = Pipeline(steps=[("num_imputer", num_imputer), ("scaler", scaler)])
	cat_pipe = Pipeline(steps=[("cat_imputer", cat_imputer), ("encoder", encoder)])

	# end2end features preprocessor

	transformers = []

	transformers.append(("numerical", num_pipe, num_cols)) if len(
	num_cols) > 0 else None
	transformers.append(("categorical", cat_pipe, cat_cols,)) if len(
	cat_cols) > 0 else None
	# ("date", date_pipe, date_cols,),

	preprocessor = ColumnTransformer(
	transformers=transformers).set_output(transform="pandas")

	print(
	f"\n[Info] Features Transformer : {transformers}. \n")


	# end2end pipeline
	end2end_pipeline = Pipeline([
	('preprocessor', preprocessor),
	('model', RandomForestClassifier(random_state=10))
	]).set_output(transform="pandas")

	# Training
	print(
	f"\n[Info] Training.\n[Info] X_train : columns( {X_train.columns.tolist()}), shape: {X_train.shape} .\n")

	end2end_pipeline.fit(X_train, y_train)

	# Evaluation
	print(
	f"\n[Info] Evaluation.\n")
	y_eval_pred = end2end_pipeline.predict(X_eval)

	print(classification_report(y_eval, y_eval_pred,
	target_names=iris['target_names']))

	# ConfusionMatrixDisplay.from_predictions(
	# y_eval, y_eval_pred, display_labels=iris['target_names'])

	# Exportation
	print(
	f"\n[Info] Exportation.\n")
	to_export = {
	"labels": iris['target_names'],
	"pipeline": end2end_pipeline,
	}


	# save components to file
	with open(ml_fp, 'wb') as file:
	pickle.dump(to_export, file)

	# Requirements
	# ! pip freeze > requirements.txt
	call(f"pip freeze > {req_fp}", shell=True)