Spaces:

Ci-Dave
/

Breast_Cancer_Classification_Analysis

Runtime error

App Files Files Community

Breast_Cancer_Classification_Analysis / app.py

Ci-Dave

Added files

7c2913f about 1 year ago

raw

history blame contribute delete

6.49 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Import sklearn tools
	from sklearn.datasets import load_breast_cancer
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.neural_network import MLPClassifier
	from sklearn.metrics import confusion_matrix, classification_report

	# Set up page configuration and title
	st.set_page_config(page_title="Breast Cancer Classification App", layout="wide")
	st.title("Breast Cancer Classification Analysis")

	# Display a header image (ensure you have this image file)
	# st.image("breast_cancer_banner.jpg", caption="Breast Cancer Analysis", use_column_width=True)

	# About the app
	with st.expander("About this App"):
	st.markdown("""
	Overview: This application demonstrates classification of the Breast Cancer dataset using several machine learning models.

	Models included:
	- Logistic Regression
	- Support Vector Machine (SVM)
	- Random Forest
	- Gradient Boosting
	- K-Nearest Neighbors (KNN)
	- MLP Neural Network

	Features:
	- Data preprocessing and scaling
	- Visualization of confusion matrices, performance reports, and detailed result discussions
	- Interactive model selection and performance comparison
	""")

	# Load the Breast Cancer dataset
	data = load_breast_cancer()
	df = pd.DataFrame(data.data, columns=data.feature_names)
	df['target'] = data.target

	# Display the raw dataset
	st.subheader("Dataset Overview")
	st.write(df.head())

	# Split data and preprocess
	X = df.drop("target", axis=1)
	y = df["target"]

	# Scale features
	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(X)

	# Sidebar: Allow the user to select test set size
	test_size = st.sidebar.slider("Test Set Size", 0.1, 0.5, 0.2, step=0.05)
	X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=42)

	# Dictionary of models
	models = {
	"Logistic Regression": LogisticRegression(max_iter=10000),
	"SVM": SVC(kernel='linear'),
	"Random Forest": RandomForestClassifier(n_estimators=100),
	"Gradient Boosting": GradientBoostingClassifier(),
	"KNN": KNeighborsClassifier(),
	"MLP Neural Network": MLPClassifier(max_iter=500)
	}

	# Sidebar: Model selection
	model_choice = st.sidebar.selectbox("Choose a model", list(models.keys()))
	selected_model = models[model_choice]

	# Train the selected model
	with st.spinner("Training model..."):
	selected_model.fit(X_train, y_train)
	y_pred = selected_model.predict(X_test)

	# Mapping labels for readability
	label_mapping = {0: "malignant", 1: "benign"}
	y_test_labels = [label_mapping[label] for label in y_test]
	y_pred_labels = [label_mapping[label] for label in y_pred]

	# Evaluate model performance
	cm = confusion_matrix(y_test_labels, y_pred_labels, labels=["malignant", "benign"])
	cr = classification_report(y_test_labels, y_pred_labels, output_dict=True)
	# Display the confusion matrix with a smaller figure size
	st.subheader(f"Confusion Matrix: {model_choice}")
	fig, ax = plt.subplots(figsize=(4, 3)) # Further reduced size
	sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax,
	xticklabels=["malignant", "benign"], yticklabels=["malignant", "benign"])
	ax.set_xlabel("Predicted")
	ax.set_ylabel("True")
	plt.tight_layout() # Adjusts the layout to fit within the figure area
	st.pyplot(fig)


	# Display classification report
	st.subheader(f"Classification Report: {model_choice}")
	cr_df = pd.DataFrame(cr).transpose()
	st.dataframe(cr_df)

	# Result and Discussion section
	st.subheader("Result and Discussion")
	if model_choice == "Logistic Regression":
	st.markdown("""
	Logistic Regression Discussion:
	- Performance: The model shows robust performance with clear separation between classes.
	- Strengths: It is fast, interpretable, and performs well on linearly separable data.
	- Weaknesses: May underperform on non-linear boundaries and could be sensitive to outliers.
	""")
	elif model_choice == "SVM":
	st.markdown("""
	SVM Discussion:
	- Performance: The linear SVM performs well for this dataset, handling high-dimensional data efficiently.
	- Strengths: Effective in cases where the number of features is greater than the number of samples.
	- Weaknesses: Tuning parameters (like the kernel) is crucial and can be computationally expensive.
	""")
	elif model_choice == "Random Forest":
	st.markdown("""
	Random Forest Discussion:
	- Performance: Typically provides high accuracy and robust results due to ensemble learning.
	- Strengths: Handles non-linearity well and provides insights via feature importance.
	- Weaknesses: Can be less interpretable and may overfit if the trees are not properly tuned.
	""")
	elif model_choice == "Gradient Boosting":
	st.markdown("""
	Gradient Boosting Discussion:
	- Performance: Demonstrates strong performance by sequentially improving on previous errors.
	- Strengths: Excellent for handling complex data patterns.
	- Weaknesses: Sensitive to overfitting if hyperparameters are not carefully optimized.
	""")
	elif model_choice == "KNN":
	st.markdown("""
	KNN Discussion:
	- Performance: Simple yet effective for this dataset, though performance depends on the choice of 'k'.
	- Strengths: Easy to implement and understand.
	- Weaknesses: Computationally expensive for large datasets and sensitive to feature scaling.
	""")
	elif model_choice == "MLP Neural Network":
	st.markdown("""
	MLP Neural Network Discussion:
	- Performance: Provides competitive accuracy with a flexible model that can capture non-linear relationships.
	- Strengths: Can learn complex patterns with enough training data.
	- Weaknesses: Requires careful tuning of hyperparameters and more computational resources compared to simpler models.
	""")
	else:
	st.markdown("No discussion available for the selected model.")

	# Optionally, provide a download button for the classification report
	st.download_button("Download Classification Report as CSV", cr_df.to_csv().encode('utf-8'), "classification_report.csv", "text/csv")