Spaces:

Dun3Co
/

week3_assignment

Sleeping

App Files Files Community

week3_assignment / app.py

Dun3Co

Upload app.py

cf5ff90 verified 7 months ago

raw

history blame contribute delete

10.5 kB

	# start by importing the necessary packages
	#standard
	import numpy as np
	import pandas as pd

	#plt packages
	import seaborn as sns
	import altair as alt
	import matplotlib.pyplot as plt
	#streamlit
	import streamlit as st

	#sklearn
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import StandardScaler
	from sklearn.cluster import KMeans
	from sklearn.metrics import silhouette_score



	st.set_page_config(page_title="StressedOUT – Cached/DB", page_icon=":skull:", layout="wide")
	st.title("StressedOOUT – Looking into a dataset of stressed students (Cached)")
	st.caption("Reads .csv files. Uses Streamlit caching and a form submit gate.")

	BASE_DIR = "StressLevelDataset.csv" #

	@st.cache_data
	def load_data(path):
	data = pd.read_csv(path)
	return data

	data = load_data(BASE_DIR).drop(columns=['future_career_concerns', 'anxiety_level', 'depression', 'bullying','peer_pressure'])


	with st.sidebar:
	st.header("Filters")
	with st.form("filters"):
	analysis = st.radio(
	"Select your dataset",
	('PCA reduced', 'No dimensionality reduction'),
	captions=('PCA reduced', 'No dimensionality reduction')
	)
	k = st.slider("Select number of clusters (k)", 2, 10, 4, step=1)
	iterations = st.slider("Select number of iterations to show", 1, 10, 5, step=1)
	seed = st.number_input("Random seed", min_value=0, max_value=100, value=42, step=1)
	st.write("For no dimensionality reduction, the first two features will be used for visualization.")
	feature_x = st.selectbox("Select X-axis feature", data.columns, index=0)
	feature_y = st.selectbox("Select Y-axis feature", data.columns, index=1)

	submitted = st.form_submit_button("Apply")
	if not submitted:
	st.info("Adjust filters and click Apply.")
	st.stop()

	def kmeans_iteration_demo(X, k, max_iters=iterations):
	# Initialize centers randomly
	np.random.seed(seed)
	centers = X[np.random.choice(len(X), k, replace=False)]

	fig, axes = plt.subplots(1, max_iters + 1, figsize=(20, 4))

	for iteration in range(max_iters + 1):
	if iteration == 0:
	# Show initial random centers
	axes[iteration].scatter(X[:, 0], X[:, 1], c='lightgray', alpha=0.6, s=30)
	axes[iteration].scatter(centers[:, 0], centers[:, 1], c='red', s=200, marker='X',
	edgecolors='black', linewidths=2)
	axes[iteration].set_title(f'Iteration {iteration}\n(Random Initialization)')

	else:
	# Assign points to nearest center
	distances = np.sqrt(((X - centers[:, np.newaxis])**2).sum(axis=2))
	labels = np.argmin(distances, axis=0)

	# Plot current clustering
	colors = ['blue', 'green', 'red', 'purple', 'orange']
	for j in range(k):
	mask = labels == j
	axes[iteration].scatter(X[mask, 0], X[mask, 1],
	c=colors[j], alpha=0.6, s=30, label=f'Cluster {j+1}')

	axes[iteration].scatter(centers[:, 0], centers[:, 1], c='black', s=200, marker='X',
	edgecolors='white', linewidths=2)
	axes[iteration].set_title(f'Iteration {iteration}')

	# Update centers
	new_centers = np.array([X[labels == j].mean(axis=0) for j in range(k)])

	# Show center movement with arrows
	if iteration > 1:
	for j in range(k):
	axes[iteration].annotate('', xy=new_centers[j], xytext=centers[j],
	arrowprops=dict(arrowstyle='->', lw=2, color='red', alpha=0.7))

	centers = new_centers

	axes[iteration].set_xlabel('PC1')
	axes[iteration].set_ylabel('PC2')
	axes[iteration].grid(True, alpha=0.3)

	plt.tight_layout()
	st.pyplot(fig)

	if analysis == 'PCA reduced':
	data_scaled = StandardScaler().fit_transform(data)
	data_reduced_df = pd.DataFrame(data_scaled, columns=data.columns)
	st.write('You selected PCA reduced')
	pca = PCA()
	pca_data = pca.fit_transform(data_reduced_df)
	pca_data_pd = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(pca_data.shape[1])])
	st.write('The PCA reduced data is shown below')
	st.dataframe(pca_data_pd.head(10))

	explained_variance = pca.explained_variance_ratio_
	cumulative_variance = np.cumsum(explained_variance)

	st.write("Explained Variance by Component:")
	for i in range(min(10, len(explained_variance))):
	st.write(f"PC{i+1}: {explained_variance[i]:.3f} ({explained_variance[i]*100:.1f}%)")

	st.write(f"\nFirst 3 components explain {cumulative_variance[2]*100:.1f}% of total variance")
	st.write(f"First 5 components explain {cumulative_variance[4]*100:.1f}% of total variance")

	#visualizations
	fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,5))

	#scree plot
	ax1.plot(range(1,len(explained_variance)+1),explained_variance,marker='o',linestyle='--')
	ax1.set_title('Scree Plot')
	ax1.set_xlabel('Principal Component')
	ax1.set_ylabel('Variance Explained')
	ax1.axvline(x=3,color='r',linestyle='--',label='3 components')
	ax1.axvline(x=5,color='g',linestyle='--',label='5 components')
	ax1.legend()
	ax1.grid()
	#cumulative variance plot
	ax2.plot(range(1,len(cumulative_variance)+1),cumulative_variance,marker='o',linestyle='--',color='orange')
	ax2.set_title('Cumulative Variance Explained')
	ax2.set_xlabel('Number of Principal Components')
	ax2.set_ylabel('Cumulative Variance Explained')
	ax2.axhline(y=0.9,color='r',linestyle='--',label='90% variance')
	ax2.axhline(y=0.95,color='g',linestyle='--',label='95% variance')
	ax2.legend()
	ax2.grid()
	st.pyplot(fig)

	components_df = pd.DataFrame(
	pca.components_[:5].T, # First 5 components
	columns=[f'PC{i+1}' for i in range(5)],
	index=data_reduced_df.columns
	)
	st.write("PCA Component Loadings (first 5 components):")
	st.dataframe(components_df)

	# Visualize component loadings for interpretation
	fig, axes = plt.subplots(3, 2, figsize=(16, 12))

	# PC1 loadings
	pc1_loadings = components_df['PC1'].sort_values(key=abs, ascending=False)
	axes[0,0].barh(range(len(pc1_loadings)), pc1_loadings.values)
	axes[0,0].set_yticks(range(len(pc1_loadings)))
	axes[0,0].set_yticklabels(pc1_loadings.index, fontsize=9)
	axes[0,0].set_title(f'PC1 Loadings (Explains {explained_variance[0]*100:.1f}% of variance)')
	axes[0,0].axvline(x=0, color='black', linestyle='-', alpha=0.3)

	# PC2 loadings
	pc2_loadings = components_df['PC2'].sort_values(key=abs, ascending=False)
	axes[0,1].barh(range(len(pc2_loadings)), pc2_loadings.values, color='orange')
	axes[0,1].set_yticks(range(len(pc2_loadings)))
	axes[0,1].set_yticklabels(pc2_loadings.index, fontsize=9)
	axes[0,1].set_title(f'PC2 Loadings (Explains {explained_variance[1]*100:.1f}% of variance)')
	axes[0,1].axvline(x=0, color='black', linestyle='-', alpha=0.3)

	# PC3 loadings
	pc3_loadings = components_df['PC3'].sort_values(key=abs, ascending=False)
	axes[1,0].barh(range(len(pc3_loadings)), pc3_loadings.values, color='green')
	axes[1,0].set_yticks(range(len(pc3_loadings)))
	axes[1,0].set_yticklabels(pc3_loadings.index, fontsize=9)
	axes[1,0].set_title(f'PC3 Loadings (Explains {explained_variance[2]*100:.1f}% of variance)')
	axes[1,0].axvline(x=0, color='black', linestyle='-', alpha=0.3)

	# PC1 vs PC2 scatter plot of cities
	axes[1,1].scatter(pca_data[:, 0], pca_data[:, 1], alpha=0.6)
	axes[1,1].set_xlabel('PC1')
	axes[1,1].set_ylabel('PC2')
	axes[1,1].set_title('Students in PC1-PC2 Space')
	axes[1,1].grid(True, alpha=0.3)

	# PC1 vs PC3 scatter plot of cities
	axes[2,0].scatter(pca_data[:, 0], pca_data[:, 2], alpha=0.6)
	axes[2,0].set_xlabel('PC1')
	axes[2,0].set_ylabel('PC3')
	axes[2,0].set_title('Students in PC1-PC3 Space')
	axes[2,0].grid(True, alpha=0.3)

	# PC2 vs PC3 scatter plot of cities
	axes[2,1].scatter(pca_data[:, 1], pca_data[:, 2], alpha=0.6)
	axes[2,1].set_xlabel('PC2')
	axes[2,1].set_ylabel('PC3')
	axes[2,1].set_title('Students in PC2-PC3 Space')
	axes[2,1].grid(True, alpha=0.3)
	plt.tight_layout()
	st.pyplot(fig)

	# KMeans clustering on PCA reduced data
	kmeans = KMeans(n_clusters=k, random_state=42)
	cluster_labels = kmeans.fit_predict(pca_data[:,:5]) # Using first 5 PCs
	silhouette_avg = silhouette_score(pca_data[:,:5], cluster_labels)
	st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}")
	pca_data_pd['Cluster'] = cluster_labels
	pca = PCA(n_components=2, random_state=42)
	pca_2d = pca.fit_transform(pca_data_pd.drop(columns=['Cluster']))
	pca_2d_df = pd.DataFrame(pca_2d, columns=['PC1', 'PC2'])
	pca_2d_df['Cluster'] = cluster_labels
	st.write("2D PCA plot with KMeans clusters:")

	kmeans_iteration_demo(pca_2d_df[['PC1', 'PC2']].values, k)

	# ...existing code...
	# ...existing code...
	else:
	st.write('You selected No dimensionality reduction')
	st.write('The original data is shown below')
	st.dataframe(data.head(10))

	# Standardize the data
	data_scaled = StandardScaler().fit_transform(data)
	data_scaled_df = pd.DataFrame(data_scaled, columns=data.columns)
	st.dataframe(data_scaled_df.head(10))

	# KMeans clustering on original scaled data
	kmeans = KMeans(n_clusters=k, random_state=seed)
	cluster_labels = kmeans.fit_predict(data_scaled_df)
	silhouette_avg = silhouette_score(data_scaled_df, cluster_labels)
	st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}")

	# Add cluster labels for plotting
	data_scaled_df['Cluster'] = cluster_labels

	# 2D scatter plot using two original features for visualization
	fig, ax = plt.subplots(figsize=(8, 6))
	scatter = ax.scatter(
	data_scaled_df[feature_x], data_scaled_df[feature_y],
	c=cluster_labels, cmap='tab10', alpha=0.7, s=50
	)
	ax.set_xlabel(feature_x)
	ax.set_ylabel(feature_y)
	ax.set_title('KMeans Clusters (Original Scaled Features)')
	plt.colorbar(scatter, ax=ax, label='Cluster')
	st.pyplot(fig)