Dun3Co's picture
Upload app.py
cf5ff90 verified
# start by importing the necessary packages
#standard
import numpy as np
import pandas as pd
#plt packages
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt
#streamlit
import streamlit as st
#sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
st.set_page_config(page_title="StressedOUT – Cached/DB", page_icon=":skull:", layout="wide")
st.title("StressedOOUT – Looking into a dataset of stressed students (Cached)")
st.caption("Reads .csv files. Uses Streamlit caching and a form submit gate.")
BASE_DIR = "StressLevelDataset.csv" #
@st.cache_data
def load_data(path):
data = pd.read_csv(path)
return data
data = load_data(BASE_DIR).drop(columns=['future_career_concerns', 'anxiety_level', 'depression', 'bullying','peer_pressure'])
with st.sidebar:
st.header("Filters")
with st.form("filters"):
analysis = st.radio(
"Select your dataset",
('PCA reduced', 'No dimensionality reduction'),
captions=('PCA reduced', 'No dimensionality reduction')
)
k = st.slider("Select number of clusters (k)", 2, 10, 4, step=1)
iterations = st.slider("Select number of iterations to show", 1, 10, 5, step=1)
seed = st.number_input("Random seed", min_value=0, max_value=100, value=42, step=1)
st.write("For no dimensionality reduction, the first two features will be used for visualization.")
feature_x = st.selectbox("Select X-axis feature", data.columns, index=0)
feature_y = st.selectbox("Select Y-axis feature", data.columns, index=1)
submitted = st.form_submit_button("Apply")
if not submitted:
st.info("Adjust filters and click **Apply**.")
st.stop()
def kmeans_iteration_demo(X, k, max_iters=iterations):
# Initialize centers randomly
np.random.seed(seed)
centers = X[np.random.choice(len(X), k, replace=False)]
fig, axes = plt.subplots(1, max_iters + 1, figsize=(20, 4))
for iteration in range(max_iters + 1):
if iteration == 0:
# Show initial random centers
axes[iteration].scatter(X[:, 0], X[:, 1], c='lightgray', alpha=0.6, s=30)
axes[iteration].scatter(centers[:, 0], centers[:, 1], c='red', s=200, marker='X',
edgecolors='black', linewidths=2)
axes[iteration].set_title(f'Iteration {iteration}\n(Random Initialization)')
else:
# Assign points to nearest center
distances = np.sqrt(((X - centers[:, np.newaxis])**2).sum(axis=2))
labels = np.argmin(distances, axis=0)
# Plot current clustering
colors = ['blue', 'green', 'red', 'purple', 'orange']
for j in range(k):
mask = labels == j
axes[iteration].scatter(X[mask, 0], X[mask, 1],
c=colors[j], alpha=0.6, s=30, label=f'Cluster {j+1}')
axes[iteration].scatter(centers[:, 0], centers[:, 1], c='black', s=200, marker='X',
edgecolors='white', linewidths=2)
axes[iteration].set_title(f'Iteration {iteration}')
# Update centers
new_centers = np.array([X[labels == j].mean(axis=0) for j in range(k)])
# Show center movement with arrows
if iteration > 1:
for j in range(k):
axes[iteration].annotate('', xy=new_centers[j], xytext=centers[j],
arrowprops=dict(arrowstyle='->', lw=2, color='red', alpha=0.7))
centers = new_centers
axes[iteration].set_xlabel('PC1')
axes[iteration].set_ylabel('PC2')
axes[iteration].grid(True, alpha=0.3)
plt.tight_layout()
st.pyplot(fig)
if analysis == 'PCA reduced':
data_scaled = StandardScaler().fit_transform(data)
data_reduced_df = pd.DataFrame(data_scaled, columns=data.columns)
st.write('You selected PCA reduced')
pca = PCA()
pca_data = pca.fit_transform(data_reduced_df)
pca_data_pd = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(pca_data.shape[1])])
st.write('The PCA reduced data is shown below')
st.dataframe(pca_data_pd.head(10))
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
st.write("Explained Variance by Component:")
for i in range(min(10, len(explained_variance))):
st.write(f"PC{i+1}: {explained_variance[i]:.3f} ({explained_variance[i]*100:.1f}%)")
st.write(f"\nFirst 3 components explain {cumulative_variance[2]*100:.1f}% of total variance")
st.write(f"First 5 components explain {cumulative_variance[4]*100:.1f}% of total variance")
#visualizations
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,5))
#scree plot
ax1.plot(range(1,len(explained_variance)+1),explained_variance,marker='o',linestyle='--')
ax1.set_title('Scree Plot')
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Variance Explained')
ax1.axvline(x=3,color='r',linestyle='--',label='3 components')
ax1.axvline(x=5,color='g',linestyle='--',label='5 components')
ax1.legend()
ax1.grid()
#cumulative variance plot
ax2.plot(range(1,len(cumulative_variance)+1),cumulative_variance,marker='o',linestyle='--',color='orange')
ax2.set_title('Cumulative Variance Explained')
ax2.set_xlabel('Number of Principal Components')
ax2.set_ylabel('Cumulative Variance Explained')
ax2.axhline(y=0.9,color='r',linestyle='--',label='90% variance')
ax2.axhline(y=0.95,color='g',linestyle='--',label='95% variance')
ax2.legend()
ax2.grid()
st.pyplot(fig)
components_df = pd.DataFrame(
pca.components_[:5].T, # First 5 components
columns=[f'PC{i+1}' for i in range(5)],
index=data_reduced_df.columns
)
st.write("PCA Component Loadings (first 5 components):")
st.dataframe(components_df)
# Visualize component loadings for interpretation
fig, axes = plt.subplots(3, 2, figsize=(16, 12))
# PC1 loadings
pc1_loadings = components_df['PC1'].sort_values(key=abs, ascending=False)
axes[0,0].barh(range(len(pc1_loadings)), pc1_loadings.values)
axes[0,0].set_yticks(range(len(pc1_loadings)))
axes[0,0].set_yticklabels(pc1_loadings.index, fontsize=9)
axes[0,0].set_title(f'PC1 Loadings (Explains {explained_variance[0]*100:.1f}% of variance)')
axes[0,0].axvline(x=0, color='black', linestyle='-', alpha=0.3)
# PC2 loadings
pc2_loadings = components_df['PC2'].sort_values(key=abs, ascending=False)
axes[0,1].barh(range(len(pc2_loadings)), pc2_loadings.values, color='orange')
axes[0,1].set_yticks(range(len(pc2_loadings)))
axes[0,1].set_yticklabels(pc2_loadings.index, fontsize=9)
axes[0,1].set_title(f'PC2 Loadings (Explains {explained_variance[1]*100:.1f}% of variance)')
axes[0,1].axvline(x=0, color='black', linestyle='-', alpha=0.3)
# PC3 loadings
pc3_loadings = components_df['PC3'].sort_values(key=abs, ascending=False)
axes[1,0].barh(range(len(pc3_loadings)), pc3_loadings.values, color='green')
axes[1,0].set_yticks(range(len(pc3_loadings)))
axes[1,0].set_yticklabels(pc3_loadings.index, fontsize=9)
axes[1,0].set_title(f'PC3 Loadings (Explains {explained_variance[2]*100:.1f}% of variance)')
axes[1,0].axvline(x=0, color='black', linestyle='-', alpha=0.3)
# PC1 vs PC2 scatter plot of cities
axes[1,1].scatter(pca_data[:, 0], pca_data[:, 1], alpha=0.6)
axes[1,1].set_xlabel('PC1')
axes[1,1].set_ylabel('PC2')
axes[1,1].set_title('Students in PC1-PC2 Space')
axes[1,1].grid(True, alpha=0.3)
# PC1 vs PC3 scatter plot of cities
axes[2,0].scatter(pca_data[:, 0], pca_data[:, 2], alpha=0.6)
axes[2,0].set_xlabel('PC1')
axes[2,0].set_ylabel('PC3')
axes[2,0].set_title('Students in PC1-PC3 Space')
axes[2,0].grid(True, alpha=0.3)
# PC2 vs PC3 scatter plot of cities
axes[2,1].scatter(pca_data[:, 1], pca_data[:, 2], alpha=0.6)
axes[2,1].set_xlabel('PC2')
axes[2,1].set_ylabel('PC3')
axes[2,1].set_title('Students in PC2-PC3 Space')
axes[2,1].grid(True, alpha=0.3)
plt.tight_layout()
st.pyplot(fig)
# KMeans clustering on PCA reduced data
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(pca_data[:,:5]) # Using first 5 PCs
silhouette_avg = silhouette_score(pca_data[:,:5], cluster_labels)
st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}")
pca_data_pd['Cluster'] = cluster_labels
pca = PCA(n_components=2, random_state=42)
pca_2d = pca.fit_transform(pca_data_pd.drop(columns=['Cluster']))
pca_2d_df = pd.DataFrame(pca_2d, columns=['PC1', 'PC2'])
pca_2d_df['Cluster'] = cluster_labels
st.write("2D PCA plot with KMeans clusters:")
kmeans_iteration_demo(pca_2d_df[['PC1', 'PC2']].values, k)
# ...existing code...
# ...existing code...
else:
st.write('You selected No dimensionality reduction')
st.write('The original data is shown below')
st.dataframe(data.head(10))
# Standardize the data
data_scaled = StandardScaler().fit_transform(data)
data_scaled_df = pd.DataFrame(data_scaled, columns=data.columns)
st.dataframe(data_scaled_df.head(10))
# KMeans clustering on original scaled data
kmeans = KMeans(n_clusters=k, random_state=seed)
cluster_labels = kmeans.fit_predict(data_scaled_df)
silhouette_avg = silhouette_score(data_scaled_df, cluster_labels)
st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}")
# Add cluster labels for plotting
data_scaled_df['Cluster'] = cluster_labels
# 2D scatter plot using two original features for visualization
fig, ax = plt.subplots(figsize=(8, 6))
scatter = ax.scatter(
data_scaled_df[feature_x], data_scaled_df[feature_y],
c=cluster_labels, cmap='tab10', alpha=0.7, s=50
)
ax.set_xlabel(feature_x)
ax.set_ylabel(feature_y)
ax.set_title('KMeans Clusters (Original Scaled Features)')
plt.colorbar(scatter, ax=ax, label='Cluster')
st.pyplot(fig)