Spaces:
Sleeping
Sleeping
| # start by importing the necessary packages | |
| #standard | |
| import numpy as np | |
| import pandas as pd | |
| #plt packages | |
| import seaborn as sns | |
| import altair as alt | |
| import matplotlib.pyplot as plt | |
| #streamlit | |
| import streamlit as st | |
| #sklearn | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.cluster import KMeans | |
| from sklearn.metrics import silhouette_score | |
| st.set_page_config(page_title="StressedOUT – Cached/DB", page_icon=":skull:", layout="wide") | |
| st.title("StressedOOUT – Looking into a dataset of stressed students (Cached)") | |
| st.caption("Reads .csv files. Uses Streamlit caching and a form submit gate.") | |
| BASE_DIR = "StressLevelDataset.csv" # | |
| def load_data(path): | |
| data = pd.read_csv(path) | |
| return data | |
| data = load_data(BASE_DIR).drop(columns=['future_career_concerns', 'anxiety_level', 'depression', 'bullying','peer_pressure']) | |
| with st.sidebar: | |
| st.header("Filters") | |
| with st.form("filters"): | |
| analysis = st.radio( | |
| "Select your dataset", | |
| ('PCA reduced', 'No dimensionality reduction'), | |
| captions=('PCA reduced', 'No dimensionality reduction') | |
| ) | |
| k = st.slider("Select number of clusters (k)", 2, 10, 4, step=1) | |
| iterations = st.slider("Select number of iterations to show", 1, 10, 5, step=1) | |
| seed = st.number_input("Random seed", min_value=0, max_value=100, value=42, step=1) | |
| st.write("For no dimensionality reduction, the first two features will be used for visualization.") | |
| feature_x = st.selectbox("Select X-axis feature", data.columns, index=0) | |
| feature_y = st.selectbox("Select Y-axis feature", data.columns, index=1) | |
| submitted = st.form_submit_button("Apply") | |
| if not submitted: | |
| st.info("Adjust filters and click **Apply**.") | |
| st.stop() | |
| def kmeans_iteration_demo(X, k, max_iters=iterations): | |
| # Initialize centers randomly | |
| np.random.seed(seed) | |
| centers = X[np.random.choice(len(X), k, replace=False)] | |
| fig, axes = plt.subplots(1, max_iters + 1, figsize=(20, 4)) | |
| for iteration in range(max_iters + 1): | |
| if iteration == 0: | |
| # Show initial random centers | |
| axes[iteration].scatter(X[:, 0], X[:, 1], c='lightgray', alpha=0.6, s=30) | |
| axes[iteration].scatter(centers[:, 0], centers[:, 1], c='red', s=200, marker='X', | |
| edgecolors='black', linewidths=2) | |
| axes[iteration].set_title(f'Iteration {iteration}\n(Random Initialization)') | |
| else: | |
| # Assign points to nearest center | |
| distances = np.sqrt(((X - centers[:, np.newaxis])**2).sum(axis=2)) | |
| labels = np.argmin(distances, axis=0) | |
| # Plot current clustering | |
| colors = ['blue', 'green', 'red', 'purple', 'orange'] | |
| for j in range(k): | |
| mask = labels == j | |
| axes[iteration].scatter(X[mask, 0], X[mask, 1], | |
| c=colors[j], alpha=0.6, s=30, label=f'Cluster {j+1}') | |
| axes[iteration].scatter(centers[:, 0], centers[:, 1], c='black', s=200, marker='X', | |
| edgecolors='white', linewidths=2) | |
| axes[iteration].set_title(f'Iteration {iteration}') | |
| # Update centers | |
| new_centers = np.array([X[labels == j].mean(axis=0) for j in range(k)]) | |
| # Show center movement with arrows | |
| if iteration > 1: | |
| for j in range(k): | |
| axes[iteration].annotate('', xy=new_centers[j], xytext=centers[j], | |
| arrowprops=dict(arrowstyle='->', lw=2, color='red', alpha=0.7)) | |
| centers = new_centers | |
| axes[iteration].set_xlabel('PC1') | |
| axes[iteration].set_ylabel('PC2') | |
| axes[iteration].grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| if analysis == 'PCA reduced': | |
| data_scaled = StandardScaler().fit_transform(data) | |
| data_reduced_df = pd.DataFrame(data_scaled, columns=data.columns) | |
| st.write('You selected PCA reduced') | |
| pca = PCA() | |
| pca_data = pca.fit_transform(data_reduced_df) | |
| pca_data_pd = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(pca_data.shape[1])]) | |
| st.write('The PCA reduced data is shown below') | |
| st.dataframe(pca_data_pd.head(10)) | |
| explained_variance = pca.explained_variance_ratio_ | |
| cumulative_variance = np.cumsum(explained_variance) | |
| st.write("Explained Variance by Component:") | |
| for i in range(min(10, len(explained_variance))): | |
| st.write(f"PC{i+1}: {explained_variance[i]:.3f} ({explained_variance[i]*100:.1f}%)") | |
| st.write(f"\nFirst 3 components explain {cumulative_variance[2]*100:.1f}% of total variance") | |
| st.write(f"First 5 components explain {cumulative_variance[4]*100:.1f}% of total variance") | |
| #visualizations | |
| fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,5)) | |
| #scree plot | |
| ax1.plot(range(1,len(explained_variance)+1),explained_variance,marker='o',linestyle='--') | |
| ax1.set_title('Scree Plot') | |
| ax1.set_xlabel('Principal Component') | |
| ax1.set_ylabel('Variance Explained') | |
| ax1.axvline(x=3,color='r',linestyle='--',label='3 components') | |
| ax1.axvline(x=5,color='g',linestyle='--',label='5 components') | |
| ax1.legend() | |
| ax1.grid() | |
| #cumulative variance plot | |
| ax2.plot(range(1,len(cumulative_variance)+1),cumulative_variance,marker='o',linestyle='--',color='orange') | |
| ax2.set_title('Cumulative Variance Explained') | |
| ax2.set_xlabel('Number of Principal Components') | |
| ax2.set_ylabel('Cumulative Variance Explained') | |
| ax2.axhline(y=0.9,color='r',linestyle='--',label='90% variance') | |
| ax2.axhline(y=0.95,color='g',linestyle='--',label='95% variance') | |
| ax2.legend() | |
| ax2.grid() | |
| st.pyplot(fig) | |
| components_df = pd.DataFrame( | |
| pca.components_[:5].T, # First 5 components | |
| columns=[f'PC{i+1}' for i in range(5)], | |
| index=data_reduced_df.columns | |
| ) | |
| st.write("PCA Component Loadings (first 5 components):") | |
| st.dataframe(components_df) | |
| # Visualize component loadings for interpretation | |
| fig, axes = plt.subplots(3, 2, figsize=(16, 12)) | |
| # PC1 loadings | |
| pc1_loadings = components_df['PC1'].sort_values(key=abs, ascending=False) | |
| axes[0,0].barh(range(len(pc1_loadings)), pc1_loadings.values) | |
| axes[0,0].set_yticks(range(len(pc1_loadings))) | |
| axes[0,0].set_yticklabels(pc1_loadings.index, fontsize=9) | |
| axes[0,0].set_title(f'PC1 Loadings (Explains {explained_variance[0]*100:.1f}% of variance)') | |
| axes[0,0].axvline(x=0, color='black', linestyle='-', alpha=0.3) | |
| # PC2 loadings | |
| pc2_loadings = components_df['PC2'].sort_values(key=abs, ascending=False) | |
| axes[0,1].barh(range(len(pc2_loadings)), pc2_loadings.values, color='orange') | |
| axes[0,1].set_yticks(range(len(pc2_loadings))) | |
| axes[0,1].set_yticklabels(pc2_loadings.index, fontsize=9) | |
| axes[0,1].set_title(f'PC2 Loadings (Explains {explained_variance[1]*100:.1f}% of variance)') | |
| axes[0,1].axvline(x=0, color='black', linestyle='-', alpha=0.3) | |
| # PC3 loadings | |
| pc3_loadings = components_df['PC3'].sort_values(key=abs, ascending=False) | |
| axes[1,0].barh(range(len(pc3_loadings)), pc3_loadings.values, color='green') | |
| axes[1,0].set_yticks(range(len(pc3_loadings))) | |
| axes[1,0].set_yticklabels(pc3_loadings.index, fontsize=9) | |
| axes[1,0].set_title(f'PC3 Loadings (Explains {explained_variance[2]*100:.1f}% of variance)') | |
| axes[1,0].axvline(x=0, color='black', linestyle='-', alpha=0.3) | |
| # PC1 vs PC2 scatter plot of cities | |
| axes[1,1].scatter(pca_data[:, 0], pca_data[:, 1], alpha=0.6) | |
| axes[1,1].set_xlabel('PC1') | |
| axes[1,1].set_ylabel('PC2') | |
| axes[1,1].set_title('Students in PC1-PC2 Space') | |
| axes[1,1].grid(True, alpha=0.3) | |
| # PC1 vs PC3 scatter plot of cities | |
| axes[2,0].scatter(pca_data[:, 0], pca_data[:, 2], alpha=0.6) | |
| axes[2,0].set_xlabel('PC1') | |
| axes[2,0].set_ylabel('PC3') | |
| axes[2,0].set_title('Students in PC1-PC3 Space') | |
| axes[2,0].grid(True, alpha=0.3) | |
| # PC2 vs PC3 scatter plot of cities | |
| axes[2,1].scatter(pca_data[:, 1], pca_data[:, 2], alpha=0.6) | |
| axes[2,1].set_xlabel('PC2') | |
| axes[2,1].set_ylabel('PC3') | |
| axes[2,1].set_title('Students in PC2-PC3 Space') | |
| axes[2,1].grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| # KMeans clustering on PCA reduced data | |
| kmeans = KMeans(n_clusters=k, random_state=42) | |
| cluster_labels = kmeans.fit_predict(pca_data[:,:5]) # Using first 5 PCs | |
| silhouette_avg = silhouette_score(pca_data[:,:5], cluster_labels) | |
| st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}") | |
| pca_data_pd['Cluster'] = cluster_labels | |
| pca = PCA(n_components=2, random_state=42) | |
| pca_2d = pca.fit_transform(pca_data_pd.drop(columns=['Cluster'])) | |
| pca_2d_df = pd.DataFrame(pca_2d, columns=['PC1', 'PC2']) | |
| pca_2d_df['Cluster'] = cluster_labels | |
| st.write("2D PCA plot with KMeans clusters:") | |
| kmeans_iteration_demo(pca_2d_df[['PC1', 'PC2']].values, k) | |
| # ...existing code... | |
| # ...existing code... | |
| else: | |
| st.write('You selected No dimensionality reduction') | |
| st.write('The original data is shown below') | |
| st.dataframe(data.head(10)) | |
| # Standardize the data | |
| data_scaled = StandardScaler().fit_transform(data) | |
| data_scaled_df = pd.DataFrame(data_scaled, columns=data.columns) | |
| st.dataframe(data_scaled_df.head(10)) | |
| # KMeans clustering on original scaled data | |
| kmeans = KMeans(n_clusters=k, random_state=seed) | |
| cluster_labels = kmeans.fit_predict(data_scaled_df) | |
| silhouette_avg = silhouette_score(data_scaled_df, cluster_labels) | |
| st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}") | |
| # Add cluster labels for plotting | |
| data_scaled_df['Cluster'] = cluster_labels | |
| # 2D scatter plot using two original features for visualization | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| scatter = ax.scatter( | |
| data_scaled_df[feature_x], data_scaled_df[feature_y], | |
| c=cluster_labels, cmap='tab10', alpha=0.7, s=50 | |
| ) | |
| ax.set_xlabel(feature_x) | |
| ax.set_ylabel(feature_y) | |
| ax.set_title('KMeans Clusters (Original Scaled Features)') | |
| plt.colorbar(scatter, ax=ax, label='Cluster') | |
| st.pyplot(fig) | |