| | import re |
| | import streamlit as st |
| | import plotly.express as px |
| | import plotly.graph_objects as go |
| | from plotly.subplots import make_subplots |
| | import pandas as pd |
| | import numpy as np |
| | from src.utils.logging import log_frontend_error, log_frontend_warning |
| |
|
| | SAMPLE_SIZE = 10000 |
| |
|
| | |
| | @st.cache_data(show_spinner=False) |
| | def compute_df_hash(df): |
| | """Optimized dataframe hashing""" |
| | return hash((df.shape, pd.util.hash_pandas_object(df.iloc[:min(100, len(df))]).sum())) |
| |
|
| |
|
| | @st.cache_data(show_spinner=False, ttl=3600) |
| | def is_potential_date_column(series, sample_size=5): |
| | """Check if column might contain dates""" |
| | |
| | if any(keyword in series.name.lower() for keyword in ['date', 'time', 'year', 'month', 'day']): |
| | return True |
| | |
| | |
| | sample = series.dropna().head(sample_size).astype(str) |
| | date_patterns = [ |
| | r'\d{4}-\d{2}-\d{2}', |
| | r'\d{2}/\d{2}/\d{4}', |
| | r'\d{2}-\w{3}-\d{2,4}', |
| | r'\d{1,2} \w{3,} \d{4}' |
| | ] |
| | |
| | date_count = sum(1 for val in sample if any(re.match(p, val) for p in date_patterns)) |
| | return date_count / len(sample) > 0.5 if len(sample) > 0 else False |
| |
|
| |
|
| |
|
| |
|
| | |
| | @st.cache_data(show_spinner=False, ttl=3600) |
| | def get_column_types(df): |
| | """Detect column types efficiently and cache the results.""" |
| | column_types = {} |
| | |
| | |
| | for chunk_start in range(0, len(df.columns), 10): |
| | chunk_end = min(chunk_start + 10, len(df.columns)) |
| | chunk_columns = df.columns[chunk_start:chunk_end] |
| | |
| | for column in chunk_columns: |
| | |
| | if pd.api.types.is_numeric_dtype(df[column]): |
| | |
| | if df[column].nunique() <= 2: |
| | column_types[column] = "BINARY" |
| | |
| | elif df[column].nunique() < 20: |
| | column_types[column] = "NUMERIC_DISCRETE" |
| | |
| | else: |
| | column_types[column] = "NUMERIC_CONTINUOUS" |
| | else: |
| | |
| | if is_potential_date_column(df[column]): |
| | try: |
| | |
| | converted = pd.to_datetime(df[column], errors='coerce') |
| | if not converted.isnull().all(): |
| | column_types[column] = "TEMPORAL" |
| | continue |
| | except Exception: |
| | pass |
| | |
| | |
| | if (df[column].nunique() > len(df) * 0.9 and |
| | any(x in column.lower() for x in ['id', 'code', 'key', 'uuid', 'identifier'])): |
| | column_types[column] = "ID" |
| | |
| | elif df[column].nunique() <= 20: |
| | column_types[column] = "CATEGORICAL" |
| | |
| | else: |
| | column_types[column] = "TEXT" |
| | |
| | return column_types |
| |
|
| |
|
| |
|
| |
|
| | |
| | @st.cache_data(show_spinner=False, ttl=3600) |
| | def get_corr_matrix(df): |
| | """Compute and cache the correlation matrix for numeric columns.""" |
| | |
| | numeric_cols = df.select_dtypes(include=[np.number]).columns |
| | |
| | |
| | if len(numeric_cols) > 30: |
| | numeric_cols = numeric_cols[:30] |
| | |
| | |
| | return df[numeric_cols].corr() if len(numeric_cols) > 1 else None |
| |
|
| |
|
| |
|
| |
|
| |
|
| | |
| | @st.cache_data(show_spinner=False, ttl=3600) |
| | def get_subsampled_data(df, column): |
| | """Return subsampled data for faster visualization.""" |
| | |
| | if column not in df.columns: |
| | return pd.DataFrame() |
| | |
| | |
| | if df[column].nunique() < 20 and len(df) > SAMPLE_SIZE: |
| | try: |
| | |
| | fractions = min(0.5, SAMPLE_SIZE / len(df)) |
| | return df[[column]].groupby(column, group_keys=False).apply( |
| | lambda x: x.sample(max(1, int(fractions * len(x))), random_state=42) |
| | ) |
| | except Exception: |
| | |
| | pass |
| | |
| | |
| | return df[[column]].sample(min(len(df), SAMPLE_SIZE), random_state=42) |
| |
|
| |
|
| |
|
| |
|
| | |
| | @st.cache_data(show_spinner=False, ttl=1800, hash_funcs={ |
| | pd.DataFrame: compute_df_hash, |
| | pd.Series: lambda s: hash((s.name, compute_df_hash(s.to_frame()))) |
| | }) |
| | def create_chart(df, column, column_type): |
| | """Generate optimized charts based on column type.""" |
| | |
| | if column not in df.columns: |
| | return None |
| | |
| | |
| | df_sample = get_subsampled_data(df, column) |
| | if df_sample.empty: |
| | return None |
| | |
| | try: |
| | |
| | if "year" in column.lower(): |
| | fig = make_subplots(rows=1, cols=2, subplot_titles=("Year Distribution", "Box Plot"), |
| | specs=[[{"type": "bar"}, {"type": "box"}]], column_widths=[0.7, 0.3], horizontal_spacing=0.1) |
| | year_counts = df_sample[column].value_counts().sort_index() |
| | fig.add_trace(go.Bar(x=year_counts.index, y=year_counts.values, marker_color='#7B68EE'), row=1, col=1) |
| | fig.add_trace(go.Box(x=df_sample[column], marker_color='#7B68EE'), row=1, col=2) |
| | |
| | |
| | elif column_type == "BINARY": |
| | value_counts = df_sample[column].value_counts() |
| | fig = make_subplots(rows=1, cols=2, |
| | subplot_titles=("Distribution", "Percentage"), |
| | specs=[[{"type": "bar"}, {"type": "pie"}]], |
| | column_widths=[0.5, 0.5], |
| | horizontal_spacing=0.1) |
| | |
| | fig.add_trace(go.Bar( |
| | x=value_counts.index, |
| | y=value_counts.values, |
| | marker_color=['#FF4B4B', '#4CAF50'], |
| | text=value_counts.values, |
| | textposition='auto' |
| | ), row=1, col=1) |
| | |
| | fig.add_trace(go.Pie( |
| | labels=value_counts.index, |
| | values=value_counts.values, |
| | marker=dict(colors=['#FF4B4B', '#4CAF50']), |
| | textinfo='percent+label' |
| | ), row=1, col=2) |
| | |
| | fig.update_layout(title_text=f"Binary Distribution: {column}") |
| | |
| | |
| | elif column_type == "NUMERIC_CONTINUOUS": |
| | fig = make_subplots(rows=2, cols=2, |
| | subplot_titles=("Distribution", "Box Plot", "Violin Plot", "Cumulative Distribution"), |
| | specs=[[{"type": "histogram"}, {"type": "box"}], |
| | [{"type": "violin"}, {"type": "scatter"}]], |
| | vertical_spacing=0.15, |
| | horizontal_spacing=0.1) |
| | |
| | |
| | fig.add_trace(go.Histogram( |
| | x=df_sample[column], |
| | nbinsx=30, |
| | marker_color='#FF4B4B', |
| | opacity=0.7 |
| | ), row=1, col=1) |
| | |
| | |
| | fig.add_trace(go.Box( |
| | x=df_sample[column], |
| | marker_color='#FF4B4B', |
| | boxpoints='outliers' |
| | ), row=1, col=2) |
| | |
| | |
| | fig.add_trace(go.Violin( |
| | x=df_sample[column], |
| | marker_color='#FF4B4B', |
| | box_visible=True, |
| | points='outliers' |
| | ), row=2, col=1) |
| | |
| | |
| | sorted_data = np.sort(df_sample[column].dropna()) |
| | cumulative = np.arange(1, len(sorted_data) + 1) / len(sorted_data) |
| | |
| | fig.add_trace(go.Scatter( |
| | x=sorted_data, |
| | y=cumulative, |
| | mode='lines', |
| | line=dict(color='#FF4B4B', width=2) |
| | ), row=2, col=2) |
| | |
| | fig.update_layout(height=600, title_text=f"Continuous Variable Analysis: {column}") |
| | |
| | |
| | elif column_type == "NUMERIC_DISCRETE": |
| | value_counts = df_sample[column].value_counts().sort_index() |
| | fig = make_subplots(rows=1, cols=2, |
| | subplot_titles=("Distribution", "Percentage"), |
| | specs=[[{"type": "bar"}, {"type": "pie"}]], |
| | column_widths=[0.7, 0.3], |
| | horizontal_spacing=0.1) |
| | |
| | fig.add_trace(go.Bar( |
| | x=value_counts.index, |
| | y=value_counts.values, |
| | marker_color='#FF4B4B', |
| | text=value_counts.values, |
| | textposition='auto' |
| | ), row=1, col=1) |
| | |
| | fig.add_trace(go.Pie( |
| | labels=value_counts.index, |
| | values=value_counts.values, |
| | marker=dict(colors=px.colors.sequential.Reds), |
| | textinfo='percent+label' |
| | ), row=1, col=2) |
| | |
| | fig.update_layout(title_text=f"Discrete Numeric Distribution: {column}") |
| | |
| | |
| | elif column_type == "CATEGORICAL": |
| | value_counts = df_sample[column].value_counts().head(20) |
| | fig = make_subplots(rows=1, cols=2, |
| | subplot_titles=("Category Distribution", "Percentage Breakdown"), |
| | specs=[[{"type": "bar"}, {"type": "pie"}]], |
| | column_widths=[0.6, 0.4], |
| | horizontal_spacing=0.1) |
| | |
| | |
| | fig.add_trace(go.Bar( |
| | x=value_counts.index, |
| | y=value_counts.values, |
| | marker_color='#00FFA3', |
| | text=value_counts.values, |
| | textposition='auto' |
| | ), row=1, col=1) |
| | |
| | |
| | fig.add_trace(go.Pie( |
| | labels=value_counts.index, |
| | values=value_counts.values, |
| | marker=dict(colors=px.colors.sequential.Greens), |
| | textinfo='percent+label' |
| | ), row=1, col=2) |
| | |
| | fig.update_layout(title_text=f"Categorical Analysis: {column}") |
| | |
| | |
| | elif column_type == "TEMPORAL": |
| | |
| | dates = pd.to_datetime(df_sample[column], errors='coerce', format='mixed') |
| | valid_dates = dates[dates.notna()] |
| | |
| | fig = make_subplots( |
| | rows=2, |
| | cols=2, |
| | subplot_titles=("Monthly Pattern", "Yearly Pattern", "Cumulative Trend", "Day of Week Distribution"), |
| | vertical_spacing=0.15, |
| | horizontal_spacing=0.1, |
| | specs=[[{"type": "bar"}, {"type": "bar"}], |
| | [{"type": "scatter"}, {"type": "bar"}]] |
| | ) |
| | |
| | |
| | if not valid_dates.empty: |
| | monthly_counts = valid_dates.dt.month.value_counts().sort_index() |
| | month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
| | month_labels = [month_names[i-1] for i in monthly_counts.index] |
| | |
| | fig.add_trace(go.Bar( |
| | x=month_labels, |
| | y=monthly_counts.values, |
| | marker_color='#7B68EE', |
| | text=monthly_counts.values, |
| | textposition='auto' |
| | ), row=1, col=1) |
| | |
| | |
| | yearly_counts = valid_dates.dt.year.value_counts().sort_index() |
| | |
| | fig.add_trace(go.Bar( |
| | x=yearly_counts.index, |
| | y=yearly_counts.values, |
| | marker_color='#7B68EE', |
| | text=yearly_counts.values, |
| | textposition='auto' |
| | ), row=1, col=2) |
| | |
| | |
| | sorted_dates = valid_dates.sort_values() |
| | cumulative = np.arange(1, len(sorted_dates) + 1) |
| | |
| | fig.add_trace(go.Scatter( |
| | x=sorted_dates, |
| | y=cumulative, |
| | mode='lines', |
| | line=dict(color='#7B68EE', width=2) |
| | ), row=2, col=1) |
| | |
| | |
| | dow_counts = valid_dates.dt.dayofweek.value_counts().sort_index() |
| | dow_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] |
| | dow_labels = [dow_names[i] for i in dow_counts.index] |
| | |
| | fig.add_trace(go.Bar( |
| | x=dow_labels, |
| | y=dow_counts.values, |
| | marker_color='#7B68EE', |
| | text=dow_counts.values, |
| | textposition='auto' |
| | ), row=2, col=2) |
| | |
| | fig.update_layout(height=600, title_text=f"Temporal Analysis: {column}") |
| | |
| | |
| | elif column_type == "ID": |
| | |
| | id_lengths = df_sample[column].astype(str).str.len() |
| | |
| | |
| | id_prefixes = df_sample[column].astype(str).str[:2].value_counts().head(15) |
| | |
| | fig = make_subplots( |
| | rows=1, |
| | cols=2, |
| | subplot_titles=("ID Length Distribution", "Common ID Prefixes"), |
| | horizontal_spacing=0.1, |
| | specs=[[{"type": "histogram"}, {"type": "bar"}]] |
| | ) |
| | |
| | |
| | fig.add_trace(go.Histogram( |
| | x=id_lengths, |
| | nbinsx=20, |
| | marker_color='#9C27B0' |
| | ), row=1, col=1) |
| | |
| | |
| | fig.add_trace(go.Bar( |
| | x=id_prefixes.index, |
| | y=id_prefixes.values, |
| | marker_color='#9C27B0', |
| | text=id_prefixes.values, |
| | textposition='auto' |
| | ), row=1, col=2) |
| | |
| | fig.update_layout(title_text=f"ID Analysis: {column}") |
| | |
| | |
| | elif column_type == "TEXT": |
| | |
| | value_counts = df_sample[column].value_counts().head(15) |
| | |
| | |
| | text_lengths = df_sample[column].astype(str).str.len() |
| | |
| | fig = make_subplots( |
| | rows=2, |
| | cols=1, |
| | subplot_titles=("Top Values", "Text Length Distribution"), |
| | vertical_spacing=0.2, |
| | specs=[[{"type": "bar"}], [{"type": "histogram"}]] |
| | ) |
| | |
| | |
| | fig.add_trace( |
| | go.Bar( |
| | x=value_counts.index, |
| | y=value_counts.values, |
| | marker_color='#00B4D8', |
| | text=value_counts.values, |
| | textposition='auto' |
| | ), |
| | row=1, col=1 |
| | ) |
| | |
| | |
| | fig.add_trace( |
| | go.Histogram( |
| | x=text_lengths, |
| | nbinsx=20, |
| | marker_color='#00B4D8' |
| | ), |
| | row=2, col=1 |
| | ) |
| | |
| | fig.update_layout( |
| | height=600, |
| | title_text=f"Text Analysis: {column}" |
| | ) |
| | |
| | |
| | else: |
| | fig = go.Figure(go.Histogram(x=df_sample[column], marker_color='#888')) |
| | fig.update_layout(title_text=f"Generic Analysis: {column}") |
| |
|
| | |
| | fig.update_layout( |
| | height=400, |
| | showlegend=False, |
| | plot_bgcolor='rgba(0,0,0,0)', |
| | paper_bgcolor='rgba(0,0,0,0)', |
| | font=dict(color='#FFFFFF'), |
| | margin=dict(l=40, r=40, t=50, b=40) |
| | ) |
| | |
| | return fig |
| | |
| | except Exception as e: |
| | log_frontend_error("Chart Generation", f"Error creating chart for {column}: {str(e)}") |
| | return None |
| |
|
| |
|
| |
|
| |
|
| | def visualize_data(df): |
| | """Automated dashboard with optimized visualizations.""" |
| | if df is None or df.empty: |
| | st.error("β No data available. Please upload and clean your data first.") |
| | return |
| |
|
| | |
| | df_hash = compute_df_hash(df) |
| |
|
| | |
| | if "selected_viz_columns" not in st.session_state: |
| | |
| | initial_columns = list(df.columns[:min(4, len(df.columns))]) |
| | st.session_state.selected_viz_columns = initial_columns |
| | |
| | |
| | valid_columns = [col for col in st.session_state.selected_viz_columns if col in df.columns] |
| | |
| | |
| | def on_column_selection_change(): |
| | |
| | st.session_state.selected_viz_columns = st.session_state.viz_column_selector |
| | |
| | st.session_state.current_tab_index = 2 |
| | |
| | |
| | selected_columns = st.multiselect( |
| | "Select columns to visualize", |
| | options=df.columns, |
| | default=valid_columns, |
| | key="viz_column_selector", |
| | on_change=on_column_selection_change |
| | ) |
| | |
| | |
| | |
| | |
| | |
| | |
| | recompute_needed = ( |
| | "column_types" not in st.session_state or |
| | "df_hash" not in st.session_state or |
| | st.session_state.get("df_hash") != df_hash |
| | ) |
| | |
| | if recompute_needed: |
| | with st.spinner("π Analyzing data structure..."): |
| | |
| | st.session_state.column_types = get_column_types(df) |
| | |
| | st.session_state.corr_matrix = get_corr_matrix(df) |
| | |
| | st.session_state.df_hash = df_hash |
| | |
| | st.session_state.current_tab_index = 2 |
| | |
| | |
| | if "test_results_calculated" in st.session_state: |
| | st.session_state.test_results_calculated = False |
| | |
| | for key in ['test_metrics', 'test_y_pred', 'test_y_test', 'test_cm', 'sampling_message']: |
| | if key in st.session_state: |
| | del st.session_state[key] |
| | |
| | |
| | column_types = st.session_state.column_types |
| | corr_matrix = st.session_state.corr_matrix |
| |
|
| | if selected_columns: |
| | |
| | viz_container = st.container() |
| | |
| | with viz_container: |
| | for idx in range(0, len(selected_columns), 2): |
| | col1, col2 = st.columns(2) |
| |
|
| | for i, col in enumerate([col1, col2]): |
| | if idx + i < len(selected_columns): |
| | column = selected_columns[idx + i] |
| | with col: |
| | |
| | chart_key = f"plot_{column.replace(' ', '_')}" |
| | |
| | |
| | if column in column_types: |
| | fig = create_chart(df, column, column_types[column]) |
| | if fig: |
| | st.plotly_chart(fig, use_container_width=True, key=chart_key) |
| | with st.expander(f"π Summary Statistics - {column}", expanded=False): |
| | if "NUMERIC" in column_types[column]: |
| | st.dataframe(df[column].describe(), key=f"stats_{column.replace(' ', '_')}") |
| | else: |
| | st.dataframe(df[column].value_counts(), key=f"counts_{column.replace(' ', '_')}") |
| | else: |
| | st.warning(f"β οΈ Column '{column}' not found in the dataset or its type couldn't be determined.") |
| |
|
| | if corr_matrix is not None: |
| | st.subheader("π Correlation Analysis") |
| | fig = px.imshow(corr_matrix, title="Correlation Matrix", color_continuous_scale="RdBu") |
| | st.plotly_chart(fig, use_container_width=True, key="corr_matrix_plot") |
| | |
| | else: |
| | st.info("π Please select columns to visualize") |
| |
|