| | import re |
| | import pandas as pd |
| | import os |
| | |
| | from mistralai import Mistral |
| |
|
| | class DataProcessor: |
| | INTERVENTION_COLUMN_OPTIONS = [ |
| | 'Did the intervention happen today?', |
| | 'Did the intervention take place today?' |
| | ] |
| | YES_RESPONSES = ['yes', 'assessment day'] |
| | ENGAGED_STR = 'Engaged' |
| | PARTIALLY_ENGAGED_STR = 'Partially Engaged' |
| | NOT_ENGAGED_STR = 'Not Engaged' |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | def __init__(self, student_metrics_df=None): |
| | self.mistral_api_key = os.getenv('MISTRAL_API_KEY') |
| | if not self.mistral_api_key: |
| | raise ValueError("MISTRAL_API_KEY not set in environment variables") |
| | self.client = Mistral(api_key=self.mistral_api_key) |
| | self.student_metrics_df = student_metrics_df |
| | self.intervention_column = None |
| |
|
| | def read_excel(self, uploaded_file): |
| | return pd.read_excel(uploaded_file) |
| |
|
| | def format_session_data(self, df): |
| | date_column = next((col for col in df.columns if col in ["Date of Session", "Date"]), None) |
| | if date_column: |
| | df[date_column] = pd.to_datetime(df[date_column], errors='coerce').dt.date |
| | else: |
| | print("Warning: Neither 'Date of Session' nor 'Date' column found in the dataframe.") |
| | |
| | df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p') |
| | df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p') |
| | df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p') |
| | return df |
| |
|
| | def safe_convert_to_time(self, series, format_str='%I:%M %p'): |
| | try: |
| | converted = pd.to_datetime(series, format='%H:%M:%S', errors='coerce') |
| | if format_str: |
| | return converted.dt.strftime(format_str) |
| | return converted |
| | except Exception as e: |
| | print(f"Error converting series to time: {e}") |
| | return series |
| |
|
| | def safe_convert_to_datetime(self, series, format_str=None): |
| | try: |
| | converted = pd.to_datetime(series, errors='coerce') |
| | if format_str: |
| | return converted.dt.strftime(format_str) |
| | return converted |
| | except Exception as e: |
| | print(f"Error converting series to datetime: {e}") |
| | return series |
| |
|
| | def replace_student_names_with_initials(self, df): |
| | updated_columns = [] |
| | for col in df.columns: |
| | if 'Student Attendance' in col: |
| | |
| | match = re.search(r'\[(.+?)\]$', col) |
| | if not match: |
| | |
| | match = re.search(r'\[(.+)$', col) |
| | if match: |
| | name = match.group(1).strip() |
| | |
| | name = name.rstrip(']') |
| | |
| | initials = ''.join([part[0] for part in name.strip().split()]) |
| | updated_col = f'Student Attendance [{initials}]' |
| | updated_columns.append(updated_col) |
| | else: |
| | |
| | updated_columns.append(col) |
| | else: |
| | updated_columns.append(col) |
| | df.columns = updated_columns |
| | return df |
| |
|
| | def find_intervention_column(self, df): |
| | for column in self.INTERVENTION_COLUMN_OPTIONS: |
| | if column in df.columns: |
| | self.intervention_column = column |
| | return column |
| | raise ValueError("No intervention column found in the dataframe.") |
| |
|
| | def get_intervention_column(self, df): |
| | if self.intervention_column is None: |
| | self.intervention_column = self.find_intervention_column(df) |
| | return self.intervention_column |
| |
|
| | def compute_intervention_statistics(self, df): |
| | intervention_column = self.get_intervention_column(df) |
| | total_days = len(df) |
| | sessions_held = df[intervention_column].str.strip().str.lower().isin(self.YES_RESPONSES).sum() |
| | intervention_frequency = (sessions_held / total_days) * 100 if total_days > 0 else 0 |
| | return pd.DataFrame({ |
| | 'Intervention Dosage (%)': [round(intervention_frequency, 0)], |
| | 'Intervention Sessions Held': [sessions_held], |
| | 'Intervention Sessions Not Held': [total_days - sessions_held], |
| | 'Total Number of Days Available': [total_days] |
| | }) |
| |
|
| | def classify_engagement(self, engagement_str): |
| | engagement_str = str(engagement_str).lower() |
| | if engagement_str.startswith(self.ENGAGED_STR.lower()): |
| | return self.ENGAGED_STR |
| | elif engagement_str.startswith(self.PARTIALLY_ENGAGED_STR.lower()): |
| | return self.PARTIALLY_ENGAGED_STR |
| | elif engagement_str.startswith(self.NOT_ENGAGED_STR.lower()): |
| | return self.NOT_ENGAGED_STR |
| | else: |
| | return 'Unknown' |
| | |
| | def compute_student_metrics(self, df): |
| | intervention_column = self.get_intervention_column(df) |
| | intervention_df = df[df[intervention_column].str.strip().str.lower().isin(self.YES_RESPONSES)] |
| | intervention_sessions_held = len(intervention_df) |
| | student_columns = [col for col in df.columns if col.startswith('Student Attendance')] |
| | |
| | student_metrics = {} |
| | for col in student_columns: |
| | student_name = col.replace('Student Attendance [', '').replace(']', '').strip() |
| | student_data = intervention_df[[col]].copy() |
| |
|
| | |
| | is_active = student_data[col].notna().any() |
| | if not is_active: |
| | continue |
| | |
| | student_data[col] = student_data[col].fillna('Absent') |
| | |
| | |
| | student_data['Engagement'] = student_data[col].apply(self.classify_engagement) |
| | |
| | |
| | attendance_values = student_data['Engagement'].apply( |
| | lambda x: 1 if x in [self.ENGAGED_STR, self.PARTIALLY_ENGAGED_STR, self.NOT_ENGAGED_STR] else 0 |
| | ) |
| | |
| | sessions_attended = attendance_values.sum() |
| | attendance_pct = (sessions_attended / intervention_sessions_held * 100) if intervention_sessions_held > 0 else 0 |
| | attendance_pct = round(attendance_pct) |
| | |
| | |
| | engagement_counts = { |
| | self.ENGAGED_STR: 0, |
| | self.PARTIALLY_ENGAGED_STR: 0, |
| | self.NOT_ENGAGED_STR: 0 |
| | } |
| | |
| | |
| | for x in student_data['Engagement']: |
| | if x in engagement_counts: |
| | engagement_counts[x] += 1 |
| | |
| | |
| | total_present_sessions = sum(engagement_counts.values()) |
| | |
| | engaged_pct = ( |
| | (engagement_counts[self.ENGAGED_STR] / total_present_sessions * 100) |
| | if total_present_sessions > 0 else 0 |
| | ) |
| | engaged_pct = round(engaged_pct) |
| | |
| | partially_engaged_pct = ( |
| | (engagement_counts[self.PARTIALLY_ENGAGED_STR] / total_present_sessions * 100) |
| | if total_present_sessions > 0 else 0 |
| | ) |
| | partially_engaged_pct = round(partially_engaged_pct) |
| | |
| | not_engaged_pct = ( |
| | (engagement_counts[self.NOT_ENGAGED_STR] / total_present_sessions * 100) |
| | if total_present_sessions > 0 else 0 |
| | ) |
| | not_engaged_pct = round(not_engaged_pct) |
| | |
| | |
| | engagement_pct = ( |
| | ((engagement_counts[self.ENGAGED_STR] + engagement_counts[self.PARTIALLY_ENGAGED_STR]) / total_present_sessions * 100) |
| | if total_present_sessions > 0 else 0 |
| | ) |
| | engagement_pct = round(engagement_pct) |
| | |
| | |
| | absent_sessions = student_data['Engagement'].value_counts().get('Absent', 0) |
| | absent_pct = (absent_sessions / intervention_sessions_held * 100) if intervention_sessions_held > 0 else 0 |
| | absent_pct = round(absent_pct) |
| | |
| | |
| | attended_90 = "Yes" if attendance_pct >= 90 else "No" |
| | |
| | |
| | engaged_80 = "Yes" if engagement_pct >= 80 else "No" |
| | |
| | |
| | student_metrics[student_name] = { |
| | 'Attended ≥ 90%': attended_90, |
| | 'Engagement ≥ 80%': engaged_80, |
| | 'Attendance (%)': attendance_pct, |
| | |
| | f'{self.ENGAGED_STR} (%)': engaged_pct, |
| | f'{self.PARTIALLY_ENGAGED_STR} (%)': partially_engaged_pct, |
| | f'{self.NOT_ENGAGED_STR} (%)': not_engaged_pct, |
| | 'Absent (%)': absent_pct |
| | } |
| | |
| | |
| | student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index() |
| | student_metrics_df.rename(columns={'index': 'Student'}, inplace=True) |
| | return student_metrics_df |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | def compute_average_metrics(self, student_metrics_df): |
| | |
| | active_students_df = student_metrics_df.dropna() |
| | |
| | |
| | attendance_avg_stats = active_students_df['Attendance (%)'].mean() |
| | |
| | |
| | |
| | total_engagement = active_students_df[f'{self.ENGAGED_STR} (%)'] + active_students_df[f'{self.PARTIALLY_ENGAGED_STR} (%)'] |
| | engagement_avg_stats = total_engagement.mean() |
| | |
| | |
| | attendance_avg_stats = round(attendance_avg_stats) |
| | engagement_avg_stats = round(engagement_avg_stats) |
| | |
| | return attendance_avg_stats, engagement_avg_stats |
| |
|
| | def evaluate_student(self, row, attendance_threshold=90, engagement_threshold=80): |
| | if row["Attended ≥ 90%"] == "No": |
| | return "Address Attendance" |
| | elif row["Engagement ≥ 80%"] == "No": |
| | return "Address Engagement" |
| | else: |
| | return "Consider barriers, fidelity, and progress monitoring" |