| | import gradio as gr |
| | import pandas as pd |
| | import numpy as np |
| | import tempfile |
| | import os |
| |
|
| | def handle_analysis(df_state, model_selection_group, analyze_results_button): |
| | with gr.Group(visible=False) as analysis_group: |
| | gr.Markdown("## Analysis") |
| |
|
| | |
| | accuracy_measurement_dropdown = gr.Dropdown( |
| | choices=['Accuracy', 'Pearson Correlation'], |
| | label='Select Evaluation Metric' |
| | ) |
| |
|
| | |
| | with gr.Row(): |
| | ground_truth_dropdown = gr.Dropdown( |
| | choices=[], |
| | label='Select True Label Column' |
| | ) |
| |
|
| | |
| | with gr.Row(): |
| | judge_a_result = gr.Textbox( |
| | label="Judge A Results", |
| | lines=10, |
| | interactive=False, |
| | visible=False |
| | ) |
| | judge_b_result = gr.Textbox( |
| | label="Judge B Results", |
| | lines=10, |
| | interactive=False, |
| | visible=False |
| | ) |
| |
|
| | |
| | json_output = gr.File(label="Results .json", interactive=False, visible=False) |
| |
|
| | |
| | with gr.Row(): |
| | back_to_results_button = gr.Button("← Back to Results") |
| | calculate_button = gr.Button("Calculate") |
| | download_button = gr.Button("Download Results as JSON") |
| |
|
| | |
| | def show_analysis_group(): |
| | df = df_state.value |
| | if df is not None: |
| | columns = df.columns.tolist() |
| | else: |
| | columns = [] |
| | |
| | return ( |
| | gr.update(visible=True), |
| | gr.update(visible=False), |
| | gr.update(choices=columns), |
| | ) |
| |
|
| | analyze_results_button.click( |
| | fn=show_analysis_group, |
| | inputs=[], |
| | outputs=[ |
| | analysis_group, |
| | model_selection_group, |
| | ground_truth_dropdown |
| | ] |
| | ) |
| |
|
| | def back_to_results(): |
| | return ( |
| | gr.update(visible=False), |
| | gr.update(visible=True), |
| | ) |
| |
|
| | back_to_results_button.click( |
| | fn=back_to_results, |
| | inputs=[], |
| | outputs=[analysis_group, model_selection_group] |
| | ) |
| |
|
| | def calculate_multiple_accuracies(measurement, ground_truth_col, df_state): |
| | |
| | col2_name = "score_selene" |
| | |
| | df = df_state.value |
| | score_columns = [col for col in df.columns if col.startswith('score_') and col != 'score_selene'] |
| | col3_name = score_columns[0] if score_columns else None |
| | |
| | if df is None: |
| | return ( |
| | gr.update(value="No DataFrame available.", visible=True), |
| | gr.update(value="No DataFrame available.", visible=True) |
| | ) |
| |
|
| | |
| | missing_columns = [col for col in [ground_truth_col, col2_name, col3_name] if col not in df.columns] |
| | if missing_columns: |
| | msg = f"Selected columns not found in DataFrame: {', '.join(missing_columns)}." |
| | return ( |
| | gr.update(value=msg, visible=True), |
| | gr.update(value=msg, visible=True) |
| | ) |
| |
|
| | |
| | result1 = calculate_accuracy( |
| | measurement, ground_truth_col, col2_name, |
| | df_state, compare_to_ground_truth=True |
| | ) |
| | text_a = f"Comparison: '{ground_truth_col}' vs. 'Selene'\n{result1}" |
| |
|
| | |
| | result2 = calculate_accuracy( |
| | measurement, ground_truth_col, col3_name, |
| | df_state, compare_to_ground_truth=True |
| | ) |
| | |
| | model_name = col3_name.replace('score_', '').replace('_', ' ').title() |
| | text_b = f"Comparison: '{ground_truth_col}' vs. '{model_name}'\n{result2}" |
| |
|
| | return ( |
| | gr.update(value=text_a, visible=True), |
| | gr.update(value=text_b, visible=True) |
| | ) |
| |
|
| | |
| | calculate_button.click( |
| | fn=calculate_multiple_accuracies, |
| | inputs=[ |
| | accuracy_measurement_dropdown, |
| | ground_truth_dropdown, |
| | df_state |
| | ], |
| | outputs=[judge_a_result, judge_b_result] |
| | ) |
| |
|
| | def create_json_download(df_state): |
| | if df_state.value is None: |
| | return gr.update(value=None, visible=True) |
| | |
| | json_str = df_state.value.to_json(orient='records', indent=2) |
| | temp_dir = tempfile.gettempdir() |
| | file_path = os.path.join(temp_dir, 'atla_custom_eval_results.json') |
| | with open(file_path, 'w', encoding='utf-8') as f: |
| | f.write(json_str) |
| | return gr.update(value=file_path, visible=True) |
| |
|
| | download_button.click( |
| | fn=create_json_download, |
| | inputs=[df_state], |
| | outputs=[json_output] |
| | ) |
| |
|
| | |
| |
|
| | def calculate_accuracy(measurement, col1, col2, df_state, compare_to_ground_truth=False): |
| | df = df_state.value |
| | |
| | if df is None: |
| | return "No DataFrame available." |
| | if col1 not in df.columns or col2 not in df.columns: |
| | return "Selected columns not found in DataFrame." |
| |
|
| | results_df = pd.DataFrame() |
| | if compare_to_ground_truth: |
| | results_df['ground_truth'] = df[col1] |
| | results_df['predicted'] = df[col2] |
| | else: |
| | results_df['extracted_winner'] = df[col1] |
| | results_df['truth_result'] = df[col2] |
| |
|
| | if measurement == 'Accuracy': |
| | result = process_pairwise_accuracy(results_df, compare_to_ground_truth) |
| | output_text = ( |
| | f"Overall Accuracy: {result['overall_accuracy']}\n" |
| | f"Number of NaNs: {result['num_extracted_nan']}" |
| | ) |
| | elif measurement == 'Pearson Correlation': |
| | result = process_single_rating_pearson_correlation(results_df, compare_to_ground_truth) |
| | output_text = ( |
| | f"Pearson Correlation: {result['overall_pearson_correlation']}\n" |
| | f"Number of NaNs: {result['num_extracted_nan']}" |
| | ) |
| | else: |
| | output_text = "Unknown measurement selected." |
| |
|
| | return output_text |
| |
|
| | def process_pairwise_accuracy(results_df: pd.DataFrame, compare_to_ground_truth=False) -> dict: |
| | |
| | if compare_to_ground_truth: |
| | |
| | results_df['ground_truth'] = results_df['ground_truth'].apply(convert_to_float_or_nan) |
| | results_df['predicted'] = results_df['predicted'].apply(convert_to_float_or_nan) |
| |
|
| | results_df['results'] = results_df['ground_truth'] == results_df['predicted'] |
| | num_extracted_nan = int(results_df['predicted'].isna().sum()) |
| | else: |
| | results_df['results'] = results_df['extracted_winner'] == results_df['truth_result'] |
| | num_extracted_nan = int(results_df['extracted_winner'].isna().sum()) |
| |
|
| | overall_accuracy = results_df['results'].mean() |
| |
|
| | return { |
| | "overall_accuracy": overall_accuracy, |
| | "num_extracted_nan": num_extracted_nan, |
| | } |
| |
|
| | def process_single_rating_pearson_correlation( |
| | results_df: pd.DataFrame, compare_to_ground_truth=False |
| | ) -> dict: |
| | if compare_to_ground_truth: |
| | pred_col = 'predicted' |
| | truth_col = 'ground_truth' |
| | else: |
| | pred_col = 'extracted_winner' |
| | truth_col = 'truth_result' |
| |
|
| | results_df[pred_col] = results_df[pred_col].apply(convert_to_float_or_nan) |
| | results_df[truth_col] = results_df[truth_col].apply(convert_to_float_or_nan) |
| |
|
| | numerical_results = results_df.dropna(subset=[pred_col, truth_col]) |
| |
|
| | if len(numerical_results) == 0: |
| | pearson_corr = np.nan |
| | else: |
| | pearson_corr = numerical_results[pred_col].corr(numerical_results[truth_col]) |
| |
|
| | num_extracted_nan = int(results_df[pred_col].isna().sum()) |
| |
|
| | return { |
| | "overall_pearson_correlation": pearson_corr if not pd.isna(pearson_corr) else 0.0, |
| | "num_extracted_nan": num_extracted_nan, |
| | } |
| |
|
| | def convert_to_float_or_nan(extracted_input): |
| | if extracted_input is None or pd.isna(extracted_input): |
| | return np.nan |
| | try: |
| | return float(extracted_input) |
| | except ValueError: |
| | return np.nan |