Spaces:

AtlaAI
/

selene

Paused

App Files Files Community

selene / score_handler.py

kaikaidai

Synced repo using 'sync_with_huggingface' Github Action

7db401b verified about 1 year ago

raw

history blame contribute delete

8.88 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import tempfile
	import os

	def handle_analysis(df_state, model_selection_group, analyze_results_button):
	with gr.Group(visible=False) as analysis_group:
	gr.Markdown("## Analysis")

	# Dropdown to select the accuracy measurement
	accuracy_measurement_dropdown = gr.Dropdown(
	choices=['Accuracy', 'Pearson Correlation'],
	label='Select Evaluation Metric'
	)

	# We remove the two compare dropdowns and only keep ground truth
	with gr.Row():
	ground_truth_dropdown = gr.Dropdown(
	choices=[],
	label='Select True Label Column'
	)

	# Define two side-by-side boxes for results
	with gr.Row():
	judge_a_result = gr.Textbox(
	label="Judge A Results",
	lines=10,
	interactive=False,
	visible=False
	)
	judge_b_result = gr.Textbox(
	label="Judge B Results",
	lines=10,
	interactive=False,
	visible=False
	)

	# Move the JSON output below those textboxes and buttons
	json_output = gr.File(label="Results .json", interactive=False, visible=False)

	# Now place the row of buttons AFTER the json_output
	with gr.Row():
	back_to_results_button = gr.Button("← Back to Results")
	calculate_button = gr.Button("Calculate")
	download_button = gr.Button("Download Results as JSON")

	# Show analysis group
	def show_analysis_group():
	df = df_state.value
	if df is not None:
	columns = df.columns.tolist()
	else:
	columns = []
	# Now we only update ground_truth_dropdown
	return (
	gr.update(visible=True), # analysis_group
	gr.update(visible=False), # model_selection_group
	gr.update(choices=columns), # ground_truth_dropdown
	)

	analyze_results_button.click(
	fn=show_analysis_group,
	inputs=[],
	outputs=[
	analysis_group,
	model_selection_group,
	ground_truth_dropdown # only this one
	]
	)

	def back_to_results():
	return (
	gr.update(visible=False), # Hide analysis_group
	gr.update(visible=True), # Show model_selection_group
	)

	back_to_results_button.click(
	fn=back_to_results,
	inputs=[],
	outputs=[analysis_group, model_selection_group]
	)

	def calculate_multiple_accuracies(measurement, ground_truth_col, df_state):
	# Update column names to match new format
	col2_name = "score_selene"
	# Get the non-selene score column (should be the other score_* column)
	df = df_state.value
	score_columns = [col for col in df.columns if col.startswith('score_') and col != 'score_selene']
	col3_name = score_columns[0] if score_columns else None

	if df is None:
	return (
	gr.update(value="No DataFrame available.", visible=True),
	gr.update(value="No DataFrame available.", visible=True)
	)

	# Check if user-supplied ground_truth_col is valid
	missing_columns = [col for col in [ground_truth_col, col2_name, col3_name] if col not in df.columns]
	if missing_columns:
	msg = f"Selected columns not found in DataFrame: {', '.join(missing_columns)}."
	return (
	gr.update(value=msg, visible=True),
	gr.update(value=msg, visible=True)
	)

	# Compare ground_truth_col with score_selene
	result1 = calculate_accuracy(
	measurement, ground_truth_col, col2_name,
	df_state, compare_to_ground_truth=True
	)
	text_a = f"Comparison: '{ground_truth_col}' vs. 'Selene'\n{result1}"

	# Compare ground_truth_col with the other model's score
	result2 = calculate_accuracy(
	measurement, ground_truth_col, col3_name,
	df_state, compare_to_ground_truth=True
	)
	# Extract model name from column name for display
	model_name = col3_name.replace('score_', '').replace('_', ' ').title()
	text_b = f"Comparison: '{ground_truth_col}' vs. '{model_name}'\n{result2}"

	return (
	gr.update(value=text_a, visible=True),
	gr.update(value=text_b, visible=True)
	)

	# Now the calculate_button only expects measurement, ground_truth_col, df_state
	calculate_button.click(
	fn=calculate_multiple_accuracies,
	inputs=[
	accuracy_measurement_dropdown,
	ground_truth_dropdown,
	df_state
	],
	outputs=[judge_a_result, judge_b_result]
	)

	def create_json_download(df_state):
	if df_state.value is None:
	return gr.update(value=None, visible=True)

	json_str = df_state.value.to_json(orient='records', indent=2)
	temp_dir = tempfile.gettempdir()
	file_path = os.path.join(temp_dir, 'atla_custom_eval_results.json')
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(json_str)
	return gr.update(value=file_path, visible=True)

	download_button.click(
	fn=create_json_download,
	inputs=[df_state],
	outputs=[json_output]
	)

	# Helper functions

	def calculate_accuracy(measurement, col1, col2, df_state, compare_to_ground_truth=False):
	df = df_state.value
	# No changes here (function remains sacred as per your request)
	if df is None:
	return "No DataFrame available."
	if col1 not in df.columns or col2 not in df.columns:
	return "Selected columns not found in DataFrame."

	results_df = pd.DataFrame()
	if compare_to_ground_truth:
	results_df['ground_truth'] = df[col1]
	results_df['predicted'] = df[col2]
	else:
	results_df['extracted_winner'] = df[col1]
	results_df['truth_result'] = df[col2]

	if measurement == 'Accuracy':
	result = process_pairwise_accuracy(results_df, compare_to_ground_truth)
	output_text = (
	f"Overall Accuracy: {result['overall_accuracy']}\n"
	f"Number of NaNs: {result['num_extracted_nan']}"
	)
	elif measurement == 'Pearson Correlation':
	result = process_single_rating_pearson_correlation(results_df, compare_to_ground_truth)
	output_text = (
	f"Pearson Correlation: {result['overall_pearson_correlation']}\n"
	f"Number of NaNs: {result['num_extracted_nan']}"
	)
	else:
	output_text = "Unknown measurement selected."

	return output_text

	def process_pairwise_accuracy(results_df: pd.DataFrame, compare_to_ground_truth=False) -> dict:
	# Compute 'results' column based on whether comparing to ground truth
	if compare_to_ground_truth:
	# NEW: convert both columns to float
	results_df['ground_truth'] = results_df['ground_truth'].apply(convert_to_float_or_nan)
	results_df['predicted'] = results_df['predicted'].apply(convert_to_float_or_nan)

	results_df['results'] = results_df['ground_truth'] == results_df['predicted']
	num_extracted_nan = int(results_df['predicted'].isna().sum())
	else:
	results_df['results'] = results_df['extracted_winner'] == results_df['truth_result']
	num_extracted_nan = int(results_df['extracted_winner'].isna().sum())

	overall_accuracy = results_df['results'].mean()

	return {
	"overall_accuracy": overall_accuracy,
	"num_extracted_nan": num_extracted_nan,
	}

	def process_single_rating_pearson_correlation(
	results_df: pd.DataFrame, compare_to_ground_truth=False
	) -> dict:
	if compare_to_ground_truth:
	pred_col = 'predicted'
	truth_col = 'ground_truth'
	else:
	pred_col = 'extracted_winner'
	truth_col = 'truth_result'

	results_df[pred_col] = results_df[pred_col].apply(convert_to_float_or_nan)
	results_df[truth_col] = results_df[truth_col].apply(convert_to_float_or_nan)

	numerical_results = results_df.dropna(subset=[pred_col, truth_col])

	if len(numerical_results) == 0:
	pearson_corr = np.nan
	else:
	pearson_corr = numerical_results[pred_col].corr(numerical_results[truth_col])

	num_extracted_nan = int(results_df[pred_col].isna().sum())

	return {
	"overall_pearson_correlation": pearson_corr if not pd.isna(pearson_corr) else 0.0,
	"num_extracted_nan": num_extracted_nan,
	}

	def convert_to_float_or_nan(extracted_input):
	if extracted_input is None or pd.isna(extracted_input):
	return np.nan
	try:
	return float(extracted_input)
	except ValueError:
	return np.nan