| import os |
| import re |
| import math |
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| import matplotlib.ticker as mtick |
| import seaborn as sns |
| import nltk |
| import evaluate |
| import traceback |
|
|
| bert_score = evaluate.load("bertscore") |
| meteor = evaluate.load("meteor") |
|
|
| print(f"loading: {__file__}") |
|
|
| |
| |
|
|
| |
| pattern_non_word_char_repetition = re.compile(r"[\s\W]{5,}") |
| pattern_text_repetitions = re.compile( |
| r"(?P<repeat>.{5}.*?)(?:[\s\W]*(?P=repeat))+", re.M | re.DOTALL | re.IGNORECASE |
| ) |
| |
| |
| |
| |
| |
| |
|
|
|
|
| def del_non_word_char_repetition(text, debug=False): |
| count = 0 |
|
|
| if isinstance(text, str): |
| if debug: |
| print("----detect non-word characters repetition----") |
| count = len(text) |
| text = pattern_non_word_char_repetition.sub("\t", text) |
| count -= len(text) |
| if debug and count: |
| print(f"removed non-word characters repetition: {count}") |
| return text, count |
|
|
|
|
| |
| def detect_text_repetitions(text, debug=False): |
| count = 0 |
|
|
| if isinstance(text, str): |
| if debug: |
| print("----detect text repetitions----") |
| matches = pattern_text_repetitions.finditer(text) |
| for match in matches: |
| if debug: |
| print(match) |
| for groupNum in range(0, len(match.groups())): |
| groupNum = groupNum + 1 |
| print( |
| "Group {groupNum} found at {start}-{end}: `{group}`".format( |
| groupNum=groupNum, |
| start=match.start(groupNum), |
| end=match.end(groupNum), |
| group=match.group(groupNum), |
| ) |
| ) |
|
|
| start, end = match.span() |
| count += end - start - len(match.group(1)) |
|
|
| return count |
|
|
|
|
| def detect_repetitions(text, debug=False): |
| if isinstance(text, str) is False: |
| return 0, 0, 0 |
| text, count_non_word_char_repetition = del_non_word_char_repetition( |
| text, debug=debug |
| ) |
| count_text_repetitions = detect_text_repetitions(text, debug=debug) |
| total_repetitions = count_non_word_char_repetition + count_text_repetitions |
|
|
| result = (count_non_word_char_repetition, count_text_repetitions, total_repetitions) |
|
|
| if debug: |
| print(result) |
| return result |
|
|
|
|
| def detect_scores( |
| row, debug=False, answer_col="answer", ground_truth_col="ground_truth" |
| ): |
| newline_score, repetition_score, total_repetitions = detect_repetitions( |
| row[answer_col], debug=debug |
| ) |
|
|
| if ground_truth_col: |
| ground_truth_newline_score, ground_truth_repetition_score, _ = ( |
| detect_repetitions(row[ground_truth_col], debug=debug) |
| ) |
|
|
| newline_score -= ground_truth_newline_score |
| if newline_score < 0: |
| newline_score = 0 |
|
|
| repetition_score -= ground_truth_repetition_score |
| if repetition_score < 0: |
| repetition_score = 0 |
|
|
| total_repetitions = newline_score + repetition_score |
|
|
| return pd.Series([newline_score, repetition_score, total_repetitions]) |
|
|
|
|
| def load_with_newline_and_repetition_scores(result_file, force_recalculate=False): |
| print(f"loading result file: {result_file}") |
| df = pd.read_csv(result_file, comment="#", on_bad_lines="warn") |
|
|
| if ( |
| force_recalculate |
| or "newline_score" not in df.columns |
| or "repetition_score" not in df.columns |
| or "total_repetitions" not in df.columns |
| or "nrr" not in df.columns |
| or "rr" not in df.columns |
| ): |
| if ( |
| force_recalculate |
| or "newline_score" not in df.columns |
| or "repetition_score" not in df.columns |
| or "total_repetitions" not in df.columns |
| ): |
| df[["newline_score", "repetition_score", "total_repetitions"]] = df.apply( |
| detect_scores, axis=1 |
| ) |
|
|
| df["answer_len"] = df["answer"].apply( |
| lambda x: len(x) if isinstance(x, str) else 0 |
| ) |
|
|
| df["nrr"] = df.apply( |
| lambda x: ( |
| 1 |
| if x["answer_len"] == 0 |
| else 1 - (x["newline_score"] + x["repetition_score"]) / x["answer_len"] |
| ), |
| axis=1, |
| ) |
|
|
| df["rr"] = df["nrr"].apply(lambda x: 1 - x) |
|
|
| df.to_csv(result_file, index=False) |
|
|
| return df |
|
|
|
|
| def replace_last(source_string, old_string, new_string): |
| head, _sep, tail = source_string.rpartition(old_string) |
| return head + new_string + tail |
|
|
|
|
| def load_for_repetition_penalty( |
| csv_result_file, repetition_penalty, force_recalculate=False |
| ): |
| result_file = replace_last( |
| csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv" |
| ) |
| return load_with_newline_and_repetition_scores( |
| result_file, force_recalculate=force_recalculate |
| ) |
|
|
|
|
| rap_penalty_functions = { |
| "linear": lambda x: x, |
| "quadratic": lambda x: x * x, |
| "cubic": lambda x: x * x * x, |
| "logarithmic": lambda x: math.log(x + 1, 2), |
| "exponential": lambda x: math.exp(x - 1), |
| } |
|
|
|
|
| def calc_adjusted_performance(f, r, l=1, penalty_function="cubic"): |
| n = 1 - r / l if l > 0 else 0 |
| return f * rap_penalty_functions[penalty_function](n) |
|
|
|
|
| def calculate_adjusted_performance(row): |
| r = row["total_repetitions"] |
| l = row["answer_len"] |
| adjusted_precision = calc_adjusted_performance(row["precision"], r, l) |
| adjusted_recall = calc_adjusted_performance(row["recall"], r, l) |
| return pd.Series([adjusted_precision, adjusted_recall]) |
|
|
|
|
| def load_performance_df(csv_result_file, repetition_penalty): |
| result_file = replace_last( |
| csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json" |
| ) |
| result_file = result_file.replace("/results/", "/eval/") |
| print(f"loading json file: {result_file}") |
| df = pd.read_json(result_file) |
|
|
| return df |
|
|
|
|
| def calculate_performance_score( |
| csv_result_file, repetition_penalty, force_recalculate=False |
| ): |
| result_file = replace_last( |
| csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv" |
| ) |
|
|
| if os.path.exists(result_file): |
| print(f"loading result file: {result_file}") |
| df = load_with_newline_and_repetition_scores( |
| result_file, force_recalculate=force_recalculate |
| ) |
| else: |
| print(f"re-creating result file: {result_file}") |
| df = pd.DataFrame() |
| force_recalculate = True |
|
|
| if force_recalculate or "f2" in df.columns or "f1" not in df.columns: |
| try: |
| perf_df = load_performance_df(csv_result_file, repetition_penalty) |
| df.drop( |
| columns=[ |
| "precision", |
| "recall", |
| "f1", |
| "f2", |
| "entities_in_answer", |
| "entities_in_question", |
| "word_count", |
| ], |
| errors="ignore", |
| inplace=True, |
| ) |
|
|
| df["id"] = perf_df["id"] |
| df["question"] = perf_df["question"] |
| df["answer"] = perf_df["pred_answer"] |
| df["word_count"] = df["answer"].apply( |
| lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0 |
| ) |
| df["ground_truth"] = perf_df["ground_truth"] |
|
|
| df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"] |
| df["precision"] = perf_df["score"].apply(lambda x: x[0]) |
| df["recall"] = perf_df["score"].apply(lambda x: x[1]) |
| df["f1"] = perf_df["score"].apply(lambda x: x[2]) |
| except Exception as e: |
| print(f"\tignored error: {e}") |
| |
|
|
| df[["newline_score", "repetition_score", "total_repetitions"]] = df.apply( |
| detect_scores, axis=1 |
| ) |
| df["answer_len"] = df["answer"].apply( |
| lambda x: len(x) if isinstance(x, str) else 0 |
| ) |
|
|
| df[["adjusted_precision", "adjusted_recall"]] = df.apply( |
| calculate_adjusted_performance, axis=1 |
| ) |
|
|
| df.to_csv(result_file, index=False) |
| print(f"performance scores saved to result file: {result_file}") |
|
|
| |
|
|
| return df |
|
|
|
|
| def adjust_perf_scores_with_repetition_penalty(result, precision, recall): |
| newline_score = [ |
| df["newline_score"].mean() for df in result["df_list_repetition_penalty"] |
| ] |
|
|
| repetition_score = [ |
| df["repetition_score"].mean() for df in result["df_list_repetition_penalty"] |
| ] |
|
|
| answer_len = [ |
| df["answer_len"].mean() for df in result["df_list_repetition_penalty"] |
| ] |
|
|
| precision = [ |
| calc_adjusted_performance(f, n + r, l) |
| for f, n, r, l in zip(precision, newline_score, repetition_score, answer_len) |
| ] |
| recall = [ |
| calc_adjusted_performance(f, n + r, l) |
| for f, n, r, l in zip(recall, newline_score, repetition_score, answer_len) |
| ] |
|
|
| return precision, recall |
|
|
|
|
| def plot_performance_scores( |
| result, |
| models=None, |
| title="Performance", |
| ): |
| if models is None: |
| models = result.keys() |
| for model in models: |
| print(f"model: {model}") |
| df = result[model]["df_overall"] |
|
|
| |
| precision = [ |
| df["precision"].mean() for df in result[model]["df_list_repetition_penalty"] |
| ] |
| recall = [ |
| df["recall"].mean() for df in result[model]["df_list_repetition_penalty"] |
| ] |
| f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] |
| best_f1 = max(f1) |
| best_f1_index = f1.index(best_f1) |
|
|
| precision, recall = adjust_perf_scores_with_repetition_penalty( |
| result[model], precision, recall |
| ) |
| afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] |
|
|
| |
| best_afrp = max(afrp) |
| best_afrp_index = afrp.index(best_afrp) |
|
|
| adjusted_precision = [ |
| df["adjusted_precision"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
| adjusted_recall = [ |
| df["adjusted_recall"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
| afrp2 = [ |
| 2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall) |
| ] |
| best_afrp2 = max(afrp2) |
| best_afrp2_index = afrp2.index(best_afrp2) |
|
|
| repetition_penalties = list(df["repetition_penalty"]) |
|
|
| |
| plt.figure(figsize=(10, 6)) |
|
|
| plt.axvspan( |
| repetition_penalties[best_f1_index] - 0.01, |
| repetition_penalties[best_f1_index] + 0.01, |
| alpha=0.5, |
| edgecolor="none", |
| facecolor="blue", |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| plt.axvspan( |
| repetition_penalties[best_afrp_index] - 0.01, |
| repetition_penalties[best_afrp_index] + 0.01, |
| alpha=0.5, |
| edgecolor="none", |
| facecolor="orange", |
| ) |
|
|
| plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue") |
| |
| |
| |
| |
| |
| |
| |
| plt.plot( |
| repetition_penalties, |
| afrp, |
| label="RAP - F1", |
| marker="o", |
| color="orange", |
| ) |
| plt.xlabel("Repetition Penalties") |
| plt.ylabel("Score") |
| |
| |
| plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) |
| plt.title(f"{model} {title}") |
| plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") |
|
|
| plt.show() |
|
|
|
|
| def plot_best_afrp( |
| result, |
| models=None, |
| title="Models with Best RAP - F1", |
| ref_result=None, |
| ): |
| |
| model_names = [] |
| best_f1 = [] |
| best_afrp = [] |
| best_repetition_penalty = [] |
| best_mtr = [] |
|
|
| if models is None: |
| models = result.keys() |
| for model in models: |
| print(f"model: {model}") |
| df = result[model]["df_overall"] |
|
|
| |
| precision = [ |
| df["precision"].mean() for df in result[model]["df_list_repetition_penalty"] |
| ] |
| recall = [ |
| df["recall"].mean() for df in result[model]["df_list_repetition_penalty"] |
| ] |
| |
| f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] |
|
|
| newline_score = [ |
| df["newline_score"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
| |
|
|
| repetition_score = [ |
| df["repetition_score"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
| |
|
|
| answer_len = [ |
| df["answer_len"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
|
|
| afrp = [ |
| calc_adjusted_performance(f, n + r, l) |
| for f, n, r, l in zip(f1, newline_score, repetition_score, answer_len) |
| ] |
|
|
| best_afrp.append(max(afrp)) |
| best_afrp_index = afrp.index(best_afrp[-1]) |
| best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index]) |
|
|
| best_f1.append(f1[best_afrp_index]) |
| best_mtr.append( |
| newline_score[best_afrp_index] + repetition_score[best_afrp_index] |
| ) |
|
|
| |
| |
| |
|
|
| df = result[model]["df_list_repetition_penalty"][best_afrp_index] |
|
|
| model_names.append( |
| f"{model} (RP={best_repetition_penalty[-1]})" |
| ) |
|
|
| if ref_result is not None: |
| print("ref_result:", ref_result) |
| for model in ref_result.keys(): |
| model_names.append(model) |
| df = pd.read_csv(ref_result[model]) |
| |
|
|
| p = df["precision"].mean() |
| r = df["recall"].mean() |
|
|
| f1 = 2 * p * r / (p + r) if p + r > 0 else 0 |
| best_f1.append(f1) |
| best_afrp.append(f1) |
| best_mtr.append(0) |
|
|
| print("model_names:", model_names) |
| |
| |
|
|
| |
| data = pd.DataFrame( |
| { |
| "Model": model_names, |
| "RAP - F1": best_afrp, |
| "F1": best_f1, |
| } |
| ) |
|
|
| |
| data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score") |
|
|
| |
| data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score") |
|
|
| |
| data_pivoted = data_pivoted[model_names] |
|
|
| |
| data_pivoted = data_pivoted.reindex(["RAP - F1", "F1"]) |
|
|
| |
| plt.figure(figsize=(15, 6)) |
| ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9) |
| plt.title(title) |
| plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") |
|
|
| |
| plt.xticks(rotation=0) |
|
|
| |
| ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) |
|
|
| |
| a1 = max(best_afrp) |
| a2 = max(best_f1) |
|
|
| max_value = max([a1, a2]) * 1.12 |
| print("max_value:", max_value) |
|
|
| |
| ax.set_ylim(0, max_value) |
|
|
| |
| for p in ax.patches: |
| ax.annotate( |
| f"{p.get_height() * 100:.1f}", |
| (p.get_x() + p.get_width() / 2.0, p.get_height()), |
| ha="center", |
| va="bottom", |
| xytext=(0, 10), |
| textcoords="offset points", |
| rotation=90, |
| ) |
|
|
| plt.show() |
| return data_pivoted, best_mtr |
|
|
|
|
| def plot_best_performance( |
| result, |
| models=None, |
| title="Models with Best F1 Score", |
| adjusted_f1=False, |
| ref_result=None, |
| ): |
| |
| model_names = [] |
| best_precision = [] |
| best_recall = [] |
| best_f1 = [] |
| best_repetition_penalty = [] |
| best_mtr = [] |
|
|
| if models is None: |
| models = result.keys() |
| for model in models: |
| print(f"model: {model}") |
| df = result[model]["df_overall"] |
|
|
| |
| precision = [ |
| df["precision"].mean() for df in result[model]["df_list_repetition_penalty"] |
| ] |
| recall = [ |
| df["recall"].mean() for df in result[model]["df_list_repetition_penalty"] |
| ] |
| newline_score = [ |
| df["newline_score"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
|
|
| repetition_score = [ |
| df["repetition_score"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
|
|
| if adjusted_f1: |
| precision, recall = adjust_perf_scores_with_repetition_penalty( |
| result[model], precision, recall |
| ) |
|
|
| |
| f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] |
|
|
| best_f1.append(max(f1)) |
| best_f1_index = f1.index(best_f1[-1]) |
| best_repetition_penalty.append(df["repetition_penalty"][best_f1_index]) |
|
|
| best_precision.append(precision[best_f1_index]) |
| best_recall.append(recall[best_f1_index]) |
| best_mtr.append(newline_score[best_f1_index] + repetition_score[best_f1_index]) |
|
|
| print( |
| f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}" |
| ) |
|
|
| df = result[model]["df_list_repetition_penalty"][best_f1_index] |
|
|
| model_names.append( |
| f"{model} (RP={best_repetition_penalty[-1]})" |
| ) |
|
|
| |
| print( |
| f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}" |
| ) |
|
|
| if ref_result is not None: |
| print("ref_result:", ref_result) |
| for model in ref_result.keys(): |
| model_names.append(model) |
| df = pd.read_csv(ref_result[model]) |
| |
|
|
| best_precision.append(df["precision"].mean()) |
| best_recall.append(df["recall"].mean()) |
| f1 = ( |
| 2 |
| * (best_precision[-1] * best_recall[-1]) |
| / (best_precision[-1] + best_recall[-1]) |
| ) |
| |
| best_f1.append(f1) |
| best_mtr.append(0) |
|
|
| |
| data = ( |
| pd.DataFrame( |
| { |
| "Model": model_names, |
| "Adjusted Precision with RP": best_precision, |
| "Adjusted Recall with RP": best_recall, |
| "Adjusted F1 with RP": best_f1, |
| } |
| ) |
| if adjusted_f1 |
| else pd.DataFrame( |
| { |
| "Model": model_names, |
| "Precision": best_precision, |
| "Recall": best_recall, |
| "F1": best_f1, |
| } |
| ) |
| ) |
| columns = list(data.columns) |
|
|
| |
| data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score") |
|
|
| |
| data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score") |
|
|
| |
| data_pivoted = data_pivoted[model_names] |
|
|
| |
| data_pivoted = data_pivoted.reindex(columns[1:]) |
|
|
| |
| plt.figure(figsize=(10, 6)) |
| ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9) |
| plt.title(title) |
| plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") |
|
|
| |
| plt.xticks(rotation=0) |
|
|
| |
| ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) |
|
|
| |
| a1 = max(best_precision) |
| a2 = max(best_recall) |
| a3 = max(best_f1) |
|
|
| max_value = max([a1, a2, a3]) * 1.12 |
| print("max_value:", max_value) |
|
|
| |
| ax.set_ylim(0, max_value) |
|
|
| |
| for p in ax.patches: |
| ax.annotate( |
| f"{p.get_height() * 100:.1f}", |
| (p.get_x() + p.get_width() / 2.0, p.get_height()), |
| ha="center", |
| va="bottom", |
| xytext=(0, 10), |
| textcoords="offset points", |
| rotation=90, |
| ) |
|
|
| plt.show() |
| return data_pivoted, best_mtr |
|
|
|
|
| def plot_best_performance_ms_macro( |
| result, |
| models=None, |
| title="Models with Best RAP - Performance", |
| ref_result=None, |
| skip_generic_prompt=False, |
| include_adjusted_performance=True, |
| ): |
| |
| model_names = [] |
| best_f1 = [] |
| best_afrp = [] |
| best_repetition_penalty = [] |
| best_bleu1 = [] |
| best_rougeL = [] |
| best_mtr = [] |
|
|
| if models is None: |
| models = result.keys() |
| for model in models: |
| if skip_generic_prompt and "generic prompt" in model: |
| continue |
| print(f"model: {model}") |
| df = result[model]["df_overall"] |
|
|
| |
| bleu1 = [x for x in df["bleu1"]] |
| rougeL = [x for x in df["rougeL"]] |
| f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)] |
|
|
| newline_score = [ |
| df["newline_score"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
| |
|
|
| repetition_score = [ |
| df["repetition_score"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
| |
|
|
| answer_len = [ |
| df["answer_len"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
|
|
| afrp = [ |
| calc_adjusted_performance(f, n + r, l) |
| for f, n, r, l in zip(f1, newline_score, repetition_score, answer_len) |
| ] |
|
|
| best_afrp.append(max(afrp if include_adjusted_performance else f1)) |
| best_afrp_index = ( |
| afrp.index(best_afrp[-1]) |
| if include_adjusted_performance |
| else f1.index(best_afrp[-1]) |
| ) |
| best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index]) |
|
|
| best_f1.append(f1[best_afrp_index]) |
| best_bleu1.append(bleu1[best_afrp_index]) |
| best_rougeL.append(rougeL[best_afrp_index]) |
| best_mtr.append( |
| newline_score[best_afrp_index] + repetition_score[best_afrp_index] |
| ) |
|
|
| |
| |
| |
|
|
| df = result[model]["df_list_repetition_penalty"][best_afrp_index] |
|
|
| model_names.append( |
| f"{model} (RP={best_repetition_penalty[-1]})" |
| ) |
|
|
| if ref_result is not None: |
| print("ref_result:", ref_result) |
| for model in ref_result.keys(): |
| model_names.append(model) |
| df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn") |
| |
|
|
| p = df["bleu1"][0] |
| best_bleu1.append(p) |
|
|
| r = df["rougeL"][0] |
| best_rougeL.append(r) |
|
|
| f1 = 2 * p * r / (p + r) if p + r > 0 else 0 |
| best_f1.append(f1) |
| best_afrp.append(f1) |
| best_mtr.append(0) |
|
|
| |
| |
| |
|
|
| |
| data = ( |
| pd.DataFrame( |
| { |
| "Model": model_names, |
| "RAP - Perf Score": best_afrp, |
| "Overall Perf Score": best_f1, |
| } |
| ) |
| if include_adjusted_performance |
| else pd.DataFrame( |
| { |
| "Model": model_names, |
| "Bleu-1": best_bleu1, |
| "Rouge-L": best_rougeL, |
| "Overall Perf Score": best_f1, |
| } |
| ) |
| ) |
|
|
| |
| data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score") |
|
|
| |
| data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score") |
|
|
| |
| data_pivoted = data_pivoted[model_names] |
|
|
| columns = list(data.columns) |
| data_pivoted = data_pivoted.reindex(columns[1:]) |
|
|
| |
| plt.figure(figsize=(10, 6)) |
| ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9) |
| plt.title(title) |
| plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") |
|
|
| |
| plt.xticks(rotation=0) |
|
|
| |
| ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) |
|
|
| |
| a1 = max(best_afrp) |
| a2 = max(best_f1) |
| a3 = max(best_bleu1) |
| a4 = max(best_rougeL) |
|
|
| max_value = ( |
| max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12 |
| ) |
| print("max_value:", max_value) |
|
|
| |
| ax.set_ylim(0, max_value) |
|
|
| |
| for p in ax.patches: |
| ax.annotate( |
| f"{p.get_height() * 100:.1f}", |
| (p.get_x() + p.get_width() / 2.0, p.get_height()), |
| ha="center", |
| va="bottom", |
| xytext=(0, 10), |
| textcoords="offset points", |
| rotation=90, |
| ) |
|
|
| plt.show() |
| return data_pivoted, best_mtr |
|
|
|
|
| all_open_source_models = [ |
| "gemma-1.1-2b-it", |
| "Phi-3-mini-128k-instruct", |
| "gemma-1.1-7b-it", |
| "Llama-2-7b-chat-hf", |
| "Mistral-7B-Instruct-v0.2", |
| "Meta-Llama-3-8B-Instruct", |
| "Llama-2-13b-chat-hf", |
| "Llama-2-70b-chat-hf", |
| "Meta-Llama-3-70B-Instruct", |
| ] |
|
|
|
|
| def load_for_repetition_penalty_ms_macro( |
| csv_result_file, repetition_penalty, force_recalculate=False |
| ): |
| result_file = replace_last( |
| csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv" |
| ) |
| df = load_with_newline_and_repetition_scores( |
| result_file, force_recalculate=force_recalculate |
| ) |
|
|
| return df |
|
|
|
|
| |
| def plot_performance_scores_ms_macro( |
| result, |
| models=None, |
| title="Performance", |
| ): |
| if models is None: |
| models = result.keys() |
| for model in models: |
| print(f"model: {model}") |
| df = result[model]["df_overall"] |
| |
|
|
| |
| bleu1 = list(df["bleu1"]) |
| rougeL = list(df["rougeL"]) |
| f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)] |
| best_f1 = max(f1) |
| best_f1_index = f1.index(best_f1) |
|
|
| bleu1, rougeL = adjust_perf_scores_with_repetition_penalty( |
| result[model], bleu1, rougeL |
| ) |
| afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)] |
|
|
| |
| best_afrp = max(afrp) |
| best_afrp_index = afrp.index(best_afrp) |
|
|
| repetition_penalties = list(df["repetition_penalty"]) |
|
|
| |
| plt.figure(figsize=(10, 6)) |
|
|
| plt.axvspan( |
| repetition_penalties[best_f1_index] - 0.01, |
| repetition_penalties[best_f1_index] + 0.01, |
| alpha=0.5, |
| edgecolor="none", |
| facecolor="blue", |
| ) |
|
|
| plt.axvspan( |
| repetition_penalties[best_afrp_index] - 0.01, |
| repetition_penalties[best_afrp_index] + 0.01, |
| alpha=0.5, |
| edgecolor="none", |
| facecolor="orange", |
| ) |
|
|
| plt.plot( |
| repetition_penalties, |
| f1, |
| label="Overall Perf Score", |
| marker="D", |
| color="blue", |
| ) |
| plt.plot( |
| repetition_penalties, |
| afrp, |
| label="RAP - Perf Score", |
| marker="o", |
| color="orange", |
| ) |
|
|
| plt.xlabel("Repetition Penalties") |
| plt.ylabel("Score") |
| |
| |
| plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) |
| plt.title(f"{model} {title}") |
| plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") |
|
|
| plt.show() |
|
|
|
|
| def plot_repetition_factors(result, groups): |
| for group in groups: |
| |
| plt.figure(figsize=(10, 6)) |
|
|
| max_value = 0 |
| for model in result.keys(): |
| if not group in model.lower(): |
| continue |
| print(f"model: {model}") |
| df = result[model]["df_overall"] |
| repetition_panelties = [ |
| repetition_penalty for repetition_penalty in df["repetition_penalty"] |
| ] |
|
|
| mean_score = [ |
| df["total_repetitions"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
|
|
| sns.lineplot(x=repetition_panelties, y=mean_score, label=model) |
|
|
| new_max = max(mean_score) |
| if new_max > max_value: |
| max_value = new_max |
|
|
| max_value = max_value * 1.05 |
| |
| |
| |
| plt.ylim(0, max_value) |
|
|
| |
| plt.grid(True) |
| plt.xlabel("Repetition Penalties") |
| plt.ylabel("Mean Total Repetitions") |
| plt.title("Mean Total Repetitions vs Repetition Penalties") |
| plt.legend() |
|
|
| plt.show() |
|
|
|
|
| def plot_repetition_factors_by_group(result, group_filter=None): |
| markers = ["D", "o", "s", "x"] |
| colors = ["blue", "orange", "green", "red"] |
|
|
| |
| plt.figure(figsize=(10, 6)) |
| index = 0 |
| max_value = 0 |
|
|
| for model in result.keys(): |
| if group_filter is not None and group_filter not in model: |
| continue |
|
|
| print(f"model: {model}") |
|
|
| df = result[model]["df_overall"] |
| repetition_panelties = [ |
| repetition_penalty for repetition_penalty in df["repetition_penalty"] |
| ] |
|
|
| |
| mean_score = [ |
| df["total_repetitions"].mean() |
| for df in result[model]["df_list_repetition_penalty"] |
| ] |
| if len(mean_score) != len(repetition_panelties): |
| print( |
| f"model: {model} has different length of repetition penalties and mean score" |
| ) |
| print("repetition_panelties:", len(repetition_panelties)) |
| print("mean_score:", len(mean_score)) |
| continue |
|
|
| new_max = max(mean_score) |
| if new_max > max_value: |
| max_value = new_max |
|
|
| sns.lineplot( |
| x=repetition_panelties, |
| y=mean_score, |
| label=model, |
| marker=markers[index], |
| color=colors[index], |
| ) |
|
|
| index += 1 |
|
|
| max_value = max_value * 1.05 |
| |
| |
| |
| plt.ylim(0, max_value) |
| max_value = 0 |
|
|
| plt.xlabel("Repetition Penalties") |
| plt.ylabel("Mean Total Repetitions") |
| plt.title("Mean Total Repetitions vs Repetition Penalties") |
| plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") |
|
|
| plt.show() |
|
|
|
|
| ms_marco_csv_result_files = [ |
| "data/results/gemma-1.1-2b-it(RAG - Generic Prompt)_mm.csv", |
| "data/results/gemma-1.1-2b-it(RAG - Chat Template)_mm.csv", |
| "data/results/gemma-1.1-2b-it(Non-RAG)_mm.csv", |
| "data/results/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_mm.csv", |
| "data/results/Phi-3-mini-128k-instruct(RAG - Chat Template)_mm.csv", |
| "data/results/Phi-3-mini-128k-instruct(Non-RAG)_mm.csv", |
| "data/results/gemma-1.1-7b-it(RAG - Generic Prompt)_mm.csv", |
| "data/results/gemma-1.1-7b-it(RAG - Chat Template)_mm.csv", |
| "data/results/gemma-1.1-7b-it(Non-RAG)_mm.csv", |
| "data/results/Llama-2-7b-chat-hf(RAG - Generic Prompt)_mm.csv", |
| "data/results/Llama-2-7b-chat-hf(RAG - Chat Template)_mm.csv", |
| "data/results/Llama-2-7b-chat-hf(Non-RAG)_mm.csv", |
| "data/results/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_mm.csv", |
| "data/results/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_mm.csv", |
| "data/results/Mistral-7B-Instruct-v0.2(Non-RAG)_mm.csv", |
| "data/results/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_mm.csv", |
| "data/results/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_mm.csv", |
| "data/results/Meta-Llama-3-8B-Instruct(Non-RAG)_mm.csv", |
| "data/results/Llama-2-13b-chat-hf(RAG - Generic Prompt)_mm.csv", |
| "data/results/Llama-2-13b-chat-hf(RAG - Chat Template)_mm.csv", |
| "data/results/Llama-2-13b-chat-hf(Non-RAG)_mm.csv", |
| "data/results/Llama-2-70b-chat-hf(RAG - Generic Prompt)_mm.csv", |
| "data/results/Llama-2-70b-chat-hf(RAG - Chat Template)_mm.csv", |
| "data/results/Llama-2-70b-chat-hf(Non-RAG)_mm.csv", |
| "data/results/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_mm.csv", |
| "data/results/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_mm.csv", |
| "data/results/Meta-Llama-3-70B-Instruct(Non-RAG)_mm.csv", |
| ] |
|
|
| webqsp_csv_result_files = [ |
| "data/results/gemma-1.1-2b-it(RAG - Generic Prompt)_wd.csv", |
| "data/results/gemma-1.1-2b-it(RAG - Chat Template)_wd.csv", |
| "data/results/gemma-1.1-2b-it(Non-RAG)_wd.csv", |
| "data/results/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_wd.csv", |
| "data/results/Phi-3-mini-128k-instruct(RAG - Chat Template)_wd.csv", |
| "data/results/Phi-3-mini-128k-instruct(Non-RAG)_wd.csv", |
| "data/results/gemma-1.1-7b-it(RAG - Generic Prompt)_wd.csv", |
| "data/results/gemma-1.1-7b-it(RAG - Chat Template)_wd.csv", |
| "data/results/gemma-1.1-7b-it(Non-RAG)_wd.csv", |
| "data/results/Llama-2-7b-chat-hf(RAG - Generic Prompt)_wd.csv", |
| "data/results/Llama-2-7b-chat-hf(RAG - Chat Template)_wd.csv", |
| "data/results/Llama-2-7b-chat-hf(Non-RAG)_wd.csv", |
| "data/results/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_wd.csv", |
| "data/results/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_wd.csv", |
| "data/results/Mistral-7B-Instruct-v0.2(Non-RAG)_wd.csv", |
| "data/results/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_wd.csv", |
| "data/results/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_wd.csv", |
| "data/results/Meta-Llama-3-8B-Instruct(Non-RAG)_wd.csv", |
| "data/results/Llama-2-13b-chat-hf(RAG - Generic Prompt)_wd.csv", |
| "data/results/Llama-2-13b-chat-hf(RAG - Chat Template)_wd.csv", |
| "data/results/Llama-2-13b-chat-hf(Non-RAG)_wd.csv", |
| "data/results/Llama-2-70b-chat-hf(RAG - Generic Prompt)_wd.csv", |
| "data/results/Llama-2-70b-chat-hf(RAG - Chat Template)_wd.csv", |
| "data/results/Llama-2-70b-chat-hf(Non-RAG)_wd.csv", |
| "data/results/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_wd.csv", |
| "data/results/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_wd.csv", |
| "data/results/Meta-Llama-3-70B-Instruct(Non-RAG)_wd.csv", |
| ] |
|
|
|
|
| def calc_rap_scores( |
| result, precision="precision", recall="recall", penalty_function="cubic" |
| ): |
| newline_score = [ |
| df["newline_score"].mean() for df in result["df_list_repetition_penalty"] |
| ] |
|
|
| repetition_score = [ |
| df["repetition_score"].mean() for df in result["df_list_repetition_penalty"] |
| ] |
|
|
| if precision in result["df_list_repetition_penalty"][0].columns: |
| precision = [ |
| df[precision].mean() for df in result["df_list_repetition_penalty"] |
| ] |
| recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]] |
| else: |
| precision = result["df_overall"][precision] |
| recall = result["df_overall"][recall] |
|
|
| f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] |
|
|
| nrr = [ |
| 1 - (n + r) / s |
| for f, n, r, s in zip( |
| f1, newline_score, repetition_score, result["df_overall"]["answer_len"] |
| ) |
| ] |
|
|
| rap = [ |
| calc_adjusted_performance(f, 1 - n, penalty_function=penalty_function) |
| for f, n in zip(f1, nrr) |
| ] |
|
|
| return newline_score, repetition_score, f1, rap, nrr |
|
|
|
|
| def get_model_name(csv_result_file): |
| parts = re.split(r"[_/]", csv_result_file) |
| print(f"parts: {parts}") |
| model_name = parts[2] |
| return model_name |
|
|
|
|
| def load_webqsp_result( |
| csv_result_files, force_recalculate=False, save=False, penalty_function="cubic" |
| ): |
| result = {} |
| for i, csv_result_file in enumerate(csv_result_files): |
| try: |
| df = pd.read_csv(csv_result_file) |
| model_name = get_model_name(csv_result_file) |
| print(f"\tmodel_name: {model_name}") |
|
|
| dfs = [ |
| calculate_performance_score( |
| csv_result_file, |
| repetition_penalty, |
| force_recalculate=force_recalculate, |
| ) |
| for repetition_penalty in df["repetition_penalty"] |
| ] |
|
|
| answer_lens = [] |
| for df_rpp in dfs: |
| answer_lens.append(df_rpp["answer_len"].mean()) |
| df["answer_len"] = answer_lens |
|
|
| result[model_name] = { |
| "df_overall": df, |
| "df_list_repetition_penalty": dfs, |
| "file": csv_result_file, |
| } |
| newline_score, repetition_score, perf, rap, nrr = calc_rap_scores( |
| result[model_name], penalty_function=penalty_function |
| ) |
| df["newline_score"] = newline_score |
| df["repetition_score"] = repetition_score |
| df["total_repetitions"] = df["newline_score"] + df["repetition_score"] |
| df["perf"] = perf |
| df["nrr"] = nrr |
| df["rap"] = rap |
| df["rr"] = df["nrr"].apply(lambda x: 1 - x) |
| df["rrp"] = df["rr"].apply(lambda x: x * 100) |
| if save: |
| df.to_csv(csv_result_file, index=False) |
| except Exception as e: |
| print(f"Error: {e}") |
| traceback.print_exc() |
|
|
| return result |
|
|
|
|
| def load_ms_marco_result( |
| csv_result_files, |
| force_recalculate=False, |
| calc_bertscore=True, |
| save=False, |
| penalty_function="cubic", |
| ): |
| result = {} |
| for csv_result_file in csv_result_files: |
| try: |
| df = pd.read_csv(csv_result_file) |
| model_name = get_model_name(csv_result_file) |
| print(f"\tmodel_name: {model_name}") |
|
|
| dfs = [ |
| load_for_repetition_penalty_ms_macro( |
| csv_result_file, |
| repetition_penalty, |
| force_recalculate=force_recalculate, |
| ) |
| for repetition_penalty in df["repetition_penalty"] |
| ] |
|
|
| answer_lens = [] |
| for df_rpp in dfs: |
| answer_lens.append(df_rpp["answer_len"].mean()) |
| df["answer_len"] = answer_lens |
|
|
| col = "bert_score" if calc_bertscore else "meteor" |
| score_unavailable = col not in df.columns |
|
|
| if score_unavailable: |
| save = True |
| bert_meteor_scores = [] |
| bert_score_references = None |
| for df_rpp in dfs: |
| if calc_bertscore: |
| bert_meteor_score = 0 |
|
|
| for i, row in df_rpp.iterrows(): |
| answer = row["answer"] |
| if not isinstance(answer, str): |
| answer = "" |
| bert_meteor_score += bert_score.compute( |
| predictions=[answer], |
| references=[row["ground_truth"][0]], |
| lang="en", |
| model_type="microsoft/deberta-large-mnli", |
| )["f1"][0] |
| |
| bert_meteor_score = bert_meteor_score / len(df_rpp) |
|
|
| print(f"bert_score: {bert_meteor_score}") |
| else: |
| bert_meteor_score = meteor.compute( |
| predictions=df_rpp["answer"], |
| references=df_rpp["ground_truth"], |
| )["meteor"] |
|
|
| bert_meteor_scores.append(bert_meteor_score) |
|
|
| df[col] = bert_meteor_scores |
|
|
| result[model_name] = { |
| "df_overall": df, |
| "df_list_repetition_penalty": dfs, |
| "file": csv_result_file, |
| } |
| newline_score, repetition_score, perf, rap, nrr = calc_rap_scores( |
| result[model_name], |
| precision=col, |
| recall=col, |
| penalty_function=penalty_function, |
| ) |
| df["newline_score"] = newline_score |
| df["repetition_score"] = repetition_score |
| df["total_repetitions"] = df["newline_score"] + df["repetition_score"] |
| df["perf"] = perf |
| df["nrr"] = nrr |
| df["rap"] = rap |
| df["rr"] = df["nrr"].apply(lambda x: 1 - x) |
| df["rrp"] = df["rr"].apply(lambda x: x * 100) |
|
|
| if save: |
| df.to_csv(csv_result_file, index=False) |
| except Exception as e: |
| print("An error occurred:", e) |
| traceback.print_exc() |
| print(f"csv_result_file: {csv_result_file}") |
|
|
| return result |
|
|