| import json |
| import metrics |
| import argparse |
| import numpy as np |
| import multiprocessing |
| from tqdm import trange |
| import signal, functools |
| import re, os, sys, random, time |
| from fraction import Fraction |
| from data_processing.answer_extraction import * |
| from functools import lru_cache |
| from eval.eval_script import * |
| MAX_INT = sys.maxsize |
| INVALID_ANS = "[Invalid]" |
| INF = 1e9 |
|
|
| __all__ = [ |
| "check_equal", |
| "check_equal_without_timeout", |
| "numberic_compare", |
| "Evaluator", |
| ] |
|
|
| @lru_cache(maxsize=1000000) |
| def check_equal_without_timeout(ans_1, ans_2): |
| return math_equal(ans_1, ans_2) |
|
|
| def check_equal(ans_1, ans_2, cache_dict=None): |
| try: |
| if cache_dict is not None: |
| key = str(ans_1) + "<##>" + str(ans_2) |
| if key in cache_dict: return cache_dict[key] |
| print("Miss") |
| return check_equal_without_timeout(ans_1, ans_2) |
| except TimeoutError as e: |
| return False |
|
|
| def numberic_compare(ai, aj, ci, cj, cache_dict=None): |
| return check_equal(ai, aj, cache_dict) |
|
|
| def prep_evaluator( |
| predicts, completions, perplexities, answer, equal_func, check_equal |
| ): |
| m = len(predicts) |
|
|
| |
| max_perplexity = -INF |
| max_perplexity_count = 0.0 |
| for i in range(m): |
| if perplexities[i] > max_perplexity: |
| max_perplexity = perplexities[i] |
| max_perplexity_count = 0.0 |
| if perplexities[i] >= max_perplexity: |
| max_perplexity_count += 1.0 |
|
|
| |
| correct, answers = 0, [] |
| for i in range(m): |
| ans_i = predicts[i] |
| answers.append([ans_i, np.exp(perplexities[i]), check_equal(ans_i, answer)]) |
| if perplexities[i] < max_perplexity: continue |
| if check_equal(ans_i, answer): |
| correct += 1.0 / max_perplexity_count |
|
|
| return correct, answers |
|
|
| class Evaluator: |
| def __init__(self): |
| self.name = "Perplexity" |
|
|
| def process(self, json_file, cache_file, equal_func, evaluator, K, seed=0): |
| |
| |
| results = json_file |
| n = len(results["predict"]) |
| m = len(results["predict"][0]) |
| indices = list(range(m)) |
| random.seed(seed) |
| random.shuffle(indices) |
| indices = indices[: K] |
|
|
| if cache_file is not None: |
| def cache_equal_func(ai, aj, ci, cj): |
| return equal_func(ai, aj, ci, cj, cache_file) |
| def cache_check_equal(ai, aj): |
| return check_equal(ai, aj, cache_file) |
| else: |
| cache_equal_func = equal_func |
| cache_check_equal = check_equal |
|
|
|
|
| predicts, completions, perplexities, answers = [], [], [], [] |
| for i in range(0, n): |
| predicts.append([results["predict"][i][j] for j in indices]) |
| completions.append([results["completion"][i][j] for j in indices]) |
| perplexities.append([results["mean_logprob"][i][j] for j in indices]) |
| answers.append(results["answer"][i]) |
| n = len(predicts) |
|
|
| start_time = time.time() |
| outputs = [] |
| for idx in trange(n): |
| res = evaluator( |
| predicts[idx], |
| completions[idx], |
| perplexities[idx], |
| answers[idx], |
| cache_equal_func, |
| cache_check_equal, |
| ) |
| outputs.append(res) |
| print(f"Running Time with Single Process Mode with Seed #{seed}: {time.time() - start_time:.2f}S") |
|
|
| for i in trange(n): |
| m = len(outputs[i][1]) |
| for j in range(m): |
| ans, prob, flag = outputs[i][1][j] |
| maximum, max_bins = metrics.compute_maximum_metrics([x[1] for x in outputs]) |
| average, avg_bins = metrics.compute_average_metrics([x[1] for x in outputs]) |
| accs = np.mean([x[0] for x in outputs]) |
| return accs * 100.0, maximum, average, max_bins, avg_bins |
|
|
| def worker(self, args): |
| json_file, cache_file, K, seed = args |
| acc, maximum, average, max_bins, avg_bins = self.process( |
| json_file=json_file, |
| cache_file=cache_file, |
| equal_func=numberic_compare, |
| evaluator=prep_evaluator, |
| K=K, |
| seed=seed |
| ) |
| return acc, maximum, average |
|
|
| def solve(self, json_file, cache_file=None, repeats=10, K=128): |
| accs, maxs, avgs = [], [], [] |
| with multiprocessing.Pool() as pool: |
| results = pool.map(self.worker, [(json_file, cache_file, K, seed) for seed in range(repeats)]) |
| accs, maxs, _ = zip(*results) |
| accs, maxs = np.array(accs), np.array(maxs) |
| return { |
| "Accuracy": f"{accs.mean():.2f} ± {accs.std():.2f}", |
| "ECE": f"{maxs[:, 0].mean() * 100.0:.2f} ± {maxs[:, 0].std() * 100.0:.2f}", |
| } |
|
|