| import pandas as pd |
| import numpy as np |
| import plotly.express as px |
| from plotly.graph_objs import Figure |
|
|
| from src.leaderboard.filter_models import FLAGGED_MODELS |
| from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS |
| from src.leaderboard.read_evals import EvalResult |
|
|
|
|
|
|
| def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame: |
| """ |
| Generates a DataFrame containing the maximum scores until each date. |
| |
| :param results_df: A DataFrame containing result information including metric scores and dates. |
| :return: A new DataFrame containing the maximum scores until each date for every metric. |
| """ |
| |
| results_df = pd.DataFrame(raw_data) |
| |
| results_df.sort_values(by="date", inplace=True) |
|
|
| |
| scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]} |
|
|
| |
| for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]: |
| current_max = 0 |
| last_date = "" |
| column = task.col_name |
| for _, row in results_df.iterrows(): |
| current_model = row["full_model"] |
| if current_model in FLAGGED_MODELS: |
| continue |
|
|
| current_date = row["date"] |
| if task.benchmark == "Average": |
| current_score = np.mean(list(row["results"].values())) |
| else: |
| current_score = row["results"][task.benchmark] |
|
|
| if current_score > current_max: |
| if current_date == last_date and len(scores[column]) > 0: |
| scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score} |
| else: |
| scores[column].append({"model": current_model, "date": current_date, "score": current_score}) |
| current_max = current_score |
| last_date = current_date |
|
|
| |
| return {k: pd.DataFrame(v) for k, v in scores.items()} |
|
|
|
|
| def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame: |
| """ |
| Transforms the scores DataFrame into a new format suitable for plotting. |
| |
| :param scores_df: A DataFrame containing metric scores and dates. |
| :return: A new DataFrame reshaped for plotting purposes. |
| """ |
| |
| dfs = [] |
|
|
| |
| for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]: |
| d = scores_df[col].reset_index(drop=True) |
| d["task"] = col |
| dfs.append(d) |
|
|
| |
| concat_df = pd.concat(dfs, ignore_index=True) |
|
|
| |
| concat_df.sort_values(by="date", inplace=True) |
| concat_df.reset_index(drop=True, inplace=True) |
| return concat_df |
|
|
|
|
| def create_metric_plot_obj( |
| df: pd.DataFrame, metrics: list[str], title: str |
| ) -> Figure: |
| """ |
| Create a Plotly figure object with lines representing different metrics |
| and horizontal dotted lines representing human baselines. |
| |
| :param df: The DataFrame containing the metric values, names, and dates. |
| :param metrics: A list of strings representing the names of the metrics |
| to be included in the plot. |
| :param title: A string representing the title of the plot. |
| :return: A Plotly figure object with lines representing metrics and |
| horizontal dotted lines representing human baselines. |
| """ |
|
|
| |
| df = df[df["task"].isin(metrics)] |
|
|
| |
| filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics} |
|
|
| |
| fig = px.line( |
| df, |
| x="date", |
| y="score", |
| color="task", |
| markers=True, |
| custom_data=["task", "score", "model"], |
| title=title, |
| ) |
|
|
| |
| fig.update_traces( |
| hovertemplate="<br>".join( |
| [ |
| "Model Name: %{customdata[2]}", |
| "Metric Name: %{customdata[0]}", |
| "Date: %{x}", |
| "Metric Value: %{y}", |
| ] |
| ) |
| ) |
|
|
| |
| fig.update_layout(yaxis_range=[0, 100]) |
|
|
| |
| metric_color_mapping = {} |
|
|
| |
| for trace in fig.data: |
| metric_color_mapping[trace.name] = trace.line.color |
|
|
| |
| for metric, value in filtered_human_baselines.items(): |
| color = metric_color_mapping.get(metric, "blue") |
| location = "top left" if metric == "HellaSwag" else "bottom left" |
| |
| fig.add_hline( |
| y=value, |
| line_dash="dot", |
| annotation_text=f"{metric} human baseline", |
| annotation_position=location, |
| annotation_font_size=10, |
| annotation_font_color=color, |
| line_color=color, |
| ) |
|
|
| return fig |
|
|
|
|
| |
| |
| |
|
|