Spaces:
Running
Running
new_benchmark
#6
by QuentinJG - opened
- .github/workflows/restart.yml +0 -19
- CHANGELOG.md +0 -3
- README.md +1 -2
- app.py +72 -812
- app/utils.py +20 -134
- data/dataset_handler.py +1 -91
- data/deprecated_model_handler.py +0 -128
- data/model_handler.py +93 -74
- data/pipeline_handler.py +0 -274
- requirements.txt +0 -6
.github/workflows/restart.yml
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
name: Scheduled Restart
|
| 2 |
-
|
| 3 |
-
on:
|
| 4 |
-
schedule:
|
| 5 |
-
- cron: '0 */1 * * *' # every 6 hours
|
| 6 |
-
workflow_dispatch:
|
| 7 |
-
|
| 8 |
-
jobs:
|
| 9 |
-
restart:
|
| 10 |
-
runs-on: ubuntu-latest
|
| 11 |
-
steps:
|
| 12 |
-
- uses: actions/checkout@v4
|
| 13 |
-
- run: |
|
| 14 |
-
date > restart.txt
|
| 15 |
-
git config user.name "github-actions"
|
| 16 |
-
git config user.email "github-actions@github.com"
|
| 17 |
-
git add restart.txt
|
| 18 |
-
git commit -m "Scheduled restart"
|
| 19 |
-
git push
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHANGELOG.md
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
# 0.1.0
|
| 2 |
-
|
| 3 |
-
- We identified that the insurance policy data used in the Vidore V2 leaderboard falls within a grey area of licensing. While the data was publicly accessible, its licensing terms were not sufficiently clear to ensure compliance with our data usage standards. We have since removed this benchmark data from HuggingFace and removed the concerned columns from the leaderboard.
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -7,8 +7,7 @@ sdk: gradio
|
|
| 7 |
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
license: mit
|
| 10 |
-
sdk_version:
|
| 11 |
-
python_version: 3.12.11
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 7 |
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
license: mit
|
| 10 |
+
sdk_version: 4.37.1
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -1,79 +1,20 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
import gradio as gr
|
| 4 |
-
import pandas as pd
|
| 5 |
-
import plotly.express as px
|
| 6 |
-
|
| 7 |
-
from app.utils import (
|
| 8 |
-
add_rank_and_format,
|
| 9 |
-
deprecated_get_refresh_function,
|
| 10 |
-
filter_models,
|
| 11 |
-
get_pipeline_refresh_function,
|
| 12 |
-
get_refresh_function,
|
| 13 |
-
)
|
| 14 |
-
from data.deprecated_model_handler import DeprecatedModelHandler
|
| 15 |
-
from data.model_handler import ModelHandler
|
| 16 |
-
from data.pipeline_handler import PipelineHandler
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
"ndcg_at_5",
|
| 21 |
-
"ndcg_at_10",
|
| 22 |
-
"ndcg_at_100",
|
| 23 |
-
"recall_at_1",
|
| 24 |
-
"recall_at_5",
|
| 25 |
-
"recall_at_10",
|
| 26 |
-
"recall_at_100",
|
| 27 |
-
]
|
| 28 |
|
|
|
|
| 29 |
|
| 30 |
def main():
|
| 31 |
-
# Get new results
|
| 32 |
model_handler = ModelHandler()
|
| 33 |
initial_metric = "ndcg_at_5"
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
data_benchmark_2 = model_handler.render_df(initial_metric, benchmark_version=2)
|
| 39 |
-
data_benchmark_2 = add_rank_and_format(data_benchmark_2, benchmark_version=2)
|
| 40 |
-
|
| 41 |
-
num_datasets_1 = len(data_benchmark_1.columns) - 3
|
| 42 |
-
num_scores_1 = len(data_benchmark_1) * num_datasets_1
|
| 43 |
-
num_models_1 = len(data_benchmark_1)
|
| 44 |
-
|
| 45 |
-
num_datasets_2 = len(data_benchmark_2.columns) - 3
|
| 46 |
-
num_scores_2 = len(data_benchmark_2) * num_datasets_2
|
| 47 |
-
num_models_2 = len(data_benchmark_2)
|
| 48 |
-
|
| 49 |
-
deprecated_model_handler = DeprecatedModelHandler()
|
| 50 |
-
initial_metric = "ndcg_at_5"
|
| 51 |
-
initial_metric_v3 = "ndcg_at_10"
|
| 52 |
-
|
| 53 |
-
# Get pipeline evaluation results
|
| 54 |
-
pipeline_handler = PipelineHandler()
|
| 55 |
-
pipeline_handler.get_pipeline_data()
|
| 56 |
-
data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
|
| 57 |
-
data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
|
| 58 |
-
|
| 59 |
-
num_datasets_pipeline = len(data_pipeline.columns) - 5 # Excluding Rank, Model, Indexing time, search time, Average
|
| 60 |
-
num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
|
| 61 |
-
num_pipelines = len(data_pipeline)
|
| 62 |
-
|
| 63 |
-
deprecated_model_handler.get_vidore_data(initial_metric)
|
| 64 |
-
deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
|
| 65 |
-
deprecated_data_benchmark_1 = add_rank_and_format(deprecated_data_benchmark_1, benchmark_version=1)
|
| 66 |
-
|
| 67 |
-
deprecated_data_benchmark_2 = deprecated_model_handler.render_df(initial_metric, benchmark_version=2)
|
| 68 |
-
deprecated_data_benchmark_2 = add_rank_and_format(deprecated_data_benchmark_2, benchmark_version=2)
|
| 69 |
-
|
| 70 |
-
deprecated_num_datasets_1 = len(deprecated_data_benchmark_1.columns) - 3
|
| 71 |
-
deprecated_num_scores_1 = len(deprecated_data_benchmark_1) * deprecated_num_datasets_1
|
| 72 |
-
deprecated_num_models_1 = len(deprecated_data_benchmark_1)
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
|
| 78 |
css = """
|
| 79 |
table > thead {
|
|
@@ -96,812 +37,131 @@ def main():
|
|
| 96 |
.markdown {
|
| 97 |
font-size: 1rem;
|
| 98 |
}
|
| 99 |
-
|
| 100 |
-
.alert-info {
|
| 101 |
-
background-color: #e3f2fd;
|
| 102 |
-
border-left: 4px solid #2196f3;
|
| 103 |
-
padding: 5px 15px;
|
| 104 |
-
}
|
| 105 |
-
|
| 106 |
-
/* 1. Force text wrapping on all headers from the 3rd column onwards */
|
| 107 |
-
#pipeline-table table th:nth-child(n+3) * {
|
| 108 |
-
white-space: normal !important;
|
| 109 |
-
overflow: visible !important;
|
| 110 |
-
text-overflow: clip !important;
|
| 111 |
-
line-height: 1.2 !important;
|
| 112 |
-
word-break: normal !important; /* Prevents breaking in the middle of words */
|
| 113 |
-
overflow-wrap: normal !important; /* Prevents breaking in the middle of words */
|
| 114 |
-
}
|
| 115 |
-
|
| 116 |
-
/* 2. Set a fixed width and center alignment for headers and data cells from the 3rd column onwards */
|
| 117 |
-
#pipeline-table table th:nth-child(n+3),
|
| 118 |
-
#pipeline-table table td:nth-child(n+3) {
|
| 119 |
-
min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
|
| 120 |
-
max-width: 120px !important;
|
| 121 |
-
}
|
| 122 |
-
|
| 123 |
-
/* 3. Make the Model column (2nd column) wider for pipeline table */
|
| 124 |
-
#pipeline-table table th:nth-child(2),
|
| 125 |
-
#pipeline-table table td:nth-child(2) {
|
| 126 |
-
min-width: 400px !important;
|
| 127 |
-
max-width: 500px !important;
|
| 128 |
-
}
|
| 129 |
"""
|
| 130 |
|
| 131 |
with gr.Blocks(css=css) as block:
|
| 132 |
-
with gr.Tabs()
|
| 133 |
-
with gr.TabItem("
|
| 134 |
-
gr.Markdown("# ViDoRe
|
| 135 |
-
|
| 136 |
-
gr.Markdown(
|
| 137 |
-
"""
|
| 138 |
-
Visual Document Retrieval Benchmark 3 leaderboard. To submit results, refer to the corresponding tab.
|
| 139 |
-
|
| 140 |
-
Refer to:
|
| 141 |
-
- 🤗 The [blogpost](https://huggingface.co/blog/QuentinJG/introducing-vidore-v3) for all the details on the datasets,
|
| 142 |
-
- 🤗 The [dataset collection](https://huggingface.co/collections/vidore/vidore-benchmark-v3),
|
| 143 |
-
- 📝 The [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics.
|
| 144 |
-
"""
|
| 145 |
-
)
|
| 146 |
-
|
| 147 |
-
gr.Markdown("""
|
| 148 |
-
As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
|
| 149 |
-
we embed it here.
|
| 150 |
-
""")
|
| 151 |
-
|
| 152 |
-
gr.Markdown(
|
| 153 |
-
"""**💡 To display English-only results:**
|
| 154 |
-
- Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*),
|
| 155 |
-
- Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it),
|
| 156 |
-
- The *eng-Latn* column will show English-only results (= English queries on English documents).""",
|
| 157 |
-
elem_classes="alert-info",
|
| 158 |
-
)
|
| 159 |
-
|
| 160 |
-
gr.HTML(
|
| 161 |
-
"""
|
| 162 |
-
<iframe
|
| 163 |
-
src="https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)"
|
| 164 |
-
style="width:100%; height:1000px; border:2px solid black; border-radius:10px;"
|
| 165 |
-
></iframe>
|
| 166 |
-
"""
|
| 167 |
-
)
|
| 168 |
-
|
| 169 |
-
with gr.TabItem("ViDoRe V3 (Pipeline)", id="vidore-v3-pipeline"):
|
| 170 |
-
gr.Markdown("# ViDoRe V3 (Pipeline Evaluation): Retrieval Performance for Complex Pipelines ⚙️")
|
| 171 |
-
gr.Markdown(
|
| 172 |
-
"### Assessing retrieval performance, latency, and compute costs of complex retrieval pipelines"
|
| 173 |
-
)
|
| 174 |
-
|
| 175 |
-
gr.Markdown(
|
| 176 |
-
"""
|
| 177 |
-
This leaderboard ranks full retrieval pipelines on **English-only queries** for **ViDoRe V3**. Instead of just testing standalone models, we evaluate real-world, multi-step retrieval systems. This includes everything from basic retrievers to advanced setups using AI agents, query reformulation, hybrid search, and any other creative retrieval pipeline one can imagine.
|
| 178 |
-
|
| 179 |
-
To show efficiency alongside accuracy, we include **Indexing latency** (seconds/doc) and **Search latency** (seconds/query). These numbers are self-reported and depend heavily on hardware, so treat them as rough estimates. Still, they give a helpful idea of how much computing power each pipeline needs.
|
| 180 |
-
|
| 181 |
-
⚠️ **Note:** Because this only uses English queries, these scores **cannot be directly compared** to the standard ViDoRe V3 results.
|
| 182 |
-
|
| 183 |
-
*Results are sourced from the [vidore-benchmark repository](https://github.com/illuin-tech/vidore-benchmark/tree/main/results).*
|
| 184 |
-
"""
|
| 185 |
-
)
|
| 186 |
-
|
| 187 |
-
if len(data_pipeline) > 0:
|
| 188 |
-
datasets_columns_pipeline = [
|
| 189 |
-
col
|
| 190 |
-
for col in data_pipeline.columns[4:]
|
| 191 |
-
if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average Score"]
|
| 192 |
-
]
|
| 193 |
-
|
| 194 |
-
with gr.Row():
|
| 195 |
-
metric_dropdown_pipeline = gr.Dropdown(
|
| 196 |
-
choices=METRICS, value=initial_metric_v3, label="Select Metric"
|
| 197 |
-
)
|
| 198 |
-
research_textbox_pipeline = gr.Textbox(
|
| 199 |
-
placeholder="🔍 Search Pipelines... [press enter]",
|
| 200 |
-
label="Filter Pipelines by Name",
|
| 201 |
-
)
|
| 202 |
-
|
| 203 |
-
with gr.Row():
|
| 204 |
-
if datasets_columns_pipeline:
|
| 205 |
-
column_checkboxes_pipeline = gr.CheckboxGroup(
|
| 206 |
-
choices=datasets_columns_pipeline,
|
| 207 |
-
value=datasets_columns_pipeline,
|
| 208 |
-
label="Select Datasets to Display",
|
| 209 |
-
)
|
| 210 |
-
else:
|
| 211 |
-
column_checkboxes_pipeline = gr.CheckboxGroup(
|
| 212 |
-
choices=[], value=[], label="Select Datasets to Display"
|
| 213 |
-
)
|
| 214 |
-
|
| 215 |
-
with gr.Row():
|
| 216 |
-
# Datatype: Rank, Pipeline, Indexing latency (s/doc), Search latency (s/query), Average + datasets
|
| 217 |
-
datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(
|
| 218 |
-
datasets_columns_pipeline
|
| 219 |
-
)
|
| 220 |
-
dataframe_pipeline = gr.Dataframe(
|
| 221 |
-
data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table"
|
| 222 |
-
)
|
| 223 |
-
|
| 224 |
-
def clean_pipeline_name(name):
|
| 225 |
-
if not isinstance(name, str):
|
| 226 |
-
return str(name)
|
| 227 |
-
# Remove Markdown links [text](url) -> text
|
| 228 |
-
name = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", name)
|
| 229 |
-
# Remove HTML tags <a href="...">text</a> -> text
|
| 230 |
-
name = re.sub(r"<[^>]+>", "", name)
|
| 231 |
-
return name.strip()
|
| 232 |
-
|
| 233 |
-
def create_pipeline_plot(df, latency_col):
|
| 234 |
-
if df is None or len(df) == 0:
|
| 235 |
-
return None
|
| 236 |
-
|
| 237 |
-
# Ensure expected columns exist
|
| 238 |
-
if (
|
| 239 |
-
latency_col not in df.columns
|
| 240 |
-
or "Average Score" not in df.columns
|
| 241 |
-
or "Pipeline" not in df.columns
|
| 242 |
-
):
|
| 243 |
-
return None
|
| 244 |
-
|
| 245 |
-
# Clean the dataframe for plotting
|
| 246 |
-
plot_df = df.copy()
|
| 247 |
-
|
| 248 |
-
# Strip HTML and Markdown for clean hover text
|
| 249 |
-
plot_df["Cleaned Pipeline"] = plot_df["Pipeline"].apply(clean_pipeline_name)
|
| 250 |
-
|
| 251 |
-
plot_df[latency_col] = pd.to_numeric(plot_df[latency_col], errors="coerce")
|
| 252 |
-
plot_df["Average Score"] = pd.to_numeric(plot_df["Average Score"], errors="coerce")
|
| 253 |
-
|
| 254 |
-
plot_df = plot_df.dropna(subset=[latency_col, "Average Score"])
|
| 255 |
-
plot_df = plot_df[plot_df[latency_col] > 0]
|
| 256 |
-
plot_df = plot_df.sort_values(by=latency_col)
|
| 257 |
-
|
| 258 |
-
if len(plot_df) == 0:
|
| 259 |
-
return None
|
| 260 |
-
|
| 261 |
-
fig = px.scatter(
|
| 262 |
-
plot_df,
|
| 263 |
-
x=latency_col,
|
| 264 |
-
y="Average Score",
|
| 265 |
-
hover_name="Cleaned Pipeline", # Use the clean text!
|
| 266 |
-
title=f"Mean Performance vs {latency_col}",
|
| 267 |
-
color_discrete_sequence=["orange"],
|
| 268 |
-
)
|
| 269 |
-
|
| 270 |
-
fig.update_layout(
|
| 271 |
-
xaxis_title=latency_col,
|
| 272 |
-
yaxis_title="Average Score",
|
| 273 |
-
plot_bgcolor="white",
|
| 274 |
-
)
|
| 275 |
-
|
| 276 |
-
fig.update_xaxes(showgrid=True, gridcolor="lightgrey")
|
| 277 |
-
fig.update_yaxes(showgrid=True, gridcolor="lightgrey")
|
| 278 |
-
|
| 279 |
-
fig.update_traces(marker=dict(size=12, opacity=0.8, line=dict(width=1, color="DarkSlateGrey")))
|
| 280 |
-
return fig
|
| 281 |
-
|
| 282 |
-
with gr.Row():
|
| 283 |
-
latency_radio = gr.Radio(
|
| 284 |
-
choices=["Search latency (s/query)", "Indexing latency (s/doc)"],
|
| 285 |
-
value="Search latency (s/query)",
|
| 286 |
-
label="Select Latency Metric for X-Axis",
|
| 287 |
-
)
|
| 288 |
-
|
| 289 |
-
with gr.Row():
|
| 290 |
-
initial_fig = create_pipeline_plot(data_pipeline, "Search latency (s/query)")
|
| 291 |
-
performance_plot = gr.Plot(value=initial_fig)
|
| 292 |
-
|
| 293 |
-
def update_data_pipeline(metric, search_term, selected_columns):
|
| 294 |
-
pipeline_handler.get_pipeline_data()
|
| 295 |
-
data = pipeline_handler.render_df(metric, "english")
|
| 296 |
-
data = add_rank_and_format(
|
| 297 |
-
data, benchmark_version=3, selected_columns=selected_columns, is_pipeline=True
|
| 298 |
-
)
|
| 299 |
-
data = filter_models(data, search_term)
|
| 300 |
-
if selected_columns:
|
| 301 |
-
# Include core columns plus selected dataset columns
|
| 302 |
-
core_cols = ["Rank", "Pipeline", "Indexing latency (s/doc)", "Search latency (s/query)"]
|
| 303 |
-
if "Average Score" in data.columns:
|
| 304 |
-
core_cols.insert(4, "Average Score")
|
| 305 |
-
data = data[core_cols + selected_columns]
|
| 306 |
-
return data
|
| 307 |
-
|
| 308 |
-
with gr.Row():
|
| 309 |
-
refresh_button_pipeline = gr.Button("Refresh")
|
| 310 |
-
refresh_button_pipeline.click(
|
| 311 |
-
lambda metric: add_rank_and_format(
|
| 312 |
-
pipeline_handler.render_df(metric, "english"), benchmark_version=3, is_pipeline=True
|
| 313 |
-
),
|
| 314 |
-
inputs=[metric_dropdown_pipeline],
|
| 315 |
-
outputs=dataframe_pipeline,
|
| 316 |
-
concurrency_limit=20,
|
| 317 |
-
).then(
|
| 318 |
-
fn=create_pipeline_plot,
|
| 319 |
-
inputs=[dataframe_pipeline, latency_radio],
|
| 320 |
-
outputs=performance_plot,
|
| 321 |
-
)
|
| 322 |
-
|
| 323 |
-
with gr.Row():
|
| 324 |
-
gr.Markdown(
|
| 325 |
-
"""
|
| 326 |
-
**Note**: These results represent full pipeline evaluations on english queries ONLY (since other queries were mostly directly translated from their english counterparts).
|
| 327 |
-
We felt mutlilingual results were less critical (and much more costly to evaluate on the full set) for pipelines, since one user could just add a translation module to the pipeline and expect similar performance to the english results.
|
| 328 |
-
If you feel this is a mistake and multi-lingual results are critical for pipelines, please let us know by opening an issue in the GitHub repository!
|
| 329 |
-
"""
|
| 330 |
-
)
|
| 331 |
-
|
| 332 |
-
# Automatically refresh the dataframe when the dropdown value changes
|
| 333 |
-
def refresh_pipeline_data(metric):
|
| 334 |
-
"""Refresh pipeline data when metric changes."""
|
| 335 |
-
df = pipeline_handler.render_df(metric, "english")
|
| 336 |
-
return add_rank_and_format(df, benchmark_version=3, is_pipeline=True)
|
| 337 |
-
|
| 338 |
-
# Update dataframe and then update the plot
|
| 339 |
-
metric_dropdown_pipeline.change(
|
| 340 |
-
refresh_pipeline_data,
|
| 341 |
-
inputs=[metric_dropdown_pipeline],
|
| 342 |
-
outputs=dataframe_pipeline,
|
| 343 |
-
).then(
|
| 344 |
-
fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
|
| 345 |
-
)
|
| 346 |
-
|
| 347 |
-
research_textbox_pipeline.submit(
|
| 348 |
-
lambda metric, search_term, selected_columns: update_data_pipeline(
|
| 349 |
-
metric, search_term, selected_columns
|
| 350 |
-
),
|
| 351 |
-
inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
|
| 352 |
-
outputs=dataframe_pipeline,
|
| 353 |
-
).then(
|
| 354 |
-
fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
|
| 355 |
-
)
|
| 356 |
-
|
| 357 |
-
column_checkboxes_pipeline.change(
|
| 358 |
-
lambda metric, search_term, selected_columns: update_data_pipeline(
|
| 359 |
-
metric, search_term, selected_columns
|
| 360 |
-
),
|
| 361 |
-
inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
|
| 362 |
-
outputs=dataframe_pipeline,
|
| 363 |
-
).then(
|
| 364 |
-
fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
|
| 365 |
-
)
|
| 366 |
-
|
| 367 |
-
# Update plot when the radio button changes
|
| 368 |
-
latency_radio.change(
|
| 369 |
-
fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
|
| 370 |
-
)
|
| 371 |
-
|
| 372 |
-
gr.Markdown(
|
| 373 |
-
f"""
|
| 374 |
-
- **Total Datasets**: {num_datasets_pipeline}
|
| 375 |
-
- **Total Scores**: {num_scores_pipeline}
|
| 376 |
-
- **Total Pipelines**: {num_pipelines}
|
| 377 |
-
"""
|
| 378 |
-
+ r"""
|
| 379 |
-
Please consider citing:
|
| 380 |
-
|
| 381 |
-
```bibtex
|
| 382 |
-
@misc{faysse2024colpaliefficientdocumentretrieval,
|
| 383 |
-
title={ColPali: Efficient Document Retrieval with Vision Language Models},
|
| 384 |
-
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
|
| 385 |
-
year={2024},
|
| 386 |
-
eprint={2407.01449},
|
| 387 |
-
archivePrefix={arXiv},
|
| 388 |
-
primaryClass={cs.IR},
|
| 389 |
-
url={[https://arxiv.org/abs/2407.01449](https://arxiv.org/abs/2407.01449)},
|
| 390 |
-
}
|
| 391 |
-
@misc{loison2026vidore,
|
| 392 |
-
title={ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
|
| 393 |
-
author={Loison, Ant{\'o}nio and Mac{\'e}, Quentin and Edy, Antoine and Xing, Victor and Balough, Tom and Moreira, Gabriel and Liu, Bo and Faysse, Manuel and Hudelot, C{\'e}line and Viaud, Gautier},
|
| 394 |
-
journal={arXiv preprint arXiv:2601.08620},
|
| 395 |
-
year={2026}
|
| 396 |
-
}
|
| 397 |
-
```
|
| 398 |
-
"""
|
| 399 |
-
)
|
| 400 |
-
else:
|
| 401 |
-
gr.Markdown("**No pipeline evaluation results available yet. Check back later!**")
|
| 402 |
-
|
| 403 |
-
with gr.TabItem("ViDoRe V2", id="vidore-v2"):
|
| 404 |
-
gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
|
| 405 |
-
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
|
| 406 |
|
| 407 |
gr.Markdown(
|
| 408 |
"""
|
| 409 |
-
Visual Document Retrieval Benchmark
|
| 410 |
|
| 411 |
-
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
|
| 412 |
"""
|
| 413 |
)
|
| 414 |
-
|
|
|
|
|
|
|
| 415 |
|
| 416 |
with gr.Row():
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
label="Filter Models by Name",
|
| 421 |
-
)
|
| 422 |
-
column_checkboxes_2 = gr.CheckboxGroup(
|
| 423 |
-
choices=datasets_columns_2, value=datasets_columns_2, label="Select Columns to Display"
|
| 424 |
-
)
|
| 425 |
|
| 426 |
with gr.Row():
|
| 427 |
-
|
| 428 |
-
|
| 429 |
|
| 430 |
-
def
|
| 431 |
-
model_handler.get_vidore_data(metric)
|
| 432 |
-
data =
|
| 433 |
-
data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns)
|
| 434 |
data = filter_models(data, search_term)
|
| 435 |
-
# data = remove_duplicates(data) # Add this line
|
| 436 |
if selected_columns:
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
]
|
| 440 |
return data
|
| 441 |
|
| 442 |
with gr.Row():
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
get_refresh_function(model_handler, benchmark_version=2),
|
| 446 |
-
inputs=[metric_dropdown_2],
|
| 447 |
-
outputs=dataframe_2,
|
| 448 |
-
concurrency_limit=20,
|
| 449 |
-
)
|
| 450 |
|
| 451 |
-
with gr.Row():
|
| 452 |
-
gr.Markdown(
|
| 453 |
-
"""
|
| 454 |
-
**Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
|
| 455 |
-
Those numbers are not numbers obtained from the organisations that released those models.
|
| 456 |
-
"""
|
| 457 |
-
)
|
| 458 |
|
| 459 |
# Automatically refresh the dataframe when the dropdown value changes
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
research_textbox_2.submit(
|
| 466 |
-
lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns),
|
| 467 |
-
inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2],
|
| 468 |
-
outputs=dataframe_2,
|
| 469 |
)
|
| 470 |
-
|
| 471 |
-
lambda metric, search_term, selected_columns:
|
| 472 |
-
inputs=[
|
| 473 |
-
outputs=
|
| 474 |
)
|
| 475 |
|
| 476 |
-
|
| 477 |
-
f"""
|
| 478 |
-
- **Total Datasets**: {num_datasets_2}
|
| 479 |
-
- **Total Scores**: {num_scores_2}
|
| 480 |
-
- **Total Models**: {num_models_2}
|
| 481 |
-
"""
|
| 482 |
-
+ r"""
|
| 483 |
-
Please consider citing:
|
| 484 |
-
|
| 485 |
-
```bibtex
|
| 486 |
-
@misc{faysse2024colpaliefficientdocumentretrieval,
|
| 487 |
-
title={ColPali: Efficient Document Retrieval with Vision Language Models},
|
| 488 |
-
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
|
| 489 |
-
year={2024},
|
| 490 |
-
eprint={2407.01449},
|
| 491 |
-
archivePrefix={arXiv},
|
| 492 |
-
primaryClass={cs.IR},
|
| 493 |
-
url={https://arxiv.org/abs/2407.01449},
|
| 494 |
-
}
|
| 495 |
|
| 496 |
-
@misc{macé2025vidorebenchmarkv2raising,
|
| 497 |
-
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
|
| 498 |
-
author={Quentin Macé and António Loison and Manuel Faysse},
|
| 499 |
-
year={2025},
|
| 500 |
-
eprint={2505.17166},
|
| 501 |
-
archivePrefix={arXiv},
|
| 502 |
-
primaryClass={cs.IR},
|
| 503 |
-
url={https://arxiv.org/abs/2505.17166},
|
| 504 |
-
}
|
| 505 |
-
```
|
| 506 |
-
"""
|
| 507 |
-
)
|
| 508 |
-
with gr.TabItem("ViDoRe V1", id="vidore-v1"):
|
| 509 |
-
gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
|
| 510 |
-
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
|
| 511 |
-
|
| 512 |
-
gr.Markdown(
|
| 513 |
-
"""
|
| 514 |
-
Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
|
| 515 |
-
|
| 516 |
-
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
|
| 517 |
-
"""
|
| 518 |
-
)
|
| 519 |
-
datasets_columns_1 = list(data_benchmark_1.columns[4:])
|
| 520 |
-
|
| 521 |
-
with gr.Row():
|
| 522 |
-
metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
|
| 523 |
-
research_textbox_1 = gr.Textbox(
|
| 524 |
-
placeholder="🔍 Search Models... [press enter]",
|
| 525 |
-
label="Filter Models by Name",
|
| 526 |
-
)
|
| 527 |
-
column_checkboxes_1 = gr.CheckboxGroup(
|
| 528 |
-
choices=datasets_columns_1, value=datasets_columns_1, label="Select Columns to Display"
|
| 529 |
-
)
|
| 530 |
-
|
| 531 |
-
with gr.Row():
|
| 532 |
-
datatype_1 = ["number", "markdown"] + ["number"] * (num_datasets_1 + 1)
|
| 533 |
-
dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas")
|
| 534 |
-
|
| 535 |
-
def update_data_1(metric, search_term, selected_columns):
|
| 536 |
-
model_handler.get_vidore_data(metric)
|
| 537 |
-
data = model_handler.render_df(metric, benchmark_version=1)
|
| 538 |
-
data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
|
| 539 |
-
data = filter_models(data, search_term)
|
| 540 |
-
if selected_columns:
|
| 541 |
-
data = data[
|
| 542 |
-
["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns
|
| 543 |
-
]
|
| 544 |
-
return data
|
| 545 |
-
|
| 546 |
-
with gr.Row():
|
| 547 |
-
refresh_button_1 = gr.Button("Refresh")
|
| 548 |
-
refresh_button_1.click(
|
| 549 |
-
get_refresh_function(model_handler, benchmark_version=1),
|
| 550 |
-
inputs=[metric_dropdown_1],
|
| 551 |
-
outputs=dataframe_1,
|
| 552 |
-
concurrency_limit=20,
|
| 553 |
-
)
|
| 554 |
-
|
| 555 |
-
# Automatically refresh the dataframe when the dropdown value changes
|
| 556 |
-
metric_dropdown_1.change(
|
| 557 |
-
get_refresh_function(model_handler, benchmark_version=1),
|
| 558 |
-
inputs=[metric_dropdown_1],
|
| 559 |
-
outputs=dataframe_1,
|
| 560 |
-
)
|
| 561 |
-
research_textbox_1.submit(
|
| 562 |
-
lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
|
| 563 |
-
inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
|
| 564 |
-
outputs=dataframe_1,
|
| 565 |
-
)
|
| 566 |
-
column_checkboxes_1.change(
|
| 567 |
-
lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
|
| 568 |
-
inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
|
| 569 |
-
outputs=dataframe_1,
|
| 570 |
-
)
|
| 571 |
|
| 572 |
gr.Markdown(
|
| 573 |
f"""
|
| 574 |
-
- **Total Datasets**: {
|
| 575 |
-
- **Total Scores**: {
|
| 576 |
-
- **Total Models**: {
|
| 577 |
"""
|
| 578 |
+ r"""
|
| 579 |
Please consider citing:
|
| 580 |
|
| 581 |
```bibtex
|
| 582 |
@misc{faysse2024colpaliefficientdocumentretrieval,
|
| 583 |
-
title={ColPali: Efficient Document Retrieval with Vision Language Models},
|
| 584 |
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
|
| 585 |
year={2024},
|
| 586 |
eprint={2407.01449},
|
| 587 |
archivePrefix={arXiv},
|
| 588 |
primaryClass={cs.IR},
|
| 589 |
-
url={https://arxiv.org/abs/2407.01449},
|
| 590 |
}
|
| 591 |
-
@misc{macé2025vidorebenchmarkv2raising,
|
| 592 |
-
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
|
| 593 |
-
author={Quentin Macé and António Loison and Manuel Faysse},
|
| 594 |
-
year={2025},
|
| 595 |
-
eprint={2505.17166},
|
| 596 |
-
archivePrefix={arXiv},
|
| 597 |
-
primaryClass={cs.IR},
|
| 598 |
-
url={https://arxiv.org/abs/2505.17166},
|
| 599 |
-
}
|
| 600 |
-
|
| 601 |
```
|
| 602 |
"""
|
| 603 |
)
|
| 604 |
-
with gr.TabItem("📚 Submit your model"
|
| 605 |
gr.Markdown("# How to Submit a New Model to the Leaderboard")
|
| 606 |
gr.Markdown(
|
| 607 |
"""
|
| 608 |
-
|
| 609 |
-
To submit a new model to the original ViDoRe leaderboard, follow these steps:
|
| 610 |
|
| 611 |
1. **Evaluate your model**:
|
| 612 |
-
- Follow the evaluation
|
| 613 |
|
| 614 |
2. **Format your submission file**:
|
| 615 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
|
| 617 |
And you're done! Your model will appear on the leaderboard when you click refresh! Once the space
|
| 618 |
gets rebooted, it will appear on startup.
|
| 619 |
|
| 620 |
Note: For proper hyperlink redirection, please ensure that your model repository name is in
|
| 621 |
kebab-case, e.g. `my-model-name`.
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
## ViDoRe v3 Pipeline leaderboard :
|
| 625 |
-
To submit a new pipeline to the ViDoRe V3 pipeline leaderboard, follow these steps:
|
| 626 |
-
1. **Evaluate your pipeline**:
|
| 627 |
-
- Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) for pipelines
|
| 628 |
-
2. **Open a PR on the ViDoRe GitHub repository including**:
|
| 629 |
-
- Your results, which are directly outputed in the correct format
|
| 630 |
-
- A short description of the pipeline and the main components used (some examples are available in the repository)
|
| 631 |
-
|
| 632 |
-
And you're done! Your pipeline will appear on the leaderboard when after the PR is merged !
|
| 633 |
-
"""
|
| 634 |
-
)
|
| 635 |
-
with gr.TabItem("Deprecated ViDoRe V1", id="vidore-v1-deprecated"):
|
| 636 |
-
gr.Markdown(
|
| 637 |
-
"## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
|
| 638 |
-
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
|
| 639 |
-
"which is no longer maintained. Results should be computed using the "
|
| 640 |
-
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
|
| 641 |
-
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
|
| 642 |
-
)
|
| 643 |
-
gr.Markdown(
|
| 644 |
-
"## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
|
| 645 |
-
)
|
| 646 |
-
gr.Markdown(
|
| 647 |
-
"# <span style='color:red'>[Deprecated]</span> ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍"
|
| 648 |
-
)
|
| 649 |
-
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
|
| 650 |
-
|
| 651 |
-
gr.Markdown(
|
| 652 |
-
"""
|
| 653 |
-
Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
|
| 654 |
-
|
| 655 |
-
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
|
| 656 |
-
"""
|
| 657 |
-
)
|
| 658 |
-
deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
|
| 659 |
-
|
| 660 |
-
with gr.Row():
|
| 661 |
-
deprecated_metric_dropdown_1 = gr.Dropdown(
|
| 662 |
-
choices=METRICS, value=initial_metric, label="Select Metric"
|
| 663 |
-
)
|
| 664 |
-
deprecated_research_textbox_1 = gr.Textbox(
|
| 665 |
-
placeholder="🔍 Search Models... [press enter]",
|
| 666 |
-
label="Filter Models by Name",
|
| 667 |
-
)
|
| 668 |
-
deprecated_column_checkboxes_1 = gr.CheckboxGroup(
|
| 669 |
-
choices=deprecated_datasets_columns_1,
|
| 670 |
-
value=deprecated_datasets_columns_1,
|
| 671 |
-
label="Select Columns to Display",
|
| 672 |
-
)
|
| 673 |
-
|
| 674 |
-
with gr.Row():
|
| 675 |
-
deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
|
| 676 |
-
deprecated_dataframe_1 = gr.Dataframe(
|
| 677 |
-
deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas"
|
| 678 |
-
)
|
| 679 |
-
|
| 680 |
-
def deprecated_update_data_1(metric, search_term, selected_columns):
|
| 681 |
-
deprecated_model_handler.get_vidore_data(metric)
|
| 682 |
-
data = deprecated_model_handler.render_df(metric, benchmark_version=1)
|
| 683 |
-
data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
|
| 684 |
-
data = filter_models(data, search_term)
|
| 685 |
-
# data = remove_duplicates(data) # Add this line
|
| 686 |
-
if selected_columns:
|
| 687 |
-
data = data[["Rank", "Model", "Average Score"] + selected_columns]
|
| 688 |
-
return data
|
| 689 |
-
|
| 690 |
-
with gr.Row():
|
| 691 |
-
deprecated_refresh_button_1 = gr.Button("Refresh")
|
| 692 |
-
deprecated_refresh_button_1.click(
|
| 693 |
-
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1),
|
| 694 |
-
inputs=[deprecated_metric_dropdown_1],
|
| 695 |
-
outputs=deprecated_dataframe_1,
|
| 696 |
-
concurrency_limit=20,
|
| 697 |
-
)
|
| 698 |
-
|
| 699 |
-
# Automatically refresh the dataframe when the dropdown value changes
|
| 700 |
-
deprecated_metric_dropdown_1.change(
|
| 701 |
-
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1),
|
| 702 |
-
inputs=[deprecated_metric_dropdown_1],
|
| 703 |
-
outputs=deprecated_dataframe_1,
|
| 704 |
-
)
|
| 705 |
-
deprecated_research_textbox_1.submit(
|
| 706 |
-
lambda metric, search_term, selected_columns: deprecated_update_data_1(
|
| 707 |
-
metric, search_term, selected_columns
|
| 708 |
-
),
|
| 709 |
-
inputs=[
|
| 710 |
-
deprecated_metric_dropdown_1,
|
| 711 |
-
deprecated_research_textbox_1,
|
| 712 |
-
deprecated_column_checkboxes_1,
|
| 713 |
-
],
|
| 714 |
-
outputs=deprecated_dataframe_1,
|
| 715 |
-
)
|
| 716 |
-
deprecated_column_checkboxes_1.change(
|
| 717 |
-
lambda metric, search_term, selected_columns: deprecated_update_data_1(
|
| 718 |
-
metric, search_term, selected_columns
|
| 719 |
-
),
|
| 720 |
-
inputs=[
|
| 721 |
-
deprecated_metric_dropdown_1,
|
| 722 |
-
deprecated_research_textbox_1,
|
| 723 |
-
deprecated_column_checkboxes_1,
|
| 724 |
-
],
|
| 725 |
-
outputs=deprecated_dataframe_1,
|
| 726 |
-
)
|
| 727 |
-
|
| 728 |
-
gr.Markdown(
|
| 729 |
-
f"""
|
| 730 |
-
- **Total Datasets**: {deprecated_num_datasets_1}
|
| 731 |
-
- **Total Scores**: {deprecated_num_scores_1}
|
| 732 |
-
- **Total Models**: {deprecated_num_models_1}
|
| 733 |
-
"""
|
| 734 |
-
+ r"""
|
| 735 |
-
Please consider citing:
|
| 736 |
-
|
| 737 |
-
```bibtex
|
| 738 |
-
@misc{faysse2024colpaliefficientdocumentretrieval,
|
| 739 |
-
title={ColPali: Efficient Document Retrieval with Vision Language Models},
|
| 740 |
-
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
|
| 741 |
-
year={2024},
|
| 742 |
-
eprint={2407.01449},
|
| 743 |
-
archivePrefix={arXiv},
|
| 744 |
-
primaryClass={cs.IR},
|
| 745 |
-
url={https://arxiv.org/abs/2407.01449},
|
| 746 |
-
}
|
| 747 |
-
|
| 748 |
-
@misc{macé2025vidorebenchmarkv2raising,
|
| 749 |
-
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
|
| 750 |
-
author={Quentin Macé and António Loison and Manuel Faysse},
|
| 751 |
-
year={2025},
|
| 752 |
-
eprint={2505.17166},
|
| 753 |
-
archivePrefix={arXiv},
|
| 754 |
-
primaryClass={cs.IR},
|
| 755 |
-
url={https://arxiv.org/abs/2505.17166},
|
| 756 |
-
}
|
| 757 |
-
```
|
| 758 |
-
"""
|
| 759 |
-
)
|
| 760 |
-
with gr.TabItem("Deprecated ViDoRe V2", id="vidore-v2-deprecated"):
|
| 761 |
-
gr.Markdown(
|
| 762 |
-
"## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
|
| 763 |
-
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
|
| 764 |
-
"which is no longer maintained. Results should be computed using the "
|
| 765 |
-
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
|
| 766 |
-
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
|
| 767 |
-
)
|
| 768 |
-
gr.Markdown(
|
| 769 |
-
"## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
|
| 770 |
-
)
|
| 771 |
-
gr.Markdown(
|
| 772 |
-
"# <span style='color:red'>[Deprecated]</span> ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍"
|
| 773 |
-
)
|
| 774 |
-
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
|
| 775 |
-
|
| 776 |
-
gr.Markdown(
|
| 777 |
"""
|
| 778 |
-
Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
|
| 779 |
-
|
| 780 |
-
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
|
| 781 |
-
"""
|
| 782 |
)
|
| 783 |
-
deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
|
| 784 |
-
|
| 785 |
-
with gr.Row():
|
| 786 |
-
deprecated_metric_dropdown_2 = gr.Dropdown(
|
| 787 |
-
choices=METRICS, value=initial_metric, label="Select Metric"
|
| 788 |
-
)
|
| 789 |
-
deprecated_research_textbox_2 = gr.Textbox(
|
| 790 |
-
placeholder="🔍 Search Models... [press enter]",
|
| 791 |
-
label="Filter Models by Name",
|
| 792 |
-
)
|
| 793 |
-
deprecated_column_checkboxes_2 = gr.CheckboxGroup(
|
| 794 |
-
choices=deprecated_datasets_columns_2,
|
| 795 |
-
value=deprecated_datasets_columns_2,
|
| 796 |
-
label="Select Columns to Display",
|
| 797 |
-
)
|
| 798 |
|
| 799 |
-
with gr.Row():
|
| 800 |
-
deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
|
| 801 |
-
deprecated_dataframe_2 = gr.Dataframe(
|
| 802 |
-
deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas"
|
| 803 |
-
)
|
| 804 |
-
|
| 805 |
-
def deprecated_update_data_2(metric, search_term, selected_columns):
|
| 806 |
-
deprecated_model_handler.get_vidore_data(metric)
|
| 807 |
-
data = deprecated_model_handler.render_df(metric, benchmark_version=2)
|
| 808 |
-
data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns)
|
| 809 |
-
data = filter_models(data, search_term)
|
| 810 |
-
# data = remove_duplicates(data) # Add this line
|
| 811 |
-
if selected_columns:
|
| 812 |
-
data = data[["Rank", "Model", "Average Score"] + selected_columns]
|
| 813 |
-
return data
|
| 814 |
-
|
| 815 |
-
with gr.Row():
|
| 816 |
-
deprecated_refresh_button_2 = gr.Button("Refresh")
|
| 817 |
-
deprecated_refresh_button_2.click(
|
| 818 |
-
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2),
|
| 819 |
-
inputs=[deprecated_metric_dropdown_2],
|
| 820 |
-
outputs=deprecated_dataframe_2,
|
| 821 |
-
concurrency_limit=20,
|
| 822 |
-
)
|
| 823 |
-
|
| 824 |
-
with gr.Row():
|
| 825 |
-
gr.Markdown(
|
| 826 |
-
"""
|
| 827 |
-
**Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
|
| 828 |
-
Those numbers are not numbers obtained from the organisations that released those models.
|
| 829 |
-
"""
|
| 830 |
-
)
|
| 831 |
-
|
| 832 |
-
# Automatically refresh the dataframe when the dropdown value changes
|
| 833 |
-
deprecated_metric_dropdown_2.change(
|
| 834 |
-
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2),
|
| 835 |
-
inputs=[deprecated_metric_dropdown_2],
|
| 836 |
-
outputs=deprecated_dataframe_2,
|
| 837 |
-
)
|
| 838 |
-
deprecated_research_textbox_2.submit(
|
| 839 |
-
lambda metric, search_term, selected_columns: deprecated_update_data_2(
|
| 840 |
-
metric, search_term, selected_columns
|
| 841 |
-
),
|
| 842 |
-
inputs=[
|
| 843 |
-
deprecated_metric_dropdown_2,
|
| 844 |
-
deprecated_research_textbox_2,
|
| 845 |
-
deprecated_column_checkboxes_2,
|
| 846 |
-
],
|
| 847 |
-
outputs=deprecated_dataframe_2,
|
| 848 |
-
)
|
| 849 |
-
deprecated_column_checkboxes_2.change(
|
| 850 |
-
lambda metric, search_term, selected_columns: deprecated_update_data_2(
|
| 851 |
-
metric, search_term, selected_columns
|
| 852 |
-
),
|
| 853 |
-
inputs=[
|
| 854 |
-
deprecated_metric_dropdown_2,
|
| 855 |
-
deprecated_research_textbox_2,
|
| 856 |
-
deprecated_column_checkboxes_2,
|
| 857 |
-
],
|
| 858 |
-
outputs=deprecated_dataframe_2,
|
| 859 |
-
)
|
| 860 |
-
|
| 861 |
-
gr.Markdown(
|
| 862 |
-
f"""
|
| 863 |
-
- **Total Datasets**: {deprecated_num_datasets_2}
|
| 864 |
-
- **Total Scores**: {deprecated_num_scores_2}
|
| 865 |
-
- **Total Models**: {deprecated_num_models_2}
|
| 866 |
-
"""
|
| 867 |
-
+ r"""
|
| 868 |
-
Please consider citing:
|
| 869 |
-
|
| 870 |
-
```bibtex
|
| 871 |
-
@misc{faysse2024colpaliefficientdocumentretrieval,
|
| 872 |
-
title={ColPali: Efficient Document Retrieval with Vision Language Models},
|
| 873 |
-
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
|
| 874 |
-
year={2024},
|
| 875 |
-
eprint={2407.01449},
|
| 876 |
-
archivePrefix={arXiv},
|
| 877 |
-
primaryClass={cs.IR},
|
| 878 |
-
url={https://arxiv.org/abs/2407.01449},
|
| 879 |
-
}
|
| 880 |
-
|
| 881 |
-
@misc{macé2025vidorebenchmarkv2raising,
|
| 882 |
-
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
|
| 883 |
-
author={Quentin Macé and António Loison and Manuel Faysse},
|
| 884 |
-
year={2025},
|
| 885 |
-
eprint={2505.17166},
|
| 886 |
-
archivePrefix={arXiv},
|
| 887 |
-
primaryClass={cs.IR},
|
| 888 |
-
url={https://arxiv.org/abs/2505.17166},
|
| 889 |
-
}
|
| 890 |
-
```
|
| 891 |
-
"""
|
| 892 |
-
)
|
| 893 |
-
|
| 894 |
-
def select_tab_from_url(request: gr.Request):
|
| 895 |
-
# Grab query parameters from the URL
|
| 896 |
-
query_params = dict(request.query_params)
|
| 897 |
-
# Look for ?tab=..., default to the first tab's ID if not found
|
| 898 |
-
target_tab = query_params.get("tab", "vidore-v3")
|
| 899 |
-
# Update the tabs component to select the target ID
|
| 900 |
-
return gr.update(selected=target_tab)
|
| 901 |
-
|
| 902 |
-
block.load(select_tab_from_url, inputs=None, outputs=tabs)
|
| 903 |
block.queue(max_size=10).launch(debug=True)
|
| 904 |
|
| 905 |
|
| 906 |
-
if __name__ == "__main__":
|
| 907 |
main()
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from app.utils import add_rank_and_format, filter_models, get_refresh_function
|
| 4 |
+
from data.model_handler import ModelHandler
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
METRICS = ["ndcg_at_5", "recall_at_1"]
|
| 7 |
|
| 8 |
def main():
|
|
|
|
| 9 |
model_handler = ModelHandler()
|
| 10 |
initial_metric = "ndcg_at_5"
|
| 11 |
|
| 12 |
+
data = model_handler.get_vidore_data(initial_metric)
|
| 13 |
+
data = add_rank_and_format(data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
NUM_DATASETS = len(data.columns) - 3
|
| 16 |
+
NUM_SCORES = len(data) * NUM_DATASETS
|
| 17 |
+
NUM_MODELS = len(data)
|
| 18 |
|
| 19 |
css = """
|
| 20 |
table > thead {
|
|
|
|
| 37 |
.markdown {
|
| 38 |
font-size: 1rem;
|
| 39 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
"""
|
| 41 |
|
| 42 |
with gr.Blocks(css=css) as block:
|
| 43 |
+
with gr.Tabs():
|
| 44 |
+
with gr.TabItem("🏆 Leaderboard"):
|
| 45 |
+
gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 📚🔍")
|
| 46 |
+
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
gr.Markdown(
|
| 49 |
"""
|
| 50 |
+
Visual Document Retrieval Benchmark leaderboard. To submit results, refer to the corresponding tab.
|
| 51 |
|
| 52 |
+
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
|
| 53 |
"""
|
| 54 |
)
|
| 55 |
+
datasets_columns = list(data.columns[3:])
|
| 56 |
+
anchor_columns = list(data.columns[:3])
|
| 57 |
+
default_columns = anchor_columns + datasets_columns
|
| 58 |
|
| 59 |
with gr.Row():
|
| 60 |
+
metric_dropdown = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
|
| 61 |
+
research_textbox = gr.Textbox(placeholder="🔍 Search Models... [press enter]", label="Filter Models by Name", )
|
| 62 |
+
column_checkboxes = gr.CheckboxGroup(choices=datasets_columns, value=default_columns, label="Select Columns to Display")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
with gr.Row():
|
| 65 |
+
datatype = ["number", "markdown"] + ["number"] * (NUM_DATASETS + 1)
|
| 66 |
+
dataframe = gr.Dataframe(data, datatype=datatype, type="pandas")
|
| 67 |
|
| 68 |
+
def update_data(metric, search_term, selected_columns):
|
| 69 |
+
data = model_handler.get_vidore_data(metric)
|
| 70 |
+
data = add_rank_and_format(data)
|
|
|
|
| 71 |
data = filter_models(data, search_term)
|
|
|
|
| 72 |
if selected_columns:
|
| 73 |
+
selected_columns = selected_columns
|
| 74 |
+
data = data[selected_columns]
|
|
|
|
| 75 |
return data
|
| 76 |
|
| 77 |
with gr.Row():
|
| 78 |
+
refresh_button = gr.Button("Refresh")
|
| 79 |
+
refresh_button.click(get_refresh_function(), inputs=[metric_dropdown], outputs=dataframe, concurrency_limit=20)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
# Automatically refresh the dataframe when the dropdown value changes
|
| 83 |
+
metric_dropdown.change(get_refresh_function(), inputs=[metric_dropdown], outputs=dataframe)
|
| 84 |
+
research_textbox.submit(
|
| 85 |
+
lambda metric, search_term, selected_columns: update_data(metric, search_term, selected_columns),
|
| 86 |
+
inputs=[metric_dropdown, research_textbox, column_checkboxes],
|
| 87 |
+
outputs=dataframe
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
)
|
| 89 |
+
column_checkboxes.change(
|
| 90 |
+
lambda metric, search_term, selected_columns: update_data(metric, search_term, selected_columns),
|
| 91 |
+
inputs=[metric_dropdown, research_textbox, column_checkboxes],
|
| 92 |
+
outputs=dataframe
|
| 93 |
)
|
| 94 |
|
| 95 |
+
#column_checkboxes.change(get_refresh_function(), inputs=[metric_dropdown, column_checkboxes], outputs=dataframe)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
gr.Markdown(
|
| 99 |
f"""
|
| 100 |
+
- **Total Datasets**: {NUM_DATASETS}
|
| 101 |
+
- **Total Scores**: {NUM_SCORES}
|
| 102 |
+
- **Total Models**: {NUM_MODELS}
|
| 103 |
"""
|
| 104 |
+ r"""
|
| 105 |
Please consider citing:
|
| 106 |
|
| 107 |
```bibtex
|
| 108 |
@misc{faysse2024colpaliefficientdocumentretrieval,
|
| 109 |
+
title={ColPali: Efficient Document Retrieval with Vision Language Models},
|
| 110 |
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
|
| 111 |
year={2024},
|
| 112 |
eprint={2407.01449},
|
| 113 |
archivePrefix={arXiv},
|
| 114 |
primaryClass={cs.IR},
|
| 115 |
+
url={https://arxiv.org/abs/2407.01449},
|
| 116 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
```
|
| 118 |
"""
|
| 119 |
)
|
| 120 |
+
with gr.TabItem("📚 Submit your model"):
|
| 121 |
gr.Markdown("# How to Submit a New Model to the Leaderboard")
|
| 122 |
gr.Markdown(
|
| 123 |
"""
|
| 124 |
+
To submit a new model to the ViDoRe leaderboard, follow these steps:
|
|
|
|
| 125 |
|
| 126 |
1. **Evaluate your model**:
|
| 127 |
+
- Follow the evaluation script provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/)
|
| 128 |
|
| 129 |
2. **Format your submission file**:
|
| 130 |
+
- The submission file should automatically be generated, and named `results.json` with the
|
| 131 |
+
following structure:
|
| 132 |
+
```json
|
| 133 |
+
{
|
| 134 |
+
"dataset_name_1": {
|
| 135 |
+
"metric_1": score_1,
|
| 136 |
+
"metric_2": score_2,
|
| 137 |
+
...
|
| 138 |
+
},
|
| 139 |
+
"dataset_name_2": {
|
| 140 |
+
"metric_1": score_1,
|
| 141 |
+
"metric_2": score_2,
|
| 142 |
+
...
|
| 143 |
+
},
|
| 144 |
+
}
|
| 145 |
+
```
|
| 146 |
+
- The dataset names should be the same as the ViDoRe dataset names listed in the following
|
| 147 |
+
collection: [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).
|
| 148 |
+
|
| 149 |
+
3. **Submit your model**:
|
| 150 |
+
- Create a public HuggingFace model repository with your model.
|
| 151 |
+
- Add the tag `vidore` to your model in the metadata of the model card and place the
|
| 152 |
+
`results.json` file at the root.
|
| 153 |
|
| 154 |
And you're done! Your model will appear on the leaderboard when you click refresh! Once the space
|
| 155 |
gets rebooted, it will appear on startup.
|
| 156 |
|
| 157 |
Note: For proper hyperlink redirection, please ensure that your model repository name is in
|
| 158 |
kebab-case, e.g. `my-model-name`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
block.queue(max_size=10).launch(debug=True)
|
| 163 |
|
| 164 |
|
| 165 |
+
if __name__ == "__main__":
|
| 166 |
main()
|
| 167 |
+
|
app/utils.py
CHANGED
|
@@ -1,143 +1,31 @@
|
|
| 1 |
-
|
| 2 |
-
import math
|
| 3 |
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
# Process folder name for link: only handle __ and -thisisapoint-
|
| 10 |
-
desanitized_folder = link_folder.replace("__", "/")
|
| 11 |
-
desanitized_folder = desanitized_folder.replace("-thisisapoint-", ".")
|
| 12 |
-
if link is None:
|
| 13 |
-
link = f"https://github.com/illuin-tech/vidore-benchmark/tree/main/results/pipeline_descriptions/{desanitized_folder}/description.json"
|
| 14 |
-
# Use word-wrap styling for potentially long pipeline aliases
|
| 15 |
-
return f'<a target="_blank" style="text-decoration: underline; word-wrap: break-word; white-space: normal; display: inline-block; max-width: 450px;" href="{link}">{model_name}</a>'
|
| 16 |
-
else:
|
| 17 |
-
# For regular models: replace __ and _ with /, and -thisisapoint- with .
|
| 18 |
-
desanitized_model_name = model_name.replace("__", "/")
|
| 19 |
-
desanitized_model_name = desanitized_model_name.replace("_", "/")
|
| 20 |
-
desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
|
| 21 |
|
| 22 |
-
|
| 23 |
-
desanitized_model_name = desanitized_model_name.replace("/captioning", "")
|
| 24 |
-
if "/ocr" in desanitized_model_name:
|
| 25 |
-
desanitized_model_name = desanitized_model_name.replace("/ocr", "")
|
| 26 |
|
| 27 |
-
|
| 28 |
-
link = "https://huggingface.co/" + desanitized_model_name
|
| 29 |
|
| 30 |
-
return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
|
| 31 |
|
| 32 |
-
|
| 33 |
-
def add_rank(df, benchmark_version=1, selected_columns=None):
|
| 34 |
-
# Convert numeric columns to proper float type (they may be 'object' dtype due to mixed data)
|
| 35 |
-
for col in df.columns:
|
| 36 |
-
if col not in ["Model", "Pipeline", "_folder_name"]:
|
| 37 |
-
df[col] = pd.to_numeric(df[col], errors="coerce")
|
| 38 |
-
|
| 39 |
-
# Only fill NaN for numeric columns to avoid issues with string columns like _folder_name
|
| 40 |
-
numeric_cols = df.select_dtypes(include=["float64", "int64", "float32", "int32"]).columns
|
| 41 |
-
df[numeric_cols] = df[numeric_cols].fillna(0.0)
|
| 42 |
-
|
| 43 |
-
if selected_columns is None:
|
| 44 |
-
cols_to_rank = [
|
| 45 |
-
col
|
| 46 |
-
for col in df.columns
|
| 47 |
-
if col
|
| 48 |
-
not in [
|
| 49 |
-
"Model",
|
| 50 |
-
"Pipeline", # For pipeline tables
|
| 51 |
-
"Model Size (Million Parameters)",
|
| 52 |
-
"Memory Usage (GB, fp32)",
|
| 53 |
-
"Embedding Dimensions",
|
| 54 |
-
"Max Tokens",
|
| 55 |
-
"Compute Cost ($)",
|
| 56 |
-
"Queries per Second",
|
| 57 |
-
"_folder_name", # Hidden column for pipeline link generation
|
| 58 |
-
]
|
| 59 |
-
]
|
| 60 |
-
else:
|
| 61 |
-
cols_to_rank = selected_columns
|
| 62 |
-
|
| 63 |
-
if len(cols_to_rank) == 1:
|
| 64 |
-
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
|
| 65 |
-
else:
|
| 66 |
-
# Only add Average column if it doesn't already exist
|
| 67 |
-
if "Average Score" not in df.columns:
|
| 68 |
-
df.insert(len(df.columns) - len(cols_to_rank), "Average Score", df[cols_to_rank].mean(axis=1, skipna=False))
|
| 69 |
-
df.sort_values("Average Score", ascending=False, inplace=True)
|
| 70 |
-
df.insert(0, "Rank", list(range(1, len(df) + 1)))
|
| 71 |
-
# multiply values by 100 if they are floats and round to 2 decimal places
|
| 72 |
-
for col in df.columns:
|
| 73 |
-
if pd.api.types.is_numeric_dtype(df[col]) and col not in [
|
| 74 |
-
"Model Size (Million Parameters)",
|
| 75 |
-
"Compute Cost ($)",
|
| 76 |
-
"Queries per Second",
|
| 77 |
-
"Indexing latency (s/doc)",
|
| 78 |
-
"Search latency (s/query)",
|
| 79 |
-
"Rank",
|
| 80 |
-
]:
|
| 81 |
-
df[col] = df[col].apply(lambda x: round(x * 100, 3 - int(math.floor(math.log10(abs(x * 100)))) - 1))
|
| 82 |
-
elif pd.api.types.is_numeric_dtype(df[col]) and col in ["Indexing latency (s/doc)", "Search latency (s/query)"]:
|
| 83 |
-
df[col] = df[col].apply(lambda x: round(x, 3 - int(math.floor(math.log10(abs(x)))) - 1))
|
| 84 |
-
return df
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
def add_rank_and_format(df, benchmark_version=1, selected_columns=None, is_pipeline=False):
|
| 88 |
df = df.reset_index()
|
| 89 |
-
|
| 90 |
-
df =
|
| 91 |
-
df =
|
| 92 |
-
|
| 93 |
-
if is_pipeline and "_folder_name" in df.columns:
|
| 94 |
-
# For pipelines, use folder_name for link generation
|
| 95 |
-
df[column_name] = df.apply(
|
| 96 |
-
lambda row: make_clickable_model(row[column_name], is_pipeline=True, folder_name=row["_folder_name"]),
|
| 97 |
-
axis=1,
|
| 98 |
-
)
|
| 99 |
-
df = df.drop(columns=["_folder_name"])
|
| 100 |
-
else:
|
| 101 |
-
df[column_name] = df[column_name].apply(lambda x: make_clickable_model(x, is_pipeline=is_pipeline))
|
| 102 |
-
# df = remove_duplicates(df)
|
| 103 |
-
return df
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
def remove_duplicates(df):
|
| 107 |
-
"""Remove duplicate models based on their name (after the last '/' if present)."""
|
| 108 |
-
df["model_name"] = df["Model"].str.replace("_", "/")
|
| 109 |
-
df = df.sort_values("Rank").drop_duplicates(subset=["model_name"], keep="first")
|
| 110 |
-
df = df.drop("model_name", axis=1)
|
| 111 |
return df
|
| 112 |
|
| 113 |
-
|
| 114 |
-
def get_refresh_function(model_handler, benchmark_version):
|
| 115 |
-
def _refresh(metric):
|
| 116 |
-
model_handler.get_vidore_data(metric)
|
| 117 |
-
data_task_category = model_handler.render_df(metric, benchmark_version)
|
| 118 |
-
df = add_rank_and_format(data_task_category, benchmark_version)
|
| 119 |
-
return df
|
| 120 |
-
|
| 121 |
-
return _refresh
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
def deprecated_get_refresh_function(model_handler, benchmark_version):
|
| 125 |
-
def _refresh(metric):
|
| 126 |
-
model_handler.get_vidore_data(metric)
|
| 127 |
-
data_task_category = model_handler.render_df(metric, benchmark_version)
|
| 128 |
-
df = add_rank_and_format(data_task_category, benchmark_version)
|
| 129 |
-
return df
|
| 130 |
-
|
| 131 |
-
return _refresh
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
def get_pipeline_refresh_function(pipeline_handler):
|
| 135 |
-
"""Refresh function for pipeline evaluation results."""
|
| 136 |
-
|
| 137 |
def _refresh(metric):
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
df = add_rank_and_format(
|
| 141 |
return df
|
| 142 |
|
| 143 |
return _refresh
|
|
@@ -145,7 +33,5 @@ def get_pipeline_refresh_function(pipeline_handler):
|
|
| 145 |
|
| 146 |
def filter_models(data, search_term):
|
| 147 |
if search_term:
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
data = data[data[col_name].str.contains(search_term, case=False, na=False)]
|
| 151 |
-
return data
|
|
|
|
| 1 |
+
from data.model_handler import ModelHandler
|
|
|
|
| 2 |
|
| 3 |
+
def make_clickable_model(model_name, link=None):
|
| 4 |
+
if link is None:
|
| 5 |
+
desanitized_model_name = model_name.replace("_", "/")
|
| 6 |
|
| 7 |
+
if '/captioning' in desanitized_model_name:
|
| 8 |
+
desanitized_model_name = desanitized_model_name.replace('/captioning', '')
|
| 9 |
+
if '/ocr' in desanitized_model_name:
|
| 10 |
+
desanitized_model_name = desanitized_model_name.replace('/ocr', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
link = "https://huggingface.co/" + desanitized_model_name
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
return f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
|
|
|
|
| 15 |
|
|
|
|
| 16 |
|
| 17 |
+
def add_rank_and_format(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
df = df.reset_index()
|
| 19 |
+
df = df.rename(columns={"index": "Model"})
|
| 20 |
+
df = ModelHandler.add_rank(df)
|
| 21 |
+
df["Model"] = df["Model"].apply(make_clickable_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
return df
|
| 23 |
|
| 24 |
+
def get_refresh_function():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def _refresh(metric):
|
| 26 |
+
model_handler = ModelHandler()
|
| 27 |
+
data_task_category = model_handler.get_vidore_data(metric)
|
| 28 |
+
df = add_rank_and_format(data_task_category)
|
| 29 |
return df
|
| 30 |
|
| 31 |
return _refresh
|
|
|
|
| 33 |
|
| 34 |
def filter_models(data, search_term):
|
| 35 |
if search_term:
|
| 36 |
+
data = data[data['Model'].str.contains(search_term, case=False, na=False)]
|
| 37 |
+
return data
|
|
|
|
|
|
data/dataset_handler.py
CHANGED
|
@@ -1,22 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
"VidoreArxivQARetrieval",
|
| 3 |
-
"VidoreDocVQARetrieval",
|
| 4 |
-
"VidoreInfoVQARetrieval",
|
| 5 |
-
"VidoreShiftProjectRetrieval",
|
| 6 |
-
"VidoreSyntheticDocQAAIRetrieval",
|
| 7 |
-
"VidoreSyntheticDocQAEnergyRetrieval",
|
| 8 |
-
"VidoreSyntheticDocQAGovernmentReportsRetrieval",
|
| 9 |
-
"VidoreSyntheticDocQAHealthcareIndustryRetrieval",
|
| 10 |
-
"VidoreTabfquadRetrieval",
|
| 11 |
-
"VidoreTatdqaRetrieval",
|
| 12 |
-
]
|
| 13 |
-
VIDORE_V2_MTEB_NAMES = [
|
| 14 |
-
"Vidore2BioMedicalLecturesRetrieval",
|
| 15 |
-
"Vidore2EconomicsReportsRetrieval",
|
| 16 |
-
"Vidore2ESGReportsHLRetrieval",
|
| 17 |
-
"Vidore2ESGReportsRetrieval",
|
| 18 |
-
]
|
| 19 |
-
DEPRECATED_VIDORE_DATASETS_KEYWORDS = [
|
| 20 |
"arxivqa",
|
| 21 |
"docvqa",
|
| 22 |
"infovqa",
|
|
@@ -29,53 +11,8 @@ DEPRECATED_VIDORE_DATASETS_KEYWORDS = [
|
|
| 29 |
"healthcare_industry",
|
| 30 |
]
|
| 31 |
|
| 32 |
-
DEPRECATED_VIDORE_2_DATASETS_KEYWORDS = [
|
| 33 |
-
"restaurant_esg",
|
| 34 |
-
"rse_restaurant",
|
| 35 |
-
"mit_biomedical",
|
| 36 |
-
"economics_macro",
|
| 37 |
-
"biomedical_lectures",
|
| 38 |
-
"esg_reports",
|
| 39 |
-
"economics_reports",
|
| 40 |
-
]
|
| 41 |
-
|
| 42 |
|
| 43 |
def get_datasets_nickname(dataset_name) -> str:
|
| 44 |
-
if dataset_name == "VidoreArxivQARetrieval":
|
| 45 |
-
return "ArxivQA"
|
| 46 |
-
elif dataset_name == "VidoreDocVQARetrieval":
|
| 47 |
-
return "DocVQA"
|
| 48 |
-
elif dataset_name == "VidoreInfoVQARetrieval":
|
| 49 |
-
return "InfoVQA"
|
| 50 |
-
elif dataset_name == "VidoreTabfquadRetrieval":
|
| 51 |
-
return "TabFQuad"
|
| 52 |
-
elif dataset_name == "VidoreTatdqaRetrieval":
|
| 53 |
-
return "TAT-DQA"
|
| 54 |
-
elif dataset_name == "VidoreShiftProjectRetrieval":
|
| 55 |
-
return "Shift Project"
|
| 56 |
-
elif dataset_name == "VidoreSyntheticDocQAAIRetrieval":
|
| 57 |
-
return "Artificial Intelligence"
|
| 58 |
-
elif dataset_name == "VidoreSyntheticDocQAEnergyRetrieval":
|
| 59 |
-
return "Energy"
|
| 60 |
-
elif dataset_name == "VidoreSyntheticDocQAGovernmentReportsRetrieval":
|
| 61 |
-
return "Government Reports"
|
| 62 |
-
elif dataset_name == "VidoreSyntheticDocQAHealthcareIndustryRetrieval":
|
| 63 |
-
return "Healthcare Industry"
|
| 64 |
-
|
| 65 |
-
elif dataset_name == "Vidore2ESGReportsHLRetrieval":
|
| 66 |
-
return "ESG Restaurant Human English"
|
| 67 |
-
elif dataset_name == "Vidore2ESGReportsRetrieval":
|
| 68 |
-
return "ESG Restaurant Synthetic Multilingual"
|
| 69 |
-
elif dataset_name == "Vidore2BioMedicalLecturesRetrieval":
|
| 70 |
-
return "MIT Biomedical Multilingual"
|
| 71 |
-
elif dataset_name == "Vidore2EconomicsReportsRetrieval":
|
| 72 |
-
return "Economics Macro Multilingual"
|
| 73 |
-
|
| 74 |
-
else:
|
| 75 |
-
raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
def deprecated_get_datasets_nickname(dataset_name) -> str:
|
| 79 |
if "arxivqa" in dataset_name:
|
| 80 |
return "ArxivQA"
|
| 81 |
|
|
@@ -104,32 +41,5 @@ def deprecated_get_datasets_nickname(dataset_name) -> str:
|
|
| 104 |
elif "healthcare_industry" in dataset_name:
|
| 105 |
return "Healthcare Industry"
|
| 106 |
|
| 107 |
-
elif ("restaurant_esg" in dataset_name) or ("esg_reports_human" in dataset_name):
|
| 108 |
-
return "ESG Restaurant Human"
|
| 109 |
-
|
| 110 |
-
elif ("rse_restaurant" in dataset_name and "multilingual" in dataset_name) or (
|
| 111 |
-
"esg_reports" in dataset_name and "_eng_" not in dataset_name
|
| 112 |
-
):
|
| 113 |
-
return "ESG Restaurant Synthetic Multilingual"
|
| 114 |
-
|
| 115 |
-
elif ("rse_restaurant" in dataset_name) or ("esg_reports" in dataset_name and "_eng_" in dataset_name):
|
| 116 |
-
return "ESG Restaurant Synthetic"
|
| 117 |
-
|
| 118 |
-
elif ("mit_biomedical" in dataset_name and "multilingual" in dataset_name) or (
|
| 119 |
-
"biomedical_lectures" in dataset_name and "_eng_" not in dataset_name
|
| 120 |
-
):
|
| 121 |
-
return "MIT Biomedical Multilingual"
|
| 122 |
-
|
| 123 |
-
elif ("mit_biomedical" in dataset_name) or ("biomedical_lectures" in dataset_name and "_eng_" in dataset_name):
|
| 124 |
-
return "MIT Biomedical"
|
| 125 |
-
|
| 126 |
-
elif ("economics_macro" in dataset_name and "multilingual" in dataset_name) or (
|
| 127 |
-
"economics_reports" in dataset_name and "_eng_" not in dataset_name
|
| 128 |
-
):
|
| 129 |
-
return "Economics Macro Multilingual"
|
| 130 |
-
|
| 131 |
-
elif ("economics_macro" in dataset_name) or ("economics_reports" in dataset_name and "_eng_" in dataset_name):
|
| 132 |
-
return "Economics Macro"
|
| 133 |
-
|
| 134 |
else:
|
| 135 |
raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")
|
|
|
|
| 1 |
+
VIDORE_DATASETS_KEYWORDS = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"arxivqa",
|
| 3 |
"docvqa",
|
| 4 |
"infovqa",
|
|
|
|
| 11 |
"healthcare_industry",
|
| 12 |
]
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def get_datasets_nickname(dataset_name) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
if "arxivqa" in dataset_name:
|
| 17 |
return "ArxivQA"
|
| 18 |
|
|
|
|
| 41 |
elif "healthcare_industry" in dataset_name:
|
| 42 |
return "Healthcare Industry"
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
else:
|
| 45 |
raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")
|
data/deprecated_model_handler.py
DELETED
|
@@ -1,128 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
-
from typing import Any, Dict
|
| 4 |
-
|
| 5 |
-
import pandas as pd
|
| 6 |
-
from huggingface_hub import HfApi, hf_hub_download, metadata_load
|
| 7 |
-
|
| 8 |
-
from .dataset_handler import (
|
| 9 |
-
DEPRECATED_VIDORE_2_DATASETS_KEYWORDS,
|
| 10 |
-
DEPRECATED_VIDORE_DATASETS_KEYWORDS,
|
| 11 |
-
deprecated_get_datasets_nickname,
|
| 12 |
-
)
|
| 13 |
-
|
| 14 |
-
BLOCKLIST = ["impactframes"]
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
class DeprecatedModelHandler:
|
| 18 |
-
def __init__(self, model_infos_path="model_infos.json"):
|
| 19 |
-
self.api = HfApi()
|
| 20 |
-
self.model_infos_path = model_infos_path
|
| 21 |
-
self.model_infos = self._load_model_infos()
|
| 22 |
-
|
| 23 |
-
def _load_model_infos(self) -> Dict:
|
| 24 |
-
if os.path.exists(self.model_infos_path):
|
| 25 |
-
with open(self.model_infos_path) as f:
|
| 26 |
-
return json.load(f)
|
| 27 |
-
return {}
|
| 28 |
-
|
| 29 |
-
def _save_model_infos(self):
|
| 30 |
-
with open(self.model_infos_path, "w") as f:
|
| 31 |
-
json.dump(self.model_infos, f)
|
| 32 |
-
|
| 33 |
-
def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
|
| 34 |
-
return "metadata" in results and "metrics" in results
|
| 35 |
-
|
| 36 |
-
def _is_baseline_repo(self, repo_id: str) -> bool:
|
| 37 |
-
return repo_id == "vidore/baseline-results"
|
| 38 |
-
|
| 39 |
-
def sanitize_model_name(self, model_name):
|
| 40 |
-
return model_name.replace("/", "_").replace(".", "-thisisapoint-")
|
| 41 |
-
|
| 42 |
-
def fuze_model_infos(self, model_name, results):
|
| 43 |
-
for dataset, metrics in results.items():
|
| 44 |
-
if dataset not in self.model_infos[model_name]["results"].keys():
|
| 45 |
-
self.model_infos[model_name]["results"][dataset] = metrics
|
| 46 |
-
else:
|
| 47 |
-
continue
|
| 48 |
-
|
| 49 |
-
def get_vidore_data(self, metric="ndcg_at_5"):
|
| 50 |
-
models = self.api.list_models(filter="vidore")
|
| 51 |
-
repositories = [model.modelId for model in models] # type: ignore
|
| 52 |
-
|
| 53 |
-
# Sort repositories to process non-baseline repos first (to prioritize their results)
|
| 54 |
-
repositories.sort(key=lambda x: self._is_baseline_repo(x))
|
| 55 |
-
|
| 56 |
-
for repo_id in repositories:
|
| 57 |
-
org_name = repo_id.split("/")[0]
|
| 58 |
-
if org_name in BLOCKLIST:
|
| 59 |
-
continue
|
| 60 |
-
files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]
|
| 61 |
-
|
| 62 |
-
if len(files) == 0:
|
| 63 |
-
continue
|
| 64 |
-
else:
|
| 65 |
-
for file in files:
|
| 66 |
-
if file.endswith("results.json"):
|
| 67 |
-
model_name = repo_id.replace("/", "_").replace(".", "-thisisapoint-")
|
| 68 |
-
else:
|
| 69 |
-
model_name = file.split("_metrics.json")[0]
|
| 70 |
-
model_name = model_name.replace("/", "_").replace(".", "-thisisapoint-")
|
| 71 |
-
|
| 72 |
-
# Skip if the model is from baseline and we already have results
|
| 73 |
-
|
| 74 |
-
readme_path = hf_hub_download(repo_id, filename="README.md")
|
| 75 |
-
meta = metadata_load(readme_path)
|
| 76 |
-
try:
|
| 77 |
-
result_path = hf_hub_download(repo_id, filename=file)
|
| 78 |
-
|
| 79 |
-
with open(result_path) as f:
|
| 80 |
-
results = json.load(f)
|
| 81 |
-
|
| 82 |
-
if self._are_results_in_new_vidore_format(results):
|
| 83 |
-
metadata = results["metadata"]
|
| 84 |
-
results = results["metrics"]
|
| 85 |
-
|
| 86 |
-
# Handles the case where the model is both in baseline and outside of it
|
| 87 |
-
# (prioritizes the non-baseline results)
|
| 88 |
-
if self._is_baseline_repo(repo_id) and self.sanitize_model_name(model_name) in self.model_infos:
|
| 89 |
-
self.fuze_model_infos(model_name, results)
|
| 90 |
-
|
| 91 |
-
self.model_infos[model_name] = {"meta": meta, "results": results}
|
| 92 |
-
except Exception as e:
|
| 93 |
-
print(f"Error loading {model_name} - {e}")
|
| 94 |
-
continue
|
| 95 |
-
|
| 96 |
-
# In order to keep only models relevant to a benchmark
|
| 97 |
-
def filter_models_by_benchmark(self, benchmark_version=1):
|
| 98 |
-
filtered_model_infos = {}
|
| 99 |
-
keywords = DEPRECATED_VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else DEPRECATED_VIDORE_2_DATASETS_KEYWORDS
|
| 100 |
-
|
| 101 |
-
for model, info in self.model_infos.items():
|
| 102 |
-
results = info["results"]
|
| 103 |
-
if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()):
|
| 104 |
-
filtered_model_infos[model] = info
|
| 105 |
-
|
| 106 |
-
return filtered_model_infos
|
| 107 |
-
|
| 108 |
-
# Compute the average of a metric for each model,
|
| 109 |
-
def render_df(self, metric="ndcg_at_5", benchmark_version=1):
|
| 110 |
-
model_res = {}
|
| 111 |
-
filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
|
| 112 |
-
if len(filtered_model_infos) > 0:
|
| 113 |
-
for model in filtered_model_infos.keys():
|
| 114 |
-
res = filtered_model_infos[model]["results"]
|
| 115 |
-
dataset_res = {}
|
| 116 |
-
keywords = DEPRECATED_VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else DEPRECATED_VIDORE_2_DATASETS_KEYWORDS
|
| 117 |
-
for dataset in res.keys():
|
| 118 |
-
if not any(keyword in dataset for keyword in keywords):
|
| 119 |
-
continue
|
| 120 |
-
|
| 121 |
-
dataset_nickname = deprecated_get_datasets_nickname(dataset)
|
| 122 |
-
dataset_res[dataset_nickname] = res[dataset][metric]
|
| 123 |
-
model_res[model] = dataset_res
|
| 124 |
-
|
| 125 |
-
df = pd.DataFrame(model_res).T
|
| 126 |
-
|
| 127 |
-
return df
|
| 128 |
-
return pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/model_handler.py
CHANGED
|
@@ -1,98 +1,117 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
|
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
-
from
|
| 6 |
|
| 7 |
-
from .dataset_handler import
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
class ModelHandler:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
def
|
| 13 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
])
|
| 22 |
|
| 23 |
def get_vidore_data(self, metric="ndcg_at_5"):
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
for model_name in model_names:
|
| 37 |
-
revisions = self.get_folders(os.path.join(local_path, folder_of_interest, model_name))
|
| 38 |
-
first_revision = revisions[0]
|
| 39 |
-
result_filenames = [
|
| 40 |
-
result_filename
|
| 41 |
-
for result_filename in os.listdir(os.path.join(local_path, folder_of_interest, model_name, first_revision))
|
| 42 |
-
# if result_filename.endswith(".json") and result_filename != "model_meta.json"
|
| 43 |
-
]
|
| 44 |
-
if "model_meta.json" in result_filenames:
|
| 45 |
-
with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, "model_meta.json"), "r") as f:
|
| 46 |
-
meta = json.load(f)
|
| 47 |
else:
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
model_res = {}
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
res = filtered_model_infos[model]["results"]
|
| 79 |
dataset_res = {}
|
| 80 |
-
keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES
|
| 81 |
-
if "n_parameters" in filtered_model_infos[model]["meta"]:
|
| 82 |
-
try:
|
| 83 |
-
dataset_res["Model Size (Million Parameters)"] = filtered_model_infos[model]["meta"]["n_parameters"] // 1_000_000
|
| 84 |
-
except TypeError:
|
| 85 |
-
dataset_res["Model Size (Million Parameters)"] = -1
|
| 86 |
-
else:
|
| 87 |
-
dataset_res["Model Size (Million Parameters)"] = -1
|
| 88 |
for dataset in res.keys():
|
| 89 |
-
|
|
|
|
|
|
|
| 90 |
continue
|
|
|
|
| 91 |
dataset_nickname = get_datasets_nickname(dataset)
|
| 92 |
-
dataset_res[dataset_nickname] = res[dataset][
|
| 93 |
model_res[model] = dataset_res
|
| 94 |
|
| 95 |
df = pd.DataFrame(model_res).T
|
| 96 |
|
| 97 |
return df
|
| 98 |
return pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
+
from typing import Any, Dict
|
| 4 |
|
| 5 |
import pandas as pd
|
| 6 |
+
from huggingface_hub import HfApi, hf_hub_download, metadata_load
|
| 7 |
|
| 8 |
+
from .dataset_handler import VIDORE_DATASETS_KEYWORDS, get_datasets_nickname
|
| 9 |
+
|
| 10 |
+
BLOCKLIST = ["impactframes"]
|
| 11 |
|
| 12 |
|
| 13 |
class ModelHandler:
|
| 14 |
+
def __init__(self, model_infos_path="model_infos.json"):
|
| 15 |
+
self.api = HfApi()
|
| 16 |
+
self.model_infos_path = model_infos_path
|
| 17 |
+
self.model_infos = self._load_model_infos()
|
| 18 |
|
| 19 |
+
def _load_model_infos(self) -> Dict:
|
| 20 |
+
if os.path.exists(self.model_infos_path):
|
| 21 |
+
with open(self.model_infos_path) as f:
|
| 22 |
+
return json.load(f)
|
| 23 |
+
return {}
|
| 24 |
|
| 25 |
+
def _save_model_infos(self):
|
| 26 |
+
with open(self.model_infos_path, "w") as f:
|
| 27 |
+
json.dump(self.model_infos, f)
|
| 28 |
+
|
| 29 |
+
def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
|
| 30 |
+
return "metadata" in results and "metrics" in results
|
|
|
|
| 31 |
|
| 32 |
def get_vidore_data(self, metric="ndcg_at_5"):
|
| 33 |
+
models = self.api.list_models(filter="vidore")
|
| 34 |
+
repositories = [model.modelId for model in models] # type: ignore
|
| 35 |
+
|
| 36 |
+
for repo_id in repositories:
|
| 37 |
+
org_name = repo_id.split("/")[0]
|
| 38 |
+
if org_name in BLOCKLIST:
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]
|
| 42 |
+
|
| 43 |
+
if len(files) == 0:
|
| 44 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
else:
|
| 46 |
+
for file in files:
|
| 47 |
+
if file.endswith("results.json"):
|
| 48 |
+
model_name = repo_id.replace("/", "_")
|
| 49 |
+
else:
|
| 50 |
+
model_name = file.split("_metrics.json")[0]
|
| 51 |
+
|
| 52 |
+
if model_name not in self.model_infos:
|
| 53 |
+
readme_path = hf_hub_download(repo_id, filename="README.md")
|
| 54 |
+
meta = metadata_load(readme_path)
|
| 55 |
+
try:
|
| 56 |
+
result_path = hf_hub_download(repo_id, filename=file)
|
| 57 |
+
|
| 58 |
+
with open(result_path) as f:
|
| 59 |
+
results = json.load(f)
|
| 60 |
+
|
| 61 |
+
if self._are_results_in_new_vidore_format(results):
|
| 62 |
+
metadata = results["metadata"]
|
| 63 |
+
results = results["metrics"]
|
| 64 |
+
|
| 65 |
+
self.model_infos[model_name] = {"meta": meta, "results": results}
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f"Error loading {model_name} - {e}")
|
| 68 |
+
continue
|
| 69 |
+
|
| 70 |
+
# self._save_model_infos()
|
| 71 |
+
|
| 72 |
model_res = {}
|
| 73 |
+
if len(self.model_infos) > 0:
|
| 74 |
+
for model in self.model_infos.keys():
|
| 75 |
+
res = self.model_infos[model]["results"]
|
|
|
|
| 76 |
dataset_res = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
for dataset in res.keys():
|
| 78 |
+
# for each keyword check if it is in the dataset name if not continue
|
| 79 |
+
if not any(keyword in dataset for keyword in VIDORE_DATASETS_KEYWORDS):
|
| 80 |
+
print(f"{dataset} not found in ViDoRe datasets. Skipping ...")
|
| 81 |
continue
|
| 82 |
+
|
| 83 |
dataset_nickname = get_datasets_nickname(dataset)
|
| 84 |
+
dataset_res[dataset_nickname] = res[dataset][metric]
|
| 85 |
model_res[model] = dataset_res
|
| 86 |
|
| 87 |
df = pd.DataFrame(model_res).T
|
| 88 |
|
| 89 |
return df
|
| 90 |
return pd.DataFrame()
|
| 91 |
+
|
| 92 |
+
@staticmethod
|
| 93 |
+
def add_rank(df):
|
| 94 |
+
df.fillna(0.0, inplace=True)
|
| 95 |
+
cols_to_rank = [
|
| 96 |
+
col
|
| 97 |
+
for col in df.columns
|
| 98 |
+
if col
|
| 99 |
+
not in [
|
| 100 |
+
"Model",
|
| 101 |
+
"Model Size (Million Parameters)",
|
| 102 |
+
"Memory Usage (GB, fp32)",
|
| 103 |
+
"Embedding Dimensions",
|
| 104 |
+
"Max Tokens",
|
| 105 |
+
]
|
| 106 |
+
]
|
| 107 |
+
if len(cols_to_rank) == 1:
|
| 108 |
+
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
|
| 109 |
+
else:
|
| 110 |
+
df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
|
| 111 |
+
df.sort_values("Average", ascending=False, inplace=True)
|
| 112 |
+
df.insert(0, "Rank", list(range(1, len(df) + 1)))
|
| 113 |
+
# multiply values by 100 if they are floats and round to 1 decimal place
|
| 114 |
+
for col in df.columns:
|
| 115 |
+
if df[col].dtype == "float64":
|
| 116 |
+
df[col] = df[col].apply(lambda x: round(x * 100, 1))
|
| 117 |
+
return df
|
data/pipeline_handler.py
DELETED
|
@@ -1,274 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from typing import Dict, List, Optional
|
| 4 |
-
|
| 5 |
-
import pandas as pd
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class PipelineHandler:
|
| 9 |
-
"""Handler for ViDoRe v3 pipeline evaluation results from GitHub."""
|
| 10 |
-
|
| 11 |
-
def __init__(self):
|
| 12 |
-
self.pipeline_infos = {}
|
| 13 |
-
self.pipeline_aliases = {} # Maps folder_name -> pipeline_alias for display
|
| 14 |
-
self.github_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/metrics"
|
| 15 |
-
self.github_descriptions_base_url = (
|
| 16 |
-
"https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/pipeline_descriptions"
|
| 17 |
-
)
|
| 18 |
-
self.available_datasets = []
|
| 19 |
-
self.available_languages = ["english"] # Default languages available
|
| 20 |
-
|
| 21 |
-
# Setup GitHub authentication if token is available
|
| 22 |
-
self.github_token = os.environ.get("GITHUB_TOKEN")
|
| 23 |
-
self.headers = {}
|
| 24 |
-
if self.github_token:
|
| 25 |
-
self.headers["Authorization"] = f"token {self.github_token}"
|
| 26 |
-
print("GitHub token detected - using authenticated requests")
|
| 27 |
-
|
| 28 |
-
def get_pipeline_folders_from_github(self) -> List[str]:
|
| 29 |
-
"""Get list of pipeline folders from GitHub API."""
|
| 30 |
-
api_url = "https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics"
|
| 31 |
-
|
| 32 |
-
try:
|
| 33 |
-
response = requests.get(api_url, headers=self.headers)
|
| 34 |
-
response.raise_for_status()
|
| 35 |
-
contents = response.json()
|
| 36 |
-
|
| 37 |
-
# Filter for directories only
|
| 38 |
-
folders = [item["name"] for item in contents if item["type"] == "dir"]
|
| 39 |
-
return sorted(folders)
|
| 40 |
-
except Exception as e:
|
| 41 |
-
print(f"Error fetching pipeline folders from GitHub: {e}")
|
| 42 |
-
return []
|
| 43 |
-
|
| 44 |
-
def get_dataset_files_from_github(self, pipeline_name: str) -> List[str]:
|
| 45 |
-
"""Get list of dataset JSON files for a specific pipeline from GitHub API."""
|
| 46 |
-
api_url = f"https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics/{pipeline_name}"
|
| 47 |
-
|
| 48 |
-
try:
|
| 49 |
-
response = requests.get(api_url, headers=self.headers)
|
| 50 |
-
response.raise_for_status()
|
| 51 |
-
contents = response.json()
|
| 52 |
-
|
| 53 |
-
# Filter for JSON files that start with 'vidore_v3'
|
| 54 |
-
files = [
|
| 55 |
-
item["name"]
|
| 56 |
-
for item in contents
|
| 57 |
-
if item["type"] == "file" and item["name"].startswith("vidore_v3") and item["name"].endswith(".json")
|
| 58 |
-
]
|
| 59 |
-
return sorted(files)
|
| 60 |
-
except Exception as e:
|
| 61 |
-
print(f"Error fetching dataset files from {pipeline_name}: {e}")
|
| 62 |
-
return []
|
| 63 |
-
|
| 64 |
-
def fetch_json_from_github(self, pipeline_name: str, filename: str) -> Optional[Dict]:
|
| 65 |
-
"""Fetch a JSON file from GitHub raw content."""
|
| 66 |
-
url = f"{self.github_base_url}/{pipeline_name}/{filename}"
|
| 67 |
-
|
| 68 |
-
try:
|
| 69 |
-
response = requests.get(url, headers=self.headers)
|
| 70 |
-
response.raise_for_status()
|
| 71 |
-
return response.json()
|
| 72 |
-
except Exception as e:
|
| 73 |
-
print(f"Error fetching {filename} from {pipeline_name}: {e}")
|
| 74 |
-
return None
|
| 75 |
-
|
| 76 |
-
def fetch_pipeline_alias(self, pipeline_name: str) -> Optional[str]:
|
| 77 |
-
"""Fetch the pipeline_alias from description.json for a pipeline.
|
| 78 |
-
|
| 79 |
-
Uses raw.githubusercontent.com to avoid API rate limits.
|
| 80 |
-
"""
|
| 81 |
-
url = f"{self.github_descriptions_base_url}/{pipeline_name}/description.json"
|
| 82 |
-
|
| 83 |
-
try:
|
| 84 |
-
response = requests.get(url, headers=self.headers)
|
| 85 |
-
response.raise_for_status()
|
| 86 |
-
description = response.json()
|
| 87 |
-
return description.get("pipeline_alias")
|
| 88 |
-
except Exception as e:
|
| 89 |
-
print(f"Error fetching description for {pipeline_name}: {e}")
|
| 90 |
-
return None
|
| 91 |
-
|
| 92 |
-
def get_pipeline_data(self):
|
| 93 |
-
"""Fetch all pipeline data from GitHub."""
|
| 94 |
-
pipeline_folders = self.get_pipeline_folders_from_github()
|
| 95 |
-
datasets_set = set()
|
| 96 |
-
languages_set = set(["overall"])
|
| 97 |
-
|
| 98 |
-
for pipeline_name in pipeline_folders:
|
| 99 |
-
# Get all dataset files for this pipeline
|
| 100 |
-
dataset_files = self.get_dataset_files_from_github(pipeline_name)
|
| 101 |
-
|
| 102 |
-
if not dataset_files:
|
| 103 |
-
continue
|
| 104 |
-
|
| 105 |
-
pipeline_data = {}
|
| 106 |
-
for filename in dataset_files:
|
| 107 |
-
results = self.fetch_json_from_github(pipeline_name, filename)
|
| 108 |
-
if results:
|
| 109 |
-
# Extract dataset name from filename (remove vidore_v3_ prefix and .json suffix)
|
| 110 |
-
dataset_name = filename.replace("vidore_v3_", "").replace(".json", "")
|
| 111 |
-
datasets_set.add(dataset_name)
|
| 112 |
-
pipeline_data[dataset_name] = results
|
| 113 |
-
|
| 114 |
-
# Collect available languages
|
| 115 |
-
if "aggregated_metrics" in results and "by_language" in results["aggregated_metrics"]:
|
| 116 |
-
languages_set.update(results["aggregated_metrics"]["by_language"].keys())
|
| 117 |
-
|
| 118 |
-
if pipeline_data:
|
| 119 |
-
self.pipeline_infos[pipeline_name] = pipeline_data
|
| 120 |
-
# Fetch the pipeline alias for display (uses raw URL, not API)
|
| 121 |
-
alias = self.fetch_pipeline_alias(pipeline_name)
|
| 122 |
-
if alias:
|
| 123 |
-
self.pipeline_aliases[pipeline_name] = alias
|
| 124 |
-
|
| 125 |
-
self.available_datasets = sorted(list(datasets_set))
|
| 126 |
-
self.available_languages = sorted(list(languages_set))
|
| 127 |
-
|
| 128 |
-
def calculate_cost_metric(self, pipeline_datasets: Dict) -> float:
|
| 129 |
-
"""
|
| 130 |
-
Calculate a compute cost metric based on retrieval time across all datasets.
|
| 131 |
-
Returns cost in arbitrary units (could be refined based on actual compute costs).
|
| 132 |
-
"""
|
| 133 |
-
total_time_s = 0
|
| 134 |
-
|
| 135 |
-
for dataset_name, dataset_data in pipeline_datasets.items():
|
| 136 |
-
if "aggregated_metrics" not in dataset_data:
|
| 137 |
-
continue
|
| 138 |
-
|
| 139 |
-
timing = dataset_data["aggregated_metrics"].get("timing", {})
|
| 140 |
-
total_time_ms = timing.get("total_retrieval_time_milliseconds", 0)
|
| 141 |
-
total_time_s += total_time_ms / 1000.0
|
| 142 |
-
|
| 143 |
-
# Simple cost model: assume $0.01 per second of compute (adjustable)
|
| 144 |
-
cost = total_time_s * 0.01
|
| 145 |
-
|
| 146 |
-
return round(cost, 4)
|
| 147 |
-
|
| 148 |
-
def extract_dataset_metrics(
|
| 149 |
-
self, pipeline_datasets: Dict, metric: str = "ndcg_cut_5", language: str = "english"
|
| 150 |
-
) -> Dict[str, float]:
|
| 151 |
-
"""
|
| 152 |
-
Extract metrics for individual datasets from the aggregated results.
|
| 153 |
-
|
| 154 |
-
Args:
|
| 155 |
-
pipeline_datasets: Dictionary mapping dataset names to their data
|
| 156 |
-
metric: The metric to extract (e.g., 'ndcg_at_5')
|
| 157 |
-
language: The language to filter by ('english' for English-only results, or specific language)
|
| 158 |
-
|
| 159 |
-
Returns:
|
| 160 |
-
Dictionary mapping dataset names to metric values
|
| 161 |
-
"""
|
| 162 |
-
# Map metric names from UI format to API format
|
| 163 |
-
metric_mapping = {
|
| 164 |
-
"ndcg_at_1": "ndcg_cut_5", # Using cut_5 as approximation
|
| 165 |
-
"ndcg_at_5": "ndcg_cut_5",
|
| 166 |
-
"ndcg_at_10": "ndcg_cut_10",
|
| 167 |
-
"ndcg_at_100": "ndcg_cut_100",
|
| 168 |
-
"recall_at_1": "recall_5",
|
| 169 |
-
"recall_at_5": "recall_5",
|
| 170 |
-
"recall_at_10": "recall_10",
|
| 171 |
-
"recall_at_100": "recall_100",
|
| 172 |
-
}
|
| 173 |
-
|
| 174 |
-
actual_metric = metric_mapping.get(metric, metric)
|
| 175 |
-
dataset_metrics = {}
|
| 176 |
-
|
| 177 |
-
for dataset_name, dataset_data in pipeline_datasets.items():
|
| 178 |
-
if "aggregated_metrics" not in dataset_data:
|
| 179 |
-
continue
|
| 180 |
-
|
| 181 |
-
aggregated = dataset_data["aggregated_metrics"]
|
| 182 |
-
|
| 183 |
-
# Get metrics for the specified language
|
| 184 |
-
if language == "overall":
|
| 185 |
-
metrics_data = aggregated.get("overall", {})
|
| 186 |
-
else:
|
| 187 |
-
metrics_data = aggregated.get("by_language", {}).get(language, {})
|
| 188 |
-
|
| 189 |
-
if metrics_data:
|
| 190 |
-
# Format dataset name for display
|
| 191 |
-
display_name = dataset_name.replace("_", " ").title()
|
| 192 |
-
dataset_metrics[display_name] = metrics_data.get(actual_metric, 0.0)
|
| 193 |
-
|
| 194 |
-
return dataset_metrics
|
| 195 |
-
|
| 196 |
-
def render_df(self, metric: str = "ndcg_at_5", language: str = "overall") -> pd.DataFrame:
|
| 197 |
-
"""
|
| 198 |
-
Render a DataFrame with pipeline results.
|
| 199 |
-
|
| 200 |
-
Args:
|
| 201 |
-
metric: The metric to display (e.g., 'ndcg_at_5')
|
| 202 |
-
language: The language to filter by ('overall' for all languages, or specific language)
|
| 203 |
-
|
| 204 |
-
Returns:
|
| 205 |
-
DataFrame with columns: Pipeline Name, Compute Cost, Timing metrics, Dataset metrics
|
| 206 |
-
"""
|
| 207 |
-
pipeline_res = {}
|
| 208 |
-
|
| 209 |
-
for pipeline_name, pipeline_datasets in self.pipeline_infos.items():
|
| 210 |
-
row_data = {}
|
| 211 |
-
|
| 212 |
-
# Aggregate time metrics across all datasets
|
| 213 |
-
total_time_ms = 0
|
| 214 |
-
total_queries = 0
|
| 215 |
-
indexing_time_ms = 0
|
| 216 |
-
search_time_ms = 0
|
| 217 |
-
num_datasets = 0
|
| 218 |
-
|
| 219 |
-
for dataset_name, dataset_data in pipeline_datasets.items():
|
| 220 |
-
if "aggregated_metrics" in dataset_data:
|
| 221 |
-
timing = dataset_data["aggregated_metrics"].get("timing", {})
|
| 222 |
-
total_time_ms += timing.get("total_retrieval_time_milliseconds", 0)
|
| 223 |
-
total_queries += timing.get("num_queries", 0)
|
| 224 |
-
indexing_time_ms += timing.get("indexing_throughput_ms_per_doc", 0)
|
| 225 |
-
search_time_ms += timing.get("search_throughput_ms_per_query", 0)
|
| 226 |
-
num_datasets += 1
|
| 227 |
-
|
| 228 |
-
if total_queries > 0:
|
| 229 |
-
if total_time_ms > 0:
|
| 230 |
-
# row_data["Total retrieval time (s)"] = round(
|
| 231 |
-
# (total_time_ms / 1000), 0
|
| 232 |
-
# )
|
| 233 |
-
row_data["Indexing latency (s/doc)"] = (
|
| 234 |
-
(indexing_time_ms / 1000) / num_datasets if indexing_time_ms > 0 else None
|
| 235 |
-
)
|
| 236 |
-
row_data["Search latency (s/query)"] = (
|
| 237 |
-
(search_time_ms / 1000) / num_datasets if search_time_ms > 0 else None
|
| 238 |
-
)
|
| 239 |
-
else:
|
| 240 |
-
# row_data["Total retrieval time (s)"] = 0
|
| 241 |
-
row_data["Indexing latency (s/doc)"] = 0
|
| 242 |
-
row_data["Search latency (s/query)"] = 0
|
| 243 |
-
else:
|
| 244 |
-
# row_data["Total retrieval time (s)"] = -1
|
| 245 |
-
row_data["Indexing latency (s/doc)"] = -1
|
| 246 |
-
row_data["Search latency (s/query)"] = -1
|
| 247 |
-
|
| 248 |
-
# Add dataset metrics
|
| 249 |
-
dataset_metrics = self.extract_dataset_metrics(pipeline_datasets, metric, language)
|
| 250 |
-
row_data.update(dataset_metrics)
|
| 251 |
-
|
| 252 |
-
# Calculate average across datasets if there are multiple
|
| 253 |
-
if dataset_metrics:
|
| 254 |
-
row_data["Average Score"] = round(sum(dataset_metrics.values()) / len(dataset_metrics), 4)
|
| 255 |
-
|
| 256 |
-
# Use pipeline_alias for display if available, otherwise fallback to folder name
|
| 257 |
-
display_name = self.pipeline_aliases.get(pipeline_name, pipeline_name)
|
| 258 |
-
# Store folder name for link generation (will be used in utils.py)
|
| 259 |
-
row_data["_folder_name"] = pipeline_name
|
| 260 |
-
pipeline_res[display_name] = row_data
|
| 261 |
-
|
| 262 |
-
if pipeline_res:
|
| 263 |
-
df = pd.DataFrame(pipeline_res).T
|
| 264 |
-
# Reorder columns to have Average right after timing metrics
|
| 265 |
-
cols = list(df.columns)
|
| 266 |
-
if "Average Score" in cols:
|
| 267 |
-
cols.remove("Average Score")
|
| 268 |
-
# Insert Average after Search latency (s/query)
|
| 269 |
-
insert_pos = cols.index("Search latency (s/query)") + 1 if "Search latency (s/query)" in cols else 2
|
| 270 |
-
cols.insert(insert_pos, "Average Score")
|
| 271 |
-
df = df[cols]
|
| 272 |
-
return df
|
| 273 |
-
|
| 274 |
-
return pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
gitpython
|
| 2 |
-
gradio
|
| 3 |
-
pydub
|
| 4 |
-
huggingface_hub < 1.0.0
|
| 5 |
-
plotly
|
| 6 |
-
pandas
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|