.github/workflows/restart.yml DELETED
@@ -1,19 +0,0 @@
1
- name: Scheduled Restart
2
-
3
- on:
4
- schedule:
5
- - cron: '0 */1 * * *' # every 6 hours
6
- workflow_dispatch:
7
-
8
- jobs:
9
- restart:
10
- runs-on: ubuntu-latest
11
- steps:
12
- - uses: actions/checkout@v4
13
- - run: |
14
- date > restart.txt
15
- git config user.name "github-actions"
16
- git config user.email "github-actions@github.com"
17
- git add restart.txt
18
- git commit -m "Scheduled restart"
19
- git push
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CHANGELOG.md DELETED
@@ -1,3 +0,0 @@
1
- # 0.1.0
2
-
3
- - We identified that the insurance policy data used in the Vidore V2 leaderboard falls within a grey area of licensing. While the data was publicly accessible, its licensing terms were not sufficiently clear to ensure compliance with our data usage standards. We have since removed this benchmark data from HuggingFace and removed the concerned columns from the leaderboard.
 
 
 
 
README.md CHANGED
@@ -7,8 +7,7 @@ sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: mit
10
- sdk_version: 6.3.0
11
- python_version: 3.12.11
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  app_file: app.py
8
  pinned: true
9
  license: mit
10
+ sdk_version: 4.37.1
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,79 +1,20 @@
1
- import re
2
-
3
  import gradio as gr
4
- import pandas as pd
5
- import plotly.express as px
6
-
7
- from app.utils import (
8
- add_rank_and_format,
9
- deprecated_get_refresh_function,
10
- filter_models,
11
- get_pipeline_refresh_function,
12
- get_refresh_function,
13
- )
14
- from data.deprecated_model_handler import DeprecatedModelHandler
15
- from data.model_handler import ModelHandler
16
- from data.pipeline_handler import PipelineHandler
17
 
18
- METRICS = [
19
- "ndcg_at_1",
20
- "ndcg_at_5",
21
- "ndcg_at_10",
22
- "ndcg_at_100",
23
- "recall_at_1",
24
- "recall_at_5",
25
- "recall_at_10",
26
- "recall_at_100",
27
- ]
28
 
 
29
 
30
  def main():
31
- # Get new results
32
  model_handler = ModelHandler()
33
  initial_metric = "ndcg_at_5"
34
 
35
- data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1)
36
- data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
37
-
38
- data_benchmark_2 = model_handler.render_df(initial_metric, benchmark_version=2)
39
- data_benchmark_2 = add_rank_and_format(data_benchmark_2, benchmark_version=2)
40
-
41
- num_datasets_1 = len(data_benchmark_1.columns) - 3
42
- num_scores_1 = len(data_benchmark_1) * num_datasets_1
43
- num_models_1 = len(data_benchmark_1)
44
-
45
- num_datasets_2 = len(data_benchmark_2.columns) - 3
46
- num_scores_2 = len(data_benchmark_2) * num_datasets_2
47
- num_models_2 = len(data_benchmark_2)
48
-
49
- deprecated_model_handler = DeprecatedModelHandler()
50
- initial_metric = "ndcg_at_5"
51
- initial_metric_v3 = "ndcg_at_10"
52
-
53
- # Get pipeline evaluation results
54
- pipeline_handler = PipelineHandler()
55
- pipeline_handler.get_pipeline_data()
56
- data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
57
- data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
58
-
59
- num_datasets_pipeline = len(data_pipeline.columns) - 5 # Excluding Rank, Model, Indexing time, search time, Average
60
- num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
61
- num_pipelines = len(data_pipeline)
62
-
63
- deprecated_model_handler.get_vidore_data(initial_metric)
64
- deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
65
- deprecated_data_benchmark_1 = add_rank_and_format(deprecated_data_benchmark_1, benchmark_version=1)
66
-
67
- deprecated_data_benchmark_2 = deprecated_model_handler.render_df(initial_metric, benchmark_version=2)
68
- deprecated_data_benchmark_2 = add_rank_and_format(deprecated_data_benchmark_2, benchmark_version=2)
69
-
70
- deprecated_num_datasets_1 = len(deprecated_data_benchmark_1.columns) - 3
71
- deprecated_num_scores_1 = len(deprecated_data_benchmark_1) * deprecated_num_datasets_1
72
- deprecated_num_models_1 = len(deprecated_data_benchmark_1)
73
 
74
- deprecated_num_datasets_2 = len(deprecated_data_benchmark_2.columns) - 3
75
- deprecated_num_scores_2 = len(deprecated_data_benchmark_2) * deprecated_num_datasets_2
76
- deprecated_num_models_2 = len(deprecated_data_benchmark_2)
77
 
78
  css = """
79
  table > thead {
@@ -96,812 +37,131 @@ def main():
96
  .markdown {
97
  font-size: 1rem;
98
  }
99
-
100
- .alert-info {
101
- background-color: #e3f2fd;
102
- border-left: 4px solid #2196f3;
103
- padding: 5px 15px;
104
- }
105
-
106
- /* 1. Force text wrapping on all headers from the 3rd column onwards */
107
- #pipeline-table table th:nth-child(n+3) * {
108
- white-space: normal !important;
109
- overflow: visible !important;
110
- text-overflow: clip !important;
111
- line-height: 1.2 !important;
112
- word-break: normal !important; /* Prevents breaking in the middle of words */
113
- overflow-wrap: normal !important; /* Prevents breaking in the middle of words */
114
- }
115
-
116
- /* 2. Set a fixed width and center alignment for headers and data cells from the 3rd column onwards */
117
- #pipeline-table table th:nth-child(n+3),
118
- #pipeline-table table td:nth-child(n+3) {
119
- min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
120
- max-width: 120px !important;
121
- }
122
-
123
- /* 3. Make the Model column (2nd column) wider for pipeline table */
124
- #pipeline-table table th:nth-child(2),
125
- #pipeline-table table td:nth-child(2) {
126
- min-width: 400px !important;
127
- max-width: 500px !important;
128
- }
129
  """
130
 
131
  with gr.Blocks(css=css) as block:
132
- with gr.Tabs() as tabs:
133
- with gr.TabItem("ViDoRe V3", id="vidore-v3"):
134
- gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷‍♂️")
135
-
136
- gr.Markdown(
137
- """
138
- Visual Document Retrieval Benchmark 3 leaderboard. To submit results, refer to the corresponding tab.
139
-
140
- Refer to:
141
- - 🤗 The [blogpost](https://huggingface.co/blog/QuentinJG/introducing-vidore-v3) for all the details on the datasets,
142
- - 🤗 The [dataset collection](https://huggingface.co/collections/vidore/vidore-benchmark-v3),
143
- - 📝 The [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics.
144
- """
145
- )
146
-
147
- gr.Markdown("""
148
- As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
149
- we embed it here.
150
- """)
151
-
152
- gr.Markdown(
153
- """**💡 To display English-only results:**
154
- - Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*),
155
- - Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it),
156
- - The *eng-Latn* column will show English-only results (= English queries on English documents).""",
157
- elem_classes="alert-info",
158
- )
159
-
160
- gr.HTML(
161
- """
162
- <iframe
163
- src="https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)"
164
- style="width:100%; height:1000px; border:2px solid black; border-radius:10px;"
165
- ></iframe>
166
- """
167
- )
168
-
169
- with gr.TabItem("ViDoRe V3 (Pipeline)", id="vidore-v3-pipeline"):
170
- gr.Markdown("# ViDoRe V3 (Pipeline Evaluation): Retrieval Performance for Complex Pipelines ⚙️")
171
- gr.Markdown(
172
- "### Assessing retrieval performance, latency, and compute costs of complex retrieval pipelines"
173
- )
174
-
175
- gr.Markdown(
176
- """
177
- This leaderboard ranks full retrieval pipelines on **English-only queries** for **ViDoRe V3**. Instead of just testing standalone models, we evaluate real-world, multi-step retrieval systems. This includes everything from basic retrievers to advanced setups using AI agents, query reformulation, hybrid search, and any other creative retrieval pipeline one can imagine.
178
-
179
- To show efficiency alongside accuracy, we include **Indexing latency** (seconds/doc) and **Search latency** (seconds/query). These numbers are self-reported and depend heavily on hardware, so treat them as rough estimates. Still, they give a helpful idea of how much computing power each pipeline needs.
180
-
181
- ⚠️ **Note:** Because this only uses English queries, these scores **cannot be directly compared** to the standard ViDoRe V3 results.
182
-
183
- *Results are sourced from the [vidore-benchmark repository](https://github.com/illuin-tech/vidore-benchmark/tree/main/results).*
184
- """
185
- )
186
-
187
- if len(data_pipeline) > 0:
188
- datasets_columns_pipeline = [
189
- col
190
- for col in data_pipeline.columns[4:]
191
- if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average Score"]
192
- ]
193
-
194
- with gr.Row():
195
- metric_dropdown_pipeline = gr.Dropdown(
196
- choices=METRICS, value=initial_metric_v3, label="Select Metric"
197
- )
198
- research_textbox_pipeline = gr.Textbox(
199
- placeholder="🔍 Search Pipelines... [press enter]",
200
- label="Filter Pipelines by Name",
201
- )
202
-
203
- with gr.Row():
204
- if datasets_columns_pipeline:
205
- column_checkboxes_pipeline = gr.CheckboxGroup(
206
- choices=datasets_columns_pipeline,
207
- value=datasets_columns_pipeline,
208
- label="Select Datasets to Display",
209
- )
210
- else:
211
- column_checkboxes_pipeline = gr.CheckboxGroup(
212
- choices=[], value=[], label="Select Datasets to Display"
213
- )
214
-
215
- with gr.Row():
216
- # Datatype: Rank, Pipeline, Indexing latency (s/doc), Search latency (s/query), Average + datasets
217
- datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(
218
- datasets_columns_pipeline
219
- )
220
- dataframe_pipeline = gr.Dataframe(
221
- data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table"
222
- )
223
-
224
- def clean_pipeline_name(name):
225
- if not isinstance(name, str):
226
- return str(name)
227
- # Remove Markdown links [text](url) -> text
228
- name = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", name)
229
- # Remove HTML tags <a href="...">text</a> -> text
230
- name = re.sub(r"<[^>]+>", "", name)
231
- return name.strip()
232
-
233
- def create_pipeline_plot(df, latency_col):
234
- if df is None or len(df) == 0:
235
- return None
236
-
237
- # Ensure expected columns exist
238
- if (
239
- latency_col not in df.columns
240
- or "Average Score" not in df.columns
241
- or "Pipeline" not in df.columns
242
- ):
243
- return None
244
-
245
- # Clean the dataframe for plotting
246
- plot_df = df.copy()
247
-
248
- # Strip HTML and Markdown for clean hover text
249
- plot_df["Cleaned Pipeline"] = plot_df["Pipeline"].apply(clean_pipeline_name)
250
-
251
- plot_df[latency_col] = pd.to_numeric(plot_df[latency_col], errors="coerce")
252
- plot_df["Average Score"] = pd.to_numeric(plot_df["Average Score"], errors="coerce")
253
-
254
- plot_df = plot_df.dropna(subset=[latency_col, "Average Score"])
255
- plot_df = plot_df[plot_df[latency_col] > 0]
256
- plot_df = plot_df.sort_values(by=latency_col)
257
-
258
- if len(plot_df) == 0:
259
- return None
260
-
261
- fig = px.scatter(
262
- plot_df,
263
- x=latency_col,
264
- y="Average Score",
265
- hover_name="Cleaned Pipeline", # Use the clean text!
266
- title=f"Mean Performance vs {latency_col}",
267
- color_discrete_sequence=["orange"],
268
- )
269
-
270
- fig.update_layout(
271
- xaxis_title=latency_col,
272
- yaxis_title="Average Score",
273
- plot_bgcolor="white",
274
- )
275
-
276
- fig.update_xaxes(showgrid=True, gridcolor="lightgrey")
277
- fig.update_yaxes(showgrid=True, gridcolor="lightgrey")
278
-
279
- fig.update_traces(marker=dict(size=12, opacity=0.8, line=dict(width=1, color="DarkSlateGrey")))
280
- return fig
281
-
282
- with gr.Row():
283
- latency_radio = gr.Radio(
284
- choices=["Search latency (s/query)", "Indexing latency (s/doc)"],
285
- value="Search latency (s/query)",
286
- label="Select Latency Metric for X-Axis",
287
- )
288
-
289
- with gr.Row():
290
- initial_fig = create_pipeline_plot(data_pipeline, "Search latency (s/query)")
291
- performance_plot = gr.Plot(value=initial_fig)
292
-
293
- def update_data_pipeline(metric, search_term, selected_columns):
294
- pipeline_handler.get_pipeline_data()
295
- data = pipeline_handler.render_df(metric, "english")
296
- data = add_rank_and_format(
297
- data, benchmark_version=3, selected_columns=selected_columns, is_pipeline=True
298
- )
299
- data = filter_models(data, search_term)
300
- if selected_columns:
301
- # Include core columns plus selected dataset columns
302
- core_cols = ["Rank", "Pipeline", "Indexing latency (s/doc)", "Search latency (s/query)"]
303
- if "Average Score" in data.columns:
304
- core_cols.insert(4, "Average Score")
305
- data = data[core_cols + selected_columns]
306
- return data
307
-
308
- with gr.Row():
309
- refresh_button_pipeline = gr.Button("Refresh")
310
- refresh_button_pipeline.click(
311
- lambda metric: add_rank_and_format(
312
- pipeline_handler.render_df(metric, "english"), benchmark_version=3, is_pipeline=True
313
- ),
314
- inputs=[metric_dropdown_pipeline],
315
- outputs=dataframe_pipeline,
316
- concurrency_limit=20,
317
- ).then(
318
- fn=create_pipeline_plot,
319
- inputs=[dataframe_pipeline, latency_radio],
320
- outputs=performance_plot,
321
- )
322
-
323
- with gr.Row():
324
- gr.Markdown(
325
- """
326
- **Note**: These results represent full pipeline evaluations on english queries ONLY (since other queries were mostly directly translated from their english counterparts).
327
- We felt mutlilingual results were less critical (and much more costly to evaluate on the full set) for pipelines, since one user could just add a translation module to the pipeline and expect similar performance to the english results.
328
- If you feel this is a mistake and multi-lingual results are critical for pipelines, please let us know by opening an issue in the GitHub repository!
329
- """
330
- )
331
-
332
- # Automatically refresh the dataframe when the dropdown value changes
333
- def refresh_pipeline_data(metric):
334
- """Refresh pipeline data when metric changes."""
335
- df = pipeline_handler.render_df(metric, "english")
336
- return add_rank_and_format(df, benchmark_version=3, is_pipeline=True)
337
-
338
- # Update dataframe and then update the plot
339
- metric_dropdown_pipeline.change(
340
- refresh_pipeline_data,
341
- inputs=[metric_dropdown_pipeline],
342
- outputs=dataframe_pipeline,
343
- ).then(
344
- fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
345
- )
346
-
347
- research_textbox_pipeline.submit(
348
- lambda metric, search_term, selected_columns: update_data_pipeline(
349
- metric, search_term, selected_columns
350
- ),
351
- inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
352
- outputs=dataframe_pipeline,
353
- ).then(
354
- fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
355
- )
356
-
357
- column_checkboxes_pipeline.change(
358
- lambda metric, search_term, selected_columns: update_data_pipeline(
359
- metric, search_term, selected_columns
360
- ),
361
- inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
362
- outputs=dataframe_pipeline,
363
- ).then(
364
- fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
365
- )
366
-
367
- # Update plot when the radio button changes
368
- latency_radio.change(
369
- fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
370
- )
371
-
372
- gr.Markdown(
373
- f"""
374
- - **Total Datasets**: {num_datasets_pipeline}
375
- - **Total Scores**: {num_scores_pipeline}
376
- - **Total Pipelines**: {num_pipelines}
377
- """
378
- + r"""
379
- Please consider citing:
380
-
381
- ```bibtex
382
- @misc{faysse2024colpaliefficientdocumentretrieval,
383
- title={ColPali: Efficient Document Retrieval with Vision Language Models},
384
- author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
385
- year={2024},
386
- eprint={2407.01449},
387
- archivePrefix={arXiv},
388
- primaryClass={cs.IR},
389
- url={[https://arxiv.org/abs/2407.01449](https://arxiv.org/abs/2407.01449)},
390
- }
391
- @misc{loison2026vidore,
392
- title={ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
393
- author={Loison, Ant{\'o}nio and Mac{\'e}, Quentin and Edy, Antoine and Xing, Victor and Balough, Tom and Moreira, Gabriel and Liu, Bo and Faysse, Manuel and Hudelot, C{\'e}line and Viaud, Gautier},
394
- journal={arXiv preprint arXiv:2601.08620},
395
- year={2026}
396
- }
397
- ```
398
- """
399
- )
400
- else:
401
- gr.Markdown("**No pipeline evaluation results available yet. Check back later!**")
402
-
403
- with gr.TabItem("ViDoRe V2", id="vidore-v2"):
404
- gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
405
- gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
406
 
407
  gr.Markdown(
408
  """
409
- Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
410
 
411
- Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
412
  """
413
  )
414
- datasets_columns_2 = list(data_benchmark_2.columns[4:])
 
 
415
 
416
  with gr.Row():
417
- metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
418
- research_textbox_2 = gr.Textbox(
419
- placeholder="🔍 Search Models... [press enter]",
420
- label="Filter Models by Name",
421
- )
422
- column_checkboxes_2 = gr.CheckboxGroup(
423
- choices=datasets_columns_2, value=datasets_columns_2, label="Select Columns to Display"
424
- )
425
 
426
  with gr.Row():
427
- datatype_2 = ["number", "markdown"] + ["number"] * (num_datasets_2 + 1)
428
- dataframe_2 = gr.Dataframe(data_benchmark_2, datatype=datatype_2, type="pandas")
429
 
430
- def update_data_2(metric, search_term, selected_columns):
431
- model_handler.get_vidore_data(metric)
432
- data = model_handler.render_df(metric, benchmark_version=2)
433
- data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns)
434
  data = filter_models(data, search_term)
435
- # data = remove_duplicates(data) # Add this line
436
  if selected_columns:
437
- data = data[
438
- ["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns
439
- ]
440
  return data
441
 
442
  with gr.Row():
443
- refresh_button_2 = gr.Button("Refresh")
444
- refresh_button_2.click(
445
- get_refresh_function(model_handler, benchmark_version=2),
446
- inputs=[metric_dropdown_2],
447
- outputs=dataframe_2,
448
- concurrency_limit=20,
449
- )
450
 
451
- with gr.Row():
452
- gr.Markdown(
453
- """
454
- **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
455
- Those numbers are not numbers obtained from the organisations that released those models.
456
- """
457
- )
458
 
459
  # Automatically refresh the dataframe when the dropdown value changes
460
- metric_dropdown_2.change(
461
- get_refresh_function(model_handler, benchmark_version=2),
462
- inputs=[metric_dropdown_2],
463
- outputs=dataframe_2,
464
- )
465
- research_textbox_2.submit(
466
- lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns),
467
- inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2],
468
- outputs=dataframe_2,
469
  )
470
- column_checkboxes_2.change(
471
- lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns),
472
- inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2],
473
- outputs=dataframe_2,
474
  )
475
 
476
- gr.Markdown(
477
- f"""
478
- - **Total Datasets**: {num_datasets_2}
479
- - **Total Scores**: {num_scores_2}
480
- - **Total Models**: {num_models_2}
481
- """
482
- + r"""
483
- Please consider citing:
484
-
485
- ```bibtex
486
- @misc{faysse2024colpaliefficientdocumentretrieval,
487
- title={ColPali: Efficient Document Retrieval with Vision Language Models},
488
- author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
489
- year={2024},
490
- eprint={2407.01449},
491
- archivePrefix={arXiv},
492
- primaryClass={cs.IR},
493
- url={https://arxiv.org/abs/2407.01449},
494
- }
495
 
496
- @misc{macé2025vidorebenchmarkv2raising,
497
- title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
498
- author={Quentin Macé and António Loison and Manuel Faysse},
499
- year={2025},
500
- eprint={2505.17166},
501
- archivePrefix={arXiv},
502
- primaryClass={cs.IR},
503
- url={https://arxiv.org/abs/2505.17166},
504
- }
505
- ```
506
- """
507
- )
508
- with gr.TabItem("ViDoRe V1", id="vidore-v1"):
509
- gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
510
- gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
511
-
512
- gr.Markdown(
513
- """
514
- Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
515
-
516
- Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
517
- """
518
- )
519
- datasets_columns_1 = list(data_benchmark_1.columns[4:])
520
-
521
- with gr.Row():
522
- metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
523
- research_textbox_1 = gr.Textbox(
524
- placeholder="🔍 Search Models... [press enter]",
525
- label="Filter Models by Name",
526
- )
527
- column_checkboxes_1 = gr.CheckboxGroup(
528
- choices=datasets_columns_1, value=datasets_columns_1, label="Select Columns to Display"
529
- )
530
-
531
- with gr.Row():
532
- datatype_1 = ["number", "markdown"] + ["number"] * (num_datasets_1 + 1)
533
- dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas")
534
-
535
- def update_data_1(metric, search_term, selected_columns):
536
- model_handler.get_vidore_data(metric)
537
- data = model_handler.render_df(metric, benchmark_version=1)
538
- data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
539
- data = filter_models(data, search_term)
540
- if selected_columns:
541
- data = data[
542
- ["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns
543
- ]
544
- return data
545
-
546
- with gr.Row():
547
- refresh_button_1 = gr.Button("Refresh")
548
- refresh_button_1.click(
549
- get_refresh_function(model_handler, benchmark_version=1),
550
- inputs=[metric_dropdown_1],
551
- outputs=dataframe_1,
552
- concurrency_limit=20,
553
- )
554
-
555
- # Automatically refresh the dataframe when the dropdown value changes
556
- metric_dropdown_1.change(
557
- get_refresh_function(model_handler, benchmark_version=1),
558
- inputs=[metric_dropdown_1],
559
- outputs=dataframe_1,
560
- )
561
- research_textbox_1.submit(
562
- lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
563
- inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
564
- outputs=dataframe_1,
565
- )
566
- column_checkboxes_1.change(
567
- lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
568
- inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
569
- outputs=dataframe_1,
570
- )
571
 
572
  gr.Markdown(
573
  f"""
574
- - **Total Datasets**: {num_datasets_1}
575
- - **Total Scores**: {num_scores_1}
576
- - **Total Models**: {num_models_1}
577
  """
578
  + r"""
579
  Please consider citing:
580
 
581
  ```bibtex
582
  @misc{faysse2024colpaliefficientdocumentretrieval,
583
- title={ColPali: Efficient Document Retrieval with Vision Language Models},
584
  author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
585
  year={2024},
586
  eprint={2407.01449},
587
  archivePrefix={arXiv},
588
  primaryClass={cs.IR},
589
- url={https://arxiv.org/abs/2407.01449},
590
  }
591
- @misc{macé2025vidorebenchmarkv2raising,
592
- title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
593
- author={Quentin Macé and António Loison and Manuel Faysse},
594
- year={2025},
595
- eprint={2505.17166},
596
- archivePrefix={arXiv},
597
- primaryClass={cs.IR},
598
- url={https://arxiv.org/abs/2505.17166},
599
- }
600
-
601
  ```
602
  """
603
  )
604
- with gr.TabItem("📚 Submit your model", id="submission"):
605
  gr.Markdown("# How to Submit a New Model to the Leaderboard")
606
  gr.Markdown(
607
  """
608
- ## Original (ViDoRe v1-v3) leaderboard:
609
- To submit a new model to the original ViDoRe leaderboard, follow these steps:
610
 
611
  1. **Evaluate your model**:
612
- - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB.
613
 
614
  2. **Format your submission file**:
615
- - Add the generated files to [MTEB results](https://github.com/embeddings-benchmark/results) project. Check the [Colpali results](https://github.com/embeddings-benchmark/results/tree/main/results/vidore__colpali-v1.3/1b5c8929330df1a66de441a9b5409a878f0de5b0) for an example.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
 
617
  And you're done! Your model will appear on the leaderboard when you click refresh! Once the space
618
  gets rebooted, it will appear on startup.
619
 
620
  Note: For proper hyperlink redirection, please ensure that your model repository name is in
621
  kebab-case, e.g. `my-model-name`.
622
-
623
-
624
- ## ViDoRe v3 Pipeline leaderboard :
625
- To submit a new pipeline to the ViDoRe V3 pipeline leaderboard, follow these steps:
626
- 1. **Evaluate your pipeline**:
627
- - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) for pipelines
628
- 2. **Open a PR on the ViDoRe GitHub repository including**:
629
- - Your results, which are directly outputed in the correct format
630
- - A short description of the pipeline and the main components used (some examples are available in the repository)
631
-
632
- And you're done! Your pipeline will appear on the leaderboard when after the PR is merged !
633
- """
634
- )
635
- with gr.TabItem("Deprecated ViDoRe V1", id="vidore-v1-deprecated"):
636
- gr.Markdown(
637
- "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
638
- "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
639
- "which is no longer maintained. Results should be computed using the "
640
- "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
641
- "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
642
- )
643
- gr.Markdown(
644
- "## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
645
- )
646
- gr.Markdown(
647
- "# <span style='color:red'>[Deprecated]</span> ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍"
648
- )
649
- gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
650
-
651
- gr.Markdown(
652
- """
653
- Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
654
-
655
- Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
656
- """
657
- )
658
- deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
659
-
660
- with gr.Row():
661
- deprecated_metric_dropdown_1 = gr.Dropdown(
662
- choices=METRICS, value=initial_metric, label="Select Metric"
663
- )
664
- deprecated_research_textbox_1 = gr.Textbox(
665
- placeholder="🔍 Search Models... [press enter]",
666
- label="Filter Models by Name",
667
- )
668
- deprecated_column_checkboxes_1 = gr.CheckboxGroup(
669
- choices=deprecated_datasets_columns_1,
670
- value=deprecated_datasets_columns_1,
671
- label="Select Columns to Display",
672
- )
673
-
674
- with gr.Row():
675
- deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
676
- deprecated_dataframe_1 = gr.Dataframe(
677
- deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas"
678
- )
679
-
680
- def deprecated_update_data_1(metric, search_term, selected_columns):
681
- deprecated_model_handler.get_vidore_data(metric)
682
- data = deprecated_model_handler.render_df(metric, benchmark_version=1)
683
- data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
684
- data = filter_models(data, search_term)
685
- # data = remove_duplicates(data) # Add this line
686
- if selected_columns:
687
- data = data[["Rank", "Model", "Average Score"] + selected_columns]
688
- return data
689
-
690
- with gr.Row():
691
- deprecated_refresh_button_1 = gr.Button("Refresh")
692
- deprecated_refresh_button_1.click(
693
- deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1),
694
- inputs=[deprecated_metric_dropdown_1],
695
- outputs=deprecated_dataframe_1,
696
- concurrency_limit=20,
697
- )
698
-
699
- # Automatically refresh the dataframe when the dropdown value changes
700
- deprecated_metric_dropdown_1.change(
701
- deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1),
702
- inputs=[deprecated_metric_dropdown_1],
703
- outputs=deprecated_dataframe_1,
704
- )
705
- deprecated_research_textbox_1.submit(
706
- lambda metric, search_term, selected_columns: deprecated_update_data_1(
707
- metric, search_term, selected_columns
708
- ),
709
- inputs=[
710
- deprecated_metric_dropdown_1,
711
- deprecated_research_textbox_1,
712
- deprecated_column_checkboxes_1,
713
- ],
714
- outputs=deprecated_dataframe_1,
715
- )
716
- deprecated_column_checkboxes_1.change(
717
- lambda metric, search_term, selected_columns: deprecated_update_data_1(
718
- metric, search_term, selected_columns
719
- ),
720
- inputs=[
721
- deprecated_metric_dropdown_1,
722
- deprecated_research_textbox_1,
723
- deprecated_column_checkboxes_1,
724
- ],
725
- outputs=deprecated_dataframe_1,
726
- )
727
-
728
- gr.Markdown(
729
- f"""
730
- - **Total Datasets**: {deprecated_num_datasets_1}
731
- - **Total Scores**: {deprecated_num_scores_1}
732
- - **Total Models**: {deprecated_num_models_1}
733
- """
734
- + r"""
735
- Please consider citing:
736
-
737
- ```bibtex
738
- @misc{faysse2024colpaliefficientdocumentretrieval,
739
- title={ColPali: Efficient Document Retrieval with Vision Language Models},
740
- author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
741
- year={2024},
742
- eprint={2407.01449},
743
- archivePrefix={arXiv},
744
- primaryClass={cs.IR},
745
- url={https://arxiv.org/abs/2407.01449},
746
- }
747
-
748
- @misc{macé2025vidorebenchmarkv2raising,
749
- title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
750
- author={Quentin Macé and António Loison and Manuel Faysse},
751
- year={2025},
752
- eprint={2505.17166},
753
- archivePrefix={arXiv},
754
- primaryClass={cs.IR},
755
- url={https://arxiv.org/abs/2505.17166},
756
- }
757
- ```
758
- """
759
- )
760
- with gr.TabItem("Deprecated ViDoRe V2", id="vidore-v2-deprecated"):
761
- gr.Markdown(
762
- "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
763
- "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
764
- "which is no longer maintained. Results should be computed using the "
765
- "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
766
- "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
767
- )
768
- gr.Markdown(
769
- "## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
770
- )
771
- gr.Markdown(
772
- "# <span style='color:red'>[Deprecated]</span> ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍"
773
- )
774
- gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
775
-
776
- gr.Markdown(
777
  """
778
- Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
779
-
780
- Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
781
- """
782
  )
783
- deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
784
-
785
- with gr.Row():
786
- deprecated_metric_dropdown_2 = gr.Dropdown(
787
- choices=METRICS, value=initial_metric, label="Select Metric"
788
- )
789
- deprecated_research_textbox_2 = gr.Textbox(
790
- placeholder="🔍 Search Models... [press enter]",
791
- label="Filter Models by Name",
792
- )
793
- deprecated_column_checkboxes_2 = gr.CheckboxGroup(
794
- choices=deprecated_datasets_columns_2,
795
- value=deprecated_datasets_columns_2,
796
- label="Select Columns to Display",
797
- )
798
 
799
- with gr.Row():
800
- deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
801
- deprecated_dataframe_2 = gr.Dataframe(
802
- deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas"
803
- )
804
-
805
- def deprecated_update_data_2(metric, search_term, selected_columns):
806
- deprecated_model_handler.get_vidore_data(metric)
807
- data = deprecated_model_handler.render_df(metric, benchmark_version=2)
808
- data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns)
809
- data = filter_models(data, search_term)
810
- # data = remove_duplicates(data) # Add this line
811
- if selected_columns:
812
- data = data[["Rank", "Model", "Average Score"] + selected_columns]
813
- return data
814
-
815
- with gr.Row():
816
- deprecated_refresh_button_2 = gr.Button("Refresh")
817
- deprecated_refresh_button_2.click(
818
- deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2),
819
- inputs=[deprecated_metric_dropdown_2],
820
- outputs=deprecated_dataframe_2,
821
- concurrency_limit=20,
822
- )
823
-
824
- with gr.Row():
825
- gr.Markdown(
826
- """
827
- **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
828
- Those numbers are not numbers obtained from the organisations that released those models.
829
- """
830
- )
831
-
832
- # Automatically refresh the dataframe when the dropdown value changes
833
- deprecated_metric_dropdown_2.change(
834
- deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2),
835
- inputs=[deprecated_metric_dropdown_2],
836
- outputs=deprecated_dataframe_2,
837
- )
838
- deprecated_research_textbox_2.submit(
839
- lambda metric, search_term, selected_columns: deprecated_update_data_2(
840
- metric, search_term, selected_columns
841
- ),
842
- inputs=[
843
- deprecated_metric_dropdown_2,
844
- deprecated_research_textbox_2,
845
- deprecated_column_checkboxes_2,
846
- ],
847
- outputs=deprecated_dataframe_2,
848
- )
849
- deprecated_column_checkboxes_2.change(
850
- lambda metric, search_term, selected_columns: deprecated_update_data_2(
851
- metric, search_term, selected_columns
852
- ),
853
- inputs=[
854
- deprecated_metric_dropdown_2,
855
- deprecated_research_textbox_2,
856
- deprecated_column_checkboxes_2,
857
- ],
858
- outputs=deprecated_dataframe_2,
859
- )
860
-
861
- gr.Markdown(
862
- f"""
863
- - **Total Datasets**: {deprecated_num_datasets_2}
864
- - **Total Scores**: {deprecated_num_scores_2}
865
- - **Total Models**: {deprecated_num_models_2}
866
- """
867
- + r"""
868
- Please consider citing:
869
-
870
- ```bibtex
871
- @misc{faysse2024colpaliefficientdocumentretrieval,
872
- title={ColPali: Efficient Document Retrieval with Vision Language Models},
873
- author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
874
- year={2024},
875
- eprint={2407.01449},
876
- archivePrefix={arXiv},
877
- primaryClass={cs.IR},
878
- url={https://arxiv.org/abs/2407.01449},
879
- }
880
-
881
- @misc{macé2025vidorebenchmarkv2raising,
882
- title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
883
- author={Quentin Macé and António Loison and Manuel Faysse},
884
- year={2025},
885
- eprint={2505.17166},
886
- archivePrefix={arXiv},
887
- primaryClass={cs.IR},
888
- url={https://arxiv.org/abs/2505.17166},
889
- }
890
- ```
891
- """
892
- )
893
-
894
- def select_tab_from_url(request: gr.Request):
895
- # Grab query parameters from the URL
896
- query_params = dict(request.query_params)
897
- # Look for ?tab=..., default to the first tab's ID if not found
898
- target_tab = query_params.get("tab", "vidore-v3")
899
- # Update the tabs component to select the target ID
900
- return gr.update(selected=target_tab)
901
-
902
- block.load(select_tab_from_url, inputs=None, outputs=tabs)
903
  block.queue(max_size=10).launch(debug=True)
904
 
905
 
906
- if __name__ == "__main__":
907
  main()
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ from app.utils import add_rank_and_format, filter_models, get_refresh_function
4
+ from data.model_handler import ModelHandler
 
 
 
 
 
 
 
 
5
 
6
+ METRICS = ["ndcg_at_5", "recall_at_1"]
7
 
8
  def main():
 
9
  model_handler = ModelHandler()
10
  initial_metric = "ndcg_at_5"
11
 
12
+ data = model_handler.get_vidore_data(initial_metric)
13
+ data = add_rank_and_format(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ NUM_DATASETS = len(data.columns) - 3
16
+ NUM_SCORES = len(data) * NUM_DATASETS
17
+ NUM_MODELS = len(data)
18
 
19
  css = """
20
  table > thead {
 
37
  .markdown {
38
  font-size: 1rem;
39
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  """
41
 
42
  with gr.Blocks(css=css) as block:
43
+ with gr.Tabs():
44
+ with gr.TabItem("🏆 Leaderboard"):
45
+ gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 📚🔍")
46
+ gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  gr.Markdown(
49
  """
50
+ Visual Document Retrieval Benchmark leaderboard. To submit results, refer to the corresponding tab.
51
 
52
+ Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
53
  """
54
  )
55
+ datasets_columns = list(data.columns[3:])
56
+ anchor_columns = list(data.columns[:3])
57
+ default_columns = anchor_columns + datasets_columns
58
 
59
  with gr.Row():
60
+ metric_dropdown = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
61
+ research_textbox = gr.Textbox(placeholder="🔍 Search Models... [press enter]", label="Filter Models by Name", )
62
+ column_checkboxes = gr.CheckboxGroup(choices=datasets_columns, value=default_columns, label="Select Columns to Display")
 
 
 
 
 
63
 
64
  with gr.Row():
65
+ datatype = ["number", "markdown"] + ["number"] * (NUM_DATASETS + 1)
66
+ dataframe = gr.Dataframe(data, datatype=datatype, type="pandas")
67
 
68
+ def update_data(metric, search_term, selected_columns):
69
+ data = model_handler.get_vidore_data(metric)
70
+ data = add_rank_and_format(data)
 
71
  data = filter_models(data, search_term)
 
72
  if selected_columns:
73
+ selected_columns = selected_columns
74
+ data = data[selected_columns]
 
75
  return data
76
 
77
  with gr.Row():
78
+ refresh_button = gr.Button("Refresh")
79
+ refresh_button.click(get_refresh_function(), inputs=[metric_dropdown], outputs=dataframe, concurrency_limit=20)
 
 
 
 
 
80
 
 
 
 
 
 
 
 
81
 
82
  # Automatically refresh the dataframe when the dropdown value changes
83
+ metric_dropdown.change(get_refresh_function(), inputs=[metric_dropdown], outputs=dataframe)
84
+ research_textbox.submit(
85
+ lambda metric, search_term, selected_columns: update_data(metric, search_term, selected_columns),
86
+ inputs=[metric_dropdown, research_textbox, column_checkboxes],
87
+ outputs=dataframe
 
 
 
 
88
  )
89
+ column_checkboxes.change(
90
+ lambda metric, search_term, selected_columns: update_data(metric, search_term, selected_columns),
91
+ inputs=[metric_dropdown, research_textbox, column_checkboxes],
92
+ outputs=dataframe
93
  )
94
 
95
+ #column_checkboxes.change(get_refresh_function(), inputs=[metric_dropdown, column_checkboxes], outputs=dataframe)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  gr.Markdown(
99
  f"""
100
+ - **Total Datasets**: {NUM_DATASETS}
101
+ - **Total Scores**: {NUM_SCORES}
102
+ - **Total Models**: {NUM_MODELS}
103
  """
104
  + r"""
105
  Please consider citing:
106
 
107
  ```bibtex
108
  @misc{faysse2024colpaliefficientdocumentretrieval,
109
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
110
  author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
111
  year={2024},
112
  eprint={2407.01449},
113
  archivePrefix={arXiv},
114
  primaryClass={cs.IR},
115
+ url={https://arxiv.org/abs/2407.01449},
116
  }
 
 
 
 
 
 
 
 
 
 
117
  ```
118
  """
119
  )
120
+ with gr.TabItem("📚 Submit your model"):
121
  gr.Markdown("# How to Submit a New Model to the Leaderboard")
122
  gr.Markdown(
123
  """
124
+ To submit a new model to the ViDoRe leaderboard, follow these steps:
 
125
 
126
  1. **Evaluate your model**:
127
+ - Follow the evaluation script provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/)
128
 
129
  2. **Format your submission file**:
130
+ - The submission file should automatically be generated, and named `results.json` with the
131
+ following structure:
132
+ ```json
133
+ {
134
+ "dataset_name_1": {
135
+ "metric_1": score_1,
136
+ "metric_2": score_2,
137
+ ...
138
+ },
139
+ "dataset_name_2": {
140
+ "metric_1": score_1,
141
+ "metric_2": score_2,
142
+ ...
143
+ },
144
+ }
145
+ ```
146
+ - The dataset names should be the same as the ViDoRe dataset names listed in the following
147
+ collection: [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).
148
+
149
+ 3. **Submit your model**:
150
+ - Create a public HuggingFace model repository with your model.
151
+ - Add the tag `vidore` to your model in the metadata of the model card and place the
152
+ `results.json` file at the root.
153
 
154
  And you're done! Your model will appear on the leaderboard when you click refresh! Once the space
155
  gets rebooted, it will appear on startup.
156
 
157
  Note: For proper hyperlink redirection, please ensure that your model repository name is in
158
  kebab-case, e.g. `my-model-name`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  """
 
 
 
 
160
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  block.queue(max_size=10).launch(debug=True)
163
 
164
 
165
+ if __name__ == "__main__":
166
  main()
167
+
app/utils.py CHANGED
@@ -1,143 +1,31 @@
1
- import pandas as pd
2
- import math
3
 
 
 
 
4
 
5
- def make_clickable_model(model_name, link=None, is_pipeline=False, folder_name=None):
6
- if is_pipeline:
7
- # For pipelines: use folder_name for link, model_name (alias) for display
8
- link_folder = folder_name if folder_name else model_name
9
- # Process folder name for link: only handle __ and -thisisapoint-
10
- desanitized_folder = link_folder.replace("__", "/")
11
- desanitized_folder = desanitized_folder.replace("-thisisapoint-", ".")
12
- if link is None:
13
- link = f"https://github.com/illuin-tech/vidore-benchmark/tree/main/results/pipeline_descriptions/{desanitized_folder}/description.json"
14
- # Use word-wrap styling for potentially long pipeline aliases
15
- return f'<a target="_blank" style="text-decoration: underline; word-wrap: break-word; white-space: normal; display: inline-block; max-width: 450px;" href="{link}">{model_name}</a>'
16
- else:
17
- # For regular models: replace __ and _ with /, and -thisisapoint- with .
18
- desanitized_model_name = model_name.replace("__", "/")
19
- desanitized_model_name = desanitized_model_name.replace("_", "/")
20
- desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
21
 
22
- if "/captioning" in desanitized_model_name:
23
- desanitized_model_name = desanitized_model_name.replace("/captioning", "")
24
- if "/ocr" in desanitized_model_name:
25
- desanitized_model_name = desanitized_model_name.replace("/ocr", "")
26
 
27
- if link is None:
28
- link = "https://huggingface.co/" + desanitized_model_name
29
 
30
- return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
31
 
32
-
33
- def add_rank(df, benchmark_version=1, selected_columns=None):
34
- # Convert numeric columns to proper float type (they may be 'object' dtype due to mixed data)
35
- for col in df.columns:
36
- if col not in ["Model", "Pipeline", "_folder_name"]:
37
- df[col] = pd.to_numeric(df[col], errors="coerce")
38
-
39
- # Only fill NaN for numeric columns to avoid issues with string columns like _folder_name
40
- numeric_cols = df.select_dtypes(include=["float64", "int64", "float32", "int32"]).columns
41
- df[numeric_cols] = df[numeric_cols].fillna(0.0)
42
-
43
- if selected_columns is None:
44
- cols_to_rank = [
45
- col
46
- for col in df.columns
47
- if col
48
- not in [
49
- "Model",
50
- "Pipeline", # For pipeline tables
51
- "Model Size (Million Parameters)",
52
- "Memory Usage (GB, fp32)",
53
- "Embedding Dimensions",
54
- "Max Tokens",
55
- "Compute Cost ($)",
56
- "Queries per Second",
57
- "_folder_name", # Hidden column for pipeline link generation
58
- ]
59
- ]
60
- else:
61
- cols_to_rank = selected_columns
62
-
63
- if len(cols_to_rank) == 1:
64
- df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
65
- else:
66
- # Only add Average column if it doesn't already exist
67
- if "Average Score" not in df.columns:
68
- df.insert(len(df.columns) - len(cols_to_rank), "Average Score", df[cols_to_rank].mean(axis=1, skipna=False))
69
- df.sort_values("Average Score", ascending=False, inplace=True)
70
- df.insert(0, "Rank", list(range(1, len(df) + 1)))
71
- # multiply values by 100 if they are floats and round to 2 decimal places
72
- for col in df.columns:
73
- if pd.api.types.is_numeric_dtype(df[col]) and col not in [
74
- "Model Size (Million Parameters)",
75
- "Compute Cost ($)",
76
- "Queries per Second",
77
- "Indexing latency (s/doc)",
78
- "Search latency (s/query)",
79
- "Rank",
80
- ]:
81
- df[col] = df[col].apply(lambda x: round(x * 100, 3 - int(math.floor(math.log10(abs(x * 100)))) - 1))
82
- elif pd.api.types.is_numeric_dtype(df[col]) and col in ["Indexing latency (s/doc)", "Search latency (s/query)"]:
83
- df[col] = df[col].apply(lambda x: round(x, 3 - int(math.floor(math.log10(abs(x)))) - 1))
84
- return df
85
-
86
-
87
- def add_rank_and_format(df, benchmark_version=1, selected_columns=None, is_pipeline=False):
88
  df = df.reset_index()
89
- column_name = "Pipeline" if is_pipeline else "Model"
90
- df = df.rename(columns={"index": column_name})
91
- df = add_rank(df, benchmark_version, selected_columns)
92
-
93
- if is_pipeline and "_folder_name" in df.columns:
94
- # For pipelines, use folder_name for link generation
95
- df[column_name] = df.apply(
96
- lambda row: make_clickable_model(row[column_name], is_pipeline=True, folder_name=row["_folder_name"]),
97
- axis=1,
98
- )
99
- df = df.drop(columns=["_folder_name"])
100
- else:
101
- df[column_name] = df[column_name].apply(lambda x: make_clickable_model(x, is_pipeline=is_pipeline))
102
- # df = remove_duplicates(df)
103
- return df
104
-
105
-
106
- def remove_duplicates(df):
107
- """Remove duplicate models based on their name (after the last '/' if present)."""
108
- df["model_name"] = df["Model"].str.replace("_", "/")
109
- df = df.sort_values("Rank").drop_duplicates(subset=["model_name"], keep="first")
110
- df = df.drop("model_name", axis=1)
111
  return df
112
 
113
-
114
- def get_refresh_function(model_handler, benchmark_version):
115
- def _refresh(metric):
116
- model_handler.get_vidore_data(metric)
117
- data_task_category = model_handler.render_df(metric, benchmark_version)
118
- df = add_rank_and_format(data_task_category, benchmark_version)
119
- return df
120
-
121
- return _refresh
122
-
123
-
124
- def deprecated_get_refresh_function(model_handler, benchmark_version):
125
- def _refresh(metric):
126
- model_handler.get_vidore_data(metric)
127
- data_task_category = model_handler.render_df(metric, benchmark_version)
128
- df = add_rank_and_format(data_task_category, benchmark_version)
129
- return df
130
-
131
- return _refresh
132
-
133
-
134
- def get_pipeline_refresh_function(pipeline_handler):
135
- """Refresh function for pipeline evaluation results."""
136
-
137
  def _refresh(metric):
138
- pipeline_handler.get_pipeline_data()
139
- data = pipeline_handler.render_df(metric)
140
- df = add_rank_and_format(data, benchmark_version=3, is_pipeline=True)
141
  return df
142
 
143
  return _refresh
@@ -145,7 +33,5 @@ def get_pipeline_refresh_function(pipeline_handler):
145
 
146
  def filter_models(data, search_term):
147
  if search_term:
148
- # Use "Pipeline" column for pipeline tables, "Model" for others
149
- col_name = "Pipeline" if "Pipeline" in data.columns else "Model"
150
- data = data[data[col_name].str.contains(search_term, case=False, na=False)]
151
- return data
 
1
+ from data.model_handler import ModelHandler
 
2
 
3
+ def make_clickable_model(model_name, link=None):
4
+ if link is None:
5
+ desanitized_model_name = model_name.replace("_", "/")
6
 
7
+ if '/captioning' in desanitized_model_name:
8
+ desanitized_model_name = desanitized_model_name.replace('/captioning', '')
9
+ if '/ocr' in desanitized_model_name:
10
+ desanitized_model_name = desanitized_model_name.replace('/ocr', '')
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ link = "https://huggingface.co/" + desanitized_model_name
 
 
 
13
 
14
+ return f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
 
15
 
 
16
 
17
+ def add_rank_and_format(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  df = df.reset_index()
19
+ df = df.rename(columns={"index": "Model"})
20
+ df = ModelHandler.add_rank(df)
21
+ df["Model"] = df["Model"].apply(make_clickable_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  return df
23
 
24
+ def get_refresh_function():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def _refresh(metric):
26
+ model_handler = ModelHandler()
27
+ data_task_category = model_handler.get_vidore_data(metric)
28
+ df = add_rank_and_format(data_task_category)
29
  return df
30
 
31
  return _refresh
 
33
 
34
  def filter_models(data, search_term):
35
  if search_term:
36
+ data = data[data['Model'].str.contains(search_term, case=False, na=False)]
37
+ return data
 
 
data/dataset_handler.py CHANGED
@@ -1,22 +1,4 @@
1
- VIDORE_V1_MTEB_NAMES = [
2
- "VidoreArxivQARetrieval",
3
- "VidoreDocVQARetrieval",
4
- "VidoreInfoVQARetrieval",
5
- "VidoreShiftProjectRetrieval",
6
- "VidoreSyntheticDocQAAIRetrieval",
7
- "VidoreSyntheticDocQAEnergyRetrieval",
8
- "VidoreSyntheticDocQAGovernmentReportsRetrieval",
9
- "VidoreSyntheticDocQAHealthcareIndustryRetrieval",
10
- "VidoreTabfquadRetrieval",
11
- "VidoreTatdqaRetrieval",
12
- ]
13
- VIDORE_V2_MTEB_NAMES = [
14
- "Vidore2BioMedicalLecturesRetrieval",
15
- "Vidore2EconomicsReportsRetrieval",
16
- "Vidore2ESGReportsHLRetrieval",
17
- "Vidore2ESGReportsRetrieval",
18
- ]
19
- DEPRECATED_VIDORE_DATASETS_KEYWORDS = [
20
  "arxivqa",
21
  "docvqa",
22
  "infovqa",
@@ -29,53 +11,8 @@ DEPRECATED_VIDORE_DATASETS_KEYWORDS = [
29
  "healthcare_industry",
30
  ]
31
 
32
- DEPRECATED_VIDORE_2_DATASETS_KEYWORDS = [
33
- "restaurant_esg",
34
- "rse_restaurant",
35
- "mit_biomedical",
36
- "economics_macro",
37
- "biomedical_lectures",
38
- "esg_reports",
39
- "economics_reports",
40
- ]
41
-
42
 
43
  def get_datasets_nickname(dataset_name) -> str:
44
- if dataset_name == "VidoreArxivQARetrieval":
45
- return "ArxivQA"
46
- elif dataset_name == "VidoreDocVQARetrieval":
47
- return "DocVQA"
48
- elif dataset_name == "VidoreInfoVQARetrieval":
49
- return "InfoVQA"
50
- elif dataset_name == "VidoreTabfquadRetrieval":
51
- return "TabFQuad"
52
- elif dataset_name == "VidoreTatdqaRetrieval":
53
- return "TAT-DQA"
54
- elif dataset_name == "VidoreShiftProjectRetrieval":
55
- return "Shift Project"
56
- elif dataset_name == "VidoreSyntheticDocQAAIRetrieval":
57
- return "Artificial Intelligence"
58
- elif dataset_name == "VidoreSyntheticDocQAEnergyRetrieval":
59
- return "Energy"
60
- elif dataset_name == "VidoreSyntheticDocQAGovernmentReportsRetrieval":
61
- return "Government Reports"
62
- elif dataset_name == "VidoreSyntheticDocQAHealthcareIndustryRetrieval":
63
- return "Healthcare Industry"
64
-
65
- elif dataset_name == "Vidore2ESGReportsHLRetrieval":
66
- return "ESG Restaurant Human English"
67
- elif dataset_name == "Vidore2ESGReportsRetrieval":
68
- return "ESG Restaurant Synthetic Multilingual"
69
- elif dataset_name == "Vidore2BioMedicalLecturesRetrieval":
70
- return "MIT Biomedical Multilingual"
71
- elif dataset_name == "Vidore2EconomicsReportsRetrieval":
72
- return "Economics Macro Multilingual"
73
-
74
- else:
75
- raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")
76
-
77
-
78
- def deprecated_get_datasets_nickname(dataset_name) -> str:
79
  if "arxivqa" in dataset_name:
80
  return "ArxivQA"
81
 
@@ -104,32 +41,5 @@ def deprecated_get_datasets_nickname(dataset_name) -> str:
104
  elif "healthcare_industry" in dataset_name:
105
  return "Healthcare Industry"
106
 
107
- elif ("restaurant_esg" in dataset_name) or ("esg_reports_human" in dataset_name):
108
- return "ESG Restaurant Human"
109
-
110
- elif ("rse_restaurant" in dataset_name and "multilingual" in dataset_name) or (
111
- "esg_reports" in dataset_name and "_eng_" not in dataset_name
112
- ):
113
- return "ESG Restaurant Synthetic Multilingual"
114
-
115
- elif ("rse_restaurant" in dataset_name) or ("esg_reports" in dataset_name and "_eng_" in dataset_name):
116
- return "ESG Restaurant Synthetic"
117
-
118
- elif ("mit_biomedical" in dataset_name and "multilingual" in dataset_name) or (
119
- "biomedical_lectures" in dataset_name and "_eng_" not in dataset_name
120
- ):
121
- return "MIT Biomedical Multilingual"
122
-
123
- elif ("mit_biomedical" in dataset_name) or ("biomedical_lectures" in dataset_name and "_eng_" in dataset_name):
124
- return "MIT Biomedical"
125
-
126
- elif ("economics_macro" in dataset_name and "multilingual" in dataset_name) or (
127
- "economics_reports" in dataset_name and "_eng_" not in dataset_name
128
- ):
129
- return "Economics Macro Multilingual"
130
-
131
- elif ("economics_macro" in dataset_name) or ("economics_reports" in dataset_name and "_eng_" in dataset_name):
132
- return "Economics Macro"
133
-
134
  else:
135
  raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")
 
1
+ VIDORE_DATASETS_KEYWORDS = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "arxivqa",
3
  "docvqa",
4
  "infovqa",
 
11
  "healthcare_industry",
12
  ]
13
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def get_datasets_nickname(dataset_name) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  if "arxivqa" in dataset_name:
17
  return "ArxivQA"
18
 
 
41
  elif "healthcare_industry" in dataset_name:
42
  return "Healthcare Industry"
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  else:
45
  raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")
data/deprecated_model_handler.py DELETED
@@ -1,128 +0,0 @@
1
- import json
2
- import os
3
- from typing import Any, Dict
4
-
5
- import pandas as pd
6
- from huggingface_hub import HfApi, hf_hub_download, metadata_load
7
-
8
- from .dataset_handler import (
9
- DEPRECATED_VIDORE_2_DATASETS_KEYWORDS,
10
- DEPRECATED_VIDORE_DATASETS_KEYWORDS,
11
- deprecated_get_datasets_nickname,
12
- )
13
-
14
- BLOCKLIST = ["impactframes"]
15
-
16
-
17
- class DeprecatedModelHandler:
18
- def __init__(self, model_infos_path="model_infos.json"):
19
- self.api = HfApi()
20
- self.model_infos_path = model_infos_path
21
- self.model_infos = self._load_model_infos()
22
-
23
- def _load_model_infos(self) -> Dict:
24
- if os.path.exists(self.model_infos_path):
25
- with open(self.model_infos_path) as f:
26
- return json.load(f)
27
- return {}
28
-
29
- def _save_model_infos(self):
30
- with open(self.model_infos_path, "w") as f:
31
- json.dump(self.model_infos, f)
32
-
33
- def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
34
- return "metadata" in results and "metrics" in results
35
-
36
- def _is_baseline_repo(self, repo_id: str) -> bool:
37
- return repo_id == "vidore/baseline-results"
38
-
39
- def sanitize_model_name(self, model_name):
40
- return model_name.replace("/", "_").replace(".", "-thisisapoint-")
41
-
42
- def fuze_model_infos(self, model_name, results):
43
- for dataset, metrics in results.items():
44
- if dataset not in self.model_infos[model_name]["results"].keys():
45
- self.model_infos[model_name]["results"][dataset] = metrics
46
- else:
47
- continue
48
-
49
- def get_vidore_data(self, metric="ndcg_at_5"):
50
- models = self.api.list_models(filter="vidore")
51
- repositories = [model.modelId for model in models] # type: ignore
52
-
53
- # Sort repositories to process non-baseline repos first (to prioritize their results)
54
- repositories.sort(key=lambda x: self._is_baseline_repo(x))
55
-
56
- for repo_id in repositories:
57
- org_name = repo_id.split("/")[0]
58
- if org_name in BLOCKLIST:
59
- continue
60
- files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]
61
-
62
- if len(files) == 0:
63
- continue
64
- else:
65
- for file in files:
66
- if file.endswith("results.json"):
67
- model_name = repo_id.replace("/", "_").replace(".", "-thisisapoint-")
68
- else:
69
- model_name = file.split("_metrics.json")[0]
70
- model_name = model_name.replace("/", "_").replace(".", "-thisisapoint-")
71
-
72
- # Skip if the model is from baseline and we already have results
73
-
74
- readme_path = hf_hub_download(repo_id, filename="README.md")
75
- meta = metadata_load(readme_path)
76
- try:
77
- result_path = hf_hub_download(repo_id, filename=file)
78
-
79
- with open(result_path) as f:
80
- results = json.load(f)
81
-
82
- if self._are_results_in_new_vidore_format(results):
83
- metadata = results["metadata"]
84
- results = results["metrics"]
85
-
86
- # Handles the case where the model is both in baseline and outside of it
87
- # (prioritizes the non-baseline results)
88
- if self._is_baseline_repo(repo_id) and self.sanitize_model_name(model_name) in self.model_infos:
89
- self.fuze_model_infos(model_name, results)
90
-
91
- self.model_infos[model_name] = {"meta": meta, "results": results}
92
- except Exception as e:
93
- print(f"Error loading {model_name} - {e}")
94
- continue
95
-
96
- # In order to keep only models relevant to a benchmark
97
- def filter_models_by_benchmark(self, benchmark_version=1):
98
- filtered_model_infos = {}
99
- keywords = DEPRECATED_VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else DEPRECATED_VIDORE_2_DATASETS_KEYWORDS
100
-
101
- for model, info in self.model_infos.items():
102
- results = info["results"]
103
- if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()):
104
- filtered_model_infos[model] = info
105
-
106
- return filtered_model_infos
107
-
108
- # Compute the average of a metric for each model,
109
- def render_df(self, metric="ndcg_at_5", benchmark_version=1):
110
- model_res = {}
111
- filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
112
- if len(filtered_model_infos) > 0:
113
- for model in filtered_model_infos.keys():
114
- res = filtered_model_infos[model]["results"]
115
- dataset_res = {}
116
- keywords = DEPRECATED_VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else DEPRECATED_VIDORE_2_DATASETS_KEYWORDS
117
- for dataset in res.keys():
118
- if not any(keyword in dataset for keyword in keywords):
119
- continue
120
-
121
- dataset_nickname = deprecated_get_datasets_nickname(dataset)
122
- dataset_res[dataset_nickname] = res[dataset][metric]
123
- model_res[model] = dataset_res
124
-
125
- df = pd.DataFrame(model_res).T
126
-
127
- return df
128
- return pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/model_handler.py CHANGED
@@ -1,98 +1,117 @@
1
  import json
2
  import os
 
3
 
4
  import pandas as pd
5
- from git import Repo
6
 
7
- from .dataset_handler import VIDORE_V1_MTEB_NAMES, VIDORE_V2_MTEB_NAMES, get_datasets_nickname
 
 
8
 
9
 
10
  class ModelHandler:
 
 
 
 
11
 
12
- def __init__(self):
13
- self.model_infos = {}
 
 
 
14
 
15
- @staticmethod
16
- def get_folders(dir_path):
17
- return sorted([
18
- path_
19
- for path_ in os.listdir(dir_path)
20
- if os.path.isdir(os.path.join(dir_path, path_))
21
- ])
22
 
23
  def get_vidore_data(self, metric="ndcg_at_5"):
24
- repo_url = "https://github.com/embeddings-benchmark/results.git"
25
- local_path = "./results"
26
- folder_of_interest = "results"
27
-
28
- if os.path.exists(local_path):
29
- repo = Repo(local_path)
30
- origin = repo.remotes.origin
31
- origin.pull()
32
- else:
33
- Repo.clone_from(repo_url, local_path, depth=1)
34
-
35
- model_names = self.get_folders(os.path.join(local_path, folder_of_interest))
36
- for model_name in model_names:
37
- revisions = self.get_folders(os.path.join(local_path, folder_of_interest, model_name))
38
- first_revision = revisions[0]
39
- result_filenames = [
40
- result_filename
41
- for result_filename in os.listdir(os.path.join(local_path, folder_of_interest, model_name, first_revision))
42
- # if result_filename.endswith(".json") and result_filename != "model_meta.json"
43
- ]
44
- if "model_meta.json" in result_filenames:
45
- with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, "model_meta.json"), "r") as f:
46
- meta = json.load(f)
47
  else:
48
- meta = {}
49
- results = {}
50
- if all(f"{v1_dataset_name}.json" in result_filenames for v1_dataset_name in VIDORE_V1_MTEB_NAMES):
51
- for v1_dataset_name in VIDORE_V1_MTEB_NAMES:
52
- with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v1_dataset_name}.json"), "r") as f:
53
- results[v1_dataset_name] = json.load(f)
54
- if all(f"{v2_dataset_name}.json" in result_filenames for v2_dataset_name in VIDORE_V2_MTEB_NAMES):
55
- for v2_dataset_name in VIDORE_V2_MTEB_NAMES:
56
- with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v2_dataset_name}.json"), "r") as f:
57
- results[v2_dataset_name] = json.load(f)
58
- if model_name not in self.model_infos:
59
- self.model_infos[model_name] = {}
60
- self.model_infos[model_name] = {"meta": meta, "results": results}
61
-
62
- def filter_models_by_benchmark(self, benchmark_version=1):
63
- filtered_model_infos = {}
64
- keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES
65
-
66
- for model, info in self.model_infos.items():
67
- results = info["results"]
68
- if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()):
69
- filtered_model_infos[model] = info
70
-
71
- return filtered_model_infos
72
-
73
- def render_df(self, metric="ndcg_at_5", benchmark_version=1):
74
  model_res = {}
75
- filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
76
- if len(filtered_model_infos) > 0:
77
- for model in filtered_model_infos.keys():
78
- res = filtered_model_infos[model]["results"]
79
  dataset_res = {}
80
- keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES
81
- if "n_parameters" in filtered_model_infos[model]["meta"]:
82
- try:
83
- dataset_res["Model Size (Million Parameters)"] = filtered_model_infos[model]["meta"]["n_parameters"] // 1_000_000
84
- except TypeError:
85
- dataset_res["Model Size (Million Parameters)"] = -1
86
- else:
87
- dataset_res["Model Size (Million Parameters)"] = -1
88
  for dataset in res.keys():
89
- if not any(keyword in dataset for keyword in keywords):
 
 
90
  continue
 
91
  dataset_nickname = get_datasets_nickname(dataset)
92
- dataset_res[dataset_nickname] = res[dataset]["scores"]["test"][0][metric]
93
  model_res[model] = dataset_res
94
 
95
  df = pd.DataFrame(model_res).T
96
 
97
  return df
98
  return pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
+ from typing import Any, Dict
4
 
5
  import pandas as pd
6
+ from huggingface_hub import HfApi, hf_hub_download, metadata_load
7
 
8
+ from .dataset_handler import VIDORE_DATASETS_KEYWORDS, get_datasets_nickname
9
+
10
+ BLOCKLIST = ["impactframes"]
11
 
12
 
13
  class ModelHandler:
14
+ def __init__(self, model_infos_path="model_infos.json"):
15
+ self.api = HfApi()
16
+ self.model_infos_path = model_infos_path
17
+ self.model_infos = self._load_model_infos()
18
 
19
+ def _load_model_infos(self) -> Dict:
20
+ if os.path.exists(self.model_infos_path):
21
+ with open(self.model_infos_path) as f:
22
+ return json.load(f)
23
+ return {}
24
 
25
+ def _save_model_infos(self):
26
+ with open(self.model_infos_path, "w") as f:
27
+ json.dump(self.model_infos, f)
28
+
29
+ def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
30
+ return "metadata" in results and "metrics" in results
 
31
 
32
  def get_vidore_data(self, metric="ndcg_at_5"):
33
+ models = self.api.list_models(filter="vidore")
34
+ repositories = [model.modelId for model in models] # type: ignore
35
+
36
+ for repo_id in repositories:
37
+ org_name = repo_id.split("/")[0]
38
+ if org_name in BLOCKLIST:
39
+ continue
40
+
41
+ files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]
42
+
43
+ if len(files) == 0:
44
+ continue
 
 
 
 
 
 
 
 
 
 
 
45
  else:
46
+ for file in files:
47
+ if file.endswith("results.json"):
48
+ model_name = repo_id.replace("/", "_")
49
+ else:
50
+ model_name = file.split("_metrics.json")[0]
51
+
52
+ if model_name not in self.model_infos:
53
+ readme_path = hf_hub_download(repo_id, filename="README.md")
54
+ meta = metadata_load(readme_path)
55
+ try:
56
+ result_path = hf_hub_download(repo_id, filename=file)
57
+
58
+ with open(result_path) as f:
59
+ results = json.load(f)
60
+
61
+ if self._are_results_in_new_vidore_format(results):
62
+ metadata = results["metadata"]
63
+ results = results["metrics"]
64
+
65
+ self.model_infos[model_name] = {"meta": meta, "results": results}
66
+ except Exception as e:
67
+ print(f"Error loading {model_name} - {e}")
68
+ continue
69
+
70
+ # self._save_model_infos()
71
+
72
  model_res = {}
73
+ if len(self.model_infos) > 0:
74
+ for model in self.model_infos.keys():
75
+ res = self.model_infos[model]["results"]
 
76
  dataset_res = {}
 
 
 
 
 
 
 
 
77
  for dataset in res.keys():
78
+ # for each keyword check if it is in the dataset name if not continue
79
+ if not any(keyword in dataset for keyword in VIDORE_DATASETS_KEYWORDS):
80
+ print(f"{dataset} not found in ViDoRe datasets. Skipping ...")
81
  continue
82
+
83
  dataset_nickname = get_datasets_nickname(dataset)
84
+ dataset_res[dataset_nickname] = res[dataset][metric]
85
  model_res[model] = dataset_res
86
 
87
  df = pd.DataFrame(model_res).T
88
 
89
  return df
90
  return pd.DataFrame()
91
+
92
+ @staticmethod
93
+ def add_rank(df):
94
+ df.fillna(0.0, inplace=True)
95
+ cols_to_rank = [
96
+ col
97
+ for col in df.columns
98
+ if col
99
+ not in [
100
+ "Model",
101
+ "Model Size (Million Parameters)",
102
+ "Memory Usage (GB, fp32)",
103
+ "Embedding Dimensions",
104
+ "Max Tokens",
105
+ ]
106
+ ]
107
+ if len(cols_to_rank) == 1:
108
+ df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
109
+ else:
110
+ df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
111
+ df.sort_values("Average", ascending=False, inplace=True)
112
+ df.insert(0, "Rank", list(range(1, len(df) + 1)))
113
+ # multiply values by 100 if they are floats and round to 1 decimal place
114
+ for col in df.columns:
115
+ if df[col].dtype == "float64":
116
+ df[col] = df[col].apply(lambda x: round(x * 100, 1))
117
+ return df
data/pipeline_handler.py DELETED
@@ -1,274 +0,0 @@
1
- import os
2
- import requests
3
- from typing import Dict, List, Optional
4
-
5
- import pandas as pd
6
-
7
-
8
- class PipelineHandler:
9
- """Handler for ViDoRe v3 pipeline evaluation results from GitHub."""
10
-
11
- def __init__(self):
12
- self.pipeline_infos = {}
13
- self.pipeline_aliases = {} # Maps folder_name -> pipeline_alias for display
14
- self.github_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/metrics"
15
- self.github_descriptions_base_url = (
16
- "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/pipeline_descriptions"
17
- )
18
- self.available_datasets = []
19
- self.available_languages = ["english"] # Default languages available
20
-
21
- # Setup GitHub authentication if token is available
22
- self.github_token = os.environ.get("GITHUB_TOKEN")
23
- self.headers = {}
24
- if self.github_token:
25
- self.headers["Authorization"] = f"token {self.github_token}"
26
- print("GitHub token detected - using authenticated requests")
27
-
28
- def get_pipeline_folders_from_github(self) -> List[str]:
29
- """Get list of pipeline folders from GitHub API."""
30
- api_url = "https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics"
31
-
32
- try:
33
- response = requests.get(api_url, headers=self.headers)
34
- response.raise_for_status()
35
- contents = response.json()
36
-
37
- # Filter for directories only
38
- folders = [item["name"] for item in contents if item["type"] == "dir"]
39
- return sorted(folders)
40
- except Exception as e:
41
- print(f"Error fetching pipeline folders from GitHub: {e}")
42
- return []
43
-
44
- def get_dataset_files_from_github(self, pipeline_name: str) -> List[str]:
45
- """Get list of dataset JSON files for a specific pipeline from GitHub API."""
46
- api_url = f"https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics/{pipeline_name}"
47
-
48
- try:
49
- response = requests.get(api_url, headers=self.headers)
50
- response.raise_for_status()
51
- contents = response.json()
52
-
53
- # Filter for JSON files that start with 'vidore_v3'
54
- files = [
55
- item["name"]
56
- for item in contents
57
- if item["type"] == "file" and item["name"].startswith("vidore_v3") and item["name"].endswith(".json")
58
- ]
59
- return sorted(files)
60
- except Exception as e:
61
- print(f"Error fetching dataset files from {pipeline_name}: {e}")
62
- return []
63
-
64
- def fetch_json_from_github(self, pipeline_name: str, filename: str) -> Optional[Dict]:
65
- """Fetch a JSON file from GitHub raw content."""
66
- url = f"{self.github_base_url}/{pipeline_name}/{filename}"
67
-
68
- try:
69
- response = requests.get(url, headers=self.headers)
70
- response.raise_for_status()
71
- return response.json()
72
- except Exception as e:
73
- print(f"Error fetching {filename} from {pipeline_name}: {e}")
74
- return None
75
-
76
- def fetch_pipeline_alias(self, pipeline_name: str) -> Optional[str]:
77
- """Fetch the pipeline_alias from description.json for a pipeline.
78
-
79
- Uses raw.githubusercontent.com to avoid API rate limits.
80
- """
81
- url = f"{self.github_descriptions_base_url}/{pipeline_name}/description.json"
82
-
83
- try:
84
- response = requests.get(url, headers=self.headers)
85
- response.raise_for_status()
86
- description = response.json()
87
- return description.get("pipeline_alias")
88
- except Exception as e:
89
- print(f"Error fetching description for {pipeline_name}: {e}")
90
- return None
91
-
92
- def get_pipeline_data(self):
93
- """Fetch all pipeline data from GitHub."""
94
- pipeline_folders = self.get_pipeline_folders_from_github()
95
- datasets_set = set()
96
- languages_set = set(["overall"])
97
-
98
- for pipeline_name in pipeline_folders:
99
- # Get all dataset files for this pipeline
100
- dataset_files = self.get_dataset_files_from_github(pipeline_name)
101
-
102
- if not dataset_files:
103
- continue
104
-
105
- pipeline_data = {}
106
- for filename in dataset_files:
107
- results = self.fetch_json_from_github(pipeline_name, filename)
108
- if results:
109
- # Extract dataset name from filename (remove vidore_v3_ prefix and .json suffix)
110
- dataset_name = filename.replace("vidore_v3_", "").replace(".json", "")
111
- datasets_set.add(dataset_name)
112
- pipeline_data[dataset_name] = results
113
-
114
- # Collect available languages
115
- if "aggregated_metrics" in results and "by_language" in results["aggregated_metrics"]:
116
- languages_set.update(results["aggregated_metrics"]["by_language"].keys())
117
-
118
- if pipeline_data:
119
- self.pipeline_infos[pipeline_name] = pipeline_data
120
- # Fetch the pipeline alias for display (uses raw URL, not API)
121
- alias = self.fetch_pipeline_alias(pipeline_name)
122
- if alias:
123
- self.pipeline_aliases[pipeline_name] = alias
124
-
125
- self.available_datasets = sorted(list(datasets_set))
126
- self.available_languages = sorted(list(languages_set))
127
-
128
- def calculate_cost_metric(self, pipeline_datasets: Dict) -> float:
129
- """
130
- Calculate a compute cost metric based on retrieval time across all datasets.
131
- Returns cost in arbitrary units (could be refined based on actual compute costs).
132
- """
133
- total_time_s = 0
134
-
135
- for dataset_name, dataset_data in pipeline_datasets.items():
136
- if "aggregated_metrics" not in dataset_data:
137
- continue
138
-
139
- timing = dataset_data["aggregated_metrics"].get("timing", {})
140
- total_time_ms = timing.get("total_retrieval_time_milliseconds", 0)
141
- total_time_s += total_time_ms / 1000.0
142
-
143
- # Simple cost model: assume $0.01 per second of compute (adjustable)
144
- cost = total_time_s * 0.01
145
-
146
- return round(cost, 4)
147
-
148
- def extract_dataset_metrics(
149
- self, pipeline_datasets: Dict, metric: str = "ndcg_cut_5", language: str = "english"
150
- ) -> Dict[str, float]:
151
- """
152
- Extract metrics for individual datasets from the aggregated results.
153
-
154
- Args:
155
- pipeline_datasets: Dictionary mapping dataset names to their data
156
- metric: The metric to extract (e.g., 'ndcg_at_5')
157
- language: The language to filter by ('english' for English-only results, or specific language)
158
-
159
- Returns:
160
- Dictionary mapping dataset names to metric values
161
- """
162
- # Map metric names from UI format to API format
163
- metric_mapping = {
164
- "ndcg_at_1": "ndcg_cut_5", # Using cut_5 as approximation
165
- "ndcg_at_5": "ndcg_cut_5",
166
- "ndcg_at_10": "ndcg_cut_10",
167
- "ndcg_at_100": "ndcg_cut_100",
168
- "recall_at_1": "recall_5",
169
- "recall_at_5": "recall_5",
170
- "recall_at_10": "recall_10",
171
- "recall_at_100": "recall_100",
172
- }
173
-
174
- actual_metric = metric_mapping.get(metric, metric)
175
- dataset_metrics = {}
176
-
177
- for dataset_name, dataset_data in pipeline_datasets.items():
178
- if "aggregated_metrics" not in dataset_data:
179
- continue
180
-
181
- aggregated = dataset_data["aggregated_metrics"]
182
-
183
- # Get metrics for the specified language
184
- if language == "overall":
185
- metrics_data = aggregated.get("overall", {})
186
- else:
187
- metrics_data = aggregated.get("by_language", {}).get(language, {})
188
-
189
- if metrics_data:
190
- # Format dataset name for display
191
- display_name = dataset_name.replace("_", " ").title()
192
- dataset_metrics[display_name] = metrics_data.get(actual_metric, 0.0)
193
-
194
- return dataset_metrics
195
-
196
- def render_df(self, metric: str = "ndcg_at_5", language: str = "overall") -> pd.DataFrame:
197
- """
198
- Render a DataFrame with pipeline results.
199
-
200
- Args:
201
- metric: The metric to display (e.g., 'ndcg_at_5')
202
- language: The language to filter by ('overall' for all languages, or specific language)
203
-
204
- Returns:
205
- DataFrame with columns: Pipeline Name, Compute Cost, Timing metrics, Dataset metrics
206
- """
207
- pipeline_res = {}
208
-
209
- for pipeline_name, pipeline_datasets in self.pipeline_infos.items():
210
- row_data = {}
211
-
212
- # Aggregate time metrics across all datasets
213
- total_time_ms = 0
214
- total_queries = 0
215
- indexing_time_ms = 0
216
- search_time_ms = 0
217
- num_datasets = 0
218
-
219
- for dataset_name, dataset_data in pipeline_datasets.items():
220
- if "aggregated_metrics" in dataset_data:
221
- timing = dataset_data["aggregated_metrics"].get("timing", {})
222
- total_time_ms += timing.get("total_retrieval_time_milliseconds", 0)
223
- total_queries += timing.get("num_queries", 0)
224
- indexing_time_ms += timing.get("indexing_throughput_ms_per_doc", 0)
225
- search_time_ms += timing.get("search_throughput_ms_per_query", 0)
226
- num_datasets += 1
227
-
228
- if total_queries > 0:
229
- if total_time_ms > 0:
230
- # row_data["Total retrieval time (s)"] = round(
231
- # (total_time_ms / 1000), 0
232
- # )
233
- row_data["Indexing latency (s/doc)"] = (
234
- (indexing_time_ms / 1000) / num_datasets if indexing_time_ms > 0 else None
235
- )
236
- row_data["Search latency (s/query)"] = (
237
- (search_time_ms / 1000) / num_datasets if search_time_ms > 0 else None
238
- )
239
- else:
240
- # row_data["Total retrieval time (s)"] = 0
241
- row_data["Indexing latency (s/doc)"] = 0
242
- row_data["Search latency (s/query)"] = 0
243
- else:
244
- # row_data["Total retrieval time (s)"] = -1
245
- row_data["Indexing latency (s/doc)"] = -1
246
- row_data["Search latency (s/query)"] = -1
247
-
248
- # Add dataset metrics
249
- dataset_metrics = self.extract_dataset_metrics(pipeline_datasets, metric, language)
250
- row_data.update(dataset_metrics)
251
-
252
- # Calculate average across datasets if there are multiple
253
- if dataset_metrics:
254
- row_data["Average Score"] = round(sum(dataset_metrics.values()) / len(dataset_metrics), 4)
255
-
256
- # Use pipeline_alias for display if available, otherwise fallback to folder name
257
- display_name = self.pipeline_aliases.get(pipeline_name, pipeline_name)
258
- # Store folder name for link generation (will be used in utils.py)
259
- row_data["_folder_name"] = pipeline_name
260
- pipeline_res[display_name] = row_data
261
-
262
- if pipeline_res:
263
- df = pd.DataFrame(pipeline_res).T
264
- # Reorder columns to have Average right after timing metrics
265
- cols = list(df.columns)
266
- if "Average Score" in cols:
267
- cols.remove("Average Score")
268
- # Insert Average after Search latency (s/query)
269
- insert_pos = cols.index("Search latency (s/query)") + 1 if "Search latency (s/query)" in cols else 2
270
- cols.insert(insert_pos, "Average Score")
271
- df = df[cols]
272
- return df
273
-
274
- return pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,6 +0,0 @@
1
- gitpython
2
- gradio
3
- pydub
4
- huggingface_hub < 1.0.0
5
- plotly
6
- pandas