QuentinJG commited on
Commit
342cf1f
·
1 Parent(s): 06e780e

Pipeline leaderboard (#17)

Browse files

- pipeline leaderboard (0538054cb956e3c5f1624236f9d1b74d337f2449)

Files changed (3) hide show
  1. app.py +385 -38
  2. app/utils.py +98 -36
  3. data/pipeline_handler.py +274 -0
app.py CHANGED
@@ -1,9 +1,19 @@
1
- import gradio as gr
2
 
3
- from app.utils import (add_rank_and_format, deprecated_get_refresh_function,
4
- filter_models, get_refresh_function)
 
 
 
 
 
 
 
 
 
5
  from data.deprecated_model_handler import DeprecatedModelHandler
6
  from data.model_handler import ModelHandler
 
7
 
8
  METRICS = [
9
  "ndcg_at_1",
@@ -22,7 +32,6 @@ def main():
22
  model_handler = ModelHandler()
23
  initial_metric = "ndcg_at_5"
24
 
25
- model_handler.get_vidore_data(initial_metric)
26
  data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1)
27
  data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
28
 
@@ -37,9 +46,19 @@ def main():
37
  num_scores_2 = len(data_benchmark_2) * num_datasets_2
38
  num_models_2 = len(data_benchmark_2)
39
 
40
- # Get deprecated results
41
  deprecated_model_handler = DeprecatedModelHandler()
42
  initial_metric = "ndcg_at_5"
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  deprecated_model_handler.get_vidore_data(initial_metric)
45
  deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
@@ -83,11 +102,35 @@ def main():
83
  border-left: 4px solid #2196f3;
84
  padding: 5px 15px;
85
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  """
87
 
88
  with gr.Blocks(css=css) as block:
89
- with gr.Tabs():
90
- with gr.TabItem("ViDoRe V3"):
91
  gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷‍♂️")
92
 
93
  gr.Markdown(
@@ -104,14 +147,14 @@ def main():
104
  gr.Markdown("""
105
  As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
106
  we embed it here.
107
- """ )
108
 
109
  gr.Markdown(
110
- """**💡 To display English-only results:**
111
  - Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*),
112
  - Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it),
113
  - The *eng-Latn* column will show English-only results (= English queries on English documents).""",
114
- elem_classes="alert-info"
115
  )
116
 
117
  gr.HTML(
@@ -122,7 +165,242 @@ def main():
122
  ></iframe>
123
  """
124
  )
125
- with gr.TabItem("ViDoRe V2"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
127
  gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
128
 
@@ -156,7 +434,9 @@ def main():
156
  data = filter_models(data, search_term)
157
  # data = remove_duplicates(data) # Add this line
158
  if selected_columns:
159
- data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
 
 
160
  return data
161
 
162
  with gr.Row():
@@ -225,7 +505,7 @@ def main():
225
  ```
226
  """
227
  )
228
- with gr.TabItem("ViDoRe V1"):
229
  gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
230
  gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
231
 
@@ -258,7 +538,9 @@ def main():
258
  data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
259
  data = filter_models(data, search_term)
260
  if selected_columns:
261
- data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
 
 
262
  return data
263
 
264
  with gr.Row():
@@ -319,11 +601,12 @@ def main():
319
  ```
320
  """
321
  )
322
- with gr.TabItem("📚 Submit your model"):
323
  gr.Markdown("# How to Submit a New Model to the Leaderboard")
324
  gr.Markdown(
325
  """
326
- To submit a new model to the ViDoRe leaderboard, follow these steps:
 
327
 
328
  1. **Evaluate your model**:
329
  - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB.
@@ -336,9 +619,20 @@ def main():
336
 
337
  Note: For proper hyperlink redirection, please ensure that your model repository name is in
338
  kebab-case, e.g. `my-model-name`.
 
 
 
 
 
 
 
 
 
 
 
339
  """
340
  )
341
- with gr.TabItem("Deprecated ViDoRe V1"):
342
  gr.Markdown(
343
  "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
344
  "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
@@ -346,8 +640,12 @@ def main():
346
  "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
347
  "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
348
  )
349
- gr.Markdown("## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>")
350
- gr.Markdown("# <span style='color:red'>[Deprecated]</span> ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
 
 
 
 
351
  gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
352
 
353
  gr.Markdown(
@@ -360,18 +658,24 @@ def main():
360
  deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
361
 
362
  with gr.Row():
363
- deprecated_metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
 
 
364
  deprecated_research_textbox_1 = gr.Textbox(
365
  placeholder="🔍 Search Models... [press enter]",
366
  label="Filter Models by Name",
367
  )
368
  deprecated_column_checkboxes_1 = gr.CheckboxGroup(
369
- choices=deprecated_datasets_columns_1, value=deprecated_datasets_columns_1, label="Select Columns to Display"
 
 
370
  )
371
 
372
  with gr.Row():
373
  deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
374
- deprecated_dataframe_1 = gr.Dataframe(deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas")
 
 
375
 
376
  def deprecated_update_data_1(metric, search_term, selected_columns):
377
  deprecated_model_handler.get_vidore_data(metric)
@@ -380,7 +684,7 @@ def main():
380
  data = filter_models(data, search_term)
381
  # data = remove_duplicates(data) # Add this line
382
  if selected_columns:
383
- data = data[["Rank", "Model", "Average"] + selected_columns]
384
  return data
385
 
386
  with gr.Row():
@@ -399,13 +703,25 @@ def main():
399
  outputs=deprecated_dataframe_1,
400
  )
401
  deprecated_research_textbox_1.submit(
402
- lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns),
403
- inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1],
 
 
 
 
 
 
404
  outputs=deprecated_dataframe_1,
405
  )
406
  deprecated_column_checkboxes_1.change(
407
- lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns),
408
- inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1],
 
 
 
 
 
 
409
  outputs=deprecated_dataframe_1,
410
  )
411
 
@@ -441,7 +757,7 @@ def main():
441
  ```
442
  """
443
  )
444
- with gr.TabItem("Deprecated ViDoRe V2"):
445
  gr.Markdown(
446
  "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
447
  "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
@@ -449,8 +765,12 @@ def main():
449
  "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
450
  "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
451
  )
452
- gr.Markdown("## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>")
453
- gr.Markdown("# <span style='color:red'>[Deprecated]</span> ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
 
 
 
 
454
  gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
455
 
456
  gr.Markdown(
@@ -463,18 +783,24 @@ def main():
463
  deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
464
 
465
  with gr.Row():
466
- deprecated_metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
 
 
467
  deprecated_research_textbox_2 = gr.Textbox(
468
  placeholder="🔍 Search Models... [press enter]",
469
  label="Filter Models by Name",
470
  )
471
  deprecated_column_checkboxes_2 = gr.CheckboxGroup(
472
- choices=deprecated_datasets_columns_2, value=deprecated_datasets_columns_2, label="Select Columns to Display"
 
 
473
  )
474
 
475
  with gr.Row():
476
  deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
477
- deprecated_dataframe_2 = gr.Dataframe(deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas")
 
 
478
 
479
  def deprecated_update_data_2(metric, search_term, selected_columns):
480
  deprecated_model_handler.get_vidore_data(metric)
@@ -483,7 +809,7 @@ def main():
483
  data = filter_models(data, search_term)
484
  # data = remove_duplicates(data) # Add this line
485
  if selected_columns:
486
- data = data[["Rank", "Model", "Average"] + selected_columns]
487
  return data
488
 
489
  with gr.Row():
@@ -510,13 +836,25 @@ def main():
510
  outputs=deprecated_dataframe_2,
511
  )
512
  deprecated_research_textbox_2.submit(
513
- lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns),
514
- inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2],
 
 
 
 
 
 
515
  outputs=deprecated_dataframe_2,
516
  )
517
  deprecated_column_checkboxes_2.change(
518
- lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns),
519
- inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2],
 
 
 
 
 
 
520
  outputs=deprecated_dataframe_2,
521
  )
522
 
@@ -553,6 +891,15 @@ def main():
553
  """
554
  )
555
 
 
 
 
 
 
 
 
 
 
556
  block.queue(max_size=10).launch(debug=True)
557
 
558
 
 
1
+ import re
2
 
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import plotly.express as px
6
+
7
+ from app.utils import (
8
+ add_rank_and_format,
9
+ deprecated_get_refresh_function,
10
+ filter_models,
11
+ get_pipeline_refresh_function,
12
+ get_refresh_function,
13
+ )
14
  from data.deprecated_model_handler import DeprecatedModelHandler
15
  from data.model_handler import ModelHandler
16
+ from data.pipeline_handler import PipelineHandler
17
 
18
  METRICS = [
19
  "ndcg_at_1",
 
32
  model_handler = ModelHandler()
33
  initial_metric = "ndcg_at_5"
34
 
 
35
  data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1)
36
  data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
37
 
 
46
  num_scores_2 = len(data_benchmark_2) * num_datasets_2
47
  num_models_2 = len(data_benchmark_2)
48
 
 
49
  deprecated_model_handler = DeprecatedModelHandler()
50
  initial_metric = "ndcg_at_5"
51
+ initial_metric_v3 = "ndcg_at_10"
52
+
53
+ # Get pipeline evaluation results
54
+ pipeline_handler = PipelineHandler()
55
+ pipeline_handler.get_pipeline_data()
56
+ data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
57
+ data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
58
+
59
+ num_datasets_pipeline = len(data_pipeline.columns) - 5 # Excluding Rank, Model, Indexing time, search time, Average
60
+ num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
61
+ num_pipelines = len(data_pipeline)
62
 
63
  deprecated_model_handler.get_vidore_data(initial_metric)
64
  deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
 
102
  border-left: 4px solid #2196f3;
103
  padding: 5px 15px;
104
  }
105
+
106
+ /* 1. Force text wrapping on all headers from the 3rd column onwards */
107
+ #pipeline-table table th:nth-child(n+3) * {
108
+ white-space: normal !important;
109
+ overflow: visible !important;
110
+ text-overflow: clip !important;
111
+ line-height: 1.2 !important;
112
+ word-break: normal !important; /* Prevents breaking in the middle of words */
113
+ overflow-wrap: normal !important; /* Prevents breaking in the middle of words */
114
+ }
115
+
116
+ /* 2. Set a fixed width and center alignment for headers and data cells from the 3rd column onwards */
117
+ #pipeline-table table th:nth-child(n+3),
118
+ #pipeline-table table td:nth-child(n+3) {
119
+ min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
120
+ max-width: 120px !important;
121
+ }
122
+
123
+ /* 3. Make the Model column (2nd column) wider for pipeline table */
124
+ #pipeline-table table th:nth-child(2),
125
+ #pipeline-table table td:nth-child(2) {
126
+ min-width: 400px !important;
127
+ max-width: 500px !important;
128
+ }
129
  """
130
 
131
  with gr.Blocks(css=css) as block:
132
+ with gr.Tabs() as tabs:
133
+ with gr.TabItem("ViDoRe V3", id="vidore-v3"):
134
  gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷‍♂️")
135
 
136
  gr.Markdown(
 
147
  gr.Markdown("""
148
  As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
149
  we embed it here.
150
+ """)
151
 
152
  gr.Markdown(
153
+ """**💡 To display English-only results:**
154
  - Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*),
155
  - Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it),
156
  - The *eng-Latn* column will show English-only results (= English queries on English documents).""",
157
+ elem_classes="alert-info",
158
  )
159
 
160
  gr.HTML(
 
165
  ></iframe>
166
  """
167
  )
168
+
169
+ with gr.TabItem("ViDoRe V3 (Pipeline)", id="vidore-v3-pipeline"):
170
+ gr.Markdown("# ViDoRe V3 (Pipeline Evaluation): Retrieval Performance for Complex Pipelines ⚙️")
171
+ gr.Markdown(
172
+ "### Assessing retrieval performance, latency, and compute costs of complex retrieval pipelines"
173
+ )
174
+
175
+ gr.Markdown(
176
+ """
177
+ This leaderboard ranks full retrieval pipelines on **English-only queries** for **ViDoRe V3**. Instead of just testing standalone models, we evaluate real-world, multi-step retrieval systems. This includes everything from basic retrievers to advanced setups using AI agents, query reformulation, hybrid search, and any other creative retrieval pipeline one can imagine.
178
+
179
+ To show efficiency alongside accuracy, we include **Indexing latency** (seconds/doc) and **Search latency** (seconds/query). These numbers are self-reported and depend heavily on hardware, so treat them as rough estimates. Still, they give a helpful idea of how much computing power each pipeline needs.
180
+
181
+ ⚠️ **Note:** Because this only uses English queries, these scores **cannot be directly compared** to the standard ViDoRe V3 results.
182
+
183
+ *Results are sourced from the [vidore-benchmark repository](https://github.com/illuin-tech/vidore-benchmark/tree/main/results).*
184
+ """
185
+ )
186
+
187
+ if len(data_pipeline) > 0:
188
+ datasets_columns_pipeline = [
189
+ col
190
+ for col in data_pipeline.columns[4:]
191
+ if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average Score"]
192
+ ]
193
+
194
+ with gr.Row():
195
+ metric_dropdown_pipeline = gr.Dropdown(
196
+ choices=METRICS, value=initial_metric_v3, label="Select Metric"
197
+ )
198
+ research_textbox_pipeline = gr.Textbox(
199
+ placeholder="🔍 Search Pipelines... [press enter]",
200
+ label="Filter Pipelines by Name",
201
+ )
202
+
203
+ with gr.Row():
204
+ if datasets_columns_pipeline:
205
+ column_checkboxes_pipeline = gr.CheckboxGroup(
206
+ choices=datasets_columns_pipeline,
207
+ value=datasets_columns_pipeline,
208
+ label="Select Datasets to Display",
209
+ )
210
+ else:
211
+ column_checkboxes_pipeline = gr.CheckboxGroup(
212
+ choices=[], value=[], label="Select Datasets to Display"
213
+ )
214
+
215
+ with gr.Row():
216
+ # Datatype: Rank, Pipeline, Indexing latency (s/doc), Search latency (s/query), Average + datasets
217
+ datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(
218
+ datasets_columns_pipeline
219
+ )
220
+ dataframe_pipeline = gr.Dataframe(
221
+ data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table"
222
+ )
223
+
224
+ def clean_pipeline_name(name):
225
+ if not isinstance(name, str):
226
+ return str(name)
227
+ # Remove Markdown links [text](url) -> text
228
+ name = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", name)
229
+ # Remove HTML tags <a href="...">text</a> -> text
230
+ name = re.sub(r"<[^>]+>", "", name)
231
+ return name.strip()
232
+
233
+ def create_pipeline_plot(df, latency_col):
234
+ if df is None or len(df) == 0:
235
+ return None
236
+
237
+ # Ensure expected columns exist
238
+ if (
239
+ latency_col not in df.columns
240
+ or "Average Score" not in df.columns
241
+ or "Pipeline" not in df.columns
242
+ ):
243
+ return None
244
+
245
+ # Clean the dataframe for plotting
246
+ plot_df = df.copy()
247
+
248
+ # Strip HTML and Markdown for clean hover text
249
+ plot_df["Cleaned Pipeline"] = plot_df["Pipeline"].apply(clean_pipeline_name)
250
+
251
+ plot_df[latency_col] = pd.to_numeric(plot_df[latency_col], errors="coerce")
252
+ plot_df["Average Score"] = pd.to_numeric(plot_df["Average Score"], errors="coerce")
253
+
254
+ plot_df = plot_df.dropna(subset=[latency_col, "Average Score"])
255
+ plot_df = plot_df[plot_df[latency_col] > 0]
256
+ plot_df = plot_df.sort_values(by=latency_col)
257
+
258
+ if len(plot_df) == 0:
259
+ return None
260
+
261
+ fig = px.scatter(
262
+ plot_df,
263
+ x=latency_col,
264
+ y="Average Score",
265
+ hover_name="Cleaned Pipeline", # Use the clean text!
266
+ title=f"Mean Performance vs {latency_col}",
267
+ color_discrete_sequence=["orange"],
268
+ )
269
+
270
+ fig.update_layout(
271
+ xaxis_title=latency_col,
272
+ yaxis_title="Average Score",
273
+ plot_bgcolor="white",
274
+ )
275
+
276
+ fig.update_xaxes(showgrid=True, gridcolor="lightgrey")
277
+ fig.update_yaxes(showgrid=True, gridcolor="lightgrey")
278
+
279
+ fig.update_traces(marker=dict(size=12, opacity=0.8, line=dict(width=1, color="DarkSlateGrey")))
280
+ return fig
281
+
282
+ with gr.Row():
283
+ latency_radio = gr.Radio(
284
+ choices=["Search latency (s/query)", "Indexing latency (s/doc)"],
285
+ value="Search latency (s/query)",
286
+ label="Select Latency Metric for X-Axis",
287
+ )
288
+
289
+ with gr.Row():
290
+ initial_fig = create_pipeline_plot(data_pipeline, "Search latency (s/query)")
291
+ performance_plot = gr.Plot(value=initial_fig)
292
+
293
+ def update_data_pipeline(metric, search_term, selected_columns):
294
+ pipeline_handler.get_pipeline_data()
295
+ data = pipeline_handler.render_df(metric, "english")
296
+ data = add_rank_and_format(
297
+ data, benchmark_version=3, selected_columns=selected_columns, is_pipeline=True
298
+ )
299
+ data = filter_models(data, search_term)
300
+ if selected_columns:
301
+ # Include core columns plus selected dataset columns
302
+ core_cols = ["Rank", "Pipeline", "Indexing latency (s/doc)", "Search latency (s/query)"]
303
+ if "Average Score" in data.columns:
304
+ core_cols.insert(4, "Average Score")
305
+ data = data[core_cols + selected_columns]
306
+ return data
307
+
308
+ with gr.Row():
309
+ refresh_button_pipeline = gr.Button("Refresh")
310
+ refresh_button_pipeline.click(
311
+ lambda metric: add_rank_and_format(
312
+ pipeline_handler.render_df(metric, "english"), benchmark_version=3, is_pipeline=True
313
+ ),
314
+ inputs=[metric_dropdown_pipeline],
315
+ outputs=dataframe_pipeline,
316
+ concurrency_limit=20,
317
+ ).then(
318
+ fn=create_pipeline_plot,
319
+ inputs=[dataframe_pipeline, latency_radio],
320
+ outputs=performance_plot,
321
+ )
322
+
323
+ with gr.Row():
324
+ gr.Markdown(
325
+ """
326
+ **Note**: These results represent full pipeline evaluations on english queries ONLY (since other queries were mostly directly translated from their english counterparts).
327
+ We felt mutli-lingual results were less critical (and much more costly to evaluate on the full set) for pipelines, since one user could just add a translation module to the pipeline and expect similar performance to the english results.
328
+ If you feel this is a mistake and multi-lingual results are critical for pipelines, please let us know by opening an issue in the GitHub repository!
329
+ """
330
+ )
331
+
332
+ # Automatically refresh the dataframe when the dropdown value changes
333
+ def refresh_pipeline_data(metric):
334
+ """Refresh pipeline data when metric changes."""
335
+ df = pipeline_handler.render_df(metric, "english")
336
+ return add_rank_and_format(df, benchmark_version=3, is_pipeline=True)
337
+
338
+ # Update dataframe and then update the plot
339
+ metric_dropdown_pipeline.change(
340
+ refresh_pipeline_data,
341
+ inputs=[metric_dropdown_pipeline],
342
+ outputs=dataframe_pipeline,
343
+ ).then(
344
+ fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
345
+ )
346
+
347
+ research_textbox_pipeline.submit(
348
+ lambda metric, search_term, selected_columns: update_data_pipeline(
349
+ metric, search_term, selected_columns
350
+ ),
351
+ inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
352
+ outputs=dataframe_pipeline,
353
+ ).then(
354
+ fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
355
+ )
356
+
357
+ column_checkboxes_pipeline.change(
358
+ lambda metric, search_term, selected_columns: update_data_pipeline(
359
+ metric, search_term, selected_columns
360
+ ),
361
+ inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
362
+ outputs=dataframe_pipeline,
363
+ ).then(
364
+ fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
365
+ )
366
+
367
+ # Update plot when the radio button changes
368
+ latency_radio.change(
369
+ fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
370
+ )
371
+
372
+ gr.Markdown(
373
+ f"""
374
+ - **Total Datasets**: {num_datasets_pipeline}
375
+ - **Total Scores**: {num_scores_pipeline}
376
+ - **Total Pipelines**: {num_pipelines}
377
+ """
378
+ + r"""
379
+ Please consider citing:
380
+
381
+ ```bibtex
382
+ @misc{faysse2024colpaliefficientdocumentretrieval,
383
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
384
+ author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
385
+ year={2024},
386
+ eprint={2407.01449},
387
+ archivePrefix={arXiv},
388
+ primaryClass={cs.IR},
389
+ url={[https://arxiv.org/abs/2407.01449](https://arxiv.org/abs/2407.01449)},
390
+ }
391
+ @misc{loison2026vidore,
392
+ title={ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
393
+ author={Loison, Ant{\'o}nio and Mac{\'e}, Quentin and Edy, Antoine and Xing, Victor and Balough, Tom and Moreira, Gabriel and Liu, Bo and Faysse, Manuel and Hudelot, C{\'e}line and Viaud, Gautier},
394
+ journal={arXiv preprint arXiv:2601.08620},
395
+ year={2026}
396
+ }
397
+ ```
398
+ """
399
+ )
400
+ else:
401
+ gr.Markdown("**No pipeline evaluation results available yet. Check back later!**")
402
+
403
+ with gr.TabItem("ViDoRe V2", id="vidore-v2"):
404
  gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
405
  gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
406
 
 
434
  data = filter_models(data, search_term)
435
  # data = remove_duplicates(data) # Add this line
436
  if selected_columns:
437
+ data = data[
438
+ ["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns
439
+ ]
440
  return data
441
 
442
  with gr.Row():
 
505
  ```
506
  """
507
  )
508
+ with gr.TabItem("ViDoRe V1", id="vidore-v1"):
509
  gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
510
  gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
511
 
 
538
  data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
539
  data = filter_models(data, search_term)
540
  if selected_columns:
541
+ data = data[
542
+ ["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns
543
+ ]
544
  return data
545
 
546
  with gr.Row():
 
601
  ```
602
  """
603
  )
604
+ with gr.TabItem("📚 Submit your model", id="submission"):
605
  gr.Markdown("# How to Submit a New Model to the Leaderboard")
606
  gr.Markdown(
607
  """
608
+ ## Original (ViDoRe v1-v3) leaderboard:
609
+ To submit a new model to the original ViDoRe leaderboard, follow these steps:
610
 
611
  1. **Evaluate your model**:
612
  - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB.
 
619
 
620
  Note: For proper hyperlink redirection, please ensure that your model repository name is in
621
  kebab-case, e.g. `my-model-name`.
622
+
623
+
624
+ ## ViDoRe v3 Pipeline leaderboard :
625
+ To submit a new pipeline to the ViDoRe V3 pipeline leaderboard, follow these steps:
626
+ 1. **Evaluate your pipeline**:
627
+ - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) for pipelines
628
+ 2. **Open a PR on the ViDoRe GitHub repository including**:
629
+ - Your results, which are directly outputed in the correct format
630
+ - A short description of the pipeline and the main components used (some examples are available in the repository)
631
+
632
+ And you're done! Your pipeline will appear on the leaderboard when after the PR is merged !
633
  """
634
  )
635
+ with gr.TabItem("Deprecated ViDoRe V1", id="vidore-v1-deprecated"):
636
  gr.Markdown(
637
  "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
638
  "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
 
640
  "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
641
  "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
642
  )
643
+ gr.Markdown(
644
+ "## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
645
+ )
646
+ gr.Markdown(
647
+ "# <span style='color:red'>[Deprecated]</span> ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍"
648
+ )
649
  gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
650
 
651
  gr.Markdown(
 
658
  deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
659
 
660
  with gr.Row():
661
+ deprecated_metric_dropdown_1 = gr.Dropdown(
662
+ choices=METRICS, value=initial_metric, label="Select Metric"
663
+ )
664
  deprecated_research_textbox_1 = gr.Textbox(
665
  placeholder="🔍 Search Models... [press enter]",
666
  label="Filter Models by Name",
667
  )
668
  deprecated_column_checkboxes_1 = gr.CheckboxGroup(
669
+ choices=deprecated_datasets_columns_1,
670
+ value=deprecated_datasets_columns_1,
671
+ label="Select Columns to Display",
672
  )
673
 
674
  with gr.Row():
675
  deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
676
+ deprecated_dataframe_1 = gr.Dataframe(
677
+ deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas"
678
+ )
679
 
680
  def deprecated_update_data_1(metric, search_term, selected_columns):
681
  deprecated_model_handler.get_vidore_data(metric)
 
684
  data = filter_models(data, search_term)
685
  # data = remove_duplicates(data) # Add this line
686
  if selected_columns:
687
+ data = data[["Rank", "Model", "Average Score"] + selected_columns]
688
  return data
689
 
690
  with gr.Row():
 
703
  outputs=deprecated_dataframe_1,
704
  )
705
  deprecated_research_textbox_1.submit(
706
+ lambda metric, search_term, selected_columns: deprecated_update_data_1(
707
+ metric, search_term, selected_columns
708
+ ),
709
+ inputs=[
710
+ deprecated_metric_dropdown_1,
711
+ deprecated_research_textbox_1,
712
+ deprecated_column_checkboxes_1,
713
+ ],
714
  outputs=deprecated_dataframe_1,
715
  )
716
  deprecated_column_checkboxes_1.change(
717
+ lambda metric, search_term, selected_columns: deprecated_update_data_1(
718
+ metric, search_term, selected_columns
719
+ ),
720
+ inputs=[
721
+ deprecated_metric_dropdown_1,
722
+ deprecated_research_textbox_1,
723
+ deprecated_column_checkboxes_1,
724
+ ],
725
  outputs=deprecated_dataframe_1,
726
  )
727
 
 
757
  ```
758
  """
759
  )
760
+ with gr.TabItem("Deprecated ViDoRe V2", id="vidore-v2-deprecated"):
761
  gr.Markdown(
762
  "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
763
  "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
 
765
  "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
766
  "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
767
  )
768
+ gr.Markdown(
769
+ "## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
770
+ )
771
+ gr.Markdown(
772
+ "# <span style='color:red'>[Deprecated]</span> ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍"
773
+ )
774
  gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
775
 
776
  gr.Markdown(
 
783
  deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
784
 
785
  with gr.Row():
786
+ deprecated_metric_dropdown_2 = gr.Dropdown(
787
+ choices=METRICS, value=initial_metric, label="Select Metric"
788
+ )
789
  deprecated_research_textbox_2 = gr.Textbox(
790
  placeholder="🔍 Search Models... [press enter]",
791
  label="Filter Models by Name",
792
  )
793
  deprecated_column_checkboxes_2 = gr.CheckboxGroup(
794
+ choices=deprecated_datasets_columns_2,
795
+ value=deprecated_datasets_columns_2,
796
+ label="Select Columns to Display",
797
  )
798
 
799
  with gr.Row():
800
  deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
801
+ deprecated_dataframe_2 = gr.Dataframe(
802
+ deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas"
803
+ )
804
 
805
  def deprecated_update_data_2(metric, search_term, selected_columns):
806
  deprecated_model_handler.get_vidore_data(metric)
 
809
  data = filter_models(data, search_term)
810
  # data = remove_duplicates(data) # Add this line
811
  if selected_columns:
812
+ data = data[["Rank", "Model", "Average Score"] + selected_columns]
813
  return data
814
 
815
  with gr.Row():
 
836
  outputs=deprecated_dataframe_2,
837
  )
838
  deprecated_research_textbox_2.submit(
839
+ lambda metric, search_term, selected_columns: deprecated_update_data_2(
840
+ metric, search_term, selected_columns
841
+ ),
842
+ inputs=[
843
+ deprecated_metric_dropdown_2,
844
+ deprecated_research_textbox_2,
845
+ deprecated_column_checkboxes_2,
846
+ ],
847
  outputs=deprecated_dataframe_2,
848
  )
849
  deprecated_column_checkboxes_2.change(
850
+ lambda metric, search_term, selected_columns: deprecated_update_data_2(
851
+ metric, search_term, selected_columns
852
+ ),
853
+ inputs=[
854
+ deprecated_metric_dropdown_2,
855
+ deprecated_research_textbox_2,
856
+ deprecated_column_checkboxes_2,
857
+ ],
858
  outputs=deprecated_dataframe_2,
859
  )
860
 
 
891
  """
892
  )
893
 
894
+ def select_tab_from_url(request: gr.Request):
895
+ # Grab query parameters from the URL
896
+ query_params = dict(request.query_params)
897
+ # Look for ?tab=..., default to the first tab's ID if not found
898
+ target_tab = query_params.get("tab", "vidore-v3")
899
+ # Update the tabs component to select the target ID
900
+ return gr.update(selected=target_tab)
901
+
902
+ block.load(select_tab_from_url, inputs=None, outputs=tabs)
903
  block.queue(max_size=10).launch(debug=True)
904
 
905
 
app/utils.py CHANGED
@@ -1,7 +1,20 @@
1
-
2
-
3
- def make_clickable_model(model_name, link=None):
4
- if link is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  desanitized_model_name = model_name.replace("__", "/")
6
  desanitized_model_name = desanitized_model_name.replace("_", "/")
7
  desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
@@ -11,47 +24,81 @@ def make_clickable_model(model_name, link=None):
11
  if "/ocr" in desanitized_model_name:
12
  desanitized_model_name = desanitized_model_name.replace("/ocr", "")
13
 
14
- link = "https://huggingface.co/" + desanitized_model_name
 
15
 
16
  return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
17
 
18
 
19
  def add_rank(df, benchmark_version=1, selected_columns=None):
20
- df.fillna(0.0, inplace=True)
21
- if selected_columns is None:
22
- cols_to_rank = [
23
- col
24
- for col in df.columns
25
- if col
26
- not in [
27
- "Model",
28
- "Model Size (Million Parameters)",
29
- "Memory Usage (GB, fp32)",
30
- "Embedding Dimensions",
31
- "Max Tokens",
32
- ]
 
 
 
 
 
 
 
 
 
 
 
33
  ]
34
- else:
35
- cols_to_rank = selected_columns
36
-
37
- if len(cols_to_rank) == 1:
38
- df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
39
- else:
40
- df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
41
- df.sort_values("Average", ascending=False, inplace=True)
42
- df.insert(0, "Rank", list(range(1, len(df) + 1)))
43
- # multiply values by 100 if they are floats and round to 1 decimal place
44
- for col in df.columns:
45
- if df[col].dtype == "float64" and col != "Model Size (Million Parameters)":
46
- df[col] = df[col].apply(lambda x: round(x * 100, 1))
47
- return df
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
- def add_rank_and_format(df, benchmark_version=1, selected_columns=None):
51
  df = df.reset_index()
52
- df = df.rename(columns={"index": "Model"})
 
53
  df = add_rank(df, benchmark_version, selected_columns)
54
- df["Model"] = df["Model"].apply(make_clickable_model)
 
 
 
 
 
 
 
 
 
55
  # df = remove_duplicates(df)
56
  return df
57
 
@@ -73,6 +120,7 @@ def get_refresh_function(model_handler, benchmark_version):
73
 
74
  return _refresh
75
 
 
76
  def deprecated_get_refresh_function(model_handler, benchmark_version):
77
  def _refresh(metric):
78
  model_handler.get_vidore_data(metric)
@@ -83,7 +131,21 @@ def deprecated_get_refresh_function(model_handler, benchmark_version):
83
  return _refresh
84
 
85
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def filter_models(data, search_term):
87
  if search_term:
88
- data = data[data["Model"].str.contains(search_term, case=False, na=False)]
 
 
89
  return data
 
1
+ import pandas as pd
2
+ import math
3
+
4
+
5
+ def make_clickable_model(model_name, link=None, is_pipeline=False, folder_name=None):
6
+ if is_pipeline:
7
+ # For pipelines: use folder_name for link, model_name (alias) for display
8
+ link_folder = folder_name if folder_name else model_name
9
+ # Process folder name for link: only handle __ and -thisisapoint-
10
+ desanitized_folder = link_folder.replace("__", "/")
11
+ desanitized_folder = desanitized_folder.replace("-thisisapoint-", ".")
12
+ if link is None:
13
+ link = f"https://github.com/illuin-tech/vidore-benchmark/tree/main/results/pipeline_descriptions/{desanitized_folder}/description.json"
14
+ # Use word-wrap styling for potentially long pipeline aliases
15
+ return f'<a target="_blank" style="text-decoration: underline; word-wrap: break-word; white-space: normal; display: inline-block; max-width: 450px;" href="{link}">{model_name}</a>'
16
+ else:
17
+ # For regular models: replace __ and _ with /, and -thisisapoint- with .
18
  desanitized_model_name = model_name.replace("__", "/")
19
  desanitized_model_name = desanitized_model_name.replace("_", "/")
20
  desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
 
24
  if "/ocr" in desanitized_model_name:
25
  desanitized_model_name = desanitized_model_name.replace("/ocr", "")
26
 
27
+ if link is None:
28
+ link = "https://huggingface.co/" + desanitized_model_name
29
 
30
  return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
31
 
32
 
33
  def add_rank(df, benchmark_version=1, selected_columns=None):
34
+ # Convert numeric columns to proper float type (they may be 'object' dtype due to mixed data)
35
+ for col in df.columns:
36
+ if col not in ["Model", "Pipeline", "_folder_name"]:
37
+ df[col] = pd.to_numeric(df[col], errors="coerce")
38
+
39
+ # Only fill NaN for numeric columns to avoid issues with string columns like _folder_name
40
+ numeric_cols = df.select_dtypes(include=["float64", "int64", "float32", "int32"]).columns
41
+ df[numeric_cols] = df[numeric_cols].fillna(0.0)
42
+
43
+ if selected_columns is None:
44
+ cols_to_rank = [
45
+ col
46
+ for col in df.columns
47
+ if col
48
+ not in [
49
+ "Model",
50
+ "Pipeline", # For pipeline tables
51
+ "Model Size (Million Parameters)",
52
+ "Memory Usage (GB, fp32)",
53
+ "Embedding Dimensions",
54
+ "Max Tokens",
55
+ "Compute Cost ($)",
56
+ "Queries per Second",
57
+ "_folder_name", # Hidden column for pipeline link generation
58
  ]
59
+ ]
60
+ else:
61
+ cols_to_rank = selected_columns
62
+
63
+ if len(cols_to_rank) == 1:
64
+ df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
65
+ else:
66
+ # Only add Average column if it doesn't already exist
67
+ if "Average Score" not in df.columns:
68
+ df.insert(len(df.columns) - len(cols_to_rank), "Average Score", df[cols_to_rank].mean(axis=1, skipna=False))
69
+ df.sort_values("Average Score", ascending=False, inplace=True)
70
+ df.insert(0, "Rank", list(range(1, len(df) + 1)))
71
+ # multiply values by 100 if they are floats and round to 2 decimal places
72
+ for col in df.columns:
73
+ if pd.api.types.is_numeric_dtype(df[col]) and col not in [
74
+ "Model Size (Million Parameters)",
75
+ "Compute Cost ($)",
76
+ "Queries per Second",
77
+ "Indexing latency (s/doc)",
78
+ "Search latency (s/query)",
79
+ "Rank",
80
+ ]:
81
+ df[col] = df[col].apply(lambda x: round(x * 100, 3 - int(math.floor(math.log10(abs(x * 100)))) - 1))
82
+ elif pd.api.types.is_numeric_dtype(df[col]) and col in ["Indexing latency (s/doc)", "Search latency (s/query)"]:
83
+ df[col] = df[col].apply(lambda x: round(x, 3 - int(math.floor(math.log10(abs(x)))) - 1))
84
+ return df
85
 
86
 
87
+ def add_rank_and_format(df, benchmark_version=1, selected_columns=None, is_pipeline=False):
88
  df = df.reset_index()
89
+ column_name = "Pipeline" if is_pipeline else "Model"
90
+ df = df.rename(columns={"index": column_name})
91
  df = add_rank(df, benchmark_version, selected_columns)
92
+
93
+ if is_pipeline and "_folder_name" in df.columns:
94
+ # For pipelines, use folder_name for link generation
95
+ df[column_name] = df.apply(
96
+ lambda row: make_clickable_model(row[column_name], is_pipeline=True, folder_name=row["_folder_name"]),
97
+ axis=1,
98
+ )
99
+ df = df.drop(columns=["_folder_name"])
100
+ else:
101
+ df[column_name] = df[column_name].apply(lambda x: make_clickable_model(x, is_pipeline=is_pipeline))
102
  # df = remove_duplicates(df)
103
  return df
104
 
 
120
 
121
  return _refresh
122
 
123
+
124
  def deprecated_get_refresh_function(model_handler, benchmark_version):
125
  def _refresh(metric):
126
  model_handler.get_vidore_data(metric)
 
131
  return _refresh
132
 
133
 
134
+ def get_pipeline_refresh_function(pipeline_handler):
135
+ """Refresh function for pipeline evaluation results."""
136
+
137
+ def _refresh(metric):
138
+ pipeline_handler.get_pipeline_data()
139
+ data = pipeline_handler.render_df(metric)
140
+ df = add_rank_and_format(data, benchmark_version=3, is_pipeline=True)
141
+ return df
142
+
143
+ return _refresh
144
+
145
+
146
  def filter_models(data, search_term):
147
  if search_term:
148
+ # Use "Pipeline" column for pipeline tables, "Model" for others
149
+ col_name = "Pipeline" if "Pipeline" in data.columns else "Model"
150
+ data = data[data[col_name].str.contains(search_term, case=False, na=False)]
151
  return data
data/pipeline_handler.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from typing import Dict, List, Optional
4
+
5
+ import pandas as pd
6
+
7
+
8
+ class PipelineHandler:
9
+ """Handler for ViDoRe v3 pipeline evaluation results from GitHub."""
10
+
11
+ def __init__(self):
12
+ self.pipeline_infos = {}
13
+ self.pipeline_aliases = {} # Maps folder_name -> pipeline_alias for display
14
+ self.github_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/metrics"
15
+ self.github_descriptions_base_url = (
16
+ "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/pipeline_descriptions"
17
+ )
18
+ self.available_datasets = []
19
+ self.available_languages = ["english"] # Default languages available
20
+
21
+ # Setup GitHub authentication if token is available
22
+ self.github_token = os.environ.get("GITHUB_TOKEN")
23
+ self.headers = {}
24
+ if self.github_token:
25
+ self.headers["Authorization"] = f"token {self.github_token}"
26
+ print("GitHub token detected - using authenticated requests")
27
+
28
+ def get_pipeline_folders_from_github(self) -> List[str]:
29
+ """Get list of pipeline folders from GitHub API."""
30
+ api_url = "https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics"
31
+
32
+ try:
33
+ response = requests.get(api_url, headers=self.headers)
34
+ response.raise_for_status()
35
+ contents = response.json()
36
+
37
+ # Filter for directories only
38
+ folders = [item["name"] for item in contents if item["type"] == "dir"]
39
+ return sorted(folders)
40
+ except Exception as e:
41
+ print(f"Error fetching pipeline folders from GitHub: {e}")
42
+ return []
43
+
44
+ def get_dataset_files_from_github(self, pipeline_name: str) -> List[str]:
45
+ """Get list of dataset JSON files for a specific pipeline from GitHub API."""
46
+ api_url = f"https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics/{pipeline_name}"
47
+
48
+ try:
49
+ response = requests.get(api_url, headers=self.headers)
50
+ response.raise_for_status()
51
+ contents = response.json()
52
+
53
+ # Filter for JSON files that start with 'vidore_v3'
54
+ files = [
55
+ item["name"]
56
+ for item in contents
57
+ if item["type"] == "file" and item["name"].startswith("vidore_v3") and item["name"].endswith(".json")
58
+ ]
59
+ return sorted(files)
60
+ except Exception as e:
61
+ print(f"Error fetching dataset files from {pipeline_name}: {e}")
62
+ return []
63
+
64
+ def fetch_json_from_github(self, pipeline_name: str, filename: str) -> Optional[Dict]:
65
+ """Fetch a JSON file from GitHub raw content."""
66
+ url = f"{self.github_base_url}/{pipeline_name}/{filename}"
67
+
68
+ try:
69
+ response = requests.get(url, headers=self.headers)
70
+ response.raise_for_status()
71
+ return response.json()
72
+ except Exception as e:
73
+ print(f"Error fetching {filename} from {pipeline_name}: {e}")
74
+ return None
75
+
76
+ def fetch_pipeline_alias(self, pipeline_name: str) -> Optional[str]:
77
+ """Fetch the pipeline_alias from description.json for a pipeline.
78
+
79
+ Uses raw.githubusercontent.com to avoid API rate limits.
80
+ """
81
+ url = f"{self.github_descriptions_base_url}/{pipeline_name}/description.json"
82
+
83
+ try:
84
+ response = requests.get(url, headers=self.headers)
85
+ response.raise_for_status()
86
+ description = response.json()
87
+ return description.get("pipeline_alias")
88
+ except Exception as e:
89
+ print(f"Error fetching description for {pipeline_name}: {e}")
90
+ return None
91
+
92
+ def get_pipeline_data(self):
93
+ """Fetch all pipeline data from GitHub."""
94
+ pipeline_folders = self.get_pipeline_folders_from_github()
95
+ datasets_set = set()
96
+ languages_set = set(["overall"])
97
+
98
+ for pipeline_name in pipeline_folders:
99
+ # Get all dataset files for this pipeline
100
+ dataset_files = self.get_dataset_files_from_github(pipeline_name)
101
+
102
+ if not dataset_files:
103
+ continue
104
+
105
+ pipeline_data = {}
106
+ for filename in dataset_files:
107
+ results = self.fetch_json_from_github(pipeline_name, filename)
108
+ if results:
109
+ # Extract dataset name from filename (remove vidore_v3_ prefix and .json suffix)
110
+ dataset_name = filename.replace("vidore_v3_", "").replace(".json", "")
111
+ datasets_set.add(dataset_name)
112
+ pipeline_data[dataset_name] = results
113
+
114
+ # Collect available languages
115
+ if "aggregated_metrics" in results and "by_language" in results["aggregated_metrics"]:
116
+ languages_set.update(results["aggregated_metrics"]["by_language"].keys())
117
+
118
+ if pipeline_data:
119
+ self.pipeline_infos[pipeline_name] = pipeline_data
120
+ # Fetch the pipeline alias for display (uses raw URL, not API)
121
+ alias = self.fetch_pipeline_alias(pipeline_name)
122
+ if alias:
123
+ self.pipeline_aliases[pipeline_name] = alias
124
+
125
+ self.available_datasets = sorted(list(datasets_set))
126
+ self.available_languages = sorted(list(languages_set))
127
+
128
+ def calculate_cost_metric(self, pipeline_datasets: Dict) -> float:
129
+ """
130
+ Calculate a compute cost metric based on retrieval time across all datasets.
131
+ Returns cost in arbitrary units (could be refined based on actual compute costs).
132
+ """
133
+ total_time_s = 0
134
+
135
+ for dataset_name, dataset_data in pipeline_datasets.items():
136
+ if "aggregated_metrics" not in dataset_data:
137
+ continue
138
+
139
+ timing = dataset_data["aggregated_metrics"].get("timing", {})
140
+ total_time_ms = timing.get("total_retrieval_time_milliseconds", 0)
141
+ total_time_s += total_time_ms / 1000.0
142
+
143
+ # Simple cost model: assume $0.01 per second of compute (adjustable)
144
+ cost = total_time_s * 0.01
145
+
146
+ return round(cost, 4)
147
+
148
+ def extract_dataset_metrics(
149
+ self, pipeline_datasets: Dict, metric: str = "ndcg_cut_5", language: str = "english"
150
+ ) -> Dict[str, float]:
151
+ """
152
+ Extract metrics for individual datasets from the aggregated results.
153
+
154
+ Args:
155
+ pipeline_datasets: Dictionary mapping dataset names to their data
156
+ metric: The metric to extract (e.g., 'ndcg_at_5')
157
+ language: The language to filter by ('english' for English-only results, or specific language)
158
+
159
+ Returns:
160
+ Dictionary mapping dataset names to metric values
161
+ """
162
+ # Map metric names from UI format to API format
163
+ metric_mapping = {
164
+ "ndcg_at_1": "ndcg_cut_5", # Using cut_5 as approximation
165
+ "ndcg_at_5": "ndcg_cut_5",
166
+ "ndcg_at_10": "ndcg_cut_10",
167
+ "ndcg_at_100": "ndcg_cut_100",
168
+ "recall_at_1": "recall_5",
169
+ "recall_at_5": "recall_5",
170
+ "recall_at_10": "recall_10",
171
+ "recall_at_100": "recall_100",
172
+ }
173
+
174
+ actual_metric = metric_mapping.get(metric, metric)
175
+ dataset_metrics = {}
176
+
177
+ for dataset_name, dataset_data in pipeline_datasets.items():
178
+ if "aggregated_metrics" not in dataset_data:
179
+ continue
180
+
181
+ aggregated = dataset_data["aggregated_metrics"]
182
+
183
+ # Get metrics for the specified language
184
+ if language == "overall":
185
+ metrics_data = aggregated.get("overall", {})
186
+ else:
187
+ metrics_data = aggregated.get("by_language", {}).get(language, {})
188
+
189
+ if metrics_data:
190
+ # Format dataset name for display
191
+ display_name = dataset_name.replace("_", " ").title()
192
+ dataset_metrics[display_name] = metrics_data.get(actual_metric, 0.0)
193
+
194
+ return dataset_metrics
195
+
196
+ def render_df(self, metric: str = "ndcg_at_5", language: str = "overall") -> pd.DataFrame:
197
+ """
198
+ Render a DataFrame with pipeline results.
199
+
200
+ Args:
201
+ metric: The metric to display (e.g., 'ndcg_at_5')
202
+ language: The language to filter by ('overall' for all languages, or specific language)
203
+
204
+ Returns:
205
+ DataFrame with columns: Pipeline Name, Compute Cost, Timing metrics, Dataset metrics
206
+ """
207
+ pipeline_res = {}
208
+
209
+ for pipeline_name, pipeline_datasets in self.pipeline_infos.items():
210
+ row_data = {}
211
+
212
+ # Aggregate time metrics across all datasets
213
+ total_time_ms = 0
214
+ total_queries = 0
215
+ indexing_time_ms = 0
216
+ search_time_ms = 0
217
+ num_datasets = 0
218
+
219
+ for dataset_name, dataset_data in pipeline_datasets.items():
220
+ if "aggregated_metrics" in dataset_data:
221
+ timing = dataset_data["aggregated_metrics"].get("timing", {})
222
+ total_time_ms += timing.get("total_retrieval_time_milliseconds", 0)
223
+ total_queries += timing.get("num_queries", 0)
224
+ indexing_time_ms += timing.get("indexing_throughput_ms_per_doc", 0)
225
+ search_time_ms += timing.get("search_throughput_ms_per_query", 0)
226
+ num_datasets += 1
227
+
228
+ if total_queries > 0:
229
+ if total_time_ms > 0:
230
+ # row_data["Total retrieval time (s)"] = round(
231
+ # (total_time_ms / 1000), 0
232
+ # )
233
+ row_data["Indexing latency (s/doc)"] = (
234
+ (indexing_time_ms / 1000) / num_datasets if indexing_time_ms > 0 else None
235
+ )
236
+ row_data["Search latency (s/query)"] = (
237
+ (search_time_ms / 1000) / num_datasets if search_time_ms > 0 else None
238
+ )
239
+ else:
240
+ # row_data["Total retrieval time (s)"] = 0
241
+ row_data["Indexing latency (s/doc)"] = 0
242
+ row_data["Search latency (s/query)"] = 0
243
+ else:
244
+ # row_data["Total retrieval time (s)"] = -1
245
+ row_data["Indexing latency (s/doc)"] = -1
246
+ row_data["Search latency (s/query)"] = -1
247
+
248
+ # Add dataset metrics
249
+ dataset_metrics = self.extract_dataset_metrics(pipeline_datasets, metric, language)
250
+ row_data.update(dataset_metrics)
251
+
252
+ # Calculate average across datasets if there are multiple
253
+ if dataset_metrics:
254
+ row_data["Average Score"] = round(sum(dataset_metrics.values()) / len(dataset_metrics), 4)
255
+
256
+ # Use pipeline_alias for display if available, otherwise fallback to folder name
257
+ display_name = self.pipeline_aliases.get(pipeline_name, pipeline_name)
258
+ # Store folder name for link generation (will be used in utils.py)
259
+ row_data["_folder_name"] = pipeline_name
260
+ pipeline_res[display_name] = row_data
261
+
262
+ if pipeline_res:
263
+ df = pd.DataFrame(pipeline_res).T
264
+ # Reorder columns to have Average right after timing metrics
265
+ cols = list(df.columns)
266
+ if "Average Score" in cols:
267
+ cols.remove("Average Score")
268
+ # Insert Average after Search latency (s/query)
269
+ insert_pos = cols.index("Search latency (s/query)") + 1 if "Search latency (s/query)" in cols else 2
270
+ cols.insert(insert_pos, "Average Score")
271
+ df = df[cols]
272
+ return df
273
+
274
+ return pd.DataFrame()