Spaces:
Running
Running
Pipeline leaderboard (#17)
Browse files- pipeline leaderboard (0538054cb956e3c5f1624236f9d1b74d337f2449)
- app.py +385 -38
- app/utils.py +98 -36
- data/pipeline_handler.py +274 -0
app.py
CHANGED
|
@@ -1,9 +1,19 @@
|
|
| 1 |
-
import
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from data.deprecated_model_handler import DeprecatedModelHandler
|
| 6 |
from data.model_handler import ModelHandler
|
|
|
|
| 7 |
|
| 8 |
METRICS = [
|
| 9 |
"ndcg_at_1",
|
|
@@ -22,7 +32,6 @@ def main():
|
|
| 22 |
model_handler = ModelHandler()
|
| 23 |
initial_metric = "ndcg_at_5"
|
| 24 |
|
| 25 |
-
model_handler.get_vidore_data(initial_metric)
|
| 26 |
data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1)
|
| 27 |
data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
|
| 28 |
|
|
@@ -37,9 +46,19 @@ def main():
|
|
| 37 |
num_scores_2 = len(data_benchmark_2) * num_datasets_2
|
| 38 |
num_models_2 = len(data_benchmark_2)
|
| 39 |
|
| 40 |
-
# Get deprecated results
|
| 41 |
deprecated_model_handler = DeprecatedModelHandler()
|
| 42 |
initial_metric = "ndcg_at_5"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
deprecated_model_handler.get_vidore_data(initial_metric)
|
| 45 |
deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
|
|
@@ -83,11 +102,35 @@ def main():
|
|
| 83 |
border-left: 4px solid #2196f3;
|
| 84 |
padding: 5px 15px;
|
| 85 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
"""
|
| 87 |
|
| 88 |
with gr.Blocks(css=css) as block:
|
| 89 |
-
with gr.Tabs():
|
| 90 |
-
with gr.TabItem("ViDoRe V3"):
|
| 91 |
gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷♂️")
|
| 92 |
|
| 93 |
gr.Markdown(
|
|
@@ -104,14 +147,14 @@ def main():
|
|
| 104 |
gr.Markdown("""
|
| 105 |
As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
|
| 106 |
we embed it here.
|
| 107 |
-
"""
|
| 108 |
|
| 109 |
gr.Markdown(
|
| 110 |
-
|
| 111 |
- Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*),
|
| 112 |
- Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it),
|
| 113 |
- The *eng-Latn* column will show English-only results (= English queries on English documents).""",
|
| 114 |
-
elem_classes="alert-info"
|
| 115 |
)
|
| 116 |
|
| 117 |
gr.HTML(
|
|
@@ -122,7 +165,242 @@ def main():
|
|
| 122 |
></iframe>
|
| 123 |
"""
|
| 124 |
)
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
|
| 127 |
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
|
| 128 |
|
|
@@ -156,7 +434,9 @@ def main():
|
|
| 156 |
data = filter_models(data, search_term)
|
| 157 |
# data = remove_duplicates(data) # Add this line
|
| 158 |
if selected_columns:
|
| 159 |
-
data = data[
|
|
|
|
|
|
|
| 160 |
return data
|
| 161 |
|
| 162 |
with gr.Row():
|
|
@@ -225,7 +505,7 @@ def main():
|
|
| 225 |
```
|
| 226 |
"""
|
| 227 |
)
|
| 228 |
-
with gr.TabItem("ViDoRe V1"):
|
| 229 |
gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
|
| 230 |
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
|
| 231 |
|
|
@@ -258,7 +538,9 @@ def main():
|
|
| 258 |
data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
|
| 259 |
data = filter_models(data, search_term)
|
| 260 |
if selected_columns:
|
| 261 |
-
data = data[
|
|
|
|
|
|
|
| 262 |
return data
|
| 263 |
|
| 264 |
with gr.Row():
|
|
@@ -319,11 +601,12 @@ def main():
|
|
| 319 |
```
|
| 320 |
"""
|
| 321 |
)
|
| 322 |
-
with gr.TabItem("📚 Submit your model"):
|
| 323 |
gr.Markdown("# How to Submit a New Model to the Leaderboard")
|
| 324 |
gr.Markdown(
|
| 325 |
"""
|
| 326 |
-
|
|
|
|
| 327 |
|
| 328 |
1. **Evaluate your model**:
|
| 329 |
- Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB.
|
|
@@ -336,9 +619,20 @@ def main():
|
|
| 336 |
|
| 337 |
Note: For proper hyperlink redirection, please ensure that your model repository name is in
|
| 338 |
kebab-case, e.g. `my-model-name`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
"""
|
| 340 |
)
|
| 341 |
-
with gr.TabItem("Deprecated ViDoRe V1"):
|
| 342 |
gr.Markdown(
|
| 343 |
"## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
|
| 344 |
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
|
|
@@ -346,8 +640,12 @@ def main():
|
|
| 346 |
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
|
| 347 |
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
|
| 348 |
)
|
| 349 |
-
gr.Markdown(
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
|
| 352 |
|
| 353 |
gr.Markdown(
|
|
@@ -360,18 +658,24 @@ def main():
|
|
| 360 |
deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
|
| 361 |
|
| 362 |
with gr.Row():
|
| 363 |
-
deprecated_metric_dropdown_1 = gr.Dropdown(
|
|
|
|
|
|
|
| 364 |
deprecated_research_textbox_1 = gr.Textbox(
|
| 365 |
placeholder="🔍 Search Models... [press enter]",
|
| 366 |
label="Filter Models by Name",
|
| 367 |
)
|
| 368 |
deprecated_column_checkboxes_1 = gr.CheckboxGroup(
|
| 369 |
-
choices=deprecated_datasets_columns_1,
|
|
|
|
|
|
|
| 370 |
)
|
| 371 |
|
| 372 |
with gr.Row():
|
| 373 |
deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
|
| 374 |
-
deprecated_dataframe_1 = gr.Dataframe(
|
|
|
|
|
|
|
| 375 |
|
| 376 |
def deprecated_update_data_1(metric, search_term, selected_columns):
|
| 377 |
deprecated_model_handler.get_vidore_data(metric)
|
|
@@ -380,7 +684,7 @@ def main():
|
|
| 380 |
data = filter_models(data, search_term)
|
| 381 |
# data = remove_duplicates(data) # Add this line
|
| 382 |
if selected_columns:
|
| 383 |
-
data = data[["Rank", "Model", "Average"] + selected_columns]
|
| 384 |
return data
|
| 385 |
|
| 386 |
with gr.Row():
|
|
@@ -399,13 +703,25 @@ def main():
|
|
| 399 |
outputs=deprecated_dataframe_1,
|
| 400 |
)
|
| 401 |
deprecated_research_textbox_1.submit(
|
| 402 |
-
lambda metric, search_term, selected_columns: deprecated_update_data_1(
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
outputs=deprecated_dataframe_1,
|
| 405 |
)
|
| 406 |
deprecated_column_checkboxes_1.change(
|
| 407 |
-
lambda metric, search_term, selected_columns: deprecated_update_data_1(
|
| 408 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
outputs=deprecated_dataframe_1,
|
| 410 |
)
|
| 411 |
|
|
@@ -441,7 +757,7 @@ def main():
|
|
| 441 |
```
|
| 442 |
"""
|
| 443 |
)
|
| 444 |
-
with gr.TabItem("Deprecated ViDoRe V2"):
|
| 445 |
gr.Markdown(
|
| 446 |
"## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
|
| 447 |
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
|
|
@@ -449,8 +765,12 @@ def main():
|
|
| 449 |
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
|
| 450 |
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
|
| 451 |
)
|
| 452 |
-
gr.Markdown(
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
|
| 455 |
|
| 456 |
gr.Markdown(
|
|
@@ -463,18 +783,24 @@ def main():
|
|
| 463 |
deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
|
| 464 |
|
| 465 |
with gr.Row():
|
| 466 |
-
deprecated_metric_dropdown_2 = gr.Dropdown(
|
|
|
|
|
|
|
| 467 |
deprecated_research_textbox_2 = gr.Textbox(
|
| 468 |
placeholder="🔍 Search Models... [press enter]",
|
| 469 |
label="Filter Models by Name",
|
| 470 |
)
|
| 471 |
deprecated_column_checkboxes_2 = gr.CheckboxGroup(
|
| 472 |
-
choices=deprecated_datasets_columns_2,
|
|
|
|
|
|
|
| 473 |
)
|
| 474 |
|
| 475 |
with gr.Row():
|
| 476 |
deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
|
| 477 |
-
deprecated_dataframe_2 = gr.Dataframe(
|
|
|
|
|
|
|
| 478 |
|
| 479 |
def deprecated_update_data_2(metric, search_term, selected_columns):
|
| 480 |
deprecated_model_handler.get_vidore_data(metric)
|
|
@@ -483,7 +809,7 @@ def main():
|
|
| 483 |
data = filter_models(data, search_term)
|
| 484 |
# data = remove_duplicates(data) # Add this line
|
| 485 |
if selected_columns:
|
| 486 |
-
data = data[["Rank", "Model", "Average"] + selected_columns]
|
| 487 |
return data
|
| 488 |
|
| 489 |
with gr.Row():
|
|
@@ -510,13 +836,25 @@ def main():
|
|
| 510 |
outputs=deprecated_dataframe_2,
|
| 511 |
)
|
| 512 |
deprecated_research_textbox_2.submit(
|
| 513 |
-
lambda metric, search_term, selected_columns: deprecated_update_data_2(
|
| 514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
outputs=deprecated_dataframe_2,
|
| 516 |
)
|
| 517 |
deprecated_column_checkboxes_2.change(
|
| 518 |
-
lambda metric, search_term, selected_columns: deprecated_update_data_2(
|
| 519 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
outputs=deprecated_dataframe_2,
|
| 521 |
)
|
| 522 |
|
|
@@ -553,6 +891,15 @@ def main():
|
|
| 553 |
"""
|
| 554 |
)
|
| 555 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
block.queue(max_size=10).launch(debug=True)
|
| 557 |
|
| 558 |
|
|
|
|
| 1 |
+
import re
|
| 2 |
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import plotly.express as px
|
| 6 |
+
|
| 7 |
+
from app.utils import (
|
| 8 |
+
add_rank_and_format,
|
| 9 |
+
deprecated_get_refresh_function,
|
| 10 |
+
filter_models,
|
| 11 |
+
get_pipeline_refresh_function,
|
| 12 |
+
get_refresh_function,
|
| 13 |
+
)
|
| 14 |
from data.deprecated_model_handler import DeprecatedModelHandler
|
| 15 |
from data.model_handler import ModelHandler
|
| 16 |
+
from data.pipeline_handler import PipelineHandler
|
| 17 |
|
| 18 |
METRICS = [
|
| 19 |
"ndcg_at_1",
|
|
|
|
| 32 |
model_handler = ModelHandler()
|
| 33 |
initial_metric = "ndcg_at_5"
|
| 34 |
|
|
|
|
| 35 |
data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1)
|
| 36 |
data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
|
| 37 |
|
|
|
|
| 46 |
num_scores_2 = len(data_benchmark_2) * num_datasets_2
|
| 47 |
num_models_2 = len(data_benchmark_2)
|
| 48 |
|
|
|
|
| 49 |
deprecated_model_handler = DeprecatedModelHandler()
|
| 50 |
initial_metric = "ndcg_at_5"
|
| 51 |
+
initial_metric_v3 = "ndcg_at_10"
|
| 52 |
+
|
| 53 |
+
# Get pipeline evaluation results
|
| 54 |
+
pipeline_handler = PipelineHandler()
|
| 55 |
+
pipeline_handler.get_pipeline_data()
|
| 56 |
+
data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
|
| 57 |
+
data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
|
| 58 |
+
|
| 59 |
+
num_datasets_pipeline = len(data_pipeline.columns) - 5 # Excluding Rank, Model, Indexing time, search time, Average
|
| 60 |
+
num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
|
| 61 |
+
num_pipelines = len(data_pipeline)
|
| 62 |
|
| 63 |
deprecated_model_handler.get_vidore_data(initial_metric)
|
| 64 |
deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
|
|
|
|
| 102 |
border-left: 4px solid #2196f3;
|
| 103 |
padding: 5px 15px;
|
| 104 |
}
|
| 105 |
+
|
| 106 |
+
/* 1. Force text wrapping on all headers from the 3rd column onwards */
|
| 107 |
+
#pipeline-table table th:nth-child(n+3) * {
|
| 108 |
+
white-space: normal !important;
|
| 109 |
+
overflow: visible !important;
|
| 110 |
+
text-overflow: clip !important;
|
| 111 |
+
line-height: 1.2 !important;
|
| 112 |
+
word-break: normal !important; /* Prevents breaking in the middle of words */
|
| 113 |
+
overflow-wrap: normal !important; /* Prevents breaking in the middle of words */
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* 2. Set a fixed width and center alignment for headers and data cells from the 3rd column onwards */
|
| 117 |
+
#pipeline-table table th:nth-child(n+3),
|
| 118 |
+
#pipeline-table table td:nth-child(n+3) {
|
| 119 |
+
min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
|
| 120 |
+
max-width: 120px !important;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
/* 3. Make the Model column (2nd column) wider for pipeline table */
|
| 124 |
+
#pipeline-table table th:nth-child(2),
|
| 125 |
+
#pipeline-table table td:nth-child(2) {
|
| 126 |
+
min-width: 400px !important;
|
| 127 |
+
max-width: 500px !important;
|
| 128 |
+
}
|
| 129 |
"""
|
| 130 |
|
| 131 |
with gr.Blocks(css=css) as block:
|
| 132 |
+
with gr.Tabs() as tabs:
|
| 133 |
+
with gr.TabItem("ViDoRe V3", id="vidore-v3"):
|
| 134 |
gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷♂️")
|
| 135 |
|
| 136 |
gr.Markdown(
|
|
|
|
| 147 |
gr.Markdown("""
|
| 148 |
As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
|
| 149 |
we embed it here.
|
| 150 |
+
""")
|
| 151 |
|
| 152 |
gr.Markdown(
|
| 153 |
+
"""**💡 To display English-only results:**
|
| 154 |
- Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*),
|
| 155 |
- Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it),
|
| 156 |
- The *eng-Latn* column will show English-only results (= English queries on English documents).""",
|
| 157 |
+
elem_classes="alert-info",
|
| 158 |
)
|
| 159 |
|
| 160 |
gr.HTML(
|
|
|
|
| 165 |
></iframe>
|
| 166 |
"""
|
| 167 |
)
|
| 168 |
+
|
| 169 |
+
with gr.TabItem("ViDoRe V3 (Pipeline)", id="vidore-v3-pipeline"):
|
| 170 |
+
gr.Markdown("# ViDoRe V3 (Pipeline Evaluation): Retrieval Performance for Complex Pipelines ⚙️")
|
| 171 |
+
gr.Markdown(
|
| 172 |
+
"### Assessing retrieval performance, latency, and compute costs of complex retrieval pipelines"
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
gr.Markdown(
|
| 176 |
+
"""
|
| 177 |
+
This leaderboard ranks full retrieval pipelines on **English-only queries** for **ViDoRe V3**. Instead of just testing standalone models, we evaluate real-world, multi-step retrieval systems. This includes everything from basic retrievers to advanced setups using AI agents, query reformulation, hybrid search, and any other creative retrieval pipeline one can imagine.
|
| 178 |
+
|
| 179 |
+
To show efficiency alongside accuracy, we include **Indexing latency** (seconds/doc) and **Search latency** (seconds/query). These numbers are self-reported and depend heavily on hardware, so treat them as rough estimates. Still, they give a helpful idea of how much computing power each pipeline needs.
|
| 180 |
+
|
| 181 |
+
⚠️ **Note:** Because this only uses English queries, these scores **cannot be directly compared** to the standard ViDoRe V3 results.
|
| 182 |
+
|
| 183 |
+
*Results are sourced from the [vidore-benchmark repository](https://github.com/illuin-tech/vidore-benchmark/tree/main/results).*
|
| 184 |
+
"""
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
if len(data_pipeline) > 0:
|
| 188 |
+
datasets_columns_pipeline = [
|
| 189 |
+
col
|
| 190 |
+
for col in data_pipeline.columns[4:]
|
| 191 |
+
if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average Score"]
|
| 192 |
+
]
|
| 193 |
+
|
| 194 |
+
with gr.Row():
|
| 195 |
+
metric_dropdown_pipeline = gr.Dropdown(
|
| 196 |
+
choices=METRICS, value=initial_metric_v3, label="Select Metric"
|
| 197 |
+
)
|
| 198 |
+
research_textbox_pipeline = gr.Textbox(
|
| 199 |
+
placeholder="🔍 Search Pipelines... [press enter]",
|
| 200 |
+
label="Filter Pipelines by Name",
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
with gr.Row():
|
| 204 |
+
if datasets_columns_pipeline:
|
| 205 |
+
column_checkboxes_pipeline = gr.CheckboxGroup(
|
| 206 |
+
choices=datasets_columns_pipeline,
|
| 207 |
+
value=datasets_columns_pipeline,
|
| 208 |
+
label="Select Datasets to Display",
|
| 209 |
+
)
|
| 210 |
+
else:
|
| 211 |
+
column_checkboxes_pipeline = gr.CheckboxGroup(
|
| 212 |
+
choices=[], value=[], label="Select Datasets to Display"
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
with gr.Row():
|
| 216 |
+
# Datatype: Rank, Pipeline, Indexing latency (s/doc), Search latency (s/query), Average + datasets
|
| 217 |
+
datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(
|
| 218 |
+
datasets_columns_pipeline
|
| 219 |
+
)
|
| 220 |
+
dataframe_pipeline = gr.Dataframe(
|
| 221 |
+
data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
def clean_pipeline_name(name):
|
| 225 |
+
if not isinstance(name, str):
|
| 226 |
+
return str(name)
|
| 227 |
+
# Remove Markdown links [text](url) -> text
|
| 228 |
+
name = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", name)
|
| 229 |
+
# Remove HTML tags <a href="...">text</a> -> text
|
| 230 |
+
name = re.sub(r"<[^>]+>", "", name)
|
| 231 |
+
return name.strip()
|
| 232 |
+
|
| 233 |
+
def create_pipeline_plot(df, latency_col):
|
| 234 |
+
if df is None or len(df) == 0:
|
| 235 |
+
return None
|
| 236 |
+
|
| 237 |
+
# Ensure expected columns exist
|
| 238 |
+
if (
|
| 239 |
+
latency_col not in df.columns
|
| 240 |
+
or "Average Score" not in df.columns
|
| 241 |
+
or "Pipeline" not in df.columns
|
| 242 |
+
):
|
| 243 |
+
return None
|
| 244 |
+
|
| 245 |
+
# Clean the dataframe for plotting
|
| 246 |
+
plot_df = df.copy()
|
| 247 |
+
|
| 248 |
+
# Strip HTML and Markdown for clean hover text
|
| 249 |
+
plot_df["Cleaned Pipeline"] = plot_df["Pipeline"].apply(clean_pipeline_name)
|
| 250 |
+
|
| 251 |
+
plot_df[latency_col] = pd.to_numeric(plot_df[latency_col], errors="coerce")
|
| 252 |
+
plot_df["Average Score"] = pd.to_numeric(plot_df["Average Score"], errors="coerce")
|
| 253 |
+
|
| 254 |
+
plot_df = plot_df.dropna(subset=[latency_col, "Average Score"])
|
| 255 |
+
plot_df = plot_df[plot_df[latency_col] > 0]
|
| 256 |
+
plot_df = plot_df.sort_values(by=latency_col)
|
| 257 |
+
|
| 258 |
+
if len(plot_df) == 0:
|
| 259 |
+
return None
|
| 260 |
+
|
| 261 |
+
fig = px.scatter(
|
| 262 |
+
plot_df,
|
| 263 |
+
x=latency_col,
|
| 264 |
+
y="Average Score",
|
| 265 |
+
hover_name="Cleaned Pipeline", # Use the clean text!
|
| 266 |
+
title=f"Mean Performance vs {latency_col}",
|
| 267 |
+
color_discrete_sequence=["orange"],
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
fig.update_layout(
|
| 271 |
+
xaxis_title=latency_col,
|
| 272 |
+
yaxis_title="Average Score",
|
| 273 |
+
plot_bgcolor="white",
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
fig.update_xaxes(showgrid=True, gridcolor="lightgrey")
|
| 277 |
+
fig.update_yaxes(showgrid=True, gridcolor="lightgrey")
|
| 278 |
+
|
| 279 |
+
fig.update_traces(marker=dict(size=12, opacity=0.8, line=dict(width=1, color="DarkSlateGrey")))
|
| 280 |
+
return fig
|
| 281 |
+
|
| 282 |
+
with gr.Row():
|
| 283 |
+
latency_radio = gr.Radio(
|
| 284 |
+
choices=["Search latency (s/query)", "Indexing latency (s/doc)"],
|
| 285 |
+
value="Search latency (s/query)",
|
| 286 |
+
label="Select Latency Metric for X-Axis",
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
with gr.Row():
|
| 290 |
+
initial_fig = create_pipeline_plot(data_pipeline, "Search latency (s/query)")
|
| 291 |
+
performance_plot = gr.Plot(value=initial_fig)
|
| 292 |
+
|
| 293 |
+
def update_data_pipeline(metric, search_term, selected_columns):
|
| 294 |
+
pipeline_handler.get_pipeline_data()
|
| 295 |
+
data = pipeline_handler.render_df(metric, "english")
|
| 296 |
+
data = add_rank_and_format(
|
| 297 |
+
data, benchmark_version=3, selected_columns=selected_columns, is_pipeline=True
|
| 298 |
+
)
|
| 299 |
+
data = filter_models(data, search_term)
|
| 300 |
+
if selected_columns:
|
| 301 |
+
# Include core columns plus selected dataset columns
|
| 302 |
+
core_cols = ["Rank", "Pipeline", "Indexing latency (s/doc)", "Search latency (s/query)"]
|
| 303 |
+
if "Average Score" in data.columns:
|
| 304 |
+
core_cols.insert(4, "Average Score")
|
| 305 |
+
data = data[core_cols + selected_columns]
|
| 306 |
+
return data
|
| 307 |
+
|
| 308 |
+
with gr.Row():
|
| 309 |
+
refresh_button_pipeline = gr.Button("Refresh")
|
| 310 |
+
refresh_button_pipeline.click(
|
| 311 |
+
lambda metric: add_rank_and_format(
|
| 312 |
+
pipeline_handler.render_df(metric, "english"), benchmark_version=3, is_pipeline=True
|
| 313 |
+
),
|
| 314 |
+
inputs=[metric_dropdown_pipeline],
|
| 315 |
+
outputs=dataframe_pipeline,
|
| 316 |
+
concurrency_limit=20,
|
| 317 |
+
).then(
|
| 318 |
+
fn=create_pipeline_plot,
|
| 319 |
+
inputs=[dataframe_pipeline, latency_radio],
|
| 320 |
+
outputs=performance_plot,
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
with gr.Row():
|
| 324 |
+
gr.Markdown(
|
| 325 |
+
"""
|
| 326 |
+
**Note**: These results represent full pipeline evaluations on english queries ONLY (since other queries were mostly directly translated from their english counterparts).
|
| 327 |
+
We felt mutli-lingual results were less critical (and much more costly to evaluate on the full set) for pipelines, since one user could just add a translation module to the pipeline and expect similar performance to the english results.
|
| 328 |
+
If you feel this is a mistake and multi-lingual results are critical for pipelines, please let us know by opening an issue in the GitHub repository!
|
| 329 |
+
"""
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
# Automatically refresh the dataframe when the dropdown value changes
|
| 333 |
+
def refresh_pipeline_data(metric):
|
| 334 |
+
"""Refresh pipeline data when metric changes."""
|
| 335 |
+
df = pipeline_handler.render_df(metric, "english")
|
| 336 |
+
return add_rank_and_format(df, benchmark_version=3, is_pipeline=True)
|
| 337 |
+
|
| 338 |
+
# Update dataframe and then update the plot
|
| 339 |
+
metric_dropdown_pipeline.change(
|
| 340 |
+
refresh_pipeline_data,
|
| 341 |
+
inputs=[metric_dropdown_pipeline],
|
| 342 |
+
outputs=dataframe_pipeline,
|
| 343 |
+
).then(
|
| 344 |
+
fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
research_textbox_pipeline.submit(
|
| 348 |
+
lambda metric, search_term, selected_columns: update_data_pipeline(
|
| 349 |
+
metric, search_term, selected_columns
|
| 350 |
+
),
|
| 351 |
+
inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
|
| 352 |
+
outputs=dataframe_pipeline,
|
| 353 |
+
).then(
|
| 354 |
+
fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
column_checkboxes_pipeline.change(
|
| 358 |
+
lambda metric, search_term, selected_columns: update_data_pipeline(
|
| 359 |
+
metric, search_term, selected_columns
|
| 360 |
+
),
|
| 361 |
+
inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
|
| 362 |
+
outputs=dataframe_pipeline,
|
| 363 |
+
).then(
|
| 364 |
+
fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
# Update plot when the radio button changes
|
| 368 |
+
latency_radio.change(
|
| 369 |
+
fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
gr.Markdown(
|
| 373 |
+
f"""
|
| 374 |
+
- **Total Datasets**: {num_datasets_pipeline}
|
| 375 |
+
- **Total Scores**: {num_scores_pipeline}
|
| 376 |
+
- **Total Pipelines**: {num_pipelines}
|
| 377 |
+
"""
|
| 378 |
+
+ r"""
|
| 379 |
+
Please consider citing:
|
| 380 |
+
|
| 381 |
+
```bibtex
|
| 382 |
+
@misc{faysse2024colpaliefficientdocumentretrieval,
|
| 383 |
+
title={ColPali: Efficient Document Retrieval with Vision Language Models},
|
| 384 |
+
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
|
| 385 |
+
year={2024},
|
| 386 |
+
eprint={2407.01449},
|
| 387 |
+
archivePrefix={arXiv},
|
| 388 |
+
primaryClass={cs.IR},
|
| 389 |
+
url={[https://arxiv.org/abs/2407.01449](https://arxiv.org/abs/2407.01449)},
|
| 390 |
+
}
|
| 391 |
+
@misc{loison2026vidore,
|
| 392 |
+
title={ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
|
| 393 |
+
author={Loison, Ant{\'o}nio and Mac{\'e}, Quentin and Edy, Antoine and Xing, Victor and Balough, Tom and Moreira, Gabriel and Liu, Bo and Faysse, Manuel and Hudelot, C{\'e}line and Viaud, Gautier},
|
| 394 |
+
journal={arXiv preprint arXiv:2601.08620},
|
| 395 |
+
year={2026}
|
| 396 |
+
}
|
| 397 |
+
```
|
| 398 |
+
"""
|
| 399 |
+
)
|
| 400 |
+
else:
|
| 401 |
+
gr.Markdown("**No pipeline evaluation results available yet. Check back later!**")
|
| 402 |
+
|
| 403 |
+
with gr.TabItem("ViDoRe V2", id="vidore-v2"):
|
| 404 |
gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
|
| 405 |
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
|
| 406 |
|
|
|
|
| 434 |
data = filter_models(data, search_term)
|
| 435 |
# data = remove_duplicates(data) # Add this line
|
| 436 |
if selected_columns:
|
| 437 |
+
data = data[
|
| 438 |
+
["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns
|
| 439 |
+
]
|
| 440 |
return data
|
| 441 |
|
| 442 |
with gr.Row():
|
|
|
|
| 505 |
```
|
| 506 |
"""
|
| 507 |
)
|
| 508 |
+
with gr.TabItem("ViDoRe V1", id="vidore-v1"):
|
| 509 |
gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
|
| 510 |
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
|
| 511 |
|
|
|
|
| 538 |
data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
|
| 539 |
data = filter_models(data, search_term)
|
| 540 |
if selected_columns:
|
| 541 |
+
data = data[
|
| 542 |
+
["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns
|
| 543 |
+
]
|
| 544 |
return data
|
| 545 |
|
| 546 |
with gr.Row():
|
|
|
|
| 601 |
```
|
| 602 |
"""
|
| 603 |
)
|
| 604 |
+
with gr.TabItem("📚 Submit your model", id="submission"):
|
| 605 |
gr.Markdown("# How to Submit a New Model to the Leaderboard")
|
| 606 |
gr.Markdown(
|
| 607 |
"""
|
| 608 |
+
## Original (ViDoRe v1-v3) leaderboard:
|
| 609 |
+
To submit a new model to the original ViDoRe leaderboard, follow these steps:
|
| 610 |
|
| 611 |
1. **Evaluate your model**:
|
| 612 |
- Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB.
|
|
|
|
| 619 |
|
| 620 |
Note: For proper hyperlink redirection, please ensure that your model repository name is in
|
| 621 |
kebab-case, e.g. `my-model-name`.
|
| 622 |
+
|
| 623 |
+
|
| 624 |
+
## ViDoRe v3 Pipeline leaderboard :
|
| 625 |
+
To submit a new pipeline to the ViDoRe V3 pipeline leaderboard, follow these steps:
|
| 626 |
+
1. **Evaluate your pipeline**:
|
| 627 |
+
- Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) for pipelines
|
| 628 |
+
2. **Open a PR on the ViDoRe GitHub repository including**:
|
| 629 |
+
- Your results, which are directly outputed in the correct format
|
| 630 |
+
- A short description of the pipeline and the main components used (some examples are available in the repository)
|
| 631 |
+
|
| 632 |
+
And you're done! Your pipeline will appear on the leaderboard when after the PR is merged !
|
| 633 |
"""
|
| 634 |
)
|
| 635 |
+
with gr.TabItem("Deprecated ViDoRe V1", id="vidore-v1-deprecated"):
|
| 636 |
gr.Markdown(
|
| 637 |
"## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
|
| 638 |
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
|
|
|
|
| 640 |
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
|
| 641 |
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
|
| 642 |
)
|
| 643 |
+
gr.Markdown(
|
| 644 |
+
"## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
|
| 645 |
+
)
|
| 646 |
+
gr.Markdown(
|
| 647 |
+
"# <span style='color:red'>[Deprecated]</span> ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍"
|
| 648 |
+
)
|
| 649 |
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
|
| 650 |
|
| 651 |
gr.Markdown(
|
|
|
|
| 658 |
deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
|
| 659 |
|
| 660 |
with gr.Row():
|
| 661 |
+
deprecated_metric_dropdown_1 = gr.Dropdown(
|
| 662 |
+
choices=METRICS, value=initial_metric, label="Select Metric"
|
| 663 |
+
)
|
| 664 |
deprecated_research_textbox_1 = gr.Textbox(
|
| 665 |
placeholder="🔍 Search Models... [press enter]",
|
| 666 |
label="Filter Models by Name",
|
| 667 |
)
|
| 668 |
deprecated_column_checkboxes_1 = gr.CheckboxGroup(
|
| 669 |
+
choices=deprecated_datasets_columns_1,
|
| 670 |
+
value=deprecated_datasets_columns_1,
|
| 671 |
+
label="Select Columns to Display",
|
| 672 |
)
|
| 673 |
|
| 674 |
with gr.Row():
|
| 675 |
deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
|
| 676 |
+
deprecated_dataframe_1 = gr.Dataframe(
|
| 677 |
+
deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas"
|
| 678 |
+
)
|
| 679 |
|
| 680 |
def deprecated_update_data_1(metric, search_term, selected_columns):
|
| 681 |
deprecated_model_handler.get_vidore_data(metric)
|
|
|
|
| 684 |
data = filter_models(data, search_term)
|
| 685 |
# data = remove_duplicates(data) # Add this line
|
| 686 |
if selected_columns:
|
| 687 |
+
data = data[["Rank", "Model", "Average Score"] + selected_columns]
|
| 688 |
return data
|
| 689 |
|
| 690 |
with gr.Row():
|
|
|
|
| 703 |
outputs=deprecated_dataframe_1,
|
| 704 |
)
|
| 705 |
deprecated_research_textbox_1.submit(
|
| 706 |
+
lambda metric, search_term, selected_columns: deprecated_update_data_1(
|
| 707 |
+
metric, search_term, selected_columns
|
| 708 |
+
),
|
| 709 |
+
inputs=[
|
| 710 |
+
deprecated_metric_dropdown_1,
|
| 711 |
+
deprecated_research_textbox_1,
|
| 712 |
+
deprecated_column_checkboxes_1,
|
| 713 |
+
],
|
| 714 |
outputs=deprecated_dataframe_1,
|
| 715 |
)
|
| 716 |
deprecated_column_checkboxes_1.change(
|
| 717 |
+
lambda metric, search_term, selected_columns: deprecated_update_data_1(
|
| 718 |
+
metric, search_term, selected_columns
|
| 719 |
+
),
|
| 720 |
+
inputs=[
|
| 721 |
+
deprecated_metric_dropdown_1,
|
| 722 |
+
deprecated_research_textbox_1,
|
| 723 |
+
deprecated_column_checkboxes_1,
|
| 724 |
+
],
|
| 725 |
outputs=deprecated_dataframe_1,
|
| 726 |
)
|
| 727 |
|
|
|
|
| 757 |
```
|
| 758 |
"""
|
| 759 |
)
|
| 760 |
+
with gr.TabItem("Deprecated ViDoRe V2", id="vidore-v2-deprecated"):
|
| 761 |
gr.Markdown(
|
| 762 |
"## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
|
| 763 |
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
|
|
|
|
| 765 |
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
|
| 766 |
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
|
| 767 |
)
|
| 768 |
+
gr.Markdown(
|
| 769 |
+
"## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
|
| 770 |
+
)
|
| 771 |
+
gr.Markdown(
|
| 772 |
+
"# <span style='color:red'>[Deprecated]</span> ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍"
|
| 773 |
+
)
|
| 774 |
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
|
| 775 |
|
| 776 |
gr.Markdown(
|
|
|
|
| 783 |
deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
|
| 784 |
|
| 785 |
with gr.Row():
|
| 786 |
+
deprecated_metric_dropdown_2 = gr.Dropdown(
|
| 787 |
+
choices=METRICS, value=initial_metric, label="Select Metric"
|
| 788 |
+
)
|
| 789 |
deprecated_research_textbox_2 = gr.Textbox(
|
| 790 |
placeholder="🔍 Search Models... [press enter]",
|
| 791 |
label="Filter Models by Name",
|
| 792 |
)
|
| 793 |
deprecated_column_checkboxes_2 = gr.CheckboxGroup(
|
| 794 |
+
choices=deprecated_datasets_columns_2,
|
| 795 |
+
value=deprecated_datasets_columns_2,
|
| 796 |
+
label="Select Columns to Display",
|
| 797 |
)
|
| 798 |
|
| 799 |
with gr.Row():
|
| 800 |
deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
|
| 801 |
+
deprecated_dataframe_2 = gr.Dataframe(
|
| 802 |
+
deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas"
|
| 803 |
+
)
|
| 804 |
|
| 805 |
def deprecated_update_data_2(metric, search_term, selected_columns):
|
| 806 |
deprecated_model_handler.get_vidore_data(metric)
|
|
|
|
| 809 |
data = filter_models(data, search_term)
|
| 810 |
# data = remove_duplicates(data) # Add this line
|
| 811 |
if selected_columns:
|
| 812 |
+
data = data[["Rank", "Model", "Average Score"] + selected_columns]
|
| 813 |
return data
|
| 814 |
|
| 815 |
with gr.Row():
|
|
|
|
| 836 |
outputs=deprecated_dataframe_2,
|
| 837 |
)
|
| 838 |
deprecated_research_textbox_2.submit(
|
| 839 |
+
lambda metric, search_term, selected_columns: deprecated_update_data_2(
|
| 840 |
+
metric, search_term, selected_columns
|
| 841 |
+
),
|
| 842 |
+
inputs=[
|
| 843 |
+
deprecated_metric_dropdown_2,
|
| 844 |
+
deprecated_research_textbox_2,
|
| 845 |
+
deprecated_column_checkboxes_2,
|
| 846 |
+
],
|
| 847 |
outputs=deprecated_dataframe_2,
|
| 848 |
)
|
| 849 |
deprecated_column_checkboxes_2.change(
|
| 850 |
+
lambda metric, search_term, selected_columns: deprecated_update_data_2(
|
| 851 |
+
metric, search_term, selected_columns
|
| 852 |
+
),
|
| 853 |
+
inputs=[
|
| 854 |
+
deprecated_metric_dropdown_2,
|
| 855 |
+
deprecated_research_textbox_2,
|
| 856 |
+
deprecated_column_checkboxes_2,
|
| 857 |
+
],
|
| 858 |
outputs=deprecated_dataframe_2,
|
| 859 |
)
|
| 860 |
|
|
|
|
| 891 |
"""
|
| 892 |
)
|
| 893 |
|
| 894 |
+
def select_tab_from_url(request: gr.Request):
|
| 895 |
+
# Grab query parameters from the URL
|
| 896 |
+
query_params = dict(request.query_params)
|
| 897 |
+
# Look for ?tab=..., default to the first tab's ID if not found
|
| 898 |
+
target_tab = query_params.get("tab", "vidore-v3")
|
| 899 |
+
# Update the tabs component to select the target ID
|
| 900 |
+
return gr.update(selected=target_tab)
|
| 901 |
+
|
| 902 |
+
block.load(select_tab_from_url, inputs=None, outputs=tabs)
|
| 903 |
block.queue(max_size=10).launch(debug=True)
|
| 904 |
|
| 905 |
|
app/utils.py
CHANGED
|
@@ -1,7 +1,20 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
desanitized_model_name = model_name.replace("__", "/")
|
| 6 |
desanitized_model_name = desanitized_model_name.replace("_", "/")
|
| 7 |
desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
|
|
@@ -11,47 +24,81 @@ def make_clickable_model(model_name, link=None):
|
|
| 11 |
if "/ocr" in desanitized_model_name:
|
| 12 |
desanitized_model_name = desanitized_model_name.replace("/ocr", "")
|
| 13 |
|
| 14 |
-
link
|
|
|
|
| 15 |
|
| 16 |
return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
|
| 17 |
|
| 18 |
|
| 19 |
def add_rank(df, benchmark_version=1, selected_columns=None):
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
]
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
-
def add_rank_and_format(df, benchmark_version=1, selected_columns=None):
|
| 51 |
df = df.reset_index()
|
| 52 |
-
|
|
|
|
| 53 |
df = add_rank(df, benchmark_version, selected_columns)
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
# df = remove_duplicates(df)
|
| 56 |
return df
|
| 57 |
|
|
@@ -73,6 +120,7 @@ def get_refresh_function(model_handler, benchmark_version):
|
|
| 73 |
|
| 74 |
return _refresh
|
| 75 |
|
|
|
|
| 76 |
def deprecated_get_refresh_function(model_handler, benchmark_version):
|
| 77 |
def _refresh(metric):
|
| 78 |
model_handler.get_vidore_data(metric)
|
|
@@ -83,7 +131,21 @@ def deprecated_get_refresh_function(model_handler, benchmark_version):
|
|
| 83 |
return _refresh
|
| 84 |
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def filter_models(data, search_term):
|
| 87 |
if search_term:
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
return data
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import math
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def make_clickable_model(model_name, link=None, is_pipeline=False, folder_name=None):
|
| 6 |
+
if is_pipeline:
|
| 7 |
+
# For pipelines: use folder_name for link, model_name (alias) for display
|
| 8 |
+
link_folder = folder_name if folder_name else model_name
|
| 9 |
+
# Process folder name for link: only handle __ and -thisisapoint-
|
| 10 |
+
desanitized_folder = link_folder.replace("__", "/")
|
| 11 |
+
desanitized_folder = desanitized_folder.replace("-thisisapoint-", ".")
|
| 12 |
+
if link is None:
|
| 13 |
+
link = f"https://github.com/illuin-tech/vidore-benchmark/tree/main/results/pipeline_descriptions/{desanitized_folder}/description.json"
|
| 14 |
+
# Use word-wrap styling for potentially long pipeline aliases
|
| 15 |
+
return f'<a target="_blank" style="text-decoration: underline; word-wrap: break-word; white-space: normal; display: inline-block; max-width: 450px;" href="{link}">{model_name}</a>'
|
| 16 |
+
else:
|
| 17 |
+
# For regular models: replace __ and _ with /, and -thisisapoint- with .
|
| 18 |
desanitized_model_name = model_name.replace("__", "/")
|
| 19 |
desanitized_model_name = desanitized_model_name.replace("_", "/")
|
| 20 |
desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
|
|
|
|
| 24 |
if "/ocr" in desanitized_model_name:
|
| 25 |
desanitized_model_name = desanitized_model_name.replace("/ocr", "")
|
| 26 |
|
| 27 |
+
if link is None:
|
| 28 |
+
link = "https://huggingface.co/" + desanitized_model_name
|
| 29 |
|
| 30 |
return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
|
| 31 |
|
| 32 |
|
| 33 |
def add_rank(df, benchmark_version=1, selected_columns=None):
|
| 34 |
+
# Convert numeric columns to proper float type (they may be 'object' dtype due to mixed data)
|
| 35 |
+
for col in df.columns:
|
| 36 |
+
if col not in ["Model", "Pipeline", "_folder_name"]:
|
| 37 |
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
| 38 |
+
|
| 39 |
+
# Only fill NaN for numeric columns to avoid issues with string columns like _folder_name
|
| 40 |
+
numeric_cols = df.select_dtypes(include=["float64", "int64", "float32", "int32"]).columns
|
| 41 |
+
df[numeric_cols] = df[numeric_cols].fillna(0.0)
|
| 42 |
+
|
| 43 |
+
if selected_columns is None:
|
| 44 |
+
cols_to_rank = [
|
| 45 |
+
col
|
| 46 |
+
for col in df.columns
|
| 47 |
+
if col
|
| 48 |
+
not in [
|
| 49 |
+
"Model",
|
| 50 |
+
"Pipeline", # For pipeline tables
|
| 51 |
+
"Model Size (Million Parameters)",
|
| 52 |
+
"Memory Usage (GB, fp32)",
|
| 53 |
+
"Embedding Dimensions",
|
| 54 |
+
"Max Tokens",
|
| 55 |
+
"Compute Cost ($)",
|
| 56 |
+
"Queries per Second",
|
| 57 |
+
"_folder_name", # Hidden column for pipeline link generation
|
| 58 |
]
|
| 59 |
+
]
|
| 60 |
+
else:
|
| 61 |
+
cols_to_rank = selected_columns
|
| 62 |
+
|
| 63 |
+
if len(cols_to_rank) == 1:
|
| 64 |
+
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
|
| 65 |
+
else:
|
| 66 |
+
# Only add Average column if it doesn't already exist
|
| 67 |
+
if "Average Score" not in df.columns:
|
| 68 |
+
df.insert(len(df.columns) - len(cols_to_rank), "Average Score", df[cols_to_rank].mean(axis=1, skipna=False))
|
| 69 |
+
df.sort_values("Average Score", ascending=False, inplace=True)
|
| 70 |
+
df.insert(0, "Rank", list(range(1, len(df) + 1)))
|
| 71 |
+
# multiply values by 100 if they are floats and round to 2 decimal places
|
| 72 |
+
for col in df.columns:
|
| 73 |
+
if pd.api.types.is_numeric_dtype(df[col]) and col not in [
|
| 74 |
+
"Model Size (Million Parameters)",
|
| 75 |
+
"Compute Cost ($)",
|
| 76 |
+
"Queries per Second",
|
| 77 |
+
"Indexing latency (s/doc)",
|
| 78 |
+
"Search latency (s/query)",
|
| 79 |
+
"Rank",
|
| 80 |
+
]:
|
| 81 |
+
df[col] = df[col].apply(lambda x: round(x * 100, 3 - int(math.floor(math.log10(abs(x * 100)))) - 1))
|
| 82 |
+
elif pd.api.types.is_numeric_dtype(df[col]) and col in ["Indexing latency (s/doc)", "Search latency (s/query)"]:
|
| 83 |
+
df[col] = df[col].apply(lambda x: round(x, 3 - int(math.floor(math.log10(abs(x)))) - 1))
|
| 84 |
+
return df
|
| 85 |
|
| 86 |
|
| 87 |
+
def add_rank_and_format(df, benchmark_version=1, selected_columns=None, is_pipeline=False):
|
| 88 |
df = df.reset_index()
|
| 89 |
+
column_name = "Pipeline" if is_pipeline else "Model"
|
| 90 |
+
df = df.rename(columns={"index": column_name})
|
| 91 |
df = add_rank(df, benchmark_version, selected_columns)
|
| 92 |
+
|
| 93 |
+
if is_pipeline and "_folder_name" in df.columns:
|
| 94 |
+
# For pipelines, use folder_name for link generation
|
| 95 |
+
df[column_name] = df.apply(
|
| 96 |
+
lambda row: make_clickable_model(row[column_name], is_pipeline=True, folder_name=row["_folder_name"]),
|
| 97 |
+
axis=1,
|
| 98 |
+
)
|
| 99 |
+
df = df.drop(columns=["_folder_name"])
|
| 100 |
+
else:
|
| 101 |
+
df[column_name] = df[column_name].apply(lambda x: make_clickable_model(x, is_pipeline=is_pipeline))
|
| 102 |
# df = remove_duplicates(df)
|
| 103 |
return df
|
| 104 |
|
|
|
|
| 120 |
|
| 121 |
return _refresh
|
| 122 |
|
| 123 |
+
|
| 124 |
def deprecated_get_refresh_function(model_handler, benchmark_version):
|
| 125 |
def _refresh(metric):
|
| 126 |
model_handler.get_vidore_data(metric)
|
|
|
|
| 131 |
return _refresh
|
| 132 |
|
| 133 |
|
| 134 |
+
def get_pipeline_refresh_function(pipeline_handler):
|
| 135 |
+
"""Refresh function for pipeline evaluation results."""
|
| 136 |
+
|
| 137 |
+
def _refresh(metric):
|
| 138 |
+
pipeline_handler.get_pipeline_data()
|
| 139 |
+
data = pipeline_handler.render_df(metric)
|
| 140 |
+
df = add_rank_and_format(data, benchmark_version=3, is_pipeline=True)
|
| 141 |
+
return df
|
| 142 |
+
|
| 143 |
+
return _refresh
|
| 144 |
+
|
| 145 |
+
|
| 146 |
def filter_models(data, search_term):
|
| 147 |
if search_term:
|
| 148 |
+
# Use "Pipeline" column for pipeline tables, "Model" for others
|
| 149 |
+
col_name = "Pipeline" if "Pipeline" in data.columns else "Model"
|
| 150 |
+
data = data[data[col_name].str.contains(search_term, case=False, na=False)]
|
| 151 |
return data
|
data/pipeline_handler.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from typing import Dict, List, Optional
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class PipelineHandler:
|
| 9 |
+
"""Handler for ViDoRe v3 pipeline evaluation results from GitHub."""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.pipeline_infos = {}
|
| 13 |
+
self.pipeline_aliases = {} # Maps folder_name -> pipeline_alias for display
|
| 14 |
+
self.github_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/metrics"
|
| 15 |
+
self.github_descriptions_base_url = (
|
| 16 |
+
"https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/pipeline_descriptions"
|
| 17 |
+
)
|
| 18 |
+
self.available_datasets = []
|
| 19 |
+
self.available_languages = ["english"] # Default languages available
|
| 20 |
+
|
| 21 |
+
# Setup GitHub authentication if token is available
|
| 22 |
+
self.github_token = os.environ.get("GITHUB_TOKEN")
|
| 23 |
+
self.headers = {}
|
| 24 |
+
if self.github_token:
|
| 25 |
+
self.headers["Authorization"] = f"token {self.github_token}"
|
| 26 |
+
print("GitHub token detected - using authenticated requests")
|
| 27 |
+
|
| 28 |
+
def get_pipeline_folders_from_github(self) -> List[str]:
|
| 29 |
+
"""Get list of pipeline folders from GitHub API."""
|
| 30 |
+
api_url = "https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics"
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
response = requests.get(api_url, headers=self.headers)
|
| 34 |
+
response.raise_for_status()
|
| 35 |
+
contents = response.json()
|
| 36 |
+
|
| 37 |
+
# Filter for directories only
|
| 38 |
+
folders = [item["name"] for item in contents if item["type"] == "dir"]
|
| 39 |
+
return sorted(folders)
|
| 40 |
+
except Exception as e:
|
| 41 |
+
print(f"Error fetching pipeline folders from GitHub: {e}")
|
| 42 |
+
return []
|
| 43 |
+
|
| 44 |
+
def get_dataset_files_from_github(self, pipeline_name: str) -> List[str]:
|
| 45 |
+
"""Get list of dataset JSON files for a specific pipeline from GitHub API."""
|
| 46 |
+
api_url = f"https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics/{pipeline_name}"
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
response = requests.get(api_url, headers=self.headers)
|
| 50 |
+
response.raise_for_status()
|
| 51 |
+
contents = response.json()
|
| 52 |
+
|
| 53 |
+
# Filter for JSON files that start with 'vidore_v3'
|
| 54 |
+
files = [
|
| 55 |
+
item["name"]
|
| 56 |
+
for item in contents
|
| 57 |
+
if item["type"] == "file" and item["name"].startswith("vidore_v3") and item["name"].endswith(".json")
|
| 58 |
+
]
|
| 59 |
+
return sorted(files)
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"Error fetching dataset files from {pipeline_name}: {e}")
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
def fetch_json_from_github(self, pipeline_name: str, filename: str) -> Optional[Dict]:
|
| 65 |
+
"""Fetch a JSON file from GitHub raw content."""
|
| 66 |
+
url = f"{self.github_base_url}/{pipeline_name}/{filename}"
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
response = requests.get(url, headers=self.headers)
|
| 70 |
+
response.raise_for_status()
|
| 71 |
+
return response.json()
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f"Error fetching {filename} from {pipeline_name}: {e}")
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
def fetch_pipeline_alias(self, pipeline_name: str) -> Optional[str]:
|
| 77 |
+
"""Fetch the pipeline_alias from description.json for a pipeline.
|
| 78 |
+
|
| 79 |
+
Uses raw.githubusercontent.com to avoid API rate limits.
|
| 80 |
+
"""
|
| 81 |
+
url = f"{self.github_descriptions_base_url}/{pipeline_name}/description.json"
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
response = requests.get(url, headers=self.headers)
|
| 85 |
+
response.raise_for_status()
|
| 86 |
+
description = response.json()
|
| 87 |
+
return description.get("pipeline_alias")
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f"Error fetching description for {pipeline_name}: {e}")
|
| 90 |
+
return None
|
| 91 |
+
|
| 92 |
+
def get_pipeline_data(self):
|
| 93 |
+
"""Fetch all pipeline data from GitHub."""
|
| 94 |
+
pipeline_folders = self.get_pipeline_folders_from_github()
|
| 95 |
+
datasets_set = set()
|
| 96 |
+
languages_set = set(["overall"])
|
| 97 |
+
|
| 98 |
+
for pipeline_name in pipeline_folders:
|
| 99 |
+
# Get all dataset files for this pipeline
|
| 100 |
+
dataset_files = self.get_dataset_files_from_github(pipeline_name)
|
| 101 |
+
|
| 102 |
+
if not dataset_files:
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
pipeline_data = {}
|
| 106 |
+
for filename in dataset_files:
|
| 107 |
+
results = self.fetch_json_from_github(pipeline_name, filename)
|
| 108 |
+
if results:
|
| 109 |
+
# Extract dataset name from filename (remove vidore_v3_ prefix and .json suffix)
|
| 110 |
+
dataset_name = filename.replace("vidore_v3_", "").replace(".json", "")
|
| 111 |
+
datasets_set.add(dataset_name)
|
| 112 |
+
pipeline_data[dataset_name] = results
|
| 113 |
+
|
| 114 |
+
# Collect available languages
|
| 115 |
+
if "aggregated_metrics" in results and "by_language" in results["aggregated_metrics"]:
|
| 116 |
+
languages_set.update(results["aggregated_metrics"]["by_language"].keys())
|
| 117 |
+
|
| 118 |
+
if pipeline_data:
|
| 119 |
+
self.pipeline_infos[pipeline_name] = pipeline_data
|
| 120 |
+
# Fetch the pipeline alias for display (uses raw URL, not API)
|
| 121 |
+
alias = self.fetch_pipeline_alias(pipeline_name)
|
| 122 |
+
if alias:
|
| 123 |
+
self.pipeline_aliases[pipeline_name] = alias
|
| 124 |
+
|
| 125 |
+
self.available_datasets = sorted(list(datasets_set))
|
| 126 |
+
self.available_languages = sorted(list(languages_set))
|
| 127 |
+
|
| 128 |
+
def calculate_cost_metric(self, pipeline_datasets: Dict) -> float:
|
| 129 |
+
"""
|
| 130 |
+
Calculate a compute cost metric based on retrieval time across all datasets.
|
| 131 |
+
Returns cost in arbitrary units (could be refined based on actual compute costs).
|
| 132 |
+
"""
|
| 133 |
+
total_time_s = 0
|
| 134 |
+
|
| 135 |
+
for dataset_name, dataset_data in pipeline_datasets.items():
|
| 136 |
+
if "aggregated_metrics" not in dataset_data:
|
| 137 |
+
continue
|
| 138 |
+
|
| 139 |
+
timing = dataset_data["aggregated_metrics"].get("timing", {})
|
| 140 |
+
total_time_ms = timing.get("total_retrieval_time_milliseconds", 0)
|
| 141 |
+
total_time_s += total_time_ms / 1000.0
|
| 142 |
+
|
| 143 |
+
# Simple cost model: assume $0.01 per second of compute (adjustable)
|
| 144 |
+
cost = total_time_s * 0.01
|
| 145 |
+
|
| 146 |
+
return round(cost, 4)
|
| 147 |
+
|
| 148 |
+
def extract_dataset_metrics(
|
| 149 |
+
self, pipeline_datasets: Dict, metric: str = "ndcg_cut_5", language: str = "english"
|
| 150 |
+
) -> Dict[str, float]:
|
| 151 |
+
"""
|
| 152 |
+
Extract metrics for individual datasets from the aggregated results.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
pipeline_datasets: Dictionary mapping dataset names to their data
|
| 156 |
+
metric: The metric to extract (e.g., 'ndcg_at_5')
|
| 157 |
+
language: The language to filter by ('english' for English-only results, or specific language)
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
Dictionary mapping dataset names to metric values
|
| 161 |
+
"""
|
| 162 |
+
# Map metric names from UI format to API format
|
| 163 |
+
metric_mapping = {
|
| 164 |
+
"ndcg_at_1": "ndcg_cut_5", # Using cut_5 as approximation
|
| 165 |
+
"ndcg_at_5": "ndcg_cut_5",
|
| 166 |
+
"ndcg_at_10": "ndcg_cut_10",
|
| 167 |
+
"ndcg_at_100": "ndcg_cut_100",
|
| 168 |
+
"recall_at_1": "recall_5",
|
| 169 |
+
"recall_at_5": "recall_5",
|
| 170 |
+
"recall_at_10": "recall_10",
|
| 171 |
+
"recall_at_100": "recall_100",
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
actual_metric = metric_mapping.get(metric, metric)
|
| 175 |
+
dataset_metrics = {}
|
| 176 |
+
|
| 177 |
+
for dataset_name, dataset_data in pipeline_datasets.items():
|
| 178 |
+
if "aggregated_metrics" not in dataset_data:
|
| 179 |
+
continue
|
| 180 |
+
|
| 181 |
+
aggregated = dataset_data["aggregated_metrics"]
|
| 182 |
+
|
| 183 |
+
# Get metrics for the specified language
|
| 184 |
+
if language == "overall":
|
| 185 |
+
metrics_data = aggregated.get("overall", {})
|
| 186 |
+
else:
|
| 187 |
+
metrics_data = aggregated.get("by_language", {}).get(language, {})
|
| 188 |
+
|
| 189 |
+
if metrics_data:
|
| 190 |
+
# Format dataset name for display
|
| 191 |
+
display_name = dataset_name.replace("_", " ").title()
|
| 192 |
+
dataset_metrics[display_name] = metrics_data.get(actual_metric, 0.0)
|
| 193 |
+
|
| 194 |
+
return dataset_metrics
|
| 195 |
+
|
| 196 |
+
def render_df(self, metric: str = "ndcg_at_5", language: str = "overall") -> pd.DataFrame:
|
| 197 |
+
"""
|
| 198 |
+
Render a DataFrame with pipeline results.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
metric: The metric to display (e.g., 'ndcg_at_5')
|
| 202 |
+
language: The language to filter by ('overall' for all languages, or specific language)
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
DataFrame with columns: Pipeline Name, Compute Cost, Timing metrics, Dataset metrics
|
| 206 |
+
"""
|
| 207 |
+
pipeline_res = {}
|
| 208 |
+
|
| 209 |
+
for pipeline_name, pipeline_datasets in self.pipeline_infos.items():
|
| 210 |
+
row_data = {}
|
| 211 |
+
|
| 212 |
+
# Aggregate time metrics across all datasets
|
| 213 |
+
total_time_ms = 0
|
| 214 |
+
total_queries = 0
|
| 215 |
+
indexing_time_ms = 0
|
| 216 |
+
search_time_ms = 0
|
| 217 |
+
num_datasets = 0
|
| 218 |
+
|
| 219 |
+
for dataset_name, dataset_data in pipeline_datasets.items():
|
| 220 |
+
if "aggregated_metrics" in dataset_data:
|
| 221 |
+
timing = dataset_data["aggregated_metrics"].get("timing", {})
|
| 222 |
+
total_time_ms += timing.get("total_retrieval_time_milliseconds", 0)
|
| 223 |
+
total_queries += timing.get("num_queries", 0)
|
| 224 |
+
indexing_time_ms += timing.get("indexing_throughput_ms_per_doc", 0)
|
| 225 |
+
search_time_ms += timing.get("search_throughput_ms_per_query", 0)
|
| 226 |
+
num_datasets += 1
|
| 227 |
+
|
| 228 |
+
if total_queries > 0:
|
| 229 |
+
if total_time_ms > 0:
|
| 230 |
+
# row_data["Total retrieval time (s)"] = round(
|
| 231 |
+
# (total_time_ms / 1000), 0
|
| 232 |
+
# )
|
| 233 |
+
row_data["Indexing latency (s/doc)"] = (
|
| 234 |
+
(indexing_time_ms / 1000) / num_datasets if indexing_time_ms > 0 else None
|
| 235 |
+
)
|
| 236 |
+
row_data["Search latency (s/query)"] = (
|
| 237 |
+
(search_time_ms / 1000) / num_datasets if search_time_ms > 0 else None
|
| 238 |
+
)
|
| 239 |
+
else:
|
| 240 |
+
# row_data["Total retrieval time (s)"] = 0
|
| 241 |
+
row_data["Indexing latency (s/doc)"] = 0
|
| 242 |
+
row_data["Search latency (s/query)"] = 0
|
| 243 |
+
else:
|
| 244 |
+
# row_data["Total retrieval time (s)"] = -1
|
| 245 |
+
row_data["Indexing latency (s/doc)"] = -1
|
| 246 |
+
row_data["Search latency (s/query)"] = -1
|
| 247 |
+
|
| 248 |
+
# Add dataset metrics
|
| 249 |
+
dataset_metrics = self.extract_dataset_metrics(pipeline_datasets, metric, language)
|
| 250 |
+
row_data.update(dataset_metrics)
|
| 251 |
+
|
| 252 |
+
# Calculate average across datasets if there are multiple
|
| 253 |
+
if dataset_metrics:
|
| 254 |
+
row_data["Average Score"] = round(sum(dataset_metrics.values()) / len(dataset_metrics), 4)
|
| 255 |
+
|
| 256 |
+
# Use pipeline_alias for display if available, otherwise fallback to folder name
|
| 257 |
+
display_name = self.pipeline_aliases.get(pipeline_name, pipeline_name)
|
| 258 |
+
# Store folder name for link generation (will be used in utils.py)
|
| 259 |
+
row_data["_folder_name"] = pipeline_name
|
| 260 |
+
pipeline_res[display_name] = row_data
|
| 261 |
+
|
| 262 |
+
if pipeline_res:
|
| 263 |
+
df = pd.DataFrame(pipeline_res).T
|
| 264 |
+
# Reorder columns to have Average right after timing metrics
|
| 265 |
+
cols = list(df.columns)
|
| 266 |
+
if "Average Score" in cols:
|
| 267 |
+
cols.remove("Average Score")
|
| 268 |
+
# Insert Average after Search latency (s/query)
|
| 269 |
+
insert_pos = cols.index("Search latency (s/query)") + 1 if "Search latency (s/query)" in cols else 2
|
| 270 |
+
cols.insert(insert_pos, "Average Score")
|
| 271 |
+
df = df[cols]
|
| 272 |
+
return df
|
| 273 |
+
|
| 274 |
+
return pd.DataFrame()
|