Spaces:

vidore
/

vidore-leaderboard

Running

App Files Files Community

QuentinJG commited on Feb 27

Commit

342cf1f

1 Parent(s): 06e780e

Pipeline leaderboard (#17)

Browse files

- pipeline leaderboard (0538054cb956e3c5f1624236f9d1b74d337f2449)

Files changed (3) hide show

app.py +385 -38
app/utils.py +98 -36
data/pipeline_handler.py +274 -0

app.py CHANGED Viewed

@@ -1,9 +1,19 @@
-import gradio as gr
-from app.utils import (add_rank_and_format, deprecated_get_refresh_function,
-                       filter_models, get_refresh_function)
 from data.deprecated_model_handler import DeprecatedModelHandler
 from data.model_handler import ModelHandler
 METRICS = [
     "ndcg_at_1",
@@ -22,7 +32,6 @@ def main():
     model_handler = ModelHandler()
     initial_metric = "ndcg_at_5"
-    model_handler.get_vidore_data(initial_metric)
     data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1)
     data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
@@ -37,9 +46,19 @@ def main():
     num_scores_2 = len(data_benchmark_2) * num_datasets_2
     num_models_2 = len(data_benchmark_2)
-    # Get deprecated results
     deprecated_model_handler = DeprecatedModelHandler()
     initial_metric = "ndcg_at_5"
     deprecated_model_handler.get_vidore_data(initial_metric)
     deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
@@ -83,11 +102,35 @@ def main():
         border-left: 4px solid #2196f3;
         padding: 5px 15px;
     }
     """
     with gr.Blocks(css=css) as block:
-        with gr.Tabs():
-            with gr.TabItem("ViDoRe V3"):
                 gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷‍♂️")
                 gr.Markdown(
@@ -104,14 +147,14 @@ def main():
                 gr.Markdown("""
                 As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
                 we embed it here.
-                            """ )
                 gr.Markdown(
-                """**💡 To display English-only results:**
                     - Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*),
                     - Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it),
                     - The *eng-Latn* column will show English-only results (= English queries on English documents).""",
-                    elem_classes="alert-info"
                 )
                 gr.HTML(
@@ -122,7 +165,242 @@ def main():
                 ></iframe>
                 """
                 )
-            with gr.TabItem("ViDoRe V2"):
                 gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
                 gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
@@ -156,7 +434,9 @@ def main():
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
-                        data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
                     return data
                 with gr.Row():
@@ -225,7 +505,7 @@ def main():
                 ```
                 """
                 )
-            with gr.TabItem("ViDoRe V1"):
                 gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
                 gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
@@ -258,7 +538,9 @@ def main():
                     data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
                     data = filter_models(data, search_term)
                     if selected_columns:
-                        data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
                     return data
                 with gr.Row():
@@ -319,11 +601,12 @@ def main():
                 ```
                 """
                 )
-            with gr.TabItem("📚 Submit your model"):
                 gr.Markdown("# How to Submit a New Model to the Leaderboard")
                 gr.Markdown(
                     """
-                    To submit a new model to the ViDoRe leaderboard, follow these steps:
                     1. **Evaluate your model**:
                        - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB.
@@ -336,9 +619,20 @@ def main():
                     Note: For proper hyperlink redirection, please ensure that your model repository name is in
                     kebab-case, e.g. `my-model-name`.
                     """
                 )
-            with gr.TabItem("Deprecated ViDoRe V1"):
                 gr.Markdown(
                     "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
                     "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
@@ -346,8 +640,12 @@ def main():
                     "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
                     "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
                 )
-                gr.Markdown("## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>")
-                gr.Markdown("# <span style='color:red'>[Deprecated]</span> ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
                 gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
                 gr.Markdown(
@@ -360,18 +658,24 @@ def main():
                 deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
                 with gr.Row():
-                    deprecated_metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
                     deprecated_research_textbox_1 = gr.Textbox(
                         placeholder="🔍 Search Models... [press enter]",
                         label="Filter Models by Name",
                     )
                     deprecated_column_checkboxes_1 = gr.CheckboxGroup(
-                        choices=deprecated_datasets_columns_1, value=deprecated_datasets_columns_1, label="Select Columns to Display"
                     )
                 with gr.Row():
                     deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
-                    deprecated_dataframe_1 = gr.Dataframe(deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas")
                 def deprecated_update_data_1(metric, search_term, selected_columns):
                     deprecated_model_handler.get_vidore_data(metric)
@@ -380,7 +684,7 @@ def main():
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
-                        data = data[["Rank", "Model", "Average"] + selected_columns]
                     return data
                 with gr.Row():
@@ -399,13 +703,25 @@ def main():
                     outputs=deprecated_dataframe_1,
                 )
                 deprecated_research_textbox_1.submit(
-                    lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns),
-                    inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1],
                     outputs=deprecated_dataframe_1,
                 )
                 deprecated_column_checkboxes_1.change(
-                    lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns),
-                    inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1],
                     outputs=deprecated_dataframe_1,
                 )
@@ -441,7 +757,7 @@ def main():
                 ```
                 """
                 )
-            with gr.TabItem("Deprecated ViDoRe V2"):
                 gr.Markdown(
                     "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
                     "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
@@ -449,8 +765,12 @@ def main():
                     "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
                     "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
                 )
-                gr.Markdown("## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>")
-                gr.Markdown("# <span style='color:red'>[Deprecated]</span> ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
                 gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
                 gr.Markdown(
@@ -463,18 +783,24 @@ def main():
                 deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
                 with gr.Row():
-                    deprecated_metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
                     deprecated_research_textbox_2 = gr.Textbox(
                         placeholder="🔍 Search Models... [press enter]",
                         label="Filter Models by Name",
                     )
                     deprecated_column_checkboxes_2 = gr.CheckboxGroup(
-                        choices=deprecated_datasets_columns_2, value=deprecated_datasets_columns_2, label="Select Columns to Display"
                     )
                 with gr.Row():
                     deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
-                    deprecated_dataframe_2 = gr.Dataframe(deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas")
                 def deprecated_update_data_2(metric, search_term, selected_columns):
                     deprecated_model_handler.get_vidore_data(metric)
@@ -483,7 +809,7 @@ def main():
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
-                        data = data[["Rank", "Model", "Average"] + selected_columns]
                     return data
                 with gr.Row():
@@ -510,13 +836,25 @@ def main():
                     outputs=deprecated_dataframe_2,
                 )
                 deprecated_research_textbox_2.submit(
-                    lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns),
-                    inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2],
                     outputs=deprecated_dataframe_2,
                 )
                 deprecated_column_checkboxes_2.change(
-                    lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns),
-                    inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2],
                     outputs=deprecated_dataframe_2,
                 )
@@ -553,6 +891,15 @@ def main():
                 """
                 )
     block.queue(max_size=10).launch(debug=True)

+import re
+import gradio as gr
+import pandas as pd
+import plotly.express as px
+from app.utils import (
+    add_rank_and_format,
+    deprecated_get_refresh_function,
+    filter_models,
+    get_pipeline_refresh_function,
+    get_refresh_function,
+)
 from data.deprecated_model_handler import DeprecatedModelHandler
 from data.model_handler import ModelHandler
+from data.pipeline_handler import PipelineHandler
 METRICS = [
     "ndcg_at_1",
     model_handler = ModelHandler()
     initial_metric = "ndcg_at_5"
     data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1)
     data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
     num_scores_2 = len(data_benchmark_2) * num_datasets_2
     num_models_2 = len(data_benchmark_2)
     deprecated_model_handler = DeprecatedModelHandler()
     initial_metric = "ndcg_at_5"
+    initial_metric_v3 = "ndcg_at_10"
+    # Get pipeline evaluation results
+    pipeline_handler = PipelineHandler()
+    pipeline_handler.get_pipeline_data()
+    data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
+    data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
+    num_datasets_pipeline = len(data_pipeline.columns) - 5  # Excluding Rank, Model, Indexing time, search time, Average
+    num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
+    num_pipelines = len(data_pipeline)
     deprecated_model_handler.get_vidore_data(initial_metric)
     deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
         border-left: 4px solid #2196f3;
         padding: 5px 15px;
     }
+    /* 1. Force text wrapping on all headers from the 3rd column onwards */
+    #pipeline-table table th:nth-child(n+3) * {
+        white-space: normal !important;
+        overflow: visible !important;
+        text-overflow: clip !important;
+        line-height: 1.2 !important;
+        word-break: normal !important; /* Prevents breaking in the middle of words */
+        overflow-wrap: normal !important; /* Prevents breaking in the middle of words */
+    }
+    /* 2. Set a fixed width and center alignment for headers and data cells from the 3rd column onwards */
+    #pipeline-table table th:nth-child(n+3),
+    #pipeline-table table td:nth-child(n+3) {
+        min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
+        max-width: 120px !important;
+    }
+    /* 3. Make the Model column (2nd column) wider for pipeline table */
+    #pipeline-table table th:nth-child(2),
+    #pipeline-table table td:nth-child(2) {
+        min-width: 400px !important;
+        max-width: 500px !important;
+    }
     """
     with gr.Blocks(css=css) as block:
+        with gr.Tabs() as tabs:
+            with gr.TabItem("ViDoRe V3", id="vidore-v3"):
                 gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷‍♂️")
                 gr.Markdown(
                 gr.Markdown("""
                 As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
                 we embed it here.
+                            """)
                 gr.Markdown(
+                    """**💡 To display English-only results:**
                     - Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*),
                     - Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it),
                     - The *eng-Latn* column will show English-only results (= English queries on English documents).""",
+                    elem_classes="alert-info",
                 )
                 gr.HTML(
                 ></iframe>
                 """
                 )
+            with gr.TabItem("ViDoRe V3 (Pipeline)", id="vidore-v3-pipeline"):
+                gr.Markdown("# ViDoRe V3 (Pipeline Evaluation): Retrieval Performance for Complex Pipelines ⚙️")
+                gr.Markdown(
+                    "### Assessing retrieval performance, latency, and compute costs of complex retrieval pipelines"
+                )
+                gr.Markdown(
+                    """
+                    This leaderboard ranks full retrieval pipelines on **English-only queries** for **ViDoRe V3**. Instead of just testing standalone models, we evaluate real-world, multi-step retrieval systems. This includes everything from basic retrievers to advanced setups using AI agents, query reformulation, hybrid search, and any other creative retrieval pipeline one can imagine.
+                    To show efficiency alongside accuracy, we include **Indexing latency** (seconds/doc) and **Search latency** (seconds/query). These numbers are self-reported and depend heavily on hardware, so treat them as rough estimates. Still, they give a helpful idea of how much computing power each pipeline needs.
+                    ⚠️ **Note:** Because this only uses English queries, these scores **cannot be directly compared** to the standard ViDoRe V3 results.
+                    *Results are sourced from the [vidore-benchmark repository](https://github.com/illuin-tech/vidore-benchmark/tree/main/results).*
+                    """
+                )
+                if len(data_pipeline) > 0:
+                    datasets_columns_pipeline = [
+                        col
+                        for col in data_pipeline.columns[4:]
+                        if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average Score"]
+                    ]
+                    with gr.Row():
+                        metric_dropdown_pipeline = gr.Dropdown(
+                            choices=METRICS, value=initial_metric_v3, label="Select Metric"
+                        )
+                        research_textbox_pipeline = gr.Textbox(
+                            placeholder="🔍 Search Pipelines... [press enter]",
+                            label="Filter Pipelines by Name",
+                        )
+                    with gr.Row():
+                        if datasets_columns_pipeline:
+                            column_checkboxes_pipeline = gr.CheckboxGroup(
+                                choices=datasets_columns_pipeline,
+                                value=datasets_columns_pipeline,
+                                label="Select Datasets to Display",
+                            )
+                        else:
+                            column_checkboxes_pipeline = gr.CheckboxGroup(
+                                choices=[], value=[], label="Select Datasets to Display"
+                            )
+                    with gr.Row():
+                        # Datatype: Rank, Pipeline, Indexing latency (s/doc), Search latency (s/query), Average + datasets
+                        datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(
+                            datasets_columns_pipeline
+                        )
+                        dataframe_pipeline = gr.Dataframe(
+                            data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table"
+                        )
+                    def clean_pipeline_name(name):
+                        if not isinstance(name, str):
+                            return str(name)
+                        # Remove Markdown links [text](url) -> text
+                        name = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", name)
+                        # Remove HTML tags <a href="...">text</a> -> text
+                        name = re.sub(r"<[^>]+>", "", name)
+                        return name.strip()
+                    def create_pipeline_plot(df, latency_col):
+                        if df is None or len(df) == 0:
+                            return None
+                        # Ensure expected columns exist
+                        if (
+                            latency_col not in df.columns
+                            or "Average Score" not in df.columns
+                            or "Pipeline" not in df.columns
+                        ):
+                            return None
+                        # Clean the dataframe for plotting
+                        plot_df = df.copy()
+                        # Strip HTML and Markdown for clean hover text
+                        plot_df["Cleaned Pipeline"] = plot_df["Pipeline"].apply(clean_pipeline_name)
+                        plot_df[latency_col] = pd.to_numeric(plot_df[latency_col], errors="coerce")
+                        plot_df["Average Score"] = pd.to_numeric(plot_df["Average Score"], errors="coerce")
+                        plot_df = plot_df.dropna(subset=[latency_col, "Average Score"])
+                        plot_df = plot_df[plot_df[latency_col] > 0]
+                        plot_df = plot_df.sort_values(by=latency_col)
+                        if len(plot_df) == 0:
+                            return None
+                        fig = px.scatter(
+                            plot_df,
+                            x=latency_col,
+                            y="Average Score",
+                            hover_name="Cleaned Pipeline",  # Use the clean text!
+                            title=f"Mean Performance vs {latency_col}",
+                            color_discrete_sequence=["orange"],
+                        )
+                        fig.update_layout(
+                            xaxis_title=latency_col,
+                            yaxis_title="Average Score",
+                            plot_bgcolor="white",
+                        )
+                        fig.update_xaxes(showgrid=True, gridcolor="lightgrey")
+                        fig.update_yaxes(showgrid=True, gridcolor="lightgrey")
+                        fig.update_traces(marker=dict(size=12, opacity=0.8, line=dict(width=1, color="DarkSlateGrey")))
+                        return fig
+                    with gr.Row():
+                        latency_radio = gr.Radio(
+                            choices=["Search latency (s/query)", "Indexing latency (s/doc)"],
+                            value="Search latency (s/query)",
+                            label="Select Latency Metric for X-Axis",
+                        )
+                    with gr.Row():
+                        initial_fig = create_pipeline_plot(data_pipeline, "Search latency (s/query)")
+                        performance_plot = gr.Plot(value=initial_fig)
+                    def update_data_pipeline(metric, search_term, selected_columns):
+                        pipeline_handler.get_pipeline_data()
+                        data = pipeline_handler.render_df(metric, "english")
+                        data = add_rank_and_format(
+                            data, benchmark_version=3, selected_columns=selected_columns, is_pipeline=True
+                        )
+                        data = filter_models(data, search_term)
+                        if selected_columns:
+                            # Include core columns plus selected dataset columns
+                            core_cols = ["Rank", "Pipeline", "Indexing latency (s/doc)", "Search latency (s/query)"]
+                            if "Average Score" in data.columns:
+                                core_cols.insert(4, "Average Score")
+                            data = data[core_cols + selected_columns]
+                        return data
+                    with gr.Row():
+                        refresh_button_pipeline = gr.Button("Refresh")
+                        refresh_button_pipeline.click(
+                            lambda metric: add_rank_and_format(
+                                pipeline_handler.render_df(metric, "english"), benchmark_version=3, is_pipeline=True
+                            ),
+                            inputs=[metric_dropdown_pipeline],
+                            outputs=dataframe_pipeline,
+                            concurrency_limit=20,
+                        ).then(
+                            fn=create_pipeline_plot,
+                            inputs=[dataframe_pipeline, latency_radio],
+                            outputs=performance_plot,
+                        )
+                    with gr.Row():
+                        gr.Markdown(
+                            """
+                        **Note**: These results represent full pipeline evaluations on english queries ONLY (since other queries were mostly directly translated from their english counterparts).
+                        We felt mutli-lingual results were less critical (and much more costly to evaluate on the full set) for pipelines, since one user could just add a translation module to the pipeline and expect similar performance to the english results.
+                        If you feel this is a mistake and multi-lingual results are critical for pipelines, please let us know by opening an issue in the GitHub repository!
+                        """
+                        )
+                    # Automatically refresh the dataframe when the dropdown value changes
+                    def refresh_pipeline_data(metric):
+                        """Refresh pipeline data when metric changes."""
+                        df = pipeline_handler.render_df(metric, "english")
+                        return add_rank_and_format(df, benchmark_version=3, is_pipeline=True)
+                    # Update dataframe and then update the plot
+                    metric_dropdown_pipeline.change(
+                        refresh_pipeline_data,
+                        inputs=[metric_dropdown_pipeline],
+                        outputs=dataframe_pipeline,
+                    ).then(
+                        fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
+                    )
+                    research_textbox_pipeline.submit(
+                        lambda metric, search_term, selected_columns: update_data_pipeline(
+                            metric, search_term, selected_columns
+                        ),
+                        inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
+                        outputs=dataframe_pipeline,
+                    ).then(
+                        fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
+                    )
+                    column_checkboxes_pipeline.change(
+                        lambda metric, search_term, selected_columns: update_data_pipeline(
+                            metric, search_term, selected_columns
+                        ),
+                        inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
+                        outputs=dataframe_pipeline,
+                    ).then(
+                        fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
+                    )
+                    # Update plot when the radio button changes
+                    latency_radio.change(
+                        fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot
+                    )
+                    gr.Markdown(
+                        f"""
+                    - **Total Datasets**: {num_datasets_pipeline}
+                    - **Total Scores**: {num_scores_pipeline}
+                    - **Total Pipelines**: {num_pipelines}
+                    """
+                        + r"""
+                    Please consider citing:
+                    ```bibtex
+                    @misc{faysse2024colpaliefficientdocumentretrieval,
+                      title={ColPali: Efficient Document Retrieval with Vision Language Models},
+                      author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
+                      year={2024},
+                      eprint={2407.01449},
+                      archivePrefix={arXiv},
+                      primaryClass={cs.IR},
+                      url={[https://arxiv.org/abs/2407.01449](https://arxiv.org/abs/2407.01449)},
+                    }
+                    @misc{loison2026vidore,
+                      title={ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
+                      author={Loison, Ant{\'o}nio and Mac{\'e}, Quentin and Edy, Antoine and Xing, Victor and Balough, Tom and Moreira, Gabriel and Liu, Bo and Faysse, Manuel and Hudelot, C{\'e}line and Viaud, Gautier},
+                      journal={arXiv preprint arXiv:2601.08620},
+                      year={2026}
+                    }
+                    ```
+                    """
+                    )
+                else:
+                    gr.Markdown("**No pipeline evaluation results available yet. Check back later!**")
+            with gr.TabItem("ViDoRe V2", id="vidore-v2"):
                 gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
                 gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
+                        data = data[
+                            ["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns
+                        ]
                     return data
                 with gr.Row():
                 ```
                 """
                 )
+            with gr.TabItem("ViDoRe V1", id="vidore-v1"):
                 gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
                 gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
                     data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
                     data = filter_models(data, search_term)
                     if selected_columns:
+                        data = data[
+                            ["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns
+                        ]
                     return data
                 with gr.Row():
                 ```
                 """
                 )
+            with gr.TabItem("📚 Submit your model", id="submission"):
                 gr.Markdown("# How to Submit a New Model to the Leaderboard")
                 gr.Markdown(
                     """
+                    ## Original (ViDoRe v1-v3) leaderboard:
+                    To submit a new model to the original ViDoRe leaderboard, follow these steps:
                     1. **Evaluate your model**:
                        - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB.
                     Note: For proper hyperlink redirection, please ensure that your model repository name is in
                     kebab-case, e.g. `my-model-name`.
+                    ## ViDoRe v3 Pipeline leaderboard :
+                    To submit a new pipeline to the ViDoRe V3 pipeline leaderboard, follow these steps:
+                    1. **Evaluate your pipeline**:
+                       - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) for pipelines
+                    2. **Open a PR on the ViDoRe GitHub repository including**:
+                        - Your results, which are directly outputed in the correct format
+                        - A short description of the pipeline and the main components used (some examples are available in the repository)
+                    And you're done! Your pipeline will appear on the leaderboard when after the PR is merged !
                     """
                 )
+            with gr.TabItem("Deprecated ViDoRe V1", id="vidore-v1-deprecated"):
                 gr.Markdown(
                     "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
                     "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
                     "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
                     "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
                 )
+                gr.Markdown(
+                    "## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
+                )
+                gr.Markdown(
+                    "# <span style='color:red'>[Deprecated]</span> ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍"
+                )
                 gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
                 gr.Markdown(
                 deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
                 with gr.Row():
+                    deprecated_metric_dropdown_1 = gr.Dropdown(
+                        choices=METRICS, value=initial_metric, label="Select Metric"
+                    )
                     deprecated_research_textbox_1 = gr.Textbox(
                         placeholder="🔍 Search Models... [press enter]",
                         label="Filter Models by Name",
                     )
                     deprecated_column_checkboxes_1 = gr.CheckboxGroup(
+                        choices=deprecated_datasets_columns_1,
+                        value=deprecated_datasets_columns_1,
+                        label="Select Columns to Display",
                     )
                 with gr.Row():
                     deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
+                    deprecated_dataframe_1 = gr.Dataframe(
+                        deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas"
+                    )
                 def deprecated_update_data_1(metric, search_term, selected_columns):
                     deprecated_model_handler.get_vidore_data(metric)
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
+                        data = data[["Rank", "Model", "Average Score"] + selected_columns]
                     return data
                 with gr.Row():
                     outputs=deprecated_dataframe_1,
                 )
                 deprecated_research_textbox_1.submit(
+                    lambda metric, search_term, selected_columns: deprecated_update_data_1(
+                        metric, search_term, selected_columns
+                    ),
+                    inputs=[
+                        deprecated_metric_dropdown_1,
+                        deprecated_research_textbox_1,
+                        deprecated_column_checkboxes_1,
+                    ],
                     outputs=deprecated_dataframe_1,
                 )
                 deprecated_column_checkboxes_1.change(
+                    lambda metric, search_term, selected_columns: deprecated_update_data_1(
+                        metric, search_term, selected_columns
+                    ),
+                    inputs=[
+                        deprecated_metric_dropdown_1,
+                        deprecated_research_textbox_1,
+                        deprecated_column_checkboxes_1,
+                    ],
                     outputs=deprecated_dataframe_1,
                 )
                 ```
                 """
                 )
+            with gr.TabItem("Deprecated ViDoRe V2", id="vidore-v2-deprecated"):
                 gr.Markdown(
                     "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
                     "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
                     "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
                     "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
                 )
+                gr.Markdown(
+                    "## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>"
+                )
+                gr.Markdown(
+                    "# <span style='color:red'>[Deprecated]</span> ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍"
+                )
                 gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
                 gr.Markdown(
                 deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
                 with gr.Row():
+                    deprecated_metric_dropdown_2 = gr.Dropdown(
+                        choices=METRICS, value=initial_metric, label="Select Metric"
+                    )
                     deprecated_research_textbox_2 = gr.Textbox(
                         placeholder="🔍 Search Models... [press enter]",
                         label="Filter Models by Name",
                     )
                     deprecated_column_checkboxes_2 = gr.CheckboxGroup(
+                        choices=deprecated_datasets_columns_2,
+                        value=deprecated_datasets_columns_2,
+                        label="Select Columns to Display",
                     )
                 with gr.Row():
                     deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
+                    deprecated_dataframe_2 = gr.Dataframe(
+                        deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas"
+                    )
                 def deprecated_update_data_2(metric, search_term, selected_columns):
                     deprecated_model_handler.get_vidore_data(metric)
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
+                        data = data[["Rank", "Model", "Average Score"] + selected_columns]
                     return data
                 with gr.Row():
                     outputs=deprecated_dataframe_2,
                 )
                 deprecated_research_textbox_2.submit(
+                    lambda metric, search_term, selected_columns: deprecated_update_data_2(
+                        metric, search_term, selected_columns
+                    ),
+                    inputs=[
+                        deprecated_metric_dropdown_2,
+                        deprecated_research_textbox_2,
+                        deprecated_column_checkboxes_2,
+                    ],
                     outputs=deprecated_dataframe_2,
                 )
                 deprecated_column_checkboxes_2.change(
+                    lambda metric, search_term, selected_columns: deprecated_update_data_2(
+                        metric, search_term, selected_columns
+                    ),
+                    inputs=[
+                        deprecated_metric_dropdown_2,
+                        deprecated_research_textbox_2,
+                        deprecated_column_checkboxes_2,
+                    ],
                     outputs=deprecated_dataframe_2,
                 )
                 """
                 )
+        def select_tab_from_url(request: gr.Request):
+            # Grab query parameters from the URL
+            query_params = dict(request.query_params)
+            # Look for ?tab=..., default to the first tab's ID if not found
+            target_tab = query_params.get("tab", "vidore-v3")
+            # Update the tabs component to select the target ID
+            return gr.update(selected=target_tab)
+        block.load(select_tab_from_url, inputs=None, outputs=tabs)
     block.queue(max_size=10).launch(debug=True)

app/utils.py CHANGED Viewed

@@ -1,7 +1,20 @@
-def make_clickable_model(model_name, link=None):
-    if link is None:
         desanitized_model_name = model_name.replace("__", "/")
         desanitized_model_name = desanitized_model_name.replace("_", "/")
         desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
@@ -11,47 +24,81 @@ def make_clickable_model(model_name, link=None):
         if "/ocr" in desanitized_model_name:
             desanitized_model_name = desanitized_model_name.replace("/ocr", "")
-        link = "https://huggingface.co/" + desanitized_model_name
     return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
 def add_rank(df, benchmark_version=1, selected_columns=None):
-        df.fillna(0.0, inplace=True)
-        if selected_columns is None:
-            cols_to_rank = [
-                col
-                for col in df.columns
-                if col
-                not in [
-                    "Model",
-                    "Model Size (Million Parameters)",
-                    "Memory Usage (GB, fp32)",
-                    "Embedding Dimensions",
-                    "Max Tokens",
-                ]
             ]
-        else:
-            cols_to_rank = selected_columns
-        if len(cols_to_rank) == 1:
-            df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
-        else:
-            df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
-            df.sort_values("Average", ascending=False, inplace=True)
-        df.insert(0, "Rank", list(range(1, len(df) + 1)))
-        # multiply values by 100 if they are floats and round to 1 decimal place
-        for col in df.columns:
-            if df[col].dtype == "float64" and col != "Model Size (Million Parameters)":
-                df[col] = df[col].apply(lambda x: round(x * 100, 1))
-        return df
-def add_rank_and_format(df, benchmark_version=1, selected_columns=None):
     df = df.reset_index()
-    df = df.rename(columns={"index": "Model"})
     df = add_rank(df, benchmark_version, selected_columns)
-    df["Model"] = df["Model"].apply(make_clickable_model)
     # df = remove_duplicates(df)
     return df
@@ -73,6 +120,7 @@ def get_refresh_function(model_handler, benchmark_version):
     return _refresh
 def deprecated_get_refresh_function(model_handler, benchmark_version):
     def _refresh(metric):
         model_handler.get_vidore_data(metric)
@@ -83,7 +131,21 @@ def deprecated_get_refresh_function(model_handler, benchmark_version):
     return _refresh
 def filter_models(data, search_term):
     if search_term:
-        data = data[data["Model"].str.contains(search_term, case=False, na=False)]
     return data

+import pandas as pd
+import math
+def make_clickable_model(model_name, link=None, is_pipeline=False, folder_name=None):
+    if is_pipeline:
+        # For pipelines: use folder_name for link, model_name (alias) for display
+        link_folder = folder_name if folder_name else model_name
+        # Process folder name for link: only handle __ and -thisisapoint-
+        desanitized_folder = link_folder.replace("__", "/")
+        desanitized_folder = desanitized_folder.replace("-thisisapoint-", ".")
+        if link is None:
+            link = f"https://github.com/illuin-tech/vidore-benchmark/tree/main/results/pipeline_descriptions/{desanitized_folder}/description.json"
+        # Use word-wrap styling for potentially long pipeline aliases
+        return f'<a target="_blank" style="text-decoration: underline; word-wrap: break-word; white-space: normal; display: inline-block; max-width: 450px;" href="{link}">{model_name}</a>'
+    else:
+        # For regular models: replace __ and _ with /, and -thisisapoint- with .
         desanitized_model_name = model_name.replace("__", "/")
         desanitized_model_name = desanitized_model_name.replace("_", "/")
         desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
         if "/ocr" in desanitized_model_name:
             desanitized_model_name = desanitized_model_name.replace("/ocr", "")
+        if link is None:
+            link = "https://huggingface.co/" + desanitized_model_name
     return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
 def add_rank(df, benchmark_version=1, selected_columns=None):
+    # Convert numeric columns to proper float type (they may be 'object' dtype due to mixed data)
+    for col in df.columns:
+        if col not in ["Model", "Pipeline", "_folder_name"]:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
+    # Only fill NaN for numeric columns to avoid issues with string columns like _folder_name
+    numeric_cols = df.select_dtypes(include=["float64", "int64", "float32", "int32"]).columns
+    df[numeric_cols] = df[numeric_cols].fillna(0.0)
+    if selected_columns is None:
+        cols_to_rank = [
+            col
+            for col in df.columns
+            if col
+            not in [
+                "Model",
+                "Pipeline",  # For pipeline tables
+                "Model Size (Million Parameters)",
+                "Memory Usage (GB, fp32)",
+                "Embedding Dimensions",
+                "Max Tokens",
+                "Compute Cost ($)",
+                "Queries per Second",
+                "_folder_name",  # Hidden column for pipeline link generation
             ]
+        ]
+    else:
+        cols_to_rank = selected_columns
+    if len(cols_to_rank) == 1:
+        df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
+    else:
+        # Only add Average column if it doesn't already exist
+        if "Average Score" not in df.columns:
+            df.insert(len(df.columns) - len(cols_to_rank), "Average Score", df[cols_to_rank].mean(axis=1, skipna=False))
+        df.sort_values("Average Score", ascending=False, inplace=True)
+    df.insert(0, "Rank", list(range(1, len(df) + 1)))
+    # multiply values by 100 if they are floats and round to 2 decimal places
+    for col in df.columns:
+        if pd.api.types.is_numeric_dtype(df[col]) and col not in [
+            "Model Size (Million Parameters)",
+            "Compute Cost ($)",
+            "Queries per Second",
+            "Indexing latency (s/doc)",
+            "Search latency (s/query)",
+            "Rank",
+        ]:
+            df[col] = df[col].apply(lambda x: round(x * 100, 3 - int(math.floor(math.log10(abs(x * 100)))) - 1))
+        elif pd.api.types.is_numeric_dtype(df[col]) and col in ["Indexing latency (s/doc)", "Search latency (s/query)"]:
+            df[col] = df[col].apply(lambda x: round(x, 3 - int(math.floor(math.log10(abs(x)))) - 1))
+    return df
+def add_rank_and_format(df, benchmark_version=1, selected_columns=None, is_pipeline=False):
     df = df.reset_index()
+    column_name = "Pipeline" if is_pipeline else "Model"
+    df = df.rename(columns={"index": column_name})
     df = add_rank(df, benchmark_version, selected_columns)
+    if is_pipeline and "_folder_name" in df.columns:
+        # For pipelines, use folder_name for link generation
+        df[column_name] = df.apply(
+            lambda row: make_clickable_model(row[column_name], is_pipeline=True, folder_name=row["_folder_name"]),
+            axis=1,
+        )
+        df = df.drop(columns=["_folder_name"])
+    else:
+        df[column_name] = df[column_name].apply(lambda x: make_clickable_model(x, is_pipeline=is_pipeline))
     # df = remove_duplicates(df)
     return df
     return _refresh
 def deprecated_get_refresh_function(model_handler, benchmark_version):
     def _refresh(metric):
         model_handler.get_vidore_data(metric)
     return _refresh
+def get_pipeline_refresh_function(pipeline_handler):
+    """Refresh function for pipeline evaluation results."""
+    def _refresh(metric):
+        pipeline_handler.get_pipeline_data()
+        data = pipeline_handler.render_df(metric)
+        df = add_rank_and_format(data, benchmark_version=3, is_pipeline=True)
+        return df
+    return _refresh
 def filter_models(data, search_term):
     if search_term:
+        # Use "Pipeline" column for pipeline tables, "Model" for others
+        col_name = "Pipeline" if "Pipeline" in data.columns else "Model"
+        data = data[data[col_name].str.contains(search_term, case=False, na=False)]
     return data

data/pipeline_handler.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import os
+import requests
+from typing import Dict, List, Optional
+import pandas as pd
+class PipelineHandler:
+    """Handler for ViDoRe v3 pipeline evaluation results from GitHub."""
+    def __init__(self):
+        self.pipeline_infos = {}
+        self.pipeline_aliases = {}  # Maps folder_name -> pipeline_alias for display
+        self.github_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/metrics"
+        self.github_descriptions_base_url = (
+            "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/main/results/pipeline_descriptions"
+        )
+        self.available_datasets = []
+        self.available_languages = ["english"]  # Default languages available
+        # Setup GitHub authentication if token is available
+        self.github_token = os.environ.get("GITHUB_TOKEN")
+        self.headers = {}
+        if self.github_token:
+            self.headers["Authorization"] = f"token {self.github_token}"
+            print("GitHub token detected - using authenticated requests")
+    def get_pipeline_folders_from_github(self) -> List[str]:
+        """Get list of pipeline folders from GitHub API."""
+        api_url = "https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics"
+        try:
+            response = requests.get(api_url, headers=self.headers)
+            response.raise_for_status()
+            contents = response.json()
+            # Filter for directories only
+            folders = [item["name"] for item in contents if item["type"] == "dir"]
+            return sorted(folders)
+        except Exception as e:
+            print(f"Error fetching pipeline folders from GitHub: {e}")
+            return []
+    def get_dataset_files_from_github(self, pipeline_name: str) -> List[str]:
+        """Get list of dataset JSON files for a specific pipeline from GitHub API."""
+        api_url = f"https://api.github.com/repos/illuin-tech/vidore-benchmark/contents/results/metrics/{pipeline_name}"
+        try:
+            response = requests.get(api_url, headers=self.headers)
+            response.raise_for_status()
+            contents = response.json()
+            # Filter for JSON files that start with 'vidore_v3'
+            files = [
+                item["name"]
+                for item in contents
+                if item["type"] == "file" and item["name"].startswith("vidore_v3") and item["name"].endswith(".json")
+            ]
+            return sorted(files)
+        except Exception as e:
+            print(f"Error fetching dataset files from {pipeline_name}: {e}")
+            return []
+    def fetch_json_from_github(self, pipeline_name: str, filename: str) -> Optional[Dict]:
+        """Fetch a JSON file from GitHub raw content."""
+        url = f"{self.github_base_url}/{pipeline_name}/{filename}"
+        try:
+            response = requests.get(url, headers=self.headers)
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            print(f"Error fetching {filename} from {pipeline_name}: {e}")
+            return None
+    def fetch_pipeline_alias(self, pipeline_name: str) -> Optional[str]:
+        """Fetch the pipeline_alias from description.json for a pipeline.
+        Uses raw.githubusercontent.com to avoid API rate limits.
+        """
+        url = f"{self.github_descriptions_base_url}/{pipeline_name}/description.json"
+        try:
+            response = requests.get(url, headers=self.headers)
+            response.raise_for_status()
+            description = response.json()
+            return description.get("pipeline_alias")
+        except Exception as e:
+            print(f"Error fetching description for {pipeline_name}: {e}")
+            return None
+    def get_pipeline_data(self):
+        """Fetch all pipeline data from GitHub."""
+        pipeline_folders = self.get_pipeline_folders_from_github()
+        datasets_set = set()
+        languages_set = set(["overall"])
+        for pipeline_name in pipeline_folders:
+            # Get all dataset files for this pipeline
+            dataset_files = self.get_dataset_files_from_github(pipeline_name)
+            if not dataset_files:
+                continue
+            pipeline_data = {}
+            for filename in dataset_files:
+                results = self.fetch_json_from_github(pipeline_name, filename)
+                if results:
+                    # Extract dataset name from filename (remove vidore_v3_ prefix and .json suffix)
+                    dataset_name = filename.replace("vidore_v3_", "").replace(".json", "")
+                    datasets_set.add(dataset_name)
+                    pipeline_data[dataset_name] = results
+                    # Collect available languages
+                    if "aggregated_metrics" in results and "by_language" in results["aggregated_metrics"]:
+                        languages_set.update(results["aggregated_metrics"]["by_language"].keys())
+            if pipeline_data:
+                self.pipeline_infos[pipeline_name] = pipeline_data
+                # Fetch the pipeline alias for display (uses raw URL, not API)
+                alias = self.fetch_pipeline_alias(pipeline_name)
+                if alias:
+                    self.pipeline_aliases[pipeline_name] = alias
+        self.available_datasets = sorted(list(datasets_set))
+        self.available_languages = sorted(list(languages_set))
+    def calculate_cost_metric(self, pipeline_datasets: Dict) -> float:
+        """
+        Calculate a compute cost metric based on retrieval time across all datasets.
+        Returns cost in arbitrary units (could be refined based on actual compute costs).
+        """
+        total_time_s = 0
+        for dataset_name, dataset_data in pipeline_datasets.items():
+            if "aggregated_metrics" not in dataset_data:
+                continue
+            timing = dataset_data["aggregated_metrics"].get("timing", {})
+            total_time_ms = timing.get("total_retrieval_time_milliseconds", 0)
+            total_time_s += total_time_ms / 1000.0
+        # Simple cost model: assume $0.01 per second of compute (adjustable)
+        cost = total_time_s * 0.01
+        return round(cost, 4)
+    def extract_dataset_metrics(
+        self, pipeline_datasets: Dict, metric: str = "ndcg_cut_5", language: str = "english"
+    ) -> Dict[str, float]:
+        """
+        Extract metrics for individual datasets from the aggregated results.
+        Args:
+            pipeline_datasets: Dictionary mapping dataset names to their data
+            metric: The metric to extract (e.g., 'ndcg_at_5')
+            language: The language to filter by ('english' for English-only results, or specific language)
+        Returns:
+            Dictionary mapping dataset names to metric values
+        """
+        # Map metric names from UI format to API format
+        metric_mapping = {
+            "ndcg_at_1": "ndcg_cut_5",  # Using cut_5 as approximation
+            "ndcg_at_5": "ndcg_cut_5",
+            "ndcg_at_10": "ndcg_cut_10",
+            "ndcg_at_100": "ndcg_cut_100",
+            "recall_at_1": "recall_5",
+            "recall_at_5": "recall_5",
+            "recall_at_10": "recall_10",
+            "recall_at_100": "recall_100",
+        }
+        actual_metric = metric_mapping.get(metric, metric)
+        dataset_metrics = {}
+        for dataset_name, dataset_data in pipeline_datasets.items():
+            if "aggregated_metrics" not in dataset_data:
+                continue
+            aggregated = dataset_data["aggregated_metrics"]
+            # Get metrics for the specified language
+            if language == "overall":
+                metrics_data = aggregated.get("overall", {})
+            else:
+                metrics_data = aggregated.get("by_language", {}).get(language, {})
+            if metrics_data:
+                # Format dataset name for display
+                display_name = dataset_name.replace("_", " ").title()
+                dataset_metrics[display_name] = metrics_data.get(actual_metric, 0.0)
+        return dataset_metrics
+    def render_df(self, metric: str = "ndcg_at_5", language: str = "overall") -> pd.DataFrame:
+        """
+        Render a DataFrame with pipeline results.
+        Args:
+            metric: The metric to display (e.g., 'ndcg_at_5')
+            language: The language to filter by ('overall' for all languages, or specific language)
+        Returns:
+            DataFrame with columns: Pipeline Name, Compute Cost, Timing metrics, Dataset metrics
+        """
+        pipeline_res = {}
+        for pipeline_name, pipeline_datasets in self.pipeline_infos.items():
+            row_data = {}
+            # Aggregate time metrics across all datasets
+            total_time_ms = 0
+            total_queries = 0
+            indexing_time_ms = 0
+            search_time_ms = 0
+            num_datasets = 0
+            for dataset_name, dataset_data in pipeline_datasets.items():
+                if "aggregated_metrics" in dataset_data:
+                    timing = dataset_data["aggregated_metrics"].get("timing", {})
+                    total_time_ms += timing.get("total_retrieval_time_milliseconds", 0)
+                    total_queries += timing.get("num_queries", 0)
+                    indexing_time_ms += timing.get("indexing_throughput_ms_per_doc", 0)
+                    search_time_ms += timing.get("search_throughput_ms_per_query", 0)
+                    num_datasets += 1
+            if total_queries > 0:
+                if total_time_ms > 0:
+                    # row_data["Total retrieval time (s)"] = round(
+                    #     (total_time_ms / 1000), 0
+                    # )
+                    row_data["Indexing latency (s/doc)"] = (
+                        (indexing_time_ms / 1000) / num_datasets if indexing_time_ms > 0 else None
+                    )
+                    row_data["Search latency (s/query)"] = (
+                        (search_time_ms / 1000) / num_datasets if search_time_ms > 0 else None
+                    )
+                else:
+                    # row_data["Total retrieval time (s)"] = 0
+                    row_data["Indexing latency (s/doc)"] = 0
+                    row_data["Search latency (s/query)"] = 0
+            else:
+                # row_data["Total retrieval time (s)"] = -1
+                row_data["Indexing latency (s/doc)"] = -1
+                row_data["Search latency (s/query)"] = -1
+            # Add dataset metrics
+            dataset_metrics = self.extract_dataset_metrics(pipeline_datasets, metric, language)
+            row_data.update(dataset_metrics)
+            # Calculate average across datasets if there are multiple
+            if dataset_metrics:
+                row_data["Average Score"] = round(sum(dataset_metrics.values()) / len(dataset_metrics), 4)
+            # Use pipeline_alias for display if available, otherwise fallback to folder name
+            display_name = self.pipeline_aliases.get(pipeline_name, pipeline_name)
+            # Store folder name for link generation (will be used in utils.py)
+            row_data["_folder_name"] = pipeline_name
+            pipeline_res[display_name] = row_data
+        if pipeline_res:
+            df = pd.DataFrame(pipeline_res).T
+            # Reorder columns to have Average right after timing metrics
+            cols = list(df.columns)
+            if "Average Score" in cols:
+                cols.remove("Average Score")
+                # Insert Average after Search latency (s/query)
+                insert_pos = cols.index("Search latency (s/query)") + 1 if "Search latency (s/query)" in cols else 2
+                cols.insert(insert_pos, "Average Score")
+                df = df[cols]
+            return df
+        return pd.DataFrame()