Spaces:

Metric-AI
/

ArmBench-TextEmbed

Running

File size: 12,168 Bytes

c5f9df5

import pandas as pd

COLUMN_LABELS = {
    "model_name": "Model",
    "model_size": "Size",
    "mteb_avg": "MTEB",
    "sts_spearman": "STS",
    "retrieval_top20": "Retrieval",
    "msmarco_top10": "MS MARCO",
}

TRANSLIT_COLUMN_LABELS = {
    "model_name": "Model",
    "model_size": "Size",
    "retrieval_translit_top20": "Retrieval",
    "msmarco_translit_top10": "MS MARCO",
}

# Metrics used for computing overall average (native script only)
SCORE_COLS = ["mteb_avg", "sts_spearman", "retrieval_top20", "msmarco_top10"]


def prepare_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare embedding benchmark leaderboard from raw results DataFrame."""
    if df.empty:
        return df

    df = df.copy()

    # Format model_name as hyperlink if model_url exists
    if "model_url" in df.columns:
        df["model_name"] = df.apply(
            lambda row: f"[{row['model_name']}]({row['model_url']})"
            if pd.notna(row.get("model_url"))
            else row["model_name"],
            axis=1,
        )

    # Calculate overall average (only native script metrics, exclude translit)
    available_cols = [c for c in SCORE_COLS if c in df.columns]
    if available_cols:
        df["average"] = df[available_cols].mean(axis=1).round(4)

    # Sort by average
    if "average" in df.columns:
        df = df.sort_values(by="average", ascending=False).reset_index(drop=True)

    df.insert(0, "Rank", range(1, len(df) + 1))

    # Select only main leaderboard columns (exclude translit)
    # Include model_size if available
    size_col = ["model_size"] if "model_size" in df.columns else []
    display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
    df = df[[c for c in display_cols if c in df.columns]]

    # Replace missing model_size with "-"
    if "model_size" in df.columns:
        df["model_size"] = df["model_size"].fillna("-").replace("", "-")

    # Round numeric columns
    df = df.round(4)

    # Rename columns for display
    df = df.rename(columns={**COLUMN_LABELS, "average": "Average"})

    return df


def prepare_detailed_leaderboards(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
    """Prepare a single combined detailed leaderboard with hierarchical columns.

    Args:
        detailed_results: Dict with DataFrames from ModelHandler.get_detailed_results()
        model_order: Optional list of model names in desired order. If provided, models will be
                    displayed in this order instead of being sorted independently.
        use_multiindex: If True, return DataFrame with MultiIndex columns for proper
                        hierarchical display (merged headers in HTML/Gradio).
                        If False, use flat "Category | Metric" column names.

    Returns:
        pd.DataFrame: Combined table with dataset names as hierarchical column headers
    """
    # Dataset configurations: (dataset_key, dataset_label, column_mappings)
    datasets = [
        ("mteb", "MTEB", {
            "FloresBitextMining_devtest": "Flores",
            "NTREXBitextMining_test": "NTREX",
            "Tatoeba_test": "Tatoeba",
            "MassiveIntentClassification_test": "Intent",
            "MassiveScenarioClassification_test": "Scenario",
            "SIB200Classification_test": "SIB200 Cls",
            "SIB200ClusteringS2S_test": "SIB200 Clust",
            "ArmenianParaphrasePC_test": "Paraphrase",
            "BelebeleRetrieval_test": "Belebele",
        }),
        ("sts", "STS", {
            "Pearson_correlation": "Pearson",
            "Spearman_correlation": "Spearman",
        }),
        ("retrieval", "Retrieval", {
            "top1 within document": "Top-1 Doc",
            "top3 within document": "Top-3 Doc",
            "top5 within document": "Top-5 Doc",
            "top20 group mean macro": "Top-20 Type",
            "top20 all": "Top-20 All",
        }),
        ("msmarco", "MS MARCO", {
            "reranking_mrr": "Rerank MRR",
            "retrieval_mrr": "Retr. MRR",
            "retrieval_top5_accuracy": "Top-5",
            "retrieval_top10_accuracy": "Top-10",
        }),
    ]

    # Collect all models from all datasets
    all_models = set()
    for key, _, _ in datasets:
        df = detailed_results.get(key, pd.DataFrame())
        if not df.empty and "model_name" in df.columns:
            all_models.update(df["model_name"].unique())

    if not all_models:
        return pd.DataFrame()

    # Use provided model_order if available, otherwise sort alphabetically
    if model_order:
        # Filter model_order to only include models that exist in detailed_results
        ordered_models = [m for m in model_order if m in all_models]
        # Add any remaining models not in model_order (in case they're new)
        remaining = sorted([m for m in all_models if m not in ordered_models])
        all_models_ordered = ordered_models + remaining
    else:
        all_models_ordered = sorted(all_models)

    # Build combined dataframe with flat columns first
    combined = pd.DataFrame({"Model": all_models_ordered})
    column_tuples = [("", "Model")]  # For MultiIndex: (level1, level2)

    for key, label, col_map in datasets:
        df = detailed_results.get(key, pd.DataFrame())
        if df.empty:
            continue
        df = df.drop_duplicates(subset=["model_name"], keep="first")

        for orig_col, new_col in col_map.items():
            if orig_col in df.columns:
                col_name = f"{label} | {new_col}"
                column_tuples.append((label, new_col))
                merged = combined.merge(
                    df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
                    on="Model",
                    how="left"
                )
                combined = merged

    # Round numeric columns
    combined = combined.round(4)

    # If no model_order was provided, sort by first numeric column for backward compatibility
    if not model_order:
        numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
        if numeric_cols:
            combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")

    # Always reset index to ensure proper row ordering
    combined = combined.reset_index(drop=True)

    combined.insert(0, "#", range(1, len(combined) + 1))
    column_tuples.insert(0, ("", "#"))

    if use_multiindex:
        # Convert to MultiIndex columns for proper hierarchical display
        combined.columns = pd.MultiIndex.from_tuples(column_tuples)

    return combined


def prepare_translit_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare translit summary leaderboard from raw results DataFrame."""
    if df.empty:
        return df

    df = df.copy()

    # Format model_name as hyperlink if model_url exists
    if "model_url" in df.columns:
        df["model_name"] = df.apply(
            lambda row: f"[{row['model_name']}]({row['model_url']})"
            if pd.notna(row.get("model_url"))
            else row["model_name"],
            axis=1,
        )

    # Only include translit columns
    translit_cols = ["retrieval_translit_top20", "msmarco_translit_top10"]
    available_cols = [c for c in translit_cols if c in df.columns]

    if not available_cols:
        return pd.DataFrame()

    # Filter to models that have translit data
    df = df.dropna(subset=available_cols, how="all")

    if df.empty:
        return pd.DataFrame()

    # Calculate average
    df["average"] = df[available_cols].mean(axis=1).round(4)

    # Sort by average
    df = df.sort_values(by="average", ascending=False).reset_index(drop=True)
    df.insert(0, "Rank", range(1, len(df) + 1))

    # Select columns - include model_size if available
    size_col = ["model_size"] if "model_size" in df.columns else []
    display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
    df = df[[c for c in display_cols if c in df.columns]].round(4)

    # Replace missing model_size with "-" if it's in the data
    if "model_size" in df.columns:
        df["model_size"] = df["model_size"].fillna("-").replace("", "-")

    df = df.rename(columns={**TRANSLIT_COLUMN_LABELS, "average": "Average"})

    return df


def prepare_translit_detailed(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
    """Prepare a single combined translit detailed leaderboard with hierarchical columns.

    Args:
        detailed_results: Dict with 'retrieval_translit' and 'msmarco_translit' DataFrames
        model_order: Optional list of model names in desired order. If provided, models will be
                    displayed in this order instead of being sorted independently.
        use_multiindex: If True, return DataFrame with MultiIndex columns for proper
                        hierarchical display (merged headers in HTML/Gradio).
                        If False, use flat "Category | Metric" column names.

    Returns:
        pd.DataFrame: Combined table with dataset names as hierarchical column headers
    """
    datasets = [
        ("retrieval_translit", "Retrieval", {
            "top1 within document": "Top-1 Doc",
            "top3 within document": "Top-3 Doc",
            "top5 within document": "Top-5 Doc",
            "top20 group mean macro": "Top-20 Type",
            "top20 all": "Top-20 All",
        }),
        ("msmarco_translit", "MS MARCO", {
            "reranking_mrr": "Rerank MRR",
            "retrieval_mrr": "Retr. MRR",
            "retrieval_top5_accuracy": "Top-5",
            "retrieval_top10_accuracy": "Top-10",
        }),
    ]

    # Collect all models from all datasets
    all_models = set()
    for key, _, _ in datasets:
        df = detailed_results.get(key, pd.DataFrame())
        if not df.empty and "model_name" in df.columns:
            all_models.update(df["model_name"].unique())

    if not all_models:
        return pd.DataFrame()

    # Use provided model_order if available, otherwise sort alphabetically
    if model_order:
        # Filter model_order to only include models that exist in detailed_results
        ordered_models = [m for m in model_order if m in all_models]
        # Add any remaining models not in model_order (in case they're new)
        remaining = sorted([m for m in all_models if m not in ordered_models])
        all_models_ordered = ordered_models + remaining
    else:
        all_models_ordered = sorted(all_models)

    # Build combined dataframe
    combined = pd.DataFrame({"Model": all_models_ordered})
    column_tuples = [("", "Model")]  # For MultiIndex: (level1, level2)

    for key, label, col_map in datasets:
        df = detailed_results.get(key, pd.DataFrame())
        if df.empty:
            continue
        df = df.drop_duplicates(subset=["model_name"], keep="first")

        for orig_col, new_col in col_map.items():
            if orig_col in df.columns:
                col_name = f"{label} | {new_col}"
                column_tuples.append((label, new_col))
                merged = combined.merge(
                    df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
                    on="Model",
                    how="left"
                )
                combined = merged

    # Round numeric columns
    combined = combined.round(4)

    # If no model_order was provided, sort by first numeric column for backward compatibility
    if not model_order:
        numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
        if numeric_cols:
            combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")

    # Always reset index to ensure proper row ordering
    combined = combined.reset_index(drop=True)

    combined.insert(0, "#", range(1, len(combined) + 1))
    column_tuples.insert(0, ("", "#"))

    if use_multiindex:
        # Convert to MultiIndex columns for proper hierarchical display
        combined.columns = pd.MultiIndex.from_tuples(column_tuples)

    return combined