ArmBench-TextEmbed / data_handler.py
Zaruhi's picture
Initial release
c5f9df5
import pandas as pd
COLUMN_LABELS = {
"model_name": "Model",
"model_size": "Size",
"mteb_avg": "MTEB",
"sts_spearman": "STS",
"retrieval_top20": "Retrieval",
"msmarco_top10": "MS MARCO",
}
TRANSLIT_COLUMN_LABELS = {
"model_name": "Model",
"model_size": "Size",
"retrieval_translit_top20": "Retrieval",
"msmarco_translit_top10": "MS MARCO",
}
# Metrics used for computing overall average (native script only)
SCORE_COLS = ["mteb_avg", "sts_spearman", "retrieval_top20", "msmarco_top10"]
def prepare_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
"""Prepare embedding benchmark leaderboard from raw results DataFrame."""
if df.empty:
return df
df = df.copy()
# Format model_name as hyperlink if model_url exists
if "model_url" in df.columns:
df["model_name"] = df.apply(
lambda row: f"[{row['model_name']}]({row['model_url']})"
if pd.notna(row.get("model_url"))
else row["model_name"],
axis=1,
)
# Calculate overall average (only native script metrics, exclude translit)
available_cols = [c for c in SCORE_COLS if c in df.columns]
if available_cols:
df["average"] = df[available_cols].mean(axis=1).round(4)
# Sort by average
if "average" in df.columns:
df = df.sort_values(by="average", ascending=False).reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
# Select only main leaderboard columns (exclude translit)
# Include model_size if available
size_col = ["model_size"] if "model_size" in df.columns else []
display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
df = df[[c for c in display_cols if c in df.columns]]
# Replace missing model_size with "-"
if "model_size" in df.columns:
df["model_size"] = df["model_size"].fillna("-").replace("", "-")
# Round numeric columns
df = df.round(4)
# Rename columns for display
df = df.rename(columns={**COLUMN_LABELS, "average": "Average"})
return df
def prepare_detailed_leaderboards(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
"""Prepare a single combined detailed leaderboard with hierarchical columns.
Args:
detailed_results: Dict with DataFrames from ModelHandler.get_detailed_results()
model_order: Optional list of model names in desired order. If provided, models will be
displayed in this order instead of being sorted independently.
use_multiindex: If True, return DataFrame with MultiIndex columns for proper
hierarchical display (merged headers in HTML/Gradio).
If False, use flat "Category | Metric" column names.
Returns:
pd.DataFrame: Combined table with dataset names as hierarchical column headers
"""
# Dataset configurations: (dataset_key, dataset_label, column_mappings)
datasets = [
("mteb", "MTEB", {
"FloresBitextMining_devtest": "Flores",
"NTREXBitextMining_test": "NTREX",
"Tatoeba_test": "Tatoeba",
"MassiveIntentClassification_test": "Intent",
"MassiveScenarioClassification_test": "Scenario",
"SIB200Classification_test": "SIB200 Cls",
"SIB200ClusteringS2S_test": "SIB200 Clust",
"ArmenianParaphrasePC_test": "Paraphrase",
"BelebeleRetrieval_test": "Belebele",
}),
("sts", "STS", {
"Pearson_correlation": "Pearson",
"Spearman_correlation": "Spearman",
}),
("retrieval", "Retrieval", {
"top1 within document": "Top-1 Doc",
"top3 within document": "Top-3 Doc",
"top5 within document": "Top-5 Doc",
"top20 group mean macro": "Top-20 Type",
"top20 all": "Top-20 All",
}),
("msmarco", "MS MARCO", {
"reranking_mrr": "Rerank MRR",
"retrieval_mrr": "Retr. MRR",
"retrieval_top5_accuracy": "Top-5",
"retrieval_top10_accuracy": "Top-10",
}),
]
# Collect all models from all datasets
all_models = set()
for key, _, _ in datasets:
df = detailed_results.get(key, pd.DataFrame())
if not df.empty and "model_name" in df.columns:
all_models.update(df["model_name"].unique())
if not all_models:
return pd.DataFrame()
# Use provided model_order if available, otherwise sort alphabetically
if model_order:
# Filter model_order to only include models that exist in detailed_results
ordered_models = [m for m in model_order if m in all_models]
# Add any remaining models not in model_order (in case they're new)
remaining = sorted([m for m in all_models if m not in ordered_models])
all_models_ordered = ordered_models + remaining
else:
all_models_ordered = sorted(all_models)
# Build combined dataframe with flat columns first
combined = pd.DataFrame({"Model": all_models_ordered})
column_tuples = [("", "Model")] # For MultiIndex: (level1, level2)
for key, label, col_map in datasets:
df = detailed_results.get(key, pd.DataFrame())
if df.empty:
continue
df = df.drop_duplicates(subset=["model_name"], keep="first")
for orig_col, new_col in col_map.items():
if orig_col in df.columns:
col_name = f"{label} | {new_col}"
column_tuples.append((label, new_col))
merged = combined.merge(
df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
on="Model",
how="left"
)
combined = merged
# Round numeric columns
combined = combined.round(4)
# If no model_order was provided, sort by first numeric column for backward compatibility
if not model_order:
numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
if numeric_cols:
combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")
# Always reset index to ensure proper row ordering
combined = combined.reset_index(drop=True)
combined.insert(0, "#", range(1, len(combined) + 1))
column_tuples.insert(0, ("", "#"))
if use_multiindex:
# Convert to MultiIndex columns for proper hierarchical display
combined.columns = pd.MultiIndex.from_tuples(column_tuples)
return combined
def prepare_translit_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
"""Prepare translit summary leaderboard from raw results DataFrame."""
if df.empty:
return df
df = df.copy()
# Format model_name as hyperlink if model_url exists
if "model_url" in df.columns:
df["model_name"] = df.apply(
lambda row: f"[{row['model_name']}]({row['model_url']})"
if pd.notna(row.get("model_url"))
else row["model_name"],
axis=1,
)
# Only include translit columns
translit_cols = ["retrieval_translit_top20", "msmarco_translit_top10"]
available_cols = [c for c in translit_cols if c in df.columns]
if not available_cols:
return pd.DataFrame()
# Filter to models that have translit data
df = df.dropna(subset=available_cols, how="all")
if df.empty:
return pd.DataFrame()
# Calculate average
df["average"] = df[available_cols].mean(axis=1).round(4)
# Sort by average
df = df.sort_values(by="average", ascending=False).reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
# Select columns - include model_size if available
size_col = ["model_size"] if "model_size" in df.columns else []
display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
df = df[[c for c in display_cols if c in df.columns]].round(4)
# Replace missing model_size with "-" if it's in the data
if "model_size" in df.columns:
df["model_size"] = df["model_size"].fillna("-").replace("", "-")
df = df.rename(columns={**TRANSLIT_COLUMN_LABELS, "average": "Average"})
return df
def prepare_translit_detailed(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
"""Prepare a single combined translit detailed leaderboard with hierarchical columns.
Args:
detailed_results: Dict with 'retrieval_translit' and 'msmarco_translit' DataFrames
model_order: Optional list of model names in desired order. If provided, models will be
displayed in this order instead of being sorted independently.
use_multiindex: If True, return DataFrame with MultiIndex columns for proper
hierarchical display (merged headers in HTML/Gradio).
If False, use flat "Category | Metric" column names.
Returns:
pd.DataFrame: Combined table with dataset names as hierarchical column headers
"""
datasets = [
("retrieval_translit", "Retrieval", {
"top1 within document": "Top-1 Doc",
"top3 within document": "Top-3 Doc",
"top5 within document": "Top-5 Doc",
"top20 group mean macro": "Top-20 Type",
"top20 all": "Top-20 All",
}),
("msmarco_translit", "MS MARCO", {
"reranking_mrr": "Rerank MRR",
"retrieval_mrr": "Retr. MRR",
"retrieval_top5_accuracy": "Top-5",
"retrieval_top10_accuracy": "Top-10",
}),
]
# Collect all models from all datasets
all_models = set()
for key, _, _ in datasets:
df = detailed_results.get(key, pd.DataFrame())
if not df.empty and "model_name" in df.columns:
all_models.update(df["model_name"].unique())
if not all_models:
return pd.DataFrame()
# Use provided model_order if available, otherwise sort alphabetically
if model_order:
# Filter model_order to only include models that exist in detailed_results
ordered_models = [m for m in model_order if m in all_models]
# Add any remaining models not in model_order (in case they're new)
remaining = sorted([m for m in all_models if m not in ordered_models])
all_models_ordered = ordered_models + remaining
else:
all_models_ordered = sorted(all_models)
# Build combined dataframe
combined = pd.DataFrame({"Model": all_models_ordered})
column_tuples = [("", "Model")] # For MultiIndex: (level1, level2)
for key, label, col_map in datasets:
df = detailed_results.get(key, pd.DataFrame())
if df.empty:
continue
df = df.drop_duplicates(subset=["model_name"], keep="first")
for orig_col, new_col in col_map.items():
if orig_col in df.columns:
col_name = f"{label} | {new_col}"
column_tuples.append((label, new_col))
merged = combined.merge(
df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
on="Model",
how="left"
)
combined = merged
# Round numeric columns
combined = combined.round(4)
# If no model_order was provided, sort by first numeric column for backward compatibility
if not model_order:
numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
if numeric_cols:
combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")
# Always reset index to ensure proper row ordering
combined = combined.reset_index(drop=True)
combined.insert(0, "#", range(1, len(combined) + 1))
column_tuples.insert(0, ("", "#"))
if use_multiindex:
# Convert to MultiIndex columns for proper hierarchical display
combined.columns = pd.MultiIndex.from_tuples(column_tuples)
return combined