Spaces:

Metric-AI
/

ArmBench-TextEmbed

Running

App Files Files Community

ArmBench-TextEmbed / data_handler.py

Zaruhi

Initial release

c5f9df5 1 day ago

raw

history blame contribute delete

12.2 kB

	import pandas as pd

	COLUMN_LABELS = {
	"model_name": "Model",
	"model_size": "Size",
	"mteb_avg": "MTEB",
	"sts_spearman": "STS",
	"retrieval_top20": "Retrieval",
	"msmarco_top10": "MS MARCO",
	}

	TRANSLIT_COLUMN_LABELS = {
	"model_name": "Model",
	"model_size": "Size",
	"retrieval_translit_top20": "Retrieval",
	"msmarco_translit_top10": "MS MARCO",
	}

	# Metrics used for computing overall average (native script only)
	SCORE_COLS = ["mteb_avg", "sts_spearman", "retrieval_top20", "msmarco_top10"]


	def prepare_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
	"""Prepare embedding benchmark leaderboard from raw results DataFrame."""
	if df.empty:
	return df

	df = df.copy()

	# Format model_name as hyperlink if model_url exists
	if "model_url" in df.columns:
	df["model_name"] = df.apply(
	lambda row: f"[{row['model_name']}]({row['model_url']})"
	if pd.notna(row.get("model_url"))
	else row["model_name"],
	axis=1,
	)

	# Calculate overall average (only native script metrics, exclude translit)
	available_cols = [c for c in SCORE_COLS if c in df.columns]
	if available_cols:
	df["average"] = df[available_cols].mean(axis=1).round(4)

	# Sort by average
	if "average" in df.columns:
	df = df.sort_values(by="average", ascending=False).reset_index(drop=True)

	df.insert(0, "Rank", range(1, len(df) + 1))

	# Select only main leaderboard columns (exclude translit)
	# Include model_size if available
	size_col = ["model_size"] if "model_size" in df.columns else []
	display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
	df = df[[c for c in display_cols if c in df.columns]]

	# Replace missing model_size with "-"
	if "model_size" in df.columns:
	df["model_size"] = df["model_size"].fillna("-").replace("", "-")

	# Round numeric columns
	df = df.round(4)

	# Rename columns for display
	df = df.rename(columns={**COLUMN_LABELS, "average": "Average"})

	return df


	def prepare_detailed_leaderboards(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
	"""Prepare a single combined detailed leaderboard with hierarchical columns.

	Args:
	detailed_results: Dict with DataFrames from ModelHandler.get_detailed_results()
	model_order: Optional list of model names in desired order. If provided, models will be
	displayed in this order instead of being sorted independently.
	use_multiindex: If True, return DataFrame with MultiIndex columns for proper
	hierarchical display (merged headers in HTML/Gradio).
	If False, use flat "Category \| Metric" column names.

	Returns:
	pd.DataFrame: Combined table with dataset names as hierarchical column headers
	"""
	# Dataset configurations: (dataset_key, dataset_label, column_mappings)
	datasets = [
	("mteb", "MTEB", {
	"FloresBitextMining_devtest": "Flores",
	"NTREXBitextMining_test": "NTREX",
	"Tatoeba_test": "Tatoeba",
	"MassiveIntentClassification_test": "Intent",
	"MassiveScenarioClassification_test": "Scenario",
	"SIB200Classification_test": "SIB200 Cls",
	"SIB200ClusteringS2S_test": "SIB200 Clust",
	"ArmenianParaphrasePC_test": "Paraphrase",
	"BelebeleRetrieval_test": "Belebele",
	}),
	("sts", "STS", {
	"Pearson_correlation": "Pearson",
	"Spearman_correlation": "Spearman",
	}),
	("retrieval", "Retrieval", {
	"top1 within document": "Top-1 Doc",
	"top3 within document": "Top-3 Doc",
	"top5 within document": "Top-5 Doc",
	"top20 group mean macro": "Top-20 Type",
	"top20 all": "Top-20 All",
	}),
	("msmarco", "MS MARCO", {
	"reranking_mrr": "Rerank MRR",
	"retrieval_mrr": "Retr. MRR",
	"retrieval_top5_accuracy": "Top-5",
	"retrieval_top10_accuracy": "Top-10",
	}),
	]

	# Collect all models from all datasets
	all_models = set()
	for key, _, _ in datasets:
	df = detailed_results.get(key, pd.DataFrame())
	if not df.empty and "model_name" in df.columns:
	all_models.update(df["model_name"].unique())

	if not all_models:
	return pd.DataFrame()

	# Use provided model_order if available, otherwise sort alphabetically
	if model_order:
	# Filter model_order to only include models that exist in detailed_results
	ordered_models = [m for m in model_order if m in all_models]
	# Add any remaining models not in model_order (in case they're new)
	remaining = sorted([m for m in all_models if m not in ordered_models])
	all_models_ordered = ordered_models + remaining
	else:
	all_models_ordered = sorted(all_models)

	# Build combined dataframe with flat columns first
	combined = pd.DataFrame({"Model": all_models_ordered})
	column_tuples = [("", "Model")] # For MultiIndex: (level1, level2)

	for key, label, col_map in datasets:
	df = detailed_results.get(key, pd.DataFrame())
	if df.empty:
	continue
	df = df.drop_duplicates(subset=["model_name"], keep="first")

	for orig_col, new_col in col_map.items():
	if orig_col in df.columns:
	col_name = f"{label} \| {new_col}"
	column_tuples.append((label, new_col))
	merged = combined.merge(
	df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
	on="Model",
	how="left"
	)
	combined = merged

	# Round numeric columns
	combined = combined.round(4)

	# If no model_order was provided, sort by first numeric column for backward compatibility
	if not model_order:
	numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
	if numeric_cols:
	combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")

	# Always reset index to ensure proper row ordering
	combined = combined.reset_index(drop=True)

	combined.insert(0, "#", range(1, len(combined) + 1))
	column_tuples.insert(0, ("", "#"))

	if use_multiindex:
	# Convert to MultiIndex columns for proper hierarchical display
	combined.columns = pd.MultiIndex.from_tuples(column_tuples)

	return combined


	def prepare_translit_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
	"""Prepare translit summary leaderboard from raw results DataFrame."""
	if df.empty:
	return df

	df = df.copy()

	# Format model_name as hyperlink if model_url exists
	if "model_url" in df.columns:
	df["model_name"] = df.apply(
	lambda row: f"[{row['model_name']}]({row['model_url']})"
	if pd.notna(row.get("model_url"))
	else row["model_name"],
	axis=1,
	)

	# Only include translit columns
	translit_cols = ["retrieval_translit_top20", "msmarco_translit_top10"]
	available_cols = [c for c in translit_cols if c in df.columns]

	if not available_cols:
	return pd.DataFrame()

	# Filter to models that have translit data
	df = df.dropna(subset=available_cols, how="all")

	if df.empty:
	return pd.DataFrame()

	# Calculate average
	df["average"] = df[available_cols].mean(axis=1).round(4)

	# Sort by average
	df = df.sort_values(by="average", ascending=False).reset_index(drop=True)
	df.insert(0, "Rank", range(1, len(df) + 1))

	# Select columns - include model_size if available
	size_col = ["model_size"] if "model_size" in df.columns else []
	display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
	df = df[[c for c in display_cols if c in df.columns]].round(4)

	# Replace missing model_size with "-" if it's in the data
	if "model_size" in df.columns:
	df["model_size"] = df["model_size"].fillna("-").replace("", "-")

	df = df.rename(columns={**TRANSLIT_COLUMN_LABELS, "average": "Average"})

	return df


	def prepare_translit_detailed(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
	"""Prepare a single combined translit detailed leaderboard with hierarchical columns.

	Args:
	detailed_results: Dict with 'retrieval_translit' and 'msmarco_translit' DataFrames
	model_order: Optional list of model names in desired order. If provided, models will be
	displayed in this order instead of being sorted independently.
	use_multiindex: If True, return DataFrame with MultiIndex columns for proper
	hierarchical display (merged headers in HTML/Gradio).
	If False, use flat "Category \| Metric" column names.

	Returns:
	pd.DataFrame: Combined table with dataset names as hierarchical column headers
	"""
	datasets = [
	("retrieval_translit", "Retrieval", {
	"top1 within document": "Top-1 Doc",
	"top3 within document": "Top-3 Doc",
	"top5 within document": "Top-5 Doc",
	"top20 group mean macro": "Top-20 Type",
	"top20 all": "Top-20 All",
	}),
	("msmarco_translit", "MS MARCO", {
	"reranking_mrr": "Rerank MRR",
	"retrieval_mrr": "Retr. MRR",
	"retrieval_top5_accuracy": "Top-5",
	"retrieval_top10_accuracy": "Top-10",
	}),
	]

	# Collect all models from all datasets
	all_models = set()
	for key, _, _ in datasets:
	df = detailed_results.get(key, pd.DataFrame())
	if not df.empty and "model_name" in df.columns:
	all_models.update(df["model_name"].unique())

	if not all_models:
	return pd.DataFrame()

	# Use provided model_order if available, otherwise sort alphabetically
	if model_order:
	# Filter model_order to only include models that exist in detailed_results
	ordered_models = [m for m in model_order if m in all_models]
	# Add any remaining models not in model_order (in case they're new)
	remaining = sorted([m for m in all_models if m not in ordered_models])
	all_models_ordered = ordered_models + remaining
	else:
	all_models_ordered = sorted(all_models)

	# Build combined dataframe
	combined = pd.DataFrame({"Model": all_models_ordered})
	column_tuples = [("", "Model")] # For MultiIndex: (level1, level2)

	for key, label, col_map in datasets:
	df = detailed_results.get(key, pd.DataFrame())
	if df.empty:
	continue
	df = df.drop_duplicates(subset=["model_name"], keep="first")

	for orig_col, new_col in col_map.items():
	if orig_col in df.columns:
	col_name = f"{label} \| {new_col}"
	column_tuples.append((label, new_col))
	merged = combined.merge(
	df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
	on="Model",
	how="left"
	)
	combined = merged

	# Round numeric columns
	combined = combined.round(4)

	# If no model_order was provided, sort by first numeric column for backward compatibility
	if not model_order:
	numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
	if numeric_cols:
	combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")

	# Always reset index to ensure proper row ordering
	combined = combined.reset_index(drop=True)

	combined.insert(0, "#", range(1, len(combined) + 1))
	column_tuples.insert(0, ("", "#"))

	if use_multiindex:
	# Convert to MultiIndex columns for proper hierarchical display
	combined.columns = pd.MultiIndex.from_tuples(column_tuples)

	return combined