Spaces:
Running
Running
| import pandas as pd | |
| COLUMN_LABELS = { | |
| "model_name": "Model", | |
| "model_size": "Size", | |
| "mteb_avg": "MTEB", | |
| "sts_spearman": "STS", | |
| "retrieval_top20": "Retrieval", | |
| "msmarco_top10": "MS MARCO", | |
| } | |
| TRANSLIT_COLUMN_LABELS = { | |
| "model_name": "Model", | |
| "model_size": "Size", | |
| "retrieval_translit_top20": "Retrieval", | |
| "msmarco_translit_top10": "MS MARCO", | |
| } | |
| # Metrics used for computing overall average (native script only) | |
| SCORE_COLS = ["mteb_avg", "sts_spearman", "retrieval_top20", "msmarco_top10"] | |
| def prepare_leaderboard(df: pd.DataFrame) -> pd.DataFrame: | |
| """Prepare embedding benchmark leaderboard from raw results DataFrame.""" | |
| if df.empty: | |
| return df | |
| df = df.copy() | |
| # Format model_name as hyperlink if model_url exists | |
| if "model_url" in df.columns: | |
| df["model_name"] = df.apply( | |
| lambda row: f"[{row['model_name']}]({row['model_url']})" | |
| if pd.notna(row.get("model_url")) | |
| else row["model_name"], | |
| axis=1, | |
| ) | |
| # Calculate overall average (only native script metrics, exclude translit) | |
| available_cols = [c for c in SCORE_COLS if c in df.columns] | |
| if available_cols: | |
| df["average"] = df[available_cols].mean(axis=1).round(4) | |
| # Sort by average | |
| if "average" in df.columns: | |
| df = df.sort_values(by="average", ascending=False).reset_index(drop=True) | |
| df.insert(0, "Rank", range(1, len(df) + 1)) | |
| # Select only main leaderboard columns (exclude translit) | |
| # Include model_size if available | |
| size_col = ["model_size"] if "model_size" in df.columns else [] | |
| display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"] | |
| df = df[[c for c in display_cols if c in df.columns]] | |
| # Replace missing model_size with "-" | |
| if "model_size" in df.columns: | |
| df["model_size"] = df["model_size"].fillna("-").replace("", "-") | |
| # Round numeric columns | |
| df = df.round(4) | |
| # Rename columns for display | |
| df = df.rename(columns={**COLUMN_LABELS, "average": "Average"}) | |
| return df | |
| def prepare_detailed_leaderboards(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame: | |
| """Prepare a single combined detailed leaderboard with hierarchical columns. | |
| Args: | |
| detailed_results: Dict with DataFrames from ModelHandler.get_detailed_results() | |
| model_order: Optional list of model names in desired order. If provided, models will be | |
| displayed in this order instead of being sorted independently. | |
| use_multiindex: If True, return DataFrame with MultiIndex columns for proper | |
| hierarchical display (merged headers in HTML/Gradio). | |
| If False, use flat "Category | Metric" column names. | |
| Returns: | |
| pd.DataFrame: Combined table with dataset names as hierarchical column headers | |
| """ | |
| # Dataset configurations: (dataset_key, dataset_label, column_mappings) | |
| datasets = [ | |
| ("mteb", "MTEB", { | |
| "FloresBitextMining_devtest": "Flores", | |
| "NTREXBitextMining_test": "NTREX", | |
| "Tatoeba_test": "Tatoeba", | |
| "MassiveIntentClassification_test": "Intent", | |
| "MassiveScenarioClassification_test": "Scenario", | |
| "SIB200Classification_test": "SIB200 Cls", | |
| "SIB200ClusteringS2S_test": "SIB200 Clust", | |
| "ArmenianParaphrasePC_test": "Paraphrase", | |
| "BelebeleRetrieval_test": "Belebele", | |
| }), | |
| ("sts", "STS", { | |
| "Pearson_correlation": "Pearson", | |
| "Spearman_correlation": "Spearman", | |
| }), | |
| ("retrieval", "Retrieval", { | |
| "top1 within document": "Top-1 Doc", | |
| "top3 within document": "Top-3 Doc", | |
| "top5 within document": "Top-5 Doc", | |
| "top20 group mean macro": "Top-20 Type", | |
| "top20 all": "Top-20 All", | |
| }), | |
| ("msmarco", "MS MARCO", { | |
| "reranking_mrr": "Rerank MRR", | |
| "retrieval_mrr": "Retr. MRR", | |
| "retrieval_top5_accuracy": "Top-5", | |
| "retrieval_top10_accuracy": "Top-10", | |
| }), | |
| ] | |
| # Collect all models from all datasets | |
| all_models = set() | |
| for key, _, _ in datasets: | |
| df = detailed_results.get(key, pd.DataFrame()) | |
| if not df.empty and "model_name" in df.columns: | |
| all_models.update(df["model_name"].unique()) | |
| if not all_models: | |
| return pd.DataFrame() | |
| # Use provided model_order if available, otherwise sort alphabetically | |
| if model_order: | |
| # Filter model_order to only include models that exist in detailed_results | |
| ordered_models = [m for m in model_order if m in all_models] | |
| # Add any remaining models not in model_order (in case they're new) | |
| remaining = sorted([m for m in all_models if m not in ordered_models]) | |
| all_models_ordered = ordered_models + remaining | |
| else: | |
| all_models_ordered = sorted(all_models) | |
| # Build combined dataframe with flat columns first | |
| combined = pd.DataFrame({"Model": all_models_ordered}) | |
| column_tuples = [("", "Model")] # For MultiIndex: (level1, level2) | |
| for key, label, col_map in datasets: | |
| df = detailed_results.get(key, pd.DataFrame()) | |
| if df.empty: | |
| continue | |
| df = df.drop_duplicates(subset=["model_name"], keep="first") | |
| for orig_col, new_col in col_map.items(): | |
| if orig_col in df.columns: | |
| col_name = f"{label} | {new_col}" | |
| column_tuples.append((label, new_col)) | |
| merged = combined.merge( | |
| df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}), | |
| on="Model", | |
| how="left" | |
| ) | |
| combined = merged | |
| # Round numeric columns | |
| combined = combined.round(4) | |
| # If no model_order was provided, sort by first numeric column for backward compatibility | |
| if not model_order: | |
| numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist() | |
| if numeric_cols: | |
| combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last") | |
| # Always reset index to ensure proper row ordering | |
| combined = combined.reset_index(drop=True) | |
| combined.insert(0, "#", range(1, len(combined) + 1)) | |
| column_tuples.insert(0, ("", "#")) | |
| if use_multiindex: | |
| # Convert to MultiIndex columns for proper hierarchical display | |
| combined.columns = pd.MultiIndex.from_tuples(column_tuples) | |
| return combined | |
| def prepare_translit_leaderboard(df: pd.DataFrame) -> pd.DataFrame: | |
| """Prepare translit summary leaderboard from raw results DataFrame.""" | |
| if df.empty: | |
| return df | |
| df = df.copy() | |
| # Format model_name as hyperlink if model_url exists | |
| if "model_url" in df.columns: | |
| df["model_name"] = df.apply( | |
| lambda row: f"[{row['model_name']}]({row['model_url']})" | |
| if pd.notna(row.get("model_url")) | |
| else row["model_name"], | |
| axis=1, | |
| ) | |
| # Only include translit columns | |
| translit_cols = ["retrieval_translit_top20", "msmarco_translit_top10"] | |
| available_cols = [c for c in translit_cols if c in df.columns] | |
| if not available_cols: | |
| return pd.DataFrame() | |
| # Filter to models that have translit data | |
| df = df.dropna(subset=available_cols, how="all") | |
| if df.empty: | |
| return pd.DataFrame() | |
| # Calculate average | |
| df["average"] = df[available_cols].mean(axis=1).round(4) | |
| # Sort by average | |
| df = df.sort_values(by="average", ascending=False).reset_index(drop=True) | |
| df.insert(0, "Rank", range(1, len(df) + 1)) | |
| # Select columns - include model_size if available | |
| size_col = ["model_size"] if "model_size" in df.columns else [] | |
| display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"] | |
| df = df[[c for c in display_cols if c in df.columns]].round(4) | |
| # Replace missing model_size with "-" if it's in the data | |
| if "model_size" in df.columns: | |
| df["model_size"] = df["model_size"].fillna("-").replace("", "-") | |
| df = df.rename(columns={**TRANSLIT_COLUMN_LABELS, "average": "Average"}) | |
| return df | |
| def prepare_translit_detailed(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame: | |
| """Prepare a single combined translit detailed leaderboard with hierarchical columns. | |
| Args: | |
| detailed_results: Dict with 'retrieval_translit' and 'msmarco_translit' DataFrames | |
| model_order: Optional list of model names in desired order. If provided, models will be | |
| displayed in this order instead of being sorted independently. | |
| use_multiindex: If True, return DataFrame with MultiIndex columns for proper | |
| hierarchical display (merged headers in HTML/Gradio). | |
| If False, use flat "Category | Metric" column names. | |
| Returns: | |
| pd.DataFrame: Combined table with dataset names as hierarchical column headers | |
| """ | |
| datasets = [ | |
| ("retrieval_translit", "Retrieval", { | |
| "top1 within document": "Top-1 Doc", | |
| "top3 within document": "Top-3 Doc", | |
| "top5 within document": "Top-5 Doc", | |
| "top20 group mean macro": "Top-20 Type", | |
| "top20 all": "Top-20 All", | |
| }), | |
| ("msmarco_translit", "MS MARCO", { | |
| "reranking_mrr": "Rerank MRR", | |
| "retrieval_mrr": "Retr. MRR", | |
| "retrieval_top5_accuracy": "Top-5", | |
| "retrieval_top10_accuracy": "Top-10", | |
| }), | |
| ] | |
| # Collect all models from all datasets | |
| all_models = set() | |
| for key, _, _ in datasets: | |
| df = detailed_results.get(key, pd.DataFrame()) | |
| if not df.empty and "model_name" in df.columns: | |
| all_models.update(df["model_name"].unique()) | |
| if not all_models: | |
| return pd.DataFrame() | |
| # Use provided model_order if available, otherwise sort alphabetically | |
| if model_order: | |
| # Filter model_order to only include models that exist in detailed_results | |
| ordered_models = [m for m in model_order if m in all_models] | |
| # Add any remaining models not in model_order (in case they're new) | |
| remaining = sorted([m for m in all_models if m not in ordered_models]) | |
| all_models_ordered = ordered_models + remaining | |
| else: | |
| all_models_ordered = sorted(all_models) | |
| # Build combined dataframe | |
| combined = pd.DataFrame({"Model": all_models_ordered}) | |
| column_tuples = [("", "Model")] # For MultiIndex: (level1, level2) | |
| for key, label, col_map in datasets: | |
| df = detailed_results.get(key, pd.DataFrame()) | |
| if df.empty: | |
| continue | |
| df = df.drop_duplicates(subset=["model_name"], keep="first") | |
| for orig_col, new_col in col_map.items(): | |
| if orig_col in df.columns: | |
| col_name = f"{label} | {new_col}" | |
| column_tuples.append((label, new_col)) | |
| merged = combined.merge( | |
| df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}), | |
| on="Model", | |
| how="left" | |
| ) | |
| combined = merged | |
| # Round numeric columns | |
| combined = combined.round(4) | |
| # If no model_order was provided, sort by first numeric column for backward compatibility | |
| if not model_order: | |
| numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist() | |
| if numeric_cols: | |
| combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last") | |
| # Always reset index to ensure proper row ordering | |
| combined = combined.reset_index(drop=True) | |
| combined.insert(0, "#", range(1, len(combined) + 1)) | |
| column_tuples.insert(0, ("", "#")) | |
| if use_multiindex: | |
| # Convert to MultiIndex columns for proper hierarchical display | |
| combined.columns = pd.MultiIndex.from_tuples(column_tuples) | |
| return combined | |