File size: 12,168 Bytes
c5f9df5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import pandas as pd

COLUMN_LABELS = {
    "model_name": "Model",
    "model_size": "Size",
    "mteb_avg": "MTEB",
    "sts_spearman": "STS",
    "retrieval_top20": "Retrieval",
    "msmarco_top10": "MS MARCO",
}

TRANSLIT_COLUMN_LABELS = {
    "model_name": "Model",
    "model_size": "Size",
    "retrieval_translit_top20": "Retrieval",
    "msmarco_translit_top10": "MS MARCO",
}

# Metrics used for computing overall average (native script only)
SCORE_COLS = ["mteb_avg", "sts_spearman", "retrieval_top20", "msmarco_top10"]


def prepare_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare embedding benchmark leaderboard from raw results DataFrame."""
    if df.empty:
        return df

    df = df.copy()

    # Format model_name as hyperlink if model_url exists
    if "model_url" in df.columns:
        df["model_name"] = df.apply(
            lambda row: f"[{row['model_name']}]({row['model_url']})"
            if pd.notna(row.get("model_url"))
            else row["model_name"],
            axis=1,
        )

    # Calculate overall average (only native script metrics, exclude translit)
    available_cols = [c for c in SCORE_COLS if c in df.columns]
    if available_cols:
        df["average"] = df[available_cols].mean(axis=1).round(4)

    # Sort by average
    if "average" in df.columns:
        df = df.sort_values(by="average", ascending=False).reset_index(drop=True)

    df.insert(0, "Rank", range(1, len(df) + 1))

    # Select only main leaderboard columns (exclude translit)
    # Include model_size if available
    size_col = ["model_size"] if "model_size" in df.columns else []
    display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
    df = df[[c for c in display_cols if c in df.columns]]

    # Replace missing model_size with "-"
    if "model_size" in df.columns:
        df["model_size"] = df["model_size"].fillna("-").replace("", "-")

    # Round numeric columns
    df = df.round(4)

    # Rename columns for display
    df = df.rename(columns={**COLUMN_LABELS, "average": "Average"})

    return df


def prepare_detailed_leaderboards(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
    """Prepare a single combined detailed leaderboard with hierarchical columns.

    Args:
        detailed_results: Dict with DataFrames from ModelHandler.get_detailed_results()
        model_order: Optional list of model names in desired order. If provided, models will be
                    displayed in this order instead of being sorted independently.
        use_multiindex: If True, return DataFrame with MultiIndex columns for proper
                        hierarchical display (merged headers in HTML/Gradio).
                        If False, use flat "Category | Metric" column names.

    Returns:
        pd.DataFrame: Combined table with dataset names as hierarchical column headers
    """
    # Dataset configurations: (dataset_key, dataset_label, column_mappings)
    datasets = [
        ("mteb", "MTEB", {
            "FloresBitextMining_devtest": "Flores",
            "NTREXBitextMining_test": "NTREX",
            "Tatoeba_test": "Tatoeba",
            "MassiveIntentClassification_test": "Intent",
            "MassiveScenarioClassification_test": "Scenario",
            "SIB200Classification_test": "SIB200 Cls",
            "SIB200ClusteringS2S_test": "SIB200 Clust",
            "ArmenianParaphrasePC_test": "Paraphrase",
            "BelebeleRetrieval_test": "Belebele",
        }),
        ("sts", "STS", {
            "Pearson_correlation": "Pearson",
            "Spearman_correlation": "Spearman",
        }),
        ("retrieval", "Retrieval", {
            "top1 within document": "Top-1 Doc",
            "top3 within document": "Top-3 Doc",
            "top5 within document": "Top-5 Doc",
            "top20 group mean macro": "Top-20 Type",
            "top20 all": "Top-20 All",
        }),
        ("msmarco", "MS MARCO", {
            "reranking_mrr": "Rerank MRR",
            "retrieval_mrr": "Retr. MRR",
            "retrieval_top5_accuracy": "Top-5",
            "retrieval_top10_accuracy": "Top-10",
        }),
    ]

    # Collect all models from all datasets
    all_models = set()
    for key, _, _ in datasets:
        df = detailed_results.get(key, pd.DataFrame())
        if not df.empty and "model_name" in df.columns:
            all_models.update(df["model_name"].unique())

    if not all_models:
        return pd.DataFrame()

    # Use provided model_order if available, otherwise sort alphabetically
    if model_order:
        # Filter model_order to only include models that exist in detailed_results
        ordered_models = [m for m in model_order if m in all_models]
        # Add any remaining models not in model_order (in case they're new)
        remaining = sorted([m for m in all_models if m not in ordered_models])
        all_models_ordered = ordered_models + remaining
    else:
        all_models_ordered = sorted(all_models)

    # Build combined dataframe with flat columns first
    combined = pd.DataFrame({"Model": all_models_ordered})
    column_tuples = [("", "Model")]  # For MultiIndex: (level1, level2)

    for key, label, col_map in datasets:
        df = detailed_results.get(key, pd.DataFrame())
        if df.empty:
            continue
        df = df.drop_duplicates(subset=["model_name"], keep="first")

        for orig_col, new_col in col_map.items():
            if orig_col in df.columns:
                col_name = f"{label} | {new_col}"
                column_tuples.append((label, new_col))
                merged = combined.merge(
                    df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
                    on="Model",
                    how="left"
                )
                combined = merged

    # Round numeric columns
    combined = combined.round(4)

    # If no model_order was provided, sort by first numeric column for backward compatibility
    if not model_order:
        numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
        if numeric_cols:
            combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")

    # Always reset index to ensure proper row ordering
    combined = combined.reset_index(drop=True)

    combined.insert(0, "#", range(1, len(combined) + 1))
    column_tuples.insert(0, ("", "#"))

    if use_multiindex:
        # Convert to MultiIndex columns for proper hierarchical display
        combined.columns = pd.MultiIndex.from_tuples(column_tuples)

    return combined


def prepare_translit_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare translit summary leaderboard from raw results DataFrame."""
    if df.empty:
        return df

    df = df.copy()

    # Format model_name as hyperlink if model_url exists
    if "model_url" in df.columns:
        df["model_name"] = df.apply(
            lambda row: f"[{row['model_name']}]({row['model_url']})"
            if pd.notna(row.get("model_url"))
            else row["model_name"],
            axis=1,
        )

    # Only include translit columns
    translit_cols = ["retrieval_translit_top20", "msmarco_translit_top10"]
    available_cols = [c for c in translit_cols if c in df.columns]

    if not available_cols:
        return pd.DataFrame()

    # Filter to models that have translit data
    df = df.dropna(subset=available_cols, how="all")

    if df.empty:
        return pd.DataFrame()

    # Calculate average
    df["average"] = df[available_cols].mean(axis=1).round(4)

    # Sort by average
    df = df.sort_values(by="average", ascending=False).reset_index(drop=True)
    df.insert(0, "Rank", range(1, len(df) + 1))

    # Select columns - include model_size if available
    size_col = ["model_size"] if "model_size" in df.columns else []
    display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"]
    df = df[[c for c in display_cols if c in df.columns]].round(4)

    # Replace missing model_size with "-" if it's in the data
    if "model_size" in df.columns:
        df["model_size"] = df["model_size"].fillna("-").replace("", "-")

    df = df.rename(columns={**TRANSLIT_COLUMN_LABELS, "average": "Average"})

    return df


def prepare_translit_detailed(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame:
    """Prepare a single combined translit detailed leaderboard with hierarchical columns.

    Args:
        detailed_results: Dict with 'retrieval_translit' and 'msmarco_translit' DataFrames
        model_order: Optional list of model names in desired order. If provided, models will be
                    displayed in this order instead of being sorted independently.
        use_multiindex: If True, return DataFrame with MultiIndex columns for proper
                        hierarchical display (merged headers in HTML/Gradio).
                        If False, use flat "Category | Metric" column names.

    Returns:
        pd.DataFrame: Combined table with dataset names as hierarchical column headers
    """
    datasets = [
        ("retrieval_translit", "Retrieval", {
            "top1 within document": "Top-1 Doc",
            "top3 within document": "Top-3 Doc",
            "top5 within document": "Top-5 Doc",
            "top20 group mean macro": "Top-20 Type",
            "top20 all": "Top-20 All",
        }),
        ("msmarco_translit", "MS MARCO", {
            "reranking_mrr": "Rerank MRR",
            "retrieval_mrr": "Retr. MRR",
            "retrieval_top5_accuracy": "Top-5",
            "retrieval_top10_accuracy": "Top-10",
        }),
    ]

    # Collect all models from all datasets
    all_models = set()
    for key, _, _ in datasets:
        df = detailed_results.get(key, pd.DataFrame())
        if not df.empty and "model_name" in df.columns:
            all_models.update(df["model_name"].unique())

    if not all_models:
        return pd.DataFrame()

    # Use provided model_order if available, otherwise sort alphabetically
    if model_order:
        # Filter model_order to only include models that exist in detailed_results
        ordered_models = [m for m in model_order if m in all_models]
        # Add any remaining models not in model_order (in case they're new)
        remaining = sorted([m for m in all_models if m not in ordered_models])
        all_models_ordered = ordered_models + remaining
    else:
        all_models_ordered = sorted(all_models)

    # Build combined dataframe
    combined = pd.DataFrame({"Model": all_models_ordered})
    column_tuples = [("", "Model")]  # For MultiIndex: (level1, level2)

    for key, label, col_map in datasets:
        df = detailed_results.get(key, pd.DataFrame())
        if df.empty:
            continue
        df = df.drop_duplicates(subset=["model_name"], keep="first")

        for orig_col, new_col in col_map.items():
            if orig_col in df.columns:
                col_name = f"{label} | {new_col}"
                column_tuples.append((label, new_col))
                merged = combined.merge(
                    df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}),
                    on="Model",
                    how="left"
                )
                combined = merged

    # Round numeric columns
    combined = combined.round(4)

    # If no model_order was provided, sort by first numeric column for backward compatibility
    if not model_order:
        numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist()
        if numeric_cols:
            combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last")

    # Always reset index to ensure proper row ordering
    combined = combined.reset_index(drop=True)

    combined.insert(0, "#", range(1, len(combined) + 1))
    column_tuples.insert(0, ("", "#"))

    if use_multiindex:
        # Convert to MultiIndex columns for proper hierarchical display
        combined.columns = pd.MultiIndex.from_tuples(column_tuples)

    return combined