Spaces:
Running
Running
File size: 4,848 Bytes
19d93fe 4c1e130 19d93fe 4c1e130 0858809 19d93fe 9331159 baaa011 19d93fe ebac224 19d93fe baaa011 ebac224 4c1e130 19d93fe baaa011 19d93fe baaa011 19d93fe 4c1e130 19d93fe baaa011 19d93fe baaa011 19d93fe ebac224 19d93fe ebac224 19d93fe 4c1e130 ebac224 4c1e130 19d93fe 140b349 baaa011 140b349 19d93fe 140b349 4c1e130 ebac224 4c1e130 1045c52 19d93fe 4c1e130 9331159 4c1e130 9331159 4c1e130 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | from git import Repo
import shutil
import os
import json
import pandas as pd
from .dataset_handler import VIDORE_V1_MTEB_NAMES, VIDORE_V2_MTEB_NAMES, get_datasets_nickname
class ModelHandler:
def __init__(self):
self.model_infos = {}
@staticmethod
def get_folders(dir_path):
return sorted([path_ for path_ in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, path_))])
def get_vidore_data(self, metric="ndcg_at_5"):
repo_url = "https://github.com/embeddings-benchmark/results.git"
local_path = "./results"
folder_of_interest = "results"
if os.path.exists(local_path):
repo = Repo(local_path)
origin = repo.remotes.origin
origin.pull()
else:
Repo.clone_from(repo_url, local_path, depth=1)
model_names = self.get_folders(os.path.join(local_path, folder_of_interest))
for model_name in model_names:
revisions = self.get_folders(os.path.join(local_path, folder_of_interest, model_name))
first_revision = revisions[0]
result_filenames = [
result_filename
for result_filename in os.listdir(
os.path.join(local_path, folder_of_interest, model_name, first_revision)
)
# if result_filename.endswith(".json") and result_filename != "model_meta.json"
]
if "model_meta.json" in result_filenames:
with open(
os.path.join(local_path, folder_of_interest, model_name, first_revision, "model_meta.json"), "r"
) as f:
meta = json.load(f)
else:
meta = {}
results = {}
if all(f"{v1_dataset_name}.json" in result_filenames for v1_dataset_name in VIDORE_V1_MTEB_NAMES):
for v1_dataset_name in VIDORE_V1_MTEB_NAMES:
with open(
os.path.join(
local_path, folder_of_interest, model_name, first_revision, f"{v1_dataset_name}.json"
),
"r",
) as f:
results[v1_dataset_name] = json.load(f)
if all(f"{v2_dataset_name}.json" in result_filenames for v2_dataset_name in VIDORE_V2_MTEB_NAMES):
for v2_dataset_name in VIDORE_V2_MTEB_NAMES:
with open(
os.path.join(
local_path, folder_of_interest, model_name, first_revision, f"{v2_dataset_name}.json"
),
"r",
) as f:
results[v2_dataset_name] = json.load(f)
if model_name not in self.model_infos:
self.model_infos[model_name] = {}
self.model_infos[model_name] = {"meta": meta, "results": results}
def filter_models_by_benchmark(self, benchmark_version=1):
filtered_model_infos = {}
keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES
for model, info in self.model_infos.items():
results = info["results"]
if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()):
filtered_model_infos[model] = info
return filtered_model_infos
def render_df(self, metric="ndcg_at_5", benchmark_version=1):
model_res = {}
filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
if len(filtered_model_infos) > 0:
for model in filtered_model_infos.keys():
res = filtered_model_infos[model]["results"]
dataset_res = {}
keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES
if "n_parameters" in filtered_model_infos[model]["meta"]:
try:
dataset_res["Model Size (Million Parameters)"] = (
filtered_model_infos[model]["meta"]["n_parameters"] // 1_000_000
)
except TypeError:
dataset_res["Model Size (Million Parameters)"] = -1
else:
dataset_res["Model Size (Million Parameters)"] = -1
for dataset in res.keys():
if not any(keyword in dataset for keyword in keywords):
continue
dataset_nickname = get_datasets_nickname(dataset)
dataset_res[dataset_nickname] = res[dataset]["scores"]["test"][0][metric]
model_res[model] = dataset_res
df = pd.DataFrame(model_res).T
return df
return pd.DataFrame()
|