benchmarks

Running

App Files Files Community

benchmarks / update_data.py

elevow

Update update_data.py

5b123b0 verified 5 days ago

raw

history blame contribute delete

14.2 kB

	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "httpx",
	# "huggingface_hub",
	# ]
	# ///
	"""
	Regenerate data.json and upload to the elevow/benchmarks Space.

	Source template: duplicated from davanstrien/benchmark-race
	https://huggingface.co/spaces/elevow/benchmarks

	Single file: All Aligned race branding, axis relabeling, optional org-groq tagging, and
	offline ``patch_output_dict`` live here (no separate inject script).

	1. Add HF ``model_id`` strings to ``MODEL_IDS_ALIGNED_ON_RACE`` (exact strings — use
	``DUMP_MODEL_IDS=1`` once to list them). That rewrites ``short_name`` and sets ``race_logo_key``.
	2. Upload the forked ``scripts/elevow-benchmarks/index.html`` to your Space (same folder as
	``data.json``). Upstream benchmark-race ignores ``race_logo_key``; without this file you will
	not see the Aligned logo or Aligned bar color.

	Run locally (from repo root or this folder):
	export HF_TOKEN=hf_...
	uv run scripts/elevow-benchmarks/update_data.py

	Or copy this file to your Space repo root on Hugging Face and run there.

	Schedule on HF Jobs (example — point to YOUR raw file):
	hf jobs scheduled uv run "0 8,20 * * *" \\
	--secrets HF_TOKEN \\
	https://huggingface.co/spaces/elevow/benchmarks/resolve/main/update_data.py

	Upload the forked UI in the same commit as data (one shot):
	UPLOAD_INDEX_HTML=1 uv run scripts/elevow-benchmarks/update_data.py
	"""

	from __future__ import annotations

	import json
	import os
	import re
	import tempfile
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from datetime import datetime, timezone
	from pathlib import Path
	from typing import Any

	import httpx
	from huggingface_hub import HfApi

	# Upload target: your fork (was davanstrien/benchmark-race in upstream).
	SPACE_REPO = os.environ.get("BENCHMARK_SPACE_REPO", "elevow/benchmarks")

	ALIGNED_LOGO_URL = (
	"https://www.google.com/s2/favicons?sz=128&domain_url="
	"https%3A%2F%2Ftryaligned.ai"
	)
	ALIGNED_LOGOS_KEY = "AlignedAI"
	ALIGNED_COLOR = "#059669"

	# Preferred: one list for both Aligned bar label + race_logo_key + Aligned bar color.
	# Run with DUMP_MODEL_IDS=1 once to print every model_id the script saw (copy exact strings).
	MODEL_IDS_ALIGNED_ON_RACE: frozenset[str] = frozenset(
	{
	# "meta-llama/Llama-3.3-70B-Instruct",
	# "meta-llama/Llama-4-Scout-17B-16E-Instruct",
	}
	)

	# Legacy: unioned with MODEL_IDS_ALIGNED_ON_RACE (you can use any of these three sets).
	MODEL_IDS_USE_ALIGNED_LOGO: frozenset[str] = frozenset()
	MODEL_IDS_ALIGNED_AXIS_LABEL: frozenset[str] = frozenset()


	def _all_branded_model_ids() -> frozenset[str]:
	return MODEL_IDS_ALIGNED_ON_RACE \| MODEL_IDS_USE_ALIGNED_LOGO \| MODEL_IDS_ALIGNED_AXIS_LABEL

	# If True, tag every row whose HF org is literally "groq" with race_logo_key (rare on leaderboards).
	USE_ALIGNED_FOR_ORG_GROQ = False

	# Copy-paste example if you add a synthetic Aligned row by hand (ensure logos/colors cover provider).
	SYNTHETIC_ALIGNED_ROW_EXAMPLE = r"""
	# After building `models` for one benchmark, you may append:
	# models.append({
	# "model_id": "tryaligned/Aligned-AI",
	# "short_name": "Aligned-AI",
	# "provider": "tryaligned",
	# "score": 0.0,
	# "date": "2026-01-01",
	# "race_logo_key": "AlignedAI",
	# })
	# Then ensure logos["AlignedAI"] is set and colors include "tryaligned".
	"""


	def aligned_groq_lane_for_model_id(model_id: str) -> str:
	"""Match client `alignedGroqLaneForRawModel` heuristics on HF model_id."""
	s = model_id.lower()
	if "scout" in s:
	return "Vision"
	if "coder" in s:
	return "Code"
	if "llama-3.1" in s and "8b" in s:
	return "Fast"
	return "Reasoning"


	def aligned_axis_label_from_model_id(model_id: str) -> str:
	"""Bar label for forked data.json (benchmark-race reads `m.short_name`)."""
	slug = model_id.split("/")[-1].replace("-", " ").replace("_", " ")
	slug = re.sub(r"\s+", " ", slug).strip()
	if len(slug) > 20:
	slug = f"{slug[:18]}…"
	lane = aligned_groq_lane_for_model_id(model_id)
	label = f"Aligned AI — {lane} · {slug}"
	if len(label) > 45:
	label = f"{label[:43]}…"
	return label

	BENCHMARK_CONFIGS = [
	{"dataset": "SWE-bench/SWE-bench_Verified", "key": "sweVerified", "name": "SWE-bench Verified", "gated": False},
	{"dataset": "ScaleAI/SWE-bench_Pro", "key": "swePro", "name": "SWE-bench Pro", "gated": False},
	{"dataset": "TIGER-Lab/MMLU-Pro", "key": "mmluPro", "name": "MMLU-Pro", "gated": False},
	{"dataset": "Idavidrein/gpqa", "key": "gpqa", "name": "GPQA Diamond", "gated": True},
	{"dataset": "cais/hle", "key": "hle", "name": "HLE", "gated": True},
	{"dataset": "MathArena/aime_2026", "key": "aime2026", "name": "AIME 2026", "gated": False},
	{"dataset": "MathArena/hmmt_feb_2026", "key": "hmmt2026", "name": "HMMT Feb 2026", "gated": False},
	{"dataset": "allenai/olmOCR-bench", "key": "olmOcr", "name": "olmOCR-bench", "gated": False},
	{"dataset": "harborframework/terminal-bench-2.0", "key": "terminalBench", "name": "Terminal-Bench 2.0", "gated": False},
	{"dataset": "FutureMa/EvasionBench", "key": "evasionBench", "name": "EvasionBench", "gated": False},
	]

	PALETTE = [
	"#6366f1", "#0d9488", "#d97706", "#e11d48", "#7c3aed",
	"#16a34a", "#2563eb", "#ea580c", "#8b5cf6", "#0891b2",
	"#c026d3", "#65a30d", "#dc2626", "#0284c7", "#a21caf",
	"#059669", "#9333ea", "#ca8a04", "#be185d", "#0369a1",
	]


	def inject_aligned_race_branding(
	benchmarks: dict[str, Any],
	logos: dict[str, str],
	color_map: dict[str, str],
	) -> tuple[int, int]:
	"""Add Aligned logo URL, optional per-model race_logo_key, bar color, and axis labels.

	Returns (logo_tag_count, axis_relabel_count) for logging.
	"""
	logos[ALIGNED_LOGOS_KEY] = ALIGNED_LOGO_URL
	color_map[ALIGNED_LOGOS_KEY] = ALIGNED_COLOR

	logo_n = 0
	axis_n = 0
	for _key, bm in benchmarks.items():
	for m in bm.get("models") or []:
	mid = m.get("model_id") or ""
	provider = mid.split("/")[0] if "/" in mid else mid
	branded = mid in _all_branded_model_ids()
	use_groq_org = USE_ALIGNED_FOR_ORG_GROQ and provider.lower() == "groq"
	if branded or use_groq_org:
	m["race_logo_key"] = ALIGNED_LOGOS_KEY
	logo_n += 1
	if branded:
	orig_sn = m.get("short_name") or (mid.split("/")[-1] if "/" in mid else mid)
	m["chart_full_name"] = f"Published HF model: {orig_sn.replace('-', ' ')}"
	m["short_name"] = aligned_axis_label_from_model_id(mid)
	axis_n += 1

	return logo_n, axis_n


	def _upload_index_html_fork(api: HfApi) -> None:
	"""Stock benchmark-race ignores race_logo_key; upload sibling index.html when asked."""
	flag = os.environ.get("UPLOAD_INDEX_HTML", "").lower()
	if flag not in ("1", "true", "yes"):
	return
	index_path = Path(__file__).resolve().parent / "index.html"
	if not index_path.is_file():
	print("UPLOAD_INDEX_HTML set but scripts/elevow-benchmarks/index.html is missing.")
	return
	api.upload_file(
	path_or_fileobj=str(index_path),
	path_in_repo="index.html",
	repo_id=SPACE_REPO,
	repo_type="space",
	commit_message=f"Update index.html Aligned fork ({datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')})",
	)
	print(f"Uploaded index.html → {SPACE_REPO}")


	def patch_output_dict(output: dict[str, Any]) -> dict[str, Any]:
	"""Deep-copy a loaded data.json dict, apply Aligned branding in place, return the copy."""
	out = json.loads(json.dumps(output))
	benchmarks = out.get("benchmarks") or {}
	logos = out.setdefault("logos", {})
	colors = out.setdefault("colors", {})
	inject_aligned_race_branding(benchmarks, logos, colors)
	return out


	def fetch_leaderboard(config: dict, hf_token: str \| None) -> list[dict]:
	url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard"
	headers = {}
	if config["gated"] and hf_token:
	headers["Authorization"] = f"Bearer {hf_token}"
	elif config["gated"]:
	print(f" {config['name']}: skipped (gated, no token)")
	return []

	print(f" {config['name']}: fetching scores...")
	try:
	resp = httpx.get(url, headers=headers, timeout=30)
	if resp.status_code != 200:
	print(f" skip (status {resp.status_code})")
	return []
	data = resp.json()
	if not isinstance(data, list):
	return []
	except Exception as e:
	print(f" error: {e}")
	return []

	seen: dict[str, float] = {}
	for entry in data:
	model_id = entry.get("modelId")
	score = entry.get("value")
	if model_id and score is not None:
	score = float(score)
	if model_id not in seen or score > seen[model_id]:
	seen[model_id] = score

	print(f" {len(seen)} models")
	return [{"model_id": mid, "score": s} for mid, s in seen.items()]


	def fetch_model_dates(model_ids: list[str], hf_token: str \| None) -> dict[str, dict]:
	api = HfApi()
	results: dict[str, dict] = {}

	def _get_info(mid: str):
	try:
	info = api.model_info(mid, token=hf_token)
	params_b = None
	if info.safetensors and hasattr(info.safetensors, "total"):
	params_b = round(info.safetensors.total / 1_000_000_000, 1)
	if params_b is None:
	m = re.findall(r"[-_/](\d+\.?\d*)[Bb](?:[-_/]\|$)", mid)
	if m:
	params_b = max(float(x) for x in m)
	return mid, info.created_at.strftime("%Y-%m-%d"), params_b
	except Exception:
	return mid, None, None

	with ThreadPoolExecutor(max_workers=8) as pool:
	futures = {pool.submit(_get_info, mid): mid for mid in model_ids}
	for f in as_completed(futures):
	mid, date, params = f.result()
	if date:
	results[mid] = {"date": date, "parameters_b": params}

	return results


	def fetch_logo(provider: str) -> str \| None:
	try:
	resp = httpx.get(
	f"https://huggingface.co/api/organizations/{provider}/avatar",
	timeout=5,
	)
	if resp.status_code == 200:
	return resp.json().get("avatarUrl")
	except Exception:
	pass
	return None


	def fetch_all_logos(providers: set[str]) -> dict[str, str]:
	logos: dict[str, str] = {}
	with ThreadPoolExecutor(max_workers=8) as pool:
	futures = {pool.submit(fetch_logo, p): p for p in providers}
	for f in as_completed(futures):
	p = futures[f]
	url = f.result()
	if url:
	logos[p] = url
	return logos


	def main() -> None:
	hf_token = os.environ.get("HF_TOKEN")
	print(f"Generating data.json → upload to {SPACE_REPO}\n")

	all_scores: dict[str, dict] = {}
	all_model_ids: set[str] = set()

	for config in BENCHMARK_CONFIGS:
	rows = fetch_leaderboard(config, hf_token)
	if rows:
	all_scores[config["key"]] = {"name": config["name"], "rows": rows}
	all_model_ids.update(r["model_id"] for r in rows)

	print(f"\n{len(all_model_ids)} unique models across {len(all_scores)} benchmarks")
	if os.environ.get("DUMP_MODEL_IDS"):
	print("\n-- DUMP_MODEL_IDS (copy into MODEL_IDS_ALIGNED_ON_RACE) --")
	for mid in sorted(all_model_ids):
	print(mid)
	print("-- end --\n")

	print("Fetching model dates...")
	model_dates = fetch_model_dates(list(all_model_ids), hf_token)
	print(f" got dates for {len(model_dates)}/{len(all_model_ids)} models")

	all_providers: set[str] = set()
	benchmarks: dict[str, Any] = {}

	for key, info in all_scores.items():
	models: list[dict] = []
	for row in info["rows"]:
	mid = row["model_id"]
	if mid not in model_dates:
	continue
	provider = mid.split("/")[0] if "/" in mid else mid
	short_name = mid.split("/")[-1]
	all_providers.add(provider)
	models.append({
	"model_id": mid,
	"short_name": short_name,
	"provider": provider,
	"score": round(row["score"], 2),
	"date": model_dates[mid]["date"],
	})
	if models:
	benchmarks[key] = {"name": info["name"], "models": models}

	print(f"\nFetching logos for {len(all_providers)} providers...")
	logos = fetch_all_logos(all_providers)
	print(f" got {len(logos)} logos")

	color_map: dict[str, str] = {}
	for i, provider in enumerate(sorted(all_providers)):
	color_map[provider] = PALETTE[i % len(PALETTE)]

	tagged, relabeled = inject_aligned_race_branding(benchmarks, logos, color_map)
	print(
	f" injected {ALIGNED_LOGOS_KEY} logo + color; "
	f"race_logo_key on {tagged} row(s); "
	f"Aligned axis short_name on {relabeled} row(s)"
	)

	output = {
	"benchmarks": benchmarks,
	"logos": logos,
	"colors": color_map,
	"generated_at": datetime.now(timezone.utc).isoformat(),
	}

	data_json = json.dumps(output, indent=2)
	print(f"\nGenerated {len(data_json) / 1024:.1f} KB")
	for key, bm in benchmarks.items():
	print(f" {bm['name']}: {len(bm['models'])} models")

	print(f"\nUploading data.json to {SPACE_REPO}...")
	api = HfApi()
	with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
	f.write(data_json)
	tmp_path = f.name

	try:
	api.upload_file(
	path_or_fileobj=tmp_path,
	path_in_repo="data.json",
	repo_id=SPACE_REPO,
	repo_type="space",
	commit_message=f"Update data.json ({datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')})",
	)
	print("Done!")
	finally:
	Path(tmp_path).unlink(missing_ok=True)

	_upload_index_html_fork(api)


	if __name__ == "__main__":
	main()