Spaces:
Running
Running
| # /// script | |
| # requires-python = ">=3.11" | |
| # dependencies = [ | |
| # "httpx", | |
| # "huggingface_hub", | |
| # ] | |
| # /// | |
| """ | |
| Regenerate data.json and upload to the elevow/benchmarks Space. | |
| Source template: duplicated from davanstrien/benchmark-race | |
| https://huggingface.co/spaces/elevow/benchmarks | |
| **Single file:** All Aligned race branding, axis relabeling, optional org-groq tagging, and | |
| offline ``patch_output_dict`` live here (no separate inject script). | |
| 1. Add HF ``model_id`` strings to ``MODEL_IDS_ALIGNED_ON_RACE`` (exact strings — use | |
| ``DUMP_MODEL_IDS=1`` once to list them). That rewrites ``short_name`` and sets ``race_logo_key``. | |
| 2. **Upload the forked** ``scripts/elevow-benchmarks/index.html`` **to your Space** (same folder as | |
| ``data.json``). Upstream benchmark-race ignores ``race_logo_key``; without this file you will | |
| not see the Aligned logo or Aligned bar color. | |
| Run locally (from repo root or this folder): | |
| export HF_TOKEN=hf_... | |
| uv run scripts/elevow-benchmarks/update_data.py | |
| Or copy this file to your Space repo root on Hugging Face and run there. | |
| Schedule on HF Jobs (example — point to YOUR raw file): | |
| hf jobs scheduled uv run "0 8,20 * * *" \\ | |
| --secrets HF_TOKEN \\ | |
| https://huggingface.co/spaces/elevow/benchmarks/resolve/main/update_data.py | |
| Upload the forked UI in the same commit as data (one shot): | |
| UPLOAD_INDEX_HTML=1 uv run scripts/elevow-benchmarks/update_data.py | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import re | |
| import tempfile | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Any | |
| import httpx | |
| from huggingface_hub import HfApi | |
| # Upload target: your fork (was davanstrien/benchmark-race in upstream). | |
| SPACE_REPO = os.environ.get("BENCHMARK_SPACE_REPO", "elevow/benchmarks") | |
| ALIGNED_LOGO_URL = ( | |
| "https://www.google.com/s2/favicons?sz=128&domain_url=" | |
| "https%3A%2F%2Ftryaligned.ai" | |
| ) | |
| ALIGNED_LOGOS_KEY = "AlignedAI" | |
| ALIGNED_COLOR = "#059669" | |
| # Preferred: one list for both **Aligned bar label** + **race_logo_key** + Aligned bar color. | |
| # Run with DUMP_MODEL_IDS=1 once to print every model_id the script saw (copy exact strings). | |
| MODEL_IDS_ALIGNED_ON_RACE: frozenset[str] = frozenset( | |
| { | |
| # "meta-llama/Llama-3.3-70B-Instruct", | |
| # "meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
| } | |
| ) | |
| # Legacy: unioned with MODEL_IDS_ALIGNED_ON_RACE (you can use any of these three sets). | |
| MODEL_IDS_USE_ALIGNED_LOGO: frozenset[str] = frozenset() | |
| MODEL_IDS_ALIGNED_AXIS_LABEL: frozenset[str] = frozenset() | |
| def _all_branded_model_ids() -> frozenset[str]: | |
| return MODEL_IDS_ALIGNED_ON_RACE | MODEL_IDS_USE_ALIGNED_LOGO | MODEL_IDS_ALIGNED_AXIS_LABEL | |
| # If True, tag every row whose HF org is literally "groq" with race_logo_key (rare on leaderboards). | |
| USE_ALIGNED_FOR_ORG_GROQ = False | |
| # Copy-paste example if you add a synthetic Aligned row by hand (ensure logos/colors cover provider). | |
| SYNTHETIC_ALIGNED_ROW_EXAMPLE = r""" | |
| # After building `models` for one benchmark, you may append: | |
| # models.append({ | |
| # "model_id": "tryaligned/Aligned-AI", | |
| # "short_name": "Aligned-AI", | |
| # "provider": "tryaligned", | |
| # "score": 0.0, | |
| # "date": "2026-01-01", | |
| # "race_logo_key": "AlignedAI", | |
| # }) | |
| # Then ensure logos["AlignedAI"] is set and colors include "tryaligned". | |
| """ | |
| def aligned_groq_lane_for_model_id(model_id: str) -> str: | |
| """Match client `alignedGroqLaneForRawModel` heuristics on HF model_id.""" | |
| s = model_id.lower() | |
| if "scout" in s: | |
| return "Vision" | |
| if "coder" in s: | |
| return "Code" | |
| if "llama-3.1" in s and "8b" in s: | |
| return "Fast" | |
| return "Reasoning" | |
| def aligned_axis_label_from_model_id(model_id: str) -> str: | |
| """Bar label for forked data.json (benchmark-race reads `m.short_name`).""" | |
| slug = model_id.split("/")[-1].replace("-", " ").replace("_", " ") | |
| slug = re.sub(r"\s+", " ", slug).strip() | |
| if len(slug) > 20: | |
| slug = f"{slug[:18]}…" | |
| lane = aligned_groq_lane_for_model_id(model_id) | |
| label = f"Aligned AI — {lane} · {slug}" | |
| if len(label) > 45: | |
| label = f"{label[:43]}…" | |
| return label | |
| BENCHMARK_CONFIGS = [ | |
| {"dataset": "SWE-bench/SWE-bench_Verified", "key": "sweVerified", "name": "SWE-bench Verified", "gated": False}, | |
| {"dataset": "ScaleAI/SWE-bench_Pro", "key": "swePro", "name": "SWE-bench Pro", "gated": False}, | |
| {"dataset": "TIGER-Lab/MMLU-Pro", "key": "mmluPro", "name": "MMLU-Pro", "gated": False}, | |
| {"dataset": "Idavidrein/gpqa", "key": "gpqa", "name": "GPQA Diamond", "gated": True}, | |
| {"dataset": "cais/hle", "key": "hle", "name": "HLE", "gated": True}, | |
| {"dataset": "MathArena/aime_2026", "key": "aime2026", "name": "AIME 2026", "gated": False}, | |
| {"dataset": "MathArena/hmmt_feb_2026", "key": "hmmt2026", "name": "HMMT Feb 2026", "gated": False}, | |
| {"dataset": "allenai/olmOCR-bench", "key": "olmOcr", "name": "olmOCR-bench", "gated": False}, | |
| {"dataset": "harborframework/terminal-bench-2.0", "key": "terminalBench", "name": "Terminal-Bench 2.0", "gated": False}, | |
| {"dataset": "FutureMa/EvasionBench", "key": "evasionBench", "name": "EvasionBench", "gated": False}, | |
| ] | |
| PALETTE = [ | |
| "#6366f1", "#0d9488", "#d97706", "#e11d48", "#7c3aed", | |
| "#16a34a", "#2563eb", "#ea580c", "#8b5cf6", "#0891b2", | |
| "#c026d3", "#65a30d", "#dc2626", "#0284c7", "#a21caf", | |
| "#059669", "#9333ea", "#ca8a04", "#be185d", "#0369a1", | |
| ] | |
| def inject_aligned_race_branding( | |
| benchmarks: dict[str, Any], | |
| logos: dict[str, str], | |
| color_map: dict[str, str], | |
| ) -> tuple[int, int]: | |
| """Add Aligned logo URL, optional per-model race_logo_key, bar color, and axis labels. | |
| Returns (logo_tag_count, axis_relabel_count) for logging. | |
| """ | |
| logos[ALIGNED_LOGOS_KEY] = ALIGNED_LOGO_URL | |
| color_map[ALIGNED_LOGOS_KEY] = ALIGNED_COLOR | |
| logo_n = 0 | |
| axis_n = 0 | |
| for _key, bm in benchmarks.items(): | |
| for m in bm.get("models") or []: | |
| mid = m.get("model_id") or "" | |
| provider = mid.split("/")[0] if "/" in mid else mid | |
| branded = mid in _all_branded_model_ids() | |
| use_groq_org = USE_ALIGNED_FOR_ORG_GROQ and provider.lower() == "groq" | |
| if branded or use_groq_org: | |
| m["race_logo_key"] = ALIGNED_LOGOS_KEY | |
| logo_n += 1 | |
| if branded: | |
| orig_sn = m.get("short_name") or (mid.split("/")[-1] if "/" in mid else mid) | |
| m["chart_full_name"] = f"Published HF model: {orig_sn.replace('-', ' ')}" | |
| m["short_name"] = aligned_axis_label_from_model_id(mid) | |
| axis_n += 1 | |
| return logo_n, axis_n | |
| def _upload_index_html_fork(api: HfApi) -> None: | |
| """Stock benchmark-race ignores race_logo_key; upload sibling index.html when asked.""" | |
| flag = os.environ.get("UPLOAD_INDEX_HTML", "").lower() | |
| if flag not in ("1", "true", "yes"): | |
| return | |
| index_path = Path(__file__).resolve().parent / "index.html" | |
| if not index_path.is_file(): | |
| print("UPLOAD_INDEX_HTML set but scripts/elevow-benchmarks/index.html is missing.") | |
| return | |
| api.upload_file( | |
| path_or_fileobj=str(index_path), | |
| path_in_repo="index.html", | |
| repo_id=SPACE_REPO, | |
| repo_type="space", | |
| commit_message=f"Update index.html Aligned fork ({datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')})", | |
| ) | |
| print(f"Uploaded index.html → {SPACE_REPO}") | |
| def patch_output_dict(output: dict[str, Any]) -> dict[str, Any]: | |
| """Deep-copy a loaded data.json dict, apply Aligned branding in place, return the copy.""" | |
| out = json.loads(json.dumps(output)) | |
| benchmarks = out.get("benchmarks") or {} | |
| logos = out.setdefault("logos", {}) | |
| colors = out.setdefault("colors", {}) | |
| inject_aligned_race_branding(benchmarks, logos, colors) | |
| return out | |
| def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]: | |
| url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard" | |
| headers = {} | |
| if config["gated"] and hf_token: | |
| headers["Authorization"] = f"Bearer {hf_token}" | |
| elif config["gated"]: | |
| print(f" {config['name']}: skipped (gated, no token)") | |
| return [] | |
| print(f" {config['name']}: fetching scores...") | |
| try: | |
| resp = httpx.get(url, headers=headers, timeout=30) | |
| if resp.status_code != 200: | |
| print(f" skip (status {resp.status_code})") | |
| return [] | |
| data = resp.json() | |
| if not isinstance(data, list): | |
| return [] | |
| except Exception as e: | |
| print(f" error: {e}") | |
| return [] | |
| seen: dict[str, float] = {} | |
| for entry in data: | |
| model_id = entry.get("modelId") | |
| score = entry.get("value") | |
| if model_id and score is not None: | |
| score = float(score) | |
| if model_id not in seen or score > seen[model_id]: | |
| seen[model_id] = score | |
| print(f" {len(seen)} models") | |
| return [{"model_id": mid, "score": s} for mid, s in seen.items()] | |
| def fetch_model_dates(model_ids: list[str], hf_token: str | None) -> dict[str, dict]: | |
| api = HfApi() | |
| results: dict[str, dict] = {} | |
| def _get_info(mid: str): | |
| try: | |
| info = api.model_info(mid, token=hf_token) | |
| params_b = None | |
| if info.safetensors and hasattr(info.safetensors, "total"): | |
| params_b = round(info.safetensors.total / 1_000_000_000, 1) | |
| if params_b is None: | |
| m = re.findall(r"[-_/](\d+\.?\d*)[Bb](?:[-_/]|$)", mid) | |
| if m: | |
| params_b = max(float(x) for x in m) | |
| return mid, info.created_at.strftime("%Y-%m-%d"), params_b | |
| except Exception: | |
| return mid, None, None | |
| with ThreadPoolExecutor(max_workers=8) as pool: | |
| futures = {pool.submit(_get_info, mid): mid for mid in model_ids} | |
| for f in as_completed(futures): | |
| mid, date, params = f.result() | |
| if date: | |
| results[mid] = {"date": date, "parameters_b": params} | |
| return results | |
| def fetch_logo(provider: str) -> str | None: | |
| try: | |
| resp = httpx.get( | |
| f"https://huggingface.co/api/organizations/{provider}/avatar", | |
| timeout=5, | |
| ) | |
| if resp.status_code == 200: | |
| return resp.json().get("avatarUrl") | |
| except Exception: | |
| pass | |
| return None | |
| def fetch_all_logos(providers: set[str]) -> dict[str, str]: | |
| logos: dict[str, str] = {} | |
| with ThreadPoolExecutor(max_workers=8) as pool: | |
| futures = {pool.submit(fetch_logo, p): p for p in providers} | |
| for f in as_completed(futures): | |
| p = futures[f] | |
| url = f.result() | |
| if url: | |
| logos[p] = url | |
| return logos | |
| def main() -> None: | |
| hf_token = os.environ.get("HF_TOKEN") | |
| print(f"Generating data.json → upload to {SPACE_REPO}\n") | |
| all_scores: dict[str, dict] = {} | |
| all_model_ids: set[str] = set() | |
| for config in BENCHMARK_CONFIGS: | |
| rows = fetch_leaderboard(config, hf_token) | |
| if rows: | |
| all_scores[config["key"]] = {"name": config["name"], "rows": rows} | |
| all_model_ids.update(r["model_id"] for r in rows) | |
| print(f"\n{len(all_model_ids)} unique models across {len(all_scores)} benchmarks") | |
| if os.environ.get("DUMP_MODEL_IDS"): | |
| print("\n-- DUMP_MODEL_IDS (copy into MODEL_IDS_ALIGNED_ON_RACE) --") | |
| for mid in sorted(all_model_ids): | |
| print(mid) | |
| print("-- end --\n") | |
| print("Fetching model dates...") | |
| model_dates = fetch_model_dates(list(all_model_ids), hf_token) | |
| print(f" got dates for {len(model_dates)}/{len(all_model_ids)} models") | |
| all_providers: set[str] = set() | |
| benchmarks: dict[str, Any] = {} | |
| for key, info in all_scores.items(): | |
| models: list[dict] = [] | |
| for row in info["rows"]: | |
| mid = row["model_id"] | |
| if mid not in model_dates: | |
| continue | |
| provider = mid.split("/")[0] if "/" in mid else mid | |
| short_name = mid.split("/")[-1] | |
| all_providers.add(provider) | |
| models.append({ | |
| "model_id": mid, | |
| "short_name": short_name, | |
| "provider": provider, | |
| "score": round(row["score"], 2), | |
| "date": model_dates[mid]["date"], | |
| }) | |
| if models: | |
| benchmarks[key] = {"name": info["name"], "models": models} | |
| print(f"\nFetching logos for {len(all_providers)} providers...") | |
| logos = fetch_all_logos(all_providers) | |
| print(f" got {len(logos)} logos") | |
| color_map: dict[str, str] = {} | |
| for i, provider in enumerate(sorted(all_providers)): | |
| color_map[provider] = PALETTE[i % len(PALETTE)] | |
| tagged, relabeled = inject_aligned_race_branding(benchmarks, logos, color_map) | |
| print( | |
| f" injected {ALIGNED_LOGOS_KEY} logo + color; " | |
| f"race_logo_key on {tagged} row(s); " | |
| f"Aligned axis short_name on {relabeled} row(s)" | |
| ) | |
| output = { | |
| "benchmarks": benchmarks, | |
| "logos": logos, | |
| "colors": color_map, | |
| "generated_at": datetime.now(timezone.utc).isoformat(), | |
| } | |
| data_json = json.dumps(output, indent=2) | |
| print(f"\nGenerated {len(data_json) / 1024:.1f} KB") | |
| for key, bm in benchmarks.items(): | |
| print(f" {bm['name']}: {len(bm['models'])} models") | |
| print(f"\nUploading data.json to {SPACE_REPO}...") | |
| api = HfApi() | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f: | |
| f.write(data_json) | |
| tmp_path = f.name | |
| try: | |
| api.upload_file( | |
| path_or_fileobj=tmp_path, | |
| path_in_repo="data.json", | |
| repo_id=SPACE_REPO, | |
| repo_type="space", | |
| commit_message=f"Update data.json ({datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')})", | |
| ) | |
| print("Done!") | |
| finally: | |
| Path(tmp_path).unlink(missing_ok=True) | |
| _upload_index_html_fork(api) | |
| if __name__ == "__main__": | |
| main() | |