[
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.4-mini",
    "model_slug": "codexresponses-gpt-5-4-mini",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 41655,
    "generation_ok": true,
    "generation_duration_s": 233.57,
    "input_tokens": 257043,
    "output_tokens": 19565,
    "total_tokens": 276608,
    "billing_tokens": 276608,
    "reasoning_tokens": 13843,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 236032,
    "total_cache_tokens": 236032,
    "effective_input_tokens": 21011,
    "display_input_tokens": 257043,
    "usage_event_count": 12,
    "tool_calls": 16,
    "turn_count": 12,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 12,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publica",
    "deterministic_failures": 0,
    "deterministic_warnings": 2,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 1,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 1,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 1,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 99,
    "task_score": 19.8,
    "task_score_max": 20,
    "quality_score": 99,
    "quality_cap_reason": "",
    "quality_class": "warn"
  },
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.4-mini",
    "model_slug": "codexresponses-gpt-5-4-mini",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 40247,
    "generation_ok": true,
    "generation_duration_s": 251.091,
    "input_tokens": 1602209,
    "output_tokens": 16541,
    "total_tokens": 1618750,
    "billing_tokens": 1618750,
    "reasoning_tokens": 10735,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1516544,
    "total_cache_tokens": 1516544,
    "effective_input_tokens": 85665,
    "display_input_tokens": 1602209,
    "usage_event_count": 24,
    "tool_calls": 39,
    "turn_count": 24,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 24,
    "self_check_mode": "checker-cli-error,run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/check_birch_renderings.py --help | sed -n '1,220p' | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.4-mini",
    "model_slug": "codexresponses-gpt-5-4-mini",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 51503,
    "generation_ok": true,
    "generation_duration_s": 228.357,
    "input_tokens": 538144,
    "output_tokens": 20613,
    "total_tokens": 558757,
    "billing_tokens": 558757,
    "reasoning_tokens": 12973,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 489472,
    "total_cache_tokens": 489472,
    "effective_input_tokens": 48672,
    "display_input_tokens": 538144,
    "usage_event_count": 14,
    "tool_calls": 29,
    "turn_count": 14,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 14,
    "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg -n \"^def (contract_findings|compare_stats|screenshot_findings|artifact_screenshot_findings|geometry_findings|render_markdown|capture|find_chrome|capture_height_for_viewport|css_ | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-co | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-fina",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.4-mini",
    "model_slug": "codexresponses-gpt-5-4-mini",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 48838,
    "generation_ok": true,
    "generation_duration_s": 249.193,
    "input_tokens": 122451,
    "output_tokens": 13529,
    "total_tokens": 135980,
    "billing_tokens": 135980,
    "reasoning_tokens": 8129,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 103936,
    "total_cache_tokens": 103936,
    "effective_input_tokens": 18515,
    "display_input_tokens": 122451,
    "usage_event_count": 8,
    "tool_calls": 11,
    "turn_count": 8,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 8,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.4-mini",
    "model_slug": "codexresponses-gpt-5-4-mini",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 55271,
    "generation_ok": true,
    "generation_duration_s": 193.592,
    "input_tokens": 280048,
    "output_tokens": 17564,
    "total_tokens": 297612,
    "billing_tokens": 297612,
    "reasoning_tokens": 9912,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 261120,
    "total_cache_tokens": 261120,
    "effective_input_tokens": 18928,
    "display_input_tokens": 280048,
    "usage_event_count": 14,
    "tool_calls": 18,
    "turn_count": 14,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 4,
    "self_check_failed_runs": 3,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 14,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && uv run --with matplotlib python - <<'PY'\nfrom pathlib impor | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-comparison.h | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\nimport re\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-co",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.5",
    "model_slug": "codexresponses-gpt-5-5",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 41967,
    "generation_ok": true,
    "generation_duration_s": 118.283,
    "input_tokens": 95354,
    "output_tokens": 5337,
    "total_tokens": 100691,
    "billing_tokens": 100691,
    "reasoning_tokens": 402,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 78848,
    "total_cache_tokens": 78848,
    "effective_input_tokens": 16506,
    "display_input_tokens": 95354,
    "usage_event_count": 10,
    "tool_calls": 10,
    "turn_count": 10,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 1,
    "self_corrected_after_checker": true,
    "self_correction_verified": true,
    "assistant_turns_trace": 10,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.5",
    "model_slug": "codexresponses-gpt-5-5",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 44204,
    "generation_ok": true,
    "generation_duration_s": 164.43,
    "input_tokens": 388756,
    "output_tokens": 7268,
    "total_tokens": 396024,
    "billing_tokens": 396024,
    "reasoning_tokens": 2335,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 346624,
    "total_cache_tokens": 346624,
    "effective_input_tokens": 42132,
    "display_input_tokens": 388756,
    "usage_event_count": 16,
    "tool_calls": 22,
    "turn_count": 16,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 3,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 16,
    "self_check_mode": "checker-cli-error,run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.5",
    "model_slug": "codexresponses-gpt-5-5",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 57189,
    "generation_ok": true,
    "generation_duration_s": 178.972,
    "input_tokens": 450726,
    "output_tokens": 9063,
    "total_tokens": 459789,
    "billing_tokens": 459789,
    "reasoning_tokens": 477,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 400896,
    "total_cache_tokens": 400896,
    "effective_input_tokens": 49830,
    "display_input_tokens": 450726,
    "usage_event_count": 14,
    "tool_calls": 25,
    "turn_count": 14,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 14,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n  <met | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/mod | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/module-explainer.html')\ns=p.read",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 1,
    "vlm_warnings": 1,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 1,
    "vlm_warning_units": 1,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 91,
    "task_score": 18.2,
    "task_score_max": 20,
    "quality_score": 91,
    "quality_cap_reason": "",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.5",
    "model_slug": "codexresponses-gpt-5-5",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 49708,
    "generation_ok": true,
    "generation_duration_s": 144.313,
    "input_tokens": 129170,
    "output_tokens": 6893,
    "total_tokens": 136063,
    "billing_tokens": 136063,
    "reasoning_tokens": 369,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 91136,
    "total_cache_tokens": 91136,
    "effective_input_tokens": 38034,
    "display_input_tokens": 129170,
    "usage_event_count": 11,
    "tool_calls": 13,
    "turn_count": 11,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 11,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexre | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin | ran checker CLI: cd /home/shaun/source/birch-html && python3 - <<'PY'\nfrom pathlib import Path\np=Path('eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/implementation-plan.html')",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "codexresponses.gpt-5.5",
    "model_slug": "codexresponses-gpt-5-5",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 53609,
    "generation_ok": true,
    "generation_duration_s": 142.604,
    "input_tokens": 126650,
    "output_tokens": 6524,
    "total_tokens": 133174,
    "billing_tokens": 133174,
    "reasoning_tokens": 491,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 101376,
    "total_cache_tokens": 101376,
    "effective_input_tokens": 25274,
    "display_input_tokens": 126650,
    "usage_event_count": 11,
    "tool_calls": 13,
    "turn_count": 11,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 11,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && python3 - <<'PY'\nfrom pathlib import Path\np=Path('eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/benchmark-comparison.html' | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "codexspark",
    "model_slug": "codexspark",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexspark-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/codexspark/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 17281,
    "generation_ok": true,
    "generation_duration_s": 82.34,
    "input_tokens": 825347,
    "output_tokens": 23923,
    "total_tokens": 849270,
    "billing_tokens": 849270,
    "reasoning_tokens": 13374,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 770688,
    "total_cache_tokens": 770688,
    "effective_input_tokens": 54659,
    "display_input_tokens": 825347,
    "usage_event_count": 32,
    "tool_calls": 31,
    "turn_count": 32,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 32,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 6,
    "deterministic_warnings": 2,
    "vlm_failures": 1,
    "vlm_warnings": 0,
    "deterministic_failure_units": 2,
    "deterministic_warning_units": 1,
    "vlm_failure_units": 1,
    "vlm_warning_units": 0,
    "desktop_failures": 1,
    "desktop_warnings": 1,
    "mobile_failures": 2,
    "mobile_warnings": 0,
    "deep_failures": 1,
    "deep_warnings": 1,
    "mobile_deep_failures": 2,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 35.0,
    "task_score": 7.0,
    "task_score_max": 20,
    "quality_score": 35.0,
    "quality_cap_reason": "missing_birch_css",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "codexspark",
    "model_slug": "codexspark",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexspark-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/codexspark/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 9658,
    "generation_ok": false,
    "generation_duration_s": 60.395,
    "input_tokens": 1737615,
    "output_tokens": 21291,
    "total_tokens": 1758906,
    "billing_tokens": 1758906,
    "reasoning_tokens": 17081,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1702656,
    "total_cache_tokens": 1702656,
    "effective_input_tokens": 86941,
    "display_input_tokens": 1789597,
    "usage_event_count": 41,
    "tool_calls": 32,
    "turn_count": 26,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 3,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 41,
    "self_check_mode": "checker-shell-reference,read-checker",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '1,260p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '260,560p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '560,920p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '920,1320p'",
    "deterministic_failures": 8,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 2,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 2,
    "desktop_warnings": 0,
    "mobile_failures": 2,
    "mobile_warnings": 0,
    "deep_failures": 2,
    "deep_warnings": 0,
    "mobile_deep_failures": 2,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 35.0,
    "task_score": 7.0,
    "task_score_max": 20,
    "quality_score": 35.0,
    "quality_cap_reason": "missing_birch_css",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "codexspark",
    "model_slug": "codexspark",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexspark-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/codexspark/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 16366,
    "generation_ok": false,
    "generation_duration_s": 87.747,
    "input_tokens": 2740590,
    "output_tokens": 27049,
    "total_tokens": 2767639,
    "billing_tokens": 2767639,
    "reasoning_tokens": 15704,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 2024320,
    "total_cache_tokens": 2024320,
    "effective_input_tokens": 202803,
    "display_input_tokens": 2227123,
    "usage_event_count": 35,
    "tool_calls": 51,
    "turn_count": 42,
    "self_check_attempted": true,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 35,
    "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"class\\s*=\\\"(flow-node|flow-edge|flow-list|flow-step|metric-row|chart-panel|finding|code-block|copyable|timeline)\" styles/birch-system.css | shell referenced checker: cd /home/shaun/source/birch-html && wc -l scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexspark-publication-final && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explain | ran checker CLI: cd /home/shaun/source/birch-html && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n  <meta char",
    "deterministic_failures": 0,
    "deterministic_warnings": 4,
    "vlm_failures": 4,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 1,
    "vlm_failure_units": 1,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 1,
    "mobile_failures": 0,
    "mobile_warnings": 1,
    "deep_failures": 0,
    "deep_warnings": 1,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 1,
    "artifact_present": true,
    "artifact_score_100": 91,
    "task_score": 18.2,
    "task_score_max": 20,
    "quality_score": 91,
    "quality_cap_reason": "",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "codexspark",
    "model_slug": "codexspark",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexspark-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/codexspark/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 46864,
    "generation_ok": true,
    "generation_duration_s": 91.953,
    "input_tokens": 1108319,
    "output_tokens": 14746,
    "total_tokens": 1123065,
    "billing_tokens": 1123065,
    "reasoning_tokens": 8043,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1055232,
    "total_cache_tokens": 1055232,
    "effective_input_tokens": 53087,
    "display_input_tokens": 1108319,
    "usage_event_count": 35,
    "tool_calls": 37,
    "turn_count": 35,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 35,
    "self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n  <meta charset | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | head -n 120 | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html --no- | ran checker CLI: python - <<'PY'\nfrom pathlib import Path\nfrom inspect import getsourcelines\nimport importlib.util\np=Path('/home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py')\nte",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "codexspark",
    "model_slug": "codexspark",
    "source_kind": "clean-final",
    "label": "skill-with-shell-codexspark-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/codexspark/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 55786,
    "generation_ok": true,
    "generation_duration_s": 41.038,
    "input_tokens": 681289,
    "output_tokens": 5651,
    "total_tokens": 686940,
    "billing_tokens": 686940,
    "reasoning_tokens": 4100,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 628224,
    "total_cache_tokens": 628224,
    "effective_input_tokens": 53065,
    "display_input_tokens": 681289,
    "usage_event_count": 24,
    "tool_calls": 23,
    "turn_count": 24,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 24,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "deepseek",
    "model_slug": "deepseek",
    "source_kind": "clean-final",
    "label": "skill-with-shell-deepseek-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/deepseek/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 62489,
    "generation_ok": true,
    "generation_duration_s": 280.24,
    "input_tokens": 594128,
    "output_tokens": 18097,
    "total_tokens": 612225,
    "billing_tokens": 612225,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 560512,
    "total_cache_tokens": 560512,
    "effective_input_tokens": 33616,
    "display_input_tokens": 594128,
    "usage_event_count": 18,
    "tool_calls": 20,
    "turn_count": 18,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 18,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-deepseek",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "deepseek",
    "model_slug": "deepseek",
    "source_kind": "clean-final",
    "label": "skill-with-shell-deepseek-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/deepseek/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 62789,
    "generation_ok": true,
    "generation_duration_s": 294.1,
    "input_tokens": 784186,
    "output_tokens": 14634,
    "total_tokens": 798820,
    "billing_tokens": 798820,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 749440,
    "total_cache_tokens": 749440,
    "effective_input_tokens": 34746,
    "display_input_tokens": 784186,
    "usage_event_count": 26,
    "tool_calls": 30,
    "turn_count": 26,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 26,
    "self_check_mode": "checker-shell-reference,run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/code-review | shell referenced checker: cd /home/shaun/source/birch-html && head -30 skill/scripts/check_birch_renderings.py | grep -A5 \"add_argument\" | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"artifact\\|--artifact\" skill/scripts/check_birch_renderings.py | head -10 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check.json skill/reports/birch-rendering-check.md && uv run --with pillow python skill/scripts/check_birch_r | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"ROOT\\s*=\" skill/scripts/check_birch_renderings.py | head -3 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check-code-review.json && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /ho",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "deepseek",
    "model_slug": "deepseek",
    "source_kind": "clean-final",
    "label": "skill-with-shell-deepseek-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/deepseek/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 31473,
    "generation_ok": false,
    "generation_duration_s": 177.334,
    "input_tokens": 215656,
    "output_tokens": 9938,
    "total_tokens": 225594,
    "billing_tokens": 225594,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 449920,
    "total_cache_tokens": 449920,
    "effective_input_tokens": 48511,
    "display_input_tokens": 498431,
    "usage_event_count": 10,
    "tool_calls": 10,
    "turn_count": 6,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 10,
    "self_check_mode": "read-checker",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
    "deterministic_failures": 8,
    "deterministic_warnings": 1,
    "vlm_failures": 7,
    "vlm_warnings": 0,
    "deterministic_failure_units": 3,
    "deterministic_warning_units": 1,
    "vlm_failure_units": 2,
    "vlm_warning_units": 0,
    "desktop_failures": 1,
    "desktop_warnings": 1,
    "mobile_failures": 3,
    "mobile_warnings": 0,
    "deep_failures": 1,
    "deep_warnings": 0,
    "mobile_deep_failures": 3,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 20.0,
    "task_score": 4.0,
    "task_score_max": 20,
    "quality_score": 20.0,
    "quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "deepseek",
    "model_slug": "deepseek",
    "source_kind": "clean-final",
    "label": "skill-with-shell-deepseek-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/deepseek/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 52099,
    "generation_ok": true,
    "generation_duration_s": 112.544,
    "input_tokens": 173739,
    "output_tokens": 6911,
    "total_tokens": 180650,
    "billing_tokens": 180650,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 160128,
    "total_cache_tokens": 160128,
    "effective_input_tokens": 13611,
    "display_input_tokens": 173739,
    "usage_event_count": 12,
    "tool_calls": 15,
    "turn_count": 12,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 12,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/implementat",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "deepseek",
    "model_slug": "deepseek",
    "source_kind": "clean-final",
    "label": "skill-with-shell-deepseek-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/deepseek/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 78962,
    "generation_ok": true,
    "generation_duration_s": 378.136,
    "input_tokens": 767427,
    "output_tokens": 27984,
    "total_tokens": 795411,
    "billing_tokens": 795411,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 717696,
    "total_cache_tokens": 717696,
    "effective_input_tokens": 49731,
    "display_input_tokens": 767427,
    "usage_event_count": 18,
    "tool_calls": 22,
    "turn_count": 18,
    "self_check_attempted": true,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 18,
    "self_check_mode": "checker-shell-reference",
    "self_check_evidence": "shell referenced checker: cd /home/shaun/source/birch-html && ls skill/scripts/check_birch_renderings.py 2>&1 && echo \"---\" && head -5 eval-runs/skill-with-shell-deepseek-publication-final/benchmark-compari",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "gemini35flash",
    "model_slug": "gemini35flash",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gemini35flash-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/gemini35flash/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 53215,
    "generation_ok": true,
    "generation_duration_s": 114.216,
    "input_tokens": 1371616,
    "output_tokens": 5260,
    "total_tokens": 1376876,
    "billing_tokens": 1376876,
    "reasoning_tokens": 12418,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1116684,
    "total_cache_tokens": 1116684,
    "effective_input_tokens": 254932,
    "display_input_tokens": 1371616,
    "usage_event_count": 29,
    "tool_calls": 28,
    "turn_count": 29,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 29,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-data.html | ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-dat",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "gemini35flash",
    "model_slug": "gemini35flash",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gemini35flash-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/gemini35flash/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 53047,
    "generation_ok": true,
    "generation_duration_s": 193.238,
    "input_tokens": 1684136,
    "output_tokens": 6902,
    "total_tokens": 1691038,
    "billing_tokens": 1691038,
    "reasoning_tokens": 23273,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1424691,
    "total_cache_tokens": 1424691,
    "effective_input_tokens": 259445,
    "display_input_tokens": 1684136,
    "usage_event_count": 34,
    "tool_calls": 33,
    "turn_count": 34,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 34,
    "self_check_mode": "checker-cli-error,run-checker-cli",
    "self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/co | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --no-capture --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publica",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "gemini35flash",
    "model_slug": "gemini35flash",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gemini35flash-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/gemini35flash/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 57420,
    "generation_ok": true,
    "generation_duration_s": 203.178,
    "input_tokens": 2196880,
    "output_tokens": 10222,
    "total_tokens": 2207102,
    "billing_tokens": 2207102,
    "reasoning_tokens": 22501,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1965131,
    "total_cache_tokens": 1965131,
    "effective_input_tokens": 231749,
    "display_input_tokens": 2196880,
    "usage_event_count": 33,
    "tool_calls": 32,
    "turn_count": 33,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": false,
    "self_check_runs": 2,
    "self_check_failed_runs": 2,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 33,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "read scripts/check_birch_renderings.py | ran checker CLI: python3 scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/module-explainer.html",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "gemini35flash",
    "model_slug": "gemini35flash",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gemini35flash-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/gemini35flash/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 49628,
    "generation_ok": true,
    "generation_duration_s": 201.715,
    "input_tokens": 2346900,
    "output_tokens": 9173,
    "total_tokens": 2356073,
    "billing_tokens": 2356073,
    "reasoning_tokens": 15150,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 2043078,
    "total_cache_tokens": 2043078,
    "effective_input_tokens": 303822,
    "display_input_tokens": 2346900,
    "usage_event_count": 34,
    "tool_calls": 33,
    "turn_count": 34,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 5,
    "self_check_failed_runs": 4,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 34,
    "self_check_mode": "checker-cli-error,run-checker-cli",
    "self_check_evidence": "ran checker CLI: python3 skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/implementation-plan.html",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "gemini35flash",
    "model_slug": "gemini35flash",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gemini35flash-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/gemini35flash/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 97390,
    "generation_ok": true,
    "generation_duration_s": 62.077,
    "input_tokens": 495825,
    "output_tokens": 829,
    "total_tokens": 496654,
    "billing_tokens": 496654,
    "reasoning_tokens": 4961,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 387138,
    "total_cache_tokens": 387138,
    "effective_input_tokens": 108687,
    "display_input_tokens": 495825,
    "usage_event_count": 17,
    "tool_calls": 16,
    "turn_count": 17,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": false,
    "self_check_runs": 1,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 17,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/be",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "glm51",
    "model_slug": "glm51",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm51-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/glm51/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 62971,
    "generation_ok": true,
    "generation_duration_s": 300.114,
    "input_tokens": 459899,
    "output_tokens": 16275,
    "total_tokens": 476174,
    "billing_tokens": 476174,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 369152,
    "total_cache_tokens": 369152,
    "effective_input_tokens": 90747,
    "display_input_tokens": 459899,
    "usage_event_count": 15,
    "tool_calls": 16,
    "turn_count": 15,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": false,
    "self_check_runs": 1,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 15,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/numeric-data.h",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 2,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 1,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 99,
    "task_score": 19.8,
    "task_score_max": 20,
    "quality_score": 99,
    "quality_cap_reason": "",
    "quality_class": "warn"
  },
  {
    "suite": "publish",
    "model": "glm51",
    "model_slug": "glm51",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm51-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/glm51/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 48933,
    "generation_ok": true,
    "generation_duration_s": 133.324,
    "input_tokens": 254816,
    "output_tokens": 8008,
    "total_tokens": 262824,
    "billing_tokens": 262824,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 202560,
    "total_cache_tokens": 202560,
    "effective_input_tokens": 52256,
    "display_input_tokens": 254816,
    "usage_event_count": 11,
    "tool_calls": 13,
    "turn_count": 11,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 11,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/code-review.ht",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 2,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 1,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 92,
    "task_score": 18.4,
    "task_score_max": 20,
    "quality_score": 92,
    "quality_cap_reason": "",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "glm51",
    "model_slug": "glm51",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm51-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/glm51/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 54229,
    "generation_ok": true,
    "generation_duration_s": 94.822,
    "input_tokens": 358438,
    "output_tokens": 6652,
    "total_tokens": 365090,
    "billing_tokens": 365090,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 254656,
    "total_cache_tokens": 254656,
    "effective_input_tokens": 103782,
    "display_input_tokens": 358438,
    "usage_event_count": 9,
    "tool_calls": 15,
    "turn_count": 9,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 9,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/module-explainer.htm",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "glm51",
    "model_slug": "glm51",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm51-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/glm51/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 60535,
    "generation_ok": true,
    "generation_duration_s": 90.03,
    "input_tokens": 210191,
    "output_tokens": 7574,
    "total_tokens": 217765,
    "billing_tokens": 217765,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 180736,
    "total_cache_tokens": 180736,
    "effective_input_tokens": 29455,
    "display_input_tokens": 210191,
    "usage_event_count": 15,
    "tool_calls": 16,
    "turn_count": 15,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 15,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/implementation",
    "deterministic_failures": 2,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 2,
    "deterministic_failure_units": 1,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 1,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 1,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 1,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 93,
    "task_score": 18.6,
    "task_score_max": 20,
    "quality_score": 93,
    "quality_cap_reason": "",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "glm51",
    "model_slug": "glm51",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm51-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/glm51/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 64863,
    "generation_ok": true,
    "generation_duration_s": 149.159,
    "input_tokens": 274201,
    "output_tokens": 14416,
    "total_tokens": 288617,
    "billing_tokens": 288617,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 214336,
    "total_cache_tokens": 214336,
    "effective_input_tokens": 59865,
    "display_input_tokens": 274201,
    "usage_event_count": 12,
    "tool_calls": 14,
    "turn_count": 12,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 12,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/benchmark-comp",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "glm52",
    "model_slug": "glm52",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm52-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/glm52/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 51395,
    "generation_ok": true,
    "generation_duration_s": 274.73,
    "input_tokens": 271862,
    "output_tokens": 16133,
    "total_tokens": 287995,
    "billing_tokens": 287995,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 239936,
    "total_cache_tokens": 239936,
    "effective_input_tokens": 31926,
    "display_input_tokens": 271862,
    "usage_event_count": 14,
    "tool_calls": 17,
    "turn_count": 14,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 14,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-glm52-publication-final/numeric-data.html && echo \"=== finished ===\" && uv  | ran checker CLI: cd /home/shaun/source/birch-html && F=eval-runs/skill-with-shell-glm52-publication-final/numeric-data.html\necho \"=== my local style block (after system block) ===\"\npython3 - \"$F\" <",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "glm52",
    "model_slug": "glm52",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm52-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/glm52/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 50845,
    "generation_ok": true,
    "generation_duration_s": 771.097,
    "input_tokens": 1249523,
    "output_tokens": 43260,
    "total_tokens": 1292783,
    "billing_tokens": 1292783,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1177792,
    "total_cache_tokens": 1177792,
    "effective_input_tokens": 71731,
    "display_input_tokens": 1249523,
    "usage_event_count": 25,
    "tool_calls": 32,
    "turn_count": 25,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 3,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 25,
    "self_check_mode": "checker-cli-error,run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/code-review.ht | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-glm52-publication-final/code-r | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --help 2>&1 | rg -i 'viewport|artifact|mobile|width' | head; echo \"=== run m | checker CLI usage error",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "glm52",
    "model_slug": "glm52",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm52-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/glm52/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 59952,
    "generation_ok": true,
    "generation_duration_s": 751.331,
    "input_tokens": 1204327,
    "output_tokens": 40435,
    "total_tokens": 1244762,
    "billing_tokens": 1244762,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1118592,
    "total_cache_tokens": 1118592,
    "effective_input_tokens": 85735,
    "display_input_tokens": 1204327,
    "usage_event_count": 22,
    "tool_calls": 32,
    "turn_count": 22,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 22,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/module-explainer.htm",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "glm52",
    "model_slug": "glm52",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm52-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/glm52/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 56320,
    "generation_ok": true,
    "generation_duration_s": 456.209,
    "input_tokens": 991570,
    "output_tokens": 24123,
    "total_tokens": 1015693,
    "billing_tokens": 1015693,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 911168,
    "total_cache_tokens": 911168,
    "effective_input_tokens": 80402,
    "display_input_tokens": 991570,
    "usage_event_count": 18,
    "tool_calls": 26,
    "turn_count": 18,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 3,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 18,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/implementation | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-glm52-pu",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "glm52",
    "model_slug": "glm52",
    "source_kind": "clean-final",
    "label": "skill-with-shell-glm52-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/glm52/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 60487,
    "generation_ok": true,
    "generation_duration_s": 380.184,
    "input_tokens": 522022,
    "output_tokens": 23534,
    "total_tokens": 545556,
    "billing_tokens": 545556,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 459648,
    "total_cache_tokens": 459648,
    "effective_input_tokens": 62374,
    "display_input_tokens": 522022,
    "usage_event_count": 16,
    "tool_calls": 19,
    "turn_count": 16,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 16,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/benchmark-comp | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-glm52-publication-final/benchm",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "gpt-5.3-codex",
    "model_slug": "gpt-5-3-codex",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gpt-5-3-codex-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 40305,
    "generation_ok": true,
    "generation_duration_s": 63.372,
    "input_tokens": 91503,
    "output_tokens": 5097,
    "total_tokens": 96600,
    "billing_tokens": 96600,
    "reasoning_tokens": 1083,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 76800,
    "total_cache_tokens": 76800,
    "effective_input_tokens": 14703,
    "display_input_tokens": 91503,
    "usage_event_count": 8,
    "tool_calls": 11,
    "turn_count": 8,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 8,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 2,
    "deterministic_warnings": 2,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 1,
    "deterministic_warning_units": 1,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 1,
    "mobile_failures": 1,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 1,
    "mobile_deep_failures": 1,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 93,
    "task_score": 18.6,
    "task_score_max": 20,
    "quality_score": 93,
    "quality_cap_reason": "",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "gpt-5.3-codex",
    "model_slug": "gpt-5-3-codex",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gpt-5-3-codex-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 39494,
    "generation_ok": true,
    "generation_duration_s": 94.334,
    "input_tokens": 461816,
    "output_tokens": 6027,
    "total_tokens": 467843,
    "billing_tokens": 467843,
    "reasoning_tokens": 2855,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 384640,
    "total_cache_tokens": 384640,
    "effective_input_tokens": 77176,
    "display_input_tokens": 461816,
    "usage_event_count": 17,
    "tool_calls": 18,
    "turn_count": 17,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": false,
    "self_check_runs": 1,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 17,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/code-r",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "gpt-5.3-codex",
    "model_slug": "gpt-5-3-codex",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gpt-5-3-codex-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 46290,
    "generation_ok": true,
    "generation_duration_s": 93.641,
    "input_tokens": 555669,
    "output_tokens": 7177,
    "total_tokens": 562846,
    "billing_tokens": 562846,
    "reasoning_tokens": 1701,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 450304,
    "total_cache_tokens": 450304,
    "effective_input_tokens": 105365,
    "display_input_tokens": 555669,
    "usage_event_count": 17,
    "tool_calls": 23,
    "turn_count": 17,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 17,
    "self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg '^def ' -n /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-pu | ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/module-explainer.h | checker CLI usage error",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 1,
    "vlm_warnings": 1,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 1,
    "vlm_warning_units": 1,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 91,
    "task_score": 18.2,
    "task_score_max": 20,
    "quality_score": 91,
    "quality_cap_reason": "",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "gpt-5.3-codex",
    "model_slug": "gpt-5-3-codex",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gpt-5-3-codex-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 45485,
    "generation_ok": true,
    "generation_duration_s": 59.362,
    "input_tokens": 90659,
    "output_tokens": 4766,
    "total_tokens": 95425,
    "billing_tokens": 95425,
    "reasoning_tokens": 589,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 71168,
    "total_cache_tokens": 71168,
    "effective_input_tokens": 19491,
    "display_input_tokens": 90659,
    "usage_event_count": 9,
    "tool_calls": 10,
    "turn_count": 9,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 9,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/implem | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "gpt-5.3-codex",
    "model_slug": "gpt-5-3-codex",
    "source_kind": "clean-final",
    "label": "skill-with-shell-gpt-5-3-codex-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 46793,
    "generation_ok": true,
    "generation_duration_s": 61.812,
    "input_tokens": 60483,
    "output_tokens": 5615,
    "total_tokens": 66098,
    "billing_tokens": 66098,
    "reasoning_tokens": 746,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 53376,
    "total_cache_tokens": 53376,
    "effective_input_tokens": 7107,
    "display_input_tokens": 60483,
    "usage_event_count": 7,
    "tool_calls": 8,
    "turn_count": 7,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 7,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 4,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 2,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 2,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 2,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 88,
    "task_score": 17.6,
    "task_score_max": 20,
    "quality_score": 88,
    "quality_cap_reason": "",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "grok-4.3",
    "model_slug": "grok-4-3",
    "source_kind": "clean-final",
    "label": "skill-with-shell-grok-4-3-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/grok-4-3/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 36903,
    "generation_ok": true,
    "generation_duration_s": 49.028,
    "input_tokens": 73338,
    "output_tokens": 3307,
    "total_tokens": 76645,
    "billing_tokens": 76645,
    "reasoning_tokens": 925,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 62720,
    "total_cache_tokens": 62720,
    "effective_input_tokens": 10618,
    "display_input_tokens": 73338,
    "usage_event_count": 10,
    "tool_calls": 9,
    "turn_count": 10,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 10,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "grok-4.3",
    "model_slug": "grok-4-3",
    "source_kind": "clean-final",
    "label": "skill-with-shell-grok-4-3-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/grok-4-3/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 38297,
    "generation_ok": true,
    "generation_duration_s": 55.392,
    "input_tokens": 190492,
    "output_tokens": 4553,
    "total_tokens": 195045,
    "billing_tokens": 195045,
    "reasoning_tokens": 2340,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 147520,
    "total_cache_tokens": 147520,
    "effective_input_tokens": 42972,
    "display_input_tokens": 190492,
    "usage_event_count": 11,
    "tool_calls": 10,
    "turn_count": 11,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 11,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "grok-4.3",
    "model_slug": "grok-4-3",
    "source_kind": "clean-final",
    "label": "skill-with-shell-grok-4-3-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/grok-4-3/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 9279,
    "generation_ok": false,
    "generation_duration_s": 40.052,
    "input_tokens": 125766,
    "output_tokens": 3826,
    "total_tokens": 129592,
    "billing_tokens": 129592,
    "reasoning_tokens": 1202,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 46784,
    "total_cache_tokens": 46784,
    "effective_input_tokens": 53433,
    "display_input_tokens": 100217,
    "usage_event_count": 15,
    "tool_calls": 6,
    "turn_count": 7,
    "self_check_attempted": true,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 15,
    "self_check_mode": "read-checker",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
    "deterministic_failures": 8,
    "deterministic_warnings": 0,
    "vlm_failures": 3,
    "vlm_warnings": 0,
    "deterministic_failure_units": 2,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 2,
    "vlm_warning_units": 0,
    "desktop_failures": 2,
    "desktop_warnings": 0,
    "mobile_failures": 2,
    "mobile_warnings": 0,
    "deep_failures": 2,
    "deep_warnings": 0,
    "mobile_deep_failures": 2,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 35.0,
    "task_score": 7.0,
    "task_score_max": 20,
    "quality_score": 35.0,
    "quality_cap_reason": "missing_birch_css",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "grok-4.3",
    "model_slug": "grok-4-3",
    "source_kind": "clean-final",
    "label": "skill-with-shell-grok-4-3-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/grok-4-3/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 16152,
    "generation_ok": false,
    "generation_duration_s": 41.596,
    "input_tokens": 32235,
    "output_tokens": 5236,
    "total_tokens": 37471,
    "billing_tokens": 37471,
    "reasoning_tokens": 1207,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 39488,
    "total_cache_tokens": 39488,
    "effective_input_tokens": 20479,
    "display_input_tokens": 59967,
    "usage_event_count": 8,
    "tool_calls": 4,
    "turn_count": 5,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 8,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 4,
    "deterministic_warnings": 0,
    "vlm_failures": 4,
    "vlm_warnings": 0,
    "deterministic_failure_units": 1,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 1,
    "vlm_warning_units": 0,
    "desktop_failures": 1,
    "desktop_warnings": 0,
    "mobile_failures": 1,
    "mobile_warnings": 0,
    "deep_failures": 1,
    "deep_warnings": 0,
    "mobile_deep_failures": 1,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 20.0,
    "task_score": 4.0,
    "task_score_max": 20,
    "quality_score": 20.0,
    "quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "grok-4.3",
    "model_slug": "grok-4-3",
    "source_kind": "clean-final",
    "label": "skill-with-shell-grok-4-3-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/grok-4-3/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 10364,
    "generation_ok": false,
    "generation_duration_s": 98.19,
    "input_tokens": 153411,
    "output_tokens": 7388,
    "total_tokens": 160799,
    "billing_tokens": 160799,
    "reasoning_tokens": 2517,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 39488,
    "total_cache_tokens": 39488,
    "effective_input_tokens": 6645,
    "display_input_tokens": 46133,
    "usage_event_count": 8,
    "tool_calls": 15,
    "turn_count": 16,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 8,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 4,
    "deterministic_warnings": 0,
    "vlm_failures": 4,
    "vlm_warnings": 1,
    "deterministic_failure_units": 1,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 1,
    "vlm_warning_units": 1,
    "desktop_failures": 1,
    "desktop_warnings": 0,
    "mobile_failures": 1,
    "mobile_warnings": 0,
    "deep_failures": 1,
    "deep_warnings": 0,
    "mobile_deep_failures": 1,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 35.0,
    "task_score": 7.0,
    "task_score_max": 20,
    "quality_score": 35.0,
    "quality_cap_reason": "missing_birch_css",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "haiku45",
    "model_slug": "haiku45",
    "source_kind": "clean-final",
    "label": "skill-with-shell-haiku45-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/haiku45/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 23937,
    "generation_ok": false,
    "generation_duration_s": 67.62,
    "input_tokens": 119520,
    "output_tokens": 7707,
    "total_tokens": 127227,
    "billing_tokens": 127227,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 7297,
    "cache_write_tokens": 12081,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 19378,
    "effective_input_tokens": 11280,
    "display_input_tokens": 30658,
    "usage_event_count": 4,
    "tool_calls": 9,
    "turn_count": 10,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 4,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-haiku45-publication-final/numeric-data",
    "deterministic_failures": 16,
    "deterministic_warnings": 12,
    "vlm_failures": 1,
    "vlm_warnings": 0,
    "deterministic_failure_units": 4,
    "deterministic_warning_units": 3,
    "vlm_failure_units": 1,
    "vlm_warning_units": 0,
    "desktop_failures": 4,
    "desktop_warnings": 3,
    "mobile_failures": 4,
    "mobile_warnings": 3,
    "deep_failures": 4,
    "deep_warnings": 3,
    "mobile_deep_failures": 4,
    "mobile_deep_warnings": 3,
    "artifact_present": true,
    "artifact_score_100": 35.0,
    "task_score": 7.0,
    "task_score_max": 20,
    "quality_score": 35.0,
    "quality_cap_reason": "missing_birch_css",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "haiku45",
    "model_slug": "haiku45",
    "source_kind": "clean-final",
    "label": "skill-with-shell-haiku45-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/haiku45/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 53526,
    "generation_ok": true,
    "generation_duration_s": 94.461,
    "input_tokens": 301467,
    "output_tokens": 10117,
    "total_tokens": 311584,
    "billing_tokens": 311584,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 228528,
    "cache_write_tokens": 34499,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 263027,
    "effective_input_tokens": 38440,
    "display_input_tokens": 301467,
    "usage_event_count": 11,
    "tool_calls": 11,
    "turn_count": 11,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 11,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
    "deterministic_failures": 6,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 2,
    "deterministic_failure_units": 2,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 1,
    "desktop_failures": 1,
    "desktop_warnings": 0,
    "mobile_failures": 2,
    "mobile_warnings": 0,
    "deep_failures": 1,
    "deep_warnings": 0,
    "mobile_deep_failures": 2,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 87,
    "task_score": 17.4,
    "task_score_max": 20,
    "quality_score": 87,
    "quality_cap_reason": "",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "haiku45",
    "model_slug": "haiku45",
    "source_kind": "clean-final",
    "label": "skill-with-shell-haiku45-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/haiku45/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 57853,
    "generation_ok": false,
    "generation_duration_s": 75.42,
    "input_tokens": 211164,
    "output_tokens": 9407,
    "total_tokens": 220571,
    "billing_tokens": 220571,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 55031,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 55031,
    "effective_input_tokens": 80985,
    "display_input_tokens": 136016,
    "usage_event_count": 3,
    "tool_calls": 10,
    "turn_count": 6,
    "self_check_attempted": true,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 3,
    "self_check_mode": "read-checker",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "haiku45",
    "model_slug": "haiku45",
    "source_kind": "clean-final",
    "label": "skill-with-shell-haiku45-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/haiku45/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 50641,
    "generation_ok": true,
    "generation_duration_s": 67.418,
    "input_tokens": 123711,
    "output_tokens": 7166,
    "total_tokens": 130877,
    "billing_tokens": 130877,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 91600,
    "cache_write_tokens": 16126,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 107726,
    "effective_input_tokens": 15985,
    "display_input_tokens": 123711,
    "usage_event_count": 9,
    "tool_calls": 9,
    "turn_count": 9,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 9,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "haiku45",
    "model_slug": "haiku45",
    "source_kind": "clean-final",
    "label": "skill-with-shell-haiku45-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/haiku45/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 49137,
    "generation_ok": true,
    "generation_duration_s": 65.28,
    "input_tokens": 151349,
    "output_tokens": 7796,
    "total_tokens": 159145,
    "billing_tokens": 159145,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 122743,
    "cache_write_tokens": 12640,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 135383,
    "effective_input_tokens": 15966,
    "display_input_tokens": 151349,
    "usage_event_count": 11,
    "tool_calls": 10,
    "turn_count": 11,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 11,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 4,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 3,
    "deterministic_failure_units": 1,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 1,
    "desktop_failures": 1,
    "desktop_warnings": 0,
    "mobile_failures": 1,
    "mobile_warnings": 0,
    "deep_failures": 1,
    "deep_warnings": 0,
    "mobile_deep_failures": 1,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 93,
    "task_score": 18.6,
    "task_score_max": 20,
    "quality_score": 93,
    "quality_cap_reason": "",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "kimi",
    "model_slug": "kimi",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/kimi/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 67620,
    "generation_ok": true,
    "generation_duration_s": 194.344,
    "input_tokens": 470039,
    "output_tokens": 5317,
    "total_tokens": 475356,
    "billing_tokens": 475356,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 425472,
    "total_cache_tokens": 425472,
    "effective_input_tokens": 44567,
    "display_input_tokens": 470039,
    "usage_event_count": 20,
    "tool_calls": 23,
    "turn_count": 20,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 20,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/numeric-data.ht | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-pub",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "kimi",
    "model_slug": "kimi",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/kimi/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 44300,
    "generation_ok": true,
    "generation_duration_s": 627.536,
    "input_tokens": 1248543,
    "output_tokens": 24596,
    "total_tokens": 1273139,
    "billing_tokens": 1273139,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1192448,
    "total_cache_tokens": 1192448,
    "effective_input_tokens": 56095,
    "display_input_tokens": 1248543,
    "usage_event_count": 33,
    "tool_calls": 36,
    "turn_count": 33,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 33,
    "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"CANDLE_CLASSES\\|BIRCH_CLASSES\\|LAYOUT_CLASSES\\|SEMANTIC_CLASSES\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"callout\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"eyebrow\\|lede\\|muted\\|caption\\|subtle\\|note\\|entity\\|label-cell\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"code-block\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"data-tone\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "kimi",
    "model_slug": "kimi",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/kimi/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 17730,
    "generation_ok": false,
    "generation_duration_s": 142.653,
    "input_tokens": 54919,
    "output_tokens": 5427,
    "total_tokens": 60346,
    "billing_tokens": 60346,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 0,
    "effective_input_tokens": 54919,
    "display_input_tokens": 54919,
    "usage_event_count": 5,
    "tool_calls": 10,
    "turn_count": 5,
    "self_check_attempted": true,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 5,
    "self_check_mode": "read-checker",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
    "deterministic_failures": 6,
    "deterministic_warnings": 0,
    "vlm_failures": 7,
    "vlm_warnings": 1,
    "deterministic_failure_units": 2,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 3,
    "vlm_warning_units": 1,
    "desktop_failures": 1,
    "desktop_warnings": 0,
    "mobile_failures": 2,
    "mobile_warnings": 0,
    "deep_failures": 1,
    "deep_warnings": 0,
    "mobile_deep_failures": 2,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 20.0,
    "task_score": 4.0,
    "task_score_max": 20,
    "quality_score": 20.0,
    "quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "kimi",
    "model_slug": "kimi",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/kimi/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 50937,
    "generation_ok": true,
    "generation_duration_s": 372.779,
    "input_tokens": 468652,
    "output_tokens": 19358,
    "total_tokens": 488010,
    "billing_tokens": 488010,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 415232,
    "total_cache_tokens": 415232,
    "effective_input_tokens": 53420,
    "display_input_tokens": 468652,
    "usage_event_count": 15,
    "tool_calls": 16,
    "turn_count": 15,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 15,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/implementation-",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "kimi",
    "model_slug": "kimi",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/kimi/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 51725,
    "generation_ok": true,
    "generation_duration_s": 427.336,
    "input_tokens": 358341,
    "output_tokens": 15297,
    "total_tokens": 373638,
    "billing_tokens": 373638,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 299776,
    "total_cache_tokens": 299776,
    "effective_input_tokens": 58565,
    "display_input_tokens": 358341,
    "usage_event_count": 14,
    "tool_calls": 14,
    "turn_count": 14,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 14,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-publicati",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 1,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 1,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 99,
    "task_score": 19.8,
    "task_score_max": 20,
    "quality_score": 99,
    "quality_cap_reason": "",
    "quality_class": "warn"
  },
  {
    "suite": "publish",
    "model": "kimi27",
    "model_slug": "kimi27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi27-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/kimi27/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 41967,
    "generation_ok": true,
    "generation_duration_s": 210.371,
    "input_tokens": 1978925,
    "output_tokens": 17532,
    "total_tokens": 1996457,
    "billing_tokens": 1996457,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1687898,
    "total_cache_tokens": 1687898,
    "effective_input_tokens": 291027,
    "display_input_tokens": 1978925,
    "usage_event_count": 30,
    "tool_calls": 32,
    "turn_count": 30,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 3,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 30,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && if [ -f skill/scripts/check_birch_renderings.py ]; then uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs | read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-p | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-kimi27-publication-final/numeric-data.html && uv run --with pillow python s",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "kimi27",
    "model_slug": "kimi27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi27-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/kimi27/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 47402,
    "generation_ok": true,
    "generation_duration_s": 253.252,
    "input_tokens": 1509119,
    "output_tokens": 28034,
    "total_tokens": 1537153,
    "billing_tokens": 1537153,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1144217,
    "total_cache_tokens": 1144217,
    "effective_input_tokens": 364902,
    "display_input_tokens": 1509119,
    "usage_event_count": 25,
    "tool_calls": 30,
    "turn_count": 25,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 25,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/code-review.html 2> | ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-kimi27-publication-final/code-review.html && uv run --with pillow py",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "kimi27",
    "model_slug": "kimi27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi27-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/kimi27/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 52748,
    "generation_ok": true,
    "generation_duration_s": 136.617,
    "input_tokens": 582570,
    "output_tokens": 12473,
    "total_tokens": 595043,
    "billing_tokens": 595043,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 334281,
    "total_cache_tokens": 334281,
    "effective_input_tokens": 248289,
    "display_input_tokens": 582570,
    "usage_event_count": 7,
    "tool_calls": 14,
    "turn_count": 7,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 7,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/module-explainer.ht",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "kimi27",
    "model_slug": "kimi27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi27-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/kimi27/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 52277,
    "generation_ok": true,
    "generation_duration_s": 72.968,
    "input_tokens": 487122,
    "output_tokens": 6684,
    "total_tokens": 493806,
    "billing_tokens": 493806,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 332350,
    "total_cache_tokens": 332350,
    "effective_input_tokens": 154772,
    "display_input_tokens": 487122,
    "usage_event_count": 9,
    "tool_calls": 9,
    "turn_count": 9,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 9,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/implementatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-p",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "kimi27",
    "model_slug": "kimi27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-kimi27-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/kimi27/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 59856,
    "generation_ok": true,
    "generation_duration_s": 159.927,
    "input_tokens": 1290293,
    "output_tokens": 18058,
    "total_tokens": 1308351,
    "billing_tokens": 1308351,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 1169139,
    "total_cache_tokens": 1169139,
    "effective_input_tokens": 121154,
    "display_input_tokens": 1290293,
    "usage_event_count": 16,
    "tool_calls": 19,
    "turn_count": 16,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 16,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/benchmark-co | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "minimax27",
    "model_slug": "minimax27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-minimax27-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/minimax27/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 50838,
    "generation_ok": false,
    "generation_duration_s": 160.154,
    "input_tokens": 87235,
    "output_tokens": 10902,
    "total_tokens": 98137,
    "billing_tokens": 98137,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 116736,
    "total_cache_tokens": 116736,
    "effective_input_tokens": 81499,
    "display_input_tokens": 198235,
    "usage_event_count": 12,
    "tool_calls": 9,
    "turn_count": 10,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 12,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "minimax27",
    "model_slug": "minimax27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-minimax27-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/minimax27/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 43165,
    "generation_ok": true,
    "generation_duration_s": 211.215,
    "input_tokens": 444148,
    "output_tokens": 7213,
    "total_tokens": 451361,
    "billing_tokens": 451361,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 355328,
    "total_cache_tokens": 355328,
    "effective_input_tokens": 88820,
    "display_input_tokens": 444148,
    "usage_event_count": 18,
    "tool_calls": 20,
    "turn_count": 18,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 18,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "minimax27",
    "model_slug": "minimax27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-minimax27-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/minimax27/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 50511,
    "generation_ok": false,
    "generation_duration_s": 183.748,
    "input_tokens": 185140,
    "output_tokens": 15068,
    "total_tokens": 200208,
    "billing_tokens": 200208,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 232320,
    "total_cache_tokens": 232320,
    "effective_input_tokens": 148313,
    "display_input_tokens": 380633,
    "usage_event_count": 9,
    "tool_calls": 9,
    "turn_count": 5,
    "self_check_attempted": true,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 9,
    "self_check_mode": "read-checker",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
    "deterministic_failures": 4,
    "deterministic_warnings": 0,
    "vlm_failures": 4,
    "vlm_warnings": 0,
    "deterministic_failure_units": 1,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 1,
    "vlm_warning_units": 0,
    "desktop_failures": 1,
    "desktop_warnings": 0,
    "mobile_failures": 1,
    "mobile_warnings": 0,
    "deep_failures": 1,
    "deep_warnings": 0,
    "mobile_deep_failures": 1,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 20.0,
    "task_score": 4.0,
    "task_score_max": 20,
    "quality_score": 20.0,
    "quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "minimax27",
    "model_slug": "minimax27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-minimax27-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/minimax27/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 21904,
    "generation_ok": false,
    "generation_duration_s": 64.763,
    "input_tokens": 27146,
    "output_tokens": 4563,
    "total_tokens": 31709,
    "billing_tokens": 31709,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 7040,
    "total_cache_tokens": 7040,
    "effective_input_tokens": 11494,
    "display_input_tokens": 18534,
    "usage_event_count": 3,
    "tool_calls": 3,
    "turn_count": 4,
    "self_check_attempted": false,
    "self_check_ran": false,
    "self_check_succeeded": false,
    "self_check_runs": 0,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 0,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 3,
    "self_check_mode": "",
    "self_check_evidence": "",
    "deterministic_failures": 14,
    "deterministic_warnings": 4,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 4,
    "deterministic_warning_units": 1,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 3,
    "desktop_warnings": 1,
    "mobile_failures": 4,
    "mobile_warnings": 1,
    "deep_failures": 3,
    "deep_warnings": 1,
    "mobile_deep_failures": 4,
    "mobile_deep_warnings": 1,
    "artifact_present": true,
    "artifact_score_100": 35.0,
    "task_score": 7.0,
    "task_score_max": 20,
    "quality_score": 35.0,
    "quality_cap_reason": "missing_birch_css",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "minimax27",
    "model_slug": "minimax27",
    "source_kind": "clean-final",
    "label": "skill-with-shell-minimax27-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/minimax27/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 79228,
    "generation_ok": false,
    "generation_duration_s": 420.033,
    "input_tokens": 511926,
    "output_tokens": 33192,
    "total_tokens": 545118,
    "billing_tokens": 545118,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 0,
    "cache_write_tokens": 0,
    "cache_hit_tokens": 129664,
    "total_cache_tokens": 129664,
    "effective_input_tokens": 154885,
    "display_input_tokens": 284549,
    "usage_event_count": 7,
    "tool_calls": 14,
    "turn_count": 13,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 7,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-minimax27-publication-final/benchmark-comparison.html 2>&1 ",
    "deterministic_failures": 8,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 4,
    "deterministic_failure_units": 2,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 1,
    "desktop_failures": 2,
    "desktop_warnings": 0,
    "mobile_failures": 2,
    "mobile_warnings": 0,
    "deep_failures": 2,
    "deep_warnings": 0,
    "mobile_deep_failures": 2,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 35.0,
    "task_score": 7.0,
    "task_score_max": 20,
    "quality_score": 35.0,
    "quality_cap_reason": "missing_birch_css",
    "quality_class": "fail"
  },
  {
    "suite": "publish",
    "model": "opus47",
    "model_slug": "opus47",
    "source_kind": "clean-final",
    "label": "skill-with-shell-opus47-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/opus47/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 45758,
    "generation_ok": true,
    "generation_duration_s": 106.088,
    "input_tokens": 161380,
    "output_tokens": 8823,
    "total_tokens": 170203,
    "billing_tokens": 170203,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 114642,
    "cache_write_tokens": 25769,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 140411,
    "effective_input_tokens": 20969,
    "display_input_tokens": 161380,
    "usage_event_count": 10,
    "tool_calls": 12,
    "turn_count": 10,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 10,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/numeric-data. | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "opus47",
    "model_slug": "opus47",
    "source_kind": "clean-final",
    "label": "skill-with-shell-opus47-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/opus47/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 50191,
    "generation_ok": true,
    "generation_duration_s": 268.356,
    "input_tokens": 571314,
    "output_tokens": 17059,
    "total_tokens": 588373,
    "billing_tokens": 588373,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 441950,
    "cache_write_tokens": 55976,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 497926,
    "effective_input_tokens": 73388,
    "display_input_tokens": 571314,
    "usage_event_count": 14,
    "tool_calls": 18,
    "turn_count": 14,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 3,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 14,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/code-review.h | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "opus47",
    "model_slug": "opus47",
    "source_kind": "clean-final",
    "label": "skill-with-shell-opus47-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/opus47/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 58814,
    "generation_ok": true,
    "generation_duration_s": 206.748,
    "input_tokens": 653611,
    "output_tokens": 15632,
    "total_tokens": 669243,
    "billing_tokens": 669243,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 502232,
    "cache_write_tokens": 65941,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 568173,
    "effective_input_tokens": 85438,
    "display_input_tokens": 653611,
    "usage_event_count": 13,
    "tool_calls": 19,
    "turn_count": 13,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 1,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 13,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/module-explainer.ht",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "opus47",
    "model_slug": "opus47",
    "source_kind": "clean-final",
    "label": "skill-with-shell-opus47-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/opus47/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 53012,
    "generation_ok": true,
    "generation_duration_s": 141.632,
    "input_tokens": 206186,
    "output_tokens": 9414,
    "total_tokens": 215600,
    "billing_tokens": 215600,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 160139,
    "cache_write_tokens": 23940,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 184079,
    "effective_input_tokens": 22107,
    "display_input_tokens": 206186,
    "usage_event_count": 11,
    "tool_calls": 12,
    "turn_count": 11,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 11,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/implementatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "opus47",
    "model_slug": "opus47",
    "source_kind": "clean-final",
    "label": "skill-with-shell-opus47-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/opus47/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 64934,
    "generation_ok": true,
    "generation_duration_s": 150.046,
    "input_tokens": 388331,
    "output_tokens": 9617,
    "total_tokens": 397948,
    "billing_tokens": 397948,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 328368,
    "cache_write_tokens": 33477,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 361845,
    "effective_input_tokens": 26486,
    "display_input_tokens": 388331,
    "usage_event_count": 19,
    "tool_calls": 22,
    "turn_count": 19,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 19,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/benchmark-com | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "sonnet46",
    "model_slug": "sonnet46",
    "source_kind": "clean-final",
    "label": "skill-with-shell-sonnet46-publication-final",
    "eval": "numeric-data",
    "artifact_path": "results/publish/models/sonnet46/artifacts/numeric-data.html",
    "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-desktop.png",
    "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile.png",
    "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile-deep.png",
    "artifact_bytes": 52394,
    "generation_ok": true,
    "generation_duration_s": 203.959,
    "input_tokens": 302149,
    "output_tokens": 14758,
    "total_tokens": 316907,
    "billing_tokens": 316907,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 234504,
    "cache_write_tokens": 38197,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 272701,
    "effective_input_tokens": 29448,
    "display_input_tokens": 302149,
    "usage_event_count": 13,
    "tool_calls": 15,
    "turn_count": 13,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 13,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "sonnet46",
    "model_slug": "sonnet46",
    "source_kind": "clean-final",
    "label": "skill-with-shell-sonnet46-publication-final",
    "eval": "code-review",
    "artifact_path": "results/publish/models/sonnet46/artifacts/code-review.html",
    "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/code-review-desktop.png",
    "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile.png",
    "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile-deep.png",
    "artifact_bytes": 57805,
    "generation_ok": true,
    "generation_duration_s": 302.047,
    "input_tokens": 477280,
    "output_tokens": 18427,
    "total_tokens": 495707,
    "billing_tokens": 495707,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 368349,
    "cache_write_tokens": 44875,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 413224,
    "effective_input_tokens": 64056,
    "display_input_tokens": 477280,
    "usage_event_count": 14,
    "tool_calls": 18,
    "turn_count": 14,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 14,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/code-review | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "sonnet46",
    "model_slug": "sonnet46",
    "source_kind": "clean-final",
    "label": "skill-with-shell-sonnet46-publication-final",
    "eval": "module-explainer",
    "artifact_path": "results/publish/models/sonnet46/artifacts/module-explainer.html",
    "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-desktop.png",
    "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile.png",
    "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile-deep.png",
    "artifact_bytes": 66525,
    "generation_ok": true,
    "generation_duration_s": 978.64,
    "input_tokens": 2649057,
    "output_tokens": 62243,
    "total_tokens": 2711300,
    "billing_tokens": 2711300,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 2413844,
    "cache_write_tokens": 135163,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 2549007,
    "effective_input_tokens": 100050,
    "display_input_tokens": 2649057,
    "usage_event_count": 34,
    "tool_calls": 38,
    "turn_count": 34,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 1,
    "self_check_successful_runs": 1,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": true,
    "assistant_turns_trace": 34,
    "self_check_mode": "read-checker,run-checker-cli",
    "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer. | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer.html && uv run --with pillow py",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "sonnet46",
    "model_slug": "sonnet46",
    "source_kind": "clean-final",
    "label": "skill-with-shell-sonnet46-publication-final",
    "eval": "implementation-plan",
    "artifact_path": "results/publish/models/sonnet46/artifacts/implementation-plan.html",
    "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-desktop.png",
    "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile.png",
    "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile-deep.png",
    "artifact_bytes": 49926,
    "generation_ok": true,
    "generation_duration_s": 196.05,
    "input_tokens": 257093,
    "output_tokens": 12916,
    "total_tokens": 270009,
    "billing_tokens": 270009,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 210864,
    "cache_write_tokens": 24527,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 235391,
    "effective_input_tokens": 21702,
    "display_input_tokens": 257093,
    "usage_event_count": 14,
    "tool_calls": 15,
    "turn_count": 14,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 2,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 2,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 14,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/implementat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  },
  {
    "suite": "publish",
    "model": "sonnet46",
    "model_slug": "sonnet46",
    "source_kind": "clean-final",
    "label": "skill-with-shell-sonnet46-publication-final",
    "eval": "benchmark-comparison",
    "artifact_path": "results/publish/models/sonnet46/artifacts/benchmark-comparison.html",
    "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-desktop.png",
    "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile.png",
    "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-deep.png",
    "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile-deep.png",
    "artifact_bytes": 122208,
    "generation_ok": true,
    "generation_duration_s": 623.147,
    "input_tokens": 1192904,
    "output_tokens": 48270,
    "total_tokens": 1241174,
    "billing_tokens": 1241174,
    "reasoning_tokens": 0,
    "tool_use_tokens": 0,
    "cache_read_tokens": 987803,
    "cache_write_tokens": 129337,
    "cache_hit_tokens": 0,
    "total_cache_tokens": 1117140,
    "effective_input_tokens": 75764,
    "display_input_tokens": 1192904,
    "usage_event_count": 18,
    "tool_calls": 22,
    "turn_count": 18,
    "self_check_attempted": true,
    "self_check_ran": true,
    "self_check_succeeded": true,
    "self_check_runs": 3,
    "self_check_failed_runs": 0,
    "self_check_successful_runs": 3,
    "self_correction_edits": 0,
    "self_corrected_after_checker": false,
    "self_correction_verified": false,
    "assistant_turns_trace": 18,
    "self_check_mode": "run-checker-cli",
    "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/benchmark-c | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
    "deterministic_failures": 0,
    "deterministic_warnings": 0,
    "vlm_failures": 0,
    "vlm_warnings": 0,
    "deterministic_failure_units": 0,
    "deterministic_warning_units": 0,
    "vlm_failure_units": 0,
    "vlm_warning_units": 0,
    "desktop_failures": 0,
    "desktop_warnings": 0,
    "mobile_failures": 0,
    "mobile_warnings": 0,
    "deep_failures": 0,
    "deep_warnings": 0,
    "mobile_deep_failures": 0,
    "mobile_deep_warnings": 0,
    "artifact_present": true,
    "artifact_score_100": 100.0,
    "task_score": 20.0,
    "task_score_max": 20,
    "quality_score": 100.0,
    "quality_cap_reason": "",
    "quality_class": "clean"
  }
]