{
  "architectures": [
    "CognicaPoEForCausalLM"
  ],
  "model_type": "cognica_poe",
  "auto_map": {
    "AutoConfig": "configuration_cognica_poe.CognicaPoEConfig",
    "AutoModelForCausalLM": "modeling_cognica_poe.CognicaPoEForCausalLM",
    "AutoTokenizer": [
      "tokenization_cognica_poe.CognicaPoETokenizer",
      null
    ]
  },
  "hidden_size": 2048,
  "intermediate_size": 12800,
  "num_hidden_layers": 32,
  "num_attention_heads": 16,
  "num_key_value_heads": 8,
  "head_dim": 128,
  "max_position_embeddings": 2048,
  "vocab_size": 32768,
  "padded_vocab_size": 32768,
  "hidden_act": "relu_squared",
  "rms_norm_eps": 1e-06,
  "rope_theta": 100000,
  "tie_word_embeddings": false,
  "window_pattern": "SSSL",
  "use_cache": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.45.0",
  "bos_token_id": 32759,
  "eos_token_id": 32759,
  "pad_token_id": null,
  "poe_mode": "flat",
  "poe_every": 1,
  "poe_alpha": 0.0,
  "poe_head_count": 4,
  "poe_stage_layers": [
    16,
    6,
    5,
    5
  ],
  "per_stage_head": true,
  "dual_head": false,
  "poe_note": "Trained with poe_mode=flat, poe_alpha=0.0 (uniform stage average), per_stage_head=True, asymmetric poe_stage_layers=(16,6,5,5) -> 4 stages with boundaries at layers (15, 21, 26, 31). Each per-stage head composes additively with the shared lm_head: logits_k = lm_head(x_k) + lm_head_stages[k](x_k). PoE aggregation uses geometric mean of per-stage softmax distributions (alpha=0). See README and the companion paper.",
  "training": {
    "total_batch_size": 786432,
    "sequence_len": 2048,
    "num_iterations": 83923,
    "target_tokens": 65990000000,
    "optimizer": "DistMuonAdamW",
    "embedding_lr": 0.3,
    "unembedding_lr": 0.008,
    "matrix_lr": 0.015,
    "weight_decay": 0.28,
    "warmdown_ratio": 0.65,
    "warmup_steps": 1000,
    "case_aug_prob": 0.15,
    "chinchilla_ratio_total": 21.85,
    "dataset": "frontier_v1 mix (63.07B tokens, 848 sharded parquets): FineWeb-Edu 33.5%, DCLM-Baseline 24.1%, Stack v2 (codeparrot mirror) 15.7%, Wikipedia 5.2%, CulturaX (ko/zh/ja/es/fr) 5.2%, ProofPile-2 4.2%, OpenWebMath 4.2%, Gutenberg 4.2%, PG-19 2.1%, UltraChat 1.0%, OpenHermes-2.5 0.6%"
  },
  "training_step": 108000,
  "training_val_bpb": 0.863411181532091,
  "poe_wand_p99_bounds_per_stage_head": []
}