| { |
| "architectures": [ |
| "CognicaPoEForCausalLM" |
| ], |
| "model_type": "cognica_poe", |
| "auto_map": { |
| "AutoConfig": "configuration_cognica_poe.CognicaPoEConfig", |
| "AutoModelForCausalLM": "modeling_cognica_poe.CognicaPoEForCausalLM", |
| "AutoTokenizer": [ |
| "tokenization_cognica_poe.CognicaPoETokenizer", |
| null |
| ] |
| }, |
| "hidden_size": 2048, |
| "intermediate_size": 12800, |
| "num_hidden_layers": 32, |
| "num_attention_heads": 16, |
| "num_key_value_heads": 8, |
| "head_dim": 128, |
| "max_position_embeddings": 2048, |
| "vocab_size": 32768, |
| "padded_vocab_size": 32768, |
| "hidden_act": "relu_squared", |
| "rms_norm_eps": 1e-06, |
| "rope_theta": 100000, |
| "tie_word_embeddings": false, |
| "window_pattern": "SSSL", |
| "use_cache": true, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.45.0", |
| "bos_token_id": 32759, |
| "eos_token_id": 32759, |
| "pad_token_id": null, |
| "poe_mode": "flat", |
| "poe_every": 1, |
| "poe_alpha": 0.0, |
| "poe_head_count": 4, |
| "poe_stage_layers": [ |
| 16, |
| 6, |
| 5, |
| 5 |
| ], |
| "per_stage_head": true, |
| "dual_head": false, |
| "poe_note": "Trained with poe_mode=flat, poe_alpha=0.0 (uniform stage average), per_stage_head=True, asymmetric poe_stage_layers=(16,6,5,5) -> 4 stages with boundaries at layers (15, 21, 26, 31). Each per-stage head composes additively with the shared lm_head: logits_k = lm_head(x_k) + lm_head_stages[k](x_k). PoE aggregation uses geometric mean of per-stage softmax distributions (alpha=0). See README and the companion paper.", |
| "training": { |
| "total_batch_size": 786432, |
| "sequence_len": 2048, |
| "num_iterations": 83923, |
| "target_tokens": 65990000000, |
| "optimizer": "DistMuonAdamW", |
| "embedding_lr": 0.3, |
| "unembedding_lr": 0.008, |
| "matrix_lr": 0.015, |
| "weight_decay": 0.28, |
| "warmdown_ratio": 0.65, |
| "warmup_steps": 1000, |
| "case_aug_prob": 0.15, |
| "chinchilla_ratio_total": 21.85, |
| "dataset": "frontier_v1 mix (63.07B tokens, 848 sharded parquets): FineWeb-Edu 33.5%, DCLM-Baseline 24.1%, Stack v2 (codeparrot mirror) 15.7%, Wikipedia 5.2%, CulturaX (ko/zh/ja/es/fr) 5.2%, ProofPile-2 4.2%, OpenWebMath 4.2%, Gutenberg 4.2%, PG-19 2.1%, UltraChat 1.0%, OpenHermes-2.5 0.6%" |
| }, |
| "training_step": 106000, |
| "training_val_bpb": 0.8717988846209657, |
| "poe_wand_p99_bounds_per_stage_head": [ |
| 3.8243, |
| 1.9192, |
| 1.3995 |
| ] |
| } |