{ "architectures": [ "CognicaPoEForCausalLM" ], "model_type": "cognica_poe", "auto_map": { "AutoConfig": "configuration_cognica_poe.CognicaPoEConfig", "AutoModelForCausalLM": "modeling_cognica_poe.CognicaPoEForCausalLM", "AutoTokenizer": [ "tokenization_cognica_poe.CognicaPoETokenizer", null ] }, "hidden_size": 2048, "intermediate_size": 12800, "num_hidden_layers": 32, "num_attention_heads": 16, "num_key_value_heads": 8, "head_dim": 128, "max_position_embeddings": 2048, "vocab_size": 32768, "padded_vocab_size": 32768, "hidden_act": "relu_squared", "rms_norm_eps": 1e-06, "rope_theta": 100000, "tie_word_embeddings": false, "window_pattern": "SSSL", "use_cache": true, "torch_dtype": "bfloat16", "transformers_version": "4.45.0", "bos_token_id": 32759, "eos_token_id": 32759, "pad_token_id": null, "poe_mode": "flat", "poe_every": 1, "poe_alpha": 0.0, "poe_head_count": 4, "poe_stage_layers": [ 16, 6, 5, 5 ], "per_stage_head": true, "dual_head": false, "poe_note": "Trained with poe_mode=flat, poe_alpha=0.0 (uniform stage average), per_stage_head=True, asymmetric poe_stage_layers=(16,6,5,5) -> 4 stages with boundaries at layers (15, 21, 26, 31). Each per-stage head composes additively with the shared lm_head: logits_k = lm_head(x_k) + lm_head_stages[k](x_k). PoE aggregation uses geometric mean of per-stage softmax distributions (alpha=0). See README and the companion paper.", "training": { "total_batch_size": 786432, "sequence_len": 2048, "num_iterations": 83923, "target_tokens": 65990000000, "optimizer": "DistMuonAdamW", "embedding_lr": 0.3, "unembedding_lr": 0.008, "matrix_lr": 0.015, "weight_decay": 0.28, "warmdown_ratio": 0.65, "warmup_steps": 1000, "case_aug_prob": 0.15, "chinchilla_ratio_total": 21.85, "dataset": "frontier_v1 mix (63.07B tokens, 848 sharded parquets): FineWeb-Edu 33.5%, DCLM-Baseline 24.1%, Stack v2 (codeparrot mirror) 15.7%, Wikipedia 5.2%, CulturaX (ko/zh/ja/es/fr) 5.2%, ProofPile-2 4.2%, OpenWebMath 4.2%, Gutenberg 4.2%, PG-19 2.1%, UltraChat 1.0%, OpenHermes-2.5 0.6%" }, "training_step": 108000, "training_val_bpb": 0.863411181532091, "poe_wand_p99_bounds_per_stage_head": [] }