jaepil's picture
config: WAND p99 bounds for s106000
e3a3285 verified
{
"architectures": [
"CognicaPoEForCausalLM"
],
"model_type": "cognica_poe",
"auto_map": {
"AutoConfig": "configuration_cognica_poe.CognicaPoEConfig",
"AutoModelForCausalLM": "modeling_cognica_poe.CognicaPoEForCausalLM",
"AutoTokenizer": [
"tokenization_cognica_poe.CognicaPoETokenizer",
null
]
},
"hidden_size": 2048,
"intermediate_size": 12800,
"num_hidden_layers": 32,
"num_attention_heads": 16,
"num_key_value_heads": 8,
"head_dim": 128,
"max_position_embeddings": 2048,
"vocab_size": 32768,
"padded_vocab_size": 32768,
"hidden_act": "relu_squared",
"rms_norm_eps": 1e-06,
"rope_theta": 100000,
"tie_word_embeddings": false,
"window_pattern": "SSSL",
"use_cache": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.45.0",
"bos_token_id": 32759,
"eos_token_id": 32759,
"pad_token_id": null,
"poe_mode": "flat",
"poe_every": 1,
"poe_alpha": 0.0,
"poe_head_count": 4,
"poe_stage_layers": [
16,
6,
5,
5
],
"per_stage_head": true,
"dual_head": false,
"poe_note": "Trained with poe_mode=flat, poe_alpha=0.0 (uniform stage average), per_stage_head=True, asymmetric poe_stage_layers=(16,6,5,5) -> 4 stages with boundaries at layers (15, 21, 26, 31). Each per-stage head composes additively with the shared lm_head: logits_k = lm_head(x_k) + lm_head_stages[k](x_k). PoE aggregation uses geometric mean of per-stage softmax distributions (alpha=0). See README and the companion paper.",
"training": {
"total_batch_size": 786432,
"sequence_len": 2048,
"num_iterations": 83923,
"target_tokens": 65990000000,
"optimizer": "DistMuonAdamW",
"embedding_lr": 0.3,
"unembedding_lr": 0.008,
"matrix_lr": 0.015,
"weight_decay": 0.28,
"warmdown_ratio": 0.65,
"warmup_steps": 1000,
"case_aug_prob": 0.15,
"chinchilla_ratio_total": 21.85,
"dataset": "frontier_v1 mix (63.07B tokens, 848 sharded parquets): FineWeb-Edu 33.5%, DCLM-Baseline 24.1%, Stack v2 (codeparrot mirror) 15.7%, Wikipedia 5.2%, CulturaX (ko/zh/ja/es/fr) 5.2%, ProofPile-2 4.2%, OpenWebMath 4.2%, Gutenberg 4.2%, PG-19 2.1%, UltraChat 1.0%, OpenHermes-2.5 0.6%"
},
"training_step": 106000,
"training_val_bpb": 0.8717988846209657,
"poe_wand_p99_bounds_per_stage_head": [
3.8243,
1.9192,
1.3995
]
}