cognica
/

Cognica-PoE-v1.0-3B-base-continual-learning

Text Generation

continual-learning

continual-pretraining

Model card Files Files and versions

Cognica-PoE-v1.0-3B-base-continual-learning / config.json

jaepil's picture

config: WAND p99 bounds for s106000

e3a3285 verified about 2 hours ago

history blame contribute delete

2.35 kB

	{
	"architectures": [
	"CognicaPoEForCausalLM"
	],
	"model_type": "cognica_poe",
	"auto_map": {
	"AutoConfig": "configuration_cognica_poe.CognicaPoEConfig",
	"AutoModelForCausalLM": "modeling_cognica_poe.CognicaPoEForCausalLM",
	"AutoTokenizer": [
	"tokenization_cognica_poe.CognicaPoETokenizer",
	null
	]
	},
	"hidden_size": 2048,
	"intermediate_size": 12800,
	"num_hidden_layers": 32,
	"num_attention_heads": 16,
	"num_key_value_heads": 8,
	"head_dim": 128,
	"max_position_embeddings": 2048,
	"vocab_size": 32768,
	"padded_vocab_size": 32768,
	"hidden_act": "relu_squared",
	"rms_norm_eps": 1e-06,
	"rope_theta": 100000,
	"tie_word_embeddings": false,
	"window_pattern": "SSSL",
	"use_cache": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.45.0",
	"bos_token_id": 32759,
	"eos_token_id": 32759,
	"pad_token_id": null,
	"poe_mode": "flat",
	"poe_every": 1,
	"poe_alpha": 0.0,
	"poe_head_count": 4,
	"poe_stage_layers": [
	16,
	6,
	5,
	5
	],
	"per_stage_head": true,
	"dual_head": false,
	"poe_note": "Trained with poe_mode=flat, poe_alpha=0.0 (uniform stage average), per_stage_head=True, asymmetric poe_stage_layers=(16,6,5,5) -> 4 stages with boundaries at layers (15, 21, 26, 31). Each per-stage head composes additively with the shared lm_head: logits_k = lm_head(x_k) + lm_head_stages[k](x_k). PoE aggregation uses geometric mean of per-stage softmax distributions (alpha=0). See README and the companion paper.",
	"training": {
	"total_batch_size": 786432,
	"sequence_len": 2048,
	"num_iterations": 83923,
	"target_tokens": 65990000000,
	"optimizer": "DistMuonAdamW",
	"embedding_lr": 0.3,
	"unembedding_lr": 0.008,
	"matrix_lr": 0.015,
	"weight_decay": 0.28,
	"warmdown_ratio": 0.65,
	"warmup_steps": 1000,
	"case_aug_prob": 0.15,
	"chinchilla_ratio_total": 21.85,
	"dataset": "frontier_v1 mix (63.07B tokens, 848 sharded parquets): FineWeb-Edu 33.5%, DCLM-Baseline 24.1%, Stack v2 (codeparrot mirror) 15.7%, Wikipedia 5.2%, CulturaX (ko/zh/ja/es/fr) 5.2%, ProofPile-2 4.2%, OpenWebMath 4.2%, Gutenberg 4.2%, PG-19 2.1%, UltraChat 1.0%, OpenHermes-2.5 0.6%"
	},
	"training_step": 106000,
	"training_val_bpb": 0.8717988846209657,
	"poe_wand_p99_bounds_per_stage_head": [
	3.8243,
	1.9192,
	1.3995
	]
	}