diff --git "a/debug.log" "b/debug.log" new file mode 100644--- /dev/null +++ "b/debug.log" @@ -0,0 +1,5594 @@ +[2025-11-16 21:41:52,880] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:7990] bf16 support detected, enabling for this configuration. +[2025-11-16 21:41:53,127] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:7990] baseline 0.000GB () +[2025-11-16 21:41:53,128] [INFO] [axolotl.cli.config.load_cfg:248] [PID:7990] config: +{ + "accelerator_config": { + "dispatch_batches": false, + "split_batches": false + }, + "activation_offloading": false, + "adam_beta1": 0.9, + "adam_beta2": 0.98, + "adam_epsilon": 1e-06, + "axolotl_config_path": "embeddings-12b.yaml", + "base_model": "Goader/gemma-3-12b-pt-focus", + "base_model_config": "Goader/gemma-3-12b-pt-focus", + "batch_size": 8, + "bf16": true, + "capabilities": { + "bf16": true, + "compute_capability": "sm_90", + "fp8": false, + "n_gpu": 1, + "n_node": 1 + }, + "context_parallel_size": 1, + "dataloader_num_workers": 8, + "dataloader_prefetch_factor": 9, + "dataset_num_proc": 64, + "dataset_prepared_path": "last_run_prepared_embeddings", + "ddp": false, + "ddp_find_unused_parameters": false, + "ddp_timeout": 7200, + "device": "cuda:0", + "dion_rank_fraction": 1.0, + "dion_rank_multiple_of": 1, + "env_capabilities": { + "torch_version": "2.8.0" + }, + "eval_batch_size": 4, + "eval_causal_lm_metrics": [ + "sacrebleu", + "comet", + "ter", + "chrf" + ], + "eval_max_new_tokens": 128, + "eval_sample_packing": true, + "eval_table_size": 0, + "experimental_skip_move_to_device": true, + "flash_attention": true, + "fp16": false, + "gradient_accumulation_steps": 2, + "gradient_checkpointing": false, + "include_tkps": true, + "is_multimodal": true, + "learning_rate": 5e-05, + "liger_fused_linear_cross_entropy": true, + "liger_glu_activation": true, + "liger_layer_norm": true, + "liger_rms_norm": true, + "liger_rope": true, + "lisa_layers_attribute": "model.layers", + "load_best_model_at_end": false, + "load_in_4bit": false, + "load_in_8bit": false, + "local_rank": 0, + "logging_steps": 10, + "lora_dropout": 0.0, + "loraplus_lr_embedding": 1e-06, + "lr_scheduler": "warmup_stable_decay", + "lr_scheduler_kwargs": { + "min_lr_ratio": 0.05, + "num_decay_steps": 10000 + }, + "max_grad_norm": 1.0, + "max_steps": 15000, + "mean_resizing_embeddings": false, + "micro_batch_size": 4, + "model_config_type": "gemma3", + "num_epochs": 1.0, + "optimizer": "adamw_torch_fused", + "otel_metrics_host": "localhost", + "otel_metrics_port": 8000, + "output_dir": "./outputs/gemma-3-12b-focus-pt", + "pad_to_sequence_len": true, + "plugins": [ + "axolotl.integrations.liger.LigerPlugin" + ], + "pretrain_multipack_attn": true, + "pretraining_dataset": [ + { + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "Goader/kobza-2m-jsonl", + "trust_remote_code": false, + "type": "pretrain" + } + ], + "processor_config": "Goader/gemma-3-12b-pt-focus", + "profiler_steps_start": 0, + "qlora_sharded_model_loading": false, + "ray_num_workers": 1, + "resources_per_worker": { + "GPU": 1 + }, + "sample_packing": true, + "sample_packing_bin_size": 200, + "sample_packing_group_size": 100000, + "save_only_model": false, + "save_safetensors": true, + "save_steps": 5000, + "save_total_limit": 30, + "sequence_len": 1024, + "shuffle_before_merging_datasets": false, + "shuffle_merged_datasets": false, + "skip_prepare_dataset": false, + "streaming_multipack_buffer_size": 10000, + "strict": false, + "tensor_parallel_size": 1, + "tiled_mlp_use_original_mlp": true, + "tokenizer_config": "lapa-llm/tokenizer", + "tokenizer_save_jinja_files": true, + "torch_dtype": "torch.bfloat16", + "train_on_inputs": true, + "trl": { + "log_completions": false, + "mask_truncated_completions": false, + "ref_model_mixup_alpha": 0.9, + "ref_model_sync_steps": 64, + "scale_rewards": true, + "sync_ref_model": false, + "use_vllm": false, + "vllm_server_host": "0.0.0.0", + "vllm_server_port": 8000 + }, + "unfrozen_parameters": [ + "^lm_head.weight$", + "^model.language_model.embed_tokens.weight$" + ], + "use_otel_metrics": false, + "use_ray": false, + "use_wandb": true, + "val_set_size": 0.0, + "vllm": { + "device": "auto", + "dtype": "auto", + "gpu_memory_utilization": 0.9, + "host": "0.0.0.0", + "port": 8000 + }, + "wandb_project": "matt", + "warmup_ratio": 0.1, + "weight_decay": 0.01, + "world_size": 1 +} +[2025-11-16 21:41:53,556] [DEBUG] [axolotl.loaders.utils.check_model_config:83] [PID:7990] Loaded image size: 896 from model config +[2025-11-16 21:41:54,991] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:7990] EOS: 1 / +[2025-11-16 21:41:54,991] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:7990] BOS: 2 / +[2025-11-16 21:41:54,991] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:7990] PAD: 0 / +[2025-11-16 21:41:54,991] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:7990] UNK: 3 / +[2025-11-16 21:41:58,518] [DEBUG] [axolotl.utils.data.streaming.wrap_streaming_dataset:231] [PID:7990] NOT shuffling merged pretraining datasets +[2025-11-16 21:41:58,519] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:7990] Loading tokenizer... lapa-llm/tokenizer +[2025-11-16 21:42:00,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:7990] EOS: 1 / +[2025-11-16 21:42:00,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:7990] BOS: 2 / +[2025-11-16 21:42:00,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:7990] PAD: 0 / +[2025-11-16 21:42:00,142] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:7990] UNK: 3 / +[2025-11-16 21:42:07,755] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:7990] Loading model +[2025-11-16 21:42:07,984] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:7990] Patched Trainer.evaluation_loop with nanmean loss calculation +[2025-11-16 21:42:07,985] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:7990] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation +[2025-11-16 21:42:07,985] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:7990] Applying multipack dataloader patch for sample packing... +[2025-11-16 21:42:08,002] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:7990] Applying LIGER to gemma3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': True, 'rms_norm': True, 'layer_norm': True, 'geglu': True} + Loading checkpoint shards: 0%| | 0/11 [00:00