Buckets:

HuggingFaceFW
/

finephrase-checkpoints

Files

xet

HuggingFaceFW/finephrase-checkpoints / mix-fw_edu_hq-article_granite3_1b_hq /7500 /config.yaml

glutamatt

7 days ago

download

raw

4.86 kB

	checkpoints:
	checkpoint_interval: 500
	checkpoints_path: /scratch/joel_niklaus/tmp/finephrase/checkpoints/mix-fw_edu_hq-article_granite3_1b_hq
	checkpoints_path_is_shared_file_system: false
	load_lr_scheduler: true
	load_optimizer: true
	resume_checkpoint_path: null
	save_final_state: true
	save_initial_state: false
	data_stages:
	- data:
	dataset:
	dataset_folder:
	- /scratch/joel_niklaus/tmp/finephrase/dataset/fw_edu_hq/
	- /scratch/joel_niklaus/tmp/finephrase/dataset/article_granite3_1b_hq/
	dataset_max_tokens: null
	dataset_read_path: null
	dataset_weights:
	- 0.5
	- 0.5
	pad_samples_to_global_batch_size: false
	return_positions: true
	shuffle_files: false
	skip_in_stream: false
	token_size_in_bytes: 4
	tokenizer_name: hynky/Llama-3.2-1B-no-bos
	use_old_brrr_dataloader: false
	vocab_size: 128256
	num_loading_workers: 0
	seed: 6
	name: stable
	start_training_step: 1
	general:
	benchmark_csv_path: null
	consumed_train_samples: 3840000
	ignore_sanity_checks: true
	project: finephrase
	run: mix-fw_edu_hq-article_granite3_1b_hq
	seed: 6
	step: 7500
	lighteval:
	batch_size: 8
	eval_interval: 500
	eval_interval_file: null
	local_checkpoint_dir: /scratch/joel_niklaus/tmp/finephrase/evals-ckpt
	logs_path: /fsx/joel_niklaus/logs/finephrase/experiments/evals
	nanotron_path: /fsx/joel_niklaus/projects/finephrase/nanotron
	output_dir: s3://finephrase/experiments/evals-test
	parallelism:
	context_parallel_size: 1
	dp: 1
	expert_parallel_size: 1
	moe_layer_recompute: false
	pp: 1
	pp_engine: afab
	recompute_layer: false
	tp: 1
	tp_linear_async_communication: true
	tp_mode: ALL_REDUCE
	tp_recompute_allgather: true
	s3_save_path: null
	slurm:
	cpus_per_task: 88
	gpus_per_node: 8
	hf_cache: /fsx/joel_niklaus/.cache/huggingface
	partition: hopper-prod
	qos: normal
	reservation: null
	time: '1:00:00'
	tasks:
	custom_tasks: /fsx/joel_niklaus/projects/finephrase/task_list.py
	dataset_loading_processes: 8
	max_samples: 1000
	multichoice_continuations_start_space: null
	no_multichoice_continuations_start_space: null
	num_fewshot_seeds: null
	tasks: /fsx/joel_niklaus/projects/finephrase/tasks.txt
	upload_to_wandb: false
	wandb: null
	wandb_entity: null
	wandb_project: null
	logging:
	iteration_step_info_interval: 5
	log_level: info
	log_level_replica: info
	metrics_logging:
	log_detail_interval: 200
	log_level: 1
	model:
	ddp_bucket_cap_mb: 50
	dtype: bfloat16
	init_method:
	scaling_method: NUM_LAYERS
	std: 0.02
	make_vocab_size_divisible_by: 1
	model_config:
	_attn_implementation: flash_attention_2
	_fused_rms_norm: true
	_fused_rotary_emb: true
	_use_doc_masking: true
	_use_qkv_packed: true
	attention_bias: false
	bos_token_id: 128000
	eos_token_id: 128001
	flex_attention_mask: null
	hidden_act: silu
	hidden_size: 2048
	initializer_range: 0.02
	intermediate_size: 6144
	is_qwen2_config: true
	max_position_embeddings: 4096
	moe_config: null
	no_rope_layer: null
	num_attention_heads: 16
	num_hidden_layers: 28
	num_key_value_heads: 8
	pad_token_id: null
	pretraining_tp: 1
	rms_norm_eps: 1.0e-06
	rope_interleaved: false
	rope_scaling: null
	rope_theta: 10000
	sliding_window_size: null
	tie_word_embeddings: true
	use_cache: true
	vocab_size: 128256
	z_loss_coefficient: 1.0e-05
	z_loss_enabled: false
	optimizer:
	accumulate_grad_in_fp32: true
	clip_grad: 1.0
	learning_rate_scheduler:
	learning_rate: 0.0005
	lr_decay_starting_step: 9000
	lr_decay_steps: 1000
	lr_decay_style: linear
	lr_warmup_steps: 100
	lr_warmup_style: linear
	min_decay_lr: 5.0e-05
	optimizer_factory:
	adam_beta1: 0.9
	adam_beta2: 0.95
	adam_eps: 1.0e-08
	name: adamW
	torch_adam_is_fused: true
	weight_decay: 0.1
	weight_decay_exclude_named_params:
	- .token_embedding.
	zero_stage: 0
	parallelism:
	context_parallel_size: 1
	dp: 64
	expert_parallel_size: 1
	moe_layer_recompute: false
	pp: 1
	pp_engine: 1f1b
	recompute_layer: false
	tp: 1
	tp_linear_async_communication: true
	tp_mode: REDUCE_SCATTER
	tp_recompute_allgather: true
	profiler: null
	s3_upload:
	remove_after_upload: true
	s5cmd_concurrency: 10
	s5cmd_numworkers: 32
	s5cmd_path: /fsx/joel_niklaus/projects/finephrase/.venv/bin/s5cmd
	upload_s3_path: s3://finephrase/experiments/checkpoints/mix-fw_edu_hq-article_granite3_1b_hq
	tokenizer:
	tokenizer_max_length: 4096
	tokenizer_name_or_path: hynky/Llama-3.2-1B-no-bos
	tokenizer_revision: null
	tokens:
	batch_accumulation_per_replica: 4
	limit_test_batches: 0
	limit_val_batches: 0
	micro_batch_size: 2
	sequence_length: 4096
	train_steps: 10000
	val_check_interval: 0

Xet Storage Details

Size:: 4.86 kB
Xet hash:: bc3109c798ab1e5975c83cf854f7f3ee93db033443e7880b7e18ac7b9ff91ec2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.