Buckets:

glutamatt's picture
download
raw
4.86 kB
checkpoints:
checkpoint_interval: 500
checkpoints_path: /scratch/joel_niklaus/tmp/finephrase/checkpoints/mix-fw_edu_hq-article_granite3_1b_hq
checkpoints_path_is_shared_file_system: false
load_lr_scheduler: true
load_optimizer: true
resume_checkpoint_path: null
save_final_state: true
save_initial_state: false
data_stages:
- data:
dataset:
dataset_folder:
- /scratch/joel_niklaus/tmp/finephrase/dataset/fw_edu_hq/
- /scratch/joel_niklaus/tmp/finephrase/dataset/article_granite3_1b_hq/
dataset_max_tokens: null
dataset_read_path: null
dataset_weights:
- 0.5
- 0.5
pad_samples_to_global_batch_size: false
return_positions: true
shuffle_files: false
skip_in_stream: false
token_size_in_bytes: 4
tokenizer_name: hynky/Llama-3.2-1B-no-bos
use_old_brrr_dataloader: false
vocab_size: 128256
num_loading_workers: 0
seed: 6
name: stable
start_training_step: 1
general:
benchmark_csv_path: null
consumed_train_samples: 3840000
ignore_sanity_checks: true
project: finephrase
run: mix-fw_edu_hq-article_granite3_1b_hq
seed: 6
step: 7500
lighteval:
batch_size: 8
eval_interval: 500
eval_interval_file: null
local_checkpoint_dir: /scratch/joel_niklaus/tmp/finephrase/evals-ckpt
logs_path: /fsx/joel_niklaus/logs/finephrase/experiments/evals
nanotron_path: /fsx/joel_niklaus/projects/finephrase/nanotron
output_dir: s3://finephrase/experiments/evals-test
parallelism:
context_parallel_size: 1
dp: 1
expert_parallel_size: 1
moe_layer_recompute: false
pp: 1
pp_engine: afab
recompute_layer: false
tp: 1
tp_linear_async_communication: true
tp_mode: ALL_REDUCE
tp_recompute_allgather: true
s3_save_path: null
slurm:
cpus_per_task: 88
gpus_per_node: 8
hf_cache: /fsx/joel_niklaus/.cache/huggingface
partition: hopper-prod
qos: normal
reservation: null
time: '1:00:00'
tasks:
custom_tasks: /fsx/joel_niklaus/projects/finephrase/task_list.py
dataset_loading_processes: 8
max_samples: 1000
multichoice_continuations_start_space: null
no_multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: /fsx/joel_niklaus/projects/finephrase/tasks.txt
upload_to_wandb: false
wandb: null
wandb_entity: null
wandb_project: null
logging:
iteration_step_info_interval: 5
log_level: info
log_level_replica: info
metrics_logging:
log_detail_interval: 200
log_level: 1
model:
ddp_bucket_cap_mb: 50
dtype: bfloat16
init_method:
scaling_method: NUM_LAYERS
std: 0.02
make_vocab_size_divisible_by: 1
model_config:
_attn_implementation: flash_attention_2
_fused_rms_norm: true
_fused_rotary_emb: true
_use_doc_masking: true
_use_qkv_packed: true
attention_bias: false
bos_token_id: 128000
eos_token_id: 128001
flex_attention_mask: null
hidden_act: silu
hidden_size: 2048
initializer_range: 0.02
intermediate_size: 6144
is_qwen2_config: true
max_position_embeddings: 4096
moe_config: null
no_rope_layer: null
num_attention_heads: 16
num_hidden_layers: 28
num_key_value_heads: 8
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-06
rope_interleaved: false
rope_scaling: null
rope_theta: 10000
sliding_window_size: null
tie_word_embeddings: true
use_cache: true
vocab_size: 128256
z_loss_coefficient: 1.0e-05
z_loss_enabled: false
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.0005
lr_decay_starting_step: 9000
lr_decay_steps: 1000
lr_decay_style: linear
lr_warmup_steps: 100
lr_warmup_style: linear
min_decay_lr: 5.0e-05
optimizer_factory:
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.1
weight_decay_exclude_named_params:
- .*token_embedding.*
zero_stage: 0
parallelism:
context_parallel_size: 1
dp: 64
expert_parallel_size: 1
moe_layer_recompute: false
pp: 1
pp_engine: 1f1b
recompute_layer: false
tp: 1
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
tp_recompute_allgather: true
profiler: null
s3_upload:
remove_after_upload: true
s5cmd_concurrency: 10
s5cmd_numworkers: 32
s5cmd_path: /fsx/joel_niklaus/projects/finephrase/.venv/bin/s5cmd
upload_s3_path: s3://finephrase/experiments/checkpoints/mix-fw_edu_hq-article_granite3_1b_hq
tokenizer:
tokenizer_max_length: 4096
tokenizer_name_or_path: hynky/Llama-3.2-1B-no-bos
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 4
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 2
sequence_length: 4096
train_steps: 10000
val_check_interval: 0

Xet Storage Details

Size:
4.86 kB
·
Xet hash:
bc3109c798ab1e5975c83cf854f7f3ee93db033443e7880b7e18ac7b9ff91ec2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.