Text Generation
Transformers
Safetensors
qwen2
Generated from Trainer
open-r1
trl
grpo
conversational
text-generation-inference
Instructions to use zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO") model = AutoModelForCausalLM.from_pretrained("zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO
- SGLang
How to use zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO with Docker Model Runner:
docker model run hf.co/zhimeng/Qwen2.5-1.5B-Open-R1-Code-GRPO
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.05596753882748006, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 706.75, | |
| "completions/max_terminated_length": 706.75, | |
| "completions/mean_length": 561.5, | |
| "completions/mean_terminated_length": 561.5, | |
| "completions/min_length": 304.25, | |
| "completions/min_terminated_length": 304.25, | |
| "epoch": 0.00011193507765496012, | |
| "grad_norm": 0.6556430486006678, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.0315, | |
| "num_tokens": 39768.0, | |
| "reward": 0.02219326765043661, | |
| "reward_std": 0.04227059497497976, | |
| "rewards/code_reward/mean": 0.02219326765043661, | |
| "rewards/code_reward/std": 0.0422705952078104, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 609.25, | |
| "completions/max_terminated_length": 609.25, | |
| "completions/mean_length": 333.3125, | |
| "completions/mean_terminated_length": 333.3125, | |
| "completions/min_length": 168.75, | |
| "completions/min_terminated_length": 168.75, | |
| "epoch": 0.00022387015530992023, | |
| "grad_norm": 0.9104023477920656, | |
| "kl": 0.0, | |
| "learning_rate": 3.3333333333333335e-07, | |
| "loss": -0.0684, | |
| "num_tokens": 62690.0, | |
| "reward": 0.09372148709371686, | |
| "reward_std": 0.05263180285692215, | |
| "rewards/code_reward/mean": 0.09372148709371686, | |
| "rewards/code_reward/std": 0.052631803788244724, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 613.25, | |
| "completions/max_terminated_length": 613.25, | |
| "completions/mean_length": 431.03125, | |
| "completions/mean_terminated_length": 431.03125, | |
| "completions/min_length": 231.5, | |
| "completions/min_terminated_length": 231.5, | |
| "epoch": 0.00033580523296488035, | |
| "grad_norm": 0.7467116220042507, | |
| "kl": 6.216764450073242e-05, | |
| "learning_rate": 6.666666666666667e-07, | |
| "loss": 0.0176, | |
| "num_tokens": 93451.0, | |
| "reward": 0.02661502081900835, | |
| "reward_std": 0.03690493572503328, | |
| "rewards/code_reward/mean": 0.02661502081900835, | |
| "rewards/code_reward/std": 0.03690493851900101, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 572.0, | |
| "completions/max_terminated_length": 572.0, | |
| "completions/mean_length": 379.625, | |
| "completions/mean_terminated_length": 379.625, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "epoch": 0.00044774031061984047, | |
| "grad_norm": 0.9821405105801766, | |
| "kl": 7.545948028564453e-05, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": -0.0724, | |
| "num_tokens": 120351.0, | |
| "reward": 0.07474911026656628, | |
| "reward_std": 0.10846673045307398, | |
| "rewards/code_reward/mean": 0.07474911026656628, | |
| "rewards/code_reward/std": 0.1084667295217514, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 699.25, | |
| "completions/max_terminated_length": 699.25, | |
| "completions/mean_length": 458.15625, | |
| "completions/mean_terminated_length": 458.15625, | |
| "completions/min_length": 172.75, | |
| "completions/min_terminated_length": 172.75, | |
| "epoch": 0.0005596753882748006, | |
| "grad_norm": 0.7621492143305323, | |
| "kl": 4.2319297790527344e-05, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": -0.022, | |
| "num_tokens": 155684.0, | |
| "reward": 0.021726191509515047, | |
| "reward_std": 0.02874244563281536, | |
| "rewards/code_reward/mean": 0.021726191509515047, | |
| "rewards/code_reward/std": 0.02874244749546051, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 764.5, | |
| "completions/max_terminated_length": 764.5, | |
| "completions/mean_length": 439.15625, | |
| "completions/mean_terminated_length": 439.15625, | |
| "completions/min_length": 246.0, | |
| "completions/min_terminated_length": 246.0, | |
| "epoch": 0.0006716104659297607, | |
| "grad_norm": 1.0850029834641424, | |
| "kl": 4.9442052841186523e-05, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.0318, | |
| "num_tokens": 189689.0, | |
| "reward": 0.36408869456499815, | |
| "reward_std": 0.2700020968914032, | |
| "rewards/code_reward/mean": 0.36408869456499815, | |
| "rewards/code_reward/std": 0.2700021122582257, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 783.75, | |
| "completions/max_terminated_length": 783.75, | |
| "completions/mean_length": 459.84375, | |
| "completions/mean_terminated_length": 459.84375, | |
| "completions/min_length": 260.0, | |
| "completions/min_terminated_length": 260.0, | |
| "epoch": 0.0007835455435847208, | |
| "grad_norm": 1.1358452129000465, | |
| "kl": 9.28044319152832e-05, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": -0.0229, | |
| "num_tokens": 223804.0, | |
| "reward": 0.07010709377937019, | |
| "reward_std": 0.09961615316569805, | |
| "rewards/code_reward/mean": 0.07010709377937019, | |
| "rewards/code_reward/std": 0.09961615689098835, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 560.25, | |
| "completions/max_terminated_length": 560.25, | |
| "completions/mean_length": 390.28125, | |
| "completions/mean_terminated_length": 390.28125, | |
| "completions/min_length": 230.25, | |
| "completions/min_terminated_length": 230.25, | |
| "epoch": 0.0008954806212396809, | |
| "grad_norm": 1.0894200859707648, | |
| "kl": 6.967782974243164e-05, | |
| "learning_rate": 2.3333333333333336e-06, | |
| "loss": -0.0832, | |
| "num_tokens": 250717.0, | |
| "reward": 0.17307570209959522, | |
| "reward_std": 0.17698997142724693, | |
| "rewards/code_reward/mean": 0.17307570209959522, | |
| "rewards/code_reward/std": 0.17698997911065817, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 759.5, | |
| "completions/max_terminated_length": 759.5, | |
| "completions/mean_length": 530.4375, | |
| "completions/mean_terminated_length": 530.4375, | |
| "completions/min_length": 260.25, | |
| "completions/min_terminated_length": 260.25, | |
| "epoch": 0.0010074156988946412, | |
| "grad_norm": 0.6609406173759865, | |
| "kl": 0.00013196468353271484, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 0.0563, | |
| "num_tokens": 290563.0, | |
| "reward": 0.0257048096973449, | |
| "reward_std": 0.036920994287356734, | |
| "rewards/code_reward/mean": 0.0257048096973449, | |
| "rewards/code_reward/std": 0.03692099452018738, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 749.25, | |
| "completions/max_terminated_length": 749.25, | |
| "completions/mean_length": 468.71875, | |
| "completions/mean_terminated_length": 468.71875, | |
| "completions/min_length": 277.25, | |
| "completions/min_terminated_length": 277.25, | |
| "epoch": 0.0011193507765496012, | |
| "grad_norm": 0.816974823658789, | |
| "kl": 0.00015497207641601562, | |
| "learning_rate": 3e-06, | |
| "loss": -0.0226, | |
| "num_tokens": 325418.0, | |
| "reward": 0.09049492585472763, | |
| "reward_std": 0.18241150537505746, | |
| "rewards/code_reward/mean": 0.09049492585472763, | |
| "rewards/code_reward/std": 0.18241151235997677, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 523.5, | |
| "completions/max_terminated_length": 523.5, | |
| "completions/mean_length": 360.09375, | |
| "completions/mean_terminated_length": 360.09375, | |
| "completions/min_length": 172.75, | |
| "completions/min_terminated_length": 172.75, | |
| "epoch": 0.0012312858542045614, | |
| "grad_norm": 1.3579676028203136, | |
| "kl": 0.0004100799560546875, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 348085.0, | |
| "reward": 0.07968997955322266, | |
| "reward_std": 0.19473078846931458, | |
| "rewards/code_reward/mean": 0.07968997955322266, | |
| "rewards/code_reward/std": 0.19473078846931458, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 719.25, | |
| "completions/max_terminated_length": 719.25, | |
| "completions/mean_length": 476.03125, | |
| "completions/mean_terminated_length": 476.03125, | |
| "completions/min_length": 289.0, | |
| "completions/min_terminated_length": 289.0, | |
| "epoch": 0.0013432209318595214, | |
| "grad_norm": 1.0254632341024847, | |
| "kl": 0.00038051605224609375, | |
| "learning_rate": 3.6666666666666666e-06, | |
| "loss": -0.0456, | |
| "num_tokens": 379078.0, | |
| "reward": 0.06544117676094174, | |
| "reward_std": 0.11873381165787578, | |
| "rewards/code_reward/mean": 0.06544117676094174, | |
| "rewards/code_reward/std": 0.11873381165787578, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 643.75, | |
| "completions/max_terminated_length": 643.75, | |
| "completions/mean_length": 456.40625, | |
| "completions/mean_terminated_length": 456.40625, | |
| "completions/min_length": 220.0, | |
| "completions/min_terminated_length": 220.0, | |
| "epoch": 0.0014551560095144816, | |
| "grad_norm": 0.7844627176055754, | |
| "kl": 0.0006098747253417969, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0275, | |
| "num_tokens": 410643.0, | |
| "reward": 0.08870427880901843, | |
| "reward_std": 0.1394934863783419, | |
| "rewards/code_reward/mean": 0.08870427880901843, | |
| "rewards/code_reward/std": 0.13949348265305161, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 946.5, | |
| "completions/max_terminated_length": 779.25, | |
| "completions/mean_length": 474.78125, | |
| "completions/mean_terminated_length": 433.4776916503906, | |
| "completions/min_length": 212.5, | |
| "completions/min_terminated_length": 212.5, | |
| "epoch": 0.0015670910871694416, | |
| "grad_norm": 1.0068663809227436, | |
| "kl": 0.001148223876953125, | |
| "learning_rate": 4.333333333333334e-06, | |
| "loss": -0.0596, | |
| "num_tokens": 444292.0, | |
| "reward": 0.09596806723857298, | |
| "reward_std": 0.1574952198425308, | |
| "rewards/code_reward/mean": 0.09596806723857298, | |
| "rewards/code_reward/std": 0.15749522170517594, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 754.75, | |
| "completions/max_terminated_length": 754.75, | |
| "completions/mean_length": 491.375, | |
| "completions/mean_terminated_length": 491.375, | |
| "completions/min_length": 320.5, | |
| "completions/min_terminated_length": 320.5, | |
| "epoch": 0.0016790261648244019, | |
| "grad_norm": 0.6616890996356674, | |
| "kl": 0.0012340545654296875, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 480536.0, | |
| "reward": 0.04718137255986221, | |
| "reward_std": 0.06556019705021754, | |
| "rewards/code_reward/mean": 0.04718137255986221, | |
| "rewards/code_reward/std": 0.0655602045590058, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 671.0, | |
| "completions/max_terminated_length": 671.0, | |
| "completions/mean_length": 434.03125, | |
| "completions/mean_terminated_length": 434.03125, | |
| "completions/min_length": 225.25, | |
| "completions/min_terminated_length": 225.25, | |
| "epoch": 0.0017909612424793619, | |
| "grad_norm": 0.8195686424785313, | |
| "kl": 0.0021371841430664062, | |
| "learning_rate": 5e-06, | |
| "loss": -0.0293, | |
| "num_tokens": 517233.0, | |
| "reward": 0.14742368459701538, | |
| "reward_std": 0.13962780684232712, | |
| "rewards/code_reward/mean": 0.14742368459701538, | |
| "rewards/code_reward/std": 0.13962781056761742, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 670.25, | |
| "completions/max_terminated_length": 670.25, | |
| "completions/mean_length": 466.125, | |
| "completions/mean_terminated_length": 466.125, | |
| "completions/min_length": 264.25, | |
| "completions/min_terminated_length": 264.25, | |
| "epoch": 0.001902896320134322, | |
| "grad_norm": 0.7604607536894525, | |
| "kl": 0.00327301025390625, | |
| "learning_rate": 4.999952797253148e-06, | |
| "loss": -0.0188, | |
| "num_tokens": 552989.0, | |
| "reward": 0.03991336654871702, | |
| "reward_std": 0.05816664919257164, | |
| "rewards/code_reward/mean": 0.03991336654871702, | |
| "rewards/code_reward/std": 0.05816664732992649, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 639.0, | |
| "completions/max_terminated_length": 639.0, | |
| "completions/mean_length": 413.21875, | |
| "completions/mean_terminated_length": 413.21875, | |
| "completions/min_length": 243.25, | |
| "completions/min_terminated_length": 243.25, | |
| "epoch": 0.0020148313977892823, | |
| "grad_norm": 0.8446607287984066, | |
| "kl": 0.00414276123046875, | |
| "learning_rate": 4.9998111909931225e-06, | |
| "loss": -0.0223, | |
| "num_tokens": 582628.0, | |
| "reward": 0.027369487448595464, | |
| "reward_std": 0.05444430746138096, | |
| "rewards/code_reward/mean": 0.027369487448595464, | |
| "rewards/code_reward/std": 0.05444430839270353, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1086.75, | |
| "completions/max_terminated_length": 799.25, | |
| "completions/mean_length": 567.1875, | |
| "completions/mean_terminated_length": 521.8258972167969, | |
| "completions/min_length": 304.5, | |
| "completions/min_terminated_length": 304.5, | |
| "epoch": 0.0021267664754442426, | |
| "grad_norm": 0.6815859244268851, | |
| "kl": 0.00553131103515625, | |
| "learning_rate": 4.999575187161439e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 620538.0, | |
| "reward": 0.03672935510985553, | |
| "reward_std": 0.029647217132151127, | |
| "rewards/code_reward/mean": 0.03672935510985553, | |
| "rewards/code_reward/std": 0.029647217132151127, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 805.25, | |
| "completions/max_terminated_length": 805.25, | |
| "completions/mean_length": 507.21875, | |
| "completions/mean_terminated_length": 507.21875, | |
| "completions/min_length": 351.0, | |
| "completions/min_terminated_length": 351.0, | |
| "epoch": 0.0022387015530992023, | |
| "grad_norm": 0.4364759611369158, | |
| "kl": 0.00673675537109375, | |
| "learning_rate": 4.9992447956603455e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 659145.0, | |
| "reward": 0.00214460794813931, | |
| "reward_std": 0.006065867375582457, | |
| "rewards/code_reward/mean": 0.00214460794813931, | |
| "rewards/code_reward/std": 0.006065867375582457, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 488.25, | |
| "completions/max_terminated_length": 488.25, | |
| "completions/mean_length": 313.0, | |
| "completions/mean_terminated_length": 313.0, | |
| "completions/min_length": 191.5, | |
| "completions/min_terminated_length": 191.5, | |
| "epoch": 0.0023506366307541626, | |
| "grad_norm": 1.1196346766853063, | |
| "kl": 0.009761810302734375, | |
| "learning_rate": 4.998820030352409e-06, | |
| "loss": -0.0401, | |
| "num_tokens": 681841.0, | |
| "reward": 0.21865647949744016, | |
| "reward_std": 0.16933009633794427, | |
| "rewards/code_reward/mean": 0.21865647949744016, | |
| "rewards/code_reward/std": 0.16933008842170238, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1162.75, | |
| "completions/max_terminated_length": 852.25, | |
| "completions/mean_length": 618.125, | |
| "completions/mean_terminated_length": 572.4866180419922, | |
| "completions/min_length": 325.75, | |
| "completions/min_terminated_length": 325.75, | |
| "epoch": 0.002462571708409123, | |
| "grad_norm": 0.8324675842567315, | |
| "kl": 0.00992584228515625, | |
| "learning_rate": 4.998300909059929e-06, | |
| "loss": 0.0359, | |
| "num_tokens": 722261.0, | |
| "reward": 0.03132503107190132, | |
| "reward_std": 0.04826245130971074, | |
| "rewards/code_reward/mean": 0.03132503107190132, | |
| "rewards/code_reward/std": 0.048262451542541385, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 668.5, | |
| "completions/max_terminated_length": 668.5, | |
| "completions/mean_length": 449.5625, | |
| "completions/mean_terminated_length": 449.5625, | |
| "completions/min_length": 302.0, | |
| "completions/min_terminated_length": 302.0, | |
| "epoch": 0.002574506786064083, | |
| "grad_norm": 0.8188216899529877, | |
| "kl": 0.012542724609375, | |
| "learning_rate": 4.997687453564198e-06, | |
| "loss": -0.0757, | |
| "num_tokens": 756751.0, | |
| "reward": 0.0606757253408432, | |
| "reward_std": 0.08386795781552792, | |
| "rewards/code_reward/mean": 0.0606757253408432, | |
| "rewards/code_reward/std": 0.08386795967817307, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 554.0, | |
| "completions/max_terminated_length": 554.0, | |
| "completions/mean_length": 397.21875, | |
| "completions/mean_terminated_length": 397.21875, | |
| "completions/min_length": 221.5, | |
| "completions/min_terminated_length": 221.5, | |
| "epoch": 0.002686441863719043, | |
| "grad_norm": 0.9882922552317236, | |
| "kl": 0.020965576171875, | |
| "learning_rate": 4.9969796896045775e-06, | |
| "loss": -0.0241, | |
| "num_tokens": 784286.0, | |
| "reward": 0.03399203496519476, | |
| "reward_std": 0.06442949129268527, | |
| "rewards/code_reward/mean": 0.03399203496519476, | |
| "rewards/code_reward/std": 0.06442949641495943, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 692.75, | |
| "completions/max_terminated_length": 692.75, | |
| "completions/mean_length": 500.4375, | |
| "completions/mean_terminated_length": 500.4375, | |
| "completions/min_length": 286.75, | |
| "completions/min_terminated_length": 286.75, | |
| "epoch": 0.002798376941374003, | |
| "grad_norm": 1.0525741278330438, | |
| "kl": 0.0211029052734375, | |
| "learning_rate": 4.996177646877426e-06, | |
| "loss": 0.0871, | |
| "num_tokens": 818260.0, | |
| "reward": 0.1732453762087971, | |
| "reward_std": 0.19011690141633153, | |
| "rewards/code_reward/mean": 0.1732453762087971, | |
| "rewards/code_reward/std": 0.19011691492050886, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 651.5, | |
| "completions/max_terminated_length": 651.5, | |
| "completions/mean_length": 481.875, | |
| "completions/mean_terminated_length": 481.875, | |
| "completions/min_length": 325.25, | |
| "completions/min_terminated_length": 325.25, | |
| "epoch": 0.0029103120190289633, | |
| "grad_norm": 0.8323803477726922, | |
| "kl": 0.02197265625, | |
| "learning_rate": 4.995281359034851e-06, | |
| "loss": -0.0176, | |
| "num_tokens": 858344.0, | |
| "reward": 0.07887662292341702, | |
| "reward_std": 0.06681955535896122, | |
| "rewards/code_reward/mean": 0.07887662292341702, | |
| "rewards/code_reward/std": 0.06681956280954182, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1061.75, | |
| "completions/max_terminated_length": 923.25, | |
| "completions/mean_length": 631.0, | |
| "completions/mean_terminated_length": 599.9107360839844, | |
| "completions/min_length": 343.5, | |
| "completions/min_terminated_length": 343.5, | |
| "epoch": 0.0030222470966839235, | |
| "grad_norm": 0.758606194839006, | |
| "kl": 0.01599884033203125, | |
| "learning_rate": 4.994290863683296e-06, | |
| "loss": -0.0909, | |
| "num_tokens": 900352.0, | |
| "reward": 0.019329323433339596, | |
| "reward_std": 0.05079583264887333, | |
| "rewards/code_reward/mean": 0.019329323433339596, | |
| "rewards/code_reward/std": 0.05079583264887333, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1012.75, | |
| "completions/max_terminated_length": 680.75, | |
| "completions/mean_length": 559.90625, | |
| "completions/mean_terminated_length": 515.2723236083984, | |
| "completions/min_length": 366.5, | |
| "completions/min_terminated_length": 366.5, | |
| "epoch": 0.0031341821743388833, | |
| "grad_norm": 0.6052468908705184, | |
| "kl": 0.019195556640625, | |
| "learning_rate": 4.99320620238196e-06, | |
| "loss": -0.0222, | |
| "num_tokens": 934653.0, | |
| "reward": 0.3267045458778739, | |
| "reward_std": 0.13263714499771595, | |
| "rewards/code_reward/mean": 0.3267045458778739, | |
| "rewards/code_reward/std": 0.1326371468603611, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 755.25, | |
| "completions/max_terminated_length": 755.25, | |
| "completions/mean_length": 560.125, | |
| "completions/mean_terminated_length": 560.125, | |
| "completions/min_length": 388.0, | |
| "completions/min_terminated_length": 388.0, | |
| "epoch": 0.0032461172519938435, | |
| "grad_norm": 0.7871054475315462, | |
| "kl": 0.026947021484375, | |
| "learning_rate": 4.99202742064106e-06, | |
| "loss": -0.0025, | |
| "num_tokens": 963673.0, | |
| "reward": 0.1587616038741544, | |
| "reward_std": 0.16337263770401478, | |
| "rewards/code_reward/mean": 0.1587616038741544, | |
| "rewards/code_reward/std": 0.16337264538742602, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 682.0, | |
| "completions/max_terminated_length": 682.0, | |
| "completions/mean_length": 521.5625, | |
| "completions/mean_terminated_length": 521.5625, | |
| "completions/min_length": 357.5, | |
| "completions/min_terminated_length": 357.5, | |
| "epoch": 0.0033580523296488037, | |
| "grad_norm": 0.7847358134959852, | |
| "kl": 0.0301513671875, | |
| "learning_rate": 4.990754567919917e-06, | |
| "loss": -0.0139, | |
| "num_tokens": 1001035.0, | |
| "reward": 0.06875000009313226, | |
| "reward_std": 0.08176466450095177, | |
| "rewards/code_reward/mean": 0.06875000009313226, | |
| "rewards/code_reward/std": 0.08176466636359692, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 795.75, | |
| "completions/max_terminated_length": 795.75, | |
| "completions/mean_length": 541.53125, | |
| "completions/mean_terminated_length": 541.53125, | |
| "completions/min_length": 275.75, | |
| "completions/min_terminated_length": 275.75, | |
| "epoch": 0.003469987407303764, | |
| "grad_norm": 0.7341146031808772, | |
| "kl": 0.038360595703125, | |
| "learning_rate": 4.989387697624881e-06, | |
| "loss": -0.0057, | |
| "num_tokens": 1032172.0, | |
| "reward": 0.12754360469989479, | |
| "reward_std": 0.13973576435819268, | |
| "rewards/code_reward/mean": 0.12754360469989479, | |
| "rewards/code_reward/std": 0.13973576435819268, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 855.25, | |
| "completions/max_terminated_length": 855.25, | |
| "completions/mean_length": 615.25, | |
| "completions/mean_terminated_length": 615.25, | |
| "completions/min_length": 476.5, | |
| "completions/min_terminated_length": 476.5, | |
| "epoch": 0.0035819224849587238, | |
| "grad_norm": 0.6710232004425948, | |
| "kl": 0.03857421875, | |
| "learning_rate": 4.987926867107095e-06, | |
| "loss": 0.0172, | |
| "num_tokens": 1071724.0, | |
| "reward": 0.04409082653000951, | |
| "reward_std": 0.08446959964931011, | |
| "rewards/code_reward/mean": 0.04409082653000951, | |
| "rewards/code_reward/std": 0.08446960058063269, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 600.25, | |
| "completions/max_terminated_length": 600.25, | |
| "completions/mean_length": 480.375, | |
| "completions/mean_terminated_length": 480.375, | |
| "completions/min_length": 348.0, | |
| "completions/min_terminated_length": 348.0, | |
| "epoch": 0.003693857562613684, | |
| "grad_norm": 0.9880874575660351, | |
| "kl": 0.045196533203125, | |
| "learning_rate": 4.986372137660078e-06, | |
| "loss": 0.0123, | |
| "num_tokens": 1105880.0, | |
| "reward": 0.2375063351355493, | |
| "reward_std": 0.21191274048760533, | |
| "rewards/code_reward/mean": 0.2375063351355493, | |
| "rewards/code_reward/std": 0.21191274095326662, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 810.25, | |
| "completions/max_terminated_length": 810.25, | |
| "completions/mean_length": 586.9375, | |
| "completions/mean_terminated_length": 586.9375, | |
| "completions/min_length": 352.0, | |
| "completions/min_terminated_length": 352.0, | |
| "epoch": 0.003805792640268644, | |
| "grad_norm": 0.7360918797059441, | |
| "kl": 0.036865234375, | |
| "learning_rate": 4.984723574517165e-06, | |
| "loss": 0.0507, | |
| "num_tokens": 1137102.0, | |
| "reward": 0.1048327736207284, | |
| "reward_std": 0.12198017432820052, | |
| "rewards/code_reward/mean": 0.1048327736207284, | |
| "rewards/code_reward/std": 0.12198018177878112, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 671.25, | |
| "completions/max_terminated_length": 671.25, | |
| "completions/mean_length": 481.6875, | |
| "completions/mean_terminated_length": 481.6875, | |
| "completions/min_length": 354.0, | |
| "completions/min_terminated_length": 354.0, | |
| "epoch": 0.0039177277179236044, | |
| "grad_norm": 0.8321841584943117, | |
| "kl": 0.04937744140625, | |
| "learning_rate": 4.9829812468487655e-06, | |
| "loss": 0.0045, | |
| "num_tokens": 1169516.0, | |
| "reward": 0.13488754630088806, | |
| "reward_std": 0.15473865950480103, | |
| "rewards/code_reward/mean": 0.13488754630088806, | |
| "rewards/code_reward/std": 0.15473866136744618, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 758.25, | |
| "completions/max_terminated_length": 758.25, | |
| "completions/mean_length": 503.5625, | |
| "completions/mean_terminated_length": 503.5625, | |
| "completions/min_length": 332.75, | |
| "completions/min_terminated_length": 332.75, | |
| "epoch": 0.004029662795578565, | |
| "grad_norm": 0.6486333735221288, | |
| "kl": 0.04339599609375, | |
| "learning_rate": 4.981145227759457e-06, | |
| "loss": 0.032, | |
| "num_tokens": 1200070.0, | |
| "reward": 0.1878063678741455, | |
| "reward_std": 0.2491721287369728, | |
| "rewards/code_reward/mean": 0.1878063678741455, | |
| "rewards/code_reward/std": 0.2491721287369728, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 699.0, | |
| "completions/max_terminated_length": 699.0, | |
| "completions/mean_length": 447.4375, | |
| "completions/mean_terminated_length": 447.4375, | |
| "completions/min_length": 265.75, | |
| "completions/min_terminated_length": 265.75, | |
| "epoch": 0.004141597873233525, | |
| "grad_norm": 0.7244692818471474, | |
| "kl": 0.06103515625, | |
| "learning_rate": 4.979215594284924e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 1226348.0, | |
| "reward": 0.20452898740768433, | |
| "reward_std": 0.11701454967260361, | |
| "rewards/code_reward/mean": 0.20452898740768433, | |
| "rewards/code_reward/std": 0.1170145571231842, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1067.0, | |
| "completions/max_terminated_length": 1067.0, | |
| "completions/mean_length": 693.0625, | |
| "completions/mean_terminated_length": 693.0625, | |
| "completions/min_length": 482.0, | |
| "completions/min_terminated_length": 482.0, | |
| "epoch": 0.004253532950888485, | |
| "grad_norm": 0.849851302505875, | |
| "kl": 0.04730224609375, | |
| "learning_rate": 4.977192427388722e-06, | |
| "loss": 0.0179, | |
| "num_tokens": 1269150.0, | |
| "reward": 0.27298991987481713, | |
| "reward_std": 0.22980366041883826, | |
| "rewards/code_reward/mean": 0.27298991987481713, | |
| "rewards/code_reward/std": 0.22980366088449955, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 683.75, | |
| "completions/max_terminated_length": 683.75, | |
| "completions/mean_length": 558.6875, | |
| "completions/mean_terminated_length": 558.6875, | |
| "completions/min_length": 429.5, | |
| "completions/min_terminated_length": 429.5, | |
| "epoch": 0.0043654680285434445, | |
| "grad_norm": 0.7007952533946012, | |
| "kl": 0.0660400390625, | |
| "learning_rate": 4.9750758119588824e-06, | |
| "loss": -0.0113, | |
| "num_tokens": 1303044.0, | |
| "reward": 0.09943000599741936, | |
| "reward_std": 0.06831434741616249, | |
| "rewards/code_reward/mean": 0.09943000599741936, | |
| "rewards/code_reward/std": 0.06831434927880764, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 757.25, | |
| "completions/max_terminated_length": 757.25, | |
| "completions/mean_length": 580.4375, | |
| "completions/mean_terminated_length": 580.4375, | |
| "completions/min_length": 434.5, | |
| "completions/min_terminated_length": 434.5, | |
| "epoch": 0.004477403106198405, | |
| "grad_norm": 0.8837854421885906, | |
| "kl": 0.0677490234375, | |
| "learning_rate": 4.972865836804349e-06, | |
| "loss": 0.036, | |
| "num_tokens": 1341114.0, | |
| "reward": 0.06429215706884861, | |
| "reward_std": 0.10811880882829428, | |
| "rewards/code_reward/mean": 0.06429215706884861, | |
| "rewards/code_reward/std": 0.10811881255358458, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 751.25, | |
| "completions/max_terminated_length": 751.25, | |
| "completions/mean_length": 564.15625, | |
| "completions/mean_terminated_length": 564.15625, | |
| "completions/min_length": 286.25, | |
| "completions/min_terminated_length": 286.25, | |
| "epoch": 0.004589338183853365, | |
| "grad_norm": 0.8908962386984877, | |
| "kl": 0.05224609375, | |
| "learning_rate": 4.970562594651254e-06, | |
| "loss": 0.0452, | |
| "num_tokens": 1374855.0, | |
| "reward": 0.0228785730432719, | |
| "reward_std": 0.06361539242789149, | |
| "rewards/code_reward/mean": 0.0228785730432719, | |
| "rewards/code_reward/std": 0.06361539429053664, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 771.25, | |
| "completions/max_terminated_length": 771.25, | |
| "completions/mean_length": 548.25, | |
| "completions/mean_terminated_length": 548.25, | |
| "completions/min_length": 359.5, | |
| "completions/min_terminated_length": 359.5, | |
| "epoch": 0.004701273261508325, | |
| "grad_norm": 0.6625726727295415, | |
| "kl": 0.06036376953125, | |
| "learning_rate": 4.968166182139026e-06, | |
| "loss": 0.0256, | |
| "num_tokens": 1408383.0, | |
| "reward": 0.12928921589627862, | |
| "reward_std": 0.1418905109167099, | |
| "rewards/code_reward/mean": 0.12928921589627862, | |
| "rewards/code_reward/std": 0.1418905109167099, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 734.5, | |
| "completions/max_terminated_length": 734.5, | |
| "completions/mean_length": 542.28125, | |
| "completions/mean_terminated_length": 542.28125, | |
| "completions/min_length": 352.0, | |
| "completions/min_terminated_length": 352.0, | |
| "epoch": 0.004813208339163285, | |
| "grad_norm": 0.6465061355449234, | |
| "kl": 0.06768798828125, | |
| "learning_rate": 4.9656766998163306e-06, | |
| "loss": 0.0207, | |
| "num_tokens": 1446992.0, | |
| "reward": 0.10072244703769684, | |
| "reward_std": 0.11095328629016876, | |
| "rewards/code_reward/mean": 0.10072244703769684, | |
| "rewards/code_reward/std": 0.11095329001545906, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1169.75, | |
| "completions/max_terminated_length": 960.0, | |
| "completions/mean_length": 709.5625, | |
| "completions/mean_terminated_length": 673.1160888671875, | |
| "completions/min_length": 441.75, | |
| "completions/min_terminated_length": 441.75, | |
| "epoch": 0.004925143416818246, | |
| "grad_norm": 0.5674013554100908, | |
| "kl": 0.05987548828125, | |
| "learning_rate": 4.963094252136865e-06, | |
| "loss": 0.0051, | |
| "num_tokens": 1489002.0, | |
| "reward": 0.026442307978868484, | |
| "reward_std": 0.02707473188638687, | |
| "rewards/code_reward/mean": 0.026442307978868484, | |
| "rewards/code_reward/std": 0.027074730023741722, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1275.0, | |
| "completions/max_terminated_length": 1007.25, | |
| "completions/mean_length": 751.75, | |
| "completions/mean_terminated_length": 665.2291717529297, | |
| "completions/min_length": 437.0, | |
| "completions/min_terminated_length": 437.0, | |
| "epoch": 0.005037078494473206, | |
| "grad_norm": 0.485271480797118, | |
| "kl": 0.04827880859375, | |
| "learning_rate": 4.960418947454958e-06, | |
| "loss": 0.0172, | |
| "num_tokens": 1535994.0, | |
| "reward": 0.0928819477558136, | |
| "reward_std": 0.059048041701316833, | |
| "rewards/code_reward/mean": 0.0928819477558136, | |
| "rewards/code_reward/std": 0.059048037976026535, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 862.25, | |
| "completions/max_terminated_length": 862.25, | |
| "completions/mean_length": 557.3125, | |
| "completions/mean_terminated_length": 557.3125, | |
| "completions/min_length": 322.75, | |
| "completions/min_terminated_length": 322.75, | |
| "epoch": 0.005149013572128166, | |
| "grad_norm": 0.5321904564182419, | |
| "kl": 0.0755615234375, | |
| "learning_rate": 4.957650898021038e-06, | |
| "loss": 0.0296, | |
| "num_tokens": 1574756.0, | |
| "reward": 0.001838235417380929, | |
| "reward_std": 0.0021725620608776808, | |
| "rewards/code_reward/mean": 0.001838235417380929, | |
| "rewards/code_reward/std": 0.0021725620608776808, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 822.75, | |
| "completions/max_terminated_length": 822.75, | |
| "completions/mean_length": 576.875, | |
| "completions/mean_terminated_length": 576.875, | |
| "completions/min_length": 400.25, | |
| "completions/min_terminated_length": 400.25, | |
| "epoch": 0.005260948649783125, | |
| "grad_norm": 0.936192416992597, | |
| "kl": 0.07989501953125, | |
| "learning_rate": 4.954790219976915e-06, | |
| "loss": 0.0045, | |
| "num_tokens": 1610288.0, | |
| "reward": 0.1439773216843605, | |
| "reward_std": 0.18783001974225044, | |
| "rewards/code_reward/mean": 0.1439773216843605, | |
| "rewards/code_reward/std": 0.1878300216048956, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1394.5, | |
| "completions/max_terminated_length": 718.5, | |
| "completions/mean_length": 634.0, | |
| "completions/mean_terminated_length": 540.5089416503906, | |
| "completions/min_length": 364.25, | |
| "completions/min_terminated_length": 364.25, | |
| "epoch": 0.005372883727438086, | |
| "grad_norm": 0.7287543485940715, | |
| "kl": 0.0684814453125, | |
| "learning_rate": 4.95183703335091e-06, | |
| "loss": -0.0097, | |
| "num_tokens": 1643344.0, | |
| "reward": 0.06508574914187193, | |
| "reward_std": 0.14080872386693954, | |
| "rewards/code_reward/mean": 0.06508574914187193, | |
| "rewards/code_reward/std": 0.14080872386693954, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1348.0, | |
| "completions/max_terminated_length": 1062.25, | |
| "completions/mean_length": 779.1875, | |
| "completions/mean_terminated_length": 739.1562652587891, | |
| "completions/min_length": 460.5, | |
| "completions/min_terminated_length": 460.5, | |
| "epoch": 0.005484818805093046, | |
| "grad_norm": 0.5005111196440745, | |
| "kl": 0.0516357421875, | |
| "learning_rate": 4.948791462052819e-06, | |
| "loss": 0.0943, | |
| "num_tokens": 1697622.0, | |
| "reward": 0.03162594046443701, | |
| "reward_std": 0.04561105836182833, | |
| "rewards/code_reward/mean": 0.03162594046443701, | |
| "rewards/code_reward/std": 0.0456110592931509, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 698.0, | |
| "completions/max_terminated_length": 698.0, | |
| "completions/mean_length": 550.5, | |
| "completions/mean_terminated_length": 550.5, | |
| "completions/min_length": 308.0, | |
| "completions/min_terminated_length": 308.0, | |
| "epoch": 0.005596753882748006, | |
| "grad_norm": 0.6600509702450322, | |
| "kl": 0.080078125, | |
| "learning_rate": 4.945653633868716e-06, | |
| "loss": -0.0276, | |
| "num_tokens": 1736334.0, | |
| "reward": 0.020256503019481897, | |
| "reward_std": 0.05186595255509019, | |
| "rewards/code_reward/mean": 0.020256503019481897, | |
| "rewards/code_reward/std": 0.05186595278792083, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 974.0, | |
| "completions/max_terminated_length": 974.0, | |
| "completions/mean_length": 716.1875, | |
| "completions/mean_terminated_length": 716.1875, | |
| "completions/min_length": 532.75, | |
| "completions/min_terminated_length": 532.75, | |
| "epoch": 0.005708688960402966, | |
| "grad_norm": 0.7327612072681561, | |
| "kl": 0.07684326171875, | |
| "learning_rate": 4.942423680455584e-06, | |
| "loss": 0.0245, | |
| "num_tokens": 1782508.0, | |
| "reward": 0.12191444495692849, | |
| "reward_std": 0.18840329442173243, | |
| "rewards/code_reward/mean": 0.12191444495692849, | |
| "rewards/code_reward/std": 0.18840329255908728, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1145.75, | |
| "completions/max_terminated_length": 794.5, | |
| "completions/mean_length": 594.21875, | |
| "completions/mean_terminated_length": 546.3482208251953, | |
| "completions/min_length": 401.5, | |
| "completions/min_terminated_length": 401.5, | |
| "epoch": 0.0058206240380579265, | |
| "grad_norm": 0.8852205460832024, | |
| "kl": 0.0771484375, | |
| "learning_rate": 4.939101737335802e-06, | |
| "loss": 0.0285, | |
| "num_tokens": 1817019.0, | |
| "reward": 0.2491304986178875, | |
| "reward_std": 0.283006114885211, | |
| "rewards/code_reward/mean": 0.2491304986178875, | |
| "rewards/code_reward/std": 0.2830061223357916, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 842.0, | |
| "completions/max_terminated_length": 842.0, | |
| "completions/mean_length": 592.90625, | |
| "completions/mean_terminated_length": 592.90625, | |
| "completions/min_length": 387.5, | |
| "completions/min_terminated_length": 387.5, | |
| "epoch": 0.005932559115712887, | |
| "grad_norm": 0.9735177234540268, | |
| "kl": 0.090576171875, | |
| "learning_rate": 4.935687943891447e-06, | |
| "loss": 0.0151, | |
| "num_tokens": 1857712.0, | |
| "reward": 0.05411792593076825, | |
| "reward_std": 0.11774230282753706, | |
| "rewards/code_reward/mean": 0.05411792593076825, | |
| "rewards/code_reward/std": 0.11774230748414993, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1032.25, | |
| "completions/max_terminated_length": 899.0, | |
| "completions/mean_length": 757.0, | |
| "completions/mean_terminated_length": 704.8645935058594, | |
| "completions/min_length": 557.75, | |
| "completions/min_terminated_length": 557.75, | |
| "epoch": 0.006044494193367847, | |
| "grad_norm": 0.528475149602509, | |
| "kl": 0.081939697265625, | |
| "learning_rate": 4.932182443358458e-06, | |
| "loss": 0.0384, | |
| "num_tokens": 1906368.0, | |
| "reward": 0.08750000596046448, | |
| "reward_std": 0.10493762046098709, | |
| "rewards/code_reward/mean": 0.08750000596046448, | |
| "rewards/code_reward/std": 0.10493762046098709, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1305.0, | |
| "completions/max_terminated_length": 659.0, | |
| "completions/mean_length": 582.5, | |
| "completions/mean_terminated_length": 488.51341247558594, | |
| "completions/min_length": 289.25, | |
| "completions/min_terminated_length": 289.25, | |
| "epoch": 0.006156429271022806, | |
| "grad_norm": 0.9161325800813456, | |
| "kl": 0.06402587890625, | |
| "learning_rate": 4.928585382820616e-06, | |
| "loss": 0.0907, | |
| "num_tokens": 1939632.0, | |
| "reward": 0.17478298512287438, | |
| "reward_std": 0.1567421266809106, | |
| "rewards/code_reward/mean": 0.17478298512287438, | |
| "rewards/code_reward/std": 0.15674212691374123, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1085.75, | |
| "completions/max_terminated_length": 757.0, | |
| "completions/mean_length": 603.28125, | |
| "completions/mean_terminated_length": 558.6964416503906, | |
| "completions/min_length": 429.0, | |
| "completions/min_terminated_length": 429.0, | |
| "epoch": 0.006268364348677767, | |
| "grad_norm": 0.8910583501574744, | |
| "kl": 0.07037353515625, | |
| "learning_rate": 4.924896913203376e-06, | |
| "loss": 0.0682, | |
| "num_tokens": 1979449.0, | |
| "reward": 0.10180415771901608, | |
| "reward_std": 0.13188505358994007, | |
| "rewards/code_reward/mean": 0.10180415771901608, | |
| "rewards/code_reward/std": 0.1318850601091981, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 806.5, | |
| "completions/max_terminated_length": 806.5, | |
| "completions/mean_length": 581.03125, | |
| "completions/mean_terminated_length": 581.03125, | |
| "completions/min_length": 407.5, | |
| "completions/min_terminated_length": 407.5, | |
| "epoch": 0.006380299426332727, | |
| "grad_norm": 1.03891552766019, | |
| "kl": 0.08428955078125, | |
| "learning_rate": 4.921117189267535e-06, | |
| "loss": -0.054, | |
| "num_tokens": 2018810.0, | |
| "reward": 0.1397020157892257, | |
| "reward_std": 0.1752215747255832, | |
| "rewards/code_reward/mean": 0.1397020157892257, | |
| "rewards/code_reward/std": 0.17522158951032907, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 745.0, | |
| "completions/max_terminated_length": 745.0, | |
| "completions/mean_length": 548.53125, | |
| "completions/mean_terminated_length": 548.53125, | |
| "completions/min_length": 340.25, | |
| "completions/min_terminated_length": 340.25, | |
| "epoch": 0.006492234503987687, | |
| "grad_norm": 0.5842843696982555, | |
| "kl": 0.0770263671875, | |
| "learning_rate": 4.917246369602742e-06, | |
| "loss": 0.0329, | |
| "num_tokens": 2054963.0, | |
| "reward": 0.10270743072032928, | |
| "reward_std": 0.14296946674585342, | |
| "rewards/code_reward/mean": 0.10270743072032928, | |
| "rewards/code_reward/std": 0.14296947047114372, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 741.75, | |
| "completions/max_terminated_length": 741.75, | |
| "completions/mean_length": 581.90625, | |
| "completions/mean_terminated_length": 581.90625, | |
| "completions/min_length": 426.0, | |
| "completions/min_terminated_length": 426.0, | |
| "epoch": 0.006604169581642647, | |
| "grad_norm": 0.8753670558307941, | |
| "kl": 0.07373046875, | |
| "learning_rate": 4.9132846166208355e-06, | |
| "loss": 0.0316, | |
| "num_tokens": 2089368.0, | |
| "reward": 0.008248626545537263, | |
| "reward_std": 0.008944235043600202, | |
| "rewards/code_reward/mean": 0.008248626545537263, | |
| "rewards/code_reward/std": 0.008944234810769558, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 706.5, | |
| "completions/max_terminated_length": 706.5, | |
| "completions/mean_length": 519.03125, | |
| "completions/mean_terminated_length": 519.03125, | |
| "completions/min_length": 317.0, | |
| "completions/min_terminated_length": 317.0, | |
| "epoch": 0.0067161046592976075, | |
| "grad_norm": 0.8925995523113797, | |
| "kl": 0.0953369140625, | |
| "learning_rate": 4.9092320965490365e-06, | |
| "loss": 0.0961, | |
| "num_tokens": 2119905.0, | |
| "reward": 0.11875520087778568, | |
| "reward_std": 0.2019189279526472, | |
| "rewards/code_reward/mean": 0.11875520087778568, | |
| "rewards/code_reward/std": 0.20191893354058266, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 580.75, | |
| "completions/max_terminated_length": 580.75, | |
| "completions/mean_length": 387.21875, | |
| "completions/mean_terminated_length": 387.21875, | |
| "completions/min_length": 223.5, | |
| "completions/min_terminated_length": 223.5, | |
| "epoch": 0.006828039736952568, | |
| "grad_norm": 0.7379134154137567, | |
| "kl": 0.0865478515625, | |
| "learning_rate": 4.905088979422971e-06, | |
| "loss": 0.0293, | |
| "num_tokens": 2148904.0, | |
| "reward": 0.021313310135155916, | |
| "reward_std": 0.018601362127810717, | |
| "rewards/code_reward/mean": 0.021313310135155916, | |
| "rewards/code_reward/std": 0.018601362593472004, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1017.5, | |
| "completions/max_terminated_length": 684.25, | |
| "completions/mean_length": 573.875, | |
| "completions/mean_terminated_length": 528.5982208251953, | |
| "completions/min_length": 326.25, | |
| "completions/min_terminated_length": 326.25, | |
| "epoch": 0.006939974814607528, | |
| "grad_norm": 0.8052241518254897, | |
| "kl": 0.0723876953125, | |
| "learning_rate": 4.900855439079536e-06, | |
| "loss": 0.0677, | |
| "num_tokens": 2190676.0, | |
| "reward": 0.055714288260787725, | |
| "reward_std": 0.04886896489188075, | |
| "rewards/code_reward/mean": 0.055714288260787725, | |
| "rewards/code_reward/std": 0.048868965124711394, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 643.75, | |
| "completions/max_terminated_length": 643.75, | |
| "completions/mean_length": 476.25, | |
| "completions/mean_terminated_length": 476.25, | |
| "completions/min_length": 310.5, | |
| "completions/min_terminated_length": 310.5, | |
| "epoch": 0.007051909892262488, | |
| "grad_norm": 0.4680852161414822, | |
| "kl": 0.08984375, | |
| "learning_rate": 4.8965316531496055e-06, | |
| "loss": 0.0257, | |
| "num_tokens": 2221412.0, | |
| "reward": 0.0011574074160307646, | |
| "reward_std": 0.0021431019995361567, | |
| "rewards/code_reward/mean": 0.0011574074160307646, | |
| "rewards/code_reward/std": 0.0021431019995361567, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1011.5, | |
| "completions/max_terminated_length": 646.75, | |
| "completions/mean_length": 617.40625, | |
| "completions/mean_terminated_length": 520.71875, | |
| "completions/min_length": 379.75, | |
| "completions/min_terminated_length": 379.75, | |
| "epoch": 0.0071638449699174475, | |
| "grad_norm": 0.8146582859250777, | |
| "kl": 0.0728759765625, | |
| "learning_rate": 4.892117803050578e-06, | |
| "loss": -0.0054, | |
| "num_tokens": 2258025.0, | |
| "reward": 0.2782451882958412, | |
| "reward_std": 0.2655297741293907, | |
| "rewards/code_reward/mean": 0.2782451882958412, | |
| "rewards/code_reward/std": 0.2655297741293907, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1142.0, | |
| "completions/max_terminated_length": 750.5, | |
| "completions/mean_length": 635.8125, | |
| "completions/mean_terminated_length": 583.9464340209961, | |
| "completions/min_length": 439.5, | |
| "completions/min_terminated_length": 439.5, | |
| "epoch": 0.007275780047572408, | |
| "grad_norm": 0.6310220375924765, | |
| "kl": 0.07208251953125, | |
| "learning_rate": 4.887614073978761e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 2296595.0, | |
| "reward": 0.08172532171010971, | |
| "reward_std": 0.08684739097952843, | |
| "rewards/code_reward/mean": 0.08172532171010971, | |
| "rewards/code_reward/std": 0.08684739866293967, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 660.25, | |
| "completions/max_terminated_length": 660.25, | |
| "completions/mean_length": 459.875, | |
| "completions/mean_terminated_length": 459.875, | |
| "completions/min_length": 291.5, | |
| "completions/min_terminated_length": 291.5, | |
| "epoch": 0.007387715125227368, | |
| "grad_norm": 0.7464732599550883, | |
| "kl": 0.10107421875, | |
| "learning_rate": 4.883020654901609e-06, | |
| "loss": -0.0305, | |
| "num_tokens": 2331455.0, | |
| "reward": 0.15218659490346909, | |
| "reward_std": 0.10367358289659023, | |
| "rewards/code_reward/mean": 0.15218659490346909, | |
| "rewards/code_reward/std": 0.10367358289659023, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 657.5, | |
| "completions/max_terminated_length": 657.5, | |
| "completions/mean_length": 434.59375, | |
| "completions/mean_terminated_length": 434.59375, | |
| "completions/min_length": 252.75, | |
| "completions/min_terminated_length": 252.75, | |
| "epoch": 0.007499650202882328, | |
| "grad_norm": 0.9414797694529051, | |
| "kl": 0.112060546875, | |
| "learning_rate": 4.878337738549785e-06, | |
| "loss": 0.0316, | |
| "num_tokens": 2356986.0, | |
| "reward": 0.10024832468479872, | |
| "reward_std": 0.18917413474991918, | |
| "rewards/code_reward/mean": 0.10024832468479872, | |
| "rewards/code_reward/std": 0.18917413474991918, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 927.25, | |
| "completions/max_terminated_length": 927.25, | |
| "completions/mean_length": 541.59375, | |
| "completions/mean_terminated_length": 541.59375, | |
| "completions/min_length": 374.0, | |
| "completions/min_terminated_length": 374.0, | |
| "epoch": 0.007611585280537288, | |
| "grad_norm": 0.9428500397053137, | |
| "kl": 0.09423828125, | |
| "learning_rate": 4.873565521409082e-06, | |
| "loss": 0.0535, | |
| "num_tokens": 2388693.0, | |
| "reward": 0.13753276504576206, | |
| "reward_std": 0.11505712405778468, | |
| "rewards/code_reward/mean": 0.13753276504576206, | |
| "rewards/code_reward/std": 0.11505712429061532, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1034.75, | |
| "completions/max_terminated_length": 878.0, | |
| "completions/mean_length": 647.96875, | |
| "completions/mean_terminated_length": 614.2187652587891, | |
| "completions/min_length": 344.75, | |
| "completions/min_terminated_length": 344.75, | |
| "epoch": 0.007723520358192249, | |
| "grad_norm": 0.6413922701473375, | |
| "kl": 0.0836181640625, | |
| "learning_rate": 4.868704203712173e-06, | |
| "loss": 0.0367, | |
| "num_tokens": 2430900.0, | |
| "reward": 0.02671811357140541, | |
| "reward_std": 0.036580079700797796, | |
| "rewards/code_reward/mean": 0.02671811357140541, | |
| "rewards/code_reward/std": 0.03658008202910423, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 733.25, | |
| "completions/max_terminated_length": 733.25, | |
| "completions/mean_length": 464.53125, | |
| "completions/mean_terminated_length": 464.53125, | |
| "completions/min_length": 264.25, | |
| "completions/min_terminated_length": 264.25, | |
| "epoch": 0.007835455435847209, | |
| "grad_norm": 0.9506305895466796, | |
| "kl": 0.12646484375, | |
| "learning_rate": 4.86375398943021e-06, | |
| "loss": 0.0209, | |
| "num_tokens": 2466109.0, | |
| "reward": 0.10709889512509108, | |
| "reward_std": 0.14360995404422283, | |
| "rewards/code_reward/mean": 0.10709889512509108, | |
| "rewards/code_reward/std": 0.14360996149480343, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 498.5, | |
| "completions/max_terminated_length": 498.5, | |
| "completions/mean_length": 333.0, | |
| "completions/mean_terminated_length": 333.0, | |
| "completions/min_length": 150.75, | |
| "completions/min_terminated_length": 150.75, | |
| "epoch": 0.007947390513502168, | |
| "grad_norm": 0.9998509369749687, | |
| "kl": 0.118896484375, | |
| "learning_rate": 4.858715086264274e-06, | |
| "loss": 0.078, | |
| "num_tokens": 2489565.0, | |
| "reward": 0.08216492831707001, | |
| "reward_std": 0.06686047837138176, | |
| "rewards/code_reward/mean": 0.08216492831707001, | |
| "rewards/code_reward/std": 0.06686047837138176, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 539.5, | |
| "completions/max_terminated_length": 539.5, | |
| "completions/mean_length": 371.84375, | |
| "completions/mean_terminated_length": 371.84375, | |
| "completions/min_length": 243.0, | |
| "completions/min_terminated_length": 243.0, | |
| "epoch": 0.00805932559115713, | |
| "grad_norm": 0.8636085966435983, | |
| "kl": 0.1112060546875, | |
| "learning_rate": 4.853587705636646e-06, | |
| "loss": 0.0659, | |
| "num_tokens": 2518496.0, | |
| "reward": 0.3326955884695053, | |
| "reward_std": 0.1749400496482849, | |
| "rewards/code_reward/mean": 0.3326955884695053, | |
| "rewards/code_reward/std": 0.17494005151093006, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 421.25, | |
| "completions/max_terminated_length": 421.25, | |
| "completions/mean_length": 293.46875, | |
| "completions/mean_terminated_length": 293.46875, | |
| "completions/min_length": 154.5, | |
| "completions/min_terminated_length": 154.5, | |
| "epoch": 0.008171260668812089, | |
| "grad_norm": 0.8623456501670774, | |
| "kl": 0.1112060546875, | |
| "learning_rate": 4.84837206268195e-06, | |
| "loss": -0.0657, | |
| "num_tokens": 2541951.0, | |
| "reward": 0.41282894741743803, | |
| "reward_std": 0.14345367066562176, | |
| "rewards/code_reward/mean": 0.41282894741743803, | |
| "rewards/code_reward/std": 0.14345368463546038, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 546.75, | |
| "completions/max_terminated_length": 546.75, | |
| "completions/mean_length": 413.25, | |
| "completions/mean_terminated_length": 413.25, | |
| "completions/min_length": 245.0, | |
| "completions/min_terminated_length": 245.0, | |
| "epoch": 0.00828319574646705, | |
| "grad_norm": 0.6709675033187479, | |
| "kl": 0.1011962890625, | |
| "learning_rate": 4.8430683762381195e-06, | |
| "loss": 0.0792, | |
| "num_tokens": 2571935.0, | |
| "reward": 0.09283980540931225, | |
| "reward_std": 0.18030504882335663, | |
| "rewards/code_reward/mean": 0.09283980540931225, | |
| "rewards/code_reward/std": 0.18030504882335663, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 701.5, | |
| "completions/max_terminated_length": 701.5, | |
| "completions/mean_length": 471.4375, | |
| "completions/mean_terminated_length": 471.4375, | |
| "completions/min_length": 277.75, | |
| "completions/min_terminated_length": 277.75, | |
| "epoch": 0.008395130824122009, | |
| "grad_norm": 0.9648853970330805, | |
| "kl": 0.1002197265625, | |
| "learning_rate": 4.837676868837213e-06, | |
| "loss": 0.0101, | |
| "num_tokens": 2603869.0, | |
| "reward": 0.09730057418346405, | |
| "reward_std": 0.08919950015842915, | |
| "rewards/code_reward/mean": 0.09730057418346405, | |
| "rewards/code_reward/std": 0.08919950760900974, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 705.5, | |
| "completions/max_terminated_length": 705.5, | |
| "completions/mean_length": 496.84375, | |
| "completions/mean_terminated_length": 496.84375, | |
| "completions/min_length": 335.25, | |
| "completions/min_terminated_length": 335.25, | |
| "epoch": 0.00850706590177697, | |
| "grad_norm": 1.0521374237347803, | |
| "kl": 0.1251220703125, | |
| "learning_rate": 4.832197766696085e-06, | |
| "loss": 0.0548, | |
| "num_tokens": 2632288.0, | |
| "reward": 0.18549207970499992, | |
| "reward_std": 0.23195349983870983, | |
| "rewards/code_reward/mean": 0.18549207970499992, | |
| "rewards/code_reward/std": 0.23195349797606468, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 760.0, | |
| "completions/max_terminated_length": 760.0, | |
| "completions/mean_length": 525.625, | |
| "completions/mean_terminated_length": 525.625, | |
| "completions/min_length": 347.5, | |
| "completions/min_terminated_length": 347.5, | |
| "epoch": 0.00861900097943193, | |
| "grad_norm": 0.025045228642194366, | |
| "kl": 0.1104736328125, | |
| "learning_rate": 4.826631299706887e-06, | |
| "loss": 0.0011, | |
| "num_tokens": 2669812.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/code_reward/mean": 0.0, | |
| "rewards/code_reward/std": 0.0, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 583.25, | |
| "completions/max_terminated_length": 583.25, | |
| "completions/mean_length": 442.46875, | |
| "completions/mean_terminated_length": 442.46875, | |
| "completions/min_length": 208.5, | |
| "completions/min_terminated_length": 208.5, | |
| "epoch": 0.008730936057086889, | |
| "grad_norm": 1.1367736845257632, | |
| "kl": 0.1265869140625, | |
| "learning_rate": 4.820977701427424e-06, | |
| "loss": 0.0331, | |
| "num_tokens": 2700491.0, | |
| "reward": 0.047237071208655834, | |
| "reward_std": 0.10418419446796179, | |
| "rewards/code_reward/mean": 0.047237071208655834, | |
| "rewards/code_reward/std": 0.10418420331552625, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 689.5, | |
| "completions/max_terminated_length": 689.5, | |
| "completions/mean_length": 505.6875, | |
| "completions/mean_terminated_length": 505.6875, | |
| "completions/min_length": 330.25, | |
| "completions/min_terminated_length": 330.25, | |
| "epoch": 0.00884287113474185, | |
| "grad_norm": 0.5193528707901873, | |
| "kl": 0.103515625, | |
| "learning_rate": 4.81523720907136e-06, | |
| "loss": 0.0032, | |
| "num_tokens": 2734425.0, | |
| "reward": 0.04880136996507645, | |
| "reward_std": 0.053204361349344254, | |
| "rewards/code_reward/mean": 0.04880136996507645, | |
| "rewards/code_reward/std": 0.053204361349344254, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 586.5, | |
| "completions/max_terminated_length": 586.5, | |
| "completions/mean_length": 443.75, | |
| "completions/mean_terminated_length": 443.75, | |
| "completions/min_length": 276.25, | |
| "completions/min_terminated_length": 276.25, | |
| "epoch": 0.00895480621239681, | |
| "grad_norm": 1.0427847884848562, | |
| "kl": 0.1318359375, | |
| "learning_rate": 4.809410063498254e-06, | |
| "loss": 0.0817, | |
| "num_tokens": 2765545.0, | |
| "reward": 0.2281582325231284, | |
| "reward_std": 0.21775340917520225, | |
| "rewards/code_reward/mean": 0.2281582325231284, | |
| "rewards/code_reward/std": 0.21775341662578285, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1028.0, | |
| "completions/max_terminated_length": 725.0, | |
| "completions/mean_length": 588.875, | |
| "completions/mean_terminated_length": 541.5669708251953, | |
| "completions/min_length": 409.75, | |
| "completions/min_terminated_length": 409.75, | |
| "epoch": 0.00906674129005177, | |
| "grad_norm": 0.4518948869491964, | |
| "kl": 0.1085205078125, | |
| "learning_rate": 4.8034965092034656e-06, | |
| "loss": -0.0053, | |
| "num_tokens": 2810309.0, | |
| "reward": 0.06289062649011612, | |
| "reward_std": 0.0699656680226326, | |
| "rewards/code_reward/mean": 0.06289062649011612, | |
| "rewards/code_reward/std": 0.0699656680226326, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 656.0, | |
| "completions/max_terminated_length": 656.0, | |
| "completions/mean_length": 440.5, | |
| "completions/mean_terminated_length": 440.5, | |
| "completions/min_length": 277.5, | |
| "completions/min_terminated_length": 277.5, | |
| "epoch": 0.00917867636770673, | |
| "grad_norm": 0.8869624498321675, | |
| "kl": 0.1461181640625, | |
| "learning_rate": 4.797496794307889e-06, | |
| "loss": 0.0534, | |
| "num_tokens": 2842813.0, | |
| "reward": 0.010840552393347025, | |
| "reward_std": 0.02040189504623413, | |
| "rewards/code_reward/mean": 0.010840552393347025, | |
| "rewards/code_reward/std": 0.02040189504623413, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 715.5, | |
| "completions/max_terminated_length": 715.5, | |
| "completions/mean_length": 515.71875, | |
| "completions/mean_terminated_length": 515.71875, | |
| "completions/min_length": 315.5, | |
| "completions/min_terminated_length": 315.5, | |
| "epoch": 0.009290611445361691, | |
| "grad_norm": 0.8955728645359476, | |
| "kl": 0.1201171875, | |
| "learning_rate": 4.791411170547545e-06, | |
| "loss": -0.0048, | |
| "num_tokens": 2878988.0, | |
| "reward": 0.0949601458851248, | |
| "reward_std": 0.043580688536167145, | |
| "rewards/code_reward/mean": 0.0949601458851248, | |
| "rewards/code_reward/std": 0.043580688536167145, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 712.75, | |
| "completions/max_terminated_length": 712.75, | |
| "completions/mean_length": 503.875, | |
| "completions/mean_terminated_length": 503.875, | |
| "completions/min_length": 348.0, | |
| "completions/min_terminated_length": 348.0, | |
| "epoch": 0.00940254652301665, | |
| "grad_norm": 0.5731316506045098, | |
| "kl": 0.10693359375, | |
| "learning_rate": 4.785239893263017e-06, | |
| "loss": 0.0657, | |
| "num_tokens": 2916960.0, | |
| "reward": 0.055458965012803674, | |
| "reward_std": 0.06723660067655146, | |
| "rewards/code_reward/mean": 0.055458965012803674, | |
| "rewards/code_reward/std": 0.06723660079296678, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 815.25, | |
| "completions/max_terminated_length": 511.75, | |
| "completions/mean_length": 416.78125, | |
| "completions/mean_terminated_length": 372.52679443359375, | |
| "completions/min_length": 259.5, | |
| "completions/min_terminated_length": 259.5, | |
| "epoch": 0.00951448160067161, | |
| "grad_norm": 1.1451584135613004, | |
| "kl": 0.138671875, | |
| "learning_rate": 4.778983221388742e-06, | |
| "loss": 0.0337, | |
| "num_tokens": 2944785.0, | |
| "reward": 0.020502878935076296, | |
| "reward_std": 0.0165937872370705, | |
| "rewards/code_reward/mean": 0.020502878935076296, | |
| "rewards/code_reward/std": 0.0165937872370705, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 617.25, | |
| "completions/max_terminated_length": 617.25, | |
| "completions/mean_length": 446.9375, | |
| "completions/mean_terminated_length": 446.9375, | |
| "completions/min_length": 280.5, | |
| "completions/min_terminated_length": 280.5, | |
| "epoch": 0.00962641667832657, | |
| "grad_norm": 0.6695429277037601, | |
| "kl": 0.143798828125, | |
| "learning_rate": 4.77264141744214e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 2976815.0, | |
| "reward": 0.012867647223174572, | |
| "reward_std": 0.027845492586493492, | |
| "rewards/code_reward/mean": 0.012867647223174572, | |
| "rewards/code_reward/std": 0.02784549444913864, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 511.75, | |
| "completions/max_terminated_length": 511.75, | |
| "completions/mean_length": 362.0625, | |
| "completions/mean_terminated_length": 362.0625, | |
| "completions/min_length": 138.0, | |
| "completions/min_terminated_length": 138.0, | |
| "epoch": 0.00973835175598153, | |
| "grad_norm": 1.0671674919636422, | |
| "kl": 0.11083984375, | |
| "learning_rate": 4.766214747512603e-06, | |
| "loss": 0.0937, | |
| "num_tokens": 2998881.0, | |
| "reward": 0.26682692021131516, | |
| "reward_std": 0.16957121342420578, | |
| "rewards/code_reward/mean": 0.26682692021131516, | |
| "rewards/code_reward/std": 0.16957121714949608, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 641.0, | |
| "completions/max_terminated_length": 641.0, | |
| "completions/mean_length": 465.0, | |
| "completions/mean_terminated_length": 465.0, | |
| "completions/min_length": 320.25, | |
| "completions/min_terminated_length": 320.25, | |
| "epoch": 0.009850286833636491, | |
| "grad_norm": 1.0133211477792508, | |
| "kl": 0.1400146484375, | |
| "learning_rate": 4.759703481250331e-06, | |
| "loss": 0.0317, | |
| "num_tokens": 3034097.0, | |
| "reward": 0.07334506892948411, | |
| "reward_std": 0.11912691406905651, | |
| "rewards/code_reward/mean": 0.07334506892948411, | |
| "rewards/code_reward/std": 0.11912691593170166, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 448.0, | |
| "completions/max_terminated_length": 448.0, | |
| "completions/mean_length": 289.90625, | |
| "completions/mean_terminated_length": 289.90625, | |
| "completions/min_length": 127.0, | |
| "completions/min_terminated_length": 127.0, | |
| "epoch": 0.00996222191129145, | |
| "grad_norm": 1.0515861072357229, | |
| "kl": 0.1114501953125, | |
| "learning_rate": 4.753107891855015e-06, | |
| "loss": -0.0234, | |
| "num_tokens": 3056046.0, | |
| "reward": 0.28452102770097554, | |
| "reward_std": 0.07553892768919468, | |
| "rewards/code_reward/mean": 0.28452102770097554, | |
| "rewards/code_reward/std": 0.07553892862051725, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 519.75, | |
| "completions/max_terminated_length": 519.75, | |
| "completions/mean_length": 333.6875, | |
| "completions/mean_terminated_length": 333.6875, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "epoch": 0.010074156988946412, | |
| "grad_norm": 0.8537837657218319, | |
| "kl": 0.1558837890625, | |
| "learning_rate": 4.746428256064375e-06, | |
| "loss": 0.0327, | |
| "num_tokens": 3079588.0, | |
| "reward": 0.14791105315089226, | |
| "reward_std": 0.09374275244772434, | |
| "rewards/code_reward/mean": 0.14791105315089226, | |
| "rewards/code_reward/std": 0.09374275989830494, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 498.5, | |
| "completions/max_terminated_length": 498.5, | |
| "completions/mean_length": 349.25, | |
| "completions/mean_terminated_length": 349.25, | |
| "completions/min_length": 185.5, | |
| "completions/min_terminated_length": 185.5, | |
| "epoch": 0.010186092066601371, | |
| "grad_norm": 1.1628188809572875, | |
| "kl": 0.16552734375, | |
| "learning_rate": 4.7396648541425534e-06, | |
| "loss": 0.071, | |
| "num_tokens": 3108756.0, | |
| "reward": 0.08873509289696813, | |
| "reward_std": 0.11791448388248682, | |
| "rewards/code_reward/mean": 0.08873509289696813, | |
| "rewards/code_reward/std": 0.11791448295116425, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 591.75, | |
| "completions/max_terminated_length": 591.75, | |
| "completions/mean_length": 385.59375, | |
| "completions/mean_terminated_length": 385.59375, | |
| "completions/min_length": 168.5, | |
| "completions/min_terminated_length": 168.5, | |
| "epoch": 0.010298027144256332, | |
| "grad_norm": 1.0836904121924078, | |
| "kl": 0.140380859375, | |
| "learning_rate": 4.732817969868348e-06, | |
| "loss": -0.0549, | |
| "num_tokens": 3141759.0, | |
| "reward": 0.1353156054392457, | |
| "reward_std": 0.0778092760592699, | |
| "rewards/code_reward/mean": 0.1353156054392457, | |
| "rewards/code_reward/std": 0.07780927885323763, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 555.0, | |
| "completions/max_terminated_length": 555.0, | |
| "completions/mean_length": 406.0, | |
| "completions/mean_terminated_length": 406.0, | |
| "completions/min_length": 236.75, | |
| "completions/min_terminated_length": 236.75, | |
| "epoch": 0.010409962221911291, | |
| "grad_norm": 0.9641885111816164, | |
| "kl": 0.142333984375, | |
| "learning_rate": 4.7258878905233095e-06, | |
| "loss": -0.0439, | |
| "num_tokens": 3167615.0, | |
| "reward": 0.22870281734503806, | |
| "reward_std": 0.08901303343009204, | |
| "rewards/code_reward/mean": 0.22870281734503806, | |
| "rewards/code_reward/std": 0.08901303354650736, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 550.25, | |
| "completions/max_terminated_length": 550.25, | |
| "completions/mean_length": 400.0, | |
| "completions/mean_terminated_length": 400.0, | |
| "completions/min_length": 247.5, | |
| "completions/min_terminated_length": 247.5, | |
| "epoch": 0.01052189729956625, | |
| "grad_norm": 0.6901557941958711, | |
| "kl": 0.139892578125, | |
| "learning_rate": 4.718874906879688e-06, | |
| "loss": 0.0551, | |
| "num_tokens": 3198319.0, | |
| "reward": 0.05615717824548483, | |
| "reward_std": 0.09094760753214359, | |
| "rewards/code_reward/mean": 0.05615717824548483, | |
| "rewards/code_reward/std": 0.09094761684536934, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 546.0, | |
| "completions/max_terminated_length": 546.0, | |
| "completions/mean_length": 405.3125, | |
| "completions/mean_terminated_length": 405.3125, | |
| "completions/min_length": 270.0, | |
| "completions/min_terminated_length": 270.0, | |
| "epoch": 0.010633832377221212, | |
| "grad_norm": 1.1099151469644146, | |
| "kl": 0.1552734375, | |
| "learning_rate": 4.711779313188231e-06, | |
| "loss": 0.0123, | |
| "num_tokens": 3232577.0, | |
| "reward": 0.1259501683525741, | |
| "reward_std": 0.10068924725055695, | |
| "rewards/code_reward/mean": 0.1259501683525741, | |
| "rewards/code_reward/std": 0.10068925376981497, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 447.75, | |
| "completions/max_terminated_length": 447.75, | |
| "completions/mean_length": 318.53125, | |
| "completions/mean_terminated_length": 318.53125, | |
| "completions/min_length": 214.25, | |
| "completions/min_terminated_length": 214.25, | |
| "epoch": 0.010745767454876171, | |
| "grad_norm": 0.9797800882072801, | |
| "kl": 0.160888671875, | |
| "learning_rate": 4.70460140716584e-06, | |
| "loss": 0.0019, | |
| "num_tokens": 3260770.0, | |
| "reward": 0.15165849681943655, | |
| "reward_std": 0.020956640131771564, | |
| "rewards/code_reward/mean": 0.15165849681943655, | |
| "rewards/code_reward/std": 0.020956639666110277, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 615.25, | |
| "completions/max_terminated_length": 615.25, | |
| "completions/mean_length": 405.21875, | |
| "completions/mean_terminated_length": 405.21875, | |
| "completions/min_length": 257.5, | |
| "completions/min_terminated_length": 257.5, | |
| "epoch": 0.010857702532531132, | |
| "grad_norm": 0.8843772526949033, | |
| "kl": 0.1278076171875, | |
| "learning_rate": 4.697341489983076e-06, | |
| "loss": -0.0262, | |
| "num_tokens": 3292233.0, | |
| "reward": 0.22608212963677943, | |
| "reward_std": 0.17254789546132088, | |
| "rewards/code_reward/mean": 0.22608212963677943, | |
| "rewards/code_reward/std": 0.17254790337756276, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 630.0, | |
| "completions/max_terminated_length": 630.0, | |
| "completions/mean_length": 432.28125, | |
| "completions/mean_terminated_length": 432.28125, | |
| "completions/min_length": 277.75, | |
| "completions/min_terminated_length": 277.75, | |
| "epoch": 0.010969637610186092, | |
| "grad_norm": 0.7844902602771369, | |
| "kl": 0.14306640625, | |
| "learning_rate": 4.6899998662515215e-06, | |
| "loss": -0.0089, | |
| "num_tokens": 3325538.0, | |
| "reward": 0.034402412828058004, | |
| "reward_std": 0.08192514721304178, | |
| "rewards/code_reward/mean": 0.034402412828058004, | |
| "rewards/code_reward/std": 0.08192514767870307, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 498.25, | |
| "completions/max_terminated_length": 498.25, | |
| "completions/mean_length": 350.1875, | |
| "completions/mean_terminated_length": 350.1875, | |
| "completions/min_length": 214.5, | |
| "completions/min_terminated_length": 214.5, | |
| "epoch": 0.011081572687841053, | |
| "grad_norm": 1.053730221138067, | |
| "kl": 0.1533203125, | |
| "learning_rate": 4.682576844011007e-06, | |
| "loss": 0.011, | |
| "num_tokens": 3360072.0, | |
| "reward": 0.03388687747064978, | |
| "reward_std": 0.09327089437283576, | |
| "rewards/code_reward/mean": 0.03388687747064978, | |
| "rewards/code_reward/std": 0.09327089437283576, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 522.5, | |
| "completions/max_terminated_length": 522.5, | |
| "completions/mean_length": 355.53125, | |
| "completions/mean_terminated_length": 355.53125, | |
| "completions/min_length": 199.0, | |
| "completions/min_terminated_length": 199.0, | |
| "epoch": 0.011193507765496012, | |
| "grad_norm": 1.1649821726930594, | |
| "kl": 0.138671875, | |
| "learning_rate": 4.675072734716678e-06, | |
| "loss": 0.0495, | |
| "num_tokens": 3386449.0, | |
| "reward": 0.2362132353009656, | |
| "reward_std": 0.2431453033350408, | |
| "rewards/code_reward/mean": 0.2362132353009656, | |
| "rewards/code_reward/std": 0.2431453038007021, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 550.25, | |
| "completions/max_terminated_length": 550.25, | |
| "completions/mean_length": 408.90625, | |
| "completions/mean_terminated_length": 408.90625, | |
| "completions/min_length": 278.25, | |
| "completions/min_terminated_length": 278.25, | |
| "epoch": 0.011305442843150973, | |
| "grad_norm": 1.0129951041942997, | |
| "kl": 0.1279296875, | |
| "learning_rate": 4.667487853225931e-06, | |
| "loss": 0.0108, | |
| "num_tokens": 3415838.0, | |
| "reward": 0.11401251330971718, | |
| "reward_std": 0.17430034466087818, | |
| "rewards/code_reward/mean": 0.11401251330971718, | |
| "rewards/code_reward/std": 0.17430034838616848, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 497.5, | |
| "completions/max_terminated_length": 497.5, | |
| "completions/mean_length": 321.125, | |
| "completions/mean_terminated_length": 321.125, | |
| "completions/min_length": 112.75, | |
| "completions/min_terminated_length": 112.75, | |
| "epoch": 0.011417377920805933, | |
| "grad_norm": 0.046761590657522806, | |
| "kl": 0.1375732421875, | |
| "learning_rate": 4.659822517785203e-06, | |
| "loss": 0.0014, | |
| "num_tokens": 3437298.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/code_reward/mean": 0.0, | |
| "rewards/code_reward/std": 0.0, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1112.5, | |
| "completions/max_terminated_length": 826.0, | |
| "completions/mean_length": 582.5, | |
| "completions/mean_terminated_length": 539.5535736083984, | |
| "completions/min_length": 330.75, | |
| "completions/min_terminated_length": 330.75, | |
| "epoch": 0.011529312998460892, | |
| "grad_norm": 0.936453543752835, | |
| "kl": 0.1123046875, | |
| "learning_rate": 4.6520770500166165e-06, | |
| "loss": -0.0705, | |
| "num_tokens": 3476618.0, | |
| "reward": 0.11124547757208347, | |
| "reward_std": 0.17285121232271194, | |
| "rewards/code_reward/mean": 0.11124547757208347, | |
| "rewards/code_reward/std": 0.17285121977329254, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 504.0, | |
| "completions/max_terminated_length": 504.0, | |
| "completions/mean_length": 329.78125, | |
| "completions/mean_terminated_length": 329.78125, | |
| "completions/min_length": 99.0, | |
| "completions/min_terminated_length": 99.0, | |
| "epoch": 0.011641248076115853, | |
| "grad_norm": 0.9390473948263904, | |
| "kl": 0.153564453125, | |
| "learning_rate": 4.644251774904487e-06, | |
| "loss": -0.1439, | |
| "num_tokens": 3504803.0, | |
| "reward": 0.15121639240533113, | |
| "reward_std": 0.10038218321278691, | |
| "rewards/code_reward/mean": 0.15121639240533113, | |
| "rewards/code_reward/std": 0.10038218321278691, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 631.0, | |
| "completions/max_terminated_length": 631.0, | |
| "completions/mean_length": 424.125, | |
| "completions/mean_terminated_length": 424.125, | |
| "completions/min_length": 292.75, | |
| "completions/min_terminated_length": 292.75, | |
| "epoch": 0.011753183153770812, | |
| "grad_norm": 1.2049832685071231, | |
| "kl": 0.167236328125, | |
| "learning_rate": 4.636347020781684e-06, | |
| "loss": -0.0759, | |
| "num_tokens": 3539471.0, | |
| "reward": 0.07833968009799719, | |
| "reward_std": 0.13395367993507534, | |
| "rewards/code_reward/mean": 0.07833968009799719, | |
| "rewards/code_reward/std": 0.13395368051715195, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 706.0, | |
| "completions/max_terminated_length": 706.0, | |
| "completions/mean_length": 419.71875, | |
| "completions/mean_terminated_length": 419.71875, | |
| "completions/min_length": 227.5, | |
| "completions/min_terminated_length": 227.5, | |
| "epoch": 0.011865118231425774, | |
| "grad_norm": 0.8526628663219907, | |
| "kl": 0.146240234375, | |
| "learning_rate": 4.6283631193158605e-06, | |
| "loss": 0.039, | |
| "num_tokens": 3576414.0, | |
| "reward": 0.12546709179878235, | |
| "reward_std": 0.13694094121456146, | |
| "rewards/code_reward/mean": 0.12546709179878235, | |
| "rewards/code_reward/std": 0.13694094866514206, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 574.5, | |
| "completions/max_terminated_length": 574.5, | |
| "completions/mean_length": 430.21875, | |
| "completions/mean_terminated_length": 430.21875, | |
| "completions/min_length": 278.0, | |
| "completions/min_terminated_length": 278.0, | |
| "epoch": 0.011977053309080733, | |
| "grad_norm": 1.1258481541663952, | |
| "kl": 0.1580810546875, | |
| "learning_rate": 4.620300405495532e-06, | |
| "loss": -0.0443, | |
| "num_tokens": 3613485.0, | |
| "reward": 0.11382943368516862, | |
| "reward_std": 0.1571302842348814, | |
| "rewards/code_reward/mean": 0.11382943368516862, | |
| "rewards/code_reward/std": 0.1571302842348814, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 879.0, | |
| "completions/max_terminated_length": 879.0, | |
| "completions/mean_length": 514.90625, | |
| "completions/mean_terminated_length": 514.90625, | |
| "completions/min_length": 229.75, | |
| "completions/min_terminated_length": 229.75, | |
| "epoch": 0.012088988386735694, | |
| "grad_norm": 0.8215610976673118, | |
| "kl": 0.1373291015625, | |
| "learning_rate": 4.612159217616022e-06, | |
| "loss": -0.0201, | |
| "num_tokens": 3648370.0, | |
| "reward": 0.2085580751299858, | |
| "reward_std": 0.1858917400240898, | |
| "rewards/code_reward/mean": 0.2085580751299858, | |
| "rewards/code_reward/std": 0.185891754925251, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 700.25, | |
| "completions/max_terminated_length": 700.25, | |
| "completions/mean_length": 498.0625, | |
| "completions/mean_terminated_length": 498.0625, | |
| "completions/min_length": 314.25, | |
| "completions/min_terminated_length": 314.25, | |
| "epoch": 0.012200923464390653, | |
| "grad_norm": 1.0580602071827545, | |
| "kl": 0.15966796875, | |
| "learning_rate": 4.603939897265268e-06, | |
| "loss": 0.0414, | |
| "num_tokens": 3684604.0, | |
| "reward": 0.08671755698742345, | |
| "reward_std": 0.06942056433763355, | |
| "rewards/code_reward/mean": 0.08671755698742345, | |
| "rewards/code_reward/std": 0.06942057458218187, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1063.25, | |
| "completions/max_terminated_length": 766.75, | |
| "completions/mean_length": 572.75, | |
| "completions/mean_terminated_length": 522.6160736083984, | |
| "completions/min_length": 313.75, | |
| "completions/min_terminated_length": 313.75, | |
| "epoch": 0.012312858542045613, | |
| "grad_norm": 0.6888409929879039, | |
| "kl": 0.118896484375, | |
| "learning_rate": 4.595642789309492e-06, | |
| "loss": -0.246, | |
| "num_tokens": 3720668.0, | |
| "reward": 0.10784313827753067, | |
| "reward_std": 0.164918415248394, | |
| "rewards/code_reward/mean": 0.10784313827753067, | |
| "rewards/code_reward/std": 0.164918415248394, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 660.5, | |
| "completions/max_terminated_length": 660.5, | |
| "completions/mean_length": 428.15625, | |
| "completions/mean_terminated_length": 428.15625, | |
| "completions/min_length": 161.0, | |
| "completions/min_terminated_length": 161.0, | |
| "epoch": 0.012424793619700574, | |
| "grad_norm": 0.6151737403026374, | |
| "kl": 0.153564453125, | |
| "learning_rate": 4.587268241878724e-06, | |
| "loss": 0.0941, | |
| "num_tokens": 3757241.0, | |
| "reward": 0.0126953125, | |
| "reward_std": 0.03590776585042477, | |
| "rewards/code_reward/mean": 0.0126953125, | |
| "rewards/code_reward/std": 0.03590776678174734, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 644.25, | |
| "completions/max_terminated_length": 644.25, | |
| "completions/mean_length": 468.65625, | |
| "completions/mean_terminated_length": 468.65625, | |
| "completions/min_length": 265.25, | |
| "completions/min_terminated_length": 265.25, | |
| "epoch": 0.012536728697355533, | |
| "grad_norm": 0.8581479811610659, | |
| "kl": 0.162841796875, | |
| "learning_rate": 4.578816606352205e-06, | |
| "loss": -0.0116, | |
| "num_tokens": 3786094.0, | |
| "reward": 0.10341486724792048, | |
| "reward_std": 0.09727730182930827, | |
| "rewards/code_reward/mean": 0.10341486724792048, | |
| "rewards/code_reward/std": 0.09727730927988887, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 602.75, | |
| "completions/max_terminated_length": 602.75, | |
| "completions/mean_length": 443.21875, | |
| "completions/mean_terminated_length": 443.21875, | |
| "completions/min_length": 278.25, | |
| "completions/min_terminated_length": 278.25, | |
| "epoch": 0.012648663775010494, | |
| "grad_norm": 1.032518001055115, | |
| "kl": 0.150634765625, | |
| "learning_rate": 4.570288237343632e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 3815197.0, | |
| "reward": 0.19150842766975984, | |
| "reward_std": 0.17536822147667408, | |
| "rewards/code_reward/mean": 0.19150842766975984, | |
| "rewards/code_reward/std": 0.17536823637783527, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 720.0, | |
| "completions/max_terminated_length": 720.0, | |
| "completions/mean_length": 471.78125, | |
| "completions/mean_terminated_length": 471.78125, | |
| "completions/min_length": 229.75, | |
| "completions/min_terminated_length": 229.75, | |
| "epoch": 0.012760598852665454, | |
| "grad_norm": 0.7883185431993377, | |
| "kl": 0.154296875, | |
| "learning_rate": 4.561683492686289e-06, | |
| "loss": -0.0316, | |
| "num_tokens": 3849462.0, | |
| "reward": 0.15483782812952995, | |
| "reward_std": 0.09077820833772421, | |
| "rewards/code_reward/mean": 0.15483782812952995, | |
| "rewards/code_reward/std": 0.09077820833772421, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 707.5, | |
| "completions/max_terminated_length": 707.5, | |
| "completions/mean_length": 486.5, | |
| "completions/mean_terminated_length": 486.5, | |
| "completions/min_length": 265.5, | |
| "completions/min_terminated_length": 265.5, | |
| "epoch": 0.012872533930320415, | |
| "grad_norm": 0.8556740089529604, | |
| "kl": 0.157958984375, | |
| "learning_rate": 4.5530027334180285e-06, | |
| "loss": 0.117, | |
| "num_tokens": 3887790.0, | |
| "reward": 0.127931407361757, | |
| "reward_std": 0.11615415895357728, | |
| "rewards/code_reward/mean": 0.127931407361757, | |
| "rewards/code_reward/std": 0.1161541665205732, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 605.0, | |
| "completions/max_terminated_length": 605.0, | |
| "completions/mean_length": 393.375, | |
| "completions/mean_terminated_length": 393.375, | |
| "completions/min_length": 257.25, | |
| "completions/min_terminated_length": 257.25, | |
| "epoch": 0.012984469007975374, | |
| "grad_norm": 0.7897406376014194, | |
| "kl": 0.19970703125, | |
| "learning_rate": 4.544246323766122e-06, | |
| "loss": 0.0248, | |
| "num_tokens": 3919554.0, | |
| "reward": 0.3160191457718611, | |
| "reward_std": 0.041524797677993774, | |
| "rewards/code_reward/mean": 0.3160191457718611, | |
| "rewards/code_reward/std": 0.041524799540638924, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 539.25, | |
| "completions/max_terminated_length": 539.25, | |
| "completions/mean_length": 380.90625, | |
| "completions/mean_terminated_length": 380.90625, | |
| "completions/min_length": 217.0, | |
| "completions/min_terminated_length": 217.0, | |
| "epoch": 0.013096404085630335, | |
| "grad_norm": 1.1134080298302058, | |
| "kl": 0.165771484375, | |
| "learning_rate": 4.535414631131983e-06, | |
| "loss": -0.0078, | |
| "num_tokens": 3944911.0, | |
| "reward": 0.24838980130152777, | |
| "reward_std": 0.09577816107776016, | |
| "rewards/code_reward/mean": 0.24838980130152777, | |
| "rewards/code_reward/std": 0.09577816678211093, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 570.5, | |
| "completions/max_terminated_length": 570.5, | |
| "completions/mean_length": 429.4375, | |
| "completions/mean_terminated_length": 429.4375, | |
| "completions/min_length": 246.0, | |
| "completions/min_terminated_length": 246.0, | |
| "epoch": 0.013208339163285295, | |
| "grad_norm": 0.9495608914425865, | |
| "kl": 0.1414794921875, | |
| "learning_rate": 4.526508026075746e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 3972925.0, | |
| "reward": 0.14121240563690662, | |
| "reward_std": 0.18964817747473717, | |
| "rewards/code_reward/mean": 0.14121240563690662, | |
| "rewards/code_reward/std": 0.18964817561209202, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 786.5, | |
| "completions/max_terminated_length": 786.5, | |
| "completions/mean_length": 586.34375, | |
| "completions/mean_terminated_length": 586.34375, | |
| "completions/min_length": 383.0, | |
| "completions/min_terminated_length": 383.0, | |
| "epoch": 0.013320274240940254, | |
| "grad_norm": 0.02608214090069118, | |
| "kl": 0.1209716796875, | |
| "learning_rate": 4.517526882300721e-06, | |
| "loss": 0.0012, | |
| "num_tokens": 4010480.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/code_reward/mean": 0.0, | |
| "rewards/code_reward/std": 0.0, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 593.0, | |
| "completions/max_terminated_length": 593.0, | |
| "completions/mean_length": 434.34375, | |
| "completions/mean_terminated_length": 434.34375, | |
| "completions/min_length": 315.25, | |
| "completions/min_terminated_length": 315.25, | |
| "epoch": 0.013432209318595215, | |
| "grad_norm": 1.1103364222586907, | |
| "kl": 0.16455078125, | |
| "learning_rate": 4.508471576637713e-06, | |
| "loss": 0.0019, | |
| "num_tokens": 4047539.0, | |
| "reward": 0.11785737407626584, | |
| "reward_std": 0.09593676403164864, | |
| "rewards/code_reward/mean": 0.11785737407626584, | |
| "rewards/code_reward/std": 0.09593676414806396, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 587.5, | |
| "completions/max_terminated_length": 587.5, | |
| "completions/mean_length": 426.9375, | |
| "completions/mean_terminated_length": 426.9375, | |
| "completions/min_length": 296.25, | |
| "completions/min_terminated_length": 296.25, | |
| "epoch": 0.013544144396250174, | |
| "grad_norm": 0.9365073082574228, | |
| "kl": 0.1627197265625, | |
| "learning_rate": 4.499342489029211e-06, | |
| "loss": -0.0644, | |
| "num_tokens": 4073449.0, | |
| "reward": 0.25551173387793824, | |
| "reward_std": 0.1488891058252193, | |
| "rewards/code_reward/mean": 0.25551173387793824, | |
| "rewards/code_reward/std": 0.1488891058252193, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 777.5, | |
| "completions/max_terminated_length": 777.5, | |
| "completions/mean_length": 561.3125, | |
| "completions/mean_terminated_length": 561.3125, | |
| "completions/min_length": 308.5, | |
| "completions/min_terminated_length": 308.5, | |
| "epoch": 0.013656079473905135, | |
| "grad_norm": 0.9171469588767179, | |
| "kl": 0.16162109375, | |
| "learning_rate": 4.490140002513449e-06, | |
| "loss": 0.0833, | |
| "num_tokens": 4117531.0, | |
| "reward": 0.06771073397248983, | |
| "reward_std": 0.10771879553794861, | |
| "rewards/code_reward/mean": 0.06771073397248983, | |
| "rewards/code_reward/std": 0.1077187992632389, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 825.0, | |
| "completions/max_terminated_length": 825.0, | |
| "completions/mean_length": 602.0, | |
| "completions/mean_terminated_length": 602.0, | |
| "completions/min_length": 318.0, | |
| "completions/min_terminated_length": 318.0, | |
| "epoch": 0.013768014551560095, | |
| "grad_norm": 0.5430581678238743, | |
| "kl": 0.1229248046875, | |
| "learning_rate": 4.48086450320833e-06, | |
| "loss": 0.0193, | |
| "num_tokens": 4164115.0, | |
| "reward": 0.06402191519737244, | |
| "reward_std": 0.10564571619033813, | |
| "rewards/code_reward/mean": 0.06402191519737244, | |
| "rewards/code_reward/std": 0.10564571805298328, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 700.75, | |
| "completions/max_terminated_length": 700.75, | |
| "completions/mean_length": 464.5, | |
| "completions/mean_terminated_length": 464.5, | |
| "completions/min_length": 276.25, | |
| "completions/min_terminated_length": 276.25, | |
| "epoch": 0.013879949629215056, | |
| "grad_norm": 0.9177689772341072, | |
| "kl": 0.154296875, | |
| "learning_rate": 4.4715163802952266e-06, | |
| "loss": 0.0239, | |
| "num_tokens": 4192843.0, | |
| "reward": 0.1863182729575783, | |
| "reward_std": 0.09033735934644938, | |
| "rewards/code_reward/mean": 0.1863182729575783, | |
| "rewards/code_reward/std": 0.09033736307173967, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 752.5, | |
| "completions/max_terminated_length": 752.5, | |
| "completions/mean_length": 489.78125, | |
| "completions/mean_terminated_length": 489.78125, | |
| "completions/min_length": 206.25, | |
| "completions/min_terminated_length": 206.25, | |
| "epoch": 0.013991884706870015, | |
| "grad_norm": 0.8322506450321084, | |
| "kl": 0.202392578125, | |
| "learning_rate": 4.462096026002655e-06, | |
| "loss": 0.0184, | |
| "num_tokens": 4227268.0, | |
| "reward": 0.21419981867074966, | |
| "reward_std": 0.21183521673083305, | |
| "rewards/code_reward/mean": 0.21419981867074966, | |
| "rewards/code_reward/std": 0.21183520928025246, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 720.0, | |
| "completions/max_terminated_length": 720.0, | |
| "completions/mean_length": 540.03125, | |
| "completions/mean_terminated_length": 540.03125, | |
| "completions/min_length": 284.25, | |
| "completions/min_terminated_length": 284.25, | |
| "epoch": 0.014103819784524976, | |
| "grad_norm": 0.9689144701432464, | |
| "kl": 0.146728515625, | |
| "learning_rate": 4.4526038355898144e-06, | |
| "loss": -0.0308, | |
| "num_tokens": 4261797.0, | |
| "reward": 0.261167012155056, | |
| "reward_std": 0.22630748711526394, | |
| "rewards/code_reward/mean": 0.261167012155056, | |
| "rewards/code_reward/std": 0.22630748711526394, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 673.75, | |
| "completions/max_terminated_length": 673.75, | |
| "completions/mean_length": 450.34375, | |
| "completions/mean_terminated_length": 450.34375, | |
| "completions/min_length": 210.25, | |
| "completions/min_terminated_length": 210.25, | |
| "epoch": 0.014215754862179936, | |
| "grad_norm": 0.4213790833134132, | |
| "kl": 0.144287109375, | |
| "learning_rate": 4.4430402073300035e-06, | |
| "loss": 0.0292, | |
| "num_tokens": 4290992.0, | |
| "reward": 0.012987012974917889, | |
| "reward_std": 0.013883699662983418, | |
| "rewards/code_reward/mean": 0.012987012974917889, | |
| "rewards/code_reward/std": 0.013883701525628567, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 664.25, | |
| "completions/max_terminated_length": 664.25, | |
| "completions/mean_length": 453.6875, | |
| "completions/mean_terminated_length": 453.6875, | |
| "completions/min_length": 196.0, | |
| "completions/min_terminated_length": 196.0, | |
| "epoch": 0.014327689939834895, | |
| "grad_norm": 0.8712327296844586, | |
| "kl": 0.145263671875, | |
| "learning_rate": 4.433405542493909e-06, | |
| "loss": -0.0154, | |
| "num_tokens": 4323358.0, | |
| "reward": 0.12838431354612112, | |
| "reward_std": 0.14957262016832829, | |
| "rewards/code_reward/mean": 0.12838431354612112, | |
| "rewards/code_reward/std": 0.14957262203097343, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 840.75, | |
| "completions/max_terminated_length": 840.75, | |
| "completions/mean_length": 554.40625, | |
| "completions/mean_terminated_length": 554.40625, | |
| "completions/min_length": 317.5, | |
| "completions/min_terminated_length": 317.5, | |
| "epoch": 0.014439625017489856, | |
| "grad_norm": 0.8824605681531097, | |
| "kl": 0.15185546875, | |
| "learning_rate": 4.4237002453327734e-06, | |
| "loss": 0.096, | |
| "num_tokens": 4357363.0, | |
| "reward": 0.22759733814746141, | |
| "reward_std": 0.2646235190331936, | |
| "rewards/code_reward/mean": 0.22759733814746141, | |
| "rewards/code_reward/std": 0.26462352089583874, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 601.75, | |
| "completions/max_terminated_length": 601.75, | |
| "completions/mean_length": 434.46875, | |
| "completions/mean_terminated_length": 434.46875, | |
| "completions/min_length": 266.0, | |
| "completions/min_terminated_length": 266.0, | |
| "epoch": 0.014551560095144815, | |
| "grad_norm": 1.011223461250626, | |
| "kl": 0.16943359375, | |
| "learning_rate": 4.4139247230614245e-06, | |
| "loss": 0.0213, | |
| "num_tokens": 4390298.0, | |
| "reward": 0.17828914523124695, | |
| "reward_std": 0.2471884172409773, | |
| "rewards/code_reward/mean": 0.17828914523124695, | |
| "rewards/code_reward/std": 0.24718842469155788, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 502.25, | |
| "completions/max_terminated_length": 502.25, | |
| "completions/mean_length": 342.0625, | |
| "completions/mean_terminated_length": 342.0625, | |
| "completions/min_length": 229.75, | |
| "completions/min_terminated_length": 229.75, | |
| "epoch": 0.014663495172799777, | |
| "grad_norm": 0.7287897396776124, | |
| "kl": 0.223876953125, | |
| "learning_rate": 4.404079385841201e-06, | |
| "loss": -0.0213, | |
| "num_tokens": 4411124.0, | |
| "reward": 0.599999999627471, | |
| "reward_std": 0.13620114093646407, | |
| "rewards/code_reward/mean": 0.599999999627471, | |
| "rewards/code_reward/std": 0.13620115583762527, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 576.0, | |
| "completions/max_terminated_length": 576.0, | |
| "completions/mean_length": 406.1875, | |
| "completions/mean_terminated_length": 406.1875, | |
| "completions/min_length": 241.5, | |
| "completions/min_terminated_length": 241.5, | |
| "epoch": 0.014775430250454736, | |
| "grad_norm": 1.1828069225002862, | |
| "kl": 0.21142578125, | |
| "learning_rate": 4.394164646762734e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 4436370.0, | |
| "reward": 0.070248453237582, | |
| "reward_std": 0.07654083496890962, | |
| "rewards/code_reward/mean": 0.070248453237582, | |
| "rewards/code_reward/std": 0.07654083543457091, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 777.75, | |
| "completions/max_terminated_length": 777.75, | |
| "completions/mean_length": 456.53125, | |
| "completions/mean_terminated_length": 456.53125, | |
| "completions/min_length": 206.25, | |
| "completions/min_terminated_length": 206.25, | |
| "epoch": 0.014887365328109697, | |
| "grad_norm": 0.8073699112875448, | |
| "kl": 0.1446533203125, | |
| "learning_rate": 4.384180921828618e-06, | |
| "loss": 0.0692, | |
| "num_tokens": 4466595.0, | |
| "reward": 0.17728960141539574, | |
| "reward_std": 0.20221376791596413, | |
| "rewards/code_reward/mean": 0.17728960141539574, | |
| "rewards/code_reward/std": 0.20221376977860928, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 632.5, | |
| "completions/max_terminated_length": 632.5, | |
| "completions/mean_length": 455.34375, | |
| "completions/mean_terminated_length": 455.34375, | |
| "completions/min_length": 254.75, | |
| "completions/min_terminated_length": 254.75, | |
| "epoch": 0.014999300405764656, | |
| "grad_norm": 0.8462919317415648, | |
| "kl": 0.156494140625, | |
| "learning_rate": 4.374128629935955e-06, | |
| "loss": 0.0137, | |
| "num_tokens": 4500494.0, | |
| "reward": 0.1631067901616916, | |
| "reward_std": 0.13719243195373565, | |
| "rewards/code_reward/mean": 0.1631067901616916, | |
| "rewards/code_reward/std": 0.13719243567902595, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 654.75, | |
| "completions/max_terminated_length": 654.75, | |
| "completions/mean_length": 447.09375, | |
| "completions/mean_terminated_length": 447.09375, | |
| "completions/min_length": 267.75, | |
| "completions/min_terminated_length": 267.75, | |
| "epoch": 0.015111235483419617, | |
| "grad_norm": 1.0654308580054503, | |
| "kl": 0.18505859375, | |
| "learning_rate": 4.364008192858781e-06, | |
| "loss": -0.0584, | |
| "num_tokens": 4531953.0, | |
| "reward": 0.30278054997324944, | |
| "reward_std": 0.2559507302939892, | |
| "rewards/code_reward/mean": 0.30278054997324944, | |
| "rewards/code_reward/std": 0.2559507489204407, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 674.0, | |
| "completions/max_terminated_length": 674.0, | |
| "completions/mean_length": 524.3125, | |
| "completions/mean_terminated_length": 524.3125, | |
| "completions/min_length": 310.25, | |
| "completions/min_terminated_length": 310.25, | |
| "epoch": 0.015223170561074577, | |
| "grad_norm": 0.7375182423747296, | |
| "kl": 0.1689453125, | |
| "learning_rate": 4.353820035230366e-06, | |
| "loss": -0.0053, | |
| "num_tokens": 4570779.0, | |
| "reward": 0.27923886105418205, | |
| "reward_std": 0.0807168073952198, | |
| "rewards/code_reward/mean": 0.27923886105418205, | |
| "rewards/code_reward/std": 0.0807168073952198, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 954.0, | |
| "completions/max_terminated_length": 635.25, | |
| "completions/mean_length": 518.625, | |
| "completions/mean_terminated_length": 471.20982360839844, | |
| "completions/min_length": 182.75, | |
| "completions/min_terminated_length": 182.75, | |
| "epoch": 0.015335105638729536, | |
| "grad_norm": 0.7408425833924634, | |
| "kl": 0.128662109375, | |
| "learning_rate": 4.3435645845254e-06, | |
| "loss": -0.0565, | |
| "num_tokens": 4603031.0, | |
| "reward": 0.08707524091005325, | |
| "reward_std": 0.1465706154704094, | |
| "rewards/code_reward/mean": 0.08707524091005325, | |
| "rewards/code_reward/std": 0.1465706117451191, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 649.75, | |
| "completions/max_terminated_length": 649.75, | |
| "completions/mean_length": 465.34375, | |
| "completions/mean_terminated_length": 465.34375, | |
| "completions/min_length": 238.25, | |
| "completions/min_terminated_length": 238.25, | |
| "epoch": 0.015447040716384497, | |
| "grad_norm": 0.9226530191775734, | |
| "kl": 0.196533203125, | |
| "learning_rate": 4.333242271042054e-06, | |
| "loss": 0.0199, | |
| "num_tokens": 4640226.0, | |
| "reward": 0.12062139442423359, | |
| "reward_std": 0.12237106915563345, | |
| "rewards/code_reward/mean": 0.12062139442423359, | |
| "rewards/code_reward/std": 0.1223710693884641, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 645.0, | |
| "completions/max_terminated_length": 645.0, | |
| "completions/mean_length": 463.03125, | |
| "completions/mean_terminated_length": 463.03125, | |
| "completions/min_length": 170.5, | |
| "completions/min_terminated_length": 170.5, | |
| "epoch": 0.015558975794039457, | |
| "grad_norm": 0.5630743662662061, | |
| "kl": 0.133544921875, | |
| "learning_rate": 4.32285352788393e-06, | |
| "loss": -0.0273, | |
| "num_tokens": 4672011.0, | |
| "reward": 0.0625, | |
| "reward_std": 0.06681530922651291, | |
| "rewards/code_reward/mean": 0.0625, | |
| "rewards/code_reward/std": 0.06681530922651291, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 667.25, | |
| "completions/max_terminated_length": 667.25, | |
| "completions/mean_length": 473.21875, | |
| "completions/mean_terminated_length": 473.21875, | |
| "completions/min_length": 248.0, | |
| "completions/min_terminated_length": 248.0, | |
| "epoch": 0.015670910871694418, | |
| "grad_norm": 0.8457340493749413, | |
| "kl": 0.203125, | |
| "learning_rate": 4.312398790941882e-06, | |
| "loss": 0.0252, | |
| "num_tokens": 4707650.0, | |
| "reward": 0.01744219067040831, | |
| "reward_std": 0.03082139673642814, | |
| "rewards/code_reward/mean": 0.01744219067040831, | |
| "rewards/code_reward/std": 0.030821396969258785, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 774.0, | |
| "completions/max_terminated_length": 774.0, | |
| "completions/mean_length": 459.03125, | |
| "completions/mean_terminated_length": 459.03125, | |
| "completions/min_length": 282.75, | |
| "completions/min_terminated_length": 282.75, | |
| "epoch": 0.015782845949349377, | |
| "grad_norm": 1.0215167669033631, | |
| "kl": 0.1568603515625, | |
| "learning_rate": 4.301878498875735e-06, | |
| "loss": -0.0223, | |
| "num_tokens": 4738659.0, | |
| "reward": 0.14855818077921867, | |
| "reward_std": 0.18304241634905338, | |
| "rewards/code_reward/mean": 0.14855818077921867, | |
| "rewards/code_reward/std": 0.18304241262376308, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 573.75, | |
| "completions/max_terminated_length": 573.75, | |
| "completions/mean_length": 438.84375, | |
| "completions/mean_terminated_length": 438.84375, | |
| "completions/min_length": 252.0, | |
| "completions/min_terminated_length": 252.0, | |
| "epoch": 0.015894781027004336, | |
| "grad_norm": 0.958766664227721, | |
| "kl": 0.20068359375, | |
| "learning_rate": 4.291293093095873e-06, | |
| "loss": 0.0597, | |
| "num_tokens": 4769838.0, | |
| "reward": 0.0944940485060215, | |
| "reward_std": 0.07186714326962829, | |
| "rewards/code_reward/mean": 0.0944940485060215, | |
| "rewards/code_reward/std": 0.07186715072020888, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 705.25, | |
| "completions/max_terminated_length": 705.25, | |
| "completions/mean_length": 510.0625, | |
| "completions/mean_terminated_length": 510.0625, | |
| "completions/min_length": 270.25, | |
| "completions/min_terminated_length": 270.25, | |
| "epoch": 0.0160067161046593, | |
| "grad_norm": 0.8340511277589133, | |
| "kl": 0.191650390625, | |
| "learning_rate": 4.280643017744723e-06, | |
| "loss": -0.0546, | |
| "num_tokens": 4813416.0, | |
| "reward": 0.017440817784518003, | |
| "reward_std": 0.015970090869814157, | |
| "rewards/code_reward/mean": 0.017440817784518003, | |
| "rewards/code_reward/std": 0.015970090869814157, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 676.5, | |
| "completions/max_terminated_length": 676.5, | |
| "completions/mean_length": 436.625, | |
| "completions/mean_terminated_length": 436.625, | |
| "completions/min_length": 141.0, | |
| "completions/min_terminated_length": 141.0, | |
| "epoch": 0.01611865118231426, | |
| "grad_norm": 0.9354512924987337, | |
| "kl": 0.1859130859375, | |
| "learning_rate": 4.269928719678117e-06, | |
| "loss": 0.0158, | |
| "num_tokens": 4850540.0, | |
| "reward": 0.18274498358368874, | |
| "reward_std": 0.1233069859445095, | |
| "rewards/code_reward/mean": 0.18274498358368874, | |
| "rewards/code_reward/std": 0.1233069896697998, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 652.0, | |
| "completions/max_terminated_length": 652.0, | |
| "completions/mean_length": 417.8125, | |
| "completions/mean_terminated_length": 417.8125, | |
| "completions/min_length": 162.0, | |
| "completions/min_terminated_length": 162.0, | |
| "epoch": 0.016230586259969218, | |
| "grad_norm": 0.8775436859559402, | |
| "kl": 0.200927734375, | |
| "learning_rate": 4.2591506484465426e-06, | |
| "loss": 0.06, | |
| "num_tokens": 4880958.0, | |
| "reward": 0.1889239656738937, | |
| "reward_std": 0.06604543374851346, | |
| "rewards/code_reward/mean": 0.1889239656738937, | |
| "rewards/code_reward/std": 0.06604543328285217, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 642.5, | |
| "completions/max_terminated_length": 642.5, | |
| "completions/mean_length": 453.6875, | |
| "completions/mean_terminated_length": 453.6875, | |
| "completions/min_length": 225.75, | |
| "completions/min_terminated_length": 225.75, | |
| "epoch": 0.016342521337624177, | |
| "grad_norm": 1.0579923330835856, | |
| "kl": 0.190673828125, | |
| "learning_rate": 4.248309256276283e-06, | |
| "loss": 0.0058, | |
| "num_tokens": 4908772.0, | |
| "reward": 0.22657467075623572, | |
| "reward_std": 0.27265046804677695, | |
| "rewards/code_reward/mean": 0.22657467075623572, | |
| "rewards/code_reward/std": 0.27265046804677695, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 558.0, | |
| "completions/max_terminated_length": 558.0, | |
| "completions/mean_length": 373.4375, | |
| "completions/mean_terminated_length": 373.4375, | |
| "completions/min_length": 167.75, | |
| "completions/min_terminated_length": 167.75, | |
| "epoch": 0.016454456415279137, | |
| "grad_norm": 1.2602004582489772, | |
| "kl": 0.2349853515625, | |
| "learning_rate": 4.23740499805044e-06, | |
| "loss": 0.0749, | |
| "num_tokens": 4935178.0, | |
| "reward": 0.3513445816934109, | |
| "reward_std": 0.20541435480117798, | |
| "rewards/code_reward/mean": 0.3513445816934109, | |
| "rewards/code_reward/std": 0.20541436225175858, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 619.25, | |
| "completions/max_terminated_length": 619.25, | |
| "completions/mean_length": 427.78125, | |
| "completions/mean_terminated_length": 427.78125, | |
| "completions/min_length": 160.0, | |
| "completions/min_terminated_length": 160.0, | |
| "epoch": 0.0165663914929341, | |
| "grad_norm": 1.1160890350070682, | |
| "kl": 0.17919921875, | |
| "learning_rate": 4.22643833128985e-06, | |
| "loss": 0.0269, | |
| "num_tokens": 4966539.0, | |
| "reward": 0.279205069411546, | |
| "reward_std": 0.04902365058660507, | |
| "rewards/code_reward/mean": 0.279205069411546, | |
| "rewards/code_reward/std": 0.04902365151792765, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 665.25, | |
| "completions/max_terminated_length": 665.25, | |
| "completions/mean_length": 371.625, | |
| "completions/mean_terminated_length": 371.625, | |
| "completions/min_length": 169.0, | |
| "completions/min_terminated_length": 169.0, | |
| "epoch": 0.01667832657058906, | |
| "grad_norm": 1.1750545317228818, | |
| "kl": 0.23681640625, | |
| "learning_rate": 4.215409716133885e-06, | |
| "loss": 0.015, | |
| "num_tokens": 5001903.0, | |
| "reward": 0.17107138480059803, | |
| "reward_std": 0.16521674406249076, | |
| "rewards/code_reward/mean": 0.17107138480059803, | |
| "rewards/code_reward/std": 0.16521676117554307, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 576.25, | |
| "completions/max_terminated_length": 576.25, | |
| "completions/mean_length": 350.125, | |
| "completions/mean_terminated_length": 350.125, | |
| "completions/min_length": 144.5, | |
| "completions/min_terminated_length": 144.5, | |
| "epoch": 0.016790261648244018, | |
| "grad_norm": 1.0413783228416158, | |
| "kl": 0.249755859375, | |
| "learning_rate": 4.204319615321151e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 5030091.0, | |
| "reward": 0.09492883179336786, | |
| "reward_std": 0.12909611221402884, | |
| "rewards/code_reward/mean": 0.09492883179336786, | |
| "rewards/code_reward/std": 0.12909611966460943, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 585.75, | |
| "completions/max_terminated_length": 585.75, | |
| "completions/mean_length": 353.1875, | |
| "completions/mean_terminated_length": 353.1875, | |
| "completions/min_length": 91.25, | |
| "completions/min_terminated_length": 91.25, | |
| "epoch": 0.016902196725898978, | |
| "grad_norm": 1.579544739950842, | |
| "kl": 0.50390625, | |
| "learning_rate": 4.193168494170065e-06, | |
| "loss": 0.0444, | |
| "num_tokens": 5057441.0, | |
| "reward": 0.600965291261673, | |
| "reward_std": 0.2557707913219929, | |
| "rewards/code_reward/mean": 0.600965291261673, | |
| "rewards/code_reward/std": 0.2557708006352186, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 577.25, | |
| "completions/max_terminated_length": 577.25, | |
| "completions/mean_length": 340.625, | |
| "completions/mean_terminated_length": 340.625, | |
| "completions/min_length": 143.0, | |
| "completions/min_terminated_length": 143.0, | |
| "epoch": 0.01701413180355394, | |
| "grad_norm": 1.3003579285788192, | |
| "kl": 0.190673828125, | |
| "learning_rate": 4.181956820559339e-06, | |
| "loss": 0.132, | |
| "num_tokens": 5082069.0, | |
| "reward": 0.32964441180229187, | |
| "reward_std": 0.2922050729393959, | |
| "rewards/code_reward/mean": 0.32964441180229187, | |
| "rewards/code_reward/std": 0.2922050729393959, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 463.75, | |
| "completions/max_terminated_length": 463.75, | |
| "completions/mean_length": 249.5625, | |
| "completions/mean_terminated_length": 249.5625, | |
| "completions/min_length": 95.0, | |
| "completions/min_terminated_length": 95.0, | |
| "epoch": 0.0171260668812089, | |
| "grad_norm": 1.2964116512514992, | |
| "kl": 0.23046875, | |
| "learning_rate": 4.170685064908342e-06, | |
| "loss": 0.0824, | |
| "num_tokens": 5110151.0, | |
| "reward": 0.128064907155931, | |
| "reward_std": 0.0706186261959374, | |
| "rewards/code_reward/mean": 0.128064907155931, | |
| "rewards/code_reward/std": 0.07061862386763096, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 499.75, | |
| "completions/max_terminated_length": 499.75, | |
| "completions/mean_length": 290.53125, | |
| "completions/mean_terminated_length": 290.53125, | |
| "completions/min_length": 119.25, | |
| "completions/min_terminated_length": 119.25, | |
| "epoch": 0.01723800195886386, | |
| "grad_norm": 0.8703199526169038, | |
| "kl": 0.276611328125, | |
| "learning_rate": 4.159353700157365e-06, | |
| "loss": -0.0831, | |
| "num_tokens": 5137592.0, | |
| "reward": 0.11129332333803177, | |
| "reward_std": 0.10705379582941532, | |
| "rewards/code_reward/mean": 0.11129332333803177, | |
| "rewards/code_reward/std": 0.1070537967607379, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 492.0, | |
| "completions/max_terminated_length": 492.0, | |
| "completions/mean_length": 322.75, | |
| "completions/mean_terminated_length": 322.75, | |
| "completions/min_length": 119.75, | |
| "completions/min_terminated_length": 119.75, | |
| "epoch": 0.01734993703651882, | |
| "grad_norm": 1.224937538572504, | |
| "kl": 0.201416015625, | |
| "learning_rate": 4.14796320174778e-06, | |
| "loss": -0.0439, | |
| "num_tokens": 5162960.0, | |
| "reward": 0.1461925357580185, | |
| "reward_std": 0.23236336186528206, | |
| "rewards/code_reward/mean": 0.1461925357580185, | |
| "rewards/code_reward/std": 0.2323633674532175, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 587.0, | |
| "completions/max_terminated_length": 587.0, | |
| "completions/mean_length": 347.59375, | |
| "completions/mean_terminated_length": 347.59375, | |
| "completions/min_length": 153.0, | |
| "completions/min_terminated_length": 153.0, | |
| "epoch": 0.017461872114173778, | |
| "grad_norm": 0.6766785249872507, | |
| "kl": 0.1558837890625, | |
| "learning_rate": 4.136514047602087e-06, | |
| "loss": 0.0103, | |
| "num_tokens": 5192755.0, | |
| "reward": 0.0625, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/code_reward/mean": 0.0625, | |
| "rewards/code_reward/std": 0.1157275140285492, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 517.5, | |
| "completions/max_terminated_length": 517.5, | |
| "completions/mean_length": 301.0625, | |
| "completions/mean_terminated_length": 301.0625, | |
| "completions/min_length": 112.25, | |
| "completions/min_terminated_length": 112.25, | |
| "epoch": 0.01757380719182874, | |
| "grad_norm": 1.2551889406028631, | |
| "kl": 0.197998046875, | |
| "learning_rate": 4.1250067181038635e-06, | |
| "loss": -0.0209, | |
| "num_tokens": 5216477.0, | |
| "reward": 0.17783564236015081, | |
| "reward_std": 0.24008767772465944, | |
| "rewards/code_reward/mean": 0.17783564236015081, | |
| "rewards/code_reward/std": 0.24008767493069172, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 404.0, | |
| "completions/max_terminated_length": 404.0, | |
| "completions/mean_length": 232.6875, | |
| "completions/mean_terminated_length": 232.6875, | |
| "completions/min_length": 86.25, | |
| "completions/min_terminated_length": 86.25, | |
| "epoch": 0.0176857422694837, | |
| "grad_norm": 1.4488039878157586, | |
| "kl": 0.1767578125, | |
| "learning_rate": 4.113441696077608e-06, | |
| "loss": -0.0524, | |
| "num_tokens": 5237427.0, | |
| "reward": 0.03743714070878923, | |
| "reward_std": 0.10588822257705033, | |
| "rewards/code_reward/mean": 0.03743714070878923, | |
| "rewards/code_reward/std": 0.10588822374120355, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 477.25, | |
| "completions/max_terminated_length": 477.25, | |
| "completions/mean_length": 279.34375, | |
| "completions/mean_terminated_length": 279.34375, | |
| "completions/min_length": 133.25, | |
| "completions/min_terminated_length": 133.25, | |
| "epoch": 0.01779767734713866, | |
| "grad_norm": 1.5189669762242, | |
| "kl": 0.238037109375, | |
| "learning_rate": 4.101819466768484e-06, | |
| "loss": -0.1518, | |
| "num_tokens": 5268406.0, | |
| "reward": 0.08647377614397556, | |
| "reward_std": 0.06177530816057697, | |
| "rewards/code_reward/mean": 0.08647377614397556, | |
| "rewards/code_reward/std": 0.06177531188586727, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 491.5, | |
| "completions/max_terminated_length": 491.5, | |
| "completions/mean_length": 260.4375, | |
| "completions/mean_terminated_length": 260.4375, | |
| "completions/min_length": 71.75, | |
| "completions/min_terminated_length": 71.75, | |
| "epoch": 0.01790961242479362, | |
| "grad_norm": 1.0554249659877584, | |
| "kl": 0.147705078125, | |
| "learning_rate": 4.0901405178219535e-06, | |
| "loss": 0.0005, | |
| "num_tokens": 5291300.0, | |
| "reward": 0.04570723883807659, | |
| "reward_std": 0.07690948667004704, | |
| "rewards/code_reward/mean": 0.04570723883807659, | |
| "rewards/code_reward/std": 0.07690948317758739, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 576.0, | |
| "completions/max_terminated_length": 576.0, | |
| "completions/mean_length": 341.28125, | |
| "completions/mean_terminated_length": 341.28125, | |
| "completions/min_length": 155.75, | |
| "completions/min_terminated_length": 155.75, | |
| "epoch": 0.018021547502448578, | |
| "grad_norm": 0.9717735557494784, | |
| "kl": 0.212890625, | |
| "learning_rate": 4.078405339263326e-06, | |
| "loss": -0.0304, | |
| "num_tokens": 5321093.0, | |
| "reward": 0.053125000558793545, | |
| "reward_std": 0.07709404267370701, | |
| "rewards/code_reward/mean": 0.053125000558793545, | |
| "rewards/code_reward/std": 0.07709404919296503, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 480.5, | |
| "completions/max_terminated_length": 480.5, | |
| "completions/mean_length": 237.9375, | |
| "completions/mean_terminated_length": 237.9375, | |
| "completions/min_length": 116.75, | |
| "completions/min_terminated_length": 116.75, | |
| "epoch": 0.01813348258010354, | |
| "grad_norm": 1.173075355333341, | |
| "kl": 0.188232421875, | |
| "learning_rate": 4.06661442347719e-06, | |
| "loss": -0.0205, | |
| "num_tokens": 5348659.0, | |
| "reward": 0.2592630833387375, | |
| "reward_std": 0.15858712047338486, | |
| "rewards/code_reward/mean": 0.2592630833387375, | |
| "rewards/code_reward/std": 0.15858712792396545, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 530.75, | |
| "completions/max_terminated_length": 530.75, | |
| "completions/mean_length": 297.1875, | |
| "completions/mean_terminated_length": 297.1875, | |
| "completions/min_length": 115.0, | |
| "completions/min_terminated_length": 115.0, | |
| "epoch": 0.0182454176577585, | |
| "grad_norm": 1.287995519073581, | |
| "kl": 0.18896484375, | |
| "learning_rate": 4.054768265186758e-06, | |
| "loss": -0.0652, | |
| "num_tokens": 5372217.0, | |
| "reward": 0.33238982781767845, | |
| "reward_std": 0.27272730600088835, | |
| "rewards/code_reward/mean": 0.33238982781767845, | |
| "rewards/code_reward/std": 0.2727273255586624, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 636.5, | |
| "completions/max_terminated_length": 636.5, | |
| "completions/mean_length": 292.53125, | |
| "completions/mean_terminated_length": 292.53125, | |
| "completions/min_length": 117.25, | |
| "completions/min_terminated_length": 117.25, | |
| "epoch": 0.01835735273541346, | |
| "grad_norm": 1.1060543366676017, | |
| "kl": 0.165283203125, | |
| "learning_rate": 4.0428673614331036e-06, | |
| "loss": 0.0064, | |
| "num_tokens": 5397890.0, | |
| "reward": 0.20836169831454754, | |
| "reward_std": 0.17235604114830494, | |
| "rewards/code_reward/mean": 0.20836169831454754, | |
| "rewards/code_reward/std": 0.17235605791211128, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 814.5, | |
| "completions/max_terminated_length": 814.5, | |
| "completions/mean_length": 351.71875, | |
| "completions/mean_terminated_length": 351.71875, | |
| "completions/min_length": 145.75, | |
| "completions/min_terminated_length": 145.75, | |
| "epoch": 0.01846928781306842, | |
| "grad_norm": 0.7382591236061559, | |
| "kl": 0.1884765625, | |
| "learning_rate": 4.030912211554316e-06, | |
| "loss": 0.0313, | |
| "num_tokens": 5423913.0, | |
| "reward": 0.13007790176197886, | |
| "reward_std": 0.05658754054456949, | |
| "rewards/code_reward/mean": 0.13007790176197886, | |
| "rewards/code_reward/std": 0.056587545201182365, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 539.25, | |
| "completions/max_terminated_length": 539.25, | |
| "completions/mean_length": 299.21875, | |
| "completions/mean_terminated_length": 299.21875, | |
| "completions/min_length": 131.25, | |
| "completions/min_terminated_length": 131.25, | |
| "epoch": 0.018581222890723382, | |
| "grad_norm": 1.373915385466877, | |
| "kl": 0.193359375, | |
| "learning_rate": 4.018903317164539e-06, | |
| "loss": -0.1003, | |
| "num_tokens": 5448488.0, | |
| "reward": 0.08751785231288522, | |
| "reward_std": 0.11654674645978957, | |
| "rewards/code_reward/mean": 0.08751785231288522, | |
| "rewards/code_reward/std": 0.11654674645978957, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 620.0, | |
| "completions/max_terminated_length": 620.0, | |
| "completions/mean_length": 359.65625, | |
| "completions/mean_terminated_length": 359.65625, | |
| "completions/min_length": 132.5, | |
| "completions/min_terminated_length": 132.5, | |
| "epoch": 0.01869315796837834, | |
| "grad_norm": 1.0340989098627629, | |
| "kl": 0.17822265625, | |
| "learning_rate": 4.006841182132932e-06, | |
| "loss": -0.0343, | |
| "num_tokens": 5474285.0, | |
| "reward": 0.1759367436170578, | |
| "reward_std": 0.2240792140364647, | |
| "rewards/code_reward/mean": 0.1759367436170578, | |
| "rewards/code_reward/std": 0.224079217761755, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 461.75, | |
| "completions/max_terminated_length": 461.75, | |
| "completions/mean_length": 281.5625, | |
| "completions/mean_terminated_length": 281.5625, | |
| "completions/min_length": 102.25, | |
| "completions/min_terminated_length": 102.25, | |
| "epoch": 0.0188050930460333, | |
| "grad_norm": 1.4134315706278993, | |
| "kl": 0.210205078125, | |
| "learning_rate": 3.9947263125625195e-06, | |
| "loss": 0.013, | |
| "num_tokens": 5498599.0, | |
| "reward": 0.3900106647051871, | |
| "reward_std": 0.2551127364858985, | |
| "rewards/code_reward/mean": 0.3900106647051871, | |
| "rewards/code_reward/std": 0.25511275534518063, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1069.5, | |
| "completions/max_terminated_length": 671.0, | |
| "completions/mean_length": 497.46875, | |
| "completions/mean_terminated_length": 441.75, | |
| "completions/min_length": 222.75, | |
| "completions/min_terminated_length": 222.75, | |
| "epoch": 0.01891702812368826, | |
| "grad_norm": 1.0149191598734515, | |
| "kl": 0.197998046875, | |
| "learning_rate": 3.982559216768967e-06, | |
| "loss": 0.0765, | |
| "num_tokens": 5530310.0, | |
| "reward": 0.1429782696068287, | |
| "reward_std": 0.16231020726263523, | |
| "rewards/code_reward/mean": 0.1429782696068287, | |
| "rewards/code_reward/std": 0.16231020539999008, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 601.5, | |
| "completions/max_terminated_length": 601.5, | |
| "completions/mean_length": 391.875, | |
| "completions/mean_terminated_length": 391.875, | |
| "completions/min_length": 166.5, | |
| "completions/min_terminated_length": 166.5, | |
| "epoch": 0.01902896320134322, | |
| "grad_norm": 0.8227739015604961, | |
| "kl": 0.18212890625, | |
| "learning_rate": 3.970340405259245e-06, | |
| "loss": 0.1136, | |
| "num_tokens": 5562970.0, | |
| "reward": 0.1685887835919857, | |
| "reward_std": 0.17748497053980827, | |
| "rewards/code_reward/mean": 0.1685887835919857, | |
| "rewards/code_reward/std": 0.17748496308922768, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 595.25, | |
| "completions/max_terminated_length": 595.25, | |
| "completions/mean_length": 413.125, | |
| "completions/mean_terminated_length": 413.125, | |
| "completions/min_length": 230.0, | |
| "completions/min_terminated_length": 230.0, | |
| "epoch": 0.019140898278998182, | |
| "grad_norm": 1.0311180144750687, | |
| "kl": 0.2353515625, | |
| "learning_rate": 3.958070390710214e-06, | |
| "loss": -0.0245, | |
| "num_tokens": 5591150.0, | |
| "reward": 0.1419127695262432, | |
| "reward_std": 0.12009143829345703, | |
| "rewards/code_reward/mean": 0.1419127695262432, | |
| "rewards/code_reward/std": 0.12009144574403763, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 715.0, | |
| "completions/max_terminated_length": 715.0, | |
| "completions/mean_length": 454.03125, | |
| "completions/mean_terminated_length": 454.03125, | |
| "completions/min_length": 216.0, | |
| "completions/min_terminated_length": 216.0, | |
| "epoch": 0.01925283335665314, | |
| "grad_norm": 0.9357441509233236, | |
| "kl": 0.19580078125, | |
| "learning_rate": 3.945749687947109e-06, | |
| "loss": -0.0136, | |
| "num_tokens": 5620991.0, | |
| "reward": 0.06574675627052784, | |
| "reward_std": 0.07077404530718923, | |
| "rewards/code_reward/mean": 0.06574675627052784, | |
| "rewards/code_reward/std": 0.07077404530718923, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 678.75, | |
| "completions/max_terminated_length": 678.75, | |
| "completions/mean_length": 487.78125, | |
| "completions/mean_terminated_length": 487.78125, | |
| "completions/min_length": 288.25, | |
| "completions/min_terminated_length": 288.25, | |
| "epoch": 0.0193647684343081, | |
| "grad_norm": 1.0872281099064747, | |
| "kl": 0.210205078125, | |
| "learning_rate": 3.933378813921942e-06, | |
| "loss": -0.0373, | |
| "num_tokens": 5656416.0, | |
| "reward": 0.1226367698982358, | |
| "reward_std": 0.20265722228214145, | |
| "rewards/code_reward/mean": 0.1226367698982358, | |
| "rewards/code_reward/std": 0.20265722228214145, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 523.75, | |
| "completions/max_terminated_length": 523.75, | |
| "completions/mean_length": 321.375, | |
| "completions/mean_terminated_length": 321.375, | |
| "completions/min_length": 95.75, | |
| "completions/min_terminated_length": 95.75, | |
| "epoch": 0.01947670351196306, | |
| "grad_norm": 1.0504957433338074, | |
| "kl": 0.21142578125, | |
| "learning_rate": 3.920958287691811e-06, | |
| "loss": -0.0152, | |
| "num_tokens": 5680844.0, | |
| "reward": 0.4488864839076996, | |
| "reward_std": 0.3014371059834957, | |
| "rewards/code_reward/mean": 0.4488864839076996, | |
| "rewards/code_reward/std": 0.3014371246099472, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 637.0, | |
| "completions/max_terminated_length": 637.0, | |
| "completions/mean_length": 430.875, | |
| "completions/mean_terminated_length": 430.875, | |
| "completions/min_length": 249.5, | |
| "completions/min_terminated_length": 249.5, | |
| "epoch": 0.019588638589618023, | |
| "grad_norm": 0.7795786053104797, | |
| "kl": 0.229736328125, | |
| "learning_rate": 3.908488630397121e-06, | |
| "loss": 0.0764, | |
| "num_tokens": 5713200.0, | |
| "reward": 0.04957035928964615, | |
| "reward_std": 0.06563462410122156, | |
| "rewards/code_reward/mean": 0.04957035928964615, | |
| "rewards/code_reward/std": 0.06563462410122156, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 609.75, | |
| "completions/max_terminated_length": 609.75, | |
| "completions/mean_length": 444.75, | |
| "completions/mean_terminated_length": 444.75, | |
| "completions/min_length": 283.0, | |
| "completions/min_terminated_length": 283.0, | |
| "epoch": 0.019700573667272982, | |
| "grad_norm": 0.8790829345446359, | |
| "kl": 0.202392578125, | |
| "learning_rate": 3.8959703652397175e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 5742760.0, | |
| "reward": 0.06789090437814593, | |
| "reward_std": 0.10605220403522253, | |
| "rewards/code_reward/mean": 0.06789090437814593, | |
| "rewards/code_reward/std": 0.10605220403522253, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 921.75, | |
| "completions/max_terminated_length": 651.0, | |
| "completions/mean_length": 502.5625, | |
| "completions/mean_terminated_length": 457.6607208251953, | |
| "completions/min_length": 233.75, | |
| "completions/min_terminated_length": 233.75, | |
| "epoch": 0.019812508744927942, | |
| "grad_norm": 0.6856544409285574, | |
| "kl": 0.212890625, | |
| "learning_rate": 3.883404017460935e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 5776802.0, | |
| "reward": 0.125, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/code_reward/mean": 0.125, | |
| "rewards/code_reward/std": 0.13363061845302582, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 573.25, | |
| "completions/max_terminated_length": 573.25, | |
| "completions/mean_length": 400.78125, | |
| "completions/mean_terminated_length": 400.78125, | |
| "completions/min_length": 223.0, | |
| "completions/min_terminated_length": 223.0, | |
| "epoch": 0.0199244438225829, | |
| "grad_norm": 0.7758589030560867, | |
| "kl": 0.256591796875, | |
| "learning_rate": 3.870790114319559e-06, | |
| "loss": -0.0786, | |
| "num_tokens": 5804987.0, | |
| "reward": 0.2419273192062974, | |
| "reward_std": 0.05694087781012058, | |
| "rewards/code_reward/mean": 0.2419273192062974, | |
| "rewards/code_reward/std": 0.05694088339805603, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 721.25, | |
| "completions/max_terminated_length": 721.25, | |
| "completions/mean_length": 484.3125, | |
| "completions/mean_terminated_length": 484.3125, | |
| "completions/min_length": 227.75, | |
| "completions/min_terminated_length": 227.75, | |
| "epoch": 0.02003637890023786, | |
| "grad_norm": 0.8831663988161996, | |
| "kl": 0.20166015625, | |
| "learning_rate": 3.858129185069701e-06, | |
| "loss": -0.0142, | |
| "num_tokens": 5838165.0, | |
| "reward": 0.15460877772420645, | |
| "reward_std": 0.1456797532737255, | |
| "rewards/code_reward/mean": 0.15460877772420645, | |
| "rewards/code_reward/std": 0.1456797607243061, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 943.5, | |
| "completions/max_terminated_length": 603.5, | |
| "completions/mean_length": 490.1875, | |
| "completions/mean_terminated_length": 443.3526916503906, | |
| "completions/min_length": 240.0, | |
| "completions/min_terminated_length": 240.0, | |
| "epoch": 0.020148313977892823, | |
| "grad_norm": 1.051749696084055, | |
| "kl": 0.194091796875, | |
| "learning_rate": 3.845421760938597e-06, | |
| "loss": 0.0364, | |
| "num_tokens": 5868107.0, | |
| "reward": 0.052815594244748354, | |
| "reward_std": 0.11768656317144632, | |
| "rewards/code_reward/mean": 0.052815594244748354, | |
| "rewards/code_reward/std": 0.11768656317144632, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1064.75, | |
| "completions/max_terminated_length": 742.5, | |
| "completions/mean_length": 527.46875, | |
| "completions/mean_terminated_length": 478.9464340209961, | |
| "completions/min_length": 230.5, | |
| "completions/min_terminated_length": 230.5, | |
| "epoch": 0.020260249055547783, | |
| "grad_norm": 0.9254371836384011, | |
| "kl": 0.224365234375, | |
| "learning_rate": 3.832668375104312e-06, | |
| "loss": 0.0786, | |
| "num_tokens": 5900842.0, | |
| "reward": 0.11509167775511742, | |
| "reward_std": 0.2528133289888501, | |
| "rewards/code_reward/mean": 0.11509167775511742, | |
| "rewards/code_reward/std": 0.2528133289888501, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 607.75, | |
| "completions/max_terminated_length": 607.75, | |
| "completions/mean_length": 416.75, | |
| "completions/mean_terminated_length": 416.75, | |
| "completions/min_length": 192.75, | |
| "completions/min_terminated_length": 192.75, | |
| "epoch": 0.020372184133202742, | |
| "grad_norm": 1.1539914541007663, | |
| "kl": 0.260009765625, | |
| "learning_rate": 3.8198695626733725e-06, | |
| "loss": -0.0358, | |
| "num_tokens": 5926258.0, | |
| "reward": 0.2809056378901005, | |
| "reward_std": 0.25853854790329933, | |
| "rewards/code_reward/mean": 0.2809056378901005, | |
| "rewards/code_reward/std": 0.2585385534912348, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 621.0, | |
| "completions/max_terminated_length": 621.0, | |
| "completions/mean_length": 439.6875, | |
| "completions/mean_terminated_length": 439.6875, | |
| "completions/min_length": 224.0, | |
| "completions/min_terminated_length": 224.0, | |
| "epoch": 0.0204841192108577, | |
| "grad_norm": 0.7671721848701089, | |
| "kl": 0.207763671875, | |
| "learning_rate": 3.8070258606583156e-06, | |
| "loss": -0.0237, | |
| "num_tokens": 5955016.0, | |
| "reward": 0.118256576359272, | |
| "reward_std": 0.10753975436091423, | |
| "rewards/code_reward/mean": 0.118256576359272, | |
| "rewards/code_reward/std": 0.10753976181149483, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 727.75, | |
| "completions/max_terminated_length": 727.75, | |
| "completions/mean_length": 514.6875, | |
| "completions/mean_terminated_length": 514.6875, | |
| "completions/min_length": 295.75, | |
| "completions/min_terminated_length": 295.75, | |
| "epoch": 0.020596054288512664, | |
| "grad_norm": 0.9937661413851208, | |
| "kl": 0.22119140625, | |
| "learning_rate": 3.7941378079551544e-06, | |
| "loss": -0.0669, | |
| "num_tokens": 5988158.0, | |
| "reward": 0.017067496781237423, | |
| "reward_std": 0.02749600470997393, | |
| "rewards/code_reward/mean": 0.017067496781237423, | |
| "rewards/code_reward/std": 0.027496004942804575, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1087.5, | |
| "completions/max_terminated_length": 682.0, | |
| "completions/mean_length": 524.875, | |
| "completions/mean_terminated_length": 471.5446472167969, | |
| "completions/min_length": 291.25, | |
| "completions/min_terminated_length": 291.25, | |
| "epoch": 0.020707989366167624, | |
| "grad_norm": 0.6638578813804005, | |
| "kl": 0.237060546875, | |
| "learning_rate": 3.7812059453207677e-06, | |
| "loss": 0.1742, | |
| "num_tokens": 6023682.0, | |
| "reward": 0.1103343702852726, | |
| "reward_std": 0.11875982582569122, | |
| "rewards/code_reward/mean": 0.1103343702852726, | |
| "rewards/code_reward/std": 0.11875982582569122, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 968.75, | |
| "completions/max_terminated_length": 968.75, | |
| "completions/mean_length": 510.34375, | |
| "completions/mean_terminated_length": 510.34375, | |
| "completions/min_length": 203.5, | |
| "completions/min_terminated_length": 203.5, | |
| "epoch": 0.020819924443822583, | |
| "grad_norm": 1.111764046709291, | |
| "kl": 0.22021484375, | |
| "learning_rate": 3.768230815350213e-06, | |
| "loss": -0.2216, | |
| "num_tokens": 6058277.0, | |
| "reward": 0.08250047732144594, | |
| "reward_std": 0.18678564205765724, | |
| "rewards/code_reward/mean": 0.08250047732144594, | |
| "rewards/code_reward/std": 0.18678564997389913, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1082.75, | |
| "completions/max_terminated_length": 816.5, | |
| "completions/mean_length": 579.78125, | |
| "completions/mean_terminated_length": 536.25, | |
| "completions/min_length": 290.75, | |
| "completions/min_terminated_length": 290.75, | |
| "epoch": 0.020931859521477542, | |
| "grad_norm": 0.6213788285878431, | |
| "kl": 0.198974609375, | |
| "learning_rate": 3.7552129624539557e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 6096662.0, | |
| "reward": 0.15393732488155365, | |
| "reward_std": 0.158139206469059, | |
| "rewards/code_reward/mean": 0.15393732488155365, | |
| "rewards/code_reward/std": 0.158139206469059, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1410.25, | |
| "completions/max_terminated_length": 706.5, | |
| "completions/mean_length": 560.1875, | |
| "completions/mean_terminated_length": 458.20983123779297, | |
| "completions/min_length": 253.0, | |
| "completions/min_terminated_length": 253.0, | |
| "epoch": 0.0210437945991325, | |
| "grad_norm": 0.7369665680166173, | |
| "kl": 0.207275390625, | |
| "learning_rate": 3.7421529328350316e-06, | |
| "loss": 0.0253, | |
| "num_tokens": 6130348.0, | |
| "reward": 0.016329039994161576, | |
| "reward_std": 0.015793586208019406, | |
| "rewards/code_reward/mean": 0.016329039994161576, | |
| "rewards/code_reward/std": 0.01579358527669683, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 808.75, | |
| "completions/max_terminated_length": 808.75, | |
| "completions/mean_length": 575.625, | |
| "completions/mean_terminated_length": 575.625, | |
| "completions/min_length": 396.0, | |
| "completions/min_terminated_length": 396.0, | |
| "epoch": 0.021155729676787464, | |
| "grad_norm": 0.8246271999593108, | |
| "kl": 0.204345703125, | |
| "learning_rate": 3.7290512744661274e-06, | |
| "loss": 0.0457, | |
| "num_tokens": 6171304.0, | |
| "reward": 0.07762476638890803, | |
| "reward_std": 0.16000637132674456, | |
| "rewards/code_reward/mean": 0.07762476638890803, | |
| "rewards/code_reward/std": 0.16000637412071228, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 1693.5, | |
| "completions/max_terminated_length": 887.75, | |
| "completions/mean_length": 880.3125, | |
| "completions/mean_terminated_length": 621.9541931152344, | |
| "completions/min_length": 358.75, | |
| "completions/min_terminated_length": 358.75, | |
| "epoch": 0.021267664754442424, | |
| "grad_norm": 0.7247939676119982, | |
| "kl": 0.166748046875, | |
| "learning_rate": 3.715908537066589e-06, | |
| "loss": -0.0747, | |
| "num_tokens": 6218770.0, | |
| "reward": 0.18021205358672887, | |
| "reward_std": 0.20904676476493478, | |
| "rewards/code_reward/mean": 0.18021205358672887, | |
| "rewards/code_reward/std": 0.2090467723319307, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 1439.5, | |
| "completions/max_terminated_length": 634.25, | |
| "completions/mean_length": 787.9375, | |
| "completions/mean_terminated_length": 455.0029830932617, | |
| "completions/min_length": 289.5, | |
| "completions/min_terminated_length": 289.5, | |
| "epoch": 0.021379599832097383, | |
| "grad_norm": 0.5466790904917126, | |
| "kl": 0.152587890625, | |
| "learning_rate": 3.7027252720793538e-06, | |
| "loss": 0.1295, | |
| "num_tokens": 6262056.0, | |
| "reward": 0.193359375, | |
| "reward_std": 0.15328529477119446, | |
| "rewards/code_reward/mean": 0.193359375, | |
| "rewards/code_reward/std": 0.15328530967235565, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1206.75, | |
| "completions/max_terminated_length": 850.0, | |
| "completions/mean_length": 588.03125, | |
| "completions/mean_terminated_length": 537.1607208251953, | |
| "completions/min_length": 300.25, | |
| "completions/min_terminated_length": 300.25, | |
| "epoch": 0.021491534909752343, | |
| "grad_norm": 0.8256952276538428, | |
| "kl": 0.239501953125, | |
| "learning_rate": 3.689502032647817e-06, | |
| "loss": -0.1993, | |
| "num_tokens": 6310129.0, | |
| "reward": 0.11067206133157015, | |
| "reward_std": 0.11477606277912855, | |
| "rewards/code_reward/mean": 0.11067206133157015, | |
| "rewards/code_reward/std": 0.11477606697008014, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1253.5, | |
| "completions/max_terminated_length": 952.0, | |
| "completions/mean_length": 544.9375, | |
| "completions/mean_terminated_length": 496.7276840209961, | |
| "completions/min_length": 312.75, | |
| "completions/min_terminated_length": 312.75, | |
| "epoch": 0.021603469987407305, | |
| "grad_norm": 0.9219945943560189, | |
| "kl": 0.228759765625, | |
| "learning_rate": 3.6762393735926245e-06, | |
| "loss": 0.0478, | |
| "num_tokens": 6343727.0, | |
| "reward": 0.08743459376273677, | |
| "reward_std": 0.060669250786304474, | |
| "rewards/code_reward/mean": 0.08743459376273677, | |
| "rewards/code_reward/std": 0.06066925637423992, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 747.75, | |
| "completions/max_terminated_length": 747.75, | |
| "completions/mean_length": 467.28125, | |
| "completions/mean_terminated_length": 467.28125, | |
| "completions/min_length": 274.25, | |
| "completions/min_terminated_length": 274.25, | |
| "epoch": 0.021715405065062265, | |
| "grad_norm": 1.087269428253631, | |
| "kl": 0.224853515625, | |
| "learning_rate": 3.6629378513883852e-06, | |
| "loss": -0.0435, | |
| "num_tokens": 6369656.0, | |
| "reward": 0.01468671576003544, | |
| "reward_std": 0.0163727342733182, | |
| "rewards/code_reward/mean": 0.01468671576003544, | |
| "rewards/code_reward/std": 0.0163727342733182, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1376.0, | |
| "completions/max_terminated_length": 711.5, | |
| "completions/mean_length": 528.34375, | |
| "completions/mean_terminated_length": 428.1741180419922, | |
| "completions/min_length": 242.0, | |
| "completions/min_terminated_length": 242.0, | |
| "epoch": 0.021827340142717224, | |
| "grad_norm": 0.877705585064893, | |
| "kl": 0.191162109375, | |
| "learning_rate": 3.6495980241403307e-06, | |
| "loss": -0.0435, | |
| "num_tokens": 6402635.0, | |
| "reward": 0.28124301601201296, | |
| "reward_std": 0.1287369872443378, | |
| "rewards/code_reward/mean": 0.28124301601201296, | |
| "rewards/code_reward/std": 0.1287369979545474, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1505.75, | |
| "completions/max_terminated_length": 884.0, | |
| "completions/mean_length": 629.875, | |
| "completions/mean_terminated_length": 531.4151840209961, | |
| "completions/min_length": 338.5, | |
| "completions/min_terminated_length": 338.5, | |
| "epoch": 0.021939275220372183, | |
| "grad_norm": 0.7532827929414431, | |
| "kl": 0.193115234375, | |
| "learning_rate": 3.636220451560896e-06, | |
| "loss": 0.067, | |
| "num_tokens": 6441607.0, | |
| "reward": 0.07744654751149938, | |
| "reward_std": 0.049496792489662766, | |
| "rewards/code_reward/mean": 0.07744654751149938, | |
| "rewards/code_reward/std": 0.04949679644778371, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 1397.75, | |
| "completions/max_terminated_length": 630.75, | |
| "completions/mean_length": 608.0, | |
| "completions/mean_terminated_length": 451.27679443359375, | |
| "completions/min_length": 247.0, | |
| "completions/min_terminated_length": 247.0, | |
| "epoch": 0.022051210298027143, | |
| "grad_norm": 0.8485874245870415, | |
| "kl": 0.191162109375, | |
| "learning_rate": 3.622805694946235e-06, | |
| "loss": -0.1349, | |
| "num_tokens": 6479383.0, | |
| "reward": 0.26938944309949875, | |
| "reward_std": 0.26605916023254395, | |
| "rewards/code_reward/mean": 0.26938944309949875, | |
| "rewards/code_reward/std": 0.26605917513370514, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1237.75, | |
| "completions/max_terminated_length": 865.5, | |
| "completions/mean_length": 552.3125, | |
| "completions/mean_terminated_length": 501.0982208251953, | |
| "completions/min_length": 266.25, | |
| "completions/min_terminated_length": 266.25, | |
| "epoch": 0.022163145375682106, | |
| "grad_norm": 0.824562163366152, | |
| "kl": 0.19140625, | |
| "learning_rate": 3.609354317152667e-06, | |
| "loss": -0.025, | |
| "num_tokens": 6511257.0, | |
| "reward": 0.05427030206192285, | |
| "reward_std": 0.047577258897945285, | |
| "rewards/code_reward/mean": 0.05427030206192285, | |
| "rewards/code_reward/std": 0.04757725913077593, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1021.5, | |
| "completions/max_terminated_length": 694.75, | |
| "completions/mean_length": 498.78125, | |
| "completions/mean_terminated_length": 447.3214340209961, | |
| "completions/min_length": 226.5, | |
| "completions/min_terminated_length": 226.5, | |
| "epoch": 0.022275080453337065, | |
| "grad_norm": 1.0082207311300992, | |
| "kl": 0.229736328125, | |
| "learning_rate": 3.595866882573063e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 6541178.0, | |
| "reward": 0.2747242748737335, | |
| "reward_std": 0.2067141029983759, | |
| "rewards/code_reward/mean": 0.2747242748737335, | |
| "rewards/code_reward/std": 0.20671410486102104, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 980.0, | |
| "completions/max_terminated_length": 573.25, | |
| "completions/mean_length": 469.53125, | |
| "completions/mean_terminated_length": 416.4330368041992, | |
| "completions/min_length": 246.5, | |
| "completions/min_terminated_length": 246.5, | |
| "epoch": 0.022387015530992024, | |
| "grad_norm": 0.9952889000470718, | |
| "kl": 0.216064453125, | |
| "learning_rate": 3.5823439571131675e-06, | |
| "loss": -0.1906, | |
| "num_tokens": 6570243.0, | |
| "reward": 0.1142054102383554, | |
| "reward_std": 0.18550929613411427, | |
| "rewards/code_reward/mean": 0.1142054102383554, | |
| "rewards/code_reward/std": 0.18550931010395288, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 764.25, | |
| "completions/max_terminated_length": 764.25, | |
| "completions/mean_length": 500.96875, | |
| "completions/mean_terminated_length": 500.96875, | |
| "completions/min_length": 323.5, | |
| "completions/min_terminated_length": 323.5, | |
| "epoch": 0.022498950608646984, | |
| "grad_norm": 0.9455318918616956, | |
| "kl": 0.227294921875, | |
| "learning_rate": 3.5687861081678477e-06, | |
| "loss": 0.031, | |
| "num_tokens": 6603946.0, | |
| "reward": 0.16931893583387136, | |
| "reward_std": 0.20531310141086578, | |
| "rewards/code_reward/mean": 0.16931893583387136, | |
| "rewards/code_reward/std": 0.20531310513615608, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1030.5, | |
| "completions/max_terminated_length": 688.5, | |
| "completions/mean_length": 418.1875, | |
| "completions/mean_terminated_length": 367.9821472167969, | |
| "completions/min_length": 194.75, | |
| "completions/min_terminated_length": 194.75, | |
| "epoch": 0.022610885686301947, | |
| "grad_norm": 1.0325105547845679, | |
| "kl": 0.2138671875, | |
| "learning_rate": 3.555193904597291e-06, | |
| "loss": 0.0613, | |
| "num_tokens": 6636552.0, | |
| "reward": 0.3791414946317673, | |
| "reward_std": 0.17875608056783676, | |
| "rewards/code_reward/mean": 0.3791414946317673, | |
| "rewards/code_reward/std": 0.17875608801841736, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 754.25, | |
| "completions/max_terminated_length": 754.25, | |
| "completions/mean_length": 455.34375, | |
| "completions/mean_terminated_length": 455.34375, | |
| "completions/min_length": 257.0, | |
| "completions/min_terminated_length": 257.0, | |
| "epoch": 0.022722820763956906, | |
| "grad_norm": 0.9727989512601006, | |
| "kl": 0.22119140625, | |
| "learning_rate": 3.541567916703138e-06, | |
| "loss": 0.0159, | |
| "num_tokens": 6668595.0, | |
| "reward": 0.19191165082156658, | |
| "reward_std": 0.15741402097046375, | |
| "rewards/code_reward/mean": 0.19191165082156658, | |
| "rewards/code_reward/std": 0.15741403214633465, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 989.25, | |
| "completions/max_terminated_length": 615.5, | |
| "completions/mean_length": 476.40625, | |
| "completions/mean_terminated_length": 425.71875762939453, | |
| "completions/min_length": 255.25, | |
| "completions/min_terminated_length": 255.25, | |
| "epoch": 0.022834755841611865, | |
| "grad_norm": 0.7440983993424026, | |
| "kl": 0.203857421875, | |
| "learning_rate": 3.5279087162045517e-06, | |
| "loss": 0.0571, | |
| "num_tokens": 6702376.0, | |
| "reward": 0.11487132962793112, | |
| "reward_std": 0.1093948557972908, | |
| "rewards/code_reward/mean": 0.11487132962793112, | |
| "rewards/code_reward/std": 0.1093948557972908, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 861.25, | |
| "completions/max_terminated_length": 861.25, | |
| "completions/mean_length": 514.59375, | |
| "completions/mean_terminated_length": 514.59375, | |
| "completions/min_length": 285.0, | |
| "completions/min_terminated_length": 285.0, | |
| "epoch": 0.022946690919266825, | |
| "grad_norm": 0.9601225973651024, | |
| "kl": 0.18505859375, | |
| "learning_rate": 3.5142168762142265e-06, | |
| "loss": 0.0168, | |
| "num_tokens": 6739939.0, | |
| "reward": 0.07519801473245025, | |
| "reward_std": 0.09981238306500018, | |
| "rewards/code_reward/mean": 0.07519801473245025, | |
| "rewards/code_reward/std": 0.09981238329783082, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 914.25, | |
| "completions/max_terminated_length": 670.75, | |
| "completions/mean_length": 427.03125, | |
| "completions/mean_terminated_length": 379.75, | |
| "completions/min_length": 184.25, | |
| "completions/min_terminated_length": 184.25, | |
| "epoch": 0.023058625996921784, | |
| "grad_norm": 0.8984794673889067, | |
| "kl": 0.207763671875, | |
| "learning_rate": 3.500492971214347e-06, | |
| "loss": 0.1382, | |
| "num_tokens": 6769180.0, | |
| "reward": 0.22681757621467113, | |
| "reward_std": 0.20832497254014015, | |
| "rewards/code_reward/mean": 0.22681757621467113, | |
| "rewards/code_reward/std": 0.20832498744130135, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1280.25, | |
| "completions/max_terminated_length": 1128.0, | |
| "completions/mean_length": 650.65625, | |
| "completions/mean_terminated_length": 608.4821472167969, | |
| "completions/min_length": 309.0, | |
| "completions/min_terminated_length": 309.0, | |
| "epoch": 0.023170561074576747, | |
| "grad_norm": 0.7922918264942025, | |
| "kl": 0.1728515625, | |
| "learning_rate": 3.48673757703248e-06, | |
| "loss": -0.0944, | |
| "num_tokens": 6805289.0, | |
| "reward": 0.14009581343270838, | |
| "reward_std": 0.11429419624619186, | |
| "rewards/code_reward/mean": 0.14009581343270838, | |
| "rewards/code_reward/std": 0.11429420742206275, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1129.5, | |
| "completions/max_terminated_length": 776.25, | |
| "completions/mean_length": 539.34375, | |
| "completions/mean_terminated_length": 491.6026916503906, | |
| "completions/min_length": 313.0, | |
| "completions/min_terminated_length": 313.0, | |
| "epoch": 0.023282496152231706, | |
| "grad_norm": 0.9119324704049495, | |
| "kl": 0.1783447265625, | |
| "learning_rate": 3.472951270817418e-06, | |
| "loss": -0.064, | |
| "num_tokens": 6837436.0, | |
| "reward": 0.05461701576132327, | |
| "reward_std": 0.09534355666255578, | |
| "rewards/code_reward/mean": 0.05461701576132327, | |
| "rewards/code_reward/std": 0.09534356038784608, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 903.25, | |
| "completions/max_terminated_length": 903.25, | |
| "completions/mean_length": 384.75, | |
| "completions/mean_terminated_length": 384.75, | |
| "completions/min_length": 159.75, | |
| "completions/min_terminated_length": 159.75, | |
| "epoch": 0.023394431229886666, | |
| "grad_norm": 1.2240979756453376, | |
| "kl": 0.183349609375, | |
| "learning_rate": 3.4591346310149578e-06, | |
| "loss": 0.0503, | |
| "num_tokens": 6864492.0, | |
| "reward": 0.4224591121310368, | |
| "reward_std": 0.2887880225898698, | |
| "rewards/code_reward/mean": 0.4224591121310368, | |
| "rewards/code_reward/std": 0.2887880523921922, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 767.75, | |
| "completions/max_terminated_length": 767.75, | |
| "completions/mean_length": 399.28125, | |
| "completions/mean_terminated_length": 399.28125, | |
| "completions/min_length": 212.0, | |
| "completions/min_terminated_length": 212.0, | |
| "epoch": 0.023506366307541625, | |
| "grad_norm": 1.1994406910258044, | |
| "kl": 0.204833984375, | |
| "learning_rate": 3.445288237343632e-06, | |
| "loss": -0.0509, | |
| "num_tokens": 6891213.0, | |
| "reward": 0.09320073015987873, | |
| "reward_std": 0.12864024192094803, | |
| "rewards/code_reward/mean": 0.09320073015987873, | |
| "rewards/code_reward/std": 0.12864025123417377, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1011.25, | |
| "completions/max_terminated_length": 669.75, | |
| "completions/mean_length": 461.25, | |
| "completions/mean_terminated_length": 410.2276840209961, | |
| "completions/min_length": 192.0, | |
| "completions/min_terminated_length": 192.0, | |
| "epoch": 0.023618301385196588, | |
| "grad_norm": 1.1498675164259378, | |
| "kl": 0.2158203125, | |
| "learning_rate": 3.4314126707703895e-06, | |
| "loss": 0.0824, | |
| "num_tokens": 6919749.0, | |
| "reward": 0.2663097037002444, | |
| "reward_std": 0.21830029226839542, | |
| "rewards/code_reward/mean": 0.2663097037002444, | |
| "rewards/code_reward/std": 0.21830029599368572, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 868.25, | |
| "completions/max_terminated_length": 621.75, | |
| "completions/mean_length": 385.84375, | |
| "completions/mean_terminated_length": 338.625, | |
| "completions/min_length": 178.0, | |
| "completions/min_terminated_length": 178.0, | |
| "epoch": 0.023730236462851547, | |
| "grad_norm": 1.220829227031836, | |
| "kl": 0.20703125, | |
| "learning_rate": 3.4175085134862128e-06, | |
| "loss": 0.1624, | |
| "num_tokens": 6948192.0, | |
| "reward": 0.27685857750475407, | |
| "reward_std": 0.24184924457222223, | |
| "rewards/code_reward/mean": 0.27685857750475407, | |
| "rewards/code_reward/std": 0.24184925481677055, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 540.25, | |
| "completions/max_terminated_length": 540.25, | |
| "completions/mean_length": 334.1875, | |
| "completions/mean_terminated_length": 334.1875, | |
| "completions/min_length": 196.25, | |
| "completions/min_terminated_length": 196.25, | |
| "epoch": 0.023842171540506506, | |
| "grad_norm": 1.0550764471568521, | |
| "kl": 0.224365234375, | |
| "learning_rate": 3.4035763488816953e-06, | |
| "loss": 0.1182, | |
| "num_tokens": 6973222.0, | |
| "reward": 0.5495182275772095, | |
| "reward_std": 0.17330202460289001, | |
| "rewards/code_reward/mean": 0.5495182275772095, | |
| "rewards/code_reward/std": 0.17330202646553516, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 656.0, | |
| "completions/max_terminated_length": 656.0, | |
| "completions/mean_length": 360.46875, | |
| "completions/mean_terminated_length": 360.46875, | |
| "completions/min_length": 144.75, | |
| "completions/min_terminated_length": 144.75, | |
| "epoch": 0.023954106618161466, | |
| "grad_norm": 1.0108308143941853, | |
| "kl": 0.258544921875, | |
| "learning_rate": 3.3896167615225594e-06, | |
| "loss": 0.0636, | |
| "num_tokens": 6998765.0, | |
| "reward": 0.13169488031417131, | |
| "reward_std": 0.07389534078538418, | |
| "rewards/code_reward/mean": 0.13169488031417131, | |
| "rewards/code_reward/std": 0.07389534404501319, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1026.5, | |
| "completions/max_terminated_length": 619.5, | |
| "completions/mean_length": 401.0, | |
| "completions/mean_terminated_length": 346.1383972167969, | |
| "completions/min_length": 137.25, | |
| "completions/min_terminated_length": 137.25, | |
| "epoch": 0.024066041695816425, | |
| "grad_norm": 0.750694546999131, | |
| "kl": 0.19580078125, | |
| "learning_rate": 3.375630337125133e-06, | |
| "loss": 0.1223, | |
| "num_tokens": 7028501.0, | |
| "reward": 0.07791783940047026, | |
| "reward_std": 0.08552672585938126, | |
| "rewards/code_reward/mean": 0.07791783940047026, | |
| "rewards/code_reward/std": 0.0855267186416313, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 568.0, | |
| "completions/max_terminated_length": 568.0, | |
| "completions/mean_length": 329.9375, | |
| "completions/mean_terminated_length": 329.9375, | |
| "completions/min_length": 163.75, | |
| "completions/min_terminated_length": 163.75, | |
| "epoch": 0.024177976773471388, | |
| "grad_norm": 0.9861460225219651, | |
| "kl": 0.20947265625, | |
| "learning_rate": 3.361617662531772e-06, | |
| "loss": 0.0307, | |
| "num_tokens": 7059667.0, | |
| "reward": 0.28345959074795246, | |
| "reward_std": 0.10491538979113102, | |
| "rewards/code_reward/mean": 0.28345959074795246, | |
| "rewards/code_reward/std": 0.10491538792848587, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 487.5, | |
| "completions/max_terminated_length": 487.5, | |
| "completions/mean_length": 300.9375, | |
| "completions/mean_terminated_length": 300.9375, | |
| "completions/min_length": 150.5, | |
| "completions/min_terminated_length": 150.5, | |
| "epoch": 0.024289911851126347, | |
| "grad_norm": 1.2867180675052194, | |
| "kl": 0.19677734375, | |
| "learning_rate": 3.347579325686237e-06, | |
| "loss": 0.0498, | |
| "num_tokens": 7084721.0, | |
| "reward": 0.38907771836966276, | |
| "reward_std": 0.32206146977841854, | |
| "rewards/code_reward/mean": 0.38907771836966276, | |
| "rewards/code_reward/std": 0.3220614865422249, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 508.75, | |
| "completions/max_terminated_length": 508.75, | |
| "completions/mean_length": 332.0625, | |
| "completions/mean_terminated_length": 332.0625, | |
| "completions/min_length": 201.25, | |
| "completions/min_terminated_length": 201.25, | |
| "epoch": 0.024401846928781307, | |
| "grad_norm": 1.221746654434671, | |
| "kl": 0.192626953125, | |
| "learning_rate": 3.333515915609027e-06, | |
| "loss": -0.0326, | |
| "num_tokens": 7112387.0, | |
| "reward": 0.05860341805964708, | |
| "reward_std": 0.07969626039266586, | |
| "rewards/code_reward/mean": 0.05860341805964708, | |
| "rewards/code_reward/std": 0.07969625853002071, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 567.25, | |
| "completions/max_terminated_length": 567.25, | |
| "completions/mean_length": 317.25, | |
| "completions/mean_terminated_length": 317.25, | |
| "completions/min_length": 156.0, | |
| "completions/min_terminated_length": 156.0, | |
| "epoch": 0.024513782006436266, | |
| "grad_norm": 1.0695893302551942, | |
| "kl": 0.23388671875, | |
| "learning_rate": 3.3194280223726616e-06, | |
| "loss": 0.027, | |
| "num_tokens": 7138323.0, | |
| "reward": 0.14415738731622696, | |
| "reward_std": 0.14080366492271423, | |
| "rewards/code_reward/mean": 0.14415738731622696, | |
| "rewards/code_reward/std": 0.14080367609858513, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 387.5, | |
| "completions/max_terminated_length": 387.5, | |
| "completions/mean_length": 239.6875, | |
| "completions/mean_terminated_length": 239.6875, | |
| "completions/min_length": 113.0, | |
| "completions/min_terminated_length": 113.0, | |
| "epoch": 0.024625717084091225, | |
| "grad_norm": 1.763480550756643, | |
| "kl": 0.20849609375, | |
| "learning_rate": 3.305316237076927e-06, | |
| "loss": -0.1439, | |
| "num_tokens": 7159529.0, | |
| "reward": 0.10015321767423302, | |
| "reward_std": 0.17220470518805087, | |
| "rewards/code_reward/mean": 0.10015321767423302, | |
| "rewards/code_reward/std": 0.17220470635220408, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 487.5, | |
| "completions/max_terminated_length": 487.5, | |
| "completions/mean_length": 327.15625, | |
| "completions/mean_terminated_length": 327.15625, | |
| "completions/min_length": 177.5, | |
| "completions/min_terminated_length": 177.5, | |
| "epoch": 0.024737652161746188, | |
| "grad_norm": 0.8169319139286209, | |
| "kl": 0.1607666015625, | |
| "learning_rate": 3.291181151824071e-06, | |
| "loss": 0.0895, | |
| "num_tokens": 7191342.0, | |
| "reward": 0.1822916641831398, | |
| "reward_std": 0.2553221881389618, | |
| "rewards/code_reward/mean": 0.1822916641831398, | |
| "rewards/code_reward/std": 0.2553221881389618, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 436.25, | |
| "completions/max_terminated_length": 436.25, | |
| "completions/mean_length": 269.84375, | |
| "completions/mean_terminated_length": 269.84375, | |
| "completions/min_length": 153.25, | |
| "completions/min_terminated_length": 153.25, | |
| "epoch": 0.024849587239401148, | |
| "grad_norm": 1.1996415284846553, | |
| "kl": 0.2158203125, | |
| "learning_rate": 3.27702335969396e-06, | |
| "loss": -0.0201, | |
| "num_tokens": 7216737.0, | |
| "reward": 0.10358373820781708, | |
| "reward_std": 0.1254219285910949, | |
| "rewards/code_reward/mean": 0.10358373820781708, | |
| "rewards/code_reward/std": 0.1254219323163852, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 349.5, | |
| "completions/max_terminated_length": 349.5, | |
| "completions/mean_length": 217.03125, | |
| "completions/mean_terminated_length": 217.03125, | |
| "completions/min_length": 117.0, | |
| "completions/min_terminated_length": 117.0, | |
| "epoch": 0.024961522317056107, | |
| "grad_norm": 1.2608189223620252, | |
| "kl": 0.2314453125, | |
| "learning_rate": 3.2628434547191985e-06, | |
| "loss": 0.0994, | |
| "num_tokens": 7235498.0, | |
| "reward": 0.14160977257415652, | |
| "reward_std": 0.0918192695826292, | |
| "rewards/code_reward/mean": 0.14160977257415652, | |
| "rewards/code_reward/std": 0.0918192733079195, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 318.0, | |
| "completions/max_terminated_length": 318.0, | |
| "completions/mean_length": 223.5625, | |
| "completions/mean_terminated_length": 223.5625, | |
| "completions/min_length": 149.25, | |
| "completions/min_terminated_length": 149.25, | |
| "epoch": 0.025073457394711066, | |
| "grad_norm": 1.600431868786668, | |
| "kl": 0.225341796875, | |
| "learning_rate": 3.2486420318601973e-06, | |
| "loss": 0.0364, | |
| "num_tokens": 7262236.0, | |
| "reward": 0.27536666474770755, | |
| "reward_std": 0.14216232020407915, | |
| "rewards/code_reward/mean": 0.27536666474770755, | |
| "rewards/code_reward/std": 0.14216232066974044, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 559.75, | |
| "completions/max_terminated_length": 559.75, | |
| "completions/mean_length": 338.53125, | |
| "completions/mean_terminated_length": 338.53125, | |
| "completions/min_length": 173.0, | |
| "completions/min_terminated_length": 173.0, | |
| "epoch": 0.02518539247236603, | |
| "grad_norm": 0.8719466687856433, | |
| "kl": 0.18359375, | |
| "learning_rate": 3.2344196869802187e-06, | |
| "loss": 0.0315, | |
| "num_tokens": 7298189.0, | |
| "reward": 0.015560166910290718, | |
| "reward_std": 0.012789241969585419, | |
| "rewards/code_reward/mean": 0.015560166910290718, | |
| "rewards/code_reward/std": 0.012789241969585419, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 465.25, | |
| "completions/max_terminated_length": 465.25, | |
| "completions/mean_length": 276.65625, | |
| "completions/mean_terminated_length": 276.65625, | |
| "completions/min_length": 160.0, | |
| "completions/min_terminated_length": 160.0, | |
| "epoch": 0.02529732755002099, | |
| "grad_norm": 1.0186655813271028, | |
| "kl": 0.19287109375, | |
| "learning_rate": 3.2201770168203694e-06, | |
| "loss": 0.115, | |
| "num_tokens": 7334746.0, | |
| "reward": 0.10499188816174865, | |
| "reward_std": 0.10288760857656598, | |
| "rewards/code_reward/mean": 0.10499188816174865, | |
| "rewards/code_reward/std": 0.10288760880939662, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 314.75, | |
| "completions/max_terminated_length": 314.75, | |
| "completions/mean_length": 205.90625, | |
| "completions/mean_terminated_length": 205.90625, | |
| "completions/min_length": 130.75, | |
| "completions/min_terminated_length": 130.75, | |
| "epoch": 0.025409262627675948, | |
| "grad_norm": 0.7605637028441592, | |
| "kl": 0.16796875, | |
| "learning_rate": 3.205914618974563e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 7353919.0, | |
| "reward": 0.01245777029544115, | |
| "reward_std": 0.01250904705375433, | |
| "rewards/code_reward/mean": 0.01245777029544115, | |
| "rewards/code_reward/std": 0.01250904705375433, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 410.0, | |
| "completions/max_terminated_length": 410.0, | |
| "completions/mean_length": 191.46875, | |
| "completions/mean_terminated_length": 191.46875, | |
| "completions/min_length": 112.0, | |
| "completions/min_terminated_length": 112.0, | |
| "epoch": 0.025521197705330907, | |
| "grad_norm": 1.2858515605192504, | |
| "kl": 0.259765625, | |
| "learning_rate": 3.1916330918644496e-06, | |
| "loss": 0.0768, | |
| "num_tokens": 7377150.0, | |
| "reward": 0.12344044167548418, | |
| "reward_std": 0.12209718860685825, | |
| "rewards/code_reward/mean": 0.12344044167548418, | |
| "rewards/code_reward/std": 0.12209718953818083, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 326.0, | |
| "completions/max_terminated_length": 326.0, | |
| "completions/mean_length": 226.6875, | |
| "completions/mean_terminated_length": 226.6875, | |
| "completions/min_length": 149.75, | |
| "completions/min_terminated_length": 149.75, | |
| "epoch": 0.025633132782985867, | |
| "grad_norm": 0.9529571983949628, | |
| "kl": 0.20458984375, | |
| "learning_rate": 3.177333034714303e-06, | |
| "loss": -0.0135, | |
| "num_tokens": 7402444.0, | |
| "reward": 0.06041666865348816, | |
| "reward_std": 0.038540102541446686, | |
| "rewards/code_reward/mean": 0.06041666865348816, | |
| "rewards/code_reward/std": 0.038540102541446686, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 312.75, | |
| "completions/max_terminated_length": 312.75, | |
| "completions/mean_length": 147.1875, | |
| "completions/mean_terminated_length": 147.1875, | |
| "completions/min_length": 67.25, | |
| "completions/min_terminated_length": 67.25, | |
| "epoch": 0.02574506786064083, | |
| "grad_norm": 1.403338545743401, | |
| "kl": 0.30029296875, | |
| "learning_rate": 3.1630150475258813e-06, | |
| "loss": 0.0342, | |
| "num_tokens": 7429962.0, | |
| "reward": 0.0703125, | |
| "reward_std": 0.11608850955963135, | |
| "rewards/code_reward/mean": 0.0703125, | |
| "rewards/code_reward/std": 0.1160885114222765, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 374.0, | |
| "completions/max_terminated_length": 374.0, | |
| "completions/mean_length": 183.46875, | |
| "completions/mean_terminated_length": 183.46875, | |
| "completions/min_length": 92.75, | |
| "completions/min_terminated_length": 92.75, | |
| "epoch": 0.02585700293829579, | |
| "grad_norm": 1.2487410703105084, | |
| "kl": 0.259765625, | |
| "learning_rate": 3.148679731053252e-06, | |
| "loss": -0.0378, | |
| "num_tokens": 7455169.0, | |
| "reward": 0.21321137621998787, | |
| "reward_std": 0.2805868834257126, | |
| "rewards/code_reward/mean": 0.21321137621998787, | |
| "rewards/code_reward/std": 0.2805868834257126, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 303.25, | |
| "completions/max_terminated_length": 303.25, | |
| "completions/mean_length": 153.84375, | |
| "completions/mean_terminated_length": 153.84375, | |
| "completions/min_length": 89.75, | |
| "completions/min_terminated_length": 89.75, | |
| "epoch": 0.025968938015950748, | |
| "grad_norm": 1.0579237191929745, | |
| "kl": 0.232421875, | |
| "learning_rate": 3.1343276867775805e-06, | |
| "loss": 0.0811, | |
| "num_tokens": 7480004.0, | |
| "reward": 0.1274509804788977, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/code_reward/mean": 0.1274509804788977, | |
| "rewards/code_reward/std": 0.2177756503224373, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 246.75, | |
| "completions/max_terminated_length": 246.75, | |
| "completions/mean_length": 166.1875, | |
| "completions/mean_terminated_length": 166.1875, | |
| "completions/min_length": 100.5, | |
| "completions/min_terminated_length": 100.5, | |
| "epoch": 0.026080873093605707, | |
| "grad_norm": 1.373681707551141, | |
| "kl": 0.2509765625, | |
| "learning_rate": 3.1199595168819043e-06, | |
| "loss": 0.005, | |
| "num_tokens": 7508034.0, | |
| "reward": 0.2599347122013569, | |
| "reward_std": 0.22181765362620354, | |
| "rewards/code_reward/mean": 0.2599347122013569, | |
| "rewards/code_reward/std": 0.22181766107678413, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 369.25, | |
| "completions/max_terminated_length": 369.25, | |
| "completions/mean_length": 237.15625, | |
| "completions/mean_terminated_length": 237.15625, | |
| "completions/min_length": 121.0, | |
| "completions/min_terminated_length": 121.0, | |
| "epoch": 0.02619280817126067, | |
| "grad_norm": 1.449039323532629, | |
| "kl": 0.205322265625, | |
| "learning_rate": 3.105575824225852e-06, | |
| "loss": 0.0024, | |
| "num_tokens": 7536911.0, | |
| "reward": 0.17171062319539487, | |
| "reward_std": 0.1438203388825059, | |
| "rewards/code_reward/mean": 0.17171062319539487, | |
| "rewards/code_reward/std": 0.1438203463330865, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 345.75, | |
| "completions/max_terminated_length": 345.75, | |
| "completions/mean_length": 170.21875, | |
| "completions/mean_terminated_length": 170.21875, | |
| "completions/min_length": 95.75, | |
| "completions/min_terminated_length": 95.75, | |
| "epoch": 0.02630474324891563, | |
| "grad_norm": 1.0698079544355648, | |
| "kl": 0.1624755859375, | |
| "learning_rate": 3.091177212320363e-06, | |
| "loss": -0.1894, | |
| "num_tokens": 7554142.0, | |
| "reward": 0.3125, | |
| "reward_std": 0.1462521031498909, | |
| "rewards/code_reward/mean": 0.3125, | |
| "rewards/code_reward/std": 0.1462521031498909, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 269.0, | |
| "completions/max_terminated_length": 269.0, | |
| "completions/mean_length": 175.46875, | |
| "completions/mean_terminated_length": 175.46875, | |
| "completions/min_length": 112.75, | |
| "completions/min_terminated_length": 112.75, | |
| "epoch": 0.02641667832657059, | |
| "grad_norm": 1.4540255844594339, | |
| "kl": 0.2158203125, | |
| "learning_rate": 3.0767642853023538e-06, | |
| "loss": -0.0223, | |
| "num_tokens": 7584357.0, | |
| "reward": 0.21975820884108543, | |
| "reward_std": 0.1363154649734497, | |
| "rewards/code_reward/mean": 0.21975820884108543, | |
| "rewards/code_reward/std": 0.1363154649734497, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 228.25, | |
| "completions/max_terminated_length": 228.25, | |
| "completions/mean_length": 116.59375, | |
| "completions/mean_terminated_length": 116.59375, | |
| "completions/min_length": 75.0, | |
| "completions/min_terminated_length": 75.0, | |
| "epoch": 0.02652861340422555, | |
| "grad_norm": 1.9465041168415773, | |
| "kl": 0.247802734375, | |
| "learning_rate": 3.062337647909376e-06, | |
| "loss": -0.039, | |
| "num_tokens": 7602040.0, | |
| "reward": 0.4692905358970165, | |
| "reward_std": 0.24660581722855568, | |
| "rewards/code_reward/mean": 0.4692905358970165, | |
| "rewards/code_reward/std": 0.24660583958029747, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 583.75, | |
| "completions/max_terminated_length": 583.75, | |
| "completions/mean_length": 281.21875, | |
| "completions/mean_terminated_length": 281.21875, | |
| "completions/min_length": 133.5, | |
| "completions/min_terminated_length": 133.5, | |
| "epoch": 0.026640548481880508, | |
| "grad_norm": 1.333783365108213, | |
| "kl": 0.1785888671875, | |
| "learning_rate": 3.04789790545424e-06, | |
| "loss": 0.0396, | |
| "num_tokens": 7627319.0, | |
| "reward": 0.17131002363748848, | |
| "reward_std": 0.18950149056036025, | |
| "rewards/code_reward/mean": 0.17131002363748848, | |
| "rewards/code_reward/std": 0.18950149248121306, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 231.5, | |
| "completions/max_terminated_length": 231.5, | |
| "completions/mean_length": 128.28125, | |
| "completions/mean_terminated_length": 128.28125, | |
| "completions/min_length": 69.25, | |
| "completions/min_terminated_length": 69.25, | |
| "epoch": 0.02675248355953547, | |
| "grad_norm": 0.8510436652715602, | |
| "kl": 0.250732421875, | |
| "learning_rate": 3.033445663799621e-06, | |
| "loss": -0.0327, | |
| "num_tokens": 7644360.0, | |
| "reward": 0.07549504935741425, | |
| "reward_std": 0.07982433587312698, | |
| "rewards/code_reward/mean": 0.07549504935741425, | |
| "rewards/code_reward/std": 0.07982433587312698, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 198.25, | |
| "completions/max_terminated_length": 198.25, | |
| "completions/mean_length": 118.28125, | |
| "completions/mean_terminated_length": 118.28125, | |
| "completions/min_length": 79.5, | |
| "completions/min_terminated_length": 79.5, | |
| "epoch": 0.02686441863719043, | |
| "grad_norm": 2.420152417522015, | |
| "kl": 0.25146484375, | |
| "learning_rate": 3.018981529332633e-06, | |
| "loss": 0.0544, | |
| "num_tokens": 7661793.0, | |
| "reward": 0.07239184161880985, | |
| "reward_std": 0.05399157607462257, | |
| "rewards/code_reward/mean": 0.07239184161880985, | |
| "rewards/code_reward/std": 0.05399157712236047, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 332.0, | |
| "completions/max_terminated_length": 332.0, | |
| "completions/mean_length": 212.125, | |
| "completions/mean_terminated_length": 212.125, | |
| "completions/min_length": 119.25, | |
| "completions/min_terminated_length": 119.25, | |
| "epoch": 0.02697635371484539, | |
| "grad_norm": 1.3615781255434425, | |
| "kl": 0.1885986328125, | |
| "learning_rate": 3.00450610893939e-06, | |
| "loss": 0.0817, | |
| "num_tokens": 7685573.0, | |
| "reward": 0.2050044471397996, | |
| "reward_std": 0.1304325871169567, | |
| "rewards/code_reward/mean": 0.2050044471397996, | |
| "rewards/code_reward/std": 0.13043258781544864, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 230.75, | |
| "completions/max_terminated_length": 230.75, | |
| "completions/mean_length": 144.375, | |
| "completions/mean_terminated_length": 144.375, | |
| "completions/min_length": 89.5, | |
| "completions/min_terminated_length": 89.5, | |
| "epoch": 0.02708828879250035, | |
| "grad_norm": 1.4458971248752004, | |
| "kl": 0.3115234375, | |
| "learning_rate": 2.9900200099795396e-06, | |
| "loss": 0.1362, | |
| "num_tokens": 7711785.0, | |
| "reward": 0.12025879789143801, | |
| "reward_std": 0.10159321606624871, | |
| "rewards/code_reward/mean": 0.12025879789143801, | |
| "rewards/code_reward/std": 0.10159321606624871, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 615.5, | |
| "completions/max_terminated_length": 615.5, | |
| "completions/mean_length": 211.875, | |
| "completions/mean_terminated_length": 211.875, | |
| "completions/min_length": 117.75, | |
| "completions/min_terminated_length": 117.75, | |
| "epoch": 0.02720022387015531, | |
| "grad_norm": 0.9925754447279824, | |
| "kl": 0.1759033203125, | |
| "learning_rate": 2.9755238402607826e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 7736909.0, | |
| "reward": 0.2357253096997738, | |
| "reward_std": 0.11498994007706642, | |
| "rewards/code_reward/mean": 0.2357253096997738, | |
| "rewards/code_reward/std": 0.11498994193971157, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 201.25, | |
| "completions/max_terminated_length": 201.25, | |
| "completions/mean_length": 117.375, | |
| "completions/mean_terminated_length": 117.375, | |
| "completions/min_length": 68.75, | |
| "completions/min_terminated_length": 68.75, | |
| "epoch": 0.02731215894781027, | |
| "grad_norm": 1.7694959761730793, | |
| "kl": 0.1773681640625, | |
| "learning_rate": 2.961018208013367e-06, | |
| "loss": 0.0806, | |
| "num_tokens": 7753785.0, | |
| "reward": 0.26032672822475433, | |
| "reward_std": 0.2158903395757079, | |
| "rewards/code_reward/mean": 0.26032672822475433, | |
| "rewards/code_reward/std": 0.21589034423232079, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 259.5, | |
| "completions/max_terminated_length": 259.5, | |
| "completions/mean_length": 134.0, | |
| "completions/mean_terminated_length": 134.0, | |
| "completions/min_length": 77.75, | |
| "completions/min_terminated_length": 77.75, | |
| "epoch": 0.02742409402546523, | |
| "grad_norm": 1.5239142070937466, | |
| "kl": 0.1982421875, | |
| "learning_rate": 2.9465037218645694e-06, | |
| "loss": 0.0341, | |
| "num_tokens": 7770921.0, | |
| "reward": 0.15353127755224705, | |
| "reward_std": 0.1622099713422358, | |
| "rewards/code_reward/mean": 0.15353127755224705, | |
| "rewards/code_reward/std": 0.16220997110940516, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 193.25, | |
| "completions/max_terminated_length": 193.25, | |
| "completions/mean_length": 129.40625, | |
| "completions/mean_terminated_length": 129.40625, | |
| "completions/min_length": 83.75, | |
| "completions/min_terminated_length": 83.75, | |
| "epoch": 0.02753602910312019, | |
| "grad_norm": 1.4700135521247835, | |
| "kl": 0.311767578125, | |
| "learning_rate": 2.9319809908131604e-06, | |
| "loss": -0.0235, | |
| "num_tokens": 7793438.0, | |
| "reward": 0.22987624257802963, | |
| "reward_std": 0.19782325625419617, | |
| "rewards/code_reward/mean": 0.22987624257802963, | |
| "rewards/code_reward/std": 0.19782325625419617, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 221.25, | |
| "completions/max_terminated_length": 221.25, | |
| "completions/mean_length": 132.5625, | |
| "completions/mean_terminated_length": 132.5625, | |
| "completions/min_length": 78.0, | |
| "completions/min_terminated_length": 78.0, | |
| "epoch": 0.02764796418077515, | |
| "grad_norm": 0.7938111312829735, | |
| "kl": 0.2685546875, | |
| "learning_rate": 2.917450624203847e-06, | |
| "loss": 0.0108, | |
| "num_tokens": 7811344.0, | |
| "reward": 0.1285112500190735, | |
| "reward_std": 0.03530046343803406, | |
| "rewards/code_reward/mean": 0.1285112500190735, | |
| "rewards/code_reward/std": 0.03530046343803406, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 283.0, | |
| "completions/max_terminated_length": 283.0, | |
| "completions/mean_length": 155.75, | |
| "completions/mean_terminated_length": 155.75, | |
| "completions/min_length": 72.75, | |
| "completions/min_terminated_length": 72.75, | |
| "epoch": 0.02775989925843011, | |
| "grad_norm": 1.461851676320045, | |
| "kl": 0.321533203125, | |
| "learning_rate": 2.9029132317017118e-06, | |
| "loss": 0.0822, | |
| "num_tokens": 7836400.0, | |
| "reward": 0.07058638549642637, | |
| "reward_std": 0.09412376256659627, | |
| "rewards/code_reward/mean": 0.07058638549642637, | |
| "rewards/code_reward/std": 0.09412377001717687, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 184.25, | |
| "completions/max_terminated_length": 184.25, | |
| "completions/mean_length": 131.59375, | |
| "completions/mean_terminated_length": 131.59375, | |
| "completions/min_length": 88.0, | |
| "completions/min_terminated_length": 88.0, | |
| "epoch": 0.02787183433608507, | |
| "grad_norm": 1.6504030899968662, | |
| "kl": 0.275634765625, | |
| "learning_rate": 2.888369423266629e-06, | |
| "loss": 0.0701, | |
| "num_tokens": 7857059.0, | |
| "reward": 0.059203914599493146, | |
| "reward_std": 0.09994567523244768, | |
| "rewards/code_reward/mean": 0.059203914599493146, | |
| "rewards/code_reward/std": 0.09994568361435086, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 415.5, | |
| "completions/max_terminated_length": 415.5, | |
| "completions/mean_length": 206.15625, | |
| "completions/mean_terminated_length": 206.15625, | |
| "completions/min_length": 104.25, | |
| "completions/min_terminated_length": 104.25, | |
| "epoch": 0.02798376941374003, | |
| "grad_norm": 1.159655895359276, | |
| "kl": 0.165283203125, | |
| "learning_rate": 2.8738198091276712e-06, | |
| "loss": -0.0308, | |
| "num_tokens": 7882080.0, | |
| "reward": 0.10000000149011612, | |
| "reward_std": 0.1331607922911644, | |
| "rewards/code_reward/mean": 0.10000000149011612, | |
| "rewards/code_reward/std": 0.1331607922911644, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 266.25, | |
| "completions/max_terminated_length": 266.25, | |
| "completions/mean_length": 179.0, | |
| "completions/mean_terminated_length": 179.0, | |
| "completions/min_length": 106.0, | |
| "completions/min_terminated_length": 106.0, | |
| "epoch": 0.02809570449139499, | |
| "grad_norm": 1.3860909228149765, | |
| "kl": 0.3243408203125, | |
| "learning_rate": 2.859264999757509e-06, | |
| "loss": -0.0087, | |
| "num_tokens": 7904552.0, | |
| "reward": 0.17192643135786057, | |
| "reward_std": 0.12011632975190878, | |
| "rewards/code_reward/mean": 0.17192643135786057, | |
| "rewards/code_reward/std": 0.12011633953079581, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 177.0, | |
| "completions/max_terminated_length": 177.0, | |
| "completions/mean_length": 111.34375, | |
| "completions/mean_terminated_length": 111.34375, | |
| "completions/min_length": 78.25, | |
| "completions/min_terminated_length": 78.25, | |
| "epoch": 0.028207639569049953, | |
| "grad_norm": 2.5730420152805027, | |
| "kl": 0.35986328125, | |
| "learning_rate": 2.8447056058467928e-06, | |
| "loss": -0.0585, | |
| "num_tokens": 7929307.0, | |
| "reward": 0.17651335208211094, | |
| "reward_std": 0.22341035841964185, | |
| "rewards/code_reward/mean": 0.17651335208211094, | |
| "rewards/code_reward/std": 0.22341035841964185, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 225.5, | |
| "completions/max_terminated_length": 225.5, | |
| "completions/mean_length": 145.40625, | |
| "completions/mean_terminated_length": 145.40625, | |
| "completions/min_length": 86.25, | |
| "completions/min_terminated_length": 86.25, | |
| "epoch": 0.028319574646704912, | |
| "grad_norm": 1.8094443062077088, | |
| "kl": 0.36767578125, | |
| "learning_rate": 2.830142238278531e-06, | |
| "loss": 0.0709, | |
| "num_tokens": 7952504.0, | |
| "reward": 0.2017338698497042, | |
| "reward_std": 0.20673675020225346, | |
| "rewards/code_reward/mean": 0.2017338698497042, | |
| "rewards/code_reward/std": 0.20673674996942282, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 203.25, | |
| "completions/max_terminated_length": 203.25, | |
| "completions/mean_length": 119.84375, | |
| "completions/mean_terminated_length": 119.84375, | |
| "completions/min_length": 75.75, | |
| "completions/min_terminated_length": 75.75, | |
| "epoch": 0.02843150972435987, | |
| "grad_norm": 2.3402410563756315, | |
| "kl": 0.354736328125, | |
| "learning_rate": 2.81557550810246e-06, | |
| "loss": -0.0806, | |
| "num_tokens": 7976539.0, | |
| "reward": 0.3907702271826565, | |
| "reward_std": 0.26296099089086056, | |
| "rewards/code_reward/mean": 0.3907702271826565, | |
| "rewards/code_reward/std": 0.26296099927276373, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 213.0, | |
| "completions/max_terminated_length": 213.0, | |
| "completions/mean_length": 128.8125, | |
| "completions/mean_terminated_length": 128.8125, | |
| "completions/min_length": 72.5, | |
| "completions/min_terminated_length": 72.5, | |
| "epoch": 0.02854344480201483, | |
| "grad_norm": 1.6470365991478257, | |
| "kl": 0.3701171875, | |
| "learning_rate": 2.8010060265094026e-06, | |
| "loss": 0.0623, | |
| "num_tokens": 7998165.0, | |
| "reward": 0.14626706298440695, | |
| "reward_std": 0.13755429768934846, | |
| "rewards/code_reward/mean": 0.14626706298440695, | |
| "rewards/code_reward/std": 0.13755429675802588, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 464.0, | |
| "completions/max_terminated_length": 464.0, | |
| "completions/mean_length": 214.6875, | |
| "completions/mean_terminated_length": 214.6875, | |
| "completions/min_length": 67.5, | |
| "completions/min_terminated_length": 67.5, | |
| "epoch": 0.02865537987966979, | |
| "grad_norm": 1.5339855216333798, | |
| "kl": 0.224365234375, | |
| "learning_rate": 2.786434404805629e-06, | |
| "loss": 0.0387, | |
| "num_tokens": 8031691.0, | |
| "reward": 0.09155143890529871, | |
| "reward_std": 0.09001913899555802, | |
| "rewards/code_reward/mean": 0.09155143890529871, | |
| "rewards/code_reward/std": 0.09001914283726364, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 225.0, | |
| "completions/max_terminated_length": 225.0, | |
| "completions/mean_length": 145.6875, | |
| "completions/mean_terminated_length": 145.6875, | |
| "completions/min_length": 88.75, | |
| "completions/min_terminated_length": 88.75, | |
| "epoch": 0.028767314957324753, | |
| "grad_norm": 1.8276516553873643, | |
| "kl": 0.37451171875, | |
| "learning_rate": 2.771861254387199e-06, | |
| "loss": -0.0261, | |
| "num_tokens": 8052065.0, | |
| "reward": 0.21322817541658878, | |
| "reward_std": 0.16002243757247925, | |
| "rewards/code_reward/mean": 0.21322817541658878, | |
| "rewards/code_reward/std": 0.16002243757247925, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 243.75, | |
| "completions/max_terminated_length": 243.75, | |
| "completions/mean_length": 164.90625, | |
| "completions/mean_terminated_length": 164.90625, | |
| "completions/min_length": 109.25, | |
| "completions/min_terminated_length": 109.25, | |
| "epoch": 0.028879250034979712, | |
| "grad_norm": 1.559190798859591, | |
| "kl": 0.28076171875, | |
| "learning_rate": 2.7572871867143204e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 8082230.0, | |
| "reward": 0.19782285764813423, | |
| "reward_std": 0.23767431639134884, | |
| "rewards/code_reward/mean": 0.19782285764813423, | |
| "rewards/code_reward/std": 0.23767432384192944, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 329.5, | |
| "completions/max_terminated_length": 329.5, | |
| "completions/mean_length": 156.125, | |
| "completions/mean_terminated_length": 156.125, | |
| "completions/min_length": 83.5, | |
| "completions/min_terminated_length": 83.5, | |
| "epoch": 0.02899118511263467, | |
| "grad_norm": 0.9033316433119087, | |
| "kl": 0.302978515625, | |
| "learning_rate": 2.742712813285681e-06, | |
| "loss": 0.0697, | |
| "num_tokens": 8106786.0, | |
| "reward": 0.0914294570684433, | |
| "reward_std": 0.09275190159678459, | |
| "rewards/code_reward/mean": 0.0914294570684433, | |
| "rewards/code_reward/std": 0.09275190159678459, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 201.0, | |
| "completions/max_terminated_length": 201.0, | |
| "completions/mean_length": 123.71875, | |
| "completions/mean_terminated_length": 123.71875, | |
| "completions/min_length": 71.0, | |
| "completions/min_terminated_length": 71.0, | |
| "epoch": 0.02910312019028963, | |
| "grad_norm": 1.9152198651462768, | |
| "kl": 0.333984375, | |
| "learning_rate": 2.7281387456128017e-06, | |
| "loss": 0.014, | |
| "num_tokens": 8126217.0, | |
| "reward": 0.21696891635656357, | |
| "reward_std": 0.28822916746139526, | |
| "rewards/code_reward/mean": 0.21696891635656357, | |
| "rewards/code_reward/std": 0.28822918236255646, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 369.75, | |
| "completions/max_terminated_length": 369.75, | |
| "completions/mean_length": 167.9375, | |
| "completions/mean_terminated_length": 167.9375, | |
| "completions/min_length": 73.25, | |
| "completions/min_terminated_length": 73.25, | |
| "epoch": 0.029215055267944594, | |
| "grad_norm": 0.9135668163744887, | |
| "kl": 0.24755859375, | |
| "learning_rate": 2.7135655951943716e-06, | |
| "loss": -0.0166, | |
| "num_tokens": 8150855.0, | |
| "reward": 0.03386699501425028, | |
| "reward_std": 0.06380424555391073, | |
| "rewards/code_reward/mean": 0.03386699501425028, | |
| "rewards/code_reward/std": 0.06380424555391073, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 271.75, | |
| "completions/max_terminated_length": 271.75, | |
| "completions/mean_length": 116.125, | |
| "completions/mean_terminated_length": 116.125, | |
| "completions/min_length": 61.75, | |
| "completions/min_terminated_length": 61.75, | |
| "epoch": 0.029326990345599553, | |
| "grad_norm": 1.6076094252791506, | |
| "kl": 0.31201171875, | |
| "learning_rate": 2.698993973490598e-06, | |
| "loss": 0.0945, | |
| "num_tokens": 8165387.0, | |
| "reward": 0.20722341747023165, | |
| "reward_std": 0.11584698176011443, | |
| "rewards/code_reward/mean": 0.20722341747023165, | |
| "rewards/code_reward/std": 0.11584698967635632, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 260.5, | |
| "completions/max_terminated_length": 260.5, | |
| "completions/mean_length": 160.3125, | |
| "completions/mean_terminated_length": 160.3125, | |
| "completions/min_length": 107.5, | |
| "completions/min_terminated_length": 107.5, | |
| "epoch": 0.029438925423254513, | |
| "grad_norm": 1.2712990086273, | |
| "kl": 0.266357421875, | |
| "learning_rate": 2.6844244918975416e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 8185445.0, | |
| "reward": 0.12302206363528967, | |
| "reward_std": 0.1178859043866396, | |
| "rewards/code_reward/mean": 0.12302206363528967, | |
| "rewards/code_reward/std": 0.11788590624928474, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 276.75, | |
| "completions/max_terminated_length": 276.75, | |
| "completions/mean_length": 177.96875, | |
| "completions/mean_terminated_length": 177.96875, | |
| "completions/min_length": 109.25, | |
| "completions/min_terminated_length": 109.25, | |
| "epoch": 0.029550860500909472, | |
| "grad_norm": 1.5932680686482104, | |
| "kl": 0.27490234375, | |
| "learning_rate": 2.66985776172147e-06, | |
| "loss": -0.0647, | |
| "num_tokens": 8214836.0, | |
| "reward": 0.3632364124059677, | |
| "reward_std": 0.24340662360191345, | |
| "rewards/code_reward/mean": 0.3632364124059677, | |
| "rewards/code_reward/std": 0.24340663105249405, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 200.0, | |
| "completions/max_terminated_length": 200.0, | |
| "completions/mean_length": 131.03125, | |
| "completions/mean_terminated_length": 131.03125, | |
| "completions/min_length": 77.0, | |
| "completions/min_terminated_length": 77.0, | |
| "epoch": 0.02966279557856443, | |
| "grad_norm": 1.3583412717133385, | |
| "kl": 0.3232421875, | |
| "learning_rate": 2.6552943941532088e-06, | |
| "loss": 0.0688, | |
| "num_tokens": 8233101.0, | |
| "reward": 0.19121321476995945, | |
| "reward_std": 0.1444133589975536, | |
| "rewards/code_reward/mean": 0.19121321476995945, | |
| "rewards/code_reward/std": 0.14441336272284389, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 195.25, | |
| "completions/max_terminated_length": 195.25, | |
| "completions/mean_length": 131.9375, | |
| "completions/mean_terminated_length": 131.9375, | |
| "completions/min_length": 98.75, | |
| "completions/min_terminated_length": 98.75, | |
| "epoch": 0.029774730656219394, | |
| "grad_norm": 1.2376740863800209, | |
| "kl": 0.347900390625, | |
| "learning_rate": 2.6407350002424927e-06, | |
| "loss": -0.0064, | |
| "num_tokens": 8253363.0, | |
| "reward": 0.24439102411270142, | |
| "reward_std": 0.17834187299013138, | |
| "rewards/code_reward/mean": 0.24439102411270142, | |
| "rewards/code_reward/std": 0.17834188044071198, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 177.5, | |
| "completions/max_terminated_length": 177.5, | |
| "completions/mean_length": 106.03125, | |
| "completions/mean_terminated_length": 106.03125, | |
| "completions/min_length": 66.5, | |
| "completions/min_terminated_length": 66.5, | |
| "epoch": 0.029886665733874353, | |
| "grad_norm": 1.4198187332614487, | |
| "kl": 0.30126953125, | |
| "learning_rate": 2.626180190872329e-06, | |
| "loss": -0.047, | |
| "num_tokens": 8267084.0, | |
| "reward": 0.04570374824106693, | |
| "reward_std": 0.034461796283721924, | |
| "rewards/code_reward/mean": 0.04570374824106693, | |
| "rewards/code_reward/std": 0.03446180047467351, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 271.0, | |
| "completions/max_terminated_length": 271.0, | |
| "completions/mean_length": 119.0, | |
| "completions/mean_terminated_length": 119.0, | |
| "completions/min_length": 69.0, | |
| "completions/min_terminated_length": 69.0, | |
| "epoch": 0.029998600811529313, | |
| "grad_norm": 2.766302572356958, | |
| "kl": 0.254150390625, | |
| "learning_rate": 2.611630576733372e-06, | |
| "loss": 0.0719, | |
| "num_tokens": 8285988.0, | |
| "reward": 0.20828989439178258, | |
| "reward_std": 0.1855767808156088, | |
| "rewards/code_reward/mean": 0.20828989439178258, | |
| "rewards/code_reward/std": 0.18557679950026795, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 470.0, | |
| "completions/max_terminated_length": 470.0, | |
| "completions/mean_length": 205.78125, | |
| "completions/mean_terminated_length": 205.78125, | |
| "completions/min_length": 125.5, | |
| "completions/min_terminated_length": 125.5, | |
| "epoch": 0.030110535889184272, | |
| "grad_norm": 0.559152868059546, | |
| "kl": 0.216064453125, | |
| "learning_rate": 2.5970867682982885e-06, | |
| "loss": 0.0113, | |
| "num_tokens": 8315381.0, | |
| "reward": 0.01448170654475689, | |
| "reward_std": 0.03370444104075432, | |
| "rewards/code_reward/mean": 0.01448170654475689, | |
| "rewards/code_reward/std": 0.03370444104075432, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 264.0, | |
| "completions/max_terminated_length": 264.0, | |
| "completions/mean_length": 146.375, | |
| "completions/mean_terminated_length": 146.375, | |
| "completions/min_length": 88.75, | |
| "completions/min_terminated_length": 88.75, | |
| "epoch": 0.030222470966839235, | |
| "grad_norm": 1.7716322097224022, | |
| "kl": 0.329833984375, | |
| "learning_rate": 2.582549375796154e-06, | |
| "loss": 0.1976, | |
| "num_tokens": 8336289.0, | |
| "reward": 0.1647916678339243, | |
| "reward_std": 0.1911229882389307, | |
| "rewards/code_reward/mean": 0.1647916678339243, | |
| "rewards/code_reward/std": 0.1911229882389307, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 275.25, | |
| "completions/max_terminated_length": 275.25, | |
| "completions/mean_length": 184.71875, | |
| "completions/mean_terminated_length": 184.71875, | |
| "completions/min_length": 112.25, | |
| "completions/min_terminated_length": 112.25, | |
| "epoch": 0.030334406044494194, | |
| "grad_norm": 0.9190427217910049, | |
| "kl": 0.28369140625, | |
| "learning_rate": 2.568019009186841e-06, | |
| "loss": -0.014, | |
| "num_tokens": 8358944.0, | |
| "reward": 0.20673798964708112, | |
| "reward_std": 0.11309454750153236, | |
| "rewards/code_reward/mean": 0.20673798964708112, | |
| "rewards/code_reward/std": 0.11309454750153236, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 260.5, | |
| "completions/max_terminated_length": 260.5, | |
| "completions/mean_length": 145.5625, | |
| "completions/mean_terminated_length": 145.5625, | |
| "completions/min_length": 82.0, | |
| "completions/min_terminated_length": 82.0, | |
| "epoch": 0.030446341122149154, | |
| "grad_norm": 1.575353161865142, | |
| "kl": 0.3779296875, | |
| "learning_rate": 2.5534962781354317e-06, | |
| "loss": 0.1436, | |
| "num_tokens": 8380378.0, | |
| "reward": 0.240084670484066, | |
| "reward_std": 0.27030207961797714, | |
| "rewards/code_reward/mean": 0.240084670484066, | |
| "rewards/code_reward/std": 0.27030208706855774, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 411.25, | |
| "completions/max_terminated_length": 411.25, | |
| "completions/mean_length": 185.34375, | |
| "completions/mean_terminated_length": 185.34375, | |
| "completions/min_length": 99.25, | |
| "completions/min_terminated_length": 99.25, | |
| "epoch": 0.030558276199804113, | |
| "grad_norm": 1.529799496784454, | |
| "kl": 0.27734375, | |
| "learning_rate": 2.538981791986634e-06, | |
| "loss": -0.072, | |
| "num_tokens": 8410077.0, | |
| "reward": 0.36352282762527466, | |
| "reward_std": 0.24801481142640114, | |
| "rewards/code_reward/mean": 0.36352282762527466, | |
| "rewards/code_reward/std": 0.24801481887698174, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 322.5, | |
| "completions/max_terminated_length": 322.5, | |
| "completions/mean_length": 156.71875, | |
| "completions/mean_terminated_length": 156.71875, | |
| "completions/min_length": 78.0, | |
| "completions/min_terminated_length": 78.0, | |
| "epoch": 0.030670211277459072, | |
| "grad_norm": 1.670315398590079, | |
| "kl": 0.28173828125, | |
| "learning_rate": 2.524476159739218e-06, | |
| "loss": -0.0316, | |
| "num_tokens": 8433564.0, | |
| "reward": 0.08395027136430144, | |
| "reward_std": 0.10782372578978539, | |
| "rewards/code_reward/mean": 0.08395027136430144, | |
| "rewards/code_reward/std": 0.10782372625544667, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 242.0, | |
| "completions/max_terminated_length": 242.0, | |
| "completions/mean_length": 130.09375, | |
| "completions/mean_terminated_length": 130.09375, | |
| "completions/min_length": 76.75, | |
| "completions/min_terminated_length": 76.75, | |
| "epoch": 0.030782146355114035, | |
| "grad_norm": 1.8106311365726324, | |
| "kl": 0.3251953125, | |
| "learning_rate": 2.5099799900204607e-06, | |
| "loss": 0.0782, | |
| "num_tokens": 8452687.0, | |
| "reward": 0.32567203789949417, | |
| "reward_std": 0.27224994264543056, | |
| "rewards/code_reward/mean": 0.32567203789949417, | |
| "rewards/code_reward/std": 0.27224994637072086, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 657.75, | |
| "completions/max_terminated_length": 223.0, | |
| "completions/mean_length": 171.0, | |
| "completions/mean_terminated_length": 111.20089340209961, | |
| "completions/min_length": 57.25, | |
| "completions/min_terminated_length": 57.25, | |
| "epoch": 0.030894081432768995, | |
| "grad_norm": 1.8321308568390302, | |
| "kl": 0.321533203125, | |
| "learning_rate": 2.4954938910606108e-06, | |
| "loss": 0.1624, | |
| "num_tokens": 8475671.0, | |
| "reward": 0.14915229193866253, | |
| "reward_std": 0.11303082318045199, | |
| "rewards/code_reward/mean": 0.14915229193866253, | |
| "rewards/code_reward/std": 0.1130308248102665, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 378.25, | |
| "completions/max_terminated_length": 378.25, | |
| "completions/mean_length": 218.875, | |
| "completions/mean_terminated_length": 218.875, | |
| "completions/min_length": 114.5, | |
| "completions/min_terminated_length": 114.5, | |
| "epoch": 0.031006016510423954, | |
| "grad_norm": 1.5281779015369072, | |
| "kl": 0.2308349609375, | |
| "learning_rate": 2.481018470667368e-06, | |
| "loss": 0.1693, | |
| "num_tokens": 8502299.0, | |
| "reward": 0.18173168785870075, | |
| "reward_std": 0.10828323196619749, | |
| "rewards/code_reward/mean": 0.18173168785870075, | |
| "rewards/code_reward/std": 0.10828323615714908, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 260.5, | |
| "completions/max_terminated_length": 260.5, | |
| "completions/mean_length": 150.34375, | |
| "completions/mean_terminated_length": 150.34375, | |
| "completions/min_length": 71.75, | |
| "completions/min_terminated_length": 71.75, | |
| "epoch": 0.031117951588078913, | |
| "grad_norm": 1.6397888672992769, | |
| "kl": 0.310546875, | |
| "learning_rate": 2.4665543362003802e-06, | |
| "loss": 0.0215, | |
| "num_tokens": 8528406.0, | |
| "reward": 0.12074580090120435, | |
| "reward_std": 0.17130711488425732, | |
| "rewards/code_reward/mean": 0.12074580090120435, | |
| "rewards/code_reward/std": 0.17130712047219276, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 241.5, | |
| "completions/max_terminated_length": 241.5, | |
| "completions/mean_length": 120.4375, | |
| "completions/mean_terminated_length": 120.4375, | |
| "completions/min_length": 69.75, | |
| "completions/min_terminated_length": 69.75, | |
| "epoch": 0.031229886665733873, | |
| "grad_norm": 1.8254531380803452, | |
| "kl": 0.32275390625, | |
| "learning_rate": 2.4521020945457615e-06, | |
| "loss": 0.0678, | |
| "num_tokens": 8549612.0, | |
| "reward": 0.6036184206604958, | |
| "reward_std": 0.3284572381526232, | |
| "rewards/code_reward/mean": 0.6036184206604958, | |
| "rewards/code_reward/std": 0.3284572381526232, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 345.0, | |
| "completions/max_terminated_length": 345.0, | |
| "completions/mean_length": 126.84375, | |
| "completions/mean_terminated_length": 126.84375, | |
| "completions/min_length": 56.25, | |
| "completions/min_terminated_length": 56.25, | |
| "epoch": 0.031341821743388835, | |
| "grad_norm": 2.4668184273965648, | |
| "kl": 0.4033203125, | |
| "learning_rate": 2.4376623520906255e-06, | |
| "loss": 0.1532, | |
| "num_tokens": 8569279.0, | |
| "reward": 0.18534822203218937, | |
| "reward_std": 0.146333621814847, | |
| "rewards/code_reward/mean": 0.18534822203218937, | |
| "rewards/code_reward/std": 0.1463336320593953, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 239.0, | |
| "completions/max_terminated_length": 239.0, | |
| "completions/mean_length": 150.90625, | |
| "completions/mean_terminated_length": 150.90625, | |
| "completions/min_length": 94.5, | |
| "completions/min_terminated_length": 94.5, | |
| "epoch": 0.031453756821043795, | |
| "grad_norm": 1.2334442728026884, | |
| "kl": 0.3759765625, | |
| "learning_rate": 2.4232357146976478e-06, | |
| "loss": 0.0028, | |
| "num_tokens": 8597444.0, | |
| "reward": 0.1862155646085739, | |
| "reward_std": 0.10262486711144447, | |
| "rewards/code_reward/mean": 0.1862155646085739, | |
| "rewards/code_reward/std": 0.10262487456202507, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 510.5, | |
| "completions/max_terminated_length": 510.5, | |
| "completions/mean_length": 210.78125, | |
| "completions/mean_terminated_length": 210.78125, | |
| "completions/min_length": 97.25, | |
| "completions/min_terminated_length": 97.25, | |
| "epoch": 0.031565691898698754, | |
| "grad_norm": 1.484546879319261, | |
| "kl": 0.309326171875, | |
| "learning_rate": 2.408822787679637e-06, | |
| "loss": -0.0366, | |
| "num_tokens": 8622829.0, | |
| "reward": 0.1472295392304659, | |
| "reward_std": 0.10856602992862463, | |
| "rewards/code_reward/mean": 0.1472295392304659, | |
| "rewards/code_reward/std": 0.10856604157015681, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 266.25, | |
| "completions/max_terminated_length": 266.25, | |
| "completions/mean_length": 177.84375, | |
| "completions/mean_terminated_length": 177.84375, | |
| "completions/min_length": 121.5, | |
| "completions/min_terminated_length": 121.5, | |
| "epoch": 0.031677626976353714, | |
| "grad_norm": 1.6429453484339698, | |
| "kl": 0.318359375, | |
| "learning_rate": 2.3944241757741475e-06, | |
| "loss": 0.0429, | |
| "num_tokens": 8643536.0, | |
| "reward": 0.1551339291036129, | |
| "reward_std": 0.21810386329889297, | |
| "rewards/code_reward/mean": 0.1551339291036129, | |
| "rewards/code_reward/std": 0.21810387633740902, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 218.5, | |
| "completions/max_terminated_length": 218.5, | |
| "completions/mean_length": 132.6875, | |
| "completions/mean_terminated_length": 132.6875, | |
| "completions/min_length": 90.75, | |
| "completions/min_terminated_length": 90.75, | |
| "epoch": 0.03178956205400867, | |
| "grad_norm": 1.9278006659321785, | |
| "kl": 0.294677734375, | |
| "learning_rate": 2.380040483118097e-06, | |
| "loss": -0.0661, | |
| "num_tokens": 8660110.0, | |
| "reward": 0.1566466533113271, | |
| "reward_std": 0.15316736698150635, | |
| "rewards/code_reward/mean": 0.1566466533113271, | |
| "rewards/code_reward/std": 0.1531673688441515, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 200.75, | |
| "completions/max_terminated_length": 200.75, | |
| "completions/mean_length": 111.15625, | |
| "completions/mean_terminated_length": 111.15625, | |
| "completions/min_length": 67.25, | |
| "completions/min_terminated_length": 67.25, | |
| "epoch": 0.03190149713166363, | |
| "grad_norm": 1.924152560934373, | |
| "kl": 0.46728515625, | |
| "learning_rate": 2.365672313222419e-06, | |
| "loss": 0.0708, | |
| "num_tokens": 8676963.0, | |
| "reward": 0.3067304156720638, | |
| "reward_std": 0.17022380698472261, | |
| "rewards/code_reward/mean": 0.3067304156720638, | |
| "rewards/code_reward/std": 0.1702238107100129, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 146.5, | |
| "completions/max_terminated_length": 146.5, | |
| "completions/mean_length": 88.375, | |
| "completions/mean_terminated_length": 88.375, | |
| "completions/min_length": 53.5, | |
| "completions/min_terminated_length": 53.5, | |
| "epoch": 0.0320134322093186, | |
| "grad_norm": 2.52648816592765, | |
| "kl": 0.4990234375, | |
| "learning_rate": 2.351320268946749e-06, | |
| "loss": -0.0968, | |
| "num_tokens": 8696055.0, | |
| "reward": 0.21885720640420914, | |
| "reward_std": 0.18590925447642803, | |
| "rewards/code_reward/mean": 0.21885720640420914, | |
| "rewards/code_reward/std": 0.18590926192700863, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 949.0, | |
| "completions/max_terminated_length": 493.0, | |
| "completions/mean_length": 296.96875, | |
| "completions/mean_terminated_length": 238.07143020629883, | |
| "completions/min_length": 110.0, | |
| "completions/min_terminated_length": 110.0, | |
| "epoch": 0.03212536728697356, | |
| "grad_norm": 1.4087649689505792, | |
| "kl": 0.279296875, | |
| "learning_rate": 2.336984952474119e-06, | |
| "loss": 0.1631, | |
| "num_tokens": 8732022.0, | |
| "reward": 0.12815122242318466, | |
| "reward_std": 0.13949624670203775, | |
| "rewards/code_reward/mean": 0.12815122242318466, | |
| "rewards/code_reward/std": 0.13949625426903367, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 313.5, | |
| "completions/max_terminated_length": 313.5, | |
| "completions/mean_length": 165.6875, | |
| "completions/mean_terminated_length": 165.6875, | |
| "completions/min_length": 89.75, | |
| "completions/min_terminated_length": 89.75, | |
| "epoch": 0.03223730236462852, | |
| "grad_norm": 1.5911200294302021, | |
| "kl": 0.309326171875, | |
| "learning_rate": 2.322666965285697e-06, | |
| "loss": -0.0499, | |
| "num_tokens": 8752596.0, | |
| "reward": 0.2135441319551319, | |
| "reward_std": 0.1789869824424386, | |
| "rewards/code_reward/mean": 0.2135441319551319, | |
| "rewards/code_reward/std": 0.17898700083605945, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 487.25, | |
| "completions/max_terminated_length": 487.25, | |
| "completions/mean_length": 240.5625, | |
| "completions/mean_terminated_length": 240.5625, | |
| "completions/min_length": 111.5, | |
| "completions/min_terminated_length": 111.5, | |
| "epoch": 0.03234923744228348, | |
| "grad_norm": 1.6229086799739825, | |
| "kl": 0.305908203125, | |
| "learning_rate": 2.3083669081355507e-06, | |
| "loss": 0.1546, | |
| "num_tokens": 8783550.0, | |
| "reward": 0.060625465121120214, | |
| "reward_std": 0.031893965788185596, | |
| "rewards/code_reward/mean": 0.060625465121120214, | |
| "rewards/code_reward/std": 0.03189396392554045, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 736.75, | |
| "completions/max_terminated_length": 277.75, | |
| "completions/mean_length": 226.15625, | |
| "completions/mean_terminated_length": 167.17411041259766, | |
| "completions/min_length": 81.25, | |
| "completions/min_terminated_length": 81.25, | |
| "epoch": 0.032461172519938436, | |
| "grad_norm": 0.9678300330589205, | |
| "kl": 0.138671875, | |
| "learning_rate": 2.2940853810254377e-06, | |
| "loss": 0.1927, | |
| "num_tokens": 8806243.0, | |
| "reward": 0.301976312417537, | |
| "reward_std": 0.05864762840792537, | |
| "rewards/code_reward/mean": 0.301976312417537, | |
| "rewards/code_reward/std": 0.058647628873586655, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 419.75, | |
| "completions/max_terminated_length": 419.75, | |
| "completions/mean_length": 161.28125, | |
| "completions/mean_terminated_length": 161.28125, | |
| "completions/min_length": 66.5, | |
| "completions/min_terminated_length": 66.5, | |
| "epoch": 0.032573107597593395, | |
| "grad_norm": 2.0533336308641092, | |
| "kl": 0.43505859375, | |
| "learning_rate": 2.2798229831796313e-06, | |
| "loss": 0.0806, | |
| "num_tokens": 8830396.0, | |
| "reward": 0.08084819512441754, | |
| "reward_std": 0.043647464364767075, | |
| "rewards/code_reward/mean": 0.08084819512441754, | |
| "rewards/code_reward/std": 0.043647464364767075, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 739.75, | |
| "completions/max_terminated_length": 282.0, | |
| "completions/mean_length": 217.4375, | |
| "completions/mean_terminated_length": 157.30357360839844, | |
| "completions/min_length": 89.0, | |
| "completions/min_terminated_length": 89.0, | |
| "epoch": 0.032685042675248355, | |
| "grad_norm": 1.637832465555356, | |
| "kl": 0.2890625, | |
| "learning_rate": 2.2655803130197816e-06, | |
| "loss": 0.2231, | |
| "num_tokens": 8853858.0, | |
| "reward": 0.2264392450451851, | |
| "reward_std": 0.21030585933476686, | |
| "rewards/code_reward/mean": 0.2264392450451851, | |
| "rewards/code_reward/std": 0.21030588168650866, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 349.5, | |
| "completions/max_terminated_length": 349.5, | |
| "completions/mean_length": 193.40625, | |
| "completions/mean_terminated_length": 193.40625, | |
| "completions/min_length": 119.25, | |
| "completions/min_terminated_length": 119.25, | |
| "epoch": 0.032796977752903314, | |
| "grad_norm": 1.574199000350919, | |
| "kl": 0.384765625, | |
| "learning_rate": 2.2513579681398034e-06, | |
| "loss": 0.0158, | |
| "num_tokens": 8878935.0, | |
| "reward": 0.2135722152888775, | |
| "reward_std": 0.17660537734627724, | |
| "rewards/code_reward/mean": 0.2135722152888775, | |
| "rewards/code_reward/std": 0.17660538339987397, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 311.25, | |
| "completions/max_terminated_length": 311.25, | |
| "completions/mean_length": 177.75, | |
| "completions/mean_terminated_length": 177.75, | |
| "completions/min_length": 91.25, | |
| "completions/min_terminated_length": 91.25, | |
| "epoch": 0.03290891283055827, | |
| "grad_norm": 1.6072325452218685, | |
| "kl": 0.363037109375, | |
| "learning_rate": 2.237156545280803e-06, | |
| "loss": 0.0884, | |
| "num_tokens": 8901727.0, | |
| "reward": 0.3473220057785511, | |
| "reward_std": 0.18608891125768423, | |
| "rewards/code_reward/mean": 0.3473220057785511, | |
| "rewards/code_reward/std": 0.18608891125768423, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 419.75, | |
| "completions/max_terminated_length": 419.75, | |
| "completions/mean_length": 209.28125, | |
| "completions/mean_terminated_length": 209.28125, | |
| "completions/min_length": 105.75, | |
| "completions/min_terminated_length": 105.75, | |
| "epoch": 0.03302084790821324, | |
| "grad_norm": 1.6249151148932088, | |
| "kl": 0.248779296875, | |
| "learning_rate": 2.2229766403060403e-06, | |
| "loss": -0.0182, | |
| "num_tokens": 8925072.0, | |
| "reward": 0.307357229758054, | |
| "reward_std": 0.13686690758913755, | |
| "rewards/code_reward/mean": 0.307357229758054, | |
| "rewards/code_reward/std": 0.13686690386384726, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 282.0, | |
| "completions/max_terminated_length": 282.0, | |
| "completions/mean_length": 178.40625, | |
| "completions/mean_terminated_length": 178.40625, | |
| "completions/min_length": 90.25, | |
| "completions/min_terminated_length": 90.25, | |
| "epoch": 0.0331327829858682, | |
| "grad_norm": 1.6672544303984767, | |
| "kl": 0.32666015625, | |
| "learning_rate": 2.2088188481759305e-06, | |
| "loss": 0.0041, | |
| "num_tokens": 8942397.0, | |
| "reward": 0.17835952731547877, | |
| "reward_std": 0.14297430915758014, | |
| "rewards/code_reward/mean": 0.17835952731547877, | |
| "rewards/code_reward/std": 0.1429743110202253, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 393.0, | |
| "completions/max_terminated_length": 393.0, | |
| "completions/mean_length": 168.09375, | |
| "completions/mean_terminated_length": 168.09375, | |
| "completions/min_length": 82.75, | |
| "completions/min_terminated_length": 82.75, | |
| "epoch": 0.03324471806352316, | |
| "grad_norm": 1.5915878451347125, | |
| "kl": 0.42919921875, | |
| "learning_rate": 2.194683762923073e-06, | |
| "loss": -0.0342, | |
| "num_tokens": 8967448.0, | |
| "reward": 0.23388671875, | |
| "reward_std": 0.09839868592098355, | |
| "rewards/code_reward/mean": 0.23388671875, | |
| "rewards/code_reward/std": 0.09839868592098355, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1053.75, | |
| "completions/max_terminated_length": 626.25, | |
| "completions/mean_length": 331.40625, | |
| "completions/mean_terminated_length": 275.5669708251953, | |
| "completions/min_length": 122.5, | |
| "completions/min_terminated_length": 122.5, | |
| "epoch": 0.03335665314117812, | |
| "grad_norm": 1.1259318778679914, | |
| "kl": 0.24853515625, | |
| "learning_rate": 2.1805719776273387e-06, | |
| "loss": 0.1031, | |
| "num_tokens": 8996029.0, | |
| "reward": 0.21752450801432133, | |
| "reward_std": 0.22587602585554123, | |
| "rewards/code_reward/mean": 0.21752450801432133, | |
| "rewards/code_reward/std": 0.22587604075670242, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 810.25, | |
| "completions/max_terminated_length": 459.5, | |
| "completions/mean_length": 282.46875, | |
| "completions/mean_terminated_length": 228.1651840209961, | |
| "completions/min_length": 90.25, | |
| "completions/min_terminated_length": 90.25, | |
| "epoch": 0.03346858821883308, | |
| "grad_norm": 1.327314761495312, | |
| "kl": 0.2470703125, | |
| "learning_rate": 2.166484084390974e-06, | |
| "loss": -0.0158, | |
| "num_tokens": 9024660.0, | |
| "reward": 0.4248046875, | |
| "reward_std": 0.41644760966300964, | |
| "rewards/code_reward/mean": 0.4248046875, | |
| "rewards/code_reward/std": 0.41644763946533203, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 816.5, | |
| "completions/max_terminated_length": 452.75, | |
| "completions/mean_length": 257.34375, | |
| "completions/mean_terminated_length": 201.17857360839844, | |
| "completions/min_length": 88.5, | |
| "completions/min_terminated_length": 88.5, | |
| "epoch": 0.033580523296488037, | |
| "grad_norm": 1.2867387634028893, | |
| "kl": 0.251953125, | |
| "learning_rate": 2.1524206743137636e-06, | |
| "loss": -0.2782, | |
| "num_tokens": 9049823.0, | |
| "reward": 0.2559996712952852, | |
| "reward_std": 0.17017995577771217, | |
| "rewards/code_reward/mean": 0.2559996712952852, | |
| "rewards/code_reward/std": 0.1701799522852525, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 544.5, | |
| "completions/max_terminated_length": 544.5, | |
| "completions/mean_length": 296.03125, | |
| "completions/mean_terminated_length": 296.03125, | |
| "completions/min_length": 139.25, | |
| "completions/min_terminated_length": 139.25, | |
| "epoch": 0.033692458374142996, | |
| "grad_norm": 1.3498052592722694, | |
| "kl": 0.25732421875, | |
| "learning_rate": 2.1383823374682287e-06, | |
| "loss": 0.0851, | |
| "num_tokens": 9079328.0, | |
| "reward": 0.38671875, | |
| "reward_std": 0.3085732739418745, | |
| "rewards/code_reward/mean": 0.38671875, | |
| "rewards/code_reward/std": 0.3085732851177454, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 362.0, | |
| "completions/max_terminated_length": 362.0, | |
| "completions/mean_length": 205.40625, | |
| "completions/mean_terminated_length": 205.40625, | |
| "completions/min_length": 79.75, | |
| "completions/min_terminated_length": 79.75, | |
| "epoch": 0.033804393451797955, | |
| "grad_norm": 1.1786971151666847, | |
| "kl": 0.253662109375, | |
| "learning_rate": 2.124369662874868e-06, | |
| "loss": 0.0537, | |
| "num_tokens": 9103917.0, | |
| "reward": 0.1022359449416399, | |
| "reward_std": 0.1313032009638846, | |
| "rewards/code_reward/mean": 0.1022359449416399, | |
| "rewards/code_reward/std": 0.13130320748314261, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 785.5, | |
| "completions/max_terminated_length": 785.5, | |
| "completions/mean_length": 290.25, | |
| "completions/mean_terminated_length": 290.25, | |
| "completions/min_length": 83.75, | |
| "completions/min_terminated_length": 83.75, | |
| "epoch": 0.033916328529452915, | |
| "grad_norm": 1.2753854412194898, | |
| "kl": 0.23779296875, | |
| "learning_rate": 2.110383238477441e-06, | |
| "loss": 0.1839, | |
| "num_tokens": 9131989.0, | |
| "reward": 0.3315134688746184, | |
| "reward_std": 0.21372198988683522, | |
| "rewards/code_reward/mean": 0.3315134688746184, | |
| "rewards/code_reward/std": 0.21372198243625462, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 834.75, | |
| "completions/max_terminated_length": 440.25, | |
| "completions/mean_length": 297.6875, | |
| "completions/mean_terminated_length": 241.50446701049805, | |
| "completions/min_length": 94.0, | |
| "completions/min_terminated_length": 94.0, | |
| "epoch": 0.03402826360710788, | |
| "grad_norm": 0.9686437804358383, | |
| "kl": 0.25732421875, | |
| "learning_rate": 2.096423651118305e-06, | |
| "loss": 0.0919, | |
| "num_tokens": 9155547.0, | |
| "reward": 0.18424479104578495, | |
| "reward_std": 0.2077017817646265, | |
| "rewards/code_reward/mean": 0.18424479104578495, | |
| "rewards/code_reward/std": 0.20770180504769087, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 807.0, | |
| "completions/max_terminated_length": 623.5, | |
| "completions/mean_length": 361.53125, | |
| "completions/mean_terminated_length": 315.49554443359375, | |
| "completions/min_length": 142.25, | |
| "completions/min_terminated_length": 142.25, | |
| "epoch": 0.03414019868476284, | |
| "grad_norm": 1.203308108515461, | |
| "kl": 0.24169921875, | |
| "learning_rate": 2.082491486513788e-06, | |
| "loss": 0.0801, | |
| "num_tokens": 9183796.0, | |
| "reward": 0.22409930732101202, | |
| "reward_std": 0.2232498861849308, | |
| "rewards/code_reward/mean": 0.22409930732101202, | |
| "rewards/code_reward/std": 0.22324990667402744, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 560.25, | |
| "completions/max_terminated_length": 560.25, | |
| "completions/mean_length": 283.53125, | |
| "completions/mean_terminated_length": 283.53125, | |
| "completions/min_length": 173.75, | |
| "completions/min_terminated_length": 173.75, | |
| "epoch": 0.0342521337624178, | |
| "grad_norm": 1.0443749007299015, | |
| "kl": 0.22998046875, | |
| "learning_rate": 2.0685873292296116e-06, | |
| "loss": -0.0535, | |
| "num_tokens": 9212077.0, | |
| "reward": 0.3671575216576457, | |
| "reward_std": 0.18197334744036198, | |
| "rewards/code_reward/mean": 0.3671575216576457, | |
| "rewards/code_reward/std": 0.18197335489094257, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 959.25, | |
| "completions/max_terminated_length": 559.0, | |
| "completions/mean_length": 339.875, | |
| "completions/mean_terminated_length": 283.05357360839844, | |
| "completions/min_length": 119.0, | |
| "completions/min_terminated_length": 119.0, | |
| "epoch": 0.03436406884007276, | |
| "grad_norm": 0.9280325752065158, | |
| "kl": 0.26708984375, | |
| "learning_rate": 2.054711762656369e-06, | |
| "loss": 0.0166, | |
| "num_tokens": 9245945.0, | |
| "reward": 0.20142045244574547, | |
| "reward_std": 0.1855016816407442, | |
| "rewards/code_reward/mean": 0.20142045244574547, | |
| "rewards/code_reward/std": 0.18550169840455055, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 917.75, | |
| "completions/max_terminated_length": 533.5, | |
| "completions/mean_length": 328.03125, | |
| "completions/mean_terminated_length": 271.48661041259766, | |
| "completions/min_length": 96.25, | |
| "completions/min_terminated_length": 96.25, | |
| "epoch": 0.03447600391772772, | |
| "grad_norm": 1.110612410740818, | |
| "kl": 0.26904296875, | |
| "learning_rate": 2.040865368985044e-06, | |
| "loss": 0.1496, | |
| "num_tokens": 9271114.0, | |
| "reward": 0.2448565848171711, | |
| "reward_std": 0.2629811018705368, | |
| "rewards/code_reward/mean": 0.2448565848171711, | |
| "rewards/code_reward/std": 0.26298110000789165, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 690.5, | |
| "completions/max_terminated_length": 690.5, | |
| "completions/mean_length": 286.53125, | |
| "completions/mean_terminated_length": 286.53125, | |
| "completions/min_length": 103.25, | |
| "completions/min_terminated_length": 103.25, | |
| "epoch": 0.03458793899538268, | |
| "grad_norm": 0.9114432475860804, | |
| "kl": 0.25927734375, | |
| "learning_rate": 2.027048729182583e-06, | |
| "loss": 0.0919, | |
| "num_tokens": 9294987.0, | |
| "reward": 0.30750996619462967, | |
| "reward_std": 0.21396427508443594, | |
| "rewards/code_reward/mean": 0.30750996619462967, | |
| "rewards/code_reward/std": 0.21396427601575851, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 448.75, | |
| "completions/max_terminated_length": 448.75, | |
| "completions/mean_length": 235.46875, | |
| "completions/mean_terminated_length": 235.46875, | |
| "completions/min_length": 110.0, | |
| "completions/min_terminated_length": 110.0, | |
| "epoch": 0.03469987407303764, | |
| "grad_norm": 1.1384514018728271, | |
| "kl": 0.2802734375, | |
| "learning_rate": 2.0132624229675205e-06, | |
| "loss": 0.0654, | |
| "num_tokens": 9320514.0, | |
| "reward": 0.31562499701976776, | |
| "reward_std": 0.12151388870552182, | |
| "rewards/code_reward/mean": 0.31562499701976776, | |
| "rewards/code_reward/std": 0.12151389149948955, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1206.0, | |
| "completions/max_terminated_length": 789.0, | |
| "completions/mean_length": 407.65625, | |
| "completions/mean_terminated_length": 296.8125, | |
| "completions/min_length": 103.75, | |
| "completions/min_terminated_length": 103.75, | |
| "epoch": 0.034811809150692596, | |
| "grad_norm": 1.0596592897748647, | |
| "kl": 0.222412109375, | |
| "learning_rate": 1.9995070287856546e-06, | |
| "loss": 0.1233, | |
| "num_tokens": 9352039.0, | |
| "reward": 0.10777858644723892, | |
| "reward_std": 0.1648613102734089, | |
| "rewards/code_reward/mean": 0.10777858644723892, | |
| "rewards/code_reward/std": 0.1648613139986992, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 472.25, | |
| "completions/max_terminated_length": 472.25, | |
| "completions/mean_length": 266.09375, | |
| "completions/mean_terminated_length": 266.09375, | |
| "completions/min_length": 116.0, | |
| "completions/min_terminated_length": 116.0, | |
| "epoch": 0.034923744228347556, | |
| "grad_norm": 1.3278661225451498, | |
| "kl": 0.267333984375, | |
| "learning_rate": 1.985783123785774e-06, | |
| "loss": 0.1761, | |
| "num_tokens": 9375930.0, | |
| "reward": 0.6153363855555654, | |
| "reward_std": 0.09626698028296232, | |
| "rewards/code_reward/mean": 0.6153363855555654, | |
| "rewards/code_reward/std": 0.09626698028296232, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 448.25, | |
| "completions/max_terminated_length": 448.25, | |
| "completions/mean_length": 285.46875, | |
| "completions/mean_terminated_length": 285.46875, | |
| "completions/min_length": 127.5, | |
| "completions/min_terminated_length": 127.5, | |
| "epoch": 0.035035679306002515, | |
| "grad_norm": 1.2384888571370045, | |
| "kl": 0.280029296875, | |
| "learning_rate": 1.9720912837954486e-06, | |
| "loss": 0.0208, | |
| "num_tokens": 9399217.0, | |
| "reward": 0.26853298489004374, | |
| "reward_std": 0.2630241848528385, | |
| "rewards/code_reward/mean": 0.26853298489004374, | |
| "rewards/code_reward/std": 0.26302417647093534, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 755.0, | |
| "completions/max_terminated_length": 755.0, | |
| "completions/mean_length": 319.59375, | |
| "completions/mean_terminated_length": 319.59375, | |
| "completions/min_length": 97.0, | |
| "completions/min_terminated_length": 97.0, | |
| "epoch": 0.03514761438365748, | |
| "grad_norm": 1.5140046934528983, | |
| "kl": 0.2685546875, | |
| "learning_rate": 1.958432083296862e-06, | |
| "loss": 0.1386, | |
| "num_tokens": 9427972.0, | |
| "reward": 0.3911227434873581, | |
| "reward_std": 0.2716307928785682, | |
| "rewards/code_reward/mean": 0.3911227434873581, | |
| "rewards/code_reward/std": 0.2716307919472456, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 329.25, | |
| "completions/max_terminated_length": 329.25, | |
| "completions/mean_length": 162.96875, | |
| "completions/mean_terminated_length": 162.96875, | |
| "completions/min_length": 85.75, | |
| "completions/min_terminated_length": 85.75, | |
| "epoch": 0.03525954946131244, | |
| "grad_norm": 1.4191024334529987, | |
| "kl": 0.25927734375, | |
| "learning_rate": 1.9448060954027093e-06, | |
| "loss": 0.0713, | |
| "num_tokens": 9447267.0, | |
| "reward": 0.5250866562128067, | |
| "reward_std": 0.19822602486237884, | |
| "rewards/code_reward/mean": 0.5250866562128067, | |
| "rewards/code_reward/std": 0.19822603231295943, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 490.5, | |
| "completions/max_terminated_length": 490.5, | |
| "completions/mean_length": 262.53125, | |
| "completions/mean_terminated_length": 262.53125, | |
| "completions/min_length": 117.5, | |
| "completions/min_terminated_length": 117.5, | |
| "epoch": 0.0353714845389674, | |
| "grad_norm": 1.2448753208331158, | |
| "kl": 0.245361328125, | |
| "learning_rate": 1.931213891832153e-06, | |
| "loss": 0.251, | |
| "num_tokens": 9471212.0, | |
| "reward": 0.19074449688196182, | |
| "reward_std": 0.07534042606130242, | |
| "rewards/code_reward/mean": 0.19074449688196182, | |
| "rewards/code_reward/std": 0.07534042652696371, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 464.25, | |
| "completions/max_terminated_length": 464.25, | |
| "completions/mean_length": 219.875, | |
| "completions/mean_terminated_length": 219.875, | |
| "completions/min_length": 93.25, | |
| "completions/min_terminated_length": 93.25, | |
| "epoch": 0.03548341961662236, | |
| "grad_norm": 1.3212499853268918, | |
| "kl": 0.286865234375, | |
| "learning_rate": 1.9176560428868336e-06, | |
| "loss": -0.0392, | |
| "num_tokens": 9494912.0, | |
| "reward": 0.23464674223214388, | |
| "reward_std": 0.1325080880196765, | |
| "rewards/code_reward/mean": 0.23464674223214388, | |
| "rewards/code_reward/std": 0.13250808895099908, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 814.5, | |
| "completions/max_terminated_length": 676.5, | |
| "completions/mean_length": 313.59375, | |
| "completions/mean_terminated_length": 265.1071472167969, | |
| "completions/min_length": 79.5, | |
| "completions/min_terminated_length": 79.5, | |
| "epoch": 0.03559535469427732, | |
| "grad_norm": 1.7038540009433336, | |
| "kl": 0.28857421875, | |
| "learning_rate": 1.9041331174269373e-06, | |
| "loss": 0.4071, | |
| "num_tokens": 9524787.0, | |
| "reward": 0.3640685440041125, | |
| "reward_std": 0.17312923236750066, | |
| "rewards/code_reward/mean": 0.3640685440041125, | |
| "rewards/code_reward/std": 0.173129228875041, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 331.0, | |
| "completions/max_terminated_length": 331.0, | |
| "completions/mean_length": 232.84375, | |
| "completions/mean_terminated_length": 232.84375, | |
| "completions/min_length": 120.25, | |
| "completions/min_terminated_length": 120.25, | |
| "epoch": 0.03570728977193228, | |
| "grad_norm": 1.3967647406257568, | |
| "kl": 0.264892578125, | |
| "learning_rate": 1.8906456828473341e-06, | |
| "loss": 0.0554, | |
| "num_tokens": 9548390.0, | |
| "reward": 0.19295948650687933, | |
| "reward_std": 0.11571824550628662, | |
| "rewards/code_reward/mean": 0.19295948650687933, | |
| "rewards/code_reward/std": 0.1157182501628995, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 901.25, | |
| "completions/max_terminated_length": 495.0, | |
| "completions/mean_length": 317.78125, | |
| "completions/mean_terminated_length": 262.4151840209961, | |
| "completions/min_length": 105.25, | |
| "completions/min_terminated_length": 105.25, | |
| "epoch": 0.03581922484958724, | |
| "grad_norm": 1.2881984024223554, | |
| "kl": 0.2744140625, | |
| "learning_rate": 1.8771943050537656e-06, | |
| "loss": -0.0248, | |
| "num_tokens": 9578255.0, | |
| "reward": 0.0896820523776114, | |
| "reward_std": 0.08542403136380017, | |
| "rewards/code_reward/mean": 0.0896820523776114, | |
| "rewards/code_reward/std": 0.08542404044419527, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1355.5, | |
| "completions/max_terminated_length": 495.5, | |
| "completions/mean_length": 346.8125, | |
| "completions/mean_terminated_length": 230.4151840209961, | |
| "completions/min_length": 79.5, | |
| "completions/min_terminated_length": 79.5, | |
| "epoch": 0.0359311599272422, | |
| "grad_norm": 1.2010225217377024, | |
| "kl": 0.206298828125, | |
| "learning_rate": 1.8637795484391046e-06, | |
| "loss": 0.562, | |
| "num_tokens": 9612289.0, | |
| "reward": 0.38001057505607605, | |
| "reward_std": 0.18961793556809425, | |
| "rewards/code_reward/mean": 0.38001057505607605, | |
| "rewards/code_reward/std": 0.18961793649941683, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 392.0, | |
| "completions/max_terminated_length": 392.0, | |
| "completions/mean_length": 219.4375, | |
| "completions/mean_terminated_length": 219.4375, | |
| "completions/min_length": 105.5, | |
| "completions/min_terminated_length": 105.5, | |
| "epoch": 0.036043095004897156, | |
| "grad_norm": 1.188208321486219, | |
| "kl": 0.26806640625, | |
| "learning_rate": 1.8504019758596698e-06, | |
| "loss": -0.046, | |
| "num_tokens": 9634663.0, | |
| "reward": 0.1651124432682991, | |
| "reward_std": 0.14856510423123837, | |
| "rewards/code_reward/mean": 0.1651124432682991, | |
| "rewards/code_reward/std": 0.14856510609388351, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 482.25, | |
| "completions/max_terminated_length": 482.25, | |
| "completions/mean_length": 215.125, | |
| "completions/mean_terminated_length": 215.125, | |
| "completions/min_length": 102.0, | |
| "completions/min_terminated_length": 102.0, | |
| "epoch": 0.03615503008255212, | |
| "grad_norm": 1.1155401373073985, | |
| "kl": 0.293701171875, | |
| "learning_rate": 1.8370621486116163e-06, | |
| "loss": 0.172, | |
| "num_tokens": 9655579.0, | |
| "reward": 0.074991176254116, | |
| "reward_std": 0.06526870373636484, | |
| "rewards/code_reward/mean": 0.074991176254116, | |
| "rewards/code_reward/std": 0.06526870559900999, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 332.75, | |
| "completions/max_terminated_length": 332.75, | |
| "completions/mean_length": 169.78125, | |
| "completions/mean_terminated_length": 169.78125, | |
| "completions/min_length": 57.5, | |
| "completions/min_terminated_length": 57.5, | |
| "epoch": 0.03626696516020708, | |
| "grad_norm": 1.4175182217669504, | |
| "kl": 0.36181640625, | |
| "learning_rate": 1.823760626407377e-06, | |
| "loss": 0.0677, | |
| "num_tokens": 9678716.0, | |
| "reward": 0.5682446430437267, | |
| "reward_std": 0.253071456681937, | |
| "rewards/code_reward/mean": 0.5682446430437267, | |
| "rewards/code_reward/std": 0.2530714562162757, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 295.0, | |
| "completions/max_terminated_length": 295.0, | |
| "completions/mean_length": 195.03125, | |
| "completions/mean_terminated_length": 195.03125, | |
| "completions/min_length": 82.0, | |
| "completions/min_terminated_length": 82.0, | |
| "epoch": 0.03637890023786204, | |
| "grad_norm": 1.466909812259168, | |
| "kl": 0.302490234375, | |
| "learning_rate": 1.8104979673521838e-06, | |
| "loss": 0.0551, | |
| "num_tokens": 9697405.0, | |
| "reward": 0.3620302341878414, | |
| "reward_std": 0.24883326888084412, | |
| "rewards/code_reward/mean": 0.3620302341878414, | |
| "rewards/code_reward/std": 0.2488332763314247, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 393.75, | |
| "completions/max_terminated_length": 393.75, | |
| "completions/mean_length": 216.21875, | |
| "completions/mean_terminated_length": 216.21875, | |
| "completions/min_length": 80.5, | |
| "completions/min_terminated_length": 80.5, | |
| "epoch": 0.036490835315517, | |
| "grad_norm": 1.6340135294611648, | |
| "kl": 0.302978515625, | |
| "learning_rate": 1.7972747279206482e-06, | |
| "loss": 0.0425, | |
| "num_tokens": 9716260.0, | |
| "reward": 0.2104739099740982, | |
| "reward_std": 0.11729209683835506, | |
| "rewards/code_reward/mean": 0.2104739099740982, | |
| "rewards/code_reward/std": 0.11729210242629051, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 426.0, | |
| "completions/max_terminated_length": 426.0, | |
| "completions/mean_length": 241.09375, | |
| "completions/mean_terminated_length": 241.09375, | |
| "completions/min_length": 119.0, | |
| "completions/min_terminated_length": 119.0, | |
| "epoch": 0.03660277039317196, | |
| "grad_norm": 1.1688628621437198, | |
| "kl": 0.2314453125, | |
| "learning_rate": 1.7840914629334122e-06, | |
| "loss": -0.0012, | |
| "num_tokens": 9739031.0, | |
| "reward": 0.16952253691852093, | |
| "reward_std": 0.042415026342496276, | |
| "rewards/code_reward/mean": 0.16952253691852093, | |
| "rewards/code_reward/std": 0.04241502704098821, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 523.5, | |
| "completions/max_terminated_length": 523.5, | |
| "completions/mean_length": 207.65625, | |
| "completions/mean_terminated_length": 207.65625, | |
| "completions/min_length": 90.5, | |
| "completions/min_terminated_length": 90.5, | |
| "epoch": 0.03671470547082692, | |
| "grad_norm": 1.0941067328247338, | |
| "kl": 0.2578125, | |
| "learning_rate": 1.7709487255338731e-06, | |
| "loss": 0.0704, | |
| "num_tokens": 9761348.0, | |
| "reward": 0.21878245938569307, | |
| "reward_std": 0.10285742627456784, | |
| "rewards/code_reward/mean": 0.21878245938569307, | |
| "rewards/code_reward/std": 0.10285743046551943, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 244.5, | |
| "completions/max_terminated_length": 244.5, | |
| "completions/mean_length": 134.71875, | |
| "completions/mean_terminated_length": 134.71875, | |
| "completions/min_length": 77.75, | |
| "completions/min_terminated_length": 77.75, | |
| "epoch": 0.03682664054848188, | |
| "grad_norm": 1.9134234308602869, | |
| "kl": 0.34326171875, | |
| "learning_rate": 1.7578470671649684e-06, | |
| "loss": 0.0705, | |
| "num_tokens": 9781267.0, | |
| "reward": 0.17752246744930744, | |
| "reward_std": 0.12675740150734782, | |
| "rewards/code_reward/mean": 0.17752246744930744, | |
| "rewards/code_reward/std": 0.12675740336999297, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 795.0, | |
| "completions/max_terminated_length": 382.5, | |
| "completions/mean_length": 250.09375, | |
| "completions/mean_terminated_length": 192.71875381469727, | |
| "completions/min_length": 71.5, | |
| "completions/min_terminated_length": 71.5, | |
| "epoch": 0.03693857562613684, | |
| "grad_norm": 1.5415476088332252, | |
| "kl": 0.39501953125, | |
| "learning_rate": 1.744787037546045e-06, | |
| "loss": 0.2168, | |
| "num_tokens": 9808326.0, | |
| "reward": 0.21277573192492127, | |
| "reward_std": 0.23475970514118671, | |
| "rewards/code_reward/mean": 0.21277573192492127, | |
| "rewards/code_reward/std": 0.234759708866477, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 282.75, | |
| "completions/max_terminated_length": 282.75, | |
| "completions/mean_length": 171.125, | |
| "completions/mean_terminated_length": 171.125, | |
| "completions/min_length": 96.25, | |
| "completions/min_terminated_length": 96.25, | |
| "epoch": 0.0370505107037918, | |
| "grad_norm": 1.7106509716957057, | |
| "kl": 0.269775390625, | |
| "learning_rate": 1.731769184649788e-06, | |
| "loss": -0.0451, | |
| "num_tokens": 9829634.0, | |
| "reward": 0.09405737672932446, | |
| "reward_std": 0.1773677747696638, | |
| "rewards/code_reward/mean": 0.09405737672932446, | |
| "rewards/code_reward/std": 0.17736777663230896, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 274.75, | |
| "completions/max_terminated_length": 274.75, | |
| "completions/mean_length": 149.90625, | |
| "completions/mean_terminated_length": 149.90625, | |
| "completions/min_length": 60.5, | |
| "completions/min_terminated_length": 60.5, | |
| "epoch": 0.037162445781446764, | |
| "grad_norm": 2.6731683502607013, | |
| "kl": 0.664306640625, | |
| "learning_rate": 1.7187940546792325e-06, | |
| "loss": 0.0639, | |
| "num_tokens": 9848823.0, | |
| "reward": 0.0996803566813469, | |
| "reward_std": 0.07073929067701101, | |
| "rewards/code_reward/mean": 0.0996803566813469, | |
| "rewards/code_reward/std": 0.0707392911426723, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 291.75, | |
| "completions/max_terminated_length": 291.75, | |
| "completions/mean_length": 160.0, | |
| "completions/mean_terminated_length": 160.0, | |
| "completions/min_length": 67.5, | |
| "completions/min_terminated_length": 67.5, | |
| "epoch": 0.03727438085910172, | |
| "grad_norm": 1.6268477182520276, | |
| "kl": 0.28955078125, | |
| "learning_rate": 1.7058621920448465e-06, | |
| "loss": 0.0592, | |
| "num_tokens": 9869263.0, | |
| "reward": 0.15218693669885397, | |
| "reward_std": 0.21367748617194593, | |
| "rewards/code_reward/mean": 0.15218693669885397, | |
| "rewards/code_reward/std": 0.21367749362252653, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 377.25, | |
| "completions/max_terminated_length": 377.25, | |
| "completions/mean_length": 193.34375, | |
| "completions/mean_terminated_length": 193.34375, | |
| "completions/min_length": 74.25, | |
| "completions/min_terminated_length": 74.25, | |
| "epoch": 0.03738631593675668, | |
| "grad_norm": 1.582329034740645, | |
| "kl": 0.268798828125, | |
| "learning_rate": 1.6929741393416855e-06, | |
| "loss": -0.0098, | |
| "num_tokens": 9902154.0, | |
| "reward": 0.19454657658934593, | |
| "reward_std": 0.20047161541879177, | |
| "rewards/code_reward/mean": 0.19454657658934593, | |
| "rewards/code_reward/std": 0.20047162100672722, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 293.75, | |
| "completions/max_terminated_length": 293.75, | |
| "completions/mean_length": 170.0625, | |
| "completions/mean_terminated_length": 170.0625, | |
| "completions/min_length": 79.5, | |
| "completions/min_terminated_length": 79.5, | |
| "epoch": 0.03749825101441164, | |
| "grad_norm": 1.5771763046671186, | |
| "kl": 0.250244140625, | |
| "learning_rate": 1.6801304373266286e-06, | |
| "loss": 0.0037, | |
| "num_tokens": 9921964.0, | |
| "reward": 0.20569872483611107, | |
| "reward_std": 0.1606605793349445, | |
| "rewards/code_reward/mean": 0.20569872483611107, | |
| "rewards/code_reward/std": 0.16066057654097676, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 485.75, | |
| "completions/max_terminated_length": 485.75, | |
| "completions/mean_length": 203.8125, | |
| "completions/mean_terminated_length": 203.8125, | |
| "completions/min_length": 88.75, | |
| "completions/min_terminated_length": 88.75, | |
| "epoch": 0.0376101860920666, | |
| "grad_norm": 1.7807442268855076, | |
| "kl": 0.286865234375, | |
| "learning_rate": 1.667331624895689e-06, | |
| "loss": 0.0622, | |
| "num_tokens": 9952862.0, | |
| "reward": 0.1456711394712329, | |
| "reward_std": 0.22775039146654308, | |
| "rewards/code_reward/mean": 0.1456711394712329, | |
| "rewards/code_reward/std": 0.22775039146654308, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 295.5, | |
| "completions/max_terminated_length": 295.5, | |
| "completions/mean_length": 133.75, | |
| "completions/mean_terminated_length": 133.75, | |
| "completions/min_length": 63.75, | |
| "completions/min_terminated_length": 63.75, | |
| "epoch": 0.03772212116972156, | |
| "grad_norm": 1.640447948869111, | |
| "kl": 0.3310546875, | |
| "learning_rate": 1.6545782390614037e-06, | |
| "loss": 0.0577, | |
| "num_tokens": 9970606.0, | |
| "reward": 0.47745162434875965, | |
| "reward_std": 0.23838305938988924, | |
| "rewards/code_reward/mean": 0.47745162434875965, | |
| "rewards/code_reward/std": 0.23838307429105043, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 235.0, | |
| "completions/max_terminated_length": 235.0, | |
| "completions/mean_length": 127.84375, | |
| "completions/mean_terminated_length": 127.84375, | |
| "completions/min_length": 71.75, | |
| "completions/min_terminated_length": 71.75, | |
| "epoch": 0.03783405624737652, | |
| "grad_norm": 2.018865032840707, | |
| "kl": 0.361328125, | |
| "learning_rate": 1.6418708149302992e-06, | |
| "loss": 0.0511, | |
| "num_tokens": 9992753.0, | |
| "reward": 0.4375, | |
| "reward_std": 0.17353582940995693, | |
| "rewards/code_reward/mean": 0.4375, | |
| "rewards/code_reward/std": 0.17353583686053753, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 210.0, | |
| "completions/max_terminated_length": 210.0, | |
| "completions/mean_length": 118.28125, | |
| "completions/mean_terminated_length": 118.28125, | |
| "completions/min_length": 63.75, | |
| "completions/min_terminated_length": 63.75, | |
| "epoch": 0.03794599132503148, | |
| "grad_norm": 1.3053097026379892, | |
| "kl": 0.3037109375, | |
| "learning_rate": 1.6292098856804423e-06, | |
| "loss": 0.0182, | |
| "num_tokens": 10005258.0, | |
| "reward": 0.3365098312497139, | |
| "reward_std": 0.20095888897776604, | |
| "rewards/code_reward/mean": 0.3365098312497139, | |
| "rewards/code_reward/std": 0.20095889456570148, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 431.0, | |
| "completions/max_terminated_length": 431.0, | |
| "completions/mean_length": 233.75, | |
| "completions/mean_terminated_length": 233.75, | |
| "completions/min_length": 133.25, | |
| "completions/min_terminated_length": 133.25, | |
| "epoch": 0.03805792640268644, | |
| "grad_norm": 0.9354111744654126, | |
| "kl": 0.2366943359375, | |
| "learning_rate": 1.6165959825390661e-06, | |
| "loss": 0.0313, | |
| "num_tokens": 10030994.0, | |
| "reward": 0.05368073424324393, | |
| "reward_std": 0.018324243370443583, | |
| "rewards/code_reward/mean": 0.05368073424324393, | |
| "rewards/code_reward/std": 0.018324245465919375, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 428.25, | |
| "completions/max_terminated_length": 428.25, | |
| "completions/mean_length": 181.5625, | |
| "completions/mean_terminated_length": 181.5625, | |
| "completions/min_length": 89.75, | |
| "completions/min_terminated_length": 89.75, | |
| "epoch": 0.038169861480341405, | |
| "grad_norm": 1.6930152926719917, | |
| "kl": 0.37255859375, | |
| "learning_rate": 1.604029634760284e-06, | |
| "loss": 0.0426, | |
| "num_tokens": 10053388.0, | |
| "reward": 0.24092174973338842, | |
| "reward_std": 0.18441250827163458, | |
| "rewards/code_reward/mean": 0.24092174973338842, | |
| "rewards/code_reward/std": 0.18441250827163458, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 191.25, | |
| "completions/max_terminated_length": 191.25, | |
| "completions/mean_length": 103.25, | |
| "completions/mean_terminated_length": 103.25, | |
| "completions/min_length": 50.25, | |
| "completions/min_terminated_length": 50.25, | |
| "epoch": 0.038281796557996364, | |
| "grad_norm": 2.4426044186229556, | |
| "kl": 0.513427734375, | |
| "learning_rate": 1.59151136960288e-06, | |
| "loss": -0.1329, | |
| "num_tokens": 10074740.0, | |
| "reward": 0.43505216389894485, | |
| "reward_std": 0.09191552549600601, | |
| "rewards/code_reward/mean": 0.43505216389894485, | |
| "rewards/code_reward/std": 0.09191552549600601, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 375.5, | |
| "completions/max_terminated_length": 375.5, | |
| "completions/mean_length": 191.9375, | |
| "completions/mean_terminated_length": 191.9375, | |
| "completions/min_length": 79.75, | |
| "completions/min_terminated_length": 79.75, | |
| "epoch": 0.038393731635651324, | |
| "grad_norm": 1.4272881181440114, | |
| "kl": 0.2998046875, | |
| "learning_rate": 1.5790417123081903e-06, | |
| "loss": 0.0731, | |
| "num_tokens": 10095146.0, | |
| "reward": 0.40253712981939316, | |
| "reward_std": 0.4054878391325474, | |
| "rewards/code_reward/mean": 0.40253712981939316, | |
| "rewards/code_reward/std": 0.40548786148428917, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 410.0, | |
| "completions/max_terminated_length": 410.0, | |
| "completions/mean_length": 190.59375, | |
| "completions/mean_terminated_length": 190.59375, | |
| "completions/min_length": 76.0, | |
| "completions/min_terminated_length": 76.0, | |
| "epoch": 0.03850566671330628, | |
| "grad_norm": 1.5919937138741604, | |
| "kl": 0.22802734375, | |
| "learning_rate": 1.5666211860780583e-06, | |
| "loss": 0.1869, | |
| "num_tokens": 10115245.0, | |
| "reward": 0.11901041585952044, | |
| "reward_std": 0.06176098808646202, | |
| "rewards/code_reward/mean": 0.11901041585952044, | |
| "rewards/code_reward/std": 0.06176098808646202, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 709.5, | |
| "completions/max_terminated_length": 495.5, | |
| "completions/mean_length": 248.71875, | |
| "completions/mean_terminated_length": 197.40625762939453, | |
| "completions/min_length": 66.5, | |
| "completions/min_terminated_length": 66.5, | |
| "epoch": 0.03861760179096124, | |
| "grad_norm": 1.8660163924674529, | |
| "kl": 0.5517578125, | |
| "learning_rate": 1.5542503120528918e-06, | |
| "loss": 0.1448, | |
| "num_tokens": 10142828.0, | |
| "reward": 0.4072798676788807, | |
| "reward_std": 0.128664406016469, | |
| "rewards/code_reward/mean": 0.4072798676788807, | |
| "rewards/code_reward/std": 0.128664406016469, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 419.0, | |
| "completions/max_terminated_length": 419.0, | |
| "completions/mean_length": 211.875, | |
| "completions/mean_terminated_length": 211.875, | |
| "completions/min_length": 81.0, | |
| "completions/min_terminated_length": 81.0, | |
| "epoch": 0.0387295368686162, | |
| "grad_norm": 1.2237817773466955, | |
| "kl": 0.3408203125, | |
| "learning_rate": 1.5419296092897866e-06, | |
| "loss": 0.1399, | |
| "num_tokens": 10168664.0, | |
| "reward": 0.02313591120764613, | |
| "reward_std": 0.02397587802261114, | |
| "rewards/code_reward/mean": 0.02313591120764613, | |
| "rewards/code_reward/std": 0.023975879419595003, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 286.75, | |
| "completions/max_terminated_length": 286.75, | |
| "completions/mean_length": 148.8125, | |
| "completions/mean_terminated_length": 148.8125, | |
| "completions/min_length": 71.5, | |
| "completions/min_terminated_length": 71.5, | |
| "epoch": 0.03884147194627116, | |
| "grad_norm": 2.0973241700512655, | |
| "kl": 0.444580078125, | |
| "learning_rate": 1.529659594740755e-06, | |
| "loss": 0.0837, | |
| "num_tokens": 10185306.0, | |
| "reward": 0.3948034793138504, | |
| "reward_std": 0.1760760466568172, | |
| "rewards/code_reward/mean": 0.3948034793138504, | |
| "rewards/code_reward/std": 0.17607605503872037, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 301.0, | |
| "completions/max_terminated_length": 301.0, | |
| "completions/mean_length": 158.78125, | |
| "completions/mean_terminated_length": 158.78125, | |
| "completions/min_length": 66.75, | |
| "completions/min_terminated_length": 66.75, | |
| "epoch": 0.03895340702392612, | |
| "grad_norm": 1.4572287618545743, | |
| "kl": 0.26953125, | |
| "learning_rate": 1.5174407832310338e-06, | |
| "loss": 0.0326, | |
| "num_tokens": 10203115.0, | |
| "reward": 0.4470205195248127, | |
| "reward_std": 0.19961272552609444, | |
| "rewards/code_reward/mean": 0.4470205195248127, | |
| "rewards/code_reward/std": 0.19961273297667503, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 289.0, | |
| "completions/max_terminated_length": 289.0, | |
| "completions/mean_length": 160.59375, | |
| "completions/mean_terminated_length": 160.59375, | |
| "completions/min_length": 91.75, | |
| "completions/min_terminated_length": 91.75, | |
| "epoch": 0.03906534210158108, | |
| "grad_norm": 1.587914542250274, | |
| "kl": 0.302978515625, | |
| "learning_rate": 1.5052736874374815e-06, | |
| "loss": 0.0176, | |
| "num_tokens": 10226750.0, | |
| "reward": 0.1811899826861918, | |
| "reward_std": 0.15240496955811977, | |
| "rewards/code_reward/mean": 0.1811899826861918, | |
| "rewards/code_reward/std": 0.15240497328341007, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 255.5, | |
| "completions/max_terminated_length": 255.5, | |
| "completions/mean_length": 134.0625, | |
| "completions/mean_terminated_length": 134.0625, | |
| "completions/min_length": 62.25, | |
| "completions/min_terminated_length": 62.25, | |
| "epoch": 0.039177277179236046, | |
| "grad_norm": 2.127239539473459, | |
| "kl": 0.37744140625, | |
| "learning_rate": 1.4931588178670695e-06, | |
| "loss": 0.0222, | |
| "num_tokens": 10248072.0, | |
| "reward": 0.3137185089290142, | |
| "reward_std": 0.06511987652629614, | |
| "rewards/code_reward/mean": 0.3137185089290142, | |
| "rewards/code_reward/std": 0.06511988304555416, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 295.25, | |
| "completions/max_terminated_length": 295.25, | |
| "completions/mean_length": 149.15625, | |
| "completions/mean_terminated_length": 149.15625, | |
| "completions/min_length": 61.25, | |
| "completions/min_terminated_length": 61.25, | |
| "epoch": 0.039289212256891005, | |
| "grad_norm": 1.4510463271520762, | |
| "kl": 0.26611328125, | |
| "learning_rate": 1.4810966828354605e-06, | |
| "loss": 0.1718, | |
| "num_tokens": 10270941.0, | |
| "reward": 0.18543480592779815, | |
| "reward_std": 0.12552618235349655, | |
| "rewards/code_reward/mean": 0.18543480592779815, | |
| "rewards/code_reward/std": 0.12552619352936745, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 232.25, | |
| "completions/max_terminated_length": 232.25, | |
| "completions/mean_length": 142.46875, | |
| "completions/mean_terminated_length": 142.46875, | |
| "completions/min_length": 60.5, | |
| "completions/min_terminated_length": 60.5, | |
| "epoch": 0.039401147334545965, | |
| "grad_norm": 2.0984464768903726, | |
| "kl": 0.345703125, | |
| "learning_rate": 1.469087788445684e-06, | |
| "loss": -0.0558, | |
| "num_tokens": 10291156.0, | |
| "reward": 0.22514494694769382, | |
| "reward_std": 0.23393048718571663, | |
| "rewards/code_reward/mean": 0.22514494694769382, | |
| "rewards/code_reward/std": 0.23393050953745842, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 250.5, | |
| "completions/max_terminated_length": 250.5, | |
| "completions/mean_length": 148.25, | |
| "completions/mean_terminated_length": 148.25, | |
| "completions/min_length": 71.5, | |
| "completions/min_terminated_length": 71.5, | |
| "epoch": 0.039513082412200924, | |
| "grad_norm": 1.6718356673761792, | |
| "kl": 0.290771484375, | |
| "learning_rate": 1.4571326385668965e-06, | |
| "loss": -0.0597, | |
| "num_tokens": 10315236.0, | |
| "reward": 0.3922019712626934, | |
| "reward_std": 0.3044360801577568, | |
| "rewards/code_reward/mean": 0.3922019712626934, | |
| "rewards/code_reward/std": 0.3044360838830471, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 239.5, | |
| "completions/max_terminated_length": 239.5, | |
| "completions/mean_length": 131.21875, | |
| "completions/mean_terminated_length": 131.21875, | |
| "completions/min_length": 61.0, | |
| "completions/min_terminated_length": 61.0, | |
| "epoch": 0.039625017489855884, | |
| "grad_norm": 1.884602182380735, | |
| "kl": 0.362060546875, | |
| "learning_rate": 1.4452317348132434e-06, | |
| "loss": 0.1699, | |
| "num_tokens": 10342059.0, | |
| "reward": 0.2568647051230073, | |
| "reward_std": 0.057626438327133656, | |
| "rewards/code_reward/mean": 0.2568647051230073, | |
| "rewards/code_reward/std": 0.05762644065544009, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 389.0, | |
| "completions/max_terminated_length": 389.0, | |
| "completions/mean_length": 216.78125, | |
| "completions/mean_terminated_length": 216.78125, | |
| "completions/min_length": 93.25, | |
| "completions/min_terminated_length": 93.25, | |
| "epoch": 0.03973695256751084, | |
| "grad_norm": 1.5608540814114584, | |
| "kl": 0.2548828125, | |
| "learning_rate": 1.4333855765228104e-06, | |
| "loss": 0.0906, | |
| "num_tokens": 10365764.0, | |
| "reward": 0.1356297740712762, | |
| "reward_std": 0.07451130566187203, | |
| "rewards/code_reward/mean": 0.1356297740712762, | |
| "rewards/code_reward/std": 0.07451130612753332, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 319.25, | |
| "completions/max_terminated_length": 319.25, | |
| "completions/mean_length": 133.78125, | |
| "completions/mean_terminated_length": 133.78125, | |
| "completions/min_length": 56.0, | |
| "completions/min_terminated_length": 56.0, | |
| "epoch": 0.0398488876451658, | |
| "grad_norm": 1.8748645260654178, | |
| "kl": 0.329833984375, | |
| "learning_rate": 1.421594660736675e-06, | |
| "loss": -0.0276, | |
| "num_tokens": 10390429.0, | |
| "reward": 0.4849093444645405, | |
| "reward_std": 0.17599604558199644, | |
| "rewards/code_reward/mean": 0.4849093444645405, | |
| "rewards/code_reward/std": 0.17599604465067387, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 256.5, | |
| "completions/max_terminated_length": 256.5, | |
| "completions/mean_length": 138.90625, | |
| "completions/mean_terminated_length": 138.90625, | |
| "completions/min_length": 55.25, | |
| "completions/min_terminated_length": 55.25, | |
| "epoch": 0.03996082272282076, | |
| "grad_norm": 1.9096248965574965, | |
| "kl": 0.291015625, | |
| "learning_rate": 1.4098594821780476e-06, | |
| "loss": -0.0702, | |
| "num_tokens": 10411850.0, | |
| "reward": 0.1599155543372035, | |
| "reward_std": 0.14085367415100336, | |
| "rewards/code_reward/mean": 0.1599155543372035, | |
| "rewards/code_reward/std": 0.14085367461666465, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 518.0, | |
| "completions/max_terminated_length": 518.0, | |
| "completions/mean_length": 214.6875, | |
| "completions/mean_terminated_length": 214.6875, | |
| "completions/min_length": 64.25, | |
| "completions/min_terminated_length": 64.25, | |
| "epoch": 0.04007275780047572, | |
| "grad_norm": 0.9256594634997231, | |
| "kl": 0.1795654296875, | |
| "learning_rate": 1.3981805332315174e-06, | |
| "loss": 0.0024, | |
| "num_tokens": 10434984.0, | |
| "reward": 0.3102440594229847, | |
| "reward_std": 0.08654948882758617, | |
| "rewards/code_reward/mean": 0.3102440594229847, | |
| "rewards/code_reward/std": 0.08654948882758617, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 166.5, | |
| "completions/max_terminated_length": 166.5, | |
| "completions/mean_length": 91.75, | |
| "completions/mean_terminated_length": 91.75, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "epoch": 0.04018469287813069, | |
| "grad_norm": 1.8840891544197063, | |
| "kl": 0.41650390625, | |
| "learning_rate": 1.3865583039223929e-06, | |
| "loss": -0.1494, | |
| "num_tokens": 10457064.0, | |
| "reward": 0.215488045476377, | |
| "reward_std": 0.08538101147860289, | |
| "rewards/code_reward/mean": 0.215488045476377, | |
| "rewards/code_reward/std": 0.08538101892918348, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 268.25, | |
| "completions/max_terminated_length": 268.25, | |
| "completions/mean_length": 169.90625, | |
| "completions/mean_terminated_length": 169.90625, | |
| "completions/min_length": 92.25, | |
| "completions/min_terminated_length": 92.25, | |
| "epoch": 0.04029662795578565, | |
| "grad_norm": 1.543399169236028, | |
| "kl": 0.379638671875, | |
| "learning_rate": 1.374993281896137e-06, | |
| "loss": -0.0768, | |
| "num_tokens": 10481869.0, | |
| "reward": 0.22161551751196384, | |
| "reward_std": 0.23532075341790915, | |
| "rewards/code_reward/mean": 0.22161551751196384, | |
| "rewards/code_reward/std": 0.2353207627311349, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 324.5, | |
| "completions/max_terminated_length": 324.5, | |
| "completions/mean_length": 172.65625, | |
| "completions/mean_terminated_length": 172.65625, | |
| "completions/min_length": 72.5, | |
| "completions/min_terminated_length": 72.5, | |
| "epoch": 0.040408563033440606, | |
| "grad_norm": 1.4416318985457022, | |
| "kl": 0.3037109375, | |
| "learning_rate": 1.3634859523979134e-06, | |
| "loss": -0.0104, | |
| "num_tokens": 10507626.0, | |
| "reward": 0.19733425695449114, | |
| "reward_std": 0.2422337755560875, | |
| "rewards/code_reward/mean": 0.19733425695449114, | |
| "rewards/code_reward/std": 0.2422337755560875, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 250.0, | |
| "completions/max_terminated_length": 250.0, | |
| "completions/mean_length": 132.1875, | |
| "completions/mean_terminated_length": 132.1875, | |
| "completions/min_length": 57.5, | |
| "completions/min_terminated_length": 57.5, | |
| "epoch": 0.040520498111095565, | |
| "grad_norm": 1.7918502710892381, | |
| "kl": 0.283935546875, | |
| "learning_rate": 1.3520367982522208e-06, | |
| "loss": 0.0692, | |
| "num_tokens": 10528088.0, | |
| "reward": 0.31125493720173836, | |
| "reward_std": 0.22952541639097035, | |
| "rewards/code_reward/mean": 0.31125493720173836, | |
| "rewards/code_reward/std": 0.22952541639097035, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 342.75, | |
| "completions/max_terminated_length": 342.75, | |
| "completions/mean_length": 163.375, | |
| "completions/mean_terminated_length": 163.375, | |
| "completions/min_length": 65.5, | |
| "completions/min_terminated_length": 65.5, | |
| "epoch": 0.040632433188750525, | |
| "grad_norm": 1.5949676136183217, | |
| "kl": 0.316162109375, | |
| "learning_rate": 1.3406462998426358e-06, | |
| "loss": -0.0073, | |
| "num_tokens": 10547284.0, | |
| "reward": 0.33256023190915585, | |
| "reward_std": 0.23680819105356932, | |
| "rewards/code_reward/mean": 0.33256023190915585, | |
| "rewards/code_reward/std": 0.2368081919848919, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 246.75, | |
| "completions/max_terminated_length": 246.75, | |
| "completions/mean_length": 127.90625, | |
| "completions/mean_terminated_length": 127.90625, | |
| "completions/min_length": 53.0, | |
| "completions/min_terminated_length": 53.0, | |
| "epoch": 0.040744368266405484, | |
| "grad_norm": 1.889587616740625, | |
| "kl": 0.40771484375, | |
| "learning_rate": 1.3293149350916595e-06, | |
| "loss": -0.0672, | |
| "num_tokens": 10564489.0, | |
| "reward": 0.3066699914634228, | |
| "reward_std": 0.09056703024543822, | |
| "rewards/code_reward/mean": 0.3066699914634228, | |
| "rewards/code_reward/std": 0.09056703303940594, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 926.75, | |
| "completions/max_terminated_length": 492.75, | |
| "completions/mean_length": 281.25, | |
| "completions/mean_terminated_length": 221.99107360839844, | |
| "completions/min_length": 88.0, | |
| "completions/min_terminated_length": 88.0, | |
| "epoch": 0.04085630334406044, | |
| "grad_norm": 1.1530140221693261, | |
| "kl": 0.166748046875, | |
| "learning_rate": 1.3180431794406623e-06, | |
| "loss": 0.5007, | |
| "num_tokens": 10590441.0, | |
| "reward": 0.25817783176898956, | |
| "reward_std": 0.1975763700902462, | |
| "rewards/code_reward/mean": 0.25817783176898956, | |
| "rewards/code_reward/std": 0.1975763738155365, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 429.0, | |
| "completions/max_terminated_length": 429.0, | |
| "completions/mean_length": 214.03125, | |
| "completions/mean_terminated_length": 214.03125, | |
| "completions/min_length": 90.0, | |
| "completions/min_terminated_length": 90.0, | |
| "epoch": 0.0409682384217154, | |
| "grad_norm": 1.5427579182120241, | |
| "kl": 0.2431640625, | |
| "learning_rate": 1.3068315058299358e-06, | |
| "loss": 0.0483, | |
| "num_tokens": 10611458.0, | |
| "reward": 0.2973039257340133, | |
| "reward_std": 0.24095657613361254, | |
| "rewards/code_reward/mean": 0.2973039257340133, | |
| "rewards/code_reward/std": 0.2409565910929814, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 360.0, | |
| "completions/max_terminated_length": 360.0, | |
| "completions/mean_length": 169.53125, | |
| "completions/mean_terminated_length": 169.53125, | |
| "completions/min_length": 81.0, | |
| "completions/min_terminated_length": 81.0, | |
| "epoch": 0.04108017349937036, | |
| "grad_norm": 1.5032805435473915, | |
| "kl": 0.297119140625, | |
| "learning_rate": 1.2956803846788503e-06, | |
| "loss": 0.0369, | |
| "num_tokens": 10640499.0, | |
| "reward": 0.21608419064432383, | |
| "reward_std": 0.08488713996484876, | |
| "rewards/code_reward/mean": 0.21608419064432383, | |
| "rewards/code_reward/std": 0.08488714415580034, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 334.0, | |
| "completions/max_terminated_length": 334.0, | |
| "completions/mean_length": 154.53125, | |
| "completions/mean_terminated_length": 154.53125, | |
| "completions/min_length": 63.5, | |
| "completions/min_terminated_length": 63.5, | |
| "epoch": 0.04119210857702533, | |
| "grad_norm": 1.2609885337673685, | |
| "kl": 0.3779296875, | |
| "learning_rate": 1.284590283866116e-06, | |
| "loss": -0.103, | |
| "num_tokens": 10665612.0, | |
| "reward": 0.6632775068283081, | |
| "reward_std": 0.22238866239786148, | |
| "rewards/code_reward/mean": 0.6632775068283081, | |
| "rewards/code_reward/std": 0.22238866239786148, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 287.25, | |
| "completions/max_terminated_length": 287.25, | |
| "completions/mean_length": 146.5, | |
| "completions/mean_terminated_length": 146.5, | |
| "completions/min_length": 69.0, | |
| "completions/min_terminated_length": 69.0, | |
| "epoch": 0.04130404365468029, | |
| "grad_norm": 1.6198013908337032, | |
| "kl": 0.283203125, | |
| "learning_rate": 1.2735616687101518e-06, | |
| "loss": 0.0275, | |
| "num_tokens": 10687588.0, | |
| "reward": 0.04903295123949647, | |
| "reward_std": 0.02168478211387992, | |
| "rewards/code_reward/mean": 0.04903295123949647, | |
| "rewards/code_reward/std": 0.02168478397652507, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 321.0, | |
| "completions/max_terminated_length": 321.0, | |
| "completions/mean_length": 169.125, | |
| "completions/mean_terminated_length": 169.125, | |
| "completions/min_length": 76.5, | |
| "completions/min_terminated_length": 76.5, | |
| "epoch": 0.04141597873233525, | |
| "grad_norm": 0.9528743354172768, | |
| "kl": 0.283203125, | |
| "learning_rate": 1.2625950019495614e-06, | |
| "loss": 0.0836, | |
| "num_tokens": 10710032.0, | |
| "reward": 0.17378074233420193, | |
| "reward_std": 0.162479427177459, | |
| "rewards/code_reward/mean": 0.17378074233420193, | |
| "rewards/code_reward/std": 0.16247944394126534, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 193.75, | |
| "completions/max_terminated_length": 193.75, | |
| "completions/mean_length": 113.15625, | |
| "completions/mean_terminated_length": 113.15625, | |
| "completions/min_length": 53.25, | |
| "completions/min_terminated_length": 53.25, | |
| "epoch": 0.041527913809990206, | |
| "grad_norm": 1.5988336537399075, | |
| "kl": 0.29150390625, | |
| "learning_rate": 1.251690743723718e-06, | |
| "loss": -0.0146, | |
| "num_tokens": 10732597.0, | |
| "reward": 0.6325892880558968, | |
| "reward_std": 0.16140316799283028, | |
| "rewards/code_reward/mean": 0.6325892880558968, | |
| "rewards/code_reward/std": 0.16140317544341087, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 251.0, | |
| "completions/max_terminated_length": 251.0, | |
| "completions/mean_length": 107.4375, | |
| "completions/mean_terminated_length": 107.4375, | |
| "completions/min_length": 54.5, | |
| "completions/min_terminated_length": 54.5, | |
| "epoch": 0.041639848887645166, | |
| "grad_norm": 1.5312147706022616, | |
| "kl": 0.34375, | |
| "learning_rate": 1.2408493515534581e-06, | |
| "loss": 0.0191, | |
| "num_tokens": 10749987.0, | |
| "reward": 0.47150277020409703, | |
| "reward_std": 0.04196681221947074, | |
| "rewards/code_reward/mean": 0.47150277020409703, | |
| "rewards/code_reward/std": 0.04196681268513203, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 254.5, | |
| "completions/max_terminated_length": 254.5, | |
| "completions/mean_length": 179.5625, | |
| "completions/mean_terminated_length": 179.5625, | |
| "completions/min_length": 109.25, | |
| "completions/min_terminated_length": 109.25, | |
| "epoch": 0.041751783965300125, | |
| "grad_norm": 1.2602835048997771, | |
| "kl": 0.37158203125, | |
| "learning_rate": 1.2300712803218834e-06, | |
| "loss": 0.0472, | |
| "num_tokens": 10773077.0, | |
| "reward": 0.2942133641336113, | |
| "reward_std": 0.06957495538517833, | |
| "rewards/code_reward/mean": 0.2942133641336113, | |
| "rewards/code_reward/std": 0.06957494793459773, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 411.25, | |
| "completions/max_terminated_length": 411.25, | |
| "completions/mean_length": 237.9375, | |
| "completions/mean_terminated_length": 237.9375, | |
| "completions/min_length": 83.5, | |
| "completions/min_terminated_length": 83.5, | |
| "epoch": 0.041863719042955085, | |
| "grad_norm": 1.77643334650333, | |
| "kl": 0.2418212890625, | |
| "learning_rate": 1.2193569822552772e-06, | |
| "loss": -0.0534, | |
| "num_tokens": 10800323.0, | |
| "reward": 0.34760985895991325, | |
| "reward_std": 0.09279043786227703, | |
| "rewards/code_reward/mean": 0.34760985895991325, | |
| "rewards/code_reward/std": 0.09279044345021248, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 704.75, | |
| "completions/max_terminated_length": 458.25, | |
| "completions/mean_length": 272.03125, | |
| "completions/mean_terminated_length": 221.75000762939453, | |
| "completions/min_length": 117.5, | |
| "completions/min_terminated_length": 117.5, | |
| "epoch": 0.041975654120610044, | |
| "grad_norm": 1.7108480467121594, | |
| "kl": 0.26763916015625, | |
| "learning_rate": 1.2087069069041268e-06, | |
| "loss": 0.075, | |
| "num_tokens": 10826268.0, | |
| "reward": 0.09669792652130127, | |
| "reward_std": 0.14347750786691904, | |
| "rewards/code_reward/mean": 0.09669792652130127, | |
| "rewards/code_reward/std": 0.14347750786691904, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 446.25, | |
| "completions/max_terminated_length": 446.25, | |
| "completions/mean_length": 234.9375, | |
| "completions/mean_terminated_length": 234.9375, | |
| "completions/min_length": 107.5, | |
| "completions/min_terminated_length": 107.5, | |
| "epoch": 0.042087589198265, | |
| "grad_norm": 1.2434311763858945, | |
| "kl": 0.213134765625, | |
| "learning_rate": 1.1981215011242654e-06, | |
| "loss": 0.131, | |
| "num_tokens": 10854130.0, | |
| "reward": 0.043518811551621184, | |
| "reward_std": 0.047120289877057076, | |
| "rewards/code_reward/mean": 0.043518811551621184, | |
| "rewards/code_reward/std": 0.047120293602347374, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 239.0, | |
| "completions/max_terminated_length": 239.0, | |
| "completions/mean_length": 136.03125, | |
| "completions/mean_terminated_length": 136.03125, | |
| "completions/min_length": 54.25, | |
| "completions/min_terminated_length": 54.25, | |
| "epoch": 0.04219952427591997, | |
| "grad_norm": 1.671522684663737, | |
| "kl": 0.36328125, | |
| "learning_rate": 1.1876012090581184e-06, | |
| "loss": 0.0534, | |
| "num_tokens": 10877771.0, | |
| "reward": 0.5510788485407829, | |
| "reward_std": 0.11537208966910839, | |
| "rewards/code_reward/mean": 0.5510788485407829, | |
| "rewards/code_reward/std": 0.11537209153175354, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 304.25, | |
| "completions/max_terminated_length": 304.25, | |
| "completions/mean_length": 165.71875, | |
| "completions/mean_terminated_length": 165.71875, | |
| "completions/min_length": 77.0, | |
| "completions/min_terminated_length": 77.0, | |
| "epoch": 0.04231145935357493, | |
| "grad_norm": 1.1250436666643788, | |
| "kl": 0.33447265625, | |
| "learning_rate": 1.177146472116071e-06, | |
| "loss": -0.0308, | |
| "num_tokens": 10904074.0, | |
| "reward": 0.04710310218797531, | |
| "reward_std": 0.030758424138184637, | |
| "rewards/code_reward/mean": 0.04710310218797531, | |
| "rewards/code_reward/std": 0.030758424138184637, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 289.5, | |
| "completions/max_terminated_length": 289.5, | |
| "completions/mean_length": 151.8125, | |
| "completions/mean_terminated_length": 151.8125, | |
| "completions/min_length": 73.5, | |
| "completions/min_terminated_length": 73.5, | |
| "epoch": 0.04242339443122989, | |
| "grad_norm": 1.360047496133987, | |
| "kl": 0.3671875, | |
| "learning_rate": 1.1667577289579462e-06, | |
| "loss": -0.0164, | |
| "num_tokens": 10930116.0, | |
| "reward": 0.4695088779553771, | |
| "reward_std": 0.12898865342140198, | |
| "rewards/code_reward/mean": 0.4695088779553771, | |
| "rewards/code_reward/std": 0.12898865342140198, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 281.5, | |
| "completions/max_terminated_length": 281.5, | |
| "completions/mean_length": 191.65625, | |
| "completions/mean_terminated_length": 191.65625, | |
| "completions/min_length": 91.75, | |
| "completions/min_terminated_length": 91.75, | |
| "epoch": 0.04253532950888485, | |
| "grad_norm": 1.396767149969482, | |
| "kl": 0.222412109375, | |
| "learning_rate": 1.1564354154746007e-06, | |
| "loss": 0.0289, | |
| "num_tokens": 10951289.0, | |
| "reward": 0.38920454680919647, | |
| "reward_std": 0.1452226829715073, | |
| "rewards/code_reward/mean": 0.38920454680919647, | |
| "rewards/code_reward/std": 0.14522269228473306, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 232.25, | |
| "completions/max_terminated_length": 232.25, | |
| "completions/mean_length": 126.9375, | |
| "completions/mean_terminated_length": 126.9375, | |
| "completions/min_length": 56.75, | |
| "completions/min_terminated_length": 56.75, | |
| "epoch": 0.04264726458653981, | |
| "grad_norm": 2.272607027139455, | |
| "kl": 0.4501953125, | |
| "learning_rate": 1.146179964769635e-06, | |
| "loss": -0.0172, | |
| "num_tokens": 10973007.0, | |
| "reward": 0.5514450334012508, | |
| "reward_std": 0.1807562008034438, | |
| "rewards/code_reward/mean": 0.5514450334012508, | |
| "rewards/code_reward/std": 0.1807561982423067, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 377.25, | |
| "completions/max_terminated_length": 377.25, | |
| "completions/mean_length": 200.40625, | |
| "completions/mean_terminated_length": 200.40625, | |
| "completions/min_length": 75.5, | |
| "completions/min_terminated_length": 75.5, | |
| "epoch": 0.042759199664194766, | |
| "grad_norm": 1.8894869007416317, | |
| "kl": 0.28662109375, | |
| "learning_rate": 1.1359918071412195e-06, | |
| "loss": 0.0651, | |
| "num_tokens": 11000324.0, | |
| "reward": 0.3411928308196366, | |
| "reward_std": 0.15844399761408567, | |
| "rewards/code_reward/mean": 0.3411928308196366, | |
| "rewards/code_reward/std": 0.15844399388879538, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 199.25, | |
| "completions/max_terminated_length": 199.25, | |
| "completions/mean_length": 122.78125, | |
| "completions/mean_terminated_length": 122.78125, | |
| "completions/min_length": 74.75, | |
| "completions/min_terminated_length": 74.75, | |
| "epoch": 0.042871134741849726, | |
| "grad_norm": 2.4633639922004784, | |
| "kl": 0.448486328125, | |
| "learning_rate": 1.1258713700640456e-06, | |
| "loss": -0.0042, | |
| "num_tokens": 11025333.0, | |
| "reward": 0.39490123838186264, | |
| "reward_std": 0.09689067304134369, | |
| "rewards/code_reward/mean": 0.39490123838186264, | |
| "rewards/code_reward/std": 0.09689067304134369, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 291.0, | |
| "completions/max_terminated_length": 291.0, | |
| "completions/mean_length": 170.78125, | |
| "completions/mean_terminated_length": 170.78125, | |
| "completions/min_length": 94.25, | |
| "completions/min_terminated_length": 94.25, | |
| "epoch": 0.042983069819504685, | |
| "grad_norm": 1.5596521493483513, | |
| "kl": 0.26416015625, | |
| "learning_rate": 1.115819078171383e-06, | |
| "loss": -0.0304, | |
| "num_tokens": 11052478.0, | |
| "reward": 0.11266797501593828, | |
| "reward_std": 0.04459898290224373, | |
| "rewards/code_reward/mean": 0.11266797501593828, | |
| "rewards/code_reward/std": 0.04459898569621146, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 396.75, | |
| "completions/max_terminated_length": 396.75, | |
| "completions/mean_length": 161.8125, | |
| "completions/mean_terminated_length": 161.8125, | |
| "completions/min_length": 66.25, | |
| "completions/min_terminated_length": 66.25, | |
| "epoch": 0.043095004897159644, | |
| "grad_norm": 1.2904801129088306, | |
| "kl": 0.337890625, | |
| "learning_rate": 1.1058353532372667e-06, | |
| "loss": 0.0852, | |
| "num_tokens": 11072608.0, | |
| "reward": 0.39945168420672417, | |
| "reward_std": 0.24530693516135216, | |
| "rewards/code_reward/mean": 0.39945168420672417, | |
| "rewards/code_reward/std": 0.24530693143606186, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 270.5, | |
| "completions/max_terminated_length": 270.5, | |
| "completions/mean_length": 145.25, | |
| "completions/mean_terminated_length": 145.25, | |
| "completions/min_length": 80.5, | |
| "completions/min_terminated_length": 80.5, | |
| "epoch": 0.04320693997481461, | |
| "grad_norm": 1.533287258494552, | |
| "kl": 0.321533203125, | |
| "learning_rate": 1.0959206141587998e-06, | |
| "loss": -0.0497, | |
| "num_tokens": 11094568.0, | |
| "reward": 0.32392971869558096, | |
| "reward_std": 0.0603926875628531, | |
| "rewards/code_reward/mean": 0.32392971869558096, | |
| "rewards/code_reward/std": 0.060392691288143396, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 460.25, | |
| "completions/max_terminated_length": 460.25, | |
| "completions/mean_length": 244.65625, | |
| "completions/mean_terminated_length": 244.65625, | |
| "completions/min_length": 101.0, | |
| "completions/min_terminated_length": 101.0, | |
| "epoch": 0.04331887505246957, | |
| "grad_norm": 1.673535799840682, | |
| "kl": 0.1998291015625, | |
| "learning_rate": 1.0860752769385766e-06, | |
| "loss": -0.0478, | |
| "num_tokens": 11115893.0, | |
| "reward": 0.19461633265018463, | |
| "reward_std": 0.2882770374417305, | |
| "rewards/code_reward/mean": 0.19461633265018463, | |
| "rewards/code_reward/std": 0.2882770411670208, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 260.75, | |
| "completions/max_terminated_length": 260.75, | |
| "completions/mean_length": 164.25, | |
| "completions/mean_terminated_length": 164.25, | |
| "completions/min_length": 75.0, | |
| "completions/min_terminated_length": 75.0, | |
| "epoch": 0.04343081013012453, | |
| "grad_norm": 1.8694819476907374, | |
| "kl": 0.28662109375, | |
| "learning_rate": 1.0762997546672279e-06, | |
| "loss": -0.1618, | |
| "num_tokens": 11140117.0, | |
| "reward": 0.13581378757953644, | |
| "reward_std": 0.1375128449872136, | |
| "rewards/code_reward/mean": 0.13581378757953644, | |
| "rewards/code_reward/std": 0.1375128524377942, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 363.25, | |
| "completions/max_terminated_length": 363.25, | |
| "completions/mean_length": 193.84375, | |
| "completions/mean_terminated_length": 193.84375, | |
| "completions/min_length": 98.5, | |
| "completions/min_terminated_length": 98.5, | |
| "epoch": 0.04354274520777949, | |
| "grad_norm": 1.0753739650687322, | |
| "kl": 0.333984375, | |
| "learning_rate": 1.0665944575060914e-06, | |
| "loss": 0.0196, | |
| "num_tokens": 11165288.0, | |
| "reward": 0.2044280730187893, | |
| "reward_std": 0.20719696558080614, | |
| "rewards/code_reward/mean": 0.2044280730187893, | |
| "rewards/code_reward/std": 0.20719696604646742, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 206.5, | |
| "completions/max_terminated_length": 206.5, | |
| "completions/mean_length": 101.28125, | |
| "completions/mean_terminated_length": 101.28125, | |
| "completions/min_length": 45.75, | |
| "completions/min_terminated_length": 45.75, | |
| "epoch": 0.04365468028543445, | |
| "grad_norm": 2.0710686171209, | |
| "kl": 0.34033203125, | |
| "learning_rate": 1.056959792669997e-06, | |
| "loss": 0.0855, | |
| "num_tokens": 11184777.0, | |
| "reward": 0.3098377622663975, | |
| "reward_std": 0.11287019960582256, | |
| "rewards/code_reward/mean": 0.3098377622663975, | |
| "rewards/code_reward/std": 0.11287020146846771, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 224.5, | |
| "completions/max_terminated_length": 224.5, | |
| "completions/mean_length": 128.0625, | |
| "completions/mean_terminated_length": 128.0625, | |
| "completions/min_length": 68.0, | |
| "completions/min_terminated_length": 68.0, | |
| "epoch": 0.04376661536308941, | |
| "grad_norm": 1.6186447722026904, | |
| "kl": 0.3369140625, | |
| "learning_rate": 1.0473961644101856e-06, | |
| "loss": 0.0431, | |
| "num_tokens": 11207051.0, | |
| "reward": 0.40973464399576187, | |
| "reward_std": 0.2817695839330554, | |
| "rewards/code_reward/mean": 0.40973464399576187, | |
| "rewards/code_reward/std": 0.281769591383636, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 391.25, | |
| "completions/max_terminated_length": 391.25, | |
| "completions/mean_length": 222.3125, | |
| "completions/mean_terminated_length": 222.3125, | |
| "completions/min_length": 73.25, | |
| "completions/min_terminated_length": 73.25, | |
| "epoch": 0.04387855044074437, | |
| "grad_norm": 1.1817397898346427, | |
| "kl": 0.214599609375, | |
| "learning_rate": 1.037903973997345e-06, | |
| "loss": 0.0475, | |
| "num_tokens": 11225013.0, | |
| "reward": 0.30992063134908676, | |
| "reward_std": 0.14644738845527172, | |
| "rewards/code_reward/mean": 0.30992063134908676, | |
| "rewards/code_reward/std": 0.14644739404320717, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 259.5, | |
| "completions/max_terminated_length": 259.5, | |
| "completions/mean_length": 149.8125, | |
| "completions/mean_terminated_length": 149.8125, | |
| "completions/min_length": 86.25, | |
| "completions/min_terminated_length": 86.25, | |
| "epoch": 0.043990485518399326, | |
| "grad_norm": 1.2696184824554122, | |
| "kl": 0.26806640625, | |
| "learning_rate": 1.0284836197047737e-06, | |
| "loss": -0.0078, | |
| "num_tokens": 11242503.0, | |
| "reward": 0.4278051145374775, | |
| "reward_std": 0.09913837909698486, | |
| "rewards/code_reward/mean": 0.4278051145374775, | |
| "rewards/code_reward/std": 0.09913837816566229, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 282.0, | |
| "completions/max_terminated_length": 282.0, | |
| "completions/mean_length": 150.03125, | |
| "completions/mean_terminated_length": 150.03125, | |
| "completions/min_length": 65.5, | |
| "completions/min_terminated_length": 65.5, | |
| "epoch": 0.044102420596054286, | |
| "grad_norm": 1.45552546760352, | |
| "kl": 0.28857421875, | |
| "learning_rate": 1.0191354967916712e-06, | |
| "loss": 0.0331, | |
| "num_tokens": 11269880.0, | |
| "reward": 0.24055082583799958, | |
| "reward_std": 0.11007735197199509, | |
| "rewards/code_reward/mean": 0.24055082583799958, | |
| "rewards/code_reward/std": 0.11007736308965832, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 444.0, | |
| "completions/max_terminated_length": 444.0, | |
| "completions/mean_length": 169.375, | |
| "completions/mean_terminated_length": 169.375, | |
| "completions/min_length": 73.0, | |
| "completions/min_terminated_length": 73.0, | |
| "epoch": 0.04421435567370925, | |
| "grad_norm": 1.368061303454261, | |
| "kl": 0.311279296875, | |
| "learning_rate": 1.0098599974865515e-06, | |
| "loss": 0.0704, | |
| "num_tokens": 11297700.0, | |
| "reward": 0.07068161107599735, | |
| "reward_std": 0.11775721522280946, | |
| "rewards/code_reward/mean": 0.07068161107599735, | |
| "rewards/code_reward/std": 0.11775722278980538, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 298.0, | |
| "completions/max_terminated_length": 298.0, | |
| "completions/mean_length": 166.28125, | |
| "completions/mean_terminated_length": 166.28125, | |
| "completions/min_length": 74.0, | |
| "completions/min_terminated_length": 74.0, | |
| "epoch": 0.04432629075136421, | |
| "grad_norm": 2.02580412202015, | |
| "kl": 0.312255859375, | |
| "learning_rate": 1.0006575109707898e-06, | |
| "loss": 0.1445, | |
| "num_tokens": 11315909.0, | |
| "reward": 0.2796209901571274, | |
| "reward_std": 0.20085123018361628, | |
| "rewards/code_reward/mean": 0.2796209901571274, | |
| "rewards/code_reward/std": 0.200851232977584, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 423.5, | |
| "completions/max_terminated_length": 423.5, | |
| "completions/mean_length": 216.125, | |
| "completions/mean_terminated_length": 216.125, | |
| "completions/min_length": 94.5, | |
| "completions/min_terminated_length": 94.5, | |
| "epoch": 0.04443822582901917, | |
| "grad_norm": 1.711611197709339, | |
| "kl": 0.398681640625, | |
| "learning_rate": 9.915284233622877e-07, | |
| "loss": -0.0014, | |
| "num_tokens": 11345121.0, | |
| "reward": 0.3469575219787657, | |
| "reward_std": 0.2414399441331625, | |
| "rewards/code_reward/mean": 0.3469575219787657, | |
| "rewards/code_reward/std": 0.2414399590343237, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 339.25, | |
| "completions/max_terminated_length": 339.25, | |
| "completions/mean_length": 163.15625, | |
| "completions/mean_terminated_length": 163.15625, | |
| "completions/min_length": 64.0, | |
| "completions/min_terminated_length": 64.0, | |
| "epoch": 0.04455016090667413, | |
| "grad_norm": 1.6559828290200511, | |
| "kl": 0.32080078125, | |
| "learning_rate": 9.824731176992796e-07, | |
| "loss": 0.0554, | |
| "num_tokens": 11366862.0, | |
| "reward": 0.21360408567124978, | |
| "reward_std": 0.14055794943124056, | |
| "rewards/code_reward/mean": 0.21360408567124978, | |
| "rewards/code_reward/std": 0.14055794943124056, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 250.5, | |
| "completions/max_terminated_length": 250.5, | |
| "completions/mean_length": 163.09375, | |
| "completions/mean_terminated_length": 163.09375, | |
| "completions/min_length": 97.25, | |
| "completions/min_terminated_length": 97.25, | |
| "epoch": 0.04466209598432909, | |
| "grad_norm": 1.4395787879499458, | |
| "kl": 0.2861328125, | |
| "learning_rate": 9.734919739242543e-07, | |
| "loss": 0.0094, | |
| "num_tokens": 11390465.0, | |
| "reward": 0.37181805819272995, | |
| "reward_std": 0.13883061078377068, | |
| "rewards/code_reward/mean": 0.37181805819272995, | |
| "rewards/code_reward/std": 0.1388306178851053, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 351.75, | |
| "completions/max_terminated_length": 351.75, | |
| "completions/mean_length": 168.9375, | |
| "completions/mean_terminated_length": 168.9375, | |
| "completions/min_length": 75.0, | |
| "completions/min_terminated_length": 75.0, | |
| "epoch": 0.04477403106198405, | |
| "grad_norm": 1.5497285885828123, | |
| "kl": 0.28857421875, | |
| "learning_rate": 9.645853688680177e-07, | |
| "loss": -0.0077, | |
| "num_tokens": 11412903.0, | |
| "reward": 0.22598881646990776, | |
| "reward_std": 0.05764714028919116, | |
| "rewards/code_reward/mean": 0.22598881646990776, | |
| "rewards/code_reward/std": 0.05764713906683028, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 810.0, | |
| "completions/max_terminated_length": 618.5, | |
| "completions/mean_length": 326.8125, | |
| "completions/mean_terminated_length": 279.2276916503906, | |
| "completions/min_length": 143.75, | |
| "completions/min_terminated_length": 143.75, | |
| "epoch": 0.04488596613963901, | |
| "grad_norm": 1.3195383455561336, | |
| "kl": 0.1973876953125, | |
| "learning_rate": 9.557536762338786e-07, | |
| "loss": 0.1984, | |
| "num_tokens": 11445705.0, | |
| "reward": 0.4391447389498353, | |
| "reward_std": 0.2860143817961216, | |
| "rewards/code_reward/mean": 0.4391447389498353, | |
| "rewards/code_reward/std": 0.28601440228521824, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 287.25, | |
| "completions/max_terminated_length": 287.25, | |
| "completions/mean_length": 152.84375, | |
| "completions/mean_terminated_length": 152.84375, | |
| "completions/min_length": 75.25, | |
| "completions/min_terminated_length": 75.25, | |
| "epoch": 0.04499790121729397, | |
| "grad_norm": 1.3849099249418695, | |
| "kl": 0.2900390625, | |
| "learning_rate": 9.46997266581973e-07, | |
| "loss": 0.0243, | |
| "num_tokens": 11470668.0, | |
| "reward": 0.5938801132142544, | |
| "reward_std": 0.22660082660149783, | |
| "rewards/code_reward/mean": 0.5938801132142544, | |
| "rewards/code_reward/std": 0.22660081752110273, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 446.5, | |
| "completions/max_terminated_length": 446.5, | |
| "completions/mean_length": 251.75, | |
| "completions/mean_terminated_length": 251.75, | |
| "completions/min_length": 149.25, | |
| "completions/min_terminated_length": 149.25, | |
| "epoch": 0.04510983629494893, | |
| "grad_norm": 1.1229376006957024, | |
| "kl": 0.2052001953125, | |
| "learning_rate": 9.383165073137115e-07, | |
| "loss": -0.0179, | |
| "num_tokens": 11493260.0, | |
| "reward": 0.36087851971387863, | |
| "reward_std": 0.14250769466161728, | |
| "rewards/code_reward/mean": 0.36087851971387863, | |
| "rewards/code_reward/std": 0.14250769466161728, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 299.75, | |
| "completions/max_terminated_length": 299.75, | |
| "completions/mean_length": 161.46875, | |
| "completions/mean_terminated_length": 161.46875, | |
| "completions/min_length": 73.0, | |
| "completions/min_terminated_length": 73.0, | |
| "epoch": 0.04522177137260389, | |
| "grad_norm": 1.3839235753481969, | |
| "kl": 0.34619140625, | |
| "learning_rate": 9.297117626563687e-07, | |
| "loss": 0.1469, | |
| "num_tokens": 11513939.0, | |
| "reward": 0.6742284968495369, | |
| "reward_std": 0.05968676181510091, | |
| "rewards/code_reward/mean": 0.6742284968495369, | |
| "rewards/code_reward/std": 0.059686762280762196, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 180.5, | |
| "completions/max_terminated_length": 180.5, | |
| "completions/mean_length": 114.53125, | |
| "completions/mean_terminated_length": 114.53125, | |
| "completions/min_length": 73.75, | |
| "completions/min_terminated_length": 73.75, | |
| "epoch": 0.04533370645025885, | |
| "grad_norm": 2.4362013118805237, | |
| "kl": 0.326171875, | |
| "learning_rate": 9.211833936477957e-07, | |
| "loss": 0.0929, | |
| "num_tokens": 11532444.0, | |
| "reward": 0.33231060579419136, | |
| "reward_std": 0.09006076445803046, | |
| "rewards/code_reward/mean": 0.33231060579419136, | |
| "rewards/code_reward/std": 0.09006076492369175, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 474.75, | |
| "completions/max_terminated_length": 474.75, | |
| "completions/mean_length": 210.15625, | |
| "completions/mean_terminated_length": 210.15625, | |
| "completions/min_length": 100.25, | |
| "completions/min_terminated_length": 100.25, | |
| "epoch": 0.04544564152791381, | |
| "grad_norm": 1.3572230509856378, | |
| "kl": 0.225341796875, | |
| "learning_rate": 9.127317581212753e-07, | |
| "loss": -0.13, | |
| "num_tokens": 11553801.0, | |
| "reward": 0.4415045604109764, | |
| "reward_std": 0.1545610846951604, | |
| "rewards/code_reward/mean": 0.4415045604109764, | |
| "rewards/code_reward/std": 0.1545610912144184, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 242.0, | |
| "completions/max_terminated_length": 242.0, | |
| "completions/mean_length": 135.875, | |
| "completions/mean_terminated_length": 135.875, | |
| "completions/min_length": 65.25, | |
| "completions/min_terminated_length": 65.25, | |
| "epoch": 0.04555757660556877, | |
| "grad_norm": 1.7906307431939532, | |
| "kl": 0.37890625, | |
| "learning_rate": 9.043572106905084e-07, | |
| "loss": -0.0301, | |
| "num_tokens": 11574885.0, | |
| "reward": 0.2516532065346837, | |
| "reward_std": 0.1726220678538084, | |
| "rewards/code_reward/mean": 0.2516532065346837, | |
| "rewards/code_reward/std": 0.17262207716703415, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 714.25, | |
| "completions/max_terminated_length": 451.0, | |
| "completions/mean_length": 230.8125, | |
| "completions/mean_terminated_length": 174.68303680419922, | |
| "completions/min_length": 66.5, | |
| "completions/min_terminated_length": 66.5, | |
| "epoch": 0.04566951168322373, | |
| "grad_norm": 2.5575273200953244, | |
| "kl": 0.662841796875, | |
| "learning_rate": 8.960601027347321e-07, | |
| "loss": 0.2496, | |
| "num_tokens": 11602207.0, | |
| "reward": 0.5628770813345909, | |
| "reward_std": 0.1447618722449988, | |
| "rewards/code_reward/mean": 0.5628770813345909, | |
| "rewards/code_reward/std": 0.14476187201216817, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 240.75, | |
| "completions/max_terminated_length": 240.75, | |
| "completions/mean_length": 148.78125, | |
| "completions/mean_terminated_length": 148.78125, | |
| "completions/min_length": 83.0, | |
| "completions/min_terminated_length": 83.0, | |
| "epoch": 0.04578144676087869, | |
| "grad_norm": 1.6621037231884788, | |
| "kl": 0.3466796875, | |
| "learning_rate": 8.878407823839788e-07, | |
| "loss": 0.0366, | |
| "num_tokens": 11618608.0, | |
| "reward": 0.25418527983129025, | |
| "reward_std": 0.11003150884062052, | |
| "rewards/code_reward/mean": 0.25418527983129025, | |
| "rewards/code_reward/std": 0.11003150977194309, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 213.5, | |
| "completions/max_terminated_length": 213.5, | |
| "completions/mean_length": 146.90625, | |
| "completions/mean_terminated_length": 146.90625, | |
| "completions/min_length": 91.25, | |
| "completions/min_terminated_length": 91.25, | |
| "epoch": 0.04589338183853365, | |
| "grad_norm": 1.4559367674936419, | |
| "kl": 0.26123046875, | |
| "learning_rate": 8.796995945044689e-07, | |
| "loss": 0.0374, | |
| "num_tokens": 11637933.0, | |
| "reward": 0.36544950399547815, | |
| "reward_std": 0.024054846144281328, | |
| "rewards/code_reward/mean": 0.36544950399547815, | |
| "rewards/code_reward/std": 0.02405484637711197, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 236.5, | |
| "completions/max_terminated_length": 236.5, | |
| "completions/mean_length": 158.5, | |
| "completions/mean_terminated_length": 158.5, | |
| "completions/min_length": 87.25, | |
| "completions/min_terminated_length": 87.25, | |
| "epoch": 0.04600531691618861, | |
| "grad_norm": 1.6433420190741899, | |
| "kl": 0.28955078125, | |
| "learning_rate": 8.716368806841405e-07, | |
| "loss": -0.0265, | |
| "num_tokens": 11658509.0, | |
| "reward": 0.3223713766783476, | |
| "reward_std": 0.1642971858382225, | |
| "rewards/code_reward/mean": 0.3223713766783476, | |
| "rewards/code_reward/std": 0.16429719096049666, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 392.25, | |
| "completions/max_terminated_length": 392.25, | |
| "completions/mean_length": 195.6875, | |
| "completions/mean_terminated_length": 195.6875, | |
| "completions/min_length": 119.25, | |
| "completions/min_terminated_length": 119.25, | |
| "epoch": 0.04611725199384357, | |
| "grad_norm": 1.0424618486352764, | |
| "kl": 0.280517578125, | |
| "learning_rate": 8.636529792183171e-07, | |
| "loss": 0.0174, | |
| "num_tokens": 11683955.0, | |
| "reward": 0.3553215153515339, | |
| "reward_std": 0.1284142378717661, | |
| "rewards/code_reward/mean": 0.3553215153515339, | |
| "rewards/code_reward/std": 0.1284142378717661, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 547.0, | |
| "completions/max_terminated_length": 547.0, | |
| "completions/mean_length": 249.4375, | |
| "completions/mean_terminated_length": 249.4375, | |
| "completions/min_length": 124.25, | |
| "completions/min_terminated_length": 124.25, | |
| "epoch": 0.046229187071498534, | |
| "grad_norm": 0.9268703240625641, | |
| "kl": 0.181884765625, | |
| "learning_rate": 8.557482250955144e-07, | |
| "loss": 0.0329, | |
| "num_tokens": 11707953.0, | |
| "reward": 0.5278465449810028, | |
| "reward_std": 0.059144818456843495, | |
| "rewards/code_reward/mean": 0.5278465449810028, | |
| "rewards/code_reward/std": 0.05914481892250478, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 324.75, | |
| "completions/max_terminated_length": 324.75, | |
| "completions/mean_length": 194.625, | |
| "completions/mean_terminated_length": 194.625, | |
| "completions/min_length": 112.5, | |
| "completions/min_terminated_length": 112.5, | |
| "epoch": 0.046341122149153494, | |
| "grad_norm": 1.2660479068382882, | |
| "kl": 0.270751953125, | |
| "learning_rate": 8.479229499833844e-07, | |
| "loss": 0.0482, | |
| "num_tokens": 11731301.0, | |
| "reward": 0.20539462007582188, | |
| "reward_std": 0.1615639952942729, | |
| "rewards/code_reward/mean": 0.20539462007582188, | |
| "rewards/code_reward/std": 0.1615639952942729, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 262.25, | |
| "completions/max_terminated_length": 262.25, | |
| "completions/mean_length": 178.9375, | |
| "completions/mean_terminated_length": 178.9375, | |
| "completions/min_length": 97.75, | |
| "completions/min_terminated_length": 97.75, | |
| "epoch": 0.04645305722680845, | |
| "grad_norm": 1.460499815260377, | |
| "kl": 0.33349609375, | |
| "learning_rate": 8.401774822147976e-07, | |
| "loss": 0.0594, | |
| "num_tokens": 11754019.0, | |
| "reward": 0.3291256055235863, | |
| "reward_std": 0.1305392780341208, | |
| "rewards/code_reward/mean": 0.3291256055235863, | |
| "rewards/code_reward/std": 0.13053929095622152, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 680.5, | |
| "completions/max_terminated_length": 680.5, | |
| "completions/mean_length": 255.3125, | |
| "completions/mean_terminated_length": 255.3125, | |
| "completions/min_length": 91.25, | |
| "completions/min_terminated_length": 91.25, | |
| "epoch": 0.04656499230446341, | |
| "grad_norm": 1.5950867620592668, | |
| "kl": 0.260498046875, | |
| "learning_rate": 8.325121467740695e-07, | |
| "loss": 0.0056, | |
| "num_tokens": 11784677.0, | |
| "reward": 0.4860835336148739, | |
| "reward_std": 0.19814053922891617, | |
| "rewards/code_reward/mean": 0.4860835336148739, | |
| "rewards/code_reward/std": 0.19814054295420647, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 380.75, | |
| "completions/max_terminated_length": 380.75, | |
| "completions/mean_length": 203.8125, | |
| "completions/mean_terminated_length": 203.8125, | |
| "completions/min_length": 100.25, | |
| "completions/min_terminated_length": 100.25, | |
| "epoch": 0.04667692738211837, | |
| "grad_norm": 1.4255259834081973, | |
| "kl": 0.23291015625, | |
| "learning_rate": 8.249272652833226e-07, | |
| "loss": 0.0277, | |
| "num_tokens": 11812087.0, | |
| "reward": 0.1272990070283413, | |
| "reward_std": 0.05336737190373242, | |
| "rewards/code_reward/mean": 0.1272990070283413, | |
| "rewards/code_reward/std": 0.053367371554486454, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 292.0, | |
| "completions/max_terminated_length": 292.0, | |
| "completions/mean_length": 193.78125, | |
| "completions/mean_terminated_length": 193.78125, | |
| "completions/min_length": 109.75, | |
| "completions/min_terminated_length": 109.75, | |
| "epoch": 0.04678886245977333, | |
| "grad_norm": 1.3483016005578479, | |
| "kl": 0.382568359375, | |
| "learning_rate": 8.174231559889931e-07, | |
| "loss": -0.0485, | |
| "num_tokens": 11828464.0, | |
| "reward": 0.3802599012851715, | |
| "reward_std": 0.24896394088864326, | |
| "rewards/code_reward/mean": 0.3802599012851715, | |
| "rewards/code_reward/std": 0.24896394088864326, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 240.75, | |
| "completions/max_terminated_length": 240.75, | |
| "completions/mean_length": 158.25, | |
| "completions/mean_terminated_length": 158.25, | |
| "completions/min_length": 95.5, | |
| "completions/min_terminated_length": 95.5, | |
| "epoch": 0.04690079753742829, | |
| "grad_norm": 1.4554834982831377, | |
| "kl": 0.31591796875, | |
| "learning_rate": 8.100001337484787e-07, | |
| "loss": 0.0997, | |
| "num_tokens": 11847600.0, | |
| "reward": 0.47475508879870176, | |
| "reward_std": 0.10649433638900518, | |
| "rewards/code_reward/mean": 0.47475508879870176, | |
| "rewards/code_reward/std": 0.10649433825165033, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 308.5, | |
| "completions/max_terminated_length": 308.5, | |
| "completions/mean_length": 141.1875, | |
| "completions/mean_terminated_length": 141.1875, | |
| "completions/min_length": 68.75, | |
| "completions/min_terminated_length": 68.75, | |
| "epoch": 0.04701273261508325, | |
| "grad_norm": 2.2293342326111887, | |
| "kl": 0.29931640625, | |
| "learning_rate": 8.026585100169251e-07, | |
| "loss": -0.2137, | |
| "num_tokens": 11868750.0, | |
| "reward": 0.5101216156035662, | |
| "reward_std": 0.014662902103736997, | |
| "rewards/code_reward/mean": 0.5101216156035662, | |
| "rewards/code_reward/std": 0.014662901870906353, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 297.5, | |
| "completions/max_terminated_length": 297.5, | |
| "completions/mean_length": 166.3125, | |
| "completions/mean_terminated_length": 166.3125, | |
| "completions/min_length": 73.0, | |
| "completions/min_terminated_length": 73.0, | |
| "epoch": 0.04712466769273821, | |
| "grad_norm": 1.7014274221258614, | |
| "kl": 0.2890625, | |
| "learning_rate": 7.953985928341601e-07, | |
| "loss": 0.0663, | |
| "num_tokens": 11895960.0, | |
| "reward": 0.5694793821312487, | |
| "reward_std": 0.16413932980503887, | |
| "rewards/code_reward/mean": 0.5694793821312487, | |
| "rewards/code_reward/std": 0.16413932980503887, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 279.75, | |
| "completions/max_terminated_length": 279.75, | |
| "completions/mean_length": 171.78125, | |
| "completions/mean_terminated_length": 171.78125, | |
| "completions/min_length": 101.5, | |
| "completions/min_terminated_length": 101.5, | |
| "epoch": 0.047236602770393175, | |
| "grad_norm": 1.1761445629861498, | |
| "kl": 0.256103515625, | |
| "learning_rate": 7.882206868117693e-07, | |
| "loss": -0.0198, | |
| "num_tokens": 11919857.0, | |
| "reward": 0.7860226929187775, | |
| "reward_std": 0.15767237346153706, | |
| "rewards/code_reward/mean": 0.7860226929187775, | |
| "rewards/code_reward/std": 0.1576723720645532, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 335.5, | |
| "completions/max_terminated_length": 335.5, | |
| "completions/mean_length": 185.5, | |
| "completions/mean_terminated_length": 185.5, | |
| "completions/min_length": 92.5, | |
| "completions/min_terminated_length": 92.5, | |
| "epoch": 0.047348537848048135, | |
| "grad_norm": 1.8051288181053138, | |
| "kl": 0.281982421875, | |
| "learning_rate": 7.81125093120313e-07, | |
| "loss": 0.0279, | |
| "num_tokens": 11942537.0, | |
| "reward": 0.3795018047094345, | |
| "reward_std": 0.15914139337837696, | |
| "rewards/code_reward/mean": 0.3795018047094345, | |
| "rewards/code_reward/std": 0.15914138592779636, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 387.25, | |
| "completions/max_terminated_length": 387.25, | |
| "completions/mean_length": 178.40625, | |
| "completions/mean_terminated_length": 178.40625, | |
| "completions/min_length": 84.75, | |
| "completions/min_terminated_length": 84.75, | |
| "epoch": 0.047460472925703094, | |
| "grad_norm": 1.4032630704318265, | |
| "kl": 0.30078125, | |
| "learning_rate": 7.741121094766916e-07, | |
| "loss": -0.1775, | |
| "num_tokens": 11966390.0, | |
| "reward": 0.6128955632448196, | |
| "reward_std": 0.11446365877054632, | |
| "rewards/code_reward/mean": 0.6128955632448196, | |
| "rewards/code_reward/std": 0.11446366063319147, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 384.25, | |
| "completions/max_terminated_length": 384.25, | |
| "completions/mean_length": 168.40625, | |
| "completions/mean_terminated_length": 168.40625, | |
| "completions/min_length": 70.25, | |
| "completions/min_terminated_length": 70.25, | |
| "epoch": 0.047572408003358053, | |
| "grad_norm": 1.7373521797945488, | |
| "kl": 0.27392578125, | |
| "learning_rate": 7.671820301316532e-07, | |
| "loss": 0.1031, | |
| "num_tokens": 11991491.0, | |
| "reward": 0.5329861111240461, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/code_reward/mean": 0.5329861111240461, | |
| "rewards/code_reward/std": 0.2041158601641655, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 376.5, | |
| "completions/max_terminated_length": 376.5, | |
| "completions/mean_length": 191.46875, | |
| "completions/mean_terminated_length": 191.46875, | |
| "completions/min_length": 92.0, | |
| "completions/min_terminated_length": 92.0, | |
| "epoch": 0.04768434308101301, | |
| "grad_norm": 2.0077272533591266, | |
| "kl": 0.25927734375, | |
| "learning_rate": 7.603351458574474e-07, | |
| "loss": 0.1358, | |
| "num_tokens": 12013706.0, | |
| "reward": 0.2916666716337204, | |
| "reward_std": 0.07259188406169415, | |
| "rewards/code_reward/mean": 0.2916666716337204, | |
| "rewards/code_reward/std": 0.072591882199049, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 479.5, | |
| "completions/max_terminated_length": 479.5, | |
| "completions/mean_length": 218.28125, | |
| "completions/mean_terminated_length": 218.28125, | |
| "completions/min_length": 85.25, | |
| "completions/min_terminated_length": 85.25, | |
| "epoch": 0.04779627815866797, | |
| "grad_norm": 1.3492407743127255, | |
| "kl": 0.30029296875, | |
| "learning_rate": 7.535717439356255e-07, | |
| "loss": 0.031, | |
| "num_tokens": 12042155.0, | |
| "reward": 0.5065476968884468, | |
| "reward_std": 0.2563619986176491, | |
| "rewards/code_reward/mean": 0.5065476968884468, | |
| "rewards/code_reward/std": 0.2563620023429394, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 268.5, | |
| "completions/max_terminated_length": 268.5, | |
| "completions/mean_length": 178.71875, | |
| "completions/mean_terminated_length": 178.71875, | |
| "completions/min_length": 96.0, | |
| "completions/min_terminated_length": 96.0, | |
| "epoch": 0.04790821323632293, | |
| "grad_norm": 1.279825584779865, | |
| "kl": 0.3115234375, | |
| "learning_rate": 7.46892108144986e-07, | |
| "loss": -0.0739, | |
| "num_tokens": 12066530.0, | |
| "reward": 0.5710227191448212, | |
| "reward_std": 0.18481200002133846, | |
| "rewards/code_reward/mean": 0.5710227191448212, | |
| "rewards/code_reward/std": 0.18481199722737074, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 262.75, | |
| "completions/max_terminated_length": 262.75, | |
| "completions/mean_length": 157.84375, | |
| "completions/mean_terminated_length": 157.84375, | |
| "completions/min_length": 91.5, | |
| "completions/min_terminated_length": 91.5, | |
| "epoch": 0.04802014831397789, | |
| "grad_norm": 1.2537079099692274, | |
| "kl": 0.32080078125, | |
| "learning_rate": 7.402965187496697e-07, | |
| "loss": -0.062, | |
| "num_tokens": 12093773.0, | |
| "reward": 0.40365831553936005, | |
| "reward_std": 0.06347889173775911, | |
| "rewards/code_reward/mean": 0.40365831553936005, | |
| "rewards/code_reward/std": 0.06347889162134379, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 342.75, | |
| "completions/max_terminated_length": 342.75, | |
| "completions/mean_length": 178.625, | |
| "completions/mean_terminated_length": 178.625, | |
| "completions/min_length": 84.25, | |
| "completions/min_terminated_length": 84.25, | |
| "epoch": 0.04813208339163285, | |
| "grad_norm": 1.5682030729649754, | |
| "kl": 0.3115234375, | |
| "learning_rate": 7.337852524873974e-07, | |
| "loss": 0.0633, | |
| "num_tokens": 12119825.0, | |
| "reward": 0.4007348418235779, | |
| "reward_std": 0.2273004651069641, | |
| "rewards/code_reward/mean": 0.4007348418235779, | |
| "rewards/code_reward/std": 0.227300476282835, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 307.75, | |
| "completions/max_terminated_length": 307.75, | |
| "completions/mean_length": 166.8125, | |
| "completions/mean_terminated_length": 166.8125, | |
| "completions/min_length": 99.0, | |
| "completions/min_terminated_length": 99.0, | |
| "epoch": 0.04824401846928781, | |
| "grad_norm": 1.414420572830449, | |
| "kl": 0.25830078125, | |
| "learning_rate": 7.273585825578608e-07, | |
| "loss": -0.004, | |
| "num_tokens": 12141963.0, | |
| "reward": 0.09160848939791322, | |
| "reward_std": 0.09675811271881685, | |
| "rewards/code_reward/mean": 0.09160848939791322, | |
| "rewards/code_reward/std": 0.09675811271881685, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 289.5, | |
| "completions/max_terminated_length": 289.5, | |
| "completions/mean_length": 160.6875, | |
| "completions/mean_terminated_length": 160.6875, | |
| "completions/min_length": 85.75, | |
| "completions/min_terminated_length": 85.75, | |
| "epoch": 0.048355953546942776, | |
| "grad_norm": 1.2819351419525538, | |
| "kl": 0.299072265625, | |
| "learning_rate": 7.21016778611259e-07, | |
| "loss": 0.0442, | |
| "num_tokens": 12160385.0, | |
| "reward": 0.3042712155729532, | |
| "reward_std": 0.1732648597098887, | |
| "rewards/code_reward/mean": 0.3042712155729532, | |
| "rewards/code_reward/std": 0.17326486064121127, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 321.0, | |
| "completions/max_terminated_length": 321.0, | |
| "completions/mean_length": 199.65625, | |
| "completions/mean_terminated_length": 199.65625, | |
| "completions/min_length": 90.25, | |
| "completions/min_terminated_length": 90.25, | |
| "epoch": 0.048467888624597735, | |
| "grad_norm": 1.4343614720564808, | |
| "kl": 0.3349609375, | |
| "learning_rate": 7.147601067369835e-07, | |
| "loss": -0.0444, | |
| "num_tokens": 12183238.0, | |
| "reward": 0.2736266343854368, | |
| "reward_std": 0.11790771875530481, | |
| "rewards/code_reward/mean": 0.2736266343854368, | |
| "rewards/code_reward/std": 0.11790771875530481, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 430.5, | |
| "completions/max_terminated_length": 430.5, | |
| "completions/mean_length": 220.1875, | |
| "completions/mean_terminated_length": 220.1875, | |
| "completions/min_length": 140.25, | |
| "completions/min_terminated_length": 140.25, | |
| "epoch": 0.048579823702252695, | |
| "grad_norm": 1.5149123099259716, | |
| "kl": 0.208740234375, | |
| "learning_rate": 7.085888294524561e-07, | |
| "loss": 0.0555, | |
| "num_tokens": 12205652.0, | |
| "reward": 0.1229942380450666, | |
| "reward_std": 0.1744669363833964, | |
| "rewards/code_reward/mean": 0.1229942380450666, | |
| "rewards/code_reward/std": 0.17446694057434797, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 280.5, | |
| "completions/max_terminated_length": 280.5, | |
| "completions/mean_length": 172.625, | |
| "completions/mean_terminated_length": 172.625, | |
| "completions/min_length": 68.25, | |
| "completions/min_terminated_length": 68.25, | |
| "epoch": 0.048691758779907654, | |
| "grad_norm": 1.1887088926887228, | |
| "kl": 0.293701171875, | |
| "learning_rate": 7.025032056921117e-07, | |
| "loss": 0.0018, | |
| "num_tokens": 12225632.0, | |
| "reward": 0.36100322124548256, | |
| "reward_std": 0.18402530439198017, | |
| "rewards/code_reward/mean": 0.36100322124548256, | |
| "rewards/code_reward/std": 0.18402530439198017, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 266.0, | |
| "completions/max_terminated_length": 266.0, | |
| "completions/mean_length": 165.9375, | |
| "completions/mean_terminated_length": 165.9375, | |
| "completions/min_length": 77.5, | |
| "completions/min_terminated_length": 77.5, | |
| "epoch": 0.04880369385756261, | |
| "grad_norm": 1.3337355570198512, | |
| "kl": 0.3330078125, | |
| "learning_rate": 6.965034907965349e-07, | |
| "loss": -0.0914, | |
| "num_tokens": 12244678.0, | |
| "reward": 0.5988663695752621, | |
| "reward_std": 0.20593830198049545, | |
| "rewards/code_reward/mean": 0.5988663695752621, | |
| "rewards/code_reward/std": 0.20593830046709627, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 690.25, | |
| "completions/max_terminated_length": 308.5, | |
| "completions/mean_length": 242.875, | |
| "completions/mean_terminated_length": 187.48214721679688, | |
| "completions/min_length": 99.75, | |
| "completions/min_terminated_length": 99.75, | |
| "epoch": 0.04891562893521757, | |
| "grad_norm": 1.333932781238413, | |
| "kl": 0.2191162109375, | |
| "learning_rate": 6.905899365017462e-07, | |
| "loss": 0.1709, | |
| "num_tokens": 12265450.0, | |
| "reward": 0.28224857337772846, | |
| "reward_std": 0.13666313188150525, | |
| "rewards/code_reward/mean": 0.28224857337772846, | |
| "rewards/code_reward/std": 0.13666313188150525, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 410.5, | |
| "completions/max_terminated_length": 410.5, | |
| "completions/mean_length": 176.8125, | |
| "completions/mean_terminated_length": 176.8125, | |
| "completions/min_length": 76.25, | |
| "completions/min_terminated_length": 76.25, | |
| "epoch": 0.04902756401287253, | |
| "grad_norm": 1.2985967740803328, | |
| "kl": 0.283447265625, | |
| "learning_rate": 6.847627909286409e-07, | |
| "loss": 0.1118, | |
| "num_tokens": 12284524.0, | |
| "reward": 0.40528881177306175, | |
| "reward_std": 0.15535564813762903, | |
| "rewards/code_reward/mean": 0.40528881177306175, | |
| "rewards/code_reward/std": 0.15535564627498388, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 376.25, | |
| "completions/max_terminated_length": 376.25, | |
| "completions/mean_length": 202.625, | |
| "completions/mean_terminated_length": 202.625, | |
| "completions/min_length": 91.75, | |
| "completions/min_terminated_length": 91.75, | |
| "epoch": 0.04913949909052749, | |
| "grad_norm": 1.1950680004085388, | |
| "kl": 0.315185546875, | |
| "learning_rate": 6.790222985725761e-07, | |
| "loss": 0.025, | |
| "num_tokens": 12306080.0, | |
| "reward": 0.48305153474211693, | |
| "reward_std": 0.12105439510196447, | |
| "rewards/code_reward/mean": 0.48305153474211693, | |
| "rewards/code_reward/std": 0.12105440441519022, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 223.5, | |
| "completions/max_terminated_length": 223.5, | |
| "completions/mean_length": 148.75, | |
| "completions/mean_terminated_length": 148.75, | |
| "completions/min_length": 79.5, | |
| "completions/min_terminated_length": 79.5, | |
| "epoch": 0.04925143416818245, | |
| "grad_norm": 1.7349156594295274, | |
| "kl": 0.23681640625, | |
| "learning_rate": 6.733687002931141e-07, | |
| "loss": -0.0181, | |
| "num_tokens": 12327448.0, | |
| "reward": 0.3754356447607279, | |
| "reward_std": 0.11030078027397394, | |
| "rewards/code_reward/mean": 0.3754356447607279, | |
| "rewards/code_reward/std": 0.11030078679323196, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 367.5, | |
| "completions/max_terminated_length": 367.5, | |
| "completions/mean_length": 209.40625, | |
| "completions/mean_terminated_length": 209.40625, | |
| "completions/min_length": 101.75, | |
| "completions/min_terminated_length": 101.75, | |
| "epoch": 0.04936336924583742, | |
| "grad_norm": 1.4076525318028708, | |
| "kl": 0.3115234375, | |
| "learning_rate": 6.678022333039158e-07, | |
| "loss": -0.0373, | |
| "num_tokens": 12347901.0, | |
| "reward": 0.09711253456771374, | |
| "reward_std": 0.057268128264695406, | |
| "rewards/code_reward/mean": 0.09711253456771374, | |
| "rewards/code_reward/std": 0.05726812733337283, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 265.25, | |
| "completions/max_terminated_length": 265.25, | |
| "completions/mean_length": 161.5, | |
| "completions/mean_terminated_length": 161.5, | |
| "completions/min_length": 74.75, | |
| "completions/min_terminated_length": 74.75, | |
| "epoch": 0.049475304323492376, | |
| "grad_norm": 1.4640386964818788, | |
| "kl": 0.293701171875, | |
| "learning_rate": 6.623231311627876e-07, | |
| "loss": 0.0452, | |
| "num_tokens": 12372045.0, | |
| "reward": 0.4493050128221512, | |
| "reward_std": 0.08245376159902662, | |
| "rewards/code_reward/mean": 0.4493050128221512, | |
| "rewards/code_reward/std": 0.08245376858394593, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 381.25, | |
| "completions/max_terminated_length": 381.25, | |
| "completions/mean_length": 170.96875, | |
| "completions/mean_terminated_length": 170.96875, | |
| "completions/min_length": 85.0, | |
| "completions/min_terminated_length": 85.0, | |
| "epoch": 0.049587239401147336, | |
| "grad_norm": 2.1086386312820666, | |
| "kl": 0.27001953125, | |
| "learning_rate": 6.569316237618811e-07, | |
| "loss": 0.1217, | |
| "num_tokens": 12396860.0, | |
| "reward": 0.34658948611468077, | |
| "reward_std": 0.23270382836926728, | |
| "rewards/code_reward/mean": 0.34658948611468077, | |
| "rewards/code_reward/std": 0.23270384327042848, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 419.75, | |
| "completions/max_terminated_length": 419.75, | |
| "completions/mean_length": 220.5625, | |
| "completions/mean_terminated_length": 220.5625, | |
| "completions/min_length": 120.25, | |
| "completions/min_terminated_length": 120.25, | |
| "epoch": 0.049699174478802295, | |
| "grad_norm": 1.4630455489204783, | |
| "kl": 0.26171875, | |
| "learning_rate": 6.516279373180499e-07, | |
| "loss": 0.2184, | |
| "num_tokens": 12422750.0, | |
| "reward": 0.37283046543598175, | |
| "reward_std": 0.12992971763014793, | |
| "rewards/code_reward/mean": 0.37283046543598175, | |
| "rewards/code_reward/std": 0.12992972321808338, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 268.25, | |
| "completions/max_terminated_length": 268.25, | |
| "completions/mean_length": 143.125, | |
| "completions/mean_terminated_length": 143.125, | |
| "completions/min_length": 72.75, | |
| "completions/min_terminated_length": 72.75, | |
| "epoch": 0.049811109556457255, | |
| "grad_norm": 1.6080266837223955, | |
| "kl": 0.27734375, | |
| "learning_rate": 6.464122943633543e-07, | |
| "loss": -0.0419, | |
| "num_tokens": 12441410.0, | |
| "reward": 0.14340316224843264, | |
| "reward_std": 0.15656377002596855, | |
| "rewards/code_reward/mean": 0.14340316224843264, | |
| "rewards/code_reward/std": 0.15656376257538795, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 356.0, | |
| "completions/max_terminated_length": 356.0, | |
| "completions/mean_length": 222.3125, | |
| "completions/mean_terminated_length": 222.3125, | |
| "completions/min_length": 124.0, | |
| "completions/min_terminated_length": 124.0, | |
| "epoch": 0.049923044634112214, | |
| "grad_norm": 1.2494209477442706, | |
| "kl": 0.2666015625, | |
| "learning_rate": 6.412849137357271e-07, | |
| "loss": -0.0008, | |
| "num_tokens": 12469060.0, | |
| "reward": 0.41476833214983344, | |
| "reward_std": 0.1326767287682742, | |
| "rewards/code_reward/mean": 0.41476833214983344, | |
| "rewards/code_reward/std": 0.1326767250429839, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 367.25, | |
| "completions/max_terminated_length": 367.25, | |
| "completions/mean_length": 207.5, | |
| "completions/mean_terminated_length": 207.5, | |
| "completions/min_length": 119.5, | |
| "completions/min_terminated_length": 119.5, | |
| "epoch": 0.05003497971176717, | |
| "grad_norm": 1.5486642122578553, | |
| "kl": 0.235107421875, | |
| "learning_rate": 6.3624601056979e-07, | |
| "loss": 0.1428, | |
| "num_tokens": 12493716.0, | |
| "reward": 0.5466772168874741, | |
| "reward_std": 0.3743356466293335, | |
| "rewards/code_reward/mean": 0.5466772168874741, | |
| "rewards/code_reward/std": 0.37433566339313984, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 450.5, | |
| "completions/max_terminated_length": 450.5, | |
| "completions/mean_length": 191.28125, | |
| "completions/mean_terminated_length": 191.28125, | |
| "completions/min_length": 94.5, | |
| "completions/min_terminated_length": 94.5, | |
| "epoch": 0.05014691478942213, | |
| "grad_norm": 1.3722991110183906, | |
| "kl": 0.2452392578125, | |
| "learning_rate": 6.312957962878278e-07, | |
| "loss": 0.2083, | |
| "num_tokens": 12519901.0, | |
| "reward": 0.4153126999735832, | |
| "reward_std": 0.04972913861274719, | |
| "rewards/code_reward/mean": 0.4153126999735832, | |
| "rewards/code_reward/std": 0.04972913861274719, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 242.25, | |
| "completions/max_terminated_length": 242.25, | |
| "completions/mean_length": 156.4375, | |
| "completions/mean_terminated_length": 156.4375, | |
| "completions/min_length": 76.75, | |
| "completions/min_terminated_length": 76.75, | |
| "epoch": 0.05025884986707709, | |
| "grad_norm": 1.6733807611752833, | |
| "kl": 0.339599609375, | |
| "learning_rate": 6.264344785909181e-07, | |
| "loss": 0.0653, | |
| "num_tokens": 12537763.0, | |
| "reward": 0.27301738993264735, | |
| "reward_std": 0.14166639209724963, | |
| "rewards/code_reward/mean": 0.27301738993264735, | |
| "rewards/code_reward/std": 0.14166639978066087, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 685.0, | |
| "completions/max_terminated_length": 364.25, | |
| "completions/mean_length": 219.84375, | |
| "completions/mean_terminated_length": 163.70536041259766, | |
| "completions/min_length": 73.0, | |
| "completions/min_terminated_length": 73.0, | |
| "epoch": 0.05037078494473206, | |
| "grad_norm": 1.3730882038714862, | |
| "kl": 0.3033447265625, | |
| "learning_rate": 6.216622614502149e-07, | |
| "loss": 0.2151, | |
| "num_tokens": 12564502.0, | |
| "reward": 0.27368341060355306, | |
| "reward_std": 0.12103560357354581, | |
| "rewards/code_reward/mean": 0.27368341060355306, | |
| "rewards/code_reward/std": 0.1210356056690216, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 374.0, | |
| "completions/max_terminated_length": 374.0, | |
| "completions/mean_length": 192.75, | |
| "completions/mean_terminated_length": 192.75, | |
| "completions/min_length": 78.0, | |
| "completions/min_terminated_length": 78.0, | |
| "epoch": 0.05048272002238702, | |
| "grad_norm": 1.1120930858197564, | |
| "kl": 0.256103515625, | |
| "learning_rate": 6.169793450983916e-07, | |
| "loss": 0.0663, | |
| "num_tokens": 12595766.0, | |
| "reward": 0.2891203761100769, | |
| "reward_std": 0.005760519183240831, | |
| "rewards/code_reward/mean": 0.2891203761100769, | |
| "rewards/code_reward/std": 0.005760519299656153, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 274.75, | |
| "completions/max_terminated_length": 274.75, | |
| "completions/mean_length": 180.71875, | |
| "completions/mean_terminated_length": 180.71875, | |
| "completions/min_length": 107.0, | |
| "completions/min_terminated_length": 107.0, | |
| "epoch": 0.05059465510004198, | |
| "grad_norm": 1.5436514492166131, | |
| "kl": 0.3642578125, | |
| "learning_rate": 6.123859260212393e-07, | |
| "loss": 0.07, | |
| "num_tokens": 12617805.0, | |
| "reward": 0.3369871713221073, | |
| "reward_std": 0.1278561158105731, | |
| "rewards/code_reward/mean": 0.3369871713221073, | |
| "rewards/code_reward/std": 0.12785612046718597, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 417.5, | |
| "completions/max_terminated_length": 417.5, | |
| "completions/mean_length": 189.8125, | |
| "completions/mean_terminated_length": 189.8125, | |
| "completions/min_length": 80.25, | |
| "completions/min_terminated_length": 80.25, | |
| "epoch": 0.050706590177696936, | |
| "grad_norm": 1.655439639393973, | |
| "kl": 0.302978515625, | |
| "learning_rate": 6.07882196949423e-07, | |
| "loss": -0.0797, | |
| "num_tokens": 12641655.0, | |
| "reward": 0.19023456424474716, | |
| "reward_std": 0.11184495687484741, | |
| "rewards/code_reward/mean": 0.19023456424474716, | |
| "rewards/code_reward/std": 0.11184496060013771, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 437.0, | |
| "completions/max_terminated_length": 437.0, | |
| "completions/mean_length": 267.09375, | |
| "completions/mean_terminated_length": 267.09375, | |
| "completions/min_length": 137.5, | |
| "completions/min_terminated_length": 137.5, | |
| "epoch": 0.050818525255351896, | |
| "grad_norm": 1.0464296647386528, | |
| "kl": 0.1806640625, | |
| "learning_rate": 6.034683468503948e-07, | |
| "loss": -0.0225, | |
| "num_tokens": 12663874.0, | |
| "reward": 0.3470753263682127, | |
| "reward_std": 0.1855549574829638, | |
| "rewards/code_reward/mean": 0.3470753263682127, | |
| "rewards/code_reward/std": 0.1855549574829638, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 679.0, | |
| "completions/max_terminated_length": 679.0, | |
| "completions/mean_length": 245.15625, | |
| "completions/mean_terminated_length": 245.15625, | |
| "completions/min_length": 89.75, | |
| "completions/min_terminated_length": 89.75, | |
| "epoch": 0.050930460333006855, | |
| "grad_norm": 1.3181084417709616, | |
| "kl": 0.22265625, | |
| "learning_rate": 5.991445609204641e-07, | |
| "loss": -0.2186, | |
| "num_tokens": 12690983.0, | |
| "reward": 0.3848821893334389, | |
| "reward_std": 0.1244323942810297, | |
| "rewards/code_reward/mean": 0.3848821893334389, | |
| "rewards/code_reward/std": 0.12443239195272326, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 298.5, | |
| "completions/max_terminated_length": 298.5, | |
| "completions/mean_length": 179.1875, | |
| "completions/mean_terminated_length": 179.1875, | |
| "completions/min_length": 97.5, | |
| "completions/min_terminated_length": 97.5, | |
| "epoch": 0.051042395410661814, | |
| "grad_norm": 1.1842353804957628, | |
| "kl": 0.24755859375, | |
| "learning_rate": 5.949110205770292e-07, | |
| "loss": -0.0126, | |
| "num_tokens": 12714285.0, | |
| "reward": 0.23306879866868258, | |
| "reward_std": 0.058227866189554334, | |
| "rewards/code_reward/mean": 0.23306879866868258, | |
| "rewards/code_reward/std": 0.05822786991484463, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 424.75, | |
| "completions/max_terminated_length": 424.75, | |
| "completions/mean_length": 212.28125, | |
| "completions/mean_terminated_length": 212.28125, | |
| "completions/min_length": 89.25, | |
| "completions/min_terminated_length": 89.25, | |
| "epoch": 0.051154330488316774, | |
| "grad_norm": 1.361690839229129, | |
| "kl": 0.21875, | |
| "learning_rate": 5.90767903450964e-07, | |
| "loss": 0.0986, | |
| "num_tokens": 12738542.0, | |
| "reward": 0.07210950274020433, | |
| "reward_std": 0.05199644831009209, | |
| "rewards/code_reward/mean": 0.07210950274020433, | |
| "rewards/code_reward/std": 0.05199644842650741, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 288.75, | |
| "completions/max_terminated_length": 288.75, | |
| "completions/mean_length": 172.09375, | |
| "completions/mean_terminated_length": 172.09375, | |
| "completions/min_length": 76.25, | |
| "completions/min_terminated_length": 76.25, | |
| "epoch": 0.05126626556597173, | |
| "grad_norm": 1.627223053243688, | |
| "kl": 0.25048828125, | |
| "learning_rate": 5.867153833791652e-07, | |
| "loss": -0.0443, | |
| "num_tokens": 12761809.0, | |
| "reward": 0.2288264101371169, | |
| "reward_std": 0.18154687527567148, | |
| "rewards/code_reward/mean": 0.2288264101371169, | |
| "rewards/code_reward/std": 0.18154688365757465, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 853.75, | |
| "completions/max_terminated_length": 403.0, | |
| "completions/mean_length": 259.6875, | |
| "completions/mean_terminated_length": 201.59375381469727, | |
| "completions/min_length": 119.25, | |
| "completions/min_terminated_length": 119.25, | |
| "epoch": 0.0513782006436267, | |
| "grad_norm": 1.37901004815236, | |
| "kl": 0.231201171875, | |
| "learning_rate": 5.827536303972587e-07, | |
| "loss": 0.3202, | |
| "num_tokens": 12797623.0, | |
| "reward": 0.4052652306854725, | |
| "reward_std": 0.09384694416075945, | |
| "rewards/code_reward/mean": 0.4052652306854725, | |
| "rewards/code_reward/std": 0.09384695184417069, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 467.5, | |
| "completions/max_terminated_length": 467.5, | |
| "completions/mean_length": 190.0625, | |
| "completions/mean_terminated_length": 190.0625, | |
| "completions/min_length": 88.25, | |
| "completions/min_terminated_length": 88.25, | |
| "epoch": 0.05149013572128166, | |
| "grad_norm": 1.1377121561938153, | |
| "kl": 0.241943359375, | |
| "learning_rate": 5.78882810732465e-07, | |
| "loss": -0.0275, | |
| "num_tokens": 12819217.0, | |
| "reward": 0.45869156159460545, | |
| "reward_std": 0.021421764977276325, | |
| "rewards/code_reward/mean": 0.45869156159460545, | |
| "rewards/code_reward/std": 0.021421766839921474, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 263.25, | |
| "completions/max_terminated_length": 263.25, | |
| "completions/mean_length": 155.9375, | |
| "completions/mean_terminated_length": 155.9375, | |
| "completions/min_length": 84.5, | |
| "completions/min_terminated_length": 84.5, | |
| "epoch": 0.05160207079893662, | |
| "grad_norm": 1.1807451529495485, | |
| "kl": 0.306884765625, | |
| "learning_rate": 5.75103086796625e-07, | |
| "loss": -0.0194, | |
| "num_tokens": 12843095.0, | |
| "reward": 0.018822902347892523, | |
| "reward_std": 0.015156067907810211, | |
| "rewards/code_reward/mean": 0.018822902347892523, | |
| "rewards/code_reward/std": 0.015156067907810211, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 404.0, | |
| "completions/max_terminated_length": 404.0, | |
| "completions/mean_length": 258.4375, | |
| "completions/mean_terminated_length": 258.4375, | |
| "completions/min_length": 161.25, | |
| "completions/min_terminated_length": 161.25, | |
| "epoch": 0.05171400587659158, | |
| "grad_norm": 1.336417383594584, | |
| "kl": 0.199951171875, | |
| "learning_rate": 5.714146171793846e-07, | |
| "loss": 0.1694, | |
| "num_tokens": 12866173.0, | |
| "reward": 0.13850605115294456, | |
| "reward_std": 0.0918186865746975, | |
| "rewards/code_reward/mean": 0.13850605115294456, | |
| "rewards/code_reward/std": 0.09181869029998779, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 452.5, | |
| "completions/max_terminated_length": 452.5, | |
| "completions/mean_length": 190.40625, | |
| "completions/mean_terminated_length": 190.40625, | |
| "completions/min_length": 83.75, | |
| "completions/min_terminated_length": 83.75, | |
| "epoch": 0.05182594095424654, | |
| "grad_norm": 1.7230754097104986, | |
| "kl": 0.298583984375, | |
| "learning_rate": 5.678175566415422e-07, | |
| "loss": 0.0863, | |
| "num_tokens": 12892290.0, | |
| "reward": 0.33602308854460716, | |
| "reward_std": 0.07540364377200603, | |
| "rewards/code_reward/mean": 0.33602308854460716, | |
| "rewards/code_reward/std": 0.07540364749729633, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 373.75, | |
| "completions/max_terminated_length": 373.75, | |
| "completions/mean_length": 159.21875, | |
| "completions/mean_terminated_length": 159.21875, | |
| "completions/min_length": 69.25, | |
| "completions/min_terminated_length": 69.25, | |
| "epoch": 0.051937876031901496, | |
| "grad_norm": 1.5832482382803177, | |
| "kl": 0.1796875, | |
| "learning_rate": 5.643120561085528e-07, | |
| "loss": -0.0099, | |
| "num_tokens": 12911025.0, | |
| "reward": 0.5693264603614807, | |
| "reward_std": 0.09451888594776392, | |
| "rewards/code_reward/mean": 0.5693264603614807, | |
| "rewards/code_reward/std": 0.09451888781040907, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 771.25, | |
| "completions/max_terminated_length": 698.25, | |
| "completions/mean_length": 317.71875, | |
| "completions/mean_terminated_length": 270.93304443359375, | |
| "completions/min_length": 121.25, | |
| "completions/min_terminated_length": 121.25, | |
| "epoch": 0.052049811109556456, | |
| "grad_norm": 0.5755700855439375, | |
| "kl": 0.2149658203125, | |
| "learning_rate": 5.608982626641991e-07, | |
| "loss": 0.033, | |
| "num_tokens": 12946576.0, | |
| "reward": 0.3332868255674839, | |
| "reward_std": 0.08039725571870804, | |
| "rewards/code_reward/mean": 0.3332868255674839, | |
| "rewards/code_reward/std": 0.08039725571870804, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 229.25, | |
| "completions/max_terminated_length": 229.25, | |
| "completions/mean_length": 158.65625, | |
| "completions/mean_terminated_length": 158.65625, | |
| "completions/min_length": 84.25, | |
| "completions/min_terminated_length": 84.25, | |
| "epoch": 0.052161746187211415, | |
| "grad_norm": 1.4999164462281511, | |
| "kl": 0.25244140625, | |
| "learning_rate": 5.575763195444166e-07, | |
| "loss": 0.1101, | |
| "num_tokens": 12962893.0, | |
| "reward": 0.26572345197200775, | |
| "reward_std": 0.08325213519856334, | |
| "rewards/code_reward/mean": 0.26572345197200775, | |
| "rewards/code_reward/std": 0.08325213845819235, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 256.75, | |
| "completions/max_terminated_length": 256.75, | |
| "completions/mean_length": 156.65625, | |
| "completions/mean_terminated_length": 156.65625, | |
| "completions/min_length": 84.25, | |
| "completions/min_terminated_length": 84.25, | |
| "epoch": 0.052273681264866374, | |
| "grad_norm": 1.5654218625865297, | |
| "kl": 0.29931640625, | |
| "learning_rate": 5.543463661312847e-07, | |
| "loss": 0.0124, | |
| "num_tokens": 12989394.0, | |
| "reward": 0.4510860964655876, | |
| "reward_std": 0.038132989313453436, | |
| "rewards/code_reward/mean": 0.4510860964655876, | |
| "rewards/code_reward/std": 0.03813299024477601, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 284.75, | |
| "completions/max_terminated_length": 284.75, | |
| "completions/mean_length": 189.0625, | |
| "completions/mean_terminated_length": 189.0625, | |
| "completions/min_length": 110.25, | |
| "completions/min_terminated_length": 110.25, | |
| "epoch": 0.05238561634252134, | |
| "grad_norm": 1.1317457646109947, | |
| "kl": 0.223388671875, | |
| "learning_rate": 5.512085379471808e-07, | |
| "loss": -0.0249, | |
| "num_tokens": 13014908.0, | |
| "reward": 0.3786849081516266, | |
| "reward_std": 0.1657154718413949, | |
| "rewards/code_reward/mean": 0.3786849081516266, | |
| "rewards/code_reward/std": 0.16571548115462065, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 308.25, | |
| "completions/max_terminated_length": 308.25, | |
| "completions/mean_length": 182.21875, | |
| "completions/mean_terminated_length": 182.21875, | |
| "completions/min_length": 90.75, | |
| "completions/min_terminated_length": 90.75, | |
| "epoch": 0.0524975514201763, | |
| "grad_norm": 1.3580873503854825, | |
| "kl": 0.2578125, | |
| "learning_rate": 5.481629666490903e-07, | |
| "loss": 0.041, | |
| "num_tokens": 13033779.0, | |
| "reward": 0.6068142354488373, | |
| "reward_std": 0.21611789241433144, | |
| "rewards/code_reward/mean": 0.6068142354488373, | |
| "rewards/code_reward/std": 0.21611790172755718, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 469 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 379.75, | |
| "completions/max_terminated_length": 379.75, | |
| "completions/mean_length": 216.59375, | |
| "completions/mean_terminated_length": 216.59375, | |
| "completions/min_length": 120.5, | |
| "completions/min_terminated_length": 120.5, | |
| "epoch": 0.05260948649783126, | |
| "grad_norm": 1.2530077335631926, | |
| "kl": 0.28271484375, | |
| "learning_rate": 5.452097800230853e-07, | |
| "loss": 0.0203, | |
| "num_tokens": 13058070.0, | |
| "reward": 0.4219468259252608, | |
| "reward_std": 0.08188007143326104, | |
| "rewards/code_reward/mean": 0.4219468259252608, | |
| "rewards/code_reward/std": 0.08188007143326104, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 373.25, | |
| "completions/max_terminated_length": 373.25, | |
| "completions/mean_length": 230.09375, | |
| "completions/mean_terminated_length": 230.09375, | |
| "completions/min_length": 153.25, | |
| "completions/min_terminated_length": 153.25, | |
| "epoch": 0.05272142157548622, | |
| "grad_norm": 1.8321679081555318, | |
| "kl": 0.239013671875, | |
| "learning_rate": 5.423491019789623e-07, | |
| "loss": -0.1368, | |
| "num_tokens": 13082769.0, | |
| "reward": 0.30861951038241386, | |
| "reward_std": 0.16406975965946913, | |
| "rewards/code_reward/mean": 0.30861951038241386, | |
| "rewards/code_reward/std": 0.16406976664438844, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 471 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 404.75, | |
| "completions/max_terminated_length": 404.75, | |
| "completions/mean_length": 182.65625, | |
| "completions/mean_terminated_length": 182.65625, | |
| "completions/min_length": 90.0, | |
| "completions/min_terminated_length": 90.0, | |
| "epoch": 0.05283335665314118, | |
| "grad_norm": 0.9729476481805254, | |
| "kl": 0.255126953125, | |
| "learning_rate": 5.395810525450425e-07, | |
| "loss": 0.0919, | |
| "num_tokens": 13106534.0, | |
| "reward": 0.21364107308909297, | |
| "reward_std": 0.09037529258057475, | |
| "rewards/code_reward/mean": 0.21364107308909297, | |
| "rewards/code_reward/std": 0.09037529304623604, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 315.5, | |
| "completions/max_terminated_length": 315.5, | |
| "completions/mean_length": 170.96875, | |
| "completions/mean_terminated_length": 170.96875, | |
| "completions/min_length": 92.75, | |
| "completions/min_terminated_length": 92.75, | |
| "epoch": 0.05294529173079614, | |
| "grad_norm": 0.9407242390344841, | |
| "kl": 0.236083984375, | |
| "learning_rate": 5.369057478631359e-07, | |
| "loss": 0.0076, | |
| "num_tokens": 13125717.0, | |
| "reward": 0.18790849673678167, | |
| "reward_std": 0.11648390302434564, | |
| "rewards/code_reward/mean": 0.18790849673678167, | |
| "rewards/code_reward/std": 0.11648390302434564, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 473 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 429.0, | |
| "completions/max_terminated_length": 429.0, | |
| "completions/mean_length": 287.9375, | |
| "completions/mean_terminated_length": 287.9375, | |
| "completions/min_length": 140.0, | |
| "completions/min_terminated_length": 140.0, | |
| "epoch": 0.0530572268084511, | |
| "grad_norm": 0.6022039821926372, | |
| "kl": 0.2093505859375, | |
| "learning_rate": 5.343233001836694e-07, | |
| "loss": -0.0311, | |
| "num_tokens": 13152515.0, | |
| "reward": 0.46875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/code_reward/mean": 0.46875, | |
| "rewards/code_reward/std": 0.0883883461356163, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 255.75, | |
| "completions/max_terminated_length": 255.75, | |
| "completions/mean_length": 157.5625, | |
| "completions/mean_terminated_length": 157.5625, | |
| "completions/min_length": 93.0, | |
| "completions/min_terminated_length": 93.0, | |
| "epoch": 0.053169161886106056, | |
| "grad_norm": 1.3145736475742311, | |
| "kl": 0.292236328125, | |
| "learning_rate": 5.318338178609754e-07, | |
| "loss": -0.0802, | |
| "num_tokens": 13178797.0, | |
| "reward": 0.07483806018717587, | |
| "reward_std": 0.034171308507211506, | |
| "rewards/code_reward/mean": 0.07483806018717587, | |
| "rewards/code_reward/std": 0.03417130699381232, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 396.25, | |
| "completions/max_terminated_length": 396.25, | |
| "completions/mean_length": 235.125, | |
| "completions/mean_terminated_length": 235.125, | |
| "completions/min_length": 119.25, | |
| "completions/min_terminated_length": 119.25, | |
| "epoch": 0.053281096963761015, | |
| "grad_norm": 1.0884074198105211, | |
| "kl": 0.156005859375, | |
| "learning_rate": 5.294374053487459e-07, | |
| "loss": 0.0987, | |
| "num_tokens": 13203897.0, | |
| "reward": 0.15914655849337578, | |
| "reward_std": 0.08056560717523098, | |
| "rewards/code_reward/mean": 0.15914655849337578, | |
| "rewards/code_reward/std": 0.0805656099691987, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 263.75, | |
| "completions/max_terminated_length": 263.75, | |
| "completions/mean_length": 168.9375, | |
| "completions/mean_terminated_length": 168.9375, | |
| "completions/min_length": 99.0, | |
| "completions/min_terminated_length": 99.0, | |
| "epoch": 0.05339303204141598, | |
| "grad_norm": 0.9450638032428148, | |
| "kl": 0.3330078125, | |
| "learning_rate": 5.271341631956511e-07, | |
| "loss": -0.038, | |
| "num_tokens": 13233671.0, | |
| "reward": 0.5676594115793705, | |
| "reward_std": 0.0675080195069313, | |
| "rewards/code_reward/mean": 0.5676594115793705, | |
| "rewards/code_reward/std": 0.0675080232322216, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 477 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 334.75, | |
| "completions/max_terminated_length": 334.75, | |
| "completions/mean_length": 192.8125, | |
| "completions/mean_terminated_length": 192.8125, | |
| "completions/min_length": 88.75, | |
| "completions/min_terminated_length": 88.75, | |
| "epoch": 0.05350496711907094, | |
| "grad_norm": 1.336348928731024, | |
| "kl": 0.270751953125, | |
| "learning_rate": 5.249241880411181e-07, | |
| "loss": -0.0158, | |
| "num_tokens": 13265057.0, | |
| "reward": 0.381644893437624, | |
| "reward_std": 0.09550960175693035, | |
| "rewards/code_reward/mean": 0.381644893437624, | |
| "rewards/code_reward/std": 0.09550959896296263, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 299.25, | |
| "completions/max_terminated_length": 299.25, | |
| "completions/mean_length": 193.1875, | |
| "completions/mean_terminated_length": 193.1875, | |
| "completions/min_length": 114.0, | |
| "completions/min_terminated_length": 114.0, | |
| "epoch": 0.0536169021967259, | |
| "grad_norm": 1.5049532201349622, | |
| "kl": 0.23974609375, | |
| "learning_rate": 5.228075726112785e-07, | |
| "loss": 0.0894, | |
| "num_tokens": 13285927.0, | |
| "reward": 0.16660759504884481, | |
| "reward_std": 0.04511617706157267, | |
| "rewards/code_reward/mean": 0.16660759504884481, | |
| "rewards/code_reward/std": 0.04511618078686297, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 479 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 325.5, | |
| "completions/max_terminated_length": 325.5, | |
| "completions/mean_length": 193.0, | |
| "completions/mean_terminated_length": 193.0, | |
| "completions/min_length": 116.75, | |
| "completions/min_terminated_length": 116.75, | |
| "epoch": 0.05372883727438086, | |
| "grad_norm": 1.3864869015820263, | |
| "kl": 0.21337890625, | |
| "learning_rate": 5.207844057150768e-07, | |
| "loss": 0.158, | |
| "num_tokens": 13303407.0, | |
| "reward": 0.6197916716337204, | |
| "reward_std": 0.20998739823698997, | |
| "rewards/code_reward/mean": 0.6197916716337204, | |
| "rewards/code_reward/std": 0.20998739078640938, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 265.5, | |
| "completions/max_terminated_length": 265.5, | |
| "completions/mean_length": 170.75, | |
| "completions/mean_terminated_length": 170.75, | |
| "completions/min_length": 92.75, | |
| "completions/min_terminated_length": 92.75, | |
| "epoch": 0.05384077235203582, | |
| "grad_norm": 1.7579933265437377, | |
| "kl": 0.256591796875, | |
| "learning_rate": 5.188547722405437e-07, | |
| "loss": 0.0498, | |
| "num_tokens": 13323183.0, | |
| "reward": 0.30053258687257767, | |
| "reward_std": 0.14006465952843428, | |
| "rewards/code_reward/mean": 0.30053258687257767, | |
| "rewards/code_reward/std": 0.1400646585971117, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 481 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 254.75, | |
| "completions/max_terminated_length": 254.75, | |
| "completions/mean_length": 146.4375, | |
| "completions/mean_terminated_length": 146.4375, | |
| "completions/min_length": 70.25, | |
| "completions/min_terminated_length": 70.25, | |
| "epoch": 0.05395270742969078, | |
| "grad_norm": 1.4622693544793195, | |
| "kl": 0.31591796875, | |
| "learning_rate": 5.170187531512351e-07, | |
| "loss": 0.0813, | |
| "num_tokens": 13348197.0, | |
| "reward": 0.2889851483050734, | |
| "reward_std": 0.04471541300881654, | |
| "rewards/code_reward/mean": 0.2889851483050734, | |
| "rewards/code_reward/std": 0.044715409399941564, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 292.5, | |
| "completions/max_terminated_length": 292.5, | |
| "completions/mean_length": 178.875, | |
| "completions/mean_terminated_length": 178.875, | |
| "completions/min_length": 100.75, | |
| "completions/min_terminated_length": 100.75, | |
| "epoch": 0.05406464250734574, | |
| "grad_norm": 1.1459398821291238, | |
| "kl": 0.280029296875, | |
| "learning_rate": 5.152764254828348e-07, | |
| "loss": 0.1023, | |
| "num_tokens": 13372969.0, | |
| "reward": 0.5509072579443455, | |
| "reward_std": 0.1530819907784462, | |
| "rewards/code_reward/mean": 0.5509072579443455, | |
| "rewards/code_reward/std": 0.1530819982290268, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 483 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 220.25, | |
| "completions/max_terminated_length": 220.25, | |
| "completions/mean_length": 151.3125, | |
| "completions/mean_terminated_length": 151.3125, | |
| "completions/min_length": 80.5, | |
| "completions/min_terminated_length": 80.5, | |
| "epoch": 0.0541765775850007, | |
| "grad_norm": 1.5763809825687893, | |
| "kl": 0.2509765625, | |
| "learning_rate": 5.136278623399225e-07, | |
| "loss": -0.0076, | |
| "num_tokens": 13397611.0, | |
| "reward": 0.43999266996979713, | |
| "reward_std": 0.15885511133819818, | |
| "rewards/code_reward/mean": 0.43999266996979713, | |
| "rewards/code_reward/std": 0.15885510575026274, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 348.25, | |
| "completions/max_terminated_length": 348.25, | |
| "completions/mean_length": 190.71875, | |
| "completions/mean_terminated_length": 190.71875, | |
| "completions/min_length": 70.25, | |
| "completions/min_terminated_length": 70.25, | |
| "epoch": 0.05428851266265566, | |
| "grad_norm": 1.543893067503445, | |
| "kl": 0.22705078125, | |
| "learning_rate": 5.120731328929058e-07, | |
| "loss": 0.1822, | |
| "num_tokens": 13421994.0, | |
| "reward": 0.4371974468231201, | |
| "reward_std": 0.06127816252410412, | |
| "rewards/code_reward/mean": 0.4371974468231201, | |
| "rewards/code_reward/std": 0.06127816252410412, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 271.0, | |
| "completions/max_terminated_length": 271.0, | |
| "completions/mean_length": 158.1875, | |
| "completions/mean_terminated_length": 158.1875, | |
| "completions/min_length": 79.25, | |
| "completions/min_terminated_length": 79.25, | |
| "epoch": 0.05440044774031062, | |
| "grad_norm": 2.287656708183697, | |
| "kl": 0.248046875, | |
| "learning_rate": 5.106123023751187e-07, | |
| "loss": 0.1385, | |
| "num_tokens": 13446792.0, | |
| "reward": 0.37210020469501615, | |
| "reward_std": 0.13278006156906486, | |
| "rewards/code_reward/mean": 0.37210020469501615, | |
| "rewards/code_reward/std": 0.132780060172081, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 296.75, | |
| "completions/max_terminated_length": 296.75, | |
| "completions/mean_length": 184.6875, | |
| "completions/mean_terminated_length": 184.6875, | |
| "completions/min_length": 116.75, | |
| "completions/min_terminated_length": 116.75, | |
| "epoch": 0.05451238281796558, | |
| "grad_norm": 1.5495749732421749, | |
| "kl": 0.2958984375, | |
| "learning_rate": 5.092454320800833e-07, | |
| "loss": 0.0935, | |
| "num_tokens": 13472366.0, | |
| "reward": 0.29983099177479744, | |
| "reward_std": 0.1254198516253382, | |
| "rewards/code_reward/mean": 0.29983099177479744, | |
| "rewards/code_reward/std": 0.1254198516253382, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 487 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 726.5, | |
| "completions/max_terminated_length": 252.75, | |
| "completions/mean_length": 214.78125, | |
| "completions/mean_terminated_length": 153.8258934020996, | |
| "completions/min_length": 74.75, | |
| "completions/min_terminated_length": 74.75, | |
| "epoch": 0.05462431789562054, | |
| "grad_norm": 1.1195840237499781, | |
| "kl": 0.3201904296875, | |
| "learning_rate": 5.079725793589405e-07, | |
| "loss": 0.0209, | |
| "num_tokens": 13498479.0, | |
| "reward": 0.5496091386303306, | |
| "reward_std": 0.08358209393918514, | |
| "rewards/code_reward/mean": 0.5496091386303306, | |
| "rewards/code_reward/std": 0.08358209580183029, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 250.5, | |
| "completions/max_terminated_length": 250.5, | |
| "completions/mean_length": 134.15625, | |
| "completions/mean_terminated_length": 134.15625, | |
| "completions/min_length": 74.75, | |
| "completions/min_terminated_length": 74.75, | |
| "epoch": 0.0547362529732755, | |
| "grad_norm": 1.6494391124504628, | |
| "kl": 0.277099609375, | |
| "learning_rate": 5.067937976180407e-07, | |
| "loss": 0.158, | |
| "num_tokens": 13520068.0, | |
| "reward": 0.4070262387394905, | |
| "reward_std": 0.1368686156347394, | |
| "rewards/code_reward/mean": 0.4070262387394905, | |
| "rewards/code_reward/std": 0.13686862308532, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 489 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 212.0, | |
| "completions/max_terminated_length": 212.0, | |
| "completions/mean_length": 142.4375, | |
| "completions/mean_terminated_length": 142.4375, | |
| "completions/min_length": 90.75, | |
| "completions/min_terminated_length": 90.75, | |
| "epoch": 0.05484818805093046, | |
| "grad_norm": 1.8021016908957268, | |
| "kl": 0.306640625, | |
| "learning_rate": 5.057091363167046e-07, | |
| "loss": -0.0293, | |
| "num_tokens": 13540754.0, | |
| "reward": 0.2056608572602272, | |
| "reward_std": 0.09596531838178635, | |
| "rewards/code_reward/mean": 0.2056608572602272, | |
| "rewards/code_reward/std": 0.09596531558781862, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 342.75, | |
| "completions/max_terminated_length": 342.75, | |
| "completions/mean_length": 205.40625, | |
| "completions/mean_terminated_length": 205.40625, | |
| "completions/min_length": 105.25, | |
| "completions/min_terminated_length": 105.25, | |
| "epoch": 0.05496012312858542, | |
| "grad_norm": 1.1900185406401222, | |
| "kl": 0.287109375, | |
| "learning_rate": 5.047186409651489e-07, | |
| "loss": -0.002, | |
| "num_tokens": 13564991.0, | |
| "reward": 0.4886061754077673, | |
| "reward_std": 0.2226488906890154, | |
| "rewards/code_reward/mean": 0.4886061754077673, | |
| "rewards/code_reward/std": 0.22264889813959599, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 491 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 295.5, | |
| "completions/max_terminated_length": 295.5, | |
| "completions/mean_length": 171.8125, | |
| "completions/mean_terminated_length": 171.8125, | |
| "completions/min_length": 101.75, | |
| "completions/min_terminated_length": 101.75, | |
| "epoch": 0.05507205820624038, | |
| "grad_norm": 1.2327221390909047, | |
| "kl": 0.21728515625, | |
| "learning_rate": 5.038223531225742e-07, | |
| "loss": -0.0388, | |
| "num_tokens": 13586065.0, | |
| "reward": 0.5599798411130905, | |
| "reward_std": 0.1288916040211916, | |
| "rewards/code_reward/mean": 0.5599798411130905, | |
| "rewards/code_reward/std": 0.12889160588383675, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 265.5, | |
| "completions/max_terminated_length": 265.5, | |
| "completions/mean_length": 154.625, | |
| "completions/mean_terminated_length": 154.625, | |
| "completions/min_length": 77.25, | |
| "completions/min_terminated_length": 77.25, | |
| "epoch": 0.05518399328389534, | |
| "grad_norm": 1.7391260110979152, | |
| "kl": 0.26513671875, | |
| "learning_rate": 5.030203103954232e-07, | |
| "loss": -0.1875, | |
| "num_tokens": 13605021.0, | |
| "reward": 0.431189201772213, | |
| "reward_std": 0.26025911793112755, | |
| "rewards/code_reward/mean": 0.431189201772213, | |
| "rewards/code_reward/std": 0.2602591188624501, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 493 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 289.25, | |
| "completions/max_terminated_length": 289.25, | |
| "completions/mean_length": 161.4375, | |
| "completions/mean_terminated_length": 161.4375, | |
| "completions/min_length": 81.0, | |
| "completions/min_terminated_length": 81.0, | |
| "epoch": 0.0552959283615503, | |
| "grad_norm": 1.4124131086208604, | |
| "kl": 0.2646484375, | |
| "learning_rate": 5.023125464358026e-07, | |
| "loss": 0.0666, | |
| "num_tokens": 13624443.0, | |
| "reward": 0.33201567456126213, | |
| "reward_std": 0.02758226078003645, | |
| "rewards/code_reward/mean": 0.33201567456126213, | |
| "rewards/code_reward/std": 0.027582260314375162, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 228.25, | |
| "completions/max_terminated_length": 228.25, | |
| "completions/mean_length": 148.9375, | |
| "completions/mean_terminated_length": 148.9375, | |
| "completions/min_length": 84.5, | |
| "completions/min_terminated_length": 84.5, | |
| "epoch": 0.055407863439205264, | |
| "grad_norm": 1.6145368535030744, | |
| "kl": 0.35107421875, | |
| "learning_rate": 5.016990909400709e-07, | |
| "loss": -0.0059, | |
| "num_tokens": 13651457.0, | |
| "reward": 0.326155461370945, | |
| "reward_std": 0.1283545382320881, | |
| "rewards/code_reward/mean": 0.326155461370945, | |
| "rewards/code_reward/std": 0.1283545382320881, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 320.5, | |
| "completions/max_terminated_length": 320.5, | |
| "completions/mean_length": 155.40625, | |
| "completions/mean_terminated_length": 155.40625, | |
| "completions/min_length": 78.75, | |
| "completions/min_terminated_length": 78.75, | |
| "epoch": 0.05551979851686022, | |
| "grad_norm": 1.6838493896543587, | |
| "kl": 0.251220703125, | |
| "learning_rate": 5.011799696475915e-07, | |
| "loss": 0.0376, | |
| "num_tokens": 13676038.0, | |
| "reward": 0.4704548120498657, | |
| "reward_std": 0.23622475564479828, | |
| "rewards/code_reward/mean": 0.4704548120498657, | |
| "rewards/code_reward/std": 0.23622475564479828, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 291.0, | |
| "completions/max_terminated_length": 291.0, | |
| "completions/mean_length": 189.15625, | |
| "completions/mean_terminated_length": 189.15625, | |
| "completions/min_length": 118.75, | |
| "completions/min_terminated_length": 118.75, | |
| "epoch": 0.05563173359451518, | |
| "grad_norm": 1.7113766777536588, | |
| "kl": 0.2880859375, | |
| "learning_rate": 5.007552043396547e-07, | |
| "loss": -0.0331, | |
| "num_tokens": 13705947.0, | |
| "reward": 0.404338245280087, | |
| "reward_std": 0.2840197389014065, | |
| "rewards/code_reward/mean": 0.404338245280087, | |
| "rewards/code_reward/std": 0.2840197426266968, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 497 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 386.75, | |
| "completions/max_terminated_length": 386.75, | |
| "completions/mean_length": 227.40625, | |
| "completions/mean_terminated_length": 227.40625, | |
| "completions/min_length": 119.25, | |
| "completions/min_terminated_length": 119.25, | |
| "epoch": 0.05574366867217014, | |
| "grad_norm": 1.3321241616316153, | |
| "kl": 0.192626953125, | |
| "learning_rate": 5.004248128385618e-07, | |
| "loss": 0.1036, | |
| "num_tokens": 13729656.0, | |
| "reward": 0.19941096380352974, | |
| "reward_std": 0.1264824215322733, | |
| "rewards/code_reward/mean": 0.19941096380352974, | |
| "rewards/code_reward/std": 0.1264824327081442, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 302.5, | |
| "completions/max_terminated_length": 302.5, | |
| "completions/mean_length": 159.75, | |
| "completions/mean_terminated_length": 159.75, | |
| "completions/min_length": 73.5, | |
| "completions/min_terminated_length": 73.5, | |
| "epoch": 0.0558556037498251, | |
| "grad_norm": 1.4751264574193441, | |
| "kl": 0.318359375, | |
| "learning_rate": 5.001888090068784e-07, | |
| "loss": -0.0364, | |
| "num_tokens": 13749472.0, | |
| "reward": 0.44989876449108124, | |
| "reward_std": 0.047168461605906487, | |
| "rewards/code_reward/mean": 0.44989876449108124, | |
| "rewards/code_reward/std": 0.04716846067458391, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 499 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 222.0, | |
| "completions/max_terminated_length": 222.0, | |
| "completions/mean_length": 119.34375, | |
| "completions/mean_terminated_length": 119.34375, | |
| "completions/min_length": 56.75, | |
| "completions/min_terminated_length": 56.75, | |
| "epoch": 0.05596753882748006, | |
| "grad_norm": 0.48660836341370794, | |
| "kl": 0.421875, | |
| "learning_rate": 5.000472027468528e-07, | |
| "loss": 0.0205, | |
| "num_tokens": 13768043.0, | |
| "reward": 0.5847536753863096, | |
| "reward_std": 0.04997873678803444, | |
| "rewards/code_reward/mean": 0.5847536753863096, | |
| "rewards/code_reward/std": 0.04997873678803444, | |
| "rewards/format_reward/mean": 0.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05596753882748006, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.02433196935596061, | |
| "train_runtime": 50427.8575, | |
| "train_samples_per_second": 0.317, | |
| "train_steps_per_second": 0.01 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 13768043, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |