| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9917808219178084, |
| "eval_steps": 100000.0, |
| "global_step": 364, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005479452054794521, |
| "grad_norm": 432.0, |
| "learning_rate": 0.0, |
| "loss": 5.7373, |
| "mean_token_accuracy": 0.6561740338802338, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.010958904109589041, |
| "grad_norm": 334.0, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 5.8256, |
| "mean_token_accuracy": 0.6489620804786682, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.01643835616438356, |
| "grad_norm": 181.0, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 5.088, |
| "mean_token_accuracy": 0.6650368273258209, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.021917808219178082, |
| "grad_norm": 75.5, |
| "learning_rate": 6e-06, |
| "loss": 3.994, |
| "mean_token_accuracy": 0.6906364560127258, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0273972602739726, |
| "grad_norm": 44.75, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 3.4044, |
| "mean_token_accuracy": 0.7045449316501617, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.03287671232876712, |
| "grad_norm": 28.125, |
| "learning_rate": 1e-05, |
| "loss": 2.8925, |
| "mean_token_accuracy": 0.7223854660987854, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.038356164383561646, |
| "grad_norm": 20.0, |
| "learning_rate": 1.2e-05, |
| "loss": 2.3515, |
| "mean_token_accuracy": 0.7618080377578735, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.043835616438356165, |
| "grad_norm": 19.0, |
| "learning_rate": 1.4e-05, |
| "loss": 2.0269, |
| "mean_token_accuracy": 0.7856993675231934, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.049315068493150684, |
| "grad_norm": 12.8125, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.7614, |
| "mean_token_accuracy": 0.8198637962341309, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.0547945205479452, |
| "grad_norm": 15.875, |
| "learning_rate": 1.8e-05, |
| "loss": 1.4779, |
| "mean_token_accuracy": 0.8413160443305969, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06027397260273973, |
| "grad_norm": 8.3125, |
| "learning_rate": 2e-05, |
| "loss": 1.3764, |
| "mean_token_accuracy": 0.8465317487716675, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.06575342465753424, |
| "grad_norm": 13.6875, |
| "learning_rate": 1.9943502824858758e-05, |
| "loss": 1.2839, |
| "mean_token_accuracy": 0.8562129735946655, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.07123287671232877, |
| "grad_norm": 10.6875, |
| "learning_rate": 1.9887005649717518e-05, |
| "loss": 1.186, |
| "mean_token_accuracy": 0.8632737696170807, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.07671232876712329, |
| "grad_norm": 6.46875, |
| "learning_rate": 1.9830508474576275e-05, |
| "loss": 1.2342, |
| "mean_token_accuracy": 0.8569013476371765, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.0821917808219178, |
| "grad_norm": 3.671875, |
| "learning_rate": 1.977401129943503e-05, |
| "loss": 1.1773, |
| "mean_token_accuracy": 0.861639678478241, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08767123287671233, |
| "grad_norm": 4.1875, |
| "learning_rate": 1.9717514124293785e-05, |
| "loss": 1.0634, |
| "mean_token_accuracy": 0.8734889626502991, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.09315068493150686, |
| "grad_norm": 3.109375, |
| "learning_rate": 1.9661016949152545e-05, |
| "loss": 1.1534, |
| "mean_token_accuracy": 0.8641785085201263, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.09863013698630137, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.96045197740113e-05, |
| "loss": 1.1272, |
| "mean_token_accuracy": 0.8654375374317169, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.10410958904109589, |
| "grad_norm": 2.375, |
| "learning_rate": 1.9548022598870058e-05, |
| "loss": 1.0411, |
| "mean_token_accuracy": 0.8728566467761993, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.1095890410958904, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.9491525423728814e-05, |
| "loss": 1.0268, |
| "mean_token_accuracy": 0.8728775084018707, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11506849315068493, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.9435028248587574e-05, |
| "loss": 1.0674, |
| "mean_token_accuracy": 0.8690488934516907, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.12054794520547946, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.937853107344633e-05, |
| "loss": 0.9995, |
| "mean_token_accuracy": 0.8791483938694, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.12602739726027398, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.9322033898305087e-05, |
| "loss": 1.0724, |
| "mean_token_accuracy": 0.8647687137126923, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.13150684931506848, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.9265536723163844e-05, |
| "loss": 1.0266, |
| "mean_token_accuracy": 0.8738211989402771, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.136986301369863, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.92090395480226e-05, |
| "loss": 1.0761, |
| "mean_token_accuracy": 0.8665441274642944, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.14246575342465753, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.9152542372881357e-05, |
| "loss": 0.9683, |
| "mean_token_accuracy": 0.8777200281620026, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.14794520547945206, |
| "grad_norm": 1.984375, |
| "learning_rate": 1.9096045197740114e-05, |
| "loss": 0.9868, |
| "mean_token_accuracy": 0.8754189312458038, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.15342465753424658, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.9039548022598874e-05, |
| "loss": 1.0158, |
| "mean_token_accuracy": 0.8761765658855438, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.1589041095890411, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.898305084745763e-05, |
| "loss": 0.8789, |
| "mean_token_accuracy": 0.8906848430633545, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.1643835616438356, |
| "grad_norm": 1.84375, |
| "learning_rate": 1.8926553672316387e-05, |
| "loss": 0.9747, |
| "mean_token_accuracy": 0.8772170841693878, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.16986301369863013, |
| "grad_norm": 1.9296875, |
| "learning_rate": 1.8870056497175144e-05, |
| "loss": 0.9559, |
| "mean_token_accuracy": 0.879201203584671, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.17534246575342466, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.88135593220339e-05, |
| "loss": 0.9505, |
| "mean_token_accuracy": 0.8790359497070312, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.18082191780821918, |
| "grad_norm": 1.875, |
| "learning_rate": 1.8757062146892657e-05, |
| "loss": 0.9744, |
| "mean_token_accuracy": 0.8767738342285156, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1863013698630137, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.8700564971751413e-05, |
| "loss": 0.9881, |
| "mean_token_accuracy": 0.8765853643417358, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.1917808219178082, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.864406779661017e-05, |
| "loss": 0.9423, |
| "mean_token_accuracy": 0.8802583813667297, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.19726027397260273, |
| "grad_norm": 1.9296875, |
| "learning_rate": 1.858757062146893e-05, |
| "loss": 0.9277, |
| "mean_token_accuracy": 0.8835765421390533, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.20273972602739726, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.8531073446327686e-05, |
| "loss": 0.9619, |
| "mean_token_accuracy": 0.8785396814346313, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.20821917808219179, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.8474576271186443e-05, |
| "loss": 0.9415, |
| "mean_token_accuracy": 0.8792627155780792, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.2136986301369863, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.84180790960452e-05, |
| "loss": 1.1318, |
| "mean_token_accuracy": 0.8636864423751831, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.2191780821917808, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.8361581920903956e-05, |
| "loss": 0.9542, |
| "mean_token_accuracy": 0.8771731555461884, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.22465753424657534, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.8305084745762713e-05, |
| "loss": 0.9122, |
| "mean_token_accuracy": 0.881610095500946, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.23013698630136986, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.824858757062147e-05, |
| "loss": 0.8768, |
| "mean_token_accuracy": 0.8854578733444214, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.2356164383561644, |
| "grad_norm": 1.75, |
| "learning_rate": 1.8192090395480226e-05, |
| "loss": 0.8985, |
| "mean_token_accuracy": 0.8832479119300842, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.2410958904109589, |
| "grad_norm": 1.71875, |
| "learning_rate": 1.8135593220338986e-05, |
| "loss": 0.9174, |
| "mean_token_accuracy": 0.88409024477005, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.2465753424657534, |
| "grad_norm": 1.9375, |
| "learning_rate": 1.8079096045197743e-05, |
| "loss": 0.9281, |
| "mean_token_accuracy": 0.8829980492591858, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.25205479452054796, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.80225988700565e-05, |
| "loss": 0.9484, |
| "mean_token_accuracy": 0.8815726637840271, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.25753424657534246, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.7966101694915256e-05, |
| "loss": 0.9032, |
| "mean_token_accuracy": 0.8823907375335693, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.26301369863013696, |
| "grad_norm": 1.7421875, |
| "learning_rate": 1.7909604519774012e-05, |
| "loss": 0.954, |
| "mean_token_accuracy": 0.8775873780250549, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.2684931506849315, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.785310734463277e-05, |
| "loss": 0.8923, |
| "mean_token_accuracy": 0.8820536136627197, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.273972602739726, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.7796610169491526e-05, |
| "loss": 0.9235, |
| "mean_token_accuracy": 0.8839817941188812, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.27945205479452057, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.7740112994350286e-05, |
| "loss": 0.9095, |
| "mean_token_accuracy": 0.8809832036495209, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.28493150684931506, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1.7683615819209042e-05, |
| "loss": 0.8763, |
| "mean_token_accuracy": 0.8849304616451263, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.29041095890410956, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.76271186440678e-05, |
| "loss": 0.9113, |
| "mean_token_accuracy": 0.8810350298881531, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.2958904109589041, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.7570621468926555e-05, |
| "loss": 0.9356, |
| "mean_token_accuracy": 0.8820917904376984, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.3013698630136986, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.7514124293785312e-05, |
| "loss": 0.9315, |
| "mean_token_accuracy": 0.8792105317115784, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.30684931506849317, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.745762711864407e-05, |
| "loss": 0.9129, |
| "mean_token_accuracy": 0.884558379650116, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.31232876712328766, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.7401129943502825e-05, |
| "loss": 0.9175, |
| "mean_token_accuracy": 0.8841174840927124, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.3178082191780822, |
| "grad_norm": 1.75, |
| "learning_rate": 1.734463276836158e-05, |
| "loss": 0.9441, |
| "mean_token_accuracy": 0.8799735009670258, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.3232876712328767, |
| "grad_norm": 1.671875, |
| "learning_rate": 1.728813559322034e-05, |
| "loss": 0.8381, |
| "mean_token_accuracy": 0.8915592730045319, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.3287671232876712, |
| "grad_norm": 1.75, |
| "learning_rate": 1.7231638418079098e-05, |
| "loss": 0.878, |
| "mean_token_accuracy": 0.8879745006561279, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.33424657534246577, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.7175141242937855e-05, |
| "loss": 0.894, |
| "mean_token_accuracy": 0.8848598599433899, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.33972602739726027, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.711864406779661e-05, |
| "loss": 0.9141, |
| "mean_token_accuracy": 0.882304459810257, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.3452054794520548, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.7062146892655368e-05, |
| "loss": 0.9312, |
| "mean_token_accuracy": 0.8793206214904785, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.3506849315068493, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.7005649717514125e-05, |
| "loss": 0.8844, |
| "mean_token_accuracy": 0.8880196511745453, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.3561643835616438, |
| "grad_norm": 2.125, |
| "learning_rate": 1.694915254237288e-05, |
| "loss": 0.7884, |
| "mean_token_accuracy": 0.9000090658664703, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.36164383561643837, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.689265536723164e-05, |
| "loss": 0.856, |
| "mean_token_accuracy": 0.8874339759349823, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.36712328767123287, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.6836158192090398e-05, |
| "loss": 0.9166, |
| "mean_token_accuracy": 0.8831603229045868, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.3726027397260274, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.6779661016949154e-05, |
| "loss": 0.9728, |
| "mean_token_accuracy": 0.8779590725898743, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.3780821917808219, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.672316384180791e-05, |
| "loss": 0.9378, |
| "mean_token_accuracy": 0.8816089332103729, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.3835616438356164, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 0.9222, |
| "mean_token_accuracy": 0.8854541778564453, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.38904109589041097, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.6610169491525424e-05, |
| "loss": 0.9116, |
| "mean_token_accuracy": 0.8790720105171204, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.39452054794520547, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.655367231638418e-05, |
| "loss": 0.8784, |
| "mean_token_accuracy": 0.8863929808139801, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.6497175141242937e-05, |
| "loss": 0.8976, |
| "mean_token_accuracy": 0.8833265900611877, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.4054794520547945, |
| "grad_norm": 1.7421875, |
| "learning_rate": 1.6440677966101697e-05, |
| "loss": 0.8872, |
| "mean_token_accuracy": 0.8866287767887115, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.410958904109589, |
| "grad_norm": 1.75, |
| "learning_rate": 1.6384180790960454e-05, |
| "loss": 0.9294, |
| "mean_token_accuracy": 0.8842478394508362, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.41643835616438357, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.632768361581921e-05, |
| "loss": 0.8754, |
| "mean_token_accuracy": 0.887272983789444, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.42191780821917807, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.6271186440677967e-05, |
| "loss": 0.8779, |
| "mean_token_accuracy": 0.8887339234352112, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.4273972602739726, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.6214689265536724e-05, |
| "loss": 0.8863, |
| "mean_token_accuracy": 0.8841427862644196, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.4328767123287671, |
| "grad_norm": 1.671875, |
| "learning_rate": 1.615819209039548e-05, |
| "loss": 0.8632, |
| "mean_token_accuracy": 0.8868177235126495, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.4383561643835616, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.6101694915254237e-05, |
| "loss": 0.9242, |
| "mean_token_accuracy": 0.8839016258716583, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.4438356164383562, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.6045197740112997e-05, |
| "loss": 0.908, |
| "mean_token_accuracy": 0.8833514153957367, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.44931506849315067, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.5988700564971753e-05, |
| "loss": 0.8747, |
| "mean_token_accuracy": 0.8873944878578186, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.4547945205479452, |
| "grad_norm": 1.75, |
| "learning_rate": 1.593220338983051e-05, |
| "loss": 0.9303, |
| "mean_token_accuracy": 0.8823438286781311, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.4602739726027397, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.5875706214689266e-05, |
| "loss": 0.8441, |
| "mean_token_accuracy": 0.888810396194458, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.4657534246575342, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.5819209039548023e-05, |
| "loss": 0.8512, |
| "mean_token_accuracy": 0.8866813480854034, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.4712328767123288, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.576271186440678e-05, |
| "loss": 0.8743, |
| "mean_token_accuracy": 0.8884658217430115, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.4767123287671233, |
| "grad_norm": 1.703125, |
| "learning_rate": 1.5706214689265536e-05, |
| "loss": 0.8523, |
| "mean_token_accuracy": 0.8898887932300568, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.4821917808219178, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.5649717514124293e-05, |
| "loss": 0.8947, |
| "mean_token_accuracy": 0.8856352865695953, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.4876712328767123, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.5593220338983053e-05, |
| "loss": 0.8895, |
| "mean_token_accuracy": 0.8845047950744629, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.4931506849315068, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.553672316384181e-05, |
| "loss": 0.8215, |
| "mean_token_accuracy": 0.8922451138496399, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4986301369863014, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.5480225988700566e-05, |
| "loss": 0.8677, |
| "mean_token_accuracy": 0.8868069648742676, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.5041095890410959, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.5423728813559326e-05, |
| "loss": 0.8843, |
| "mean_token_accuracy": 0.8863621056079865, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.5095890410958904, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.536723163841808e-05, |
| "loss": 0.8593, |
| "mean_token_accuracy": 0.8890643417835236, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.5150684931506849, |
| "grad_norm": 1.7734375, |
| "learning_rate": 1.5310734463276836e-05, |
| "loss": 0.8961, |
| "mean_token_accuracy": 0.885326474905014, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.5205479452054794, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.5254237288135594e-05, |
| "loss": 0.8115, |
| "mean_token_accuracy": 0.8935891091823578, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.5260273972602739, |
| "grad_norm": 1.71875, |
| "learning_rate": 1.5197740112994352e-05, |
| "loss": 0.8157, |
| "mean_token_accuracy": 0.8906770646572113, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.5315068493150685, |
| "grad_norm": 1.703125, |
| "learning_rate": 1.5141242937853109e-05, |
| "loss": 0.9447, |
| "mean_token_accuracy": 0.8807980120182037, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.536986301369863, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1.5084745762711865e-05, |
| "loss": 0.8652, |
| "mean_token_accuracy": 0.886509358882904, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.5424657534246575, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.5028248587570622e-05, |
| "loss": 0.8307, |
| "mean_token_accuracy": 0.8903360962867737, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.547945205479452, |
| "grad_norm": 6.25, |
| "learning_rate": 1.497175141242938e-05, |
| "loss": 0.918, |
| "mean_token_accuracy": 0.8805244266986847, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5534246575342465, |
| "grad_norm": 1.75, |
| "learning_rate": 1.4915254237288137e-05, |
| "loss": 0.8078, |
| "mean_token_accuracy": 0.8928936421871185, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.5589041095890411, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.4858757062146894e-05, |
| "loss": 0.8592, |
| "mean_token_accuracy": 0.889348953962326, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.5643835616438356, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.480225988700565e-05, |
| "loss": 0.7756, |
| "mean_token_accuracy": 0.8963395059108734, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.5698630136986301, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.4745762711864408e-05, |
| "loss": 0.8789, |
| "mean_token_accuracy": 0.886920839548111, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.5753424657534246, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.4689265536723165e-05, |
| "loss": 0.8967, |
| "mean_token_accuracy": 0.8798324763774872, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.5808219178082191, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.4632768361581922e-05, |
| "loss": 0.843, |
| "mean_token_accuracy": 0.8907739818096161, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.5863013698630137, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.4576271186440678e-05, |
| "loss": 0.8541, |
| "mean_token_accuracy": 0.8892745971679688, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.5917808219178082, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.4519774011299436e-05, |
| "loss": 0.8462, |
| "mean_token_accuracy": 0.8868447542190552, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.5972602739726027, |
| "grad_norm": 1.703125, |
| "learning_rate": 1.4463276836158193e-05, |
| "loss": 0.8057, |
| "mean_token_accuracy": 0.8942738175392151, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.6027397260273972, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.440677966101695e-05, |
| "loss": 0.9267, |
| "mean_token_accuracy": 0.879928857088089, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.6082191780821918, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.4350282485875708e-05, |
| "loss": 0.8554, |
| "mean_token_accuracy": 0.8891916871070862, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.6136986301369863, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.4293785310734465e-05, |
| "loss": 0.8786, |
| "mean_token_accuracy": 0.8868878483772278, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.6191780821917808, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.4237288135593221e-05, |
| "loss": 0.8238, |
| "mean_token_accuracy": 0.8923602402210236, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.6246575342465753, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.4180790960451978e-05, |
| "loss": 0.7824, |
| "mean_token_accuracy": 0.898999810218811, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.6301369863013698, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.4124293785310736e-05, |
| "loss": 0.8753, |
| "mean_token_accuracy": 0.8899329602718353, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.6356164383561644, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.4067796610169493e-05, |
| "loss": 0.8015, |
| "mean_token_accuracy": 0.8944090604782104, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.6410958904109589, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.4011299435028249e-05, |
| "loss": 0.8078, |
| "mean_token_accuracy": 0.8907613754272461, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.6465753424657534, |
| "grad_norm": 1.84375, |
| "learning_rate": 1.3954802259887006e-05, |
| "loss": 0.8241, |
| "mean_token_accuracy": 0.8921016752719879, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.6520547945205479, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.3898305084745764e-05, |
| "loss": 0.8554, |
| "mean_token_accuracy": 0.8893947899341583, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.6575342465753424, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.384180790960452e-05, |
| "loss": 0.7452, |
| "mean_token_accuracy": 0.9025129973888397, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.663013698630137, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.3785310734463277e-05, |
| "loss": 0.8917, |
| "mean_token_accuracy": 0.8841381669044495, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.6684931506849315, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.3728813559322034e-05, |
| "loss": 0.7997, |
| "mean_token_accuracy": 0.8949976563453674, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.673972602739726, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.3672316384180792e-05, |
| "loss": 0.77, |
| "mean_token_accuracy": 0.8981111347675323, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.6794520547945205, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.3615819209039549e-05, |
| "loss": 0.8735, |
| "mean_token_accuracy": 0.88605797290802, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.684931506849315, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.3559322033898305e-05, |
| "loss": 0.8578, |
| "mean_token_accuracy": 0.8875480592250824, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.6904109589041096, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.3502824858757064e-05, |
| "loss": 0.842, |
| "mean_token_accuracy": 0.8861750662326813, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.6958904109589041, |
| "grad_norm": 1.75, |
| "learning_rate": 1.344632768361582e-05, |
| "loss": 0.7661, |
| "mean_token_accuracy": 0.8968884646892548, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.7013698630136986, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.3389830508474577e-05, |
| "loss": 0.8218, |
| "mean_token_accuracy": 0.89415243268013, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.7068493150684931, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 0.84, |
| "mean_token_accuracy": 0.8905225694179535, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.7123287671232876, |
| "grad_norm": 1.75, |
| "learning_rate": 1.3276836158192092e-05, |
| "loss": 0.7886, |
| "mean_token_accuracy": 0.8948017656803131, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.7178082191780822, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.3220338983050848e-05, |
| "loss": 0.7962, |
| "mean_token_accuracy": 0.8944378197193146, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.7232876712328767, |
| "grad_norm": 1.6875, |
| "learning_rate": 1.3163841807909605e-05, |
| "loss": 0.7744, |
| "mean_token_accuracy": 0.8996008336544037, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.7287671232876712, |
| "grad_norm": 1.8984375, |
| "learning_rate": 1.3107344632768361e-05, |
| "loss": 0.8511, |
| "mean_token_accuracy": 0.8891786336898804, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.7342465753424657, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.305084745762712e-05, |
| "loss": 0.7581, |
| "mean_token_accuracy": 0.8997050821781158, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.7397260273972602, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.2994350282485876e-05, |
| "loss": 0.8179, |
| "mean_token_accuracy": 0.889685183763504, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.7452054794520548, |
| "grad_norm": 1.71875, |
| "learning_rate": 1.2937853107344633e-05, |
| "loss": 0.7822, |
| "mean_token_accuracy": 0.8955155909061432, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.7506849315068493, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.288135593220339e-05, |
| "loss": 0.8004, |
| "mean_token_accuracy": 0.8967688679695129, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.7561643835616438, |
| "grad_norm": 1.71875, |
| "learning_rate": 1.282485875706215e-05, |
| "loss": 0.7105, |
| "mean_token_accuracy": 0.903778463602066, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.7616438356164383, |
| "grad_norm": 1.9609375, |
| "learning_rate": 1.2768361581920904e-05, |
| "loss": 0.8719, |
| "mean_token_accuracy": 0.8871429562568665, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.7671232876712328, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.2711864406779661e-05, |
| "loss": 0.8294, |
| "mean_token_accuracy": 0.8909505307674408, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.7726027397260274, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.265536723163842e-05, |
| "loss": 0.8538, |
| "mean_token_accuracy": 0.8904447853565216, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.7780821917808219, |
| "grad_norm": 1.75, |
| "learning_rate": 1.2598870056497177e-05, |
| "loss": 0.7883, |
| "mean_token_accuracy": 0.8976991772651672, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.7835616438356164, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.2542372881355932e-05, |
| "loss": 0.8273, |
| "mean_token_accuracy": 0.8918053209781647, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.7890410958904109, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.2485875706214689e-05, |
| "loss": 0.8064, |
| "mean_token_accuracy": 0.8940227329730988, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.7945205479452054, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.2429378531073449e-05, |
| "loss": 0.8038, |
| "mean_token_accuracy": 0.893746942281723, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.2372881355932205e-05, |
| "loss": 0.7326, |
| "mean_token_accuracy": 0.9016251862049103, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.8054794520547945, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.2316384180790962e-05, |
| "loss": 0.8565, |
| "mean_token_accuracy": 0.8860533237457275, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.810958904109589, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.2259887005649717e-05, |
| "loss": 0.814, |
| "mean_token_accuracy": 0.8916102349758148, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.8164383561643835, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.2203389830508477e-05, |
| "loss": 0.826, |
| "mean_token_accuracy": 0.8912359774112701, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.821917808219178, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.2146892655367234e-05, |
| "loss": 0.7332, |
| "mean_token_accuracy": 0.901479035615921, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.8273972602739726, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.209039548022599e-05, |
| "loss": 0.8137, |
| "mean_token_accuracy": 0.8948657810688019, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.8328767123287671, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.2033898305084745e-05, |
| "loss": 0.7541, |
| "mean_token_accuracy": 0.8997522294521332, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.8383561643835616, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.1977401129943505e-05, |
| "loss": 0.7506, |
| "mean_token_accuracy": 0.9004031419754028, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.8438356164383561, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.1920903954802262e-05, |
| "loss": 0.8034, |
| "mean_token_accuracy": 0.8948807418346405, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.8493150684931506, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.1864406779661018e-05, |
| "loss": 0.8141, |
| "mean_token_accuracy": 0.8910720944404602, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.8547945205479452, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.1807909604519776e-05, |
| "loss": 0.8654, |
| "mean_token_accuracy": 0.8876812160015106, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.8602739726027397, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.1751412429378533e-05, |
| "loss": 0.7872, |
| "mean_token_accuracy": 0.8944331705570221, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.8657534246575342, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.169491525423729e-05, |
| "loss": 0.8064, |
| "mean_token_accuracy": 0.8923972845077515, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.8712328767123287, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.1638418079096046e-05, |
| "loss": 0.8012, |
| "mean_token_accuracy": 0.8913676142692566, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.8767123287671232, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.1581920903954804e-05, |
| "loss": 0.7828, |
| "mean_token_accuracy": 0.8984484672546387, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.8821917808219178, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.1525423728813561e-05, |
| "loss": 0.7947, |
| "mean_token_accuracy": 0.8937750458717346, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.8876712328767123, |
| "grad_norm": 1.875, |
| "learning_rate": 1.1468926553672318e-05, |
| "loss": 0.8615, |
| "mean_token_accuracy": 0.8844136893749237, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.8931506849315068, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.1412429378531074e-05, |
| "loss": 0.7851, |
| "mean_token_accuracy": 0.8954149484634399, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.8986301369863013, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.1355932203389833e-05, |
| "loss": 0.8646, |
| "mean_token_accuracy": 0.8902953565120697, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.9041095890410958, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.1299435028248589e-05, |
| "loss": 0.8723, |
| "mean_token_accuracy": 0.8843137621879578, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.9095890410958904, |
| "grad_norm": 1.7421875, |
| "learning_rate": 1.1242937853107346e-05, |
| "loss": 0.8073, |
| "mean_token_accuracy": 0.8933804333209991, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.915068493150685, |
| "grad_norm": 1.7734375, |
| "learning_rate": 1.1186440677966102e-05, |
| "loss": 0.8571, |
| "mean_token_accuracy": 0.8872124254703522, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.9205479452054794, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.112994350282486e-05, |
| "loss": 0.7582, |
| "mean_token_accuracy": 0.8991103768348694, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.9260273972602739, |
| "grad_norm": 1.875, |
| "learning_rate": 1.1073446327683617e-05, |
| "loss": 0.7952, |
| "mean_token_accuracy": 0.8964874744415283, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.9315068493150684, |
| "grad_norm": 1.6875, |
| "learning_rate": 1.1016949152542374e-05, |
| "loss": 0.7878, |
| "mean_token_accuracy": 0.894893229007721, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.936986301369863, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.096045197740113e-05, |
| "loss": 0.8082, |
| "mean_token_accuracy": 0.8924291729927063, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.9424657534246575, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.0903954802259889e-05, |
| "loss": 0.8655, |
| "mean_token_accuracy": 0.8870832324028015, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.947945205479452, |
| "grad_norm": 1.9296875, |
| "learning_rate": 1.0847457627118645e-05, |
| "loss": 0.7696, |
| "mean_token_accuracy": 0.8987171053886414, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.9534246575342465, |
| "grad_norm": 1.8984375, |
| "learning_rate": 1.0790960451977402e-05, |
| "loss": 0.7408, |
| "mean_token_accuracy": 0.9013040661811829, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.958904109589041, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.073446327683616e-05, |
| "loss": 0.8175, |
| "mean_token_accuracy": 0.8909050524234772, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.9643835616438357, |
| "grad_norm": 1.7421875, |
| "learning_rate": 1.0677966101694917e-05, |
| "loss": 0.7626, |
| "mean_token_accuracy": 0.895355612039566, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.9698630136986301, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.0621468926553673e-05, |
| "loss": 0.8223, |
| "mean_token_accuracy": 0.8929450511932373, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.9753424657534246, |
| "grad_norm": 1.9453125, |
| "learning_rate": 1.056497175141243e-05, |
| "loss": 0.8457, |
| "mean_token_accuracy": 0.8877719342708588, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.9808219178082191, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.0508474576271188e-05, |
| "loss": 0.8065, |
| "mean_token_accuracy": 0.8943466544151306, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.9863013698630136, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.0451977401129945e-05, |
| "loss": 0.7862, |
| "mean_token_accuracy": 0.8981420993804932, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.9917808219178083, |
| "grad_norm": 1.6796875, |
| "learning_rate": 1.0395480225988701e-05, |
| "loss": 0.7166, |
| "mean_token_accuracy": 0.9056210517883301, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.9972602739726028, |
| "grad_norm": 1.7734375, |
| "learning_rate": 1.0338983050847458e-05, |
| "loss": 0.716, |
| "mean_token_accuracy": 0.9040741622447968, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.203125, |
| "learning_rate": 1.0282485875706216e-05, |
| "loss": 0.366, |
| "mean_token_accuracy": 0.8993819952011108, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.0054794520547945, |
| "grad_norm": 1.96875, |
| "learning_rate": 1.0225988700564973e-05, |
| "loss": 0.6589, |
| "mean_token_accuracy": 0.9100659489631653, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.010958904109589, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.016949152542373e-05, |
| "loss": 0.7632, |
| "mean_token_accuracy": 0.8995705246925354, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.0164383561643835, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.0112994350282486e-05, |
| "loss": 0.7087, |
| "mean_token_accuracy": 0.9034123718738556, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.021917808219178, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.0056497175141244e-05, |
| "loss": 0.7332, |
| "mean_token_accuracy": 0.8997578024864197, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.0273972602739727, |
| "grad_norm": 1.96875, |
| "learning_rate": 1e-05, |
| "loss": 0.7192, |
| "mean_token_accuracy": 0.9052003026008606, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.0328767123287672, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.943502824858759e-06, |
| "loss": 0.6996, |
| "mean_token_accuracy": 0.9051499664783478, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.0383561643835617, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.887005649717516e-06, |
| "loss": 0.6603, |
| "mean_token_accuracy": 0.9084216058254242, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.0438356164383562, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.830508474576272e-06, |
| "loss": 0.6934, |
| "mean_token_accuracy": 0.905331015586853, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.0493150684931507, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.774011299435029e-06, |
| "loss": 0.6852, |
| "mean_token_accuracy": 0.907427966594696, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.0547945205479452, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.717514124293787e-06, |
| "loss": 0.6364, |
| "mean_token_accuracy": 0.9119535982608795, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.0602739726027397, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.661016949152544e-06, |
| "loss": 0.7319, |
| "mean_token_accuracy": 0.902245968580246, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.0657534246575342, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.6045197740113e-06, |
| "loss": 0.7186, |
| "mean_token_accuracy": 0.9031082093715668, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.0712328767123287, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.548022598870057e-06, |
| "loss": 0.7056, |
| "mean_token_accuracy": 0.9083144962787628, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.0767123287671232, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.491525423728815e-06, |
| "loss": 0.6955, |
| "mean_token_accuracy": 0.9058070778846741, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.0821917808219177, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.435028248587572e-06, |
| "loss": 0.658, |
| "mean_token_accuracy": 0.9093527495861053, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.0876712328767124, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.378531073446328e-06, |
| "loss": 0.6913, |
| "mean_token_accuracy": 0.907717764377594, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.093150684931507, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.322033898305085e-06, |
| "loss": 0.6677, |
| "mean_token_accuracy": 0.9069054424762726, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.0986301369863014, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.265536723163843e-06, |
| "loss": 0.6996, |
| "mean_token_accuracy": 0.9076306223869324, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.104109589041096, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.2090395480226e-06, |
| "loss": 0.7397, |
| "mean_token_accuracy": 0.8992961049079895, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.1095890410958904, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.152542372881356e-06, |
| "loss": 0.7211, |
| "mean_token_accuracy": 0.9030121862888336, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.115068493150685, |
| "grad_norm": 2.359375, |
| "learning_rate": 9.096045197740113e-06, |
| "loss": 0.7143, |
| "mean_token_accuracy": 0.9033773839473724, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.1205479452054794, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.039548022598871e-06, |
| "loss": 0.7024, |
| "mean_token_accuracy": 0.905937910079956, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.126027397260274, |
| "grad_norm": 1.953125, |
| "learning_rate": 8.983050847457628e-06, |
| "loss": 0.671, |
| "mean_token_accuracy": 0.9082026183605194, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.1315068493150684, |
| "grad_norm": 1.875, |
| "learning_rate": 8.926553672316384e-06, |
| "loss": 0.713, |
| "mean_token_accuracy": 0.903482049703598, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.1369863013698631, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.870056497175143e-06, |
| "loss": 0.7127, |
| "mean_token_accuracy": 0.9049177765846252, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.1424657534246576, |
| "grad_norm": 1.953125, |
| "learning_rate": 8.8135593220339e-06, |
| "loss": 0.7038, |
| "mean_token_accuracy": 0.9026915431022644, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.1479452054794521, |
| "grad_norm": 1.9609375, |
| "learning_rate": 8.757062146892656e-06, |
| "loss": 0.7828, |
| "mean_token_accuracy": 0.8936010599136353, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.1534246575342466, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.700564971751413e-06, |
| "loss": 0.6303, |
| "mean_token_accuracy": 0.9128701090812683, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.158904109589041, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.64406779661017e-06, |
| "loss": 0.6587, |
| "mean_token_accuracy": 0.9105681478977203, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.1643835616438356, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.587570621468927e-06, |
| "loss": 0.6522, |
| "mean_token_accuracy": 0.9126417934894562, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.16986301369863, |
| "grad_norm": 1.921875, |
| "learning_rate": 8.531073446327684e-06, |
| "loss": 0.6384, |
| "mean_token_accuracy": 0.9142054915428162, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.1753424657534246, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.47457627118644e-06, |
| "loss": 0.6443, |
| "mean_token_accuracy": 0.9108563363552094, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.180821917808219, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.418079096045199e-06, |
| "loss": 0.6724, |
| "mean_token_accuracy": 0.9071345031261444, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.1863013698630138, |
| "grad_norm": 1.984375, |
| "learning_rate": 8.361581920903955e-06, |
| "loss": 0.7334, |
| "mean_token_accuracy": 0.9011849761009216, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.191780821917808, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.305084745762712e-06, |
| "loss": 0.6668, |
| "mean_token_accuracy": 0.9077614843845367, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.1972602739726028, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.248587570621469e-06, |
| "loss": 0.7365, |
| "mean_token_accuracy": 0.9032963216304779, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.2027397260273973, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.192090395480227e-06, |
| "loss": 0.6649, |
| "mean_token_accuracy": 0.9089824557304382, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.2082191780821918, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.135593220338983e-06, |
| "loss": 0.6773, |
| "mean_token_accuracy": 0.9071504771709442, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.2136986301369863, |
| "grad_norm": 1.9765625, |
| "learning_rate": 8.07909604519774e-06, |
| "loss": 0.69, |
| "mean_token_accuracy": 0.9098499119281769, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.2191780821917808, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.022598870056498e-06, |
| "loss": 0.6418, |
| "mean_token_accuracy": 0.9123164415359497, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.2246575342465753, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.966101694915255e-06, |
| "loss": 0.6542, |
| "mean_token_accuracy": 0.9111972451210022, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.2301369863013698, |
| "grad_norm": 1.8828125, |
| "learning_rate": 7.909604519774012e-06, |
| "loss": 0.7119, |
| "mean_token_accuracy": 0.9055051803588867, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.2356164383561643, |
| "grad_norm": 2.0, |
| "learning_rate": 7.853107344632768e-06, |
| "loss": 0.6844, |
| "mean_token_accuracy": 0.9083731472492218, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.2410958904109588, |
| "grad_norm": 1.9296875, |
| "learning_rate": 7.796610169491526e-06, |
| "loss": 0.6986, |
| "mean_token_accuracy": 0.9047116041183472, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.2465753424657535, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.740112994350283e-06, |
| "loss": 0.6755, |
| "mean_token_accuracy": 0.9082626402378082, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.252054794520548, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.68361581920904e-06, |
| "loss": 0.6935, |
| "mean_token_accuracy": 0.9040849804878235, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.2575342465753425, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.627118644067797e-06, |
| "loss": 0.6887, |
| "mean_token_accuracy": 0.9099703729152679, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.263013698630137, |
| "grad_norm": 2.0, |
| "learning_rate": 7.5706214689265545e-06, |
| "loss": 0.7033, |
| "mean_token_accuracy": 0.9058608114719391, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.2684931506849315, |
| "grad_norm": 1.9375, |
| "learning_rate": 7.514124293785311e-06, |
| "loss": 0.7058, |
| "mean_token_accuracy": 0.9063239991664886, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.273972602739726, |
| "grad_norm": 1.9765625, |
| "learning_rate": 7.4576271186440685e-06, |
| "loss": 0.7218, |
| "mean_token_accuracy": 0.9018439948558807, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.2794520547945205, |
| "grad_norm": 1.9140625, |
| "learning_rate": 7.401129943502825e-06, |
| "loss": 0.7134, |
| "mean_token_accuracy": 0.9037202000617981, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.284931506849315, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.3446327683615825e-06, |
| "loss": 0.6849, |
| "mean_token_accuracy": 0.9080476462841034, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.2904109589041095, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.288135593220339e-06, |
| "loss": 0.747, |
| "mean_token_accuracy": 0.9032059609889984, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.2958904109589042, |
| "grad_norm": 1.875, |
| "learning_rate": 7.2316384180790965e-06, |
| "loss": 0.6309, |
| "mean_token_accuracy": 0.9128157496452332, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.3013698630136985, |
| "grad_norm": 1.859375, |
| "learning_rate": 7.175141242937854e-06, |
| "loss": 0.7132, |
| "mean_token_accuracy": 0.9027237892150879, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.3068493150684932, |
| "grad_norm": 1.9140625, |
| "learning_rate": 7.1186440677966106e-06, |
| "loss": 0.6806, |
| "mean_token_accuracy": 0.9067763984203339, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.3123287671232877, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.062146892655368e-06, |
| "loss": 0.7434, |
| "mean_token_accuracy": 0.9005565345287323, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.3178082191780822, |
| "grad_norm": 1.875, |
| "learning_rate": 7.0056497175141246e-06, |
| "loss": 0.6809, |
| "mean_token_accuracy": 0.9072897434234619, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.3232876712328767, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.949152542372882e-06, |
| "loss": 0.7208, |
| "mean_token_accuracy": 0.906402200460434, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.3287671232876712, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.892655367231639e-06, |
| "loss": 0.7418, |
| "mean_token_accuracy": 0.9020512700080872, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.3342465753424657, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.836158192090396e-06, |
| "loss": 0.7158, |
| "mean_token_accuracy": 0.9037717282772064, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.3397260273972602, |
| "grad_norm": 1.96875, |
| "learning_rate": 6.779661016949153e-06, |
| "loss": 0.7274, |
| "mean_token_accuracy": 0.9033022224903107, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.345205479452055, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.72316384180791e-06, |
| "loss": 0.6736, |
| "mean_token_accuracy": 0.9079640805721283, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.3506849315068492, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.6624, |
| "mean_token_accuracy": 0.9096674621105194, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.356164383561644, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.610169491525424e-06, |
| "loss": 0.7013, |
| "mean_token_accuracy": 0.9066566228866577, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.3616438356164384, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.553672316384181e-06, |
| "loss": 0.7107, |
| "mean_token_accuracy": 0.9048294425010681, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.367123287671233, |
| "grad_norm": 2.046875, |
| "learning_rate": 6.497175141242938e-06, |
| "loss": 0.6964, |
| "mean_token_accuracy": 0.9053798615932465, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.3726027397260274, |
| "grad_norm": 1.859375, |
| "learning_rate": 6.440677966101695e-06, |
| "loss": 0.6184, |
| "mean_token_accuracy": 0.9159266352653503, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.378082191780822, |
| "grad_norm": 1.8984375, |
| "learning_rate": 6.384180790960452e-06, |
| "loss": 0.6435, |
| "mean_token_accuracy": 0.9122322797775269, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.3835616438356164, |
| "grad_norm": 1.8359375, |
| "learning_rate": 6.32768361581921e-06, |
| "loss": 0.6622, |
| "mean_token_accuracy": 0.910215824842453, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.389041095890411, |
| "grad_norm": 1.9140625, |
| "learning_rate": 6.271186440677966e-06, |
| "loss": 0.6991, |
| "mean_token_accuracy": 0.907020092010498, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.3945205479452054, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.2146892655367244e-06, |
| "loss": 0.6726, |
| "mean_token_accuracy": 0.9085971713066101, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 1.84375, |
| "learning_rate": 6.158192090395481e-06, |
| "loss": 0.6498, |
| "mean_token_accuracy": 0.9123066365718842, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.4054794520547946, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.1016949152542385e-06, |
| "loss": 0.6304, |
| "mean_token_accuracy": 0.914670318365097, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.410958904109589, |
| "grad_norm": 1.8671875, |
| "learning_rate": 6.045197740112995e-06, |
| "loss": 0.6677, |
| "mean_token_accuracy": 0.9098580479621887, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.4164383561643836, |
| "grad_norm": 1.90625, |
| "learning_rate": 5.9887005649717525e-06, |
| "loss": 0.6927, |
| "mean_token_accuracy": 0.9073116779327393, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.4219178082191781, |
| "grad_norm": 2.25, |
| "learning_rate": 5.932203389830509e-06, |
| "loss": 0.7589, |
| "mean_token_accuracy": 0.8991564214229584, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.4273972602739726, |
| "grad_norm": 1.921875, |
| "learning_rate": 5.8757062146892665e-06, |
| "loss": 0.7304, |
| "mean_token_accuracy": 0.9021161198616028, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.4328767123287671, |
| "grad_norm": 1.828125, |
| "learning_rate": 5.819209039548023e-06, |
| "loss": 0.6267, |
| "mean_token_accuracy": 0.9146751463413239, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.4383561643835616, |
| "grad_norm": 1.9140625, |
| "learning_rate": 5.7627118644067805e-06, |
| "loss": 0.7622, |
| "mean_token_accuracy": 0.8948341906070709, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.4438356164383561, |
| "grad_norm": 1.8515625, |
| "learning_rate": 5.706214689265537e-06, |
| "loss": 0.6191, |
| "mean_token_accuracy": 0.913863331079483, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.4493150684931506, |
| "grad_norm": 2.09375, |
| "learning_rate": 5.6497175141242946e-06, |
| "loss": 0.6872, |
| "mean_token_accuracy": 0.9074727296829224, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.4547945205479453, |
| "grad_norm": 2.046875, |
| "learning_rate": 5.593220338983051e-06, |
| "loss": 0.6691, |
| "mean_token_accuracy": 0.910420149564743, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.4602739726027396, |
| "grad_norm": 1.9296875, |
| "learning_rate": 5.536723163841809e-06, |
| "loss": 0.6667, |
| "mean_token_accuracy": 0.9060676395893097, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.4657534246575343, |
| "grad_norm": 1.9375, |
| "learning_rate": 5.480225988700565e-06, |
| "loss": 0.6477, |
| "mean_token_accuracy": 0.9133667647838593, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.4712328767123288, |
| "grad_norm": 1.921875, |
| "learning_rate": 5.423728813559323e-06, |
| "loss": 0.6444, |
| "mean_token_accuracy": 0.9106875658035278, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.4767123287671233, |
| "grad_norm": 1.9140625, |
| "learning_rate": 5.36723163841808e-06, |
| "loss": 0.6404, |
| "mean_token_accuracy": 0.9118073582649231, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.4821917808219178, |
| "grad_norm": 1.890625, |
| "learning_rate": 5.310734463276837e-06, |
| "loss": 0.6769, |
| "mean_token_accuracy": 0.9056595265865326, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.4876712328767123, |
| "grad_norm": 1.9765625, |
| "learning_rate": 5.254237288135594e-06, |
| "loss": 0.6808, |
| "mean_token_accuracy": 0.9079870283603668, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.4931506849315068, |
| "grad_norm": 1.953125, |
| "learning_rate": 5.197740112994351e-06, |
| "loss": 0.7428, |
| "mean_token_accuracy": 0.9004494547843933, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.4986301369863013, |
| "grad_norm": 1.8828125, |
| "learning_rate": 5.141242937853108e-06, |
| "loss": 0.6059, |
| "mean_token_accuracy": 0.9166204333305359, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.504109589041096, |
| "grad_norm": 2.203125, |
| "learning_rate": 5.084745762711865e-06, |
| "loss": 0.6387, |
| "mean_token_accuracy": 0.9125949144363403, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.5095890410958903, |
| "grad_norm": 1.9296875, |
| "learning_rate": 5.028248587570622e-06, |
| "loss": 0.674, |
| "mean_token_accuracy": 0.910666286945343, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.515068493150685, |
| "grad_norm": 1.9296875, |
| "learning_rate": 4.9717514124293796e-06, |
| "loss": 0.653, |
| "mean_token_accuracy": 0.9097437858581543, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.5205479452054793, |
| "grad_norm": 1.9921875, |
| "learning_rate": 4.915254237288136e-06, |
| "loss": 0.6891, |
| "mean_token_accuracy": 0.9078112840652466, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.526027397260274, |
| "grad_norm": 2.140625, |
| "learning_rate": 4.8587570621468936e-06, |
| "loss": 0.7249, |
| "mean_token_accuracy": 0.9042028188705444, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.5315068493150685, |
| "grad_norm": 1.9140625, |
| "learning_rate": 4.80225988700565e-06, |
| "loss": 0.6809, |
| "mean_token_accuracy": 0.9092828631401062, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.536986301369863, |
| "grad_norm": 1.890625, |
| "learning_rate": 4.745762711864408e-06, |
| "loss": 0.6141, |
| "mean_token_accuracy": 0.9147942662239075, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.5424657534246575, |
| "grad_norm": 2.046875, |
| "learning_rate": 4.689265536723164e-06, |
| "loss": 0.7209, |
| "mean_token_accuracy": 0.9041316211223602, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.547945205479452, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.632768361581922e-06, |
| "loss": 0.6145, |
| "mean_token_accuracy": 0.9166280925273895, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.5534246575342465, |
| "grad_norm": 2.125, |
| "learning_rate": 4.576271186440678e-06, |
| "loss": 0.673, |
| "mean_token_accuracy": 0.9096376001834869, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.558904109589041, |
| "grad_norm": 1.9609375, |
| "learning_rate": 4.519774011299436e-06, |
| "loss": 0.6784, |
| "mean_token_accuracy": 0.9063901305198669, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.5643835616438357, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.463276836158192e-06, |
| "loss": 0.6594, |
| "mean_token_accuracy": 0.9109133183956146, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.56986301369863, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.40677966101695e-06, |
| "loss": 0.6409, |
| "mean_token_accuracy": 0.9116988480091095, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.5753424657534247, |
| "grad_norm": 1.90625, |
| "learning_rate": 4.350282485875706e-06, |
| "loss": 0.6547, |
| "mean_token_accuracy": 0.9099021852016449, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.580821917808219, |
| "grad_norm": 1.90625, |
| "learning_rate": 4.293785310734464e-06, |
| "loss": 0.6769, |
| "mean_token_accuracy": 0.9094822108745575, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.5863013698630137, |
| "grad_norm": 2.0, |
| "learning_rate": 4.23728813559322e-06, |
| "loss": 0.7131, |
| "mean_token_accuracy": 0.903436928987503, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.5917808219178082, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.180790960451978e-06, |
| "loss": 0.6988, |
| "mean_token_accuracy": 0.9046348929405212, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.5972602739726027, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.124293785310734e-06, |
| "loss": 0.7204, |
| "mean_token_accuracy": 0.9052576124668121, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.6027397260273972, |
| "grad_norm": 1.984375, |
| "learning_rate": 4.067796610169492e-06, |
| "loss": 0.6905, |
| "mean_token_accuracy": 0.9082843363285065, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.6082191780821917, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.011299435028249e-06, |
| "loss": 0.7316, |
| "mean_token_accuracy": 0.9009381234645844, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.6136986301369864, |
| "grad_norm": 1.9921875, |
| "learning_rate": 3.954802259887006e-06, |
| "loss": 0.7316, |
| "mean_token_accuracy": 0.9005630910396576, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.6191780821917807, |
| "grad_norm": 1.984375, |
| "learning_rate": 3.898305084745763e-06, |
| "loss": 0.6994, |
| "mean_token_accuracy": 0.905254602432251, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.6246575342465754, |
| "grad_norm": 1.953125, |
| "learning_rate": 3.84180790960452e-06, |
| "loss": 0.6493, |
| "mean_token_accuracy": 0.9093045294284821, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.6301369863013697, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.7853107344632772e-06, |
| "loss": 0.6282, |
| "mean_token_accuracy": 0.9144234955310822, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.6356164383561644, |
| "grad_norm": 1.9296875, |
| "learning_rate": 3.7288135593220342e-06, |
| "loss": 0.6878, |
| "mean_token_accuracy": 0.9078341126441956, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.641095890410959, |
| "grad_norm": 1.890625, |
| "learning_rate": 3.6723163841807913e-06, |
| "loss": 0.6357, |
| "mean_token_accuracy": 0.9131599366664886, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.6465753424657534, |
| "grad_norm": 1.875, |
| "learning_rate": 3.6158192090395483e-06, |
| "loss": 0.7303, |
| "mean_token_accuracy": 0.9041744768619537, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.652054794520548, |
| "grad_norm": 1.9375, |
| "learning_rate": 3.5593220338983053e-06, |
| "loss": 0.6885, |
| "mean_token_accuracy": 0.9045538306236267, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.6575342465753424, |
| "grad_norm": 1.9453125, |
| "learning_rate": 3.5028248587570623e-06, |
| "loss": 0.6915, |
| "mean_token_accuracy": 0.9064763784408569, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.6630136986301371, |
| "grad_norm": 2.078125, |
| "learning_rate": 3.4463276836158193e-06, |
| "loss": 0.7073, |
| "mean_token_accuracy": 0.9064249396324158, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.6684931506849314, |
| "grad_norm": 1.953125, |
| "learning_rate": 3.3898305084745763e-06, |
| "loss": 0.6497, |
| "mean_token_accuracy": 0.9118121564388275, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.6739726027397261, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.6628, |
| "mean_token_accuracy": 0.9104187488555908, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.6794520547945204, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.2768361581920903e-06, |
| "loss": 0.6534, |
| "mean_token_accuracy": 0.909776359796524, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.6849315068493151, |
| "grad_norm": 1.875, |
| "learning_rate": 3.2203389830508473e-06, |
| "loss": 0.6717, |
| "mean_token_accuracy": 0.9090818762779236, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.6904109589041096, |
| "grad_norm": 1.984375, |
| "learning_rate": 3.163841807909605e-06, |
| "loss": 0.6735, |
| "mean_token_accuracy": 0.9090877175331116, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.6958904109589041, |
| "grad_norm": 1.9609375, |
| "learning_rate": 3.1073446327683622e-06, |
| "loss": 0.6932, |
| "mean_token_accuracy": 0.9055996835231781, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.7013698630136986, |
| "grad_norm": 1.9140625, |
| "learning_rate": 3.0508474576271192e-06, |
| "loss": 0.6919, |
| "mean_token_accuracy": 0.9036900401115417, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.7068493150684931, |
| "grad_norm": 1.9453125, |
| "learning_rate": 2.9943502824858762e-06, |
| "loss": 0.6737, |
| "mean_token_accuracy": 0.9093412756919861, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.7123287671232876, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.9378531073446333e-06, |
| "loss": 0.6038, |
| "mean_token_accuracy": 0.9156779944896698, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.7178082191780821, |
| "grad_norm": 1.8984375, |
| "learning_rate": 2.8813559322033903e-06, |
| "loss": 0.6681, |
| "mean_token_accuracy": 0.9089544415473938, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.7232876712328768, |
| "grad_norm": 2.015625, |
| "learning_rate": 2.8248587570621473e-06, |
| "loss": 0.6544, |
| "mean_token_accuracy": 0.911058783531189, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.7287671232876711, |
| "grad_norm": 1.9453125, |
| "learning_rate": 2.7683615819209043e-06, |
| "loss": 0.6405, |
| "mean_token_accuracy": 0.9108243882656097, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.7342465753424658, |
| "grad_norm": 1.921875, |
| "learning_rate": 2.7118644067796613e-06, |
| "loss": 0.6556, |
| "mean_token_accuracy": 0.9137302935123444, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.7397260273972601, |
| "grad_norm": 2.28125, |
| "learning_rate": 2.6553672316384183e-06, |
| "loss": 0.6323, |
| "mean_token_accuracy": 0.9138603806495667, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.7452054794520548, |
| "grad_norm": 1.8984375, |
| "learning_rate": 2.5988700564971753e-06, |
| "loss": 0.662, |
| "mean_token_accuracy": 0.908368855714798, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.7506849315068493, |
| "grad_norm": 1.9296875, |
| "learning_rate": 2.5423728813559323e-06, |
| "loss": 0.602, |
| "mean_token_accuracy": 0.9159910678863525, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.7561643835616438, |
| "grad_norm": 1.9140625, |
| "learning_rate": 2.4858757062146898e-06, |
| "loss": 0.6691, |
| "mean_token_accuracy": 0.9087992608547211, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.7616438356164383, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.4293785310734468e-06, |
| "loss": 0.596, |
| "mean_token_accuracy": 0.917241632938385, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.7671232876712328, |
| "grad_norm": 2.015625, |
| "learning_rate": 2.372881355932204e-06, |
| "loss": 0.7606, |
| "mean_token_accuracy": 0.8987223207950592, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.7726027397260276, |
| "grad_norm": 1.8984375, |
| "learning_rate": 2.316384180790961e-06, |
| "loss": 0.6812, |
| "mean_token_accuracy": 0.9057367146015167, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.7780821917808218, |
| "grad_norm": 1.953125, |
| "learning_rate": 2.259887005649718e-06, |
| "loss": 0.6828, |
| "mean_token_accuracy": 0.9079809784889221, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.7835616438356166, |
| "grad_norm": 1.984375, |
| "learning_rate": 2.203389830508475e-06, |
| "loss": 0.7241, |
| "mean_token_accuracy": 0.9030264317989349, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.7890410958904108, |
| "grad_norm": 1.90625, |
| "learning_rate": 2.146892655367232e-06, |
| "loss": 0.6908, |
| "mean_token_accuracy": 0.9060869216918945, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.7945205479452055, |
| "grad_norm": 1.984375, |
| "learning_rate": 2.090395480225989e-06, |
| "loss": 0.6877, |
| "mean_token_accuracy": 0.9075476229190826, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 2.0, |
| "learning_rate": 2.033898305084746e-06, |
| "loss": 0.6607, |
| "mean_token_accuracy": 0.9095008373260498, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.8054794520547945, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.977401129943503e-06, |
| "loss": 0.5915, |
| "mean_token_accuracy": 0.9170799255371094, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.810958904109589, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.92090395480226e-06, |
| "loss": 0.7045, |
| "mean_token_accuracy": 0.9037127196788788, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.8164383561643835, |
| "grad_norm": 1.9765625, |
| "learning_rate": 1.8644067796610171e-06, |
| "loss": 0.6665, |
| "mean_token_accuracy": 0.9105578064918518, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.821917808219178, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.8079096045197741e-06, |
| "loss": 0.6302, |
| "mean_token_accuracy": 0.9144696891307831, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.8273972602739725, |
| "grad_norm": 1.984375, |
| "learning_rate": 1.7514124293785311e-06, |
| "loss": 0.7235, |
| "mean_token_accuracy": 0.9047011733055115, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.8328767123287673, |
| "grad_norm": 1.875, |
| "learning_rate": 1.6949152542372882e-06, |
| "loss": 0.6352, |
| "mean_token_accuracy": 0.9123874604701996, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.8383561643835615, |
| "grad_norm": 1.9609375, |
| "learning_rate": 1.6384180790960452e-06, |
| "loss": 0.6611, |
| "mean_token_accuracy": 0.9092899858951569, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.8438356164383563, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.5819209039548026e-06, |
| "loss": 0.7016, |
| "mean_token_accuracy": 0.9032659232616425, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.8493150684931505, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.5254237288135596e-06, |
| "loss": 0.7062, |
| "mean_token_accuracy": 0.9050180912017822, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.8547945205479452, |
| "grad_norm": 2.0, |
| "learning_rate": 1.4689265536723166e-06, |
| "loss": 0.7284, |
| "mean_token_accuracy": 0.9015350937843323, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.8602739726027397, |
| "grad_norm": 1.96875, |
| "learning_rate": 1.4124293785310736e-06, |
| "loss": 0.6355, |
| "mean_token_accuracy": 0.9123442471027374, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.8657534246575342, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.3559322033898307e-06, |
| "loss": 0.6472, |
| "mean_token_accuracy": 0.9134511053562164, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.8712328767123287, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.2994350282485877e-06, |
| "loss": 0.6786, |
| "mean_token_accuracy": 0.9099195599555969, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.8767123287671232, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.2429378531073449e-06, |
| "loss": 0.747, |
| "mean_token_accuracy": 0.9031140804290771, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.882191780821918, |
| "grad_norm": 2.0, |
| "learning_rate": 1.186440677966102e-06, |
| "loss": 0.6904, |
| "mean_token_accuracy": 0.9059503078460693, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.8876712328767122, |
| "grad_norm": 1.9375, |
| "learning_rate": 1.129943502824859e-06, |
| "loss": 0.6317, |
| "mean_token_accuracy": 0.913758397102356, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.893150684931507, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.073446327683616e-06, |
| "loss": 0.6718, |
| "mean_token_accuracy": 0.9079194068908691, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.8986301369863012, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.016949152542373e-06, |
| "loss": 0.6781, |
| "mean_token_accuracy": 0.9061055779457092, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.904109589041096, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.6045197740113e-07, |
| "loss": 0.7169, |
| "mean_token_accuracy": 0.9037725031375885, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.9095890410958904, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.039548022598871e-07, |
| "loss": 0.6882, |
| "mean_token_accuracy": 0.9081202149391174, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.915068493150685, |
| "grad_norm": 1.984375, |
| "learning_rate": 8.474576271186441e-07, |
| "loss": 0.6714, |
| "mean_token_accuracy": 0.9084700644016266, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.9205479452054794, |
| "grad_norm": 1.8984375, |
| "learning_rate": 7.909604519774013e-07, |
| "loss": 0.5951, |
| "mean_token_accuracy": 0.915435403585434, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.926027397260274, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.344632768361583e-07, |
| "loss": 0.6639, |
| "mean_token_accuracy": 0.9109577238559723, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.9315068493150684, |
| "grad_norm": 1.9296875, |
| "learning_rate": 6.779661016949153e-07, |
| "loss": 0.6354, |
| "mean_token_accuracy": 0.9138354063034058, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.936986301369863, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.214689265536724e-07, |
| "loss": 0.6613, |
| "mean_token_accuracy": 0.9091556370258331, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.9424657534246577, |
| "grad_norm": 1.8828125, |
| "learning_rate": 5.649717514124295e-07, |
| "loss": 0.6825, |
| "mean_token_accuracy": 0.9077677130699158, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.947945205479452, |
| "grad_norm": 1.890625, |
| "learning_rate": 5.084745762711865e-07, |
| "loss": 0.6799, |
| "mean_token_accuracy": 0.9061026573181152, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.9534246575342467, |
| "grad_norm": 1.96875, |
| "learning_rate": 4.5197740112994353e-07, |
| "loss": 0.631, |
| "mean_token_accuracy": 0.914902925491333, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.958904109589041, |
| "grad_norm": 1.9375, |
| "learning_rate": 3.9548022598870065e-07, |
| "loss": 0.7215, |
| "mean_token_accuracy": 0.9018444120883942, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.9643835616438357, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.3898305084745766e-07, |
| "loss": 0.7234, |
| "mean_token_accuracy": 0.9011828899383545, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.9698630136986301, |
| "grad_norm": 2.046875, |
| "learning_rate": 2.8248587570621473e-07, |
| "loss": 0.6796, |
| "mean_token_accuracy": 0.9087766408920288, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.9753424657534246, |
| "grad_norm": 1.921875, |
| "learning_rate": 2.2598870056497177e-07, |
| "loss": 0.6692, |
| "mean_token_accuracy": 0.9075906872749329, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.9808219178082191, |
| "grad_norm": 1.9453125, |
| "learning_rate": 1.6949152542372883e-07, |
| "loss": 0.7053, |
| "mean_token_accuracy": 0.9050938785076141, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.9863013698630136, |
| "grad_norm": 1.9375, |
| "learning_rate": 1.1299435028248588e-07, |
| "loss": 0.6466, |
| "mean_token_accuracy": 0.9130730330944061, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.9917808219178084, |
| "grad_norm": 1.8671875, |
| "learning_rate": 5.649717514124294e-08, |
| "loss": 0.6636, |
| "mean_token_accuracy": 0.9109528958797455, |
| "step": 364 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 364, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.568355335831552e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|