{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08685649186629352, "eval_steps": 1000, "global_step": 430000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_calculated_loss": 15.195395469665527, "eval_loss": 10.872434616088867, "eval_perplexity": 3974444.1891521593, "eval_runtime": 113.1527, "eval_samples_per_second": 8.82, "eval_steps_per_second": 2.209, "step": 0 }, { "epoch": 2.019918415495198e-05, "grad_norm": 25.910245895385742, "learning_rate": 1.9999604095830626e-05, "loss": 3.5944, "step": 100 }, { "epoch": 4.039836830990396e-05, "grad_norm": 18.22606086730957, "learning_rate": 1.9999200111984325e-05, "loss": 2.3508, "step": 200 }, { "epoch": 6.059755246485594e-05, "grad_norm": 22.640230178833008, "learning_rate": 1.9998796128138024e-05, "loss": 2.3644, "step": 300 }, { "epoch": 8.079673661980792e-05, "grad_norm": 19.121156692504883, "learning_rate": 1.999839214429172e-05, "loss": 2.3056, "step": 400 }, { "epoch": 0.00010099592077475991, "grad_norm": 25.438858032226562, "learning_rate": 1.999798816044542e-05, "loss": 2.2826, "step": 500 }, { "epoch": 0.00012119510492971189, "grad_norm": 20.913928985595703, "learning_rate": 1.9997584176599115e-05, "loss": 2.2766, "step": 600 }, { "epoch": 0.00014139428908466388, "grad_norm": 18.063331604003906, "learning_rate": 1.9997180192752814e-05, "loss": 2.2752, "step": 700 }, { "epoch": 0.00016159347323961584, "grad_norm": 18.578174591064453, "learning_rate": 1.9996776208906513e-05, "loss": 2.3266, "step": 800 }, { "epoch": 0.00018179265739456783, "grad_norm": 14.369830131530762, "learning_rate": 1.999637222506021e-05, "loss": 2.3299, "step": 900 }, { "epoch": 0.00020199184154951982, "grad_norm": 16.384193420410156, "learning_rate": 1.9995968241213908e-05, "loss": 2.334, "step": 1000 }, { "epoch": 0.00020199184154951982, "eval_calculated_loss": 8.00036334991455, "eval_loss": 2.3184101581573486, "eval_perplexity": 2982.0413146731817, "eval_runtime": 111.9479, "eval_samples_per_second": 8.915, "eval_steps_per_second": 2.233, "step": 1000 }, { "epoch": 0.0002221910257044718, "grad_norm": 16.618349075317383, "learning_rate": 1.9995564257367607e-05, "loss": 2.2621, "step": 1100 }, { "epoch": 0.00024239020985942377, "grad_norm": 20.238191604614258, "learning_rate": 1.9995160273521303e-05, "loss": 2.2648, "step": 1200 }, { "epoch": 0.00026258939401437574, "grad_norm": 16.140003204345703, "learning_rate": 1.9994756289675002e-05, "loss": 2.2632, "step": 1300 }, { "epoch": 0.00028278857816932775, "grad_norm": 18.15752410888672, "learning_rate": 1.99943523058287e-05, "loss": 2.2618, "step": 1400 }, { "epoch": 0.0003029877623242797, "grad_norm": 26.219144821166992, "learning_rate": 1.99939483219824e-05, "loss": 2.2145, "step": 1500 }, { "epoch": 0.0003231869464792317, "grad_norm": 9.240769386291504, "learning_rate": 1.9993544338136096e-05, "loss": 2.2663, "step": 1600 }, { "epoch": 0.0003433861306341837, "grad_norm": 11.76053237915039, "learning_rate": 1.9993140354289795e-05, "loss": 2.2002, "step": 1700 }, { "epoch": 0.00036358531478913566, "grad_norm": 15.234793663024902, "learning_rate": 1.9992736370443494e-05, "loss": 2.1741, "step": 1800 }, { "epoch": 0.0003837844989440876, "grad_norm": 16.69493293762207, "learning_rate": 1.999233238659719e-05, "loss": 2.0676, "step": 1900 }, { "epoch": 0.00040398368309903964, "grad_norm": 26.615434646606445, "learning_rate": 1.999192840275089e-05, "loss": 2.3167, "step": 2000 }, { "epoch": 0.00040398368309903964, "eval_calculated_loss": 8.013657569885254, "eval_loss": 2.2550275325775146, "eval_perplexity": 3021.9499169678766, "eval_runtime": 112.2141, "eval_samples_per_second": 8.894, "eval_steps_per_second": 2.228, "step": 2000 }, { "epoch": 0.0004241828672539916, "grad_norm": 19.514629364013672, "learning_rate": 1.9991524418904585e-05, "loss": 2.1699, "step": 2100 }, { "epoch": 0.0004443820514089436, "grad_norm": 10.794641494750977, "learning_rate": 1.9991120435058284e-05, "loss": 2.1946, "step": 2200 }, { "epoch": 0.0004645812355638956, "grad_norm": 14.217839241027832, "learning_rate": 1.9990716451211983e-05, "loss": 2.2505, "step": 2300 }, { "epoch": 0.00048478041971884754, "grad_norm": 15.775816917419434, "learning_rate": 1.9990312467365682e-05, "loss": 2.1256, "step": 2400 }, { "epoch": 0.0005049796038737995, "grad_norm": 22.818836212158203, "learning_rate": 1.998990848351938e-05, "loss": 2.1466, "step": 2500 }, { "epoch": 0.0005251787880287515, "grad_norm": 11.562753677368164, "learning_rate": 1.9989504499673077e-05, "loss": 2.1966, "step": 2600 }, { "epoch": 0.0005453779721837035, "grad_norm": 15.451811790466309, "learning_rate": 1.9989100515826776e-05, "loss": 2.1711, "step": 2700 }, { "epoch": 0.0005655771563386555, "grad_norm": 13.218891143798828, "learning_rate": 1.9988696531980475e-05, "loss": 2.1963, "step": 2800 }, { "epoch": 0.0005857763404936075, "grad_norm": 19.49493980407715, "learning_rate": 1.998829254813417e-05, "loss": 2.1485, "step": 2900 }, { "epoch": 0.0006059755246485594, "grad_norm": 24.019865036010742, "learning_rate": 1.998788856428787e-05, "loss": 2.2345, "step": 3000 }, { "epoch": 0.0006059755246485594, "eval_calculated_loss": 8.221076965332031, "eval_loss": 2.240705966949463, "eval_perplexity": 3718.504927717358, "eval_runtime": 110.0706, "eval_samples_per_second": 9.067, "eval_steps_per_second": 2.271, "step": 3000 }, { "epoch": 0.0006261747088035114, "grad_norm": 19.43185806274414, "learning_rate": 1.9987484580441566e-05, "loss": 2.1975, "step": 3100 }, { "epoch": 0.0006463738929584634, "grad_norm": 15.528987884521484, "learning_rate": 1.9987080596595265e-05, "loss": 2.1599, "step": 3200 }, { "epoch": 0.0006665730771134154, "grad_norm": 21.222124099731445, "learning_rate": 1.9986676612748964e-05, "loss": 2.2313, "step": 3300 }, { "epoch": 0.0006867722612683674, "grad_norm": 15.070540428161621, "learning_rate": 1.9986272628902663e-05, "loss": 2.1972, "step": 3400 }, { "epoch": 0.0007069714454233194, "grad_norm": 15.285359382629395, "learning_rate": 1.9985868645056362e-05, "loss": 2.1384, "step": 3500 }, { "epoch": 0.0007271706295782713, "grad_norm": 14.576860427856445, "learning_rate": 1.9985464661210058e-05, "loss": 2.1814, "step": 3600 }, { "epoch": 0.0007473698137332233, "grad_norm": 20.040666580200195, "learning_rate": 1.9985060677363757e-05, "loss": 2.1974, "step": 3700 }, { "epoch": 0.0007675689978881752, "grad_norm": 19.252782821655273, "learning_rate": 1.9984656693517453e-05, "loss": 2.1569, "step": 3800 }, { "epoch": 0.0007877681820431273, "grad_norm": 15.978367805480957, "learning_rate": 1.9984252709671152e-05, "loss": 2.1051, "step": 3900 }, { "epoch": 0.0008079673661980793, "grad_norm": 13.873286247253418, "learning_rate": 1.998384872582485e-05, "loss": 2.2285, "step": 4000 }, { "epoch": 0.0008079673661980793, "eval_calculated_loss": 8.216127395629883, "eval_loss": 2.2571475505828857, "eval_perplexity": 3700.1454017479055, "eval_runtime": 111.3523, "eval_samples_per_second": 8.963, "eval_steps_per_second": 2.245, "step": 4000 }, { "epoch": 0.0008281665503530312, "grad_norm": 22.396608352661133, "learning_rate": 1.9983444741978547e-05, "loss": 2.1719, "step": 4100 }, { "epoch": 0.0008483657345079832, "grad_norm": 13.320940017700195, "learning_rate": 1.9983040758132246e-05, "loss": 2.2068, "step": 4200 }, { "epoch": 0.0008685649186629352, "grad_norm": 20.007034301757812, "learning_rate": 1.9982636774285945e-05, "loss": 2.1399, "step": 4300 }, { "epoch": 0.0008887641028178872, "grad_norm": 12.159279823303223, "learning_rate": 1.9982232790439644e-05, "loss": 2.1292, "step": 4400 }, { "epoch": 0.0009089632869728392, "grad_norm": 15.521031379699707, "learning_rate": 1.9981828806593344e-05, "loss": 2.0842, "step": 4500 }, { "epoch": 0.0009291624711277912, "grad_norm": 12.431594848632812, "learning_rate": 1.998142482274704e-05, "loss": 2.015, "step": 4600 }, { "epoch": 0.0009493616552827431, "grad_norm": 17.10834312438965, "learning_rate": 1.998102083890074e-05, "loss": 2.2095, "step": 4700 }, { "epoch": 0.0009695608394376951, "grad_norm": 8.77836799621582, "learning_rate": 1.9980616855054434e-05, "loss": 2.137, "step": 4800 }, { "epoch": 0.000989760023592647, "grad_norm": 12.702131271362305, "learning_rate": 1.9980212871208133e-05, "loss": 2.1431, "step": 4900 }, { "epoch": 0.001009959207747599, "grad_norm": 15.861292839050293, "learning_rate": 1.9979808887361832e-05, "loss": 2.1324, "step": 5000 }, { "epoch": 0.001009959207747599, "eval_calculated_loss": 8.174903869628906, "eval_loss": 2.2724368572235107, "eval_perplexity": 3550.713576123446, "eval_runtime": 111.5924, "eval_samples_per_second": 8.943, "eval_steps_per_second": 2.24, "step": 5000 }, { "epoch": 0.001030158391902551, "grad_norm": 14.1941556930542, "learning_rate": 1.9979404903515528e-05, "loss": 2.0905, "step": 5100 }, { "epoch": 0.001050357576057503, "grad_norm": 17.618680953979492, "learning_rate": 1.9979000919669227e-05, "loss": 2.2055, "step": 5200 }, { "epoch": 0.0010705567602124551, "grad_norm": 8.960135459899902, "learning_rate": 1.9978596935822926e-05, "loss": 2.1484, "step": 5300 }, { "epoch": 0.001090755944367407, "grad_norm": 12.967617988586426, "learning_rate": 1.9978192951976626e-05, "loss": 2.1458, "step": 5400 }, { "epoch": 0.001110955128522359, "grad_norm": 7.830617904663086, "learning_rate": 1.997778896813032e-05, "loss": 2.1513, "step": 5500 }, { "epoch": 0.001131154312677311, "grad_norm": 20.921924591064453, "learning_rate": 1.997738498428402e-05, "loss": 2.1403, "step": 5600 }, { "epoch": 0.001151353496832263, "grad_norm": 12.650078773498535, "learning_rate": 1.997698100043772e-05, "loss": 2.1335, "step": 5700 }, { "epoch": 0.001171552680987215, "grad_norm": 12.180737495422363, "learning_rate": 1.9976577016591415e-05, "loss": 2.166, "step": 5800 }, { "epoch": 0.001191751865142167, "grad_norm": 21.112455368041992, "learning_rate": 1.9976173032745114e-05, "loss": 2.2004, "step": 5900 }, { "epoch": 0.0012119510492971189, "grad_norm": 14.288626670837402, "learning_rate": 1.9975769048898814e-05, "loss": 2.1718, "step": 6000 }, { "epoch": 0.0012119510492971189, "eval_calculated_loss": 8.323880195617676, "eval_loss": 2.2688639163970947, "eval_perplexity": 4121.119775633763, "eval_runtime": 111.8907, "eval_samples_per_second": 8.919, "eval_steps_per_second": 2.234, "step": 6000 }, { "epoch": 0.0012321502334520708, "grad_norm": 15.341775894165039, "learning_rate": 1.997536506505251e-05, "loss": 2.1305, "step": 6100 }, { "epoch": 0.0012523494176070228, "grad_norm": 14.539010047912598, "learning_rate": 1.997496108120621e-05, "loss": 2.0739, "step": 6200 }, { "epoch": 0.0012725486017619748, "grad_norm": 14.811124801635742, "learning_rate": 1.9974557097359904e-05, "loss": 2.0979, "step": 6300 }, { "epoch": 0.0012927477859169267, "grad_norm": 14.591126441955566, "learning_rate": 1.9974153113513607e-05, "loss": 2.1376, "step": 6400 }, { "epoch": 0.001312946970071879, "grad_norm": 10.39746379852295, "learning_rate": 1.9973749129667302e-05, "loss": 2.1622, "step": 6500 }, { "epoch": 0.0013331461542268309, "grad_norm": 16.617488861083984, "learning_rate": 1.9973345145821e-05, "loss": 2.1278, "step": 6600 }, { "epoch": 0.0013533453383817828, "grad_norm": 11.628128051757812, "learning_rate": 1.99729411619747e-05, "loss": 2.1003, "step": 6700 }, { "epoch": 0.0013735445225367348, "grad_norm": 13.467378616333008, "learning_rate": 1.9972537178128396e-05, "loss": 2.0524, "step": 6800 }, { "epoch": 0.0013937437066916867, "grad_norm": 18.945878982543945, "learning_rate": 1.9972133194282096e-05, "loss": 2.0721, "step": 6900 }, { "epoch": 0.0014139428908466387, "grad_norm": 19.877120971679688, "learning_rate": 1.997172921043579e-05, "loss": 2.1042, "step": 7000 }, { "epoch": 0.0014139428908466387, "eval_calculated_loss": 7.999070167541504, "eval_loss": 2.270808219909668, "eval_perplexity": 2978.1874837994965, "eval_runtime": 112.2636, "eval_samples_per_second": 8.89, "eval_steps_per_second": 2.227, "step": 7000 }, { "epoch": 0.0014341420750015907, "grad_norm": 15.634549140930176, "learning_rate": 1.997132522658949e-05, "loss": 2.0542, "step": 7100 }, { "epoch": 0.0014543412591565426, "grad_norm": 24.16468048095703, "learning_rate": 1.997092124274319e-05, "loss": 2.0379, "step": 7200 }, { "epoch": 0.0014745404433114946, "grad_norm": 9.60893726348877, "learning_rate": 1.9970517258896885e-05, "loss": 2.0216, "step": 7300 }, { "epoch": 0.0014947396274664466, "grad_norm": 17.191179275512695, "learning_rate": 1.9970113275050584e-05, "loss": 2.069, "step": 7400 }, { "epoch": 0.0015149388116213985, "grad_norm": 14.517882347106934, "learning_rate": 1.9969709291204284e-05, "loss": 2.1824, "step": 7500 }, { "epoch": 0.0015351379957763505, "grad_norm": 18.14604377746582, "learning_rate": 1.9969305307357983e-05, "loss": 2.1703, "step": 7600 }, { "epoch": 0.0015553371799313027, "grad_norm": 10.891212463378906, "learning_rate": 1.9968901323511682e-05, "loss": 2.0912, "step": 7700 }, { "epoch": 0.0015755363640862546, "grad_norm": 12.502071380615234, "learning_rate": 1.9968497339665378e-05, "loss": 2.1163, "step": 7800 }, { "epoch": 0.0015957355482412066, "grad_norm": 16.225780487060547, "learning_rate": 1.9968093355819077e-05, "loss": 2.1911, "step": 7900 }, { "epoch": 0.0016159347323961586, "grad_norm": 12.36819839477539, "learning_rate": 1.9967689371972772e-05, "loss": 2.0413, "step": 8000 }, { "epoch": 0.0016159347323961586, "eval_calculated_loss": 8.131377220153809, "eval_loss": 2.236302375793457, "eval_perplexity": 3399.478172517282, "eval_runtime": 112.6477, "eval_samples_per_second": 8.859, "eval_steps_per_second": 2.219, "step": 8000 }, { "epoch": 0.0016361339165511105, "grad_norm": 22.086515426635742, "learning_rate": 1.996728538812647e-05, "loss": 2.1228, "step": 8100 }, { "epoch": 0.0016563331007060625, "grad_norm": 11.615912437438965, "learning_rate": 1.996688140428017e-05, "loss": 2.1545, "step": 8200 }, { "epoch": 0.0016765322848610144, "grad_norm": 17.12405776977539, "learning_rate": 1.9966477420433866e-05, "loss": 2.1091, "step": 8300 }, { "epoch": 0.0016967314690159664, "grad_norm": 15.868494033813477, "learning_rate": 1.9966073436587566e-05, "loss": 2.1557, "step": 8400 }, { "epoch": 0.0017169306531709184, "grad_norm": 14.905299186706543, "learning_rate": 1.9965669452741265e-05, "loss": 2.1224, "step": 8500 }, { "epoch": 0.0017371298373258703, "grad_norm": 8.594021797180176, "learning_rate": 1.9965265468894964e-05, "loss": 2.0612, "step": 8600 }, { "epoch": 0.0017573290214808223, "grad_norm": 10.705985069274902, "learning_rate": 1.996486148504866e-05, "loss": 2.0761, "step": 8700 }, { "epoch": 0.0017775282056357745, "grad_norm": 13.676326751708984, "learning_rate": 1.996445750120236e-05, "loss": 2.0293, "step": 8800 }, { "epoch": 0.0017977273897907264, "grad_norm": 11.932296752929688, "learning_rate": 1.9964053517356058e-05, "loss": 2.1927, "step": 8900 }, { "epoch": 0.0018179265739456784, "grad_norm": 18.86102294921875, "learning_rate": 1.9963649533509753e-05, "loss": 2.0697, "step": 9000 }, { "epoch": 0.0018179265739456784, "eval_calculated_loss": 8.520842552185059, "eval_loss": 2.2402846813201904, "eval_perplexity": 5018.280138967454, "eval_runtime": 114.3336, "eval_samples_per_second": 8.729, "eval_steps_per_second": 2.187, "step": 9000 }, { "epoch": 0.0018381257581006304, "grad_norm": 17.254270553588867, "learning_rate": 1.9963245549663453e-05, "loss": 2.076, "step": 9100 }, { "epoch": 0.0018583249422555823, "grad_norm": 11.008255958557129, "learning_rate": 1.9962841565817152e-05, "loss": 2.0997, "step": 9200 }, { "epoch": 0.0018785241264105343, "grad_norm": 17.414348602294922, "learning_rate": 1.9962437581970847e-05, "loss": 2.1021, "step": 9300 }, { "epoch": 0.0018987233105654863, "grad_norm": 9.155817985534668, "learning_rate": 1.9962033598124547e-05, "loss": 2.0387, "step": 9400 }, { "epoch": 0.0019189224947204382, "grad_norm": 18.29210090637207, "learning_rate": 1.9961629614278246e-05, "loss": 2.2233, "step": 9500 }, { "epoch": 0.0019391216788753902, "grad_norm": 19.88136100769043, "learning_rate": 1.9961225630431945e-05, "loss": 2.1061, "step": 9600 }, { "epoch": 0.001959320863030342, "grad_norm": 17.7725830078125, "learning_rate": 1.996082164658564e-05, "loss": 2.1102, "step": 9700 }, { "epoch": 0.001979520047185294, "grad_norm": 21.010528564453125, "learning_rate": 1.996041766273934e-05, "loss": 2.0979, "step": 9800 }, { "epoch": 0.001999719231340246, "grad_norm": 14.356607437133789, "learning_rate": 1.996001367889304e-05, "loss": 2.139, "step": 9900 }, { "epoch": 0.002019918415495198, "grad_norm": 17.772459030151367, "learning_rate": 1.9959609695046735e-05, "loss": 2.1305, "step": 10000 }, { "epoch": 0.002019918415495198, "eval_calculated_loss": 8.188838005065918, "eval_loss": 2.2532951831817627, "eval_perplexity": 3600.536010137951, "eval_runtime": 112.3018, "eval_samples_per_second": 8.887, "eval_steps_per_second": 2.226, "step": 10000 }, { "epoch": 0.00204011759965015, "grad_norm": 16.128952026367188, "learning_rate": 1.9959205711200434e-05, "loss": 2.0798, "step": 10100 }, { "epoch": 0.002060316783805102, "grad_norm": 11.240351676940918, "learning_rate": 1.9958801727354133e-05, "loss": 2.1108, "step": 10200 }, { "epoch": 0.002080515967960054, "grad_norm": 19.229631423950195, "learning_rate": 1.995839774350783e-05, "loss": 2.1032, "step": 10300 }, { "epoch": 0.002100715152115006, "grad_norm": 8.384011268615723, "learning_rate": 1.9957993759661528e-05, "loss": 2.1229, "step": 10400 }, { "epoch": 0.002120914336269958, "grad_norm": 8.813220977783203, "learning_rate": 1.9957589775815223e-05, "loss": 1.9492, "step": 10500 }, { "epoch": 0.0021411135204249102, "grad_norm": 12.245566368103027, "learning_rate": 1.9957185791968926e-05, "loss": 2.1693, "step": 10600 }, { "epoch": 0.002161312704579862, "grad_norm": 15.503602981567383, "learning_rate": 1.9956781808122622e-05, "loss": 2.1765, "step": 10700 }, { "epoch": 0.002181511888734814, "grad_norm": 11.552753448486328, "learning_rate": 1.995637782427632e-05, "loss": 2.0278, "step": 10800 }, { "epoch": 0.002201711072889766, "grad_norm": 13.686413764953613, "learning_rate": 1.995597384043002e-05, "loss": 2.0879, "step": 10900 }, { "epoch": 0.002221910257044718, "grad_norm": 12.884256362915039, "learning_rate": 1.9955569856583716e-05, "loss": 2.0529, "step": 11000 }, { "epoch": 0.002221910257044718, "eval_calculated_loss": 8.2538423538208, "eval_loss": 2.2255513668060303, "eval_perplexity": 3842.3612052896897, "eval_runtime": 111.533, "eval_samples_per_second": 8.948, "eval_steps_per_second": 2.241, "step": 11000 }, { "epoch": 0.00224210944119967, "grad_norm": 11.18860912322998, "learning_rate": 1.9955165872737415e-05, "loss": 2.0589, "step": 11100 }, { "epoch": 0.002262308625354622, "grad_norm": 15.950249671936035, "learning_rate": 1.995476188889111e-05, "loss": 2.1229, "step": 11200 }, { "epoch": 0.002282507809509574, "grad_norm": 17.725683212280273, "learning_rate": 1.995435790504481e-05, "loss": 2.1181, "step": 11300 }, { "epoch": 0.002302706993664526, "grad_norm": 16.85702133178711, "learning_rate": 1.995395392119851e-05, "loss": 2.1371, "step": 11400 }, { "epoch": 0.002322906177819478, "grad_norm": 11.12031364440918, "learning_rate": 1.9953549937352205e-05, "loss": 2.0366, "step": 11500 }, { "epoch": 0.00234310536197443, "grad_norm": 18.28437614440918, "learning_rate": 1.9953145953505907e-05, "loss": 2.0218, "step": 11600 }, { "epoch": 0.002363304546129382, "grad_norm": 16.387279510498047, "learning_rate": 1.9952741969659603e-05, "loss": 2.1128, "step": 11700 }, { "epoch": 0.002383503730284334, "grad_norm": 13.82940673828125, "learning_rate": 1.9952337985813302e-05, "loss": 2.1075, "step": 11800 }, { "epoch": 0.0024037029144392858, "grad_norm": 20.83962059020996, "learning_rate": 1.9951934001966998e-05, "loss": 2.0045, "step": 11900 }, { "epoch": 0.0024239020985942377, "grad_norm": 16.14495086669922, "learning_rate": 1.9951530018120697e-05, "loss": 2.1294, "step": 12000 }, { "epoch": 0.0024239020985942377, "eval_calculated_loss": 8.375478744506836, "eval_loss": 2.224027633666992, "eval_perplexity": 4339.345218968523, "eval_runtime": 111.9712, "eval_samples_per_second": 8.913, "eval_steps_per_second": 2.233, "step": 12000 }, { "epoch": 0.0024441012827491897, "grad_norm": 16.718814849853516, "learning_rate": 1.9951126034274396e-05, "loss": 2.0798, "step": 12100 }, { "epoch": 0.0024643004669041416, "grad_norm": 16.511919021606445, "learning_rate": 1.9950722050428092e-05, "loss": 2.137, "step": 12200 }, { "epoch": 0.0024844996510590936, "grad_norm": 9.742353439331055, "learning_rate": 1.995031806658179e-05, "loss": 2.1007, "step": 12300 }, { "epoch": 0.0025046988352140456, "grad_norm": 19.58485221862793, "learning_rate": 1.994991408273549e-05, "loss": 2.1079, "step": 12400 }, { "epoch": 0.0025248980193689975, "grad_norm": 13.82087516784668, "learning_rate": 1.9949510098889186e-05, "loss": 2.0136, "step": 12500 }, { "epoch": 0.0025450972035239495, "grad_norm": 18.029054641723633, "learning_rate": 1.9949106115042888e-05, "loss": 2.0646, "step": 12600 }, { "epoch": 0.0025652963876789015, "grad_norm": 17.239728927612305, "learning_rate": 1.9948702131196584e-05, "loss": 2.0334, "step": 12700 }, { "epoch": 0.0025854955718338534, "grad_norm": 8.519083023071289, "learning_rate": 1.9948298147350283e-05, "loss": 2.0673, "step": 12800 }, { "epoch": 0.002605694755988806, "grad_norm": 14.200940132141113, "learning_rate": 1.994789416350398e-05, "loss": 2.0958, "step": 12900 }, { "epoch": 0.002625893940143758, "grad_norm": 10.00262451171875, "learning_rate": 1.9947490179657678e-05, "loss": 2.0551, "step": 13000 }, { "epoch": 0.002625893940143758, "eval_calculated_loss": 8.174849510192871, "eval_loss": 2.229931116104126, "eval_perplexity": 3550.520566581918, "eval_runtime": 111.7465, "eval_samples_per_second": 8.931, "eval_steps_per_second": 2.237, "step": 13000 }, { "epoch": 0.0026460931242987097, "grad_norm": 14.405789375305176, "learning_rate": 1.9947086195811377e-05, "loss": 2.1111, "step": 13100 }, { "epoch": 0.0026662923084536617, "grad_norm": 13.659997940063477, "learning_rate": 1.9946682211965073e-05, "loss": 2.1172, "step": 13200 }, { "epoch": 0.0026864914926086137, "grad_norm": 13.050477027893066, "learning_rate": 1.9946278228118772e-05, "loss": 2.0971, "step": 13300 }, { "epoch": 0.0027066906767635656, "grad_norm": 15.96218204498291, "learning_rate": 1.994587424427247e-05, "loss": 2.2217, "step": 13400 }, { "epoch": 0.0027268898609185176, "grad_norm": 9.249089241027832, "learning_rate": 1.9945470260426167e-05, "loss": 2.0653, "step": 13500 }, { "epoch": 0.0027470890450734696, "grad_norm": 18.265544891357422, "learning_rate": 1.9945066276579866e-05, "loss": 2.0546, "step": 13600 }, { "epoch": 0.0027672882292284215, "grad_norm": 17.93002700805664, "learning_rate": 1.9944662292733565e-05, "loss": 2.079, "step": 13700 }, { "epoch": 0.0027874874133833735, "grad_norm": 14.305792808532715, "learning_rate": 1.9944258308887264e-05, "loss": 2.0866, "step": 13800 }, { "epoch": 0.0028076865975383255, "grad_norm": 11.79248046875, "learning_rate": 1.994385432504096e-05, "loss": 2.0415, "step": 13900 }, { "epoch": 0.0028278857816932774, "grad_norm": 8.845921516418457, "learning_rate": 1.994345034119466e-05, "loss": 2.0206, "step": 14000 }, { "epoch": 0.0028278857816932774, "eval_calculated_loss": 8.099516868591309, "eval_loss": 2.218456268310547, "eval_perplexity": 3292.8767987105393, "eval_runtime": 108.735, "eval_samples_per_second": 9.178, "eval_steps_per_second": 2.299, "step": 14000 }, { "epoch": 0.0028480849658482294, "grad_norm": 16.03417205810547, "learning_rate": 1.9943046357348358e-05, "loss": 2.0717, "step": 14100 }, { "epoch": 0.0028682841500031813, "grad_norm": 9.276649475097656, "learning_rate": 1.9942642373502054e-05, "loss": 2.0969, "step": 14200 }, { "epoch": 0.0028884833341581333, "grad_norm": 19.115127563476562, "learning_rate": 1.9942238389655753e-05, "loss": 2.1088, "step": 14300 }, { "epoch": 0.0029086825183130853, "grad_norm": 12.772050857543945, "learning_rate": 1.994183440580945e-05, "loss": 2.2394, "step": 14400 }, { "epoch": 0.0029288817024680372, "grad_norm": 10.176413536071777, "learning_rate": 1.9941430421963148e-05, "loss": 2.0571, "step": 14500 }, { "epoch": 0.002949080886622989, "grad_norm": 18.262754440307617, "learning_rate": 1.9941026438116847e-05, "loss": 2.0258, "step": 14600 }, { "epoch": 0.002969280070777941, "grad_norm": 13.194957733154297, "learning_rate": 1.9940622454270546e-05, "loss": 2.124, "step": 14700 }, { "epoch": 0.002989479254932893, "grad_norm": 16.792816162109375, "learning_rate": 1.9940218470424245e-05, "loss": 2.1189, "step": 14800 }, { "epoch": 0.003009678439087845, "grad_norm": 11.42923641204834, "learning_rate": 1.993981448657794e-05, "loss": 2.0965, "step": 14900 }, { "epoch": 0.003029877623242797, "grad_norm": 10.031203269958496, "learning_rate": 1.993941050273164e-05, "loss": 2.0235, "step": 15000 }, { "epoch": 0.003029877623242797, "eval_calculated_loss": 8.181089401245117, "eval_loss": 2.2253119945526123, "eval_perplexity": 3572.7446940529394, "eval_runtime": 112.0588, "eval_samples_per_second": 8.906, "eval_steps_per_second": 2.231, "step": 15000 }, { "epoch": 0.003050076807397749, "grad_norm": 14.893531799316406, "learning_rate": 1.993900651888534e-05, "loss": 2.026, "step": 15100 }, { "epoch": 0.003070275991552701, "grad_norm": 15.90219783782959, "learning_rate": 1.9938602535039035e-05, "loss": 2.1375, "step": 15200 }, { "epoch": 0.0030904751757076534, "grad_norm": 14.414021492004395, "learning_rate": 1.9938198551192734e-05, "loss": 2.1384, "step": 15300 }, { "epoch": 0.0031106743598626053, "grad_norm": 17.65363121032715, "learning_rate": 1.993779456734643e-05, "loss": 2.069, "step": 15400 }, { "epoch": 0.0031308735440175573, "grad_norm": 14.43399715423584, "learning_rate": 1.993739058350013e-05, "loss": 2.1005, "step": 15500 }, { "epoch": 0.0031510727281725093, "grad_norm": 15.697698593139648, "learning_rate": 1.9936986599653828e-05, "loss": 2.0597, "step": 15600 }, { "epoch": 0.0031712719123274612, "grad_norm": 10.32991886138916, "learning_rate": 1.9936582615807524e-05, "loss": 2.0348, "step": 15700 }, { "epoch": 0.003191471096482413, "grad_norm": 16.40020179748535, "learning_rate": 1.9936178631961226e-05, "loss": 2.1552, "step": 15800 }, { "epoch": 0.003211670280637365, "grad_norm": 10.746498107910156, "learning_rate": 1.9935774648114922e-05, "loss": 2.0655, "step": 15900 }, { "epoch": 0.003231869464792317, "grad_norm": 15.481362342834473, "learning_rate": 1.993537066426862e-05, "loss": 2.0509, "step": 16000 }, { "epoch": 0.003231869464792317, "eval_calculated_loss": 8.235836029052734, "eval_loss": 2.221240282058716, "eval_perplexity": 3773.7935796362035, "eval_runtime": 111.107, "eval_samples_per_second": 8.982, "eval_steps_per_second": 2.25, "step": 16000 }, { "epoch": 0.003252068648947269, "grad_norm": 16.55597686767578, "learning_rate": 1.9934966680422317e-05, "loss": 2.0392, "step": 16100 }, { "epoch": 0.003272267833102221, "grad_norm": 13.35317611694336, "learning_rate": 1.9934562696576016e-05, "loss": 2.0851, "step": 16200 }, { "epoch": 0.003292467017257173, "grad_norm": 9.272472381591797, "learning_rate": 1.9934158712729715e-05, "loss": 2.0969, "step": 16300 }, { "epoch": 0.003312666201412125, "grad_norm": 14.339399337768555, "learning_rate": 1.993375472888341e-05, "loss": 2.0203, "step": 16400 }, { "epoch": 0.003332865385567077, "grad_norm": 18.10075569152832, "learning_rate": 1.993335074503711e-05, "loss": 2.1243, "step": 16500 }, { "epoch": 0.003353064569722029, "grad_norm": 16.49703025817871, "learning_rate": 1.993294676119081e-05, "loss": 2.1295, "step": 16600 }, { "epoch": 0.003373263753876981, "grad_norm": 16.95433807373047, "learning_rate": 1.9932542777344505e-05, "loss": 2.0563, "step": 16700 }, { "epoch": 0.003393462938031933, "grad_norm": 8.706789016723633, "learning_rate": 1.9932138793498204e-05, "loss": 2.0232, "step": 16800 }, { "epoch": 0.0034136621221868848, "grad_norm": 13.800054550170898, "learning_rate": 1.9931734809651903e-05, "loss": 2.0338, "step": 16900 }, { "epoch": 0.0034338613063418367, "grad_norm": 12.451580047607422, "learning_rate": 1.9931330825805602e-05, "loss": 2.0215, "step": 17000 }, { "epoch": 0.0034338613063418367, "eval_calculated_loss": 8.189416885375977, "eval_loss": 2.2341392040252686, "eval_perplexity": 3602.620892930455, "eval_runtime": 111.0276, "eval_samples_per_second": 8.989, "eval_steps_per_second": 2.252, "step": 17000 }, { "epoch": 0.0034540604904967887, "grad_norm": 16.189687728881836, "learning_rate": 1.9930926841959298e-05, "loss": 2.027, "step": 17100 }, { "epoch": 0.0034742596746517407, "grad_norm": 14.01804256439209, "learning_rate": 1.9930522858112997e-05, "loss": 2.1159, "step": 17200 }, { "epoch": 0.0034944588588066926, "grad_norm": 11.502208709716797, "learning_rate": 1.9930118874266696e-05, "loss": 2.1222, "step": 17300 }, { "epoch": 0.0035146580429616446, "grad_norm": 17.18349838256836, "learning_rate": 1.9929714890420392e-05, "loss": 2.0262, "step": 17400 }, { "epoch": 0.0035348572271165966, "grad_norm": 17.982885360717773, "learning_rate": 1.992931090657409e-05, "loss": 2.0751, "step": 17500 }, { "epoch": 0.003555056411271549, "grad_norm": 13.454726219177246, "learning_rate": 1.9928906922727787e-05, "loss": 2.032, "step": 17600 }, { "epoch": 0.003575255595426501, "grad_norm": 20.344341278076172, "learning_rate": 1.9928502938881486e-05, "loss": 2.1274, "step": 17700 }, { "epoch": 0.003595454779581453, "grad_norm": 12.8789701461792, "learning_rate": 1.9928098955035185e-05, "loss": 2.1513, "step": 17800 }, { "epoch": 0.003615653963736405, "grad_norm": 11.671751022338867, "learning_rate": 1.9927694971188884e-05, "loss": 2.0416, "step": 17900 }, { "epoch": 0.003635853147891357, "grad_norm": 12.824463844299316, "learning_rate": 1.9927290987342584e-05, "loss": 2.0001, "step": 18000 }, { "epoch": 0.003635853147891357, "eval_calculated_loss": 8.412993431091309, "eval_loss": 2.2174038887023926, "eval_perplexity": 4505.22643171546, "eval_runtime": 111.2355, "eval_samples_per_second": 8.972, "eval_steps_per_second": 2.247, "step": 18000 }, { "epoch": 0.0036560523320463088, "grad_norm": 9.710125923156738, "learning_rate": 1.992688700349628e-05, "loss": 2.0267, "step": 18100 }, { "epoch": 0.0036762515162012607, "grad_norm": 9.904170036315918, "learning_rate": 1.992648301964998e-05, "loss": 1.9607, "step": 18200 }, { "epoch": 0.0036964507003562127, "grad_norm": 16.79892349243164, "learning_rate": 1.9926079035803678e-05, "loss": 2.1698, "step": 18300 }, { "epoch": 0.0037166498845111647, "grad_norm": 10.15634536743164, "learning_rate": 1.9925675051957373e-05, "loss": 2.0241, "step": 18400 }, { "epoch": 0.0037368490686661166, "grad_norm": 12.359416007995605, "learning_rate": 1.9925271068111072e-05, "loss": 1.99, "step": 18500 }, { "epoch": 0.0037570482528210686, "grad_norm": 19.242616653442383, "learning_rate": 1.9924867084264768e-05, "loss": 2.0002, "step": 18600 }, { "epoch": 0.0037772474369760205, "grad_norm": 8.066484451293945, "learning_rate": 1.9924463100418467e-05, "loss": 1.9825, "step": 18700 }, { "epoch": 0.0037974466211309725, "grad_norm": 9.346282005310059, "learning_rate": 1.9924059116572166e-05, "loss": 2.0471, "step": 18800 }, { "epoch": 0.0038176458052859245, "grad_norm": 15.94672679901123, "learning_rate": 1.9923655132725866e-05, "loss": 2.0958, "step": 18900 }, { "epoch": 0.0038378449894408764, "grad_norm": 17.419845581054688, "learning_rate": 1.9923251148879565e-05, "loss": 2.0392, "step": 19000 }, { "epoch": 0.0038378449894408764, "eval_calculated_loss": 8.109663009643555, "eval_loss": 2.215837001800537, "eval_perplexity": 3326.4568571103264, "eval_runtime": 111.8065, "eval_samples_per_second": 8.926, "eval_steps_per_second": 2.236, "step": 19000 }, { "epoch": 0.0038580441735958284, "grad_norm": 11.516481399536133, "learning_rate": 1.992284716503326e-05, "loss": 2.0646, "step": 19100 }, { "epoch": 0.0038782433577507804, "grad_norm": 13.174059867858887, "learning_rate": 1.992244318118696e-05, "loss": 2.0514, "step": 19200 }, { "epoch": 0.0038984425419057323, "grad_norm": 8.800369262695312, "learning_rate": 1.9922039197340655e-05, "loss": 2.1226, "step": 19300 }, { "epoch": 0.003918641726060684, "grad_norm": 12.778773307800293, "learning_rate": 1.9921635213494354e-05, "loss": 2.1521, "step": 19400 }, { "epoch": 0.003938840910215636, "grad_norm": 12.775114059448242, "learning_rate": 1.9921231229648054e-05, "loss": 2.0402, "step": 19500 }, { "epoch": 0.003959040094370588, "grad_norm": 9.759988784790039, "learning_rate": 1.992082724580175e-05, "loss": 2.0527, "step": 19600 }, { "epoch": 0.00397923927852554, "grad_norm": 12.398292541503906, "learning_rate": 1.992042326195545e-05, "loss": 2.1076, "step": 19700 }, { "epoch": 0.003999438462680492, "grad_norm": 11.28762149810791, "learning_rate": 1.9920019278109148e-05, "loss": 2.0985, "step": 19800 }, { "epoch": 0.004019637646835444, "grad_norm": 8.985289573669434, "learning_rate": 1.9919615294262847e-05, "loss": 2.0234, "step": 19900 }, { "epoch": 0.004039836830990396, "grad_norm": 11.115205764770508, "learning_rate": 1.9919211310416546e-05, "loss": 2.0466, "step": 20000 }, { "epoch": 0.004039836830990396, "eval_calculated_loss": 8.2469482421875, "eval_loss": 2.198920726776123, "eval_perplexity": 3815.9626400902202, "eval_runtime": 112.1642, "eval_samples_per_second": 8.898, "eval_steps_per_second": 2.229, "step": 20000 }, { "epoch": 0.004060036015145348, "grad_norm": 15.438053131103516, "learning_rate": 1.991880732657024e-05, "loss": 2.0324, "step": 20100 }, { "epoch": 0.0040802351993003, "grad_norm": 12.605445861816406, "learning_rate": 1.991840334272394e-05, "loss": 2.067, "step": 20200 }, { "epoch": 0.004100434383455252, "grad_norm": 12.566864967346191, "learning_rate": 1.9917999358877636e-05, "loss": 2.1261, "step": 20300 }, { "epoch": 0.004120633567610204, "grad_norm": 16.848543167114258, "learning_rate": 1.9917595375031336e-05, "loss": 2.0191, "step": 20400 }, { "epoch": 0.004140832751765156, "grad_norm": 12.748955726623535, "learning_rate": 1.9917191391185035e-05, "loss": 2.1113, "step": 20500 }, { "epoch": 0.004161031935920108, "grad_norm": 9.857080459594727, "learning_rate": 1.991678740733873e-05, "loss": 2.0861, "step": 20600 }, { "epoch": 0.00418123112007506, "grad_norm": 7.566249370574951, "learning_rate": 1.991638342349243e-05, "loss": 1.9787, "step": 20700 }, { "epoch": 0.004201430304230012, "grad_norm": 15.660636901855469, "learning_rate": 1.991597943964613e-05, "loss": 2.081, "step": 20800 }, { "epoch": 0.004221629488384964, "grad_norm": 16.804439544677734, "learning_rate": 1.9915575455799828e-05, "loss": 2.0816, "step": 20900 }, { "epoch": 0.004241828672539916, "grad_norm": 13.673198699951172, "learning_rate": 1.9915171471953524e-05, "loss": 2.1496, "step": 21000 }, { "epoch": 0.004241828672539916, "eval_calculated_loss": 8.21810245513916, "eval_loss": 2.215019941329956, "eval_perplexity": 3707.460630737652, "eval_runtime": 111.3205, "eval_samples_per_second": 8.965, "eval_steps_per_second": 2.246, "step": 21000 }, { "epoch": 0.0042620278566948685, "grad_norm": 15.613951683044434, "learning_rate": 1.9914767488107223e-05, "loss": 2.0086, "step": 21100 }, { "epoch": 0.0042822270408498205, "grad_norm": 9.769705772399902, "learning_rate": 1.9914363504260922e-05, "loss": 2.0269, "step": 21200 }, { "epoch": 0.0043024262250047724, "grad_norm": 10.865740776062012, "learning_rate": 1.9913959520414617e-05, "loss": 2.0507, "step": 21300 }, { "epoch": 0.004322625409159724, "grad_norm": 13.432695388793945, "learning_rate": 1.9913555536568317e-05, "loss": 2.0853, "step": 21400 }, { "epoch": 0.004342824593314676, "grad_norm": 16.049549102783203, "learning_rate": 1.9913151552722016e-05, "loss": 2.0132, "step": 21500 }, { "epoch": 0.004363023777469628, "grad_norm": 10.666481971740723, "learning_rate": 1.991274756887571e-05, "loss": 2.052, "step": 21600 }, { "epoch": 0.00438322296162458, "grad_norm": 20.86705207824707, "learning_rate": 1.991234358502941e-05, "loss": 2.1067, "step": 21700 }, { "epoch": 0.004403422145779532, "grad_norm": 11.019103050231934, "learning_rate": 1.9911939601183106e-05, "loss": 2.1009, "step": 21800 }, { "epoch": 0.004423621329934484, "grad_norm": 17.342477798461914, "learning_rate": 1.9911535617336805e-05, "loss": 2.1241, "step": 21900 }, { "epoch": 0.004443820514089436, "grad_norm": 8.50845718383789, "learning_rate": 1.9911131633490505e-05, "loss": 2.0563, "step": 22000 }, { "epoch": 0.004443820514089436, "eval_calculated_loss": 8.346796035766602, "eval_loss": 2.2326948642730713, "eval_perplexity": 4216.649084379963, "eval_runtime": 112.2501, "eval_samples_per_second": 8.891, "eval_steps_per_second": 2.227, "step": 22000 }, { "epoch": 0.004464019698244388, "grad_norm": 15.097211837768555, "learning_rate": 1.9910727649644204e-05, "loss": 2.0801, "step": 22100 }, { "epoch": 0.00448421888239934, "grad_norm": 14.364411354064941, "learning_rate": 1.9910323665797903e-05, "loss": 2.0924, "step": 22200 }, { "epoch": 0.004504418066554292, "grad_norm": 17.62268829345703, "learning_rate": 1.99099196819516e-05, "loss": 2.0123, "step": 22300 }, { "epoch": 0.004524617250709244, "grad_norm": 9.558741569519043, "learning_rate": 1.9909515698105298e-05, "loss": 2.1025, "step": 22400 }, { "epoch": 0.004544816434864196, "grad_norm": 14.964478492736816, "learning_rate": 1.9909111714258993e-05, "loss": 2.0792, "step": 22500 }, { "epoch": 0.004565015619019148, "grad_norm": 14.342308044433594, "learning_rate": 1.9908707730412693e-05, "loss": 2.0226, "step": 22600 }, { "epoch": 0.0045852148031741, "grad_norm": 10.554933547973633, "learning_rate": 1.9908303746566392e-05, "loss": 2.0069, "step": 22700 }, { "epoch": 0.004605413987329052, "grad_norm": 13.65635871887207, "learning_rate": 1.9907899762720087e-05, "loss": 2.0231, "step": 22800 }, { "epoch": 0.004625613171484004, "grad_norm": 20.441370010375977, "learning_rate": 1.9907495778873787e-05, "loss": 2.0678, "step": 22900 }, { "epoch": 0.004645812355638956, "grad_norm": 19.751319885253906, "learning_rate": 1.9907091795027486e-05, "loss": 1.9919, "step": 23000 }, { "epoch": 0.004645812355638956, "eval_calculated_loss": 8.435108184814453, "eval_loss": 2.195997953414917, "eval_perplexity": 4605.968239131692, "eval_runtime": 111.9995, "eval_samples_per_second": 8.911, "eval_steps_per_second": 2.232, "step": 23000 }, { "epoch": 0.004666011539793908, "grad_norm": 11.68442153930664, "learning_rate": 1.9906687811181185e-05, "loss": 2.0259, "step": 23100 }, { "epoch": 0.00468621072394886, "grad_norm": 14.436915397644043, "learning_rate": 1.9906283827334884e-05, "loss": 2.0469, "step": 23200 }, { "epoch": 0.004706409908103812, "grad_norm": 18.566843032836914, "learning_rate": 1.990587984348858e-05, "loss": 2.06, "step": 23300 }, { "epoch": 0.004726609092258764, "grad_norm": 15.728690147399902, "learning_rate": 1.990547585964228e-05, "loss": 2.1111, "step": 23400 }, { "epoch": 0.004746808276413716, "grad_norm": 17.367023468017578, "learning_rate": 1.9905071875795975e-05, "loss": 2.1095, "step": 23500 }, { "epoch": 0.004767007460568668, "grad_norm": 8.298946380615234, "learning_rate": 1.9904667891949674e-05, "loss": 2.1248, "step": 23600 }, { "epoch": 0.0047872066447236196, "grad_norm": 12.979907035827637, "learning_rate": 1.9904263908103373e-05, "loss": 1.9756, "step": 23700 }, { "epoch": 0.0048074058288785715, "grad_norm": 16.8901424407959, "learning_rate": 1.990385992425707e-05, "loss": 2.0506, "step": 23800 }, { "epoch": 0.0048276050130335235, "grad_norm": 14.642086029052734, "learning_rate": 1.9903455940410768e-05, "loss": 2.0, "step": 23900 }, { "epoch": 0.0048478041971884754, "grad_norm": 18.58380699157715, "learning_rate": 1.9903051956564467e-05, "loss": 2.0825, "step": 24000 }, { "epoch": 0.0048478041971884754, "eval_calculated_loss": 8.425640106201172, "eval_loss": 2.2331039905548096, "eval_perplexity": 4562.564369620386, "eval_runtime": 113.6124, "eval_samples_per_second": 8.784, "eval_steps_per_second": 2.2, "step": 24000 }, { "epoch": 0.004868003381343427, "grad_norm": 17.74709129333496, "learning_rate": 1.9902647972718166e-05, "loss": 2.0431, "step": 24100 }, { "epoch": 0.004888202565498379, "grad_norm": 10.769476890563965, "learning_rate": 1.9902243988871862e-05, "loss": 2.0158, "step": 24200 }, { "epoch": 0.004908401749653331, "grad_norm": 14.74028491973877, "learning_rate": 1.990184000502556e-05, "loss": 2.0442, "step": 24300 }, { "epoch": 0.004928600933808283, "grad_norm": 15.694328308105469, "learning_rate": 1.990143602117926e-05, "loss": 1.967, "step": 24400 }, { "epoch": 0.004948800117963235, "grad_norm": 14.124894142150879, "learning_rate": 1.9901032037332956e-05, "loss": 2.1077, "step": 24500 }, { "epoch": 0.004968999302118187, "grad_norm": 11.142488479614258, "learning_rate": 1.9900628053486655e-05, "loss": 2.1287, "step": 24600 }, { "epoch": 0.004989198486273139, "grad_norm": 13.29405403137207, "learning_rate": 1.9900224069640354e-05, "loss": 2.0456, "step": 24700 }, { "epoch": 0.005009397670428091, "grad_norm": 10.714554786682129, "learning_rate": 1.989982008579405e-05, "loss": 2.0172, "step": 24800 }, { "epoch": 0.005029596854583043, "grad_norm": 21.411832809448242, "learning_rate": 1.989941610194775e-05, "loss": 2.1151, "step": 24900 }, { "epoch": 0.005049796038737995, "grad_norm": 10.372939109802246, "learning_rate": 1.9899012118101445e-05, "loss": 2.0068, "step": 25000 }, { "epoch": 0.005049796038737995, "eval_calculated_loss": 8.302364349365234, "eval_loss": 2.2044384479522705, "eval_perplexity": 4033.39748985258, "eval_runtime": 111.7648, "eval_samples_per_second": 8.929, "eval_steps_per_second": 2.237, "step": 25000 }, { "epoch": 0.005069995222892947, "grad_norm": 8.303022384643555, "learning_rate": 1.9898608134255147e-05, "loss": 2.0627, "step": 25100 }, { "epoch": 0.005090194407047899, "grad_norm": 14.745580673217773, "learning_rate": 1.9898204150408843e-05, "loss": 2.0271, "step": 25200 }, { "epoch": 0.005110393591202851, "grad_norm": 15.371898651123047, "learning_rate": 1.9897800166562542e-05, "loss": 2.0291, "step": 25300 }, { "epoch": 0.005130592775357803, "grad_norm": 12.00602912902832, "learning_rate": 1.989739618271624e-05, "loss": 1.9788, "step": 25400 }, { "epoch": 0.005150791959512755, "grad_norm": 12.994361877441406, "learning_rate": 1.9896992198869937e-05, "loss": 2.0105, "step": 25500 }, { "epoch": 0.005170991143667707, "grad_norm": 13.48690128326416, "learning_rate": 1.9896588215023636e-05, "loss": 1.9981, "step": 25600 }, { "epoch": 0.005191190327822659, "grad_norm": 12.47059440612793, "learning_rate": 1.9896184231177335e-05, "loss": 2.0286, "step": 25700 }, { "epoch": 0.005211389511977612, "grad_norm": 11.680514335632324, "learning_rate": 1.989578024733103e-05, "loss": 2.1054, "step": 25800 }, { "epoch": 0.005231588696132564, "grad_norm": 10.08999252319336, "learning_rate": 1.989537626348473e-05, "loss": 2.0495, "step": 25900 }, { "epoch": 0.005251787880287516, "grad_norm": 13.10912036895752, "learning_rate": 1.9894972279638426e-05, "loss": 2.0916, "step": 26000 }, { "epoch": 0.005251787880287516, "eval_calculated_loss": 8.35053825378418, "eval_loss": 2.1956231594085693, "eval_perplexity": 4232.4582668114335, "eval_runtime": 111.1901, "eval_samples_per_second": 8.976, "eval_steps_per_second": 2.248, "step": 26000 }, { "epoch": 0.0052719870644424675, "grad_norm": 13.165558815002441, "learning_rate": 1.9894568295792128e-05, "loss": 2.0134, "step": 26100 }, { "epoch": 0.0052921862485974195, "grad_norm": 10.724847793579102, "learning_rate": 1.9894164311945824e-05, "loss": 2.0007, "step": 26200 }, { "epoch": 0.0053123854327523715, "grad_norm": 23.102054595947266, "learning_rate": 1.9893760328099523e-05, "loss": 2.0538, "step": 26300 }, { "epoch": 0.005332584616907323, "grad_norm": 14.941573143005371, "learning_rate": 1.9893356344253222e-05, "loss": 2.0079, "step": 26400 }, { "epoch": 0.005352783801062275, "grad_norm": 12.683263778686523, "learning_rate": 1.9892952360406918e-05, "loss": 2.0223, "step": 26500 }, { "epoch": 0.005372982985217227, "grad_norm": 14.855751037597656, "learning_rate": 1.9892548376560617e-05, "loss": 2.0197, "step": 26600 }, { "epoch": 0.005393182169372179, "grad_norm": 7.569177150726318, "learning_rate": 1.9892144392714313e-05, "loss": 2.0679, "step": 26700 }, { "epoch": 0.005413381353527131, "grad_norm": 15.061573028564453, "learning_rate": 1.9891740408868012e-05, "loss": 2.0398, "step": 26800 }, { "epoch": 0.005433580537682083, "grad_norm": 12.878744125366211, "learning_rate": 1.989133642502171e-05, "loss": 2.0799, "step": 26900 }, { "epoch": 0.005453779721837035, "grad_norm": 18.369998931884766, "learning_rate": 1.9890932441175407e-05, "loss": 2.037, "step": 27000 }, { "epoch": 0.005453779721837035, "eval_calculated_loss": 8.307608604431152, "eval_loss": 2.1971194744110107, "eval_perplexity": 4054.60521582887, "eval_runtime": 111.4278, "eval_samples_per_second": 8.956, "eval_steps_per_second": 2.244, "step": 27000 }, { "epoch": 0.005473978905991987, "grad_norm": 14.205077171325684, "learning_rate": 1.9890528457329106e-05, "loss": 2.0845, "step": 27100 }, { "epoch": 0.005494178090146939, "grad_norm": 15.075255393981934, "learning_rate": 1.9890124473482805e-05, "loss": 2.015, "step": 27200 }, { "epoch": 0.005514377274301891, "grad_norm": 12.495539665222168, "learning_rate": 1.9889720489636504e-05, "loss": 2.1176, "step": 27300 }, { "epoch": 0.005534576458456843, "grad_norm": 16.419275283813477, "learning_rate": 1.98893165057902e-05, "loss": 2.0604, "step": 27400 }, { "epoch": 0.005554775642611795, "grad_norm": 17.74167251586914, "learning_rate": 1.98889125219439e-05, "loss": 2.0151, "step": 27500 }, { "epoch": 0.005574974826766747, "grad_norm": 12.319457054138184, "learning_rate": 1.9888508538097598e-05, "loss": 1.997, "step": 27600 }, { "epoch": 0.005595174010921699, "grad_norm": 13.761833190917969, "learning_rate": 1.9888104554251294e-05, "loss": 2.0795, "step": 27700 }, { "epoch": 0.005615373195076651, "grad_norm": 8.775877952575684, "learning_rate": 1.9887700570404993e-05, "loss": 2.0347, "step": 27800 }, { "epoch": 0.005635572379231603, "grad_norm": 10.419814109802246, "learning_rate": 1.9887296586558692e-05, "loss": 2.06, "step": 27900 }, { "epoch": 0.005655771563386555, "grad_norm": 14.39529800415039, "learning_rate": 1.9886892602712388e-05, "loss": 2.0416, "step": 28000 }, { "epoch": 0.005655771563386555, "eval_calculated_loss": 8.302370071411133, "eval_loss": 2.1887247562408447, "eval_perplexity": 4033.420569204174, "eval_runtime": 110.8663, "eval_samples_per_second": 9.002, "eval_steps_per_second": 2.255, "step": 28000 }, { "epoch": 0.005675970747541507, "grad_norm": 6.437203884124756, "learning_rate": 1.9886488618866087e-05, "loss": 1.9432, "step": 28100 }, { "epoch": 0.005696169931696459, "grad_norm": 14.101344108581543, "learning_rate": 1.9886084635019786e-05, "loss": 1.9705, "step": 28200 }, { "epoch": 0.005716369115851411, "grad_norm": 8.806755065917969, "learning_rate": 1.9885680651173485e-05, "loss": 1.9599, "step": 28300 }, { "epoch": 0.005736568300006363, "grad_norm": 15.57079029083252, "learning_rate": 1.988527666732718e-05, "loss": 2.0809, "step": 28400 }, { "epoch": 0.005756767484161315, "grad_norm": 14.696571350097656, "learning_rate": 1.988487268348088e-05, "loss": 1.9478, "step": 28500 }, { "epoch": 0.005776966668316267, "grad_norm": 14.017302513122559, "learning_rate": 1.988446869963458e-05, "loss": 1.9757, "step": 28600 }, { "epoch": 0.005797165852471219, "grad_norm": 8.698016166687012, "learning_rate": 1.9884064715788275e-05, "loss": 2.0491, "step": 28700 }, { "epoch": 0.0058173650366261705, "grad_norm": 16.822898864746094, "learning_rate": 1.9883660731941974e-05, "loss": 2.0698, "step": 28800 }, { "epoch": 0.0058375642207811225, "grad_norm": 14.459988594055176, "learning_rate": 1.9883256748095673e-05, "loss": 2.0097, "step": 28900 }, { "epoch": 0.0058577634049360745, "grad_norm": 8.349377632141113, "learning_rate": 1.988285276424937e-05, "loss": 2.0603, "step": 29000 }, { "epoch": 0.0058577634049360745, "eval_calculated_loss": 8.242944717407227, "eval_loss": 2.2092907428741455, "eval_perplexity": 3800.7158798560813, "eval_runtime": 110.005, "eval_samples_per_second": 9.072, "eval_steps_per_second": 2.273, "step": 29000 }, { "epoch": 0.005877962589091026, "grad_norm": 16.197235107421875, "learning_rate": 1.9882448780403068e-05, "loss": 2.1042, "step": 29100 }, { "epoch": 0.005898161773245978, "grad_norm": 12.635161399841309, "learning_rate": 1.9882044796556767e-05, "loss": 2.0674, "step": 29200 }, { "epoch": 0.00591836095740093, "grad_norm": 7.8919219970703125, "learning_rate": 1.9881640812710466e-05, "loss": 1.9967, "step": 29300 }, { "epoch": 0.005938560141555882, "grad_norm": 11.32049560546875, "learning_rate": 1.9881236828864162e-05, "loss": 2.0076, "step": 29400 }, { "epoch": 0.005958759325710834, "grad_norm": 15.19477653503418, "learning_rate": 1.988083284501786e-05, "loss": 2.0288, "step": 29500 }, { "epoch": 0.005978958509865786, "grad_norm": 12.250726699829102, "learning_rate": 1.988042886117156e-05, "loss": 2.1007, "step": 29600 }, { "epoch": 0.005999157694020738, "grad_norm": 12.456631660461426, "learning_rate": 1.9880024877325256e-05, "loss": 2.0521, "step": 29700 }, { "epoch": 0.00601935687817569, "grad_norm": 12.784562110900879, "learning_rate": 1.9879620893478955e-05, "loss": 2.0782, "step": 29800 }, { "epoch": 0.006039556062330642, "grad_norm": 12.124025344848633, "learning_rate": 1.987921690963265e-05, "loss": 1.9623, "step": 29900 }, { "epoch": 0.006059755246485594, "grad_norm": 17.722209930419922, "learning_rate": 1.987881292578635e-05, "loss": 2.0753, "step": 30000 }, { "epoch": 0.006059755246485594, "eval_calculated_loss": 8.150581359863281, "eval_loss": 2.1909983158111572, "eval_perplexity": 3465.393120448087, "eval_runtime": 111.5725, "eval_samples_per_second": 8.945, "eval_steps_per_second": 2.241, "step": 30000 }, { "epoch": 0.006079954430640546, "grad_norm": 12.432003021240234, "learning_rate": 1.987840894194005e-05, "loss": 2.0413, "step": 30100 }, { "epoch": 0.006100153614795498, "grad_norm": 17.013704299926758, "learning_rate": 1.9878004958093745e-05, "loss": 2.1076, "step": 30200 }, { "epoch": 0.00612035279895045, "grad_norm": 12.792585372924805, "learning_rate": 1.9877600974247448e-05, "loss": 1.9939, "step": 30300 }, { "epoch": 0.006140551983105402, "grad_norm": 11.985084533691406, "learning_rate": 1.9877196990401143e-05, "loss": 2.0324, "step": 30400 }, { "epoch": 0.006160751167260355, "grad_norm": 12.927572250366211, "learning_rate": 1.9876793006554842e-05, "loss": 2.0465, "step": 30500 }, { "epoch": 0.006180950351415307, "grad_norm": 11.570544242858887, "learning_rate": 1.987638902270854e-05, "loss": 2.0255, "step": 30600 }, { "epoch": 0.006201149535570259, "grad_norm": 16.081274032592773, "learning_rate": 1.9875985038862237e-05, "loss": 2.0821, "step": 30700 }, { "epoch": 0.006221348719725211, "grad_norm": 16.33824348449707, "learning_rate": 1.9875581055015936e-05, "loss": 2.0318, "step": 30800 }, { "epoch": 0.006241547903880163, "grad_norm": 9.063811302185059, "learning_rate": 1.9875177071169632e-05, "loss": 1.959, "step": 30900 }, { "epoch": 0.006261747088035115, "grad_norm": 9.499013900756836, "learning_rate": 1.987477308732333e-05, "loss": 2.0402, "step": 31000 }, { "epoch": 0.006261747088035115, "eval_calculated_loss": 8.11413860321045, "eval_loss": 2.203726291656494, "eval_perplexity": 3341.3780918043794, "eval_runtime": 112.1905, "eval_samples_per_second": 8.896, "eval_steps_per_second": 2.228, "step": 31000 }, { "epoch": 0.0062819462721900666, "grad_norm": 10.735247611999512, "learning_rate": 1.987436910347703e-05, "loss": 2.0585, "step": 31100 }, { "epoch": 0.0063021454563450185, "grad_norm": 14.403847694396973, "learning_rate": 1.9873965119630726e-05, "loss": 2.0246, "step": 31200 }, { "epoch": 0.0063223446404999705, "grad_norm": 16.1884708404541, "learning_rate": 1.987356113578443e-05, "loss": 2.0527, "step": 31300 }, { "epoch": 0.0063425438246549224, "grad_norm": 8.576621055603027, "learning_rate": 1.9873157151938124e-05, "loss": 1.9783, "step": 31400 }, { "epoch": 0.006362743008809874, "grad_norm": 13.534814834594727, "learning_rate": 1.9872753168091824e-05, "loss": 2.0337, "step": 31500 }, { "epoch": 0.006382942192964826, "grad_norm": 16.004358291625977, "learning_rate": 1.987234918424552e-05, "loss": 2.0475, "step": 31600 }, { "epoch": 0.006403141377119778, "grad_norm": 16.217954635620117, "learning_rate": 1.987194520039922e-05, "loss": 2.0251, "step": 31700 }, { "epoch": 0.00642334056127473, "grad_norm": 11.783538818359375, "learning_rate": 1.9871541216552918e-05, "loss": 1.9801, "step": 31800 }, { "epoch": 0.006443539745429682, "grad_norm": 8.650589942932129, "learning_rate": 1.9871137232706613e-05, "loss": 2.0017, "step": 31900 }, { "epoch": 0.006463738929584634, "grad_norm": 11.255512237548828, "learning_rate": 1.9870733248860312e-05, "loss": 2.0702, "step": 32000 }, { "epoch": 0.006463738929584634, "eval_calculated_loss": 8.621613502502441, "eval_loss": 2.209042549133301, "eval_perplexity": 5550.334651567819, "eval_runtime": 111.2254, "eval_samples_per_second": 8.973, "eval_steps_per_second": 2.248, "step": 32000 }, { "epoch": 0.006483938113739586, "grad_norm": 10.08043098449707, "learning_rate": 1.987032926501401e-05, "loss": 2.0822, "step": 32100 }, { "epoch": 0.006504137297894538, "grad_norm": 10.624674797058105, "learning_rate": 1.9869925281167707e-05, "loss": 1.9679, "step": 32200 }, { "epoch": 0.00652433648204949, "grad_norm": 13.383779525756836, "learning_rate": 1.9869521297321406e-05, "loss": 2.0278, "step": 32300 }, { "epoch": 0.006544535666204442, "grad_norm": 7.623569965362549, "learning_rate": 1.9869117313475106e-05, "loss": 1.9295, "step": 32400 }, { "epoch": 0.006564734850359394, "grad_norm": 12.939375877380371, "learning_rate": 1.9868713329628805e-05, "loss": 1.9392, "step": 32500 }, { "epoch": 0.006584934034514346, "grad_norm": 8.32258129119873, "learning_rate": 1.98683093457825e-05, "loss": 2.0566, "step": 32600 }, { "epoch": 0.006605133218669298, "grad_norm": 11.53382682800293, "learning_rate": 1.98679053619362e-05, "loss": 2.0464, "step": 32700 }, { "epoch": 0.00662533240282425, "grad_norm": 13.934680938720703, "learning_rate": 1.98675013780899e-05, "loss": 2.0527, "step": 32800 }, { "epoch": 0.006645531586979202, "grad_norm": 14.294280052185059, "learning_rate": 1.9867097394243594e-05, "loss": 1.974, "step": 32900 }, { "epoch": 0.006665730771134154, "grad_norm": 14.105250358581543, "learning_rate": 1.9866693410397294e-05, "loss": 1.9726, "step": 33000 }, { "epoch": 0.006665730771134154, "eval_calculated_loss": 8.465253829956055, "eval_loss": 2.2065563201904297, "eval_perplexity": 4746.932172522866, "eval_runtime": 110.8203, "eval_samples_per_second": 9.006, "eval_steps_per_second": 2.256, "step": 33000 }, { "epoch": 0.006685929955289106, "grad_norm": 8.299141883850098, "learning_rate": 1.986628942655099e-05, "loss": 2.0388, "step": 33100 }, { "epoch": 0.006706129139444058, "grad_norm": 14.818499565124512, "learning_rate": 1.986588544270469e-05, "loss": 1.9977, "step": 33200 }, { "epoch": 0.00672632832359901, "grad_norm": 11.689865112304688, "learning_rate": 1.9865481458858388e-05, "loss": 1.9971, "step": 33300 }, { "epoch": 0.006746527507753962, "grad_norm": 11.393363952636719, "learning_rate": 1.9865077475012087e-05, "loss": 1.96, "step": 33400 }, { "epoch": 0.006766726691908914, "grad_norm": 16.217853546142578, "learning_rate": 1.9864673491165786e-05, "loss": 1.9944, "step": 33500 }, { "epoch": 0.006786925876063866, "grad_norm": 12.2222900390625, "learning_rate": 1.986426950731948e-05, "loss": 1.9036, "step": 33600 }, { "epoch": 0.006807125060218818, "grad_norm": 10.150970458984375, "learning_rate": 1.986386552347318e-05, "loss": 2.0703, "step": 33700 }, { "epoch": 0.0068273242443737696, "grad_norm": 16.8414249420166, "learning_rate": 1.986346153962688e-05, "loss": 2.0912, "step": 33800 }, { "epoch": 0.0068475234285287215, "grad_norm": 14.300328254699707, "learning_rate": 1.9863057555780575e-05, "loss": 1.9844, "step": 33900 }, { "epoch": 0.0068677226126836735, "grad_norm": 12.847623825073242, "learning_rate": 1.9862653571934275e-05, "loss": 1.9953, "step": 34000 }, { "epoch": 0.0068677226126836735, "eval_calculated_loss": 8.450090408325195, "eval_loss": 2.188781261444092, "eval_perplexity": 4675.495420114787, "eval_runtime": 112.0126, "eval_samples_per_second": 8.91, "eval_steps_per_second": 2.232, "step": 34000 }, { "epoch": 0.0068879217968386254, "grad_norm": 13.699974060058594, "learning_rate": 1.986224958808797e-05, "loss": 2.0709, "step": 34100 }, { "epoch": 0.006908120980993577, "grad_norm": 12.574729919433594, "learning_rate": 1.986184560424167e-05, "loss": 1.9678, "step": 34200 }, { "epoch": 0.006928320165148529, "grad_norm": 8.329707145690918, "learning_rate": 1.986144162039537e-05, "loss": 1.9189, "step": 34300 }, { "epoch": 0.006948519349303481, "grad_norm": 8.789560317993164, "learning_rate": 1.9861037636549068e-05, "loss": 2.0629, "step": 34400 }, { "epoch": 0.006968718533458433, "grad_norm": 11.655618667602539, "learning_rate": 1.9860633652702767e-05, "loss": 2.0447, "step": 34500 }, { "epoch": 0.006988917717613385, "grad_norm": 5.668132781982422, "learning_rate": 1.9860229668856463e-05, "loss": 2.0082, "step": 34600 }, { "epoch": 0.007009116901768337, "grad_norm": 6.784494400024414, "learning_rate": 1.9859825685010162e-05, "loss": 2.0399, "step": 34700 }, { "epoch": 0.007029316085923289, "grad_norm": 11.962926864624023, "learning_rate": 1.9859421701163857e-05, "loss": 2.0036, "step": 34800 }, { "epoch": 0.007049515270078241, "grad_norm": 13.51230239868164, "learning_rate": 1.9859017717317557e-05, "loss": 2.0163, "step": 34900 }, { "epoch": 0.007069714454233193, "grad_norm": 15.357996940612793, "learning_rate": 1.9858613733471256e-05, "loss": 2.0849, "step": 35000 }, { "epoch": 0.007069714454233193, "eval_calculated_loss": 8.485404014587402, "eval_loss": 2.1926209926605225, "eval_perplexity": 4843.553936164792, "eval_runtime": 109.7855, "eval_samples_per_second": 9.09, "eval_steps_per_second": 2.277, "step": 35000 }, { "epoch": 0.007089913638388146, "grad_norm": 16.081483840942383, "learning_rate": 1.985820974962495e-05, "loss": 2.1098, "step": 35100 }, { "epoch": 0.007110112822543098, "grad_norm": 11.002547264099121, "learning_rate": 1.985780576577865e-05, "loss": 1.9231, "step": 35200 }, { "epoch": 0.00713031200669805, "grad_norm": 22.631996154785156, "learning_rate": 1.985740178193235e-05, "loss": 1.9907, "step": 35300 }, { "epoch": 0.007150511190853002, "grad_norm": 13.509586334228516, "learning_rate": 1.985699779808605e-05, "loss": 1.9834, "step": 35400 }, { "epoch": 0.007170710375007954, "grad_norm": 10.289020538330078, "learning_rate": 1.9856593814239748e-05, "loss": 1.9555, "step": 35500 }, { "epoch": 0.007190909559162906, "grad_norm": 13.982701301574707, "learning_rate": 1.9856189830393444e-05, "loss": 2.0296, "step": 35600 }, { "epoch": 0.007211108743317858, "grad_norm": 8.867023468017578, "learning_rate": 1.9855785846547143e-05, "loss": 2.037, "step": 35700 }, { "epoch": 0.00723130792747281, "grad_norm": 7.199387550354004, "learning_rate": 1.985538186270084e-05, "loss": 2.0337, "step": 35800 }, { "epoch": 0.007251507111627762, "grad_norm": 9.168804168701172, "learning_rate": 1.9854977878854538e-05, "loss": 2.0443, "step": 35900 }, { "epoch": 0.007271706295782714, "grad_norm": 16.700153350830078, "learning_rate": 1.9854573895008237e-05, "loss": 2.0161, "step": 36000 }, { "epoch": 0.007271706295782714, "eval_calculated_loss": 8.459732055664062, "eval_loss": 2.2094264030456543, "eval_perplexity": 4720.792918432857, "eval_runtime": 112.7111, "eval_samples_per_second": 8.854, "eval_steps_per_second": 2.218, "step": 36000 }, { "epoch": 0.0072919054799376656, "grad_norm": 19.214948654174805, "learning_rate": 1.9854169911161933e-05, "loss": 1.9811, "step": 36100 }, { "epoch": 0.0073121046640926175, "grad_norm": 10.98000431060791, "learning_rate": 1.9853765927315632e-05, "loss": 1.9557, "step": 36200 }, { "epoch": 0.0073323038482475695, "grad_norm": 14.5835542678833, "learning_rate": 1.985336194346933e-05, "loss": 1.9323, "step": 36300 }, { "epoch": 0.0073525030324025215, "grad_norm": 14.409607887268066, "learning_rate": 1.9852957959623027e-05, "loss": 1.9553, "step": 36400 }, { "epoch": 0.007372702216557473, "grad_norm": 16.744993209838867, "learning_rate": 1.9852553975776726e-05, "loss": 2.0025, "step": 36500 }, { "epoch": 0.007392901400712425, "grad_norm": 10.739960670471191, "learning_rate": 1.9852149991930425e-05, "loss": 2.034, "step": 36600 }, { "epoch": 0.007413100584867377, "grad_norm": 10.741344451904297, "learning_rate": 1.9851746008084124e-05, "loss": 1.9967, "step": 36700 }, { "epoch": 0.007433299769022329, "grad_norm": 16.565181732177734, "learning_rate": 1.985134202423782e-05, "loss": 2.0993, "step": 36800 }, { "epoch": 0.007453498953177281, "grad_norm": 11.329813003540039, "learning_rate": 1.985093804039152e-05, "loss": 2.0721, "step": 36900 }, { "epoch": 0.007473698137332233, "grad_norm": 14.010645866394043, "learning_rate": 1.9850534056545218e-05, "loss": 2.0203, "step": 37000 }, { "epoch": 0.007473698137332233, "eval_calculated_loss": 8.432367324829102, "eval_loss": 2.2230653762817383, "eval_perplexity": 4593.361210034377, "eval_runtime": 111.7665, "eval_samples_per_second": 8.929, "eval_steps_per_second": 2.237, "step": 37000 }, { "epoch": 0.007493897321487185, "grad_norm": 13.896880149841309, "learning_rate": 1.9850130072698914e-05, "loss": 2.0928, "step": 37100 }, { "epoch": 0.007514096505642137, "grad_norm": 14.589510917663574, "learning_rate": 1.9849726088852613e-05, "loss": 1.9712, "step": 37200 }, { "epoch": 0.007534295689797089, "grad_norm": 12.921557426452637, "learning_rate": 1.984932210500631e-05, "loss": 1.9918, "step": 37300 }, { "epoch": 0.007554494873952041, "grad_norm": 14.445589065551758, "learning_rate": 1.9848918121160008e-05, "loss": 1.9668, "step": 37400 }, { "epoch": 0.007574694058106993, "grad_norm": 17.431726455688477, "learning_rate": 1.9848514137313707e-05, "loss": 1.961, "step": 37500 }, { "epoch": 0.007594893242261945, "grad_norm": 8.175025939941406, "learning_rate": 1.9848110153467406e-05, "loss": 1.9424, "step": 37600 }, { "epoch": 0.007615092426416897, "grad_norm": 9.766443252563477, "learning_rate": 1.9847706169621105e-05, "loss": 1.9956, "step": 37700 }, { "epoch": 0.007635291610571849, "grad_norm": 15.242705345153809, "learning_rate": 1.98473021857748e-05, "loss": 2.0642, "step": 37800 }, { "epoch": 0.007655490794726801, "grad_norm": 10.698137283325195, "learning_rate": 1.98468982019285e-05, "loss": 2.0308, "step": 37900 }, { "epoch": 0.007675689978881753, "grad_norm": 6.2672319412231445, "learning_rate": 1.9846494218082196e-05, "loss": 2.0778, "step": 38000 }, { "epoch": 0.007675689978881753, "eval_calculated_loss": 8.519001960754395, "eval_loss": 2.2016165256500244, "eval_perplexity": 5009.052030740691, "eval_runtime": 110.3599, "eval_samples_per_second": 9.043, "eval_steps_per_second": 2.265, "step": 38000 }, { "epoch": 0.007695889163036705, "grad_norm": 12.424135208129883, "learning_rate": 1.9846090234235895e-05, "loss": 1.9893, "step": 38100 }, { "epoch": 0.007716088347191657, "grad_norm": 8.223752975463867, "learning_rate": 1.9845686250389594e-05, "loss": 1.959, "step": 38200 }, { "epoch": 0.007736287531346609, "grad_norm": 10.241130828857422, "learning_rate": 1.984528226654329e-05, "loss": 2.0557, "step": 38300 }, { "epoch": 0.007756486715501561, "grad_norm": 16.2491512298584, "learning_rate": 1.984487828269699e-05, "loss": 2.0233, "step": 38400 }, { "epoch": 0.007776685899656513, "grad_norm": 12.530030250549316, "learning_rate": 1.9844474298850688e-05, "loss": 2.0474, "step": 38500 }, { "epoch": 0.007796885083811465, "grad_norm": 15.984992980957031, "learning_rate": 1.9844070315004387e-05, "loss": 2.0901, "step": 38600 }, { "epoch": 0.007817084267966417, "grad_norm": 15.04157829284668, "learning_rate": 1.9843666331158086e-05, "loss": 2.0223, "step": 38700 }, { "epoch": 0.007837283452121369, "grad_norm": 12.580413818359375, "learning_rate": 1.9843262347311782e-05, "loss": 2.1142, "step": 38800 }, { "epoch": 0.00785748263627632, "grad_norm": 15.48827838897705, "learning_rate": 1.984285836346548e-05, "loss": 2.0252, "step": 38900 }, { "epoch": 0.007877681820431272, "grad_norm": 7.944377422332764, "learning_rate": 1.9842454379619177e-05, "loss": 2.065, "step": 39000 }, { "epoch": 0.007877681820431272, "eval_calculated_loss": 8.504120826721191, "eval_loss": 2.1887786388397217, "eval_perplexity": 4935.063537811582, "eval_runtime": 111.8912, "eval_samples_per_second": 8.919, "eval_steps_per_second": 2.234, "step": 39000 }, { "epoch": 0.007897881004586224, "grad_norm": 7.970875263214111, "learning_rate": 1.9842050395772876e-05, "loss": 2.0389, "step": 39100 }, { "epoch": 0.007918080188741176, "grad_norm": 15.152000427246094, "learning_rate": 1.9841646411926575e-05, "loss": 2.027, "step": 39200 }, { "epoch": 0.007938279372896128, "grad_norm": 13.495182991027832, "learning_rate": 1.984124242808027e-05, "loss": 1.9826, "step": 39300 }, { "epoch": 0.00795847855705108, "grad_norm": 11.979135513305664, "learning_rate": 1.984083844423397e-05, "loss": 2.0177, "step": 39400 }, { "epoch": 0.007978677741206032, "grad_norm": 10.474676132202148, "learning_rate": 1.984043446038767e-05, "loss": 2.1174, "step": 39500 }, { "epoch": 0.007998876925360984, "grad_norm": 12.76319408416748, "learning_rate": 1.9840030476541368e-05, "loss": 1.9566, "step": 39600 }, { "epoch": 0.008019076109515936, "grad_norm": 13.413045883178711, "learning_rate": 1.9839626492695064e-05, "loss": 2.0434, "step": 39700 }, { "epoch": 0.008039275293670888, "grad_norm": 11.830572128295898, "learning_rate": 1.9839222508848763e-05, "loss": 2.1083, "step": 39800 }, { "epoch": 0.00805947447782584, "grad_norm": 8.898438453674316, "learning_rate": 1.9838818525002462e-05, "loss": 2.0204, "step": 39900 }, { "epoch": 0.008079673661980792, "grad_norm": 13.034883499145508, "learning_rate": 1.9838414541156158e-05, "loss": 2.0789, "step": 40000 }, { "epoch": 0.008079673661980792, "eval_calculated_loss": 8.336727142333984, "eval_loss": 2.1868879795074463, "eval_perplexity": 4174.405125963974, "eval_runtime": 111.6901, "eval_samples_per_second": 8.935, "eval_steps_per_second": 2.238, "step": 40000 }, { "epoch": 0.008099872846135744, "grad_norm": 19.909770965576172, "learning_rate": 1.9838010557309857e-05, "loss": 1.9074, "step": 40100 }, { "epoch": 0.008120072030290696, "grad_norm": 13.346073150634766, "learning_rate": 1.9837606573463556e-05, "loss": 2.0506, "step": 40200 }, { "epoch": 0.008140271214445648, "grad_norm": 17.11739730834961, "learning_rate": 1.9837202589617252e-05, "loss": 2.0479, "step": 40300 }, { "epoch": 0.0081604703986006, "grad_norm": 15.690287590026855, "learning_rate": 1.983679860577095e-05, "loss": 2.0071, "step": 40400 }, { "epoch": 0.008180669582755552, "grad_norm": 14.468833923339844, "learning_rate": 1.9836394621924647e-05, "loss": 2.0135, "step": 40500 }, { "epoch": 0.008200868766910504, "grad_norm": 20.831480026245117, "learning_rate": 1.983599063807835e-05, "loss": 2.0509, "step": 40600 }, { "epoch": 0.008221067951065456, "grad_norm": 18.10137367248535, "learning_rate": 1.9835586654232045e-05, "loss": 1.9168, "step": 40700 }, { "epoch": 0.008241267135220408, "grad_norm": 11.501856803894043, "learning_rate": 1.9835182670385744e-05, "loss": 1.9954, "step": 40800 }, { "epoch": 0.00826146631937536, "grad_norm": 11.797844886779785, "learning_rate": 1.9834778686539443e-05, "loss": 1.9613, "step": 40900 }, { "epoch": 0.008281665503530312, "grad_norm": 12.507508277893066, "learning_rate": 1.983437470269314e-05, "loss": 2.0329, "step": 41000 }, { "epoch": 0.008281665503530312, "eval_calculated_loss": 8.430296897888184, "eval_loss": 2.195378303527832, "eval_perplexity": 4583.860829551353, "eval_runtime": 112.1973, "eval_samples_per_second": 8.895, "eval_steps_per_second": 2.228, "step": 41000 }, { "epoch": 0.008301864687685264, "grad_norm": 11.455669403076172, "learning_rate": 1.9833970718846838e-05, "loss": 1.9248, "step": 41100 }, { "epoch": 0.008322063871840216, "grad_norm": 11.583222389221191, "learning_rate": 1.9833566735000537e-05, "loss": 2.0095, "step": 41200 }, { "epoch": 0.008342263055995168, "grad_norm": 11.7819185256958, "learning_rate": 1.9833162751154233e-05, "loss": 2.0711, "step": 41300 }, { "epoch": 0.00836246224015012, "grad_norm": 6.9709367752075195, "learning_rate": 1.9832758767307932e-05, "loss": 1.9689, "step": 41400 }, { "epoch": 0.008382661424305072, "grad_norm": 9.221981048583984, "learning_rate": 1.9832354783461628e-05, "loss": 2.0048, "step": 41500 }, { "epoch": 0.008402860608460024, "grad_norm": 16.439449310302734, "learning_rate": 1.9831950799615327e-05, "loss": 1.9791, "step": 41600 }, { "epoch": 0.008423059792614975, "grad_norm": 14.33887767791748, "learning_rate": 1.9831546815769026e-05, "loss": 2.0338, "step": 41700 }, { "epoch": 0.008443258976769927, "grad_norm": 12.718589782714844, "learning_rate": 1.9831142831922725e-05, "loss": 2.0347, "step": 41800 }, { "epoch": 0.00846345816092488, "grad_norm": 13.341283798217773, "learning_rate": 1.9830738848076424e-05, "loss": 1.9823, "step": 41900 }, { "epoch": 0.008483657345079831, "grad_norm": 8.024139404296875, "learning_rate": 1.983033486423012e-05, "loss": 1.9702, "step": 42000 }, { "epoch": 0.008483657345079831, "eval_calculated_loss": 8.245535850524902, "eval_loss": 2.2091779708862305, "eval_perplexity": 3810.576810618221, "eval_runtime": 111.6092, "eval_samples_per_second": 8.942, "eval_steps_per_second": 2.24, "step": 42000 }, { "epoch": 0.008503856529234783, "grad_norm": 13.12780475616455, "learning_rate": 1.982993088038382e-05, "loss": 2.0017, "step": 42100 }, { "epoch": 0.008524055713389737, "grad_norm": 11.197165489196777, "learning_rate": 1.9829526896537515e-05, "loss": 1.9657, "step": 42200 }, { "epoch": 0.008544254897544689, "grad_norm": 15.177810668945312, "learning_rate": 1.9829122912691214e-05, "loss": 2.0148, "step": 42300 }, { "epoch": 0.008564454081699641, "grad_norm": 13.956751823425293, "learning_rate": 1.9828718928844913e-05, "loss": 1.9131, "step": 42400 }, { "epoch": 0.008584653265854593, "grad_norm": 9.388537406921387, "learning_rate": 1.982831494499861e-05, "loss": 2.0096, "step": 42500 }, { "epoch": 0.008604852450009545, "grad_norm": 14.376363754272461, "learning_rate": 1.9827910961152308e-05, "loss": 2.0173, "step": 42600 }, { "epoch": 0.008625051634164497, "grad_norm": 11.57795524597168, "learning_rate": 1.9827506977306007e-05, "loss": 1.9553, "step": 42700 }, { "epoch": 0.008645250818319449, "grad_norm": 8.853484153747559, "learning_rate": 1.9827102993459706e-05, "loss": 1.99, "step": 42800 }, { "epoch": 0.0086654500024744, "grad_norm": 14.889068603515625, "learning_rate": 1.9826699009613402e-05, "loss": 2.0248, "step": 42900 }, { "epoch": 0.008685649186629353, "grad_norm": 21.62253761291504, "learning_rate": 1.98262950257671e-05, "loss": 2.0787, "step": 43000 }, { "epoch": 0.008685649186629353, "eval_calculated_loss": 8.486618995666504, "eval_loss": 2.237224578857422, "eval_perplexity": 4849.44233897748, "eval_runtime": 112.9267, "eval_samples_per_second": 8.838, "eval_steps_per_second": 2.214, "step": 43000 }, { "epoch": 0.008705848370784305, "grad_norm": 13.908287048339844, "learning_rate": 1.98258910419208e-05, "loss": 2.0259, "step": 43100 }, { "epoch": 0.008726047554939257, "grad_norm": 7.358977317810059, "learning_rate": 1.9825487058074496e-05, "loss": 2.0138, "step": 43200 }, { "epoch": 0.008746246739094209, "grad_norm": 10.591900825500488, "learning_rate": 1.9825083074228195e-05, "loss": 1.9895, "step": 43300 }, { "epoch": 0.00876644592324916, "grad_norm": 14.291583061218262, "learning_rate": 1.9824679090381894e-05, "loss": 1.887, "step": 43400 }, { "epoch": 0.008786645107404113, "grad_norm": 9.744098663330078, "learning_rate": 1.982427510653559e-05, "loss": 1.9708, "step": 43500 }, { "epoch": 0.008806844291559065, "grad_norm": 13.054421424865723, "learning_rate": 1.982387112268929e-05, "loss": 1.9866, "step": 43600 }, { "epoch": 0.008827043475714016, "grad_norm": 9.797351837158203, "learning_rate": 1.982346713884299e-05, "loss": 1.972, "step": 43700 }, { "epoch": 0.008847242659868968, "grad_norm": 12.749700546264648, "learning_rate": 1.9823063154996688e-05, "loss": 1.9404, "step": 43800 }, { "epoch": 0.00886744184402392, "grad_norm": 15.539433479309082, "learning_rate": 1.9822659171150383e-05, "loss": 2.0405, "step": 43900 }, { "epoch": 0.008887641028178872, "grad_norm": 10.18968391418457, "learning_rate": 1.9822255187304082e-05, "loss": 1.9495, "step": 44000 }, { "epoch": 0.008887641028178872, "eval_calculated_loss": 8.534730911254883, "eval_loss": 2.226102352142334, "eval_perplexity": 5088.462043101177, "eval_runtime": 114.1812, "eval_samples_per_second": 8.74, "eval_steps_per_second": 2.19, "step": 44000 }, { "epoch": 0.008907840212333824, "grad_norm": 16.962804794311523, "learning_rate": 1.982185120345778e-05, "loss": 1.9851, "step": 44100 }, { "epoch": 0.008928039396488776, "grad_norm": 13.846346855163574, "learning_rate": 1.9821447219611477e-05, "loss": 2.0, "step": 44200 }, { "epoch": 0.008948238580643728, "grad_norm": 16.351301193237305, "learning_rate": 1.9821043235765176e-05, "loss": 2.1305, "step": 44300 }, { "epoch": 0.00896843776479868, "grad_norm": 12.137742042541504, "learning_rate": 1.9820639251918876e-05, "loss": 1.8972, "step": 44400 }, { "epoch": 0.008988636948953632, "grad_norm": 13.792881965637207, "learning_rate": 1.982023526807257e-05, "loss": 1.9491, "step": 44500 }, { "epoch": 0.009008836133108584, "grad_norm": 14.181328773498535, "learning_rate": 1.981983128422627e-05, "loss": 2.0893, "step": 44600 }, { "epoch": 0.009029035317263536, "grad_norm": 13.777767181396484, "learning_rate": 1.9819427300379966e-05, "loss": 2.0353, "step": 44700 }, { "epoch": 0.009049234501418488, "grad_norm": 9.483240127563477, "learning_rate": 1.981902331653367e-05, "loss": 2.0389, "step": 44800 }, { "epoch": 0.00906943368557344, "grad_norm": 12.807628631591797, "learning_rate": 1.9818619332687364e-05, "loss": 2.0424, "step": 44900 }, { "epoch": 0.009089632869728392, "grad_norm": 9.879388809204102, "learning_rate": 1.9818215348841064e-05, "loss": 1.9705, "step": 45000 }, { "epoch": 0.009089632869728392, "eval_calculated_loss": 8.594109535217285, "eval_loss": 2.195486545562744, "eval_perplexity": 5399.758639784684, "eval_runtime": 111.9904, "eval_samples_per_second": 8.911, "eval_steps_per_second": 2.232, "step": 45000 }, { "epoch": 0.009109832053883344, "grad_norm": 13.538002014160156, "learning_rate": 1.9817811364994763e-05, "loss": 2.061, "step": 45100 }, { "epoch": 0.009130031238038296, "grad_norm": 14.195423126220703, "learning_rate": 1.981740738114846e-05, "loss": 1.9529, "step": 45200 }, { "epoch": 0.009150230422193248, "grad_norm": 10.191577911376953, "learning_rate": 1.9817003397302158e-05, "loss": 1.9345, "step": 45300 }, { "epoch": 0.0091704296063482, "grad_norm": 8.407819747924805, "learning_rate": 1.9816599413455853e-05, "loss": 1.9004, "step": 45400 }, { "epoch": 0.009190628790503152, "grad_norm": 14.963154792785645, "learning_rate": 1.9816195429609552e-05, "loss": 2.0828, "step": 45500 }, { "epoch": 0.009210827974658104, "grad_norm": 14.666624069213867, "learning_rate": 1.981579144576325e-05, "loss": 1.9822, "step": 45600 }, { "epoch": 0.009231027158813056, "grad_norm": 14.210261344909668, "learning_rate": 1.9815387461916947e-05, "loss": 2.0321, "step": 45700 }, { "epoch": 0.009251226342968008, "grad_norm": 16.217239379882812, "learning_rate": 1.981498347807065e-05, "loss": 1.9746, "step": 45800 }, { "epoch": 0.00927142552712296, "grad_norm": 13.723954200744629, "learning_rate": 1.9814579494224346e-05, "loss": 2.0189, "step": 45900 }, { "epoch": 0.009291624711277912, "grad_norm": 11.22737979888916, "learning_rate": 1.9814175510378045e-05, "loss": 1.8612, "step": 46000 }, { "epoch": 0.009291624711277912, "eval_calculated_loss": 8.585322380065918, "eval_loss": 2.205509662628174, "eval_perplexity": 5352.51798230034, "eval_runtime": 112.2672, "eval_samples_per_second": 8.89, "eval_steps_per_second": 2.227, "step": 46000 }, { "epoch": 0.009311823895432864, "grad_norm": 16.302425384521484, "learning_rate": 1.9813771526531744e-05, "loss": 2.0395, "step": 46100 }, { "epoch": 0.009332023079587816, "grad_norm": 14.061258316040039, "learning_rate": 1.981336754268544e-05, "loss": 2.031, "step": 46200 }, { "epoch": 0.009352222263742768, "grad_norm": 15.286768913269043, "learning_rate": 1.981296355883914e-05, "loss": 1.9522, "step": 46300 }, { "epoch": 0.00937242144789772, "grad_norm": 7.225491046905518, "learning_rate": 1.9812559574992834e-05, "loss": 2.0356, "step": 46400 }, { "epoch": 0.009392620632052671, "grad_norm": 12.068641662597656, "learning_rate": 1.9812155591146533e-05, "loss": 1.9616, "step": 46500 }, { "epoch": 0.009412819816207623, "grad_norm": 9.710037231445312, "learning_rate": 1.9811751607300233e-05, "loss": 1.9501, "step": 46600 }, { "epoch": 0.009433019000362575, "grad_norm": 11.472894668579102, "learning_rate": 1.981134762345393e-05, "loss": 2.0095, "step": 46700 }, { "epoch": 0.009453218184517527, "grad_norm": 15.260313987731934, "learning_rate": 1.981094363960763e-05, "loss": 1.9189, "step": 46800 }, { "epoch": 0.00947341736867248, "grad_norm": 15.02626895904541, "learning_rate": 1.9810539655761327e-05, "loss": 2.0156, "step": 46900 }, { "epoch": 0.009493616552827431, "grad_norm": 14.58222770690918, "learning_rate": 1.9810135671915026e-05, "loss": 2.0017, "step": 47000 }, { "epoch": 0.009493616552827431, "eval_calculated_loss": 8.405712127685547, "eval_loss": 2.1936798095703125, "eval_perplexity": 4472.541649467629, "eval_runtime": 111.1044, "eval_samples_per_second": 8.983, "eval_steps_per_second": 2.25, "step": 47000 }, { "epoch": 0.009513815736982383, "grad_norm": 10.097427368164062, "learning_rate": 1.980973168806872e-05, "loss": 2.0243, "step": 47100 }, { "epoch": 0.009534014921137335, "grad_norm": 6.984368801116943, "learning_rate": 1.980932770422242e-05, "loss": 2.0094, "step": 47200 }, { "epoch": 0.009554214105292287, "grad_norm": 12.490196228027344, "learning_rate": 1.980892372037612e-05, "loss": 1.9665, "step": 47300 }, { "epoch": 0.009574413289447239, "grad_norm": 17.38823127746582, "learning_rate": 1.9808519736529815e-05, "loss": 2.0373, "step": 47400 }, { "epoch": 0.009594612473602191, "grad_norm": 9.531137466430664, "learning_rate": 1.9808115752683515e-05, "loss": 1.8937, "step": 47500 }, { "epoch": 0.009614811657757143, "grad_norm": 6.860366344451904, "learning_rate": 1.9807711768837214e-05, "loss": 1.9491, "step": 47600 }, { "epoch": 0.009635010841912095, "grad_norm": 9.476030349731445, "learning_rate": 1.980730778499091e-05, "loss": 1.9683, "step": 47700 }, { "epoch": 0.009655210026067047, "grad_norm": 18.873661041259766, "learning_rate": 1.980690380114461e-05, "loss": 1.9731, "step": 47800 }, { "epoch": 0.009675409210221999, "grad_norm": 14.04172420501709, "learning_rate": 1.9806499817298308e-05, "loss": 2.0331, "step": 47900 }, { "epoch": 0.009695608394376951, "grad_norm": 9.616294860839844, "learning_rate": 1.9806095833452007e-05, "loss": 2.011, "step": 48000 }, { "epoch": 0.009695608394376951, "eval_calculated_loss": 8.475849151611328, "eval_loss": 2.212171792984009, "eval_perplexity": 4797.494836594926, "eval_runtime": 114.3476, "eval_samples_per_second": 8.728, "eval_steps_per_second": 2.186, "step": 48000 }, { "epoch": 0.009715807578531903, "grad_norm": 14.868757247924805, "learning_rate": 1.9805691849605703e-05, "loss": 1.9636, "step": 48100 }, { "epoch": 0.009736006762686855, "grad_norm": 12.546900749206543, "learning_rate": 1.9805287865759402e-05, "loss": 1.9948, "step": 48200 }, { "epoch": 0.009756205946841807, "grad_norm": 17.454397201538086, "learning_rate": 1.98048838819131e-05, "loss": 2.029, "step": 48300 }, { "epoch": 0.009776405130996759, "grad_norm": 6.154905319213867, "learning_rate": 1.9804479898066797e-05, "loss": 2.1346, "step": 48400 }, { "epoch": 0.00979660431515171, "grad_norm": 10.797252655029297, "learning_rate": 1.9804075914220496e-05, "loss": 2.0075, "step": 48500 }, { "epoch": 0.009816803499306663, "grad_norm": 12.816378593444824, "learning_rate": 1.980367193037419e-05, "loss": 2.0672, "step": 48600 }, { "epoch": 0.009837002683461615, "grad_norm": 14.031821250915527, "learning_rate": 1.980326794652789e-05, "loss": 2.0598, "step": 48700 }, { "epoch": 0.009857201867616567, "grad_norm": 15.716894149780273, "learning_rate": 1.980286396268159e-05, "loss": 2.0064, "step": 48800 }, { "epoch": 0.009877401051771519, "grad_norm": 12.84679126739502, "learning_rate": 1.980245997883529e-05, "loss": 2.0227, "step": 48900 }, { "epoch": 0.00989760023592647, "grad_norm": 15.792977333068848, "learning_rate": 1.9802055994988988e-05, "loss": 1.9647, "step": 49000 }, { "epoch": 0.00989760023592647, "eval_calculated_loss": 8.472697257995605, "eval_loss": 2.212716579437256, "eval_perplexity": 4782.397448427354, "eval_runtime": 112.7442, "eval_samples_per_second": 8.852, "eval_steps_per_second": 2.217, "step": 49000 }, { "epoch": 0.009917799420081422, "grad_norm": 12.62405014038086, "learning_rate": 1.9801652011142684e-05, "loss": 2.1005, "step": 49100 }, { "epoch": 0.009937998604236374, "grad_norm": 8.988811492919922, "learning_rate": 1.9801248027296383e-05, "loss": 1.997, "step": 49200 }, { "epoch": 0.009958197788391326, "grad_norm": 10.262998580932617, "learning_rate": 1.9800844043450082e-05, "loss": 1.8804, "step": 49300 }, { "epoch": 0.009978396972546278, "grad_norm": 13.962371826171875, "learning_rate": 1.9800440059603778e-05, "loss": 1.9967, "step": 49400 }, { "epoch": 0.00999859615670123, "grad_norm": 15.679228782653809, "learning_rate": 1.9800036075757477e-05, "loss": 1.9677, "step": 49500 }, { "epoch": 0.010018795340856182, "grad_norm": 11.444239616394043, "learning_rate": 1.9799632091911173e-05, "loss": 1.9622, "step": 49600 }, { "epoch": 0.010038994525011134, "grad_norm": 11.26085376739502, "learning_rate": 1.9799228108064872e-05, "loss": 1.9456, "step": 49700 }, { "epoch": 0.010059193709166086, "grad_norm": 9.829758644104004, "learning_rate": 1.979882412421857e-05, "loss": 1.989, "step": 49800 }, { "epoch": 0.010079392893321038, "grad_norm": 13.269829750061035, "learning_rate": 1.979842014037227e-05, "loss": 2.029, "step": 49900 }, { "epoch": 0.01009959207747599, "grad_norm": 13.821464538574219, "learning_rate": 1.979801615652597e-05, "loss": 1.9084, "step": 50000 }, { "epoch": 0.01009959207747599, "eval_calculated_loss": 8.49938678741455, "eval_loss": 2.1795763969421387, "eval_perplexity": 4911.755966052304, "eval_runtime": 114.0098, "eval_samples_per_second": 8.754, "eval_steps_per_second": 2.193, "step": 50000 }, { "epoch": 0.010119791261630942, "grad_norm": 13.649625778198242, "learning_rate": 1.9797612172679665e-05, "loss": 2.0921, "step": 50100 }, { "epoch": 0.010139990445785894, "grad_norm": 12.733220100402832, "learning_rate": 1.9797208188833364e-05, "loss": 1.9901, "step": 50200 }, { "epoch": 0.010160189629940846, "grad_norm": 10.816847801208496, "learning_rate": 1.979680420498706e-05, "loss": 2.0335, "step": 50300 }, { "epoch": 0.010180388814095798, "grad_norm": 10.008528709411621, "learning_rate": 1.979640022114076e-05, "loss": 1.9574, "step": 50400 }, { "epoch": 0.01020058799825075, "grad_norm": 8.977941513061523, "learning_rate": 1.9795996237294458e-05, "loss": 1.9658, "step": 50500 }, { "epoch": 0.010220787182405702, "grad_norm": 15.642790794372559, "learning_rate": 1.9795592253448154e-05, "loss": 1.9989, "step": 50600 }, { "epoch": 0.010240986366560654, "grad_norm": 13.312954902648926, "learning_rate": 1.9795188269601853e-05, "loss": 2.0242, "step": 50700 }, { "epoch": 0.010261185550715606, "grad_norm": 13.020797729492188, "learning_rate": 1.9794784285755552e-05, "loss": 1.9123, "step": 50800 }, { "epoch": 0.010281384734870558, "grad_norm": 14.440340042114258, "learning_rate": 1.9794380301909248e-05, "loss": 1.9407, "step": 50900 }, { "epoch": 0.01030158391902551, "grad_norm": 14.855812072753906, "learning_rate": 1.9793976318062947e-05, "loss": 1.9939, "step": 51000 }, { "epoch": 0.01030158391902551, "eval_calculated_loss": 8.50253963470459, "eval_loss": 2.197661876678467, "eval_perplexity": 4927.266420738316, "eval_runtime": 113.1438, "eval_samples_per_second": 8.821, "eval_steps_per_second": 2.21, "step": 51000 }, { "epoch": 0.010321783103180462, "grad_norm": 9.868553161621094, "learning_rate": 1.9793572334216646e-05, "loss": 2.0299, "step": 51100 }, { "epoch": 0.010341982287335414, "grad_norm": 12.750384330749512, "learning_rate": 1.9793168350370345e-05, "loss": 1.9926, "step": 51200 }, { "epoch": 0.010362181471490366, "grad_norm": 13.057293891906738, "learning_rate": 1.979276436652404e-05, "loss": 1.9867, "step": 51300 }, { "epoch": 0.010382380655645318, "grad_norm": 8.166776657104492, "learning_rate": 1.979236038267774e-05, "loss": 1.9881, "step": 51400 }, { "epoch": 0.010402579839800271, "grad_norm": 7.104085445404053, "learning_rate": 1.979195639883144e-05, "loss": 1.9665, "step": 51500 }, { "epoch": 0.010422779023955223, "grad_norm": 11.413253784179688, "learning_rate": 1.9791552414985135e-05, "loss": 1.9543, "step": 51600 }, { "epoch": 0.010442978208110175, "grad_norm": 10.005061149597168, "learning_rate": 1.9791148431138834e-05, "loss": 1.9602, "step": 51700 }, { "epoch": 0.010463177392265127, "grad_norm": 14.402604103088379, "learning_rate": 1.9790744447292533e-05, "loss": 1.9759, "step": 51800 }, { "epoch": 0.01048337657642008, "grad_norm": 17.35906219482422, "learning_rate": 1.979034046344623e-05, "loss": 2.0418, "step": 51900 }, { "epoch": 0.010503575760575031, "grad_norm": 9.023457527160645, "learning_rate": 1.9789936479599928e-05, "loss": 1.9275, "step": 52000 }, { "epoch": 0.010503575760575031, "eval_calculated_loss": 8.468161582946777, "eval_loss": 2.1913435459136963, "eval_perplexity": 4760.755165930006, "eval_runtime": 114.1116, "eval_samples_per_second": 8.746, "eval_steps_per_second": 2.191, "step": 52000 }, { "epoch": 0.010523774944729983, "grad_norm": 11.715458869934082, "learning_rate": 1.9789532495753627e-05, "loss": 1.9409, "step": 52100 }, { "epoch": 0.010543974128884935, "grad_norm": 6.026762962341309, "learning_rate": 1.9789128511907326e-05, "loss": 1.9324, "step": 52200 }, { "epoch": 0.010564173313039887, "grad_norm": 9.82934856414795, "learning_rate": 1.9788724528061022e-05, "loss": 2.0304, "step": 52300 }, { "epoch": 0.010584372497194839, "grad_norm": 12.730053901672363, "learning_rate": 1.978832054421472e-05, "loss": 1.9588, "step": 52400 }, { "epoch": 0.010604571681349791, "grad_norm": 12.848283767700195, "learning_rate": 1.978791656036842e-05, "loss": 2.0221, "step": 52500 }, { "epoch": 0.010624770865504743, "grad_norm": 17.791444778442383, "learning_rate": 1.9787512576522116e-05, "loss": 2.0495, "step": 52600 }, { "epoch": 0.010644970049659695, "grad_norm": 6.594224452972412, "learning_rate": 1.9787108592675815e-05, "loss": 1.8849, "step": 52700 }, { "epoch": 0.010665169233814647, "grad_norm": 12.514652252197266, "learning_rate": 1.978670460882951e-05, "loss": 1.9291, "step": 52800 }, { "epoch": 0.010685368417969599, "grad_norm": 8.210865020751953, "learning_rate": 1.978630062498321e-05, "loss": 1.9758, "step": 52900 }, { "epoch": 0.01070556760212455, "grad_norm": 20.003868103027344, "learning_rate": 1.978589664113691e-05, "loss": 1.968, "step": 53000 }, { "epoch": 0.01070556760212455, "eval_calculated_loss": 8.442296981811523, "eval_loss": 2.199124813079834, "eval_perplexity": 4639.198910944646, "eval_runtime": 116.1371, "eval_samples_per_second": 8.593, "eval_steps_per_second": 2.153, "step": 53000 }, { "epoch": 0.010725766786279503, "grad_norm": 14.205262184143066, "learning_rate": 1.9785492657290608e-05, "loss": 1.9423, "step": 53100 }, { "epoch": 0.010745965970434455, "grad_norm": 10.539705276489258, "learning_rate": 1.9785088673444307e-05, "loss": 1.8881, "step": 53200 }, { "epoch": 0.010766165154589407, "grad_norm": 12.530035018920898, "learning_rate": 1.9784684689598003e-05, "loss": 1.9979, "step": 53300 }, { "epoch": 0.010786364338744359, "grad_norm": 9.857118606567383, "learning_rate": 1.9784280705751702e-05, "loss": 1.9785, "step": 53400 }, { "epoch": 0.01080656352289931, "grad_norm": 20.84344482421875, "learning_rate": 1.9783876721905398e-05, "loss": 2.023, "step": 53500 }, { "epoch": 0.010826762707054263, "grad_norm": 15.949945449829102, "learning_rate": 1.9783472738059097e-05, "loss": 1.9727, "step": 53600 }, { "epoch": 0.010846961891209215, "grad_norm": 12.451590538024902, "learning_rate": 1.9783068754212796e-05, "loss": 1.7977, "step": 53700 }, { "epoch": 0.010867161075364166, "grad_norm": 17.44866943359375, "learning_rate": 1.9782664770366492e-05, "loss": 2.0331, "step": 53800 }, { "epoch": 0.010887360259519118, "grad_norm": 7.055886268615723, "learning_rate": 1.978226078652019e-05, "loss": 1.9411, "step": 53900 }, { "epoch": 0.01090755944367407, "grad_norm": 9.230459213256836, "learning_rate": 1.978185680267389e-05, "loss": 2.0076, "step": 54000 }, { "epoch": 0.01090755944367407, "eval_calculated_loss": 8.497525215148926, "eval_loss": 2.1825642585754395, "eval_perplexity": 4902.6208828172785, "eval_runtime": 116.8483, "eval_samples_per_second": 8.541, "eval_steps_per_second": 2.14, "step": 54000 }, { "epoch": 0.010927758627829022, "grad_norm": 13.88126277923584, "learning_rate": 1.978145281882759e-05, "loss": 1.9752, "step": 54100 }, { "epoch": 0.010947957811983974, "grad_norm": 8.018758773803711, "learning_rate": 1.978104883498129e-05, "loss": 2.0077, "step": 54200 }, { "epoch": 0.010968156996138926, "grad_norm": 11.512068748474121, "learning_rate": 1.9780644851134984e-05, "loss": 2.0018, "step": 54300 }, { "epoch": 0.010988356180293878, "grad_norm": 16.45362663269043, "learning_rate": 1.9780240867288683e-05, "loss": 2.032, "step": 54400 }, { "epoch": 0.01100855536444883, "grad_norm": 13.808854103088379, "learning_rate": 1.977983688344238e-05, "loss": 1.8836, "step": 54500 }, { "epoch": 0.011028754548603782, "grad_norm": 5.108974933624268, "learning_rate": 1.9779432899596078e-05, "loss": 1.9969, "step": 54600 }, { "epoch": 0.011048953732758734, "grad_norm": 9.465174674987793, "learning_rate": 1.9779028915749777e-05, "loss": 1.9947, "step": 54700 }, { "epoch": 0.011069152916913686, "grad_norm": 9.800680160522461, "learning_rate": 1.9778624931903473e-05, "loss": 1.9746, "step": 54800 }, { "epoch": 0.011089352101068638, "grad_norm": 9.729010581970215, "learning_rate": 1.9778220948057172e-05, "loss": 1.9209, "step": 54900 }, { "epoch": 0.01110955128522359, "grad_norm": 13.018278121948242, "learning_rate": 1.977781696421087e-05, "loss": 1.8762, "step": 55000 }, { "epoch": 0.01110955128522359, "eval_calculated_loss": 8.518715858459473, "eval_loss": 2.1787071228027344, "eval_perplexity": 5007.619134446546, "eval_runtime": 115.492, "eval_samples_per_second": 8.641, "eval_steps_per_second": 2.165, "step": 55000 }, { "epoch": 0.011129750469378542, "grad_norm": 13.910865783691406, "learning_rate": 1.977741298036457e-05, "loss": 2.0101, "step": 55100 }, { "epoch": 0.011149949653533494, "grad_norm": 12.283522605895996, "learning_rate": 1.9777008996518266e-05, "loss": 1.9584, "step": 55200 }, { "epoch": 0.011170148837688446, "grad_norm": 9.163033485412598, "learning_rate": 1.9776605012671965e-05, "loss": 2.0292, "step": 55300 }, { "epoch": 0.011190348021843398, "grad_norm": 11.402956008911133, "learning_rate": 1.9776201028825664e-05, "loss": 2.0388, "step": 55400 }, { "epoch": 0.01121054720599835, "grad_norm": 8.443638801574707, "learning_rate": 1.977579704497936e-05, "loss": 1.961, "step": 55500 }, { "epoch": 0.011230746390153302, "grad_norm": 12.389243125915527, "learning_rate": 1.977539306113306e-05, "loss": 1.9713, "step": 55600 }, { "epoch": 0.011250945574308254, "grad_norm": 17.078989028930664, "learning_rate": 1.977498907728676e-05, "loss": 1.9602, "step": 55700 }, { "epoch": 0.011271144758463206, "grad_norm": 8.420044898986816, "learning_rate": 1.9774585093440454e-05, "loss": 2.043, "step": 55800 }, { "epoch": 0.011291343942618158, "grad_norm": 7.513820171356201, "learning_rate": 1.9774181109594153e-05, "loss": 1.9509, "step": 55900 }, { "epoch": 0.01131154312677311, "grad_norm": 6.280425548553467, "learning_rate": 1.977377712574785e-05, "loss": 1.9763, "step": 56000 }, { "epoch": 0.01131154312677311, "eval_calculated_loss": 8.292081832885742, "eval_loss": 2.1797027587890625, "eval_perplexity": 3992.1365105879227, "eval_runtime": 117.0019, "eval_samples_per_second": 8.53, "eval_steps_per_second": 2.137, "step": 56000 }, { "epoch": 0.011331742310928062, "grad_norm": 8.747842788696289, "learning_rate": 1.9773373141901548e-05, "loss": 1.9869, "step": 56100 }, { "epoch": 0.011351941495083014, "grad_norm": 8.540254592895508, "learning_rate": 1.9772969158055247e-05, "loss": 1.9915, "step": 56200 }, { "epoch": 0.011372140679237966, "grad_norm": 15.794537544250488, "learning_rate": 1.9772565174208946e-05, "loss": 1.9508, "step": 56300 }, { "epoch": 0.011392339863392918, "grad_norm": 15.173884391784668, "learning_rate": 1.9772161190362646e-05, "loss": 1.9729, "step": 56400 }, { "epoch": 0.01141253904754787, "grad_norm": 12.197762489318848, "learning_rate": 1.977175720651634e-05, "loss": 1.9578, "step": 56500 }, { "epoch": 0.011432738231702821, "grad_norm": 14.158513069152832, "learning_rate": 1.977135322267004e-05, "loss": 1.9076, "step": 56600 }, { "epoch": 0.011452937415857773, "grad_norm": 20.657779693603516, "learning_rate": 1.977094923882374e-05, "loss": 1.9924, "step": 56700 }, { "epoch": 0.011473136600012725, "grad_norm": 15.949647903442383, "learning_rate": 1.9770545254977435e-05, "loss": 1.9376, "step": 56800 }, { "epoch": 0.011493335784167677, "grad_norm": 14.40701675415039, "learning_rate": 1.9770141271131134e-05, "loss": 2.041, "step": 56900 }, { "epoch": 0.01151353496832263, "grad_norm": 12.495902061462402, "learning_rate": 1.976973728728483e-05, "loss": 1.8719, "step": 57000 }, { "epoch": 0.01151353496832263, "eval_calculated_loss": 8.640325546264648, "eval_loss": 2.190984010696411, "eval_perplexity": 5655.170544446691, "eval_runtime": 115.3993, "eval_samples_per_second": 8.648, "eval_steps_per_second": 2.166, "step": 57000 }, { "epoch": 0.011533734152477581, "grad_norm": 11.368563652038574, "learning_rate": 1.976933330343853e-05, "loss": 1.9829, "step": 57100 }, { "epoch": 0.011553933336632533, "grad_norm": 12.83647632598877, "learning_rate": 1.976892931959223e-05, "loss": 2.0063, "step": 57200 }, { "epoch": 0.011574132520787485, "grad_norm": 19.073041915893555, "learning_rate": 1.9768525335745928e-05, "loss": 2.0488, "step": 57300 }, { "epoch": 0.011594331704942437, "grad_norm": 11.273963928222656, "learning_rate": 1.9768121351899627e-05, "loss": 2.0211, "step": 57400 }, { "epoch": 0.011614530889097389, "grad_norm": 15.034774780273438, "learning_rate": 1.9767717368053322e-05, "loss": 1.9706, "step": 57500 }, { "epoch": 0.011634730073252341, "grad_norm": 14.191093444824219, "learning_rate": 1.976731338420702e-05, "loss": 1.9575, "step": 57600 }, { "epoch": 0.011654929257407293, "grad_norm": 11.542226791381836, "learning_rate": 1.9766909400360717e-05, "loss": 2.0231, "step": 57700 }, { "epoch": 0.011675128441562245, "grad_norm": 10.606226921081543, "learning_rate": 1.9766505416514416e-05, "loss": 1.9935, "step": 57800 }, { "epoch": 0.011695327625717197, "grad_norm": 9.295251846313477, "learning_rate": 1.9766101432668116e-05, "loss": 1.9869, "step": 57900 }, { "epoch": 0.011715526809872149, "grad_norm": 15.209702491760254, "learning_rate": 1.976569744882181e-05, "loss": 1.9375, "step": 58000 }, { "epoch": 0.011715526809872149, "eval_calculated_loss": 8.495692253112793, "eval_loss": 2.1714253425598145, "eval_perplexity": 4893.642795621693, "eval_runtime": 113.9285, "eval_samples_per_second": 8.76, "eval_steps_per_second": 2.194, "step": 58000 }, { "epoch": 0.011735725994027101, "grad_norm": 12.322779655456543, "learning_rate": 1.976529346497551e-05, "loss": 1.988, "step": 58100 }, { "epoch": 0.011755925178182053, "grad_norm": 7.429297924041748, "learning_rate": 1.976488948112921e-05, "loss": 1.9349, "step": 58200 }, { "epoch": 0.011776124362337005, "grad_norm": 7.004861831665039, "learning_rate": 1.976448549728291e-05, "loss": 1.9686, "step": 58300 }, { "epoch": 0.011796323546491957, "grad_norm": 11.298835754394531, "learning_rate": 1.9764081513436604e-05, "loss": 1.9401, "step": 58400 }, { "epoch": 0.011816522730646909, "grad_norm": 11.349494934082031, "learning_rate": 1.9763677529590303e-05, "loss": 1.9553, "step": 58500 }, { "epoch": 0.01183672191480186, "grad_norm": 13.339435577392578, "learning_rate": 1.9763273545744003e-05, "loss": 2.0069, "step": 58600 }, { "epoch": 0.011856921098956813, "grad_norm": 16.372365951538086, "learning_rate": 1.97628695618977e-05, "loss": 1.9649, "step": 58700 }, { "epoch": 0.011877120283111765, "grad_norm": 11.735032081604004, "learning_rate": 1.9762465578051397e-05, "loss": 2.0024, "step": 58800 }, { "epoch": 0.011897319467266717, "grad_norm": 14.197381973266602, "learning_rate": 1.9762061594205097e-05, "loss": 1.9758, "step": 58900 }, { "epoch": 0.011917518651421669, "grad_norm": 11.596463203430176, "learning_rate": 1.9761657610358792e-05, "loss": 1.8878, "step": 59000 }, { "epoch": 0.011917518651421669, "eval_calculated_loss": 8.694454193115234, "eval_loss": 2.183953285217285, "eval_perplexity": 5969.713367650796, "eval_runtime": 114.5923, "eval_samples_per_second": 8.709, "eval_steps_per_second": 2.182, "step": 59000 }, { "epoch": 0.01193771783557662, "grad_norm": 14.160085678100586, "learning_rate": 1.976125362651249e-05, "loss": 1.9555, "step": 59100 }, { "epoch": 0.011957917019731572, "grad_norm": 18.621742248535156, "learning_rate": 1.9760849642666187e-05, "loss": 1.9336, "step": 59200 }, { "epoch": 0.011978116203886524, "grad_norm": 10.191388130187988, "learning_rate": 1.976044565881989e-05, "loss": 2.0337, "step": 59300 }, { "epoch": 0.011998315388041476, "grad_norm": 13.028531074523926, "learning_rate": 1.9760041674973585e-05, "loss": 2.0268, "step": 59400 }, { "epoch": 0.012018514572196428, "grad_norm": 16.464231491088867, "learning_rate": 1.9759637691127285e-05, "loss": 1.9877, "step": 59500 }, { "epoch": 0.01203871375635138, "grad_norm": 9.454071998596191, "learning_rate": 1.9759233707280984e-05, "loss": 1.983, "step": 59600 }, { "epoch": 0.012058912940506332, "grad_norm": 19.99734878540039, "learning_rate": 1.975882972343468e-05, "loss": 2.0351, "step": 59700 }, { "epoch": 0.012079112124661284, "grad_norm": 12.699873924255371, "learning_rate": 1.975842573958838e-05, "loss": 1.9606, "step": 59800 }, { "epoch": 0.012099311308816236, "grad_norm": 13.97014331817627, "learning_rate": 1.9758021755742078e-05, "loss": 2.0167, "step": 59900 }, { "epoch": 0.012119510492971188, "grad_norm": 10.598409652709961, "learning_rate": 1.9757617771895773e-05, "loss": 1.892, "step": 60000 }, { "epoch": 0.012119510492971188, "eval_calculated_loss": 8.524048805236816, "eval_loss": 2.178335189819336, "eval_perplexity": 5034.395836674214, "eval_runtime": 115.3351, "eval_samples_per_second": 8.653, "eval_steps_per_second": 2.168, "step": 60000 }, { "epoch": 0.01213970967712614, "grad_norm": 8.735347747802734, "learning_rate": 1.9757213788049473e-05, "loss": 1.9138, "step": 60100 }, { "epoch": 0.012159908861281092, "grad_norm": 10.524636268615723, "learning_rate": 1.975680980420317e-05, "loss": 1.9252, "step": 60200 }, { "epoch": 0.012180108045436044, "grad_norm": 8.53785228729248, "learning_rate": 1.975640582035687e-05, "loss": 1.9821, "step": 60300 }, { "epoch": 0.012200307229590996, "grad_norm": 9.305787086486816, "learning_rate": 1.9756001836510567e-05, "loss": 2.0047, "step": 60400 }, { "epoch": 0.012220506413745948, "grad_norm": 10.733519554138184, "learning_rate": 1.9755597852664266e-05, "loss": 1.9871, "step": 60500 }, { "epoch": 0.0122407055979009, "grad_norm": 10.296757698059082, "learning_rate": 1.9755193868817965e-05, "loss": 1.93, "step": 60600 }, { "epoch": 0.012260904782055852, "grad_norm": 8.372322082519531, "learning_rate": 1.975478988497166e-05, "loss": 2.0422, "step": 60700 }, { "epoch": 0.012281103966210804, "grad_norm": 8.790604591369629, "learning_rate": 1.975438590112536e-05, "loss": 1.9758, "step": 60800 }, { "epoch": 0.012301303150365758, "grad_norm": 11.030494689941406, "learning_rate": 1.9753981917279055e-05, "loss": 1.9124, "step": 60900 }, { "epoch": 0.01232150233452071, "grad_norm": 11.18093490600586, "learning_rate": 1.9753577933432755e-05, "loss": 1.9605, "step": 61000 }, { "epoch": 0.01232150233452071, "eval_calculated_loss": 8.611672401428223, "eval_loss": 2.185499668121338, "eval_perplexity": 5495.431564520953, "eval_runtime": 115.0816, "eval_samples_per_second": 8.672, "eval_steps_per_second": 2.172, "step": 61000 }, { "epoch": 0.012341701518675662, "grad_norm": 15.449681282043457, "learning_rate": 1.9753173949586454e-05, "loss": 2.0362, "step": 61100 }, { "epoch": 0.012361900702830613, "grad_norm": 11.129823684692383, "learning_rate": 1.975276996574015e-05, "loss": 1.904, "step": 61200 }, { "epoch": 0.012382099886985565, "grad_norm": 12.128745079040527, "learning_rate": 1.9752365981893852e-05, "loss": 1.9833, "step": 61300 }, { "epoch": 0.012402299071140517, "grad_norm": 10.286901473999023, "learning_rate": 1.9751961998047548e-05, "loss": 1.9893, "step": 61400 }, { "epoch": 0.01242249825529547, "grad_norm": 13.499273300170898, "learning_rate": 1.9751558014201247e-05, "loss": 1.9016, "step": 61500 }, { "epoch": 0.012442697439450421, "grad_norm": 11.576932907104492, "learning_rate": 1.9751154030354943e-05, "loss": 2.0175, "step": 61600 }, { "epoch": 0.012462896623605373, "grad_norm": 13.734441757202148, "learning_rate": 1.9750750046508642e-05, "loss": 2.0636, "step": 61700 }, { "epoch": 0.012483095807760325, "grad_norm": 15.676130294799805, "learning_rate": 1.975034606266234e-05, "loss": 1.9046, "step": 61800 }, { "epoch": 0.012503294991915277, "grad_norm": 7.811785697937012, "learning_rate": 1.9749942078816037e-05, "loss": 1.9893, "step": 61900 }, { "epoch": 0.01252349417607023, "grad_norm": 14.226452827453613, "learning_rate": 1.9749538094969736e-05, "loss": 1.9549, "step": 62000 }, { "epoch": 0.01252349417607023, "eval_calculated_loss": 8.477521896362305, "eval_loss": 2.182260513305664, "eval_perplexity": 4805.526536519848, "eval_runtime": 116.0678, "eval_samples_per_second": 8.598, "eval_steps_per_second": 2.154, "step": 62000 }, { "epoch": 0.012543693360225181, "grad_norm": 17.040544509887695, "learning_rate": 1.9749134111123435e-05, "loss": 1.9463, "step": 62100 }, { "epoch": 0.012563892544380133, "grad_norm": 14.19808292388916, "learning_rate": 1.974873012727713e-05, "loss": 2.0043, "step": 62200 }, { "epoch": 0.012584091728535085, "grad_norm": 7.704442977905273, "learning_rate": 1.974832614343083e-05, "loss": 1.8626, "step": 62300 }, { "epoch": 0.012604290912690037, "grad_norm": 11.587897300720215, "learning_rate": 1.974792215958453e-05, "loss": 1.9122, "step": 62400 }, { "epoch": 0.012624490096844989, "grad_norm": 5.792624473571777, "learning_rate": 1.9747518175738228e-05, "loss": 1.9244, "step": 62500 }, { "epoch": 0.012644689280999941, "grad_norm": 8.93792724609375, "learning_rate": 1.9747114191891924e-05, "loss": 2.0611, "step": 62600 }, { "epoch": 0.012664888465154893, "grad_norm": 8.672538757324219, "learning_rate": 1.9746710208045623e-05, "loss": 1.9185, "step": 62700 }, { "epoch": 0.012685087649309845, "grad_norm": 6.317646503448486, "learning_rate": 1.9746306224199322e-05, "loss": 1.9221, "step": 62800 }, { "epoch": 0.012705286833464797, "grad_norm": 8.404720306396484, "learning_rate": 1.9745902240353018e-05, "loss": 2.0157, "step": 62900 }, { "epoch": 0.012725486017619749, "grad_norm": 12.48885726928711, "learning_rate": 1.9745498256506717e-05, "loss": 1.9315, "step": 63000 }, { "epoch": 0.012725486017619749, "eval_calculated_loss": 8.567061424255371, "eval_loss": 2.1793267726898193, "eval_perplexity": 5255.662912471723, "eval_runtime": 116.8883, "eval_samples_per_second": 8.538, "eval_steps_per_second": 2.139, "step": 63000 }, { "epoch": 0.0127456852017747, "grad_norm": 16.113243103027344, "learning_rate": 1.9745094272660416e-05, "loss": 2.0046, "step": 63100 }, { "epoch": 0.012765884385929653, "grad_norm": 10.5829439163208, "learning_rate": 1.9744690288814112e-05, "loss": 1.8776, "step": 63200 }, { "epoch": 0.012786083570084605, "grad_norm": 10.938643455505371, "learning_rate": 1.974428630496781e-05, "loss": 1.9124, "step": 63300 }, { "epoch": 0.012806282754239557, "grad_norm": 15.219682693481445, "learning_rate": 1.974388232112151e-05, "loss": 1.9617, "step": 63400 }, { "epoch": 0.012826481938394509, "grad_norm": 9.992905616760254, "learning_rate": 1.974347833727521e-05, "loss": 1.9286, "step": 63500 }, { "epoch": 0.01284668112254946, "grad_norm": 12.800617218017578, "learning_rate": 1.9743074353428905e-05, "loss": 1.9813, "step": 63600 }, { "epoch": 0.012866880306704413, "grad_norm": 11.809919357299805, "learning_rate": 1.9742670369582604e-05, "loss": 2.0395, "step": 63700 }, { "epoch": 0.012887079490859365, "grad_norm": 17.467103958129883, "learning_rate": 1.9742266385736303e-05, "loss": 1.9703, "step": 63800 }, { "epoch": 0.012907278675014316, "grad_norm": 13.897904396057129, "learning_rate": 1.974186240189e-05, "loss": 1.9879, "step": 63900 }, { "epoch": 0.012927477859169268, "grad_norm": 6.919246196746826, "learning_rate": 1.9741458418043698e-05, "loss": 1.9425, "step": 64000 }, { "epoch": 0.012927477859169268, "eval_calculated_loss": 8.45793342590332, "eval_loss": 2.176229238510132, "eval_perplexity": 4712.309591264832, "eval_runtime": 114.8517, "eval_samples_per_second": 8.689, "eval_steps_per_second": 2.177, "step": 64000 }, { "epoch": 0.01294767704332422, "grad_norm": 12.060063362121582, "learning_rate": 1.9741054434197394e-05, "loss": 1.8869, "step": 64100 }, { "epoch": 0.012967876227479172, "grad_norm": 12.96971607208252, "learning_rate": 1.9740650450351093e-05, "loss": 1.9175, "step": 64200 }, { "epoch": 0.012988075411634124, "grad_norm": 13.333728790283203, "learning_rate": 1.9740246466504792e-05, "loss": 1.9359, "step": 64300 }, { "epoch": 0.013008274595789076, "grad_norm": 12.01270580291748, "learning_rate": 1.973984248265849e-05, "loss": 1.9175, "step": 64400 }, { "epoch": 0.013028473779944028, "grad_norm": 14.310555458068848, "learning_rate": 1.973943849881219e-05, "loss": 1.952, "step": 64500 }, { "epoch": 0.01304867296409898, "grad_norm": 11.044120788574219, "learning_rate": 1.9739034514965886e-05, "loss": 1.8683, "step": 64600 }, { "epoch": 0.013068872148253932, "grad_norm": 22.99897003173828, "learning_rate": 1.9738630531119585e-05, "loss": 1.9106, "step": 64700 }, { "epoch": 0.013089071332408884, "grad_norm": 13.964509963989258, "learning_rate": 1.9738226547273284e-05, "loss": 2.0427, "step": 64800 }, { "epoch": 0.013109270516563836, "grad_norm": 14.350369453430176, "learning_rate": 1.973782256342698e-05, "loss": 1.9822, "step": 64900 }, { "epoch": 0.013129469700718788, "grad_norm": 8.233635902404785, "learning_rate": 1.973741857958068e-05, "loss": 1.8031, "step": 65000 }, { "epoch": 0.013129469700718788, "eval_calculated_loss": 8.57400894165039, "eval_loss": 2.1801528930664062, "eval_perplexity": 5292.30385639415, "eval_runtime": 116.3235, "eval_samples_per_second": 8.58, "eval_steps_per_second": 2.149, "step": 65000 }, { "epoch": 0.01314966888487374, "grad_norm": 6.17851448059082, "learning_rate": 1.9737014595734375e-05, "loss": 1.8861, "step": 65100 }, { "epoch": 0.013169868069028692, "grad_norm": 7.901808738708496, "learning_rate": 1.9736610611888074e-05, "loss": 2.0733, "step": 65200 }, { "epoch": 0.013190067253183644, "grad_norm": 9.012036323547363, "learning_rate": 1.9736206628041773e-05, "loss": 1.9027, "step": 65300 }, { "epoch": 0.013210266437338596, "grad_norm": 6.889059543609619, "learning_rate": 1.973580264419547e-05, "loss": 1.9437, "step": 65400 }, { "epoch": 0.013230465621493548, "grad_norm": 8.172494888305664, "learning_rate": 1.973539866034917e-05, "loss": 1.9495, "step": 65500 }, { "epoch": 0.0132506648056485, "grad_norm": 12.076960563659668, "learning_rate": 1.9734994676502867e-05, "loss": 2.0382, "step": 65600 }, { "epoch": 0.013270863989803452, "grad_norm": 8.512677192687988, "learning_rate": 1.9734590692656566e-05, "loss": 2.0384, "step": 65700 }, { "epoch": 0.013291063173958404, "grad_norm": 14.805746078491211, "learning_rate": 1.9734186708810262e-05, "loss": 2.0235, "step": 65800 }, { "epoch": 0.013311262358113356, "grad_norm": 11.315361976623535, "learning_rate": 1.973378272496396e-05, "loss": 1.9481, "step": 65900 }, { "epoch": 0.013331461542268308, "grad_norm": 11.304340362548828, "learning_rate": 1.973337874111766e-05, "loss": 1.9681, "step": 66000 }, { "epoch": 0.013331461542268308, "eval_calculated_loss": 8.679269790649414, "eval_loss": 2.1903867721557617, "eval_perplexity": 5879.75157381771, "eval_runtime": 117.5356, "eval_samples_per_second": 8.491, "eval_steps_per_second": 2.127, "step": 66000 }, { "epoch": 0.01335166072642326, "grad_norm": 6.039035797119141, "learning_rate": 1.9732974757271356e-05, "loss": 2.0009, "step": 66100 }, { "epoch": 0.013371859910578212, "grad_norm": 13.11530590057373, "learning_rate": 1.9732570773425055e-05, "loss": 2.0342, "step": 66200 }, { "epoch": 0.013392059094733164, "grad_norm": 10.649621963500977, "learning_rate": 1.9732166789578754e-05, "loss": 1.9316, "step": 66300 }, { "epoch": 0.013412258278888116, "grad_norm": 15.414111137390137, "learning_rate": 1.973176280573245e-05, "loss": 1.9911, "step": 66400 }, { "epoch": 0.013432457463043068, "grad_norm": 8.565655708312988, "learning_rate": 1.973135882188615e-05, "loss": 1.9721, "step": 66500 }, { "epoch": 0.01345265664719802, "grad_norm": 14.160994529724121, "learning_rate": 1.9730954838039848e-05, "loss": 1.9286, "step": 66600 }, { "epoch": 0.013472855831352971, "grad_norm": 9.534680366516113, "learning_rate": 1.9730550854193547e-05, "loss": 1.9231, "step": 66700 }, { "epoch": 0.013493055015507923, "grad_norm": 8.554953575134277, "learning_rate": 1.9730146870347243e-05, "loss": 1.906, "step": 66800 }, { "epoch": 0.013513254199662875, "grad_norm": 17.71461296081543, "learning_rate": 1.9729742886500942e-05, "loss": 2.0328, "step": 66900 }, { "epoch": 0.013533453383817827, "grad_norm": 15.710007667541504, "learning_rate": 1.972933890265464e-05, "loss": 1.9308, "step": 67000 }, { "epoch": 0.013533453383817827, "eval_calculated_loss": 8.566352844238281, "eval_loss": 2.1746551990509033, "eval_perplexity": 5251.9401738402485, "eval_runtime": 117.0258, "eval_samples_per_second": 8.528, "eval_steps_per_second": 2.136, "step": 67000 }, { "epoch": 0.01355365256797278, "grad_norm": 13.18702220916748, "learning_rate": 1.9728934918808337e-05, "loss": 1.9165, "step": 67100 }, { "epoch": 0.013573851752127731, "grad_norm": 7.510580539703369, "learning_rate": 1.9728530934962036e-05, "loss": 1.9554, "step": 67200 }, { "epoch": 0.013594050936282683, "grad_norm": 10.631264686584473, "learning_rate": 1.9728126951115735e-05, "loss": 1.9789, "step": 67300 }, { "epoch": 0.013614250120437635, "grad_norm": 7.961525917053223, "learning_rate": 1.972772296726943e-05, "loss": 1.8663, "step": 67400 }, { "epoch": 0.013634449304592587, "grad_norm": 8.975910186767578, "learning_rate": 1.972731898342313e-05, "loss": 1.9632, "step": 67500 }, { "epoch": 0.013654648488747539, "grad_norm": 5.693676948547363, "learning_rate": 1.972691499957683e-05, "loss": 1.8782, "step": 67600 }, { "epoch": 0.013674847672902491, "grad_norm": 9.377995491027832, "learning_rate": 1.972651101573053e-05, "loss": 1.9228, "step": 67700 }, { "epoch": 0.013695046857057443, "grad_norm": 8.463482856750488, "learning_rate": 1.9726107031884224e-05, "loss": 1.8661, "step": 67800 }, { "epoch": 0.013715246041212395, "grad_norm": 11.985474586486816, "learning_rate": 1.9725703048037923e-05, "loss": 1.9341, "step": 67900 }, { "epoch": 0.013735445225367347, "grad_norm": 13.960906982421875, "learning_rate": 1.9725299064191622e-05, "loss": 2.0204, "step": 68000 }, { "epoch": 0.013735445225367347, "eval_calculated_loss": 8.56175422668457, "eval_loss": 2.1808924674987793, "eval_perplexity": 5227.84395667352, "eval_runtime": 116.5071, "eval_samples_per_second": 8.566, "eval_steps_per_second": 2.146, "step": 68000 }, { "epoch": 0.013755644409522299, "grad_norm": 9.529396057128906, "learning_rate": 1.9724895080345318e-05, "loss": 1.9271, "step": 68100 }, { "epoch": 0.013775843593677251, "grad_norm": 17.40645980834961, "learning_rate": 1.9724491096499017e-05, "loss": 1.9728, "step": 68200 }, { "epoch": 0.013796042777832203, "grad_norm": 12.686241149902344, "learning_rate": 1.9724087112652713e-05, "loss": 1.9277, "step": 68300 }, { "epoch": 0.013816241961987155, "grad_norm": 17.979093551635742, "learning_rate": 1.9723683128806412e-05, "loss": 1.9566, "step": 68400 }, { "epoch": 0.013836441146142107, "grad_norm": 9.442341804504395, "learning_rate": 1.972327914496011e-05, "loss": 1.8405, "step": 68500 }, { "epoch": 0.013856640330297059, "grad_norm": 8.580099105834961, "learning_rate": 1.972287516111381e-05, "loss": 1.9414, "step": 68600 }, { "epoch": 0.01387683951445201, "grad_norm": 10.870562553405762, "learning_rate": 1.972247117726751e-05, "loss": 1.9278, "step": 68700 }, { "epoch": 0.013897038698606963, "grad_norm": 15.867297172546387, "learning_rate": 1.9722067193421205e-05, "loss": 1.9489, "step": 68800 }, { "epoch": 0.013917237882761915, "grad_norm": 15.138749122619629, "learning_rate": 1.9721663209574904e-05, "loss": 1.9026, "step": 68900 }, { "epoch": 0.013937437066916867, "grad_norm": 9.298121452331543, "learning_rate": 1.97212592257286e-05, "loss": 2.0112, "step": 69000 }, { "epoch": 0.013937437066916867, "eval_calculated_loss": 8.512927055358887, "eval_loss": 2.1825478076934814, "eval_perplexity": 4978.714754871577, "eval_runtime": 114.3495, "eval_samples_per_second": 8.728, "eval_steps_per_second": 2.186, "step": 69000 }, { "epoch": 0.013957636251071819, "grad_norm": 13.665872573852539, "learning_rate": 1.97208552418823e-05, "loss": 2.0459, "step": 69100 }, { "epoch": 0.01397783543522677, "grad_norm": 14.067502975463867, "learning_rate": 1.9720451258036e-05, "loss": 1.9834, "step": 69200 }, { "epoch": 0.013998034619381722, "grad_norm": 13.204691886901855, "learning_rate": 1.9720047274189694e-05, "loss": 1.9258, "step": 69300 }, { "epoch": 0.014018233803536674, "grad_norm": 14.770075798034668, "learning_rate": 1.9719643290343393e-05, "loss": 1.9349, "step": 69400 }, { "epoch": 0.014038432987691626, "grad_norm": 6.959907531738281, "learning_rate": 1.9719239306497092e-05, "loss": 1.9583, "step": 69500 }, { "epoch": 0.014058632171846578, "grad_norm": 9.348843574523926, "learning_rate": 1.971883532265079e-05, "loss": 1.8906, "step": 69600 }, { "epoch": 0.01407883135600153, "grad_norm": 12.414327621459961, "learning_rate": 1.971843133880449e-05, "loss": 1.9855, "step": 69700 }, { "epoch": 0.014099030540156482, "grad_norm": 9.491443634033203, "learning_rate": 1.9718027354958186e-05, "loss": 1.9437, "step": 69800 }, { "epoch": 0.014119229724311434, "grad_norm": 15.106160163879395, "learning_rate": 1.9717623371111886e-05, "loss": 2.0366, "step": 69900 }, { "epoch": 0.014139428908466386, "grad_norm": 8.697134017944336, "learning_rate": 1.971721938726558e-05, "loss": 2.0269, "step": 70000 }, { "epoch": 0.014139428908466386, "eval_calculated_loss": 8.596267700195312, "eval_loss": 2.1827614307403564, "eval_perplexity": 5411.424793985483, "eval_runtime": 115.4146, "eval_samples_per_second": 8.647, "eval_steps_per_second": 2.166, "step": 70000 }, { "epoch": 0.014159628092621338, "grad_norm": 19.61542510986328, "learning_rate": 1.971681540341928e-05, "loss": 1.9856, "step": 70100 }, { "epoch": 0.014179827276776292, "grad_norm": 23.49608612060547, "learning_rate": 1.971641141957298e-05, "loss": 1.8873, "step": 70200 }, { "epoch": 0.014200026460931244, "grad_norm": 11.15372371673584, "learning_rate": 1.9716007435726675e-05, "loss": 2.0244, "step": 70300 }, { "epoch": 0.014220225645086196, "grad_norm": 8.39646053314209, "learning_rate": 1.9715603451880374e-05, "loss": 2.1156, "step": 70400 }, { "epoch": 0.014240424829241148, "grad_norm": 12.018199920654297, "learning_rate": 1.9715199468034074e-05, "loss": 1.9635, "step": 70500 }, { "epoch": 0.0142606240133961, "grad_norm": 15.45269775390625, "learning_rate": 1.971479548418777e-05, "loss": 1.8629, "step": 70600 }, { "epoch": 0.014280823197551052, "grad_norm": 7.450088024139404, "learning_rate": 1.971439150034147e-05, "loss": 1.9676, "step": 70700 }, { "epoch": 0.014301022381706004, "grad_norm": 9.752771377563477, "learning_rate": 1.9713987516495167e-05, "loss": 2.055, "step": 70800 }, { "epoch": 0.014321221565860956, "grad_norm": 7.682866096496582, "learning_rate": 1.9713583532648867e-05, "loss": 1.9576, "step": 70900 }, { "epoch": 0.014341420750015908, "grad_norm": 11.500426292419434, "learning_rate": 1.9713179548802562e-05, "loss": 1.928, "step": 71000 }, { "epoch": 0.014341420750015908, "eval_calculated_loss": 8.634943008422852, "eval_loss": 2.177037239074707, "eval_perplexity": 5624.813148198547, "eval_runtime": 117.613, "eval_samples_per_second": 8.485, "eval_steps_per_second": 2.126, "step": 71000 }, { "epoch": 0.01436161993417086, "grad_norm": 11.272709846496582, "learning_rate": 1.971277556495626e-05, "loss": 1.9418, "step": 71100 }, { "epoch": 0.014381819118325812, "grad_norm": 21.36680793762207, "learning_rate": 1.971237158110996e-05, "loss": 2.0229, "step": 71200 }, { "epoch": 0.014402018302480763, "grad_norm": 8.226950645446777, "learning_rate": 1.9711967597263656e-05, "loss": 1.9795, "step": 71300 }, { "epoch": 0.014422217486635715, "grad_norm": 15.437986373901367, "learning_rate": 1.9711563613417355e-05, "loss": 1.9831, "step": 71400 }, { "epoch": 0.014442416670790667, "grad_norm": 7.653742790222168, "learning_rate": 1.971115962957105e-05, "loss": 2.0325, "step": 71500 }, { "epoch": 0.01446261585494562, "grad_norm": 10.110993385314941, "learning_rate": 1.971075564572475e-05, "loss": 1.9321, "step": 71600 }, { "epoch": 0.014482815039100571, "grad_norm": 14.33385944366455, "learning_rate": 1.971035166187845e-05, "loss": 2.002, "step": 71700 }, { "epoch": 0.014503014223255523, "grad_norm": 10.72745418548584, "learning_rate": 1.970994767803215e-05, "loss": 1.951, "step": 71800 }, { "epoch": 0.014523213407410475, "grad_norm": 18.215965270996094, "learning_rate": 1.9709543694185848e-05, "loss": 1.9135, "step": 71900 }, { "epoch": 0.014543412591565427, "grad_norm": 12.204795837402344, "learning_rate": 1.9709139710339543e-05, "loss": 1.9004, "step": 72000 }, { "epoch": 0.014543412591565427, "eval_calculated_loss": 8.69466495513916, "eval_loss": 2.184701919555664, "eval_perplexity": 5970.97168912095, "eval_runtime": 116.1843, "eval_samples_per_second": 8.59, "eval_steps_per_second": 2.152, "step": 72000 }, { "epoch": 0.01456361177572038, "grad_norm": 18.41004180908203, "learning_rate": 1.9708735726493243e-05, "loss": 1.956, "step": 72100 }, { "epoch": 0.014583810959875331, "grad_norm": 12.236845016479492, "learning_rate": 1.9708331742646942e-05, "loss": 1.9479, "step": 72200 }, { "epoch": 0.014604010144030283, "grad_norm": 11.605778694152832, "learning_rate": 1.9707927758800637e-05, "loss": 1.8987, "step": 72300 }, { "epoch": 0.014624209328185235, "grad_norm": 12.75858211517334, "learning_rate": 1.9707523774954337e-05, "loss": 1.9728, "step": 72400 }, { "epoch": 0.014644408512340187, "grad_norm": 12.685344696044922, "learning_rate": 1.9707119791108032e-05, "loss": 1.9645, "step": 72500 }, { "epoch": 0.014664607696495139, "grad_norm": 7.561262130737305, "learning_rate": 1.970671580726173e-05, "loss": 1.917, "step": 72600 }, { "epoch": 0.014684806880650091, "grad_norm": 9.463238716125488, "learning_rate": 1.970631182341543e-05, "loss": 2.0498, "step": 72700 }, { "epoch": 0.014705006064805043, "grad_norm": 14.383054733276367, "learning_rate": 1.970590783956913e-05, "loss": 1.9558, "step": 72800 }, { "epoch": 0.014725205248959995, "grad_norm": 21.82784080505371, "learning_rate": 1.970550385572283e-05, "loss": 1.9595, "step": 72900 }, { "epoch": 0.014745404433114947, "grad_norm": 10.659093856811523, "learning_rate": 1.9705099871876525e-05, "loss": 1.9531, "step": 73000 }, { "epoch": 0.014745404433114947, "eval_calculated_loss": 8.456175804138184, "eval_loss": 2.190964937210083, "eval_perplexity": 4704.0344078147655, "eval_runtime": 116.9674, "eval_samples_per_second": 8.532, "eval_steps_per_second": 2.137, "step": 73000 }, { "epoch": 0.014765603617269899, "grad_norm": 12.748675346374512, "learning_rate": 1.9704695888030224e-05, "loss": 2.0626, "step": 73100 }, { "epoch": 0.01478580280142485, "grad_norm": 11.506717681884766, "learning_rate": 1.970429190418392e-05, "loss": 1.8892, "step": 73200 }, { "epoch": 0.014806001985579803, "grad_norm": 11.702174186706543, "learning_rate": 1.970388792033762e-05, "loss": 1.9899, "step": 73300 }, { "epoch": 0.014826201169734755, "grad_norm": 10.86549186706543, "learning_rate": 1.9703483936491318e-05, "loss": 1.9879, "step": 73400 }, { "epoch": 0.014846400353889707, "grad_norm": 7.8760986328125, "learning_rate": 1.9703079952645013e-05, "loss": 1.934, "step": 73500 }, { "epoch": 0.014866599538044659, "grad_norm": 6.947813510894775, "learning_rate": 1.9702675968798713e-05, "loss": 1.9936, "step": 73600 }, { "epoch": 0.01488679872219961, "grad_norm": 14.365093231201172, "learning_rate": 1.9702271984952412e-05, "loss": 1.9283, "step": 73700 }, { "epoch": 0.014906997906354563, "grad_norm": 11.076435089111328, "learning_rate": 1.970186800110611e-05, "loss": 2.0169, "step": 73800 }, { "epoch": 0.014927197090509515, "grad_norm": 8.661486625671387, "learning_rate": 1.9701464017259807e-05, "loss": 1.9529, "step": 73900 }, { "epoch": 0.014947396274664466, "grad_norm": 9.152626991271973, "learning_rate": 1.9701060033413506e-05, "loss": 1.909, "step": 74000 }, { "epoch": 0.014947396274664466, "eval_calculated_loss": 8.531876564025879, "eval_loss": 2.1737639904022217, "eval_perplexity": 5073.958514468986, "eval_runtime": 115.5224, "eval_samples_per_second": 8.639, "eval_steps_per_second": 2.164, "step": 74000 }, { "epoch": 0.014967595458819418, "grad_norm": 15.548705101013184, "learning_rate": 1.9700656049567205e-05, "loss": 1.8819, "step": 74100 }, { "epoch": 0.01498779464297437, "grad_norm": 14.858419418334961, "learning_rate": 1.97002520657209e-05, "loss": 1.9136, "step": 74200 }, { "epoch": 0.015007993827129322, "grad_norm": 9.007251739501953, "learning_rate": 1.96998480818746e-05, "loss": 1.9071, "step": 74300 }, { "epoch": 0.015028193011284274, "grad_norm": 12.38598346710205, "learning_rate": 1.96994440980283e-05, "loss": 1.9215, "step": 74400 }, { "epoch": 0.015048392195439226, "grad_norm": 6.534239768981934, "learning_rate": 1.9699040114181995e-05, "loss": 1.834, "step": 74500 }, { "epoch": 0.015068591379594178, "grad_norm": 11.458577156066895, "learning_rate": 1.9698636130335694e-05, "loss": 1.9528, "step": 74600 }, { "epoch": 0.01508879056374913, "grad_norm": 9.785414695739746, "learning_rate": 1.969823214648939e-05, "loss": 1.9559, "step": 74700 }, { "epoch": 0.015108989747904082, "grad_norm": 7.9829607009887695, "learning_rate": 1.9697828162643092e-05, "loss": 1.8863, "step": 74800 }, { "epoch": 0.015129188932059034, "grad_norm": 8.70184326171875, "learning_rate": 1.9697424178796788e-05, "loss": 1.915, "step": 74900 }, { "epoch": 0.015149388116213986, "grad_norm": 6.039971351623535, "learning_rate": 1.9697020194950487e-05, "loss": 1.9669, "step": 75000 }, { "epoch": 0.015149388116213986, "eval_calculated_loss": 8.635236740112305, "eval_loss": 2.1812522411346436, "eval_perplexity": 5626.4655767408585, "eval_runtime": 118.5015, "eval_samples_per_second": 8.422, "eval_steps_per_second": 2.11, "step": 75000 }, { "epoch": 0.015169587300368938, "grad_norm": 8.402853012084961, "learning_rate": 1.9696616211104186e-05, "loss": 1.9794, "step": 75100 }, { "epoch": 0.01518978648452389, "grad_norm": 10.304044723510742, "learning_rate": 1.9696212227257882e-05, "loss": 1.9269, "step": 75200 }, { "epoch": 0.015209985668678842, "grad_norm": 8.872735977172852, "learning_rate": 1.969580824341158e-05, "loss": 2.0235, "step": 75300 }, { "epoch": 0.015230184852833794, "grad_norm": 11.908730506896973, "learning_rate": 1.969540425956528e-05, "loss": 1.9341, "step": 75400 }, { "epoch": 0.015250384036988746, "grad_norm": 9.53458023071289, "learning_rate": 1.9695000275718976e-05, "loss": 1.9937, "step": 75500 }, { "epoch": 0.015270583221143698, "grad_norm": 10.336732864379883, "learning_rate": 1.9694596291872675e-05, "loss": 1.9658, "step": 75600 }, { "epoch": 0.01529078240529865, "grad_norm": 16.25728988647461, "learning_rate": 1.969419230802637e-05, "loss": 1.9436, "step": 75700 }, { "epoch": 0.015310981589453602, "grad_norm": 10.853204727172852, "learning_rate": 1.9693788324180073e-05, "loss": 1.9205, "step": 75800 }, { "epoch": 0.015331180773608554, "grad_norm": 8.358878135681152, "learning_rate": 1.969338434033377e-05, "loss": 1.9463, "step": 75900 }, { "epoch": 0.015351379957763506, "grad_norm": 11.785439491271973, "learning_rate": 1.9692980356487468e-05, "loss": 2.0425, "step": 76000 }, { "epoch": 0.015351379957763506, "eval_calculated_loss": 8.667360305786133, "eval_loss": 2.1841490268707275, "eval_perplexity": 5810.142090749896, "eval_runtime": 115.0656, "eval_samples_per_second": 8.673, "eval_steps_per_second": 2.173, "step": 76000 }, { "epoch": 0.015371579141918458, "grad_norm": 11.043523788452148, "learning_rate": 1.9692576372641167e-05, "loss": 1.9636, "step": 76100 }, { "epoch": 0.01539177832607341, "grad_norm": 7.601599216461182, "learning_rate": 1.9692172388794863e-05, "loss": 1.9679, "step": 76200 }, { "epoch": 0.015411977510228362, "grad_norm": 9.51773738861084, "learning_rate": 1.9691768404948562e-05, "loss": 1.9247, "step": 76300 }, { "epoch": 0.015432176694383314, "grad_norm": 7.433879852294922, "learning_rate": 1.9691364421102258e-05, "loss": 1.9682, "step": 76400 }, { "epoch": 0.015452375878538266, "grad_norm": 12.214580535888672, "learning_rate": 1.9690960437255957e-05, "loss": 2.0916, "step": 76500 }, { "epoch": 0.015472575062693218, "grad_norm": 11.108043670654297, "learning_rate": 1.9690556453409656e-05, "loss": 1.9191, "step": 76600 }, { "epoch": 0.01549277424684817, "grad_norm": 6.974573612213135, "learning_rate": 1.969015246956335e-05, "loss": 1.9538, "step": 76700 }, { "epoch": 0.015512973431003121, "grad_norm": 13.892779350280762, "learning_rate": 1.968974848571705e-05, "loss": 1.8547, "step": 76800 }, { "epoch": 0.015533172615158073, "grad_norm": 12.951510429382324, "learning_rate": 1.968934450187075e-05, "loss": 1.8803, "step": 76900 }, { "epoch": 0.015553371799313025, "grad_norm": 14.275099754333496, "learning_rate": 1.968894051802445e-05, "loss": 1.9578, "step": 77000 }, { "epoch": 0.015553371799313025, "eval_calculated_loss": 8.588242530822754, "eval_loss": 2.18302321434021, "eval_perplexity": 5368.170985178306, "eval_runtime": 116.3464, "eval_samples_per_second": 8.578, "eval_steps_per_second": 2.149, "step": 77000 }, { "epoch": 0.015573570983467977, "grad_norm": 10.260730743408203, "learning_rate": 1.9688536534178145e-05, "loss": 1.8997, "step": 77100 }, { "epoch": 0.01559377016762293, "grad_norm": 7.019965648651123, "learning_rate": 1.9688132550331844e-05, "loss": 1.973, "step": 77200 }, { "epoch": 0.015613969351777881, "grad_norm": 11.144935607910156, "learning_rate": 1.9687728566485543e-05, "loss": 1.9393, "step": 77300 }, { "epoch": 0.015634168535932833, "grad_norm": 11.265685081481934, "learning_rate": 1.968732458263924e-05, "loss": 1.9982, "step": 77400 }, { "epoch": 0.015654367720087787, "grad_norm": 10.266606330871582, "learning_rate": 1.9686920598792938e-05, "loss": 1.9347, "step": 77500 }, { "epoch": 0.015674566904242737, "grad_norm": 13.713878631591797, "learning_rate": 1.9686516614946637e-05, "loss": 1.8952, "step": 77600 }, { "epoch": 0.01569476608839769, "grad_norm": 12.354329109191895, "learning_rate": 1.9686112631100333e-05, "loss": 1.9334, "step": 77700 }, { "epoch": 0.01571496527255264, "grad_norm": 12.432307243347168, "learning_rate": 1.9685708647254032e-05, "loss": 1.9479, "step": 77800 }, { "epoch": 0.015735164456707595, "grad_norm": 13.903250694274902, "learning_rate": 1.968530466340773e-05, "loss": 1.8667, "step": 77900 }, { "epoch": 0.015755363640862545, "grad_norm": 15.848904609680176, "learning_rate": 1.968490067956143e-05, "loss": 1.9839, "step": 78000 }, { "epoch": 0.015755363640862545, "eval_calculated_loss": 8.759418487548828, "eval_loss": 2.1817219257354736, "eval_perplexity": 6370.406030259124, "eval_runtime": 114.0411, "eval_samples_per_second": 8.751, "eval_steps_per_second": 2.192, "step": 78000 }, { "epoch": 0.0157755628250175, "grad_norm": 9.802268028259277, "learning_rate": 1.9684496695715126e-05, "loss": 1.9945, "step": 78100 }, { "epoch": 0.01579576200917245, "grad_norm": 11.386712074279785, "learning_rate": 1.9684092711868825e-05, "loss": 1.9048, "step": 78200 }, { "epoch": 0.015815961193327403, "grad_norm": 13.039191246032715, "learning_rate": 1.9683688728022524e-05, "loss": 2.0029, "step": 78300 }, { "epoch": 0.015836160377482353, "grad_norm": 9.326837539672852, "learning_rate": 1.968328474417622e-05, "loss": 1.9116, "step": 78400 }, { "epoch": 0.015856359561637307, "grad_norm": 14.944392204284668, "learning_rate": 1.968288076032992e-05, "loss": 1.9536, "step": 78500 }, { "epoch": 0.015876558745792257, "grad_norm": 13.72722339630127, "learning_rate": 1.9682476776483618e-05, "loss": 1.9616, "step": 78600 }, { "epoch": 0.01589675792994721, "grad_norm": 15.647879600524902, "learning_rate": 1.9682072792637314e-05, "loss": 1.9132, "step": 78700 }, { "epoch": 0.01591695711410216, "grad_norm": 9.324875831604004, "learning_rate": 1.9681668808791013e-05, "loss": 2.0576, "step": 78800 }, { "epoch": 0.015937156298257114, "grad_norm": 12.271368980407715, "learning_rate": 1.9681264824944712e-05, "loss": 1.859, "step": 78900 }, { "epoch": 0.015957355482412065, "grad_norm": 10.942290306091309, "learning_rate": 1.968086084109841e-05, "loss": 1.9021, "step": 79000 }, { "epoch": 0.015957355482412065, "eval_calculated_loss": 8.583382606506348, "eval_loss": 2.174382448196411, "eval_perplexity": 5342.145372950266, "eval_runtime": 115.0045, "eval_samples_per_second": 8.678, "eval_steps_per_second": 2.174, "step": 79000 }, { "epoch": 0.01597755466656702, "grad_norm": 9.230875015258789, "learning_rate": 1.9680456857252107e-05, "loss": 1.9879, "step": 79100 }, { "epoch": 0.01599775385072197, "grad_norm": 9.966829299926758, "learning_rate": 1.9680052873405806e-05, "loss": 1.92, "step": 79200 }, { "epoch": 0.016017953034876922, "grad_norm": 13.05021858215332, "learning_rate": 1.9679648889559505e-05, "loss": 1.9302, "step": 79300 }, { "epoch": 0.016038152219031872, "grad_norm": 10.91209888458252, "learning_rate": 1.96792449057132e-05, "loss": 1.9238, "step": 79400 }, { "epoch": 0.016058351403186826, "grad_norm": 6.4987592697143555, "learning_rate": 1.96788409218669e-05, "loss": 2.0302, "step": 79500 }, { "epoch": 0.016078550587341776, "grad_norm": 12.1759614944458, "learning_rate": 1.9678436938020596e-05, "loss": 1.8694, "step": 79600 }, { "epoch": 0.01609874977149673, "grad_norm": 9.711325645446777, "learning_rate": 1.9678032954174295e-05, "loss": 2.0324, "step": 79700 }, { "epoch": 0.01611894895565168, "grad_norm": 6.012800216674805, "learning_rate": 1.9677628970327994e-05, "loss": 1.9951, "step": 79800 }, { "epoch": 0.016139148139806634, "grad_norm": 8.894773483276367, "learning_rate": 1.967722498648169e-05, "loss": 1.9912, "step": 79900 }, { "epoch": 0.016159347323961584, "grad_norm": 7.1914777755737305, "learning_rate": 1.9676821002635392e-05, "loss": 1.9473, "step": 80000 }, { "epoch": 0.016159347323961584, "eval_calculated_loss": 8.566746711730957, "eval_loss": 2.1721248626708984, "eval_perplexity": 5254.009149772636, "eval_runtime": 114.7428, "eval_samples_per_second": 8.698, "eval_steps_per_second": 2.179, "step": 80000 }, { "epoch": 0.016179546508116538, "grad_norm": 16.680898666381836, "learning_rate": 1.9676417018789088e-05, "loss": 1.912, "step": 80100 }, { "epoch": 0.016199745692271488, "grad_norm": 10.255926132202148, "learning_rate": 1.9676013034942787e-05, "loss": 2.0138, "step": 80200 }, { "epoch": 0.016219944876426442, "grad_norm": 9.66007137298584, "learning_rate": 1.9675609051096486e-05, "loss": 1.922, "step": 80300 }, { "epoch": 0.016240144060581392, "grad_norm": 11.722455978393555, "learning_rate": 1.9675205067250182e-05, "loss": 1.9225, "step": 80400 }, { "epoch": 0.016260343244736346, "grad_norm": 7.745282173156738, "learning_rate": 1.967480108340388e-05, "loss": 1.9922, "step": 80500 }, { "epoch": 0.016280542428891296, "grad_norm": 7.626348495483398, "learning_rate": 1.9674397099557577e-05, "loss": 1.9187, "step": 80600 }, { "epoch": 0.01630074161304625, "grad_norm": 11.418168067932129, "learning_rate": 1.9673993115711276e-05, "loss": 1.8699, "step": 80700 }, { "epoch": 0.0163209407972012, "grad_norm": 11.88425350189209, "learning_rate": 1.9673589131864975e-05, "loss": 1.952, "step": 80800 }, { "epoch": 0.016341139981356154, "grad_norm": 10.640769004821777, "learning_rate": 1.967318514801867e-05, "loss": 1.9658, "step": 80900 }, { "epoch": 0.016361339165511104, "grad_norm": 10.965596199035645, "learning_rate": 1.9672781164172374e-05, "loss": 1.9355, "step": 81000 }, { "epoch": 0.016361339165511104, "eval_calculated_loss": 8.728536605834961, "eval_loss": 2.180121421813965, "eval_perplexity": 6176.68257298532, "eval_runtime": 115.3118, "eval_samples_per_second": 8.655, "eval_steps_per_second": 2.168, "step": 81000 }, { "epoch": 0.016381538349666058, "grad_norm": 14.395133972167969, "learning_rate": 1.967237718032607e-05, "loss": 1.963, "step": 81100 }, { "epoch": 0.016401737533821008, "grad_norm": 7.909218788146973, "learning_rate": 1.967197319647977e-05, "loss": 1.9182, "step": 81200 }, { "epoch": 0.01642193671797596, "grad_norm": 15.878961563110352, "learning_rate": 1.9671569212633464e-05, "loss": 1.9543, "step": 81300 }, { "epoch": 0.01644213590213091, "grad_norm": 8.866228103637695, "learning_rate": 1.9671165228787163e-05, "loss": 1.9848, "step": 81400 }, { "epoch": 0.016462335086285865, "grad_norm": 9.022475242614746, "learning_rate": 1.9670761244940862e-05, "loss": 1.9361, "step": 81500 }, { "epoch": 0.016482534270440816, "grad_norm": 7.391636371612549, "learning_rate": 1.9670357261094558e-05, "loss": 1.8476, "step": 81600 }, { "epoch": 0.01650273345459577, "grad_norm": 9.36715030670166, "learning_rate": 1.9669953277248257e-05, "loss": 1.876, "step": 81700 }, { "epoch": 0.01652293263875072, "grad_norm": 9.40855598449707, "learning_rate": 1.9669549293401956e-05, "loss": 1.958, "step": 81800 }, { "epoch": 0.016543131822905673, "grad_norm": 9.63371753692627, "learning_rate": 1.9669145309555652e-05, "loss": 1.8921, "step": 81900 }, { "epoch": 0.016563331007060624, "grad_norm": 12.275777816772461, "learning_rate": 1.966874132570935e-05, "loss": 1.9205, "step": 82000 }, { "epoch": 0.016563331007060624, "eval_calculated_loss": 8.562515258789062, "eval_loss": 2.177161931991577, "eval_perplexity": 5231.824028050777, "eval_runtime": 114.8408, "eval_samples_per_second": 8.69, "eval_steps_per_second": 2.177, "step": 82000 }, { "epoch": 0.016583530191215577, "grad_norm": 6.232553958892822, "learning_rate": 1.966833734186305e-05, "loss": 1.9574, "step": 82100 }, { "epoch": 0.016603729375370527, "grad_norm": 15.45679759979248, "learning_rate": 1.966793335801675e-05, "loss": 1.9621, "step": 82200 }, { "epoch": 0.01662392855952548, "grad_norm": 14.278603553771973, "learning_rate": 1.9667529374170445e-05, "loss": 1.901, "step": 82300 }, { "epoch": 0.01664412774368043, "grad_norm": 11.552295684814453, "learning_rate": 1.9667125390324144e-05, "loss": 1.9051, "step": 82400 }, { "epoch": 0.016664326927835385, "grad_norm": 11.254755973815918, "learning_rate": 1.9666721406477844e-05, "loss": 2.0065, "step": 82500 }, { "epoch": 0.016684526111990335, "grad_norm": 11.469532012939453, "learning_rate": 1.966631742263154e-05, "loss": 1.9718, "step": 82600 }, { "epoch": 0.01670472529614529, "grad_norm": 10.861802101135254, "learning_rate": 1.966591343878524e-05, "loss": 1.9511, "step": 82700 }, { "epoch": 0.01672492448030024, "grad_norm": 11.456778526306152, "learning_rate": 1.9665509454938938e-05, "loss": 1.9236, "step": 82800 }, { "epoch": 0.016745123664455193, "grad_norm": 5.398102283477783, "learning_rate": 1.9665105471092633e-05, "loss": 1.913, "step": 82900 }, { "epoch": 0.016765322848610143, "grad_norm": 9.378028869628906, "learning_rate": 1.9664701487246332e-05, "loss": 1.9854, "step": 83000 }, { "epoch": 0.016765322848610143, "eval_calculated_loss": 8.759657859802246, "eval_loss": 2.1816229820251465, "eval_perplexity": 6371.931111229526, "eval_runtime": 114.4037, "eval_samples_per_second": 8.723, "eval_steps_per_second": 2.185, "step": 83000 }, { "epoch": 0.016785522032765097, "grad_norm": 7.5000481605529785, "learning_rate": 1.966429750340003e-05, "loss": 1.9085, "step": 83100 }, { "epoch": 0.016805721216920047, "grad_norm": 7.777898788452148, "learning_rate": 1.966389351955373e-05, "loss": 1.8998, "step": 83200 }, { "epoch": 0.016825920401075, "grad_norm": 18.04599380493164, "learning_rate": 1.9663489535707426e-05, "loss": 1.9391, "step": 83300 }, { "epoch": 0.01684611958522995, "grad_norm": 10.000171661376953, "learning_rate": 1.9663085551861125e-05, "loss": 2.0032, "step": 83400 }, { "epoch": 0.016866318769384905, "grad_norm": 12.27891731262207, "learning_rate": 1.9662681568014825e-05, "loss": 1.9378, "step": 83500 }, { "epoch": 0.016886517953539855, "grad_norm": 9.9215726852417, "learning_rate": 1.966227758416852e-05, "loss": 1.9626, "step": 83600 }, { "epoch": 0.01690671713769481, "grad_norm": 10.71828842163086, "learning_rate": 1.966187360032222e-05, "loss": 1.9925, "step": 83700 }, { "epoch": 0.01692691632184976, "grad_norm": 8.898046493530273, "learning_rate": 1.9661469616475915e-05, "loss": 1.926, "step": 83800 }, { "epoch": 0.016947115506004713, "grad_norm": 10.726910591125488, "learning_rate": 1.9661065632629614e-05, "loss": 2.0044, "step": 83900 }, { "epoch": 0.016967314690159663, "grad_norm": 9.346013069152832, "learning_rate": 1.9660661648783313e-05, "loss": 1.9274, "step": 84000 }, { "epoch": 0.016967314690159663, "eval_calculated_loss": 8.679731369018555, "eval_loss": 2.1690053939819336, "eval_perplexity": 5882.46616641052, "eval_runtime": 114.9756, "eval_samples_per_second": 8.68, "eval_steps_per_second": 2.174, "step": 84000 }, { "epoch": 0.016987513874314616, "grad_norm": 11.498566627502441, "learning_rate": 1.9660257664937013e-05, "loss": 1.9508, "step": 84100 }, { "epoch": 0.017007713058469567, "grad_norm": 14.278846740722656, "learning_rate": 1.9659853681090712e-05, "loss": 2.0472, "step": 84200 }, { "epoch": 0.01702791224262452, "grad_norm": 11.444978713989258, "learning_rate": 1.9659449697244407e-05, "loss": 1.9659, "step": 84300 }, { "epoch": 0.017048111426779474, "grad_norm": 10.822953224182129, "learning_rate": 1.9659045713398107e-05, "loss": 1.9405, "step": 84400 }, { "epoch": 0.017068310610934424, "grad_norm": 12.94221019744873, "learning_rate": 1.9658641729551802e-05, "loss": 1.9643, "step": 84500 }, { "epoch": 0.017088509795089378, "grad_norm": 13.139805793762207, "learning_rate": 1.96582377457055e-05, "loss": 1.864, "step": 84600 }, { "epoch": 0.017108708979244328, "grad_norm": 6.512974739074707, "learning_rate": 1.96578337618592e-05, "loss": 1.8787, "step": 84700 }, { "epoch": 0.017128908163399282, "grad_norm": 10.231914520263672, "learning_rate": 1.9657429778012896e-05, "loss": 1.9887, "step": 84800 }, { "epoch": 0.017149107347554232, "grad_norm": 5.4094133377075195, "learning_rate": 1.9657025794166595e-05, "loss": 1.9334, "step": 84900 }, { "epoch": 0.017169306531709186, "grad_norm": 9.502370834350586, "learning_rate": 1.9656621810320295e-05, "loss": 1.8968, "step": 85000 }, { "epoch": 0.017169306531709186, "eval_calculated_loss": 8.565424919128418, "eval_loss": 2.1791579723358154, "eval_perplexity": 5247.069027056672, "eval_runtime": 115.1863, "eval_samples_per_second": 8.664, "eval_steps_per_second": 2.17, "step": 85000 }, { "epoch": 0.017189505715864136, "grad_norm": 8.728424072265625, "learning_rate": 1.965621782647399e-05, "loss": 1.937, "step": 85100 }, { "epoch": 0.01720970490001909, "grad_norm": 10.406803131103516, "learning_rate": 1.9655813842627693e-05, "loss": 1.9688, "step": 85200 }, { "epoch": 0.01722990408417404, "grad_norm": 5.545611381530762, "learning_rate": 1.965540985878139e-05, "loss": 2.0326, "step": 85300 }, { "epoch": 0.017250103268328994, "grad_norm": 9.8101167678833, "learning_rate": 1.9655005874935088e-05, "loss": 2.0521, "step": 85400 }, { "epoch": 0.017270302452483944, "grad_norm": 10.769133567810059, "learning_rate": 1.9654601891088783e-05, "loss": 1.9597, "step": 85500 }, { "epoch": 0.017290501636638898, "grad_norm": 14.408909797668457, "learning_rate": 1.9654197907242483e-05, "loss": 1.9794, "step": 85600 }, { "epoch": 0.017310700820793848, "grad_norm": 9.278264045715332, "learning_rate": 1.9653793923396182e-05, "loss": 2.009, "step": 85700 }, { "epoch": 0.0173309000049488, "grad_norm": 15.429107666015625, "learning_rate": 1.9653389939549877e-05, "loss": 2.0145, "step": 85800 }, { "epoch": 0.017351099189103752, "grad_norm": 14.083562850952148, "learning_rate": 1.9652985955703577e-05, "loss": 1.8954, "step": 85900 }, { "epoch": 0.017371298373258705, "grad_norm": 16.26299476623535, "learning_rate": 1.9652581971857276e-05, "loss": 2.0432, "step": 86000 }, { "epoch": 0.017371298373258705, "eval_calculated_loss": 8.668024063110352, "eval_loss": 2.176945686340332, "eval_perplexity": 5813.999895298758, "eval_runtime": 115.9745, "eval_samples_per_second": 8.605, "eval_steps_per_second": 2.156, "step": 86000 }, { "epoch": 0.017391497557413656, "grad_norm": 9.689414024353027, "learning_rate": 1.965217798801097e-05, "loss": 1.9261, "step": 86100 }, { "epoch": 0.01741169674156861, "grad_norm": 10.804335594177246, "learning_rate": 1.965177400416467e-05, "loss": 1.9067, "step": 86200 }, { "epoch": 0.01743189592572356, "grad_norm": 10.011388778686523, "learning_rate": 1.965137002031837e-05, "loss": 1.9221, "step": 86300 }, { "epoch": 0.017452095109878513, "grad_norm": 10.956096649169922, "learning_rate": 1.965096603647207e-05, "loss": 1.9839, "step": 86400 }, { "epoch": 0.017472294294033464, "grad_norm": 9.454009056091309, "learning_rate": 1.9650562052625765e-05, "loss": 1.8881, "step": 86500 }, { "epoch": 0.017492493478188417, "grad_norm": 13.800542831420898, "learning_rate": 1.9650158068779464e-05, "loss": 1.9521, "step": 86600 }, { "epoch": 0.017512692662343367, "grad_norm": 12.073776245117188, "learning_rate": 1.9649754084933163e-05, "loss": 1.9125, "step": 86700 }, { "epoch": 0.01753289184649832, "grad_norm": 11.074697494506836, "learning_rate": 1.964935010108686e-05, "loss": 2.0174, "step": 86800 }, { "epoch": 0.01755309103065327, "grad_norm": 11.53991985321045, "learning_rate": 1.9648946117240558e-05, "loss": 1.9408, "step": 86900 }, { "epoch": 0.017573290214808225, "grad_norm": 6.382812976837158, "learning_rate": 1.9648542133394253e-05, "loss": 1.9325, "step": 87000 }, { "epoch": 0.017573290214808225, "eval_calculated_loss": 8.56582260131836, "eval_loss": 2.187227964401245, "eval_perplexity": 5249.1561079280655, "eval_runtime": 114.5207, "eval_samples_per_second": 8.715, "eval_steps_per_second": 2.183, "step": 87000 }, { "epoch": 0.017593489398963175, "grad_norm": 10.295441627502441, "learning_rate": 1.9648138149547953e-05, "loss": 1.9424, "step": 87100 }, { "epoch": 0.01761368858311813, "grad_norm": 12.570395469665527, "learning_rate": 1.9647734165701652e-05, "loss": 1.9854, "step": 87200 }, { "epoch": 0.01763388776727308, "grad_norm": 12.932522773742676, "learning_rate": 1.964733018185535e-05, "loss": 1.9167, "step": 87300 }, { "epoch": 0.017654086951428033, "grad_norm": 12.410618782043457, "learning_rate": 1.964692619800905e-05, "loss": 1.9594, "step": 87400 }, { "epoch": 0.017674286135582983, "grad_norm": 16.652917861938477, "learning_rate": 1.9646522214162746e-05, "loss": 1.9089, "step": 87500 }, { "epoch": 0.017694485319737937, "grad_norm": 11.336516380310059, "learning_rate": 1.9646118230316445e-05, "loss": 1.9192, "step": 87600 }, { "epoch": 0.017714684503892887, "grad_norm": 10.535503387451172, "learning_rate": 1.964571424647014e-05, "loss": 1.9028, "step": 87700 }, { "epoch": 0.01773488368804784, "grad_norm": 10.97453784942627, "learning_rate": 1.964531026262384e-05, "loss": 1.9877, "step": 87800 }, { "epoch": 0.01775508287220279, "grad_norm": 10.235799789428711, "learning_rate": 1.964490627877754e-05, "loss": 1.9095, "step": 87900 }, { "epoch": 0.017775282056357745, "grad_norm": 16.19926643371582, "learning_rate": 1.9644502294931235e-05, "loss": 1.9597, "step": 88000 }, { "epoch": 0.017775282056357745, "eval_calculated_loss": 8.503422737121582, "eval_loss": 2.1877877712249756, "eval_perplexity": 4931.619623502665, "eval_runtime": 114.4366, "eval_samples_per_second": 8.721, "eval_steps_per_second": 2.185, "step": 88000 }, { "epoch": 0.017795481240512695, "grad_norm": 11.81240463256836, "learning_rate": 1.9644098311084934e-05, "loss": 1.8468, "step": 88100 }, { "epoch": 0.01781568042466765, "grad_norm": 10.979573249816895, "learning_rate": 1.9643694327238633e-05, "loss": 1.9063, "step": 88200 }, { "epoch": 0.0178358796088226, "grad_norm": 8.161264419555664, "learning_rate": 1.9643290343392332e-05, "loss": 1.9248, "step": 88300 }, { "epoch": 0.017856078792977553, "grad_norm": 9.752946853637695, "learning_rate": 1.964288635954603e-05, "loss": 1.8409, "step": 88400 }, { "epoch": 0.017876277977132503, "grad_norm": 21.125791549682617, "learning_rate": 1.9642482375699727e-05, "loss": 1.9678, "step": 88500 }, { "epoch": 0.017896477161287457, "grad_norm": 16.290634155273438, "learning_rate": 1.9642078391853426e-05, "loss": 1.8608, "step": 88600 }, { "epoch": 0.017916676345442407, "grad_norm": 14.855155944824219, "learning_rate": 1.964167440800712e-05, "loss": 1.9203, "step": 88700 }, { "epoch": 0.01793687552959736, "grad_norm": 7.282357692718506, "learning_rate": 1.964127042416082e-05, "loss": 1.9368, "step": 88800 }, { "epoch": 0.01795707471375231, "grad_norm": 8.843650817871094, "learning_rate": 1.964086644031452e-05, "loss": 1.9392, "step": 88900 }, { "epoch": 0.017977273897907264, "grad_norm": 12.740259170532227, "learning_rate": 1.9640462456468216e-05, "loss": 1.9168, "step": 89000 }, { "epoch": 0.017977273897907264, "eval_calculated_loss": 8.53454303741455, "eval_loss": 2.1905517578125, "eval_perplexity": 5087.506143992786, "eval_runtime": 116.7304, "eval_samples_per_second": 8.55, "eval_steps_per_second": 2.142, "step": 89000 }, { "epoch": 0.017997473082062215, "grad_norm": 9.11579704284668, "learning_rate": 1.9640058472621915e-05, "loss": 1.8864, "step": 89100 }, { "epoch": 0.01801767226621717, "grad_norm": 10.116637229919434, "learning_rate": 1.9639654488775614e-05, "loss": 1.9549, "step": 89200 }, { "epoch": 0.01803787145037212, "grad_norm": 11.981674194335938, "learning_rate": 1.9639250504929313e-05, "loss": 1.8468, "step": 89300 }, { "epoch": 0.018058070634527072, "grad_norm": 12.578632354736328, "learning_rate": 1.963884652108301e-05, "loss": 1.8313, "step": 89400 }, { "epoch": 0.018078269818682022, "grad_norm": 11.221511840820312, "learning_rate": 1.9638442537236708e-05, "loss": 1.8312, "step": 89500 }, { "epoch": 0.018098469002836976, "grad_norm": 11.164252281188965, "learning_rate": 1.9638038553390407e-05, "loss": 1.9729, "step": 89600 }, { "epoch": 0.018118668186991926, "grad_norm": 9.455153465270996, "learning_rate": 1.9637634569544103e-05, "loss": 1.9303, "step": 89700 }, { "epoch": 0.01813886737114688, "grad_norm": 8.67717170715332, "learning_rate": 1.9637230585697802e-05, "loss": 1.899, "step": 89800 }, { "epoch": 0.01815906655530183, "grad_norm": 16.374034881591797, "learning_rate": 1.96368266018515e-05, "loss": 2.015, "step": 89900 }, { "epoch": 0.018179265739456784, "grad_norm": 11.249991416931152, "learning_rate": 1.9636422618005197e-05, "loss": 1.9772, "step": 90000 }, { "epoch": 0.018179265739456784, "eval_calculated_loss": 8.793445587158203, "eval_loss": 2.189730644226074, "eval_perplexity": 6590.902625980539, "eval_runtime": 116.4143, "eval_samples_per_second": 8.573, "eval_steps_per_second": 2.148, "step": 90000 }, { "epoch": 0.018199464923611734, "grad_norm": 14.340103149414062, "learning_rate": 1.9636018634158896e-05, "loss": 1.9721, "step": 90100 }, { "epoch": 0.018219664107766688, "grad_norm": 10.992175102233887, "learning_rate": 1.963561465031259e-05, "loss": 1.8808, "step": 90200 }, { "epoch": 0.018239863291921638, "grad_norm": 10.559914588928223, "learning_rate": 1.9635210666466294e-05, "loss": 1.9646, "step": 90300 }, { "epoch": 0.018260062476076592, "grad_norm": 10.853055000305176, "learning_rate": 1.963480668261999e-05, "loss": 2.0232, "step": 90400 }, { "epoch": 0.018280261660231542, "grad_norm": 10.230924606323242, "learning_rate": 1.963440269877369e-05, "loss": 1.867, "step": 90500 }, { "epoch": 0.018300460844386496, "grad_norm": 12.287362098693848, "learning_rate": 1.9633998714927388e-05, "loss": 1.8804, "step": 90600 }, { "epoch": 0.018320660028541446, "grad_norm": 10.427842140197754, "learning_rate": 1.9633594731081084e-05, "loss": 1.9592, "step": 90700 }, { "epoch": 0.0183408592126964, "grad_norm": 13.822956085205078, "learning_rate": 1.9633190747234783e-05, "loss": 1.9001, "step": 90800 }, { "epoch": 0.01836105839685135, "grad_norm": 12.165316581726074, "learning_rate": 1.9632786763388482e-05, "loss": 1.9528, "step": 90900 }, { "epoch": 0.018381257581006304, "grad_norm": 11.874184608459473, "learning_rate": 1.9632382779542178e-05, "loss": 1.9543, "step": 91000 }, { "epoch": 0.018381257581006304, "eval_calculated_loss": 8.631385803222656, "eval_loss": 2.196997880935669, "eval_perplexity": 5604.840078831869, "eval_runtime": 117.9791, "eval_samples_per_second": 8.459, "eval_steps_per_second": 2.119, "step": 91000 }, { "epoch": 0.018401456765161254, "grad_norm": 12.227421760559082, "learning_rate": 1.9631978795695877e-05, "loss": 1.8992, "step": 91100 }, { "epoch": 0.018421655949316208, "grad_norm": 16.443490982055664, "learning_rate": 1.9631574811849573e-05, "loss": 1.9335, "step": 91200 }, { "epoch": 0.018441855133471158, "grad_norm": 7.090055465698242, "learning_rate": 1.9631170828003272e-05, "loss": 1.9563, "step": 91300 }, { "epoch": 0.01846205431762611, "grad_norm": 13.327200889587402, "learning_rate": 1.963076684415697e-05, "loss": 1.9343, "step": 91400 }, { "epoch": 0.01848225350178106, "grad_norm": 13.093523025512695, "learning_rate": 1.963036286031067e-05, "loss": 1.8817, "step": 91500 }, { "epoch": 0.018502452685936015, "grad_norm": 12.538352012634277, "learning_rate": 1.962995887646437e-05, "loss": 1.8639, "step": 91600 }, { "epoch": 0.018522651870090966, "grad_norm": 15.607866287231445, "learning_rate": 1.9629554892618065e-05, "loss": 1.8676, "step": 91700 }, { "epoch": 0.01854285105424592, "grad_norm": 10.634490013122559, "learning_rate": 1.9629150908771764e-05, "loss": 1.8773, "step": 91800 }, { "epoch": 0.01856305023840087, "grad_norm": 9.72269344329834, "learning_rate": 1.962874692492546e-05, "loss": 1.9812, "step": 91900 }, { "epoch": 0.018583249422555823, "grad_norm": 12.345704078674316, "learning_rate": 1.962834294107916e-05, "loss": 1.9175, "step": 92000 }, { "epoch": 0.018583249422555823, "eval_calculated_loss": 8.477407455444336, "eval_loss": 2.161731719970703, "eval_perplexity": 4804.976619118812, "eval_runtime": 117.198, "eval_samples_per_second": 8.516, "eval_steps_per_second": 2.133, "step": 92000 }, { "epoch": 0.018603448606710773, "grad_norm": 15.002909660339355, "learning_rate": 1.9627938957232858e-05, "loss": 1.9984, "step": 92100 }, { "epoch": 0.018623647790865727, "grad_norm": 9.99566650390625, "learning_rate": 1.9627534973386554e-05, "loss": 1.9512, "step": 92200 }, { "epoch": 0.018643846975020677, "grad_norm": 5.014530658721924, "learning_rate": 1.9627130989540253e-05, "loss": 1.8704, "step": 92300 }, { "epoch": 0.01866404615917563, "grad_norm": 8.978522300720215, "learning_rate": 1.9626727005693952e-05, "loss": 1.9215, "step": 92400 }, { "epoch": 0.01868424534333058, "grad_norm": 15.043296813964844, "learning_rate": 1.962632302184765e-05, "loss": 1.9168, "step": 92500 }, { "epoch": 0.018704444527485535, "grad_norm": 11.01242733001709, "learning_rate": 1.9625919038001347e-05, "loss": 1.9454, "step": 92600 }, { "epoch": 0.018724643711640485, "grad_norm": 8.234987258911133, "learning_rate": 1.9625515054155046e-05, "loss": 1.8901, "step": 92700 }, { "epoch": 0.01874484289579544, "grad_norm": 12.834805488586426, "learning_rate": 1.9625111070308745e-05, "loss": 1.9782, "step": 92800 }, { "epoch": 0.01876504207995039, "grad_norm": 11.656557083129883, "learning_rate": 1.962470708646244e-05, "loss": 1.9629, "step": 92900 }, { "epoch": 0.018785241264105343, "grad_norm": 10.805724143981934, "learning_rate": 1.962430310261614e-05, "loss": 1.8819, "step": 93000 }, { "epoch": 0.018785241264105343, "eval_calculated_loss": 8.614938735961914, "eval_loss": 2.164815902709961, "eval_perplexity": 5513.410829579887, "eval_runtime": 117.9507, "eval_samples_per_second": 8.461, "eval_steps_per_second": 2.12, "step": 93000 }, { "epoch": 0.018805440448260293, "grad_norm": 9.788331031799316, "learning_rate": 1.962389911876984e-05, "loss": 1.9047, "step": 93100 }, { "epoch": 0.018825639632415247, "grad_norm": 8.347597122192383, "learning_rate": 1.9623495134923535e-05, "loss": 1.9359, "step": 93200 }, { "epoch": 0.018845838816570197, "grad_norm": 5.786556243896484, "learning_rate": 1.9623091151077234e-05, "loss": 1.8611, "step": 93300 }, { "epoch": 0.01886603800072515, "grad_norm": 12.274849891662598, "learning_rate": 1.9622687167230933e-05, "loss": 1.9803, "step": 93400 }, { "epoch": 0.0188862371848801, "grad_norm": 10.04313850402832, "learning_rate": 1.9622283183384632e-05, "loss": 1.9178, "step": 93500 }, { "epoch": 0.018906436369035055, "grad_norm": 12.101825714111328, "learning_rate": 1.9621879199538328e-05, "loss": 1.8323, "step": 93600 }, { "epoch": 0.01892663555319001, "grad_norm": 9.803915023803711, "learning_rate": 1.9621475215692027e-05, "loss": 1.964, "step": 93700 }, { "epoch": 0.01894683473734496, "grad_norm": 10.685532569885254, "learning_rate": 1.9621071231845726e-05, "loss": 1.9492, "step": 93800 }, { "epoch": 0.018967033921499912, "grad_norm": 10.902131080627441, "learning_rate": 1.9620667247999422e-05, "loss": 1.9416, "step": 93900 }, { "epoch": 0.018987233105654863, "grad_norm": 9.646952629089355, "learning_rate": 1.962026326415312e-05, "loss": 1.9655, "step": 94000 }, { "epoch": 0.018987233105654863, "eval_calculated_loss": 8.532927513122559, "eval_loss": 2.1567440032958984, "eval_perplexity": 5079.293789646607, "eval_runtime": 117.3231, "eval_samples_per_second": 8.506, "eval_steps_per_second": 2.131, "step": 94000 }, { "epoch": 0.019007432289809816, "grad_norm": 12.642590522766113, "learning_rate": 1.961985928030682e-05, "loss": 1.9455, "step": 94100 }, { "epoch": 0.019027631473964766, "grad_norm": 8.07237720489502, "learning_rate": 1.9619455296460516e-05, "loss": 2.0036, "step": 94200 }, { "epoch": 0.01904783065811972, "grad_norm": 12.847275733947754, "learning_rate": 1.9619051312614215e-05, "loss": 1.9422, "step": 94300 }, { "epoch": 0.01906802984227467, "grad_norm": 6.982450485229492, "learning_rate": 1.961864732876791e-05, "loss": 1.938, "step": 94400 }, { "epoch": 0.019088229026429624, "grad_norm": 19.05004119873047, "learning_rate": 1.9618243344921614e-05, "loss": 1.987, "step": 94500 }, { "epoch": 0.019108428210584574, "grad_norm": 11.949305534362793, "learning_rate": 1.961783936107531e-05, "loss": 1.9539, "step": 94600 }, { "epoch": 0.019128627394739528, "grad_norm": 12.36069393157959, "learning_rate": 1.961743537722901e-05, "loss": 1.8923, "step": 94700 }, { "epoch": 0.019148826578894478, "grad_norm": 10.926715850830078, "learning_rate": 1.9617031393382708e-05, "loss": 1.9655, "step": 94800 }, { "epoch": 0.019169025763049432, "grad_norm": 6.322956562042236, "learning_rate": 1.9616627409536403e-05, "loss": 1.9135, "step": 94900 }, { "epoch": 0.019189224947204382, "grad_norm": 10.378437995910645, "learning_rate": 1.9616223425690102e-05, "loss": 1.9214, "step": 95000 }, { "epoch": 0.019189224947204382, "eval_calculated_loss": 8.475489616394043, "eval_loss": 2.158468723297119, "eval_perplexity": 4795.770278284727, "eval_runtime": 117.3964, "eval_samples_per_second": 8.501, "eval_steps_per_second": 2.13, "step": 95000 }, { "epoch": 0.019209424131359336, "grad_norm": 8.114861488342285, "learning_rate": 1.9615819441843798e-05, "loss": 1.9614, "step": 95100 }, { "epoch": 0.019229623315514286, "grad_norm": 13.571133613586426, "learning_rate": 1.9615415457997497e-05, "loss": 1.8451, "step": 95200 }, { "epoch": 0.01924982249966924, "grad_norm": 11.188097953796387, "learning_rate": 1.9615011474151196e-05, "loss": 1.9691, "step": 95300 }, { "epoch": 0.01927002168382419, "grad_norm": 9.125367164611816, "learning_rate": 1.9614607490304892e-05, "loss": 1.9321, "step": 95400 }, { "epoch": 0.019290220867979144, "grad_norm": 8.05715274810791, "learning_rate": 1.9614203506458595e-05, "loss": 1.9669, "step": 95500 }, { "epoch": 0.019310420052134094, "grad_norm": 8.43848705291748, "learning_rate": 1.961379952261229e-05, "loss": 1.8892, "step": 95600 }, { "epoch": 0.019330619236289048, "grad_norm": 8.159011840820312, "learning_rate": 1.961339553876599e-05, "loss": 1.906, "step": 95700 }, { "epoch": 0.019350818420443998, "grad_norm": 15.49233627319336, "learning_rate": 1.961299155491969e-05, "loss": 1.977, "step": 95800 }, { "epoch": 0.01937101760459895, "grad_norm": 7.3712639808654785, "learning_rate": 1.9612587571073384e-05, "loss": 1.8979, "step": 95900 }, { "epoch": 0.019391216788753902, "grad_norm": 12.118559837341309, "learning_rate": 1.9612183587227083e-05, "loss": 1.9032, "step": 96000 }, { "epoch": 0.019391216788753902, "eval_calculated_loss": 8.308650970458984, "eval_loss": 2.1708984375, "eval_perplexity": 4058.833802046557, "eval_runtime": 114.7305, "eval_samples_per_second": 8.699, "eval_steps_per_second": 2.179, "step": 96000 }, { "epoch": 0.019411415972908855, "grad_norm": 4.089985370635986, "learning_rate": 1.961177960338078e-05, "loss": 1.8881, "step": 96100 }, { "epoch": 0.019431615157063806, "grad_norm": 14.715400695800781, "learning_rate": 1.961137561953448e-05, "loss": 1.95, "step": 96200 }, { "epoch": 0.01945181434121876, "grad_norm": 7.5255937576293945, "learning_rate": 1.9610971635688177e-05, "loss": 2.0087, "step": 96300 }, { "epoch": 0.01947201352537371, "grad_norm": 11.924358367919922, "learning_rate": 1.9610567651841873e-05, "loss": 1.922, "step": 96400 }, { "epoch": 0.019492212709528663, "grad_norm": 10.348092079162598, "learning_rate": 1.9610163667995572e-05, "loss": 1.9357, "step": 96500 }, { "epoch": 0.019512411893683614, "grad_norm": 9.972105979919434, "learning_rate": 1.960975968414927e-05, "loss": 1.9277, "step": 96600 }, { "epoch": 0.019532611077838567, "grad_norm": 9.17884349822998, "learning_rate": 1.960935570030297e-05, "loss": 1.9207, "step": 96700 }, { "epoch": 0.019552810261993517, "grad_norm": 12.54970645904541, "learning_rate": 1.9608951716456666e-05, "loss": 2.0048, "step": 96800 }, { "epoch": 0.01957300944614847, "grad_norm": 8.883353233337402, "learning_rate": 1.9608547732610365e-05, "loss": 1.9561, "step": 96900 }, { "epoch": 0.01959320863030342, "grad_norm": 12.455787658691406, "learning_rate": 1.9608143748764065e-05, "loss": 1.9902, "step": 97000 }, { "epoch": 0.01959320863030342, "eval_calculated_loss": 8.384799003601074, "eval_loss": 2.1478347778320312, "eval_perplexity": 4379.978101064696, "eval_runtime": 114.7865, "eval_samples_per_second": 8.694, "eval_steps_per_second": 2.178, "step": 97000 }, { "epoch": 0.019613407814458375, "grad_norm": 13.146993637084961, "learning_rate": 1.960773976491776e-05, "loss": 1.9329, "step": 97100 }, { "epoch": 0.019633606998613325, "grad_norm": 8.160852432250977, "learning_rate": 1.960733578107146e-05, "loss": 1.9244, "step": 97200 }, { "epoch": 0.01965380618276828, "grad_norm": 18.778162002563477, "learning_rate": 1.960693179722516e-05, "loss": 1.9822, "step": 97300 }, { "epoch": 0.01967400536692323, "grad_norm": 8.838996887207031, "learning_rate": 1.9606527813378854e-05, "loss": 1.9623, "step": 97400 }, { "epoch": 0.019694204551078183, "grad_norm": 9.904072761535645, "learning_rate": 1.9606123829532553e-05, "loss": 1.8759, "step": 97500 }, { "epoch": 0.019714403735233133, "grad_norm": 5.229916095733643, "learning_rate": 1.9605719845686253e-05, "loss": 1.92, "step": 97600 }, { "epoch": 0.019734602919388087, "grad_norm": 6.602996349334717, "learning_rate": 1.9605315861839952e-05, "loss": 1.8956, "step": 97700 }, { "epoch": 0.019754802103543037, "grad_norm": 6.18559455871582, "learning_rate": 1.9604911877993647e-05, "loss": 1.9099, "step": 97800 }, { "epoch": 0.01977500128769799, "grad_norm": 10.04875659942627, "learning_rate": 1.9604507894147347e-05, "loss": 1.9597, "step": 97900 }, { "epoch": 0.01979520047185294, "grad_norm": 7.430987358093262, "learning_rate": 1.9604103910301046e-05, "loss": 1.9088, "step": 98000 }, { "epoch": 0.01979520047185294, "eval_calculated_loss": 8.433289527893066, "eval_loss": 2.157395601272583, "eval_perplexity": 4597.5991756482545, "eval_runtime": 118.2202, "eval_samples_per_second": 8.442, "eval_steps_per_second": 2.115, "step": 98000 }, { "epoch": 0.019815399656007895, "grad_norm": 9.214753150939941, "learning_rate": 1.960369992645474e-05, "loss": 1.9464, "step": 98100 }, { "epoch": 0.019835598840162845, "grad_norm": 13.940532684326172, "learning_rate": 1.960329594260844e-05, "loss": 1.9576, "step": 98200 }, { "epoch": 0.0198557980243178, "grad_norm": 17.096738815307617, "learning_rate": 1.9602891958762136e-05, "loss": 1.9645, "step": 98300 }, { "epoch": 0.01987599720847275, "grad_norm": 11.605273246765137, "learning_rate": 1.9602487974915835e-05, "loss": 2.0301, "step": 98400 }, { "epoch": 0.019896196392627703, "grad_norm": 10.906980514526367, "learning_rate": 1.9602083991069535e-05, "loss": 1.9904, "step": 98500 }, { "epoch": 0.019916395576782653, "grad_norm": 9.425110816955566, "learning_rate": 1.9601680007223234e-05, "loss": 1.8706, "step": 98600 }, { "epoch": 0.019936594760937607, "grad_norm": 11.457566261291504, "learning_rate": 1.9601276023376933e-05, "loss": 1.9114, "step": 98700 }, { "epoch": 0.019956793945092557, "grad_norm": 10.550619125366211, "learning_rate": 1.960087203953063e-05, "loss": 1.9117, "step": 98800 }, { "epoch": 0.01997699312924751, "grad_norm": 11.223888397216797, "learning_rate": 1.9600468055684328e-05, "loss": 1.9247, "step": 98900 }, { "epoch": 0.01999719231340246, "grad_norm": 8.145689964294434, "learning_rate": 1.9600064071838027e-05, "loss": 1.9152, "step": 99000 }, { "epoch": 0.01999719231340246, "eval_calculated_loss": 8.603516578674316, "eval_loss": 2.1747255325317383, "eval_perplexity": 5450.794073886107, "eval_runtime": 115.9156, "eval_samples_per_second": 8.61, "eval_steps_per_second": 2.157, "step": 99000 }, { "epoch": 0.020017391497557414, "grad_norm": 12.680281639099121, "learning_rate": 1.9599660087991723e-05, "loss": 1.9794, "step": 99100 }, { "epoch": 0.020037590681712365, "grad_norm": 8.86703109741211, "learning_rate": 1.9599256104145422e-05, "loss": 1.9391, "step": 99200 }, { "epoch": 0.02005778986586732, "grad_norm": 7.248746395111084, "learning_rate": 1.9598852120299117e-05, "loss": 1.9443, "step": 99300 }, { "epoch": 0.02007798905002227, "grad_norm": 13.294442176818848, "learning_rate": 1.9598448136452817e-05, "loss": 1.9338, "step": 99400 }, { "epoch": 0.020098188234177222, "grad_norm": 12.509722709655762, "learning_rate": 1.9598044152606516e-05, "loss": 2.0138, "step": 99500 }, { "epoch": 0.020118387418332172, "grad_norm": 12.70292854309082, "learning_rate": 1.959764016876021e-05, "loss": 1.8794, "step": 99600 }, { "epoch": 0.020138586602487126, "grad_norm": 13.163321495056152, "learning_rate": 1.9597236184913914e-05, "loss": 1.897, "step": 99700 }, { "epoch": 0.020158785786642076, "grad_norm": 13.91247844696045, "learning_rate": 1.959683220106761e-05, "loss": 1.94, "step": 99800 }, { "epoch": 0.02017898497079703, "grad_norm": 16.410734176635742, "learning_rate": 1.959642821722131e-05, "loss": 1.9518, "step": 99900 }, { "epoch": 0.02019918415495198, "grad_norm": 17.118438720703125, "learning_rate": 1.9596024233375005e-05, "loss": 1.9754, "step": 100000 }, { "epoch": 0.02019918415495198, "eval_calculated_loss": 8.426875114440918, "eval_loss": 2.15352201461792, "eval_perplexity": 4568.202655159141, "eval_runtime": 115.1195, "eval_samples_per_second": 8.669, "eval_steps_per_second": 2.172, "step": 100000 }, { "epoch": 0.020219383339106934, "grad_norm": 15.366546630859375, "learning_rate": 1.9595620249528704e-05, "loss": 1.9117, "step": 100100 }, { "epoch": 0.020239582523261884, "grad_norm": 14.730748176574707, "learning_rate": 1.9595216265682403e-05, "loss": 2.0346, "step": 100200 }, { "epoch": 0.020259781707416838, "grad_norm": 10.821325302124023, "learning_rate": 1.95948122818361e-05, "loss": 1.9368, "step": 100300 }, { "epoch": 0.020279980891571788, "grad_norm": 12.1116943359375, "learning_rate": 1.9594408297989798e-05, "loss": 2.0091, "step": 100400 }, { "epoch": 0.020300180075726742, "grad_norm": 7.93390417098999, "learning_rate": 1.9594004314143497e-05, "loss": 1.9051, "step": 100500 }, { "epoch": 0.020320379259881692, "grad_norm": 8.476318359375, "learning_rate": 1.9593600330297193e-05, "loss": 1.9237, "step": 100600 }, { "epoch": 0.020340578444036646, "grad_norm": 12.024627685546875, "learning_rate": 1.9593196346450895e-05, "loss": 1.9197, "step": 100700 }, { "epoch": 0.020360777628191596, "grad_norm": 18.548677444458008, "learning_rate": 1.959279236260459e-05, "loss": 1.9236, "step": 100800 }, { "epoch": 0.02038097681234655, "grad_norm": 16.26833724975586, "learning_rate": 1.959238837875829e-05, "loss": 1.897, "step": 100900 }, { "epoch": 0.0204011759965015, "grad_norm": 13.273887634277344, "learning_rate": 1.9591984394911986e-05, "loss": 1.8477, "step": 101000 }, { "epoch": 0.0204011759965015, "eval_calculated_loss": 8.61137580871582, "eval_loss": 2.1746397018432617, "eval_perplexity": 5493.80190125248, "eval_runtime": 115.8088, "eval_samples_per_second": 8.618, "eval_steps_per_second": 2.159, "step": 101000 }, { "epoch": 0.020421375180656454, "grad_norm": 12.98752212524414, "learning_rate": 1.9591580411065685e-05, "loss": 1.834, "step": 101100 }, { "epoch": 0.020441574364811404, "grad_norm": 11.131392478942871, "learning_rate": 1.9591176427219384e-05, "loss": 1.9241, "step": 101200 }, { "epoch": 0.020461773548966358, "grad_norm": 7.571663856506348, "learning_rate": 1.959077244337308e-05, "loss": 1.9544, "step": 101300 }, { "epoch": 0.020481972733121308, "grad_norm": 10.38593864440918, "learning_rate": 1.959036845952678e-05, "loss": 1.9507, "step": 101400 }, { "epoch": 0.02050217191727626, "grad_norm": 9.369426727294922, "learning_rate": 1.9589964475680478e-05, "loss": 1.9037, "step": 101500 }, { "epoch": 0.02052237110143121, "grad_norm": 10.060220718383789, "learning_rate": 1.9589560491834174e-05, "loss": 2.0535, "step": 101600 }, { "epoch": 0.020542570285586165, "grad_norm": 9.950952529907227, "learning_rate": 1.9589156507987873e-05, "loss": 1.8642, "step": 101700 }, { "epoch": 0.020562769469741116, "grad_norm": 6.597073554992676, "learning_rate": 1.9588752524141572e-05, "loss": 1.9301, "step": 101800 }, { "epoch": 0.02058296865389607, "grad_norm": 6.954657077789307, "learning_rate": 1.958834854029527e-05, "loss": 1.9813, "step": 101900 }, { "epoch": 0.02060316783805102, "grad_norm": 14.86670207977295, "learning_rate": 1.9587944556448967e-05, "loss": 1.9248, "step": 102000 }, { "epoch": 0.02060316783805102, "eval_calculated_loss": 8.519978523254395, "eval_loss": 2.1574668884277344, "eval_perplexity": 5013.946072394294, "eval_runtime": 115.2984, "eval_samples_per_second": 8.656, "eval_steps_per_second": 2.168, "step": 102000 }, { "epoch": 0.020623367022205973, "grad_norm": 10.703827857971191, "learning_rate": 1.9587540572602666e-05, "loss": 1.9961, "step": 102100 }, { "epoch": 0.020643566206360923, "grad_norm": 12.576465606689453, "learning_rate": 1.9587136588756365e-05, "loss": 1.9797, "step": 102200 }, { "epoch": 0.020663765390515877, "grad_norm": 6.370522975921631, "learning_rate": 1.958673260491006e-05, "loss": 1.8488, "step": 102300 }, { "epoch": 0.020683964574670827, "grad_norm": 11.042531967163086, "learning_rate": 1.958632862106376e-05, "loss": 1.9355, "step": 102400 }, { "epoch": 0.02070416375882578, "grad_norm": 11.385001182556152, "learning_rate": 1.9585924637217456e-05, "loss": 1.8879, "step": 102500 }, { "epoch": 0.02072436294298073, "grad_norm": 11.380036354064941, "learning_rate": 1.9585520653371155e-05, "loss": 1.8695, "step": 102600 }, { "epoch": 0.020744562127135685, "grad_norm": 19.433185577392578, "learning_rate": 1.9585116669524854e-05, "loss": 1.929, "step": 102700 }, { "epoch": 0.020764761311290635, "grad_norm": 8.307475090026855, "learning_rate": 1.9584712685678553e-05, "loss": 1.9598, "step": 102800 }, { "epoch": 0.02078496049544559, "grad_norm": 11.513336181640625, "learning_rate": 1.9584308701832252e-05, "loss": 1.9531, "step": 102900 }, { "epoch": 0.020805159679600543, "grad_norm": 9.98449993133545, "learning_rate": 1.9583904717985948e-05, "loss": 1.9426, "step": 103000 }, { "epoch": 0.020805159679600543, "eval_calculated_loss": 8.633782386779785, "eval_loss": 2.1564900875091553, "eval_perplexity": 5618.288655286782, "eval_runtime": 114.8832, "eval_samples_per_second": 8.687, "eval_steps_per_second": 2.176, "step": 103000 }, { "epoch": 0.020825358863755493, "grad_norm": 6.974603176116943, "learning_rate": 1.9583500734139647e-05, "loss": 2.0107, "step": 103100 }, { "epoch": 0.020845558047910447, "grad_norm": 4.853410243988037, "learning_rate": 1.9583096750293343e-05, "loss": 1.8965, "step": 103200 }, { "epoch": 0.020865757232065397, "grad_norm": 11.729667663574219, "learning_rate": 1.9582692766447042e-05, "loss": 1.9651, "step": 103300 }, { "epoch": 0.02088595641622035, "grad_norm": 12.45013427734375, "learning_rate": 1.958228878260074e-05, "loss": 2.0069, "step": 103400 }, { "epoch": 0.0209061556003753, "grad_norm": 6.452690601348877, "learning_rate": 1.9581884798754437e-05, "loss": 1.9377, "step": 103500 }, { "epoch": 0.020926354784530254, "grad_norm": 11.824859619140625, "learning_rate": 1.9581480814908136e-05, "loss": 1.9619, "step": 103600 }, { "epoch": 0.020946553968685205, "grad_norm": 13.698244094848633, "learning_rate": 1.9581076831061835e-05, "loss": 1.8635, "step": 103700 }, { "epoch": 0.02096675315284016, "grad_norm": 4.6446733474731445, "learning_rate": 1.9580672847215534e-05, "loss": 1.9251, "step": 103800 }, { "epoch": 0.02098695233699511, "grad_norm": 12.618769645690918, "learning_rate": 1.9580268863369233e-05, "loss": 1.9276, "step": 103900 }, { "epoch": 0.021007151521150062, "grad_norm": 10.287954330444336, "learning_rate": 1.957986487952293e-05, "loss": 1.8688, "step": 104000 }, { "epoch": 0.021007151521150062, "eval_calculated_loss": 8.702544212341309, "eval_loss": 2.1707711219787598, "eval_perplexity": 6018.204345569905, "eval_runtime": 114.5087, "eval_samples_per_second": 8.715, "eval_steps_per_second": 2.183, "step": 104000 }, { "epoch": 0.021027350705305013, "grad_norm": 6.8389129638671875, "learning_rate": 1.9579460895676628e-05, "loss": 1.9528, "step": 104100 }, { "epoch": 0.021047549889459966, "grad_norm": 8.774535179138184, "learning_rate": 1.9579056911830324e-05, "loss": 1.9057, "step": 104200 }, { "epoch": 0.021067749073614916, "grad_norm": 8.196966171264648, "learning_rate": 1.9578652927984023e-05, "loss": 1.9315, "step": 104300 }, { "epoch": 0.02108794825776987, "grad_norm": 9.788582801818848, "learning_rate": 1.9578248944137722e-05, "loss": 1.9358, "step": 104400 }, { "epoch": 0.02110814744192482, "grad_norm": 7.050076961517334, "learning_rate": 1.9577844960291418e-05, "loss": 1.9523, "step": 104500 }, { "epoch": 0.021128346626079774, "grad_norm": 6.221020698547363, "learning_rate": 1.9577440976445117e-05, "loss": 1.8748, "step": 104600 }, { "epoch": 0.021148545810234724, "grad_norm": 10.378198623657227, "learning_rate": 1.9577036992598816e-05, "loss": 1.8663, "step": 104700 }, { "epoch": 0.021168744994389678, "grad_norm": 12.603347778320312, "learning_rate": 1.9576633008752515e-05, "loss": 1.9006, "step": 104800 }, { "epoch": 0.021188944178544628, "grad_norm": 9.972136497497559, "learning_rate": 1.957622902490621e-05, "loss": 1.8865, "step": 104900 }, { "epoch": 0.021209143362699582, "grad_norm": 11.163573265075684, "learning_rate": 1.957582504105991e-05, "loss": 1.9027, "step": 105000 }, { "epoch": 0.021209143362699582, "eval_calculated_loss": 8.447463035583496, "eval_loss": 2.156996011734009, "eval_perplexity": 4663.227274447137, "eval_runtime": 114.4683, "eval_samples_per_second": 8.719, "eval_steps_per_second": 2.184, "step": 105000 }, { "epoch": 0.021229342546854532, "grad_norm": 11.195130348205566, "learning_rate": 1.957542105721361e-05, "loss": 1.9658, "step": 105100 }, { "epoch": 0.021249541731009486, "grad_norm": 9.68360710144043, "learning_rate": 1.9575017073367305e-05, "loss": 1.8433, "step": 105200 }, { "epoch": 0.021269740915164436, "grad_norm": 7.2122955322265625, "learning_rate": 1.9574613089521004e-05, "loss": 1.9598, "step": 105300 }, { "epoch": 0.02128994009931939, "grad_norm": 12.781706809997559, "learning_rate": 1.9574209105674703e-05, "loss": 1.9887, "step": 105400 }, { "epoch": 0.02131013928347434, "grad_norm": 6.93292760848999, "learning_rate": 1.95738051218284e-05, "loss": 1.9029, "step": 105500 }, { "epoch": 0.021330338467629294, "grad_norm": 13.795820236206055, "learning_rate": 1.9573401137982098e-05, "loss": 1.8948, "step": 105600 }, { "epoch": 0.021350537651784244, "grad_norm": 8.724305152893066, "learning_rate": 1.9572997154135794e-05, "loss": 1.9026, "step": 105700 }, { "epoch": 0.021370736835939198, "grad_norm": 12.092317581176758, "learning_rate": 1.9572593170289493e-05, "loss": 2.0564, "step": 105800 }, { "epoch": 0.021390936020094148, "grad_norm": 7.181171417236328, "learning_rate": 1.9572189186443192e-05, "loss": 1.8869, "step": 105900 }, { "epoch": 0.0214111352042491, "grad_norm": 7.489689350128174, "learning_rate": 1.957178520259689e-05, "loss": 1.919, "step": 106000 }, { "epoch": 0.0214111352042491, "eval_calculated_loss": 8.688108444213867, "eval_loss": 2.164332866668701, "eval_perplexity": 5931.951007553572, "eval_runtime": 115.5815, "eval_samples_per_second": 8.635, "eval_steps_per_second": 2.163, "step": 106000 }, { "epoch": 0.021431334388404052, "grad_norm": 9.6091947555542, "learning_rate": 1.957138121875059e-05, "loss": 1.9436, "step": 106100 }, { "epoch": 0.021451533572559005, "grad_norm": 23.19289207458496, "learning_rate": 1.9570977234904286e-05, "loss": 1.888, "step": 106200 }, { "epoch": 0.021471732756713956, "grad_norm": 9.486663818359375, "learning_rate": 1.9570573251057985e-05, "loss": 1.942, "step": 106300 }, { "epoch": 0.02149193194086891, "grad_norm": 11.705130577087402, "learning_rate": 1.9570169267211684e-05, "loss": 1.9023, "step": 106400 }, { "epoch": 0.02151213112502386, "grad_norm": 11.115427017211914, "learning_rate": 1.956976528336538e-05, "loss": 1.8221, "step": 106500 }, { "epoch": 0.021532330309178813, "grad_norm": 12.137297630310059, "learning_rate": 1.956936129951908e-05, "loss": 1.9046, "step": 106600 }, { "epoch": 0.021552529493333764, "grad_norm": 9.117584228515625, "learning_rate": 1.9568957315672775e-05, "loss": 1.872, "step": 106700 }, { "epoch": 0.021572728677488717, "grad_norm": 12.303617477416992, "learning_rate": 1.9568553331826474e-05, "loss": 1.9449, "step": 106800 }, { "epoch": 0.021592927861643667, "grad_norm": 5.270529747009277, "learning_rate": 1.9568149347980173e-05, "loss": 1.9034, "step": 106900 }, { "epoch": 0.02161312704579862, "grad_norm": 12.26543140411377, "learning_rate": 1.9567745364133872e-05, "loss": 1.973, "step": 107000 }, { "epoch": 0.02161312704579862, "eval_calculated_loss": 8.602093696594238, "eval_loss": 2.1657047271728516, "eval_perplexity": 5443.04375188093, "eval_runtime": 114.6435, "eval_samples_per_second": 8.705, "eval_steps_per_second": 2.181, "step": 107000 }, { "epoch": 0.02163332622995357, "grad_norm": 10.698525428771973, "learning_rate": 1.956734138028757e-05, "loss": 1.9871, "step": 107100 }, { "epoch": 0.021653525414108525, "grad_norm": 11.58796501159668, "learning_rate": 1.9566937396441267e-05, "loss": 1.9264, "step": 107200 }, { "epoch": 0.021673724598263475, "grad_norm": 11.356884002685547, "learning_rate": 1.9566533412594966e-05, "loss": 1.8557, "step": 107300 }, { "epoch": 0.02169392378241843, "grad_norm": 12.358782768249512, "learning_rate": 1.9566129428748662e-05, "loss": 1.9659, "step": 107400 }, { "epoch": 0.02171412296657338, "grad_norm": 5.522743225097656, "learning_rate": 1.956572544490236e-05, "loss": 1.8595, "step": 107500 }, { "epoch": 0.021734322150728333, "grad_norm": 11.092134475708008, "learning_rate": 1.956532146105606e-05, "loss": 1.9028, "step": 107600 }, { "epoch": 0.021754521334883283, "grad_norm": 9.81352424621582, "learning_rate": 1.9564917477209756e-05, "loss": 1.9089, "step": 107700 }, { "epoch": 0.021774720519038237, "grad_norm": 12.729276657104492, "learning_rate": 1.9564513493363455e-05, "loss": 1.9046, "step": 107800 }, { "epoch": 0.021794919703193187, "grad_norm": 12.502304077148438, "learning_rate": 1.9564109509517154e-05, "loss": 1.9146, "step": 107900 }, { "epoch": 0.02181511888734814, "grad_norm": 13.263702392578125, "learning_rate": 1.9563705525670853e-05, "loss": 1.9606, "step": 108000 }, { "epoch": 0.02181511888734814, "eval_calculated_loss": 8.717714309692383, "eval_loss": 2.1692049503326416, "eval_perplexity": 6110.197096667301, "eval_runtime": 115.7666, "eval_samples_per_second": 8.621, "eval_steps_per_second": 2.16, "step": 108000 }, { "epoch": 0.02183531807150309, "grad_norm": 20.95808219909668, "learning_rate": 1.956330154182455e-05, "loss": 1.9193, "step": 108100 }, { "epoch": 0.021855517255658045, "grad_norm": 6.982318878173828, "learning_rate": 1.956289755797825e-05, "loss": 1.947, "step": 108200 }, { "epoch": 0.021875716439812995, "grad_norm": 10.291969299316406, "learning_rate": 1.9562493574131947e-05, "loss": 1.9545, "step": 108300 }, { "epoch": 0.02189591562396795, "grad_norm": 8.480717658996582, "learning_rate": 1.9562089590285643e-05, "loss": 1.9283, "step": 108400 }, { "epoch": 0.0219161148081229, "grad_norm": 7.270088195800781, "learning_rate": 1.9561685606439342e-05, "loss": 1.9349, "step": 108500 }, { "epoch": 0.021936313992277853, "grad_norm": 12.933526039123535, "learning_rate": 1.956128162259304e-05, "loss": 1.9615, "step": 108600 }, { "epoch": 0.021956513176432803, "grad_norm": 7.565719127655029, "learning_rate": 1.9560877638746737e-05, "loss": 1.9642, "step": 108700 }, { "epoch": 0.021976712360587757, "grad_norm": 11.675732612609863, "learning_rate": 1.9560473654900436e-05, "loss": 1.9312, "step": 108800 }, { "epoch": 0.021996911544742707, "grad_norm": 14.90842342376709, "learning_rate": 1.9560069671054132e-05, "loss": 2.011, "step": 108900 }, { "epoch": 0.02201711072889766, "grad_norm": 9.105809211730957, "learning_rate": 1.9559665687207835e-05, "loss": 1.8735, "step": 109000 }, { "epoch": 0.02201711072889766, "eval_calculated_loss": 8.79406452178955, "eval_loss": 2.179229736328125, "eval_perplexity": 6594.983226549836, "eval_runtime": 114.4692, "eval_samples_per_second": 8.719, "eval_steps_per_second": 2.184, "step": 109000 }, { "epoch": 0.02203730991305261, "grad_norm": 12.271614074707031, "learning_rate": 1.955926170336153e-05, "loss": 1.871, "step": 109100 }, { "epoch": 0.022057509097207564, "grad_norm": 8.161776542663574, "learning_rate": 1.955885771951523e-05, "loss": 1.871, "step": 109200 }, { "epoch": 0.022077708281362515, "grad_norm": 9.014863967895508, "learning_rate": 1.955845373566893e-05, "loss": 1.8192, "step": 109300 }, { "epoch": 0.02209790746551747, "grad_norm": 12.581456184387207, "learning_rate": 1.9558049751822624e-05, "loss": 1.9175, "step": 109400 }, { "epoch": 0.02211810664967242, "grad_norm": 6.990269184112549, "learning_rate": 1.9557645767976323e-05, "loss": 1.927, "step": 109500 }, { "epoch": 0.022138305833827372, "grad_norm": 10.000133514404297, "learning_rate": 1.9557241784130023e-05, "loss": 1.9283, "step": 109600 }, { "epoch": 0.022158505017982322, "grad_norm": 8.315985679626465, "learning_rate": 1.955683780028372e-05, "loss": 1.8145, "step": 109700 }, { "epoch": 0.022178704202137276, "grad_norm": 12.183423042297363, "learning_rate": 1.9556433816437417e-05, "loss": 1.887, "step": 109800 }, { "epoch": 0.022198903386292226, "grad_norm": 6.160093307495117, "learning_rate": 1.9556029832591113e-05, "loss": 1.9775, "step": 109900 }, { "epoch": 0.02221910257044718, "grad_norm": 9.388866424560547, "learning_rate": 1.9555625848744816e-05, "loss": 1.927, "step": 110000 }, { "epoch": 0.02221910257044718, "eval_calculated_loss": 8.743273735046387, "eval_loss": 2.1707398891448975, "eval_perplexity": 6268.3831843959515, "eval_runtime": 116.293, "eval_samples_per_second": 8.582, "eval_steps_per_second": 2.15, "step": 110000 }, { "epoch": 0.02223930175460213, "grad_norm": 11.805925369262695, "learning_rate": 1.955522186489851e-05, "loss": 1.835, "step": 110100 }, { "epoch": 0.022259500938757084, "grad_norm": 11.094228744506836, "learning_rate": 1.955481788105221e-05, "loss": 1.9783, "step": 110200 }, { "epoch": 0.022279700122912034, "grad_norm": 12.70694351196289, "learning_rate": 1.955441389720591e-05, "loss": 1.9014, "step": 110300 }, { "epoch": 0.022299899307066988, "grad_norm": 10.158271789550781, "learning_rate": 1.9554009913359605e-05, "loss": 1.9868, "step": 110400 }, { "epoch": 0.022320098491221938, "grad_norm": 10.783295631408691, "learning_rate": 1.9553605929513305e-05, "loss": 1.9207, "step": 110500 }, { "epoch": 0.022340297675376892, "grad_norm": 15.285638809204102, "learning_rate": 1.9553201945667e-05, "loss": 1.8804, "step": 110600 }, { "epoch": 0.022360496859531842, "grad_norm": 8.995826721191406, "learning_rate": 1.95527979618207e-05, "loss": 1.9939, "step": 110700 }, { "epoch": 0.022380696043686796, "grad_norm": 8.730541229248047, "learning_rate": 1.95523939779744e-05, "loss": 1.913, "step": 110800 }, { "epoch": 0.022400895227841746, "grad_norm": 14.748241424560547, "learning_rate": 1.9551989994128094e-05, "loss": 1.9213, "step": 110900 }, { "epoch": 0.0224210944119967, "grad_norm": 9.831111907958984, "learning_rate": 1.9551586010281793e-05, "loss": 1.9137, "step": 111000 }, { "epoch": 0.0224210944119967, "eval_calculated_loss": 8.779928207397461, "eval_loss": 2.1639797687530518, "eval_perplexity": 6502.410331628701, "eval_runtime": 114.2135, "eval_samples_per_second": 8.738, "eval_steps_per_second": 2.189, "step": 111000 }, { "epoch": 0.02244129359615165, "grad_norm": 10.733758926391602, "learning_rate": 1.9551182026435493e-05, "loss": 1.9686, "step": 111100 }, { "epoch": 0.022461492780306604, "grad_norm": 9.120345115661621, "learning_rate": 1.9550778042589192e-05, "loss": 1.9146, "step": 111200 }, { "epoch": 0.022481691964461554, "grad_norm": 11.018587112426758, "learning_rate": 1.955037405874289e-05, "loss": 1.8883, "step": 111300 }, { "epoch": 0.022501891148616508, "grad_norm": 8.589083671569824, "learning_rate": 1.9549970074896587e-05, "loss": 1.8545, "step": 111400 }, { "epoch": 0.022522090332771458, "grad_norm": 9.234447479248047, "learning_rate": 1.9549566091050286e-05, "loss": 1.9447, "step": 111500 }, { "epoch": 0.02254228951692641, "grad_norm": 14.499650001525879, "learning_rate": 1.954916210720398e-05, "loss": 1.9092, "step": 111600 }, { "epoch": 0.02256248870108136, "grad_norm": 7.079584121704102, "learning_rate": 1.954875812335768e-05, "loss": 1.9861, "step": 111700 }, { "epoch": 0.022582687885236315, "grad_norm": 13.82696533203125, "learning_rate": 1.954835413951138e-05, "loss": 1.8469, "step": 111800 }, { "epoch": 0.022602887069391266, "grad_norm": 10.191110610961914, "learning_rate": 1.9547950155665075e-05, "loss": 1.9196, "step": 111900 }, { "epoch": 0.02262308625354622, "grad_norm": 7.0008158683776855, "learning_rate": 1.9547546171818775e-05, "loss": 1.9384, "step": 112000 }, { "epoch": 0.02262308625354622, "eval_calculated_loss": 8.771933555603027, "eval_loss": 2.169485569000244, "eval_perplexity": 6450.633071561352, "eval_runtime": 115.3584, "eval_samples_per_second": 8.651, "eval_steps_per_second": 2.167, "step": 112000 }, { "epoch": 0.02264328543770117, "grad_norm": 14.61336612701416, "learning_rate": 1.9547142187972474e-05, "loss": 2.0946, "step": 112100 }, { "epoch": 0.022663484621856123, "grad_norm": 9.58180046081543, "learning_rate": 1.9546738204126173e-05, "loss": 1.9599, "step": 112200 }, { "epoch": 0.022683683806011073, "grad_norm": 11.260405540466309, "learning_rate": 1.954633422027987e-05, "loss": 1.9446, "step": 112300 }, { "epoch": 0.022703882990166027, "grad_norm": 14.126409530639648, "learning_rate": 1.9545930236433568e-05, "loss": 1.9076, "step": 112400 }, { "epoch": 0.02272408217432098, "grad_norm": 10.310724258422852, "learning_rate": 1.9545526252587267e-05, "loss": 1.9408, "step": 112500 }, { "epoch": 0.02274428135847593, "grad_norm": 12.03943157196045, "learning_rate": 1.9545122268740963e-05, "loss": 1.9152, "step": 112600 }, { "epoch": 0.022764480542630885, "grad_norm": 6.887315273284912, "learning_rate": 1.9544718284894662e-05, "loss": 1.8713, "step": 112700 }, { "epoch": 0.022784679726785835, "grad_norm": 10.992171287536621, "learning_rate": 1.954431430104836e-05, "loss": 1.9988, "step": 112800 }, { "epoch": 0.02280487891094079, "grad_norm": 11.574909210205078, "learning_rate": 1.9543910317202057e-05, "loss": 1.9534, "step": 112900 }, { "epoch": 0.02282507809509574, "grad_norm": 13.597509384155273, "learning_rate": 1.9543506333355756e-05, "loss": 1.9088, "step": 113000 }, { "epoch": 0.02282507809509574, "eval_calculated_loss": 8.623072624206543, "eval_loss": 2.1527295112609863, "eval_perplexity": 5558.439176629256, "eval_runtime": 115.7416, "eval_samples_per_second": 8.623, "eval_steps_per_second": 2.16, "step": 113000 }, { "epoch": 0.022845277279250693, "grad_norm": 11.889699935913086, "learning_rate": 1.9543102349509455e-05, "loss": 1.9119, "step": 113100 }, { "epoch": 0.022865476463405643, "grad_norm": 17.05116844177246, "learning_rate": 1.9542698365663154e-05, "loss": 1.8209, "step": 113200 }, { "epoch": 0.022885675647560597, "grad_norm": 5.74518346786499, "learning_rate": 1.954229438181685e-05, "loss": 1.9137, "step": 113300 }, { "epoch": 0.022905874831715547, "grad_norm": 5.76929235458374, "learning_rate": 1.954189039797055e-05, "loss": 1.9029, "step": 113400 }, { "epoch": 0.0229260740158705, "grad_norm": 11.674982070922852, "learning_rate": 1.9541486414124248e-05, "loss": 1.8761, "step": 113500 }, { "epoch": 0.02294627320002545, "grad_norm": 10.589188575744629, "learning_rate": 1.9541082430277944e-05, "loss": 1.8674, "step": 113600 }, { "epoch": 0.022966472384180404, "grad_norm": 9.295524597167969, "learning_rate": 1.9540678446431643e-05, "loss": 1.9066, "step": 113700 }, { "epoch": 0.022986671568335355, "grad_norm": 4.656947612762451, "learning_rate": 1.954027446258534e-05, "loss": 1.8938, "step": 113800 }, { "epoch": 0.02300687075249031, "grad_norm": 9.981327056884766, "learning_rate": 1.9539870478739038e-05, "loss": 1.944, "step": 113900 }, { "epoch": 0.02302706993664526, "grad_norm": 5.600552082061768, "learning_rate": 1.9539466494892737e-05, "loss": 1.9571, "step": 114000 }, { "epoch": 0.02302706993664526, "eval_calculated_loss": 8.58819580078125, "eval_loss": 2.156484603881836, "eval_perplexity": 5367.920136186506, "eval_runtime": 115.722, "eval_samples_per_second": 8.624, "eval_steps_per_second": 2.16, "step": 114000 }, { "epoch": 0.023047269120800212, "grad_norm": 9.208000183105469, "learning_rate": 1.9539062511046433e-05, "loss": 1.8801, "step": 114100 }, { "epoch": 0.023067468304955163, "grad_norm": 9.087783813476562, "learning_rate": 1.9538658527200135e-05, "loss": 1.8968, "step": 114200 }, { "epoch": 0.023087667489110116, "grad_norm": 7.924561023712158, "learning_rate": 1.953825454335383e-05, "loss": 1.9456, "step": 114300 }, { "epoch": 0.023107866673265066, "grad_norm": 9.197439193725586, "learning_rate": 1.953785055950753e-05, "loss": 1.9153, "step": 114400 }, { "epoch": 0.02312806585742002, "grad_norm": 10.417551040649414, "learning_rate": 1.953744657566123e-05, "loss": 1.9222, "step": 114500 }, { "epoch": 0.02314826504157497, "grad_norm": 11.022684097290039, "learning_rate": 1.9537042591814925e-05, "loss": 1.881, "step": 114600 }, { "epoch": 0.023168464225729924, "grad_norm": 5.606407165527344, "learning_rate": 1.9536638607968624e-05, "loss": 1.8814, "step": 114700 }, { "epoch": 0.023188663409884874, "grad_norm": 5.729255199432373, "learning_rate": 1.953623462412232e-05, "loss": 1.999, "step": 114800 }, { "epoch": 0.023208862594039828, "grad_norm": 9.581171989440918, "learning_rate": 1.953583064027602e-05, "loss": 1.857, "step": 114900 }, { "epoch": 0.023229061778194778, "grad_norm": 13.858840942382812, "learning_rate": 1.9535426656429718e-05, "loss": 1.9399, "step": 115000 }, { "epoch": 0.023229061778194778, "eval_calculated_loss": 8.461751937866211, "eval_loss": 2.1578352451324463, "eval_perplexity": 4730.33800075451, "eval_runtime": 115.6302, "eval_samples_per_second": 8.631, "eval_steps_per_second": 2.162, "step": 115000 }, { "epoch": 0.023249260962349732, "grad_norm": 9.170641899108887, "learning_rate": 1.9535022672583414e-05, "loss": 1.9604, "step": 115100 }, { "epoch": 0.023269460146504682, "grad_norm": 7.1122026443481445, "learning_rate": 1.9534618688737116e-05, "loss": 1.8665, "step": 115200 }, { "epoch": 0.023289659330659636, "grad_norm": 6.043238162994385, "learning_rate": 1.9534214704890812e-05, "loss": 2.0108, "step": 115300 }, { "epoch": 0.023309858514814586, "grad_norm": 13.749519348144531, "learning_rate": 1.953381072104451e-05, "loss": 2.0115, "step": 115400 }, { "epoch": 0.02333005769896954, "grad_norm": 14.469870567321777, "learning_rate": 1.9533406737198207e-05, "loss": 1.9567, "step": 115500 }, { "epoch": 0.02335025688312449, "grad_norm": 9.346266746520996, "learning_rate": 1.9533002753351906e-05, "loss": 1.9354, "step": 115600 }, { "epoch": 0.023370456067279444, "grad_norm": 10.46854305267334, "learning_rate": 1.9532598769505605e-05, "loss": 1.8719, "step": 115700 }, { "epoch": 0.023390655251434394, "grad_norm": 13.285515785217285, "learning_rate": 1.95321947856593e-05, "loss": 1.8901, "step": 115800 }, { "epoch": 0.023410854435589348, "grad_norm": 14.527497291564941, "learning_rate": 1.9531790801813e-05, "loss": 1.8616, "step": 115900 }, { "epoch": 0.023431053619744298, "grad_norm": 9.409163475036621, "learning_rate": 1.95313868179667e-05, "loss": 1.9024, "step": 116000 }, { "epoch": 0.023431053619744298, "eval_calculated_loss": 8.754914283752441, "eval_loss": 2.1713435649871826, "eval_perplexity": 6341.776947247167, "eval_runtime": 115.3729, "eval_samples_per_second": 8.65, "eval_steps_per_second": 2.167, "step": 116000 }, { "epoch": 0.02345125280389925, "grad_norm": 11.064241409301758, "learning_rate": 1.9530982834120395e-05, "loss": 1.8858, "step": 116100 }, { "epoch": 0.023471451988054202, "grad_norm": 7.093085765838623, "learning_rate": 1.9530578850274097e-05, "loss": 1.8807, "step": 116200 }, { "epoch": 0.023491651172209155, "grad_norm": 14.094633102416992, "learning_rate": 1.9530174866427793e-05, "loss": 2.0888, "step": 116300 }, { "epoch": 0.023511850356364106, "grad_norm": 10.236852645874023, "learning_rate": 1.9529770882581492e-05, "loss": 1.9016, "step": 116400 }, { "epoch": 0.02353204954051906, "grad_norm": 9.467764854431152, "learning_rate": 1.9529366898735188e-05, "loss": 1.8511, "step": 116500 }, { "epoch": 0.02355224872467401, "grad_norm": 8.13488483428955, "learning_rate": 1.9528962914888887e-05, "loss": 1.9488, "step": 116600 }, { "epoch": 0.023572447908828963, "grad_norm": 12.796955108642578, "learning_rate": 1.9528558931042586e-05, "loss": 1.9451, "step": 116700 }, { "epoch": 0.023592647092983914, "grad_norm": 10.511743545532227, "learning_rate": 1.9528154947196282e-05, "loss": 1.9253, "step": 116800 }, { "epoch": 0.023612846277138867, "grad_norm": 10.312297821044922, "learning_rate": 1.952775096334998e-05, "loss": 1.9212, "step": 116900 }, { "epoch": 0.023633045461293817, "grad_norm": 5.516084671020508, "learning_rate": 1.952734697950368e-05, "loss": 1.9073, "step": 117000 }, { "epoch": 0.023633045461293817, "eval_calculated_loss": 8.579399108886719, "eval_loss": 2.161792278289795, "eval_perplexity": 5320.90727860682, "eval_runtime": 114.7415, "eval_samples_per_second": 8.698, "eval_steps_per_second": 2.179, "step": 117000 }, { "epoch": 0.02365324464544877, "grad_norm": 15.051648139953613, "learning_rate": 1.9526942995657376e-05, "loss": 1.9399, "step": 117100 }, { "epoch": 0.02367344382960372, "grad_norm": 8.570009231567383, "learning_rate": 1.9526539011811075e-05, "loss": 1.8519, "step": 117200 }, { "epoch": 0.023693643013758675, "grad_norm": 9.521416664123535, "learning_rate": 1.9526135027964774e-05, "loss": 1.9789, "step": 117300 }, { "epoch": 0.023713842197913625, "grad_norm": 16.419347763061523, "learning_rate": 1.9525731044118473e-05, "loss": 1.9598, "step": 117400 }, { "epoch": 0.02373404138206858, "grad_norm": 7.823553562164307, "learning_rate": 1.952532706027217e-05, "loss": 1.9928, "step": 117500 }, { "epoch": 0.02375424056622353, "grad_norm": 9.282140731811523, "learning_rate": 1.9524923076425868e-05, "loss": 1.9174, "step": 117600 }, { "epoch": 0.023774439750378483, "grad_norm": 7.67407751083374, "learning_rate": 1.9524519092579567e-05, "loss": 1.9271, "step": 117700 }, { "epoch": 0.023794638934533433, "grad_norm": 6.843902111053467, "learning_rate": 1.9524115108733263e-05, "loss": 1.9286, "step": 117800 }, { "epoch": 0.023814838118688387, "grad_norm": 14.962915420532227, "learning_rate": 1.9523711124886962e-05, "loss": 1.967, "step": 117900 }, { "epoch": 0.023835037302843337, "grad_norm": 6.138215065002441, "learning_rate": 1.9523307141040658e-05, "loss": 2.0014, "step": 118000 }, { "epoch": 0.023835037302843337, "eval_calculated_loss": 8.555452346801758, "eval_loss": 2.159203290939331, "eval_perplexity": 5195.002302780004, "eval_runtime": 114.8902, "eval_samples_per_second": 8.687, "eval_steps_per_second": 2.176, "step": 118000 }, { "epoch": 0.02385523648699829, "grad_norm": 15.556174278259277, "learning_rate": 1.9522903157194357e-05, "loss": 1.9589, "step": 118100 }, { "epoch": 0.02387543567115324, "grad_norm": 5.683952331542969, "learning_rate": 1.9522499173348056e-05, "loss": 1.8721, "step": 118200 }, { "epoch": 0.023895634855308195, "grad_norm": 12.132006645202637, "learning_rate": 1.9522095189501755e-05, "loss": 1.935, "step": 118300 }, { "epoch": 0.023915834039463145, "grad_norm": 8.775909423828125, "learning_rate": 1.9521691205655454e-05, "loss": 1.9217, "step": 118400 }, { "epoch": 0.0239360332236181, "grad_norm": 8.338315963745117, "learning_rate": 1.952128722180915e-05, "loss": 1.8685, "step": 118500 }, { "epoch": 0.02395623240777305, "grad_norm": 12.862604141235352, "learning_rate": 1.952088323796285e-05, "loss": 1.896, "step": 118600 }, { "epoch": 0.023976431591928003, "grad_norm": 9.901778221130371, "learning_rate": 1.9520479254116545e-05, "loss": 1.9317, "step": 118700 }, { "epoch": 0.023996630776082953, "grad_norm": 7.316587924957275, "learning_rate": 1.9520075270270244e-05, "loss": 1.9007, "step": 118800 }, { "epoch": 0.024016829960237907, "grad_norm": 11.35329532623291, "learning_rate": 1.9519671286423943e-05, "loss": 1.9215, "step": 118900 }, { "epoch": 0.024037029144392857, "grad_norm": 10.924333572387695, "learning_rate": 1.951926730257764e-05, "loss": 1.9284, "step": 119000 }, { "epoch": 0.024037029144392857, "eval_calculated_loss": 8.672449111938477, "eval_loss": 2.1525251865386963, "eval_perplexity": 5839.784134908261, "eval_runtime": 115.9985, "eval_samples_per_second": 8.604, "eval_steps_per_second": 2.155, "step": 119000 }, { "epoch": 0.02405722832854781, "grad_norm": 11.197850227355957, "learning_rate": 1.9518863318731338e-05, "loss": 1.9235, "step": 119100 }, { "epoch": 0.02407742751270276, "grad_norm": 9.137248992919922, "learning_rate": 1.9518459334885037e-05, "loss": 2.013, "step": 119200 }, { "epoch": 0.024097626696857714, "grad_norm": 11.6725435256958, "learning_rate": 1.9518055351038736e-05, "loss": 2.021, "step": 119300 }, { "epoch": 0.024117825881012665, "grad_norm": 13.563395500183105, "learning_rate": 1.9517651367192436e-05, "loss": 1.8816, "step": 119400 }, { "epoch": 0.02413802506516762, "grad_norm": 8.979905128479004, "learning_rate": 1.951724738334613e-05, "loss": 1.9258, "step": 119500 }, { "epoch": 0.02415822424932257, "grad_norm": 14.272371292114258, "learning_rate": 1.951684339949983e-05, "loss": 1.8755, "step": 119600 }, { "epoch": 0.024178423433477522, "grad_norm": 8.281030654907227, "learning_rate": 1.9516439415653526e-05, "loss": 1.9381, "step": 119700 }, { "epoch": 0.024198622617632472, "grad_norm": 6.432582378387451, "learning_rate": 1.9516035431807225e-05, "loss": 1.9184, "step": 119800 }, { "epoch": 0.024218821801787426, "grad_norm": 11.942901611328125, "learning_rate": 1.9515631447960924e-05, "loss": 1.7901, "step": 119900 }, { "epoch": 0.024239020985942376, "grad_norm": 7.5337114334106445, "learning_rate": 1.951522746411462e-05, "loss": 1.949, "step": 120000 }, { "epoch": 0.024239020985942376, "eval_calculated_loss": 8.673171043395996, "eval_loss": 2.148559331893921, "eval_perplexity": 5844.001580950687, "eval_runtime": 116.5053, "eval_samples_per_second": 8.566, "eval_steps_per_second": 2.146, "step": 120000 }, { "epoch": 0.02425922017009733, "grad_norm": 8.514386177062988, "learning_rate": 1.951482348026832e-05, "loss": 1.905, "step": 120100 }, { "epoch": 0.02427941935425228, "grad_norm": 8.621369361877441, "learning_rate": 1.951441949642202e-05, "loss": 1.9164, "step": 120200 }, { "epoch": 0.024299618538407234, "grad_norm": 6.7766337394714355, "learning_rate": 1.9514015512575714e-05, "loss": 1.8348, "step": 120300 }, { "epoch": 0.024319817722562184, "grad_norm": 11.239591598510742, "learning_rate": 1.9513611528729413e-05, "loss": 1.8839, "step": 120400 }, { "epoch": 0.024340016906717138, "grad_norm": 6.962913513183594, "learning_rate": 1.9513207544883112e-05, "loss": 1.8658, "step": 120500 }, { "epoch": 0.024360216090872088, "grad_norm": 8.432662963867188, "learning_rate": 1.951280356103681e-05, "loss": 1.8811, "step": 120600 }, { "epoch": 0.024380415275027042, "grad_norm": 10.365650177001953, "learning_rate": 1.9512399577190507e-05, "loss": 1.9708, "step": 120700 }, { "epoch": 0.024400614459181992, "grad_norm": 6.497849941253662, "learning_rate": 1.9511995593344206e-05, "loss": 1.8707, "step": 120800 }, { "epoch": 0.024420813643336946, "grad_norm": 9.25075912475586, "learning_rate": 1.9511591609497905e-05, "loss": 1.9525, "step": 120900 }, { "epoch": 0.024441012827491896, "grad_norm": 10.062418937683105, "learning_rate": 1.95111876256516e-05, "loss": 1.8481, "step": 121000 }, { "epoch": 0.024441012827491896, "eval_calculated_loss": 8.506257057189941, "eval_loss": 2.166487216949463, "eval_perplexity": 4945.617239462317, "eval_runtime": 115.9364, "eval_samples_per_second": 8.608, "eval_steps_per_second": 2.156, "step": 121000 }, { "epoch": 0.02446121201164685, "grad_norm": 10.04286003112793, "learning_rate": 1.95107836418053e-05, "loss": 1.9121, "step": 121100 }, { "epoch": 0.0244814111958018, "grad_norm": 11.455218315124512, "learning_rate": 1.9510379657958996e-05, "loss": 1.8865, "step": 121200 }, { "epoch": 0.024501610379956754, "grad_norm": 9.906312942504883, "learning_rate": 1.9509975674112695e-05, "loss": 1.8466, "step": 121300 }, { "epoch": 0.024521809564111704, "grad_norm": 12.66657543182373, "learning_rate": 1.9509571690266394e-05, "loss": 1.8964, "step": 121400 }, { "epoch": 0.024542008748266658, "grad_norm": 5.785588264465332, "learning_rate": 1.9509167706420093e-05, "loss": 1.9971, "step": 121500 }, { "epoch": 0.024562207932421608, "grad_norm": 12.177464485168457, "learning_rate": 1.9508763722573793e-05, "loss": 1.8702, "step": 121600 }, { "epoch": 0.02458240711657656, "grad_norm": 7.7578277587890625, "learning_rate": 1.950835973872749e-05, "loss": 1.8694, "step": 121700 }, { "epoch": 0.024602606300731515, "grad_norm": 7.1899333000183105, "learning_rate": 1.9507955754881187e-05, "loss": 1.8801, "step": 121800 }, { "epoch": 0.024622805484886465, "grad_norm": 9.884145736694336, "learning_rate": 1.9507551771034887e-05, "loss": 1.948, "step": 121900 }, { "epoch": 0.02464300466904142, "grad_norm": 6.807309150695801, "learning_rate": 1.9507147787188582e-05, "loss": 1.8488, "step": 122000 }, { "epoch": 0.02464300466904142, "eval_calculated_loss": 8.662817001342773, "eval_loss": 2.1638503074645996, "eval_perplexity": 5783.804721020707, "eval_runtime": 114.8931, "eval_samples_per_second": 8.686, "eval_steps_per_second": 2.176, "step": 122000 }, { "epoch": 0.02466320385319637, "grad_norm": 8.876616477966309, "learning_rate": 1.950674380334228e-05, "loss": 1.8684, "step": 122100 }, { "epoch": 0.024683403037351323, "grad_norm": 13.303281784057617, "learning_rate": 1.9506339819495977e-05, "loss": 1.8548, "step": 122200 }, { "epoch": 0.024703602221506273, "grad_norm": 11.172639846801758, "learning_rate": 1.9505935835649676e-05, "loss": 1.9203, "step": 122300 }, { "epoch": 0.024723801405661227, "grad_norm": 19.405988693237305, "learning_rate": 1.9505531851803375e-05, "loss": 1.8683, "step": 122400 }, { "epoch": 0.024744000589816177, "grad_norm": 9.086236000061035, "learning_rate": 1.9505127867957075e-05, "loss": 1.8445, "step": 122500 }, { "epoch": 0.02476419977397113, "grad_norm": 8.047343254089355, "learning_rate": 1.9504723884110774e-05, "loss": 1.8989, "step": 122600 }, { "epoch": 0.02478439895812608, "grad_norm": 10.778080940246582, "learning_rate": 1.950431990026447e-05, "loss": 1.928, "step": 122700 }, { "epoch": 0.024804598142281035, "grad_norm": 8.663052558898926, "learning_rate": 1.950391591641817e-05, "loss": 1.8551, "step": 122800 }, { "epoch": 0.024824797326435985, "grad_norm": 14.202831268310547, "learning_rate": 1.9503511932571864e-05, "loss": 1.9668, "step": 122900 }, { "epoch": 0.02484499651059094, "grad_norm": 11.088659286499023, "learning_rate": 1.9503107948725563e-05, "loss": 1.8291, "step": 123000 }, { "epoch": 0.02484499651059094, "eval_calculated_loss": 8.534655570983887, "eval_loss": 2.1541154384613037, "eval_perplexity": 5088.0786914329865, "eval_runtime": 114.4645, "eval_samples_per_second": 8.719, "eval_steps_per_second": 2.184, "step": 123000 }, { "epoch": 0.02486519569474589, "grad_norm": 8.151483535766602, "learning_rate": 1.9502703964879263e-05, "loss": 1.9456, "step": 123100 }, { "epoch": 0.024885394878900843, "grad_norm": 9.888504981994629, "learning_rate": 1.950229998103296e-05, "loss": 1.9473, "step": 123200 }, { "epoch": 0.024905594063055793, "grad_norm": 7.148754119873047, "learning_rate": 1.9501895997186657e-05, "loss": 1.9257, "step": 123300 }, { "epoch": 0.024925793247210747, "grad_norm": 5.595034122467041, "learning_rate": 1.9501492013340357e-05, "loss": 1.9315, "step": 123400 }, { "epoch": 0.024945992431365697, "grad_norm": 16.649805068969727, "learning_rate": 1.9501088029494056e-05, "loss": 1.8398, "step": 123500 }, { "epoch": 0.02496619161552065, "grad_norm": 11.18706226348877, "learning_rate": 1.950068404564775e-05, "loss": 1.8451, "step": 123600 }, { "epoch": 0.0249863907996756, "grad_norm": 8.3052978515625, "learning_rate": 1.950028006180145e-05, "loss": 1.9251, "step": 123700 }, { "epoch": 0.025006589983830554, "grad_norm": 9.516090393066406, "learning_rate": 1.949987607795515e-05, "loss": 2.0106, "step": 123800 }, { "epoch": 0.025026789167985505, "grad_norm": 9.698432922363281, "learning_rate": 1.9499472094108845e-05, "loss": 1.929, "step": 123900 }, { "epoch": 0.02504698835214046, "grad_norm": 10.74876594543457, "learning_rate": 1.9499068110262545e-05, "loss": 1.8188, "step": 124000 }, { "epoch": 0.02504698835214046, "eval_calculated_loss": 8.682193756103516, "eval_loss": 2.1670069694519043, "eval_perplexity": 5896.968923499344, "eval_runtime": 115.6129, "eval_samples_per_second": 8.632, "eval_steps_per_second": 2.162, "step": 124000 }, { "epoch": 0.02506718753629541, "grad_norm": 9.93574333190918, "learning_rate": 1.9498664126416244e-05, "loss": 1.9169, "step": 124100 }, { "epoch": 0.025087386720450362, "grad_norm": 12.151721954345703, "learning_rate": 1.949826014256994e-05, "loss": 1.8798, "step": 124200 }, { "epoch": 0.025107585904605313, "grad_norm": 13.664654731750488, "learning_rate": 1.949785615872364e-05, "loss": 1.885, "step": 124300 }, { "epoch": 0.025127785088760266, "grad_norm": 13.734745979309082, "learning_rate": 1.9497452174877334e-05, "loss": 1.9096, "step": 124400 }, { "epoch": 0.025147984272915216, "grad_norm": 9.692512512207031, "learning_rate": 1.9497048191031037e-05, "loss": 1.896, "step": 124500 }, { "epoch": 0.02516818345707017, "grad_norm": 10.294355392456055, "learning_rate": 1.9496644207184733e-05, "loss": 2.0177, "step": 124600 }, { "epoch": 0.02518838264122512, "grad_norm": 13.47653865814209, "learning_rate": 1.9496240223338432e-05, "loss": 1.907, "step": 124700 }, { "epoch": 0.025208581825380074, "grad_norm": 8.108979225158691, "learning_rate": 1.949583623949213e-05, "loss": 1.8698, "step": 124800 }, { "epoch": 0.025228781009535024, "grad_norm": 11.38154125213623, "learning_rate": 1.9495432255645827e-05, "loss": 1.8727, "step": 124900 }, { "epoch": 0.025248980193689978, "grad_norm": 11.382820129394531, "learning_rate": 1.9495028271799526e-05, "loss": 1.9072, "step": 125000 }, { "epoch": 0.025248980193689978, "eval_calculated_loss": 8.67243766784668, "eval_loss": 2.159693956375122, "eval_perplexity": 5839.717304264956, "eval_runtime": 114.3322, "eval_samples_per_second": 8.729, "eval_steps_per_second": 2.187, "step": 125000 }, { "epoch": 0.025269179377844928, "grad_norm": 8.509298324584961, "learning_rate": 1.9494624287953225e-05, "loss": 1.9182, "step": 125100 }, { "epoch": 0.025289378561999882, "grad_norm": 8.024401664733887, "learning_rate": 1.949422030410692e-05, "loss": 1.9703, "step": 125200 }, { "epoch": 0.025309577746154832, "grad_norm": 9.4053955078125, "learning_rate": 1.949381632026062e-05, "loss": 1.9565, "step": 125300 }, { "epoch": 0.025329776930309786, "grad_norm": 10.24446964263916, "learning_rate": 1.9493412336414315e-05, "loss": 1.849, "step": 125400 }, { "epoch": 0.025349976114464736, "grad_norm": 8.640008926391602, "learning_rate": 1.9493008352568015e-05, "loss": 1.9164, "step": 125500 }, { "epoch": 0.02537017529861969, "grad_norm": 7.809895038604736, "learning_rate": 1.9492604368721714e-05, "loss": 1.7625, "step": 125600 }, { "epoch": 0.02539037448277464, "grad_norm": 10.773448944091797, "learning_rate": 1.9492200384875413e-05, "loss": 1.8555, "step": 125700 }, { "epoch": 0.025410573666929594, "grad_norm": 8.98731517791748, "learning_rate": 1.9491796401029112e-05, "loss": 1.8543, "step": 125800 }, { "epoch": 0.025430772851084544, "grad_norm": 6.51132345199585, "learning_rate": 1.9491392417182808e-05, "loss": 1.8838, "step": 125900 }, { "epoch": 0.025450972035239498, "grad_norm": 5.917710781097412, "learning_rate": 1.9490988433336507e-05, "loss": 1.8893, "step": 126000 }, { "epoch": 0.025450972035239498, "eval_calculated_loss": 8.81694221496582, "eval_loss": 2.1471824645996094, "eval_perplexity": 6747.600336568029, "eval_runtime": 115.0912, "eval_samples_per_second": 8.671, "eval_steps_per_second": 2.172, "step": 126000 }, { "epoch": 0.025471171219394448, "grad_norm": 9.476507186889648, "learning_rate": 1.9490584449490203e-05, "loss": 1.9128, "step": 126100 }, { "epoch": 0.0254913704035494, "grad_norm": 7.479661464691162, "learning_rate": 1.94901804656439e-05, "loss": 1.9628, "step": 126200 }, { "epoch": 0.025511569587704352, "grad_norm": 15.233835220336914, "learning_rate": 1.94897764817976e-05, "loss": 1.9177, "step": 126300 }, { "epoch": 0.025531768771859305, "grad_norm": 12.707551956176758, "learning_rate": 1.9489372497951297e-05, "loss": 1.8577, "step": 126400 }, { "epoch": 0.025551967956014256, "grad_norm": 6.40065336227417, "learning_rate": 1.9488968514104996e-05, "loss": 1.8403, "step": 126500 }, { "epoch": 0.02557216714016921, "grad_norm": 11.854357719421387, "learning_rate": 1.9488564530258695e-05, "loss": 1.8952, "step": 126600 }, { "epoch": 0.02559236632432416, "grad_norm": 11.100446701049805, "learning_rate": 1.9488160546412394e-05, "loss": 1.7924, "step": 126700 }, { "epoch": 0.025612565508479113, "grad_norm": 7.243961334228516, "learning_rate": 1.9487756562566093e-05, "loss": 1.9307, "step": 126800 }, { "epoch": 0.025632764692634064, "grad_norm": 7.645382881164551, "learning_rate": 1.948735257871979e-05, "loss": 1.8792, "step": 126900 }, { "epoch": 0.025652963876789017, "grad_norm": 9.23530387878418, "learning_rate": 1.9486948594873488e-05, "loss": 1.8408, "step": 127000 }, { "epoch": 0.025652963876789017, "eval_calculated_loss": 8.699297904968262, "eval_loss": 2.1493964195251465, "eval_perplexity": 5998.699081601197, "eval_runtime": 114.9668, "eval_samples_per_second": 8.681, "eval_steps_per_second": 2.175, "step": 127000 }, { "epoch": 0.025673163060943967, "grad_norm": 9.255523681640625, "learning_rate": 1.9486544611027184e-05, "loss": 1.9022, "step": 127100 }, { "epoch": 0.02569336224509892, "grad_norm": 6.138650894165039, "learning_rate": 1.9486140627180883e-05, "loss": 1.8192, "step": 127200 }, { "epoch": 0.02571356142925387, "grad_norm": 16.76350975036621, "learning_rate": 1.9485736643334582e-05, "loss": 1.9175, "step": 127300 }, { "epoch": 0.025733760613408825, "grad_norm": 13.224411010742188, "learning_rate": 1.9485332659488278e-05, "loss": 1.9721, "step": 127400 }, { "epoch": 0.025753959797563775, "grad_norm": 12.182456016540527, "learning_rate": 1.9484928675641977e-05, "loss": 1.9448, "step": 127500 }, { "epoch": 0.02577415898171873, "grad_norm": 12.673949241638184, "learning_rate": 1.9484524691795676e-05, "loss": 1.9463, "step": 127600 }, { "epoch": 0.02579435816587368, "grad_norm": 10.463150024414062, "learning_rate": 1.9484120707949375e-05, "loss": 1.8443, "step": 127700 }, { "epoch": 0.025814557350028633, "grad_norm": 6.336469650268555, "learning_rate": 1.948371672410307e-05, "loss": 1.9295, "step": 127800 }, { "epoch": 0.025834756534183583, "grad_norm": 12.261128425598145, "learning_rate": 1.948331274025677e-05, "loss": 1.9703, "step": 127900 }, { "epoch": 0.025854955718338537, "grad_norm": 5.6836018562316895, "learning_rate": 1.948290875641047e-05, "loss": 1.8713, "step": 128000 }, { "epoch": 0.025854955718338537, "eval_calculated_loss": 8.659810066223145, "eval_loss": 2.1513895988464355, "eval_perplexity": 5766.43931688611, "eval_runtime": 116.0302, "eval_samples_per_second": 8.601, "eval_steps_per_second": 2.155, "step": 128000 }, { "epoch": 0.025875154902493487, "grad_norm": 7.811306953430176, "learning_rate": 1.9482504772564165e-05, "loss": 1.8253, "step": 128100 }, { "epoch": 0.02589535408664844, "grad_norm": 10.852075576782227, "learning_rate": 1.9482100788717864e-05, "loss": 1.8709, "step": 128200 }, { "epoch": 0.02591555327080339, "grad_norm": 12.01059341430664, "learning_rate": 1.9481696804871563e-05, "loss": 1.9133, "step": 128300 }, { "epoch": 0.025935752454958345, "grad_norm": 5.904566764831543, "learning_rate": 1.948129282102526e-05, "loss": 1.9446, "step": 128400 }, { "epoch": 0.025955951639113295, "grad_norm": 8.99965763092041, "learning_rate": 1.9480888837178958e-05, "loss": 1.8452, "step": 128500 }, { "epoch": 0.02597615082326825, "grad_norm": 9.525450706481934, "learning_rate": 1.9480484853332654e-05, "loss": 1.8428, "step": 128600 }, { "epoch": 0.0259963500074232, "grad_norm": 9.489052772521973, "learning_rate": 1.9480080869486356e-05, "loss": 1.8527, "step": 128700 }, { "epoch": 0.026016549191578153, "grad_norm": 8.845267295837402, "learning_rate": 1.9479676885640052e-05, "loss": 1.9908, "step": 128800 }, { "epoch": 0.026036748375733103, "grad_norm": 7.041219711303711, "learning_rate": 1.947927290179375e-05, "loss": 1.9319, "step": 128900 }, { "epoch": 0.026056947559888056, "grad_norm": 10.48880672454834, "learning_rate": 1.947886891794745e-05, "loss": 1.9374, "step": 129000 }, { "epoch": 0.026056947559888056, "eval_calculated_loss": 8.694695472717285, "eval_loss": 2.1420342922210693, "eval_perplexity": 5971.1539114964335, "eval_runtime": 115.2617, "eval_samples_per_second": 8.659, "eval_steps_per_second": 2.169, "step": 129000 }, { "epoch": 0.026077146744043007, "grad_norm": 7.891235828399658, "learning_rate": 1.9478464934101146e-05, "loss": 1.8571, "step": 129100 }, { "epoch": 0.02609734592819796, "grad_norm": 9.76472282409668, "learning_rate": 1.9478060950254845e-05, "loss": 1.8954, "step": 129200 }, { "epoch": 0.02611754511235291, "grad_norm": 10.8591947555542, "learning_rate": 1.947765696640854e-05, "loss": 1.9169, "step": 129300 }, { "epoch": 0.026137744296507864, "grad_norm": 10.25788688659668, "learning_rate": 1.947725298256224e-05, "loss": 1.8456, "step": 129400 }, { "epoch": 0.026157943480662815, "grad_norm": 6.508614540100098, "learning_rate": 1.947684899871594e-05, "loss": 1.8168, "step": 129500 }, { "epoch": 0.02617814266481777, "grad_norm": 10.583121299743652, "learning_rate": 1.9476445014869635e-05, "loss": 1.9079, "step": 129600 }, { "epoch": 0.02619834184897272, "grad_norm": 9.31694507598877, "learning_rate": 1.9476041031023337e-05, "loss": 1.8725, "step": 129700 }, { "epoch": 0.026218541033127672, "grad_norm": 11.93203067779541, "learning_rate": 1.9475637047177033e-05, "loss": 1.8285, "step": 129800 }, { "epoch": 0.026238740217282622, "grad_norm": 5.208664417266846, "learning_rate": 1.9475233063330732e-05, "loss": 1.9763, "step": 129900 }, { "epoch": 0.026258939401437576, "grad_norm": 13.8796968460083, "learning_rate": 1.947482907948443e-05, "loss": 1.9348, "step": 130000 }, { "epoch": 0.026258939401437576, "eval_calculated_loss": 8.795588493347168, "eval_loss": 2.161043405532837, "eval_perplexity": 6605.041455690949, "eval_runtime": 115.4216, "eval_samples_per_second": 8.647, "eval_steps_per_second": 2.166, "step": 130000 }, { "epoch": 0.026279138585592526, "grad_norm": 7.951629161834717, "learning_rate": 1.9474425095638127e-05, "loss": 1.9663, "step": 130100 }, { "epoch": 0.02629933776974748, "grad_norm": 11.827552795410156, "learning_rate": 1.9474021111791826e-05, "loss": 1.9221, "step": 130200 }, { "epoch": 0.02631953695390243, "grad_norm": 9.267682075500488, "learning_rate": 1.9473617127945522e-05, "loss": 1.8848, "step": 130300 }, { "epoch": 0.026339736138057384, "grad_norm": 11.777013778686523, "learning_rate": 1.947321314409922e-05, "loss": 1.8508, "step": 130400 }, { "epoch": 0.026359935322212334, "grad_norm": 10.622955322265625, "learning_rate": 1.947280916025292e-05, "loss": 1.931, "step": 130500 }, { "epoch": 0.026380134506367288, "grad_norm": 10.582629203796387, "learning_rate": 1.9472405176406616e-05, "loss": 1.9023, "step": 130600 }, { "epoch": 0.026400333690522238, "grad_norm": 9.655990600585938, "learning_rate": 1.947200119256032e-05, "loss": 1.8955, "step": 130700 }, { "epoch": 0.026420532874677192, "grad_norm": 5.00461483001709, "learning_rate": 1.9471597208714014e-05, "loss": 1.8787, "step": 130800 }, { "epoch": 0.026440732058832142, "grad_norm": 8.361371040344238, "learning_rate": 1.9471193224867713e-05, "loss": 1.8432, "step": 130900 }, { "epoch": 0.026460931242987096, "grad_norm": 6.838481426239014, "learning_rate": 1.947078924102141e-05, "loss": 2.0115, "step": 131000 }, { "epoch": 0.026460931242987096, "eval_calculated_loss": 8.700460433959961, "eval_loss": 2.157085418701172, "eval_perplexity": 6005.676798309172, "eval_runtime": 115.2289, "eval_samples_per_second": 8.661, "eval_steps_per_second": 2.17, "step": 131000 }, { "epoch": 0.02648113042714205, "grad_norm": 9.943859100341797, "learning_rate": 1.9470385257175108e-05, "loss": 1.8951, "step": 131100 }, { "epoch": 0.026501329611297, "grad_norm": 10.06558609008789, "learning_rate": 1.9469981273328807e-05, "loss": 1.8724, "step": 131200 }, { "epoch": 0.026521528795451953, "grad_norm": 8.725265502929688, "learning_rate": 1.9469577289482503e-05, "loss": 1.8568, "step": 131300 }, { "epoch": 0.026541727979606904, "grad_norm": 7.407872676849365, "learning_rate": 1.9469173305636202e-05, "loss": 1.864, "step": 131400 }, { "epoch": 0.026561927163761857, "grad_norm": 12.15917682647705, "learning_rate": 1.94687693217899e-05, "loss": 1.9483, "step": 131500 }, { "epoch": 0.026582126347916808, "grad_norm": 8.531879425048828, "learning_rate": 1.9468365337943597e-05, "loss": 1.8526, "step": 131600 }, { "epoch": 0.02660232553207176, "grad_norm": 12.534575462341309, "learning_rate": 1.9467961354097296e-05, "loss": 1.9038, "step": 131700 }, { "epoch": 0.02662252471622671, "grad_norm": 10.177078247070312, "learning_rate": 1.9467557370250995e-05, "loss": 1.8374, "step": 131800 }, { "epoch": 0.026642723900381665, "grad_norm": 11.42243480682373, "learning_rate": 1.9467153386404694e-05, "loss": 1.8919, "step": 131900 }, { "epoch": 0.026662923084536615, "grad_norm": 12.040313720703125, "learning_rate": 1.946674940255839e-05, "loss": 1.8132, "step": 132000 }, { "epoch": 0.026662923084536615, "eval_calculated_loss": 8.793431282043457, "eval_loss": 2.1544861793518066, "eval_perplexity": 6590.80834303656, "eval_runtime": 115.2948, "eval_samples_per_second": 8.656, "eval_steps_per_second": 2.168, "step": 132000 }, { "epoch": 0.02668312226869157, "grad_norm": 8.887813568115234, "learning_rate": 1.946634541871209e-05, "loss": 1.8868, "step": 132100 }, { "epoch": 0.02670332145284652, "grad_norm": 11.211295127868652, "learning_rate": 1.946594143486579e-05, "loss": 1.9127, "step": 132200 }, { "epoch": 0.026723520637001473, "grad_norm": 13.514474868774414, "learning_rate": 1.9465537451019484e-05, "loss": 1.978, "step": 132300 }, { "epoch": 0.026743719821156423, "grad_norm": 7.3022332191467285, "learning_rate": 1.9465133467173183e-05, "loss": 1.8404, "step": 132400 }, { "epoch": 0.026763919005311377, "grad_norm": 5.652934551239014, "learning_rate": 1.9464729483326882e-05, "loss": 1.8957, "step": 132500 }, { "epoch": 0.026784118189466327, "grad_norm": 11.507928848266602, "learning_rate": 1.9464325499480578e-05, "loss": 1.8819, "step": 132600 }, { "epoch": 0.02680431737362128, "grad_norm": 9.268664360046387, "learning_rate": 1.9463921515634277e-05, "loss": 1.9627, "step": 132700 }, { "epoch": 0.02682451655777623, "grad_norm": 10.99177360534668, "learning_rate": 1.9463517531787976e-05, "loss": 1.8827, "step": 132800 }, { "epoch": 0.026844715741931185, "grad_norm": 11.513261795043945, "learning_rate": 1.9463113547941675e-05, "loss": 1.8499, "step": 132900 }, { "epoch": 0.026864914926086135, "grad_norm": 11.597780227661133, "learning_rate": 1.946270956409537e-05, "loss": 1.9372, "step": 133000 }, { "epoch": 0.026864914926086135, "eval_calculated_loss": 8.84054946899414, "eval_loss": 2.1434452533721924, "eval_perplexity": 6908.787762271369, "eval_runtime": 115.9651, "eval_samples_per_second": 8.606, "eval_steps_per_second": 2.156, "step": 133000 }, { "epoch": 0.02688511411024109, "grad_norm": 9.124756813049316, "learning_rate": 1.946230558024907e-05, "loss": 1.8404, "step": 133100 }, { "epoch": 0.02690531329439604, "grad_norm": 7.804595947265625, "learning_rate": 1.946190159640277e-05, "loss": 1.9066, "step": 133200 }, { "epoch": 0.026925512478550993, "grad_norm": 6.118130683898926, "learning_rate": 1.9461497612556465e-05, "loss": 1.9014, "step": 133300 }, { "epoch": 0.026945711662705943, "grad_norm": 6.848764419555664, "learning_rate": 1.9461093628710164e-05, "loss": 1.933, "step": 133400 }, { "epoch": 0.026965910846860897, "grad_norm": 10.223786354064941, "learning_rate": 1.946068964486386e-05, "loss": 1.8996, "step": 133500 }, { "epoch": 0.026986110031015847, "grad_norm": 6.95790433883667, "learning_rate": 1.946028566101756e-05, "loss": 1.85, "step": 133600 }, { "epoch": 0.0270063092151708, "grad_norm": 10.427995681762695, "learning_rate": 1.945988167717126e-05, "loss": 1.8303, "step": 133700 }, { "epoch": 0.02702650839932575, "grad_norm": 13.626007080078125, "learning_rate": 1.9459477693324957e-05, "loss": 1.8762, "step": 133800 }, { "epoch": 0.027046707583480704, "grad_norm": 12.731369018554688, "learning_rate": 1.9459073709478657e-05, "loss": 1.882, "step": 133900 }, { "epoch": 0.027066906767635655, "grad_norm": 10.514532089233398, "learning_rate": 1.9458669725632352e-05, "loss": 1.9362, "step": 134000 }, { "epoch": 0.027066906767635655, "eval_calculated_loss": 8.741464614868164, "eval_loss": 2.151218891143799, "eval_perplexity": 6257.053177654317, "eval_runtime": 113.9637, "eval_samples_per_second": 8.757, "eval_steps_per_second": 2.194, "step": 134000 }, { "epoch": 0.02708710595179061, "grad_norm": 14.691906929016113, "learning_rate": 1.945826574178605e-05, "loss": 1.9086, "step": 134100 }, { "epoch": 0.02710730513594556, "grad_norm": 5.928524971008301, "learning_rate": 1.9457861757939747e-05, "loss": 1.9437, "step": 134200 }, { "epoch": 0.027127504320100512, "grad_norm": 13.480024337768555, "learning_rate": 1.9457457774093446e-05, "loss": 1.8935, "step": 134300 }, { "epoch": 0.027147703504255462, "grad_norm": 8.485321044921875, "learning_rate": 1.9457053790247145e-05, "loss": 1.9441, "step": 134400 }, { "epoch": 0.027167902688410416, "grad_norm": 4.932061672210693, "learning_rate": 1.945664980640084e-05, "loss": 1.8908, "step": 134500 }, { "epoch": 0.027188101872565366, "grad_norm": 6.713749408721924, "learning_rate": 1.945624582255454e-05, "loss": 1.9282, "step": 134600 }, { "epoch": 0.02720830105672032, "grad_norm": 9.505495071411133, "learning_rate": 1.945584183870824e-05, "loss": 1.9081, "step": 134700 }, { "epoch": 0.02722850024087527, "grad_norm": 6.588253021240234, "learning_rate": 1.9455437854861935e-05, "loss": 1.846, "step": 134800 }, { "epoch": 0.027248699425030224, "grad_norm": 9.137443542480469, "learning_rate": 1.9455033871015638e-05, "loss": 1.8636, "step": 134900 }, { "epoch": 0.027268898609185174, "grad_norm": 6.671466827392578, "learning_rate": 1.9454629887169333e-05, "loss": 1.8673, "step": 135000 }, { "epoch": 0.027268898609185174, "eval_calculated_loss": 8.661575317382812, "eval_loss": 2.14789080619812, "eval_perplexity": 5776.6275203007135, "eval_runtime": 115.5733, "eval_samples_per_second": 8.635, "eval_steps_per_second": 2.163, "step": 135000 }, { "epoch": 0.027289097793340128, "grad_norm": 8.771307945251465, "learning_rate": 1.9454225903323033e-05, "loss": 1.9186, "step": 135100 }, { "epoch": 0.027309296977495078, "grad_norm": 8.617480278015137, "learning_rate": 1.945382191947673e-05, "loss": 1.9417, "step": 135200 }, { "epoch": 0.027329496161650032, "grad_norm": 10.633512496948242, "learning_rate": 1.9453417935630427e-05, "loss": 1.9062, "step": 135300 }, { "epoch": 0.027349695345804982, "grad_norm": 12.457283973693848, "learning_rate": 1.9453013951784127e-05, "loss": 1.9329, "step": 135400 }, { "epoch": 0.027369894529959936, "grad_norm": 11.716483116149902, "learning_rate": 1.9452609967937822e-05, "loss": 1.8412, "step": 135500 }, { "epoch": 0.027390093714114886, "grad_norm": 14.707319259643555, "learning_rate": 1.945220598409152e-05, "loss": 1.8184, "step": 135600 }, { "epoch": 0.02741029289826984, "grad_norm": 7.245187759399414, "learning_rate": 1.945180200024522e-05, "loss": 1.9184, "step": 135700 }, { "epoch": 0.02743049208242479, "grad_norm": 8.797752380371094, "learning_rate": 1.9451398016398916e-05, "loss": 1.941, "step": 135800 }, { "epoch": 0.027450691266579744, "grad_norm": 10.341994285583496, "learning_rate": 1.9450994032552615e-05, "loss": 1.8903, "step": 135900 }, { "epoch": 0.027470890450734694, "grad_norm": 11.423296928405762, "learning_rate": 1.9450590048706315e-05, "loss": 1.8537, "step": 136000 }, { "epoch": 0.027470890450734694, "eval_calculated_loss": 8.519530296325684, "eval_loss": 2.1625194549560547, "eval_perplexity": 5011.699190339682, "eval_runtime": 114.8961, "eval_samples_per_second": 8.686, "eval_steps_per_second": 2.176, "step": 136000 }, { "epoch": 0.027491089634889648, "grad_norm": 9.30046272277832, "learning_rate": 1.9450186064860014e-05, "loss": 1.9009, "step": 136100 }, { "epoch": 0.027511288819044598, "grad_norm": 10.331964492797852, "learning_rate": 1.944978208101371e-05, "loss": 1.9033, "step": 136200 }, { "epoch": 0.02753148800319955, "grad_norm": 6.770610809326172, "learning_rate": 1.944937809716741e-05, "loss": 1.8532, "step": 136300 }, { "epoch": 0.027551687187354502, "grad_norm": 11.300037384033203, "learning_rate": 1.9448974113321108e-05, "loss": 1.8625, "step": 136400 }, { "epoch": 0.027571886371509455, "grad_norm": 6.972611904144287, "learning_rate": 1.9448570129474803e-05, "loss": 1.9042, "step": 136500 }, { "epoch": 0.027592085555664406, "grad_norm": 9.159838676452637, "learning_rate": 1.9448166145628503e-05, "loss": 1.8381, "step": 136600 }, { "epoch": 0.02761228473981936, "grad_norm": 8.941594123840332, "learning_rate": 1.94477621617822e-05, "loss": 1.9393, "step": 136700 }, { "epoch": 0.02763248392397431, "grad_norm": 7.790648460388184, "learning_rate": 1.9447358177935897e-05, "loss": 1.9911, "step": 136800 }, { "epoch": 0.027652683108129263, "grad_norm": 6.847464561462402, "learning_rate": 1.9446954194089597e-05, "loss": 1.8617, "step": 136900 }, { "epoch": 0.027672882292284214, "grad_norm": 9.57892894744873, "learning_rate": 1.9446550210243296e-05, "loss": 2.0969, "step": 137000 }, { "epoch": 0.027672882292284214, "eval_calculated_loss": 8.597387313842773, "eval_loss": 2.142209053039551, "eval_perplexity": 5417.486892007622, "eval_runtime": 114.1708, "eval_samples_per_second": 8.741, "eval_steps_per_second": 2.19, "step": 137000 }, { "epoch": 0.027693081476439167, "grad_norm": 12.038168907165527, "learning_rate": 1.9446146226396995e-05, "loss": 1.9523, "step": 137100 }, { "epoch": 0.027713280660594117, "grad_norm": 13.44192886352539, "learning_rate": 1.944574224255069e-05, "loss": 1.9622, "step": 137200 }, { "epoch": 0.02773347984474907, "grad_norm": 11.0913724899292, "learning_rate": 1.944533825870439e-05, "loss": 1.9934, "step": 137300 }, { "epoch": 0.02775367902890402, "grad_norm": 6.566337585449219, "learning_rate": 1.944493427485809e-05, "loss": 1.8989, "step": 137400 }, { "epoch": 0.027773878213058975, "grad_norm": 11.821962356567383, "learning_rate": 1.9444530291011785e-05, "loss": 1.9106, "step": 137500 }, { "epoch": 0.027794077397213925, "grad_norm": 7.424932956695557, "learning_rate": 1.9444126307165484e-05, "loss": 1.9419, "step": 137600 }, { "epoch": 0.02781427658136888, "grad_norm": 11.329866409301758, "learning_rate": 1.944372232331918e-05, "loss": 1.8853, "step": 137700 }, { "epoch": 0.02783447576552383, "grad_norm": 8.478400230407715, "learning_rate": 1.944331833947288e-05, "loss": 1.9338, "step": 137800 }, { "epoch": 0.027854674949678783, "grad_norm": 10.56600570678711, "learning_rate": 1.9442914355626578e-05, "loss": 1.8418, "step": 137900 }, { "epoch": 0.027874874133833733, "grad_norm": 11.560397148132324, "learning_rate": 1.9442510371780277e-05, "loss": 1.918, "step": 138000 }, { "epoch": 0.027874874133833733, "eval_calculated_loss": 8.701836585998535, "eval_loss": 2.1549012660980225, "eval_perplexity": 6013.947212046341, "eval_runtime": 115.3055, "eval_samples_per_second": 8.655, "eval_steps_per_second": 2.168, "step": 138000 }, { "epoch": 0.027895073317988687, "grad_norm": 9.51849365234375, "learning_rate": 1.9442106387933976e-05, "loss": 1.9358, "step": 138100 }, { "epoch": 0.027915272502143637, "grad_norm": 10.731077194213867, "learning_rate": 1.944170240408767e-05, "loss": 1.8707, "step": 138200 }, { "epoch": 0.02793547168629859, "grad_norm": 9.16917610168457, "learning_rate": 1.944129842024137e-05, "loss": 1.8886, "step": 138300 }, { "epoch": 0.02795567087045354, "grad_norm": 8.686304092407227, "learning_rate": 1.9440894436395067e-05, "loss": 1.8549, "step": 138400 }, { "epoch": 0.027975870054608495, "grad_norm": 7.452663898468018, "learning_rate": 1.9440490452548766e-05, "loss": 1.7746, "step": 138500 }, { "epoch": 0.027996069238763445, "grad_norm": 10.865402221679688, "learning_rate": 1.9440086468702465e-05, "loss": 1.8768, "step": 138600 }, { "epoch": 0.0280162684229184, "grad_norm": 8.085914611816406, "learning_rate": 1.943968248485616e-05, "loss": 1.9573, "step": 138700 }, { "epoch": 0.02803646760707335, "grad_norm": 10.896288871765137, "learning_rate": 1.943927850100986e-05, "loss": 1.8913, "step": 138800 }, { "epoch": 0.028056666791228303, "grad_norm": 8.284097671508789, "learning_rate": 1.943887451716356e-05, "loss": 1.8123, "step": 138900 }, { "epoch": 0.028076865975383253, "grad_norm": 7.8322343826293945, "learning_rate": 1.9438470533317258e-05, "loss": 1.9491, "step": 139000 }, { "epoch": 0.028076865975383253, "eval_calculated_loss": 8.829712867736816, "eval_loss": 2.156332015991211, "eval_perplexity": 6834.324178899495, "eval_runtime": 115.7342, "eval_samples_per_second": 8.623, "eval_steps_per_second": 2.16, "step": 139000 }, { "epoch": 0.028097065159538206, "grad_norm": 10.495026588439941, "learning_rate": 1.9438066549470954e-05, "loss": 1.8519, "step": 139100 }, { "epoch": 0.028117264343693157, "grad_norm": 12.149530410766602, "learning_rate": 1.9437662565624653e-05, "loss": 1.8789, "step": 139200 }, { "epoch": 0.02813746352784811, "grad_norm": 5.874023914337158, "learning_rate": 1.9437258581778352e-05, "loss": 1.8863, "step": 139300 }, { "epoch": 0.02815766271200306, "grad_norm": 14.31913948059082, "learning_rate": 1.9436854597932048e-05, "loss": 1.8778, "step": 139400 }, { "epoch": 0.028177861896158014, "grad_norm": 7.667793273925781, "learning_rate": 1.9436450614085747e-05, "loss": 2.0104, "step": 139500 }, { "epoch": 0.028198061080312965, "grad_norm": 14.792919158935547, "learning_rate": 1.9436046630239446e-05, "loss": 1.8735, "step": 139600 }, { "epoch": 0.02821826026446792, "grad_norm": 10.170815467834473, "learning_rate": 1.943564264639314e-05, "loss": 1.83, "step": 139700 }, { "epoch": 0.02823845944862287, "grad_norm": 7.282669544219971, "learning_rate": 1.943523866254684e-05, "loss": 1.861, "step": 139800 }, { "epoch": 0.028258658632777822, "grad_norm": 8.59855842590332, "learning_rate": 1.9434834678700537e-05, "loss": 1.9488, "step": 139900 }, { "epoch": 0.028278857816932772, "grad_norm": 5.438498020172119, "learning_rate": 1.9434430694854236e-05, "loss": 1.9302, "step": 140000 }, { "epoch": 0.028278857816932772, "eval_calculated_loss": 9.01622200012207, "eval_loss": 2.1468141078948975, "eval_perplexity": 8235.604121255708, "eval_runtime": 116.095, "eval_samples_per_second": 8.596, "eval_steps_per_second": 2.153, "step": 140000 }, { "epoch": 0.028299057001087726, "grad_norm": 6.288539409637451, "learning_rate": 1.9434026711007935e-05, "loss": 1.8926, "step": 140100 }, { "epoch": 0.028319256185242676, "grad_norm": 12.223179817199707, "learning_rate": 1.9433622727161634e-05, "loss": 1.8859, "step": 140200 }, { "epoch": 0.02833945536939763, "grad_norm": 10.005512237548828, "learning_rate": 1.9433218743315333e-05, "loss": 1.9192, "step": 140300 }, { "epoch": 0.028359654553552584, "grad_norm": 10.60089111328125, "learning_rate": 1.943281475946903e-05, "loss": 1.8133, "step": 140400 }, { "epoch": 0.028379853737707534, "grad_norm": 5.289458751678467, "learning_rate": 1.9432410775622728e-05, "loss": 1.9473, "step": 140500 }, { "epoch": 0.028400052921862488, "grad_norm": 16.239694595336914, "learning_rate": 1.9432006791776427e-05, "loss": 1.9132, "step": 140600 }, { "epoch": 0.028420252106017438, "grad_norm": 9.06106185913086, "learning_rate": 1.9431602807930123e-05, "loss": 1.9015, "step": 140700 }, { "epoch": 0.02844045129017239, "grad_norm": 15.967580795288086, "learning_rate": 1.9431198824083822e-05, "loss": 1.9833, "step": 140800 }, { "epoch": 0.028460650474327342, "grad_norm": 7.879931926727295, "learning_rate": 1.9430794840237518e-05, "loss": 1.813, "step": 140900 }, { "epoch": 0.028480849658482296, "grad_norm": 11.536020278930664, "learning_rate": 1.9430390856391217e-05, "loss": 1.939, "step": 141000 }, { "epoch": 0.028480849658482296, "eval_calculated_loss": 8.707555770874023, "eval_loss": 2.141162872314453, "eval_perplexity": 6048.440631083087, "eval_runtime": 116.0643, "eval_samples_per_second": 8.599, "eval_steps_per_second": 2.154, "step": 141000 }, { "epoch": 0.028501048842637246, "grad_norm": 6.09442663192749, "learning_rate": 1.9429986872544916e-05, "loss": 1.8241, "step": 141100 }, { "epoch": 0.0285212480267922, "grad_norm": 11.423694610595703, "learning_rate": 1.9429582888698615e-05, "loss": 1.8164, "step": 141200 }, { "epoch": 0.02854144721094715, "grad_norm": 9.979031562805176, "learning_rate": 1.9429178904852314e-05, "loss": 1.9309, "step": 141300 }, { "epoch": 0.028561646395102103, "grad_norm": 14.488486289978027, "learning_rate": 1.942877492100601e-05, "loss": 1.8989, "step": 141400 }, { "epoch": 0.028581845579257054, "grad_norm": 7.749299049377441, "learning_rate": 1.942837093715971e-05, "loss": 1.9172, "step": 141500 }, { "epoch": 0.028602044763412007, "grad_norm": 8.404234886169434, "learning_rate": 1.9427966953313405e-05, "loss": 1.95, "step": 141600 }, { "epoch": 0.028622243947566958, "grad_norm": 8.310734748840332, "learning_rate": 1.9427562969467104e-05, "loss": 1.9582, "step": 141700 }, { "epoch": 0.02864244313172191, "grad_norm": 12.154207229614258, "learning_rate": 1.9427158985620803e-05, "loss": 1.8759, "step": 141800 }, { "epoch": 0.02866264231587686, "grad_norm": 15.374611854553223, "learning_rate": 1.94267550017745e-05, "loss": 1.8734, "step": 141900 }, { "epoch": 0.028682841500031815, "grad_norm": 9.323626518249512, "learning_rate": 1.9426351017928198e-05, "loss": 1.9244, "step": 142000 }, { "epoch": 0.028682841500031815, "eval_calculated_loss": 8.905502319335938, "eval_loss": 2.164216995239258, "eval_perplexity": 7372.427592426938, "eval_runtime": 113.7881, "eval_samples_per_second": 8.771, "eval_steps_per_second": 2.197, "step": 142000 }, { "epoch": 0.028703040684186765, "grad_norm": 12.357133865356445, "learning_rate": 1.9425947034081897e-05, "loss": 1.8948, "step": 142100 }, { "epoch": 0.02872323986834172, "grad_norm": 10.25709056854248, "learning_rate": 1.9425543050235596e-05, "loss": 1.9328, "step": 142200 }, { "epoch": 0.02874343905249667, "grad_norm": 7.621326923370361, "learning_rate": 1.9425139066389295e-05, "loss": 1.9055, "step": 142300 }, { "epoch": 0.028763638236651623, "grad_norm": 7.441956996917725, "learning_rate": 1.942473508254299e-05, "loss": 1.9303, "step": 142400 }, { "epoch": 0.028783837420806573, "grad_norm": 11.477213859558105, "learning_rate": 1.942433109869669e-05, "loss": 1.9084, "step": 142500 }, { "epoch": 0.028804036604961527, "grad_norm": 10.837919235229492, "learning_rate": 1.9423927114850386e-05, "loss": 1.9459, "step": 142600 }, { "epoch": 0.028824235789116477, "grad_norm": 8.61233139038086, "learning_rate": 1.9423523131004085e-05, "loss": 1.8847, "step": 142700 }, { "epoch": 0.02884443497327143, "grad_norm": 10.981130599975586, "learning_rate": 1.9423119147157784e-05, "loss": 2.0019, "step": 142800 }, { "epoch": 0.02886463415742638, "grad_norm": 9.70471477508545, "learning_rate": 1.942271516331148e-05, "loss": 1.8506, "step": 142900 }, { "epoch": 0.028884833341581335, "grad_norm": 11.387918472290039, "learning_rate": 1.942231117946518e-05, "loss": 1.9043, "step": 143000 }, { "epoch": 0.028884833341581335, "eval_calculated_loss": 8.588996887207031, "eval_loss": 2.1535727977752686, "eval_perplexity": 5372.222027005395, "eval_runtime": 117.7382, "eval_samples_per_second": 8.476, "eval_steps_per_second": 2.123, "step": 143000 }, { "epoch": 0.028905032525736285, "grad_norm": 17.570785522460938, "learning_rate": 1.9421907195618878e-05, "loss": 1.9144, "step": 143100 }, { "epoch": 0.02892523170989124, "grad_norm": 5.81889009475708, "learning_rate": 1.9421503211772577e-05, "loss": 1.9531, "step": 143200 }, { "epoch": 0.02894543089404619, "grad_norm": 6.482121467590332, "learning_rate": 1.9421099227926273e-05, "loss": 1.9221, "step": 143300 }, { "epoch": 0.028965630078201143, "grad_norm": 7.926971912384033, "learning_rate": 1.9420695244079972e-05, "loss": 1.9126, "step": 143400 }, { "epoch": 0.028985829262356093, "grad_norm": 6.403306007385254, "learning_rate": 1.942029126023367e-05, "loss": 1.8703, "step": 143500 }, { "epoch": 0.029006028446511047, "grad_norm": 8.77118968963623, "learning_rate": 1.9419887276387367e-05, "loss": 1.9532, "step": 143600 }, { "epoch": 0.029026227630665997, "grad_norm": 10.467899322509766, "learning_rate": 1.9419483292541066e-05, "loss": 1.9149, "step": 143700 }, { "epoch": 0.02904642681482095, "grad_norm": 10.16403579711914, "learning_rate": 1.9419079308694765e-05, "loss": 1.8995, "step": 143800 }, { "epoch": 0.0290666259989759, "grad_norm": 10.59488582611084, "learning_rate": 1.941867532484846e-05, "loss": 1.9435, "step": 143900 }, { "epoch": 0.029086825183130854, "grad_norm": 9.68643856048584, "learning_rate": 1.941827134100216e-05, "loss": 1.9514, "step": 144000 }, { "epoch": 0.029086825183130854, "eval_calculated_loss": 8.685250282287598, "eval_loss": 2.148764133453369, "eval_perplexity": 5915.020737287629, "eval_runtime": 114.9276, "eval_samples_per_second": 8.684, "eval_steps_per_second": 2.175, "step": 144000 }, { "epoch": 0.029107024367285805, "grad_norm": 13.09060287475586, "learning_rate": 1.9417867357155856e-05, "loss": 1.9077, "step": 144100 }, { "epoch": 0.02912722355144076, "grad_norm": 5.942892551422119, "learning_rate": 1.941746337330956e-05, "loss": 1.8473, "step": 144200 }, { "epoch": 0.02914742273559571, "grad_norm": 7.725119590759277, "learning_rate": 1.9417059389463254e-05, "loss": 1.8629, "step": 144300 }, { "epoch": 0.029167621919750662, "grad_norm": 4.719742774963379, "learning_rate": 1.9416655405616953e-05, "loss": 1.8576, "step": 144400 }, { "epoch": 0.029187821103905612, "grad_norm": 6.905200958251953, "learning_rate": 1.9416251421770652e-05, "loss": 1.8678, "step": 144500 }, { "epoch": 0.029208020288060566, "grad_norm": 4.444155216217041, "learning_rate": 1.9415847437924348e-05, "loss": 1.8717, "step": 144600 }, { "epoch": 0.029228219472215516, "grad_norm": 8.802597999572754, "learning_rate": 1.9415443454078047e-05, "loss": 1.9289, "step": 144700 }, { "epoch": 0.02924841865637047, "grad_norm": 8.605241775512695, "learning_rate": 1.9415039470231743e-05, "loss": 1.8176, "step": 144800 }, { "epoch": 0.02926861784052542, "grad_norm": 10.210579872131348, "learning_rate": 1.9414635486385442e-05, "loss": 1.9725, "step": 144900 }, { "epoch": 0.029288817024680374, "grad_norm": 9.188620567321777, "learning_rate": 1.941423150253914e-05, "loss": 1.9403, "step": 145000 }, { "epoch": 0.029288817024680374, "eval_calculated_loss": 8.697754859924316, "eval_loss": 2.1531760692596436, "eval_perplexity": 5989.449956456742, "eval_runtime": 113.4425, "eval_samples_per_second": 8.797, "eval_steps_per_second": 2.204, "step": 145000 }, { "epoch": 0.029309016208835324, "grad_norm": 7.510270118713379, "learning_rate": 1.9413827518692837e-05, "loss": 1.8203, "step": 145100 }, { "epoch": 0.029329215392990278, "grad_norm": 7.921109676361084, "learning_rate": 1.941342353484654e-05, "loss": 1.8826, "step": 145200 }, { "epoch": 0.029349414577145228, "grad_norm": 8.004108428955078, "learning_rate": 1.9413019551000235e-05, "loss": 1.8906, "step": 145300 }, { "epoch": 0.029369613761300182, "grad_norm": 13.9486083984375, "learning_rate": 1.9412615567153934e-05, "loss": 1.9408, "step": 145400 }, { "epoch": 0.029389812945455132, "grad_norm": 5.60446834564209, "learning_rate": 1.9412211583307633e-05, "loss": 1.8637, "step": 145500 }, { "epoch": 0.029410012129610086, "grad_norm": 6.971690654754639, "learning_rate": 1.941180759946133e-05, "loss": 1.8075, "step": 145600 }, { "epoch": 0.029430211313765036, "grad_norm": 8.62724781036377, "learning_rate": 1.941140361561503e-05, "loss": 1.87, "step": 145700 }, { "epoch": 0.02945041049791999, "grad_norm": 8.306726455688477, "learning_rate": 1.9410999631768724e-05, "loss": 1.9485, "step": 145800 }, { "epoch": 0.02947060968207494, "grad_norm": 12.874725341796875, "learning_rate": 1.9410595647922423e-05, "loss": 1.8655, "step": 145900 }, { "epoch": 0.029490808866229894, "grad_norm": 6.215710163116455, "learning_rate": 1.9410191664076122e-05, "loss": 1.8578, "step": 146000 }, { "epoch": 0.029490808866229894, "eval_calculated_loss": 8.624828338623047, "eval_loss": 2.1528937816619873, "eval_perplexity": 5568.206780477289, "eval_runtime": 115.0937, "eval_samples_per_second": 8.671, "eval_steps_per_second": 2.172, "step": 146000 }, { "epoch": 0.029511008050384844, "grad_norm": 5.977174758911133, "learning_rate": 1.9409787680229818e-05, "loss": 1.9653, "step": 146100 }, { "epoch": 0.029531207234539798, "grad_norm": 10.191570281982422, "learning_rate": 1.9409383696383517e-05, "loss": 1.9061, "step": 146200 }, { "epoch": 0.029551406418694748, "grad_norm": 9.741968154907227, "learning_rate": 1.9408979712537216e-05, "loss": 1.8536, "step": 146300 }, { "epoch": 0.0295716056028497, "grad_norm": 11.662590026855469, "learning_rate": 1.9408575728690915e-05, "loss": 1.8906, "step": 146400 }, { "epoch": 0.029591804787004652, "grad_norm": 10.068302154541016, "learning_rate": 1.940817174484461e-05, "loss": 1.9245, "step": 146500 }, { "epoch": 0.029612003971159605, "grad_norm": 9.201475143432617, "learning_rate": 1.940776776099831e-05, "loss": 1.9119, "step": 146600 }, { "epoch": 0.029632203155314556, "grad_norm": 8.920313835144043, "learning_rate": 1.940736377715201e-05, "loss": 1.9449, "step": 146700 }, { "epoch": 0.02965240233946951, "grad_norm": 10.678736686706543, "learning_rate": 1.9406959793305705e-05, "loss": 1.9169, "step": 146800 }, { "epoch": 0.02967260152362446, "grad_norm": 7.605050563812256, "learning_rate": 1.9406555809459404e-05, "loss": 1.8883, "step": 146900 }, { "epoch": 0.029692800707779413, "grad_norm": 12.023672103881836, "learning_rate": 1.9406151825613103e-05, "loss": 1.9427, "step": 147000 }, { "epoch": 0.029692800707779413, "eval_calculated_loss": 8.726242065429688, "eval_loss": 2.155641794204712, "eval_perplexity": 6162.526172678326, "eval_runtime": 115.5658, "eval_samples_per_second": 8.636, "eval_steps_per_second": 2.163, "step": 147000 }, { "epoch": 0.029712999891934364, "grad_norm": 8.463896751403809, "learning_rate": 1.94057478417668e-05, "loss": 1.9558, "step": 147100 }, { "epoch": 0.029733199076089317, "grad_norm": 10.066676139831543, "learning_rate": 1.94053438579205e-05, "loss": 1.8906, "step": 147200 }, { "epoch": 0.029753398260244267, "grad_norm": 8.647018432617188, "learning_rate": 1.9404939874074197e-05, "loss": 1.9822, "step": 147300 }, { "epoch": 0.02977359744439922, "grad_norm": 6.699999809265137, "learning_rate": 1.9404535890227897e-05, "loss": 1.8534, "step": 147400 }, { "epoch": 0.02979379662855417, "grad_norm": 7.748482704162598, "learning_rate": 1.9404131906381592e-05, "loss": 1.9033, "step": 147500 }, { "epoch": 0.029813995812709125, "grad_norm": 12.691795349121094, "learning_rate": 1.940372792253529e-05, "loss": 1.8945, "step": 147600 }, { "epoch": 0.029834194996864075, "grad_norm": 7.813727855682373, "learning_rate": 1.940332393868899e-05, "loss": 1.9474, "step": 147700 }, { "epoch": 0.02985439418101903, "grad_norm": 8.93797779083252, "learning_rate": 1.9402919954842686e-05, "loss": 1.8401, "step": 147800 }, { "epoch": 0.02987459336517398, "grad_norm": 8.463555335998535, "learning_rate": 1.9402515970996385e-05, "loss": 1.9385, "step": 147900 }, { "epoch": 0.029894792549328933, "grad_norm": 13.271930694580078, "learning_rate": 1.9402111987150085e-05, "loss": 1.9204, "step": 148000 }, { "epoch": 0.029894792549328933, "eval_calculated_loss": 8.797550201416016, "eval_loss": 2.17789888381958, "eval_perplexity": 6618.011336209953, "eval_runtime": 113.7619, "eval_samples_per_second": 8.773, "eval_steps_per_second": 2.198, "step": 148000 }, { "epoch": 0.029914991733483883, "grad_norm": 6.308937072753906, "learning_rate": 1.940170800330378e-05, "loss": 1.8863, "step": 148100 }, { "epoch": 0.029935190917638837, "grad_norm": 12.372295379638672, "learning_rate": 1.940130401945748e-05, "loss": 1.948, "step": 148200 }, { "epoch": 0.029955390101793787, "grad_norm": 9.129865646362305, "learning_rate": 1.940090003561118e-05, "loss": 1.913, "step": 148300 }, { "epoch": 0.02997558928594874, "grad_norm": 9.321149826049805, "learning_rate": 1.9400496051764878e-05, "loss": 1.8712, "step": 148400 }, { "epoch": 0.02999578847010369, "grad_norm": 7.758849620819092, "learning_rate": 1.9400092067918573e-05, "loss": 1.8801, "step": 148500 }, { "epoch": 0.030015987654258645, "grad_norm": 6.748317718505859, "learning_rate": 1.9399688084072273e-05, "loss": 1.8332, "step": 148600 }, { "epoch": 0.030036186838413595, "grad_norm": 6.142191410064697, "learning_rate": 1.9399284100225972e-05, "loss": 1.8607, "step": 148700 }, { "epoch": 0.03005638602256855, "grad_norm": 9.979125022888184, "learning_rate": 1.9398880116379667e-05, "loss": 1.9411, "step": 148800 }, { "epoch": 0.0300765852067235, "grad_norm": 9.226143836975098, "learning_rate": 1.9398476132533367e-05, "loss": 1.9497, "step": 148900 }, { "epoch": 0.030096784390878453, "grad_norm": 8.756196022033691, "learning_rate": 1.9398072148687062e-05, "loss": 1.8548, "step": 149000 }, { "epoch": 0.030096784390878453, "eval_calculated_loss": 8.494766235351562, "eval_loss": 2.159644365310669, "eval_perplexity": 4889.113292999442, "eval_runtime": 118.1127, "eval_samples_per_second": 8.45, "eval_steps_per_second": 2.117, "step": 149000 }, { "epoch": 0.030116983575033403, "grad_norm": 10.828848838806152, "learning_rate": 1.939766816484076e-05, "loss": 1.8294, "step": 149100 }, { "epoch": 0.030137182759188356, "grad_norm": 12.095913887023926, "learning_rate": 1.939726418099446e-05, "loss": 1.9345, "step": 149200 }, { "epoch": 0.030157381943343307, "grad_norm": 8.063175201416016, "learning_rate": 1.9396860197148156e-05, "loss": 1.8632, "step": 149300 }, { "epoch": 0.03017758112749826, "grad_norm": 9.511073112487793, "learning_rate": 1.939645621330186e-05, "loss": 1.91, "step": 149400 }, { "epoch": 0.03019778031165321, "grad_norm": 8.740058898925781, "learning_rate": 1.9396052229455555e-05, "loss": 1.8828, "step": 149500 }, { "epoch": 0.030217979495808164, "grad_norm": 9.390707015991211, "learning_rate": 1.9395648245609254e-05, "loss": 1.9401, "step": 149600 }, { "epoch": 0.030238178679963115, "grad_norm": 10.985479354858398, "learning_rate": 1.939524426176295e-05, "loss": 1.9051, "step": 149700 }, { "epoch": 0.030258377864118068, "grad_norm": 7.412342548370361, "learning_rate": 1.939484027791665e-05, "loss": 1.8721, "step": 149800 }, { "epoch": 0.030278577048273022, "grad_norm": 8.795125961303711, "learning_rate": 1.9394436294070348e-05, "loss": 1.9324, "step": 149900 }, { "epoch": 0.030298776232427972, "grad_norm": 13.969797134399414, "learning_rate": 1.9394032310224043e-05, "loss": 1.931, "step": 150000 }, { "epoch": 0.030298776232427972, "eval_calculated_loss": 8.72269344329834, "eval_loss": 2.160431146621704, "eval_perplexity": 6140.696451540683, "eval_runtime": 115.1778, "eval_samples_per_second": 8.665, "eval_steps_per_second": 2.171, "step": 150000 }, { "epoch": 0.030318975416582926, "grad_norm": 9.14804744720459, "learning_rate": 1.9393628326377743e-05, "loss": 1.8717, "step": 150100 }, { "epoch": 0.030339174600737876, "grad_norm": 9.954785346984863, "learning_rate": 1.939322434253144e-05, "loss": 1.9619, "step": 150200 }, { "epoch": 0.03035937378489283, "grad_norm": 11.208419799804688, "learning_rate": 1.9392820358685137e-05, "loss": 1.8412, "step": 150300 }, { "epoch": 0.03037957296904778, "grad_norm": 14.982954978942871, "learning_rate": 1.939241637483884e-05, "loss": 1.8989, "step": 150400 }, { "epoch": 0.030399772153202734, "grad_norm": 9.91010570526123, "learning_rate": 1.9392012390992536e-05, "loss": 1.896, "step": 150500 }, { "epoch": 0.030419971337357684, "grad_norm": 14.04091739654541, "learning_rate": 1.9391608407146235e-05, "loss": 1.879, "step": 150600 }, { "epoch": 0.030440170521512638, "grad_norm": 6.775964260101318, "learning_rate": 1.939120442329993e-05, "loss": 1.9091, "step": 150700 }, { "epoch": 0.030460369705667588, "grad_norm": 6.893741607666016, "learning_rate": 1.939080043945363e-05, "loss": 1.9523, "step": 150800 }, { "epoch": 0.03048056888982254, "grad_norm": 6.70244836807251, "learning_rate": 1.939039645560733e-05, "loss": 1.8796, "step": 150900 }, { "epoch": 0.030500768073977492, "grad_norm": 8.14001178741455, "learning_rate": 1.9389992471761025e-05, "loss": 1.8314, "step": 151000 }, { "epoch": 0.030500768073977492, "eval_calculated_loss": 8.679781913757324, "eval_loss": 2.1619527339935303, "eval_perplexity": 5882.763501640524, "eval_runtime": 117.8432, "eval_samples_per_second": 8.469, "eval_steps_per_second": 2.121, "step": 151000 }, { "epoch": 0.030520967258132446, "grad_norm": 9.769774436950684, "learning_rate": 1.9389588487914724e-05, "loss": 1.8382, "step": 151100 }, { "epoch": 0.030541166442287396, "grad_norm": 11.710790634155273, "learning_rate": 1.9389184504068423e-05, "loss": 1.7876, "step": 151200 }, { "epoch": 0.03056136562644235, "grad_norm": 13.295123100280762, "learning_rate": 1.938878052022212e-05, "loss": 1.9457, "step": 151300 }, { "epoch": 0.0305815648105973, "grad_norm": 8.115666389465332, "learning_rate": 1.9388376536375818e-05, "loss": 1.889, "step": 151400 }, { "epoch": 0.030601763994752253, "grad_norm": 10.925188064575195, "learning_rate": 1.9387972552529517e-05, "loss": 1.9662, "step": 151500 }, { "epoch": 0.030621963178907204, "grad_norm": 8.936375617980957, "learning_rate": 1.9387568568683216e-05, "loss": 1.8902, "step": 151600 }, { "epoch": 0.030642162363062157, "grad_norm": 5.648778915405273, "learning_rate": 1.938716458483691e-05, "loss": 1.8867, "step": 151700 }, { "epoch": 0.030662361547217108, "grad_norm": 9.965012550354004, "learning_rate": 1.938676060099061e-05, "loss": 1.9079, "step": 151800 }, { "epoch": 0.03068256073137206, "grad_norm": 7.122488498687744, "learning_rate": 1.938635661714431e-05, "loss": 1.8919, "step": 151900 }, { "epoch": 0.03070275991552701, "grad_norm": 4.180229663848877, "learning_rate": 1.9385952633298006e-05, "loss": 1.8718, "step": 152000 }, { "epoch": 0.03070275991552701, "eval_calculated_loss": 8.64743709564209, "eval_loss": 2.1618664264678955, "eval_perplexity": 5695.530911283125, "eval_runtime": 115.7672, "eval_samples_per_second": 8.621, "eval_steps_per_second": 2.16, "step": 152000 }, { "epoch": 0.030722959099681965, "grad_norm": 9.065752029418945, "learning_rate": 1.9385548649451705e-05, "loss": 1.9445, "step": 152100 }, { "epoch": 0.030743158283836915, "grad_norm": 11.440138816833496, "learning_rate": 1.93851446656054e-05, "loss": 1.9264, "step": 152200 }, { "epoch": 0.03076335746799187, "grad_norm": 6.316174507141113, "learning_rate": 1.93847406817591e-05, "loss": 1.7717, "step": 152300 }, { "epoch": 0.03078355665214682, "grad_norm": 6.4864068031311035, "learning_rate": 1.93843366979128e-05, "loss": 1.8319, "step": 152400 }, { "epoch": 0.030803755836301773, "grad_norm": 7.695703506469727, "learning_rate": 1.9383932714066498e-05, "loss": 1.8096, "step": 152500 }, { "epoch": 0.030823955020456723, "grad_norm": 10.564781188964844, "learning_rate": 1.9383528730220197e-05, "loss": 1.8795, "step": 152600 }, { "epoch": 0.030844154204611677, "grad_norm": 9.183577537536621, "learning_rate": 1.9383124746373893e-05, "loss": 1.9513, "step": 152700 }, { "epoch": 0.030864353388766627, "grad_norm": 7.75216817855835, "learning_rate": 1.9382720762527592e-05, "loss": 1.873, "step": 152800 }, { "epoch": 0.03088455257292158, "grad_norm": 8.821394920349121, "learning_rate": 1.938231677868129e-05, "loss": 1.9741, "step": 152900 }, { "epoch": 0.03090475175707653, "grad_norm": 5.387707233428955, "learning_rate": 1.9381912794834987e-05, "loss": 1.8837, "step": 153000 }, { "epoch": 0.03090475175707653, "eval_calculated_loss": 8.73408317565918, "eval_loss": 2.1544034481048584, "eval_perplexity": 6211.037161144503, "eval_runtime": 114.5955, "eval_samples_per_second": 8.709, "eval_steps_per_second": 2.182, "step": 153000 }, { "epoch": 0.030924950941231485, "grad_norm": 7.06566047668457, "learning_rate": 1.9381508810988686e-05, "loss": 1.9269, "step": 153100 }, { "epoch": 0.030945150125386435, "grad_norm": 9.185441970825195, "learning_rate": 1.938110482714238e-05, "loss": 1.8832, "step": 153200 }, { "epoch": 0.03096534930954139, "grad_norm": 9.615150451660156, "learning_rate": 1.938070084329608e-05, "loss": 1.87, "step": 153300 }, { "epoch": 0.03098554849369634, "grad_norm": 6.875046730041504, "learning_rate": 1.938029685944978e-05, "loss": 1.8992, "step": 153400 }, { "epoch": 0.031005747677851293, "grad_norm": 9.858625411987305, "learning_rate": 1.937989287560348e-05, "loss": 1.907, "step": 153500 }, { "epoch": 0.031025946862006243, "grad_norm": 8.318817138671875, "learning_rate": 1.9379488891757178e-05, "loss": 1.8644, "step": 153600 }, { "epoch": 0.031046146046161197, "grad_norm": 10.200340270996094, "learning_rate": 1.9379084907910874e-05, "loss": 1.965, "step": 153700 }, { "epoch": 0.031066345230316147, "grad_norm": 9.276193618774414, "learning_rate": 1.9378680924064573e-05, "loss": 1.9173, "step": 153800 }, { "epoch": 0.0310865444144711, "grad_norm": 21.395109176635742, "learning_rate": 1.937827694021827e-05, "loss": 1.8548, "step": 153900 }, { "epoch": 0.03110674359862605, "grad_norm": 10.485021591186523, "learning_rate": 1.9377872956371968e-05, "loss": 1.8836, "step": 154000 }, { "epoch": 0.03110674359862605, "eval_calculated_loss": 8.873324394226074, "eval_loss": 2.160630941390991, "eval_perplexity": 7138.974333368017, "eval_runtime": 114.7194, "eval_samples_per_second": 8.699, "eval_steps_per_second": 2.179, "step": 154000 }, { "epoch": 0.031126942782781004, "grad_norm": 12.467246055603027, "learning_rate": 1.9377468972525667e-05, "loss": 1.8733, "step": 154100 }, { "epoch": 0.031147141966935955, "grad_norm": 12.886040687561035, "learning_rate": 1.9377064988679363e-05, "loss": 1.9088, "step": 154200 }, { "epoch": 0.03116734115109091, "grad_norm": 14.070168495178223, "learning_rate": 1.9376661004833062e-05, "loss": 1.9112, "step": 154300 }, { "epoch": 0.03118754033524586, "grad_norm": 9.309741020202637, "learning_rate": 1.937625702098676e-05, "loss": 1.8846, "step": 154400 }, { "epoch": 0.031207739519400812, "grad_norm": 12.814202308654785, "learning_rate": 1.9375853037140457e-05, "loss": 1.8727, "step": 154500 }, { "epoch": 0.031227938703555762, "grad_norm": 12.505725860595703, "learning_rate": 1.9375449053294156e-05, "loss": 1.8935, "step": 154600 }, { "epoch": 0.031248137887710716, "grad_norm": 9.069246292114258, "learning_rate": 1.9375045069447855e-05, "loss": 1.8639, "step": 154700 }, { "epoch": 0.031268337071865666, "grad_norm": 8.464688301086426, "learning_rate": 1.9374641085601554e-05, "loss": 1.8742, "step": 154800 }, { "epoch": 0.03128853625602062, "grad_norm": 11.329187393188477, "learning_rate": 1.937423710175525e-05, "loss": 1.9301, "step": 154900 }, { "epoch": 0.031308735440175574, "grad_norm": 9.013127326965332, "learning_rate": 1.937383311790895e-05, "loss": 1.8682, "step": 155000 }, { "epoch": 0.031308735440175574, "eval_calculated_loss": 8.595398902893066, "eval_loss": 2.1539549827575684, "eval_perplexity": 5406.725404427241, "eval_runtime": 114.6095, "eval_samples_per_second": 8.708, "eval_steps_per_second": 2.181, "step": 155000 }, { "epoch": 0.03132893462433052, "grad_norm": 9.004318237304688, "learning_rate": 1.9373429134062648e-05, "loss": 1.8953, "step": 155100 }, { "epoch": 0.031349133808485474, "grad_norm": 7.233293056488037, "learning_rate": 1.9373025150216344e-05, "loss": 1.8827, "step": 155200 }, { "epoch": 0.03136933299264043, "grad_norm": 6.159506797790527, "learning_rate": 1.9372621166370043e-05, "loss": 1.8842, "step": 155300 }, { "epoch": 0.03138953217679538, "grad_norm": 9.899197578430176, "learning_rate": 1.937221718252374e-05, "loss": 1.9043, "step": 155400 }, { "epoch": 0.03140973136095033, "grad_norm": 13.766695022583008, "learning_rate": 1.9371813198677438e-05, "loss": 1.8923, "step": 155500 }, { "epoch": 0.03142993054510528, "grad_norm": 13.843588829040527, "learning_rate": 1.9371409214831137e-05, "loss": 1.9365, "step": 155600 }, { "epoch": 0.031450129729260236, "grad_norm": 9.942315101623535, "learning_rate": 1.9371005230984836e-05, "loss": 1.8652, "step": 155700 }, { "epoch": 0.03147032891341519, "grad_norm": 13.639331817626953, "learning_rate": 1.9370601247138535e-05, "loss": 1.9018, "step": 155800 }, { "epoch": 0.031490528097570136, "grad_norm": 12.41340446472168, "learning_rate": 1.937019726329223e-05, "loss": 1.9296, "step": 155900 }, { "epoch": 0.03151072728172509, "grad_norm": 10.437644004821777, "learning_rate": 1.936979327944593e-05, "loss": 2.0237, "step": 156000 }, { "epoch": 0.03151072728172509, "eval_calculated_loss": 8.579217910766602, "eval_loss": 2.15397310256958, "eval_perplexity": 5319.943227555375, "eval_runtime": 114.7118, "eval_samples_per_second": 8.7, "eval_steps_per_second": 2.179, "step": 156000 }, { "epoch": 0.031530926465880044, "grad_norm": 7.995079517364502, "learning_rate": 1.936938929559963e-05, "loss": 1.8808, "step": 156100 }, { "epoch": 0.031551125650035, "grad_norm": 10.87572193145752, "learning_rate": 1.9368985311753325e-05, "loss": 1.8568, "step": 156200 }, { "epoch": 0.031571324834189944, "grad_norm": 10.02473258972168, "learning_rate": 1.9368581327907024e-05, "loss": 1.9465, "step": 156300 }, { "epoch": 0.0315915240183449, "grad_norm": 8.775519371032715, "learning_rate": 1.936817734406072e-05, "loss": 1.873, "step": 156400 }, { "epoch": 0.03161172320249985, "grad_norm": 10.018585205078125, "learning_rate": 1.936777336021442e-05, "loss": 1.9304, "step": 156500 }, { "epoch": 0.031631922386654805, "grad_norm": 8.910362243652344, "learning_rate": 1.9367369376368118e-05, "loss": 1.8815, "step": 156600 }, { "epoch": 0.03165212157080975, "grad_norm": 13.162079811096191, "learning_rate": 1.9366965392521817e-05, "loss": 1.8206, "step": 156700 }, { "epoch": 0.031672320754964706, "grad_norm": 9.723376274108887, "learning_rate": 1.9366561408675516e-05, "loss": 1.8515, "step": 156800 }, { "epoch": 0.03169251993911966, "grad_norm": 9.098092079162598, "learning_rate": 1.9366157424829212e-05, "loss": 1.9469, "step": 156900 }, { "epoch": 0.03171271912327461, "grad_norm": 15.852664947509766, "learning_rate": 1.936575344098291e-05, "loss": 1.9423, "step": 157000 }, { "epoch": 0.03171271912327461, "eval_calculated_loss": 8.762299537658691, "eval_loss": 2.1746773719787598, "eval_perplexity": 6388.785953278626, "eval_runtime": 114.4019, "eval_samples_per_second": 8.724, "eval_steps_per_second": 2.185, "step": 157000 }, { "epoch": 0.03173291830742956, "grad_norm": 7.055413246154785, "learning_rate": 1.9365349457136607e-05, "loss": 1.8973, "step": 157100 }, { "epoch": 0.031753117491584514, "grad_norm": 8.945793151855469, "learning_rate": 1.9364945473290306e-05, "loss": 1.9511, "step": 157200 }, { "epoch": 0.03177331667573947, "grad_norm": 5.916433811187744, "learning_rate": 1.9364541489444005e-05, "loss": 1.933, "step": 157300 }, { "epoch": 0.03179351585989442, "grad_norm": 14.776032447814941, "learning_rate": 1.93641375055977e-05, "loss": 1.9646, "step": 157400 }, { "epoch": 0.03181371504404937, "grad_norm": 4.945358753204346, "learning_rate": 1.93637335217514e-05, "loss": 1.9975, "step": 157500 }, { "epoch": 0.03183391422820432, "grad_norm": 10.146398544311523, "learning_rate": 1.93633295379051e-05, "loss": 1.8347, "step": 157600 }, { "epoch": 0.031854113412359275, "grad_norm": 7.083983421325684, "learning_rate": 1.93629255540588e-05, "loss": 1.8915, "step": 157700 }, { "epoch": 0.03187431259651423, "grad_norm": 18.269588470458984, "learning_rate": 1.9362521570212494e-05, "loss": 1.8354, "step": 157800 }, { "epoch": 0.031894511780669176, "grad_norm": 9.938824653625488, "learning_rate": 1.9362117586366193e-05, "loss": 1.8355, "step": 157900 }, { "epoch": 0.03191471096482413, "grad_norm": 11.448747634887695, "learning_rate": 1.9361713602519892e-05, "loss": 1.9026, "step": 158000 }, { "epoch": 0.03191471096482413, "eval_calculated_loss": 8.94283676147461, "eval_loss": 2.1569290161132812, "eval_perplexity": 7652.875675458709, "eval_runtime": 115.3813, "eval_samples_per_second": 8.65, "eval_steps_per_second": 2.167, "step": 158000 }, { "epoch": 0.03193491014897908, "grad_norm": 9.473236083984375, "learning_rate": 1.9361309618673588e-05, "loss": 1.9207, "step": 158100 }, { "epoch": 0.03195510933313404, "grad_norm": 11.579065322875977, "learning_rate": 1.9360905634827287e-05, "loss": 1.9285, "step": 158200 }, { "epoch": 0.03197530851728898, "grad_norm": 7.025585174560547, "learning_rate": 1.9360501650980986e-05, "loss": 1.918, "step": 158300 }, { "epoch": 0.03199550770144394, "grad_norm": 9.150583267211914, "learning_rate": 1.9360097667134682e-05, "loss": 1.8517, "step": 158400 }, { "epoch": 0.03201570688559889, "grad_norm": 2.9125454425811768, "learning_rate": 1.935969368328838e-05, "loss": 1.9574, "step": 158500 }, { "epoch": 0.032035906069753844, "grad_norm": 5.268874168395996, "learning_rate": 1.935928969944208e-05, "loss": 1.8509, "step": 158600 }, { "epoch": 0.03205610525390879, "grad_norm": 7.290364742279053, "learning_rate": 1.935888571559578e-05, "loss": 1.8577, "step": 158700 }, { "epoch": 0.032076304438063745, "grad_norm": 9.121620178222656, "learning_rate": 1.9358481731749475e-05, "loss": 1.973, "step": 158800 }, { "epoch": 0.0320965036222187, "grad_norm": 7.642507553100586, "learning_rate": 1.9358077747903174e-05, "loss": 1.8417, "step": 158900 }, { "epoch": 0.03211670280637365, "grad_norm": 12.389857292175293, "learning_rate": 1.9357673764056873e-05, "loss": 1.8881, "step": 159000 }, { "epoch": 0.03211670280637365, "eval_calculated_loss": 8.859965324401855, "eval_loss": 2.1483519077301025, "eval_perplexity": 7044.238477147658, "eval_runtime": 114.6648, "eval_samples_per_second": 8.704, "eval_steps_per_second": 2.18, "step": 159000 }, { "epoch": 0.0321369019905286, "grad_norm": 8.215250968933105, "learning_rate": 1.935726978021057e-05, "loss": 1.9051, "step": 159100 }, { "epoch": 0.03215710117468355, "grad_norm": 9.273605346679688, "learning_rate": 1.935686579636427e-05, "loss": 1.7959, "step": 159200 }, { "epoch": 0.032177300358838506, "grad_norm": 10.729673385620117, "learning_rate": 1.9356461812517967e-05, "loss": 1.8876, "step": 159300 }, { "epoch": 0.03219749954299346, "grad_norm": 8.111745834350586, "learning_rate": 1.9356057828671663e-05, "loss": 1.8642, "step": 159400 }, { "epoch": 0.032217698727148414, "grad_norm": 13.467341423034668, "learning_rate": 1.9355653844825362e-05, "loss": 1.9488, "step": 159500 }, { "epoch": 0.03223789791130336, "grad_norm": 8.367358207702637, "learning_rate": 1.9355249860979058e-05, "loss": 1.856, "step": 159600 }, { "epoch": 0.032258097095458314, "grad_norm": 8.35740852355957, "learning_rate": 1.935484587713276e-05, "loss": 1.8548, "step": 159700 }, { "epoch": 0.03227829627961327, "grad_norm": 6.894195556640625, "learning_rate": 1.9354441893286456e-05, "loss": 1.8979, "step": 159800 }, { "epoch": 0.03229849546376822, "grad_norm": 5.82987117767334, "learning_rate": 1.9354037909440155e-05, "loss": 1.9008, "step": 159900 }, { "epoch": 0.03231869464792317, "grad_norm": 6.470695495605469, "learning_rate": 1.9353633925593855e-05, "loss": 1.8758, "step": 160000 }, { "epoch": 0.03231869464792317, "eval_calculated_loss": 8.685526847839355, "eval_loss": 2.1534032821655273, "eval_perplexity": 5916.656854497897, "eval_runtime": 116.2207, "eval_samples_per_second": 8.587, "eval_steps_per_second": 2.151, "step": 160000 }, { "epoch": 0.03233889383207812, "grad_norm": 9.612197875976562, "learning_rate": 1.935322994174755e-05, "loss": 1.8756, "step": 160100 }, { "epoch": 0.032359093016233076, "grad_norm": 8.769372940063477, "learning_rate": 1.935282595790125e-05, "loss": 1.8654, "step": 160200 }, { "epoch": 0.03237929220038803, "grad_norm": 8.882165908813477, "learning_rate": 1.9352421974054945e-05, "loss": 1.8816, "step": 160300 }, { "epoch": 0.032399491384542976, "grad_norm": 12.410551071166992, "learning_rate": 1.9352017990208644e-05, "loss": 1.891, "step": 160400 }, { "epoch": 0.03241969056869793, "grad_norm": 11.886222839355469, "learning_rate": 1.9351614006362343e-05, "loss": 1.89, "step": 160500 }, { "epoch": 0.032439889752852884, "grad_norm": 7.942470073699951, "learning_rate": 1.935121002251604e-05, "loss": 1.8714, "step": 160600 }, { "epoch": 0.03246008893700784, "grad_norm": 11.170023918151855, "learning_rate": 1.935080603866974e-05, "loss": 1.8428, "step": 160700 }, { "epoch": 0.032480288121162784, "grad_norm": 10.141267776489258, "learning_rate": 1.9350402054823437e-05, "loss": 1.9378, "step": 160800 }, { "epoch": 0.03250048730531774, "grad_norm": 13.674053192138672, "learning_rate": 1.9349998070977137e-05, "loss": 1.9136, "step": 160900 }, { "epoch": 0.03252068648947269, "grad_norm": 3.9234325885772705, "learning_rate": 1.9349594087130836e-05, "loss": 1.8095, "step": 161000 }, { "epoch": 0.03252068648947269, "eval_calculated_loss": 8.665081024169922, "eval_loss": 2.154268503189087, "eval_perplexity": 5796.914221441247, "eval_runtime": 115.709, "eval_samples_per_second": 8.625, "eval_steps_per_second": 2.161, "step": 161000 }, { "epoch": 0.032540885673627645, "grad_norm": 7.824481010437012, "learning_rate": 1.934919010328453e-05, "loss": 1.862, "step": 161100 }, { "epoch": 0.03256108485778259, "grad_norm": 8.096741676330566, "learning_rate": 1.934878611943823e-05, "loss": 1.9294, "step": 161200 }, { "epoch": 0.032581284041937546, "grad_norm": 10.548776626586914, "learning_rate": 1.9348382135591926e-05, "loss": 1.9491, "step": 161300 }, { "epoch": 0.0326014832260925, "grad_norm": 5.3527679443359375, "learning_rate": 1.9347978151745625e-05, "loss": 1.9164, "step": 161400 }, { "epoch": 0.03262168241024745, "grad_norm": 11.40318775177002, "learning_rate": 1.9347574167899325e-05, "loss": 1.9181, "step": 161500 }, { "epoch": 0.0326418815944024, "grad_norm": 12.404004096984863, "learning_rate": 1.934717018405302e-05, "loss": 1.9248, "step": 161600 }, { "epoch": 0.032662080778557354, "grad_norm": 6.889045238494873, "learning_rate": 1.934676620020672e-05, "loss": 1.9622, "step": 161700 }, { "epoch": 0.03268227996271231, "grad_norm": 10.463480949401855, "learning_rate": 1.934636221636042e-05, "loss": 1.9418, "step": 161800 }, { "epoch": 0.03270247914686726, "grad_norm": 9.15534496307373, "learning_rate": 1.9345958232514118e-05, "loss": 1.8084, "step": 161900 }, { "epoch": 0.03272267833102221, "grad_norm": 7.19870662689209, "learning_rate": 1.9345554248667813e-05, "loss": 1.9679, "step": 162000 }, { "epoch": 0.03272267833102221, "eval_calculated_loss": 8.932210922241211, "eval_loss": 2.1365087032318115, "eval_perplexity": 7571.987959863217, "eval_runtime": 115.0248, "eval_samples_per_second": 8.676, "eval_steps_per_second": 2.173, "step": 162000 }, { "epoch": 0.03274287751517716, "grad_norm": 11.900522232055664, "learning_rate": 1.9345150264821513e-05, "loss": 1.864, "step": 162100 }, { "epoch": 0.032763076699332115, "grad_norm": 11.452325820922852, "learning_rate": 1.9344746280975212e-05, "loss": 1.9353, "step": 162200 }, { "epoch": 0.03278327588348707, "grad_norm": 13.345973014831543, "learning_rate": 1.9344342297128907e-05, "loss": 1.9584, "step": 162300 }, { "epoch": 0.032803475067642016, "grad_norm": 10.451742172241211, "learning_rate": 1.9343938313282607e-05, "loss": 1.872, "step": 162400 }, { "epoch": 0.03282367425179697, "grad_norm": 9.205824851989746, "learning_rate": 1.9343534329436306e-05, "loss": 1.9162, "step": 162500 }, { "epoch": 0.03284387343595192, "grad_norm": 11.009294509887695, "learning_rate": 1.934313034559e-05, "loss": 1.8778, "step": 162600 }, { "epoch": 0.03286407262010688, "grad_norm": 8.80064868927002, "learning_rate": 1.93427263617437e-05, "loss": 1.8888, "step": 162700 }, { "epoch": 0.03288427180426182, "grad_norm": 17.00140953063965, "learning_rate": 1.93423223778974e-05, "loss": 1.8952, "step": 162800 }, { "epoch": 0.03290447098841678, "grad_norm": 17.204851150512695, "learning_rate": 1.93419183940511e-05, "loss": 1.928, "step": 162900 }, { "epoch": 0.03292467017257173, "grad_norm": 7.314293384552002, "learning_rate": 1.9341514410204795e-05, "loss": 1.8693, "step": 163000 }, { "epoch": 0.03292467017257173, "eval_calculated_loss": 8.755518913269043, "eval_loss": 2.1441190242767334, "eval_perplexity": 6345.612532214269, "eval_runtime": 114.9804, "eval_samples_per_second": 8.68, "eval_steps_per_second": 2.174, "step": 163000 }, { "epoch": 0.032944869356726685, "grad_norm": 11.337770462036133, "learning_rate": 1.9341110426358494e-05, "loss": 1.8841, "step": 163100 }, { "epoch": 0.03296506854088163, "grad_norm": 10.024971961975098, "learning_rate": 1.9340706442512193e-05, "loss": 1.955, "step": 163200 }, { "epoch": 0.032985267725036585, "grad_norm": 8.325066566467285, "learning_rate": 1.934030245866589e-05, "loss": 1.9125, "step": 163300 }, { "epoch": 0.03300546690919154, "grad_norm": 10.64697551727295, "learning_rate": 1.9339898474819588e-05, "loss": 1.9121, "step": 163400 }, { "epoch": 0.03302566609334649, "grad_norm": 8.242648124694824, "learning_rate": 1.9339494490973287e-05, "loss": 1.8273, "step": 163500 }, { "epoch": 0.03304586527750144, "grad_norm": 9.067059516906738, "learning_rate": 1.9339090507126983e-05, "loss": 1.835, "step": 163600 }, { "epoch": 0.03306606446165639, "grad_norm": 11.433392524719238, "learning_rate": 1.933868652328068e-05, "loss": 1.8575, "step": 163700 }, { "epoch": 0.03308626364581135, "grad_norm": 13.134499549865723, "learning_rate": 1.9338282539434377e-05, "loss": 1.8886, "step": 163800 }, { "epoch": 0.0331064628299663, "grad_norm": 12.654879570007324, "learning_rate": 1.933787855558808e-05, "loss": 1.9111, "step": 163900 }, { "epoch": 0.03312666201412125, "grad_norm": 8.855031967163086, "learning_rate": 1.9337474571741776e-05, "loss": 1.8448, "step": 164000 }, { "epoch": 0.03312666201412125, "eval_calculated_loss": 8.815940856933594, "eval_loss": 2.148885726928711, "eval_perplexity": 6740.846954613696, "eval_runtime": 116.135, "eval_samples_per_second": 8.593, "eval_steps_per_second": 2.153, "step": 164000 }, { "epoch": 0.0331468611982762, "grad_norm": 14.212641716003418, "learning_rate": 1.9337070587895475e-05, "loss": 1.8801, "step": 164100 }, { "epoch": 0.033167060382431154, "grad_norm": 9.067956924438477, "learning_rate": 1.9336666604049174e-05, "loss": 1.8292, "step": 164200 }, { "epoch": 0.03318725956658611, "grad_norm": 8.256628036499023, "learning_rate": 1.933626262020287e-05, "loss": 1.9025, "step": 164300 }, { "epoch": 0.033207458750741055, "grad_norm": 10.68395709991455, "learning_rate": 1.933585863635657e-05, "loss": 1.8584, "step": 164400 }, { "epoch": 0.03322765793489601, "grad_norm": 16.04056167602539, "learning_rate": 1.9335454652510265e-05, "loss": 1.7978, "step": 164500 }, { "epoch": 0.03324785711905096, "grad_norm": 9.976346015930176, "learning_rate": 1.9335050668663964e-05, "loss": 1.9062, "step": 164600 }, { "epoch": 0.033268056303205916, "grad_norm": 9.664307594299316, "learning_rate": 1.9334646684817663e-05, "loss": 1.8717, "step": 164700 }, { "epoch": 0.03328825548736086, "grad_norm": 7.81104850769043, "learning_rate": 1.933424270097136e-05, "loss": 1.833, "step": 164800 }, { "epoch": 0.033308454671515816, "grad_norm": 9.60290813446045, "learning_rate": 1.933383871712506e-05, "loss": 1.8397, "step": 164900 }, { "epoch": 0.03332865385567077, "grad_norm": 5.701851844787598, "learning_rate": 1.9333434733278757e-05, "loss": 1.9358, "step": 165000 }, { "epoch": 0.03332865385567077, "eval_calculated_loss": 8.608086585998535, "eval_loss": 2.144423723220825, "eval_perplexity": 5475.761249360762, "eval_runtime": 113.2366, "eval_samples_per_second": 8.813, "eval_steps_per_second": 2.208, "step": 165000 }, { "epoch": 0.033348853039825724, "grad_norm": 10.740530014038086, "learning_rate": 1.9333030749432456e-05, "loss": 1.9002, "step": 165100 }, { "epoch": 0.03336905222398067, "grad_norm": 16.20566177368164, "learning_rate": 1.933262676558615e-05, "loss": 1.8878, "step": 165200 }, { "epoch": 0.033389251408135624, "grad_norm": 6.594677925109863, "learning_rate": 1.933222278173985e-05, "loss": 1.847, "step": 165300 }, { "epoch": 0.03340945059229058, "grad_norm": 13.0748872756958, "learning_rate": 1.933181879789355e-05, "loss": 1.8032, "step": 165400 }, { "epoch": 0.03342964977644553, "grad_norm": 7.844029426574707, "learning_rate": 1.9331414814047246e-05, "loss": 1.834, "step": 165500 }, { "epoch": 0.03344984896060048, "grad_norm": 12.126371383666992, "learning_rate": 1.9331010830200945e-05, "loss": 1.8054, "step": 165600 }, { "epoch": 0.03347004814475543, "grad_norm": 11.347332954406738, "learning_rate": 1.9330606846354644e-05, "loss": 1.8422, "step": 165700 }, { "epoch": 0.033490247328910386, "grad_norm": 9.47719669342041, "learning_rate": 1.933020286250834e-05, "loss": 1.8464, "step": 165800 }, { "epoch": 0.03351044651306534, "grad_norm": 4.736513614654541, "learning_rate": 1.932979887866204e-05, "loss": 1.7991, "step": 165900 }, { "epoch": 0.033530645697220286, "grad_norm": 7.199169158935547, "learning_rate": 1.9329394894815738e-05, "loss": 1.853, "step": 166000 }, { "epoch": 0.033530645697220286, "eval_calculated_loss": 8.682769775390625, "eval_loss": 2.146501064300537, "eval_perplexity": 5900.366669824525, "eval_runtime": 115.5171, "eval_samples_per_second": 8.639, "eval_steps_per_second": 2.164, "step": 166000 }, { "epoch": 0.03355084488137524, "grad_norm": 8.460418701171875, "learning_rate": 1.9328990910969437e-05, "loss": 1.9044, "step": 166100 }, { "epoch": 0.033571044065530194, "grad_norm": 11.308565139770508, "learning_rate": 1.9328586927123133e-05, "loss": 1.8471, "step": 166200 }, { "epoch": 0.03359124324968515, "grad_norm": 10.887795448303223, "learning_rate": 1.9328182943276832e-05, "loss": 1.9511, "step": 166300 }, { "epoch": 0.033611442433840094, "grad_norm": 9.305035591125488, "learning_rate": 1.932777895943053e-05, "loss": 1.8547, "step": 166400 }, { "epoch": 0.03363164161799505, "grad_norm": 12.884174346923828, "learning_rate": 1.9327374975584227e-05, "loss": 1.893, "step": 166500 }, { "epoch": 0.03365184080215, "grad_norm": 12.50661849975586, "learning_rate": 1.9326970991737926e-05, "loss": 1.9265, "step": 166600 }, { "epoch": 0.033672039986304955, "grad_norm": 8.560576438903809, "learning_rate": 1.9326567007891625e-05, "loss": 1.9477, "step": 166700 }, { "epoch": 0.0336922391704599, "grad_norm": 8.9891939163208, "learning_rate": 1.932616302404532e-05, "loss": 1.8571, "step": 166800 }, { "epoch": 0.033712438354614856, "grad_norm": 5.419243335723877, "learning_rate": 1.932575904019902e-05, "loss": 1.8653, "step": 166900 }, { "epoch": 0.03373263753876981, "grad_norm": 6.483798027038574, "learning_rate": 1.932535505635272e-05, "loss": 1.9061, "step": 167000 }, { "epoch": 0.03373263753876981, "eval_calculated_loss": 8.685222625732422, "eval_loss": 2.151376485824585, "eval_perplexity": 5914.857150452378, "eval_runtime": 115.1078, "eval_samples_per_second": 8.67, "eval_steps_per_second": 2.172, "step": 167000 }, { "epoch": 0.03375283672292476, "grad_norm": 12.501021385192871, "learning_rate": 1.9324951072506418e-05, "loss": 1.8474, "step": 167100 }, { "epoch": 0.03377303590707971, "grad_norm": 9.922167778015137, "learning_rate": 1.9324547088660114e-05, "loss": 1.9315, "step": 167200 }, { "epoch": 0.033793235091234664, "grad_norm": 7.023649215698242, "learning_rate": 1.9324143104813813e-05, "loss": 1.918, "step": 167300 }, { "epoch": 0.03381343427538962, "grad_norm": 9.719534873962402, "learning_rate": 1.9323739120967512e-05, "loss": 1.8694, "step": 167400 }, { "epoch": 0.03383363345954457, "grad_norm": 10.926530838012695, "learning_rate": 1.9323335137121208e-05, "loss": 1.9157, "step": 167500 }, { "epoch": 0.03385383264369952, "grad_norm": 13.49268913269043, "learning_rate": 1.9322931153274907e-05, "loss": 1.8588, "step": 167600 }, { "epoch": 0.03387403182785447, "grad_norm": 13.635334968566895, "learning_rate": 1.9322527169428603e-05, "loss": 1.9154, "step": 167700 }, { "epoch": 0.033894231012009425, "grad_norm": 7.895483493804932, "learning_rate": 1.9322123185582302e-05, "loss": 1.8628, "step": 167800 }, { "epoch": 0.03391443019616438, "grad_norm": 10.83584213256836, "learning_rate": 1.9321719201736e-05, "loss": 1.8781, "step": 167900 }, { "epoch": 0.033934629380319326, "grad_norm": 13.697803497314453, "learning_rate": 1.93213152178897e-05, "loss": 1.8303, "step": 168000 }, { "epoch": 0.033934629380319326, "eval_calculated_loss": 8.763497352600098, "eval_loss": 2.159778118133545, "eval_perplexity": 6396.443121570687, "eval_runtime": 114.3049, "eval_samples_per_second": 8.731, "eval_steps_per_second": 2.187, "step": 168000 }, { "epoch": 0.03395482856447428, "grad_norm": 10.530407905578613, "learning_rate": 1.93209112340434e-05, "loss": 1.9157, "step": 168100 }, { "epoch": 0.03397502774862923, "grad_norm": 8.003859519958496, "learning_rate": 1.9320507250197095e-05, "loss": 1.9176, "step": 168200 }, { "epoch": 0.03399522693278419, "grad_norm": 12.555131912231445, "learning_rate": 1.9320103266350794e-05, "loss": 1.885, "step": 168300 }, { "epoch": 0.03401542611693913, "grad_norm": 7.6587934494018555, "learning_rate": 1.931969928250449e-05, "loss": 1.9424, "step": 168400 }, { "epoch": 0.03403562530109409, "grad_norm": 10.83288860321045, "learning_rate": 1.931929529865819e-05, "loss": 1.9986, "step": 168500 }, { "epoch": 0.03405582448524904, "grad_norm": 12.39575481414795, "learning_rate": 1.9318891314811888e-05, "loss": 1.8952, "step": 168600 }, { "epoch": 0.034076023669403994, "grad_norm": 5.927823066711426, "learning_rate": 1.9318487330965584e-05, "loss": 1.8377, "step": 168700 }, { "epoch": 0.03409622285355895, "grad_norm": 12.271119117736816, "learning_rate": 1.9318083347119283e-05, "loss": 1.8859, "step": 168800 }, { "epoch": 0.034116422037713895, "grad_norm": 9.380400657653809, "learning_rate": 1.9317679363272982e-05, "loss": 1.9309, "step": 168900 }, { "epoch": 0.03413662122186885, "grad_norm": 12.272589683532715, "learning_rate": 1.9317275379426678e-05, "loss": 1.9127, "step": 169000 }, { "epoch": 0.03413662122186885, "eval_calculated_loss": 8.628913879394531, "eval_loss": 2.1643459796905518, "eval_perplexity": 5591.002450915387, "eval_runtime": 115.9913, "eval_samples_per_second": 8.604, "eval_steps_per_second": 2.155, "step": 169000 }, { "epoch": 0.0341568204060238, "grad_norm": 11.741931915283203, "learning_rate": 1.931687139558038e-05, "loss": 1.8735, "step": 169100 }, { "epoch": 0.034177019590178756, "grad_norm": 7.034543037414551, "learning_rate": 1.9316467411734076e-05, "loss": 1.8434, "step": 169200 }, { "epoch": 0.0341972187743337, "grad_norm": 9.554359436035156, "learning_rate": 1.9316063427887775e-05, "loss": 1.9486, "step": 169300 }, { "epoch": 0.034217417958488656, "grad_norm": 3.5753724575042725, "learning_rate": 1.931565944404147e-05, "loss": 1.9309, "step": 169400 }, { "epoch": 0.03423761714264361, "grad_norm": 9.838029861450195, "learning_rate": 1.931525546019517e-05, "loss": 1.9173, "step": 169500 }, { "epoch": 0.034257816326798564, "grad_norm": 12.489981651306152, "learning_rate": 1.931485147634887e-05, "loss": 1.9649, "step": 169600 }, { "epoch": 0.03427801551095351, "grad_norm": 4.973383903503418, "learning_rate": 1.9314447492502565e-05, "loss": 1.8772, "step": 169700 }, { "epoch": 0.034298214695108464, "grad_norm": 7.560512542724609, "learning_rate": 1.9314043508656264e-05, "loss": 1.8021, "step": 169800 }, { "epoch": 0.03431841387926342, "grad_norm": 9.011916160583496, "learning_rate": 1.9313639524809963e-05, "loss": 1.9629, "step": 169900 }, { "epoch": 0.03433861306341837, "grad_norm": 8.34902572631836, "learning_rate": 1.931323554096366e-05, "loss": 1.868, "step": 170000 }, { "epoch": 0.03433861306341837, "eval_calculated_loss": 8.622265815734863, "eval_loss": 2.148010492324829, "eval_perplexity": 5553.956389430748, "eval_runtime": 116.6967, "eval_samples_per_second": 8.552, "eval_steps_per_second": 2.142, "step": 170000 }, { "epoch": 0.03435881224757332, "grad_norm": 5.6360883712768555, "learning_rate": 1.9312831557117358e-05, "loss": 1.9599, "step": 170100 }, { "epoch": 0.03437901143172827, "grad_norm": 7.907002925872803, "learning_rate": 1.9312427573271057e-05, "loss": 1.8621, "step": 170200 }, { "epoch": 0.034399210615883226, "grad_norm": 8.626396179199219, "learning_rate": 1.9312023589424756e-05, "loss": 1.8709, "step": 170300 }, { "epoch": 0.03441940980003818, "grad_norm": 11.600446701049805, "learning_rate": 1.9311619605578452e-05, "loss": 1.8611, "step": 170400 }, { "epoch": 0.034439608984193126, "grad_norm": 9.432430267333984, "learning_rate": 1.931121562173215e-05, "loss": 1.9136, "step": 170500 }, { "epoch": 0.03445980816834808, "grad_norm": 14.064164161682129, "learning_rate": 1.931081163788585e-05, "loss": 1.9251, "step": 170600 }, { "epoch": 0.034480007352503034, "grad_norm": 9.861713409423828, "learning_rate": 1.9310407654039546e-05, "loss": 1.8302, "step": 170700 }, { "epoch": 0.03450020653665799, "grad_norm": 8.609768867492676, "learning_rate": 1.9310003670193245e-05, "loss": 1.9154, "step": 170800 }, { "epoch": 0.034520405720812934, "grad_norm": 6.329281806945801, "learning_rate": 1.930959968634694e-05, "loss": 1.9341, "step": 170900 }, { "epoch": 0.03454060490496789, "grad_norm": 7.913784027099609, "learning_rate": 1.930919570250064e-05, "loss": 1.828, "step": 171000 }, { "epoch": 0.03454060490496789, "eval_calculated_loss": 8.516435623168945, "eval_loss": 2.1499826908111572, "eval_perplexity": 4996.213593175418, "eval_runtime": 116.5162, "eval_samples_per_second": 8.565, "eval_steps_per_second": 2.146, "step": 171000 }, { "epoch": 0.03456080408912284, "grad_norm": 4.703094959259033, "learning_rate": 1.930879171865434e-05, "loss": 1.8075, "step": 171100 }, { "epoch": 0.034581003273277795, "grad_norm": 8.751540184020996, "learning_rate": 1.930838773480804e-05, "loss": 1.9117, "step": 171200 }, { "epoch": 0.03460120245743274, "grad_norm": 10.210956573486328, "learning_rate": 1.9307983750961737e-05, "loss": 1.8936, "step": 171300 }, { "epoch": 0.034621401641587696, "grad_norm": 7.018980026245117, "learning_rate": 1.9307579767115433e-05, "loss": 1.9379, "step": 171400 }, { "epoch": 0.03464160082574265, "grad_norm": 6.15402364730835, "learning_rate": 1.9307175783269132e-05, "loss": 1.9477, "step": 171500 }, { "epoch": 0.0346618000098976, "grad_norm": 13.942824363708496, "learning_rate": 1.930677179942283e-05, "loss": 1.961, "step": 171600 }, { "epoch": 0.03468199919405255, "grad_norm": 12.224624633789062, "learning_rate": 1.9306367815576527e-05, "loss": 1.8218, "step": 171700 }, { "epoch": 0.034702198378207504, "grad_norm": 8.078397750854492, "learning_rate": 1.9305963831730226e-05, "loss": 1.8753, "step": 171800 }, { "epoch": 0.03472239756236246, "grad_norm": 7.704348087310791, "learning_rate": 1.9305559847883922e-05, "loss": 1.8918, "step": 171900 }, { "epoch": 0.03474259674651741, "grad_norm": 9.276354789733887, "learning_rate": 1.930515586403762e-05, "loss": 1.8176, "step": 172000 }, { "epoch": 0.03474259674651741, "eval_calculated_loss": 8.691880226135254, "eval_loss": 2.159313678741455, "eval_perplexity": 5954.367281195127, "eval_runtime": 115.8958, "eval_samples_per_second": 8.611, "eval_steps_per_second": 2.157, "step": 172000 }, { "epoch": 0.03476279593067236, "grad_norm": 6.757080554962158, "learning_rate": 1.930475188019132e-05, "loss": 1.8202, "step": 172100 }, { "epoch": 0.03478299511482731, "grad_norm": 6.329104423522949, "learning_rate": 1.930434789634502e-05, "loss": 1.8127, "step": 172200 }, { "epoch": 0.034803194298982265, "grad_norm": 10.59896183013916, "learning_rate": 1.930394391249872e-05, "loss": 1.8182, "step": 172300 }, { "epoch": 0.03482339348313722, "grad_norm": 11.715038299560547, "learning_rate": 1.9303539928652414e-05, "loss": 1.8316, "step": 172400 }, { "epoch": 0.034843592667292166, "grad_norm": 17.093265533447266, "learning_rate": 1.9303135944806113e-05, "loss": 1.8662, "step": 172500 }, { "epoch": 0.03486379185144712, "grad_norm": 6.646310806274414, "learning_rate": 1.930273196095981e-05, "loss": 1.8996, "step": 172600 }, { "epoch": 0.03488399103560207, "grad_norm": 11.429150581359863, "learning_rate": 1.930232797711351e-05, "loss": 1.8836, "step": 172700 }, { "epoch": 0.03490419021975703, "grad_norm": 12.97887134552002, "learning_rate": 1.9301923993267207e-05, "loss": 1.9178, "step": 172800 }, { "epoch": 0.03492438940391197, "grad_norm": 9.375802993774414, "learning_rate": 1.9301520009420903e-05, "loss": 1.8471, "step": 172900 }, { "epoch": 0.03494458858806693, "grad_norm": 9.99439525604248, "learning_rate": 1.9301116025574602e-05, "loss": 1.9022, "step": 173000 }, { "epoch": 0.03494458858806693, "eval_calculated_loss": 8.70579719543457, "eval_loss": 2.144232988357544, "eval_perplexity": 6037.813339128403, "eval_runtime": 116.4725, "eval_samples_per_second": 8.569, "eval_steps_per_second": 2.146, "step": 173000 }, { "epoch": 0.03496478777222188, "grad_norm": 15.337119102478027, "learning_rate": 1.93007120417283e-05, "loss": 1.9572, "step": 173100 }, { "epoch": 0.034984986956376835, "grad_norm": 6.583555698394775, "learning_rate": 1.9300308057882e-05, "loss": 1.8821, "step": 173200 }, { "epoch": 0.03500518614053178, "grad_norm": 7.3887176513671875, "learning_rate": 1.9299904074035696e-05, "loss": 1.787, "step": 173300 }, { "epoch": 0.035025385324686735, "grad_norm": 8.00586223602295, "learning_rate": 1.9299500090189395e-05, "loss": 1.9787, "step": 173400 }, { "epoch": 0.03504558450884169, "grad_norm": 11.257737159729004, "learning_rate": 1.9299096106343095e-05, "loss": 1.8569, "step": 173500 }, { "epoch": 0.03506578369299664, "grad_norm": 8.749199867248535, "learning_rate": 1.929869212249679e-05, "loss": 1.7547, "step": 173600 }, { "epoch": 0.03508598287715159, "grad_norm": 6.26441764831543, "learning_rate": 1.929828813865049e-05, "loss": 1.7985, "step": 173700 }, { "epoch": 0.03510618206130654, "grad_norm": 8.29989242553711, "learning_rate": 1.929788415480419e-05, "loss": 1.9626, "step": 173800 }, { "epoch": 0.0351263812454615, "grad_norm": 10.507983207702637, "learning_rate": 1.9297480170957884e-05, "loss": 1.8691, "step": 173900 }, { "epoch": 0.03514658042961645, "grad_norm": 11.22715950012207, "learning_rate": 1.9297076187111583e-05, "loss": 1.9084, "step": 174000 }, { "epoch": 0.03514658042961645, "eval_calculated_loss": 8.885384559631348, "eval_loss": 2.1591544151306152, "eval_perplexity": 7225.592811370175, "eval_runtime": 115.5518, "eval_samples_per_second": 8.637, "eval_steps_per_second": 2.164, "step": 174000 }, { "epoch": 0.0351667796137714, "grad_norm": 6.52236270904541, "learning_rate": 1.9296672203265283e-05, "loss": 1.819, "step": 174100 }, { "epoch": 0.03518697879792635, "grad_norm": 10.422433853149414, "learning_rate": 1.9296268219418982e-05, "loss": 1.8943, "step": 174200 }, { "epoch": 0.035207177982081304, "grad_norm": 11.109524726867676, "learning_rate": 1.9295864235572677e-05, "loss": 1.7767, "step": 174300 }, { "epoch": 0.03522737716623626, "grad_norm": 7.900753498077393, "learning_rate": 1.9295460251726377e-05, "loss": 1.9018, "step": 174400 }, { "epoch": 0.035247576350391205, "grad_norm": 8.60867977142334, "learning_rate": 1.9295056267880076e-05, "loss": 1.8307, "step": 174500 }, { "epoch": 0.03526777553454616, "grad_norm": 9.324322700500488, "learning_rate": 1.929465228403377e-05, "loss": 1.9472, "step": 174600 }, { "epoch": 0.03528797471870111, "grad_norm": 8.042919158935547, "learning_rate": 1.929424830018747e-05, "loss": 1.9518, "step": 174700 }, { "epoch": 0.035308173902856066, "grad_norm": 6.033627510070801, "learning_rate": 1.929384431634117e-05, "loss": 1.905, "step": 174800 }, { "epoch": 0.03532837308701101, "grad_norm": 8.407913208007812, "learning_rate": 1.9293440332494865e-05, "loss": 1.9413, "step": 174900 }, { "epoch": 0.035348572271165966, "grad_norm": 9.538910865783691, "learning_rate": 1.9293036348648565e-05, "loss": 1.8869, "step": 175000 }, { "epoch": 0.035348572271165966, "eval_calculated_loss": 8.496248245239258, "eval_loss": 2.154132843017578, "eval_perplexity": 4896.3643790051465, "eval_runtime": 114.4162, "eval_samples_per_second": 8.723, "eval_steps_per_second": 2.185, "step": 175000 }, { "epoch": 0.03536877145532092, "grad_norm": 7.0011491775512695, "learning_rate": 1.929263236480226e-05, "loss": 1.83, "step": 175100 }, { "epoch": 0.035388970639475874, "grad_norm": 6.190149307250977, "learning_rate": 1.929222838095596e-05, "loss": 1.9287, "step": 175200 }, { "epoch": 0.03540916982363082, "grad_norm": 10.161815643310547, "learning_rate": 1.929182439710966e-05, "loss": 1.8942, "step": 175300 }, { "epoch": 0.035429369007785774, "grad_norm": 6.961226463317871, "learning_rate": 1.9291420413263358e-05, "loss": 1.8683, "step": 175400 }, { "epoch": 0.03544956819194073, "grad_norm": 5.269403457641602, "learning_rate": 1.9291016429417057e-05, "loss": 1.8183, "step": 175500 }, { "epoch": 0.03546976737609568, "grad_norm": 9.816115379333496, "learning_rate": 1.9290612445570753e-05, "loss": 1.8884, "step": 175600 }, { "epoch": 0.03548996656025063, "grad_norm": 11.391027450561523, "learning_rate": 1.929020846172445e-05, "loss": 1.9596, "step": 175700 }, { "epoch": 0.03551016574440558, "grad_norm": 4.859189510345459, "learning_rate": 1.9289804477878147e-05, "loss": 1.8419, "step": 175800 }, { "epoch": 0.035530364928560536, "grad_norm": 5.057873725891113, "learning_rate": 1.9289400494031847e-05, "loss": 1.8616, "step": 175900 }, { "epoch": 0.03555056411271549, "grad_norm": 7.0868377685546875, "learning_rate": 1.9288996510185546e-05, "loss": 1.8918, "step": 176000 }, { "epoch": 0.03555056411271549, "eval_calculated_loss": 8.823039054870605, "eval_loss": 2.1522586345672607, "eval_perplexity": 6788.865039884128, "eval_runtime": 114.9195, "eval_samples_per_second": 8.684, "eval_steps_per_second": 2.175, "step": 176000 }, { "epoch": 0.035570763296870436, "grad_norm": 5.746357440948486, "learning_rate": 1.928859252633924e-05, "loss": 1.8647, "step": 176100 }, { "epoch": 0.03559096248102539, "grad_norm": 9.570052146911621, "learning_rate": 1.928818854249294e-05, "loss": 1.9607, "step": 176200 }, { "epoch": 0.035611161665180344, "grad_norm": 7.259548187255859, "learning_rate": 1.928778455864664e-05, "loss": 1.8604, "step": 176300 }, { "epoch": 0.0356313608493353, "grad_norm": 7.872356414794922, "learning_rate": 1.928738057480034e-05, "loss": 1.8445, "step": 176400 }, { "epoch": 0.035651560033490244, "grad_norm": 14.27995491027832, "learning_rate": 1.9286976590954038e-05, "loss": 1.9406, "step": 176500 }, { "epoch": 0.0356717592176452, "grad_norm": 8.635711669921875, "learning_rate": 1.9286572607107734e-05, "loss": 1.7737, "step": 176600 }, { "epoch": 0.03569195840180015, "grad_norm": 6.806236267089844, "learning_rate": 1.9286168623261433e-05, "loss": 1.861, "step": 176700 }, { "epoch": 0.035712157585955105, "grad_norm": 10.11923885345459, "learning_rate": 1.928576463941513e-05, "loss": 1.8916, "step": 176800 }, { "epoch": 0.03573235677011005, "grad_norm": 9.30077838897705, "learning_rate": 1.9285360655568828e-05, "loss": 1.8625, "step": 176900 }, { "epoch": 0.035752555954265006, "grad_norm": 8.817821502685547, "learning_rate": 1.9284956671722527e-05, "loss": 1.8693, "step": 177000 }, { "epoch": 0.035752555954265006, "eval_calculated_loss": 8.802331924438477, "eval_loss": 2.1533966064453125, "eval_perplexity": 6649.732614119759, "eval_runtime": 117.0621, "eval_samples_per_second": 8.525, "eval_steps_per_second": 2.136, "step": 177000 }, { "epoch": 0.03577275513841996, "grad_norm": 6.8481035232543945, "learning_rate": 1.9284552687876223e-05, "loss": 1.8841, "step": 177100 }, { "epoch": 0.03579295432257491, "grad_norm": 6.573037624359131, "learning_rate": 1.928414870402992e-05, "loss": 1.8947, "step": 177200 }, { "epoch": 0.03581315350672986, "grad_norm": 9.411544799804688, "learning_rate": 1.928374472018362e-05, "loss": 1.9024, "step": 177300 }, { "epoch": 0.035833352690884814, "grad_norm": 5.929566383361816, "learning_rate": 1.928334073633732e-05, "loss": 1.8852, "step": 177400 }, { "epoch": 0.03585355187503977, "grad_norm": 7.605165004730225, "learning_rate": 1.9282936752491016e-05, "loss": 1.8987, "step": 177500 }, { "epoch": 0.03587375105919472, "grad_norm": 11.251494407653809, "learning_rate": 1.9282532768644715e-05, "loss": 1.8307, "step": 177600 }, { "epoch": 0.03589395024334967, "grad_norm": 10.710345268249512, "learning_rate": 1.9282128784798414e-05, "loss": 1.7866, "step": 177700 }, { "epoch": 0.03591414942750462, "grad_norm": 6.052304267883301, "learning_rate": 1.928172480095211e-05, "loss": 1.8602, "step": 177800 }, { "epoch": 0.035934348611659575, "grad_norm": 7.743120193481445, "learning_rate": 1.928132081710581e-05, "loss": 1.8989, "step": 177900 }, { "epoch": 0.03595454779581453, "grad_norm": 5.352055549621582, "learning_rate": 1.9280916833259508e-05, "loss": 1.9354, "step": 178000 }, { "epoch": 0.03595454779581453, "eval_calculated_loss": 8.864767074584961, "eval_loss": 2.165762186050415, "eval_perplexity": 7078.144489498647, "eval_runtime": 115.8049, "eval_samples_per_second": 8.618, "eval_steps_per_second": 2.159, "step": 178000 }, { "epoch": 0.03597474697996948, "grad_norm": 8.41420841217041, "learning_rate": 1.9280512849413204e-05, "loss": 1.8858, "step": 178100 }, { "epoch": 0.03599494616412443, "grad_norm": 6.979867458343506, "learning_rate": 1.9280108865566903e-05, "loss": 1.7948, "step": 178200 }, { "epoch": 0.03601514534827938, "grad_norm": 6.518843173980713, "learning_rate": 1.92797048817206e-05, "loss": 1.9211, "step": 178300 }, { "epoch": 0.03603534453243434, "grad_norm": 15.507050514221191, "learning_rate": 1.92793008978743e-05, "loss": 1.9718, "step": 178400 }, { "epoch": 0.03605554371658929, "grad_norm": 10.502939224243164, "learning_rate": 1.9278896914027997e-05, "loss": 1.9214, "step": 178500 }, { "epoch": 0.03607574290074424, "grad_norm": 9.733681678771973, "learning_rate": 1.9278492930181696e-05, "loss": 1.8627, "step": 178600 }, { "epoch": 0.03609594208489919, "grad_norm": 8.314748764038086, "learning_rate": 1.9278088946335395e-05, "loss": 1.8094, "step": 178700 }, { "epoch": 0.036116141269054144, "grad_norm": 6.511251926422119, "learning_rate": 1.927768496248909e-05, "loss": 1.8995, "step": 178800 }, { "epoch": 0.0361363404532091, "grad_norm": 5.3394598960876465, "learning_rate": 1.927728097864279e-05, "loss": 1.9496, "step": 178900 }, { "epoch": 0.036156539637364045, "grad_norm": 6.386066913604736, "learning_rate": 1.9276876994796486e-05, "loss": 1.8752, "step": 179000 }, { "epoch": 0.036156539637364045, "eval_calculated_loss": 8.641555786132812, "eval_loss": 2.1538212299346924, "eval_perplexity": 5662.132041989633, "eval_runtime": 115.6192, "eval_samples_per_second": 8.632, "eval_steps_per_second": 2.162, "step": 179000 }, { "epoch": 0.036176738821519, "grad_norm": 8.553481101989746, "learning_rate": 1.9276473010950185e-05, "loss": 1.8096, "step": 179100 }, { "epoch": 0.03619693800567395, "grad_norm": 12.055900573730469, "learning_rate": 1.9276069027103884e-05, "loss": 1.9321, "step": 179200 }, { "epoch": 0.036217137189828906, "grad_norm": 9.668008804321289, "learning_rate": 1.927566504325758e-05, "loss": 1.8211, "step": 179300 }, { "epoch": 0.03623733637398385, "grad_norm": 9.840803146362305, "learning_rate": 1.9275261059411282e-05, "loss": 1.8602, "step": 179400 }, { "epoch": 0.036257535558138806, "grad_norm": 5.5796732902526855, "learning_rate": 1.9274857075564978e-05, "loss": 1.8822, "step": 179500 }, { "epoch": 0.03627773474229376, "grad_norm": 9.44176197052002, "learning_rate": 1.9274453091718677e-05, "loss": 1.8816, "step": 179600 }, { "epoch": 0.036297933926448714, "grad_norm": 7.541870594024658, "learning_rate": 1.9274049107872376e-05, "loss": 1.8382, "step": 179700 }, { "epoch": 0.03631813311060366, "grad_norm": 5.8545637130737305, "learning_rate": 1.9273645124026072e-05, "loss": 1.8639, "step": 179800 }, { "epoch": 0.036338332294758614, "grad_norm": 10.119240760803223, "learning_rate": 1.927324114017977e-05, "loss": 1.8926, "step": 179900 }, { "epoch": 0.03635853147891357, "grad_norm": 7.253044605255127, "learning_rate": 1.9272837156333467e-05, "loss": 1.8857, "step": 180000 }, { "epoch": 0.03635853147891357, "eval_calculated_loss": 8.643569946289062, "eval_loss": 2.160858154296875, "eval_perplexity": 5673.54797564801, "eval_runtime": 116.6264, "eval_samples_per_second": 8.557, "eval_steps_per_second": 2.144, "step": 180000 }, { "epoch": 0.03637873066306852, "grad_norm": 9.755952835083008, "learning_rate": 1.9272433172487166e-05, "loss": 1.9068, "step": 180100 }, { "epoch": 0.03639892984722347, "grad_norm": 7.838980674743652, "learning_rate": 1.9272029188640865e-05, "loss": 1.8411, "step": 180200 }, { "epoch": 0.03641912903137842, "grad_norm": 10.947504997253418, "learning_rate": 1.927162520479456e-05, "loss": 1.844, "step": 180300 }, { "epoch": 0.036439328215533376, "grad_norm": 8.383092880249023, "learning_rate": 1.927122122094826e-05, "loss": 1.8162, "step": 180400 }, { "epoch": 0.03645952739968833, "grad_norm": 6.868319511413574, "learning_rate": 1.927081723710196e-05, "loss": 1.925, "step": 180500 }, { "epoch": 0.036479726583843276, "grad_norm": 14.619441032409668, "learning_rate": 1.9270413253255658e-05, "loss": 1.916, "step": 180600 }, { "epoch": 0.03649992576799823, "grad_norm": 12.167838096618652, "learning_rate": 1.9270009269409354e-05, "loss": 1.9093, "step": 180700 }, { "epoch": 0.036520124952153184, "grad_norm": 10.289194107055664, "learning_rate": 1.9269605285563053e-05, "loss": 1.9192, "step": 180800 }, { "epoch": 0.03654032413630814, "grad_norm": 10.987932205200195, "learning_rate": 1.9269201301716752e-05, "loss": 1.9177, "step": 180900 }, { "epoch": 0.036560523320463084, "grad_norm": 8.117514610290527, "learning_rate": 1.9268797317870448e-05, "loss": 1.8472, "step": 181000 }, { "epoch": 0.036560523320463084, "eval_calculated_loss": 8.807206153869629, "eval_loss": 2.145449638366699, "eval_perplexity": 6682.22405758387, "eval_runtime": 117.6237, "eval_samples_per_second": 8.485, "eval_steps_per_second": 2.125, "step": 181000 }, { "epoch": 0.03658072250461804, "grad_norm": 6.606201171875, "learning_rate": 1.9268393334024147e-05, "loss": 1.8916, "step": 181100 }, { "epoch": 0.03660092168877299, "grad_norm": 7.120405673980713, "learning_rate": 1.9267989350177846e-05, "loss": 1.9358, "step": 181200 }, { "epoch": 0.036621120872927945, "grad_norm": 8.028386116027832, "learning_rate": 1.9267585366331542e-05, "loss": 1.9142, "step": 181300 }, { "epoch": 0.03664132005708289, "grad_norm": 5.305957317352295, "learning_rate": 1.926718138248524e-05, "loss": 1.8294, "step": 181400 }, { "epoch": 0.036661519241237846, "grad_norm": 3.6817586421966553, "learning_rate": 1.926677739863894e-05, "loss": 1.8166, "step": 181500 }, { "epoch": 0.0366817184253928, "grad_norm": 9.322967529296875, "learning_rate": 1.926637341479264e-05, "loss": 1.8423, "step": 181600 }, { "epoch": 0.03670191760954775, "grad_norm": 5.2085161209106445, "learning_rate": 1.9265969430946335e-05, "loss": 1.8987, "step": 181700 }, { "epoch": 0.0367221167937027, "grad_norm": 7.176606178283691, "learning_rate": 1.9265565447100034e-05, "loss": 1.8589, "step": 181800 }, { "epoch": 0.036742315977857654, "grad_norm": 7.8761420249938965, "learning_rate": 1.9265161463253733e-05, "loss": 1.9127, "step": 181900 }, { "epoch": 0.03676251516201261, "grad_norm": 7.365816593170166, "learning_rate": 1.926475747940743e-05, "loss": 1.9022, "step": 182000 }, { "epoch": 0.03676251516201261, "eval_calculated_loss": 8.909403800964355, "eval_loss": 2.1481103897094727, "eval_perplexity": 7401.247166197875, "eval_runtime": 115.0593, "eval_samples_per_second": 8.674, "eval_steps_per_second": 2.173, "step": 182000 }, { "epoch": 0.03678271434616756, "grad_norm": 12.9530668258667, "learning_rate": 1.9264353495561128e-05, "loss": 2.0173, "step": 182100 }, { "epoch": 0.03680291353032251, "grad_norm": 4.581433296203613, "learning_rate": 1.9263949511714827e-05, "loss": 1.8551, "step": 182200 }, { "epoch": 0.03682311271447746, "grad_norm": 9.221866607666016, "learning_rate": 1.9263545527868523e-05, "loss": 1.8498, "step": 182300 }, { "epoch": 0.036843311898632415, "grad_norm": 14.401711463928223, "learning_rate": 1.9263141544022222e-05, "loss": 1.9208, "step": 182400 }, { "epoch": 0.03686351108278737, "grad_norm": 10.368573188781738, "learning_rate": 1.926273756017592e-05, "loss": 1.8482, "step": 182500 }, { "epoch": 0.036883710266942316, "grad_norm": 9.070782661437988, "learning_rate": 1.926233357632962e-05, "loss": 1.9194, "step": 182600 }, { "epoch": 0.03690390945109727, "grad_norm": 8.625724792480469, "learning_rate": 1.9261929592483316e-05, "loss": 1.8764, "step": 182700 }, { "epoch": 0.03692410863525222, "grad_norm": 10.296141624450684, "learning_rate": 1.9261525608637015e-05, "loss": 1.8192, "step": 182800 }, { "epoch": 0.03694430781940718, "grad_norm": 7.8852410316467285, "learning_rate": 1.9261121624790714e-05, "loss": 1.9093, "step": 182900 }, { "epoch": 0.03696450700356212, "grad_norm": 8.814546585083008, "learning_rate": 1.926071764094441e-05, "loss": 1.925, "step": 183000 }, { "epoch": 0.03696450700356212, "eval_calculated_loss": 8.765191078186035, "eval_loss": 2.147709369659424, "eval_perplexity": 6407.286120885254, "eval_runtime": 115.2866, "eval_samples_per_second": 8.657, "eval_steps_per_second": 2.169, "step": 183000 }, { "epoch": 0.03698470618771708, "grad_norm": 13.959532737731934, "learning_rate": 1.926031365709811e-05, "loss": 2.0185, "step": 183100 }, { "epoch": 0.03700490537187203, "grad_norm": 7.717657566070557, "learning_rate": 1.9259909673251805e-05, "loss": 1.9288, "step": 183200 }, { "epoch": 0.037025104556026985, "grad_norm": 10.531784057617188, "learning_rate": 1.9259505689405504e-05, "loss": 1.9472, "step": 183300 }, { "epoch": 0.03704530374018193, "grad_norm": 12.905617713928223, "learning_rate": 1.9259101705559203e-05, "loss": 1.8373, "step": 183400 }, { "epoch": 0.037065502924336885, "grad_norm": 7.78174352645874, "learning_rate": 1.92586977217129e-05, "loss": 1.8457, "step": 183500 }, { "epoch": 0.03708570210849184, "grad_norm": 6.0078206062316895, "learning_rate": 1.92582937378666e-05, "loss": 1.8987, "step": 183600 }, { "epoch": 0.03710590129264679, "grad_norm": 9.602002143859863, "learning_rate": 1.9257889754020297e-05, "loss": 1.9034, "step": 183700 }, { "epoch": 0.03712610047680174, "grad_norm": 10.83747673034668, "learning_rate": 1.9257485770173996e-05, "loss": 1.8892, "step": 183800 }, { "epoch": 0.03714629966095669, "grad_norm": 10.454262733459473, "learning_rate": 1.9257081786327692e-05, "loss": 1.8323, "step": 183900 }, { "epoch": 0.03716649884511165, "grad_norm": 4.942661762237549, "learning_rate": 1.925667780248139e-05, "loss": 1.9056, "step": 184000 }, { "epoch": 0.03716649884511165, "eval_calculated_loss": 8.858805656433105, "eval_loss": 2.1587486267089844, "eval_perplexity": 7036.074234241927, "eval_runtime": 117.3598, "eval_samples_per_second": 8.504, "eval_steps_per_second": 2.13, "step": 184000 }, { "epoch": 0.0371866980292666, "grad_norm": 7.396584987640381, "learning_rate": 1.925627381863509e-05, "loss": 1.9038, "step": 184100 }, { "epoch": 0.03720689721342155, "grad_norm": 5.863966941833496, "learning_rate": 1.9255869834788786e-05, "loss": 1.9412, "step": 184200 }, { "epoch": 0.0372270963975765, "grad_norm": 9.993583679199219, "learning_rate": 1.9255465850942485e-05, "loss": 1.8423, "step": 184300 }, { "epoch": 0.037247295581731454, "grad_norm": 7.376931190490723, "learning_rate": 1.9255061867096184e-05, "loss": 1.8529, "step": 184400 }, { "epoch": 0.03726749476588641, "grad_norm": 8.116250991821289, "learning_rate": 1.925465788324988e-05, "loss": 1.8329, "step": 184500 }, { "epoch": 0.037287693950041355, "grad_norm": 6.289344310760498, "learning_rate": 1.9254253899403583e-05, "loss": 1.8591, "step": 184600 }, { "epoch": 0.03730789313419631, "grad_norm": 7.241195201873779, "learning_rate": 1.925384991555728e-05, "loss": 1.8502, "step": 184700 }, { "epoch": 0.03732809231835126, "grad_norm": 15.89418888092041, "learning_rate": 1.9253445931710977e-05, "loss": 1.8928, "step": 184800 }, { "epoch": 0.037348291502506216, "grad_norm": 11.819393157958984, "learning_rate": 1.9253041947864673e-05, "loss": 1.8715, "step": 184900 }, { "epoch": 0.03736849068666116, "grad_norm": 6.865317344665527, "learning_rate": 1.9252637964018372e-05, "loss": 1.842, "step": 185000 }, { "epoch": 0.03736849068666116, "eval_calculated_loss": 9.034724235534668, "eval_loss": 2.1687569618225098, "eval_perplexity": 8389.399600139703, "eval_runtime": 115.326, "eval_samples_per_second": 8.654, "eval_steps_per_second": 2.168, "step": 185000 }, { "epoch": 0.037388689870816116, "grad_norm": 11.395030975341797, "learning_rate": 1.925223398017207e-05, "loss": 1.8682, "step": 185100 }, { "epoch": 0.03740888905497107, "grad_norm": 7.036442279815674, "learning_rate": 1.9251829996325767e-05, "loss": 1.8166, "step": 185200 }, { "epoch": 0.037429088239126024, "grad_norm": 11.08996868133545, "learning_rate": 1.9251426012479466e-05, "loss": 1.8504, "step": 185300 }, { "epoch": 0.03744928742328097, "grad_norm": 8.456270217895508, "learning_rate": 1.9251022028633165e-05, "loss": 1.8942, "step": 185400 }, { "epoch": 0.037469486607435924, "grad_norm": 9.790822982788086, "learning_rate": 1.925061804478686e-05, "loss": 1.9065, "step": 185500 }, { "epoch": 0.03748968579159088, "grad_norm": 9.20614242553711, "learning_rate": 1.925021406094056e-05, "loss": 1.9018, "step": 185600 }, { "epoch": 0.03750988497574583, "grad_norm": 11.162237167358398, "learning_rate": 1.924981007709426e-05, "loss": 1.8251, "step": 185700 }, { "epoch": 0.03753008415990078, "grad_norm": 12.115494728088379, "learning_rate": 1.924940609324796e-05, "loss": 1.9624, "step": 185800 }, { "epoch": 0.03755028334405573, "grad_norm": 11.382243156433105, "learning_rate": 1.9249002109401654e-05, "loss": 1.9503, "step": 185900 }, { "epoch": 0.037570482528210686, "grad_norm": 14.17362117767334, "learning_rate": 1.9248598125555353e-05, "loss": 1.8462, "step": 186000 }, { "epoch": 0.037570482528210686, "eval_calculated_loss": 8.759293556213379, "eval_loss": 2.1542155742645264, "eval_perplexity": 6369.610216638474, "eval_runtime": 116.2833, "eval_samples_per_second": 8.582, "eval_steps_per_second": 2.15, "step": 186000 }, { "epoch": 0.03759068171236564, "grad_norm": 10.914087295532227, "learning_rate": 1.9248194141709053e-05, "loss": 1.9941, "step": 186100 }, { "epoch": 0.037610880896520586, "grad_norm": 10.856618881225586, "learning_rate": 1.924779015786275e-05, "loss": 1.948, "step": 186200 }, { "epoch": 0.03763108008067554, "grad_norm": 9.326981544494629, "learning_rate": 1.9247386174016447e-05, "loss": 1.8449, "step": 186300 }, { "epoch": 0.037651279264830494, "grad_norm": 12.619038581848145, "learning_rate": 1.9246982190170143e-05, "loss": 1.9158, "step": 186400 }, { "epoch": 0.03767147844898545, "grad_norm": 10.468655586242676, "learning_rate": 1.9246578206323842e-05, "loss": 1.8112, "step": 186500 }, { "epoch": 0.037691677633140394, "grad_norm": 8.628546714782715, "learning_rate": 1.924617422247754e-05, "loss": 1.8621, "step": 186600 }, { "epoch": 0.03771187681729535, "grad_norm": 10.120201110839844, "learning_rate": 1.924577023863124e-05, "loss": 1.9151, "step": 186700 }, { "epoch": 0.0377320760014503, "grad_norm": 7.397960662841797, "learning_rate": 1.924536625478494e-05, "loss": 1.9212, "step": 186800 }, { "epoch": 0.037752275185605255, "grad_norm": 7.0347700119018555, "learning_rate": 1.9244962270938635e-05, "loss": 1.8873, "step": 186900 }, { "epoch": 0.0377724743697602, "grad_norm": 12.240497589111328, "learning_rate": 1.9244558287092335e-05, "loss": 1.8371, "step": 187000 }, { "epoch": 0.0377724743697602, "eval_calculated_loss": 8.960716247558594, "eval_loss": 2.1498329639434814, "eval_perplexity": 7790.935702916603, "eval_runtime": 116.0414, "eval_samples_per_second": 8.6, "eval_steps_per_second": 2.154, "step": 187000 }, { "epoch": 0.037792673553915156, "grad_norm": 10.022509574890137, "learning_rate": 1.9244154303246034e-05, "loss": 1.8797, "step": 187100 }, { "epoch": 0.03781287273807011, "grad_norm": 9.702503204345703, "learning_rate": 1.924375031939973e-05, "loss": 1.8584, "step": 187200 }, { "epoch": 0.03783307192222506, "grad_norm": 9.723908424377441, "learning_rate": 1.924334633555343e-05, "loss": 1.8588, "step": 187300 }, { "epoch": 0.03785327110638002, "grad_norm": 12.206493377685547, "learning_rate": 1.9242942351707124e-05, "loss": 1.8757, "step": 187400 }, { "epoch": 0.037873470290534964, "grad_norm": 7.785461902618408, "learning_rate": 1.9242538367860823e-05, "loss": 1.8185, "step": 187500 }, { "epoch": 0.03789366947468992, "grad_norm": 7.438809871673584, "learning_rate": 1.9242134384014523e-05, "loss": 1.9254, "step": 187600 }, { "epoch": 0.03791386865884487, "grad_norm": 10.319104194641113, "learning_rate": 1.924173040016822e-05, "loss": 1.8663, "step": 187700 }, { "epoch": 0.037934067842999825, "grad_norm": 9.679981231689453, "learning_rate": 1.924132641632192e-05, "loss": 1.8221, "step": 187800 }, { "epoch": 0.03795426702715477, "grad_norm": 10.702319145202637, "learning_rate": 1.9240922432475617e-05, "loss": 1.8201, "step": 187900 }, { "epoch": 0.037974466211309725, "grad_norm": 5.7000813484191895, "learning_rate": 1.9240518448629316e-05, "loss": 1.9292, "step": 188000 }, { "epoch": 0.037974466211309725, "eval_calculated_loss": 8.676486015319824, "eval_loss": 2.16723370552063, "eval_perplexity": 5863.40642760522, "eval_runtime": 114.6128, "eval_samples_per_second": 8.708, "eval_steps_per_second": 2.181, "step": 188000 }, { "epoch": 0.03799466539546468, "grad_norm": 8.915352821350098, "learning_rate": 1.924011446478301e-05, "loss": 2.002, "step": 188100 }, { "epoch": 0.03801486457961963, "grad_norm": 12.192968368530273, "learning_rate": 1.923971048093671e-05, "loss": 1.8916, "step": 188200 }, { "epoch": 0.03803506376377458, "grad_norm": 10.272913932800293, "learning_rate": 1.923930649709041e-05, "loss": 1.8387, "step": 188300 }, { "epoch": 0.03805526294792953, "grad_norm": 8.301383972167969, "learning_rate": 1.9238902513244105e-05, "loss": 1.9187, "step": 188400 }, { "epoch": 0.03807546213208449, "grad_norm": 8.257829666137695, "learning_rate": 1.9238498529397805e-05, "loss": 1.8357, "step": 188500 }, { "epoch": 0.03809566131623944, "grad_norm": 10.053245544433594, "learning_rate": 1.9238094545551504e-05, "loss": 1.9545, "step": 188600 }, { "epoch": 0.03811586050039439, "grad_norm": 10.278520584106445, "learning_rate": 1.9237690561705203e-05, "loss": 1.9365, "step": 188700 }, { "epoch": 0.03813605968454934, "grad_norm": 8.513630867004395, "learning_rate": 1.92372865778589e-05, "loss": 1.8314, "step": 188800 }, { "epoch": 0.038156258868704294, "grad_norm": 11.264870643615723, "learning_rate": 1.9236882594012598e-05, "loss": 1.9147, "step": 188900 }, { "epoch": 0.03817645805285925, "grad_norm": 7.991087913513184, "learning_rate": 1.9236478610166297e-05, "loss": 1.886, "step": 189000 }, { "epoch": 0.03817645805285925, "eval_calculated_loss": 8.800079345703125, "eval_loss": 2.1569089889526367, "eval_perplexity": 6634.770425917558, "eval_runtime": 115.6558, "eval_samples_per_second": 8.629, "eval_steps_per_second": 2.162, "step": 189000 }, { "epoch": 0.038196657237014195, "grad_norm": 6.121678352355957, "learning_rate": 1.9236074626319993e-05, "loss": 1.8103, "step": 189100 }, { "epoch": 0.03821685642116915, "grad_norm": 11.118112564086914, "learning_rate": 1.923567064247369e-05, "loss": 1.9548, "step": 189200 }, { "epoch": 0.0382370556053241, "grad_norm": 8.144820213317871, "learning_rate": 1.923526665862739e-05, "loss": 1.8995, "step": 189300 }, { "epoch": 0.038257254789479056, "grad_norm": 10.166838645935059, "learning_rate": 1.9234862674781087e-05, "loss": 1.8567, "step": 189400 }, { "epoch": 0.038277453973634, "grad_norm": 8.51311206817627, "learning_rate": 1.9234458690934786e-05, "loss": 1.8945, "step": 189500 }, { "epoch": 0.038297653157788956, "grad_norm": 5.624179363250732, "learning_rate": 1.9234054707088485e-05, "loss": 1.7397, "step": 189600 }, { "epoch": 0.03831785234194391, "grad_norm": 12.036606788635254, "learning_rate": 1.923365072324218e-05, "loss": 1.9355, "step": 189700 }, { "epoch": 0.038338051526098864, "grad_norm": 7.774526596069336, "learning_rate": 1.923324673939588e-05, "loss": 1.9063, "step": 189800 }, { "epoch": 0.03835825071025381, "grad_norm": 5.6215667724609375, "learning_rate": 1.923284275554958e-05, "loss": 1.8335, "step": 189900 }, { "epoch": 0.038378449894408764, "grad_norm": 8.797396659851074, "learning_rate": 1.9232438771703278e-05, "loss": 1.8509, "step": 190000 }, { "epoch": 0.038378449894408764, "eval_calculated_loss": 8.742568016052246, "eval_loss": 2.1553781032562256, "eval_perplexity": 6263.9610279036215, "eval_runtime": 116.27, "eval_samples_per_second": 8.583, "eval_steps_per_second": 2.15, "step": 190000 }, { "epoch": 0.03839864907856372, "grad_norm": 6.79378080368042, "learning_rate": 1.9232034787856974e-05, "loss": 1.8631, "step": 190100 }, { "epoch": 0.03841884826271867, "grad_norm": 8.842333793640137, "learning_rate": 1.9231630804010673e-05, "loss": 1.8544, "step": 190200 }, { "epoch": 0.03843904744687362, "grad_norm": 10.558267593383789, "learning_rate": 1.9231226820164372e-05, "loss": 1.9145, "step": 190300 }, { "epoch": 0.03845924663102857, "grad_norm": 5.368472576141357, "learning_rate": 1.9230822836318068e-05, "loss": 1.919, "step": 190400 }, { "epoch": 0.038479445815183526, "grad_norm": 8.330390930175781, "learning_rate": 1.9230418852471767e-05, "loss": 1.9864, "step": 190500 }, { "epoch": 0.03849964499933848, "grad_norm": 7.551198482513428, "learning_rate": 1.9230014868625463e-05, "loss": 1.895, "step": 190600 }, { "epoch": 0.038519844183493426, "grad_norm": 6.901564121246338, "learning_rate": 1.922961088477916e-05, "loss": 1.8954, "step": 190700 }, { "epoch": 0.03854004336764838, "grad_norm": 9.384733200073242, "learning_rate": 1.922920690093286e-05, "loss": 1.8603, "step": 190800 }, { "epoch": 0.038560242551803334, "grad_norm": 9.172077178955078, "learning_rate": 1.922880291708656e-05, "loss": 1.9145, "step": 190900 }, { "epoch": 0.03858044173595829, "grad_norm": 8.177810668945312, "learning_rate": 1.922839893324026e-05, "loss": 1.9929, "step": 191000 }, { "epoch": 0.03858044173595829, "eval_calculated_loss": 8.77212905883789, "eval_loss": 2.153428792953491, "eval_perplexity": 6451.894314478277, "eval_runtime": 116.1575, "eval_samples_per_second": 8.592, "eval_steps_per_second": 2.152, "step": 191000 }, { "epoch": 0.038600640920113234, "grad_norm": 6.675577640533447, "learning_rate": 1.9227994949393955e-05, "loss": 1.9354, "step": 191100 }, { "epoch": 0.03862084010426819, "grad_norm": 5.253208637237549, "learning_rate": 1.9227590965547654e-05, "loss": 1.8394, "step": 191200 }, { "epoch": 0.03864103928842314, "grad_norm": 9.375314712524414, "learning_rate": 1.922718698170135e-05, "loss": 1.9024, "step": 191300 }, { "epoch": 0.038661238472578095, "grad_norm": 9.940751075744629, "learning_rate": 1.922678299785505e-05, "loss": 1.9164, "step": 191400 }, { "epoch": 0.03868143765673304, "grad_norm": 9.08710765838623, "learning_rate": 1.9226379014008748e-05, "loss": 1.8999, "step": 191500 }, { "epoch": 0.038701636840887996, "grad_norm": 12.83515453338623, "learning_rate": 1.9225975030162444e-05, "loss": 1.7939, "step": 191600 }, { "epoch": 0.03872183602504295, "grad_norm": 6.624438285827637, "learning_rate": 1.9225571046316143e-05, "loss": 1.9388, "step": 191700 }, { "epoch": 0.0387420352091979, "grad_norm": 8.720972061157227, "learning_rate": 1.9225167062469842e-05, "loss": 1.8393, "step": 191800 }, { "epoch": 0.03876223439335285, "grad_norm": 11.169381141662598, "learning_rate": 1.922476307862354e-05, "loss": 1.8753, "step": 191900 }, { "epoch": 0.038782433577507804, "grad_norm": 11.21078109741211, "learning_rate": 1.922435909477724e-05, "loss": 1.8218, "step": 192000 }, { "epoch": 0.038782433577507804, "eval_calculated_loss": 8.832292556762695, "eval_loss": 2.153034210205078, "eval_perplexity": 6851.977370064998, "eval_runtime": 114.8323, "eval_samples_per_second": 8.691, "eval_steps_per_second": 2.177, "step": 192000 }, { "epoch": 0.03880263276166276, "grad_norm": 8.838940620422363, "learning_rate": 1.9223955110930936e-05, "loss": 1.916, "step": 192100 }, { "epoch": 0.03882283194581771, "grad_norm": 10.563461303710938, "learning_rate": 1.9223551127084635e-05, "loss": 1.9009, "step": 192200 }, { "epoch": 0.03884303112997266, "grad_norm": 12.017937660217285, "learning_rate": 1.922314714323833e-05, "loss": 1.9385, "step": 192300 }, { "epoch": 0.03886323031412761, "grad_norm": 6.799251556396484, "learning_rate": 1.922274315939203e-05, "loss": 1.896, "step": 192400 }, { "epoch": 0.038883429498282565, "grad_norm": 8.059572219848633, "learning_rate": 1.922233917554573e-05, "loss": 1.8764, "step": 192500 }, { "epoch": 0.03890362868243752, "grad_norm": 3.7331676483154297, "learning_rate": 1.9221935191699425e-05, "loss": 1.9597, "step": 192600 }, { "epoch": 0.038923827866592466, "grad_norm": 9.69257640838623, "learning_rate": 1.9221531207853124e-05, "loss": 1.9069, "step": 192700 }, { "epoch": 0.03894402705074742, "grad_norm": 11.19588851928711, "learning_rate": 1.9221127224006823e-05, "loss": 1.8904, "step": 192800 }, { "epoch": 0.03896422623490237, "grad_norm": 7.841254234313965, "learning_rate": 1.9220723240160522e-05, "loss": 1.8655, "step": 192900 }, { "epoch": 0.03898442541905733, "grad_norm": 16.166717529296875, "learning_rate": 1.9220319256314218e-05, "loss": 1.8917, "step": 193000 }, { "epoch": 0.03898442541905733, "eval_calculated_loss": 8.747621536254883, "eval_loss": 2.164707899093628, "eval_perplexity": 6295.696201138315, "eval_runtime": 116.4917, "eval_samples_per_second": 8.567, "eval_steps_per_second": 2.146, "step": 193000 }, { "epoch": 0.03900462460321227, "grad_norm": 5.1951141357421875, "learning_rate": 1.9219915272467917e-05, "loss": 1.83, "step": 193100 }, { "epoch": 0.03902482378736723, "grad_norm": 6.443612575531006, "learning_rate": 1.9219511288621616e-05, "loss": 1.8379, "step": 193200 }, { "epoch": 0.03904502297152218, "grad_norm": 10.286405563354492, "learning_rate": 1.9219107304775312e-05, "loss": 1.917, "step": 193300 }, { "epoch": 0.039065222155677135, "grad_norm": 9.113442420959473, "learning_rate": 1.921870332092901e-05, "loss": 1.9532, "step": 193400 }, { "epoch": 0.03908542133983208, "grad_norm": 9.518424034118652, "learning_rate": 1.921829933708271e-05, "loss": 1.8615, "step": 193500 }, { "epoch": 0.039105620523987035, "grad_norm": 9.218367576599121, "learning_rate": 1.9217895353236406e-05, "loss": 1.8771, "step": 193600 }, { "epoch": 0.03912581970814199, "grad_norm": 7.302276611328125, "learning_rate": 1.9217491369390105e-05, "loss": 2.031, "step": 193700 }, { "epoch": 0.03914601889229694, "grad_norm": 4.820382595062256, "learning_rate": 1.92170873855438e-05, "loss": 1.8899, "step": 193800 }, { "epoch": 0.03916621807645189, "grad_norm": 6.062244415283203, "learning_rate": 1.9216683401697503e-05, "loss": 1.8828, "step": 193900 }, { "epoch": 0.03918641726060684, "grad_norm": 8.305116653442383, "learning_rate": 1.92162794178512e-05, "loss": 1.9282, "step": 194000 }, { "epoch": 0.03918641726060684, "eval_calculated_loss": 8.820188522338867, "eval_loss": 2.156078338623047, "eval_perplexity": 6769.540714628295, "eval_runtime": 116.1317, "eval_samples_per_second": 8.594, "eval_steps_per_second": 2.153, "step": 194000 }, { "epoch": 0.039206616444761797, "grad_norm": 9.274125099182129, "learning_rate": 1.9215875434004898e-05, "loss": 1.9367, "step": 194100 }, { "epoch": 0.03922681562891675, "grad_norm": 11.487557411193848, "learning_rate": 1.9215471450158597e-05, "loss": 1.8634, "step": 194200 }, { "epoch": 0.0392470148130717, "grad_norm": 10.27464771270752, "learning_rate": 1.9215067466312293e-05, "loss": 1.8315, "step": 194300 }, { "epoch": 0.03926721399722665, "grad_norm": 10.689704895019531, "learning_rate": 1.9214663482465992e-05, "loss": 1.8304, "step": 194400 }, { "epoch": 0.039287413181381604, "grad_norm": 10.70937728881836, "learning_rate": 1.9214259498619688e-05, "loss": 1.8598, "step": 194500 }, { "epoch": 0.03930761236553656, "grad_norm": 10.290186882019043, "learning_rate": 1.9213855514773387e-05, "loss": 1.8959, "step": 194600 }, { "epoch": 0.039327811549691505, "grad_norm": 6.964941501617432, "learning_rate": 1.9213451530927086e-05, "loss": 1.7826, "step": 194700 }, { "epoch": 0.03934801073384646, "grad_norm": 9.228343963623047, "learning_rate": 1.9213047547080782e-05, "loss": 1.9003, "step": 194800 }, { "epoch": 0.03936820991800141, "grad_norm": 4.369132995605469, "learning_rate": 1.921264356323448e-05, "loss": 1.9751, "step": 194900 }, { "epoch": 0.039388409102156366, "grad_norm": 7.334677219390869, "learning_rate": 1.921223957938818e-05, "loss": 1.937, "step": 195000 }, { "epoch": 0.039388409102156366, "eval_calculated_loss": 8.735472679138184, "eval_loss": 2.1507885456085205, "eval_perplexity": 6219.673417552783, "eval_runtime": 116.6008, "eval_samples_per_second": 8.559, "eval_steps_per_second": 2.144, "step": 195000 }, { "epoch": 0.03940860828631131, "grad_norm": 10.138018608093262, "learning_rate": 1.921183559554188e-05, "loss": 1.956, "step": 195100 }, { "epoch": 0.039428807470466266, "grad_norm": 7.39090633392334, "learning_rate": 1.921143161169558e-05, "loss": 1.9249, "step": 195200 }, { "epoch": 0.03944900665462122, "grad_norm": 9.126243591308594, "learning_rate": 1.9211027627849274e-05, "loss": 1.8945, "step": 195300 }, { "epoch": 0.039469205838776174, "grad_norm": 8.288269996643066, "learning_rate": 1.9210623644002973e-05, "loss": 1.8365, "step": 195400 }, { "epoch": 0.03948940502293112, "grad_norm": 9.086993217468262, "learning_rate": 1.921021966015667e-05, "loss": 1.9679, "step": 195500 }, { "epoch": 0.039509604207086074, "grad_norm": 9.450535774230957, "learning_rate": 1.9209815676310368e-05, "loss": 1.9032, "step": 195600 }, { "epoch": 0.03952980339124103, "grad_norm": 5.627622604370117, "learning_rate": 1.9209411692464067e-05, "loss": 1.9589, "step": 195700 }, { "epoch": 0.03955000257539598, "grad_norm": 10.263936042785645, "learning_rate": 1.9209007708617763e-05, "loss": 1.8751, "step": 195800 }, { "epoch": 0.03957020175955093, "grad_norm": 12.04333782196045, "learning_rate": 1.9208603724771462e-05, "loss": 1.9063, "step": 195900 }, { "epoch": 0.03959040094370588, "grad_norm": 8.310556411743164, "learning_rate": 1.920819974092516e-05, "loss": 1.9085, "step": 196000 }, { "epoch": 0.03959040094370588, "eval_calculated_loss": 8.963282585144043, "eval_loss": 2.137153148651123, "eval_perplexity": 7810.955551894553, "eval_runtime": 115.0246, "eval_samples_per_second": 8.676, "eval_steps_per_second": 2.173, "step": 196000 }, { "epoch": 0.039610600127860836, "grad_norm": 16.685335159301758, "learning_rate": 1.920779575707886e-05, "loss": 1.8289, "step": 196100 }, { "epoch": 0.03963079931201579, "grad_norm": 6.464444160461426, "learning_rate": 1.9207391773232556e-05, "loss": 1.8501, "step": 196200 }, { "epoch": 0.039650998496170736, "grad_norm": 4.55080509185791, "learning_rate": 1.9206987789386255e-05, "loss": 1.8507, "step": 196300 }, { "epoch": 0.03967119768032569, "grad_norm": 13.132109642028809, "learning_rate": 1.9206583805539954e-05, "loss": 1.777, "step": 196400 }, { "epoch": 0.039691396864480644, "grad_norm": 6.258199691772461, "learning_rate": 1.920617982169365e-05, "loss": 1.8224, "step": 196500 }, { "epoch": 0.0397115960486356, "grad_norm": 10.072660446166992, "learning_rate": 1.920577583784735e-05, "loss": 1.8622, "step": 196600 }, { "epoch": 0.03973179523279055, "grad_norm": 7.234834671020508, "learning_rate": 1.920537185400105e-05, "loss": 1.8508, "step": 196700 }, { "epoch": 0.0397519944169455, "grad_norm": 10.774652481079102, "learning_rate": 1.9204967870154744e-05, "loss": 1.9028, "step": 196800 }, { "epoch": 0.03977219360110045, "grad_norm": 7.1877946853637695, "learning_rate": 1.9204563886308443e-05, "loss": 1.8328, "step": 196900 }, { "epoch": 0.039792392785255405, "grad_norm": 8.671481132507324, "learning_rate": 1.9204159902462142e-05, "loss": 1.8449, "step": 197000 }, { "epoch": 0.039792392785255405, "eval_calculated_loss": 8.720308303833008, "eval_loss": 2.1451637744903564, "eval_perplexity": 6126.0674870849325, "eval_runtime": 115.2819, "eval_samples_per_second": 8.657, "eval_steps_per_second": 2.169, "step": 197000 }, { "epoch": 0.03981259196941036, "grad_norm": 6.420688152313232, "learning_rate": 1.920375591861584e-05, "loss": 1.9277, "step": 197100 }, { "epoch": 0.039832791153565306, "grad_norm": 8.814358711242676, "learning_rate": 1.9203351934769537e-05, "loss": 1.7965, "step": 197200 }, { "epoch": 0.03985299033772026, "grad_norm": 6.279759883880615, "learning_rate": 1.9202947950923236e-05, "loss": 1.9058, "step": 197300 }, { "epoch": 0.03987318952187521, "grad_norm": 4.893459796905518, "learning_rate": 1.9202543967076935e-05, "loss": 1.9293, "step": 197400 }, { "epoch": 0.03989338870603017, "grad_norm": 8.133665084838867, "learning_rate": 1.920213998323063e-05, "loss": 1.8087, "step": 197500 }, { "epoch": 0.039913587890185113, "grad_norm": 12.520848274230957, "learning_rate": 1.920173599938433e-05, "loss": 1.8802, "step": 197600 }, { "epoch": 0.03993378707434007, "grad_norm": 6.983875274658203, "learning_rate": 1.920133201553803e-05, "loss": 1.8587, "step": 197700 }, { "epoch": 0.03995398625849502, "grad_norm": 8.400647163391113, "learning_rate": 1.9200928031691725e-05, "loss": 1.7907, "step": 197800 }, { "epoch": 0.039974185442649975, "grad_norm": 12.3555269241333, "learning_rate": 1.9200524047845424e-05, "loss": 1.8392, "step": 197900 }, { "epoch": 0.03999438462680492, "grad_norm": 4.834227085113525, "learning_rate": 1.920012006399912e-05, "loss": 1.84, "step": 198000 }, { "epoch": 0.03999438462680492, "eval_calculated_loss": 8.702235221862793, "eval_loss": 2.15303635597229, "eval_perplexity": 6016.345064994152, "eval_runtime": 116.1681, "eval_samples_per_second": 8.591, "eval_steps_per_second": 2.152, "step": 198000 }, { "epoch": 0.040014583810959875, "grad_norm": 9.216290473937988, "learning_rate": 1.9199716080152823e-05, "loss": 1.8608, "step": 198100 }, { "epoch": 0.04003478299511483, "grad_norm": 7.627422332763672, "learning_rate": 1.919931209630652e-05, "loss": 1.861, "step": 198200 }, { "epoch": 0.04005498217926978, "grad_norm": 9.890130043029785, "learning_rate": 1.9198908112460217e-05, "loss": 1.8878, "step": 198300 }, { "epoch": 0.04007518136342473, "grad_norm": 11.87736701965332, "learning_rate": 1.9198504128613917e-05, "loss": 1.9142, "step": 198400 }, { "epoch": 0.04009538054757968, "grad_norm": 25.417097091674805, "learning_rate": 1.9198100144767612e-05, "loss": 1.892, "step": 198500 }, { "epoch": 0.04011557973173464, "grad_norm": 6.256922245025635, "learning_rate": 1.919769616092131e-05, "loss": 1.9343, "step": 198600 }, { "epoch": 0.04013577891588959, "grad_norm": 5.34206485748291, "learning_rate": 1.9197292177075007e-05, "loss": 1.8326, "step": 198700 }, { "epoch": 0.04015597810004454, "grad_norm": 6.527047634124756, "learning_rate": 1.9196888193228706e-05, "loss": 1.8175, "step": 198800 }, { "epoch": 0.04017617728419949, "grad_norm": 12.728317260742188, "learning_rate": 1.9196484209382405e-05, "loss": 1.9316, "step": 198900 }, { "epoch": 0.040196376468354444, "grad_norm": 8.482234954833984, "learning_rate": 1.91960802255361e-05, "loss": 1.85, "step": 199000 }, { "epoch": 0.040196376468354444, "eval_calculated_loss": 8.703866004943848, "eval_loss": 2.1559948921203613, "eval_perplexity": 6026.164423181354, "eval_runtime": 114.6548, "eval_samples_per_second": 8.704, "eval_steps_per_second": 2.18, "step": 199000 }, { "epoch": 0.0402165756525094, "grad_norm": 8.785974502563477, "learning_rate": 1.9195676241689804e-05, "loss": 1.8392, "step": 199100 }, { "epoch": 0.040236774836664345, "grad_norm": 7.740417957305908, "learning_rate": 1.91952722578435e-05, "loss": 1.9412, "step": 199200 }, { "epoch": 0.0402569740208193, "grad_norm": 11.07666015625, "learning_rate": 1.91948682739972e-05, "loss": 1.862, "step": 199300 }, { "epoch": 0.04027717320497425, "grad_norm": 8.411739349365234, "learning_rate": 1.9194464290150894e-05, "loss": 1.9669, "step": 199400 }, { "epoch": 0.040297372389129206, "grad_norm": 6.379426956176758, "learning_rate": 1.9194060306304593e-05, "loss": 1.806, "step": 199500 }, { "epoch": 0.04031757157328415, "grad_norm": 5.333110809326172, "learning_rate": 1.9193656322458293e-05, "loss": 1.8056, "step": 199600 }, { "epoch": 0.040337770757439106, "grad_norm": 8.40981674194336, "learning_rate": 1.9193252338611988e-05, "loss": 1.8772, "step": 199700 }, { "epoch": 0.04035796994159406, "grad_norm": 7.882214069366455, "learning_rate": 1.9192848354765687e-05, "loss": 1.7926, "step": 199800 }, { "epoch": 0.040378169125749014, "grad_norm": 10.435395240783691, "learning_rate": 1.9192444370919387e-05, "loss": 1.8429, "step": 199900 }, { "epoch": 0.04039836830990396, "grad_norm": 7.487983703613281, "learning_rate": 1.9192040387073082e-05, "loss": 1.8069, "step": 200000 }, { "epoch": 0.04039836830990396, "eval_calculated_loss": 8.797872543334961, "eval_loss": 2.156423568725586, "eval_perplexity": 6620.144942540574, "eval_runtime": 116.398, "eval_samples_per_second": 8.574, "eval_steps_per_second": 2.148, "step": 200000 }, { "epoch": 0.040418567494058914, "grad_norm": 10.260890007019043, "learning_rate": 1.9191636403226785e-05, "loss": 1.8166, "step": 200100 }, { "epoch": 0.04043876667821387, "grad_norm": 9.908806800842285, "learning_rate": 1.919123241938048e-05, "loss": 1.8354, "step": 200200 }, { "epoch": 0.04045896586236882, "grad_norm": 9.756707191467285, "learning_rate": 1.919082843553418e-05, "loss": 1.8721, "step": 200300 }, { "epoch": 0.04047916504652377, "grad_norm": 7.318221092224121, "learning_rate": 1.9190424451687875e-05, "loss": 1.8694, "step": 200400 }, { "epoch": 0.04049936423067872, "grad_norm": 6.3385491371154785, "learning_rate": 1.9190020467841575e-05, "loss": 1.8891, "step": 200500 }, { "epoch": 0.040519563414833676, "grad_norm": 7.155636310577393, "learning_rate": 1.9189616483995274e-05, "loss": 1.8247, "step": 200600 }, { "epoch": 0.04053976259898863, "grad_norm": 5.3379106521606445, "learning_rate": 1.918921250014897e-05, "loss": 1.811, "step": 200700 }, { "epoch": 0.040559961783143576, "grad_norm": 10.028721809387207, "learning_rate": 1.918880851630267e-05, "loss": 1.8249, "step": 200800 }, { "epoch": 0.04058016096729853, "grad_norm": 12.235604286193848, "learning_rate": 1.9188404532456368e-05, "loss": 1.9419, "step": 200900 }, { "epoch": 0.040600360151453484, "grad_norm": 10.906242370605469, "learning_rate": 1.9188000548610063e-05, "loss": 1.878, "step": 201000 }, { "epoch": 0.040600360151453484, "eval_calculated_loss": 8.6763277053833, "eval_loss": 2.150991916656494, "eval_perplexity": 5862.478265576429, "eval_runtime": 116.1603, "eval_samples_per_second": 8.592, "eval_steps_per_second": 2.152, "step": 201000 }, { "epoch": 0.04062055933560844, "grad_norm": 8.82659912109375, "learning_rate": 1.9187596564763763e-05, "loss": 1.8455, "step": 201100 }, { "epoch": 0.040640758519763384, "grad_norm": 5.211157321929932, "learning_rate": 1.918719258091746e-05, "loss": 1.8896, "step": 201200 }, { "epoch": 0.04066095770391834, "grad_norm": 7.598946571350098, "learning_rate": 1.918678859707116e-05, "loss": 1.8872, "step": 201300 }, { "epoch": 0.04068115688807329, "grad_norm": 11.124927520751953, "learning_rate": 1.9186384613224857e-05, "loss": 1.8729, "step": 201400 }, { "epoch": 0.040701356072228245, "grad_norm": 10.32551097869873, "learning_rate": 1.9185980629378556e-05, "loss": 1.8993, "step": 201500 }, { "epoch": 0.04072155525638319, "grad_norm": 7.5982584953308105, "learning_rate": 1.9185576645532255e-05, "loss": 1.8599, "step": 201600 }, { "epoch": 0.040741754440538146, "grad_norm": 5.777035236358643, "learning_rate": 1.918517266168595e-05, "loss": 1.9439, "step": 201700 }, { "epoch": 0.0407619536246931, "grad_norm": 8.381050109863281, "learning_rate": 1.918476867783965e-05, "loss": 1.9074, "step": 201800 }, { "epoch": 0.04078215280884805, "grad_norm": 8.055017471313477, "learning_rate": 1.9184364693993345e-05, "loss": 1.8742, "step": 201900 }, { "epoch": 0.040802351993003, "grad_norm": 8.445761680603027, "learning_rate": 1.9183960710147045e-05, "loss": 1.8141, "step": 202000 }, { "epoch": 0.040802351993003, "eval_calculated_loss": 8.899706840515137, "eval_loss": 2.151169776916504, "eval_perplexity": 7329.824416603445, "eval_runtime": 114.7165, "eval_samples_per_second": 8.7, "eval_steps_per_second": 2.179, "step": 202000 }, { "epoch": 0.040822551177157954, "grad_norm": 9.612513542175293, "learning_rate": 1.9183556726300744e-05, "loss": 1.886, "step": 202100 }, { "epoch": 0.04084275036131291, "grad_norm": 5.811991214752197, "learning_rate": 1.9183152742454443e-05, "loss": 1.8142, "step": 202200 }, { "epoch": 0.04086294954546786, "grad_norm": 5.465378284454346, "learning_rate": 1.9182748758608142e-05, "loss": 1.9, "step": 202300 }, { "epoch": 0.04088314872962281, "grad_norm": 10.064153671264648, "learning_rate": 1.9182344774761838e-05, "loss": 1.9616, "step": 202400 }, { "epoch": 0.04090334791377776, "grad_norm": 5.648518085479736, "learning_rate": 1.9181940790915537e-05, "loss": 1.7825, "step": 202500 }, { "epoch": 0.040923547097932715, "grad_norm": 10.937169075012207, "learning_rate": 1.9181536807069236e-05, "loss": 1.8728, "step": 202600 }, { "epoch": 0.04094374628208767, "grad_norm": 8.633846282958984, "learning_rate": 1.918113282322293e-05, "loss": 1.8053, "step": 202700 }, { "epoch": 0.040963945466242616, "grad_norm": 5.777010917663574, "learning_rate": 1.918072883937663e-05, "loss": 1.9042, "step": 202800 }, { "epoch": 0.04098414465039757, "grad_norm": 6.338438987731934, "learning_rate": 1.9180324855530327e-05, "loss": 1.8634, "step": 202900 }, { "epoch": 0.04100434383455252, "grad_norm": 10.789576530456543, "learning_rate": 1.9179920871684026e-05, "loss": 1.8876, "step": 203000 }, { "epoch": 0.04100434383455252, "eval_calculated_loss": 8.925415992736816, "eval_loss": 2.1508677005767822, "eval_perplexity": 7520.711243595524, "eval_runtime": 114.369, "eval_samples_per_second": 8.726, "eval_steps_per_second": 2.186, "step": 203000 }, { "epoch": 0.04102454301870748, "grad_norm": 9.623980522155762, "learning_rate": 1.9179516887837725e-05, "loss": 1.9121, "step": 203100 }, { "epoch": 0.04104474220286242, "grad_norm": 11.223094940185547, "learning_rate": 1.9179112903991424e-05, "loss": 1.8714, "step": 203200 }, { "epoch": 0.04106494138701738, "grad_norm": 5.84520959854126, "learning_rate": 1.9178708920145123e-05, "loss": 1.9315, "step": 203300 }, { "epoch": 0.04108514057117233, "grad_norm": 6.134271144866943, "learning_rate": 1.917830493629882e-05, "loss": 1.8937, "step": 203400 }, { "epoch": 0.041105339755327285, "grad_norm": 6.660818099975586, "learning_rate": 1.9177900952452518e-05, "loss": 1.7996, "step": 203500 }, { "epoch": 0.04112553893948223, "grad_norm": 5.901226997375488, "learning_rate": 1.9177496968606214e-05, "loss": 1.9126, "step": 203600 }, { "epoch": 0.041145738123637185, "grad_norm": 8.107010841369629, "learning_rate": 1.9177092984759913e-05, "loss": 1.9246, "step": 203700 }, { "epoch": 0.04116593730779214, "grad_norm": 10.130114555358887, "learning_rate": 1.9176689000913612e-05, "loss": 1.9042, "step": 203800 }, { "epoch": 0.04118613649194709, "grad_norm": 8.689801216125488, "learning_rate": 1.9176285017067308e-05, "loss": 1.787, "step": 203900 }, { "epoch": 0.04120633567610204, "grad_norm": 12.473678588867188, "learning_rate": 1.9175881033221007e-05, "loss": 1.8406, "step": 204000 }, { "epoch": 0.04120633567610204, "eval_calculated_loss": 9.061912536621094, "eval_loss": 2.1543405055999756, "eval_perplexity": 8620.622153395178, "eval_runtime": 115.1384, "eval_samples_per_second": 8.668, "eval_steps_per_second": 2.171, "step": 204000 }, { "epoch": 0.04122653486025699, "grad_norm": 10.928964614868164, "learning_rate": 1.9175477049374706e-05, "loss": 1.8941, "step": 204100 }, { "epoch": 0.041246734044411947, "grad_norm": 6.528296947479248, "learning_rate": 1.91750730655284e-05, "loss": 1.7851, "step": 204200 }, { "epoch": 0.0412669332285669, "grad_norm": 6.1570563316345215, "learning_rate": 1.91746690816821e-05, "loss": 1.8422, "step": 204300 }, { "epoch": 0.04128713241272185, "grad_norm": 8.017904281616211, "learning_rate": 1.91742650978358e-05, "loss": 1.8721, "step": 204400 }, { "epoch": 0.0413073315968768, "grad_norm": 9.161508560180664, "learning_rate": 1.91738611139895e-05, "loss": 1.9038, "step": 204500 }, { "epoch": 0.041327530781031754, "grad_norm": 9.952686309814453, "learning_rate": 1.9173457130143195e-05, "loss": 1.9042, "step": 204600 }, { "epoch": 0.04134772996518671, "grad_norm": 6.418429851531982, "learning_rate": 1.9173053146296894e-05, "loss": 1.8478, "step": 204700 }, { "epoch": 0.041367929149341655, "grad_norm": 8.840271949768066, "learning_rate": 1.9172649162450593e-05, "loss": 1.9412, "step": 204800 }, { "epoch": 0.04138812833349661, "grad_norm": 12.747471809387207, "learning_rate": 1.917224517860429e-05, "loss": 1.8362, "step": 204900 }, { "epoch": 0.04140832751765156, "grad_norm": 7.227093696594238, "learning_rate": 1.9171841194757988e-05, "loss": 1.8971, "step": 205000 }, { "epoch": 0.04140832751765156, "eval_calculated_loss": 8.830872535705566, "eval_loss": 2.1394405364990234, "eval_perplexity": 6842.254323016154, "eval_runtime": 114.433, "eval_samples_per_second": 8.721, "eval_steps_per_second": 2.185, "step": 205000 }, { "epoch": 0.041428526701806516, "grad_norm": 12.339046478271484, "learning_rate": 1.9171437210911684e-05, "loss": 1.8529, "step": 205100 }, { "epoch": 0.04144872588596146, "grad_norm": 10.103473663330078, "learning_rate": 1.9171033227065383e-05, "loss": 1.9598, "step": 205200 }, { "epoch": 0.041468925070116416, "grad_norm": 7.742167949676514, "learning_rate": 1.9170629243219082e-05, "loss": 1.7994, "step": 205300 }, { "epoch": 0.04148912425427137, "grad_norm": 5.372630596160889, "learning_rate": 1.917022525937278e-05, "loss": 1.8247, "step": 205400 }, { "epoch": 0.041509323438426324, "grad_norm": 6.624951362609863, "learning_rate": 1.916982127552648e-05, "loss": 1.8903, "step": 205500 }, { "epoch": 0.04152952262258127, "grad_norm": 7.686150074005127, "learning_rate": 1.9169417291680176e-05, "loss": 1.8704, "step": 205600 }, { "epoch": 0.041549721806736224, "grad_norm": 4.398284435272217, "learning_rate": 1.9169013307833875e-05, "loss": 1.8331, "step": 205700 }, { "epoch": 0.04156992099089118, "grad_norm": 10.608869552612305, "learning_rate": 1.9168609323987574e-05, "loss": 1.8325, "step": 205800 }, { "epoch": 0.04159012017504613, "grad_norm": 8.445277214050293, "learning_rate": 1.916820534014127e-05, "loss": 1.866, "step": 205900 }, { "epoch": 0.041610319359201085, "grad_norm": 4.5129804611206055, "learning_rate": 1.916780135629497e-05, "loss": 1.9396, "step": 206000 }, { "epoch": 0.041610319359201085, "eval_calculated_loss": 8.997671127319336, "eval_loss": 2.1393935680389404, "eval_perplexity": 8084.234833876248, "eval_runtime": 117.2306, "eval_samples_per_second": 8.513, "eval_steps_per_second": 2.133, "step": 206000 }, { "epoch": 0.04163051854335603, "grad_norm": 5.684268474578857, "learning_rate": 1.9167397372448665e-05, "loss": 1.8829, "step": 206100 }, { "epoch": 0.041650717727510986, "grad_norm": 11.033501625061035, "learning_rate": 1.9166993388602364e-05, "loss": 1.9275, "step": 206200 }, { "epoch": 0.04167091691166594, "grad_norm": 8.640451431274414, "learning_rate": 1.9166589404756063e-05, "loss": 1.8402, "step": 206300 }, { "epoch": 0.04169111609582089, "grad_norm": 9.046432495117188, "learning_rate": 1.9166185420909762e-05, "loss": 1.8504, "step": 206400 }, { "epoch": 0.04171131527997584, "grad_norm": 6.564426422119141, "learning_rate": 1.916578143706346e-05, "loss": 1.8391, "step": 206500 }, { "epoch": 0.041731514464130794, "grad_norm": 6.5555644035339355, "learning_rate": 1.9165377453217157e-05, "loss": 1.8742, "step": 206600 }, { "epoch": 0.04175171364828575, "grad_norm": 4.981863498687744, "learning_rate": 1.9164973469370856e-05, "loss": 1.8772, "step": 206700 }, { "epoch": 0.0417719128324407, "grad_norm": 10.192010879516602, "learning_rate": 1.9164569485524552e-05, "loss": 1.8025, "step": 206800 }, { "epoch": 0.04179211201659565, "grad_norm": 10.951991081237793, "learning_rate": 1.916416550167825e-05, "loss": 1.7281, "step": 206900 }, { "epoch": 0.0418123112007506, "grad_norm": 8.597357749938965, "learning_rate": 1.916376151783195e-05, "loss": 1.9458, "step": 207000 }, { "epoch": 0.0418123112007506, "eval_calculated_loss": 8.928159713745117, "eval_loss": 2.145660400390625, "eval_perplexity": 7541.374310915254, "eval_runtime": 116.8682, "eval_samples_per_second": 8.54, "eval_steps_per_second": 2.139, "step": 207000 }, { "epoch": 0.041832510384905555, "grad_norm": 8.840129852294922, "learning_rate": 1.9163357533985646e-05, "loss": 1.9385, "step": 207100 }, { "epoch": 0.04185270956906051, "grad_norm": 10.445405006408691, "learning_rate": 1.9162953550139345e-05, "loss": 1.8747, "step": 207200 }, { "epoch": 0.041872908753215456, "grad_norm": 9.782524108886719, "learning_rate": 1.9162549566293044e-05, "loss": 1.8437, "step": 207300 }, { "epoch": 0.04189310793737041, "grad_norm": 6.390991687774658, "learning_rate": 1.9162145582446743e-05, "loss": 1.7623, "step": 207400 }, { "epoch": 0.04191330712152536, "grad_norm": 6.593479156494141, "learning_rate": 1.9161741598600442e-05, "loss": 1.9116, "step": 207500 }, { "epoch": 0.04193350630568032, "grad_norm": 9.215277671813965, "learning_rate": 1.9161337614754138e-05, "loss": 1.9118, "step": 207600 }, { "epoch": 0.041953705489835263, "grad_norm": 9.492897033691406, "learning_rate": 1.9160933630907837e-05, "loss": 1.8856, "step": 207700 }, { "epoch": 0.04197390467399022, "grad_norm": 6.333498954772949, "learning_rate": 1.9160529647061533e-05, "loss": 1.789, "step": 207800 }, { "epoch": 0.04199410385814517, "grad_norm": 10.276865005493164, "learning_rate": 1.9160125663215232e-05, "loss": 1.871, "step": 207900 }, { "epoch": 0.042014303042300125, "grad_norm": 7.308897972106934, "learning_rate": 1.915972167936893e-05, "loss": 1.73, "step": 208000 }, { "epoch": 0.042014303042300125, "eval_calculated_loss": 9.019959449768066, "eval_loss": 2.1523001194000244, "eval_perplexity": 8266.441868330212, "eval_runtime": 115.6627, "eval_samples_per_second": 8.629, "eval_steps_per_second": 2.161, "step": 208000 }, { "epoch": 0.04203450222645507, "grad_norm": 8.757702827453613, "learning_rate": 1.9159317695522627e-05, "loss": 1.8817, "step": 208100 }, { "epoch": 0.042054701410610025, "grad_norm": 13.332296371459961, "learning_rate": 1.9158913711676326e-05, "loss": 1.9693, "step": 208200 }, { "epoch": 0.04207490059476498, "grad_norm": 8.663498878479004, "learning_rate": 1.9158509727830025e-05, "loss": 1.8596, "step": 208300 }, { "epoch": 0.04209509977891993, "grad_norm": 9.174738883972168, "learning_rate": 1.9158105743983724e-05, "loss": 1.8489, "step": 208400 }, { "epoch": 0.04211529896307488, "grad_norm": 5.517697811126709, "learning_rate": 1.915770176013742e-05, "loss": 1.9229, "step": 208500 }, { "epoch": 0.04213549814722983, "grad_norm": 6.75238037109375, "learning_rate": 1.915729777629112e-05, "loss": 1.9038, "step": 208600 }, { "epoch": 0.04215569733138479, "grad_norm": 6.900482177734375, "learning_rate": 1.915689379244482e-05, "loss": 1.858, "step": 208700 }, { "epoch": 0.04217589651553974, "grad_norm": 6.432459354400635, "learning_rate": 1.9156489808598514e-05, "loss": 1.8807, "step": 208800 }, { "epoch": 0.04219609569969469, "grad_norm": 5.592904567718506, "learning_rate": 1.9156085824752213e-05, "loss": 1.8546, "step": 208900 }, { "epoch": 0.04221629488384964, "grad_norm": 12.140987396240234, "learning_rate": 1.9155681840905912e-05, "loss": 1.8554, "step": 209000 }, { "epoch": 0.04221629488384964, "eval_calculated_loss": 8.781381607055664, "eval_loss": 2.1414670944213867, "eval_perplexity": 6511.867803660689, "eval_runtime": 116.033, "eval_samples_per_second": 8.601, "eval_steps_per_second": 2.155, "step": 209000 }, { "epoch": 0.042236494068004594, "grad_norm": 10.906598091125488, "learning_rate": 1.9155277857059608e-05, "loss": 1.9443, "step": 209100 }, { "epoch": 0.04225669325215955, "grad_norm": 10.416831016540527, "learning_rate": 1.9154873873213307e-05, "loss": 1.7877, "step": 209200 }, { "epoch": 0.042276892436314495, "grad_norm": 9.313643455505371, "learning_rate": 1.9154469889367003e-05, "loss": 1.9457, "step": 209300 }, { "epoch": 0.04229709162046945, "grad_norm": 8.68413257598877, "learning_rate": 1.9154065905520702e-05, "loss": 1.8619, "step": 209400 }, { "epoch": 0.0423172908046244, "grad_norm": 7.964523792266846, "learning_rate": 1.91536619216744e-05, "loss": 1.8665, "step": 209500 }, { "epoch": 0.042337489988779356, "grad_norm": 8.142499923706055, "learning_rate": 1.91532579378281e-05, "loss": 1.9776, "step": 209600 }, { "epoch": 0.0423576891729343, "grad_norm": 8.650781631469727, "learning_rate": 1.91528539539818e-05, "loss": 1.8955, "step": 209700 }, { "epoch": 0.042377888357089256, "grad_norm": 8.223980903625488, "learning_rate": 1.9152449970135495e-05, "loss": 1.9664, "step": 209800 }, { "epoch": 0.04239808754124421, "grad_norm": 8.863676071166992, "learning_rate": 1.9152045986289194e-05, "loss": 1.9137, "step": 209900 }, { "epoch": 0.042418286725399164, "grad_norm": 10.316213607788086, "learning_rate": 1.915164200244289e-05, "loss": 1.879, "step": 210000 }, { "epoch": 0.042418286725399164, "eval_calculated_loss": 8.752593994140625, "eval_loss": 2.139275074005127, "eval_perplexity": 6327.079246131675, "eval_runtime": 116.0475, "eval_samples_per_second": 8.6, "eval_steps_per_second": 2.154, "step": 210000 }, { "epoch": 0.04243848590955411, "grad_norm": 8.482129096984863, "learning_rate": 1.915123801859659e-05, "loss": 1.8393, "step": 210100 }, { "epoch": 0.042458685093709064, "grad_norm": 7.937465667724609, "learning_rate": 1.915083403475029e-05, "loss": 1.7357, "step": 210200 }, { "epoch": 0.04247888427786402, "grad_norm": 7.247908592224121, "learning_rate": 1.9150430050903984e-05, "loss": 1.8658, "step": 210300 }, { "epoch": 0.04249908346201897, "grad_norm": 8.91141128540039, "learning_rate": 1.9150026067057683e-05, "loss": 1.7944, "step": 210400 }, { "epoch": 0.04251928264617392, "grad_norm": 10.109628677368164, "learning_rate": 1.9149622083211382e-05, "loss": 1.9434, "step": 210500 }, { "epoch": 0.04253948183032887, "grad_norm": 9.638161659240723, "learning_rate": 1.914921809936508e-05, "loss": 1.83, "step": 210600 }, { "epoch": 0.042559681014483826, "grad_norm": 7.158429145812988, "learning_rate": 1.914881411551878e-05, "loss": 1.8998, "step": 210700 }, { "epoch": 0.04257988019863878, "grad_norm": 9.779906272888184, "learning_rate": 1.9148410131672476e-05, "loss": 1.8654, "step": 210800 }, { "epoch": 0.042600079382793726, "grad_norm": 7.648034572601318, "learning_rate": 1.9148006147826175e-05, "loss": 1.8889, "step": 210900 }, { "epoch": 0.04262027856694868, "grad_norm": 7.930856704711914, "learning_rate": 1.914760216397987e-05, "loss": 1.9122, "step": 211000 }, { "epoch": 0.04262027856694868, "eval_calculated_loss": 8.86432933807373, "eval_loss": 2.135540008544922, "eval_perplexity": 7075.046805258038, "eval_runtime": 116.0306, "eval_samples_per_second": 8.601, "eval_steps_per_second": 2.155, "step": 211000 }, { "epoch": 0.042640477751103634, "grad_norm": 9.21384048461914, "learning_rate": 1.914719818013357e-05, "loss": 1.9583, "step": 211100 }, { "epoch": 0.04266067693525859, "grad_norm": 13.15894603729248, "learning_rate": 1.914679419628727e-05, "loss": 1.8843, "step": 211200 }, { "epoch": 0.042680876119413534, "grad_norm": 9.083513259887695, "learning_rate": 1.9146390212440965e-05, "loss": 1.9493, "step": 211300 }, { "epoch": 0.04270107530356849, "grad_norm": 7.425697326660156, "learning_rate": 1.9145986228594664e-05, "loss": 1.8734, "step": 211400 }, { "epoch": 0.04272127448772344, "grad_norm": 6.376762866973877, "learning_rate": 1.9145582244748363e-05, "loss": 1.8219, "step": 211500 }, { "epoch": 0.042741473671878395, "grad_norm": 9.918581008911133, "learning_rate": 1.9145178260902063e-05, "loss": 1.8764, "step": 211600 }, { "epoch": 0.04276167285603334, "grad_norm": 10.603432655334473, "learning_rate": 1.914477427705576e-05, "loss": 1.9138, "step": 211700 }, { "epoch": 0.042781872040188296, "grad_norm": 8.386831283569336, "learning_rate": 1.9144370293209457e-05, "loss": 1.9228, "step": 211800 }, { "epoch": 0.04280207122434325, "grad_norm": 7.5070905685424805, "learning_rate": 1.9143966309363157e-05, "loss": 1.8109, "step": 211900 }, { "epoch": 0.0428222704084982, "grad_norm": 8.648841857910156, "learning_rate": 1.9143562325516852e-05, "loss": 1.826, "step": 212000 }, { "epoch": 0.0428222704084982, "eval_calculated_loss": 8.794268608093262, "eval_loss": 2.1539084911346436, "eval_perplexity": 6596.329309653469, "eval_runtime": 117.9258, "eval_samples_per_second": 8.463, "eval_steps_per_second": 2.12, "step": 212000 }, { "epoch": 0.04284246959265315, "grad_norm": 5.1313886642456055, "learning_rate": 1.914315834167055e-05, "loss": 1.8295, "step": 212100 }, { "epoch": 0.042862668776808104, "grad_norm": 6.8429388999938965, "learning_rate": 1.914275435782425e-05, "loss": 1.8097, "step": 212200 }, { "epoch": 0.04288286796096306, "grad_norm": 8.551163673400879, "learning_rate": 1.9142350373977946e-05, "loss": 1.8947, "step": 212300 }, { "epoch": 0.04290306714511801, "grad_norm": 12.240937232971191, "learning_rate": 1.9141946390131645e-05, "loss": 1.8523, "step": 212400 }, { "epoch": 0.04292326632927296, "grad_norm": 17.22944450378418, "learning_rate": 1.914154240628534e-05, "loss": 1.8151, "step": 212500 }, { "epoch": 0.04294346551342791, "grad_norm": 4.994401454925537, "learning_rate": 1.9141138422439044e-05, "loss": 1.8079, "step": 212600 }, { "epoch": 0.042963664697582865, "grad_norm": 10.61722469329834, "learning_rate": 1.914073443859274e-05, "loss": 1.8981, "step": 212700 }, { "epoch": 0.04298386388173782, "grad_norm": 11.347257614135742, "learning_rate": 1.914033045474644e-05, "loss": 1.9185, "step": 212800 }, { "epoch": 0.043004063065892766, "grad_norm": 6.7535400390625, "learning_rate": 1.9139926470900138e-05, "loss": 1.9017, "step": 212900 }, { "epoch": 0.04302426225004772, "grad_norm": 8.252427101135254, "learning_rate": 1.9139522487053833e-05, "loss": 1.8302, "step": 213000 }, { "epoch": 0.04302426225004772, "eval_calculated_loss": 8.975553512573242, "eval_loss": 2.162919759750366, "eval_perplexity": 7907.393703303067, "eval_runtime": 117.7372, "eval_samples_per_second": 8.477, "eval_steps_per_second": 2.123, "step": 213000 }, { "epoch": 0.04304446143420267, "grad_norm": 9.20660400390625, "learning_rate": 1.9139118503207533e-05, "loss": 1.7644, "step": 213100 }, { "epoch": 0.04306466061835763, "grad_norm": 9.77186393737793, "learning_rate": 1.913871451936123e-05, "loss": 1.8467, "step": 213200 }, { "epoch": 0.04308485980251257, "grad_norm": 6.236815452575684, "learning_rate": 1.9138310535514927e-05, "loss": 1.9232, "step": 213300 }, { "epoch": 0.04310505898666753, "grad_norm": 9.243416786193848, "learning_rate": 1.9137906551668627e-05, "loss": 1.8127, "step": 213400 }, { "epoch": 0.04312525817082248, "grad_norm": 9.30537223815918, "learning_rate": 1.9137502567822322e-05, "loss": 1.9157, "step": 213500 }, { "epoch": 0.043145457354977435, "grad_norm": 7.540266990661621, "learning_rate": 1.9137098583976025e-05, "loss": 1.8792, "step": 213600 }, { "epoch": 0.04316565653913238, "grad_norm": 11.966422080993652, "learning_rate": 1.913669460012972e-05, "loss": 1.8265, "step": 213700 }, { "epoch": 0.043185855723287335, "grad_norm": 7.825847625732422, "learning_rate": 1.913629061628342e-05, "loss": 1.8357, "step": 213800 }, { "epoch": 0.04320605490744229, "grad_norm": 9.242196083068848, "learning_rate": 1.913588663243712e-05, "loss": 1.9687, "step": 213900 }, { "epoch": 0.04322625409159724, "grad_norm": 10.24905014038086, "learning_rate": 1.9135482648590815e-05, "loss": 1.9197, "step": 214000 }, { "epoch": 0.04322625409159724, "eval_calculated_loss": 8.85120964050293, "eval_loss": 2.1426444053649902, "eval_perplexity": 6982.830578113388, "eval_runtime": 117.8297, "eval_samples_per_second": 8.47, "eval_steps_per_second": 2.122, "step": 214000 }, { "epoch": 0.04324645327575219, "grad_norm": 5.815005302429199, "learning_rate": 1.9135078664744514e-05, "loss": 1.8443, "step": 214100 }, { "epoch": 0.04326665245990714, "grad_norm": 5.8815598487854, "learning_rate": 1.913467468089821e-05, "loss": 1.8526, "step": 214200 }, { "epoch": 0.043286851644062097, "grad_norm": 12.368746757507324, "learning_rate": 1.913427069705191e-05, "loss": 1.8663, "step": 214300 }, { "epoch": 0.04330705082821705, "grad_norm": 6.6092963218688965, "learning_rate": 1.9133866713205608e-05, "loss": 1.9036, "step": 214400 }, { "epoch": 0.043327250012372, "grad_norm": 5.457396984100342, "learning_rate": 1.9133462729359303e-05, "loss": 1.9174, "step": 214500 }, { "epoch": 0.04334744919652695, "grad_norm": 12.003748893737793, "learning_rate": 1.9133058745513006e-05, "loss": 1.8552, "step": 214600 }, { "epoch": 0.043367648380681904, "grad_norm": 9.763564109802246, "learning_rate": 1.91326547616667e-05, "loss": 1.9442, "step": 214700 }, { "epoch": 0.04338784756483686, "grad_norm": 8.937064170837402, "learning_rate": 1.91322507778204e-05, "loss": 1.8638, "step": 214800 }, { "epoch": 0.043408046748991805, "grad_norm": 8.593820571899414, "learning_rate": 1.9131846793974097e-05, "loss": 1.8527, "step": 214900 }, { "epoch": 0.04342824593314676, "grad_norm": 7.414796352386475, "learning_rate": 1.9131442810127796e-05, "loss": 1.7477, "step": 215000 }, { "epoch": 0.04342824593314676, "eval_calculated_loss": 8.876818656921387, "eval_loss": 2.144538402557373, "eval_perplexity": 7163.963418852375, "eval_runtime": 120.1292, "eval_samples_per_second": 8.308, "eval_steps_per_second": 2.081, "step": 215000 }, { "epoch": 0.04344844511730171, "grad_norm": 5.371884822845459, "learning_rate": 1.9131038826281495e-05, "loss": 1.8332, "step": 215100 }, { "epoch": 0.043468644301456666, "grad_norm": 10.723823547363281, "learning_rate": 1.913063484243519e-05, "loss": 1.9385, "step": 215200 }, { "epoch": 0.04348884348561162, "grad_norm": 5.449325084686279, "learning_rate": 1.913023085858889e-05, "loss": 1.908, "step": 215300 }, { "epoch": 0.043509042669766566, "grad_norm": 10.215658187866211, "learning_rate": 1.912982687474259e-05, "loss": 1.9036, "step": 215400 }, { "epoch": 0.04352924185392152, "grad_norm": 4.988282203674316, "learning_rate": 1.9129422890896285e-05, "loss": 1.8057, "step": 215500 }, { "epoch": 0.043549441038076474, "grad_norm": 9.271337509155273, "learning_rate": 1.9129018907049984e-05, "loss": 1.8977, "step": 215600 }, { "epoch": 0.04356964022223143, "grad_norm": 8.808655738830566, "learning_rate": 1.9128614923203683e-05, "loss": 1.8687, "step": 215700 }, { "epoch": 0.043589839406386374, "grad_norm": 4.175015926361084, "learning_rate": 1.9128210939357382e-05, "loss": 1.8092, "step": 215800 }, { "epoch": 0.04361003859054133, "grad_norm": 5.523985862731934, "learning_rate": 1.9127806955511078e-05, "loss": 1.8782, "step": 215900 }, { "epoch": 0.04363023777469628, "grad_norm": 9.314783096313477, "learning_rate": 1.9127402971664777e-05, "loss": 1.9156, "step": 216000 }, { "epoch": 0.04363023777469628, "eval_calculated_loss": 8.727312088012695, "eval_loss": 2.159050464630127, "eval_perplexity": 6169.123743997131, "eval_runtime": 115.9471, "eval_samples_per_second": 8.607, "eval_steps_per_second": 2.156, "step": 216000 }, { "epoch": 0.043650436958851235, "grad_norm": 8.598088264465332, "learning_rate": 1.9126998987818476e-05, "loss": 1.816, "step": 216100 }, { "epoch": 0.04367063614300618, "grad_norm": 6.853077411651611, "learning_rate": 1.912659500397217e-05, "loss": 1.8405, "step": 216200 }, { "epoch": 0.043690835327161136, "grad_norm": 9.568017959594727, "learning_rate": 1.912619102012587e-05, "loss": 1.746, "step": 216300 }, { "epoch": 0.04371103451131609, "grad_norm": 9.818849563598633, "learning_rate": 1.912578703627957e-05, "loss": 1.8533, "step": 216400 }, { "epoch": 0.04373123369547104, "grad_norm": 6.748567581176758, "learning_rate": 1.9125383052433266e-05, "loss": 1.8467, "step": 216500 }, { "epoch": 0.04375143287962599, "grad_norm": 7.000861167907715, "learning_rate": 1.9124979068586965e-05, "loss": 1.9592, "step": 216600 }, { "epoch": 0.043771632063780944, "grad_norm": 6.943436622619629, "learning_rate": 1.9124575084740664e-05, "loss": 1.8336, "step": 216700 }, { "epoch": 0.0437918312479359, "grad_norm": 8.608758926391602, "learning_rate": 1.9124171100894363e-05, "loss": 1.9787, "step": 216800 }, { "epoch": 0.04381203043209085, "grad_norm": 8.623042106628418, "learning_rate": 1.912376711704806e-05, "loss": 1.8608, "step": 216900 }, { "epoch": 0.0438322296162458, "grad_norm": 9.823525428771973, "learning_rate": 1.9123363133201758e-05, "loss": 1.9267, "step": 217000 }, { "epoch": 0.0438322296162458, "eval_calculated_loss": 8.960296630859375, "eval_loss": 2.1433815956115723, "eval_perplexity": 7787.667182004057, "eval_runtime": 120.1393, "eval_samples_per_second": 8.307, "eval_steps_per_second": 2.081, "step": 217000 }, { "epoch": 0.04385242880040075, "grad_norm": 9.438272476196289, "learning_rate": 1.9122959149355457e-05, "loss": 1.9066, "step": 217100 }, { "epoch": 0.043872627984555705, "grad_norm": 6.064783573150635, "learning_rate": 1.9122555165509153e-05, "loss": 1.8743, "step": 217200 }, { "epoch": 0.04389282716871066, "grad_norm": 8.56527328491211, "learning_rate": 1.9122151181662852e-05, "loss": 1.9791, "step": 217300 }, { "epoch": 0.043913026352865606, "grad_norm": 13.001959800720215, "learning_rate": 1.9121747197816548e-05, "loss": 1.8712, "step": 217400 }, { "epoch": 0.04393322553702056, "grad_norm": 8.268044471740723, "learning_rate": 1.9121343213970247e-05, "loss": 1.8099, "step": 217500 }, { "epoch": 0.04395342472117551, "grad_norm": 6.83350944519043, "learning_rate": 1.9120939230123946e-05, "loss": 1.8156, "step": 217600 }, { "epoch": 0.04397362390533047, "grad_norm": 7.982903957366943, "learning_rate": 1.9120535246277645e-05, "loss": 1.8795, "step": 217700 }, { "epoch": 0.043993823089485413, "grad_norm": 11.473187446594238, "learning_rate": 1.9120131262431344e-05, "loss": 1.8829, "step": 217800 }, { "epoch": 0.04401402227364037, "grad_norm": 6.4735517501831055, "learning_rate": 1.911972727858504e-05, "loss": 1.9174, "step": 217900 }, { "epoch": 0.04403422145779532, "grad_norm": 10.567347526550293, "learning_rate": 1.911932329473874e-05, "loss": 1.8583, "step": 218000 }, { "epoch": 0.04403422145779532, "eval_calculated_loss": 8.671298027038574, "eval_loss": 2.1495301723480225, "eval_perplexity": 5833.065914934463, "eval_runtime": 119.1158, "eval_samples_per_second": 8.378, "eval_steps_per_second": 2.099, "step": 218000 }, { "epoch": 0.044054420641950275, "grad_norm": 11.370668411254883, "learning_rate": 1.9118919310892438e-05, "loss": 1.8639, "step": 218100 }, { "epoch": 0.04407461982610522, "grad_norm": 5.206957817077637, "learning_rate": 1.9118515327046134e-05, "loss": 1.7997, "step": 218200 }, { "epoch": 0.044094819010260175, "grad_norm": 8.761788368225098, "learning_rate": 1.9118111343199833e-05, "loss": 1.9269, "step": 218300 }, { "epoch": 0.04411501819441513, "grad_norm": 5.018133640289307, "learning_rate": 1.911770735935353e-05, "loss": 1.9438, "step": 218400 }, { "epoch": 0.04413521737857008, "grad_norm": 5.780869007110596, "learning_rate": 1.9117303375507228e-05, "loss": 1.9539, "step": 218500 }, { "epoch": 0.04415541656272503, "grad_norm": 8.560293197631836, "learning_rate": 1.9116899391660927e-05, "loss": 1.8614, "step": 218600 }, { "epoch": 0.04417561574687998, "grad_norm": 9.864809036254883, "learning_rate": 1.9116495407814623e-05, "loss": 1.9109, "step": 218700 }, { "epoch": 0.04419581493103494, "grad_norm": 7.794581413269043, "learning_rate": 1.9116091423968325e-05, "loss": 1.7974, "step": 218800 }, { "epoch": 0.04421601411518989, "grad_norm": 9.03325080871582, "learning_rate": 1.911568744012202e-05, "loss": 1.8626, "step": 218900 }, { "epoch": 0.04423621329934484, "grad_norm": 5.859469413757324, "learning_rate": 1.911528345627572e-05, "loss": 1.808, "step": 219000 }, { "epoch": 0.04423621329934484, "eval_calculated_loss": 8.764714241027832, "eval_loss": 2.1472702026367188, "eval_perplexity": 6404.2316170879085, "eval_runtime": 129.9255, "eval_samples_per_second": 7.681, "eval_steps_per_second": 1.924, "step": 219000 }, { "epoch": 0.04425641248349979, "grad_norm": 3.509976625442505, "learning_rate": 1.9114879472429416e-05, "loss": 1.86, "step": 219100 }, { "epoch": 0.044276611667654744, "grad_norm": 5.703763961791992, "learning_rate": 1.9114475488583115e-05, "loss": 1.8262, "step": 219200 }, { "epoch": 0.0442968108518097, "grad_norm": 12.3679780960083, "learning_rate": 1.9114071504736814e-05, "loss": 1.7718, "step": 219300 }, { "epoch": 0.044317010035964645, "grad_norm": 9.06519603729248, "learning_rate": 1.911366752089051e-05, "loss": 1.8615, "step": 219400 }, { "epoch": 0.0443372092201196, "grad_norm": 10.217348098754883, "learning_rate": 1.911326353704421e-05, "loss": 1.8613, "step": 219500 }, { "epoch": 0.04435740840427455, "grad_norm": 4.709558963775635, "learning_rate": 1.9112859553197908e-05, "loss": 1.8341, "step": 219600 }, { "epoch": 0.044377607588429506, "grad_norm": 8.504166603088379, "learning_rate": 1.9112455569351604e-05, "loss": 1.8287, "step": 219700 }, { "epoch": 0.04439780677258445, "grad_norm": 5.626651287078857, "learning_rate": 1.9112051585505303e-05, "loss": 1.9358, "step": 219800 }, { "epoch": 0.044418005956739406, "grad_norm": 5.211250305175781, "learning_rate": 1.9111647601659002e-05, "loss": 1.8556, "step": 219900 }, { "epoch": 0.04443820514089436, "grad_norm": 9.52608585357666, "learning_rate": 1.91112436178127e-05, "loss": 1.9437, "step": 220000 }, { "epoch": 0.04443820514089436, "eval_calculated_loss": 8.753735542297363, "eval_loss": 2.148793935775757, "eval_perplexity": 6334.306035862093, "eval_runtime": 129.5779, "eval_samples_per_second": 7.702, "eval_steps_per_second": 1.929, "step": 220000 }, { "epoch": 0.044458404325049314, "grad_norm": 8.711750030517578, "learning_rate": 1.9110839633966397e-05, "loss": 1.9107, "step": 220100 }, { "epoch": 0.04447860350920426, "grad_norm": 11.07898998260498, "learning_rate": 1.9110435650120096e-05, "loss": 1.9192, "step": 220200 }, { "epoch": 0.044498802693359214, "grad_norm": 8.849239349365234, "learning_rate": 1.9110031666273795e-05, "loss": 1.8794, "step": 220300 }, { "epoch": 0.04451900187751417, "grad_norm": 9.936403274536133, "learning_rate": 1.910962768242749e-05, "loss": 1.8586, "step": 220400 }, { "epoch": 0.04453920106166912, "grad_norm": 5.07237434387207, "learning_rate": 1.910922369858119e-05, "loss": 1.9231, "step": 220500 }, { "epoch": 0.04455940024582407, "grad_norm": 10.711737632751465, "learning_rate": 1.9108819714734886e-05, "loss": 1.8472, "step": 220600 }, { "epoch": 0.04457959942997902, "grad_norm": 9.187674522399902, "learning_rate": 1.9108415730888585e-05, "loss": 1.8843, "step": 220700 }, { "epoch": 0.044599798614133976, "grad_norm": 9.896595001220703, "learning_rate": 1.9108011747042284e-05, "loss": 1.8052, "step": 220800 }, { "epoch": 0.04461999779828893, "grad_norm": 8.485130310058594, "learning_rate": 1.9107607763195983e-05, "loss": 1.8095, "step": 220900 }, { "epoch": 0.044640196982443876, "grad_norm": 7.270910739898682, "learning_rate": 1.9107203779349682e-05, "loss": 1.8554, "step": 221000 }, { "epoch": 0.044640196982443876, "eval_calculated_loss": 8.670506477355957, "eval_loss": 2.152801513671875, "eval_perplexity": 5828.4505803351085, "eval_runtime": 127.183, "eval_samples_per_second": 7.847, "eval_steps_per_second": 1.966, "step": 221000 }, { "epoch": 0.04466039616659883, "grad_norm": 7.343337059020996, "learning_rate": 1.9106799795503378e-05, "loss": 1.8787, "step": 221100 }, { "epoch": 0.044680595350753784, "grad_norm": 12.791946411132812, "learning_rate": 1.9106395811657077e-05, "loss": 1.7584, "step": 221200 }, { "epoch": 0.04470079453490874, "grad_norm": 8.440247535705566, "learning_rate": 1.9105991827810776e-05, "loss": 1.9265, "step": 221300 }, { "epoch": 0.044720993719063684, "grad_norm": 9.085601806640625, "learning_rate": 1.9105587843964472e-05, "loss": 1.8835, "step": 221400 }, { "epoch": 0.04474119290321864, "grad_norm": 6.537060260772705, "learning_rate": 1.910518386011817e-05, "loss": 1.8462, "step": 221500 }, { "epoch": 0.04476139208737359, "grad_norm": 10.06937026977539, "learning_rate": 1.9104779876271867e-05, "loss": 1.9066, "step": 221600 }, { "epoch": 0.044781591271528545, "grad_norm": 7.666464328765869, "learning_rate": 1.9104375892425566e-05, "loss": 1.8703, "step": 221700 }, { "epoch": 0.04480179045568349, "grad_norm": 9.1937894821167, "learning_rate": 1.9103971908579265e-05, "loss": 1.9092, "step": 221800 }, { "epoch": 0.044821989639838446, "grad_norm": 8.469026565551758, "learning_rate": 1.9103567924732964e-05, "loss": 1.9098, "step": 221900 }, { "epoch": 0.0448421888239934, "grad_norm": 13.712198257446289, "learning_rate": 1.9103163940886663e-05, "loss": 1.9557, "step": 222000 }, { "epoch": 0.0448421888239934, "eval_calculated_loss": 8.77527904510498, "eval_loss": 2.160231351852417, "eval_perplexity": 6472.249735783144, "eval_runtime": 127.763, "eval_samples_per_second": 7.811, "eval_steps_per_second": 1.957, "step": 222000 }, { "epoch": 0.04486238800814835, "grad_norm": 7.901589393615723, "learning_rate": 1.910275995704036e-05, "loss": 1.9105, "step": 222100 }, { "epoch": 0.0448825871923033, "grad_norm": 4.156330585479736, "learning_rate": 1.910235597319406e-05, "loss": 1.763, "step": 222200 }, { "epoch": 0.044902786376458254, "grad_norm": 6.266919136047363, "learning_rate": 1.9101951989347754e-05, "loss": 1.8694, "step": 222300 }, { "epoch": 0.04492298556061321, "grad_norm": 9.511520385742188, "learning_rate": 1.9101548005501453e-05, "loss": 1.8984, "step": 222400 }, { "epoch": 0.04494318474476816, "grad_norm": 11.175813674926758, "learning_rate": 1.9101144021655152e-05, "loss": 1.8735, "step": 222500 }, { "epoch": 0.04496338392892311, "grad_norm": 9.45095157623291, "learning_rate": 1.9100740037808848e-05, "loss": 1.9093, "step": 222600 }, { "epoch": 0.04498358311307806, "grad_norm": 7.500711917877197, "learning_rate": 1.9100336053962547e-05, "loss": 1.9965, "step": 222700 }, { "epoch": 0.045003782297233015, "grad_norm": 6.93779182434082, "learning_rate": 1.9099932070116246e-05, "loss": 1.8686, "step": 222800 }, { "epoch": 0.04502398148138797, "grad_norm": 10.924238204956055, "learning_rate": 1.9099528086269945e-05, "loss": 1.7783, "step": 222900 }, { "epoch": 0.045044180665542916, "grad_norm": 6.130155563354492, "learning_rate": 1.9099124102423645e-05, "loss": 1.7605, "step": 223000 }, { "epoch": 0.045044180665542916, "eval_calculated_loss": 8.716991424560547, "eval_loss": 2.159419536590576, "eval_perplexity": 6105.781722130111, "eval_runtime": 130.4203, "eval_samples_per_second": 7.652, "eval_steps_per_second": 1.917, "step": 223000 }, { "epoch": 0.04506437984969787, "grad_norm": 4.628193378448486, "learning_rate": 1.909872011857734e-05, "loss": 1.7631, "step": 223100 }, { "epoch": 0.04508457903385282, "grad_norm": 6.4933648109436035, "learning_rate": 1.909831613473104e-05, "loss": 1.9018, "step": 223200 }, { "epoch": 0.04510477821800778, "grad_norm": 8.80773639678955, "learning_rate": 1.9097912150884735e-05, "loss": 1.9146, "step": 223300 }, { "epoch": 0.04512497740216272, "grad_norm": 4.847875595092773, "learning_rate": 1.9097508167038434e-05, "loss": 1.7753, "step": 223400 }, { "epoch": 0.04514517658631768, "grad_norm": 8.585896492004395, "learning_rate": 1.9097104183192133e-05, "loss": 1.8388, "step": 223500 }, { "epoch": 0.04516537577047263, "grad_norm": 10.853940963745117, "learning_rate": 1.909670019934583e-05, "loss": 1.8045, "step": 223600 }, { "epoch": 0.045185574954627584, "grad_norm": 11.127128601074219, "learning_rate": 1.909629621549953e-05, "loss": 1.9149, "step": 223700 }, { "epoch": 0.04520577413878253, "grad_norm": 7.970984935760498, "learning_rate": 1.9095892231653227e-05, "loss": 1.7932, "step": 223800 }, { "epoch": 0.045225973322937485, "grad_norm": 6.629143238067627, "learning_rate": 1.9095488247806923e-05, "loss": 1.7822, "step": 223900 }, { "epoch": 0.04524617250709244, "grad_norm": 16.219600677490234, "learning_rate": 1.9095084263960622e-05, "loss": 1.8582, "step": 224000 }, { "epoch": 0.04524617250709244, "eval_calculated_loss": 8.894572257995605, "eval_loss": 2.1659181118011475, "eval_perplexity": 7292.285284641346, "eval_runtime": 132.1126, "eval_samples_per_second": 7.554, "eval_steps_per_second": 1.892, "step": 224000 }, { "epoch": 0.04526637169124739, "grad_norm": 8.092475891113281, "learning_rate": 1.909468028011432e-05, "loss": 1.9007, "step": 224100 }, { "epoch": 0.04528657087540234, "grad_norm": 9.335886001586914, "learning_rate": 1.909427629626802e-05, "loss": 1.8179, "step": 224200 }, { "epoch": 0.04530677005955729, "grad_norm": 4.408821105957031, "learning_rate": 1.9093872312421716e-05, "loss": 1.8238, "step": 224300 }, { "epoch": 0.045326969243712247, "grad_norm": 7.410131454467773, "learning_rate": 1.9093468328575415e-05, "loss": 1.908, "step": 224400 }, { "epoch": 0.0453471684278672, "grad_norm": 8.043240547180176, "learning_rate": 1.9093064344729115e-05, "loss": 1.8231, "step": 224500 }, { "epoch": 0.04536736761202215, "grad_norm": 7.644871234893799, "learning_rate": 1.909266036088281e-05, "loss": 1.7536, "step": 224600 }, { "epoch": 0.0453875667961771, "grad_norm": 7.3484272956848145, "learning_rate": 1.909225637703651e-05, "loss": 1.8341, "step": 224700 }, { "epoch": 0.045407765980332054, "grad_norm": 4.709966659545898, "learning_rate": 1.9091852393190205e-05, "loss": 1.8596, "step": 224800 }, { "epoch": 0.04542796516448701, "grad_norm": 6.598545074462891, "learning_rate": 1.9091448409343904e-05, "loss": 1.8272, "step": 224900 }, { "epoch": 0.04544816434864196, "grad_norm": 6.585177898406982, "learning_rate": 1.9091044425497603e-05, "loss": 1.9486, "step": 225000 }, { "epoch": 0.04544816434864196, "eval_calculated_loss": 8.910396575927734, "eval_loss": 2.165977954864502, "eval_perplexity": 7408.59858763203, "eval_runtime": 121.0626, "eval_samples_per_second": 8.244, "eval_steps_per_second": 2.065, "step": 225000 }, { "epoch": 0.04546836353279691, "grad_norm": 8.598965644836426, "learning_rate": 1.9090640441651303e-05, "loss": 1.7652, "step": 225100 }, { "epoch": 0.04548856271695186, "grad_norm": 8.25394058227539, "learning_rate": 1.9090236457805e-05, "loss": 1.8595, "step": 225200 }, { "epoch": 0.045508761901106816, "grad_norm": 6.348060607910156, "learning_rate": 1.9089832473958697e-05, "loss": 1.9006, "step": 225300 }, { "epoch": 0.04552896108526177, "grad_norm": 6.5691609382629395, "learning_rate": 1.9089428490112397e-05, "loss": 1.8666, "step": 225400 }, { "epoch": 0.045549160269416716, "grad_norm": 10.12369441986084, "learning_rate": 1.9089024506266092e-05, "loss": 1.8591, "step": 225500 }, { "epoch": 0.04556935945357167, "grad_norm": 8.177905082702637, "learning_rate": 1.908862052241979e-05, "loss": 1.8209, "step": 225600 }, { "epoch": 0.045589558637726624, "grad_norm": 8.574554443359375, "learning_rate": 1.908821653857349e-05, "loss": 1.8953, "step": 225700 }, { "epoch": 0.04560975782188158, "grad_norm": 4.579465389251709, "learning_rate": 1.9087812554727186e-05, "loss": 1.9201, "step": 225800 }, { "epoch": 0.045629957006036524, "grad_norm": 6.99273157119751, "learning_rate": 1.9087408570880885e-05, "loss": 1.855, "step": 225900 }, { "epoch": 0.04565015619019148, "grad_norm": 10.352133750915527, "learning_rate": 1.9087004587034585e-05, "loss": 1.8726, "step": 226000 }, { "epoch": 0.04565015619019148, "eval_calculated_loss": 8.909202575683594, "eval_loss": 2.1657187938690186, "eval_perplexity": 7399.757997993041, "eval_runtime": 125.795, "eval_samples_per_second": 7.934, "eval_steps_per_second": 1.987, "step": 226000 }, { "epoch": 0.04567035537434643, "grad_norm": 8.182644844055176, "learning_rate": 1.9086600603188284e-05, "loss": 1.9066, "step": 226100 }, { "epoch": 0.045690554558501385, "grad_norm": 8.5428466796875, "learning_rate": 1.9086196619341983e-05, "loss": 1.8834, "step": 226200 }, { "epoch": 0.04571075374265633, "grad_norm": 7.543524265289307, "learning_rate": 1.908579263549568e-05, "loss": 1.7495, "step": 226300 }, { "epoch": 0.045730952926811286, "grad_norm": 9.38264274597168, "learning_rate": 1.9085388651649378e-05, "loss": 1.8215, "step": 226400 }, { "epoch": 0.04575115211096624, "grad_norm": 7.848899841308594, "learning_rate": 1.9084984667803073e-05, "loss": 1.869, "step": 226500 }, { "epoch": 0.04577135129512119, "grad_norm": 9.68488597869873, "learning_rate": 1.9084580683956773e-05, "loss": 1.8457, "step": 226600 }, { "epoch": 0.04579155047927614, "grad_norm": 9.429545402526855, "learning_rate": 1.908417670011047e-05, "loss": 1.808, "step": 226700 }, { "epoch": 0.045811749663431094, "grad_norm": 7.146281719207764, "learning_rate": 1.9083772716264167e-05, "loss": 1.8841, "step": 226800 }, { "epoch": 0.04583194884758605, "grad_norm": 8.276765823364258, "learning_rate": 1.9083368732417867e-05, "loss": 1.9255, "step": 226900 }, { "epoch": 0.045852148031741, "grad_norm": 9.56286907196045, "learning_rate": 1.9082964748571566e-05, "loss": 1.8407, "step": 227000 }, { "epoch": 0.045852148031741, "eval_calculated_loss": 8.957907676696777, "eval_loss": 2.154520273208618, "eval_perplexity": 7769.085006892506, "eval_runtime": 127.2364, "eval_samples_per_second": 7.844, "eval_steps_per_second": 1.965, "step": 227000 }, { "epoch": 0.04587234721589595, "grad_norm": 6.017195701599121, "learning_rate": 1.9082560764725265e-05, "loss": 1.8247, "step": 227100 }, { "epoch": 0.0458925464000509, "grad_norm": 7.732224464416504, "learning_rate": 1.908215678087896e-05, "loss": 1.8678, "step": 227200 }, { "epoch": 0.045912745584205855, "grad_norm": 9.369988441467285, "learning_rate": 1.908175279703266e-05, "loss": 1.8131, "step": 227300 }, { "epoch": 0.04593294476836081, "grad_norm": 7.405807018280029, "learning_rate": 1.908134881318636e-05, "loss": 1.9138, "step": 227400 }, { "epoch": 0.045953143952515756, "grad_norm": 8.971450805664062, "learning_rate": 1.9080944829340055e-05, "loss": 1.8143, "step": 227500 }, { "epoch": 0.04597334313667071, "grad_norm": 7.249116897583008, "learning_rate": 1.9080540845493754e-05, "loss": 1.9111, "step": 227600 }, { "epoch": 0.04599354232082566, "grad_norm": 11.111312866210938, "learning_rate": 1.9080136861647453e-05, "loss": 1.9372, "step": 227700 }, { "epoch": 0.04601374150498062, "grad_norm": 8.709527969360352, "learning_rate": 1.907973287780115e-05, "loss": 1.8426, "step": 227800 }, { "epoch": 0.046033940689135563, "grad_norm": 5.897391319274902, "learning_rate": 1.9079328893954848e-05, "loss": 1.7849, "step": 227900 }, { "epoch": 0.04605413987329052, "grad_norm": 6.349254608154297, "learning_rate": 1.9078924910108543e-05, "loss": 1.8309, "step": 228000 }, { "epoch": 0.04605413987329052, "eval_calculated_loss": 8.90362548828125, "eval_loss": 2.1545052528381348, "eval_perplexity": 7358.60376792246, "eval_runtime": 127.7034, "eval_samples_per_second": 7.815, "eval_steps_per_second": 1.958, "step": 228000 }, { "epoch": 0.04607433905744547, "grad_norm": 8.246236801147461, "learning_rate": 1.9078520926262246e-05, "loss": 1.7974, "step": 228100 }, { "epoch": 0.046094538241600425, "grad_norm": 9.52185344696045, "learning_rate": 1.907811694241594e-05, "loss": 1.8557, "step": 228200 }, { "epoch": 0.04611473742575537, "grad_norm": 7.179449081420898, "learning_rate": 1.907771295856964e-05, "loss": 1.8676, "step": 228300 }, { "epoch": 0.046134936609910325, "grad_norm": 8.750191688537598, "learning_rate": 1.907730897472334e-05, "loss": 1.8349, "step": 228400 }, { "epoch": 0.04615513579406528, "grad_norm": 7.512239933013916, "learning_rate": 1.9076904990877036e-05, "loss": 1.9234, "step": 228500 }, { "epoch": 0.04617533497822023, "grad_norm": 8.653946876525879, "learning_rate": 1.9076501007030735e-05, "loss": 1.8633, "step": 228600 }, { "epoch": 0.04619553416237518, "grad_norm": 8.368672370910645, "learning_rate": 1.9076097023184434e-05, "loss": 1.8926, "step": 228700 }, { "epoch": 0.04621573334653013, "grad_norm": 7.641295433044434, "learning_rate": 1.907569303933813e-05, "loss": 1.8992, "step": 228800 }, { "epoch": 0.04623593253068509, "grad_norm": 7.871522903442383, "learning_rate": 1.907528905549183e-05, "loss": 1.9388, "step": 228900 }, { "epoch": 0.04625613171484004, "grad_norm": 7.970442295074463, "learning_rate": 1.9074885071645525e-05, "loss": 1.8626, "step": 229000 }, { "epoch": 0.04625613171484004, "eval_calculated_loss": 8.941476821899414, "eval_loss": 2.1469407081604004, "eval_perplexity": 7642.475300507252, "eval_runtime": 129.2984, "eval_samples_per_second": 7.719, "eval_steps_per_second": 1.934, "step": 229000 }, { "epoch": 0.04627633089899499, "grad_norm": 6.413226127624512, "learning_rate": 1.9074481087799227e-05, "loss": 1.9339, "step": 229100 }, { "epoch": 0.04629653008314994, "grad_norm": 7.9401984214782715, "learning_rate": 1.9074077103952923e-05, "loss": 1.8507, "step": 229200 }, { "epoch": 0.046316729267304894, "grad_norm": 16.151063919067383, "learning_rate": 1.9073673120106622e-05, "loss": 1.9212, "step": 229300 }, { "epoch": 0.04633692845145985, "grad_norm": 6.922092914581299, "learning_rate": 1.907326913626032e-05, "loss": 1.9004, "step": 229400 }, { "epoch": 0.046357127635614795, "grad_norm": 4.677104473114014, "learning_rate": 1.9072865152414017e-05, "loss": 1.8243, "step": 229500 }, { "epoch": 0.04637732681976975, "grad_norm": 6.403905391693115, "learning_rate": 1.9072461168567716e-05, "loss": 1.8783, "step": 229600 }, { "epoch": 0.0463975260039247, "grad_norm": 6.3270673751831055, "learning_rate": 1.907205718472141e-05, "loss": 1.878, "step": 229700 }, { "epoch": 0.046417725188079656, "grad_norm": 11.396222114562988, "learning_rate": 1.907165320087511e-05, "loss": 1.8233, "step": 229800 }, { "epoch": 0.0464379243722346, "grad_norm": 9.902535438537598, "learning_rate": 1.907124921702881e-05, "loss": 1.8215, "step": 229900 }, { "epoch": 0.046458123556389556, "grad_norm": 11.488368034362793, "learning_rate": 1.9070845233182506e-05, "loss": 1.958, "step": 230000 }, { "epoch": 0.046458123556389556, "eval_calculated_loss": 8.81992244720459, "eval_loss": 2.157660961151123, "eval_perplexity": 6767.739747780527, "eval_runtime": 129.2473, "eval_samples_per_second": 7.722, "eval_steps_per_second": 1.934, "step": 230000 }, { "epoch": 0.04647832274054451, "grad_norm": 6.85070276260376, "learning_rate": 1.9070441249336205e-05, "loss": 1.8876, "step": 230100 }, { "epoch": 0.046498521924699464, "grad_norm": 10.95838451385498, "learning_rate": 1.9070037265489904e-05, "loss": 1.7882, "step": 230200 }, { "epoch": 0.04651872110885441, "grad_norm": 5.6098432540893555, "learning_rate": 1.9069633281643603e-05, "loss": 1.8303, "step": 230300 }, { "epoch": 0.046538920293009364, "grad_norm": 11.193291664123535, "learning_rate": 1.90692292977973e-05, "loss": 1.797, "step": 230400 }, { "epoch": 0.04655911947716432, "grad_norm": 4.442704200744629, "learning_rate": 1.9068825313950998e-05, "loss": 1.8954, "step": 230500 }, { "epoch": 0.04657931866131927, "grad_norm": 7.222259998321533, "learning_rate": 1.9068421330104697e-05, "loss": 1.7986, "step": 230600 }, { "epoch": 0.04659951784547422, "grad_norm": 9.019906997680664, "learning_rate": 1.9068017346258393e-05, "loss": 1.8512, "step": 230700 }, { "epoch": 0.04661971702962917, "grad_norm": 10.187920570373535, "learning_rate": 1.9067613362412092e-05, "loss": 1.8729, "step": 230800 }, { "epoch": 0.046639916213784126, "grad_norm": 10.076736450195312, "learning_rate": 1.906720937856579e-05, "loss": 1.8077, "step": 230900 }, { "epoch": 0.04666011539793908, "grad_norm": 9.514508247375488, "learning_rate": 1.9066805394719487e-05, "loss": 1.8287, "step": 231000 }, { "epoch": 0.04666011539793908, "eval_calculated_loss": 8.911035537719727, "eval_loss": 2.1394968032836914, "eval_perplexity": 7413.3339117462165, "eval_runtime": 121.5848, "eval_samples_per_second": 8.208, "eval_steps_per_second": 2.056, "step": 231000 }, { "epoch": 0.046680314582094026, "grad_norm": 7.226778507232666, "learning_rate": 1.9066401410873186e-05, "loss": 1.9307, "step": 231100 }, { "epoch": 0.04670051376624898, "grad_norm": 6.908090114593506, "learning_rate": 1.9065997427026885e-05, "loss": 1.887, "step": 231200 }, { "epoch": 0.046720712950403934, "grad_norm": 5.782750129699707, "learning_rate": 1.9065593443180584e-05, "loss": 1.7883, "step": 231300 }, { "epoch": 0.04674091213455889, "grad_norm": 6.0197882652282715, "learning_rate": 1.906518945933428e-05, "loss": 1.8935, "step": 231400 }, { "epoch": 0.046761111318713834, "grad_norm": 7.9698686599731445, "learning_rate": 1.906478547548798e-05, "loss": 1.8483, "step": 231500 }, { "epoch": 0.04678131050286879, "grad_norm": 9.161744117736816, "learning_rate": 1.9064381491641678e-05, "loss": 1.9384, "step": 231600 }, { "epoch": 0.04680150968702374, "grad_norm": 6.881001949310303, "learning_rate": 1.9063977507795374e-05, "loss": 1.86, "step": 231700 }, { "epoch": 0.046821708871178695, "grad_norm": 7.99090576171875, "learning_rate": 1.9063573523949073e-05, "loss": 1.8688, "step": 231800 }, { "epoch": 0.04684190805533364, "grad_norm": 7.307586669921875, "learning_rate": 1.9063169540102772e-05, "loss": 1.8467, "step": 231900 }, { "epoch": 0.046862107239488596, "grad_norm": 12.214798927307129, "learning_rate": 1.9062765556256468e-05, "loss": 1.7419, "step": 232000 }, { "epoch": 0.046862107239488596, "eval_calculated_loss": 9.056154251098633, "eval_loss": 2.1468122005462646, "eval_perplexity": 8571.124796380585, "eval_runtime": 121.5703, "eval_samples_per_second": 8.209, "eval_steps_per_second": 2.056, "step": 232000 }, { "epoch": 0.04688230642364355, "grad_norm": 10.02348804473877, "learning_rate": 1.9062361572410167e-05, "loss": 1.9084, "step": 232100 }, { "epoch": 0.0469025056077985, "grad_norm": 9.455307006835938, "learning_rate": 1.9061957588563863e-05, "loss": 1.8832, "step": 232200 }, { "epoch": 0.04692270479195345, "grad_norm": 8.946393966674805, "learning_rate": 1.9061553604717565e-05, "loss": 1.891, "step": 232300 }, { "epoch": 0.046942903976108404, "grad_norm": 7.6481428146362305, "learning_rate": 1.906114962087126e-05, "loss": 1.7628, "step": 232400 }, { "epoch": 0.04696310316026336, "grad_norm": 4.818081378936768, "learning_rate": 1.906074563702496e-05, "loss": 1.9579, "step": 232500 }, { "epoch": 0.04698330234441831, "grad_norm": 10.835078239440918, "learning_rate": 1.906034165317866e-05, "loss": 1.8807, "step": 232600 }, { "epoch": 0.04700350152857326, "grad_norm": 9.59549331665039, "learning_rate": 1.9059937669332355e-05, "loss": 1.8688, "step": 232700 }, { "epoch": 0.04702370071272821, "grad_norm": 6.588180065155029, "learning_rate": 1.9059533685486054e-05, "loss": 1.7968, "step": 232800 }, { "epoch": 0.047043899896883165, "grad_norm": 11.766359329223633, "learning_rate": 1.905912970163975e-05, "loss": 1.9079, "step": 232900 }, { "epoch": 0.04706409908103812, "grad_norm": 11.84742259979248, "learning_rate": 1.905872571779345e-05, "loss": 1.9047, "step": 233000 }, { "epoch": 0.04706409908103812, "eval_calculated_loss": 8.738611221313477, "eval_loss": 2.147418260574341, "eval_perplexity": 6239.224790245066, "eval_runtime": 119.8613, "eval_samples_per_second": 8.326, "eval_steps_per_second": 2.086, "step": 233000 }, { "epoch": 0.047084298265193066, "grad_norm": 5.345065116882324, "learning_rate": 1.9058321733947148e-05, "loss": 1.835, "step": 233100 }, { "epoch": 0.04710449744934802, "grad_norm": 10.255525588989258, "learning_rate": 1.9057917750100844e-05, "loss": 1.905, "step": 233200 }, { "epoch": 0.04712469663350297, "grad_norm": 10.27133846282959, "learning_rate": 1.9057513766254546e-05, "loss": 1.9006, "step": 233300 }, { "epoch": 0.04714489581765793, "grad_norm": 6.736246585845947, "learning_rate": 1.9057109782408242e-05, "loss": 1.8915, "step": 233400 }, { "epoch": 0.04716509500181287, "grad_norm": 12.231513977050781, "learning_rate": 1.905670579856194e-05, "loss": 1.8902, "step": 233500 }, { "epoch": 0.04718529418596783, "grad_norm": 5.27718448638916, "learning_rate": 1.905630181471564e-05, "loss": 1.8183, "step": 233600 }, { "epoch": 0.04720549337012278, "grad_norm": 7.040873050689697, "learning_rate": 1.9055897830869336e-05, "loss": 1.8617, "step": 233700 }, { "epoch": 0.047225692554277734, "grad_norm": 9.407166481018066, "learning_rate": 1.9055493847023035e-05, "loss": 1.7923, "step": 233800 }, { "epoch": 0.04724589173843268, "grad_norm": 10.687407493591309, "learning_rate": 1.905508986317673e-05, "loss": 1.8861, "step": 233900 }, { "epoch": 0.047266090922587635, "grad_norm": 8.731328010559082, "learning_rate": 1.905468587933043e-05, "loss": 1.9401, "step": 234000 }, { "epoch": 0.047266090922587635, "eval_calculated_loss": 8.90762996673584, "eval_loss": 2.1536240577697754, "eval_perplexity": 7388.130217726216, "eval_runtime": 121.7756, "eval_samples_per_second": 8.195, "eval_steps_per_second": 2.053, "step": 234000 }, { "epoch": 0.04728629010674259, "grad_norm": 5.776238441467285, "learning_rate": 1.905428189548413e-05, "loss": 1.8221, "step": 234100 }, { "epoch": 0.04730648929089754, "grad_norm": 6.887906551361084, "learning_rate": 1.9053877911637825e-05, "loss": 1.7797, "step": 234200 }, { "epoch": 0.047326688475052496, "grad_norm": 5.318334102630615, "learning_rate": 1.9053473927791527e-05, "loss": 1.8107, "step": 234300 }, { "epoch": 0.04734688765920744, "grad_norm": 4.944169998168945, "learning_rate": 1.9053069943945223e-05, "loss": 1.7785, "step": 234400 }, { "epoch": 0.047367086843362396, "grad_norm": 5.60725736618042, "learning_rate": 1.9052665960098922e-05, "loss": 1.8163, "step": 234500 }, { "epoch": 0.04738728602751735, "grad_norm": 5.39339017868042, "learning_rate": 1.9052261976252618e-05, "loss": 1.8505, "step": 234600 }, { "epoch": 0.047407485211672304, "grad_norm": 6.493779182434082, "learning_rate": 1.9051857992406317e-05, "loss": 1.8068, "step": 234700 }, { "epoch": 0.04742768439582725, "grad_norm": 8.94505786895752, "learning_rate": 1.9051454008560016e-05, "loss": 1.9121, "step": 234800 }, { "epoch": 0.047447883579982204, "grad_norm": 8.024632453918457, "learning_rate": 1.9051050024713712e-05, "loss": 1.9679, "step": 234900 }, { "epoch": 0.04746808276413716, "grad_norm": 4.466652870178223, "learning_rate": 1.905064604086741e-05, "loss": 1.8697, "step": 235000 }, { "epoch": 0.04746808276413716, "eval_calculated_loss": 8.939727783203125, "eval_loss": 2.1641898155212402, "eval_perplexity": 7629.119998346012, "eval_runtime": 120.8032, "eval_samples_per_second": 8.261, "eval_steps_per_second": 2.069, "step": 235000 }, { "epoch": 0.04748828194829211, "grad_norm": 8.453546524047852, "learning_rate": 1.905024205702111e-05, "loss": 1.9163, "step": 235100 }, { "epoch": 0.04750848113244706, "grad_norm": 4.43382453918457, "learning_rate": 1.9049838073174806e-05, "loss": 1.8562, "step": 235200 }, { "epoch": 0.04752868031660201, "grad_norm": 4.801021099090576, "learning_rate": 1.9049434089328505e-05, "loss": 1.849, "step": 235300 }, { "epoch": 0.047548879500756966, "grad_norm": 5.523567199707031, "learning_rate": 1.9049030105482204e-05, "loss": 1.8572, "step": 235400 }, { "epoch": 0.04756907868491192, "grad_norm": 10.044441223144531, "learning_rate": 1.9048626121635903e-05, "loss": 1.866, "step": 235500 }, { "epoch": 0.047589277869066866, "grad_norm": 11.375443458557129, "learning_rate": 1.90482221377896e-05, "loss": 1.8715, "step": 235600 }, { "epoch": 0.04760947705322182, "grad_norm": 8.854352951049805, "learning_rate": 1.90478181539433e-05, "loss": 1.8233, "step": 235700 }, { "epoch": 0.047629676237376774, "grad_norm": 8.427626609802246, "learning_rate": 1.9047414170096997e-05, "loss": 1.8969, "step": 235800 }, { "epoch": 0.04764987542153173, "grad_norm": 8.468038558959961, "learning_rate": 1.9047010186250693e-05, "loss": 1.7865, "step": 235900 }, { "epoch": 0.047670074605686674, "grad_norm": 6.595878601074219, "learning_rate": 1.9046606202404392e-05, "loss": 1.7984, "step": 236000 }, { "epoch": 0.047670074605686674, "eval_calculated_loss": 8.897857666015625, "eval_loss": 2.169373035430908, "eval_perplexity": 7316.282816455932, "eval_runtime": 122.7203, "eval_samples_per_second": 8.132, "eval_steps_per_second": 2.037, "step": 236000 }, { "epoch": 0.04769027378984163, "grad_norm": 8.068780899047852, "learning_rate": 1.9046202218558088e-05, "loss": 1.8476, "step": 236100 }, { "epoch": 0.04771047297399658, "grad_norm": 5.790914535522461, "learning_rate": 1.9045798234711787e-05, "loss": 1.8409, "step": 236200 }, { "epoch": 0.047730672158151535, "grad_norm": 10.487488746643066, "learning_rate": 1.9045394250865486e-05, "loss": 1.7942, "step": 236300 }, { "epoch": 0.04775087134230648, "grad_norm": 6.2112579345703125, "learning_rate": 1.9044990267019185e-05, "loss": 1.9582, "step": 236400 }, { "epoch": 0.047771070526461436, "grad_norm": 7.959331512451172, "learning_rate": 1.9044586283172885e-05, "loss": 1.9187, "step": 236500 }, { "epoch": 0.04779126971061639, "grad_norm": 8.084004402160645, "learning_rate": 1.904418229932658e-05, "loss": 1.7883, "step": 236600 }, { "epoch": 0.04781146889477134, "grad_norm": 9.01605224609375, "learning_rate": 1.904377831548028e-05, "loss": 1.8607, "step": 236700 }, { "epoch": 0.04783166807892629, "grad_norm": 5.309074401855469, "learning_rate": 1.904337433163398e-05, "loss": 1.7499, "step": 236800 }, { "epoch": 0.047851867263081244, "grad_norm": 8.77957534790039, "learning_rate": 1.9042970347787674e-05, "loss": 1.9239, "step": 236900 }, { "epoch": 0.0478720664472362, "grad_norm": 7.1075615882873535, "learning_rate": 1.9042566363941373e-05, "loss": 1.871, "step": 237000 }, { "epoch": 0.0478720664472362, "eval_calculated_loss": 8.79887866973877, "eval_loss": 2.1656129360198975, "eval_perplexity": 6626.808997042736, "eval_runtime": 120.3274, "eval_samples_per_second": 8.294, "eval_steps_per_second": 2.078, "step": 237000 }, { "epoch": 0.04789226563139115, "grad_norm": 9.740921020507812, "learning_rate": 1.904216238009507e-05, "loss": 1.8272, "step": 237100 }, { "epoch": 0.0479124648155461, "grad_norm": 6.273146152496338, "learning_rate": 1.9041758396248768e-05, "loss": 1.8353, "step": 237200 }, { "epoch": 0.04793266399970105, "grad_norm": 10.195385932922363, "learning_rate": 1.9041354412402467e-05, "loss": 1.8267, "step": 237300 }, { "epoch": 0.047952863183856005, "grad_norm": 14.358573913574219, "learning_rate": 1.9040950428556167e-05, "loss": 1.8442, "step": 237400 }, { "epoch": 0.04797306236801096, "grad_norm": 8.426995277404785, "learning_rate": 1.9040546444709866e-05, "loss": 1.808, "step": 237500 }, { "epoch": 0.047993261552165906, "grad_norm": 9.985697746276855, "learning_rate": 1.904014246086356e-05, "loss": 1.842, "step": 237600 }, { "epoch": 0.04801346073632086, "grad_norm": 10.615689277648926, "learning_rate": 1.903973847701726e-05, "loss": 1.9842, "step": 237700 }, { "epoch": 0.04803365992047581, "grad_norm": 10.1068115234375, "learning_rate": 1.9039334493170956e-05, "loss": 1.9396, "step": 237800 }, { "epoch": 0.04805385910463077, "grad_norm": 6.73759126663208, "learning_rate": 1.9038930509324655e-05, "loss": 1.8104, "step": 237900 }, { "epoch": 0.04807405828878571, "grad_norm": 11.536015510559082, "learning_rate": 1.9038526525478355e-05, "loss": 1.8624, "step": 238000 }, { "epoch": 0.04807405828878571, "eval_calculated_loss": 8.686287879943848, "eval_loss": 2.165492534637451, "eval_perplexity": 5921.161334124834, "eval_runtime": 121.6067, "eval_samples_per_second": 8.207, "eval_steps_per_second": 2.056, "step": 238000 }, { "epoch": 0.04809425747294067, "grad_norm": 11.024856567382812, "learning_rate": 1.903812254163205e-05, "loss": 1.8675, "step": 238100 }, { "epoch": 0.04811445665709562, "grad_norm": 7.301053524017334, "learning_rate": 1.903771855778575e-05, "loss": 1.8748, "step": 238200 }, { "epoch": 0.048134655841250575, "grad_norm": 2.5337612628936768, "learning_rate": 1.903731457393945e-05, "loss": 1.8658, "step": 238300 }, { "epoch": 0.04815485502540552, "grad_norm": 7.563669681549072, "learning_rate": 1.9036910590093144e-05, "loss": 1.8952, "step": 238400 }, { "epoch": 0.048175054209560475, "grad_norm": 10.289495468139648, "learning_rate": 1.9036506606246843e-05, "loss": 1.8268, "step": 238500 }, { "epoch": 0.04819525339371543, "grad_norm": 6.070688724517822, "learning_rate": 1.9036102622400543e-05, "loss": 1.8935, "step": 238600 }, { "epoch": 0.04821545257787038, "grad_norm": 7.881333827972412, "learning_rate": 1.903569863855424e-05, "loss": 1.8821, "step": 238700 }, { "epoch": 0.04823565176202533, "grad_norm": 6.960472106933594, "learning_rate": 1.9035294654707937e-05, "loss": 1.9028, "step": 238800 }, { "epoch": 0.04825585094618028, "grad_norm": 9.410379409790039, "learning_rate": 1.9034890670861637e-05, "loss": 1.8831, "step": 238900 }, { "epoch": 0.04827605013033524, "grad_norm": 7.827146530151367, "learning_rate": 1.9034486687015336e-05, "loss": 1.9061, "step": 239000 }, { "epoch": 0.04827605013033524, "eval_calculated_loss": 8.710476875305176, "eval_loss": 2.1551074981689453, "eval_perplexity": 6066.134588178687, "eval_runtime": 122.6353, "eval_samples_per_second": 8.138, "eval_steps_per_second": 2.039, "step": 239000 }, { "epoch": 0.04829624931449019, "grad_norm": 7.884547710418701, "learning_rate": 1.903408270316903e-05, "loss": 1.825, "step": 239100 }, { "epoch": 0.04831644849864514, "grad_norm": 4.973730564117432, "learning_rate": 1.903367871932273e-05, "loss": 1.7624, "step": 239200 }, { "epoch": 0.04833664768280009, "grad_norm": 7.965257167816162, "learning_rate": 1.903327473547643e-05, "loss": 1.9476, "step": 239300 }, { "epoch": 0.048356846866955044, "grad_norm": 4.243851184844971, "learning_rate": 1.9032870751630125e-05, "loss": 1.8859, "step": 239400 }, { "epoch": 0.04837704605111, "grad_norm": 11.59663200378418, "learning_rate": 1.9032466767783825e-05, "loss": 1.9146, "step": 239500 }, { "epoch": 0.048397245235264945, "grad_norm": 6.331230163574219, "learning_rate": 1.9032062783937524e-05, "loss": 1.846, "step": 239600 }, { "epoch": 0.0484174444194199, "grad_norm": 14.218033790588379, "learning_rate": 1.9031658800091223e-05, "loss": 1.8298, "step": 239700 }, { "epoch": 0.04843764360357485, "grad_norm": 7.058395862579346, "learning_rate": 1.903125481624492e-05, "loss": 1.8527, "step": 239800 }, { "epoch": 0.048457842787729806, "grad_norm": 6.220451831817627, "learning_rate": 1.9030850832398618e-05, "loss": 1.8663, "step": 239900 }, { "epoch": 0.04847804197188475, "grad_norm": 10.222018241882324, "learning_rate": 1.9030446848552317e-05, "loss": 1.9111, "step": 240000 }, { "epoch": 0.04847804197188475, "eval_calculated_loss": 8.61007022857666, "eval_loss": 2.146693229675293, "eval_perplexity": 5486.633982765899, "eval_runtime": 119.0074, "eval_samples_per_second": 8.386, "eval_steps_per_second": 2.101, "step": 240000 }, { "epoch": 0.048498241156039706, "grad_norm": 8.31572151184082, "learning_rate": 1.9030042864706013e-05, "loss": 1.877, "step": 240100 }, { "epoch": 0.04851844034019466, "grad_norm": 6.876774311065674, "learning_rate": 1.902963888085971e-05, "loss": 1.9405, "step": 240200 }, { "epoch": 0.048538639524349614, "grad_norm": 5.816372871398926, "learning_rate": 1.9029234897013407e-05, "loss": 1.7898, "step": 240300 }, { "epoch": 0.04855883870850456, "grad_norm": 7.577704429626465, "learning_rate": 1.9028830913167107e-05, "loss": 1.7831, "step": 240400 }, { "epoch": 0.048579037892659514, "grad_norm": 10.742887496948242, "learning_rate": 1.9028426929320806e-05, "loss": 1.8626, "step": 240500 }, { "epoch": 0.04859923707681447, "grad_norm": 10.94094181060791, "learning_rate": 1.9028022945474505e-05, "loss": 1.8066, "step": 240600 }, { "epoch": 0.04861943626096942, "grad_norm": 7.871347427368164, "learning_rate": 1.9027618961628204e-05, "loss": 1.7777, "step": 240700 }, { "epoch": 0.04863963544512437, "grad_norm": 8.446471214294434, "learning_rate": 1.90272149777819e-05, "loss": 1.8476, "step": 240800 }, { "epoch": 0.04865983462927932, "grad_norm": 14.207817077636719, "learning_rate": 1.90268109939356e-05, "loss": 1.8756, "step": 240900 }, { "epoch": 0.048680033813434276, "grad_norm": 6.425985813140869, "learning_rate": 1.9026407010089295e-05, "loss": 1.8682, "step": 241000 }, { "epoch": 0.048680033813434276, "eval_calculated_loss": 8.756113052368164, "eval_loss": 2.1486783027648926, "eval_perplexity": 6349.3838289540145, "eval_runtime": 120.5477, "eval_samples_per_second": 8.279, "eval_steps_per_second": 2.074, "step": 241000 }, { "epoch": 0.04870023299758923, "grad_norm": 7.14463472366333, "learning_rate": 1.9026003026242994e-05, "loss": 1.8594, "step": 241100 }, { "epoch": 0.048720432181744176, "grad_norm": 5.377647876739502, "learning_rate": 1.9025599042396693e-05, "loss": 1.8513, "step": 241200 }, { "epoch": 0.04874063136589913, "grad_norm": 11.97607135772705, "learning_rate": 1.902519505855039e-05, "loss": 1.9264, "step": 241300 }, { "epoch": 0.048760830550054084, "grad_norm": 9.756447792053223, "learning_rate": 1.9024791074704088e-05, "loss": 1.9198, "step": 241400 }, { "epoch": 0.04878102973420904, "grad_norm": 6.493258953094482, "learning_rate": 1.9024387090857787e-05, "loss": 1.8664, "step": 241500 }, { "epoch": 0.048801228918363984, "grad_norm": 7.992952346801758, "learning_rate": 1.9023983107011486e-05, "loss": 1.8629, "step": 241600 }, { "epoch": 0.04882142810251894, "grad_norm": 4.7888593673706055, "learning_rate": 1.9023579123165185e-05, "loss": 1.9125, "step": 241700 }, { "epoch": 0.04884162728667389, "grad_norm": 10.00471305847168, "learning_rate": 1.902317513931888e-05, "loss": 1.812, "step": 241800 }, { "epoch": 0.048861826470828845, "grad_norm": 8.250213623046875, "learning_rate": 1.902277115547258e-05, "loss": 1.8758, "step": 241900 }, { "epoch": 0.04888202565498379, "grad_norm": 12.111825942993164, "learning_rate": 1.9022367171626276e-05, "loss": 1.7996, "step": 242000 }, { "epoch": 0.04888202565498379, "eval_calculated_loss": 8.59214973449707, "eval_loss": 2.151857376098633, "eval_perplexity": 5389.186551889972, "eval_runtime": 121.8077, "eval_samples_per_second": 8.193, "eval_steps_per_second": 2.052, "step": 242000 }, { "epoch": 0.048902224839138746, "grad_norm": 4.571228981018066, "learning_rate": 1.9021963187779975e-05, "loss": 1.7509, "step": 242100 }, { "epoch": 0.0489224240232937, "grad_norm": 11.529388427734375, "learning_rate": 1.9021559203933674e-05, "loss": 1.8992, "step": 242200 }, { "epoch": 0.04894262320744865, "grad_norm": 12.139969825744629, "learning_rate": 1.902115522008737e-05, "loss": 1.8769, "step": 242300 }, { "epoch": 0.0489628223916036, "grad_norm": 6.037910461425781, "learning_rate": 1.902075123624107e-05, "loss": 1.9428, "step": 242400 }, { "epoch": 0.048983021575758554, "grad_norm": 7.428520679473877, "learning_rate": 1.9020347252394768e-05, "loss": 1.8766, "step": 242500 }, { "epoch": 0.04900322075991351, "grad_norm": 25.53704071044922, "learning_rate": 1.9019943268548467e-05, "loss": 1.8688, "step": 242600 }, { "epoch": 0.04902341994406846, "grad_norm": 7.90401554107666, "learning_rate": 1.9019539284702163e-05, "loss": 1.8931, "step": 242700 }, { "epoch": 0.04904361912822341, "grad_norm": 9.313445091247559, "learning_rate": 1.9019135300855862e-05, "loss": 1.856, "step": 242800 }, { "epoch": 0.04906381831237836, "grad_norm": 5.614658355712891, "learning_rate": 1.901873131700956e-05, "loss": 1.8285, "step": 242900 }, { "epoch": 0.049084017496533315, "grad_norm": 8.129128456115723, "learning_rate": 1.9018327333163257e-05, "loss": 1.9467, "step": 243000 }, { "epoch": 0.049084017496533315, "eval_calculated_loss": 8.715229988098145, "eval_loss": 2.154472827911377, "eval_perplexity": 6095.03624209175, "eval_runtime": 121.1987, "eval_samples_per_second": 8.234, "eval_steps_per_second": 2.063, "step": 243000 }, { "epoch": 0.04910421668068827, "grad_norm": 6.827504634857178, "learning_rate": 1.9017923349316956e-05, "loss": 1.9116, "step": 243100 }, { "epoch": 0.049124415864843216, "grad_norm": 8.384531021118164, "learning_rate": 1.9017519365470655e-05, "loss": 1.9072, "step": 243200 }, { "epoch": 0.04914461504899817, "grad_norm": 8.26894474029541, "learning_rate": 1.901711538162435e-05, "loss": 1.8796, "step": 243300 }, { "epoch": 0.04916481423315312, "grad_norm": 9.337692260742188, "learning_rate": 1.901671139777805e-05, "loss": 1.8439, "step": 243400 }, { "epoch": 0.04918501341730808, "grad_norm": 8.641589164733887, "learning_rate": 1.9016307413931746e-05, "loss": 1.8478, "step": 243500 }, { "epoch": 0.04920521260146303, "grad_norm": 6.716281890869141, "learning_rate": 1.9015903430085448e-05, "loss": 1.908, "step": 243600 }, { "epoch": 0.04922541178561798, "grad_norm": 9.126908302307129, "learning_rate": 1.9015499446239144e-05, "loss": 1.8447, "step": 243700 }, { "epoch": 0.04924561096977293, "grad_norm": 11.408358573913574, "learning_rate": 1.9015095462392843e-05, "loss": 1.8739, "step": 243800 }, { "epoch": 0.049265810153927884, "grad_norm": 10.937552452087402, "learning_rate": 1.9014691478546542e-05, "loss": 1.813, "step": 243900 }, { "epoch": 0.04928600933808284, "grad_norm": 8.356169700622559, "learning_rate": 1.9014287494700238e-05, "loss": 1.8013, "step": 244000 }, { "epoch": 0.04928600933808284, "eval_calculated_loss": 8.640751838684082, "eval_loss": 2.156252145767212, "eval_perplexity": 5657.581814696991, "eval_runtime": 119.2611, "eval_samples_per_second": 8.368, "eval_steps_per_second": 2.096, "step": 244000 }, { "epoch": 0.049306208522237785, "grad_norm": 6.270507335662842, "learning_rate": 1.9013883510853937e-05, "loss": 1.8023, "step": 244100 }, { "epoch": 0.04932640770639274, "grad_norm": 6.44399881362915, "learning_rate": 1.9013479527007636e-05, "loss": 1.8647, "step": 244200 }, { "epoch": 0.04934660689054769, "grad_norm": 11.908512115478516, "learning_rate": 1.9013075543161332e-05, "loss": 1.843, "step": 244300 }, { "epoch": 0.049366806074702646, "grad_norm": 11.794540405273438, "learning_rate": 1.901267155931503e-05, "loss": 1.8878, "step": 244400 }, { "epoch": 0.04938700525885759, "grad_norm": 10.728631973266602, "learning_rate": 1.9012267575468727e-05, "loss": 1.9099, "step": 244500 }, { "epoch": 0.049407204443012546, "grad_norm": 10.787117004394531, "learning_rate": 1.9011863591622426e-05, "loss": 1.7285, "step": 244600 }, { "epoch": 0.0494274036271675, "grad_norm": 9.480835914611816, "learning_rate": 1.9011459607776125e-05, "loss": 1.8606, "step": 244700 }, { "epoch": 0.049447602811322454, "grad_norm": 7.4326863288879395, "learning_rate": 1.9011055623929824e-05, "loss": 1.79, "step": 244800 }, { "epoch": 0.0494678019954774, "grad_norm": 11.19866943359375, "learning_rate": 1.9010651640083523e-05, "loss": 1.8808, "step": 244900 }, { "epoch": 0.049488001179632354, "grad_norm": 6.323408126831055, "learning_rate": 1.901024765623722e-05, "loss": 1.8812, "step": 245000 }, { "epoch": 0.049488001179632354, "eval_calculated_loss": 8.83846378326416, "eval_loss": 2.139986276626587, "eval_perplexity": 6894.393218689328, "eval_runtime": 120.9519, "eval_samples_per_second": 8.251, "eval_steps_per_second": 2.067, "step": 245000 }, { "epoch": 0.04950820036378731, "grad_norm": 13.07207202911377, "learning_rate": 1.9009843672390918e-05, "loss": 1.826, "step": 245100 }, { "epoch": 0.04952839954794226, "grad_norm": 12.586216926574707, "learning_rate": 1.9009439688544614e-05, "loss": 1.8914, "step": 245200 }, { "epoch": 0.04954859873209721, "grad_norm": 6.539427757263184, "learning_rate": 1.9009035704698313e-05, "loss": 1.9696, "step": 245300 }, { "epoch": 0.04956879791625216, "grad_norm": 6.186250686645508, "learning_rate": 1.9008631720852012e-05, "loss": 1.8765, "step": 245400 }, { "epoch": 0.049588997100407116, "grad_norm": 9.339799880981445, "learning_rate": 1.9008227737005708e-05, "loss": 1.896, "step": 245500 }, { "epoch": 0.04960919628456207, "grad_norm": 7.487159729003906, "learning_rate": 1.9007823753159407e-05, "loss": 1.7928, "step": 245600 }, { "epoch": 0.049629395468717016, "grad_norm": 10.421879768371582, "learning_rate": 1.9007419769313106e-05, "loss": 1.908, "step": 245700 }, { "epoch": 0.04964959465287197, "grad_norm": 9.731350898742676, "learning_rate": 1.9007015785466805e-05, "loss": 1.8547, "step": 245800 }, { "epoch": 0.049669793837026924, "grad_norm": 9.125784873962402, "learning_rate": 1.90066118016205e-05, "loss": 1.8354, "step": 245900 }, { "epoch": 0.04968999302118188, "grad_norm": 9.782923698425293, "learning_rate": 1.90062078177742e-05, "loss": 1.8169, "step": 246000 }, { "epoch": 0.04968999302118188, "eval_calculated_loss": 8.684916496276855, "eval_loss": 2.1580288410186768, "eval_perplexity": 5913.0467155810165, "eval_runtime": 123.5487, "eval_samples_per_second": 8.078, "eval_steps_per_second": 2.023, "step": 246000 }, { "epoch": 0.049710192205336824, "grad_norm": 6.656792640686035, "learning_rate": 1.90058038339279e-05, "loss": 1.8504, "step": 246100 }, { "epoch": 0.04973039138949178, "grad_norm": 5.384629249572754, "learning_rate": 1.9005399850081595e-05, "loss": 1.9247, "step": 246200 }, { "epoch": 0.04975059057364673, "grad_norm": 8.245911598205566, "learning_rate": 1.9004995866235294e-05, "loss": 1.8877, "step": 246300 }, { "epoch": 0.049770789757801685, "grad_norm": 7.342944622039795, "learning_rate": 1.9004591882388993e-05, "loss": 1.9114, "step": 246400 }, { "epoch": 0.04979098894195663, "grad_norm": 5.988204002380371, "learning_rate": 1.900418789854269e-05, "loss": 1.9107, "step": 246500 }, { "epoch": 0.049811188126111586, "grad_norm": 10.320978164672852, "learning_rate": 1.9003783914696388e-05, "loss": 1.8369, "step": 246600 }, { "epoch": 0.04983138731026654, "grad_norm": 5.901266098022461, "learning_rate": 1.9003379930850084e-05, "loss": 1.9, "step": 246700 }, { "epoch": 0.04985158649442149, "grad_norm": 7.879960536956787, "learning_rate": 1.9002975947003786e-05, "loss": 1.7614, "step": 246800 }, { "epoch": 0.04987178567857644, "grad_norm": 10.78330135345459, "learning_rate": 1.9002571963157482e-05, "loss": 1.7755, "step": 246900 }, { "epoch": 0.049891984862731394, "grad_norm": 22.727073669433594, "learning_rate": 1.900216797931118e-05, "loss": 1.9328, "step": 247000 }, { "epoch": 0.049891984862731394, "eval_calculated_loss": 8.909180641174316, "eval_loss": 2.1485884189605713, "eval_perplexity": 7399.595689712666, "eval_runtime": 129.0746, "eval_samples_per_second": 7.732, "eval_steps_per_second": 1.937, "step": 247000 }, { "epoch": 0.04991218404688635, "grad_norm": 9.78345012664795, "learning_rate": 1.900176399546488e-05, "loss": 1.7834, "step": 247100 }, { "epoch": 0.0499323832310413, "grad_norm": 9.387948036193848, "learning_rate": 1.9001360011618576e-05, "loss": 1.8102, "step": 247200 }, { "epoch": 0.04995258241519625, "grad_norm": 8.591599464416504, "learning_rate": 1.9000956027772275e-05, "loss": 1.7743, "step": 247300 }, { "epoch": 0.0499727815993512, "grad_norm": 11.186663627624512, "learning_rate": 1.9000552043925974e-05, "loss": 1.8497, "step": 247400 }, { "epoch": 0.049992980783506155, "grad_norm": 9.403977394104004, "learning_rate": 1.900014806007967e-05, "loss": 1.8729, "step": 247500 }, { "epoch": 0.05001317996766111, "grad_norm": 4.558881759643555, "learning_rate": 1.899974407623337e-05, "loss": 1.9017, "step": 247600 }, { "epoch": 0.050033379151816056, "grad_norm": 5.349502086639404, "learning_rate": 1.8999340092387065e-05, "loss": 1.8355, "step": 247700 }, { "epoch": 0.05005357833597101, "grad_norm": 5.7885236740112305, "learning_rate": 1.8998936108540767e-05, "loss": 1.9266, "step": 247800 }, { "epoch": 0.05007377752012596, "grad_norm": 7.943942546844482, "learning_rate": 1.8998532124694463e-05, "loss": 1.8986, "step": 247900 }, { "epoch": 0.05009397670428092, "grad_norm": 6.063719749450684, "learning_rate": 1.8998128140848162e-05, "loss": 1.8775, "step": 248000 }, { "epoch": 0.05009397670428092, "eval_calculated_loss": 8.94555950164795, "eval_loss": 2.1483190059661865, "eval_perplexity": 7673.740859849727, "eval_runtime": 130.7658, "eval_samples_per_second": 7.632, "eval_steps_per_second": 1.912, "step": 248000 }, { "epoch": 0.05011417588843586, "grad_norm": 8.587395668029785, "learning_rate": 1.899772415700186e-05, "loss": 1.8867, "step": 248100 }, { "epoch": 0.05013437507259082, "grad_norm": 6.969587802886963, "learning_rate": 1.8997320173155557e-05, "loss": 1.8545, "step": 248200 }, { "epoch": 0.05015457425674577, "grad_norm": 6.325963973999023, "learning_rate": 1.8996916189309256e-05, "loss": 1.8781, "step": 248300 }, { "epoch": 0.050174773440900725, "grad_norm": 8.668638229370117, "learning_rate": 1.8996512205462952e-05, "loss": 1.861, "step": 248400 }, { "epoch": 0.05019497262505567, "grad_norm": 5.91864538192749, "learning_rate": 1.899610822161665e-05, "loss": 1.7818, "step": 248500 }, { "epoch": 0.050215171809210625, "grad_norm": 4.768704414367676, "learning_rate": 1.899570423777035e-05, "loss": 1.8033, "step": 248600 }, { "epoch": 0.05023537099336558, "grad_norm": 11.668099403381348, "learning_rate": 1.8995300253924046e-05, "loss": 1.8388, "step": 248700 }, { "epoch": 0.05025557017752053, "grad_norm": 10.819196701049805, "learning_rate": 1.899489627007775e-05, "loss": 1.7632, "step": 248800 }, { "epoch": 0.05027576936167548, "grad_norm": 7.880147933959961, "learning_rate": 1.8994492286231444e-05, "loss": 1.8302, "step": 248900 }, { "epoch": 0.05029596854583043, "grad_norm": 10.0785551071167, "learning_rate": 1.8994088302385143e-05, "loss": 1.8731, "step": 249000 }, { "epoch": 0.05029596854583043, "eval_calculated_loss": 8.879867553710938, "eval_loss": 2.1459648609161377, "eval_perplexity": 7185.838935070343, "eval_runtime": 132.902, "eval_samples_per_second": 7.509, "eval_steps_per_second": 1.881, "step": 249000 }, { "epoch": 0.05031616772998539, "grad_norm": 4.945219993591309, "learning_rate": 1.8993684318538843e-05, "loss": 1.7451, "step": 249100 }, { "epoch": 0.05033636691414034, "grad_norm": 10.76941967010498, "learning_rate": 1.8993280334692538e-05, "loss": 1.8399, "step": 249200 }, { "epoch": 0.05035656609829529, "grad_norm": 6.243673801422119, "learning_rate": 1.8992876350846237e-05, "loss": 1.8893, "step": 249300 }, { "epoch": 0.05037676528245024, "grad_norm": 15.672439575195312, "learning_rate": 1.8992472366999933e-05, "loss": 1.8173, "step": 249400 }, { "epoch": 0.050396964466605194, "grad_norm": 5.392640590667725, "learning_rate": 1.8992068383153632e-05, "loss": 1.8917, "step": 249500 }, { "epoch": 0.05041716365076015, "grad_norm": 3.989332675933838, "learning_rate": 1.899166439930733e-05, "loss": 1.9057, "step": 249600 }, { "epoch": 0.050437362834915095, "grad_norm": 4.625633716583252, "learning_rate": 1.8991260415461027e-05, "loss": 1.9195, "step": 249700 }, { "epoch": 0.05045756201907005, "grad_norm": 7.4487714767456055, "learning_rate": 1.8990856431614726e-05, "loss": 1.8503, "step": 249800 }, { "epoch": 0.050477761203225, "grad_norm": 10.711529731750488, "learning_rate": 1.8990452447768425e-05, "loss": 1.8459, "step": 249900 }, { "epoch": 0.050497960387379956, "grad_norm": 7.633238315582275, "learning_rate": 1.8990048463922125e-05, "loss": 1.8962, "step": 250000 }, { "epoch": 0.050497960387379956, "eval_calculated_loss": 9.102846145629883, "eval_loss": 2.139979600906372, "eval_perplexity": 8980.81707648608, "eval_runtime": 127.2913, "eval_samples_per_second": 7.84, "eval_steps_per_second": 1.964, "step": 250000 }, { "epoch": 0.0505181595715349, "grad_norm": 6.7051615715026855, "learning_rate": 1.898964448007582e-05, "loss": 1.845, "step": 250100 }, { "epoch": 0.050538358755689856, "grad_norm": 10.021074295043945, "learning_rate": 1.898924049622952e-05, "loss": 1.8907, "step": 250200 }, { "epoch": 0.05055855793984481, "grad_norm": 9.283757209777832, "learning_rate": 1.898883651238322e-05, "loss": 1.8089, "step": 250300 }, { "epoch": 0.050578757123999764, "grad_norm": 9.498982429504395, "learning_rate": 1.8988432528536914e-05, "loss": 1.9386, "step": 250400 }, { "epoch": 0.05059895630815471, "grad_norm": 8.916112899780273, "learning_rate": 1.8988028544690613e-05, "loss": 1.8288, "step": 250500 }, { "epoch": 0.050619155492309664, "grad_norm": 5.384042263031006, "learning_rate": 1.8987624560844313e-05, "loss": 1.7535, "step": 250600 }, { "epoch": 0.05063935467646462, "grad_norm": 8.434881210327148, "learning_rate": 1.8987220576998008e-05, "loss": 1.8176, "step": 250700 }, { "epoch": 0.05065955386061957, "grad_norm": 10.503287315368652, "learning_rate": 1.8986816593151707e-05, "loss": 1.8416, "step": 250800 }, { "epoch": 0.05067975304477452, "grad_norm": 7.115686893463135, "learning_rate": 1.8986412609305407e-05, "loss": 1.792, "step": 250900 }, { "epoch": 0.05069995222892947, "grad_norm": 9.095532417297363, "learning_rate": 1.8986008625459106e-05, "loss": 1.7986, "step": 251000 }, { "epoch": 0.05069995222892947, "eval_calculated_loss": 8.882851600646973, "eval_loss": 2.1441829204559326, "eval_perplexity": 7207.31384085545, "eval_runtime": 134.0633, "eval_samples_per_second": 7.444, "eval_steps_per_second": 1.865, "step": 251000 }, { "epoch": 0.050720151413084426, "grad_norm": 10.037092208862305, "learning_rate": 1.89856046416128e-05, "loss": 1.7973, "step": 251100 }, { "epoch": 0.05074035059723938, "grad_norm": 9.30965518951416, "learning_rate": 1.89852006577665e-05, "loss": 1.8784, "step": 251200 }, { "epoch": 0.050760549781394326, "grad_norm": 6.412376880645752, "learning_rate": 1.89847966739202e-05, "loss": 1.8491, "step": 251300 }, { "epoch": 0.05078074896554928, "grad_norm": 4.523944854736328, "learning_rate": 1.8984392690073895e-05, "loss": 1.8574, "step": 251400 }, { "epoch": 0.050800948149704234, "grad_norm": 7.408275127410889, "learning_rate": 1.8983988706227595e-05, "loss": 1.9076, "step": 251500 }, { "epoch": 0.05082114733385919, "grad_norm": 7.976487636566162, "learning_rate": 1.898358472238129e-05, "loss": 1.9701, "step": 251600 }, { "epoch": 0.050841346518014134, "grad_norm": 6.142745018005371, "learning_rate": 1.898318073853499e-05, "loss": 1.8389, "step": 251700 }, { "epoch": 0.05086154570216909, "grad_norm": 4.4674072265625, "learning_rate": 1.898277675468869e-05, "loss": 1.7299, "step": 251800 }, { "epoch": 0.05088174488632404, "grad_norm": 4.493817329406738, "learning_rate": 1.8982372770842388e-05, "loss": 1.8982, "step": 251900 }, { "epoch": 0.050901944070478995, "grad_norm": 12.053571701049805, "learning_rate": 1.8981968786996087e-05, "loss": 1.8329, "step": 252000 }, { "epoch": 0.050901944070478995, "eval_calculated_loss": 8.706426620483398, "eval_loss": 2.136192798614502, "eval_perplexity": 6041.614886353215, "eval_runtime": 131.5673, "eval_samples_per_second": 7.585, "eval_steps_per_second": 1.9, "step": 252000 }, { "epoch": 0.05092214325463394, "grad_norm": 7.8474225997924805, "learning_rate": 1.8981564803149783e-05, "loss": 1.7965, "step": 252100 }, { "epoch": 0.050942342438788896, "grad_norm": 13.054023742675781, "learning_rate": 1.898116081930348e-05, "loss": 1.9017, "step": 252200 }, { "epoch": 0.05096254162294385, "grad_norm": 8.732861518859863, "learning_rate": 1.898075683545718e-05, "loss": 1.8511, "step": 252300 }, { "epoch": 0.0509827408070988, "grad_norm": 7.372416019439697, "learning_rate": 1.8980352851610877e-05, "loss": 1.8645, "step": 252400 }, { "epoch": 0.05100293999125375, "grad_norm": 8.092021942138672, "learning_rate": 1.8979948867764576e-05, "loss": 1.8508, "step": 252500 }, { "epoch": 0.051023139175408704, "grad_norm": 4.23358678817749, "learning_rate": 1.897954488391827e-05, "loss": 1.8414, "step": 252600 }, { "epoch": 0.05104333835956366, "grad_norm": 4.979587554931641, "learning_rate": 1.897914090007197e-05, "loss": 1.8603, "step": 252700 }, { "epoch": 0.05106353754371861, "grad_norm": 8.531682014465332, "learning_rate": 1.897873691622567e-05, "loss": 1.9144, "step": 252800 }, { "epoch": 0.051083736727873565, "grad_norm": 5.80830717086792, "learning_rate": 1.8978332932379365e-05, "loss": 1.8301, "step": 252900 }, { "epoch": 0.05110393591202851, "grad_norm": 10.870638847351074, "learning_rate": 1.8977928948533068e-05, "loss": 1.7889, "step": 253000 }, { "epoch": 0.05110393591202851, "eval_calculated_loss": 8.96341609954834, "eval_loss": 2.1368610858917236, "eval_perplexity": 7811.998496594574, "eval_runtime": 130.5619, "eval_samples_per_second": 7.644, "eval_steps_per_second": 1.915, "step": 253000 }, { "epoch": 0.051124135096183465, "grad_norm": 8.749419212341309, "learning_rate": 1.8977524964686764e-05, "loss": 1.8393, "step": 253100 }, { "epoch": 0.05114433428033842, "grad_norm": 8.32186508178711, "learning_rate": 1.8977120980840463e-05, "loss": 1.9154, "step": 253200 }, { "epoch": 0.05116453346449337, "grad_norm": 8.532902717590332, "learning_rate": 1.897671699699416e-05, "loss": 1.7699, "step": 253300 }, { "epoch": 0.05118473264864832, "grad_norm": 8.092824935913086, "learning_rate": 1.8976313013147858e-05, "loss": 1.924, "step": 253400 }, { "epoch": 0.05120493183280327, "grad_norm": 8.758020401000977, "learning_rate": 1.8975909029301557e-05, "loss": 1.8349, "step": 253500 }, { "epoch": 0.05122513101695823, "grad_norm": 7.2197370529174805, "learning_rate": 1.8975505045455253e-05, "loss": 1.8334, "step": 253600 }, { "epoch": 0.05124533020111318, "grad_norm": 6.9694414138793945, "learning_rate": 1.897510106160895e-05, "loss": 1.7959, "step": 253700 }, { "epoch": 0.05126552938526813, "grad_norm": 8.634186744689941, "learning_rate": 1.897469707776265e-05, "loss": 1.8436, "step": 253800 }, { "epoch": 0.05128572856942308, "grad_norm": 12.430952072143555, "learning_rate": 1.8974293093916347e-05, "loss": 1.766, "step": 253900 }, { "epoch": 0.051305927753578034, "grad_norm": 5.5233354568481445, "learning_rate": 1.8973889110070046e-05, "loss": 1.9469, "step": 254000 }, { "epoch": 0.051305927753578034, "eval_calculated_loss": 8.997846603393555, "eval_loss": 2.1227333545684814, "eval_perplexity": 8085.653548139523, "eval_runtime": 131.6395, "eval_samples_per_second": 7.581, "eval_steps_per_second": 1.899, "step": 254000 }, { "epoch": 0.05132612693773299, "grad_norm": 12.872769355773926, "learning_rate": 1.8973485126223745e-05, "loss": 1.7591, "step": 254100 }, { "epoch": 0.051346326121887935, "grad_norm": 6.813149452209473, "learning_rate": 1.8973081142377444e-05, "loss": 1.8309, "step": 254200 }, { "epoch": 0.05136652530604289, "grad_norm": 5.5572590827941895, "learning_rate": 1.897267715853114e-05, "loss": 1.8353, "step": 254300 }, { "epoch": 0.05138672449019784, "grad_norm": 9.917732238769531, "learning_rate": 1.897227317468484e-05, "loss": 1.8456, "step": 254400 }, { "epoch": 0.051406923674352796, "grad_norm": 13.208770751953125, "learning_rate": 1.8971869190838538e-05, "loss": 1.876, "step": 254500 }, { "epoch": 0.05142712285850774, "grad_norm": 7.632747173309326, "learning_rate": 1.8971465206992234e-05, "loss": 1.8256, "step": 254600 }, { "epoch": 0.051447322042662696, "grad_norm": 7.520971298217773, "learning_rate": 1.8971061223145933e-05, "loss": 1.8302, "step": 254700 }, { "epoch": 0.05146752122681765, "grad_norm": 16.38874626159668, "learning_rate": 1.8970657239299632e-05, "loss": 1.8058, "step": 254800 }, { "epoch": 0.051487720410972604, "grad_norm": 9.232110023498535, "learning_rate": 1.8970253255453328e-05, "loss": 1.7602, "step": 254900 }, { "epoch": 0.05150791959512755, "grad_norm": 9.024076461791992, "learning_rate": 1.8969849271607027e-05, "loss": 1.8495, "step": 255000 }, { "epoch": 0.05150791959512755, "eval_calculated_loss": 8.98412036895752, "eval_loss": 2.1485862731933594, "eval_perplexity": 7975.426205791578, "eval_runtime": 130.2435, "eval_samples_per_second": 7.663, "eval_steps_per_second": 1.919, "step": 255000 }, { "epoch": 0.051528118779282504, "grad_norm": 9.242718696594238, "learning_rate": 1.8969445287760726e-05, "loss": 1.8179, "step": 255100 }, { "epoch": 0.05154831796343746, "grad_norm": 11.13328742980957, "learning_rate": 1.8969041303914425e-05, "loss": 1.8771, "step": 255200 }, { "epoch": 0.05156851714759241, "grad_norm": 8.649642944335938, "learning_rate": 1.896863732006812e-05, "loss": 1.8552, "step": 255300 }, { "epoch": 0.05158871633174736, "grad_norm": 11.743189811706543, "learning_rate": 1.896823333622182e-05, "loss": 1.838, "step": 255400 }, { "epoch": 0.05160891551590231, "grad_norm": 6.896992206573486, "learning_rate": 1.896782935237552e-05, "loss": 1.9796, "step": 255500 }, { "epoch": 0.051629114700057266, "grad_norm": 7.315978050231934, "learning_rate": 1.8967425368529215e-05, "loss": 1.8593, "step": 255600 }, { "epoch": 0.05164931388421222, "grad_norm": 8.14692497253418, "learning_rate": 1.8967021384682914e-05, "loss": 1.8317, "step": 255700 }, { "epoch": 0.051669513068367166, "grad_norm": 4.412042140960693, "learning_rate": 1.896661740083661e-05, "loss": 1.8531, "step": 255800 }, { "epoch": 0.05168971225252212, "grad_norm": 4.949193477630615, "learning_rate": 1.896621341699031e-05, "loss": 1.8155, "step": 255900 }, { "epoch": 0.051709911436677074, "grad_norm": 5.809614181518555, "learning_rate": 1.8965809433144008e-05, "loss": 1.8786, "step": 256000 }, { "epoch": 0.051709911436677074, "eval_calculated_loss": 8.98672866821289, "eval_loss": 2.1314430236816406, "eval_perplexity": 7996.255656937415, "eval_runtime": 130.8154, "eval_samples_per_second": 7.629, "eval_steps_per_second": 1.911, "step": 256000 }, { "epoch": 0.05173011062083203, "grad_norm": 10.489334106445312, "learning_rate": 1.8965405449297707e-05, "loss": 1.8473, "step": 256100 }, { "epoch": 0.051750309804986974, "grad_norm": 6.788070201873779, "learning_rate": 1.8965001465451406e-05, "loss": 1.802, "step": 256200 }, { "epoch": 0.05177050898914193, "grad_norm": 6.503628253936768, "learning_rate": 1.8964597481605102e-05, "loss": 1.8411, "step": 256300 }, { "epoch": 0.05179070817329688, "grad_norm": 5.502723693847656, "learning_rate": 1.89641934977588e-05, "loss": 1.8968, "step": 256400 }, { "epoch": 0.051810907357451835, "grad_norm": 6.514665603637695, "learning_rate": 1.8963789513912497e-05, "loss": 1.8598, "step": 256500 }, { "epoch": 0.05183110654160678, "grad_norm": 7.661776065826416, "learning_rate": 1.8963385530066196e-05, "loss": 1.9192, "step": 256600 }, { "epoch": 0.051851305725761736, "grad_norm": 8.066819190979004, "learning_rate": 1.8962981546219895e-05, "loss": 1.8923, "step": 256700 }, { "epoch": 0.05187150490991669, "grad_norm": 11.137585639953613, "learning_rate": 1.896257756237359e-05, "loss": 1.9219, "step": 256800 }, { "epoch": 0.05189170409407164, "grad_norm": 9.48623275756836, "learning_rate": 1.896217357852729e-05, "loss": 1.7818, "step": 256900 }, { "epoch": 0.05191190327822659, "grad_norm": 6.95361852645874, "learning_rate": 1.896176959468099e-05, "loss": 1.8465, "step": 257000 }, { "epoch": 0.05191190327822659, "eval_calculated_loss": 8.97596549987793, "eval_loss": 2.1357598304748535, "eval_perplexity": 7910.652120289121, "eval_runtime": 132.5433, "eval_samples_per_second": 7.53, "eval_steps_per_second": 1.886, "step": 257000 }, { "epoch": 0.051932102462381544, "grad_norm": 4.784207344055176, "learning_rate": 1.8961365610834688e-05, "loss": 1.8321, "step": 257100 }, { "epoch": 0.0519523016465365, "grad_norm": 6.2403717041015625, "learning_rate": 1.8960961626988387e-05, "loss": 1.9434, "step": 257200 }, { "epoch": 0.05197250083069145, "grad_norm": 9.566238403320312, "learning_rate": 1.8960557643142083e-05, "loss": 1.8351, "step": 257300 }, { "epoch": 0.0519927000148464, "grad_norm": 12.923303604125977, "learning_rate": 1.8960153659295782e-05, "loss": 1.9092, "step": 257400 }, { "epoch": 0.05201289919900135, "grad_norm": 4.941838264465332, "learning_rate": 1.8959749675449478e-05, "loss": 1.7848, "step": 257500 }, { "epoch": 0.052033098383156305, "grad_norm": 9.35424518585205, "learning_rate": 1.8959345691603177e-05, "loss": 1.8564, "step": 257600 }, { "epoch": 0.05205329756731126, "grad_norm": 9.436524391174316, "learning_rate": 1.8958941707756876e-05, "loss": 1.9115, "step": 257700 }, { "epoch": 0.052073496751466206, "grad_norm": 6.978128910064697, "learning_rate": 1.8958537723910572e-05, "loss": 1.7884, "step": 257800 }, { "epoch": 0.05209369593562116, "grad_norm": 6.138431072235107, "learning_rate": 1.895813374006427e-05, "loss": 1.8997, "step": 257900 }, { "epoch": 0.05211389511977611, "grad_norm": 9.001023292541504, "learning_rate": 1.895772975621797e-05, "loss": 1.9381, "step": 258000 }, { "epoch": 0.05211389511977611, "eval_calculated_loss": 9.046321868896484, "eval_loss": 2.1482982635498047, "eval_perplexity": 8487.26317684221, "eval_runtime": 128.7007, "eval_samples_per_second": 7.754, "eval_steps_per_second": 1.942, "step": 258000 }, { "epoch": 0.05213409430393107, "grad_norm": 8.787799835205078, "learning_rate": 1.895732577237167e-05, "loss": 1.8895, "step": 258100 }, { "epoch": 0.05215429348808601, "grad_norm": 12.208064079284668, "learning_rate": 1.8956921788525365e-05, "loss": 1.8528, "step": 258200 }, { "epoch": 0.05217449267224097, "grad_norm": 9.388934135437012, "learning_rate": 1.8956517804679064e-05, "loss": 1.8959, "step": 258300 }, { "epoch": 0.05219469185639592, "grad_norm": 11.522174835205078, "learning_rate": 1.8956113820832763e-05, "loss": 1.7682, "step": 258400 }, { "epoch": 0.052214891040550875, "grad_norm": 11.254547119140625, "learning_rate": 1.895570983698646e-05, "loss": 1.8638, "step": 258500 }, { "epoch": 0.05223509022470582, "grad_norm": 5.544787406921387, "learning_rate": 1.8955305853140158e-05, "loss": 1.8476, "step": 258600 }, { "epoch": 0.052255289408860775, "grad_norm": 7.933061122894287, "learning_rate": 1.8954901869293857e-05, "loss": 1.7933, "step": 258700 }, { "epoch": 0.05227548859301573, "grad_norm": 6.030482769012451, "learning_rate": 1.8954497885447553e-05, "loss": 1.8296, "step": 258800 }, { "epoch": 0.05229568777717068, "grad_norm": 11.48824691772461, "learning_rate": 1.8954093901601252e-05, "loss": 1.8506, "step": 258900 }, { "epoch": 0.05231588696132563, "grad_norm": 8.800088882446289, "learning_rate": 1.8953689917754948e-05, "loss": 1.8905, "step": 259000 }, { "epoch": 0.05231588696132563, "eval_calculated_loss": 8.975776672363281, "eval_loss": 2.149339199066162, "eval_perplexity": 7909.158512531557, "eval_runtime": 131.6895, "eval_samples_per_second": 7.578, "eval_steps_per_second": 1.898, "step": 259000 }, { "epoch": 0.05233608614548058, "grad_norm": 6.565278053283691, "learning_rate": 1.8953285933908647e-05, "loss": 1.7772, "step": 259100 }, { "epoch": 0.05235628532963554, "grad_norm": 6.203109264373779, "learning_rate": 1.8952881950062346e-05, "loss": 1.7356, "step": 259200 }, { "epoch": 0.05237648451379049, "grad_norm": 3.3854076862335205, "learning_rate": 1.8952477966216045e-05, "loss": 1.79, "step": 259300 }, { "epoch": 0.05239668369794544, "grad_norm": 5.864636421203613, "learning_rate": 1.8952073982369744e-05, "loss": 1.9071, "step": 259400 }, { "epoch": 0.05241688288210039, "grad_norm": 8.283792495727539, "learning_rate": 1.895166999852344e-05, "loss": 1.8599, "step": 259500 }, { "epoch": 0.052437082066255344, "grad_norm": 7.282087802886963, "learning_rate": 1.895126601467714e-05, "loss": 1.8505, "step": 259600 }, { "epoch": 0.0524572812504103, "grad_norm": 7.113352298736572, "learning_rate": 1.895086203083084e-05, "loss": 1.8348, "step": 259700 }, { "epoch": 0.052477480434565245, "grad_norm": 8.357288360595703, "learning_rate": 1.8950458046984534e-05, "loss": 1.8186, "step": 259800 }, { "epoch": 0.0524976796187202, "grad_norm": 7.9371442794799805, "learning_rate": 1.8950054063138233e-05, "loss": 1.8466, "step": 259900 }, { "epoch": 0.05251787880287515, "grad_norm": 7.254012584686279, "learning_rate": 1.894965007929193e-05, "loss": 1.8304, "step": 260000 }, { "epoch": 0.05251787880287515, "eval_calculated_loss": 8.917665481567383, "eval_loss": 2.134284734725952, "eval_perplexity": 7462.647190805526, "eval_runtime": 130.9685, "eval_samples_per_second": 7.62, "eval_steps_per_second": 1.909, "step": 260000 }, { "epoch": 0.052538077987030106, "grad_norm": 8.144756317138672, "learning_rate": 1.8949246095445628e-05, "loss": 1.8607, "step": 260100 }, { "epoch": 0.05255827717118505, "grad_norm": 5.624141693115234, "learning_rate": 1.8948842111599327e-05, "loss": 1.8455, "step": 260200 }, { "epoch": 0.052578476355340006, "grad_norm": 8.852350234985352, "learning_rate": 1.8948438127753026e-05, "loss": 1.8388, "step": 260300 }, { "epoch": 0.05259867553949496, "grad_norm": 4.558782577514648, "learning_rate": 1.8948034143906725e-05, "loss": 1.7429, "step": 260400 }, { "epoch": 0.052618874723649914, "grad_norm": 4.619109153747559, "learning_rate": 1.894763016006042e-05, "loss": 1.9235, "step": 260500 }, { "epoch": 0.05263907390780486, "grad_norm": 11.506093978881836, "learning_rate": 1.894722617621412e-05, "loss": 1.775, "step": 260600 }, { "epoch": 0.052659273091959814, "grad_norm": 6.027554988861084, "learning_rate": 1.8946822192367816e-05, "loss": 1.7801, "step": 260700 }, { "epoch": 0.05267947227611477, "grad_norm": 5.190069198608398, "learning_rate": 1.8946418208521515e-05, "loss": 1.7812, "step": 260800 }, { "epoch": 0.05269967146026972, "grad_norm": 11.975712776184082, "learning_rate": 1.8946014224675214e-05, "loss": 1.8861, "step": 260900 }, { "epoch": 0.05271987064442467, "grad_norm": 4.984888553619385, "learning_rate": 1.894561024082891e-05, "loss": 1.8509, "step": 261000 }, { "epoch": 0.05271987064442467, "eval_calculated_loss": 8.721879005432129, "eval_loss": 2.137293577194214, "eval_perplexity": 6135.697271862589, "eval_runtime": 121.3956, "eval_samples_per_second": 8.221, "eval_steps_per_second": 2.059, "step": 261000 }, { "epoch": 0.05274006982857962, "grad_norm": 9.19296646118164, "learning_rate": 1.894520625698261e-05, "loss": 1.7829, "step": 261100 }, { "epoch": 0.052760269012734576, "grad_norm": 11.320028305053711, "learning_rate": 1.894480227313631e-05, "loss": 1.8044, "step": 261200 }, { "epoch": 0.05278046819688953, "grad_norm": 6.5360798835754395, "learning_rate": 1.8944398289290007e-05, "loss": 1.75, "step": 261300 }, { "epoch": 0.052800667381044476, "grad_norm": 5.970553398132324, "learning_rate": 1.8943994305443703e-05, "loss": 1.8608, "step": 261400 }, { "epoch": 0.05282086656519943, "grad_norm": 9.053762435913086, "learning_rate": 1.8943590321597402e-05, "loss": 1.8482, "step": 261500 }, { "epoch": 0.052841065749354384, "grad_norm": 9.470288276672363, "learning_rate": 1.89431863377511e-05, "loss": 1.882, "step": 261600 }, { "epoch": 0.05286126493350934, "grad_norm": 8.776025772094727, "learning_rate": 1.8942782353904797e-05, "loss": 1.8295, "step": 261700 }, { "epoch": 0.052881464117664284, "grad_norm": 6.840857982635498, "learning_rate": 1.8942378370058496e-05, "loss": 1.7937, "step": 261800 }, { "epoch": 0.05290166330181924, "grad_norm": 7.125694274902344, "learning_rate": 1.8941974386212195e-05, "loss": 1.8092, "step": 261900 }, { "epoch": 0.05292186248597419, "grad_norm": 9.809179306030273, "learning_rate": 1.894157040236589e-05, "loss": 1.8117, "step": 262000 }, { "epoch": 0.05292186248597419, "eval_calculated_loss": 8.995569229125977, "eval_loss": 2.1371264457702637, "eval_perplexity": 8067.260440756718, "eval_runtime": 124.0708, "eval_samples_per_second": 8.044, "eval_steps_per_second": 2.015, "step": 262000 }, { "epoch": 0.052942061670129145, "grad_norm": 7.277595520019531, "learning_rate": 1.894116641851959e-05, "loss": 1.9078, "step": 262100 }, { "epoch": 0.0529622608542841, "grad_norm": 10.671294212341309, "learning_rate": 1.8940762434673286e-05, "loss": 1.8339, "step": 262200 }, { "epoch": 0.052982460038439046, "grad_norm": 7.537107944488525, "learning_rate": 1.894035845082699e-05, "loss": 1.9593, "step": 262300 }, { "epoch": 0.053002659222594, "grad_norm": 9.893152236938477, "learning_rate": 1.8939954466980684e-05, "loss": 1.8743, "step": 262400 }, { "epoch": 0.05302285840674895, "grad_norm": 9.145313262939453, "learning_rate": 1.8939550483134383e-05, "loss": 1.8619, "step": 262500 }, { "epoch": 0.05304305759090391, "grad_norm": 8.658838272094727, "learning_rate": 1.8939146499288083e-05, "loss": 1.852, "step": 262600 }, { "epoch": 0.053063256775058854, "grad_norm": 9.128313064575195, "learning_rate": 1.8938742515441778e-05, "loss": 1.8701, "step": 262700 }, { "epoch": 0.05308345595921381, "grad_norm": 8.290282249450684, "learning_rate": 1.8938338531595477e-05, "loss": 1.9155, "step": 262800 }, { "epoch": 0.05310365514336876, "grad_norm": 5.912005424499512, "learning_rate": 1.8937934547749177e-05, "loss": 1.8429, "step": 262900 }, { "epoch": 0.053123854327523715, "grad_norm": 8.230057716369629, "learning_rate": 1.8937530563902872e-05, "loss": 1.8126, "step": 263000 }, { "epoch": 0.053123854327523715, "eval_calculated_loss": 8.96555233001709, "eval_loss": 2.1335337162017822, "eval_perplexity": 7828.704563456199, "eval_runtime": 123.0971, "eval_samples_per_second": 8.107, "eval_steps_per_second": 2.031, "step": 263000 }, { "epoch": 0.05314405351167866, "grad_norm": 9.344315528869629, "learning_rate": 1.893712658005657e-05, "loss": 1.8626, "step": 263100 }, { "epoch": 0.053164252695833615, "grad_norm": 11.898512840270996, "learning_rate": 1.8936722596210267e-05, "loss": 1.8255, "step": 263200 }, { "epoch": 0.05318445187998857, "grad_norm": 8.151082038879395, "learning_rate": 1.893631861236397e-05, "loss": 1.879, "step": 263300 }, { "epoch": 0.05320465106414352, "grad_norm": 10.140399932861328, "learning_rate": 1.8935914628517665e-05, "loss": 1.8519, "step": 263400 }, { "epoch": 0.05322485024829847, "grad_norm": 5.630188941955566, "learning_rate": 1.8935510644671365e-05, "loss": 1.8868, "step": 263500 }, { "epoch": 0.05324504943245342, "grad_norm": 7.623091220855713, "learning_rate": 1.8935106660825064e-05, "loss": 1.858, "step": 263600 }, { "epoch": 0.05326524861660838, "grad_norm": 10.479933738708496, "learning_rate": 1.893470267697876e-05, "loss": 1.8336, "step": 263700 }, { "epoch": 0.05328544780076333, "grad_norm": 5.489277362823486, "learning_rate": 1.893429869313246e-05, "loss": 1.8025, "step": 263800 }, { "epoch": 0.05330564698491828, "grad_norm": 12.816245079040527, "learning_rate": 1.8933894709286154e-05, "loss": 1.8441, "step": 263900 }, { "epoch": 0.05332584616907323, "grad_norm": 11.175862312316895, "learning_rate": 1.8933490725439853e-05, "loss": 1.9196, "step": 264000 }, { "epoch": 0.05332584616907323, "eval_calculated_loss": 8.910318374633789, "eval_loss": 2.146388530731201, "eval_perplexity": 7408.019248288994, "eval_runtime": 123.7693, "eval_samples_per_second": 8.063, "eval_steps_per_second": 2.02, "step": 264000 }, { "epoch": 0.053346045353228184, "grad_norm": 8.435454368591309, "learning_rate": 1.8933086741593553e-05, "loss": 1.827, "step": 264100 }, { "epoch": 0.05336624453738314, "grad_norm": 6.469120025634766, "learning_rate": 1.8932682757747248e-05, "loss": 1.7945, "step": 264200 }, { "epoch": 0.053386443721538085, "grad_norm": 8.142306327819824, "learning_rate": 1.8932278773900947e-05, "loss": 1.8593, "step": 264300 }, { "epoch": 0.05340664290569304, "grad_norm": 6.894611835479736, "learning_rate": 1.8931874790054647e-05, "loss": 1.9114, "step": 264400 }, { "epoch": 0.05342684208984799, "grad_norm": 7.390207290649414, "learning_rate": 1.8931470806208346e-05, "loss": 1.8978, "step": 264500 }, { "epoch": 0.053447041274002946, "grad_norm": 8.60483169555664, "learning_rate": 1.893106682236204e-05, "loss": 1.8636, "step": 264600 }, { "epoch": 0.05346724045815789, "grad_norm": 9.716324806213379, "learning_rate": 1.893066283851574e-05, "loss": 1.854, "step": 264700 }, { "epoch": 0.053487439642312846, "grad_norm": 9.938554763793945, "learning_rate": 1.893025885466944e-05, "loss": 1.906, "step": 264800 }, { "epoch": 0.0535076388264678, "grad_norm": 8.135493278503418, "learning_rate": 1.8929854870823135e-05, "loss": 1.7621, "step": 264900 }, { "epoch": 0.053527838010622754, "grad_norm": 10.29218578338623, "learning_rate": 1.8929450886976835e-05, "loss": 1.8566, "step": 265000 }, { "epoch": 0.053527838010622754, "eval_calculated_loss": 8.840622901916504, "eval_loss": 2.1391685009002686, "eval_perplexity": 6909.295113374648, "eval_runtime": 120.4258, "eval_samples_per_second": 8.287, "eval_steps_per_second": 2.076, "step": 265000 }, { "epoch": 0.0535480371947777, "grad_norm": 8.900711059570312, "learning_rate": 1.8929046903130534e-05, "loss": 1.8047, "step": 265100 }, { "epoch": 0.053568236378932654, "grad_norm": 8.244138717651367, "learning_rate": 1.892864291928423e-05, "loss": 1.9164, "step": 265200 }, { "epoch": 0.05358843556308761, "grad_norm": 4.613393783569336, "learning_rate": 1.892823893543793e-05, "loss": 1.997, "step": 265300 }, { "epoch": 0.05360863474724256, "grad_norm": 9.136329650878906, "learning_rate": 1.8927834951591628e-05, "loss": 1.8687, "step": 265400 }, { "epoch": 0.05362883393139751, "grad_norm": 11.01395320892334, "learning_rate": 1.8927430967745327e-05, "loss": 1.847, "step": 265500 }, { "epoch": 0.05364903311555246, "grad_norm": 10.474855422973633, "learning_rate": 1.8927026983899023e-05, "loss": 1.8757, "step": 265600 }, { "epoch": 0.053669232299707416, "grad_norm": 6.0403313636779785, "learning_rate": 1.892662300005272e-05, "loss": 1.8179, "step": 265700 }, { "epoch": 0.05368943148386237, "grad_norm": 9.080132484436035, "learning_rate": 1.892621901620642e-05, "loss": 1.8879, "step": 265800 }, { "epoch": 0.053709630668017316, "grad_norm": 11.760141372680664, "learning_rate": 1.8925815032360117e-05, "loss": 1.8056, "step": 265900 }, { "epoch": 0.05372982985217227, "grad_norm": 9.389424324035645, "learning_rate": 1.8925411048513816e-05, "loss": 1.806, "step": 266000 }, { "epoch": 0.05372982985217227, "eval_calculated_loss": 8.856968879699707, "eval_loss": 2.151919364929199, "eval_perplexity": 7023.162398503849, "eval_runtime": 126.0752, "eval_samples_per_second": 7.916, "eval_steps_per_second": 1.983, "step": 266000 }, { "epoch": 0.053750029036327224, "grad_norm": 7.849495887756348, "learning_rate": 1.8925007064667515e-05, "loss": 1.918, "step": 266100 }, { "epoch": 0.05377022822048218, "grad_norm": 8.524956703186035, "learning_rate": 1.892460308082121e-05, "loss": 1.8065, "step": 266200 }, { "epoch": 0.053790427404637124, "grad_norm": 8.33759880065918, "learning_rate": 1.892419909697491e-05, "loss": 1.8659, "step": 266300 }, { "epoch": 0.05381062658879208, "grad_norm": 5.064263820648193, "learning_rate": 1.892379511312861e-05, "loss": 1.8558, "step": 266400 }, { "epoch": 0.05383082577294703, "grad_norm": 4.931060791015625, "learning_rate": 1.8923391129282308e-05, "loss": 1.8146, "step": 266500 }, { "epoch": 0.053851024957101985, "grad_norm": 10.122589111328125, "learning_rate": 1.8922987145436004e-05, "loss": 1.8124, "step": 266600 }, { "epoch": 0.05387122414125693, "grad_norm": 8.767791748046875, "learning_rate": 1.8922583161589703e-05, "loss": 1.8974, "step": 266700 }, { "epoch": 0.053891423325411886, "grad_norm": 6.240015029907227, "learning_rate": 1.8922179177743402e-05, "loss": 1.866, "step": 266800 }, { "epoch": 0.05391162250956684, "grad_norm": 9.567181587219238, "learning_rate": 1.8921775193897098e-05, "loss": 1.8294, "step": 266900 }, { "epoch": 0.05393182169372179, "grad_norm": 11.278148651123047, "learning_rate": 1.8921371210050797e-05, "loss": 1.9336, "step": 267000 }, { "epoch": 0.05393182169372179, "eval_calculated_loss": 8.770978927612305, "eval_loss": 2.1432082653045654, "eval_perplexity": 6444.478055016327, "eval_runtime": 123.4478, "eval_samples_per_second": 8.084, "eval_steps_per_second": 2.025, "step": 267000 }, { "epoch": 0.05395202087787674, "grad_norm": 4.768433094024658, "learning_rate": 1.8920967226204492e-05, "loss": 1.788, "step": 267100 }, { "epoch": 0.053972220062031694, "grad_norm": 8.33971881866455, "learning_rate": 1.892056324235819e-05, "loss": 1.8393, "step": 267200 }, { "epoch": 0.05399241924618665, "grad_norm": 4.601705074310303, "learning_rate": 1.892015925851189e-05, "loss": 1.7832, "step": 267300 }, { "epoch": 0.0540126184303416, "grad_norm": 8.917844772338867, "learning_rate": 1.8919755274665586e-05, "loss": 1.8266, "step": 267400 }, { "epoch": 0.05403281761449655, "grad_norm": 5.443627834320068, "learning_rate": 1.891935129081929e-05, "loss": 1.9318, "step": 267500 }, { "epoch": 0.0540530167986515, "grad_norm": 8.94868278503418, "learning_rate": 1.8918947306972985e-05, "loss": 1.7374, "step": 267600 }, { "epoch": 0.054073215982806455, "grad_norm": 10.135793685913086, "learning_rate": 1.8918543323126684e-05, "loss": 1.7781, "step": 267700 }, { "epoch": 0.05409341516696141, "grad_norm": 9.09152889251709, "learning_rate": 1.8918139339280383e-05, "loss": 1.8311, "step": 267800 }, { "epoch": 0.054113614351116356, "grad_norm": 8.205487251281738, "learning_rate": 1.891773535543408e-05, "loss": 1.7884, "step": 267900 }, { "epoch": 0.05413381353527131, "grad_norm": 11.102087020874023, "learning_rate": 1.8917331371587778e-05, "loss": 1.8161, "step": 268000 }, { "epoch": 0.05413381353527131, "eval_calculated_loss": 8.873443603515625, "eval_loss": 2.1455841064453125, "eval_perplexity": 7139.825416153899, "eval_runtime": 124.1467, "eval_samples_per_second": 8.039, "eval_steps_per_second": 2.014, "step": 268000 }, { "epoch": 0.05415401271942626, "grad_norm": 3.78941011428833, "learning_rate": 1.8916927387741474e-05, "loss": 1.8233, "step": 268100 }, { "epoch": 0.05417421190358122, "grad_norm": 9.259247779846191, "learning_rate": 1.8916523403895173e-05, "loss": 1.8883, "step": 268200 }, { "epoch": 0.05419441108773616, "grad_norm": 5.801112651824951, "learning_rate": 1.8916119420048872e-05, "loss": 1.8042, "step": 268300 }, { "epoch": 0.05421461027189112, "grad_norm": 5.014223098754883, "learning_rate": 1.8915715436202568e-05, "loss": 1.8247, "step": 268400 }, { "epoch": 0.05423480945604607, "grad_norm": 6.780404090881348, "learning_rate": 1.891531145235627e-05, "loss": 1.8959, "step": 268500 }, { "epoch": 0.054255008640201025, "grad_norm": 7.090856075286865, "learning_rate": 1.8914907468509966e-05, "loss": 1.9654, "step": 268600 }, { "epoch": 0.05427520782435597, "grad_norm": 8.51171875, "learning_rate": 1.8914503484663665e-05, "loss": 1.9487, "step": 268700 }, { "epoch": 0.054295407008510925, "grad_norm": 7.326169490814209, "learning_rate": 1.891409950081736e-05, "loss": 1.8716, "step": 268800 }, { "epoch": 0.05431560619266588, "grad_norm": 10.885299682617188, "learning_rate": 1.891369551697106e-05, "loss": 1.8428, "step": 268900 }, { "epoch": 0.05433580537682083, "grad_norm": 5.281418800354004, "learning_rate": 1.891329153312476e-05, "loss": 1.8287, "step": 269000 }, { "epoch": 0.05433580537682083, "eval_calculated_loss": 9.187053680419922, "eval_loss": 2.1381235122680664, "eval_perplexity": 9769.823510286937, "eval_runtime": 124.2632, "eval_samples_per_second": 8.031, "eval_steps_per_second": 2.012, "step": 269000 }, { "epoch": 0.05435600456097578, "grad_norm": 9.00594425201416, "learning_rate": 1.8912887549278455e-05, "loss": 1.8222, "step": 269100 }, { "epoch": 0.05437620374513073, "grad_norm": 9.580602645874023, "learning_rate": 1.8912483565432154e-05, "loss": 1.8338, "step": 269200 }, { "epoch": 0.05439640292928569, "grad_norm": 4.23262882232666, "learning_rate": 1.8912079581585853e-05, "loss": 1.8795, "step": 269300 }, { "epoch": 0.05441660211344064, "grad_norm": 10.827473640441895, "learning_rate": 1.891167559773955e-05, "loss": 1.8323, "step": 269400 }, { "epoch": 0.05443680129759559, "grad_norm": 9.935784339904785, "learning_rate": 1.8911271613893248e-05, "loss": 1.812, "step": 269500 }, { "epoch": 0.05445700048175054, "grad_norm": 4.216614246368408, "learning_rate": 1.8910867630046947e-05, "loss": 1.8398, "step": 269600 }, { "epoch": 0.054477199665905494, "grad_norm": 9.116621017456055, "learning_rate": 1.8910463646200646e-05, "loss": 1.874, "step": 269700 }, { "epoch": 0.05449739885006045, "grad_norm": 11.011263847351074, "learning_rate": 1.8910059662354342e-05, "loss": 1.8338, "step": 269800 }, { "epoch": 0.054517598034215395, "grad_norm": 4.906750202178955, "learning_rate": 1.890965567850804e-05, "loss": 1.9309, "step": 269900 }, { "epoch": 0.05453779721837035, "grad_norm": 8.594338417053223, "learning_rate": 1.890925169466174e-05, "loss": 1.9241, "step": 270000 }, { "epoch": 0.05453779721837035, "eval_calculated_loss": 8.866072654724121, "eval_loss": 2.143726110458374, "eval_perplexity": 7087.391609480832, "eval_runtime": 124.4236, "eval_samples_per_second": 8.021, "eval_steps_per_second": 2.009, "step": 270000 }, { "epoch": 0.0545579964025253, "grad_norm": 8.641648292541504, "learning_rate": 1.8908847710815436e-05, "loss": 1.8915, "step": 270100 }, { "epoch": 0.054578195586680256, "grad_norm": 6.825341701507568, "learning_rate": 1.8908443726969135e-05, "loss": 1.9451, "step": 270200 }, { "epoch": 0.0545983947708352, "grad_norm": 6.34080171585083, "learning_rate": 1.8908039743122834e-05, "loss": 1.8157, "step": 270300 }, { "epoch": 0.054618593954990156, "grad_norm": 10.469839096069336, "learning_rate": 1.890763575927653e-05, "loss": 1.9469, "step": 270400 }, { "epoch": 0.05463879313914511, "grad_norm": 11.308684349060059, "learning_rate": 1.890723177543023e-05, "loss": 1.7957, "step": 270500 }, { "epoch": 0.054658992323300064, "grad_norm": 10.52429485321045, "learning_rate": 1.8906827791583928e-05, "loss": 1.9223, "step": 270600 }, { "epoch": 0.05467919150745501, "grad_norm": 9.531723022460938, "learning_rate": 1.8906423807737627e-05, "loss": 1.8254, "step": 270700 }, { "epoch": 0.054699390691609964, "grad_norm": 9.87861156463623, "learning_rate": 1.8906019823891323e-05, "loss": 1.8509, "step": 270800 }, { "epoch": 0.05471958987576492, "grad_norm": 7.772528171539307, "learning_rate": 1.8905615840045022e-05, "loss": 1.8534, "step": 270900 }, { "epoch": 0.05473978905991987, "grad_norm": 7.0633931159973145, "learning_rate": 1.890521185619872e-05, "loss": 1.8829, "step": 271000 }, { "epoch": 0.05473978905991987, "eval_calculated_loss": 8.957479476928711, "eval_loss": 2.1450469493865967, "eval_perplexity": 7765.758998643266, "eval_runtime": 124.1588, "eval_samples_per_second": 8.038, "eval_steps_per_second": 2.014, "step": 271000 }, { "epoch": 0.05475998824407482, "grad_norm": 10.909952163696289, "learning_rate": 1.8904807872352417e-05, "loss": 1.7966, "step": 271100 }, { "epoch": 0.05478018742822977, "grad_norm": 6.863527774810791, "learning_rate": 1.8904403888506116e-05, "loss": 1.9094, "step": 271200 }, { "epoch": 0.054800386612384726, "grad_norm": 9.602267265319824, "learning_rate": 1.8903999904659812e-05, "loss": 1.8242, "step": 271300 }, { "epoch": 0.05482058579653968, "grad_norm": 8.201351165771484, "learning_rate": 1.890359592081351e-05, "loss": 1.8998, "step": 271400 }, { "epoch": 0.05484078498069463, "grad_norm": 8.28017807006836, "learning_rate": 1.890319193696721e-05, "loss": 1.9286, "step": 271500 }, { "epoch": 0.05486098416484958, "grad_norm": 7.0805792808532715, "learning_rate": 1.890278795312091e-05, "loss": 1.8769, "step": 271600 }, { "epoch": 0.054881183349004534, "grad_norm": 7.117795944213867, "learning_rate": 1.890238396927461e-05, "loss": 1.894, "step": 271700 }, { "epoch": 0.05490138253315949, "grad_norm": 3.9619085788726807, "learning_rate": 1.8901979985428304e-05, "loss": 1.8328, "step": 271800 }, { "epoch": 0.05492158171731444, "grad_norm": 11.217453956604004, "learning_rate": 1.8901576001582003e-05, "loss": 1.7991, "step": 271900 }, { "epoch": 0.05494178090146939, "grad_norm": 11.48288345336914, "learning_rate": 1.89011720177357e-05, "loss": 1.9403, "step": 272000 }, { "epoch": 0.05494178090146939, "eval_calculated_loss": 8.835973739624023, "eval_loss": 2.150623083114624, "eval_perplexity": 6877.24723468595, "eval_runtime": 126.8407, "eval_samples_per_second": 7.868, "eval_steps_per_second": 1.971, "step": 272000 }, { "epoch": 0.05496198008562434, "grad_norm": 8.595284461975098, "learning_rate": 1.8900768033889398e-05, "loss": 1.8559, "step": 272100 }, { "epoch": 0.054982179269779295, "grad_norm": 5.541689872741699, "learning_rate": 1.8900364050043097e-05, "loss": 1.8598, "step": 272200 }, { "epoch": 0.05500237845393425, "grad_norm": 4.8343729972839355, "learning_rate": 1.8899960066196793e-05, "loss": 1.8681, "step": 272300 }, { "epoch": 0.055022577638089196, "grad_norm": 6.086266040802002, "learning_rate": 1.8899556082350492e-05, "loss": 1.8635, "step": 272400 }, { "epoch": 0.05504277682224415, "grad_norm": 9.329236030578613, "learning_rate": 1.889915209850419e-05, "loss": 1.7989, "step": 272500 }, { "epoch": 0.0550629760063991, "grad_norm": 8.666624069213867, "learning_rate": 1.889874811465789e-05, "loss": 1.787, "step": 272600 }, { "epoch": 0.05508317519055406, "grad_norm": 6.191939830780029, "learning_rate": 1.889834413081159e-05, "loss": 1.8701, "step": 272700 }, { "epoch": 0.055103374374709004, "grad_norm": 8.859508514404297, "learning_rate": 1.8897940146965285e-05, "loss": 1.8306, "step": 272800 }, { "epoch": 0.05512357355886396, "grad_norm": 8.752134323120117, "learning_rate": 1.8897536163118984e-05, "loss": 1.8879, "step": 272900 }, { "epoch": 0.05514377274301891, "grad_norm": 5.232884883880615, "learning_rate": 1.889713217927268e-05, "loss": 1.8182, "step": 273000 }, { "epoch": 0.05514377274301891, "eval_calculated_loss": 8.794251441955566, "eval_loss": 2.1434850692749023, "eval_perplexity": 6596.216077128141, "eval_runtime": 126.081, "eval_samples_per_second": 7.916, "eval_steps_per_second": 1.983, "step": 273000 }, { "epoch": 0.055163971927173865, "grad_norm": 9.640660285949707, "learning_rate": 1.889672819542638e-05, "loss": 1.8504, "step": 273100 }, { "epoch": 0.05518417111132881, "grad_norm": 8.387462615966797, "learning_rate": 1.889632421158008e-05, "loss": 1.7848, "step": 273200 }, { "epoch": 0.055204370295483765, "grad_norm": 11.009815216064453, "learning_rate": 1.8895920227733774e-05, "loss": 1.8629, "step": 273300 }, { "epoch": 0.05522456947963872, "grad_norm": 4.546616077423096, "learning_rate": 1.8895516243887473e-05, "loss": 1.8581, "step": 273400 }, { "epoch": 0.05524476866379367, "grad_norm": 6.000464916229248, "learning_rate": 1.8895112260041172e-05, "loss": 1.8996, "step": 273500 }, { "epoch": 0.05526496784794862, "grad_norm": 6.975701332092285, "learning_rate": 1.8894708276194868e-05, "loss": 1.9297, "step": 273600 }, { "epoch": 0.05528516703210357, "grad_norm": 7.031562805175781, "learning_rate": 1.8894304292348567e-05, "loss": 1.831, "step": 273700 }, { "epoch": 0.05530536621625853, "grad_norm": 11.08057975769043, "learning_rate": 1.8893900308502266e-05, "loss": 1.8736, "step": 273800 }, { "epoch": 0.05532556540041348, "grad_norm": 5.1088128089904785, "learning_rate": 1.8893496324655965e-05, "loss": 1.8681, "step": 273900 }, { "epoch": 0.05534576458456843, "grad_norm": 5.899173736572266, "learning_rate": 1.889309234080966e-05, "loss": 1.8476, "step": 274000 }, { "epoch": 0.05534576458456843, "eval_calculated_loss": 9.016838073730469, "eval_loss": 2.1348021030426025, "eval_perplexity": 8240.679422823181, "eval_runtime": 119.7284, "eval_samples_per_second": 8.336, "eval_steps_per_second": 2.088, "step": 274000 }, { "epoch": 0.05536596376872338, "grad_norm": 7.178084373474121, "learning_rate": 1.889268835696336e-05, "loss": 1.905, "step": 274100 }, { "epoch": 0.055386162952878334, "grad_norm": 7.5173749923706055, "learning_rate": 1.889228437311706e-05, "loss": 1.9373, "step": 274200 }, { "epoch": 0.05540636213703329, "grad_norm": 7.6161322593688965, "learning_rate": 1.8891880389270755e-05, "loss": 1.8551, "step": 274300 }, { "epoch": 0.055426561321188235, "grad_norm": 4.829672336578369, "learning_rate": 1.8891476405424454e-05, "loss": 1.7809, "step": 274400 }, { "epoch": 0.05544676050534319, "grad_norm": 7.995138645172119, "learning_rate": 1.889107242157815e-05, "loss": 1.8828, "step": 274500 }, { "epoch": 0.05546695968949814, "grad_norm": 5.763426780700684, "learning_rate": 1.889066843773185e-05, "loss": 1.8255, "step": 274600 }, { "epoch": 0.055487158873653096, "grad_norm": 6.327158451080322, "learning_rate": 1.8890264453885548e-05, "loss": 1.8305, "step": 274700 }, { "epoch": 0.05550735805780804, "grad_norm": 7.132756233215332, "learning_rate": 1.8889860470039247e-05, "loss": 1.8535, "step": 274800 }, { "epoch": 0.055527557241962996, "grad_norm": 6.191182613372803, "learning_rate": 1.8889456486192947e-05, "loss": 1.7748, "step": 274900 }, { "epoch": 0.05554775642611795, "grad_norm": 8.719108581542969, "learning_rate": 1.8889052502346642e-05, "loss": 1.9017, "step": 275000 }, { "epoch": 0.05554775642611795, "eval_calculated_loss": 8.747678756713867, "eval_loss": 2.139511823654175, "eval_perplexity": 6296.05645407139, "eval_runtime": 125.6937, "eval_samples_per_second": 7.94, "eval_steps_per_second": 1.989, "step": 275000 }, { "epoch": 0.055567955610272904, "grad_norm": 3.5589964389801025, "learning_rate": 1.888864851850034e-05, "loss": 1.7784, "step": 275100 }, { "epoch": 0.05558815479442785, "grad_norm": 9.934723854064941, "learning_rate": 1.8888244534654037e-05, "loss": 1.8194, "step": 275200 }, { "epoch": 0.055608353978582804, "grad_norm": 9.453248977661133, "learning_rate": 1.8887840550807736e-05, "loss": 1.8948, "step": 275300 }, { "epoch": 0.05562855316273776, "grad_norm": 7.533228397369385, "learning_rate": 1.8887436566961435e-05, "loss": 1.8688, "step": 275400 }, { "epoch": 0.05564875234689271, "grad_norm": 8.012956619262695, "learning_rate": 1.888703258311513e-05, "loss": 1.7591, "step": 275500 }, { "epoch": 0.05566895153104766, "grad_norm": 8.024110794067383, "learning_rate": 1.888662859926883e-05, "loss": 1.9039, "step": 275600 }, { "epoch": 0.05568915071520261, "grad_norm": 9.152833938598633, "learning_rate": 1.888622461542253e-05, "loss": 1.8225, "step": 275700 }, { "epoch": 0.055709349899357566, "grad_norm": 5.900786399841309, "learning_rate": 1.888582063157623e-05, "loss": 1.8621, "step": 275800 }, { "epoch": 0.05572954908351252, "grad_norm": 10.414346694946289, "learning_rate": 1.8885416647729928e-05, "loss": 1.742, "step": 275900 }, { "epoch": 0.055749748267667466, "grad_norm": 7.433250904083252, "learning_rate": 1.8885012663883623e-05, "loss": 1.7655, "step": 276000 }, { "epoch": 0.055749748267667466, "eval_calculated_loss": 9.096254348754883, "eval_loss": 2.133514165878296, "eval_perplexity": 8921.812042800702, "eval_runtime": 124.6302, "eval_samples_per_second": 8.008, "eval_steps_per_second": 2.006, "step": 276000 }, { "epoch": 0.05576994745182242, "grad_norm": 10.137136459350586, "learning_rate": 1.8884608680037323e-05, "loss": 1.9047, "step": 276100 }, { "epoch": 0.055790146635977374, "grad_norm": 9.66201114654541, "learning_rate": 1.8884204696191018e-05, "loss": 1.8903, "step": 276200 }, { "epoch": 0.05581034582013233, "grad_norm": 5.860875606536865, "learning_rate": 1.8883800712344717e-05, "loss": 1.9235, "step": 276300 }, { "epoch": 0.055830545004287274, "grad_norm": 7.340824127197266, "learning_rate": 1.8883396728498417e-05, "loss": 1.9608, "step": 276400 }, { "epoch": 0.05585074418844223, "grad_norm": 10.24260139465332, "learning_rate": 1.8882992744652112e-05, "loss": 1.8277, "step": 276500 }, { "epoch": 0.05587094337259718, "grad_norm": 8.419562339782715, "learning_rate": 1.888258876080581e-05, "loss": 1.9033, "step": 276600 }, { "epoch": 0.055891142556752135, "grad_norm": 5.643170356750488, "learning_rate": 1.888218477695951e-05, "loss": 1.9123, "step": 276700 }, { "epoch": 0.05591134174090708, "grad_norm": 9.960338592529297, "learning_rate": 1.888178079311321e-05, "loss": 1.8834, "step": 276800 }, { "epoch": 0.055931540925062036, "grad_norm": 4.2921833992004395, "learning_rate": 1.8881376809266905e-05, "loss": 1.8568, "step": 276900 }, { "epoch": 0.05595174010921699, "grad_norm": 10.309284210205078, "learning_rate": 1.8880972825420605e-05, "loss": 1.8473, "step": 277000 }, { "epoch": 0.05595174010921699, "eval_calculated_loss": 9.072406768798828, "eval_loss": 2.1453771591186523, "eval_perplexity": 8711.565318513192, "eval_runtime": 124.3778, "eval_samples_per_second": 8.024, "eval_steps_per_second": 2.01, "step": 277000 }, { "epoch": 0.05597193929337194, "grad_norm": 6.915069103240967, "learning_rate": 1.8880568841574304e-05, "loss": 1.8919, "step": 277100 }, { "epoch": 0.05599213847752689, "grad_norm": 6.8476338386535645, "learning_rate": 1.8880164857728e-05, "loss": 1.8666, "step": 277200 }, { "epoch": 0.056012337661681844, "grad_norm": 8.47436809539795, "learning_rate": 1.88797608738817e-05, "loss": 1.8535, "step": 277300 }, { "epoch": 0.0560325368458368, "grad_norm": 8.532501220703125, "learning_rate": 1.8879356890035398e-05, "loss": 1.8619, "step": 277400 }, { "epoch": 0.05605273602999175, "grad_norm": 9.330723762512207, "learning_rate": 1.8878952906189093e-05, "loss": 1.7652, "step": 277500 }, { "epoch": 0.0560729352141467, "grad_norm": 6.718904495239258, "learning_rate": 1.8878548922342793e-05, "loss": 1.8072, "step": 277600 }, { "epoch": 0.05609313439830165, "grad_norm": 7.624213695526123, "learning_rate": 1.8878144938496488e-05, "loss": 1.8835, "step": 277700 }, { "epoch": 0.056113333582456605, "grad_norm": 10.099466323852539, "learning_rate": 1.887774095465019e-05, "loss": 1.8754, "step": 277800 }, { "epoch": 0.05613353276661156, "grad_norm": 9.434712409973145, "learning_rate": 1.8877336970803887e-05, "loss": 1.868, "step": 277900 }, { "epoch": 0.056153731950766506, "grad_norm": 5.100517272949219, "learning_rate": 1.8876932986957586e-05, "loss": 1.8691, "step": 278000 }, { "epoch": 0.056153731950766506, "eval_calculated_loss": 8.905409812927246, "eval_loss": 2.1401329040527344, "eval_perplexity": 7371.74562717059, "eval_runtime": 124.7607, "eval_samples_per_second": 7.999, "eval_steps_per_second": 2.004, "step": 278000 }, { "epoch": 0.05617393113492146, "grad_norm": 7.669050693511963, "learning_rate": 1.8876529003111285e-05, "loss": 1.9306, "step": 278100 }, { "epoch": 0.05619413031907641, "grad_norm": 9.161352157592773, "learning_rate": 1.887612501926498e-05, "loss": 1.8132, "step": 278200 }, { "epoch": 0.05621432950323137, "grad_norm": 10.330587387084961, "learning_rate": 1.887572103541868e-05, "loss": 1.8073, "step": 278300 }, { "epoch": 0.05623452868738631, "grad_norm": 5.805448532104492, "learning_rate": 1.887531705157238e-05, "loss": 1.8707, "step": 278400 }, { "epoch": 0.05625472787154127, "grad_norm": 8.651413917541504, "learning_rate": 1.8874913067726075e-05, "loss": 1.7918, "step": 278500 }, { "epoch": 0.05627492705569622, "grad_norm": 8.659919738769531, "learning_rate": 1.8874509083879774e-05, "loss": 1.8331, "step": 278600 }, { "epoch": 0.056295126239851175, "grad_norm": 5.096415996551514, "learning_rate": 1.887410510003347e-05, "loss": 1.755, "step": 278700 }, { "epoch": 0.05631532542400612, "grad_norm": 7.520391941070557, "learning_rate": 1.887370111618717e-05, "loss": 1.8949, "step": 278800 }, { "epoch": 0.056335524608161075, "grad_norm": 9.043025970458984, "learning_rate": 1.8873297132340868e-05, "loss": 1.869, "step": 278900 }, { "epoch": 0.05635572379231603, "grad_norm": 9.233643531799316, "learning_rate": 1.8872893148494567e-05, "loss": 1.81, "step": 279000 }, { "epoch": 0.05635572379231603, "eval_calculated_loss": 8.91842269897461, "eval_loss": 2.150230646133423, "eval_perplexity": 7468.300177162122, "eval_runtime": 122.8425, "eval_samples_per_second": 8.124, "eval_steps_per_second": 2.035, "step": 279000 }, { "epoch": 0.05637592297647098, "grad_norm": 9.039660453796387, "learning_rate": 1.8872489164648266e-05, "loss": 1.7937, "step": 279100 }, { "epoch": 0.05639612216062593, "grad_norm": 8.863395690917969, "learning_rate": 1.887208518080196e-05, "loss": 1.8713, "step": 279200 }, { "epoch": 0.05641632134478088, "grad_norm": 3.8841583728790283, "learning_rate": 1.887168119695566e-05, "loss": 1.8569, "step": 279300 }, { "epoch": 0.05643652052893584, "grad_norm": 9.705349922180176, "learning_rate": 1.8871277213109356e-05, "loss": 1.7902, "step": 279400 }, { "epoch": 0.05645671971309079, "grad_norm": 8.797986030578613, "learning_rate": 1.8870873229263056e-05, "loss": 1.9302, "step": 279500 }, { "epoch": 0.05647691889724574, "grad_norm": 12.501291275024414, "learning_rate": 1.8870469245416755e-05, "loss": 1.8505, "step": 279600 }, { "epoch": 0.05649711808140069, "grad_norm": 6.297443389892578, "learning_rate": 1.887006526157045e-05, "loss": 1.894, "step": 279700 }, { "epoch": 0.056517317265555644, "grad_norm": 5.903554439544678, "learning_rate": 1.886966127772415e-05, "loss": 1.892, "step": 279800 }, { "epoch": 0.0565375164497106, "grad_norm": 3.3455371856689453, "learning_rate": 1.886925729387785e-05, "loss": 1.889, "step": 279900 }, { "epoch": 0.056557715633865545, "grad_norm": 7.502070903778076, "learning_rate": 1.8868853310031548e-05, "loss": 1.8294, "step": 280000 }, { "epoch": 0.056557715633865545, "eval_calculated_loss": 8.932680130004883, "eval_loss": 2.1497814655303955, "eval_perplexity": 7575.541629039802, "eval_runtime": 125.6968, "eval_samples_per_second": 7.94, "eval_steps_per_second": 1.989, "step": 280000 }, { "epoch": 0.0565779148180205, "grad_norm": 5.67266845703125, "learning_rate": 1.8868449326185244e-05, "loss": 1.8231, "step": 280100 }, { "epoch": 0.05659811400217545, "grad_norm": 11.055676460266113, "learning_rate": 1.8868045342338943e-05, "loss": 1.8403, "step": 280200 }, { "epoch": 0.056618313186330406, "grad_norm": 9.937235832214355, "learning_rate": 1.8867641358492642e-05, "loss": 1.7933, "step": 280300 }, { "epoch": 0.05663851237048535, "grad_norm": 6.529809951782227, "learning_rate": 1.8867237374646338e-05, "loss": 1.8473, "step": 280400 }, { "epoch": 0.056658711554640306, "grad_norm": 6.835728645324707, "learning_rate": 1.8866833390800037e-05, "loss": 1.8454, "step": 280500 }, { "epoch": 0.05667891073879526, "grad_norm": 4.149314880371094, "learning_rate": 1.8866429406953736e-05, "loss": 1.8006, "step": 280600 }, { "epoch": 0.056699109922950214, "grad_norm": 6.002569198608398, "learning_rate": 1.886602542310743e-05, "loss": 1.8281, "step": 280700 }, { "epoch": 0.05671930910710517, "grad_norm": 5.513141632080078, "learning_rate": 1.886562143926113e-05, "loss": 1.7618, "step": 280800 }, { "epoch": 0.056739508291260114, "grad_norm": 8.147765159606934, "learning_rate": 1.886521745541483e-05, "loss": 1.8311, "step": 280900 }, { "epoch": 0.05675970747541507, "grad_norm": 6.387617111206055, "learning_rate": 1.886481347156853e-05, "loss": 1.873, "step": 281000 }, { "epoch": 0.05675970747541507, "eval_calculated_loss": 8.863032341003418, "eval_loss": 2.151414632797241, "eval_perplexity": 7065.876438534216, "eval_runtime": 123.3276, "eval_samples_per_second": 8.092, "eval_steps_per_second": 2.027, "step": 281000 }, { "epoch": 0.05677990665957002, "grad_norm": 6.217083930969238, "learning_rate": 1.8864409487722225e-05, "loss": 1.7987, "step": 281100 }, { "epoch": 0.056800105843724975, "grad_norm": 7.763278961181641, "learning_rate": 1.8864005503875924e-05, "loss": 1.8206, "step": 281200 }, { "epoch": 0.05682030502787992, "grad_norm": 8.837784767150879, "learning_rate": 1.8863601520029623e-05, "loss": 1.9021, "step": 281300 }, { "epoch": 0.056840504212034876, "grad_norm": 8.170931816101074, "learning_rate": 1.886319753618332e-05, "loss": 1.8841, "step": 281400 }, { "epoch": 0.05686070339618983, "grad_norm": 3.7434990406036377, "learning_rate": 1.8862793552337018e-05, "loss": 1.8284, "step": 281500 }, { "epoch": 0.05688090258034478, "grad_norm": 9.986892700195312, "learning_rate": 1.8862389568490717e-05, "loss": 1.8492, "step": 281600 }, { "epoch": 0.05690110176449973, "grad_norm": 5.173144817352295, "learning_rate": 1.8861985584644413e-05, "loss": 1.7905, "step": 281700 }, { "epoch": 0.056921300948654684, "grad_norm": 8.05146312713623, "learning_rate": 1.8861581600798112e-05, "loss": 1.9277, "step": 281800 }, { "epoch": 0.05694150013280964, "grad_norm": 13.096009254455566, "learning_rate": 1.8861177616951808e-05, "loss": 1.8848, "step": 281900 }, { "epoch": 0.05696169931696459, "grad_norm": 10.18315315246582, "learning_rate": 1.886077363310551e-05, "loss": 1.9285, "step": 282000 }, { "epoch": 0.05696169931696459, "eval_calculated_loss": 8.829548835754395, "eval_loss": 2.1397573947906494, "eval_perplexity": 6833.203223094732, "eval_runtime": 122.4985, "eval_samples_per_second": 8.147, "eval_steps_per_second": 2.041, "step": 282000 }, { "epoch": 0.05698189850111954, "grad_norm": 7.368905544281006, "learning_rate": 1.8860369649259206e-05, "loss": 1.8573, "step": 282100 }, { "epoch": 0.05700209768527449, "grad_norm": 9.454216003417969, "learning_rate": 1.8859965665412905e-05, "loss": 1.8838, "step": 282200 }, { "epoch": 0.057022296869429445, "grad_norm": 6.477413177490234, "learning_rate": 1.8859561681566604e-05, "loss": 1.8447, "step": 282300 }, { "epoch": 0.0570424960535844, "grad_norm": 7.279499053955078, "learning_rate": 1.88591576977203e-05, "loss": 1.8197, "step": 282400 }, { "epoch": 0.057062695237739346, "grad_norm": 10.389497756958008, "learning_rate": 1.8858753713874e-05, "loss": 1.804, "step": 282500 }, { "epoch": 0.0570828944218943, "grad_norm": 11.44560718536377, "learning_rate": 1.8858349730027695e-05, "loss": 1.9147, "step": 282600 }, { "epoch": 0.05710309360604925, "grad_norm": 10.171195983886719, "learning_rate": 1.8857945746181394e-05, "loss": 1.9158, "step": 282700 }, { "epoch": 0.05712329279020421, "grad_norm": 7.864814758300781, "learning_rate": 1.8857541762335093e-05, "loss": 1.7692, "step": 282800 }, { "epoch": 0.057143491974359154, "grad_norm": 7.6262736320495605, "learning_rate": 1.885713777848879e-05, "loss": 1.8474, "step": 282900 }, { "epoch": 0.05716369115851411, "grad_norm": 6.068788528442383, "learning_rate": 1.885673379464249e-05, "loss": 1.8632, "step": 283000 }, { "epoch": 0.05716369115851411, "eval_calculated_loss": 8.911909103393555, "eval_loss": 2.1418750286102295, "eval_perplexity": 7419.812775224504, "eval_runtime": 125.8627, "eval_samples_per_second": 7.929, "eval_steps_per_second": 1.986, "step": 283000 }, { "epoch": 0.05718389034266906, "grad_norm": 13.944077491760254, "learning_rate": 1.8856329810796187e-05, "loss": 1.8523, "step": 283100 }, { "epoch": 0.057204089526824015, "grad_norm": 10.603829383850098, "learning_rate": 1.8855925826949886e-05, "loss": 1.8663, "step": 283200 }, { "epoch": 0.05722428871097896, "grad_norm": 5.125158786773682, "learning_rate": 1.8855521843103585e-05, "loss": 1.8553, "step": 283300 }, { "epoch": 0.057244487895133915, "grad_norm": 6.640757083892822, "learning_rate": 1.885511785925728e-05, "loss": 1.7719, "step": 283400 }, { "epoch": 0.05726468707928887, "grad_norm": 6.303879737854004, "learning_rate": 1.885471387541098e-05, "loss": 1.7822, "step": 283500 }, { "epoch": 0.05728488626344382, "grad_norm": 7.935580730438232, "learning_rate": 1.8854309891564676e-05, "loss": 1.7796, "step": 283600 }, { "epoch": 0.05730508544759877, "grad_norm": 7.75797176361084, "learning_rate": 1.8853905907718375e-05, "loss": 1.8693, "step": 283700 }, { "epoch": 0.05732528463175372, "grad_norm": 7.938492298126221, "learning_rate": 1.8853501923872074e-05, "loss": 1.8171, "step": 283800 }, { "epoch": 0.05734548381590868, "grad_norm": 10.951003074645996, "learning_rate": 1.885309794002577e-05, "loss": 1.8741, "step": 283900 }, { "epoch": 0.05736568300006363, "grad_norm": 6.343652248382568, "learning_rate": 1.8852693956179472e-05, "loss": 1.914, "step": 284000 }, { "epoch": 0.05736568300006363, "eval_calculated_loss": 8.941390991210938, "eval_loss": 2.152419090270996, "eval_perplexity": 7641.819369740441, "eval_runtime": 125.6675, "eval_samples_per_second": 7.942, "eval_steps_per_second": 1.989, "step": 284000 }, { "epoch": 0.05738588218421858, "grad_norm": 9.449675559997559, "learning_rate": 1.8852289972333168e-05, "loss": 1.9647, "step": 284100 }, { "epoch": 0.05740608136837353, "grad_norm": 8.993537902832031, "learning_rate": 1.8851885988486867e-05, "loss": 1.9291, "step": 284200 }, { "epoch": 0.057426280552528484, "grad_norm": 5.404361248016357, "learning_rate": 1.8851482004640563e-05, "loss": 1.8709, "step": 284300 }, { "epoch": 0.05744647973668344, "grad_norm": 7.828668594360352, "learning_rate": 1.8851078020794262e-05, "loss": 1.8086, "step": 284400 }, { "epoch": 0.057466678920838385, "grad_norm": 7.424640655517578, "learning_rate": 1.885067403694796e-05, "loss": 1.8445, "step": 284500 }, { "epoch": 0.05748687810499334, "grad_norm": 9.658270835876465, "learning_rate": 1.8850270053101657e-05, "loss": 1.8715, "step": 284600 }, { "epoch": 0.05750707728914829, "grad_norm": 7.361419200897217, "learning_rate": 1.8849866069255356e-05, "loss": 1.8212, "step": 284700 }, { "epoch": 0.057527276473303246, "grad_norm": 10.040990829467773, "learning_rate": 1.8849462085409055e-05, "loss": 1.8264, "step": 284800 }, { "epoch": 0.05754747565745819, "grad_norm": 5.217296123504639, "learning_rate": 1.884905810156275e-05, "loss": 1.8265, "step": 284900 }, { "epoch": 0.057567674841613146, "grad_norm": 5.577116012573242, "learning_rate": 1.884865411771645e-05, "loss": 1.8532, "step": 285000 }, { "epoch": 0.057567674841613146, "eval_calculated_loss": 9.028349876403809, "eval_loss": 2.154672384262085, "eval_perplexity": 8336.092633569955, "eval_runtime": 120.8416, "eval_samples_per_second": 8.259, "eval_steps_per_second": 2.069, "step": 285000 }, { "epoch": 0.0575878740257681, "grad_norm": 8.130290985107422, "learning_rate": 1.884825013387015e-05, "loss": 1.9271, "step": 285100 }, { "epoch": 0.057608073209923054, "grad_norm": 6.534633159637451, "learning_rate": 1.884784615002385e-05, "loss": 1.8085, "step": 285200 }, { "epoch": 0.057628272394078, "grad_norm": 8.644725799560547, "learning_rate": 1.8847442166177544e-05, "loss": 1.8591, "step": 285300 }, { "epoch": 0.057648471578232954, "grad_norm": 10.969131469726562, "learning_rate": 1.8847038182331243e-05, "loss": 1.8914, "step": 285400 }, { "epoch": 0.05766867076238791, "grad_norm": 6.606294631958008, "learning_rate": 1.8846634198484942e-05, "loss": 1.8606, "step": 285500 }, { "epoch": 0.05768886994654286, "grad_norm": 5.591391086578369, "learning_rate": 1.8846230214638638e-05, "loss": 1.8593, "step": 285600 }, { "epoch": 0.05770906913069781, "grad_norm": 7.542158126831055, "learning_rate": 1.8845826230792337e-05, "loss": 1.8882, "step": 285700 }, { "epoch": 0.05772926831485276, "grad_norm": 5.561542987823486, "learning_rate": 1.8845422246946033e-05, "loss": 1.8866, "step": 285800 }, { "epoch": 0.057749467499007716, "grad_norm": 9.306262969970703, "learning_rate": 1.8845018263099732e-05, "loss": 1.9024, "step": 285900 }, { "epoch": 0.05776966668316267, "grad_norm": 7.716490745544434, "learning_rate": 1.884461427925343e-05, "loss": 1.919, "step": 286000 }, { "epoch": 0.05776966668316267, "eval_calculated_loss": 8.884687423706055, "eval_loss": 2.152413845062256, "eval_perplexity": 7220.557346445516, "eval_runtime": 122.5619, "eval_samples_per_second": 8.143, "eval_steps_per_second": 2.04, "step": 286000 }, { "epoch": 0.057789865867317616, "grad_norm": 7.7936882972717285, "learning_rate": 1.884421029540713e-05, "loss": 1.8444, "step": 286100 }, { "epoch": 0.05781006505147257, "grad_norm": 6.609826564788818, "learning_rate": 1.884380631156083e-05, "loss": 1.7576, "step": 286200 }, { "epoch": 0.057830264235627524, "grad_norm": 9.667623519897461, "learning_rate": 1.8843402327714525e-05, "loss": 1.7947, "step": 286300 }, { "epoch": 0.05785046341978248, "grad_norm": 5.695378303527832, "learning_rate": 1.8842998343868224e-05, "loss": 1.8144, "step": 286400 }, { "epoch": 0.057870662603937424, "grad_norm": 7.10432243347168, "learning_rate": 1.8842594360021923e-05, "loss": 1.7669, "step": 286500 }, { "epoch": 0.05789086178809238, "grad_norm": 8.933112144470215, "learning_rate": 1.884219037617562e-05, "loss": 1.8275, "step": 286600 }, { "epoch": 0.05791106097224733, "grad_norm": 7.373722076416016, "learning_rate": 1.8841786392329318e-05, "loss": 1.8113, "step": 286700 }, { "epoch": 0.057931260156402285, "grad_norm": 8.973371505737305, "learning_rate": 1.8841382408483014e-05, "loss": 1.9357, "step": 286800 }, { "epoch": 0.05795145934055723, "grad_norm": 12.044899940490723, "learning_rate": 1.8840978424636713e-05, "loss": 1.7267, "step": 286900 }, { "epoch": 0.057971658524712186, "grad_norm": 5.867072582244873, "learning_rate": 1.8840574440790412e-05, "loss": 1.8709, "step": 287000 }, { "epoch": 0.057971658524712186, "eval_calculated_loss": 9.105512619018555, "eval_loss": 2.1476528644561768, "eval_perplexity": 9004.796141791066, "eval_runtime": 122.3193, "eval_samples_per_second": 8.159, "eval_steps_per_second": 2.044, "step": 287000 }, { "epoch": 0.05799185770886714, "grad_norm": 8.000768661499023, "learning_rate": 1.8840170456944108e-05, "loss": 1.889, "step": 287100 }, { "epoch": 0.05801205689302209, "grad_norm": 8.289738655090332, "learning_rate": 1.883976647309781e-05, "loss": 1.9274, "step": 287200 }, { "epoch": 0.05803225607717704, "grad_norm": 4.692756652832031, "learning_rate": 1.8839362489251506e-05, "loss": 1.7724, "step": 287300 }, { "epoch": 0.058052455261331994, "grad_norm": 7.936660289764404, "learning_rate": 1.8838958505405205e-05, "loss": 1.7815, "step": 287400 }, { "epoch": 0.05807265444548695, "grad_norm": 4.1101202964782715, "learning_rate": 1.88385545215589e-05, "loss": 1.7337, "step": 287500 }, { "epoch": 0.0580928536296419, "grad_norm": 8.616670608520508, "learning_rate": 1.88381505377126e-05, "loss": 1.8791, "step": 287600 }, { "epoch": 0.05811305281379685, "grad_norm": 4.515771865844727, "learning_rate": 1.88377465538663e-05, "loss": 1.7707, "step": 287700 }, { "epoch": 0.0581332519979518, "grad_norm": 13.756794929504395, "learning_rate": 1.8837342570019995e-05, "loss": 1.8092, "step": 287800 }, { "epoch": 0.058153451182106755, "grad_norm": 8.672130584716797, "learning_rate": 1.8836938586173694e-05, "loss": 1.8601, "step": 287900 }, { "epoch": 0.05817365036626171, "grad_norm": 7.711603164672852, "learning_rate": 1.8836534602327393e-05, "loss": 1.8485, "step": 288000 }, { "epoch": 0.05817365036626171, "eval_calculated_loss": 9.04776382446289, "eval_loss": 2.14197039604187, "eval_perplexity": 8499.510260977135, "eval_runtime": 123.9053, "eval_samples_per_second": 8.055, "eval_steps_per_second": 2.018, "step": 288000 }, { "epoch": 0.058193849550416656, "grad_norm": 8.764796257019043, "learning_rate": 1.883613061848109e-05, "loss": 1.8617, "step": 288100 }, { "epoch": 0.05821404873457161, "grad_norm": 8.02717399597168, "learning_rate": 1.883572663463479e-05, "loss": 1.8639, "step": 288200 }, { "epoch": 0.05823424791872656, "grad_norm": 9.946102142333984, "learning_rate": 1.8835322650788487e-05, "loss": 1.9307, "step": 288300 }, { "epoch": 0.05825444710288152, "grad_norm": 8.751115798950195, "learning_rate": 1.8834918666942187e-05, "loss": 1.8374, "step": 288400 }, { "epoch": 0.05827464628703646, "grad_norm": 8.751819610595703, "learning_rate": 1.8834514683095882e-05, "loss": 1.8386, "step": 288500 }, { "epoch": 0.05829484547119142, "grad_norm": 8.76799488067627, "learning_rate": 1.883411069924958e-05, "loss": 1.8415, "step": 288600 }, { "epoch": 0.05831504465534637, "grad_norm": 5.46577262878418, "learning_rate": 1.883370671540328e-05, "loss": 1.8383, "step": 288700 }, { "epoch": 0.058335243839501325, "grad_norm": 7.347733974456787, "learning_rate": 1.8833302731556976e-05, "loss": 1.817, "step": 288800 }, { "epoch": 0.05835544302365627, "grad_norm": 7.465794563293457, "learning_rate": 1.8832898747710675e-05, "loss": 1.8172, "step": 288900 }, { "epoch": 0.058375642207811225, "grad_norm": 10.16101360321045, "learning_rate": 1.8832494763864375e-05, "loss": 1.8795, "step": 289000 }, { "epoch": 0.058375642207811225, "eval_calculated_loss": 9.03345775604248, "eval_loss": 2.150200128555298, "eval_perplexity": 8378.78132292881, "eval_runtime": 123.993, "eval_samples_per_second": 8.049, "eval_steps_per_second": 2.016, "step": 289000 }, { "epoch": 0.05839584139196618, "grad_norm": 8.103070259094238, "learning_rate": 1.883209078001807e-05, "loss": 1.8562, "step": 289100 }, { "epoch": 0.05841604057612113, "grad_norm": 2.1547601222991943, "learning_rate": 1.883168679617177e-05, "loss": 1.8329, "step": 289200 }, { "epoch": 0.05843623976027608, "grad_norm": 6.923336982727051, "learning_rate": 1.883128281232547e-05, "loss": 1.7422, "step": 289300 }, { "epoch": 0.05845643894443103, "grad_norm": 6.27888822555542, "learning_rate": 1.8830878828479168e-05, "loss": 1.8987, "step": 289400 }, { "epoch": 0.05847663812858599, "grad_norm": 5.705938339233398, "learning_rate": 1.8830474844632863e-05, "loss": 1.8941, "step": 289500 }, { "epoch": 0.05849683731274094, "grad_norm": 9.813825607299805, "learning_rate": 1.8830070860786563e-05, "loss": 1.8481, "step": 289600 }, { "epoch": 0.05851703649689589, "grad_norm": 7.035060405731201, "learning_rate": 1.882966687694026e-05, "loss": 1.8493, "step": 289700 }, { "epoch": 0.05853723568105084, "grad_norm": 7.663398265838623, "learning_rate": 1.8829262893093957e-05, "loss": 1.842, "step": 289800 }, { "epoch": 0.058557434865205794, "grad_norm": 8.571109771728516, "learning_rate": 1.8828858909247657e-05, "loss": 1.9147, "step": 289900 }, { "epoch": 0.05857763404936075, "grad_norm": 8.670381546020508, "learning_rate": 1.8828454925401352e-05, "loss": 1.8294, "step": 290000 }, { "epoch": 0.05857763404936075, "eval_calculated_loss": 8.952350616455078, "eval_loss": 2.1408040523529053, "eval_perplexity": 7726.031469830199, "eval_runtime": 123.6065, "eval_samples_per_second": 8.074, "eval_steps_per_second": 2.023, "step": 290000 }, { "epoch": 0.058597833233515695, "grad_norm": 3.9622297286987305, "learning_rate": 1.882805094155505e-05, "loss": 1.8735, "step": 290100 }, { "epoch": 0.05861803241767065, "grad_norm": 8.827208518981934, "learning_rate": 1.882764695770875e-05, "loss": 1.8984, "step": 290200 }, { "epoch": 0.0586382316018256, "grad_norm": 7.399585247039795, "learning_rate": 1.882724297386245e-05, "loss": 1.8085, "step": 290300 }, { "epoch": 0.058658430785980556, "grad_norm": 9.308290481567383, "learning_rate": 1.882683899001615e-05, "loss": 1.8489, "step": 290400 }, { "epoch": 0.05867862997013551, "grad_norm": 7.4223713874816895, "learning_rate": 1.8826435006169845e-05, "loss": 1.9251, "step": 290500 }, { "epoch": 0.058698829154290456, "grad_norm": 9.312091827392578, "learning_rate": 1.8826031022323544e-05, "loss": 1.7737, "step": 290600 }, { "epoch": 0.05871902833844541, "grad_norm": 8.067108154296875, "learning_rate": 1.882562703847724e-05, "loss": 1.9447, "step": 290700 }, { "epoch": 0.058739227522600364, "grad_norm": 4.001662254333496, "learning_rate": 1.882522305463094e-05, "loss": 1.7712, "step": 290800 }, { "epoch": 0.05875942670675532, "grad_norm": 8.489066123962402, "learning_rate": 1.8824819070784638e-05, "loss": 1.9064, "step": 290900 }, { "epoch": 0.058779625890910264, "grad_norm": 6.467322826385498, "learning_rate": 1.8824415086938333e-05, "loss": 1.7693, "step": 291000 }, { "epoch": 0.058779625890910264, "eval_calculated_loss": 9.078303337097168, "eval_loss": 2.139507293701172, "eval_perplexity": 8763.085404976244, "eval_runtime": 122.993, "eval_samples_per_second": 8.114, "eval_steps_per_second": 2.033, "step": 291000 }, { "epoch": 0.05879982507506522, "grad_norm": 8.048219680786133, "learning_rate": 1.8824011103092033e-05, "loss": 1.871, "step": 291100 }, { "epoch": 0.05882002425922017, "grad_norm": 8.119820594787598, "learning_rate": 1.882360711924573e-05, "loss": 1.8558, "step": 291200 }, { "epoch": 0.058840223443375125, "grad_norm": 7.596703052520752, "learning_rate": 1.882320313539943e-05, "loss": 1.7746, "step": 291300 }, { "epoch": 0.05886042262753007, "grad_norm": 9.53935718536377, "learning_rate": 1.882279915155313e-05, "loss": 1.8562, "step": 291400 }, { "epoch": 0.058880621811685026, "grad_norm": 4.813135147094727, "learning_rate": 1.8822395167706826e-05, "loss": 1.7856, "step": 291500 }, { "epoch": 0.05890082099583998, "grad_norm": 7.201075553894043, "learning_rate": 1.8821991183860525e-05, "loss": 1.8813, "step": 291600 }, { "epoch": 0.05892102017999493, "grad_norm": 9.571510314941406, "learning_rate": 1.882158720001422e-05, "loss": 1.7981, "step": 291700 }, { "epoch": 0.05894121936414988, "grad_norm": 5.642327785491943, "learning_rate": 1.882118321616792e-05, "loss": 1.8516, "step": 291800 }, { "epoch": 0.058961418548304834, "grad_norm": 5.706386566162109, "learning_rate": 1.882077923232162e-05, "loss": 1.8813, "step": 291900 }, { "epoch": 0.05898161773245979, "grad_norm": 11.412710189819336, "learning_rate": 1.8820375248475314e-05, "loss": 1.9098, "step": 292000 }, { "epoch": 0.05898161773245979, "eval_calculated_loss": 8.96032428741455, "eval_loss": 2.146820306777954, "eval_perplexity": 7787.88256502953, "eval_runtime": 123.471, "eval_samples_per_second": 8.083, "eval_steps_per_second": 2.025, "step": 292000 }, { "epoch": 0.05900181691661474, "grad_norm": 6.749436378479004, "learning_rate": 1.8819971264629014e-05, "loss": 1.8569, "step": 292100 }, { "epoch": 0.05902201610076969, "grad_norm": 5.565478324890137, "learning_rate": 1.8819567280782713e-05, "loss": 1.8215, "step": 292200 }, { "epoch": 0.05904221528492464, "grad_norm": 7.462223052978516, "learning_rate": 1.8819163296936412e-05, "loss": 1.8982, "step": 292300 }, { "epoch": 0.059062414469079595, "grad_norm": 11.229182243347168, "learning_rate": 1.8818759313090108e-05, "loss": 1.8571, "step": 292400 }, { "epoch": 0.05908261365323455, "grad_norm": 7.170738220214844, "learning_rate": 1.8818355329243807e-05, "loss": 1.8809, "step": 292500 }, { "epoch": 0.059102812837389496, "grad_norm": 11.169904708862305, "learning_rate": 1.8817951345397506e-05, "loss": 1.8976, "step": 292600 }, { "epoch": 0.05912301202154445, "grad_norm": 7.03305196762085, "learning_rate": 1.88175473615512e-05, "loss": 1.8593, "step": 292700 }, { "epoch": 0.0591432112056994, "grad_norm": 8.465617179870605, "learning_rate": 1.88171433777049e-05, "loss": 1.8263, "step": 292800 }, { "epoch": 0.05916341038985436, "grad_norm": 6.483994960784912, "learning_rate": 1.88167393938586e-05, "loss": 1.8924, "step": 292900 }, { "epoch": 0.059183609574009304, "grad_norm": 8.460470199584961, "learning_rate": 1.8816335410012296e-05, "loss": 1.8353, "step": 293000 }, { "epoch": 0.059183609574009304, "eval_calculated_loss": 8.968130111694336, "eval_loss": 2.1372995376586914, "eval_perplexity": 7848.9112877088155, "eval_runtime": 125.5536, "eval_samples_per_second": 7.949, "eval_steps_per_second": 1.991, "step": 293000 }, { "epoch": 0.05920380875816426, "grad_norm": 7.796407222747803, "learning_rate": 1.8815931426165995e-05, "loss": 1.9046, "step": 293100 }, { "epoch": 0.05922400794231921, "grad_norm": 6.784018039703369, "learning_rate": 1.881552744231969e-05, "loss": 1.8403, "step": 293200 }, { "epoch": 0.059244207126474165, "grad_norm": 3.4973504543304443, "learning_rate": 1.881512345847339e-05, "loss": 1.778, "step": 293300 }, { "epoch": 0.05926440631062911, "grad_norm": 8.88313102722168, "learning_rate": 1.881471947462709e-05, "loss": 1.7406, "step": 293400 }, { "epoch": 0.059284605494784065, "grad_norm": 4.571095943450928, "learning_rate": 1.8814315490780788e-05, "loss": 1.7881, "step": 293500 }, { "epoch": 0.05930480467893902, "grad_norm": 4.136029243469238, "learning_rate": 1.8813911506934487e-05, "loss": 1.8912, "step": 293600 }, { "epoch": 0.05932500386309397, "grad_norm": 4.088936805725098, "learning_rate": 1.8813507523088183e-05, "loss": 1.8922, "step": 293700 }, { "epoch": 0.05934520304724892, "grad_norm": 4.449787139892578, "learning_rate": 1.8813103539241882e-05, "loss": 1.8514, "step": 293800 }, { "epoch": 0.05936540223140387, "grad_norm": 6.124415874481201, "learning_rate": 1.881269955539558e-05, "loss": 1.8456, "step": 293900 }, { "epoch": 0.05938560141555883, "grad_norm": 7.795497894287109, "learning_rate": 1.8812295571549277e-05, "loss": 1.8633, "step": 294000 }, { "epoch": 0.05938560141555883, "eval_calculated_loss": 8.931365013122559, "eval_loss": 2.152730703353882, "eval_perplexity": 7565.585454552238, "eval_runtime": 118.9812, "eval_samples_per_second": 8.388, "eval_steps_per_second": 2.101, "step": 294000 }, { "epoch": 0.05940580059971378, "grad_norm": 8.354081153869629, "learning_rate": 1.8811891587702976e-05, "loss": 1.8335, "step": 294100 }, { "epoch": 0.05942599978386873, "grad_norm": 7.298532962799072, "learning_rate": 1.881148760385667e-05, "loss": 1.8549, "step": 294200 }, { "epoch": 0.05944619896802368, "grad_norm": 16.073911666870117, "learning_rate": 1.881108362001037e-05, "loss": 1.8252, "step": 294300 }, { "epoch": 0.059466398152178634, "grad_norm": 4.452024936676025, "learning_rate": 1.881067963616407e-05, "loss": 1.8769, "step": 294400 }, { "epoch": 0.05948659733633359, "grad_norm": 7.612700939178467, "learning_rate": 1.881027565231777e-05, "loss": 1.8504, "step": 294500 }, { "epoch": 0.059506796520488535, "grad_norm": 7.387007713317871, "learning_rate": 1.8809871668471468e-05, "loss": 1.8274, "step": 294600 }, { "epoch": 0.05952699570464349, "grad_norm": 6.587162017822266, "learning_rate": 1.8809467684625164e-05, "loss": 1.88, "step": 294700 }, { "epoch": 0.05954719488879844, "grad_norm": 8.963000297546387, "learning_rate": 1.8809063700778863e-05, "loss": 1.8201, "step": 294800 }, { "epoch": 0.059567394072953396, "grad_norm": 13.76458740234375, "learning_rate": 1.880865971693256e-05, "loss": 1.9146, "step": 294900 }, { "epoch": 0.05958759325710834, "grad_norm": 10.244553565979004, "learning_rate": 1.8808255733086258e-05, "loss": 1.7793, "step": 295000 }, { "epoch": 0.05958759325710834, "eval_calculated_loss": 8.77422046661377, "eval_loss": 2.1447534561157227, "eval_perplexity": 6465.401976507899, "eval_runtime": 124.59, "eval_samples_per_second": 8.01, "eval_steps_per_second": 2.007, "step": 295000 }, { "epoch": 0.059607792441263296, "grad_norm": 4.403097629547119, "learning_rate": 1.8807851749239957e-05, "loss": 1.8825, "step": 295100 }, { "epoch": 0.05962799162541825, "grad_norm": 17.561044692993164, "learning_rate": 1.8807447765393653e-05, "loss": 1.7668, "step": 295200 }, { "epoch": 0.059648190809573204, "grad_norm": 9.193973541259766, "learning_rate": 1.8807043781547352e-05, "loss": 1.8135, "step": 295300 }, { "epoch": 0.05966838999372815, "grad_norm": 7.576569557189941, "learning_rate": 1.880663979770105e-05, "loss": 1.868, "step": 295400 }, { "epoch": 0.059688589177883104, "grad_norm": 5.62177038192749, "learning_rate": 1.880623581385475e-05, "loss": 1.7775, "step": 295500 }, { "epoch": 0.05970878836203806, "grad_norm": 5.827737331390381, "learning_rate": 1.8805831830008446e-05, "loss": 1.9268, "step": 295600 }, { "epoch": 0.05972898754619301, "grad_norm": 6.361175060272217, "learning_rate": 1.8805427846162145e-05, "loss": 1.8145, "step": 295700 }, { "epoch": 0.05974918673034796, "grad_norm": 8.640932083129883, "learning_rate": 1.8805023862315844e-05, "loss": 1.7731, "step": 295800 }, { "epoch": 0.05976938591450291, "grad_norm": 7.467153072357178, "learning_rate": 1.880461987846954e-05, "loss": 1.858, "step": 295900 }, { "epoch": 0.059789585098657866, "grad_norm": 8.637483596801758, "learning_rate": 1.880421589462324e-05, "loss": 1.8172, "step": 296000 }, { "epoch": 0.059789585098657866, "eval_calculated_loss": 8.871946334838867, "eval_loss": 2.1386051177978516, "eval_perplexity": 7129.143178286482, "eval_runtime": 123.0412, "eval_samples_per_second": 8.111, "eval_steps_per_second": 2.032, "step": 296000 }, { "epoch": 0.05980978428281282, "grad_norm": 5.19301176071167, "learning_rate": 1.8803811910776938e-05, "loss": 1.8578, "step": 296100 }, { "epoch": 0.059829983466967766, "grad_norm": 8.43254280090332, "learning_rate": 1.8803407926930634e-05, "loss": 1.8352, "step": 296200 }, { "epoch": 0.05985018265112272, "grad_norm": 9.765015602111816, "learning_rate": 1.8803003943084333e-05, "loss": 1.8525, "step": 296300 }, { "epoch": 0.059870381835277674, "grad_norm": 9.782549858093262, "learning_rate": 1.8802599959238032e-05, "loss": 1.8566, "step": 296400 }, { "epoch": 0.05989058101943263, "grad_norm": 7.245795249938965, "learning_rate": 1.880219597539173e-05, "loss": 1.8332, "step": 296500 }, { "epoch": 0.059910780203587574, "grad_norm": 4.290226936340332, "learning_rate": 1.8801791991545427e-05, "loss": 1.8881, "step": 296600 }, { "epoch": 0.05993097938774253, "grad_norm": 7.797712326049805, "learning_rate": 1.8801388007699126e-05, "loss": 1.759, "step": 296700 }, { "epoch": 0.05995117857189748, "grad_norm": 8.675239562988281, "learning_rate": 1.8800984023852825e-05, "loss": 1.7724, "step": 296800 }, { "epoch": 0.059971377756052435, "grad_norm": 7.941835403442383, "learning_rate": 1.880058004000652e-05, "loss": 1.8871, "step": 296900 }, { "epoch": 0.05999157694020738, "grad_norm": 5.934349536895752, "learning_rate": 1.880017605616022e-05, "loss": 1.7868, "step": 297000 }, { "epoch": 0.05999157694020738, "eval_calculated_loss": 8.934648513793945, "eval_loss": 2.151684284210205, "eval_perplexity": 7590.4678878593595, "eval_runtime": 122.7263, "eval_samples_per_second": 8.132, "eval_steps_per_second": 2.037, "step": 297000 }, { "epoch": 0.060011776124362336, "grad_norm": 7.2742767333984375, "learning_rate": 1.879977207231392e-05, "loss": 1.8407, "step": 297100 }, { "epoch": 0.06003197530851729, "grad_norm": 7.976162433624268, "learning_rate": 1.8799368088467615e-05, "loss": 1.8355, "step": 297200 }, { "epoch": 0.06005217449267224, "grad_norm": 7.598889350891113, "learning_rate": 1.8798964104621314e-05, "loss": 1.952, "step": 297300 }, { "epoch": 0.06007237367682719, "grad_norm": 9.059739112854004, "learning_rate": 1.879856012077501e-05, "loss": 1.8012, "step": 297400 }, { "epoch": 0.060092572860982144, "grad_norm": 7.6322126388549805, "learning_rate": 1.8798156136928712e-05, "loss": 1.8202, "step": 297500 }, { "epoch": 0.0601127720451371, "grad_norm": 8.767751693725586, "learning_rate": 1.8797752153082408e-05, "loss": 1.8322, "step": 297600 }, { "epoch": 0.06013297122929205, "grad_norm": 8.597329139709473, "learning_rate": 1.8797348169236107e-05, "loss": 1.8421, "step": 297700 }, { "epoch": 0.060153170413447, "grad_norm": 8.705891609191895, "learning_rate": 1.8796944185389806e-05, "loss": 1.7981, "step": 297800 }, { "epoch": 0.06017336959760195, "grad_norm": 7.034084320068359, "learning_rate": 1.8796540201543502e-05, "loss": 1.7713, "step": 297900 }, { "epoch": 0.060193568781756905, "grad_norm": 5.624641418457031, "learning_rate": 1.87961362176972e-05, "loss": 1.8101, "step": 298000 }, { "epoch": 0.060193568781756905, "eval_calculated_loss": 8.87320613861084, "eval_loss": 2.157646894454956, "eval_perplexity": 7138.130159481218, "eval_runtime": 123.5959, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.023, "step": 298000 }, { "epoch": 0.06021376796591186, "grad_norm": 7.83519983291626, "learning_rate": 1.8795732233850897e-05, "loss": 1.8487, "step": 298100 }, { "epoch": 0.060233967150066806, "grad_norm": 8.752517700195312, "learning_rate": 1.8795328250004596e-05, "loss": 1.8844, "step": 298200 }, { "epoch": 0.06025416633422176, "grad_norm": 5.755463600158691, "learning_rate": 1.8794924266158295e-05, "loss": 1.8, "step": 298300 }, { "epoch": 0.06027436551837671, "grad_norm": 5.51276969909668, "learning_rate": 1.879452028231199e-05, "loss": 1.847, "step": 298400 }, { "epoch": 0.06029456470253167, "grad_norm": 7.496716022491455, "learning_rate": 1.8794116298465693e-05, "loss": 1.8999, "step": 298500 }, { "epoch": 0.06031476388668661, "grad_norm": 4.61299991607666, "learning_rate": 1.879371231461939e-05, "loss": 1.8859, "step": 298600 }, { "epoch": 0.06033496307084157, "grad_norm": 10.455416679382324, "learning_rate": 1.8793308330773088e-05, "loss": 1.8489, "step": 298700 }, { "epoch": 0.06035516225499652, "grad_norm": 4.959133625030518, "learning_rate": 1.8792904346926787e-05, "loss": 1.9012, "step": 298800 }, { "epoch": 0.060375361439151475, "grad_norm": 6.239687919616699, "learning_rate": 1.8792500363080483e-05, "loss": 1.8654, "step": 298900 }, { "epoch": 0.06039556062330642, "grad_norm": 8.473806381225586, "learning_rate": 1.8792096379234182e-05, "loss": 1.8105, "step": 299000 }, { "epoch": 0.06039556062330642, "eval_calculated_loss": 8.89892292022705, "eval_loss": 2.1515986919403076, "eval_perplexity": 7324.080670148992, "eval_runtime": 124.7503, "eval_samples_per_second": 8.0, "eval_steps_per_second": 2.004, "step": 299000 }, { "epoch": 0.060415759807461375, "grad_norm": 8.274065971374512, "learning_rate": 1.8791692395387878e-05, "loss": 1.9285, "step": 299100 }, { "epoch": 0.06043595899161633, "grad_norm": 7.983005046844482, "learning_rate": 1.8791288411541577e-05, "loss": 1.9793, "step": 299200 }, { "epoch": 0.06045615817577128, "grad_norm": 5.265191555023193, "learning_rate": 1.8790884427695276e-05, "loss": 1.8083, "step": 299300 }, { "epoch": 0.06047635735992623, "grad_norm": 6.762467861175537, "learning_rate": 1.8790480443848972e-05, "loss": 1.7395, "step": 299400 }, { "epoch": 0.06049655654408118, "grad_norm": 4.726485252380371, "learning_rate": 1.879007646000267e-05, "loss": 1.7443, "step": 299500 }, { "epoch": 0.060516755728236137, "grad_norm": 9.697747230529785, "learning_rate": 1.878967247615637e-05, "loss": 1.814, "step": 299600 }, { "epoch": 0.06053695491239109, "grad_norm": 6.460058689117432, "learning_rate": 1.878926849231007e-05, "loss": 1.848, "step": 299700 }, { "epoch": 0.060557154096546044, "grad_norm": 8.234662055969238, "learning_rate": 1.8788864508463765e-05, "loss": 1.8653, "step": 299800 }, { "epoch": 0.06057735328070099, "grad_norm": 10.354362487792969, "learning_rate": 1.8788460524617464e-05, "loss": 1.9711, "step": 299900 }, { "epoch": 0.060597552464855944, "grad_norm": 11.669537544250488, "learning_rate": 1.8788056540771163e-05, "loss": 1.9181, "step": 300000 }, { "epoch": 0.060597552464855944, "eval_calculated_loss": 8.926397323608398, "eval_loss": 2.1472949981689453, "eval_perplexity": 7528.095172161079, "eval_runtime": 122.2397, "eval_samples_per_second": 8.164, "eval_steps_per_second": 2.045, "step": 300000 }, { "epoch": 0.0606177516490109, "grad_norm": 8.791509628295898, "learning_rate": 1.878765255692486e-05, "loss": 1.7994, "step": 300100 }, { "epoch": 0.06063795083316585, "grad_norm": 9.452177047729492, "learning_rate": 1.8787248573078558e-05, "loss": 1.7818, "step": 300200 }, { "epoch": 0.0606581500173208, "grad_norm": 8.201678276062012, "learning_rate": 1.8786844589232257e-05, "loss": 1.8235, "step": 300300 }, { "epoch": 0.06067834920147575, "grad_norm": 6.944396495819092, "learning_rate": 1.8786440605385953e-05, "loss": 1.924, "step": 300400 }, { "epoch": 0.060698548385630706, "grad_norm": 10.114995002746582, "learning_rate": 1.8786036621539652e-05, "loss": 1.8582, "step": 300500 }, { "epoch": 0.06071874756978566, "grad_norm": 4.375490188598633, "learning_rate": 1.878563263769335e-05, "loss": 1.8265, "step": 300600 }, { "epoch": 0.060738946753940606, "grad_norm": 8.894621849060059, "learning_rate": 1.878522865384705e-05, "loss": 1.8956, "step": 300700 }, { "epoch": 0.06075914593809556, "grad_norm": 10.209686279296875, "learning_rate": 1.8784824670000746e-05, "loss": 1.8441, "step": 300800 }, { "epoch": 0.060779345122250514, "grad_norm": 4.875696659088135, "learning_rate": 1.8784420686154445e-05, "loss": 1.7966, "step": 300900 }, { "epoch": 0.06079954430640547, "grad_norm": 7.2163238525390625, "learning_rate": 1.8784016702308145e-05, "loss": 1.8469, "step": 301000 }, { "epoch": 0.06079954430640547, "eval_calculated_loss": 9.05814266204834, "eval_loss": 2.1499903202056885, "eval_perplexity": 8588.18467017603, "eval_runtime": 123.7504, "eval_samples_per_second": 8.065, "eval_steps_per_second": 2.02, "step": 301000 }, { "epoch": 0.060819743490560414, "grad_norm": 5.446864128112793, "learning_rate": 1.878361271846184e-05, "loss": 1.9138, "step": 301100 }, { "epoch": 0.06083994267471537, "grad_norm": 9.220932960510254, "learning_rate": 1.878320873461554e-05, "loss": 1.8371, "step": 301200 }, { "epoch": 0.06086014185887032, "grad_norm": 10.253331184387207, "learning_rate": 1.8782804750769235e-05, "loss": 1.8032, "step": 301300 }, { "epoch": 0.060880341043025275, "grad_norm": 3.5361156463623047, "learning_rate": 1.8782400766922934e-05, "loss": 1.9056, "step": 301400 }, { "epoch": 0.06090054022718022, "grad_norm": 7.462442874908447, "learning_rate": 1.8781996783076633e-05, "loss": 1.846, "step": 301500 }, { "epoch": 0.060920739411335176, "grad_norm": 7.107940196990967, "learning_rate": 1.878159279923033e-05, "loss": 1.8363, "step": 301600 }, { "epoch": 0.06094093859549013, "grad_norm": 7.123429298400879, "learning_rate": 1.878118881538403e-05, "loss": 1.8441, "step": 301700 }, { "epoch": 0.06096113777964508, "grad_norm": 6.913952827453613, "learning_rate": 1.8780784831537727e-05, "loss": 1.8632, "step": 301800 }, { "epoch": 0.06098133696380003, "grad_norm": 5.5179972648620605, "learning_rate": 1.8780380847691427e-05, "loss": 1.7596, "step": 301900 }, { "epoch": 0.061001536147954984, "grad_norm": 6.567690372467041, "learning_rate": 1.8779976863845126e-05, "loss": 1.7948, "step": 302000 }, { "epoch": 0.061001536147954984, "eval_calculated_loss": 9.089357376098633, "eval_loss": 2.13462495803833, "eval_perplexity": 8860.490259509143, "eval_runtime": 126.5892, "eval_samples_per_second": 7.884, "eval_steps_per_second": 1.975, "step": 302000 }, { "epoch": 0.06102173533210994, "grad_norm": 10.963176727294922, "learning_rate": 1.877957287999882e-05, "loss": 1.8348, "step": 302100 }, { "epoch": 0.06104193451626489, "grad_norm": 6.733486652374268, "learning_rate": 1.877916889615252e-05, "loss": 1.8622, "step": 302200 }, { "epoch": 0.06106213370041984, "grad_norm": 4.613994598388672, "learning_rate": 1.8778764912306216e-05, "loss": 1.8263, "step": 302300 }, { "epoch": 0.06108233288457479, "grad_norm": 5.9916768074035645, "learning_rate": 1.8778360928459915e-05, "loss": 1.8026, "step": 302400 }, { "epoch": 0.061102532068729745, "grad_norm": 4.373349189758301, "learning_rate": 1.8777956944613615e-05, "loss": 1.8577, "step": 302500 }, { "epoch": 0.0611227312528847, "grad_norm": 7.143569469451904, "learning_rate": 1.877755296076731e-05, "loss": 1.8466, "step": 302600 }, { "epoch": 0.061142930437039646, "grad_norm": 7.889708042144775, "learning_rate": 1.8777148976921013e-05, "loss": 1.8032, "step": 302700 }, { "epoch": 0.0611631296211946, "grad_norm": 9.935575485229492, "learning_rate": 1.877674499307471e-05, "loss": 1.8363, "step": 302800 }, { "epoch": 0.06118332880534955, "grad_norm": 5.392722129821777, "learning_rate": 1.8776341009228408e-05, "loss": 1.8305, "step": 302900 }, { "epoch": 0.06120352798950451, "grad_norm": 11.33907699584961, "learning_rate": 1.8775937025382103e-05, "loss": 1.8555, "step": 303000 }, { "epoch": 0.06120352798950451, "eval_calculated_loss": 9.142171859741211, "eval_loss": 2.1567559242248535, "eval_perplexity": 9341.030525208247, "eval_runtime": 121.3499, "eval_samples_per_second": 8.224, "eval_steps_per_second": 2.06, "step": 303000 }, { "epoch": 0.061223727173659453, "grad_norm": 10.479214668273926, "learning_rate": 1.8775533041535803e-05, "loss": 1.8178, "step": 303100 }, { "epoch": 0.06124392635781441, "grad_norm": 12.197513580322266, "learning_rate": 1.87751290576895e-05, "loss": 1.7784, "step": 303200 }, { "epoch": 0.06126412554196936, "grad_norm": 5.701066493988037, "learning_rate": 1.8774725073843197e-05, "loss": 1.8592, "step": 303300 }, { "epoch": 0.061284324726124315, "grad_norm": 10.288422584533691, "learning_rate": 1.8774321089996897e-05, "loss": 1.868, "step": 303400 }, { "epoch": 0.06130452391027926, "grad_norm": 5.125488758087158, "learning_rate": 1.8773917106150596e-05, "loss": 1.749, "step": 303500 }, { "epoch": 0.061324723094434215, "grad_norm": 8.474486351013184, "learning_rate": 1.877351312230429e-05, "loss": 1.8802, "step": 303600 }, { "epoch": 0.06134492227858917, "grad_norm": 3.5771403312683105, "learning_rate": 1.8773109138457994e-05, "loss": 1.8778, "step": 303700 }, { "epoch": 0.06136512146274412, "grad_norm": 6.7525434494018555, "learning_rate": 1.877270515461169e-05, "loss": 1.8014, "step": 303800 }, { "epoch": 0.06138532064689907, "grad_norm": 9.787223815917969, "learning_rate": 1.877230117076539e-05, "loss": 1.8436, "step": 303900 }, { "epoch": 0.06140551983105402, "grad_norm": 5.92941951751709, "learning_rate": 1.8771897186919084e-05, "loss": 1.8973, "step": 304000 }, { "epoch": 0.06140551983105402, "eval_calculated_loss": 8.951493263244629, "eval_loss": 2.1418750286102295, "eval_perplexity": 7719.410370661395, "eval_runtime": 124.7952, "eval_samples_per_second": 7.997, "eval_steps_per_second": 2.003, "step": 304000 }, { "epoch": 0.06142571901520898, "grad_norm": 4.456434726715088, "learning_rate": 1.8771493203072784e-05, "loss": 1.8838, "step": 304100 }, { "epoch": 0.06144591819936393, "grad_norm": 10.941431999206543, "learning_rate": 1.8771089219226483e-05, "loss": 1.855, "step": 304200 }, { "epoch": 0.06146611738351888, "grad_norm": 9.704672813415527, "learning_rate": 1.877068523538018e-05, "loss": 1.8994, "step": 304300 }, { "epoch": 0.06148631656767383, "grad_norm": 7.381129741668701, "learning_rate": 1.8770281251533878e-05, "loss": 1.8274, "step": 304400 }, { "epoch": 0.061506515751828784, "grad_norm": 8.791516304016113, "learning_rate": 1.8769877267687577e-05, "loss": 1.8004, "step": 304500 }, { "epoch": 0.06152671493598374, "grad_norm": 4.963751316070557, "learning_rate": 1.8769473283841272e-05, "loss": 1.8954, "step": 304600 }, { "epoch": 0.061546914120138685, "grad_norm": 7.543796062469482, "learning_rate": 1.876906929999497e-05, "loss": 1.7705, "step": 304700 }, { "epoch": 0.06156711330429364, "grad_norm": 8.514450073242188, "learning_rate": 1.876866531614867e-05, "loss": 1.7204, "step": 304800 }, { "epoch": 0.06158731248844859, "grad_norm": 7.248682022094727, "learning_rate": 1.876826133230237e-05, "loss": 1.863, "step": 304900 }, { "epoch": 0.061607511672603546, "grad_norm": 7.631741046905518, "learning_rate": 1.8767857348456066e-05, "loss": 1.9446, "step": 305000 }, { "epoch": 0.061607511672603546, "eval_calculated_loss": 9.020033836364746, "eval_loss": 2.158501386642456, "eval_perplexity": 8267.056803678639, "eval_runtime": 122.3464, "eval_samples_per_second": 8.157, "eval_steps_per_second": 2.043, "step": 305000 }, { "epoch": 0.06162771085675849, "grad_norm": 4.396106719970703, "learning_rate": 1.8767453364609765e-05, "loss": 1.8415, "step": 305100 }, { "epoch": 0.061647910040913446, "grad_norm": 4.184022903442383, "learning_rate": 1.8767049380763464e-05, "loss": 1.788, "step": 305200 }, { "epoch": 0.0616681092250684, "grad_norm": 10.05771541595459, "learning_rate": 1.876664539691716e-05, "loss": 1.8192, "step": 305300 }, { "epoch": 0.061688308409223354, "grad_norm": 6.228518962860107, "learning_rate": 1.876624141307086e-05, "loss": 1.8416, "step": 305400 }, { "epoch": 0.0617085075933783, "grad_norm": 4.917849540710449, "learning_rate": 1.8765837429224554e-05, "loss": 1.8667, "step": 305500 }, { "epoch": 0.061728706777533254, "grad_norm": 6.639430046081543, "learning_rate": 1.8765433445378254e-05, "loss": 1.9142, "step": 305600 }, { "epoch": 0.06174890596168821, "grad_norm": 9.862544059753418, "learning_rate": 1.8765029461531953e-05, "loss": 1.9024, "step": 305700 }, { "epoch": 0.06176910514584316, "grad_norm": 6.295173168182373, "learning_rate": 1.8764625477685652e-05, "loss": 1.835, "step": 305800 }, { "epoch": 0.06178930432999811, "grad_norm": 9.370795249938965, "learning_rate": 1.876422149383935e-05, "loss": 1.8931, "step": 305900 }, { "epoch": 0.06180950351415306, "grad_norm": 9.919925689697266, "learning_rate": 1.8763817509993047e-05, "loss": 1.8419, "step": 306000 }, { "epoch": 0.06180950351415306, "eval_calculated_loss": 8.957195281982422, "eval_loss": 2.1550347805023193, "eval_perplexity": 7763.552322759672, "eval_runtime": 125.2992, "eval_samples_per_second": 7.965, "eval_steps_per_second": 1.995, "step": 306000 }, { "epoch": 0.061829702698308016, "grad_norm": 9.219781875610352, "learning_rate": 1.8763413526146746e-05, "loss": 1.8355, "step": 306100 }, { "epoch": 0.06184990188246297, "grad_norm": 5.97334098815918, "learning_rate": 1.876300954230044e-05, "loss": 1.8546, "step": 306200 }, { "epoch": 0.061870101066617916, "grad_norm": 9.01024055480957, "learning_rate": 1.876260555845414e-05, "loss": 1.905, "step": 306300 }, { "epoch": 0.06189030025077287, "grad_norm": 5.8268256187438965, "learning_rate": 1.876220157460784e-05, "loss": 1.9061, "step": 306400 }, { "epoch": 0.061910499434927824, "grad_norm": 7.193397521972656, "learning_rate": 1.8761797590761536e-05, "loss": 1.7594, "step": 306500 }, { "epoch": 0.06193069861908278, "grad_norm": 7.0762176513671875, "learning_rate": 1.8761393606915235e-05, "loss": 1.8188, "step": 306600 }, { "epoch": 0.061950897803237724, "grad_norm": 4.920135021209717, "learning_rate": 1.8760989623068934e-05, "loss": 1.8469, "step": 306700 }, { "epoch": 0.06197109698739268, "grad_norm": 8.540083885192871, "learning_rate": 1.8760585639222633e-05, "loss": 1.7416, "step": 306800 }, { "epoch": 0.06199129617154763, "grad_norm": 3.821953296661377, "learning_rate": 1.8760181655376332e-05, "loss": 1.8781, "step": 306900 }, { "epoch": 0.062011495355702585, "grad_norm": 10.12619686126709, "learning_rate": 1.8759777671530028e-05, "loss": 1.8646, "step": 307000 }, { "epoch": 0.062011495355702585, "eval_calculated_loss": 8.973339080810547, "eval_loss": 2.1574819087982178, "eval_perplexity": 7889.902692998293, "eval_runtime": 122.581, "eval_samples_per_second": 8.142, "eval_steps_per_second": 2.039, "step": 307000 }, { "epoch": 0.06203169453985753, "grad_norm": 9.248090744018555, "learning_rate": 1.8759373687683727e-05, "loss": 1.8152, "step": 307100 }, { "epoch": 0.062051893724012486, "grad_norm": 5.349910736083984, "learning_rate": 1.8758969703837423e-05, "loss": 1.7999, "step": 307200 }, { "epoch": 0.06207209290816744, "grad_norm": 7.818264484405518, "learning_rate": 1.8758565719991122e-05, "loss": 1.8822, "step": 307300 }, { "epoch": 0.06209229209232239, "grad_norm": 7.146326065063477, "learning_rate": 1.875816173614482e-05, "loss": 1.8925, "step": 307400 }, { "epoch": 0.06211249127647734, "grad_norm": 7.604658603668213, "learning_rate": 1.8757757752298517e-05, "loss": 1.9365, "step": 307500 }, { "epoch": 0.062132690460632294, "grad_norm": 8.035287857055664, "learning_rate": 1.8757353768452216e-05, "loss": 1.9097, "step": 307600 }, { "epoch": 0.06215288964478725, "grad_norm": 10.933748245239258, "learning_rate": 1.8756949784605915e-05, "loss": 1.8441, "step": 307700 }, { "epoch": 0.0621730888289422, "grad_norm": 5.795827388763428, "learning_rate": 1.875654580075961e-05, "loss": 1.7807, "step": 307800 }, { "epoch": 0.06219328801309715, "grad_norm": 13.149075508117676, "learning_rate": 1.875614181691331e-05, "loss": 1.9607, "step": 307900 }, { "epoch": 0.0622134871972521, "grad_norm": 9.568521499633789, "learning_rate": 1.875573783306701e-05, "loss": 1.8517, "step": 308000 }, { "epoch": 0.0622134871972521, "eval_calculated_loss": 9.019303321838379, "eval_loss": 2.146230459213257, "eval_perplexity": 8261.01980391973, "eval_runtime": 123.8664, "eval_samples_per_second": 8.057, "eval_steps_per_second": 2.018, "step": 308000 }, { "epoch": 0.062233686381407055, "grad_norm": 6.11726188659668, "learning_rate": 1.8755333849220708e-05, "loss": 1.8325, "step": 308100 }, { "epoch": 0.06225388556556201, "grad_norm": 10.048039436340332, "learning_rate": 1.8754929865374404e-05, "loss": 1.9089, "step": 308200 }, { "epoch": 0.062274084749716956, "grad_norm": 9.022970199584961, "learning_rate": 1.8754525881528103e-05, "loss": 1.7947, "step": 308300 }, { "epoch": 0.06229428393387191, "grad_norm": 5.0875959396362305, "learning_rate": 1.8754121897681802e-05, "loss": 1.8884, "step": 308400 }, { "epoch": 0.06231448311802686, "grad_norm": 6.859148979187012, "learning_rate": 1.8753717913835498e-05, "loss": 1.8123, "step": 308500 }, { "epoch": 0.06233468230218182, "grad_norm": 7.007096290588379, "learning_rate": 1.8753313929989197e-05, "loss": 1.8214, "step": 308600 }, { "epoch": 0.06235488148633676, "grad_norm": 9.715551376342773, "learning_rate": 1.8752909946142893e-05, "loss": 1.8101, "step": 308700 }, { "epoch": 0.06237508067049172, "grad_norm": 11.334957122802734, "learning_rate": 1.8752505962296592e-05, "loss": 1.8539, "step": 308800 }, { "epoch": 0.06239527985464667, "grad_norm": 10.655929565429688, "learning_rate": 1.875210197845029e-05, "loss": 1.8968, "step": 308900 }, { "epoch": 0.062415479038801625, "grad_norm": 5.124995231628418, "learning_rate": 1.875169799460399e-05, "loss": 1.7352, "step": 309000 }, { "epoch": 0.062415479038801625, "eval_calculated_loss": 9.135059356689453, "eval_loss": 2.15047287940979, "eval_perplexity": 9274.828128550616, "eval_runtime": 123.5671, "eval_samples_per_second": 8.077, "eval_steps_per_second": 2.023, "step": 309000 }, { "epoch": 0.06243567822295658, "grad_norm": 4.849394798278809, "learning_rate": 1.875129401075769e-05, "loss": 1.8602, "step": 309100 }, { "epoch": 0.062455877407111525, "grad_norm": 9.695449829101562, "learning_rate": 1.8750890026911385e-05, "loss": 1.8602, "step": 309200 }, { "epoch": 0.06247607659126648, "grad_norm": 7.05778169631958, "learning_rate": 1.8750486043065084e-05, "loss": 1.8366, "step": 309300 }, { "epoch": 0.06249627577542143, "grad_norm": 12.628386497497559, "learning_rate": 1.8750082059218783e-05, "loss": 1.7717, "step": 309400 }, { "epoch": 0.06251647495957638, "grad_norm": 6.676788330078125, "learning_rate": 1.874967807537248e-05, "loss": 1.8626, "step": 309500 }, { "epoch": 0.06253667414373133, "grad_norm": 8.754571914672852, "learning_rate": 1.8749274091526178e-05, "loss": 1.7498, "step": 309600 }, { "epoch": 0.06255687332788629, "grad_norm": 7.482773780822754, "learning_rate": 1.8748870107679874e-05, "loss": 1.8518, "step": 309700 }, { "epoch": 0.06257707251204124, "grad_norm": 7.825121879577637, "learning_rate": 1.8748466123833573e-05, "loss": 1.7788, "step": 309800 }, { "epoch": 0.0625972716961962, "grad_norm": 7.502450942993164, "learning_rate": 1.8748062139987272e-05, "loss": 1.8344, "step": 309900 }, { "epoch": 0.06261747088035115, "grad_norm": 9.410930633544922, "learning_rate": 1.874765815614097e-05, "loss": 1.8411, "step": 310000 }, { "epoch": 0.06261747088035115, "eval_calculated_loss": 9.124639511108398, "eval_loss": 2.1453285217285156, "eval_perplexity": 9178.687606214005, "eval_runtime": 122.9474, "eval_samples_per_second": 8.117, "eval_steps_per_second": 2.033, "step": 310000 }, { "epoch": 0.0626376700645061, "grad_norm": 3.9534294605255127, "learning_rate": 1.874725417229467e-05, "loss": 1.8752, "step": 310100 }, { "epoch": 0.06265786924866104, "grad_norm": 5.999114513397217, "learning_rate": 1.8746850188448366e-05, "loss": 1.7982, "step": 310200 }, { "epoch": 0.062678068432816, "grad_norm": 4.97440767288208, "learning_rate": 1.8746446204602065e-05, "loss": 1.8279, "step": 310300 }, { "epoch": 0.06269826761697095, "grad_norm": 9.847051620483398, "learning_rate": 1.874604222075576e-05, "loss": 1.8908, "step": 310400 }, { "epoch": 0.0627184668011259, "grad_norm": 8.342706680297852, "learning_rate": 1.874563823690946e-05, "loss": 1.892, "step": 310500 }, { "epoch": 0.06273866598528086, "grad_norm": 11.136197090148926, "learning_rate": 1.874523425306316e-05, "loss": 1.8469, "step": 310600 }, { "epoch": 0.06275886516943581, "grad_norm": 8.768720626831055, "learning_rate": 1.8744830269216855e-05, "loss": 1.8576, "step": 310700 }, { "epoch": 0.06277906435359076, "grad_norm": 11.896618843078613, "learning_rate": 1.8744426285370554e-05, "loss": 1.8338, "step": 310800 }, { "epoch": 0.06279926353774572, "grad_norm": 6.622823238372803, "learning_rate": 1.8744022301524253e-05, "loss": 1.9107, "step": 310900 }, { "epoch": 0.06281946272190066, "grad_norm": 8.045440673828125, "learning_rate": 1.8743618317677952e-05, "loss": 1.8871, "step": 311000 }, { "epoch": 0.06281946272190066, "eval_calculated_loss": 9.111844062805176, "eval_loss": 2.1485586166381836, "eval_perplexity": 9061.990372337787, "eval_runtime": 123.9086, "eval_samples_per_second": 8.054, "eval_steps_per_second": 2.018, "step": 311000 }, { "epoch": 0.06283966190605561, "grad_norm": 6.713430404663086, "learning_rate": 1.8743214333831648e-05, "loss": 1.9516, "step": 311100 }, { "epoch": 0.06285986109021056, "grad_norm": 10.496493339538574, "learning_rate": 1.8742810349985347e-05, "loss": 1.8936, "step": 311200 }, { "epoch": 0.06288006027436552, "grad_norm": 8.000434875488281, "learning_rate": 1.8742406366139046e-05, "loss": 1.905, "step": 311300 }, { "epoch": 0.06290025945852047, "grad_norm": 8.927785873413086, "learning_rate": 1.8742002382292742e-05, "loss": 1.9183, "step": 311400 }, { "epoch": 0.06292045864267543, "grad_norm": 14.25734806060791, "learning_rate": 1.874159839844644e-05, "loss": 1.8693, "step": 311500 }, { "epoch": 0.06294065782683038, "grad_norm": 7.591400623321533, "learning_rate": 1.874119441460014e-05, "loss": 1.9, "step": 311600 }, { "epoch": 0.06296085701098533, "grad_norm": 4.450354099273682, "learning_rate": 1.8740790430753836e-05, "loss": 1.7804, "step": 311700 }, { "epoch": 0.06298105619514027, "grad_norm": 7.512557029724121, "learning_rate": 1.8740386446907535e-05, "loss": 1.8068, "step": 311800 }, { "epoch": 0.06300125537929523, "grad_norm": 7.5620503425598145, "learning_rate": 1.873998246306123e-05, "loss": 1.849, "step": 311900 }, { "epoch": 0.06302145456345018, "grad_norm": 9.570419311523438, "learning_rate": 1.8739578479214933e-05, "loss": 1.8275, "step": 312000 }, { "epoch": 0.06302145456345018, "eval_calculated_loss": 9.353179931640625, "eval_loss": 2.141878366470337, "eval_perplexity": 11535.447117178319, "eval_runtime": 123.8345, "eval_samples_per_second": 8.059, "eval_steps_per_second": 2.019, "step": 312000 }, { "epoch": 0.06304165374760513, "grad_norm": 6.424056053161621, "learning_rate": 1.873917449536863e-05, "loss": 1.853, "step": 312100 }, { "epoch": 0.06306185293176009, "grad_norm": 7.442103385925293, "learning_rate": 1.8738770511522328e-05, "loss": 1.8269, "step": 312200 }, { "epoch": 0.06308205211591504, "grad_norm": 8.64204216003418, "learning_rate": 1.8738366527676027e-05, "loss": 1.8859, "step": 312300 }, { "epoch": 0.06310225130007, "grad_norm": 6.493527412414551, "learning_rate": 1.8737962543829723e-05, "loss": 1.8785, "step": 312400 }, { "epoch": 0.06312245048422495, "grad_norm": 9.792562484741211, "learning_rate": 1.8737558559983422e-05, "loss": 1.8647, "step": 312500 }, { "epoch": 0.06314264966837989, "grad_norm": 12.842633247375488, "learning_rate": 1.873715457613712e-05, "loss": 1.7641, "step": 312600 }, { "epoch": 0.06316284885253484, "grad_norm": 5.480280876159668, "learning_rate": 1.8736750592290817e-05, "loss": 1.8284, "step": 312700 }, { "epoch": 0.0631830480366898, "grad_norm": 4.944769859313965, "learning_rate": 1.8736346608444516e-05, "loss": 1.8004, "step": 312800 }, { "epoch": 0.06320324722084475, "grad_norm": 5.286147117614746, "learning_rate": 1.8735942624598212e-05, "loss": 1.8191, "step": 312900 }, { "epoch": 0.0632234464049997, "grad_norm": 7.07968282699585, "learning_rate": 1.8735538640751915e-05, "loss": 1.8676, "step": 313000 }, { "epoch": 0.0632234464049997, "eval_calculated_loss": 9.075636863708496, "eval_loss": 2.1483960151672363, "eval_perplexity": 8739.749996390603, "eval_runtime": 124.6011, "eval_samples_per_second": 8.01, "eval_steps_per_second": 2.006, "step": 313000 }, { "epoch": 0.06324364558915466, "grad_norm": 9.860682487487793, "learning_rate": 1.873513465690561e-05, "loss": 1.8084, "step": 313100 }, { "epoch": 0.06326384477330961, "grad_norm": 9.910196304321289, "learning_rate": 1.873473067305931e-05, "loss": 1.8531, "step": 313200 }, { "epoch": 0.06328404395746456, "grad_norm": 7.370165824890137, "learning_rate": 1.873432668921301e-05, "loss": 1.8576, "step": 313300 }, { "epoch": 0.0633042431416195, "grad_norm": 9.925126075744629, "learning_rate": 1.8733922705366704e-05, "loss": 1.8909, "step": 313400 }, { "epoch": 0.06332444232577446, "grad_norm": 11.338434219360352, "learning_rate": 1.8733518721520403e-05, "loss": 1.9275, "step": 313500 }, { "epoch": 0.06334464150992941, "grad_norm": 6.546390533447266, "learning_rate": 1.87331147376741e-05, "loss": 1.8216, "step": 313600 }, { "epoch": 0.06336484069408437, "grad_norm": 4.160061836242676, "learning_rate": 1.8732710753827798e-05, "loss": 1.8161, "step": 313700 }, { "epoch": 0.06338503987823932, "grad_norm": 7.356183052062988, "learning_rate": 1.8732306769981497e-05, "loss": 1.8545, "step": 313800 }, { "epoch": 0.06340523906239427, "grad_norm": 7.7637434005737305, "learning_rate": 1.8731902786135193e-05, "loss": 1.8788, "step": 313900 }, { "epoch": 0.06342543824654923, "grad_norm": 10.864518165588379, "learning_rate": 1.8731498802288892e-05, "loss": 1.9083, "step": 314000 }, { "epoch": 0.06342543824654923, "eval_calculated_loss": 9.130982398986816, "eval_loss": 2.1456453800201416, "eval_perplexity": 9237.092023093668, "eval_runtime": 124.6027, "eval_samples_per_second": 8.009, "eval_steps_per_second": 2.006, "step": 314000 }, { "epoch": 0.06344563743070418, "grad_norm": 4.796252727508545, "learning_rate": 1.873109481844259e-05, "loss": 1.8327, "step": 314100 }, { "epoch": 0.06346583661485912, "grad_norm": 6.12101411819458, "learning_rate": 1.873069083459629e-05, "loss": 1.8067, "step": 314200 }, { "epoch": 0.06348603579901407, "grad_norm": 9.383954048156738, "learning_rate": 1.873028685074999e-05, "loss": 1.7975, "step": 314300 }, { "epoch": 0.06350623498316903, "grad_norm": 5.619745254516602, "learning_rate": 1.8729882866903685e-05, "loss": 1.8149, "step": 314400 }, { "epoch": 0.06352643416732398, "grad_norm": 6.950753211975098, "learning_rate": 1.8729478883057385e-05, "loss": 1.8379, "step": 314500 }, { "epoch": 0.06354663335147893, "grad_norm": 8.495321273803711, "learning_rate": 1.872907489921108e-05, "loss": 1.7979, "step": 314600 }, { "epoch": 0.06356683253563389, "grad_norm": 7.565228462219238, "learning_rate": 1.872867091536478e-05, "loss": 1.814, "step": 314700 }, { "epoch": 0.06358703171978884, "grad_norm": 7.119283199310303, "learning_rate": 1.872826693151848e-05, "loss": 1.9027, "step": 314800 }, { "epoch": 0.0636072309039438, "grad_norm": 6.007784843444824, "learning_rate": 1.8727862947672174e-05, "loss": 1.8131, "step": 314900 }, { "epoch": 0.06362743008809874, "grad_norm": 8.84888744354248, "learning_rate": 1.8727458963825873e-05, "loss": 1.9713, "step": 315000 }, { "epoch": 0.06362743008809874, "eval_calculated_loss": 9.078725814819336, "eval_loss": 2.1437289714813232, "eval_perplexity": 8766.788395497722, "eval_runtime": 123.3213, "eval_samples_per_second": 8.093, "eval_steps_per_second": 2.027, "step": 315000 }, { "epoch": 0.06364762927225369, "grad_norm": 6.163305282592773, "learning_rate": 1.8727054979979573e-05, "loss": 1.8311, "step": 315100 }, { "epoch": 0.06366782845640864, "grad_norm": 11.389727592468262, "learning_rate": 1.872665099613327e-05, "loss": 1.8751, "step": 315200 }, { "epoch": 0.0636880276405636, "grad_norm": 7.579365253448486, "learning_rate": 1.8726247012286967e-05, "loss": 1.8298, "step": 315300 }, { "epoch": 0.06370822682471855, "grad_norm": 5.270155906677246, "learning_rate": 1.8725843028440667e-05, "loss": 1.8371, "step": 315400 }, { "epoch": 0.0637284260088735, "grad_norm": 7.240290641784668, "learning_rate": 1.8725439044594366e-05, "loss": 1.8409, "step": 315500 }, { "epoch": 0.06374862519302846, "grad_norm": 6.05098819732666, "learning_rate": 1.872503506074806e-05, "loss": 1.9378, "step": 315600 }, { "epoch": 0.06376882437718341, "grad_norm": 8.696502685546875, "learning_rate": 1.872463107690176e-05, "loss": 1.8439, "step": 315700 }, { "epoch": 0.06378902356133835, "grad_norm": 8.81367015838623, "learning_rate": 1.872422709305546e-05, "loss": 1.8112, "step": 315800 }, { "epoch": 0.0638092227454933, "grad_norm": 5.533753871917725, "learning_rate": 1.8723823109209155e-05, "loss": 1.8022, "step": 315900 }, { "epoch": 0.06382942192964826, "grad_norm": 8.17931079864502, "learning_rate": 1.8723419125362854e-05, "loss": 1.8767, "step": 316000 }, { "epoch": 0.06382942192964826, "eval_calculated_loss": 9.128663063049316, "eval_loss": 2.1481165885925293, "eval_perplexity": 9215.692929043156, "eval_runtime": 122.8066, "eval_samples_per_second": 8.127, "eval_steps_per_second": 2.036, "step": 316000 }, { "epoch": 0.06384962111380321, "grad_norm": 4.097798824310303, "learning_rate": 1.872301514151655e-05, "loss": 1.8095, "step": 316100 }, { "epoch": 0.06386982029795817, "grad_norm": 4.150618076324463, "learning_rate": 1.8722611157670253e-05, "loss": 1.7601, "step": 316200 }, { "epoch": 0.06389001948211312, "grad_norm": 5.888782501220703, "learning_rate": 1.872220717382395e-05, "loss": 1.8314, "step": 316300 }, { "epoch": 0.06391021866626807, "grad_norm": 8.549055099487305, "learning_rate": 1.8721803189977648e-05, "loss": 1.864, "step": 316400 }, { "epoch": 0.06393041785042303, "grad_norm": 6.2362589836120605, "learning_rate": 1.8721399206131347e-05, "loss": 1.8856, "step": 316500 }, { "epoch": 0.06395061703457797, "grad_norm": 7.045931816101074, "learning_rate": 1.8720995222285042e-05, "loss": 1.8887, "step": 316600 }, { "epoch": 0.06397081621873292, "grad_norm": 9.662400245666504, "learning_rate": 1.872059123843874e-05, "loss": 1.824, "step": 316700 }, { "epoch": 0.06399101540288787, "grad_norm": 5.4185471534729, "learning_rate": 1.8720187254592437e-05, "loss": 1.8718, "step": 316800 }, { "epoch": 0.06401121458704283, "grad_norm": 7.262552738189697, "learning_rate": 1.8719783270746136e-05, "loss": 1.8696, "step": 316900 }, { "epoch": 0.06403141377119778, "grad_norm": 10.377701759338379, "learning_rate": 1.8719379286899836e-05, "loss": 1.8855, "step": 317000 }, { "epoch": 0.06403141377119778, "eval_calculated_loss": 9.116873741149902, "eval_loss": 2.1391072273254395, "eval_perplexity": 9107.684085083756, "eval_runtime": 124.4645, "eval_samples_per_second": 8.018, "eval_steps_per_second": 2.009, "step": 317000 }, { "epoch": 0.06405161295535274, "grad_norm": 7.9338860511779785, "learning_rate": 1.871897530305353e-05, "loss": 1.7668, "step": 317100 }, { "epoch": 0.06407181213950769, "grad_norm": 11.832696914672852, "learning_rate": 1.8718571319207234e-05, "loss": 1.8483, "step": 317200 }, { "epoch": 0.06409201132366264, "grad_norm": 6.247273921966553, "learning_rate": 1.871816733536093e-05, "loss": 1.8741, "step": 317300 }, { "epoch": 0.06411221050781758, "grad_norm": 5.516573429107666, "learning_rate": 1.871776335151463e-05, "loss": 1.9198, "step": 317400 }, { "epoch": 0.06413240969197254, "grad_norm": 7.21619987487793, "learning_rate": 1.8717359367668328e-05, "loss": 1.8448, "step": 317500 }, { "epoch": 0.06415260887612749, "grad_norm": 7.884905815124512, "learning_rate": 1.8716955383822024e-05, "loss": 1.9045, "step": 317600 }, { "epoch": 0.06417280806028244, "grad_norm": 8.018383979797363, "learning_rate": 1.8716551399975723e-05, "loss": 1.7711, "step": 317700 }, { "epoch": 0.0641930072444374, "grad_norm": 6.763605117797852, "learning_rate": 1.871614741612942e-05, "loss": 1.7632, "step": 317800 }, { "epoch": 0.06421320642859235, "grad_norm": 9.340757369995117, "learning_rate": 1.8715743432283118e-05, "loss": 1.8359, "step": 317900 }, { "epoch": 0.0642334056127473, "grad_norm": 6.388706684112549, "learning_rate": 1.8715339448436817e-05, "loss": 1.882, "step": 318000 }, { "epoch": 0.0642334056127473, "eval_calculated_loss": 8.949272155761719, "eval_loss": 2.1538333892822266, "eval_perplexity": 7702.283757588611, "eval_runtime": 127.2911, "eval_samples_per_second": 7.84, "eval_steps_per_second": 1.964, "step": 318000 }, { "epoch": 0.06425360479690226, "grad_norm": 7.592050552368164, "learning_rate": 1.8714935464590512e-05, "loss": 1.8499, "step": 318100 }, { "epoch": 0.0642738039810572, "grad_norm": 9.427456855773926, "learning_rate": 1.8714531480744215e-05, "loss": 1.857, "step": 318200 }, { "epoch": 0.06429400316521215, "grad_norm": 7.281992435455322, "learning_rate": 1.871412749689791e-05, "loss": 1.8984, "step": 318300 }, { "epoch": 0.0643142023493671, "grad_norm": 9.643452644348145, "learning_rate": 1.871372351305161e-05, "loss": 1.8476, "step": 318400 }, { "epoch": 0.06433440153352206, "grad_norm": 5.880373954772949, "learning_rate": 1.8713319529205306e-05, "loss": 1.9167, "step": 318500 }, { "epoch": 0.06435460071767701, "grad_norm": 8.705901145935059, "learning_rate": 1.8712915545359005e-05, "loss": 1.7565, "step": 318600 }, { "epoch": 0.06437479990183197, "grad_norm": 9.433892250061035, "learning_rate": 1.8712511561512704e-05, "loss": 1.8556, "step": 318700 }, { "epoch": 0.06439499908598692, "grad_norm": 5.033259391784668, "learning_rate": 1.87121075776664e-05, "loss": 1.8807, "step": 318800 }, { "epoch": 0.06441519827014187, "grad_norm": 6.4000349044799805, "learning_rate": 1.87117035938201e-05, "loss": 1.837, "step": 318900 }, { "epoch": 0.06443539745429683, "grad_norm": 8.251852035522461, "learning_rate": 1.8711299609973798e-05, "loss": 1.8561, "step": 319000 }, { "epoch": 0.06443539745429683, "eval_calculated_loss": 8.985816955566406, "eval_loss": 2.137880802154541, "eval_perplexity": 7988.968691844334, "eval_runtime": 124.2033, "eval_samples_per_second": 8.035, "eval_steps_per_second": 2.013, "step": 319000 }, { "epoch": 0.06445559663845177, "grad_norm": 7.938558101654053, "learning_rate": 1.8710895626127494e-05, "loss": 1.804, "step": 319100 }, { "epoch": 0.06447579582260672, "grad_norm": 7.444107532501221, "learning_rate": 1.8710491642281193e-05, "loss": 1.8393, "step": 319200 }, { "epoch": 0.06449599500676167, "grad_norm": 9.572016716003418, "learning_rate": 1.8710087658434892e-05, "loss": 1.8475, "step": 319300 }, { "epoch": 0.06451619419091663, "grad_norm": 9.941235542297363, "learning_rate": 1.870968367458859e-05, "loss": 1.8641, "step": 319400 }, { "epoch": 0.06453639337507158, "grad_norm": 5.1659770011901855, "learning_rate": 1.8709279690742287e-05, "loss": 1.8377, "step": 319500 }, { "epoch": 0.06455659255922654, "grad_norm": 5.172203063964844, "learning_rate": 1.8708875706895986e-05, "loss": 1.8095, "step": 319600 }, { "epoch": 0.06457679174338149, "grad_norm": 9.470183372497559, "learning_rate": 1.8708471723049685e-05, "loss": 1.731, "step": 319700 }, { "epoch": 0.06459699092753644, "grad_norm": 5.551164627075195, "learning_rate": 1.870806773920338e-05, "loss": 1.8425, "step": 319800 }, { "epoch": 0.06461719011169138, "grad_norm": 6.908609390258789, "learning_rate": 1.870766375535708e-05, "loss": 1.8468, "step": 319900 }, { "epoch": 0.06463738929584634, "grad_norm": 7.230128288269043, "learning_rate": 1.870725977151078e-05, "loss": 1.8213, "step": 320000 }, { "epoch": 0.06463738929584634, "eval_calculated_loss": 8.88950252532959, "eval_loss": 2.1423869132995605, "eval_perplexity": 7255.408903404786, "eval_runtime": 121.4479, "eval_samples_per_second": 8.218, "eval_steps_per_second": 2.058, "step": 320000 }, { "epoch": 0.06465758848000129, "grad_norm": 8.262033462524414, "learning_rate": 1.8706855787664475e-05, "loss": 1.8516, "step": 320100 }, { "epoch": 0.06467778766415624, "grad_norm": 7.20474910736084, "learning_rate": 1.8706451803818174e-05, "loss": 1.8107, "step": 320200 }, { "epoch": 0.0646979868483112, "grad_norm": 7.100156307220459, "learning_rate": 1.8706047819971873e-05, "loss": 1.7625, "step": 320300 }, { "epoch": 0.06471818603246615, "grad_norm": 9.437070846557617, "learning_rate": 1.8705643836125572e-05, "loss": 1.9337, "step": 320400 }, { "epoch": 0.0647383852166211, "grad_norm": 6.648512840270996, "learning_rate": 1.8705239852279268e-05, "loss": 1.8597, "step": 320500 }, { "epoch": 0.06475858440077606, "grad_norm": 6.951407432556152, "learning_rate": 1.8704835868432967e-05, "loss": 1.8052, "step": 320600 }, { "epoch": 0.064778783584931, "grad_norm": 7.607064723968506, "learning_rate": 1.8704431884586666e-05, "loss": 1.7755, "step": 320700 }, { "epoch": 0.06479898276908595, "grad_norm": 8.065730094909668, "learning_rate": 1.8704027900740362e-05, "loss": 1.8404, "step": 320800 }, { "epoch": 0.0648191819532409, "grad_norm": 6.466313362121582, "learning_rate": 1.870362391689406e-05, "loss": 1.8234, "step": 320900 }, { "epoch": 0.06483938113739586, "grad_norm": 5.7923970222473145, "learning_rate": 1.8703219933047757e-05, "loss": 1.7672, "step": 321000 }, { "epoch": 0.06483938113739586, "eval_calculated_loss": 9.020661354064941, "eval_loss": 2.147510051727295, "eval_perplexity": 8272.246156186455, "eval_runtime": 124.5714, "eval_samples_per_second": 8.011, "eval_steps_per_second": 2.007, "step": 321000 }, { "epoch": 0.06485958032155081, "grad_norm": 8.113118171691895, "learning_rate": 1.8702815949201456e-05, "loss": 1.8299, "step": 321100 }, { "epoch": 0.06487977950570577, "grad_norm": 7.760495185852051, "learning_rate": 1.8702411965355155e-05, "loss": 1.7808, "step": 321200 }, { "epoch": 0.06489997868986072, "grad_norm": 7.350905418395996, "learning_rate": 1.8702007981508854e-05, "loss": 1.8534, "step": 321300 }, { "epoch": 0.06492017787401567, "grad_norm": 9.00823974609375, "learning_rate": 1.8701603997662553e-05, "loss": 1.8138, "step": 321400 }, { "epoch": 0.06494037705817061, "grad_norm": 6.571842670440674, "learning_rate": 1.870120001381625e-05, "loss": 1.8427, "step": 321500 }, { "epoch": 0.06496057624232557, "grad_norm": 5.984094142913818, "learning_rate": 1.8700796029969948e-05, "loss": 1.8198, "step": 321600 }, { "epoch": 0.06498077542648052, "grad_norm": 7.571002960205078, "learning_rate": 1.8700392046123644e-05, "loss": 1.8671, "step": 321700 }, { "epoch": 0.06500097461063548, "grad_norm": 5.761745929718018, "learning_rate": 1.8699988062277343e-05, "loss": 1.8646, "step": 321800 }, { "epoch": 0.06502117379479043, "grad_norm": 3.879593849182129, "learning_rate": 1.8699584078431042e-05, "loss": 1.8317, "step": 321900 }, { "epoch": 0.06504137297894538, "grad_norm": 5.114414215087891, "learning_rate": 1.8699180094584738e-05, "loss": 1.8074, "step": 322000 }, { "epoch": 0.06504137297894538, "eval_calculated_loss": 8.894097328186035, "eval_loss": 2.1365692615509033, "eval_perplexity": 7288.822783267421, "eval_runtime": 124.128, "eval_samples_per_second": 8.04, "eval_steps_per_second": 2.014, "step": 322000 }, { "epoch": 0.06506157216310034, "grad_norm": 3.9066874980926514, "learning_rate": 1.8698776110738437e-05, "loss": 1.9382, "step": 322100 }, { "epoch": 0.06508177134725529, "grad_norm": 5.849746227264404, "learning_rate": 1.8698372126892136e-05, "loss": 1.7712, "step": 322200 }, { "epoch": 0.06510197053141023, "grad_norm": 5.755077362060547, "learning_rate": 1.8697968143045832e-05, "loss": 1.8488, "step": 322300 }, { "epoch": 0.06512216971556518, "grad_norm": 6.895565509796143, "learning_rate": 1.8697564159199534e-05, "loss": 1.9133, "step": 322400 }, { "epoch": 0.06514236889972014, "grad_norm": 6.566089153289795, "learning_rate": 1.869716017535323e-05, "loss": 1.8444, "step": 322500 }, { "epoch": 0.06516256808387509, "grad_norm": 4.755077362060547, "learning_rate": 1.869675619150693e-05, "loss": 1.8387, "step": 322600 }, { "epoch": 0.06518276726803005, "grad_norm": 8.915486335754395, "learning_rate": 1.8696352207660625e-05, "loss": 1.8323, "step": 322700 }, { "epoch": 0.065202966452185, "grad_norm": 6.474851131439209, "learning_rate": 1.8695948223814324e-05, "loss": 1.7883, "step": 322800 }, { "epoch": 0.06522316563633995, "grad_norm": 7.989998817443848, "learning_rate": 1.8695544239968023e-05, "loss": 1.8285, "step": 322900 }, { "epoch": 0.0652433648204949, "grad_norm": 5.396895885467529, "learning_rate": 1.869514025612172e-05, "loss": 1.8772, "step": 323000 }, { "epoch": 0.0652433648204949, "eval_calculated_loss": 8.78897476196289, "eval_loss": 2.1340973377227783, "eval_perplexity": 6561.501624801805, "eval_runtime": 121.5675, "eval_samples_per_second": 8.209, "eval_steps_per_second": 2.056, "step": 323000 }, { "epoch": 0.06526356400464985, "grad_norm": 6.650256156921387, "learning_rate": 1.8694736272275418e-05, "loss": 1.8598, "step": 323100 }, { "epoch": 0.0652837631888048, "grad_norm": 7.49819803237915, "learning_rate": 1.8694332288429117e-05, "loss": 1.8053, "step": 323200 }, { "epoch": 0.06530396237295975, "grad_norm": 8.147951126098633, "learning_rate": 1.8693928304582813e-05, "loss": 1.9181, "step": 323300 }, { "epoch": 0.06532416155711471, "grad_norm": 10.845536231994629, "learning_rate": 1.8693524320736512e-05, "loss": 1.7557, "step": 323400 }, { "epoch": 0.06534436074126966, "grad_norm": 6.366534233093262, "learning_rate": 1.869312033689021e-05, "loss": 1.8389, "step": 323500 }, { "epoch": 0.06536455992542461, "grad_norm": 4.136862754821777, "learning_rate": 1.869271635304391e-05, "loss": 1.8211, "step": 323600 }, { "epoch": 0.06538475910957957, "grad_norm": 8.467758178710938, "learning_rate": 1.8692312369197606e-05, "loss": 1.8357, "step": 323700 }, { "epoch": 0.06540495829373452, "grad_norm": 6.531941890716553, "learning_rate": 1.8691908385351305e-05, "loss": 1.8229, "step": 323800 }, { "epoch": 0.06542515747788946, "grad_norm": 13.646995544433594, "learning_rate": 1.8691504401505004e-05, "loss": 1.8402, "step": 323900 }, { "epoch": 0.06544535666204442, "grad_norm": 4.978155136108398, "learning_rate": 1.86911004176587e-05, "loss": 1.8664, "step": 324000 }, { "epoch": 0.06544535666204442, "eval_calculated_loss": 8.843877792358398, "eval_loss": 2.1394596099853516, "eval_perplexity": 6931.820751354313, "eval_runtime": 125.1506, "eval_samples_per_second": 7.974, "eval_steps_per_second": 1.998, "step": 324000 }, { "epoch": 0.06546555584619937, "grad_norm": 6.3595404624938965, "learning_rate": 1.86906964338124e-05, "loss": 1.92, "step": 324100 }, { "epoch": 0.06548575503035432, "grad_norm": 10.078269958496094, "learning_rate": 1.8690292449966095e-05, "loss": 1.9221, "step": 324200 }, { "epoch": 0.06550595421450928, "grad_norm": 7.195315837860107, "learning_rate": 1.8689888466119794e-05, "loss": 1.8145, "step": 324300 }, { "epoch": 0.06552615339866423, "grad_norm": 5.172389507293701, "learning_rate": 1.8689484482273493e-05, "loss": 1.8266, "step": 324400 }, { "epoch": 0.06554635258281918, "grad_norm": 5.402004241943359, "learning_rate": 1.8689080498427192e-05, "loss": 1.8655, "step": 324500 }, { "epoch": 0.06556655176697414, "grad_norm": 8.021435737609863, "learning_rate": 1.868867651458089e-05, "loss": 1.7399, "step": 324600 }, { "epoch": 0.06558675095112908, "grad_norm": 10.758060455322266, "learning_rate": 1.8688272530734587e-05, "loss": 1.8489, "step": 324700 }, { "epoch": 0.06560695013528403, "grad_norm": 7.953063011169434, "learning_rate": 1.8687868546888286e-05, "loss": 1.8418, "step": 324800 }, { "epoch": 0.06562714931943898, "grad_norm": 8.572832107543945, "learning_rate": 1.8687464563041985e-05, "loss": 1.8229, "step": 324900 }, { "epoch": 0.06564734850359394, "grad_norm": 8.093379974365234, "learning_rate": 1.868706057919568e-05, "loss": 1.9191, "step": 325000 }, { "epoch": 0.06564734850359394, "eval_calculated_loss": 8.991694450378418, "eval_loss": 2.1366655826568604, "eval_perplexity": 8036.062073868456, "eval_runtime": 124.2679, "eval_samples_per_second": 8.031, "eval_steps_per_second": 2.012, "step": 325000 }, { "epoch": 0.06566754768774889, "grad_norm": 9.340394973754883, "learning_rate": 1.868665659534938e-05, "loss": 1.8438, "step": 325100 }, { "epoch": 0.06568774687190385, "grad_norm": 7.861692905426025, "learning_rate": 1.8686252611503076e-05, "loss": 1.8448, "step": 325200 }, { "epoch": 0.0657079460560588, "grad_norm": 10.473594665527344, "learning_rate": 1.8685848627656775e-05, "loss": 1.8445, "step": 325300 }, { "epoch": 0.06572814524021375, "grad_norm": 8.64989948272705, "learning_rate": 1.8685444643810474e-05, "loss": 1.7459, "step": 325400 }, { "epoch": 0.0657483444243687, "grad_norm": 12.785391807556152, "learning_rate": 1.8685040659964173e-05, "loss": 1.7699, "step": 325500 }, { "epoch": 0.06576854360852365, "grad_norm": 9.09941291809082, "learning_rate": 1.8684636676117873e-05, "loss": 1.8631, "step": 325600 }, { "epoch": 0.0657887427926786, "grad_norm": 10.284366607666016, "learning_rate": 1.8684232692271568e-05, "loss": 1.7993, "step": 325700 }, { "epoch": 0.06580894197683355, "grad_norm": 18.499267578125, "learning_rate": 1.8683828708425267e-05, "loss": 1.8392, "step": 325800 }, { "epoch": 0.06582914116098851, "grad_norm": 6.650815486907959, "learning_rate": 1.8683424724578963e-05, "loss": 1.7767, "step": 325900 }, { "epoch": 0.06584934034514346, "grad_norm": 5.764721870422363, "learning_rate": 1.8683020740732662e-05, "loss": 1.868, "step": 326000 }, { "epoch": 0.06584934034514346, "eval_calculated_loss": 8.900279998779297, "eval_loss": 2.1493916511535645, "eval_perplexity": 7334.026770234477, "eval_runtime": 123.9663, "eval_samples_per_second": 8.051, "eval_steps_per_second": 2.017, "step": 326000 }, { "epoch": 0.06586953952929842, "grad_norm": 11.097825050354004, "learning_rate": 1.868261675688636e-05, "loss": 1.7992, "step": 326100 }, { "epoch": 0.06588973871345337, "grad_norm": 6.537656784057617, "learning_rate": 1.8682212773040057e-05, "loss": 1.9553, "step": 326200 }, { "epoch": 0.06590993789760831, "grad_norm": 6.3429856300354, "learning_rate": 1.8681808789193756e-05, "loss": 1.8558, "step": 326300 }, { "epoch": 0.06593013708176326, "grad_norm": 8.231230735778809, "learning_rate": 1.8681404805347455e-05, "loss": 1.9728, "step": 326400 }, { "epoch": 0.06595033626591822, "grad_norm": 7.443426132202148, "learning_rate": 1.8681000821501155e-05, "loss": 1.854, "step": 326500 }, { "epoch": 0.06597053545007317, "grad_norm": 7.1345391273498535, "learning_rate": 1.868059683765485e-05, "loss": 1.8228, "step": 326600 }, { "epoch": 0.06599073463422812, "grad_norm": 6.828729152679443, "learning_rate": 1.868019285380855e-05, "loss": 1.8568, "step": 326700 }, { "epoch": 0.06601093381838308, "grad_norm": 8.708218574523926, "learning_rate": 1.867978886996225e-05, "loss": 1.8775, "step": 326800 }, { "epoch": 0.06603113300253803, "grad_norm": 5.61966609954834, "learning_rate": 1.8679384886115944e-05, "loss": 1.8571, "step": 326900 }, { "epoch": 0.06605133218669298, "grad_norm": 6.609435081481934, "learning_rate": 1.8678980902269643e-05, "loss": 1.8653, "step": 327000 }, { "epoch": 0.06605133218669298, "eval_calculated_loss": 8.998481750488281, "eval_loss": 2.1450679302215576, "eval_perplexity": 8090.790758769092, "eval_runtime": 123.5889, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.023, "step": 327000 }, { "epoch": 0.06607153137084792, "grad_norm": 5.85704231262207, "learning_rate": 1.8678576918423343e-05, "loss": 1.9274, "step": 327100 }, { "epoch": 0.06609173055500288, "grad_norm": 3.763065814971924, "learning_rate": 1.8678172934577038e-05, "loss": 1.872, "step": 327200 }, { "epoch": 0.06611192973915783, "grad_norm": 8.414139747619629, "learning_rate": 1.8677768950730737e-05, "loss": 1.7705, "step": 327300 }, { "epoch": 0.06613212892331279, "grad_norm": 4.755809307098389, "learning_rate": 1.8677364966884433e-05, "loss": 1.8167, "step": 327400 }, { "epoch": 0.06615232810746774, "grad_norm": 5.848087310791016, "learning_rate": 1.8676960983038136e-05, "loss": 1.7709, "step": 327500 }, { "epoch": 0.0661725272916227, "grad_norm": 6.7345051765441895, "learning_rate": 1.867655699919183e-05, "loss": 1.8776, "step": 327600 }, { "epoch": 0.06619272647577765, "grad_norm": 8.043341636657715, "learning_rate": 1.867615301534553e-05, "loss": 1.8422, "step": 327700 }, { "epoch": 0.0662129256599326, "grad_norm": 9.562589645385742, "learning_rate": 1.867574903149923e-05, "loss": 1.8053, "step": 327800 }, { "epoch": 0.06623312484408755, "grad_norm": 4.5761942863464355, "learning_rate": 1.8675345047652925e-05, "loss": 1.7901, "step": 327900 }, { "epoch": 0.0662533240282425, "grad_norm": 4.249932765960693, "learning_rate": 1.8674941063806625e-05, "loss": 1.9021, "step": 328000 }, { "epoch": 0.0662533240282425, "eval_calculated_loss": 9.007538795471191, "eval_loss": 2.142821788787842, "eval_perplexity": 8164.402262774419, "eval_runtime": 124.9417, "eval_samples_per_second": 7.988, "eval_steps_per_second": 2.001, "step": 328000 }, { "epoch": 0.06627352321239745, "grad_norm": 7.4814581871032715, "learning_rate": 1.8674537079960324e-05, "loss": 1.8754, "step": 328100 }, { "epoch": 0.0662937223965524, "grad_norm": 5.908868312835693, "learning_rate": 1.867413309611402e-05, "loss": 1.8201, "step": 328200 }, { "epoch": 0.06631392158070736, "grad_norm": 7.613265514373779, "learning_rate": 1.867372911226772e-05, "loss": 1.8572, "step": 328300 }, { "epoch": 0.06633412076486231, "grad_norm": 7.345720291137695, "learning_rate": 1.8673325128421414e-05, "loss": 1.8952, "step": 328400 }, { "epoch": 0.06635431994901726, "grad_norm": 5.810305118560791, "learning_rate": 1.8672921144575113e-05, "loss": 1.8821, "step": 328500 }, { "epoch": 0.06637451913317222, "grad_norm": 10.45711612701416, "learning_rate": 1.8672517160728812e-05, "loss": 1.8124, "step": 328600 }, { "epoch": 0.06639471831732717, "grad_norm": 7.396759033203125, "learning_rate": 1.867211317688251e-05, "loss": 1.8417, "step": 328700 }, { "epoch": 0.06641491750148211, "grad_norm": 7.554502964019775, "learning_rate": 1.867170919303621e-05, "loss": 1.7989, "step": 328800 }, { "epoch": 0.06643511668563706, "grad_norm": 9.253334999084473, "learning_rate": 1.8671305209189906e-05, "loss": 1.804, "step": 328900 }, { "epoch": 0.06645531586979202, "grad_norm": 10.224385261535645, "learning_rate": 1.8670901225343606e-05, "loss": 1.7868, "step": 329000 }, { "epoch": 0.06645531586979202, "eval_calculated_loss": 8.877494812011719, "eval_loss": 2.1507294178009033, "eval_perplexity": 7168.809007184982, "eval_runtime": 119.8519, "eval_samples_per_second": 8.327, "eval_steps_per_second": 2.086, "step": 329000 }, { "epoch": 0.06647551505394697, "grad_norm": 5.085386753082275, "learning_rate": 1.86704972414973e-05, "loss": 1.8374, "step": 329100 }, { "epoch": 0.06649571423810192, "grad_norm": 8.325943946838379, "learning_rate": 1.8670093257651e-05, "loss": 1.8815, "step": 329200 }, { "epoch": 0.06651591342225688, "grad_norm": 4.560763835906982, "learning_rate": 1.86696892738047e-05, "loss": 1.7534, "step": 329300 }, { "epoch": 0.06653611260641183, "grad_norm": 6.3566508293151855, "learning_rate": 1.8669285289958395e-05, "loss": 1.8617, "step": 329400 }, { "epoch": 0.06655631179056679, "grad_norm": 6.1395792961120605, "learning_rate": 1.8668881306112094e-05, "loss": 1.858, "step": 329500 }, { "epoch": 0.06657651097472173, "grad_norm": 5.806092262268066, "learning_rate": 1.8668477322265794e-05, "loss": 1.7935, "step": 329600 }, { "epoch": 0.06659671015887668, "grad_norm": 11.953869819641113, "learning_rate": 1.8668073338419493e-05, "loss": 1.7925, "step": 329700 }, { "epoch": 0.06661690934303163, "grad_norm": 6.949769496917725, "learning_rate": 1.8667669354573192e-05, "loss": 1.8058, "step": 329800 }, { "epoch": 0.06663710852718659, "grad_norm": 4.378473281860352, "learning_rate": 1.8667265370726888e-05, "loss": 1.8516, "step": 329900 }, { "epoch": 0.06665730771134154, "grad_norm": 20.701711654663086, "learning_rate": 1.8666861386880587e-05, "loss": 1.9139, "step": 330000 }, { "epoch": 0.06665730771134154, "eval_calculated_loss": 8.897851943969727, "eval_loss": 2.1471481323242188, "eval_perplexity": 7316.240952469624, "eval_runtime": 122.4033, "eval_samples_per_second": 8.153, "eval_steps_per_second": 2.042, "step": 330000 }, { "epoch": 0.0666775068954965, "grad_norm": 8.274892807006836, "learning_rate": 1.8666457403034282e-05, "loss": 1.811, "step": 330100 }, { "epoch": 0.06669770607965145, "grad_norm": 10.262772560119629, "learning_rate": 1.866605341918798e-05, "loss": 1.8665, "step": 330200 }, { "epoch": 0.0667179052638064, "grad_norm": 7.604287147521973, "learning_rate": 1.866564943534168e-05, "loss": 1.8355, "step": 330300 }, { "epoch": 0.06673810444796134, "grad_norm": 10.373831748962402, "learning_rate": 1.8665245451495376e-05, "loss": 1.8421, "step": 330400 }, { "epoch": 0.0667583036321163, "grad_norm": 8.127327919006348, "learning_rate": 1.8664841467649076e-05, "loss": 1.859, "step": 330500 }, { "epoch": 0.06677850281627125, "grad_norm": 15.282692909240723, "learning_rate": 1.8664437483802775e-05, "loss": 1.8217, "step": 330600 }, { "epoch": 0.0667987020004262, "grad_norm": 6.548064231872559, "learning_rate": 1.8664033499956474e-05, "loss": 1.7881, "step": 330700 }, { "epoch": 0.06681890118458116, "grad_norm": 9.742500305175781, "learning_rate": 1.866362951611017e-05, "loss": 1.8318, "step": 330800 }, { "epoch": 0.06683910036873611, "grad_norm": 5.52107048034668, "learning_rate": 1.866322553226387e-05, "loss": 1.8403, "step": 330900 }, { "epoch": 0.06685929955289106, "grad_norm": 5.484245777130127, "learning_rate": 1.8662821548417568e-05, "loss": 1.8675, "step": 331000 }, { "epoch": 0.06685929955289106, "eval_calculated_loss": 8.85827922821045, "eval_loss": 2.159273147583008, "eval_perplexity": 7032.371220959185, "eval_runtime": 125.136, "eval_samples_per_second": 7.975, "eval_steps_per_second": 1.998, "step": 331000 }, { "epoch": 0.06687949873704602, "grad_norm": 6.903822422027588, "learning_rate": 1.8662417564571264e-05, "loss": 1.8047, "step": 331100 }, { "epoch": 0.06689969792120096, "grad_norm": 9.767412185668945, "learning_rate": 1.8662013580724963e-05, "loss": 1.779, "step": 331200 }, { "epoch": 0.06691989710535591, "grad_norm": 6.476258277893066, "learning_rate": 1.8661609596878662e-05, "loss": 1.8761, "step": 331300 }, { "epoch": 0.06694009628951086, "grad_norm": 5.922285556793213, "learning_rate": 1.8661205613032358e-05, "loss": 1.7574, "step": 331400 }, { "epoch": 0.06696029547366582, "grad_norm": 6.382295608520508, "learning_rate": 1.8660801629186057e-05, "loss": 1.8634, "step": 331500 }, { "epoch": 0.06698049465782077, "grad_norm": 8.849844932556152, "learning_rate": 1.8660397645339752e-05, "loss": 1.8188, "step": 331600 }, { "epoch": 0.06700069384197573, "grad_norm": 9.001785278320312, "learning_rate": 1.8659993661493455e-05, "loss": 1.8198, "step": 331700 }, { "epoch": 0.06702089302613068, "grad_norm": 6.502404689788818, "learning_rate": 1.865958967764715e-05, "loss": 1.7962, "step": 331800 }, { "epoch": 0.06704109221028563, "grad_norm": 5.607604503631592, "learning_rate": 1.865918569380085e-05, "loss": 1.8991, "step": 331900 }, { "epoch": 0.06706129139444057, "grad_norm": 7.185437202453613, "learning_rate": 1.865878170995455e-05, "loss": 1.7681, "step": 332000 }, { "epoch": 0.06706129139444057, "eval_calculated_loss": 9.0269193649292, "eval_loss": 2.153315305709839, "eval_perplexity": 8324.17628267467, "eval_runtime": 125.356, "eval_samples_per_second": 7.961, "eval_steps_per_second": 1.994, "step": 332000 }, { "epoch": 0.06708149057859553, "grad_norm": 7.87230110168457, "learning_rate": 1.8658377726108245e-05, "loss": 1.8495, "step": 332100 }, { "epoch": 0.06710168976275048, "grad_norm": 8.2437162399292, "learning_rate": 1.8657973742261944e-05, "loss": 1.8046, "step": 332200 }, { "epoch": 0.06712188894690543, "grad_norm": 10.118973731994629, "learning_rate": 1.865756975841564e-05, "loss": 1.7528, "step": 332300 }, { "epoch": 0.06714208813106039, "grad_norm": 5.516053676605225, "learning_rate": 1.865716577456934e-05, "loss": 1.9109, "step": 332400 }, { "epoch": 0.06716228731521534, "grad_norm": 8.445104598999023, "learning_rate": 1.8656761790723038e-05, "loss": 1.802, "step": 332500 }, { "epoch": 0.0671824864993703, "grad_norm": 5.669631481170654, "learning_rate": 1.8656357806876734e-05, "loss": 1.8742, "step": 332600 }, { "epoch": 0.06720268568352525, "grad_norm": 5.211068630218506, "learning_rate": 1.8655953823030436e-05, "loss": 1.8514, "step": 332700 }, { "epoch": 0.06722288486768019, "grad_norm": 7.139090538024902, "learning_rate": 1.8655549839184132e-05, "loss": 1.8948, "step": 332800 }, { "epoch": 0.06724308405183514, "grad_norm": 8.95274543762207, "learning_rate": 1.865514585533783e-05, "loss": 1.792, "step": 332900 }, { "epoch": 0.0672632832359901, "grad_norm": 6.532233715057373, "learning_rate": 1.865474187149153e-05, "loss": 1.8176, "step": 333000 }, { "epoch": 0.0672632832359901, "eval_calculated_loss": 9.158169746398926, "eval_loss": 2.149292230606079, "eval_perplexity": 9491.66900875139, "eval_runtime": 122.6861, "eval_samples_per_second": 8.135, "eval_steps_per_second": 2.038, "step": 333000 }, { "epoch": 0.06728348242014505, "grad_norm": 8.129182815551758, "learning_rate": 1.8654337887645226e-05, "loss": 1.8158, "step": 333100 }, { "epoch": 0.0673036816043, "grad_norm": 8.279404640197754, "learning_rate": 1.8653933903798925e-05, "loss": 1.9418, "step": 333200 }, { "epoch": 0.06732388078845496, "grad_norm": 10.075114250183105, "learning_rate": 1.865352991995262e-05, "loss": 1.9254, "step": 333300 }, { "epoch": 0.06734407997260991, "grad_norm": 11.528436660766602, "learning_rate": 1.865312593610632e-05, "loss": 1.8599, "step": 333400 }, { "epoch": 0.06736427915676486, "grad_norm": 8.502030372619629, "learning_rate": 1.865272195226002e-05, "loss": 1.8702, "step": 333500 }, { "epoch": 0.0673844783409198, "grad_norm": 7.947624206542969, "learning_rate": 1.8652317968413715e-05, "loss": 1.8684, "step": 333600 }, { "epoch": 0.06740467752507476, "grad_norm": 8.906253814697266, "learning_rate": 1.8651913984567414e-05, "loss": 1.7888, "step": 333700 }, { "epoch": 0.06742487670922971, "grad_norm": 4.653687477111816, "learning_rate": 1.8651510000721113e-05, "loss": 1.8347, "step": 333800 }, { "epoch": 0.06744507589338467, "grad_norm": 8.743926048278809, "learning_rate": 1.8651106016874812e-05, "loss": 1.7979, "step": 333900 }, { "epoch": 0.06746527507753962, "grad_norm": 6.782896995544434, "learning_rate": 1.8650702033028508e-05, "loss": 1.9499, "step": 334000 }, { "epoch": 0.06746527507753962, "eval_calculated_loss": 9.151371002197266, "eval_loss": 2.1488850116729736, "eval_perplexity": 9427.356449160885, "eval_runtime": 120.7691, "eval_samples_per_second": 8.264, "eval_steps_per_second": 2.07, "step": 334000 }, { "epoch": 0.06748547426169457, "grad_norm": 7.068550109863281, "learning_rate": 1.8650298049182207e-05, "loss": 1.8579, "step": 334100 }, { "epoch": 0.06750567344584953, "grad_norm": 8.053393363952637, "learning_rate": 1.8649894065335906e-05, "loss": 1.9265, "step": 334200 }, { "epoch": 0.06752587263000448, "grad_norm": 8.761486053466797, "learning_rate": 1.8649490081489602e-05, "loss": 1.7729, "step": 334300 }, { "epoch": 0.06754607181415942, "grad_norm": 8.14766788482666, "learning_rate": 1.86490860976433e-05, "loss": 1.909, "step": 334400 }, { "epoch": 0.06756627099831437, "grad_norm": 6.7443156242370605, "learning_rate": 1.8648682113797e-05, "loss": 1.788, "step": 334500 }, { "epoch": 0.06758647018246933, "grad_norm": 4.790045261383057, "learning_rate": 1.8648278129950696e-05, "loss": 1.8083, "step": 334600 }, { "epoch": 0.06760666936662428, "grad_norm": 9.05424976348877, "learning_rate": 1.8647874146104395e-05, "loss": 1.8551, "step": 334700 }, { "epoch": 0.06762686855077923, "grad_norm": 7.163120269775391, "learning_rate": 1.8647470162258094e-05, "loss": 1.8663, "step": 334800 }, { "epoch": 0.06764706773493419, "grad_norm": 10.19705581665039, "learning_rate": 1.8647066178411793e-05, "loss": 1.8408, "step": 334900 }, { "epoch": 0.06766726691908914, "grad_norm": 8.058040618896484, "learning_rate": 1.864666219456549e-05, "loss": 1.8956, "step": 335000 }, { "epoch": 0.06766726691908914, "eval_calculated_loss": 9.397428512573242, "eval_loss": 2.1477901935577393, "eval_perplexity": 12057.335544419153, "eval_runtime": 123.7392, "eval_samples_per_second": 8.065, "eval_steps_per_second": 2.02, "step": 335000 }, { "epoch": 0.0676874661032441, "grad_norm": 5.644222736358643, "learning_rate": 1.8646258210719188e-05, "loss": 1.8355, "step": 335100 }, { "epoch": 0.06770766528739904, "grad_norm": 4.8472161293029785, "learning_rate": 1.8645854226872887e-05, "loss": 1.764, "step": 335200 }, { "epoch": 0.06772786447155399, "grad_norm": 7.660960674285889, "learning_rate": 1.8645450243026583e-05, "loss": 1.8146, "step": 335300 }, { "epoch": 0.06774806365570894, "grad_norm": 7.652428150177002, "learning_rate": 1.8645046259180282e-05, "loss": 1.8036, "step": 335400 }, { "epoch": 0.0677682628398639, "grad_norm": 3.7926716804504395, "learning_rate": 1.864464227533398e-05, "loss": 1.8655, "step": 335500 }, { "epoch": 0.06778846202401885, "grad_norm": 5.855233669281006, "learning_rate": 1.8644238291487677e-05, "loss": 1.8586, "step": 335600 }, { "epoch": 0.0678086612081738, "grad_norm": 7.653592586517334, "learning_rate": 1.8643834307641376e-05, "loss": 1.8402, "step": 335700 }, { "epoch": 0.06782886039232876, "grad_norm": 8.461165428161621, "learning_rate": 1.8643430323795075e-05, "loss": 1.7665, "step": 335800 }, { "epoch": 0.06784905957648371, "grad_norm": 8.180864334106445, "learning_rate": 1.8643026339948774e-05, "loss": 1.9229, "step": 335900 }, { "epoch": 0.06786925876063865, "grad_norm": 6.23255729675293, "learning_rate": 1.864262235610247e-05, "loss": 1.7826, "step": 336000 }, { "epoch": 0.06786925876063865, "eval_calculated_loss": 9.125353813171387, "eval_loss": 2.1530556678771973, "eval_perplexity": 9185.246303873415, "eval_runtime": 122.1743, "eval_samples_per_second": 8.169, "eval_steps_per_second": 2.046, "step": 336000 }, { "epoch": 0.0678894579447936, "grad_norm": 8.61579418182373, "learning_rate": 1.864221837225617e-05, "loss": 1.7723, "step": 336100 }, { "epoch": 0.06790965712894856, "grad_norm": 8.955265998840332, "learning_rate": 1.8641814388409868e-05, "loss": 1.8203, "step": 336200 }, { "epoch": 0.06792985631310351, "grad_norm": 6.204704761505127, "learning_rate": 1.8641410404563564e-05, "loss": 1.7931, "step": 336300 }, { "epoch": 0.06795005549725847, "grad_norm": 3.51863431930542, "learning_rate": 1.8641006420717263e-05, "loss": 1.8467, "step": 336400 }, { "epoch": 0.06797025468141342, "grad_norm": 8.74953556060791, "learning_rate": 1.864060243687096e-05, "loss": 1.8377, "step": 336500 }, { "epoch": 0.06799045386556837, "grad_norm": 5.908233642578125, "learning_rate": 1.8640198453024658e-05, "loss": 1.8511, "step": 336600 }, { "epoch": 0.06801065304972333, "grad_norm": 7.493351459503174, "learning_rate": 1.8639794469178357e-05, "loss": 1.8127, "step": 336700 }, { "epoch": 0.06803085223387827, "grad_norm": 8.296442985534668, "learning_rate": 1.8639390485332053e-05, "loss": 1.8373, "step": 336800 }, { "epoch": 0.06805105141803322, "grad_norm": 5.0034589767456055, "learning_rate": 1.8638986501485755e-05, "loss": 1.8725, "step": 336900 }, { "epoch": 0.06807125060218817, "grad_norm": 7.133185386657715, "learning_rate": 1.863858251763945e-05, "loss": 1.8535, "step": 337000 }, { "epoch": 0.06807125060218817, "eval_calculated_loss": 9.148462295532227, "eval_loss": 2.1558637619018555, "eval_perplexity": 9399.974876410408, "eval_runtime": 123.6747, "eval_samples_per_second": 8.07, "eval_steps_per_second": 2.021, "step": 337000 }, { "epoch": 0.06809144978634313, "grad_norm": 7.875904560089111, "learning_rate": 1.863817853379315e-05, "loss": 1.9293, "step": 337100 }, { "epoch": 0.06811164897049808, "grad_norm": 7.559472560882568, "learning_rate": 1.8637774549946846e-05, "loss": 1.8633, "step": 337200 }, { "epoch": 0.06813184815465304, "grad_norm": 4.795033931732178, "learning_rate": 1.8637370566100545e-05, "loss": 1.8501, "step": 337300 }, { "epoch": 0.06815204733880799, "grad_norm": 5.764415740966797, "learning_rate": 1.8636966582254244e-05, "loss": 1.7977, "step": 337400 }, { "epoch": 0.06817224652296294, "grad_norm": 4.645477294921875, "learning_rate": 1.863656259840794e-05, "loss": 1.7999, "step": 337500 }, { "epoch": 0.0681924457071179, "grad_norm": 10.065559387207031, "learning_rate": 1.863615861456164e-05, "loss": 1.8738, "step": 337600 }, { "epoch": 0.06821264489127284, "grad_norm": 6.380617141723633, "learning_rate": 1.8635754630715338e-05, "loss": 1.8513, "step": 337700 }, { "epoch": 0.06823284407542779, "grad_norm": 5.791234493255615, "learning_rate": 1.8635350646869034e-05, "loss": 1.76, "step": 337800 }, { "epoch": 0.06825304325958274, "grad_norm": 4.9152703285217285, "learning_rate": 1.8634946663022737e-05, "loss": 1.8738, "step": 337900 }, { "epoch": 0.0682732424437377, "grad_norm": 6.076180458068848, "learning_rate": 1.8634542679176432e-05, "loss": 1.9129, "step": 338000 }, { "epoch": 0.0682732424437377, "eval_calculated_loss": 9.211630821228027, "eval_loss": 2.147207021713257, "eval_perplexity": 10012.912822397515, "eval_runtime": 123.4548, "eval_samples_per_second": 8.084, "eval_steps_per_second": 2.025, "step": 338000 }, { "epoch": 0.06829344162789265, "grad_norm": 7.582123756408691, "learning_rate": 1.863413869533013e-05, "loss": 1.8437, "step": 338100 }, { "epoch": 0.0683136408120476, "grad_norm": 6.3378520011901855, "learning_rate": 1.8633734711483827e-05, "loss": 1.8144, "step": 338200 }, { "epoch": 0.06833383999620256, "grad_norm": 5.450493335723877, "learning_rate": 1.8633330727637526e-05, "loss": 1.8338, "step": 338300 }, { "epoch": 0.06835403918035751, "grad_norm": 6.421515464782715, "learning_rate": 1.8632926743791225e-05, "loss": 1.8463, "step": 338400 }, { "epoch": 0.06837423836451245, "grad_norm": 13.073742866516113, "learning_rate": 1.863252275994492e-05, "loss": 1.8266, "step": 338500 }, { "epoch": 0.0683944375486674, "grad_norm": 3.9386167526245117, "learning_rate": 1.863211877609862e-05, "loss": 1.8468, "step": 338600 }, { "epoch": 0.06841463673282236, "grad_norm": 5.853858947753906, "learning_rate": 1.863171479225232e-05, "loss": 1.9004, "step": 338700 }, { "epoch": 0.06843483591697731, "grad_norm": 7.767780303955078, "learning_rate": 1.8631310808406015e-05, "loss": 1.8248, "step": 338800 }, { "epoch": 0.06845503510113227, "grad_norm": 5.747520446777344, "learning_rate": 1.8630906824559714e-05, "loss": 1.8107, "step": 338900 }, { "epoch": 0.06847523428528722, "grad_norm": 12.701042175292969, "learning_rate": 1.8630502840713413e-05, "loss": 1.9478, "step": 339000 }, { "epoch": 0.06847523428528722, "eval_calculated_loss": 9.104517936706543, "eval_loss": 2.1462202072143555, "eval_perplexity": 8995.84368350963, "eval_runtime": 122.8432, "eval_samples_per_second": 8.124, "eval_steps_per_second": 2.035, "step": 339000 }, { "epoch": 0.06849543346944217, "grad_norm": 7.234545707702637, "learning_rate": 1.8630098856867113e-05, "loss": 1.8065, "step": 339100 }, { "epoch": 0.06851563265359713, "grad_norm": 8.34564208984375, "learning_rate": 1.8629694873020808e-05, "loss": 1.8475, "step": 339200 }, { "epoch": 0.06853583183775207, "grad_norm": 2.770965099334717, "learning_rate": 1.8629290889174507e-05, "loss": 1.7967, "step": 339300 }, { "epoch": 0.06855603102190702, "grad_norm": 9.013152122497559, "learning_rate": 1.8628886905328207e-05, "loss": 1.8357, "step": 339400 }, { "epoch": 0.06857623020606197, "grad_norm": 9.449689865112305, "learning_rate": 1.8628482921481902e-05, "loss": 1.9289, "step": 339500 }, { "epoch": 0.06859642939021693, "grad_norm": 7.974453926086426, "learning_rate": 1.86280789376356e-05, "loss": 1.8883, "step": 339600 }, { "epoch": 0.06861662857437188, "grad_norm": 8.860564231872559, "learning_rate": 1.8627674953789297e-05, "loss": 1.8888, "step": 339700 }, { "epoch": 0.06863682775852684, "grad_norm": 9.055566787719727, "learning_rate": 1.8627270969942996e-05, "loss": 1.8662, "step": 339800 }, { "epoch": 0.06865702694268179, "grad_norm": 5.951841831207275, "learning_rate": 1.8626866986096695e-05, "loss": 1.807, "step": 339900 }, { "epoch": 0.06867722612683674, "grad_norm": 9.410130500793457, "learning_rate": 1.8626463002250395e-05, "loss": 1.8613, "step": 340000 }, { "epoch": 0.06867722612683674, "eval_calculated_loss": 9.038171768188477, "eval_loss": 2.1460988521575928, "eval_perplexity": 8418.372242576323, "eval_runtime": 123.2288, "eval_samples_per_second": 8.099, "eval_steps_per_second": 2.029, "step": 340000 }, { "epoch": 0.06869742531099168, "grad_norm": 8.75412654876709, "learning_rate": 1.8626059018404094e-05, "loss": 1.8357, "step": 340100 }, { "epoch": 0.06871762449514664, "grad_norm": 8.126481056213379, "learning_rate": 1.862565503455779e-05, "loss": 1.855, "step": 340200 }, { "epoch": 0.06873782367930159, "grad_norm": 7.4197468757629395, "learning_rate": 1.862525105071149e-05, "loss": 1.8373, "step": 340300 }, { "epoch": 0.06875802286345654, "grad_norm": 7.687781810760498, "learning_rate": 1.8624847066865188e-05, "loss": 1.846, "step": 340400 }, { "epoch": 0.0687782220476115, "grad_norm": 6.437399387359619, "learning_rate": 1.8624443083018883e-05, "loss": 1.8694, "step": 340500 }, { "epoch": 0.06879842123176645, "grad_norm": 4.681427001953125, "learning_rate": 1.8624039099172583e-05, "loss": 1.9086, "step": 340600 }, { "epoch": 0.0688186204159214, "grad_norm": 8.881457328796387, "learning_rate": 1.8623635115326278e-05, "loss": 1.9271, "step": 340700 }, { "epoch": 0.06883881960007636, "grad_norm": 8.801932334899902, "learning_rate": 1.8623231131479977e-05, "loss": 1.8389, "step": 340800 }, { "epoch": 0.0688590187842313, "grad_norm": 8.934637069702148, "learning_rate": 1.8622827147633676e-05, "loss": 1.8057, "step": 340900 }, { "epoch": 0.06887921796838625, "grad_norm": 6.352433204650879, "learning_rate": 1.8622423163787376e-05, "loss": 1.832, "step": 341000 }, { "epoch": 0.06887921796838625, "eval_calculated_loss": 9.173539161682129, "eval_loss": 2.1507656574249268, "eval_perplexity": 9638.677232868507, "eval_runtime": 121.0057, "eval_samples_per_second": 8.248, "eval_steps_per_second": 2.066, "step": 341000 }, { "epoch": 0.0688994171525412, "grad_norm": 5.921281814575195, "learning_rate": 1.8622019179941075e-05, "loss": 1.8382, "step": 341100 }, { "epoch": 0.06891961633669616, "grad_norm": 7.059607982635498, "learning_rate": 1.862161519609477e-05, "loss": 1.8528, "step": 341200 }, { "epoch": 0.06893981552085111, "grad_norm": 10.09842300415039, "learning_rate": 1.862121121224847e-05, "loss": 1.8988, "step": 341300 }, { "epoch": 0.06896001470500607, "grad_norm": 5.364804267883301, "learning_rate": 1.8620807228402165e-05, "loss": 1.9187, "step": 341400 }, { "epoch": 0.06898021388916102, "grad_norm": 5.6972761154174805, "learning_rate": 1.8620403244555864e-05, "loss": 1.8714, "step": 341500 }, { "epoch": 0.06900041307331597, "grad_norm": 9.771142959594727, "learning_rate": 1.8619999260709564e-05, "loss": 1.8611, "step": 341600 }, { "epoch": 0.06902061225747091, "grad_norm": 7.618026256561279, "learning_rate": 1.861959527686326e-05, "loss": 1.8958, "step": 341700 }, { "epoch": 0.06904081144162587, "grad_norm": 6.77806282043457, "learning_rate": 1.861919129301696e-05, "loss": 1.8622, "step": 341800 }, { "epoch": 0.06906101062578082, "grad_norm": 9.544051170349121, "learning_rate": 1.8618787309170658e-05, "loss": 1.7982, "step": 341900 }, { "epoch": 0.06908120980993578, "grad_norm": 9.320150375366211, "learning_rate": 1.8618383325324353e-05, "loss": 1.8482, "step": 342000 }, { "epoch": 0.06908120980993578, "eval_calculated_loss": 9.19436264038086, "eval_loss": 2.1693637371063232, "eval_perplexity": 9841.492352447087, "eval_runtime": 122.7134, "eval_samples_per_second": 8.133, "eval_steps_per_second": 2.037, "step": 342000 }, { "epoch": 0.06910140899409073, "grad_norm": 4.522839546203613, "learning_rate": 1.8617979341478052e-05, "loss": 1.8071, "step": 342100 }, { "epoch": 0.06912160817824568, "grad_norm": 7.0868611335754395, "learning_rate": 1.861757535763175e-05, "loss": 1.861, "step": 342200 }, { "epoch": 0.06914180736240064, "grad_norm": 9.479244232177734, "learning_rate": 1.861717137378545e-05, "loss": 1.8165, "step": 342300 }, { "epoch": 0.06916200654655559, "grad_norm": 7.682100296020508, "learning_rate": 1.8616767389939146e-05, "loss": 1.9207, "step": 342400 }, { "epoch": 0.06918220573071053, "grad_norm": 5.122099876403809, "learning_rate": 1.8616363406092846e-05, "loss": 1.8692, "step": 342500 }, { "epoch": 0.06920240491486548, "grad_norm": 7.9666829109191895, "learning_rate": 1.8615959422246545e-05, "loss": 1.913, "step": 342600 }, { "epoch": 0.06922260409902044, "grad_norm": 6.215741157531738, "learning_rate": 1.861555543840024e-05, "loss": 1.895, "step": 342700 }, { "epoch": 0.06924280328317539, "grad_norm": 7.832880973815918, "learning_rate": 1.861515145455394e-05, "loss": 1.7911, "step": 342800 }, { "epoch": 0.06926300246733035, "grad_norm": 8.017600059509277, "learning_rate": 1.8614747470707635e-05, "loss": 1.81, "step": 342900 }, { "epoch": 0.0692832016514853, "grad_norm": 8.839098930358887, "learning_rate": 1.8614343486861334e-05, "loss": 1.8357, "step": 343000 }, { "epoch": 0.0692832016514853, "eval_calculated_loss": 8.988046646118164, "eval_loss": 2.1612861156463623, "eval_perplexity": 8006.801493281256, "eval_runtime": 121.8032, "eval_samples_per_second": 8.194, "eval_steps_per_second": 2.052, "step": 343000 }, { "epoch": 0.06930340083564025, "grad_norm": 9.032402038574219, "learning_rate": 1.8613939503015034e-05, "loss": 1.8719, "step": 343100 }, { "epoch": 0.0693236000197952, "grad_norm": 8.93995189666748, "learning_rate": 1.8613535519168733e-05, "loss": 1.8221, "step": 343200 }, { "epoch": 0.06934379920395015, "grad_norm": 8.102910041809082, "learning_rate": 1.8613131535322432e-05, "loss": 1.9252, "step": 343300 }, { "epoch": 0.0693639983881051, "grad_norm": 8.75213623046875, "learning_rate": 1.8612727551476128e-05, "loss": 1.8871, "step": 343400 }, { "epoch": 0.06938419757226005, "grad_norm": 6.687302589416504, "learning_rate": 1.8612323567629827e-05, "loss": 1.855, "step": 343500 }, { "epoch": 0.06940439675641501, "grad_norm": 13.352339744567871, "learning_rate": 1.8611919583783526e-05, "loss": 1.8613, "step": 343600 }, { "epoch": 0.06942459594056996, "grad_norm": 6.663467884063721, "learning_rate": 1.861151559993722e-05, "loss": 1.7998, "step": 343700 }, { "epoch": 0.06944479512472491, "grad_norm": 6.451165676116943, "learning_rate": 1.861111161609092e-05, "loss": 1.8187, "step": 343800 }, { "epoch": 0.06946499430887987, "grad_norm": 9.153966903686523, "learning_rate": 1.8610707632244616e-05, "loss": 1.8009, "step": 343900 }, { "epoch": 0.06948519349303482, "grad_norm": 4.286011695861816, "learning_rate": 1.8610303648398316e-05, "loss": 1.7964, "step": 344000 }, { "epoch": 0.06948519349303482, "eval_calculated_loss": 9.183454513549805, "eval_loss": 2.1528704166412354, "eval_perplexity": 9734.723488490088, "eval_runtime": 125.1701, "eval_samples_per_second": 7.973, "eval_steps_per_second": 1.997, "step": 344000 }, { "epoch": 0.06950539267718976, "grad_norm": 8.502347946166992, "learning_rate": 1.8609899664552015e-05, "loss": 1.7784, "step": 344100 }, { "epoch": 0.06952559186134472, "grad_norm": 7.536856651306152, "learning_rate": 1.8609495680705714e-05, "loss": 1.8743, "step": 344200 }, { "epoch": 0.06954579104549967, "grad_norm": 8.510319709777832, "learning_rate": 1.8609091696859413e-05, "loss": 1.8021, "step": 344300 }, { "epoch": 0.06956599022965462, "grad_norm": 5.201436519622803, "learning_rate": 1.860868771301311e-05, "loss": 1.8516, "step": 344400 }, { "epoch": 0.06958618941380958, "grad_norm": 7.465777397155762, "learning_rate": 1.8608283729166808e-05, "loss": 1.794, "step": 344500 }, { "epoch": 0.06960638859796453, "grad_norm": 7.360861301422119, "learning_rate": 1.8607879745320504e-05, "loss": 1.7744, "step": 344600 }, { "epoch": 0.06962658778211948, "grad_norm": 4.471718788146973, "learning_rate": 1.8607475761474203e-05, "loss": 1.8287, "step": 344700 }, { "epoch": 0.06964678696627444, "grad_norm": 8.879964828491211, "learning_rate": 1.8607071777627902e-05, "loss": 1.7699, "step": 344800 }, { "epoch": 0.06966698615042938, "grad_norm": 9.488407135009766, "learning_rate": 1.8606667793781598e-05, "loss": 1.7458, "step": 344900 }, { "epoch": 0.06968718533458433, "grad_norm": 3.3425865173339844, "learning_rate": 1.8606263809935297e-05, "loss": 1.7678, "step": 345000 }, { "epoch": 0.06968718533458433, "eval_calculated_loss": 9.22260570526123, "eval_loss": 2.1564371585845947, "eval_perplexity": 10123.408609601707, "eval_runtime": 120.6747, "eval_samples_per_second": 8.27, "eval_steps_per_second": 2.072, "step": 345000 }, { "epoch": 0.06970738451873928, "grad_norm": 6.377954483032227, "learning_rate": 1.8605859826088996e-05, "loss": 1.8152, "step": 345100 }, { "epoch": 0.06972758370289424, "grad_norm": 5.628783702850342, "learning_rate": 1.8605455842242695e-05, "loss": 1.7985, "step": 345200 }, { "epoch": 0.06974778288704919, "grad_norm": 8.595008850097656, "learning_rate": 1.860505185839639e-05, "loss": 1.8848, "step": 345300 }, { "epoch": 0.06976798207120415, "grad_norm": 6.5008544921875, "learning_rate": 1.860464787455009e-05, "loss": 1.8887, "step": 345400 }, { "epoch": 0.0697881812553591, "grad_norm": 8.828642845153809, "learning_rate": 1.860424389070379e-05, "loss": 1.9186, "step": 345500 }, { "epoch": 0.06980838043951405, "grad_norm": 9.99146556854248, "learning_rate": 1.8603839906857485e-05, "loss": 1.7781, "step": 345600 }, { "epoch": 0.069828579623669, "grad_norm": 6.867810249328613, "learning_rate": 1.8603435923011184e-05, "loss": 1.8346, "step": 345700 }, { "epoch": 0.06984877880782395, "grad_norm": 9.729251861572266, "learning_rate": 1.8603031939164883e-05, "loss": 1.8813, "step": 345800 }, { "epoch": 0.0698689779919789, "grad_norm": 4.2174072265625, "learning_rate": 1.860262795531858e-05, "loss": 1.7812, "step": 345900 }, { "epoch": 0.06988917717613385, "grad_norm": 8.236374855041504, "learning_rate": 1.8602223971472278e-05, "loss": 1.8575, "step": 346000 }, { "epoch": 0.06988917717613385, "eval_calculated_loss": 9.131367683410645, "eval_loss": 2.170748710632324, "eval_perplexity": 9240.65161645554, "eval_runtime": 120.8883, "eval_samples_per_second": 8.256, "eval_steps_per_second": 2.068, "step": 346000 }, { "epoch": 0.06990937636028881, "grad_norm": 6.073222637176514, "learning_rate": 1.8601819987625977e-05, "loss": 1.8751, "step": 346100 }, { "epoch": 0.06992957554444376, "grad_norm": 7.34204626083374, "learning_rate": 1.8601416003779676e-05, "loss": 1.7926, "step": 346200 }, { "epoch": 0.06994977472859872, "grad_norm": 5.715147018432617, "learning_rate": 1.8601012019933372e-05, "loss": 1.8298, "step": 346300 }, { "epoch": 0.06996997391275367, "grad_norm": 5.053602695465088, "learning_rate": 1.860060803608707e-05, "loss": 1.8205, "step": 346400 }, { "epoch": 0.06999017309690862, "grad_norm": 6.408377170562744, "learning_rate": 1.860020405224077e-05, "loss": 1.9029, "step": 346500 }, { "epoch": 0.07001037228106356, "grad_norm": 7.689845085144043, "learning_rate": 1.8599800068394466e-05, "loss": 1.8412, "step": 346600 }, { "epoch": 0.07003057146521852, "grad_norm": 4.945607662200928, "learning_rate": 1.8599396084548165e-05, "loss": 1.7266, "step": 346700 }, { "epoch": 0.07005077064937347, "grad_norm": 7.932934284210205, "learning_rate": 1.8598992100701864e-05, "loss": 1.7946, "step": 346800 }, { "epoch": 0.07007096983352842, "grad_norm": 11.15888500213623, "learning_rate": 1.859858811685556e-05, "loss": 1.887, "step": 346900 }, { "epoch": 0.07009116901768338, "grad_norm": 8.876708030700684, "learning_rate": 1.859818413300926e-05, "loss": 1.8884, "step": 347000 }, { "epoch": 0.07009116901768338, "eval_calculated_loss": 9.055581092834473, "eval_loss": 2.165325164794922, "eval_perplexity": 8566.213592953256, "eval_runtime": 123.2436, "eval_samples_per_second": 8.098, "eval_steps_per_second": 2.029, "step": 347000 }, { "epoch": 0.07011136820183833, "grad_norm": 10.5900297164917, "learning_rate": 1.8597780149162955e-05, "loss": 1.8189, "step": 347100 }, { "epoch": 0.07013156738599328, "grad_norm": 10.196020126342773, "learning_rate": 1.8597376165316657e-05, "loss": 1.8028, "step": 347200 }, { "epoch": 0.07015176657014824, "grad_norm": 9.807719230651855, "learning_rate": 1.8596972181470353e-05, "loss": 1.9316, "step": 347300 }, { "epoch": 0.07017196575430318, "grad_norm": 6.71481466293335, "learning_rate": 1.8596568197624052e-05, "loss": 1.8033, "step": 347400 }, { "epoch": 0.07019216493845813, "grad_norm": 7.719909191131592, "learning_rate": 1.859616421377775e-05, "loss": 1.7627, "step": 347500 }, { "epoch": 0.07021236412261309, "grad_norm": 8.530470848083496, "learning_rate": 1.8595760229931447e-05, "loss": 1.9073, "step": 347600 }, { "epoch": 0.07023256330676804, "grad_norm": 5.383358001708984, "learning_rate": 1.8595356246085146e-05, "loss": 1.8921, "step": 347700 }, { "epoch": 0.070252762490923, "grad_norm": 8.212226867675781, "learning_rate": 1.8594952262238842e-05, "loss": 1.8546, "step": 347800 }, { "epoch": 0.07027296167507795, "grad_norm": 7.708028793334961, "learning_rate": 1.859454827839254e-05, "loss": 1.9667, "step": 347900 }, { "epoch": 0.0702931608592329, "grad_norm": 9.48918628692627, "learning_rate": 1.859414429454624e-05, "loss": 1.9097, "step": 348000 }, { "epoch": 0.0702931608592329, "eval_calculated_loss": 9.12944221496582, "eval_loss": 2.1503562927246094, "eval_perplexity": 9222.876151898261, "eval_runtime": 122.4853, "eval_samples_per_second": 8.148, "eval_steps_per_second": 2.041, "step": 348000 }, { "epoch": 0.07031336004338785, "grad_norm": 7.447887420654297, "learning_rate": 1.8593740310699936e-05, "loss": 1.7523, "step": 348100 }, { "epoch": 0.0703335592275428, "grad_norm": 7.648500919342041, "learning_rate": 1.8593336326853635e-05, "loss": 1.8535, "step": 348200 }, { "epoch": 0.07035375841169775, "grad_norm": 8.476592063903809, "learning_rate": 1.8592932343007334e-05, "loss": 1.8091, "step": 348300 }, { "epoch": 0.0703739575958527, "grad_norm": 9.702569961547852, "learning_rate": 1.8592528359161033e-05, "loss": 1.8196, "step": 348400 }, { "epoch": 0.07039415678000766, "grad_norm": 9.26444149017334, "learning_rate": 1.8592124375314732e-05, "loss": 1.7941, "step": 348500 }, { "epoch": 0.07041435596416261, "grad_norm": 4.976348876953125, "learning_rate": 1.8591720391468428e-05, "loss": 1.8084, "step": 348600 }, { "epoch": 0.07043455514831756, "grad_norm": 8.864372253417969, "learning_rate": 1.8591316407622127e-05, "loss": 1.8453, "step": 348700 }, { "epoch": 0.07045475433247252, "grad_norm": 7.5134687423706055, "learning_rate": 1.8590912423775823e-05, "loss": 1.7983, "step": 348800 }, { "epoch": 0.07047495351662747, "grad_norm": 7.6541571617126465, "learning_rate": 1.8590508439929522e-05, "loss": 1.8299, "step": 348900 }, { "epoch": 0.07049515270078241, "grad_norm": 4.2730183601379395, "learning_rate": 1.859010445608322e-05, "loss": 1.8386, "step": 349000 }, { "epoch": 0.07049515270078241, "eval_calculated_loss": 9.113224983215332, "eval_loss": 2.1554553508758545, "eval_perplexity": 9074.512904119472, "eval_runtime": 121.3266, "eval_samples_per_second": 8.226, "eval_steps_per_second": 2.061, "step": 349000 }, { "epoch": 0.07051535188493736, "grad_norm": 7.929989814758301, "learning_rate": 1.8589700472236917e-05, "loss": 1.8828, "step": 349100 }, { "epoch": 0.07053555106909232, "grad_norm": 9.707295417785645, "learning_rate": 1.8589296488390616e-05, "loss": 1.8632, "step": 349200 }, { "epoch": 0.07055575025324727, "grad_norm": 9.597343444824219, "learning_rate": 1.8588892504544315e-05, "loss": 1.786, "step": 349300 }, { "epoch": 0.07057594943740222, "grad_norm": 4.616622447967529, "learning_rate": 1.8588488520698014e-05, "loss": 1.8338, "step": 349400 }, { "epoch": 0.07059614862155718, "grad_norm": 8.571489334106445, "learning_rate": 1.858808453685171e-05, "loss": 1.7981, "step": 349500 }, { "epoch": 0.07061634780571213, "grad_norm": 6.758572101593018, "learning_rate": 1.858768055300541e-05, "loss": 1.8149, "step": 349600 }, { "epoch": 0.07063654698986709, "grad_norm": 6.28740930557251, "learning_rate": 1.8587276569159108e-05, "loss": 1.8797, "step": 349700 }, { "epoch": 0.07065674617402203, "grad_norm": 5.860518932342529, "learning_rate": 1.8586872585312804e-05, "loss": 1.8358, "step": 349800 }, { "epoch": 0.07067694535817698, "grad_norm": 8.907389640808105, "learning_rate": 1.8586468601466503e-05, "loss": 1.8316, "step": 349900 }, { "epoch": 0.07069714454233193, "grad_norm": 7.607327461242676, "learning_rate": 1.8586064617620202e-05, "loss": 1.8294, "step": 350000 }, { "epoch": 0.07069714454233193, "eval_calculated_loss": 9.096264839172363, "eval_loss": 2.173569679260254, "eval_perplexity": 8921.905636824633, "eval_runtime": 121.6573, "eval_samples_per_second": 8.203, "eval_steps_per_second": 2.055, "step": 350000 }, { "epoch": 0.07071734372648689, "grad_norm": 5.5535783767700195, "learning_rate": 1.8585660633773898e-05, "loss": 1.833, "step": 350100 }, { "epoch": 0.07073754291064184, "grad_norm": 3.506300210952759, "learning_rate": 1.8585256649927597e-05, "loss": 1.8013, "step": 350200 }, { "epoch": 0.0707577420947968, "grad_norm": 5.719552516937256, "learning_rate": 1.8584852666081296e-05, "loss": 1.8474, "step": 350300 }, { "epoch": 0.07077794127895175, "grad_norm": 5.29328727722168, "learning_rate": 1.8584448682234995e-05, "loss": 1.7024, "step": 350400 }, { "epoch": 0.0707981404631067, "grad_norm": 10.60397720336914, "learning_rate": 1.858404469838869e-05, "loss": 1.753, "step": 350500 }, { "epoch": 0.07081833964726164, "grad_norm": 5.180353164672852, "learning_rate": 1.858364071454239e-05, "loss": 1.808, "step": 350600 }, { "epoch": 0.0708385388314166, "grad_norm": 6.786312103271484, "learning_rate": 1.858323673069609e-05, "loss": 1.8225, "step": 350700 }, { "epoch": 0.07085873801557155, "grad_norm": 10.33619213104248, "learning_rate": 1.8582832746849785e-05, "loss": 1.8583, "step": 350800 }, { "epoch": 0.0708789371997265, "grad_norm": 4.851522922515869, "learning_rate": 1.8582428763003484e-05, "loss": 1.8638, "step": 350900 }, { "epoch": 0.07089913638388146, "grad_norm": 6.220249176025391, "learning_rate": 1.8582024779157183e-05, "loss": 1.8254, "step": 351000 }, { "epoch": 0.07089913638388146, "eval_calculated_loss": 9.190866470336914, "eval_loss": 2.156674385070801, "eval_perplexity": 9807.144898952065, "eval_runtime": 123.4635, "eval_samples_per_second": 8.083, "eval_steps_per_second": 2.025, "step": 351000 }, { "epoch": 0.07091933556803641, "grad_norm": 8.914176940917969, "learning_rate": 1.858162079531088e-05, "loss": 1.9078, "step": 351100 }, { "epoch": 0.07093953475219136, "grad_norm": 4.679831504821777, "learning_rate": 1.8581216811464578e-05, "loss": 1.845, "step": 351200 }, { "epoch": 0.07095973393634632, "grad_norm": 7.495253086090088, "learning_rate": 1.8580812827618274e-05, "loss": 1.8088, "step": 351300 }, { "epoch": 0.07097993312050126, "grad_norm": 7.770479202270508, "learning_rate": 1.8580408843771977e-05, "loss": 1.9244, "step": 351400 }, { "epoch": 0.07100013230465621, "grad_norm": 9.251537322998047, "learning_rate": 1.8580004859925672e-05, "loss": 1.9184, "step": 351500 }, { "epoch": 0.07102033148881116, "grad_norm": 6.90657901763916, "learning_rate": 1.857960087607937e-05, "loss": 1.7883, "step": 351600 }, { "epoch": 0.07104053067296612, "grad_norm": 8.140374183654785, "learning_rate": 1.857919689223307e-05, "loss": 1.8592, "step": 351700 }, { "epoch": 0.07106072985712107, "grad_norm": 8.931591987609863, "learning_rate": 1.8578792908386766e-05, "loss": 1.8753, "step": 351800 }, { "epoch": 0.07108092904127603, "grad_norm": 6.551872253417969, "learning_rate": 1.8578388924540465e-05, "loss": 1.879, "step": 351900 }, { "epoch": 0.07110112822543098, "grad_norm": 7.5000128746032715, "learning_rate": 1.857798494069416e-05, "loss": 1.8264, "step": 352000 }, { "epoch": 0.07110112822543098, "eval_calculated_loss": 9.375945091247559, "eval_loss": 2.1481759548187256, "eval_perplexity": 11801.065361456984, "eval_runtime": 123.5631, "eval_samples_per_second": 8.077, "eval_steps_per_second": 2.023, "step": 352000 }, { "epoch": 0.07112132740958593, "grad_norm": 7.02061653137207, "learning_rate": 1.857758095684786e-05, "loss": 1.7905, "step": 352100 }, { "epoch": 0.07114152659374087, "grad_norm": 4.717836856842041, "learning_rate": 1.857717697300156e-05, "loss": 1.7429, "step": 352200 }, { "epoch": 0.07116172577789583, "grad_norm": 9.60696792602539, "learning_rate": 1.8576772989155255e-05, "loss": 1.8613, "step": 352300 }, { "epoch": 0.07118192496205078, "grad_norm": 9.777852058410645, "learning_rate": 1.8576369005308958e-05, "loss": 1.8634, "step": 352400 }, { "epoch": 0.07120212414620573, "grad_norm": 8.372335433959961, "learning_rate": 1.8575965021462653e-05, "loss": 1.7722, "step": 352500 }, { "epoch": 0.07122232333036069, "grad_norm": 8.407658576965332, "learning_rate": 1.8575561037616353e-05, "loss": 1.9224, "step": 352600 }, { "epoch": 0.07124252251451564, "grad_norm": 9.033238410949707, "learning_rate": 1.8575157053770048e-05, "loss": 1.919, "step": 352700 }, { "epoch": 0.0712627216986706, "grad_norm": 12.692145347595215, "learning_rate": 1.8574753069923747e-05, "loss": 1.8693, "step": 352800 }, { "epoch": 0.07128292088282555, "grad_norm": 15.122175216674805, "learning_rate": 1.8574349086077447e-05, "loss": 1.8139, "step": 352900 }, { "epoch": 0.07130312006698049, "grad_norm": 8.933138847351074, "learning_rate": 1.8573945102231142e-05, "loss": 1.8582, "step": 353000 }, { "epoch": 0.07130312006698049, "eval_calculated_loss": 9.132935523986816, "eval_loss": 2.1573362350463867, "eval_perplexity": 9255.150848282, "eval_runtime": 124.3694, "eval_samples_per_second": 8.024, "eval_steps_per_second": 2.01, "step": 353000 }, { "epoch": 0.07132331925113544, "grad_norm": 13.468642234802246, "learning_rate": 1.857354111838484e-05, "loss": 1.8797, "step": 353100 }, { "epoch": 0.0713435184352904, "grad_norm": 8.805088996887207, "learning_rate": 1.857313713453854e-05, "loss": 1.8764, "step": 353200 }, { "epoch": 0.07136371761944535, "grad_norm": 3.6091396808624268, "learning_rate": 1.8572733150692236e-05, "loss": 1.8774, "step": 353300 }, { "epoch": 0.0713839168036003, "grad_norm": 3.860713005065918, "learning_rate": 1.857232916684594e-05, "loss": 1.8998, "step": 353400 }, { "epoch": 0.07140411598775526, "grad_norm": 3.621244430541992, "learning_rate": 1.8571925182999634e-05, "loss": 1.7969, "step": 353500 }, { "epoch": 0.07142431517191021, "grad_norm": 7.522816181182861, "learning_rate": 1.8571521199153334e-05, "loss": 1.7934, "step": 353600 }, { "epoch": 0.07144451435606516, "grad_norm": 7.693681716918945, "learning_rate": 1.857111721530703e-05, "loss": 1.7586, "step": 353700 }, { "epoch": 0.0714647135402201, "grad_norm": 7.374012470245361, "learning_rate": 1.857071323146073e-05, "loss": 1.8265, "step": 353800 }, { "epoch": 0.07148491272437506, "grad_norm": 10.543414115905762, "learning_rate": 1.8570309247614428e-05, "loss": 1.7379, "step": 353900 }, { "epoch": 0.07150511190853001, "grad_norm": 12.195913314819336, "learning_rate": 1.8569905263768123e-05, "loss": 1.8306, "step": 354000 }, { "epoch": 0.07150511190853001, "eval_calculated_loss": 9.319578170776367, "eval_loss": 2.147211790084839, "eval_perplexity": 11154.275661431198, "eval_runtime": 120.8531, "eval_samples_per_second": 8.258, "eval_steps_per_second": 2.069, "step": 354000 }, { "epoch": 0.07152531109268497, "grad_norm": 6.787940979003906, "learning_rate": 1.8569501279921822e-05, "loss": 1.8494, "step": 354100 }, { "epoch": 0.07154551027683992, "grad_norm": 12.485248565673828, "learning_rate": 1.856909729607552e-05, "loss": 1.8708, "step": 354200 }, { "epoch": 0.07156570946099487, "grad_norm": 7.578666687011719, "learning_rate": 1.8568693312229217e-05, "loss": 1.9573, "step": 354300 }, { "epoch": 0.07158590864514983, "grad_norm": 5.7110443115234375, "learning_rate": 1.8568289328382916e-05, "loss": 1.8282, "step": 354400 }, { "epoch": 0.07160610782930478, "grad_norm": 9.01564884185791, "learning_rate": 1.8567885344536616e-05, "loss": 1.808, "step": 354500 }, { "epoch": 0.07162630701345972, "grad_norm": 3.3478264808654785, "learning_rate": 1.8567481360690315e-05, "loss": 1.8623, "step": 354600 }, { "epoch": 0.07164650619761467, "grad_norm": 6.398139476776123, "learning_rate": 1.856707737684401e-05, "loss": 1.7699, "step": 354700 }, { "epoch": 0.07166670538176963, "grad_norm": 6.55301570892334, "learning_rate": 1.856667339299771e-05, "loss": 1.7952, "step": 354800 }, { "epoch": 0.07168690456592458, "grad_norm": 8.29326343536377, "learning_rate": 1.856626940915141e-05, "loss": 1.8605, "step": 354900 }, { "epoch": 0.07170710375007953, "grad_norm": 6.717408180236816, "learning_rate": 1.8565865425305104e-05, "loss": 1.865, "step": 355000 }, { "epoch": 0.07170710375007953, "eval_calculated_loss": 9.312995910644531, "eval_loss": 2.1504437923431396, "eval_perplexity": 11081.096424048155, "eval_runtime": 121.833, "eval_samples_per_second": 8.192, "eval_steps_per_second": 2.052, "step": 355000 }, { "epoch": 0.07172730293423449, "grad_norm": 5.7222700119018555, "learning_rate": 1.8565461441458804e-05, "loss": 1.8575, "step": 355100 }, { "epoch": 0.07174750211838944, "grad_norm": 6.8705267906188965, "learning_rate": 1.85650574576125e-05, "loss": 1.8343, "step": 355200 }, { "epoch": 0.0717677013025444, "grad_norm": 5.948390483856201, "learning_rate": 1.85646534737662e-05, "loss": 1.8594, "step": 355300 }, { "epoch": 0.07178790048669934, "grad_norm": 7.2619547843933105, "learning_rate": 1.8564249489919898e-05, "loss": 1.8632, "step": 355400 }, { "epoch": 0.07180809967085429, "grad_norm": 5.8478617668151855, "learning_rate": 1.8563845506073597e-05, "loss": 1.8808, "step": 355500 }, { "epoch": 0.07182829885500924, "grad_norm": 5.752320289611816, "learning_rate": 1.8563441522227296e-05, "loss": 1.7697, "step": 355600 }, { "epoch": 0.0718484980391642, "grad_norm": 4.47726583480835, "learning_rate": 1.856303753838099e-05, "loss": 1.8423, "step": 355700 }, { "epoch": 0.07186869722331915, "grad_norm": 5.227095603942871, "learning_rate": 1.856263355453469e-05, "loss": 1.8101, "step": 355800 }, { "epoch": 0.0718888964074741, "grad_norm": 4.255580902099609, "learning_rate": 1.8562229570688386e-05, "loss": 1.9325, "step": 355900 }, { "epoch": 0.07190909559162906, "grad_norm": 5.357397079467773, "learning_rate": 1.8561825586842086e-05, "loss": 1.8185, "step": 356000 }, { "epoch": 0.07190909559162906, "eval_calculated_loss": 8.945663452148438, "eval_loss": 2.151270866394043, "eval_perplexity": 7674.5385905142575, "eval_runtime": 122.4774, "eval_samples_per_second": 8.148, "eval_steps_per_second": 2.041, "step": 356000 }, { "epoch": 0.07192929477578401, "grad_norm": 9.850187301635742, "learning_rate": 1.8561421602995785e-05, "loss": 1.7817, "step": 356100 }, { "epoch": 0.07194949395993896, "grad_norm": 8.81491756439209, "learning_rate": 1.856101761914948e-05, "loss": 1.8312, "step": 356200 }, { "epoch": 0.0719696931440939, "grad_norm": 6.983784198760986, "learning_rate": 1.856061363530318e-05, "loss": 1.8708, "step": 356300 }, { "epoch": 0.07198989232824886, "grad_norm": 8.17747688293457, "learning_rate": 1.856020965145688e-05, "loss": 1.8316, "step": 356400 }, { "epoch": 0.07201009151240381, "grad_norm": 7.121781349182129, "learning_rate": 1.8559805667610574e-05, "loss": 1.8237, "step": 356500 }, { "epoch": 0.07203029069655877, "grad_norm": 5.342189788818359, "learning_rate": 1.8559401683764277e-05, "loss": 1.8596, "step": 356600 }, { "epoch": 0.07205048988071372, "grad_norm": 7.422841548919678, "learning_rate": 1.8558997699917973e-05, "loss": 1.8658, "step": 356700 }, { "epoch": 0.07207068906486867, "grad_norm": 5.6406025886535645, "learning_rate": 1.8558593716071672e-05, "loss": 1.7553, "step": 356800 }, { "epoch": 0.07209088824902363, "grad_norm": 11.113832473754883, "learning_rate": 1.8558189732225368e-05, "loss": 1.9201, "step": 356900 }, { "epoch": 0.07211108743317858, "grad_norm": 8.735001564025879, "learning_rate": 1.8557785748379067e-05, "loss": 1.8653, "step": 357000 }, { "epoch": 0.07211108743317858, "eval_calculated_loss": 9.05236530303955, "eval_loss": 2.1643261909484863, "eval_perplexity": 8538.710696168635, "eval_runtime": 120.2618, "eval_samples_per_second": 8.299, "eval_steps_per_second": 2.079, "step": 357000 }, { "epoch": 0.07213128661733352, "grad_norm": 6.423651695251465, "learning_rate": 1.8557381764532766e-05, "loss": 1.7681, "step": 357100 }, { "epoch": 0.07215148580148847, "grad_norm": 7.495262622833252, "learning_rate": 1.855697778068646e-05, "loss": 1.8476, "step": 357200 }, { "epoch": 0.07217168498564343, "grad_norm": 10.773580551147461, "learning_rate": 1.855657379684016e-05, "loss": 1.7759, "step": 357300 }, { "epoch": 0.07219188416979838, "grad_norm": 4.069754600524902, "learning_rate": 1.855616981299386e-05, "loss": 1.8926, "step": 357400 }, { "epoch": 0.07221208335395334, "grad_norm": 6.33791971206665, "learning_rate": 1.8555765829147556e-05, "loss": 1.8291, "step": 357500 }, { "epoch": 0.07223228253810829, "grad_norm": 6.954745292663574, "learning_rate": 1.8555361845301255e-05, "loss": 1.8971, "step": 357600 }, { "epoch": 0.07225248172226324, "grad_norm": 5.727682113647461, "learning_rate": 1.8554957861454954e-05, "loss": 1.8108, "step": 357700 }, { "epoch": 0.0722726809064182, "grad_norm": 6.227973461151123, "learning_rate": 1.8554553877608653e-05, "loss": 1.8308, "step": 357800 }, { "epoch": 0.07229288009057314, "grad_norm": 5.389007091522217, "learning_rate": 1.855414989376235e-05, "loss": 1.8228, "step": 357900 }, { "epoch": 0.07231307927472809, "grad_norm": 6.152331829071045, "learning_rate": 1.8553745909916048e-05, "loss": 1.8514, "step": 358000 }, { "epoch": 0.07231307927472809, "eval_calculated_loss": 9.28725528717041, "eval_loss": 2.1659555435180664, "eval_perplexity": 10799.501851316347, "eval_runtime": 124.2339, "eval_samples_per_second": 8.033, "eval_steps_per_second": 2.012, "step": 358000 }, { "epoch": 0.07233327845888304, "grad_norm": 6.078901290893555, "learning_rate": 1.8553341926069747e-05, "loss": 1.902, "step": 358100 }, { "epoch": 0.072353477643038, "grad_norm": 6.295683860778809, "learning_rate": 1.8552937942223443e-05, "loss": 1.9179, "step": 358200 }, { "epoch": 0.07237367682719295, "grad_norm": 3.339268445968628, "learning_rate": 1.8552533958377142e-05, "loss": 1.8356, "step": 358300 }, { "epoch": 0.0723938760113479, "grad_norm": 4.032576084136963, "learning_rate": 1.8552129974530838e-05, "loss": 1.833, "step": 358400 }, { "epoch": 0.07241407519550286, "grad_norm": 4.8992767333984375, "learning_rate": 1.8551725990684537e-05, "loss": 1.8772, "step": 358500 }, { "epoch": 0.07243427437965781, "grad_norm": 11.420063972473145, "learning_rate": 1.8551322006838236e-05, "loss": 1.8463, "step": 358600 }, { "epoch": 0.07245447356381275, "grad_norm": 7.287923812866211, "learning_rate": 1.8550918022991935e-05, "loss": 1.8239, "step": 358700 }, { "epoch": 0.0724746727479677, "grad_norm": 5.380286693572998, "learning_rate": 1.8550514039145634e-05, "loss": 1.8125, "step": 358800 }, { "epoch": 0.07249487193212266, "grad_norm": 6.247990608215332, "learning_rate": 1.855011005529933e-05, "loss": 1.895, "step": 358900 }, { "epoch": 0.07251507111627761, "grad_norm": 6.655818462371826, "learning_rate": 1.854970607145303e-05, "loss": 1.829, "step": 359000 }, { "epoch": 0.07251507111627761, "eval_calculated_loss": 9.226855278015137, "eval_loss": 2.1600730419158936, "eval_perplexity": 10166.520309278869, "eval_runtime": 122.5179, "eval_samples_per_second": 8.146, "eval_steps_per_second": 2.041, "step": 359000 }, { "epoch": 0.07253527030043257, "grad_norm": 6.102029800415039, "learning_rate": 1.8549302087606728e-05, "loss": 1.8176, "step": 359100 }, { "epoch": 0.07255546948458752, "grad_norm": 9.297236442565918, "learning_rate": 1.8548898103760424e-05, "loss": 1.7746, "step": 359200 }, { "epoch": 0.07257566866874247, "grad_norm": 8.229606628417969, "learning_rate": 1.8548494119914123e-05, "loss": 1.7683, "step": 359300 }, { "epoch": 0.07259586785289743, "grad_norm": 8.58607006072998, "learning_rate": 1.854809013606782e-05, "loss": 1.851, "step": 359400 }, { "epoch": 0.07261606703705237, "grad_norm": 8.948165893554688, "learning_rate": 1.8547686152221518e-05, "loss": 1.7992, "step": 359500 }, { "epoch": 0.07263626622120732, "grad_norm": 8.559492111206055, "learning_rate": 1.8547282168375217e-05, "loss": 1.8751, "step": 359600 }, { "epoch": 0.07265646540536227, "grad_norm": 5.95286226272583, "learning_rate": 1.8546878184528916e-05, "loss": 1.8436, "step": 359700 }, { "epoch": 0.07267666458951723, "grad_norm": 9.897032737731934, "learning_rate": 1.8546474200682615e-05, "loss": 1.8643, "step": 359800 }, { "epoch": 0.07269686377367218, "grad_norm": 8.847259521484375, "learning_rate": 1.854607021683631e-05, "loss": 1.9264, "step": 359900 }, { "epoch": 0.07271706295782714, "grad_norm": 8.311528205871582, "learning_rate": 1.854566623299001e-05, "loss": 1.7848, "step": 360000 }, { "epoch": 0.07271706295782714, "eval_calculated_loss": 9.30804443359375, "eval_loss": 2.170164108276367, "eval_perplexity": 11026.364243797141, "eval_runtime": 125.1556, "eval_samples_per_second": 7.974, "eval_steps_per_second": 1.998, "step": 360000 }, { "epoch": 0.07273726214198209, "grad_norm": 4.597968101501465, "learning_rate": 1.8545262249143706e-05, "loss": 1.8658, "step": 360100 }, { "epoch": 0.07275746132613704, "grad_norm": 8.466377258300781, "learning_rate": 1.8544858265297405e-05, "loss": 1.819, "step": 360200 }, { "epoch": 0.07277766051029198, "grad_norm": 4.743690490722656, "learning_rate": 1.8544454281451104e-05, "loss": 1.7587, "step": 360300 }, { "epoch": 0.07279785969444694, "grad_norm": 11.88215446472168, "learning_rate": 1.85440502976048e-05, "loss": 1.8915, "step": 360400 }, { "epoch": 0.07281805887860189, "grad_norm": 7.825565338134766, "learning_rate": 1.85436463137585e-05, "loss": 1.8436, "step": 360500 }, { "epoch": 0.07283825806275684, "grad_norm": 6.447007179260254, "learning_rate": 1.8543242329912198e-05, "loss": 1.8091, "step": 360600 }, { "epoch": 0.0728584572469118, "grad_norm": 6.96050500869751, "learning_rate": 1.8542838346065897e-05, "loss": 1.7706, "step": 360700 }, { "epoch": 0.07287865643106675, "grad_norm": 8.51883316040039, "learning_rate": 1.8542434362219593e-05, "loss": 1.7962, "step": 360800 }, { "epoch": 0.0728988556152217, "grad_norm": 7.541439056396484, "learning_rate": 1.8542030378373292e-05, "loss": 1.878, "step": 360900 }, { "epoch": 0.07291905479937666, "grad_norm": 8.272599220275879, "learning_rate": 1.854162639452699e-05, "loss": 1.9059, "step": 361000 }, { "epoch": 0.07291905479937666, "eval_calculated_loss": 9.071493148803711, "eval_loss": 2.1673178672790527, "eval_perplexity": 8703.609892920742, "eval_runtime": 121.6774, "eval_samples_per_second": 8.202, "eval_steps_per_second": 2.055, "step": 361000 }, { "epoch": 0.0729392539835316, "grad_norm": 9.12534236907959, "learning_rate": 1.8541222410680687e-05, "loss": 1.8258, "step": 361100 }, { "epoch": 0.07295945316768655, "grad_norm": 4.63268518447876, "learning_rate": 1.8540818426834386e-05, "loss": 1.8057, "step": 361200 }, { "epoch": 0.0729796523518415, "grad_norm": 7.0275983810424805, "learning_rate": 1.8540414442988085e-05, "loss": 1.824, "step": 361300 }, { "epoch": 0.07299985153599646, "grad_norm": 6.019340991973877, "learning_rate": 1.854001045914178e-05, "loss": 1.8155, "step": 361400 }, { "epoch": 0.07302005072015141, "grad_norm": 8.125801086425781, "learning_rate": 1.853960647529548e-05, "loss": 1.8312, "step": 361500 }, { "epoch": 0.07304024990430637, "grad_norm": 6.647726058959961, "learning_rate": 1.853920249144918e-05, "loss": 1.8539, "step": 361600 }, { "epoch": 0.07306044908846132, "grad_norm": 11.522717475891113, "learning_rate": 1.8538798507602878e-05, "loss": 1.7586, "step": 361700 }, { "epoch": 0.07308064827261627, "grad_norm": 8.886085510253906, "learning_rate": 1.8538394523756574e-05, "loss": 1.7386, "step": 361800 }, { "epoch": 0.07310084745677121, "grad_norm": 11.173982620239258, "learning_rate": 1.8537990539910273e-05, "loss": 1.9206, "step": 361900 }, { "epoch": 0.07312104664092617, "grad_norm": 4.281346797943115, "learning_rate": 1.8537586556063972e-05, "loss": 1.8119, "step": 362000 }, { "epoch": 0.07312104664092617, "eval_calculated_loss": 9.148346900939941, "eval_loss": 2.154778242111206, "eval_perplexity": 9398.890232724267, "eval_runtime": 124.211, "eval_samples_per_second": 8.035, "eval_steps_per_second": 2.013, "step": 362000 }, { "epoch": 0.07314124582508112, "grad_norm": 6.340549468994141, "learning_rate": 1.8537182572217668e-05, "loss": 1.886, "step": 362100 }, { "epoch": 0.07316144500923608, "grad_norm": 6.086480140686035, "learning_rate": 1.8536778588371367e-05, "loss": 1.7531, "step": 362200 }, { "epoch": 0.07318164419339103, "grad_norm": 4.26153039932251, "learning_rate": 1.8536374604525066e-05, "loss": 1.8452, "step": 362300 }, { "epoch": 0.07320184337754598, "grad_norm": 9.274605751037598, "learning_rate": 1.8535970620678762e-05, "loss": 1.7874, "step": 362400 }, { "epoch": 0.07322204256170094, "grad_norm": 8.291196823120117, "learning_rate": 1.853556663683246e-05, "loss": 1.8673, "step": 362500 }, { "epoch": 0.07324224174585589, "grad_norm": 4.41739559173584, "learning_rate": 1.8535162652986157e-05, "loss": 1.8719, "step": 362600 }, { "epoch": 0.07326244093001083, "grad_norm": 8.108845710754395, "learning_rate": 1.8534758669139856e-05, "loss": 1.7642, "step": 362700 }, { "epoch": 0.07328264011416578, "grad_norm": 7.3747639656066895, "learning_rate": 1.8534354685293555e-05, "loss": 1.7977, "step": 362800 }, { "epoch": 0.07330283929832074, "grad_norm": 7.02156925201416, "learning_rate": 1.8533950701447254e-05, "loss": 1.7127, "step": 362900 }, { "epoch": 0.07332303848247569, "grad_norm": 9.301231384277344, "learning_rate": 1.8533546717600953e-05, "loss": 1.8909, "step": 363000 }, { "epoch": 0.07332303848247569, "eval_calculated_loss": 9.153412818908691, "eval_loss": 2.15409779548645, "eval_perplexity": 9446.625047882531, "eval_runtime": 122.3597, "eval_samples_per_second": 8.156, "eval_steps_per_second": 2.043, "step": 363000 }, { "epoch": 0.07334323766663065, "grad_norm": 8.737717628479004, "learning_rate": 1.853314273375465e-05, "loss": 1.8145, "step": 363100 }, { "epoch": 0.0733634368507856, "grad_norm": 4.726940155029297, "learning_rate": 1.8532738749908348e-05, "loss": 1.8389, "step": 363200 }, { "epoch": 0.07338363603494055, "grad_norm": 8.353133201599121, "learning_rate": 1.8532334766062044e-05, "loss": 1.7743, "step": 363300 }, { "epoch": 0.0734038352190955, "grad_norm": 6.970937252044678, "learning_rate": 1.8531930782215743e-05, "loss": 1.808, "step": 363400 }, { "epoch": 0.07342403440325045, "grad_norm": 6.106738567352295, "learning_rate": 1.8531526798369442e-05, "loss": 1.7831, "step": 363500 }, { "epoch": 0.0734442335874054, "grad_norm": 5.140354156494141, "learning_rate": 1.8531122814523138e-05, "loss": 1.7289, "step": 363600 }, { "epoch": 0.07346443277156035, "grad_norm": 6.117851257324219, "learning_rate": 1.8530718830676837e-05, "loss": 1.872, "step": 363700 }, { "epoch": 0.07348463195571531, "grad_norm": 5.654204845428467, "learning_rate": 1.8530314846830536e-05, "loss": 1.8388, "step": 363800 }, { "epoch": 0.07350483113987026, "grad_norm": 5.0902605056762695, "learning_rate": 1.8529910862984235e-05, "loss": 1.8196, "step": 363900 }, { "epoch": 0.07352503032402521, "grad_norm": 7.795461654663086, "learning_rate": 1.8529506879137935e-05, "loss": 1.8498, "step": 364000 }, { "epoch": 0.07352503032402521, "eval_calculated_loss": 9.28377914428711, "eval_loss": 2.160064220428467, "eval_perplexity": 10762.026412539017, "eval_runtime": 122.4618, "eval_samples_per_second": 8.149, "eval_steps_per_second": 2.041, "step": 364000 }, { "epoch": 0.07354522950818017, "grad_norm": 8.091964721679688, "learning_rate": 1.852910289529163e-05, "loss": 1.8894, "step": 364100 }, { "epoch": 0.07356542869233512, "grad_norm": 9.231401443481445, "learning_rate": 1.852869891144533e-05, "loss": 1.8987, "step": 364200 }, { "epoch": 0.07358562787649006, "grad_norm": 8.536253929138184, "learning_rate": 1.8528294927599025e-05, "loss": 1.6805, "step": 364300 }, { "epoch": 0.07360582706064502, "grad_norm": 6.178476333618164, "learning_rate": 1.8527890943752724e-05, "loss": 1.8599, "step": 364400 }, { "epoch": 0.07362602624479997, "grad_norm": 10.127728462219238, "learning_rate": 1.8527486959906423e-05, "loss": 1.8423, "step": 364500 }, { "epoch": 0.07364622542895492, "grad_norm": 8.48193645477295, "learning_rate": 1.852708297606012e-05, "loss": 1.8165, "step": 364600 }, { "epoch": 0.07366642461310988, "grad_norm": 4.642056941986084, "learning_rate": 1.8526678992213818e-05, "loss": 1.7825, "step": 364700 }, { "epoch": 0.07368662379726483, "grad_norm": 4.88509464263916, "learning_rate": 1.8526275008367517e-05, "loss": 1.9283, "step": 364800 }, { "epoch": 0.07370682298141978, "grad_norm": 6.177083969116211, "learning_rate": 1.8525871024521217e-05, "loss": 1.8046, "step": 364900 }, { "epoch": 0.07372702216557474, "grad_norm": 4.645002841949463, "learning_rate": 1.8525467040674912e-05, "loss": 1.8576, "step": 365000 }, { "epoch": 0.07372702216557474, "eval_calculated_loss": 9.23946762084961, "eval_loss": 2.150618076324463, "eval_perplexity": 10295.555959297724, "eval_runtime": 122.2556, "eval_samples_per_second": 8.163, "eval_steps_per_second": 2.045, "step": 365000 }, { "epoch": 0.07374722134972969, "grad_norm": 8.311169624328613, "learning_rate": 1.852506305682861e-05, "loss": 1.839, "step": 365100 }, { "epoch": 0.07376742053388463, "grad_norm": 5.640664577484131, "learning_rate": 1.852465907298231e-05, "loss": 1.822, "step": 365200 }, { "epoch": 0.07378761971803958, "grad_norm": 5.188340663909912, "learning_rate": 1.8524255089136006e-05, "loss": 1.8175, "step": 365300 }, { "epoch": 0.07380781890219454, "grad_norm": 5.942427635192871, "learning_rate": 1.8523851105289705e-05, "loss": 1.8301, "step": 365400 }, { "epoch": 0.07382801808634949, "grad_norm": 11.974994659423828, "learning_rate": 1.8523447121443404e-05, "loss": 1.8461, "step": 365500 }, { "epoch": 0.07384821727050445, "grad_norm": 5.00408411026001, "learning_rate": 1.85230431375971e-05, "loss": 1.7591, "step": 365600 }, { "epoch": 0.0738684164546594, "grad_norm": 8.620485305786133, "learning_rate": 1.85226391537508e-05, "loss": 1.8605, "step": 365700 }, { "epoch": 0.07388861563881435, "grad_norm": 8.614998817443848, "learning_rate": 1.8522235169904495e-05, "loss": 1.8791, "step": 365800 }, { "epoch": 0.07390881482296931, "grad_norm": 4.672860145568848, "learning_rate": 1.8521831186058198e-05, "loss": 1.8429, "step": 365900 }, { "epoch": 0.07392901400712425, "grad_norm": 7.091451644897461, "learning_rate": 1.8521427202211893e-05, "loss": 1.8023, "step": 366000 }, { "epoch": 0.07392901400712425, "eval_calculated_loss": 9.22274398803711, "eval_loss": 2.162313938140869, "eval_perplexity": 10124.808599440608, "eval_runtime": 123.7553, "eval_samples_per_second": 8.064, "eval_steps_per_second": 2.02, "step": 366000 }, { "epoch": 0.0739492131912792, "grad_norm": 12.299976348876953, "learning_rate": 1.8521023218365592e-05, "loss": 1.8695, "step": 366100 }, { "epoch": 0.07396941237543415, "grad_norm": 5.176506519317627, "learning_rate": 1.852061923451929e-05, "loss": 1.7725, "step": 366200 }, { "epoch": 0.07398961155958911, "grad_norm": 5.545331001281738, "learning_rate": 1.8520215250672987e-05, "loss": 1.8489, "step": 366300 }, { "epoch": 0.07400981074374406, "grad_norm": 3.451465606689453, "learning_rate": 1.8519811266826686e-05, "loss": 1.8537, "step": 366400 }, { "epoch": 0.07403000992789902, "grad_norm": 9.01791763305664, "learning_rate": 1.8519407282980386e-05, "loss": 1.8181, "step": 366500 }, { "epoch": 0.07405020911205397, "grad_norm": 5.461636066436768, "learning_rate": 1.851900329913408e-05, "loss": 1.8584, "step": 366600 }, { "epoch": 0.07407040829620892, "grad_norm": 10.28196907043457, "learning_rate": 1.851859931528778e-05, "loss": 1.9768, "step": 366700 }, { "epoch": 0.07409060748036386, "grad_norm": 7.073184490203857, "learning_rate": 1.8518195331441476e-05, "loss": 1.8977, "step": 366800 }, { "epoch": 0.07411080666451882, "grad_norm": 8.197790145874023, "learning_rate": 1.851779134759518e-05, "loss": 1.7618, "step": 366900 }, { "epoch": 0.07413100584867377, "grad_norm": 5.181674480438232, "learning_rate": 1.8517387363748874e-05, "loss": 1.8723, "step": 367000 }, { "epoch": 0.07413100584867377, "eval_calculated_loss": 9.34266471862793, "eval_loss": 2.151585102081299, "eval_perplexity": 11414.784939590858, "eval_runtime": 123.8297, "eval_samples_per_second": 8.059, "eval_steps_per_second": 2.019, "step": 367000 }, { "epoch": 0.07415120503282872, "grad_norm": 5.05996036529541, "learning_rate": 1.8516983379902574e-05, "loss": 1.8469, "step": 367100 }, { "epoch": 0.07417140421698368, "grad_norm": 3.772759437561035, "learning_rate": 1.8516579396056273e-05, "loss": 1.7323, "step": 367200 }, { "epoch": 0.07419160340113863, "grad_norm": 7.059957981109619, "learning_rate": 1.851617541220997e-05, "loss": 1.8845, "step": 367300 }, { "epoch": 0.07421180258529358, "grad_norm": 10.97197437286377, "learning_rate": 1.8515771428363668e-05, "loss": 1.8527, "step": 367400 }, { "epoch": 0.07423200176944854, "grad_norm": 7.676540374755859, "learning_rate": 1.8515367444517363e-05, "loss": 1.8417, "step": 367500 }, { "epoch": 0.07425220095360348, "grad_norm": 5.926901817321777, "learning_rate": 1.8514963460671062e-05, "loss": 1.8045, "step": 367600 }, { "epoch": 0.07427240013775843, "grad_norm": 6.450165748596191, "learning_rate": 1.851455947682476e-05, "loss": 1.8279, "step": 367700 }, { "epoch": 0.07429259932191339, "grad_norm": 9.604448318481445, "learning_rate": 1.8514155492978457e-05, "loss": 1.844, "step": 367800 }, { "epoch": 0.07431279850606834, "grad_norm": 6.5752458572387695, "learning_rate": 1.851375150913216e-05, "loss": 1.7762, "step": 367900 }, { "epoch": 0.0743329976902233, "grad_norm": 6.412501335144043, "learning_rate": 1.8513347525285856e-05, "loss": 1.815, "step": 368000 }, { "epoch": 0.0743329976902233, "eval_calculated_loss": 9.516380310058594, "eval_loss": 2.154123544692993, "eval_perplexity": 13580.365428402036, "eval_runtime": 121.2134, "eval_samples_per_second": 8.233, "eval_steps_per_second": 2.062, "step": 368000 }, { "epoch": 0.07435319687437825, "grad_norm": 8.258987426757812, "learning_rate": 1.8512943541439555e-05, "loss": 1.9034, "step": 368100 }, { "epoch": 0.0743733960585332, "grad_norm": 9.565937042236328, "learning_rate": 1.851253955759325e-05, "loss": 1.8191, "step": 368200 }, { "epoch": 0.07439359524268815, "grad_norm": 9.501412391662598, "learning_rate": 1.851213557374695e-05, "loss": 1.841, "step": 368300 }, { "epoch": 0.0744137944268431, "grad_norm": 5.2012810707092285, "learning_rate": 1.851173158990065e-05, "loss": 1.8207, "step": 368400 }, { "epoch": 0.07443399361099805, "grad_norm": 8.137323379516602, "learning_rate": 1.8511327606054344e-05, "loss": 1.8614, "step": 368500 }, { "epoch": 0.074454192795153, "grad_norm": 9.89655876159668, "learning_rate": 1.8510923622208044e-05, "loss": 1.8525, "step": 368600 }, { "epoch": 0.07447439197930796, "grad_norm": 7.3488569259643555, "learning_rate": 1.8510519638361743e-05, "loss": 1.8423, "step": 368700 }, { "epoch": 0.07449459116346291, "grad_norm": 7.284970283508301, "learning_rate": 1.851011565451544e-05, "loss": 1.8537, "step": 368800 }, { "epoch": 0.07451479034761786, "grad_norm": 7.958863258361816, "learning_rate": 1.8509711670669138e-05, "loss": 1.8426, "step": 368900 }, { "epoch": 0.07453498953177282, "grad_norm": 7.271966457366943, "learning_rate": 1.8509307686822837e-05, "loss": 1.8445, "step": 369000 }, { "epoch": 0.07453498953177282, "eval_calculated_loss": 9.340601921081543, "eval_loss": 2.157438278198242, "eval_perplexity": 11391.262818252842, "eval_runtime": 123.1978, "eval_samples_per_second": 8.101, "eval_steps_per_second": 2.029, "step": 369000 }, { "epoch": 0.07455518871592777, "grad_norm": 6.001710414886475, "learning_rate": 1.8508903702976536e-05, "loss": 1.8037, "step": 369100 }, { "epoch": 0.07457538790008271, "grad_norm": 8.490416526794434, "learning_rate": 1.850849971913023e-05, "loss": 1.8272, "step": 369200 }, { "epoch": 0.07459558708423766, "grad_norm": 4.425375461578369, "learning_rate": 1.850809573528393e-05, "loss": 1.8235, "step": 369300 }, { "epoch": 0.07461578626839262, "grad_norm": 10.107687950134277, "learning_rate": 1.850769175143763e-05, "loss": 1.795, "step": 369400 }, { "epoch": 0.07463598545254757, "grad_norm": 7.4300971031188965, "learning_rate": 1.8507287767591326e-05, "loss": 1.9123, "step": 369500 }, { "epoch": 0.07465618463670252, "grad_norm": 8.412282943725586, "learning_rate": 1.8506883783745025e-05, "loss": 1.9213, "step": 369600 }, { "epoch": 0.07467638382085748, "grad_norm": 4.154993534088135, "learning_rate": 1.8506479799898724e-05, "loss": 1.7709, "step": 369700 }, { "epoch": 0.07469658300501243, "grad_norm": 8.020492553710938, "learning_rate": 1.850607581605242e-05, "loss": 1.8078, "step": 369800 }, { "epoch": 0.07471678218916739, "grad_norm": 8.160213470458984, "learning_rate": 1.850567183220612e-05, "loss": 1.9048, "step": 369900 }, { "epoch": 0.07473698137332233, "grad_norm": 7.619051933288574, "learning_rate": 1.8505267848359818e-05, "loss": 1.8885, "step": 370000 }, { "epoch": 0.07473698137332233, "eval_calculated_loss": 9.236297607421875, "eval_loss": 2.1656203269958496, "eval_perplexity": 10262.970583986893, "eval_runtime": 119.5458, "eval_samples_per_second": 8.348, "eval_steps_per_second": 2.091, "step": 370000 }, { "epoch": 0.07475718055747728, "grad_norm": 5.708563804626465, "learning_rate": 1.8504863864513517e-05, "loss": 1.8685, "step": 370100 }, { "epoch": 0.07477737974163223, "grad_norm": 6.880437850952148, "learning_rate": 1.8504459880667213e-05, "loss": 1.8213, "step": 370200 }, { "epoch": 0.07479757892578719, "grad_norm": 6.867125511169434, "learning_rate": 1.8504055896820912e-05, "loss": 1.8269, "step": 370300 }, { "epoch": 0.07481777810994214, "grad_norm": 11.903949737548828, "learning_rate": 1.850365191297461e-05, "loss": 1.8216, "step": 370400 }, { "epoch": 0.0748379772940971, "grad_norm": 7.057260990142822, "learning_rate": 1.8503247929128307e-05, "loss": 1.8632, "step": 370500 }, { "epoch": 0.07485817647825205, "grad_norm": 8.268577575683594, "learning_rate": 1.8502843945282006e-05, "loss": 1.9432, "step": 370600 }, { "epoch": 0.074878375662407, "grad_norm": 7.351163387298584, "learning_rate": 1.85024399614357e-05, "loss": 1.8272, "step": 370700 }, { "epoch": 0.07489857484656194, "grad_norm": 5.129861831665039, "learning_rate": 1.85020359775894e-05, "loss": 1.7764, "step": 370800 }, { "epoch": 0.0749187740307169, "grad_norm": 6.981326103210449, "learning_rate": 1.85016319937431e-05, "loss": 1.7455, "step": 370900 }, { "epoch": 0.07493897321487185, "grad_norm": 9.434586524963379, "learning_rate": 1.8501228009896796e-05, "loss": 1.8004, "step": 371000 }, { "epoch": 0.07493897321487185, "eval_calculated_loss": 9.27262020111084, "eval_loss": 2.1566898822784424, "eval_perplexity": 10642.601140505896, "eval_runtime": 122.4686, "eval_samples_per_second": 8.149, "eval_steps_per_second": 2.041, "step": 371000 }, { "epoch": 0.0749591723990268, "grad_norm": 7.979338645935059, "learning_rate": 1.8500824026050498e-05, "loss": 1.9072, "step": 371100 }, { "epoch": 0.07497937158318176, "grad_norm": 11.617944717407227, "learning_rate": 1.8500420042204194e-05, "loss": 1.8755, "step": 371200 }, { "epoch": 0.07499957076733671, "grad_norm": 6.432194232940674, "learning_rate": 1.8500016058357893e-05, "loss": 1.8373, "step": 371300 }, { "epoch": 0.07501976995149166, "grad_norm": 13.058634757995605, "learning_rate": 1.849961207451159e-05, "loss": 1.7905, "step": 371400 }, { "epoch": 0.07503996913564662, "grad_norm": 4.627424716949463, "learning_rate": 1.8499208090665288e-05, "loss": 1.7414, "step": 371500 }, { "epoch": 0.07506016831980156, "grad_norm": 3.894861936569214, "learning_rate": 1.8498804106818987e-05, "loss": 1.8433, "step": 371600 }, { "epoch": 0.07508036750395651, "grad_norm": 8.077101707458496, "learning_rate": 1.8498400122972683e-05, "loss": 1.8756, "step": 371700 }, { "epoch": 0.07510056668811146, "grad_norm": 5.090162754058838, "learning_rate": 1.8497996139126382e-05, "loss": 1.914, "step": 371800 }, { "epoch": 0.07512076587226642, "grad_norm": 10.7888765335083, "learning_rate": 1.849759215528008e-05, "loss": 1.8002, "step": 371900 }, { "epoch": 0.07514096505642137, "grad_norm": 6.903065204620361, "learning_rate": 1.8497188171433777e-05, "loss": 1.8379, "step": 372000 }, { "epoch": 0.07514096505642137, "eval_calculated_loss": 9.210734367370605, "eval_loss": 2.1556005477905273, "eval_perplexity": 10003.940730208025, "eval_runtime": 121.8754, "eval_samples_per_second": 8.189, "eval_steps_per_second": 2.051, "step": 372000 }, { "epoch": 0.07516116424057633, "grad_norm": 8.936508178710938, "learning_rate": 1.849678418758748e-05, "loss": 1.8832, "step": 372100 }, { "epoch": 0.07518136342473128, "grad_norm": 6.25187873840332, "learning_rate": 1.8496380203741175e-05, "loss": 1.7918, "step": 372200 }, { "epoch": 0.07520156260888623, "grad_norm": 5.258055210113525, "learning_rate": 1.8495976219894874e-05, "loss": 1.7674, "step": 372300 }, { "epoch": 0.07522176179304117, "grad_norm": 8.760602951049805, "learning_rate": 1.849557223604857e-05, "loss": 1.931, "step": 372400 }, { "epoch": 0.07524196097719613, "grad_norm": 7.458888053894043, "learning_rate": 1.849516825220227e-05, "loss": 1.8555, "step": 372500 }, { "epoch": 0.07526216016135108, "grad_norm": 6.076719284057617, "learning_rate": 1.8494764268355968e-05, "loss": 1.7745, "step": 372600 }, { "epoch": 0.07528235934550603, "grad_norm": 10.368979454040527, "learning_rate": 1.8494360284509664e-05, "loss": 1.8717, "step": 372700 }, { "epoch": 0.07530255852966099, "grad_norm": 9.608556747436523, "learning_rate": 1.8493956300663363e-05, "loss": 1.7534, "step": 372800 }, { "epoch": 0.07532275771381594, "grad_norm": 5.037866592407227, "learning_rate": 1.8493552316817062e-05, "loss": 1.8548, "step": 372900 }, { "epoch": 0.0753429568979709, "grad_norm": 9.11322021484375, "learning_rate": 1.8493148332970758e-05, "loss": 1.7876, "step": 373000 }, { "epoch": 0.0753429568979709, "eval_calculated_loss": 9.09738826751709, "eval_loss": 2.1575684547424316, "eval_perplexity": 8931.934390744458, "eval_runtime": 122.255, "eval_samples_per_second": 8.163, "eval_steps_per_second": 2.045, "step": 373000 }, { "epoch": 0.07536315608212585, "grad_norm": 7.235683441162109, "learning_rate": 1.8492744349124457e-05, "loss": 1.8959, "step": 373100 }, { "epoch": 0.07538335526628079, "grad_norm": 6.338716506958008, "learning_rate": 1.8492340365278156e-05, "loss": 1.8045, "step": 373200 }, { "epoch": 0.07540355445043574, "grad_norm": 9.020793914794922, "learning_rate": 1.8491936381431855e-05, "loss": 1.8591, "step": 373300 }, { "epoch": 0.0754237536345907, "grad_norm": 9.050629615783691, "learning_rate": 1.849153239758555e-05, "loss": 1.9597, "step": 373400 }, { "epoch": 0.07544395281874565, "grad_norm": 7.306633949279785, "learning_rate": 1.849112841373925e-05, "loss": 1.8134, "step": 373500 }, { "epoch": 0.0754641520029006, "grad_norm": 6.955211162567139, "learning_rate": 1.849072442989295e-05, "loss": 1.8902, "step": 373600 }, { "epoch": 0.07548435118705556, "grad_norm": 3.76177978515625, "learning_rate": 1.8490320446046645e-05, "loss": 1.8045, "step": 373700 }, { "epoch": 0.07550455037121051, "grad_norm": 8.428662300109863, "learning_rate": 1.8489916462200344e-05, "loss": 1.7702, "step": 373800 }, { "epoch": 0.07552474955536546, "grad_norm": 7.196755886077881, "learning_rate": 1.848951247835404e-05, "loss": 1.8929, "step": 373900 }, { "epoch": 0.0755449487395204, "grad_norm": 10.14197826385498, "learning_rate": 1.848910849450774e-05, "loss": 1.8262, "step": 374000 }, { "epoch": 0.0755449487395204, "eval_calculated_loss": 9.085657119750977, "eval_loss": 2.14973783493042, "eval_perplexity": 8827.764757895093, "eval_runtime": 123.2606, "eval_samples_per_second": 8.097, "eval_steps_per_second": 2.028, "step": 374000 }, { "epoch": 0.07556514792367536, "grad_norm": 7.030701637268066, "learning_rate": 1.8488704510661438e-05, "loss": 1.719, "step": 374100 }, { "epoch": 0.07558534710783031, "grad_norm": 6.204556941986084, "learning_rate": 1.8488300526815137e-05, "loss": 1.7825, "step": 374200 }, { "epoch": 0.07560554629198526, "grad_norm": 6.756950855255127, "learning_rate": 1.8487896542968836e-05, "loss": 1.9489, "step": 374300 }, { "epoch": 0.07562574547614022, "grad_norm": 7.227318286895752, "learning_rate": 1.8487492559122532e-05, "loss": 1.7833, "step": 374400 }, { "epoch": 0.07564594466029517, "grad_norm": 5.632149696350098, "learning_rate": 1.848708857527623e-05, "loss": 1.8626, "step": 374500 }, { "epoch": 0.07566614384445013, "grad_norm": 6.384549140930176, "learning_rate": 1.848668459142993e-05, "loss": 1.8365, "step": 374600 }, { "epoch": 0.07568634302860508, "grad_norm": 5.295100688934326, "learning_rate": 1.8486280607583626e-05, "loss": 1.8464, "step": 374700 }, { "epoch": 0.07570654221276003, "grad_norm": 5.7808308601379395, "learning_rate": 1.8485876623737325e-05, "loss": 1.8008, "step": 374800 }, { "epoch": 0.07572674139691497, "grad_norm": 7.861916542053223, "learning_rate": 1.848547263989102e-05, "loss": 1.7888, "step": 374900 }, { "epoch": 0.07574694058106993, "grad_norm": 7.813963413238525, "learning_rate": 1.848506865604472e-05, "loss": 1.7981, "step": 375000 }, { "epoch": 0.07574694058106993, "eval_calculated_loss": 9.271315574645996, "eval_loss": 2.1490211486816406, "eval_perplexity": 10628.725574586548, "eval_runtime": 121.5625, "eval_samples_per_second": 8.21, "eval_steps_per_second": 2.057, "step": 375000 }, { "epoch": 0.07576713976522488, "grad_norm": 5.321366786956787, "learning_rate": 1.848466467219842e-05, "loss": 1.8662, "step": 375100 }, { "epoch": 0.07578733894937983, "grad_norm": 8.393329620361328, "learning_rate": 1.8484260688352118e-05, "loss": 1.8539, "step": 375200 }, { "epoch": 0.07580753813353479, "grad_norm": 6.5441718101501465, "learning_rate": 1.8483856704505817e-05, "loss": 1.7218, "step": 375300 }, { "epoch": 0.07582773731768974, "grad_norm": 7.882169246673584, "learning_rate": 1.8483452720659513e-05, "loss": 1.8003, "step": 375400 }, { "epoch": 0.0758479365018447, "grad_norm": 10.45507526397705, "learning_rate": 1.8483048736813212e-05, "loss": 1.8217, "step": 375500 }, { "epoch": 0.07586813568599965, "grad_norm": 4.565847396850586, "learning_rate": 1.8482644752966908e-05, "loss": 1.8021, "step": 375600 }, { "epoch": 0.07588833487015459, "grad_norm": 6.892687797546387, "learning_rate": 1.8482240769120607e-05, "loss": 1.7758, "step": 375700 }, { "epoch": 0.07590853405430954, "grad_norm": 9.217846870422363, "learning_rate": 1.8481836785274306e-05, "loss": 1.817, "step": 375800 }, { "epoch": 0.0759287332384645, "grad_norm": 9.294034957885742, "learning_rate": 1.8481432801428002e-05, "loss": 1.795, "step": 375900 }, { "epoch": 0.07594893242261945, "grad_norm": 15.46816635131836, "learning_rate": 1.84810288175817e-05, "loss": 1.8088, "step": 376000 }, { "epoch": 0.07594893242261945, "eval_calculated_loss": 9.182243347167969, "eval_loss": 2.1558620929718018, "eval_perplexity": 9722.940255832418, "eval_runtime": 121.9963, "eval_samples_per_second": 8.181, "eval_steps_per_second": 2.049, "step": 376000 }, { "epoch": 0.0759691316067744, "grad_norm": 7.259715557098389, "learning_rate": 1.84806248337354e-05, "loss": 1.8712, "step": 376100 }, { "epoch": 0.07598933079092936, "grad_norm": 5.616568565368652, "learning_rate": 1.84802208498891e-05, "loss": 1.7765, "step": 376200 }, { "epoch": 0.07600952997508431, "grad_norm": 5.095566272735596, "learning_rate": 1.8479816866042795e-05, "loss": 1.8875, "step": 376300 }, { "epoch": 0.07602972915923926, "grad_norm": 5.459178447723389, "learning_rate": 1.8479412882196494e-05, "loss": 1.7392, "step": 376400 }, { "epoch": 0.0760499283433942, "grad_norm": 10.058938026428223, "learning_rate": 1.8479008898350193e-05, "loss": 1.7793, "step": 376500 }, { "epoch": 0.07607012752754916, "grad_norm": 6.212890148162842, "learning_rate": 1.847860491450389e-05, "loss": 1.8351, "step": 376600 }, { "epoch": 0.07609032671170411, "grad_norm": 5.951969146728516, "learning_rate": 1.8478200930657588e-05, "loss": 1.8975, "step": 376700 }, { "epoch": 0.07611052589585907, "grad_norm": 5.79859733581543, "learning_rate": 1.8477796946811287e-05, "loss": 1.8428, "step": 376800 }, { "epoch": 0.07613072508001402, "grad_norm": 11.652242660522461, "learning_rate": 1.8477392962964983e-05, "loss": 1.8783, "step": 376900 }, { "epoch": 0.07615092426416897, "grad_norm": 6.925556182861328, "learning_rate": 1.8476988979118682e-05, "loss": 1.7599, "step": 377000 }, { "epoch": 0.07615092426416897, "eval_calculated_loss": 9.125747680664062, "eval_loss": 2.162090301513672, "eval_perplexity": 9188.864786359265, "eval_runtime": 122.1157, "eval_samples_per_second": 8.173, "eval_steps_per_second": 2.047, "step": 377000 }, { "epoch": 0.07617112344832393, "grad_norm": 5.916322708129883, "learning_rate": 1.847658499527238e-05, "loss": 1.9067, "step": 377100 }, { "epoch": 0.07619132263247888, "grad_norm": 9.48071002960205, "learning_rate": 1.8476181011426077e-05, "loss": 1.8138, "step": 377200 }, { "epoch": 0.07621152181663382, "grad_norm": 7.016234397888184, "learning_rate": 1.8475777027579776e-05, "loss": 1.8044, "step": 377300 }, { "epoch": 0.07623172100078877, "grad_norm": 8.163068771362305, "learning_rate": 1.8475373043733475e-05, "loss": 1.7889, "step": 377400 }, { "epoch": 0.07625192018494373, "grad_norm": 9.383662223815918, "learning_rate": 1.8474969059887175e-05, "loss": 1.8845, "step": 377500 }, { "epoch": 0.07627211936909868, "grad_norm": 9.747446060180664, "learning_rate": 1.847456507604087e-05, "loss": 1.822, "step": 377600 }, { "epoch": 0.07629231855325364, "grad_norm": 7.445842742919922, "learning_rate": 1.847416109219457e-05, "loss": 1.7035, "step": 377700 }, { "epoch": 0.07631251773740859, "grad_norm": 4.336341857910156, "learning_rate": 1.847375710834827e-05, "loss": 1.7695, "step": 377800 }, { "epoch": 0.07633271692156354, "grad_norm": 5.502963542938232, "learning_rate": 1.8473353124501964e-05, "loss": 1.8525, "step": 377900 }, { "epoch": 0.0763529161057185, "grad_norm": 10.049403190612793, "learning_rate": 1.8472949140655663e-05, "loss": 1.8856, "step": 378000 }, { "epoch": 0.0763529161057185, "eval_calculated_loss": 9.315505027770996, "eval_loss": 2.1621525287628174, "eval_perplexity": 11108.935103514199, "eval_runtime": 123.2358, "eval_samples_per_second": 8.098, "eval_steps_per_second": 2.029, "step": 378000 }, { "epoch": 0.07637311528987344, "grad_norm": 8.512450218200684, "learning_rate": 1.847254515680936e-05, "loss": 1.8598, "step": 378100 }, { "epoch": 0.07639331447402839, "grad_norm": 5.703627109527588, "learning_rate": 1.8472141172963058e-05, "loss": 1.8197, "step": 378200 }, { "epoch": 0.07641351365818334, "grad_norm": 5.355831623077393, "learning_rate": 1.8471737189116757e-05, "loss": 1.8141, "step": 378300 }, { "epoch": 0.0764337128423383, "grad_norm": 5.707732200622559, "learning_rate": 1.8471333205270456e-05, "loss": 1.7639, "step": 378400 }, { "epoch": 0.07645391202649325, "grad_norm": 8.13870620727539, "learning_rate": 1.8470929221424156e-05, "loss": 1.8814, "step": 378500 }, { "epoch": 0.0764741112106482, "grad_norm": 6.614063739776611, "learning_rate": 1.847052523757785e-05, "loss": 1.8553, "step": 378600 }, { "epoch": 0.07649431039480316, "grad_norm": 5.831099510192871, "learning_rate": 1.847012125373155e-05, "loss": 1.8332, "step": 378700 }, { "epoch": 0.07651450957895811, "grad_norm": 6.586577415466309, "learning_rate": 1.8469717269885246e-05, "loss": 1.8473, "step": 378800 }, { "epoch": 0.07653470876311305, "grad_norm": 5.80102014541626, "learning_rate": 1.8469313286038945e-05, "loss": 1.7799, "step": 378900 }, { "epoch": 0.076554907947268, "grad_norm": 9.125856399536133, "learning_rate": 1.8468909302192644e-05, "loss": 1.8579, "step": 379000 }, { "epoch": 0.076554907947268, "eval_calculated_loss": 8.885652542114258, "eval_loss": 2.1620078086853027, "eval_perplexity": 7227.52940314701, "eval_runtime": 121.2075, "eval_samples_per_second": 8.234, "eval_steps_per_second": 2.063, "step": 379000 }, { "epoch": 0.07657510713142296, "grad_norm": 7.931296348571777, "learning_rate": 1.846850531834634e-05, "loss": 1.7619, "step": 379100 }, { "epoch": 0.07659530631557791, "grad_norm": 8.145037651062012, "learning_rate": 1.846810133450004e-05, "loss": 1.7809, "step": 379200 }, { "epoch": 0.07661550549973287, "grad_norm": 8.188270568847656, "learning_rate": 1.846769735065374e-05, "loss": 1.8633, "step": 379300 }, { "epoch": 0.07663570468388782, "grad_norm": 6.611621379852295, "learning_rate": 1.8467293366807438e-05, "loss": 1.7585, "step": 379400 }, { "epoch": 0.07665590386804277, "grad_norm": 6.918247222900391, "learning_rate": 1.8466889382961137e-05, "loss": 1.8268, "step": 379500 }, { "epoch": 0.07667610305219773, "grad_norm": 6.576334476470947, "learning_rate": 1.8466485399114832e-05, "loss": 1.8136, "step": 379600 }, { "epoch": 0.07669630223635267, "grad_norm": 9.831661224365234, "learning_rate": 1.846608141526853e-05, "loss": 1.8559, "step": 379700 }, { "epoch": 0.07671650142050762, "grad_norm": 11.31689453125, "learning_rate": 1.8465677431422227e-05, "loss": 1.8268, "step": 379800 }, { "epoch": 0.07673670060466257, "grad_norm": 8.519753456115723, "learning_rate": 1.8465273447575926e-05, "loss": 1.8444, "step": 379900 }, { "epoch": 0.07675689978881753, "grad_norm": 8.489996910095215, "learning_rate": 1.8464869463729626e-05, "loss": 1.9262, "step": 380000 }, { "epoch": 0.07675689978881753, "eval_calculated_loss": 8.931181907653809, "eval_loss": 2.1594791412353516, "eval_perplexity": 7564.200281301483, "eval_runtime": 125.7466, "eval_samples_per_second": 7.937, "eval_steps_per_second": 1.988, "step": 380000 }, { "epoch": 0.07677709897297248, "grad_norm": 11.60036849975586, "learning_rate": 1.846446547988332e-05, "loss": 1.7724, "step": 380100 }, { "epoch": 0.07679729815712744, "grad_norm": 5.7083330154418945, "learning_rate": 1.846406149603702e-05, "loss": 1.872, "step": 380200 }, { "epoch": 0.07681749734128239, "grad_norm": 8.215682983398438, "learning_rate": 1.846365751219072e-05, "loss": 1.836, "step": 380300 }, { "epoch": 0.07683769652543734, "grad_norm": 4.232137680053711, "learning_rate": 1.846325352834442e-05, "loss": 1.7559, "step": 380400 }, { "epoch": 0.07685789570959228, "grad_norm": 7.857476711273193, "learning_rate": 1.8462849544498114e-05, "loss": 1.8819, "step": 380500 }, { "epoch": 0.07687809489374724, "grad_norm": 7.039008617401123, "learning_rate": 1.8462445560651814e-05, "loss": 1.9232, "step": 380600 }, { "epoch": 0.07689829407790219, "grad_norm": 7.104013919830322, "learning_rate": 1.8462041576805513e-05, "loss": 1.793, "step": 380700 }, { "epoch": 0.07691849326205714, "grad_norm": 7.339169979095459, "learning_rate": 1.846163759295921e-05, "loss": 1.8221, "step": 380800 }, { "epoch": 0.0769386924462121, "grad_norm": 7.608441352844238, "learning_rate": 1.8461233609112908e-05, "loss": 1.8671, "step": 380900 }, { "epoch": 0.07695889163036705, "grad_norm": 6.654970645904541, "learning_rate": 1.8460829625266607e-05, "loss": 1.8821, "step": 381000 }, { "epoch": 0.07695889163036705, "eval_calculated_loss": 9.263461112976074, "eval_loss": 2.154331684112549, "eval_perplexity": 10545.56965695275, "eval_runtime": 121.8595, "eval_samples_per_second": 8.19, "eval_steps_per_second": 2.052, "step": 381000 }, { "epoch": 0.076979090814522, "grad_norm": 3.635413885116577, "learning_rate": 1.8460425641420302e-05, "loss": 1.8607, "step": 381100 }, { "epoch": 0.07699928999867696, "grad_norm": 8.103691101074219, "learning_rate": 1.8460021657574e-05, "loss": 1.9127, "step": 381200 }, { "epoch": 0.0770194891828319, "grad_norm": 7.582113265991211, "learning_rate": 1.8459617673727697e-05, "loss": 1.8088, "step": 381300 }, { "epoch": 0.07703968836698685, "grad_norm": 7.7193498611450195, "learning_rate": 1.84592136898814e-05, "loss": 1.7868, "step": 381400 }, { "epoch": 0.0770598875511418, "grad_norm": 8.64983081817627, "learning_rate": 1.8458809706035096e-05, "loss": 1.7939, "step": 381500 }, { "epoch": 0.07708008673529676, "grad_norm": 7.204223155975342, "learning_rate": 1.8458405722188795e-05, "loss": 1.7851, "step": 381600 }, { "epoch": 0.07710028591945171, "grad_norm": 6.001214981079102, "learning_rate": 1.8458001738342494e-05, "loss": 1.8197, "step": 381700 }, { "epoch": 0.07712048510360667, "grad_norm": 8.000415802001953, "learning_rate": 1.845759775449619e-05, "loss": 1.7746, "step": 381800 }, { "epoch": 0.07714068428776162, "grad_norm": 10.069757461547852, "learning_rate": 1.845719377064989e-05, "loss": 1.8677, "step": 381900 }, { "epoch": 0.07716088347191657, "grad_norm": 8.781767845153809, "learning_rate": 1.8456789786803584e-05, "loss": 1.8762, "step": 382000 }, { "epoch": 0.07716088347191657, "eval_calculated_loss": 9.052706718444824, "eval_loss": 2.1595959663391113, "eval_perplexity": 8541.626441253302, "eval_runtime": 122.7572, "eval_samples_per_second": 8.13, "eval_steps_per_second": 2.037, "step": 382000 }, { "epoch": 0.07718108265607151, "grad_norm": 6.081557273864746, "learning_rate": 1.8456385802957284e-05, "loss": 1.9017, "step": 382100 }, { "epoch": 0.07720128184022647, "grad_norm": 10.106014251708984, "learning_rate": 1.8455981819110983e-05, "loss": 1.8119, "step": 382200 }, { "epoch": 0.07722148102438142, "grad_norm": 6.639877796173096, "learning_rate": 1.845557783526468e-05, "loss": 1.8139, "step": 382300 }, { "epoch": 0.07724168020853638, "grad_norm": 9.455606460571289, "learning_rate": 1.845517385141838e-05, "loss": 1.831, "step": 382400 }, { "epoch": 0.07726187939269133, "grad_norm": 11.300556182861328, "learning_rate": 1.8454769867572077e-05, "loss": 1.8734, "step": 382500 }, { "epoch": 0.07728207857684628, "grad_norm": 10.113683700561523, "learning_rate": 1.8454365883725776e-05, "loss": 1.9146, "step": 382600 }, { "epoch": 0.07730227776100124, "grad_norm": 12.346861839294434, "learning_rate": 1.8453961899879475e-05, "loss": 1.8181, "step": 382700 }, { "epoch": 0.07732247694515619, "grad_norm": 10.081669807434082, "learning_rate": 1.845355791603317e-05, "loss": 1.8911, "step": 382800 }, { "epoch": 0.07734267612931113, "grad_norm": 6.359347343444824, "learning_rate": 1.845315393218687e-05, "loss": 1.8435, "step": 382900 }, { "epoch": 0.07736287531346608, "grad_norm": 5.223204612731934, "learning_rate": 1.8452749948340566e-05, "loss": 1.7772, "step": 383000 }, { "epoch": 0.07736287531346608, "eval_calculated_loss": 9.050971031188965, "eval_loss": 2.1459310054779053, "eval_perplexity": 8526.813707949867, "eval_runtime": 122.0069, "eval_samples_per_second": 8.18, "eval_steps_per_second": 2.049, "step": 383000 }, { "epoch": 0.07738307449762104, "grad_norm": 7.441074848175049, "learning_rate": 1.8452345964494265e-05, "loss": 1.7236, "step": 383100 }, { "epoch": 0.07740327368177599, "grad_norm": 7.553987979888916, "learning_rate": 1.8451941980647964e-05, "loss": 1.8435, "step": 383200 }, { "epoch": 0.07742347286593095, "grad_norm": 6.859163761138916, "learning_rate": 1.845153799680166e-05, "loss": 1.7918, "step": 383300 }, { "epoch": 0.0774436720500859, "grad_norm": 8.138772010803223, "learning_rate": 1.845113401295536e-05, "loss": 1.8293, "step": 383400 }, { "epoch": 0.07746387123424085, "grad_norm": 13.137247085571289, "learning_rate": 1.8450730029109058e-05, "loss": 1.8513, "step": 383500 }, { "epoch": 0.0774840704183958, "grad_norm": 5.440796375274658, "learning_rate": 1.8450326045262757e-05, "loss": 1.9069, "step": 383600 }, { "epoch": 0.07750426960255075, "grad_norm": 6.769591808319092, "learning_rate": 1.8449922061416453e-05, "loss": 1.8102, "step": 383700 }, { "epoch": 0.0775244687867057, "grad_norm": 6.317040920257568, "learning_rate": 1.8449518077570152e-05, "loss": 1.7907, "step": 383800 }, { "epoch": 0.07754466797086065, "grad_norm": 8.13633918762207, "learning_rate": 1.844911409372385e-05, "loss": 1.8014, "step": 383900 }, { "epoch": 0.07756486715501561, "grad_norm": 4.606566905975342, "learning_rate": 1.8448710109877547e-05, "loss": 1.8068, "step": 384000 }, { "epoch": 0.07756486715501561, "eval_calculated_loss": 9.130843162536621, "eval_loss": 2.1595544815063477, "eval_perplexity": 9235.805972724867, "eval_runtime": 121.3009, "eval_samples_per_second": 8.227, "eval_steps_per_second": 2.061, "step": 384000 }, { "epoch": 0.07758506633917056, "grad_norm": 7.706423759460449, "learning_rate": 1.8448306126031246e-05, "loss": 1.7952, "step": 384100 }, { "epoch": 0.07760526552332551, "grad_norm": 7.466434955596924, "learning_rate": 1.8447902142184945e-05, "loss": 1.8034, "step": 384200 }, { "epoch": 0.07762546470748047, "grad_norm": 9.844286918640137, "learning_rate": 1.844749815833864e-05, "loss": 1.766, "step": 384300 }, { "epoch": 0.07764566389163542, "grad_norm": 7.588119029998779, "learning_rate": 1.844709417449234e-05, "loss": 1.8336, "step": 384400 }, { "epoch": 0.07766586307579038, "grad_norm": 7.823384761810303, "learning_rate": 1.844669019064604e-05, "loss": 1.8135, "step": 384500 }, { "epoch": 0.07768606225994532, "grad_norm": 8.397918701171875, "learning_rate": 1.8446286206799738e-05, "loss": 1.8257, "step": 384600 }, { "epoch": 0.07770626144410027, "grad_norm": 5.723045349121094, "learning_rate": 1.8445882222953434e-05, "loss": 1.9161, "step": 384700 }, { "epoch": 0.07772646062825522, "grad_norm": 8.476702690124512, "learning_rate": 1.8445478239107133e-05, "loss": 1.9, "step": 384800 }, { "epoch": 0.07774665981241018, "grad_norm": 7.045224666595459, "learning_rate": 1.8445074255260832e-05, "loss": 1.6657, "step": 384900 }, { "epoch": 0.07776685899656513, "grad_norm": 8.839911460876465, "learning_rate": 1.8444670271414528e-05, "loss": 1.7848, "step": 385000 }, { "epoch": 0.07776685899656513, "eval_calculated_loss": 9.175065994262695, "eval_loss": 2.1529393196105957, "eval_perplexity": 9653.405119948975, "eval_runtime": 121.8628, "eval_samples_per_second": 8.19, "eval_steps_per_second": 2.051, "step": 385000 }, { "epoch": 0.07778705818072008, "grad_norm": 7.8235697746276855, "learning_rate": 1.8444266287568227e-05, "loss": 1.7862, "step": 385100 }, { "epoch": 0.07780725736487504, "grad_norm": 6.045807361602783, "learning_rate": 1.8443862303721926e-05, "loss": 1.8, "step": 385200 }, { "epoch": 0.07782745654902999, "grad_norm": 7.956810474395752, "learning_rate": 1.8443458319875622e-05, "loss": 1.8529, "step": 385300 }, { "epoch": 0.07784765573318493, "grad_norm": 6.7353901863098145, "learning_rate": 1.844305433602932e-05, "loss": 1.8689, "step": 385400 }, { "epoch": 0.07786785491733988, "grad_norm": 6.7284770011901855, "learning_rate": 1.8442650352183017e-05, "loss": 1.8792, "step": 385500 }, { "epoch": 0.07788805410149484, "grad_norm": 7.854054927825928, "learning_rate": 1.844224636833672e-05, "loss": 1.8259, "step": 385600 }, { "epoch": 0.07790825328564979, "grad_norm": 6.593098163604736, "learning_rate": 1.8441842384490415e-05, "loss": 1.791, "step": 385700 }, { "epoch": 0.07792845246980475, "grad_norm": 7.8149614334106445, "learning_rate": 1.8441438400644114e-05, "loss": 1.7335, "step": 385800 }, { "epoch": 0.0779486516539597, "grad_norm": 7.260552883148193, "learning_rate": 1.8441034416797813e-05, "loss": 1.7409, "step": 385900 }, { "epoch": 0.07796885083811465, "grad_norm": 5.175228118896484, "learning_rate": 1.844063043295151e-05, "loss": 1.7485, "step": 386000 }, { "epoch": 0.07796885083811465, "eval_calculated_loss": 8.996095657348633, "eval_loss": 2.1529009342193604, "eval_perplexity": 8071.508392354954, "eval_runtime": 123.2347, "eval_samples_per_second": 8.098, "eval_steps_per_second": 2.029, "step": 386000 }, { "epoch": 0.07798905002226961, "grad_norm": 6.925776958465576, "learning_rate": 1.8440226449105208e-05, "loss": 1.8218, "step": 386100 }, { "epoch": 0.07800924920642455, "grad_norm": 9.605204582214355, "learning_rate": 1.8439822465258904e-05, "loss": 1.7608, "step": 386200 }, { "epoch": 0.0780294483905795, "grad_norm": 4.622828960418701, "learning_rate": 1.8439418481412603e-05, "loss": 1.8721, "step": 386300 }, { "epoch": 0.07804964757473445, "grad_norm": 4.487117290496826, "learning_rate": 1.8439014497566302e-05, "loss": 1.8411, "step": 386400 }, { "epoch": 0.07806984675888941, "grad_norm": 4.337416172027588, "learning_rate": 1.8438610513719998e-05, "loss": 1.7858, "step": 386500 }, { "epoch": 0.07809004594304436, "grad_norm": 8.1255521774292, "learning_rate": 1.84382065298737e-05, "loss": 1.7878, "step": 386600 }, { "epoch": 0.07811024512719932, "grad_norm": 6.424166679382324, "learning_rate": 1.8437802546027396e-05, "loss": 1.7986, "step": 386700 }, { "epoch": 0.07813044431135427, "grad_norm": 8.665786743164062, "learning_rate": 1.8437398562181095e-05, "loss": 1.875, "step": 386800 }, { "epoch": 0.07815064349550922, "grad_norm": 4.4299211502075195, "learning_rate": 1.843699457833479e-05, "loss": 1.7534, "step": 386900 }, { "epoch": 0.07817084267966416, "grad_norm": 7.541343688964844, "learning_rate": 1.843659059448849e-05, "loss": 1.754, "step": 387000 }, { "epoch": 0.07817084267966416, "eval_calculated_loss": 9.158580780029297, "eval_loss": 2.14286732673645, "eval_perplexity": 9495.571205834516, "eval_runtime": 120.4675, "eval_samples_per_second": 8.284, "eval_steps_per_second": 2.075, "step": 387000 }, { "epoch": 0.07819104186381912, "grad_norm": 3.953704357147217, "learning_rate": 1.843618661064219e-05, "loss": 1.8275, "step": 387100 }, { "epoch": 0.07821124104797407, "grad_norm": 7.868585586547852, "learning_rate": 1.8435782626795885e-05, "loss": 1.9674, "step": 387200 }, { "epoch": 0.07823144023212902, "grad_norm": 8.10775089263916, "learning_rate": 1.8435378642949584e-05, "loss": 1.8718, "step": 387300 }, { "epoch": 0.07825163941628398, "grad_norm": 7.729784965515137, "learning_rate": 1.8434974659103283e-05, "loss": 1.8349, "step": 387400 }, { "epoch": 0.07827183860043893, "grad_norm": 8.221837043762207, "learning_rate": 1.843457067525698e-05, "loss": 1.8322, "step": 387500 }, { "epoch": 0.07829203778459388, "grad_norm": 8.563023567199707, "learning_rate": 1.843416669141068e-05, "loss": 1.8397, "step": 387600 }, { "epoch": 0.07831223696874884, "grad_norm": 3.5947091579437256, "learning_rate": 1.8433762707564377e-05, "loss": 1.7358, "step": 387700 }, { "epoch": 0.07833243615290378, "grad_norm": 8.633442878723145, "learning_rate": 1.8433358723718076e-05, "loss": 1.8969, "step": 387800 }, { "epoch": 0.07835263533705873, "grad_norm": 7.578847885131836, "learning_rate": 1.8432954739871772e-05, "loss": 1.7822, "step": 387900 }, { "epoch": 0.07837283452121369, "grad_norm": 9.397909164428711, "learning_rate": 1.843255075602547e-05, "loss": 1.8634, "step": 388000 }, { "epoch": 0.07837283452121369, "eval_calculated_loss": 8.991012573242188, "eval_loss": 2.1573171615600586, "eval_perplexity": 8030.58433465976, "eval_runtime": 121.1648, "eval_samples_per_second": 8.237, "eval_steps_per_second": 2.063, "step": 388000 }, { "epoch": 0.07839303370536864, "grad_norm": 7.604772090911865, "learning_rate": 1.843214677217917e-05, "loss": 1.9578, "step": 388100 }, { "epoch": 0.07841323288952359, "grad_norm": 7.133698463439941, "learning_rate": 1.8431742788332866e-05, "loss": 1.7482, "step": 388200 }, { "epoch": 0.07843343207367855, "grad_norm": 5.748772621154785, "learning_rate": 1.8431338804486565e-05, "loss": 1.8473, "step": 388300 }, { "epoch": 0.0784536312578335, "grad_norm": 4.815923690795898, "learning_rate": 1.8430934820640264e-05, "loss": 1.8296, "step": 388400 }, { "epoch": 0.07847383044198845, "grad_norm": 8.091704368591309, "learning_rate": 1.843053083679396e-05, "loss": 1.8186, "step": 388500 }, { "epoch": 0.0784940296261434, "grad_norm": 9.59970760345459, "learning_rate": 1.843012685294766e-05, "loss": 1.768, "step": 388600 }, { "epoch": 0.07851422881029835, "grad_norm": 8.650588035583496, "learning_rate": 1.8429722869101358e-05, "loss": 1.7575, "step": 388700 }, { "epoch": 0.0785344279944533, "grad_norm": 4.982900619506836, "learning_rate": 1.8429318885255057e-05, "loss": 1.7935, "step": 388800 }, { "epoch": 0.07855462717860826, "grad_norm": 3.38824200630188, "learning_rate": 1.8428914901408753e-05, "loss": 1.9291, "step": 388900 }, { "epoch": 0.07857482636276321, "grad_norm": 8.226973533630371, "learning_rate": 1.8428510917562452e-05, "loss": 1.8759, "step": 389000 }, { "epoch": 0.07857482636276321, "eval_calculated_loss": 9.095683097839355, "eval_loss": 2.1551826000213623, "eval_perplexity": 8916.716904942728, "eval_runtime": 120.6648, "eval_samples_per_second": 8.271, "eval_steps_per_second": 2.072, "step": 389000 }, { "epoch": 0.07859502554691816, "grad_norm": 6.632321357727051, "learning_rate": 1.842810693371615e-05, "loss": 1.901, "step": 389100 }, { "epoch": 0.07861522473107312, "grad_norm": 9.012633323669434, "learning_rate": 1.8427702949869847e-05, "loss": 1.8061, "step": 389200 }, { "epoch": 0.07863542391522807, "grad_norm": 6.81278657913208, "learning_rate": 1.8427298966023546e-05, "loss": 1.8606, "step": 389300 }, { "epoch": 0.07865562309938301, "grad_norm": 8.091257095336914, "learning_rate": 1.8426894982177242e-05, "loss": 1.8588, "step": 389400 }, { "epoch": 0.07867582228353796, "grad_norm": 5.139726638793945, "learning_rate": 1.842649099833094e-05, "loss": 1.6844, "step": 389500 }, { "epoch": 0.07869602146769292, "grad_norm": 8.37137222290039, "learning_rate": 1.842608701448464e-05, "loss": 1.9164, "step": 389600 }, { "epoch": 0.07871622065184787, "grad_norm": 6.052548885345459, "learning_rate": 1.842568303063834e-05, "loss": 1.9164, "step": 389700 }, { "epoch": 0.07873641983600282, "grad_norm": 5.997828483581543, "learning_rate": 1.842527904679204e-05, "loss": 1.7989, "step": 389800 }, { "epoch": 0.07875661902015778, "grad_norm": 10.14066219329834, "learning_rate": 1.8424875062945734e-05, "loss": 1.8349, "step": 389900 }, { "epoch": 0.07877681820431273, "grad_norm": 4.017516613006592, "learning_rate": 1.8424471079099433e-05, "loss": 1.8429, "step": 390000 }, { "epoch": 0.07877681820431273, "eval_calculated_loss": 9.056273460388184, "eval_loss": 2.150679588317871, "eval_perplexity": 8572.146614982137, "eval_runtime": 120.8014, "eval_samples_per_second": 8.261, "eval_steps_per_second": 2.07, "step": 390000 }, { "epoch": 0.07879701738846769, "grad_norm": 10.484025955200195, "learning_rate": 1.8424067095253133e-05, "loss": 1.8907, "step": 390100 }, { "epoch": 0.07881721657262263, "grad_norm": 4.054012775421143, "learning_rate": 1.8423663111406828e-05, "loss": 1.8249, "step": 390200 }, { "epoch": 0.07883741575677758, "grad_norm": 10.393050193786621, "learning_rate": 1.8423259127560527e-05, "loss": 1.7758, "step": 390300 }, { "epoch": 0.07885761494093253, "grad_norm": 7.573507785797119, "learning_rate": 1.8422855143714223e-05, "loss": 1.9245, "step": 390400 }, { "epoch": 0.07887781412508749, "grad_norm": 8.59049129486084, "learning_rate": 1.8422451159867922e-05, "loss": 1.903, "step": 390500 }, { "epoch": 0.07889801330924244, "grad_norm": 13.147334098815918, "learning_rate": 1.842204717602162e-05, "loss": 1.738, "step": 390600 }, { "epoch": 0.0789182124933974, "grad_norm": 8.584184646606445, "learning_rate": 1.842164319217532e-05, "loss": 1.7511, "step": 390700 }, { "epoch": 0.07893841167755235, "grad_norm": 9.11816692352295, "learning_rate": 1.842123920832902e-05, "loss": 1.8649, "step": 390800 }, { "epoch": 0.0789586108617073, "grad_norm": 9.415234565734863, "learning_rate": 1.8420835224482715e-05, "loss": 1.7869, "step": 390900 }, { "epoch": 0.07897881004586224, "grad_norm": 5.537289619445801, "learning_rate": 1.8420431240636414e-05, "loss": 1.8932, "step": 391000 }, { "epoch": 0.07897881004586224, "eval_calculated_loss": 9.069291114807129, "eval_loss": 2.1494834423065186, "eval_perplexity": 8684.46533426397, "eval_runtime": 119.6503, "eval_samples_per_second": 8.341, "eval_steps_per_second": 2.089, "step": 391000 }, { "epoch": 0.0789990092300172, "grad_norm": 6.990189075469971, "learning_rate": 1.842002725679011e-05, "loss": 1.8486, "step": 391100 }, { "epoch": 0.07901920841417215, "grad_norm": 6.912900924682617, "learning_rate": 1.841962327294381e-05, "loss": 1.8641, "step": 391200 }, { "epoch": 0.0790394075983271, "grad_norm": 5.210417747497559, "learning_rate": 1.841921928909751e-05, "loss": 1.7923, "step": 391300 }, { "epoch": 0.07905960678248206, "grad_norm": 5.518886566162109, "learning_rate": 1.8418815305251204e-05, "loss": 1.9343, "step": 391400 }, { "epoch": 0.07907980596663701, "grad_norm": 6.5677900314331055, "learning_rate": 1.8418411321404903e-05, "loss": 1.7512, "step": 391500 }, { "epoch": 0.07910000515079196, "grad_norm": 5.010310173034668, "learning_rate": 1.8418007337558602e-05, "loss": 1.7753, "step": 391600 }, { "epoch": 0.07912020433494692, "grad_norm": 8.764114379882812, "learning_rate": 1.8417603353712298e-05, "loss": 1.8137, "step": 391700 }, { "epoch": 0.07914040351910186, "grad_norm": 6.050848484039307, "learning_rate": 1.8417199369865997e-05, "loss": 1.9106, "step": 391800 }, { "epoch": 0.07916060270325681, "grad_norm": 9.08995532989502, "learning_rate": 1.8416795386019696e-05, "loss": 1.823, "step": 391900 }, { "epoch": 0.07918080188741176, "grad_norm": 5.258919715881348, "learning_rate": 1.8416391402173396e-05, "loss": 1.8516, "step": 392000 }, { "epoch": 0.07918080188741176, "eval_calculated_loss": 9.170306205749512, "eval_loss": 2.158487319946289, "eval_perplexity": 9607.566131633354, "eval_runtime": 120.945, "eval_samples_per_second": 8.252, "eval_steps_per_second": 2.067, "step": 392000 }, { "epoch": 0.07920100107156672, "grad_norm": 9.883459091186523, "learning_rate": 1.841598741832709e-05, "loss": 1.7864, "step": 392100 }, { "epoch": 0.07922120025572167, "grad_norm": 8.122231483459473, "learning_rate": 1.841558343448079e-05, "loss": 1.8832, "step": 392200 }, { "epoch": 0.07924139943987663, "grad_norm": 4.788127899169922, "learning_rate": 1.841517945063449e-05, "loss": 1.8231, "step": 392300 }, { "epoch": 0.07926159862403158, "grad_norm": 8.873268127441406, "learning_rate": 1.8414775466788185e-05, "loss": 1.8178, "step": 392400 }, { "epoch": 0.07928179780818653, "grad_norm": 7.817172527313232, "learning_rate": 1.8414371482941884e-05, "loss": 1.8659, "step": 392500 }, { "epoch": 0.07930199699234147, "grad_norm": 4.730844974517822, "learning_rate": 1.841396749909558e-05, "loss": 1.8058, "step": 392600 }, { "epoch": 0.07932219617649643, "grad_norm": 4.476061820983887, "learning_rate": 1.841356351524928e-05, "loss": 1.724, "step": 392700 }, { "epoch": 0.07934239536065138, "grad_norm": 11.80243968963623, "learning_rate": 1.841315953140298e-05, "loss": 1.8605, "step": 392800 }, { "epoch": 0.07936259454480633, "grad_norm": 10.81490421295166, "learning_rate": 1.8412755547556678e-05, "loss": 1.9518, "step": 392900 }, { "epoch": 0.07938279372896129, "grad_norm": 6.091230392456055, "learning_rate": 1.8412351563710377e-05, "loss": 1.8697, "step": 393000 }, { "epoch": 0.07938279372896129, "eval_calculated_loss": 8.978293418884277, "eval_loss": 2.1451430320739746, "eval_perplexity": 7929.08892908528, "eval_runtime": 120.3622, "eval_samples_per_second": 8.292, "eval_steps_per_second": 2.077, "step": 393000 }, { "epoch": 0.07940299291311624, "grad_norm": 6.819962024688721, "learning_rate": 1.8411947579864072e-05, "loss": 1.8725, "step": 393100 }, { "epoch": 0.0794231920972712, "grad_norm": 3.1387197971343994, "learning_rate": 1.841154359601777e-05, "loss": 1.8428, "step": 393200 }, { "epoch": 0.07944339128142615, "grad_norm": 5.4196929931640625, "learning_rate": 1.841113961217147e-05, "loss": 1.844, "step": 393300 }, { "epoch": 0.0794635904655811, "grad_norm": 7.981515407562256, "learning_rate": 1.8410735628325166e-05, "loss": 1.7779, "step": 393400 }, { "epoch": 0.07948378964973604, "grad_norm": 3.953084945678711, "learning_rate": 1.8410331644478866e-05, "loss": 1.8278, "step": 393500 }, { "epoch": 0.079503988833891, "grad_norm": 5.736674785614014, "learning_rate": 1.840992766063256e-05, "loss": 1.86, "step": 393600 }, { "epoch": 0.07952418801804595, "grad_norm": 5.559413909912109, "learning_rate": 1.840952367678626e-05, "loss": 1.9059, "step": 393700 }, { "epoch": 0.0795443872022009, "grad_norm": 8.427830696105957, "learning_rate": 1.840911969293996e-05, "loss": 1.8536, "step": 393800 }, { "epoch": 0.07956458638635586, "grad_norm": 7.526864051818848, "learning_rate": 1.840871570909366e-05, "loss": 1.828, "step": 393900 }, { "epoch": 0.07958478557051081, "grad_norm": 8.979958534240723, "learning_rate": 1.8408311725247358e-05, "loss": 1.8936, "step": 394000 }, { "epoch": 0.07958478557051081, "eval_calculated_loss": 8.989225387573242, "eval_loss": 2.1579647064208984, "eval_perplexity": 8016.245006760952, "eval_runtime": 117.8014, "eval_samples_per_second": 8.472, "eval_steps_per_second": 2.122, "step": 394000 }, { "epoch": 0.07960498475466576, "grad_norm": 7.202105522155762, "learning_rate": 1.8407907741401054e-05, "loss": 1.8385, "step": 394100 }, { "epoch": 0.07962518393882072, "grad_norm": 9.64968204498291, "learning_rate": 1.8407503757554753e-05, "loss": 1.8281, "step": 394200 }, { "epoch": 0.07964538312297566, "grad_norm": 4.55208683013916, "learning_rate": 1.840709977370845e-05, "loss": 1.8272, "step": 394300 }, { "epoch": 0.07966558230713061, "grad_norm": 3.465747356414795, "learning_rate": 1.8406695789862148e-05, "loss": 1.8683, "step": 394400 }, { "epoch": 0.07968578149128556, "grad_norm": 6.7357988357543945, "learning_rate": 1.8406291806015847e-05, "loss": 1.836, "step": 394500 }, { "epoch": 0.07970598067544052, "grad_norm": 8.264881134033203, "learning_rate": 1.8405887822169542e-05, "loss": 1.7488, "step": 394600 }, { "epoch": 0.07972617985959547, "grad_norm": 9.22222900390625, "learning_rate": 1.840548383832324e-05, "loss": 1.8083, "step": 394700 }, { "epoch": 0.07974637904375043, "grad_norm": 4.453227519989014, "learning_rate": 1.840507985447694e-05, "loss": 1.7764, "step": 394800 }, { "epoch": 0.07976657822790538, "grad_norm": 7.964577674865723, "learning_rate": 1.840467587063064e-05, "loss": 1.8713, "step": 394900 }, { "epoch": 0.07978677741206033, "grad_norm": 6.794923305511475, "learning_rate": 1.840427188678434e-05, "loss": 1.8202, "step": 395000 }, { "epoch": 0.07978677741206033, "eval_calculated_loss": 9.074106216430664, "eval_loss": 2.1572513580322266, "eval_perplexity": 8726.38275472555, "eval_runtime": 118.6675, "eval_samples_per_second": 8.41, "eval_steps_per_second": 2.107, "step": 395000 }, { "epoch": 0.07980697659621527, "grad_norm": 6.124146938323975, "learning_rate": 1.8403867902938035e-05, "loss": 1.8783, "step": 395100 }, { "epoch": 0.07982717578037023, "grad_norm": 8.674759864807129, "learning_rate": 1.8403463919091734e-05, "loss": 1.9043, "step": 395200 }, { "epoch": 0.07984737496452518, "grad_norm": 6.616066932678223, "learning_rate": 1.840305993524543e-05, "loss": 1.8241, "step": 395300 }, { "epoch": 0.07986757414868013, "grad_norm": 4.9110260009765625, "learning_rate": 1.840265595139913e-05, "loss": 1.8818, "step": 395400 }, { "epoch": 0.07988777333283509, "grad_norm": 3.347391128540039, "learning_rate": 1.8402251967552828e-05, "loss": 1.8146, "step": 395500 }, { "epoch": 0.07990797251699004, "grad_norm": 6.22420072555542, "learning_rate": 1.8401847983706524e-05, "loss": 1.9537, "step": 395600 }, { "epoch": 0.079928171701145, "grad_norm": 5.356933116912842, "learning_rate": 1.8401443999860223e-05, "loss": 1.8286, "step": 395700 }, { "epoch": 0.07994837088529995, "grad_norm": 8.676271438598633, "learning_rate": 1.8401040016013922e-05, "loss": 1.8693, "step": 395800 }, { "epoch": 0.07996857006945489, "grad_norm": 4.527159214019775, "learning_rate": 1.840063603216762e-05, "loss": 1.8062, "step": 395900 }, { "epoch": 0.07998876925360984, "grad_norm": 5.568970203399658, "learning_rate": 1.8400232048321317e-05, "loss": 1.7872, "step": 396000 }, { "epoch": 0.07998876925360984, "eval_calculated_loss": 9.047502517700195, "eval_loss": 2.152294874191284, "eval_perplexity": 8497.289571619553, "eval_runtime": 119.3729, "eval_samples_per_second": 8.36, "eval_steps_per_second": 2.094, "step": 396000 }, { "epoch": 0.0800089684377648, "grad_norm": 5.118333339691162, "learning_rate": 1.8399828064475016e-05, "loss": 1.8316, "step": 396100 }, { "epoch": 0.08002916762191975, "grad_norm": 7.674463748931885, "learning_rate": 1.8399424080628715e-05, "loss": 1.8673, "step": 396200 }, { "epoch": 0.0800493668060747, "grad_norm": 7.621123790740967, "learning_rate": 1.839902009678241e-05, "loss": 1.8142, "step": 396300 }, { "epoch": 0.08006956599022966, "grad_norm": 8.572112083435059, "learning_rate": 1.839861611293611e-05, "loss": 1.8282, "step": 396400 }, { "epoch": 0.08008976517438461, "grad_norm": 8.876689910888672, "learning_rate": 1.839821212908981e-05, "loss": 1.7394, "step": 396500 }, { "epoch": 0.08010996435853956, "grad_norm": 6.61820125579834, "learning_rate": 1.8397808145243505e-05, "loss": 1.7575, "step": 396600 }, { "epoch": 0.0801301635426945, "grad_norm": 6.804290771484375, "learning_rate": 1.8397404161397204e-05, "loss": 1.8239, "step": 396700 }, { "epoch": 0.08015036272684946, "grad_norm": 8.299723625183105, "learning_rate": 1.83970001775509e-05, "loss": 1.8544, "step": 396800 }, { "epoch": 0.08017056191100441, "grad_norm": 5.7295966148376465, "learning_rate": 1.83965961937046e-05, "loss": 1.8655, "step": 396900 }, { "epoch": 0.08019076109515937, "grad_norm": 5.643389701843262, "learning_rate": 1.8396192209858298e-05, "loss": 1.8542, "step": 397000 }, { "epoch": 0.08019076109515937, "eval_calculated_loss": 8.903852462768555, "eval_loss": 2.146611452102661, "eval_perplexity": 7360.2741728024375, "eval_runtime": 117.4375, "eval_samples_per_second": 8.498, "eval_steps_per_second": 2.129, "step": 397000 }, { "epoch": 0.08021096027931432, "grad_norm": 5.420555591583252, "learning_rate": 1.8395788226011997e-05, "loss": 1.8352, "step": 397100 }, { "epoch": 0.08023115946346927, "grad_norm": 7.185835361480713, "learning_rate": 1.8395384242165696e-05, "loss": 1.7936, "step": 397200 }, { "epoch": 0.08025135864762423, "grad_norm": 7.229457378387451, "learning_rate": 1.8394980258319392e-05, "loss": 1.889, "step": 397300 }, { "epoch": 0.08027155783177918, "grad_norm": 8.57788372039795, "learning_rate": 1.839457627447309e-05, "loss": 1.7514, "step": 397400 }, { "epoch": 0.08029175701593412, "grad_norm": 8.124770164489746, "learning_rate": 1.8394172290626787e-05, "loss": 1.7347, "step": 397500 }, { "epoch": 0.08031195620008907, "grad_norm": 9.51638126373291, "learning_rate": 1.8393768306780486e-05, "loss": 1.8886, "step": 397600 }, { "epoch": 0.08033215538424403, "grad_norm": 5.528271198272705, "learning_rate": 1.8393364322934185e-05, "loss": 1.7873, "step": 397700 }, { "epoch": 0.08035235456839898, "grad_norm": 6.335660934448242, "learning_rate": 1.839296033908788e-05, "loss": 1.8146, "step": 397800 }, { "epoch": 0.08037255375255394, "grad_norm": 5.784611225128174, "learning_rate": 1.839255635524158e-05, "loss": 1.7903, "step": 397900 }, { "epoch": 0.08039275293670889, "grad_norm": 10.501916885375977, "learning_rate": 1.839215237139528e-05, "loss": 1.8804, "step": 398000 }, { "epoch": 0.08039275293670889, "eval_calculated_loss": 8.977622985839844, "eval_loss": 2.154210329055786, "eval_perplexity": 7923.774787442099, "eval_runtime": 120.2815, "eval_samples_per_second": 8.297, "eval_steps_per_second": 2.078, "step": 398000 }, { "epoch": 0.08041295212086384, "grad_norm": 5.46207332611084, "learning_rate": 1.8391748387548978e-05, "loss": 1.8386, "step": 398100 }, { "epoch": 0.0804331513050188, "grad_norm": 9.209818840026855, "learning_rate": 1.8391344403702677e-05, "loss": 1.8685, "step": 398200 }, { "epoch": 0.08045335048917374, "grad_norm": 6.834765434265137, "learning_rate": 1.8390940419856373e-05, "loss": 1.9388, "step": 398300 }, { "epoch": 0.08047354967332869, "grad_norm": 9.667920112609863, "learning_rate": 1.8390536436010072e-05, "loss": 1.8332, "step": 398400 }, { "epoch": 0.08049374885748364, "grad_norm": 8.04558277130127, "learning_rate": 1.8390132452163768e-05, "loss": 1.8324, "step": 398500 }, { "epoch": 0.0805139480416386, "grad_norm": 7.397255897521973, "learning_rate": 1.8389728468317467e-05, "loss": 1.8966, "step": 398600 }, { "epoch": 0.08053414722579355, "grad_norm": 5.293362617492676, "learning_rate": 1.8389324484471166e-05, "loss": 1.8516, "step": 398700 }, { "epoch": 0.0805543464099485, "grad_norm": 9.136662483215332, "learning_rate": 1.8388920500624862e-05, "loss": 1.8552, "step": 398800 }, { "epoch": 0.08057454559410346, "grad_norm": 8.569327354431152, "learning_rate": 1.838851651677856e-05, "loss": 1.8608, "step": 398900 }, { "epoch": 0.08059474477825841, "grad_norm": 7.106268405914307, "learning_rate": 1.838811253293226e-05, "loss": 1.8096, "step": 399000 }, { "epoch": 0.08059474477825841, "eval_calculated_loss": 9.098999977111816, "eval_loss": 2.1468164920806885, "eval_perplexity": 8946.341682167693, "eval_runtime": 120.1552, "eval_samples_per_second": 8.306, "eval_steps_per_second": 2.081, "step": 399000 }, { "epoch": 0.08061494396241335, "grad_norm": 6.949513912200928, "learning_rate": 1.838770854908596e-05, "loss": 1.8753, "step": 399100 }, { "epoch": 0.0806351431465683, "grad_norm": 4.963296413421631, "learning_rate": 1.8387304565239655e-05, "loss": 1.8846, "step": 399200 }, { "epoch": 0.08065534233072326, "grad_norm": 6.850680828094482, "learning_rate": 1.8386900581393354e-05, "loss": 1.8, "step": 399300 }, { "epoch": 0.08067554151487821, "grad_norm": 6.530250549316406, "learning_rate": 1.8386496597547053e-05, "loss": 1.812, "step": 399400 }, { "epoch": 0.08069574069903317, "grad_norm": 8.754434585571289, "learning_rate": 1.838609261370075e-05, "loss": 1.9095, "step": 399500 }, { "epoch": 0.08071593988318812, "grad_norm": 9.656085014343262, "learning_rate": 1.8385688629854448e-05, "loss": 1.8897, "step": 399600 }, { "epoch": 0.08073613906734307, "grad_norm": 3.6430916786193848, "learning_rate": 1.8385284646008147e-05, "loss": 1.8991, "step": 399700 }, { "epoch": 0.08075633825149803, "grad_norm": 3.1433706283569336, "learning_rate": 1.8384880662161843e-05, "loss": 1.7926, "step": 399800 }, { "epoch": 0.08077653743565297, "grad_norm": 9.537979125976562, "learning_rate": 1.8384476678315542e-05, "loss": 1.8655, "step": 399900 }, { "epoch": 0.08079673661980792, "grad_norm": 6.52443790435791, "learning_rate": 1.8384072694469238e-05, "loss": 1.8515, "step": 400000 }, { "epoch": 0.08079673661980792, "eval_calculated_loss": 9.270938873291016, "eval_loss": 2.1477808952331543, "eval_perplexity": 10624.722473295064, "eval_runtime": 124.5533, "eval_samples_per_second": 8.013, "eval_steps_per_second": 2.007, "step": 400000 }, { "epoch": 0.08081693580396287, "grad_norm": 7.808268070220947, "learning_rate": 1.838366871062294e-05, "loss": 1.8135, "step": 400100 }, { "epoch": 0.08083713498811783, "grad_norm": 4.284387111663818, "learning_rate": 1.8383264726776636e-05, "loss": 1.8954, "step": 400200 }, { "epoch": 0.08085733417227278, "grad_norm": 10.422481536865234, "learning_rate": 1.8382860742930335e-05, "loss": 1.7916, "step": 400300 }, { "epoch": 0.08087753335642774, "grad_norm": 8.537772178649902, "learning_rate": 1.8382456759084034e-05, "loss": 1.8123, "step": 400400 }, { "epoch": 0.08089773254058269, "grad_norm": 12.79542064666748, "learning_rate": 1.838205277523773e-05, "loss": 1.804, "step": 400500 }, { "epoch": 0.08091793172473764, "grad_norm": 9.500804901123047, "learning_rate": 1.838164879139143e-05, "loss": 1.8261, "step": 400600 }, { "epoch": 0.08093813090889258, "grad_norm": 3.28655743598938, "learning_rate": 1.8381244807545128e-05, "loss": 1.8377, "step": 400700 }, { "epoch": 0.08095833009304754, "grad_norm": 8.43991470336914, "learning_rate": 1.8380840823698824e-05, "loss": 1.7923, "step": 400800 }, { "epoch": 0.08097852927720249, "grad_norm": 12.134124755859375, "learning_rate": 1.8380436839852523e-05, "loss": 1.8098, "step": 400900 }, { "epoch": 0.08099872846135744, "grad_norm": 6.724854946136475, "learning_rate": 1.838003285600622e-05, "loss": 1.8844, "step": 401000 }, { "epoch": 0.08099872846135744, "eval_calculated_loss": 9.349281311035156, "eval_loss": 2.145357608795166, "eval_perplexity": 11490.562336570476, "eval_runtime": 118.6592, "eval_samples_per_second": 8.411, "eval_steps_per_second": 2.107, "step": 401000 }, { "epoch": 0.0810189276455124, "grad_norm": 7.430522441864014, "learning_rate": 1.837962887215992e-05, "loss": 1.7587, "step": 401100 }, { "epoch": 0.08103912682966735, "grad_norm": 9.940062522888184, "learning_rate": 1.8379224888313617e-05, "loss": 1.7684, "step": 401200 }, { "epoch": 0.0810593260138223, "grad_norm": 8.394464492797852, "learning_rate": 1.8378820904467316e-05, "loss": 1.7937, "step": 401300 }, { "epoch": 0.08107952519797726, "grad_norm": 5.329605579376221, "learning_rate": 1.8378416920621015e-05, "loss": 1.9016, "step": 401400 }, { "epoch": 0.0810997243821322, "grad_norm": 4.088993549346924, "learning_rate": 1.837801293677471e-05, "loss": 1.759, "step": 401500 }, { "epoch": 0.08111992356628715, "grad_norm": 8.148797035217285, "learning_rate": 1.837760895292841e-05, "loss": 1.8157, "step": 401600 }, { "epoch": 0.0811401227504421, "grad_norm": 7.6573567390441895, "learning_rate": 1.8377204969082106e-05, "loss": 1.8064, "step": 401700 }, { "epoch": 0.08116032193459706, "grad_norm": 5.5708112716674805, "learning_rate": 1.8376800985235805e-05, "loss": 1.8477, "step": 401800 }, { "epoch": 0.08118052111875201, "grad_norm": 8.061396598815918, "learning_rate": 1.8376397001389504e-05, "loss": 1.8813, "step": 401900 }, { "epoch": 0.08120072030290697, "grad_norm": 9.693116188049316, "learning_rate": 1.83759930175432e-05, "loss": 1.7701, "step": 402000 }, { "epoch": 0.08120072030290697, "eval_calculated_loss": 9.24411678314209, "eval_loss": 2.1548590660095215, "eval_perplexity": 10343.533110206614, "eval_runtime": 121.675, "eval_samples_per_second": 8.202, "eval_steps_per_second": 2.055, "step": 402000 }, { "epoch": 0.08122091948706192, "grad_norm": 8.771495819091797, "learning_rate": 1.8375589033696903e-05, "loss": 1.8531, "step": 402100 }, { "epoch": 0.08124111867121687, "grad_norm": 5.582803726196289, "learning_rate": 1.8375185049850598e-05, "loss": 1.704, "step": 402200 }, { "epoch": 0.08126131785537181, "grad_norm": 6.409722805023193, "learning_rate": 1.8374781066004297e-05, "loss": 1.8171, "step": 402300 }, { "epoch": 0.08128151703952677, "grad_norm": 9.047882080078125, "learning_rate": 1.8374377082157993e-05, "loss": 1.8476, "step": 402400 }, { "epoch": 0.08130171622368172, "grad_norm": 5.700900554656982, "learning_rate": 1.8373973098311692e-05, "loss": 1.8167, "step": 402500 }, { "epoch": 0.08132191540783668, "grad_norm": 9.834999084472656, "learning_rate": 1.837356911446539e-05, "loss": 1.8669, "step": 402600 }, { "epoch": 0.08134211459199163, "grad_norm": 5.591362953186035, "learning_rate": 1.8373165130619087e-05, "loss": 1.9046, "step": 402700 }, { "epoch": 0.08136231377614658, "grad_norm": 7.634653091430664, "learning_rate": 1.8372761146772786e-05, "loss": 1.8053, "step": 402800 }, { "epoch": 0.08138251296030154, "grad_norm": 6.189750671386719, "learning_rate": 1.8372357162926485e-05, "loss": 1.8783, "step": 402900 }, { "epoch": 0.08140271214445649, "grad_norm": 5.255284309387207, "learning_rate": 1.837195317908018e-05, "loss": 1.7534, "step": 403000 }, { "epoch": 0.08140271214445649, "eval_calculated_loss": 9.233604431152344, "eval_loss": 2.1545093059539795, "eval_perplexity": 10235.367781445639, "eval_runtime": 120.1501, "eval_samples_per_second": 8.306, "eval_steps_per_second": 2.081, "step": 403000 }, { "epoch": 0.08142291132861144, "grad_norm": 4.957564353942871, "learning_rate": 1.837154919523388e-05, "loss": 1.76, "step": 403100 }, { "epoch": 0.08144311051276638, "grad_norm": 8.602977752685547, "learning_rate": 1.837114521138758e-05, "loss": 1.8409, "step": 403200 }, { "epoch": 0.08146330969692134, "grad_norm": 10.338513374328613, "learning_rate": 1.837074122754128e-05, "loss": 1.8401, "step": 403300 }, { "epoch": 0.08148350888107629, "grad_norm": 9.818755149841309, "learning_rate": 1.8370337243694974e-05, "loss": 1.7906, "step": 403400 }, { "epoch": 0.08150370806523125, "grad_norm": 5.810373783111572, "learning_rate": 1.8369933259848673e-05, "loss": 1.8342, "step": 403500 }, { "epoch": 0.0815239072493862, "grad_norm": 6.01206636428833, "learning_rate": 1.8369529276002372e-05, "loss": 1.7831, "step": 403600 }, { "epoch": 0.08154410643354115, "grad_norm": 14.717752456665039, "learning_rate": 1.8369125292156068e-05, "loss": 1.8376, "step": 403700 }, { "epoch": 0.0815643056176961, "grad_norm": 6.195648670196533, "learning_rate": 1.8368721308309767e-05, "loss": 1.784, "step": 403800 }, { "epoch": 0.08158450480185106, "grad_norm": 7.638695240020752, "learning_rate": 1.8368317324463466e-05, "loss": 1.8576, "step": 403900 }, { "epoch": 0.081604703986006, "grad_norm": 6.436873912811279, "learning_rate": 1.8367913340617162e-05, "loss": 1.8557, "step": 404000 }, { "epoch": 0.081604703986006, "eval_calculated_loss": 9.215670585632324, "eval_loss": 2.1565608978271484, "eval_perplexity": 10053.444445182284, "eval_runtime": 119.6852, "eval_samples_per_second": 8.339, "eval_steps_per_second": 2.089, "step": 404000 }, { "epoch": 0.08162490317016095, "grad_norm": 7.715814113616943, "learning_rate": 1.836750935677086e-05, "loss": 1.9059, "step": 404100 }, { "epoch": 0.08164510235431591, "grad_norm": 8.888496398925781, "learning_rate": 1.836710537292456e-05, "loss": 1.8192, "step": 404200 }, { "epoch": 0.08166530153847086, "grad_norm": 13.149572372436523, "learning_rate": 1.836670138907826e-05, "loss": 1.7927, "step": 404300 }, { "epoch": 0.08168550072262581, "grad_norm": 4.346386909484863, "learning_rate": 1.8366297405231955e-05, "loss": 1.8297, "step": 404400 }, { "epoch": 0.08170569990678077, "grad_norm": 4.453954696655273, "learning_rate": 1.8365893421385654e-05, "loss": 1.9031, "step": 404500 }, { "epoch": 0.08172589909093572, "grad_norm": 8.915035247802734, "learning_rate": 1.8365489437539354e-05, "loss": 1.859, "step": 404600 }, { "epoch": 0.08174609827509068, "grad_norm": 7.880258083343506, "learning_rate": 1.836508545369305e-05, "loss": 1.8429, "step": 404700 }, { "epoch": 0.08176629745924562, "grad_norm": 7.265096664428711, "learning_rate": 1.836468146984675e-05, "loss": 1.9361, "step": 404800 }, { "epoch": 0.08178649664340057, "grad_norm": 8.16903305053711, "learning_rate": 1.8364277486000444e-05, "loss": 1.8129, "step": 404900 }, { "epoch": 0.08180669582755552, "grad_norm": 4.945183277130127, "learning_rate": 1.8363873502154143e-05, "loss": 1.7934, "step": 405000 }, { "epoch": 0.08180669582755552, "eval_calculated_loss": 9.119121551513672, "eval_loss": 2.160757541656494, "eval_perplexity": 9128.179457986293, "eval_runtime": 120.1111, "eval_samples_per_second": 8.309, "eval_steps_per_second": 2.081, "step": 405000 }, { "epoch": 0.08182689501171048, "grad_norm": 4.3116841316223145, "learning_rate": 1.8363469518307842e-05, "loss": 1.8463, "step": 405100 }, { "epoch": 0.08184709419586543, "grad_norm": 4.293654918670654, "learning_rate": 1.836306553446154e-05, "loss": 1.8082, "step": 405200 }, { "epoch": 0.08186729338002038, "grad_norm": 5.163362503051758, "learning_rate": 1.836266155061524e-05, "loss": 1.8881, "step": 405300 }, { "epoch": 0.08188749256417534, "grad_norm": 6.968864917755127, "learning_rate": 1.8362257566768936e-05, "loss": 1.7668, "step": 405400 }, { "epoch": 0.08190769174833029, "grad_norm": 6.6363701820373535, "learning_rate": 1.8361853582922636e-05, "loss": 1.8112, "step": 405500 }, { "epoch": 0.08192789093248523, "grad_norm": 6.152227878570557, "learning_rate": 1.8361449599076335e-05, "loss": 1.8137, "step": 405600 }, { "epoch": 0.08194809011664018, "grad_norm": 6.719285011291504, "learning_rate": 1.836104561523003e-05, "loss": 1.8482, "step": 405700 }, { "epoch": 0.08196828930079514, "grad_norm": 3.5866756439208984, "learning_rate": 1.836064163138373e-05, "loss": 1.8428, "step": 405800 }, { "epoch": 0.08198848848495009, "grad_norm": 7.980559825897217, "learning_rate": 1.8360237647537425e-05, "loss": 1.8157, "step": 405900 }, { "epoch": 0.08200868766910505, "grad_norm": 8.286700248718262, "learning_rate": 1.8359833663691124e-05, "loss": 1.8947, "step": 406000 }, { "epoch": 0.08200868766910505, "eval_calculated_loss": 9.119991302490234, "eval_loss": 2.1561620235443115, "eval_perplexity": 9136.122154567469, "eval_runtime": 119.5087, "eval_samples_per_second": 8.351, "eval_steps_per_second": 2.092, "step": 406000 }, { "epoch": 0.08202888685326, "grad_norm": 8.093257904052734, "learning_rate": 1.8359429679844824e-05, "loss": 1.8007, "step": 406100 }, { "epoch": 0.08204908603741495, "grad_norm": 8.439946174621582, "learning_rate": 1.835902569599852e-05, "loss": 1.8025, "step": 406200 }, { "epoch": 0.08206928522156991, "grad_norm": 7.3863091468811035, "learning_rate": 1.8358621712152222e-05, "loss": 1.8274, "step": 406300 }, { "epoch": 0.08208948440572485, "grad_norm": 8.666865348815918, "learning_rate": 1.8358217728305918e-05, "loss": 1.7355, "step": 406400 }, { "epoch": 0.0821096835898798, "grad_norm": 12.35438060760498, "learning_rate": 1.8357813744459617e-05, "loss": 1.8443, "step": 406500 }, { "epoch": 0.08212988277403475, "grad_norm": 12.035587310791016, "learning_rate": 1.8357409760613312e-05, "loss": 1.8244, "step": 406600 }, { "epoch": 0.08215008195818971, "grad_norm": 8.379929542541504, "learning_rate": 1.835700577676701e-05, "loss": 1.7965, "step": 406700 }, { "epoch": 0.08217028114234466, "grad_norm": 4.570988178253174, "learning_rate": 1.835660179292071e-05, "loss": 1.767, "step": 406800 }, { "epoch": 0.08219048032649962, "grad_norm": 5.948405742645264, "learning_rate": 1.8356197809074406e-05, "loss": 1.7932, "step": 406900 }, { "epoch": 0.08221067951065457, "grad_norm": 5.167417526245117, "learning_rate": 1.8355793825228106e-05, "loss": 1.7411, "step": 407000 }, { "epoch": 0.08221067951065457, "eval_calculated_loss": 9.172855377197266, "eval_loss": 2.151095151901245, "eval_perplexity": 9632.088707744408, "eval_runtime": 117.6024, "eval_samples_per_second": 8.486, "eval_steps_per_second": 2.126, "step": 407000 }, { "epoch": 0.08223087869480952, "grad_norm": 9.1135835647583, "learning_rate": 1.8355389841381805e-05, "loss": 1.8799, "step": 407100 }, { "epoch": 0.08225107787896446, "grad_norm": 6.2787394523620605, "learning_rate": 1.83549858575355e-05, "loss": 1.846, "step": 407200 }, { "epoch": 0.08227127706311942, "grad_norm": 7.393254280090332, "learning_rate": 1.83545818736892e-05, "loss": 1.8194, "step": 407300 }, { "epoch": 0.08229147624727437, "grad_norm": 6.187542915344238, "learning_rate": 1.83541778898429e-05, "loss": 1.8702, "step": 407400 }, { "epoch": 0.08231167543142932, "grad_norm": 9.188738822937012, "learning_rate": 1.8353773905996598e-05, "loss": 1.7744, "step": 407500 }, { "epoch": 0.08233187461558428, "grad_norm": 5.8349609375, "learning_rate": 1.8353369922150294e-05, "loss": 1.7732, "step": 407600 }, { "epoch": 0.08235207379973923, "grad_norm": 5.107840061187744, "learning_rate": 1.8352965938303993e-05, "loss": 1.8463, "step": 407700 }, { "epoch": 0.08237227298389418, "grad_norm": 21.127429962158203, "learning_rate": 1.8352561954457692e-05, "loss": 1.7791, "step": 407800 }, { "epoch": 0.08239247216804914, "grad_norm": 9.074225425720215, "learning_rate": 1.8352157970611388e-05, "loss": 1.7791, "step": 407900 }, { "epoch": 0.08241267135220408, "grad_norm": 5.3416218757629395, "learning_rate": 1.8351753986765087e-05, "loss": 1.7914, "step": 408000 }, { "epoch": 0.08241267135220408, "eval_calculated_loss": 9.293042182922363, "eval_loss": 2.1547958850860596, "eval_perplexity": 10862.178619754732, "eval_runtime": 118.4003, "eval_samples_per_second": 8.429, "eval_steps_per_second": 2.111, "step": 408000 }, { "epoch": 0.08243287053635903, "grad_norm": 6.473891258239746, "learning_rate": 1.8351350002918782e-05, "loss": 1.9047, "step": 408100 }, { "epoch": 0.08245306972051399, "grad_norm": 9.919487953186035, "learning_rate": 1.835094601907248e-05, "loss": 1.7425, "step": 408200 }, { "epoch": 0.08247326890466894, "grad_norm": 7.697896480560303, "learning_rate": 1.835054203522618e-05, "loss": 1.8202, "step": 408300 }, { "epoch": 0.08249346808882389, "grad_norm": 4.806969165802002, "learning_rate": 1.835013805137988e-05, "loss": 1.802, "step": 408400 }, { "epoch": 0.08251366727297885, "grad_norm": 7.501895427703857, "learning_rate": 1.834973406753358e-05, "loss": 1.7791, "step": 408500 }, { "epoch": 0.0825338664571338, "grad_norm": 6.624790668487549, "learning_rate": 1.8349330083687275e-05, "loss": 1.8449, "step": 408600 }, { "epoch": 0.08255406564128875, "grad_norm": 8.383810043334961, "learning_rate": 1.8348926099840974e-05, "loss": 1.8637, "step": 408700 }, { "epoch": 0.0825742648254437, "grad_norm": 8.27509593963623, "learning_rate": 1.8348522115994673e-05, "loss": 1.8782, "step": 408800 }, { "epoch": 0.08259446400959865, "grad_norm": 4.683411598205566, "learning_rate": 1.834811813214837e-05, "loss": 1.8374, "step": 408900 }, { "epoch": 0.0826146631937536, "grad_norm": 4.151754379272461, "learning_rate": 1.8347714148302068e-05, "loss": 1.8658, "step": 409000 }, { "epoch": 0.0826146631937536, "eval_calculated_loss": 9.316883087158203, "eval_loss": 2.1577250957489014, "eval_perplexity": 11124.254428861175, "eval_runtime": 119.7635, "eval_samples_per_second": 8.333, "eval_steps_per_second": 2.087, "step": 409000 }, { "epoch": 0.08263486237790856, "grad_norm": 4.9033026695251465, "learning_rate": 1.8347310164455764e-05, "loss": 1.8087, "step": 409100 }, { "epoch": 0.08265506156206351, "grad_norm": 3.424278736114502, "learning_rate": 1.8346906180609463e-05, "loss": 1.9102, "step": 409200 }, { "epoch": 0.08267526074621846, "grad_norm": 4.852458477020264, "learning_rate": 1.8346502196763162e-05, "loss": 1.8587, "step": 409300 }, { "epoch": 0.08269545993037342, "grad_norm": 6.583441257476807, "learning_rate": 1.834609821291686e-05, "loss": 1.8122, "step": 409400 }, { "epoch": 0.08271565911452837, "grad_norm": 10.762838363647461, "learning_rate": 1.834569422907056e-05, "loss": 1.8962, "step": 409500 }, { "epoch": 0.08273585829868331, "grad_norm": 8.479572296142578, "learning_rate": 1.8345290245224256e-05, "loss": 1.8337, "step": 409600 }, { "epoch": 0.08275605748283826, "grad_norm": 5.637114524841309, "learning_rate": 1.8344886261377955e-05, "loss": 1.7978, "step": 409700 }, { "epoch": 0.08277625666699322, "grad_norm": 8.254775047302246, "learning_rate": 1.834448227753165e-05, "loss": 1.8589, "step": 409800 }, { "epoch": 0.08279645585114817, "grad_norm": 3.5891435146331787, "learning_rate": 1.834407829368535e-05, "loss": 1.8028, "step": 409900 }, { "epoch": 0.08281665503530312, "grad_norm": 6.968643665313721, "learning_rate": 1.834367430983905e-05, "loss": 1.8142, "step": 410000 }, { "epoch": 0.08281665503530312, "eval_calculated_loss": 9.193015098571777, "eval_loss": 2.1622018814086914, "eval_perplexity": 9828.239461456213, "eval_runtime": 120.1998, "eval_samples_per_second": 8.303, "eval_steps_per_second": 2.08, "step": 410000 }, { "epoch": 0.08283685421945808, "grad_norm": 6.232243061065674, "learning_rate": 1.8343270325992745e-05, "loss": 1.8406, "step": 410100 }, { "epoch": 0.08285705340361303, "grad_norm": 8.362969398498535, "learning_rate": 1.8342866342146444e-05, "loss": 1.8433, "step": 410200 }, { "epoch": 0.08287725258776799, "grad_norm": 5.544002056121826, "learning_rate": 1.8342462358300143e-05, "loss": 1.8332, "step": 410300 }, { "epoch": 0.08289745177192293, "grad_norm": 6.985019683837891, "learning_rate": 1.8342058374453842e-05, "loss": 1.9677, "step": 410400 }, { "epoch": 0.08291765095607788, "grad_norm": 6.206446647644043, "learning_rate": 1.834165439060754e-05, "loss": 1.7036, "step": 410500 }, { "epoch": 0.08293785014023283, "grad_norm": 7.265565872192383, "learning_rate": 1.8341250406761237e-05, "loss": 1.8439, "step": 410600 }, { "epoch": 0.08295804932438779, "grad_norm": 11.228412628173828, "learning_rate": 1.8340846422914936e-05, "loss": 1.7886, "step": 410700 }, { "epoch": 0.08297824850854274, "grad_norm": 6.535689353942871, "learning_rate": 1.8340442439068632e-05, "loss": 1.8567, "step": 410800 }, { "epoch": 0.0829984476926977, "grad_norm": 7.5399298667907715, "learning_rate": 1.834003845522233e-05, "loss": 1.7995, "step": 410900 }, { "epoch": 0.08301864687685265, "grad_norm": 7.615657329559326, "learning_rate": 1.833963447137603e-05, "loss": 1.8567, "step": 411000 }, { "epoch": 0.08301864687685265, "eval_calculated_loss": 9.231804847717285, "eval_loss": 2.1610569953918457, "eval_perplexity": 10216.964946819058, "eval_runtime": 119.0936, "eval_samples_per_second": 8.38, "eval_steps_per_second": 2.099, "step": 411000 }, { "epoch": 0.0830388460610076, "grad_norm": 9.348479270935059, "learning_rate": 1.8339230487529726e-05, "loss": 1.8569, "step": 411100 }, { "epoch": 0.08305904524516254, "grad_norm": 4.87159538269043, "learning_rate": 1.8338826503683425e-05, "loss": 1.8449, "step": 411200 }, { "epoch": 0.0830792444293175, "grad_norm": 6.147030830383301, "learning_rate": 1.8338422519837124e-05, "loss": 1.8731, "step": 411300 }, { "epoch": 0.08309944361347245, "grad_norm": 6.228244781494141, "learning_rate": 1.833801853599082e-05, "loss": 1.8063, "step": 411400 }, { "epoch": 0.0831196427976274, "grad_norm": 6.560942649841309, "learning_rate": 1.833761455214452e-05, "loss": 1.7784, "step": 411500 }, { "epoch": 0.08313984198178236, "grad_norm": 9.809507369995117, "learning_rate": 1.8337210568298218e-05, "loss": 1.8311, "step": 411600 }, { "epoch": 0.08316004116593731, "grad_norm": 9.956226348876953, "learning_rate": 1.8336806584451917e-05, "loss": 1.8135, "step": 411700 }, { "epoch": 0.08318024035009226, "grad_norm": 8.294797897338867, "learning_rate": 1.8336402600605613e-05, "loss": 1.794, "step": 411800 }, { "epoch": 0.08320043953424722, "grad_norm": 6.48022985458374, "learning_rate": 1.8335998616759312e-05, "loss": 1.7806, "step": 411900 }, { "epoch": 0.08322063871840217, "grad_norm": 5.242206573486328, "learning_rate": 1.833559463291301e-05, "loss": 1.8548, "step": 412000 }, { "epoch": 0.08322063871840217, "eval_calculated_loss": 9.131585121154785, "eval_loss": 2.166351795196533, "eval_perplexity": 9242.661101358424, "eval_runtime": 122.0293, "eval_samples_per_second": 8.178, "eval_steps_per_second": 2.049, "step": 412000 }, { "epoch": 0.08324083790255711, "grad_norm": 7.4920501708984375, "learning_rate": 1.8335190649066707e-05, "loss": 1.8424, "step": 412100 }, { "epoch": 0.08326103708671206, "grad_norm": 4.95317268371582, "learning_rate": 1.8334786665220406e-05, "loss": 1.8498, "step": 412200 }, { "epoch": 0.08328123627086702, "grad_norm": 8.700780868530273, "learning_rate": 1.8334382681374102e-05, "loss": 1.7557, "step": 412300 }, { "epoch": 0.08330143545502197, "grad_norm": 8.083622932434082, "learning_rate": 1.83339786975278e-05, "loss": 1.8652, "step": 412400 }, { "epoch": 0.08332163463917693, "grad_norm": 8.082448959350586, "learning_rate": 1.83335747136815e-05, "loss": 1.8034, "step": 412500 }, { "epoch": 0.08334183382333188, "grad_norm": 7.220121383666992, "learning_rate": 1.83331707298352e-05, "loss": 1.8107, "step": 412600 }, { "epoch": 0.08336203300748683, "grad_norm": 7.790622711181641, "learning_rate": 1.8332766745988898e-05, "loss": 1.7773, "step": 412700 }, { "epoch": 0.08338223219164179, "grad_norm": 4.764991760253906, "learning_rate": 1.8332362762142594e-05, "loss": 1.8366, "step": 412800 }, { "epoch": 0.08340243137579673, "grad_norm": 9.027872085571289, "learning_rate": 1.8331958778296293e-05, "loss": 1.8284, "step": 412900 }, { "epoch": 0.08342263055995168, "grad_norm": 7.291168689727783, "learning_rate": 1.833155479444999e-05, "loss": 1.7647, "step": 413000 }, { "epoch": 0.08342263055995168, "eval_calculated_loss": 9.080599784851074, "eval_loss": 2.1616337299346924, "eval_perplexity": 8783.232497290139, "eval_runtime": 121.1192, "eval_samples_per_second": 8.24, "eval_steps_per_second": 2.064, "step": 413000 }, { "epoch": 0.08344282974410663, "grad_norm": 4.920919895172119, "learning_rate": 1.8331150810603688e-05, "loss": 1.8074, "step": 413100 }, { "epoch": 0.08346302892826159, "grad_norm": 6.1620073318481445, "learning_rate": 1.8330746826757387e-05, "loss": 1.8674, "step": 413200 }, { "epoch": 0.08348322811241654, "grad_norm": 8.968664169311523, "learning_rate": 1.8330342842911083e-05, "loss": 1.8766, "step": 413300 }, { "epoch": 0.0835034272965715, "grad_norm": 7.475934028625488, "learning_rate": 1.8329938859064782e-05, "loss": 1.7777, "step": 413400 }, { "epoch": 0.08352362648072645, "grad_norm": 5.7123589515686035, "learning_rate": 1.832953487521848e-05, "loss": 1.7503, "step": 413500 }, { "epoch": 0.0835438256648814, "grad_norm": 3.9801735877990723, "learning_rate": 1.832913089137218e-05, "loss": 1.8154, "step": 413600 }, { "epoch": 0.08356402484903634, "grad_norm": 8.94515609741211, "learning_rate": 1.832872690752588e-05, "loss": 1.8272, "step": 413700 }, { "epoch": 0.0835842240331913, "grad_norm": 7.358917236328125, "learning_rate": 1.8328322923679575e-05, "loss": 1.855, "step": 413800 }, { "epoch": 0.08360442321734625, "grad_norm": 8.395889282226562, "learning_rate": 1.8327918939833274e-05, "loss": 1.911, "step": 413900 }, { "epoch": 0.0836246224015012, "grad_norm": 5.069589138031006, "learning_rate": 1.832751495598697e-05, "loss": 1.7949, "step": 414000 }, { "epoch": 0.0836246224015012, "eval_calculated_loss": 9.181689262390137, "eval_loss": 2.139704465866089, "eval_perplexity": 9717.554414884926, "eval_runtime": 125.5308, "eval_samples_per_second": 7.95, "eval_steps_per_second": 1.992, "step": 414000 }, { "epoch": 0.08364482158565616, "grad_norm": 9.519525527954102, "learning_rate": 1.832711097214067e-05, "loss": 1.7955, "step": 414100 }, { "epoch": 0.08366502076981111, "grad_norm": 6.76863431930542, "learning_rate": 1.8326706988294368e-05, "loss": 1.9001, "step": 414200 }, { "epoch": 0.08368521995396606, "grad_norm": 6.896796703338623, "learning_rate": 1.8326303004448064e-05, "loss": 1.8607, "step": 414300 }, { "epoch": 0.08370541913812102, "grad_norm": 8.885563850402832, "learning_rate": 1.8325899020601763e-05, "loss": 1.8922, "step": 414400 }, { "epoch": 0.08372561832227596, "grad_norm": 5.971884250640869, "learning_rate": 1.8325495036755462e-05, "loss": 1.8215, "step": 414500 }, { "epoch": 0.08374581750643091, "grad_norm": 11.921563148498535, "learning_rate": 1.832509105290916e-05, "loss": 1.7905, "step": 414600 }, { "epoch": 0.08376601669058586, "grad_norm": 7.815409183502197, "learning_rate": 1.8324687069062857e-05, "loss": 1.8113, "step": 414700 }, { "epoch": 0.08378621587474082, "grad_norm": 5.517616271972656, "learning_rate": 1.8324283085216556e-05, "loss": 1.8443, "step": 414800 }, { "epoch": 0.08380641505889577, "grad_norm": 5.939117908477783, "learning_rate": 1.8323879101370255e-05, "loss": 1.7448, "step": 414900 }, { "epoch": 0.08382661424305073, "grad_norm": 4.366371154785156, "learning_rate": 1.832347511752395e-05, "loss": 1.859, "step": 415000 }, { "epoch": 0.08382661424305073, "eval_calculated_loss": 9.210808753967285, "eval_loss": 2.1431407928466797, "eval_perplexity": 10004.68491699075, "eval_runtime": 121.4232, "eval_samples_per_second": 8.219, "eval_steps_per_second": 2.059, "step": 415000 }, { "epoch": 0.08384681342720568, "grad_norm": 6.6482110023498535, "learning_rate": 1.832307113367765e-05, "loss": 1.8265, "step": 415100 }, { "epoch": 0.08386701261136063, "grad_norm": 8.47255802154541, "learning_rate": 1.832266714983135e-05, "loss": 1.8453, "step": 415200 }, { "epoch": 0.08388721179551557, "grad_norm": 7.746036052703857, "learning_rate": 1.8322263165985045e-05, "loss": 1.8113, "step": 415300 }, { "epoch": 0.08390741097967053, "grad_norm": 7.141107559204102, "learning_rate": 1.8321859182138744e-05, "loss": 1.8424, "step": 415400 }, { "epoch": 0.08392761016382548, "grad_norm": 7.3766632080078125, "learning_rate": 1.832145519829244e-05, "loss": 1.8478, "step": 415500 }, { "epoch": 0.08394780934798043, "grad_norm": 5.821646690368652, "learning_rate": 1.8321051214446142e-05, "loss": 1.8372, "step": 415600 }, { "epoch": 0.08396800853213539, "grad_norm": 8.392439842224121, "learning_rate": 1.8320647230599838e-05, "loss": 1.7943, "step": 415700 }, { "epoch": 0.08398820771629034, "grad_norm": 9.018341064453125, "learning_rate": 1.8320243246753537e-05, "loss": 1.7746, "step": 415800 }, { "epoch": 0.0840084069004453, "grad_norm": 5.546571254730225, "learning_rate": 1.8319839262907236e-05, "loss": 1.8596, "step": 415900 }, { "epoch": 0.08402860608460025, "grad_norm": 7.279107570648193, "learning_rate": 1.8319435279060932e-05, "loss": 1.9107, "step": 416000 }, { "epoch": 0.08402860608460025, "eval_calculated_loss": 9.178197860717773, "eval_loss": 2.1484458446502686, "eval_perplexity": 9683.68568821863, "eval_runtime": 119.0536, "eval_samples_per_second": 8.383, "eval_steps_per_second": 2.1, "step": 416000 }, { "epoch": 0.08404880526875519, "grad_norm": 7.332847595214844, "learning_rate": 1.831903129521463e-05, "loss": 1.7986, "step": 416100 }, { "epoch": 0.08406900445291014, "grad_norm": 6.626583576202393, "learning_rate": 1.831862731136833e-05, "loss": 1.8227, "step": 416200 }, { "epoch": 0.0840892036370651, "grad_norm": 4.4493727684021, "learning_rate": 1.8318223327522026e-05, "loss": 1.8392, "step": 416300 }, { "epoch": 0.08410940282122005, "grad_norm": 6.410115718841553, "learning_rate": 1.8317819343675725e-05, "loss": 1.7998, "step": 416400 }, { "epoch": 0.084129602005375, "grad_norm": 7.831961154937744, "learning_rate": 1.831741535982942e-05, "loss": 1.7291, "step": 416500 }, { "epoch": 0.08414980118952996, "grad_norm": 5.93422269821167, "learning_rate": 1.8317011375983124e-05, "loss": 1.8577, "step": 416600 }, { "epoch": 0.08417000037368491, "grad_norm": 9.494799613952637, "learning_rate": 1.831660739213682e-05, "loss": 1.8205, "step": 416700 }, { "epoch": 0.08419019955783986, "grad_norm": 6.009932041168213, "learning_rate": 1.831620340829052e-05, "loss": 1.7694, "step": 416800 }, { "epoch": 0.0842103987419948, "grad_norm": 8.427569389343262, "learning_rate": 1.8315799424444218e-05, "loss": 1.784, "step": 416900 }, { "epoch": 0.08423059792614976, "grad_norm": 6.405014991760254, "learning_rate": 1.8315395440597913e-05, "loss": 1.8082, "step": 417000 }, { "epoch": 0.08423059792614976, "eval_calculated_loss": 9.29077434539795, "eval_loss": 2.15290904045105, "eval_perplexity": 10837.57287494545, "eval_runtime": 120.4644, "eval_samples_per_second": 8.285, "eval_steps_per_second": 2.075, "step": 417000 }, { "epoch": 0.08425079711030471, "grad_norm": 7.644352912902832, "learning_rate": 1.8314991456751612e-05, "loss": 1.7831, "step": 417100 }, { "epoch": 0.08427099629445967, "grad_norm": 6.26356315612793, "learning_rate": 1.8314587472905308e-05, "loss": 1.8455, "step": 417200 }, { "epoch": 0.08429119547861462, "grad_norm": 7.846127033233643, "learning_rate": 1.8314183489059007e-05, "loss": 1.8415, "step": 417300 }, { "epoch": 0.08431139466276957, "grad_norm": 7.160844326019287, "learning_rate": 1.8313779505212706e-05, "loss": 1.9099, "step": 417400 }, { "epoch": 0.08433159384692453, "grad_norm": 9.112675666809082, "learning_rate": 1.8313375521366402e-05, "loss": 1.7762, "step": 417500 }, { "epoch": 0.08435179303107948, "grad_norm": 7.911828517913818, "learning_rate": 1.83129715375201e-05, "loss": 1.8051, "step": 417600 }, { "epoch": 0.08437199221523442, "grad_norm": 7.9289703369140625, "learning_rate": 1.83125675536738e-05, "loss": 1.8482, "step": 417700 }, { "epoch": 0.08439219139938937, "grad_norm": 12.742350578308105, "learning_rate": 1.83121635698275e-05, "loss": 1.7145, "step": 417800 }, { "epoch": 0.08441239058354433, "grad_norm": 7.211028099060059, "learning_rate": 1.8311759585981195e-05, "loss": 1.7512, "step": 417900 }, { "epoch": 0.08443258976769928, "grad_norm": 8.87987232208252, "learning_rate": 1.8311355602134894e-05, "loss": 1.8669, "step": 418000 }, { "epoch": 0.08443258976769928, "eval_calculated_loss": 9.140390396118164, "eval_loss": 2.1506943702697754, "eval_perplexity": 9324.404632734151, "eval_runtime": 119.5094, "eval_samples_per_second": 8.351, "eval_steps_per_second": 2.092, "step": 418000 }, { "epoch": 0.08445278895185424, "grad_norm": 9.283807754516602, "learning_rate": 1.8310951618288594e-05, "loss": 1.697, "step": 418100 }, { "epoch": 0.08447298813600919, "grad_norm": 7.230323314666748, "learning_rate": 1.831054763444229e-05, "loss": 1.7571, "step": 418200 }, { "epoch": 0.08449318732016414, "grad_norm": 8.235542297363281, "learning_rate": 1.831014365059599e-05, "loss": 1.8471, "step": 418300 }, { "epoch": 0.0845133865043191, "grad_norm": 8.232141494750977, "learning_rate": 1.8309739666749688e-05, "loss": 1.8008, "step": 418400 }, { "epoch": 0.08453358568847404, "grad_norm": 6.491087436676025, "learning_rate": 1.8309335682903383e-05, "loss": 1.7641, "step": 418500 }, { "epoch": 0.08455378487262899, "grad_norm": 6.849950790405273, "learning_rate": 1.8308931699057082e-05, "loss": 1.8929, "step": 418600 }, { "epoch": 0.08457398405678394, "grad_norm": 6.340728759765625, "learning_rate": 1.830852771521078e-05, "loss": 1.8722, "step": 418700 }, { "epoch": 0.0845941832409389, "grad_norm": 6.702763080596924, "learning_rate": 1.830812373136448e-05, "loss": 1.7258, "step": 418800 }, { "epoch": 0.08461438242509385, "grad_norm": 9.978192329406738, "learning_rate": 1.8307719747518176e-05, "loss": 1.8401, "step": 418900 }, { "epoch": 0.0846345816092488, "grad_norm": 6.636532783508301, "learning_rate": 1.8307315763671876e-05, "loss": 1.8932, "step": 419000 }, { "epoch": 0.0846345816092488, "eval_calculated_loss": 8.994178771972656, "eval_loss": 2.157411813735962, "eval_perplexity": 8056.051055660007, "eval_runtime": 118.4595, "eval_samples_per_second": 8.425, "eval_steps_per_second": 2.11, "step": 419000 }, { "epoch": 0.08465478079340376, "grad_norm": 8.755831718444824, "learning_rate": 1.8306911779825575e-05, "loss": 1.7632, "step": 419100 }, { "epoch": 0.08467497997755871, "grad_norm": 5.848514080047607, "learning_rate": 1.830650779597927e-05, "loss": 1.8589, "step": 419200 }, { "epoch": 0.08469517916171365, "grad_norm": 4.427034854888916, "learning_rate": 1.830610381213297e-05, "loss": 1.8249, "step": 419300 }, { "epoch": 0.0847153783458686, "grad_norm": 7.695736408233643, "learning_rate": 1.830569982828667e-05, "loss": 1.748, "step": 419400 }, { "epoch": 0.08473557753002356, "grad_norm": 3.1310458183288574, "learning_rate": 1.8305295844440364e-05, "loss": 1.7741, "step": 419500 }, { "epoch": 0.08475577671417851, "grad_norm": 11.125128746032715, "learning_rate": 1.8304891860594064e-05, "loss": 1.7949, "step": 419600 }, { "epoch": 0.08477597589833347, "grad_norm": 9.05510425567627, "learning_rate": 1.8304487876747763e-05, "loss": 1.9178, "step": 419700 }, { "epoch": 0.08479617508248842, "grad_norm": 7.098130226135254, "learning_rate": 1.8304083892901462e-05, "loss": 1.7303, "step": 419800 }, { "epoch": 0.08481637426664337, "grad_norm": 7.98378324508667, "learning_rate": 1.8303679909055158e-05, "loss": 1.9111, "step": 419900 }, { "epoch": 0.08483657345079833, "grad_norm": 7.242863655090332, "learning_rate": 1.8303275925208857e-05, "loss": 1.8008, "step": 420000 }, { "epoch": 0.08483657345079833, "eval_calculated_loss": 8.993282318115234, "eval_loss": 2.161144256591797, "eval_perplexity": 8048.8324136887195, "eval_runtime": 120.2083, "eval_samples_per_second": 8.302, "eval_steps_per_second": 2.08, "step": 420000 }, { "epoch": 0.08485677263495327, "grad_norm": 6.413053035736084, "learning_rate": 1.8302871941362556e-05, "loss": 1.7171, "step": 420100 }, { "epoch": 0.08487697181910822, "grad_norm": 6.688105583190918, "learning_rate": 1.830246795751625e-05, "loss": 1.8015, "step": 420200 }, { "epoch": 0.08489717100326317, "grad_norm": 3.7928783893585205, "learning_rate": 1.830206397366995e-05, "loss": 1.8014, "step": 420300 }, { "epoch": 0.08491737018741813, "grad_norm": 4.180181980133057, "learning_rate": 1.8301659989823646e-05, "loss": 1.8149, "step": 420400 }, { "epoch": 0.08493756937157308, "grad_norm": 7.1839118003845215, "learning_rate": 1.8301256005977346e-05, "loss": 1.7863, "step": 420500 }, { "epoch": 0.08495776855572804, "grad_norm": 7.554237365722656, "learning_rate": 1.8300852022131045e-05, "loss": 1.6886, "step": 420600 }, { "epoch": 0.08497796773988299, "grad_norm": 7.224564075469971, "learning_rate": 1.830044803828474e-05, "loss": 1.8987, "step": 420700 }, { "epoch": 0.08499816692403794, "grad_norm": 4.308104991912842, "learning_rate": 1.8300044054438443e-05, "loss": 1.7918, "step": 420800 }, { "epoch": 0.08501836610819288, "grad_norm": 6.2921013832092285, "learning_rate": 1.829964007059214e-05, "loss": 1.7949, "step": 420900 }, { "epoch": 0.08503856529234784, "grad_norm": 7.541611671447754, "learning_rate": 1.8299236086745838e-05, "loss": 1.8439, "step": 421000 }, { "epoch": 0.08503856529234784, "eval_calculated_loss": 9.303682327270508, "eval_loss": 2.154186248779297, "eval_perplexity": 10978.370822910694, "eval_runtime": 116.8188, "eval_samples_per_second": 8.543, "eval_steps_per_second": 2.14, "step": 421000 }, { "epoch": 0.08505876447650279, "grad_norm": 8.937958717346191, "learning_rate": 1.8298832102899537e-05, "loss": 1.8998, "step": 421100 }, { "epoch": 0.08507896366065774, "grad_norm": 6.996360778808594, "learning_rate": 1.8298428119053233e-05, "loss": 1.7641, "step": 421200 }, { "epoch": 0.0850991628448127, "grad_norm": 5.88022518157959, "learning_rate": 1.8298024135206932e-05, "loss": 1.7682, "step": 421300 }, { "epoch": 0.08511936202896765, "grad_norm": 4.9054274559021, "learning_rate": 1.8297620151360628e-05, "loss": 1.7982, "step": 421400 }, { "epoch": 0.0851395612131226, "grad_norm": 8.138473510742188, "learning_rate": 1.8297216167514327e-05, "loss": 1.7551, "step": 421500 }, { "epoch": 0.08515976039727756, "grad_norm": 5.793388366699219, "learning_rate": 1.8296812183668026e-05, "loss": 1.6982, "step": 421600 }, { "epoch": 0.08517995958143251, "grad_norm": 8.462969779968262, "learning_rate": 1.829640819982172e-05, "loss": 1.7167, "step": 421700 }, { "epoch": 0.08520015876558745, "grad_norm": 7.344801902770996, "learning_rate": 1.8296004215975424e-05, "loss": 1.9117, "step": 421800 }, { "epoch": 0.0852203579497424, "grad_norm": 7.763269424438477, "learning_rate": 1.829560023212912e-05, "loss": 1.8082, "step": 421900 }, { "epoch": 0.08524055713389736, "grad_norm": 7.260557174682617, "learning_rate": 1.829519624828282e-05, "loss": 1.8075, "step": 422000 }, { "epoch": 0.08524055713389736, "eval_calculated_loss": 9.247808456420898, "eval_loss": 2.160222291946411, "eval_perplexity": 10381.788624981926, "eval_runtime": 119.7512, "eval_samples_per_second": 8.334, "eval_steps_per_second": 2.088, "step": 422000 }, { "epoch": 0.08526075631805231, "grad_norm": 9.569961547851562, "learning_rate": 1.8294792264436515e-05, "loss": 1.8264, "step": 422100 }, { "epoch": 0.08528095550220727, "grad_norm": 4.935611248016357, "learning_rate": 1.8294388280590214e-05, "loss": 1.8399, "step": 422200 }, { "epoch": 0.08530115468636222, "grad_norm": 6.584778785705566, "learning_rate": 1.8293984296743913e-05, "loss": 1.8551, "step": 422300 }, { "epoch": 0.08532135387051717, "grad_norm": 4.479078769683838, "learning_rate": 1.829358031289761e-05, "loss": 1.8224, "step": 422400 }, { "epoch": 0.08534155305467213, "grad_norm": 6.991487979888916, "learning_rate": 1.8293176329051308e-05, "loss": 1.7717, "step": 422500 }, { "epoch": 0.08536175223882707, "grad_norm": 5.194042682647705, "learning_rate": 1.8292772345205007e-05, "loss": 1.847, "step": 422600 }, { "epoch": 0.08538195142298202, "grad_norm": 7.741470813751221, "learning_rate": 1.8292368361358703e-05, "loss": 1.8057, "step": 422700 }, { "epoch": 0.08540215060713698, "grad_norm": 9.566228866577148, "learning_rate": 1.8291964377512402e-05, "loss": 1.7603, "step": 422800 }, { "epoch": 0.08542234979129193, "grad_norm": 8.853109359741211, "learning_rate": 1.82915603936661e-05, "loss": 1.8306, "step": 422900 }, { "epoch": 0.08544254897544688, "grad_norm": 8.92155647277832, "learning_rate": 1.82911564098198e-05, "loss": 1.6926, "step": 423000 }, { "epoch": 0.08544254897544688, "eval_calculated_loss": 9.461281776428223, "eval_loss": 2.1515486240386963, "eval_perplexity": 12852.347730889785, "eval_runtime": 118.0017, "eval_samples_per_second": 8.458, "eval_steps_per_second": 2.119, "step": 423000 }, { "epoch": 0.08546274815960184, "grad_norm": 8.60214614868164, "learning_rate": 1.8290752425973496e-05, "loss": 1.8419, "step": 423100 }, { "epoch": 0.08548294734375679, "grad_norm": 9.164385795593262, "learning_rate": 1.8290348442127195e-05, "loss": 1.7965, "step": 423200 }, { "epoch": 0.08550314652791174, "grad_norm": 3.2410502433776855, "learning_rate": 1.8289944458280894e-05, "loss": 1.7837, "step": 423300 }, { "epoch": 0.08552334571206668, "grad_norm": 9.676871299743652, "learning_rate": 1.828954047443459e-05, "loss": 1.8603, "step": 423400 }, { "epoch": 0.08554354489622164, "grad_norm": 6.3174848556518555, "learning_rate": 1.828913649058829e-05, "loss": 1.8575, "step": 423500 }, { "epoch": 0.08556374408037659, "grad_norm": 9.761197090148926, "learning_rate": 1.8288732506741985e-05, "loss": 1.9138, "step": 423600 }, { "epoch": 0.08558394326453155, "grad_norm": 6.450584888458252, "learning_rate": 1.8288328522895684e-05, "loss": 1.7795, "step": 423700 }, { "epoch": 0.0856041424486865, "grad_norm": 10.606514930725098, "learning_rate": 1.8287924539049383e-05, "loss": 1.845, "step": 423800 }, { "epoch": 0.08562434163284145, "grad_norm": 6.521526336669922, "learning_rate": 1.8287520555203082e-05, "loss": 1.7282, "step": 423900 }, { "epoch": 0.0856445408169964, "grad_norm": 4.706676959991455, "learning_rate": 1.828711657135678e-05, "loss": 1.7848, "step": 424000 }, { "epoch": 0.0856445408169964, "eval_calculated_loss": 9.247177124023438, "eval_loss": 2.1499366760253906, "eval_perplexity": 10375.236334033794, "eval_runtime": 120.5343, "eval_samples_per_second": 8.28, "eval_steps_per_second": 2.074, "step": 424000 }, { "epoch": 0.08566474000115136, "grad_norm": 2.536306619644165, "learning_rate": 1.8286712587510477e-05, "loss": 1.8405, "step": 424100 }, { "epoch": 0.0856849391853063, "grad_norm": 9.734230041503906, "learning_rate": 1.8286308603664176e-05, "loss": 1.8849, "step": 424200 }, { "epoch": 0.08570513836946125, "grad_norm": 8.468152046203613, "learning_rate": 1.8285904619817875e-05, "loss": 1.7155, "step": 424300 }, { "epoch": 0.08572533755361621, "grad_norm": 10.907402038574219, "learning_rate": 1.828550063597157e-05, "loss": 1.7401, "step": 424400 }, { "epoch": 0.08574553673777116, "grad_norm": 6.628477096557617, "learning_rate": 1.828509665212527e-05, "loss": 1.8147, "step": 424500 }, { "epoch": 0.08576573592192611, "grad_norm": 5.086821556091309, "learning_rate": 1.8284692668278966e-05, "loss": 1.8706, "step": 424600 }, { "epoch": 0.08578593510608107, "grad_norm": 7.067666530609131, "learning_rate": 1.8284288684432665e-05, "loss": 1.8452, "step": 424700 }, { "epoch": 0.08580613429023602, "grad_norm": 12.79640007019043, "learning_rate": 1.8283884700586364e-05, "loss": 1.856, "step": 424800 }, { "epoch": 0.08582633347439098, "grad_norm": 5.587308883666992, "learning_rate": 1.8283480716740063e-05, "loss": 1.8198, "step": 424900 }, { "epoch": 0.08584653265854592, "grad_norm": 5.471019268035889, "learning_rate": 1.8283076732893762e-05, "loss": 1.8294, "step": 425000 }, { "epoch": 0.08584653265854592, "eval_calculated_loss": 9.168128967285156, "eval_loss": 2.15287709236145, "eval_perplexity": 9586.67092428155, "eval_runtime": 118.9413, "eval_samples_per_second": 8.391, "eval_steps_per_second": 2.102, "step": 425000 }, { "epoch": 0.08586673184270087, "grad_norm": 8.728907585144043, "learning_rate": 1.8282672749047458e-05, "loss": 1.9171, "step": 425100 }, { "epoch": 0.08588693102685582, "grad_norm": 5.359127998352051, "learning_rate": 1.8282268765201157e-05, "loss": 1.8351, "step": 425200 }, { "epoch": 0.08590713021101078, "grad_norm": 4.222873210906982, "learning_rate": 1.8281864781354853e-05, "loss": 1.8234, "step": 425300 }, { "epoch": 0.08592732939516573, "grad_norm": 7.627676963806152, "learning_rate": 1.8281460797508552e-05, "loss": 1.8605, "step": 425400 }, { "epoch": 0.08594752857932068, "grad_norm": 6.692786693572998, "learning_rate": 1.828105681366225e-05, "loss": 1.8108, "step": 425500 }, { "epoch": 0.08596772776347564, "grad_norm": 9.308635711669922, "learning_rate": 1.8280652829815947e-05, "loss": 1.8585, "step": 425600 }, { "epoch": 0.08598792694763059, "grad_norm": 6.339076042175293, "learning_rate": 1.8280248845969646e-05, "loss": 1.771, "step": 425700 }, { "epoch": 0.08600812613178553, "grad_norm": 9.292741775512695, "learning_rate": 1.8279844862123345e-05, "loss": 1.7739, "step": 425800 }, { "epoch": 0.08602832531594048, "grad_norm": 5.4735541343688965, "learning_rate": 1.827944087827704e-05, "loss": 1.8324, "step": 425900 }, { "epoch": 0.08604852450009544, "grad_norm": 7.7512946128845215, "learning_rate": 1.8279036894430743e-05, "loss": 1.8606, "step": 426000 }, { "epoch": 0.08604852450009544, "eval_calculated_loss": 9.328418731689453, "eval_loss": 2.157205581665039, "eval_perplexity": 11253.322886279955, "eval_runtime": 124.6951, "eval_samples_per_second": 8.004, "eval_steps_per_second": 2.005, "step": 426000 }, { "epoch": 0.08606872368425039, "grad_norm": 4.879986763000488, "learning_rate": 1.827863291058444e-05, "loss": 1.8307, "step": 426100 }, { "epoch": 0.08608892286840535, "grad_norm": 7.70387077331543, "learning_rate": 1.8278228926738138e-05, "loss": 1.8381, "step": 426200 }, { "epoch": 0.0861091220525603, "grad_norm": 8.852095603942871, "learning_rate": 1.8277824942891834e-05, "loss": 1.7607, "step": 426300 }, { "epoch": 0.08612932123671525, "grad_norm": 9.934218406677246, "learning_rate": 1.8277420959045533e-05, "loss": 1.7648, "step": 426400 }, { "epoch": 0.08614952042087021, "grad_norm": 9.036935806274414, "learning_rate": 1.8277016975199232e-05, "loss": 1.8013, "step": 426500 }, { "epoch": 0.08616971960502515, "grad_norm": 9.266921043395996, "learning_rate": 1.8276612991352928e-05, "loss": 1.775, "step": 426600 }, { "epoch": 0.0861899187891801, "grad_norm": 11.959612846374512, "learning_rate": 1.8276209007506627e-05, "loss": 1.8671, "step": 426700 }, { "epoch": 0.08621011797333505, "grad_norm": 3.957697868347168, "learning_rate": 1.8275805023660326e-05, "loss": 1.8094, "step": 426800 }, { "epoch": 0.08623031715749001, "grad_norm": 9.138283729553223, "learning_rate": 1.8275401039814022e-05, "loss": 1.8212, "step": 426900 }, { "epoch": 0.08625051634164496, "grad_norm": 4.167767524719238, "learning_rate": 1.827499705596772e-05, "loss": 1.8583, "step": 427000 }, { "epoch": 0.08625051634164496, "eval_calculated_loss": 9.30738353729248, "eval_loss": 2.1598093509674072, "eval_perplexity": 11019.079367990864, "eval_runtime": 120.7324, "eval_samples_per_second": 8.266, "eval_steps_per_second": 2.071, "step": 427000 }, { "epoch": 0.08627071552579992, "grad_norm": 6.546739101409912, "learning_rate": 1.827459307212142e-05, "loss": 1.8322, "step": 427100 }, { "epoch": 0.08629091470995487, "grad_norm": 8.145934104919434, "learning_rate": 1.827418908827512e-05, "loss": 1.8225, "step": 427200 }, { "epoch": 0.08631111389410982, "grad_norm": 3.702028751373291, "learning_rate": 1.8273785104428815e-05, "loss": 1.8555, "step": 427300 }, { "epoch": 0.08633131307826476, "grad_norm": 6.30873966217041, "learning_rate": 1.8273381120582514e-05, "loss": 1.8848, "step": 427400 }, { "epoch": 0.08635151226241972, "grad_norm": 6.224644184112549, "learning_rate": 1.8272977136736213e-05, "loss": 1.7413, "step": 427500 }, { "epoch": 0.08637171144657467, "grad_norm": 11.107601165771484, "learning_rate": 1.827257315288991e-05, "loss": 1.8625, "step": 427600 }, { "epoch": 0.08639191063072962, "grad_norm": 4.994136333465576, "learning_rate": 1.8272169169043608e-05, "loss": 1.7677, "step": 427700 }, { "epoch": 0.08641210981488458, "grad_norm": 6.793715953826904, "learning_rate": 1.8271765185197304e-05, "loss": 1.8308, "step": 427800 }, { "epoch": 0.08643230899903953, "grad_norm": 10.218241691589355, "learning_rate": 1.8271361201351003e-05, "loss": 1.8048, "step": 427900 }, { "epoch": 0.08645250818319448, "grad_norm": 9.50942325592041, "learning_rate": 1.8270957217504702e-05, "loss": 1.9165, "step": 428000 }, { "epoch": 0.08645250818319448, "eval_calculated_loss": 9.34253978729248, "eval_loss": 2.1467504501342773, "eval_perplexity": 11413.35896434084, "eval_runtime": 121.7062, "eval_samples_per_second": 8.2, "eval_steps_per_second": 2.054, "step": 428000 }, { "epoch": 0.08647270736734944, "grad_norm": 6.610445499420166, "learning_rate": 1.82705532336584e-05, "loss": 1.7966, "step": 428100 }, { "epoch": 0.08649290655150438, "grad_norm": 6.609389781951904, "learning_rate": 1.82701492498121e-05, "loss": 1.8173, "step": 428200 }, { "epoch": 0.08651310573565933, "grad_norm": 10.08692455291748, "learning_rate": 1.8269745265965796e-05, "loss": 1.8384, "step": 428300 }, { "epoch": 0.08653330491981429, "grad_norm": 5.440232753753662, "learning_rate": 1.8269341282119495e-05, "loss": 1.7784, "step": 428400 }, { "epoch": 0.08655350410396924, "grad_norm": 7.416434288024902, "learning_rate": 1.826893729827319e-05, "loss": 1.8167, "step": 428500 }, { "epoch": 0.08657370328812419, "grad_norm": 6.3675537109375, "learning_rate": 1.826853331442689e-05, "loss": 1.844, "step": 428600 }, { "epoch": 0.08659390247227915, "grad_norm": 6.304159641265869, "learning_rate": 1.826812933058059e-05, "loss": 1.8466, "step": 428700 }, { "epoch": 0.0866141016564341, "grad_norm": 7.3371782302856445, "learning_rate": 1.8267725346734285e-05, "loss": 1.7166, "step": 428800 }, { "epoch": 0.08663430084058905, "grad_norm": 6.735725402832031, "learning_rate": 1.8267321362887984e-05, "loss": 1.8905, "step": 428900 }, { "epoch": 0.086654500024744, "grad_norm": 5.622129440307617, "learning_rate": 1.8266917379041683e-05, "loss": 1.85, "step": 429000 }, { "epoch": 0.086654500024744, "eval_calculated_loss": 9.373899459838867, "eval_loss": 2.159902334213257, "eval_perplexity": 11776.949406085098, "eval_runtime": 117.9465, "eval_samples_per_second": 8.461, "eval_steps_per_second": 2.12, "step": 429000 }, { "epoch": 0.08667469920889895, "grad_norm": 6.762901306152344, "learning_rate": 1.8266513395195382e-05, "loss": 1.8265, "step": 429100 }, { "epoch": 0.0866948983930539, "grad_norm": 8.692009925842285, "learning_rate": 1.826610941134908e-05, "loss": 1.8383, "step": 429200 }, { "epoch": 0.08671509757720886, "grad_norm": 5.679828643798828, "learning_rate": 1.8265705427502777e-05, "loss": 1.8476, "step": 429300 }, { "epoch": 0.08673529676136381, "grad_norm": 10.169102668762207, "learning_rate": 1.8265301443656476e-05, "loss": 1.8147, "step": 429400 }, { "epoch": 0.08675549594551876, "grad_norm": 5.833523273468018, "learning_rate": 1.8264897459810172e-05, "loss": 1.8876, "step": 429500 }, { "epoch": 0.08677569512967372, "grad_norm": 6.57623291015625, "learning_rate": 1.826449347596387e-05, "loss": 1.8491, "step": 429600 }, { "epoch": 0.08679589431382867, "grad_norm": 7.553040027618408, "learning_rate": 1.826408949211757e-05, "loss": 1.8982, "step": 429700 }, { "epoch": 0.08681609349798361, "grad_norm": 3.7053771018981934, "learning_rate": 1.8263685508271266e-05, "loss": 1.7308, "step": 429800 }, { "epoch": 0.08683629268213856, "grad_norm": 5.743793487548828, "learning_rate": 1.8263281524424965e-05, "loss": 1.8752, "step": 429900 }, { "epoch": 0.08685649186629352, "grad_norm": 9.06568717956543, "learning_rate": 1.8262877540578664e-05, "loss": 1.8599, "step": 430000 }, { "epoch": 0.08685649186629352, "eval_calculated_loss": 9.405138969421387, "eval_loss": 2.1570987701416016, "eval_perplexity": 12150.662443988485, "eval_runtime": 119.8972, "eval_samples_per_second": 8.324, "eval_steps_per_second": 2.085, "step": 430000 } ], "logging_steps": 100, "max_steps": 4950695, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.4467225665634304e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }