| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9982003599280143, |
| "eval_steps": 500, |
| "global_step": 832, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01199760047990402, |
| "grad_norm": 8.655085086138964, |
| "learning_rate": 2.9761904761904763e-06, |
| "loss": 1.9665, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02399520095980804, |
| "grad_norm": 4.415158437946272, |
| "learning_rate": 5.9523809523809525e-06, |
| "loss": 1.6691, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.035992801439712056, |
| "grad_norm": 3.057605663016171, |
| "learning_rate": 8.92857142857143e-06, |
| "loss": 1.441, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.04799040191961608, |
| "grad_norm": 3.042012061427229, |
| "learning_rate": 1.1904761904761905e-05, |
| "loss": 1.3171, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.059988002399520096, |
| "grad_norm": 2.1436915691999014, |
| "learning_rate": 1.4880952380952381e-05, |
| "loss": 1.2588, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.07198560287942411, |
| "grad_norm": 2.03272405774148, |
| "learning_rate": 1.785714285714286e-05, |
| "loss": 1.2021, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08398320335932813, |
| "grad_norm": 1.9317364452662562, |
| "learning_rate": 2.0833333333333336e-05, |
| "loss": 1.1645, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.09598080383923216, |
| "grad_norm": 1.9450167856079439, |
| "learning_rate": 2.380952380952381e-05, |
| "loss": 1.1652, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10797840431913618, |
| "grad_norm": 2.2175764125145943, |
| "learning_rate": 2.6785714285714288e-05, |
| "loss": 1.1492, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.11997600479904019, |
| "grad_norm": 1.9900888357541513, |
| "learning_rate": 2.9761904761904762e-05, |
| "loss": 1.146, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.13197360527894422, |
| "grad_norm": 2.1707343241074715, |
| "learning_rate": 3.273809523809524e-05, |
| "loss": 1.1312, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.14397120575884823, |
| "grad_norm": 2.1008421926969874, |
| "learning_rate": 3.571428571428572e-05, |
| "loss": 1.1158, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15596880623875226, |
| "grad_norm": 1.9739168298796461, |
| "learning_rate": 3.8690476190476195e-05, |
| "loss": 1.112, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.16796640671865626, |
| "grad_norm": 1.8793192108758627, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 1.1216, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.1799640071985603, |
| "grad_norm": 2.1680515905272393, |
| "learning_rate": 4.464285714285715e-05, |
| "loss": 1.1154, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.19196160767846432, |
| "grad_norm": 2.088302206757574, |
| "learning_rate": 4.761904761904762e-05, |
| "loss": 1.115, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.20395920815836832, |
| "grad_norm": 2.0040693696661096, |
| "learning_rate": 4.9999779501355384e-05, |
| "loss": 1.1166, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.21595680863827235, |
| "grad_norm": 1.8604233679285211, |
| "learning_rate": 4.9992062457191e-05, |
| "loss": 1.126, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.22795440911817635, |
| "grad_norm": 1.8293195388484245, |
| "learning_rate": 4.997332437005931e-05, |
| "loss": 1.1183, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.23995200959808038, |
| "grad_norm": 1.9227439225768022, |
| "learning_rate": 4.99435735031144e-05, |
| "loss": 1.1341, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2519496100779844, |
| "grad_norm": 1.7353567528325675, |
| "learning_rate": 4.990282297594509e-05, |
| "loss": 1.1111, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.26394721055788845, |
| "grad_norm": 1.6246909472281814, |
| "learning_rate": 4.98510907587894e-05, |
| "loss": 1.1085, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.27594481103779245, |
| "grad_norm": 1.5550505481063543, |
| "learning_rate": 4.9788399664609985e-05, |
| "loss": 1.1081, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.28794241151769645, |
| "grad_norm": 1.6242317436366518, |
| "learning_rate": 4.97147773390341e-05, |
| "loss": 1.1187, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.29994001199760045, |
| "grad_norm": 1.6719313503244617, |
| "learning_rate": 4.963025624816232e-05, |
| "loss": 1.0934, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.3119376124775045, |
| "grad_norm": 1.7557488929934109, |
| "learning_rate": 4.953487366425163e-05, |
| "loss": 1.0939, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3239352129574085, |
| "grad_norm": 1.846886767486188, |
| "learning_rate": 4.942867164927899e-05, |
| "loss": 1.1016, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.3359328134373125, |
| "grad_norm": 1.564323242123012, |
| "learning_rate": 4.931169703639282e-05, |
| "loss": 1.0755, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3479304139172166, |
| "grad_norm": 1.5659567424784107, |
| "learning_rate": 4.918400140926042e-05, |
| "loss": 1.0772, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.3599280143971206, |
| "grad_norm": 1.6772397525925744, |
| "learning_rate": 4.9045641079320484e-05, |
| "loss": 1.0975, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3719256148770246, |
| "grad_norm": 1.4057626611756922, |
| "learning_rate": 4.889667706095084e-05, |
| "loss": 1.0894, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.38392321535692864, |
| "grad_norm": 1.3508523989809198, |
| "learning_rate": 4.873717504456219e-05, |
| "loss": 1.0899, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.39592081583683264, |
| "grad_norm": 1.4599727770410549, |
| "learning_rate": 4.8567205367629835e-05, |
| "loss": 1.0954, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.40791841631673664, |
| "grad_norm": 1.557943385459695, |
| "learning_rate": 4.8386842983676164e-05, |
| "loss": 1.1022, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.41991601679664065, |
| "grad_norm": 1.4392521405523333, |
| "learning_rate": 4.8196167429217474e-05, |
| "loss": 1.0858, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.4319136172765447, |
| "grad_norm": 1.4039232690942924, |
| "learning_rate": 4.799526278868987e-05, |
| "loss": 1.0776, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.4439112177564487, |
| "grad_norm": 1.3958637024927205, |
| "learning_rate": 4.778421765736951e-05, |
| "loss": 1.0656, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.4559088182363527, |
| "grad_norm": 1.3165094843488907, |
| "learning_rate": 4.7563125102303766e-05, |
| "loss": 1.0701, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.46790641871625677, |
| "grad_norm": 1.4462822477245483, |
| "learning_rate": 4.7332082621270326e-05, |
| "loss": 1.078, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.47990401919616077, |
| "grad_norm": 1.4177688615580946, |
| "learning_rate": 4.709119209978242e-05, |
| "loss": 1.0748, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.49190161967606477, |
| "grad_norm": 1.2487244087937184, |
| "learning_rate": 4.684055976615924e-05, |
| "loss": 1.0728, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.5038992201559688, |
| "grad_norm": 1.2843589104522166, |
| "learning_rate": 4.6580296144681157e-05, |
| "loss": 1.0741, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5158968206358728, |
| "grad_norm": 1.3423526627782842, |
| "learning_rate": 4.631051600685051e-05, |
| "loss": 1.0545, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.5278944211157769, |
| "grad_norm": 1.3987872492900897, |
| "learning_rate": 4.6031338320779534e-05, |
| "loss": 1.0466, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5398920215956808, |
| "grad_norm": 1.1228691632454497, |
| "learning_rate": 4.57428861987275e-05, |
| "loss": 1.0757, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.5518896220755849, |
| "grad_norm": 1.4386106052561902, |
| "learning_rate": 4.544528684281056e-05, |
| "loss": 1.0659, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.563887222555489, |
| "grad_norm": 1.54532036456569, |
| "learning_rate": 4.513867148890788e-05, |
| "loss": 1.0731, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.5758848230353929, |
| "grad_norm": 1.1946309045723265, |
| "learning_rate": 4.482317534878901e-05, |
| "loss": 1.0724, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.587882423515297, |
| "grad_norm": 1.2995921290959758, |
| "learning_rate": 4.449893755048799e-05, |
| "loss": 1.0538, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.5998800239952009, |
| "grad_norm": 1.1629232540517822, |
| "learning_rate": 4.416610107695042e-05, |
| "loss": 1.0653, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.611877624475105, |
| "grad_norm": 1.141699716294497, |
| "learning_rate": 4.3824812702980595e-05, |
| "loss": 1.0593, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.623875224955009, |
| "grad_norm": 1.1625584379097018, |
| "learning_rate": 4.347522293051648e-05, |
| "loss": 1.0728, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.635872825434913, |
| "grad_norm": 1.2139933398328908, |
| "learning_rate": 4.3117485922261136e-05, |
| "loss": 1.0683, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.647870425914817, |
| "grad_norm": 1.1676365588651592, |
| "learning_rate": 4.275175943369975e-05, |
| "loss": 1.0493, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6598680263947211, |
| "grad_norm": 1.2056917804998961, |
| "learning_rate": 4.2378204743532377e-05, |
| "loss": 1.0779, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.671865626874625, |
| "grad_norm": 1.356550764377353, |
| "learning_rate": 4.199698658255298e-05, |
| "loss": 1.0507, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6838632273545291, |
| "grad_norm": 1.1721573272728156, |
| "learning_rate": 4.160827306100611e-05, |
| "loss": 1.0541, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.6958608278344331, |
| "grad_norm": 1.1802890937435413, |
| "learning_rate": 4.121223559445343e-05, |
| "loss": 1.0507, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.7078584283143371, |
| "grad_norm": 1.3591231013602916, |
| "learning_rate": 4.0809048828182534e-05, |
| "loss": 1.0529, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.7198560287942412, |
| "grad_norm": 1.230075489545995, |
| "learning_rate": 4.039889056019159e-05, |
| "loss": 1.0611, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7318536292741452, |
| "grad_norm": 1.3962403495748514, |
| "learning_rate": 3.9981941662783674e-05, |
| "loss": 1.0477, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.7438512297540492, |
| "grad_norm": 1.2104058914840572, |
| "learning_rate": 3.955838600280535e-05, |
| "loss": 1.0425, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.7558488302339532, |
| "grad_norm": 1.243572445349006, |
| "learning_rate": 3.91284103605648e-05, |
| "loss": 1.053, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.7678464307138573, |
| "grad_norm": 1.2315818992929564, |
| "learning_rate": 3.869220434746509e-05, |
| "loss": 1.0469, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.7798440311937612, |
| "grad_norm": 1.282062911918751, |
| "learning_rate": 3.8249960322389e-05, |
| "loss": 1.0507, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.7918416316736653, |
| "grad_norm": 1.2926792700193703, |
| "learning_rate": 3.780187330687231e-05, |
| "loss": 1.0286, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.8038392321535693, |
| "grad_norm": 1.1789915377278588, |
| "learning_rate": 3.734814089910283e-05, |
| "loss": 1.0275, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.8158368326334733, |
| "grad_norm": 1.2214772745932698, |
| "learning_rate": 3.6888963186783224e-05, |
| "loss": 1.0152, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8278344331133773, |
| "grad_norm": 1.0572719303807887, |
| "learning_rate": 3.6424542658895944e-05, |
| "loss": 1.0329, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.8398320335932813, |
| "grad_norm": 1.057795278450336, |
| "learning_rate": 3.5955084116409385e-05, |
| "loss": 1.0358, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.8518296340731853, |
| "grad_norm": 1.1933118891898369, |
| "learning_rate": 3.5480794581964304e-05, |
| "loss": 1.0257, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.8638272345530894, |
| "grad_norm": 1.1930511530685082, |
| "learning_rate": 3.5001883208580665e-05, |
| "loss": 1.0462, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.8758248350329934, |
| "grad_norm": 1.206184339739026, |
| "learning_rate": 3.451856118742498e-05, |
| "loss": 1.0457, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.8878224355128974, |
| "grad_norm": 1.2488722797945375, |
| "learning_rate": 3.403104165467883e-05, |
| "loss": 1.0273, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8998200359928015, |
| "grad_norm": 1.1879602242889826, |
| "learning_rate": 3.353953959754973e-05, |
| "loss": 1.0063, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.9118176364727054, |
| "grad_norm": 1.1859307890368762, |
| "learning_rate": 3.30442717594657e-05, |
| "loss": 1.0155, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.9238152369526095, |
| "grad_norm": 1.1938213692010637, |
| "learning_rate": 3.2545456544495365e-05, |
| "loss": 1.025, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.9358128374325135, |
| "grad_norm": 1.1843748330164985, |
| "learning_rate": 3.2043313921035743e-05, |
| "loss": 1.0328, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.9478104379124175, |
| "grad_norm": 1.117527742694121, |
| "learning_rate": 3.1538065324810206e-05, |
| "loss": 1.0248, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.9598080383923215, |
| "grad_norm": 1.1115869963103877, |
| "learning_rate": 3.1029933561219375e-05, |
| "loss": 1.0027, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.9718056388722256, |
| "grad_norm": 1.0498821054649277, |
| "learning_rate": 3.0519142707088026e-05, |
| "loss": 1.0116, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.9838032393521295, |
| "grad_norm": 1.118470935254374, |
| "learning_rate": 3.000591801185124e-05, |
| "loss": 1.0147, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.9958008398320336, |
| "grad_norm": 1.1283471897635804, |
| "learning_rate": 2.9490485798223623e-05, |
| "loss": 1.0312, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.0095980803839233, |
| "grad_norm": 1.0933704505218012, |
| "learning_rate": 2.8973073362394998e-05, |
| "loss": 1.0189, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.0215956808638271, |
| "grad_norm": 1.2757088787270183, |
| "learning_rate": 2.8453908873797058e-05, |
| "loss": 0.8115, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.0335932813437312, |
| "grad_norm": 1.2014338555644504, |
| "learning_rate": 2.7933221274484723e-05, |
| "loss": 0.8041, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.0455908818236352, |
| "grad_norm": 1.136417187679828, |
| "learning_rate": 2.7411240178176927e-05, |
| "loss": 0.8078, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.0575884823035393, |
| "grad_norm": 1.1276372766792626, |
| "learning_rate": 2.6888195769001146e-05, |
| "loss": 0.8179, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.0695860827834434, |
| "grad_norm": 1.0072604306741504, |
| "learning_rate": 2.63643186999864e-05, |
| "loss": 0.8072, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.0815836832633474, |
| "grad_norm": 1.089321595722244, |
| "learning_rate": 2.5839839991349506e-05, |
| "loss": 0.8099, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.0935812837432513, |
| "grad_norm": 1.0056285329535712, |
| "learning_rate": 2.5314990928619337e-05, |
| "loss": 0.7985, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.1055788842231553, |
| "grad_norm": 0.9752484724026733, |
| "learning_rate": 2.479000296064417e-05, |
| "loss": 0.8224, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.1175764847030594, |
| "grad_norm": 1.0643013774775827, |
| "learning_rate": 2.4265107597526946e-05, |
| "loss": 0.8156, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.1295740851829634, |
| "grad_norm": 1.1300899339981707, |
| "learning_rate": 2.374053630853358e-05, |
| "loss": 0.8113, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.1415716856628675, |
| "grad_norm": 1.107379832173596, |
| "learning_rate": 2.3216520420019195e-05, |
| "loss": 0.8078, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.1535692861427713, |
| "grad_norm": 1.1381056597190673, |
| "learning_rate": 2.2693291013417453e-05, |
| "loss": 0.7979, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.1655668866226754, |
| "grad_norm": 1.0344901370146822, |
| "learning_rate": 2.2171078823337863e-05, |
| "loss": 0.8114, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.1775644871025794, |
| "grad_norm": 1.1433623054658555, |
| "learning_rate": 2.165011413581605e-05, |
| "loss": 0.8068, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.1895620875824835, |
| "grad_norm": 1.0574705191297809, |
| "learning_rate": 2.1130626686761762e-05, |
| "loss": 0.7931, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.2015596880623876, |
| "grad_norm": 1.0955960697193892, |
| "learning_rate": 2.0612845560649603e-05, |
| "loss": 0.7961, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.2135572885422916, |
| "grad_norm": 1.1142604155745692, |
| "learning_rate": 2.0096999089496913e-05, |
| "loss": 0.7949, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.2255548890221957, |
| "grad_norm": 1.081207113394286, |
| "learning_rate": 1.958331475217357e-05, |
| "loss": 0.8081, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.2375524895020995, |
| "grad_norm": 1.087424227244276, |
| "learning_rate": 1.9072019074087876e-05, |
| "loss": 0.7944, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.2495500899820036, |
| "grad_norm": 1.1746075874674693, |
| "learning_rate": 1.856333752729311e-05, |
| "loss": 0.7982, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.2615476904619076, |
| "grad_norm": 1.1261850868143939, |
| "learning_rate": 1.8057494431058365e-05, |
| "loss": 0.8013, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.2735452909418117, |
| "grad_norm": 1.0858958449694596, |
| "learning_rate": 1.7554712852947913e-05, |
| "loss": 0.8031, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.2855428914217157, |
| "grad_norm": 1.1039599626344443, |
| "learning_rate": 1.705521451045246e-05, |
| "loss": 0.7924, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.2975404919016196, |
| "grad_norm": 1.0379577155650672, |
| "learning_rate": 1.6559219673215784e-05, |
| "loss": 0.784, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.3095380923815236, |
| "grad_norm": 1.0180533259182083, |
| "learning_rate": 1.6066947065899847e-05, |
| "loss": 0.795, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.3215356928614277, |
| "grad_norm": 1.1174277968751472, |
| "learning_rate": 1.5578613771731213e-05, |
| "loss": 0.7903, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.3335332933413317, |
| "grad_norm": 1.113625762435688, |
| "learning_rate": 1.509443513677134e-05, |
| "loss": 0.7841, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.3455308938212358, |
| "grad_norm": 1.046198809224864, |
| "learning_rate": 1.4614624674952842e-05, |
| "loss": 0.7932, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.3575284943011399, |
| "grad_norm": 1.0511943871992961, |
| "learning_rate": 1.4139393973923798e-05, |
| "loss": 0.7815, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.369526094781044, |
| "grad_norm": 1.012108314446409, |
| "learning_rate": 1.3668952601741441e-05, |
| "loss": 0.7899, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.3815236952609478, |
| "grad_norm": 1.0590168334777237, |
| "learning_rate": 1.320350801445649e-05, |
| "loss": 0.7745, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.3935212957408518, |
| "grad_norm": 1.0208126844360186, |
| "learning_rate": 1.2743265464628786e-05, |
| "loss": 0.7761, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.4055188962207559, |
| "grad_norm": 1.1048330533675996, |
| "learning_rate": 1.2288427910814699e-05, |
| "loss": 0.7549, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.41751649670066, |
| "grad_norm": 1.087557213168186, |
| "learning_rate": 1.1839195928066102e-05, |
| "loss": 0.7887, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.4295140971805638, |
| "grad_norm": 1.066790026980483, |
| "learning_rate": 1.1395767619480451e-05, |
| "loss": 0.7891, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.4415116976604678, |
| "grad_norm": 1.0511422556482182, |
| "learning_rate": 1.0958338528840893e-05, |
| "loss": 0.7828, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.4535092981403719, |
| "grad_norm": 1.0811572565881435, |
| "learning_rate": 1.052710155438506e-05, |
| "loss": 0.7985, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.465506898620276, |
| "grad_norm": 0.9838694729298545, |
| "learning_rate": 1.0102246863740496e-05, |
| "loss": 0.7712, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.47750449910018, |
| "grad_norm": 1.039397195061213, |
| "learning_rate": 9.683961810064176e-06, |
| "loss": 0.7813, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.489502099580084, |
| "grad_norm": 1.0652079719056124, |
| "learning_rate": 9.272430849423174e-06, |
| "loss": 0.774, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.5014997000599881, |
| "grad_norm": 1.107963886239366, |
| "learning_rate": 8.867835459452925e-06, |
| "loss": 0.7756, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.5134973005398922, |
| "grad_norm": 1.0470978361528913, |
| "learning_rate": 8.470354059328919e-06, |
| "loss": 0.7683, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.525494901019796, |
| "grad_norm": 1.2007634054581608, |
| "learning_rate": 8.080161931087094e-06, |
| "loss": 0.7827, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.5374925014997, |
| "grad_norm": 1.1547346563119187, |
| "learning_rate": 7.697431142327632e-06, |
| "loss": 0.7787, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.5494901019796041, |
| "grad_norm": 1.17217513079497, |
| "learning_rate": 7.3223304703363135e-06, |
| "loss": 0.7858, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.561487702459508, |
| "grad_norm": 1.0566429470152394, |
| "learning_rate": 6.955025327656839e-06, |
| "loss": 0.766, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.573485302939412, |
| "grad_norm": 1.022660107263399, |
| "learning_rate": 6.5956776891468925e-06, |
| "loss": 0.7694, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.585482903419316, |
| "grad_norm": 1.0475499247182583, |
| "learning_rate": 6.244446020550182e-06, |
| "loss": 0.7627, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.5974805038992201, |
| "grad_norm": 1.0898061259359026, |
| "learning_rate": 5.901485208615948e-06, |
| "loss": 0.7672, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.6094781043791242, |
| "grad_norm": 1.0594055484172262, |
| "learning_rate": 5.5669464927967655e-06, |
| "loss": 0.7783, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.6214757048590283, |
| "grad_norm": 1.000104590500381, |
| "learning_rate": 5.240977398554673e-06, |
| "loss": 0.7578, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.6334733053389323, |
| "grad_norm": 1.010755641973307, |
| "learning_rate": 4.9237216723051485e-06, |
| "loss": 0.7681, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.6454709058188364, |
| "grad_norm": 1.0438538534480322, |
| "learning_rate": 4.615319218027561e-06, |
| "loss": 0.773, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.6574685062987402, |
| "grad_norm": 1.00200099090404, |
| "learning_rate": 4.315906035570094e-06, |
| "loss": 0.7541, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.6694661067786443, |
| "grad_norm": 0.9967511007903198, |
| "learning_rate": 4.0256141606762836e-06, |
| "loss": 0.7744, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.6814637072585483, |
| "grad_norm": 0.954046235240287, |
| "learning_rate": 3.7445716067596503e-06, |
| "loss": 0.7472, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.6934613077384522, |
| "grad_norm": 1.013820429314171, |
| "learning_rate": 3.4729023084521417e-06, |
| "loss": 0.7499, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.7054589082183562, |
| "grad_norm": 1.0142864719521136, |
| "learning_rate": 3.2107260669512336e-06, |
| "loss": 0.7539, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.7174565086982603, |
| "grad_norm": 1.036622595232617, |
| "learning_rate": 2.9581584971897697e-06, |
| "loss": 0.7731, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.7294541091781643, |
| "grad_norm": 1.02617492688862, |
| "learning_rate": 2.7153109768518925e-06, |
| "loss": 0.7495, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.7414517096580684, |
| "grad_norm": 0.9783666612285059, |
| "learning_rate": 2.4822905972575167e-06, |
| "loss": 0.7649, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.7534493101379725, |
| "grad_norm": 0.9915953729109124, |
| "learning_rate": 2.2592001161370392e-06, |
| "loss": 0.7662, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.7654469106178765, |
| "grad_norm": 0.9904152103445725, |
| "learning_rate": 2.0461379123170284e-06, |
| "loss": 0.7681, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.7774445110977806, |
| "grad_norm": 1.0176103785435524, |
| "learning_rate": 1.8431979423369604e-06, |
| "loss": 0.7606, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.7894421115776846, |
| "grad_norm": 1.0288460382306641, |
| "learning_rate": 1.650469699016116e-06, |
| "loss": 0.7704, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.8014397120575885, |
| "grad_norm": 0.9904963634021251, |
| "learning_rate": 1.4680381719888807e-06, |
| "loss": 0.7502, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.8134373125374925, |
| "grad_norm": 1.0256818925204132, |
| "learning_rate": 1.2959838102258536e-06, |
| "loss": 0.7713, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.8254349130173964, |
| "grad_norm": 0.9849452100623455, |
| "learning_rate": 1.134382486557342e-06, |
| "loss": 0.758, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.8374325134973004, |
| "grad_norm": 0.9397595503622964, |
| "learning_rate": 9.833054642148066e-07, |
| "loss": 0.7521, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.8494301139772045, |
| "grad_norm": 1.0890895891545742, |
| "learning_rate": 8.428193654051036e-07, |
| "loss": 0.7509, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.8614277144571085, |
| "grad_norm": 0.9884112088685092, |
| "learning_rate": 7.129861419312822e-07, |
| "loss": 0.7655, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.8734253149370126, |
| "grad_norm": 0.9719952905492264, |
| "learning_rate": 5.938630478729917e-07, |
| "loss": 0.7498, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.8854229154169166, |
| "grad_norm": 1.0012032904437846, |
| "learning_rate": 4.855026143384733e-07, |
| "loss": 0.7536, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.8974205158968207, |
| "grad_norm": 0.9926612758167743, |
| "learning_rate": 3.8795262629928996e-07, |
| "loss": 0.7668, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.9094181163767248, |
| "grad_norm": 0.9848391831057468, |
| "learning_rate": 3.0125610151804374e-07, |
| "loss": 0.7638, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.9214157168566288, |
| "grad_norm": 1.0112841443251817, |
| "learning_rate": 2.2545127157831413e-07, |
| "loss": 0.755, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.9334133173365327, |
| "grad_norm": 0.9951541506451825, |
| "learning_rate": 1.605715650252415e-07, |
| "loss": 0.7509, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.9454109178164367, |
| "grad_norm": 1.0148344278049142, |
| "learning_rate": 1.0664559262413831e-07, |
| "loss": 0.7576, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.9574085182963408, |
| "grad_norm": 1.0628673000773126, |
| "learning_rate": 6.369713474366212e-08, |
| "loss": 0.7668, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.9694061187762446, |
| "grad_norm": 0.9218706861308771, |
| "learning_rate": 3.1745130869123566e-08, |
| "loss": 0.7593, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.9814037192561487, |
| "grad_norm": 1.0123478252373648, |
| "learning_rate": 1.08036712505033e-08, |
| "loss": 0.7563, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.9934013197360527, |
| "grad_norm": 1.0032109470714232, |
| "learning_rate": 8.819906889168117e-10, |
| "loss": 0.7589, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.9982003599280143, |
| "step": 832, |
| "total_flos": 619672101191680.0, |
| "train_loss": 0.9412782498850272, |
| "train_runtime": 43950.8497, |
| "train_samples_per_second": 1.214, |
| "train_steps_per_second": 0.019 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 832, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 300, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 619672101191680.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|