| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9998704383502484, |
| "eval_steps": 500, |
| "global_step": 2894, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006909954653422587, |
| "grad_norm": 2.7603113651275635, |
| "learning_rate": 8.673533304426543e-06, |
| "loss": 3.6901, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.013819909306845174, |
| "grad_norm": 2.2209839820861816, |
| "learning_rate": 1.0680399942186417e-05, |
| "loss": 3.4962, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.020729863960267762, |
| "grad_norm": 2.200331687927246, |
| "learning_rate": 1.1854341669224292e-05, |
| "loss": 3.4824, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.02763981861369035, |
| "grad_norm": 2.167349100112915, |
| "learning_rate": 1.2687266579946291e-05, |
| "loss": 3.4759, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.034549773267112935, |
| "grad_norm": 2.0614750385284424, |
| "learning_rate": 1.3333333333333337e-05, |
| "loss": 3.4502, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.041459727920535525, |
| "grad_norm": 2.170149087905884, |
| "learning_rate": 1.3861208306984167e-05, |
| "loss": 3.4189, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.04836968257395811, |
| "grad_norm": 2.2909820079803467, |
| "learning_rate": 1.4307520237854922e-05, |
| "loss": 3.4281, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0552796372273807, |
| "grad_norm": 2.159343719482422, |
| "learning_rate": 1.4694133217706166e-05, |
| "loss": 3.414, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.06218959188080328, |
| "grad_norm": 2.2137129306793213, |
| "learning_rate": 1.5035150034022042e-05, |
| "loss": 3.4163, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.06909954653422587, |
| "grad_norm": 1.9851800203323364, |
| "learning_rate": 1.5340199971093208e-05, |
| "loss": 3.4075, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07600950118764846, |
| "grad_norm": 2.3020198345184326, |
| "learning_rate": 1.5616151205481378e-05, |
| "loss": 3.3834, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.08291945584107105, |
| "grad_norm": 2.0768489837646484, |
| "learning_rate": 1.586807494474404e-05, |
| "loss": 3.3947, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.08982941049449363, |
| "grad_norm": 2.0693891048431396, |
| "learning_rate": 1.6099822319805453e-05, |
| "loss": 3.4267, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.09673936514791621, |
| "grad_norm": 1.994624137878418, |
| "learning_rate": 1.6314386875614796e-05, |
| "loss": 3.402, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.1036493198013388, |
| "grad_norm": 2.0815863609313965, |
| "learning_rate": 1.6514141698131085e-05, |
| "loss": 3.4122, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.1105592744547614, |
| "grad_norm": 2.1544482707977295, |
| "learning_rate": 1.670099985546604e-05, |
| "loss": 3.4049, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.11746922910818398, |
| "grad_norm": 2.2229878902435303, |
| "learning_rate": 1.6876526113615038e-05, |
| "loss": 3.4034, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.12437918376160656, |
| "grad_norm": 2.2661125659942627, |
| "learning_rate": 1.704201667178192e-05, |
| "loss": 3.4268, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.13128913841502915, |
| "grad_norm": 2.1003775596618652, |
| "learning_rate": 1.7198557310778737e-05, |
| "loss": 3.3937, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.13819909306845174, |
| "grad_norm": 2.0601627826690674, |
| "learning_rate": 1.7347066608853085e-05, |
| "loss": 3.3605, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.14510904772187433, |
| "grad_norm": 1.9609519243240356, |
| "learning_rate": 1.748832860265267e-05, |
| "loss": 3.3977, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.15201900237529692, |
| "grad_norm": 2.147367000579834, |
| "learning_rate": 1.762301784324125e-05, |
| "loss": 3.407, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.1589289570287195, |
| "grad_norm": 2.011277675628662, |
| "learning_rate": 1.7751718877877165e-05, |
| "loss": 3.3841, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.1658389116821421, |
| "grad_norm": 2.076007127761841, |
| "learning_rate": 1.7874941582503917e-05, |
| "loss": 3.3784, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.17274886633556466, |
| "grad_norm": 2.187765121459961, |
| "learning_rate": 1.7993133362240127e-05, |
| "loss": 3.3801, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.17965882098898725, |
| "grad_norm": 2.0247693061828613, |
| "learning_rate": 1.810668895756533e-05, |
| "loss": 3.3751, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.18656877564240984, |
| "grad_norm": 2.131352424621582, |
| "learning_rate": 1.8215958398819793e-05, |
| "loss": 3.3631, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.19347873029583243, |
| "grad_norm": 2.178847312927246, |
| "learning_rate": 1.832125351337467e-05, |
| "loss": 3.3764, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.20038868494925502, |
| "grad_norm": 2.2057948112487793, |
| "learning_rate": 1.8422853290419585e-05, |
| "loss": 3.3686, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.2072986396026776, |
| "grad_norm": 2.5370497703552246, |
| "learning_rate": 1.852100833589096e-05, |
| "loss": 3.377, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.2142085942561002, |
| "grad_norm": 2.284680128097534, |
| "learning_rate": 1.861594459665503e-05, |
| "loss": 3.3723, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.2211185489095228, |
| "grad_norm": 2.1865601539611816, |
| "learning_rate": 1.8707866493225918e-05, |
| "loss": 3.3216, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.22802850356294538, |
| "grad_norm": 2.0960466861724854, |
| "learning_rate": 1.879695957027913e-05, |
| "loss": 3.3597, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.23493845821636797, |
| "grad_norm": 2.184783935546875, |
| "learning_rate": 1.888339275137491e-05, |
| "loss": 3.3432, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.24184841286979053, |
| "grad_norm": 2.191296100616455, |
| "learning_rate": 1.8967320266761712e-05, |
| "loss": 3.3668, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.24875836752321312, |
| "grad_norm": 2.0871424674987793, |
| "learning_rate": 1.9048883309541794e-05, |
| "loss": 3.3539, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2556683221766357, |
| "grad_norm": 2.1541998386383057, |
| "learning_rate": 1.9128211464873177e-05, |
| "loss": 3.3594, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.2625782768300583, |
| "grad_norm": 2.0716910362243652, |
| "learning_rate": 1.920542394853861e-05, |
| "loss": 3.3173, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.2694882314834809, |
| "grad_norm": 2.270064353942871, |
| "learning_rate": 1.9280630684603204e-05, |
| "loss": 3.3289, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.2763981861369035, |
| "grad_norm": 2.1515491008758545, |
| "learning_rate": 1.935393324661296e-05, |
| "loss": 3.3316, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.28330814079032607, |
| "grad_norm": 2.1054258346557617, |
| "learning_rate": 1.9425425682558113e-05, |
| "loss": 3.3063, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.29021809544374866, |
| "grad_norm": 2.2717254161834717, |
| "learning_rate": 1.9495195240412547e-05, |
| "loss": 3.3388, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.29712805009717125, |
| "grad_norm": 2.2322545051574707, |
| "learning_rate": 1.9563323008290453e-05, |
| "loss": 3.3187, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.30403800475059384, |
| "grad_norm": 2.102282762527466, |
| "learning_rate": 1.9629884481001123e-05, |
| "loss": 3.2985, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.3109479594040164, |
| "grad_norm": 2.2856414318084717, |
| "learning_rate": 1.9694950062928836e-05, |
| "loss": 3.2925, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.317857914057439, |
| "grad_norm": 2.143099308013916, |
| "learning_rate": 1.975858551563704e-05, |
| "loss": 3.3045, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.3247678687108616, |
| "grad_norm": 2.1752912998199463, |
| "learning_rate": 1.982085235733133e-05, |
| "loss": 3.3158, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.3316778233642842, |
| "grad_norm": 2.247835159301758, |
| "learning_rate": 1.988180822026379e-05, |
| "loss": 3.3155, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.3385877780177068, |
| "grad_norm": 2.0297396183013916, |
| "learning_rate": 1.99415071712833e-05, |
| "loss": 3.3113, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.3454977326711293, |
| "grad_norm": 2.0661704540252686, |
| "learning_rate": 2e-05, |
| "loss": 3.2767, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3524076873245519, |
| "grad_norm": 2.0839781761169434, |
| "learning_rate": 1.9799366420274552e-05, |
| "loss": 3.2869, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.3593176419779745, |
| "grad_norm": 2.1040310859680176, |
| "learning_rate": 1.958817317845829e-05, |
| "loss": 3.2935, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.3662275966313971, |
| "grad_norm": 2.2290778160095215, |
| "learning_rate": 1.937697993664203e-05, |
| "loss": 3.2958, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.3731375512848197, |
| "grad_norm": 2.12402081489563, |
| "learning_rate": 1.916578669482577e-05, |
| "loss": 3.2603, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.38004750593824227, |
| "grad_norm": 2.0378010272979736, |
| "learning_rate": 1.8954593453009504e-05, |
| "loss": 3.2961, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.38695746059166486, |
| "grad_norm": 2.16814923286438, |
| "learning_rate": 1.8743400211193243e-05, |
| "loss": 3.2748, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.39386741524508745, |
| "grad_norm": 2.180172920227051, |
| "learning_rate": 1.8532206969376982e-05, |
| "loss": 3.2758, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.40077736989851004, |
| "grad_norm": 2.0336861610412598, |
| "learning_rate": 1.832101372756072e-05, |
| "loss": 3.2804, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.40768732455193263, |
| "grad_norm": 2.077043056488037, |
| "learning_rate": 1.810982048574446e-05, |
| "loss": 3.2668, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.4145972792053552, |
| "grad_norm": 2.0104854106903076, |
| "learning_rate": 1.7898627243928195e-05, |
| "loss": 3.2707, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4215072338587778, |
| "grad_norm": 2.3837151527404785, |
| "learning_rate": 1.7687434002111933e-05, |
| "loss": 3.2416, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.4284171885122004, |
| "grad_norm": 2.058412551879883, |
| "learning_rate": 1.7476240760295672e-05, |
| "loss": 3.2403, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.435327143165623, |
| "grad_norm": 2.1999926567077637, |
| "learning_rate": 1.726504751847941e-05, |
| "loss": 3.2177, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.4422370978190456, |
| "grad_norm": 2.096730947494507, |
| "learning_rate": 1.705385427666315e-05, |
| "loss": 3.2254, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.44914705247246817, |
| "grad_norm": 2.0574615001678467, |
| "learning_rate": 1.6842661034846885e-05, |
| "loss": 3.2167, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.45605700712589076, |
| "grad_norm": 2.133613348007202, |
| "learning_rate": 1.6631467793030624e-05, |
| "loss": 3.1911, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.46296696177931335, |
| "grad_norm": 2.3095550537109375, |
| "learning_rate": 1.6420274551214363e-05, |
| "loss": 3.2139, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.46987691643273594, |
| "grad_norm": 2.173490285873413, |
| "learning_rate": 1.62090813093981e-05, |
| "loss": 3.1964, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.47678687108615847, |
| "grad_norm": 2.0556015968322754, |
| "learning_rate": 1.599788806758184e-05, |
| "loss": 3.1824, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.48369682573958106, |
| "grad_norm": 2.140432357788086, |
| "learning_rate": 1.5786694825765576e-05, |
| "loss": 3.1972, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.49060678039300365, |
| "grad_norm": 2.209411859512329, |
| "learning_rate": 1.5575501583949314e-05, |
| "loss": 3.1942, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.49751673504642624, |
| "grad_norm": 2.1304755210876465, |
| "learning_rate": 1.5364308342133053e-05, |
| "loss": 3.1723, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.5044266896998488, |
| "grad_norm": 2.2864432334899902, |
| "learning_rate": 1.515311510031679e-05, |
| "loss": 3.1777, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.5113366443532714, |
| "grad_norm": 2.17474365234375, |
| "learning_rate": 1.4941921858500529e-05, |
| "loss": 3.1849, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.518246599006694, |
| "grad_norm": 2.0795702934265137, |
| "learning_rate": 1.4730728616684266e-05, |
| "loss": 3.1643, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5251565536601166, |
| "grad_norm": 2.2711730003356934, |
| "learning_rate": 1.4519535374868005e-05, |
| "loss": 3.1572, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.5320665083135392, |
| "grad_norm": 2.0966429710388184, |
| "learning_rate": 1.4308342133051742e-05, |
| "loss": 3.1984, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.5389764629669618, |
| "grad_norm": 2.2726895809173584, |
| "learning_rate": 1.409714889123548e-05, |
| "loss": 3.1593, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.5458864176203844, |
| "grad_norm": 2.102625608444214, |
| "learning_rate": 1.3885955649419221e-05, |
| "loss": 3.1733, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.552796372273807, |
| "grad_norm": 2.2264513969421387, |
| "learning_rate": 1.3674762407602957e-05, |
| "loss": 3.1159, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5597063269272295, |
| "grad_norm": 2.2396631240844727, |
| "learning_rate": 1.3463569165786697e-05, |
| "loss": 3.1444, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.5666162815806521, |
| "grad_norm": 2.4287407398223877, |
| "learning_rate": 1.3252375923970432e-05, |
| "loss": 3.1341, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.5735262362340747, |
| "grad_norm": 2.253844976425171, |
| "learning_rate": 1.3041182682154171e-05, |
| "loss": 3.114, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.5804361908874973, |
| "grad_norm": 2.08655047416687, |
| "learning_rate": 1.2829989440337912e-05, |
| "loss": 3.1117, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.5873461455409199, |
| "grad_norm": 2.4364819526672363, |
| "learning_rate": 1.2618796198521647e-05, |
| "loss": 3.1262, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5942561001943425, |
| "grad_norm": 2.2320666313171387, |
| "learning_rate": 1.2407602956705388e-05, |
| "loss": 3.0833, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.6011660548477651, |
| "grad_norm": 2.156684160232544, |
| "learning_rate": 1.2196409714889123e-05, |
| "loss": 3.1382, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.6080760095011877, |
| "grad_norm": 2.1044089794158936, |
| "learning_rate": 1.1985216473072863e-05, |
| "loss": 3.1222, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.6149859641546103, |
| "grad_norm": 2.1616947650909424, |
| "learning_rate": 1.1774023231256602e-05, |
| "loss": 3.1213, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.6218959188080329, |
| "grad_norm": 2.161734104156494, |
| "learning_rate": 1.1562829989440338e-05, |
| "loss": 3.0898, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.6288058734614554, |
| "grad_norm": 2.0919594764709473, |
| "learning_rate": 1.1351636747624078e-05, |
| "loss": 3.0756, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.635715828114878, |
| "grad_norm": 2.4278299808502197, |
| "learning_rate": 1.1140443505807813e-05, |
| "loss": 3.0654, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.6426257827683006, |
| "grad_norm": 2.1781225204467773, |
| "learning_rate": 1.0929250263991554e-05, |
| "loss": 3.0855, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.6495357374217232, |
| "grad_norm": 2.1798110008239746, |
| "learning_rate": 1.0718057022175293e-05, |
| "loss": 3.0628, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.6564456920751458, |
| "grad_norm": 2.3901069164276123, |
| "learning_rate": 1.050686378035903e-05, |
| "loss": 3.0388, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.6633556467285684, |
| "grad_norm": 2.2091197967529297, |
| "learning_rate": 1.0295670538542769e-05, |
| "loss": 3.0418, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.670265601381991, |
| "grad_norm": 2.403480052947998, |
| "learning_rate": 1.0084477296726504e-05, |
| "loss": 3.063, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.6771755560354136, |
| "grad_norm": 2.185926914215088, |
| "learning_rate": 9.873284054910244e-06, |
| "loss": 3.044, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.684085510688836, |
| "grad_norm": 2.147468328475952, |
| "learning_rate": 9.662090813093982e-06, |
| "loss": 3.0328, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.6909954653422586, |
| "grad_norm": 2.153027057647705, |
| "learning_rate": 9.45089757127772e-06, |
| "loss": 3.0437, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.6979054199956812, |
| "grad_norm": 2.172102451324463, |
| "learning_rate": 9.239704329461457e-06, |
| "loss": 3.0516, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.7048153746491038, |
| "grad_norm": 2.4078903198242188, |
| "learning_rate": 9.028511087645196e-06, |
| "loss": 3.0235, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.7117253293025264, |
| "grad_norm": 2.3446578979492188, |
| "learning_rate": 8.817317845828935e-06, |
| "loss": 3.0051, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.718635283955949, |
| "grad_norm": 2.2547926902770996, |
| "learning_rate": 8.606124604012672e-06, |
| "loss": 3.0165, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.7255452386093716, |
| "grad_norm": 2.216155767440796, |
| "learning_rate": 8.39493136219641e-06, |
| "loss": 3.0411, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.7324551932627942, |
| "grad_norm": 2.2646193504333496, |
| "learning_rate": 8.183738120380148e-06, |
| "loss": 2.9969, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.7393651479162168, |
| "grad_norm": 2.208395004272461, |
| "learning_rate": 7.972544878563887e-06, |
| "loss": 2.9966, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.7462751025696394, |
| "grad_norm": 2.2717556953430176, |
| "learning_rate": 7.761351636747625e-06, |
| "loss": 2.9973, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.753185057223062, |
| "grad_norm": 2.19114351272583, |
| "learning_rate": 7.5501583949313625e-06, |
| "loss": 2.9948, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.7600950118764845, |
| "grad_norm": 2.1782710552215576, |
| "learning_rate": 7.3389651531151e-06, |
| "loss": 3.0065, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.7670049665299071, |
| "grad_norm": 2.2360358238220215, |
| "learning_rate": 7.127771911298838e-06, |
| "loss": 2.9865, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.7739149211833297, |
| "grad_norm": 2.233429193496704, |
| "learning_rate": 6.916578669482578e-06, |
| "loss": 2.9858, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.7808248758367523, |
| "grad_norm": 2.2624831199645996, |
| "learning_rate": 6.705385427666316e-06, |
| "loss": 2.9851, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.7877348304901749, |
| "grad_norm": 2.1775155067443848, |
| "learning_rate": 6.494192185850053e-06, |
| "loss": 2.9961, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.7946447851435975, |
| "grad_norm": 2.2491140365600586, |
| "learning_rate": 6.282998944033791e-06, |
| "loss": 2.9476, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.8015547397970201, |
| "grad_norm": 2.2396609783172607, |
| "learning_rate": 6.071805702217529e-06, |
| "loss": 2.9736, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.8084646944504427, |
| "grad_norm": 2.3773739337921143, |
| "learning_rate": 5.8606124604012685e-06, |
| "loss": 2.9598, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.8153746491038653, |
| "grad_norm": 2.2326018810272217, |
| "learning_rate": 5.649419218585006e-06, |
| "loss": 2.9475, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.8222846037572878, |
| "grad_norm": 2.466160535812378, |
| "learning_rate": 5.438225976768744e-06, |
| "loss": 2.9475, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.8291945584107104, |
| "grad_norm": 2.3290674686431885, |
| "learning_rate": 5.227032734952482e-06, |
| "loss": 2.9525, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.836104513064133, |
| "grad_norm": 2.1915531158447266, |
| "learning_rate": 5.01583949313622e-06, |
| "loss": 2.935, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.8430144677175556, |
| "grad_norm": 2.295179605484009, |
| "learning_rate": 4.804646251319958e-06, |
| "loss": 2.9198, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.8499244223709782, |
| "grad_norm": 2.103804111480713, |
| "learning_rate": 4.593453009503696e-06, |
| "loss": 2.9274, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.8568343770244008, |
| "grad_norm": 2.2191033363342285, |
| "learning_rate": 4.382259767687435e-06, |
| "loss": 2.9178, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.8637443316778234, |
| "grad_norm": 2.251919746398926, |
| "learning_rate": 4.171066525871173e-06, |
| "loss": 2.9324, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.870654286331246, |
| "grad_norm": 2.2608258724212646, |
| "learning_rate": 3.959873284054911e-06, |
| "loss": 2.912, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.8775642409846686, |
| "grad_norm": 2.294982433319092, |
| "learning_rate": 3.7486800422386486e-06, |
| "loss": 2.9023, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.8844741956380912, |
| "grad_norm": 2.280447006225586, |
| "learning_rate": 3.5374868004223865e-06, |
| "loss": 2.9203, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.8913841502915137, |
| "grad_norm": 2.182582378387451, |
| "learning_rate": 3.3262935586061253e-06, |
| "loss": 2.9146, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.8982941049449363, |
| "grad_norm": 2.1298370361328125, |
| "learning_rate": 3.1151003167898632e-06, |
| "loss": 2.875, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.9052040595983589, |
| "grad_norm": 2.147123336791992, |
| "learning_rate": 2.9039070749736007e-06, |
| "loss": 2.9039, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.9121140142517815, |
| "grad_norm": 2.381716728210449, |
| "learning_rate": 2.6927138331573395e-06, |
| "loss": 2.8772, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.9190239689052041, |
| "grad_norm": 2.1160385608673096, |
| "learning_rate": 2.4815205913410774e-06, |
| "loss": 2.8996, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.9259339235586267, |
| "grad_norm": 2.1785471439361572, |
| "learning_rate": 2.2703273495248154e-06, |
| "loss": 2.8793, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.9328438782120493, |
| "grad_norm": 2.3042492866516113, |
| "learning_rate": 2.0591341077085537e-06, |
| "loss": 2.8936, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.9397538328654719, |
| "grad_norm": 2.329930543899536, |
| "learning_rate": 1.8479408658922914e-06, |
| "loss": 2.8752, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.9466637875188945, |
| "grad_norm": 2.297677993774414, |
| "learning_rate": 1.6367476240760296e-06, |
| "loss": 2.8898, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.9535737421723169, |
| "grad_norm": 2.117172956466675, |
| "learning_rate": 1.425554382259768e-06, |
| "loss": 2.9107, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.9604836968257395, |
| "grad_norm": 2.2045278549194336, |
| "learning_rate": 1.2143611404435059e-06, |
| "loss": 2.8748, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.9673936514791621, |
| "grad_norm": 2.441049098968506, |
| "learning_rate": 1.003167898627244e-06, |
| "loss": 2.876, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.9743036061325847, |
| "grad_norm": 2.242483615875244, |
| "learning_rate": 7.91974656810982e-07, |
| "loss": 2.8553, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.9812135607860073, |
| "grad_norm": 2.046325206756592, |
| "learning_rate": 5.807814149947203e-07, |
| "loss": 2.8689, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.9881235154394299, |
| "grad_norm": 2.166343927383423, |
| "learning_rate": 3.6958817317845836e-07, |
| "loss": 2.8678, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.9950334700928525, |
| "grad_norm": 2.2947232723236084, |
| "learning_rate": 1.583949313621964e-07, |
| "loss": 2.8587, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.9998704383502484, |
| "step": 2894, |
| "total_flos": 4.190944749311492e+18, |
| "train_loss": 3.1699299486077237, |
| "train_runtime": 39562.1085, |
| "train_samples_per_second": 9.364, |
| "train_steps_per_second": 0.073 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 2894, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.190944749311492e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|