ItsMaxNorm's picture
Training in progress, step 1900
bf8165a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.989025177533893,
"eval_steps": 500,
"global_step": 1935,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025823111684958036,
"grad_norm": 18.261793337291987,
"learning_rate": 1.0309278350515465e-07,
"loss": 2.5303,
"step": 1
},
{
"epoch": 0.005164622336991607,
"grad_norm": 17.801147060406052,
"learning_rate": 2.061855670103093e-07,
"loss": 2.5624,
"step": 2
},
{
"epoch": 0.007746933505487412,
"grad_norm": 18.16995107477998,
"learning_rate": 3.0927835051546394e-07,
"loss": 2.5315,
"step": 3
},
{
"epoch": 0.010329244673983214,
"grad_norm": 18.01568136556937,
"learning_rate": 4.123711340206186e-07,
"loss": 2.4765,
"step": 4
},
{
"epoch": 0.012911555842479019,
"grad_norm": 17.515246452891258,
"learning_rate": 5.154639175257732e-07,
"loss": 2.5302,
"step": 5
},
{
"epoch": 0.015493867010974823,
"grad_norm": 17.590449970545567,
"learning_rate": 6.185567010309279e-07,
"loss": 2.5035,
"step": 6
},
{
"epoch": 0.018076178179470628,
"grad_norm": 17.860290443777686,
"learning_rate": 7.216494845360824e-07,
"loss": 2.4856,
"step": 7
},
{
"epoch": 0.02065848934796643,
"grad_norm": 16.457931144604892,
"learning_rate": 8.247422680412372e-07,
"loss": 2.5034,
"step": 8
},
{
"epoch": 0.023240800516462233,
"grad_norm": 15.470780109900105,
"learning_rate": 9.278350515463919e-07,
"loss": 2.4852,
"step": 9
},
{
"epoch": 0.025823111684958037,
"grad_norm": 15.512665472720686,
"learning_rate": 1.0309278350515464e-06,
"loss": 2.4699,
"step": 10
},
{
"epoch": 0.028405422853453842,
"grad_norm": 11.58357921033743,
"learning_rate": 1.134020618556701e-06,
"loss": 2.4699,
"step": 11
},
{
"epoch": 0.030987734021949646,
"grad_norm": 10.860729184020308,
"learning_rate": 1.2371134020618557e-06,
"loss": 2.4071,
"step": 12
},
{
"epoch": 0.03357004519044545,
"grad_norm": 9.989819033444546,
"learning_rate": 1.3402061855670104e-06,
"loss": 2.3805,
"step": 13
},
{
"epoch": 0.036152356358941255,
"grad_norm": 4.5174463153202415,
"learning_rate": 1.4432989690721649e-06,
"loss": 2.2546,
"step": 14
},
{
"epoch": 0.03873466752743705,
"grad_norm": 4.393268427169014,
"learning_rate": 1.5463917525773197e-06,
"loss": 2.2604,
"step": 15
},
{
"epoch": 0.04131697869593286,
"grad_norm": 4.427841764828794,
"learning_rate": 1.6494845360824744e-06,
"loss": 2.2923,
"step": 16
},
{
"epoch": 0.04389928986442866,
"grad_norm": 4.116274907311869,
"learning_rate": 1.7525773195876288e-06,
"loss": 2.2216,
"step": 17
},
{
"epoch": 0.046481601032924466,
"grad_norm": 4.133640482984593,
"learning_rate": 1.8556701030927837e-06,
"loss": 2.2106,
"step": 18
},
{
"epoch": 0.04906391220142027,
"grad_norm": 4.395953110017593,
"learning_rate": 1.9587628865979384e-06,
"loss": 2.0858,
"step": 19
},
{
"epoch": 0.051646223369916075,
"grad_norm": 4.474700218464569,
"learning_rate": 2.061855670103093e-06,
"loss": 2.0397,
"step": 20
},
{
"epoch": 0.05422853453841188,
"grad_norm": 4.4282827611154545,
"learning_rate": 2.1649484536082477e-06,
"loss": 1.9898,
"step": 21
},
{
"epoch": 0.056810845706907684,
"grad_norm": 4.327539626304312,
"learning_rate": 2.268041237113402e-06,
"loss": 1.9951,
"step": 22
},
{
"epoch": 0.05939315687540349,
"grad_norm": 4.120482191834098,
"learning_rate": 2.3711340206185566e-06,
"loss": 1.9155,
"step": 23
},
{
"epoch": 0.06197546804389929,
"grad_norm": 3.8070009573204358,
"learning_rate": 2.4742268041237115e-06,
"loss": 1.8995,
"step": 24
},
{
"epoch": 0.0645577792123951,
"grad_norm": 4.212221981103606,
"learning_rate": 2.577319587628866e-06,
"loss": 1.8671,
"step": 25
},
{
"epoch": 0.0671400903808909,
"grad_norm": 4.058515787932264,
"learning_rate": 2.680412371134021e-06,
"loss": 1.624,
"step": 26
},
{
"epoch": 0.0697224015493867,
"grad_norm": 2.6837920364601398,
"learning_rate": 2.7835051546391757e-06,
"loss": 1.6006,
"step": 27
},
{
"epoch": 0.07230471271788251,
"grad_norm": 2.3134143191011924,
"learning_rate": 2.8865979381443297e-06,
"loss": 1.5962,
"step": 28
},
{
"epoch": 0.07488702388637831,
"grad_norm": 2.10252758773887,
"learning_rate": 2.9896907216494846e-06,
"loss": 1.5671,
"step": 29
},
{
"epoch": 0.0774693350548741,
"grad_norm": 1.5299035293940784,
"learning_rate": 3.0927835051546395e-06,
"loss": 1.5472,
"step": 30
},
{
"epoch": 0.08005164622336991,
"grad_norm": 1.2378467992780195,
"learning_rate": 3.195876288659794e-06,
"loss": 1.5826,
"step": 31
},
{
"epoch": 0.08263395739186571,
"grad_norm": 1.1693886031886611,
"learning_rate": 3.298969072164949e-06,
"loss": 1.4891,
"step": 32
},
{
"epoch": 0.08521626856036152,
"grad_norm": 1.1401616730246562,
"learning_rate": 3.4020618556701037e-06,
"loss": 1.4982,
"step": 33
},
{
"epoch": 0.08779857972885732,
"grad_norm": 1.0954194107096582,
"learning_rate": 3.5051546391752577e-06,
"loss": 1.5325,
"step": 34
},
{
"epoch": 0.09038089089735313,
"grad_norm": 1.0579698812847145,
"learning_rate": 3.6082474226804126e-06,
"loss": 1.4814,
"step": 35
},
{
"epoch": 0.09296320206584893,
"grad_norm": 0.9653972853589035,
"learning_rate": 3.7113402061855674e-06,
"loss": 1.4494,
"step": 36
},
{
"epoch": 0.09554551323434474,
"grad_norm": 0.9645799739429384,
"learning_rate": 3.814432989690722e-06,
"loss": 1.5063,
"step": 37
},
{
"epoch": 0.09812782440284054,
"grad_norm": 1.00902463704413,
"learning_rate": 3.917525773195877e-06,
"loss": 1.4683,
"step": 38
},
{
"epoch": 0.10071013557133635,
"grad_norm": 0.8783590838731776,
"learning_rate": 4.020618556701032e-06,
"loss": 1.4453,
"step": 39
},
{
"epoch": 0.10329244673983215,
"grad_norm": 0.8107745608541321,
"learning_rate": 4.123711340206186e-06,
"loss": 1.4753,
"step": 40
},
{
"epoch": 0.10587475790832795,
"grad_norm": 0.789499449431658,
"learning_rate": 4.2268041237113405e-06,
"loss": 1.4376,
"step": 41
},
{
"epoch": 0.10845706907682376,
"grad_norm": 0.8025862979836251,
"learning_rate": 4.329896907216495e-06,
"loss": 1.4394,
"step": 42
},
{
"epoch": 0.11103938024531956,
"grad_norm": 0.7735255000850332,
"learning_rate": 4.4329896907216494e-06,
"loss": 1.4252,
"step": 43
},
{
"epoch": 0.11362169141381537,
"grad_norm": 0.655685351955956,
"learning_rate": 4.536082474226804e-06,
"loss": 1.4297,
"step": 44
},
{
"epoch": 0.11620400258231117,
"grad_norm": 0.6434305676734384,
"learning_rate": 4.639175257731959e-06,
"loss": 1.4067,
"step": 45
},
{
"epoch": 0.11878631375080698,
"grad_norm": 0.6261090972732297,
"learning_rate": 4.742268041237113e-06,
"loss": 1.4101,
"step": 46
},
{
"epoch": 0.12136862491930278,
"grad_norm": 0.6073212568723447,
"learning_rate": 4.845360824742268e-06,
"loss": 1.4172,
"step": 47
},
{
"epoch": 0.12395093608779859,
"grad_norm": 0.5593992251138936,
"learning_rate": 4.948453608247423e-06,
"loss": 1.3798,
"step": 48
},
{
"epoch": 0.1265332472562944,
"grad_norm": 0.5922984047206602,
"learning_rate": 5.051546391752578e-06,
"loss": 1.3858,
"step": 49
},
{
"epoch": 0.1291155584247902,
"grad_norm": 0.6051805656066229,
"learning_rate": 5.154639175257732e-06,
"loss": 1.3836,
"step": 50
},
{
"epoch": 0.131697869593286,
"grad_norm": 0.5691467307958845,
"learning_rate": 5.257731958762888e-06,
"loss": 1.3841,
"step": 51
},
{
"epoch": 0.1342801807617818,
"grad_norm": 0.5693752131599139,
"learning_rate": 5.360824742268042e-06,
"loss": 1.3529,
"step": 52
},
{
"epoch": 0.1368624919302776,
"grad_norm": 0.5695922879694713,
"learning_rate": 5.463917525773196e-06,
"loss": 1.3599,
"step": 53
},
{
"epoch": 0.1394448030987734,
"grad_norm": 0.5653409792135605,
"learning_rate": 5.567010309278351e-06,
"loss": 1.359,
"step": 54
},
{
"epoch": 0.14202711426726922,
"grad_norm": 0.5181379607950953,
"learning_rate": 5.670103092783505e-06,
"loss": 1.358,
"step": 55
},
{
"epoch": 0.14460942543576502,
"grad_norm": 0.5616919575863237,
"learning_rate": 5.7731958762886594e-06,
"loss": 1.3925,
"step": 56
},
{
"epoch": 0.14719173660426083,
"grad_norm": 0.5785012034706822,
"learning_rate": 5.876288659793815e-06,
"loss": 1.3902,
"step": 57
},
{
"epoch": 0.14977404777275663,
"grad_norm": 0.5304602501774517,
"learning_rate": 5.979381443298969e-06,
"loss": 1.3516,
"step": 58
},
{
"epoch": 0.15235635894125243,
"grad_norm": 0.530893892325567,
"learning_rate": 6.082474226804124e-06,
"loss": 1.3556,
"step": 59
},
{
"epoch": 0.1549386701097482,
"grad_norm": 0.5578192633717619,
"learning_rate": 6.185567010309279e-06,
"loss": 1.3609,
"step": 60
},
{
"epoch": 0.15752098127824402,
"grad_norm": 0.5513788256965562,
"learning_rate": 6.288659793814433e-06,
"loss": 1.3494,
"step": 61
},
{
"epoch": 0.16010329244673982,
"grad_norm": 0.5357427336170907,
"learning_rate": 6.391752577319588e-06,
"loss": 1.3338,
"step": 62
},
{
"epoch": 0.16268560361523562,
"grad_norm": 0.5023717761905121,
"learning_rate": 6.494845360824743e-06,
"loss": 1.3329,
"step": 63
},
{
"epoch": 0.16526791478373143,
"grad_norm": 0.5667763878793689,
"learning_rate": 6.597938144329898e-06,
"loss": 1.3091,
"step": 64
},
{
"epoch": 0.16785022595222723,
"grad_norm": 0.5321682781083598,
"learning_rate": 6.701030927835052e-06,
"loss": 1.4019,
"step": 65
},
{
"epoch": 0.17043253712072304,
"grad_norm": 0.5281078850382545,
"learning_rate": 6.804123711340207e-06,
"loss": 1.2957,
"step": 66
},
{
"epoch": 0.17301484828921884,
"grad_norm": 0.5517071277006239,
"learning_rate": 6.907216494845361e-06,
"loss": 1.3387,
"step": 67
},
{
"epoch": 0.17559715945771465,
"grad_norm": 0.49767879157016104,
"learning_rate": 7.010309278350515e-06,
"loss": 1.3428,
"step": 68
},
{
"epoch": 0.17817947062621045,
"grad_norm": 0.5333338011710018,
"learning_rate": 7.113402061855671e-06,
"loss": 1.3275,
"step": 69
},
{
"epoch": 0.18076178179470626,
"grad_norm": 0.5436738511723471,
"learning_rate": 7.216494845360825e-06,
"loss": 1.323,
"step": 70
},
{
"epoch": 0.18334409296320206,
"grad_norm": 0.510166704780262,
"learning_rate": 7.319587628865979e-06,
"loss": 1.3337,
"step": 71
},
{
"epoch": 0.18592640413169786,
"grad_norm": 0.5367005388657456,
"learning_rate": 7.422680412371135e-06,
"loss": 1.3157,
"step": 72
},
{
"epoch": 0.18850871530019367,
"grad_norm": 0.5448486484799029,
"learning_rate": 7.525773195876289e-06,
"loss": 1.3016,
"step": 73
},
{
"epoch": 0.19109102646868947,
"grad_norm": 0.5049919202829503,
"learning_rate": 7.628865979381444e-06,
"loss": 1.314,
"step": 74
},
{
"epoch": 0.19367333763718528,
"grad_norm": 0.5203343239140137,
"learning_rate": 7.731958762886599e-06,
"loss": 1.3402,
"step": 75
},
{
"epoch": 0.19625564880568108,
"grad_norm": 0.5295426487296165,
"learning_rate": 7.835051546391754e-06,
"loss": 1.2937,
"step": 76
},
{
"epoch": 0.1988379599741769,
"grad_norm": 0.516845033941673,
"learning_rate": 7.938144329896907e-06,
"loss": 1.3334,
"step": 77
},
{
"epoch": 0.2014202711426727,
"grad_norm": 0.5195834990483513,
"learning_rate": 8.041237113402063e-06,
"loss": 1.3398,
"step": 78
},
{
"epoch": 0.2040025823111685,
"grad_norm": 0.5437220214849503,
"learning_rate": 8.144329896907216e-06,
"loss": 1.3328,
"step": 79
},
{
"epoch": 0.2065848934796643,
"grad_norm": 0.5316348277354109,
"learning_rate": 8.247422680412371e-06,
"loss": 1.2668,
"step": 80
},
{
"epoch": 0.2091672046481601,
"grad_norm": 0.5367859922800738,
"learning_rate": 8.350515463917526e-06,
"loss": 1.3455,
"step": 81
},
{
"epoch": 0.2117495158166559,
"grad_norm": 0.5330729877188181,
"learning_rate": 8.453608247422681e-06,
"loss": 1.3012,
"step": 82
},
{
"epoch": 0.2143318269851517,
"grad_norm": 0.5289538334232236,
"learning_rate": 8.556701030927836e-06,
"loss": 1.2987,
"step": 83
},
{
"epoch": 0.21691413815364752,
"grad_norm": 0.5271186274617113,
"learning_rate": 8.65979381443299e-06,
"loss": 1.3091,
"step": 84
},
{
"epoch": 0.21949644932214332,
"grad_norm": 0.5425463627416961,
"learning_rate": 8.762886597938146e-06,
"loss": 1.3312,
"step": 85
},
{
"epoch": 0.22207876049063913,
"grad_norm": 0.5002639279766852,
"learning_rate": 8.865979381443299e-06,
"loss": 1.297,
"step": 86
},
{
"epoch": 0.22466107165913493,
"grad_norm": 0.5217277201869615,
"learning_rate": 8.969072164948455e-06,
"loss": 1.289,
"step": 87
},
{
"epoch": 0.22724338282763074,
"grad_norm": 0.5187640962084964,
"learning_rate": 9.072164948453609e-06,
"loss": 1.2987,
"step": 88
},
{
"epoch": 0.22982569399612654,
"grad_norm": 0.5338044768965542,
"learning_rate": 9.175257731958764e-06,
"loss": 1.3224,
"step": 89
},
{
"epoch": 0.23240800516462234,
"grad_norm": 0.5154318950249379,
"learning_rate": 9.278350515463918e-06,
"loss": 1.2765,
"step": 90
},
{
"epoch": 0.23499031633311815,
"grad_norm": 0.5528419081350959,
"learning_rate": 9.381443298969073e-06,
"loss": 1.2546,
"step": 91
},
{
"epoch": 0.23757262750161395,
"grad_norm": 0.5214303815188377,
"learning_rate": 9.484536082474226e-06,
"loss": 1.3196,
"step": 92
},
{
"epoch": 0.24015493867010976,
"grad_norm": 0.5526561516212597,
"learning_rate": 9.587628865979383e-06,
"loss": 1.3079,
"step": 93
},
{
"epoch": 0.24273724983860556,
"grad_norm": 0.5222706312908064,
"learning_rate": 9.690721649484536e-06,
"loss": 1.3066,
"step": 94
},
{
"epoch": 0.24531956100710137,
"grad_norm": 0.5012834034257991,
"learning_rate": 9.793814432989691e-06,
"loss": 1.2722,
"step": 95
},
{
"epoch": 0.24790187217559717,
"grad_norm": 0.546506717931034,
"learning_rate": 9.896907216494846e-06,
"loss": 1.2797,
"step": 96
},
{
"epoch": 0.25048418334409295,
"grad_norm": 0.5253068189240363,
"learning_rate": 1e-05,
"loss": 1.3317,
"step": 97
},
{
"epoch": 0.2530664945125888,
"grad_norm": 0.5611013083746247,
"learning_rate": 1.0103092783505156e-05,
"loss": 1.3025,
"step": 98
},
{
"epoch": 0.25564880568108456,
"grad_norm": 0.5298450611788376,
"learning_rate": 1.0206185567010309e-05,
"loss": 1.3036,
"step": 99
},
{
"epoch": 0.2582311168495804,
"grad_norm": 0.5099158069328582,
"learning_rate": 1.0309278350515464e-05,
"loss": 1.3022,
"step": 100
},
{
"epoch": 0.26081342801807617,
"grad_norm": 0.5293692963261973,
"learning_rate": 1.041237113402062e-05,
"loss": 1.3146,
"step": 101
},
{
"epoch": 0.263395739186572,
"grad_norm": 0.5244293197527762,
"learning_rate": 1.0515463917525775e-05,
"loss": 1.2885,
"step": 102
},
{
"epoch": 0.2659780503550678,
"grad_norm": 0.5033038451383925,
"learning_rate": 1.0618556701030928e-05,
"loss": 1.2578,
"step": 103
},
{
"epoch": 0.2685603615235636,
"grad_norm": 0.5274373232659619,
"learning_rate": 1.0721649484536083e-05,
"loss": 1.299,
"step": 104
},
{
"epoch": 0.2711426726920594,
"grad_norm": 0.5444726311980428,
"learning_rate": 1.0824742268041238e-05,
"loss": 1.2929,
"step": 105
},
{
"epoch": 0.2737249838605552,
"grad_norm": 0.5326713139943118,
"learning_rate": 1.0927835051546391e-05,
"loss": 1.2753,
"step": 106
},
{
"epoch": 0.276307295029051,
"grad_norm": 0.5380363136345329,
"learning_rate": 1.1030927835051548e-05,
"loss": 1.3309,
"step": 107
},
{
"epoch": 0.2788896061975468,
"grad_norm": 0.5326668305211691,
"learning_rate": 1.1134020618556703e-05,
"loss": 1.2572,
"step": 108
},
{
"epoch": 0.2814719173660426,
"grad_norm": 0.546468614229743,
"learning_rate": 1.1237113402061856e-05,
"loss": 1.3002,
"step": 109
},
{
"epoch": 0.28405422853453843,
"grad_norm": 0.5702827216999251,
"learning_rate": 1.134020618556701e-05,
"loss": 1.2785,
"step": 110
},
{
"epoch": 0.2866365397030342,
"grad_norm": 0.6279431033493682,
"learning_rate": 1.1443298969072166e-05,
"loss": 1.293,
"step": 111
},
{
"epoch": 0.28921885087153004,
"grad_norm": 0.531097886388855,
"learning_rate": 1.1546391752577319e-05,
"loss": 1.2583,
"step": 112
},
{
"epoch": 0.2918011620400258,
"grad_norm": 0.5599048942527884,
"learning_rate": 1.1649484536082475e-05,
"loss": 1.2812,
"step": 113
},
{
"epoch": 0.29438347320852165,
"grad_norm": 0.5156492032600126,
"learning_rate": 1.175257731958763e-05,
"loss": 1.2593,
"step": 114
},
{
"epoch": 0.2969657843770174,
"grad_norm": 0.5204640872347789,
"learning_rate": 1.1855670103092785e-05,
"loss": 1.2222,
"step": 115
},
{
"epoch": 0.29954809554551326,
"grad_norm": 0.5242605009448421,
"learning_rate": 1.1958762886597938e-05,
"loss": 1.2452,
"step": 116
},
{
"epoch": 0.30213040671400904,
"grad_norm": 0.5194114061316515,
"learning_rate": 1.2061855670103093e-05,
"loss": 1.2844,
"step": 117
},
{
"epoch": 0.30471271788250487,
"grad_norm": 0.517182795982599,
"learning_rate": 1.2164948453608248e-05,
"loss": 1.2494,
"step": 118
},
{
"epoch": 0.30729502905100065,
"grad_norm": 0.5322718855106631,
"learning_rate": 1.2268041237113405e-05,
"loss": 1.2853,
"step": 119
},
{
"epoch": 0.3098773402194964,
"grad_norm": 0.5477767784263254,
"learning_rate": 1.2371134020618558e-05,
"loss": 1.266,
"step": 120
},
{
"epoch": 0.31245965138799225,
"grad_norm": 0.5409220884054373,
"learning_rate": 1.2474226804123713e-05,
"loss": 1.2862,
"step": 121
},
{
"epoch": 0.31504196255648803,
"grad_norm": 0.5278483491044216,
"learning_rate": 1.2577319587628866e-05,
"loss": 1.2431,
"step": 122
},
{
"epoch": 0.31762427372498386,
"grad_norm": 0.5366419552263015,
"learning_rate": 1.268041237113402e-05,
"loss": 1.2665,
"step": 123
},
{
"epoch": 0.32020658489347964,
"grad_norm": 0.5484453793169465,
"learning_rate": 1.2783505154639176e-05,
"loss": 1.2466,
"step": 124
},
{
"epoch": 0.32278889606197547,
"grad_norm": 0.5707617704740593,
"learning_rate": 1.2886597938144332e-05,
"loss": 1.2801,
"step": 125
},
{
"epoch": 0.32537120723047125,
"grad_norm": 0.5545965695444451,
"learning_rate": 1.2989690721649485e-05,
"loss": 1.2627,
"step": 126
},
{
"epoch": 0.3279535183989671,
"grad_norm": 0.545566481033505,
"learning_rate": 1.309278350515464e-05,
"loss": 1.2662,
"step": 127
},
{
"epoch": 0.33053582956746286,
"grad_norm": 0.614998937339536,
"learning_rate": 1.3195876288659795e-05,
"loss": 1.278,
"step": 128
},
{
"epoch": 0.3331181407359587,
"grad_norm": 0.8515042792729816,
"learning_rate": 1.3298969072164948e-05,
"loss": 1.2528,
"step": 129
},
{
"epoch": 0.33570045190445447,
"grad_norm": 0.5478821425580377,
"learning_rate": 1.3402061855670103e-05,
"loss": 1.2626,
"step": 130
},
{
"epoch": 0.3382827630729503,
"grad_norm": 0.5793920120710241,
"learning_rate": 1.350515463917526e-05,
"loss": 1.229,
"step": 131
},
{
"epoch": 0.3408650742414461,
"grad_norm": 0.6386990352009216,
"learning_rate": 1.3608247422680415e-05,
"loss": 1.2371,
"step": 132
},
{
"epoch": 0.3434473854099419,
"grad_norm": 0.5565581664986918,
"learning_rate": 1.3711340206185568e-05,
"loss": 1.2155,
"step": 133
},
{
"epoch": 0.3460296965784377,
"grad_norm": 0.5946237867227266,
"learning_rate": 1.3814432989690723e-05,
"loss": 1.2841,
"step": 134
},
{
"epoch": 0.3486120077469335,
"grad_norm": 0.5521016353163,
"learning_rate": 1.3917525773195878e-05,
"loss": 1.2551,
"step": 135
},
{
"epoch": 0.3511943189154293,
"grad_norm": 0.5385185628561531,
"learning_rate": 1.402061855670103e-05,
"loss": 1.2419,
"step": 136
},
{
"epoch": 0.3537766300839251,
"grad_norm": 0.5193240629735151,
"learning_rate": 1.4123711340206187e-05,
"loss": 1.2539,
"step": 137
},
{
"epoch": 0.3563589412524209,
"grad_norm": 0.556121240094166,
"learning_rate": 1.4226804123711342e-05,
"loss": 1.2303,
"step": 138
},
{
"epoch": 0.35894125242091673,
"grad_norm": 0.5218247569723863,
"learning_rate": 1.4329896907216495e-05,
"loss": 1.2644,
"step": 139
},
{
"epoch": 0.3615235635894125,
"grad_norm": 0.530437093642517,
"learning_rate": 1.443298969072165e-05,
"loss": 1.2497,
"step": 140
},
{
"epoch": 0.36410587475790834,
"grad_norm": 0.5484440453142011,
"learning_rate": 1.4536082474226805e-05,
"loss": 1.2324,
"step": 141
},
{
"epoch": 0.3666881859264041,
"grad_norm": 0.5493692192470679,
"learning_rate": 1.4639175257731958e-05,
"loss": 1.2717,
"step": 142
},
{
"epoch": 0.36927049709489995,
"grad_norm": 0.5413123502333834,
"learning_rate": 1.4742268041237115e-05,
"loss": 1.2103,
"step": 143
},
{
"epoch": 0.37185280826339573,
"grad_norm": 0.556751121901872,
"learning_rate": 1.484536082474227e-05,
"loss": 1.2532,
"step": 144
},
{
"epoch": 0.37443511943189156,
"grad_norm": 0.5408139067043912,
"learning_rate": 1.4948453608247425e-05,
"loss": 1.2995,
"step": 145
},
{
"epoch": 0.37701743060038734,
"grad_norm": 0.567109158025857,
"learning_rate": 1.5051546391752578e-05,
"loss": 1.2412,
"step": 146
},
{
"epoch": 0.37959974176888317,
"grad_norm": 0.5489463445020752,
"learning_rate": 1.5154639175257733e-05,
"loss": 1.2601,
"step": 147
},
{
"epoch": 0.38218205293737895,
"grad_norm": 0.5269782212095577,
"learning_rate": 1.5257731958762888e-05,
"loss": 1.2105,
"step": 148
},
{
"epoch": 0.3847643641058748,
"grad_norm": 0.5476377594621987,
"learning_rate": 1.5360824742268042e-05,
"loss": 1.251,
"step": 149
},
{
"epoch": 0.38734667527437056,
"grad_norm": 0.5851100678927004,
"learning_rate": 1.5463917525773197e-05,
"loss": 1.2722,
"step": 150
},
{
"epoch": 0.3899289864428664,
"grad_norm": 0.5906959266553361,
"learning_rate": 1.5567010309278352e-05,
"loss": 1.2605,
"step": 151
},
{
"epoch": 0.39251129761136216,
"grad_norm": 0.5812493666723612,
"learning_rate": 1.5670103092783507e-05,
"loss": 1.278,
"step": 152
},
{
"epoch": 0.395093608779858,
"grad_norm": 0.6197633946415279,
"learning_rate": 1.5773195876288662e-05,
"loss": 1.2062,
"step": 153
},
{
"epoch": 0.3976759199483538,
"grad_norm": 0.5382604894375081,
"learning_rate": 1.5876288659793813e-05,
"loss": 1.1994,
"step": 154
},
{
"epoch": 0.4002582311168496,
"grad_norm": 0.5919569403332751,
"learning_rate": 1.597938144329897e-05,
"loss": 1.2151,
"step": 155
},
{
"epoch": 0.4028405422853454,
"grad_norm": 0.5540942763865888,
"learning_rate": 1.6082474226804127e-05,
"loss": 1.2245,
"step": 156
},
{
"epoch": 0.4054228534538412,
"grad_norm": 0.6456250154614755,
"learning_rate": 1.618556701030928e-05,
"loss": 1.2407,
"step": 157
},
{
"epoch": 0.408005164622337,
"grad_norm": 0.5495270617698018,
"learning_rate": 1.6288659793814433e-05,
"loss": 1.2319,
"step": 158
},
{
"epoch": 0.41058747579083277,
"grad_norm": 0.6456750468117451,
"learning_rate": 1.6391752577319588e-05,
"loss": 1.2314,
"step": 159
},
{
"epoch": 0.4131697869593286,
"grad_norm": 0.5361216417752698,
"learning_rate": 1.6494845360824743e-05,
"loss": 1.2252,
"step": 160
},
{
"epoch": 0.4157520981278244,
"grad_norm": 0.5740356212634748,
"learning_rate": 1.65979381443299e-05,
"loss": 1.2199,
"step": 161
},
{
"epoch": 0.4183344092963202,
"grad_norm": 0.5600044764299427,
"learning_rate": 1.6701030927835052e-05,
"loss": 1.2386,
"step": 162
},
{
"epoch": 0.420916720464816,
"grad_norm": 0.5752486574243302,
"learning_rate": 1.6804123711340207e-05,
"loss": 1.2745,
"step": 163
},
{
"epoch": 0.4234990316333118,
"grad_norm": 0.5653976115621461,
"learning_rate": 1.6907216494845362e-05,
"loss": 1.2294,
"step": 164
},
{
"epoch": 0.4260813428018076,
"grad_norm": 0.6075219487510116,
"learning_rate": 1.7010309278350517e-05,
"loss": 1.2719,
"step": 165
},
{
"epoch": 0.4286636539703034,
"grad_norm": 0.606371230053945,
"learning_rate": 1.7113402061855672e-05,
"loss": 1.2112,
"step": 166
},
{
"epoch": 0.4312459651387992,
"grad_norm": 0.6078879765053119,
"learning_rate": 1.7216494845360827e-05,
"loss": 1.264,
"step": 167
},
{
"epoch": 0.43382827630729504,
"grad_norm": 0.5767804822867032,
"learning_rate": 1.731958762886598e-05,
"loss": 1.2515,
"step": 168
},
{
"epoch": 0.4364105874757908,
"grad_norm": 0.6130923630273873,
"learning_rate": 1.7422680412371137e-05,
"loss": 1.2038,
"step": 169
},
{
"epoch": 0.43899289864428664,
"grad_norm": 0.5235102985682106,
"learning_rate": 1.752577319587629e-05,
"loss": 1.237,
"step": 170
},
{
"epoch": 0.4415752098127824,
"grad_norm": 0.6204241509904728,
"learning_rate": 1.7628865979381443e-05,
"loss": 1.2653,
"step": 171
},
{
"epoch": 0.44415752098127825,
"grad_norm": 0.5839862114241418,
"learning_rate": 1.7731958762886598e-05,
"loss": 1.2268,
"step": 172
},
{
"epoch": 0.44673983214977403,
"grad_norm": 0.5379286158874602,
"learning_rate": 1.7835051546391756e-05,
"loss": 1.2237,
"step": 173
},
{
"epoch": 0.44932214331826986,
"grad_norm": 0.5773716297805083,
"learning_rate": 1.793814432989691e-05,
"loss": 1.2563,
"step": 174
},
{
"epoch": 0.45190445448676564,
"grad_norm": 0.5310519709213946,
"learning_rate": 1.8041237113402062e-05,
"loss": 1.1999,
"step": 175
},
{
"epoch": 0.45448676565526147,
"grad_norm": 0.5849880582279767,
"learning_rate": 1.8144329896907217e-05,
"loss": 1.2286,
"step": 176
},
{
"epoch": 0.45706907682375725,
"grad_norm": 0.5412178590822653,
"learning_rate": 1.8247422680412372e-05,
"loss": 1.2008,
"step": 177
},
{
"epoch": 0.4596513879922531,
"grad_norm": 0.6133621047960811,
"learning_rate": 1.8350515463917527e-05,
"loss": 1.2378,
"step": 178
},
{
"epoch": 0.46223369916074886,
"grad_norm": 0.528699955767748,
"learning_rate": 1.8453608247422682e-05,
"loss": 1.2259,
"step": 179
},
{
"epoch": 0.4648160103292447,
"grad_norm": 0.5279317130021535,
"learning_rate": 1.8556701030927837e-05,
"loss": 1.2412,
"step": 180
},
{
"epoch": 0.46739832149774047,
"grad_norm": 0.5853559551993786,
"learning_rate": 1.865979381443299e-05,
"loss": 1.2214,
"step": 181
},
{
"epoch": 0.4699806326662363,
"grad_norm": 0.5424233833428941,
"learning_rate": 1.8762886597938147e-05,
"loss": 1.2458,
"step": 182
},
{
"epoch": 0.4725629438347321,
"grad_norm": 0.5106671423999047,
"learning_rate": 1.88659793814433e-05,
"loss": 1.2233,
"step": 183
},
{
"epoch": 0.4751452550032279,
"grad_norm": 0.5324533214858636,
"learning_rate": 1.8969072164948453e-05,
"loss": 1.2069,
"step": 184
},
{
"epoch": 0.4777275661717237,
"grad_norm": 0.5646853613253074,
"learning_rate": 1.907216494845361e-05,
"loss": 1.252,
"step": 185
},
{
"epoch": 0.4803098773402195,
"grad_norm": 0.5937004053571758,
"learning_rate": 1.9175257731958766e-05,
"loss": 1.2147,
"step": 186
},
{
"epoch": 0.4828921885087153,
"grad_norm": 0.5562505209372693,
"learning_rate": 1.927835051546392e-05,
"loss": 1.2675,
"step": 187
},
{
"epoch": 0.4854744996772111,
"grad_norm": 0.5563014952202203,
"learning_rate": 1.9381443298969072e-05,
"loss": 1.2189,
"step": 188
},
{
"epoch": 0.4880568108457069,
"grad_norm": 0.5264086037575215,
"learning_rate": 1.9484536082474227e-05,
"loss": 1.2233,
"step": 189
},
{
"epoch": 0.49063912201420273,
"grad_norm": 0.5151034326104902,
"learning_rate": 1.9587628865979382e-05,
"loss": 1.1885,
"step": 190
},
{
"epoch": 0.4932214331826985,
"grad_norm": 0.5034288685234395,
"learning_rate": 1.969072164948454e-05,
"loss": 1.1903,
"step": 191
},
{
"epoch": 0.49580374435119434,
"grad_norm": 0.5307684939002809,
"learning_rate": 1.9793814432989692e-05,
"loss": 1.1928,
"step": 192
},
{
"epoch": 0.4983860555196901,
"grad_norm": 0.5230091554872849,
"learning_rate": 1.9896907216494847e-05,
"loss": 1.2519,
"step": 193
},
{
"epoch": 0.5009683666881859,
"grad_norm": 0.5226317801919913,
"learning_rate": 2e-05,
"loss": 1.2413,
"step": 194
},
{
"epoch": 0.5035506778566817,
"grad_norm": 0.5482333261742753,
"learning_rate": 1.9999983719336895e-05,
"loss": 1.2165,
"step": 195
},
{
"epoch": 0.5061329890251776,
"grad_norm": 0.5162294083726316,
"learning_rate": 1.999993487740058e-05,
"loss": 1.241,
"step": 196
},
{
"epoch": 0.5087153001936734,
"grad_norm": 0.5914236651547329,
"learning_rate": 1.99998534743501e-05,
"loss": 1.2174,
"step": 197
},
{
"epoch": 0.5112976113621691,
"grad_norm": 0.5318708983837044,
"learning_rate": 1.9999739510450505e-05,
"loss": 1.2061,
"step": 198
},
{
"epoch": 0.513879922530665,
"grad_norm": 0.5320871675914947,
"learning_rate": 1.9999592986072886e-05,
"loss": 1.2102,
"step": 199
},
{
"epoch": 0.5164622336991608,
"grad_norm": 0.520490178789329,
"learning_rate": 1.999941390169434e-05,
"loss": 1.2041,
"step": 200
},
{
"epoch": 0.5190445448676565,
"grad_norm": 0.5771804416096503,
"learning_rate": 1.9999202257897994e-05,
"loss": 1.2208,
"step": 201
},
{
"epoch": 0.5216268560361523,
"grad_norm": 0.5820688393505574,
"learning_rate": 1.9998958055372984e-05,
"loss": 1.2535,
"step": 202
},
{
"epoch": 0.5242091672046482,
"grad_norm": 0.5725413352628053,
"learning_rate": 1.9998681294914463e-05,
"loss": 1.2003,
"step": 203
},
{
"epoch": 0.526791478373144,
"grad_norm": 0.5813889118499928,
"learning_rate": 1.999837197742361e-05,
"loss": 1.1826,
"step": 204
},
{
"epoch": 0.5293737895416397,
"grad_norm": 0.543697742052886,
"learning_rate": 1.9998030103907594e-05,
"loss": 1.2383,
"step": 205
},
{
"epoch": 0.5319561007101355,
"grad_norm": 0.5737878782200372,
"learning_rate": 1.9997655675479604e-05,
"loss": 1.2261,
"step": 206
},
{
"epoch": 0.5345384118786314,
"grad_norm": 0.5318306020135024,
"learning_rate": 1.999724869335883e-05,
"loss": 1.2087,
"step": 207
},
{
"epoch": 0.5371207230471272,
"grad_norm": 0.5577225248661078,
"learning_rate": 1.999680915887046e-05,
"loss": 1.1793,
"step": 208
},
{
"epoch": 0.5397030342156229,
"grad_norm": 0.5246527308813651,
"learning_rate": 1.9996337073445673e-05,
"loss": 1.1913,
"step": 209
},
{
"epoch": 0.5422853453841188,
"grad_norm": 0.5214804985936327,
"learning_rate": 1.9995832438621646e-05,
"loss": 1.2264,
"step": 210
},
{
"epoch": 0.5448676565526146,
"grad_norm": 0.5608653450795346,
"learning_rate": 1.9995295256041534e-05,
"loss": 1.2269,
"step": 211
},
{
"epoch": 0.5474499677211104,
"grad_norm": 0.516907670213089,
"learning_rate": 1.9994725527454476e-05,
"loss": 1.1963,
"step": 212
},
{
"epoch": 0.5500322788896062,
"grad_norm": 0.5315857426965214,
"learning_rate": 1.999412325471558e-05,
"loss": 1.2646,
"step": 213
},
{
"epoch": 0.552614590058102,
"grad_norm": 0.5281811102245024,
"learning_rate": 1.999348843978593e-05,
"loss": 1.2163,
"step": 214
},
{
"epoch": 0.5551969012265978,
"grad_norm": 0.5579573481195031,
"learning_rate": 1.9992821084732572e-05,
"loss": 1.2262,
"step": 215
},
{
"epoch": 0.5577792123950936,
"grad_norm": 0.5595800317478022,
"learning_rate": 1.9992121191728495e-05,
"loss": 1.1872,
"step": 216
},
{
"epoch": 0.5603615235635894,
"grad_norm": 0.5344069260281085,
"learning_rate": 1.9991388763052643e-05,
"loss": 1.2293,
"step": 217
},
{
"epoch": 0.5629438347320852,
"grad_norm": 0.549912373034949,
"learning_rate": 1.9990623801089908e-05,
"loss": 1.1958,
"step": 218
},
{
"epoch": 0.565526145900581,
"grad_norm": 0.5482064392691067,
"learning_rate": 1.9989826308331103e-05,
"loss": 1.249,
"step": 219
},
{
"epoch": 0.5681084570690769,
"grad_norm": 0.6017796916988709,
"learning_rate": 1.9988996287372967e-05,
"loss": 1.1591,
"step": 220
},
{
"epoch": 0.5706907682375726,
"grad_norm": 0.5616669264753023,
"learning_rate": 1.9988133740918167e-05,
"loss": 1.2029,
"step": 221
},
{
"epoch": 0.5732730794060684,
"grad_norm": 0.548070565923786,
"learning_rate": 1.998723867177526e-05,
"loss": 1.2113,
"step": 222
},
{
"epoch": 0.5758553905745643,
"grad_norm": 0.5579809466395939,
"learning_rate": 1.998631108285871e-05,
"loss": 1.1915,
"step": 223
},
{
"epoch": 0.5784377017430601,
"grad_norm": 0.5145023167575715,
"learning_rate": 1.9985350977188877e-05,
"loss": 1.2455,
"step": 224
},
{
"epoch": 0.5810200129115558,
"grad_norm": 0.5438281690868874,
"learning_rate": 1.998435835789199e-05,
"loss": 1.1996,
"step": 225
},
{
"epoch": 0.5836023240800516,
"grad_norm": 0.5062822089728274,
"learning_rate": 1.9983333228200145e-05,
"loss": 1.2267,
"step": 226
},
{
"epoch": 0.5861846352485475,
"grad_norm": 0.49455916062220207,
"learning_rate": 1.9982275591451304e-05,
"loss": 1.2234,
"step": 227
},
{
"epoch": 0.5887669464170433,
"grad_norm": 0.5230043234273385,
"learning_rate": 1.998118545108927e-05,
"loss": 1.2048,
"step": 228
},
{
"epoch": 0.591349257585539,
"grad_norm": 0.492411804242286,
"learning_rate": 1.998006281066369e-05,
"loss": 1.1567,
"step": 229
},
{
"epoch": 0.5939315687540349,
"grad_norm": 0.47969671152864785,
"learning_rate": 1.997890767383002e-05,
"loss": 1.1842,
"step": 230
},
{
"epoch": 0.5965138799225307,
"grad_norm": 0.5037463603875141,
"learning_rate": 1.9977720044349546e-05,
"loss": 1.2071,
"step": 231
},
{
"epoch": 0.5990961910910265,
"grad_norm": 0.5505833494748907,
"learning_rate": 1.997649992608935e-05,
"loss": 1.2215,
"step": 232
},
{
"epoch": 0.6016785022595222,
"grad_norm": 0.48300379957641953,
"learning_rate": 1.9975247323022286e-05,
"loss": 1.1522,
"step": 233
},
{
"epoch": 0.6042608134280181,
"grad_norm": 0.535926278201071,
"learning_rate": 1.9973962239227012e-05,
"loss": 1.187,
"step": 234
},
{
"epoch": 0.6068431245965139,
"grad_norm": 0.5053573787331682,
"learning_rate": 1.997264467888792e-05,
"loss": 1.2264,
"step": 235
},
{
"epoch": 0.6094254357650097,
"grad_norm": 0.5209920849371423,
"learning_rate": 1.9971294646295165e-05,
"loss": 1.1841,
"step": 236
},
{
"epoch": 0.6120077469335055,
"grad_norm": 0.540792437365881,
"learning_rate": 1.9969912145844633e-05,
"loss": 1.2543,
"step": 237
},
{
"epoch": 0.6145900581020013,
"grad_norm": 0.5212177041723032,
"learning_rate": 1.9968497182037926e-05,
"loss": 1.2561,
"step": 238
},
{
"epoch": 0.6171723692704971,
"grad_norm": 0.5304144527565653,
"learning_rate": 1.996704975948236e-05,
"loss": 1.1606,
"step": 239
},
{
"epoch": 0.6197546804389928,
"grad_norm": 0.5829183549634611,
"learning_rate": 1.9965569882890924e-05,
"loss": 1.196,
"step": 240
},
{
"epoch": 0.6223369916074887,
"grad_norm": 0.5345144481383324,
"learning_rate": 1.99640575570823e-05,
"loss": 1.1859,
"step": 241
},
{
"epoch": 0.6249193027759845,
"grad_norm": 0.5221286963848386,
"learning_rate": 1.9962512786980825e-05,
"loss": 1.1715,
"step": 242
},
{
"epoch": 0.6275016139444803,
"grad_norm": 0.5058627006232616,
"learning_rate": 1.9960935577616466e-05,
"loss": 1.1821,
"step": 243
},
{
"epoch": 0.6300839251129761,
"grad_norm": 0.5199433840698656,
"learning_rate": 1.9959325934124833e-05,
"loss": 1.1953,
"step": 244
},
{
"epoch": 0.6326662362814719,
"grad_norm": 0.5096245762495348,
"learning_rate": 1.9957683861747137e-05,
"loss": 1.1775,
"step": 245
},
{
"epoch": 0.6352485474499677,
"grad_norm": 0.491816602747724,
"learning_rate": 1.995600936583018e-05,
"loss": 1.1965,
"step": 246
},
{
"epoch": 0.6378308586184636,
"grad_norm": 0.539351334033109,
"learning_rate": 1.9954302451826343e-05,
"loss": 1.1902,
"step": 247
},
{
"epoch": 0.6404131697869593,
"grad_norm": 0.5695500043236047,
"learning_rate": 1.9952563125293572e-05,
"loss": 1.1805,
"step": 248
},
{
"epoch": 0.6429954809554551,
"grad_norm": 0.5279649835109442,
"learning_rate": 1.9950791391895335e-05,
"loss": 1.1736,
"step": 249
},
{
"epoch": 0.6455777921239509,
"grad_norm": 0.5082701944757544,
"learning_rate": 1.9948987257400637e-05,
"loss": 1.2334,
"step": 250
},
{
"epoch": 0.6481601032924468,
"grad_norm": 0.5482093109063493,
"learning_rate": 1.994715072768398e-05,
"loss": 1.1802,
"step": 251
},
{
"epoch": 0.6507424144609425,
"grad_norm": 0.50876456990971,
"learning_rate": 1.9945281808725342e-05,
"loss": 1.2399,
"step": 252
},
{
"epoch": 0.6533247256294383,
"grad_norm": 0.5541690057371409,
"learning_rate": 1.9943380506610177e-05,
"loss": 1.1826,
"step": 253
},
{
"epoch": 0.6559070367979342,
"grad_norm": 0.503268686466964,
"learning_rate": 1.9941446827529374e-05,
"loss": 1.1959,
"step": 254
},
{
"epoch": 0.65848934796643,
"grad_norm": 0.5437511832970794,
"learning_rate": 1.993948077777925e-05,
"loss": 1.1953,
"step": 255
},
{
"epoch": 0.6610716591349257,
"grad_norm": 0.4781802895283609,
"learning_rate": 1.9937482363761522e-05,
"loss": 1.1989,
"step": 256
},
{
"epoch": 0.6636539703034215,
"grad_norm": 0.5449342716639853,
"learning_rate": 1.9935451591983292e-05,
"loss": 1.2134,
"step": 257
},
{
"epoch": 0.6662362814719174,
"grad_norm": 0.5062143579422912,
"learning_rate": 1.9933388469057026e-05,
"loss": 1.2243,
"step": 258
},
{
"epoch": 0.6688185926404132,
"grad_norm": 0.47503990693298725,
"learning_rate": 1.9931293001700518e-05,
"loss": 1.1859,
"step": 259
},
{
"epoch": 0.6714009038089089,
"grad_norm": 0.5208679076366871,
"learning_rate": 1.9929165196736893e-05,
"loss": 1.1658,
"step": 260
},
{
"epoch": 0.6739832149774048,
"grad_norm": 0.5159438722195342,
"learning_rate": 1.9927005061094563e-05,
"loss": 1.1943,
"step": 261
},
{
"epoch": 0.6765655261459006,
"grad_norm": 0.51564921169161,
"learning_rate": 1.992481260180722e-05,
"loss": 1.2096,
"step": 262
},
{
"epoch": 0.6791478373143964,
"grad_norm": 0.5272888613016128,
"learning_rate": 1.99225878260138e-05,
"loss": 1.2247,
"step": 263
},
{
"epoch": 0.6817301484828922,
"grad_norm": 0.4937662552334185,
"learning_rate": 1.992033074095847e-05,
"loss": 1.1959,
"step": 264
},
{
"epoch": 0.684312459651388,
"grad_norm": 0.5156120724948244,
"learning_rate": 1.9918041353990593e-05,
"loss": 1.1896,
"step": 265
},
{
"epoch": 0.6868947708198838,
"grad_norm": 0.5029899107771948,
"learning_rate": 1.9915719672564724e-05,
"loss": 1.2029,
"step": 266
},
{
"epoch": 0.6894770819883796,
"grad_norm": 0.5021013897512888,
"learning_rate": 1.9913365704240562e-05,
"loss": 1.2001,
"step": 267
},
{
"epoch": 0.6920593931568754,
"grad_norm": 0.4874570592953185,
"learning_rate": 1.9910979456682935e-05,
"loss": 1.1909,
"step": 268
},
{
"epoch": 0.6946417043253712,
"grad_norm": 0.49498289469426227,
"learning_rate": 1.990856093766179e-05,
"loss": 1.1823,
"step": 269
},
{
"epoch": 0.697224015493867,
"grad_norm": 0.493423453437657,
"learning_rate": 1.9906110155052142e-05,
"loss": 1.226,
"step": 270
},
{
"epoch": 0.6998063266623629,
"grad_norm": 0.5122671842264414,
"learning_rate": 1.9903627116834064e-05,
"loss": 1.1651,
"step": 271
},
{
"epoch": 0.7023886378308586,
"grad_norm": 0.4937702040231816,
"learning_rate": 1.990111183109266e-05,
"loss": 1.1902,
"step": 272
},
{
"epoch": 0.7049709489993544,
"grad_norm": 0.5136007768359044,
"learning_rate": 1.989856430601803e-05,
"loss": 1.1999,
"step": 273
},
{
"epoch": 0.7075532601678503,
"grad_norm": 0.49202497646323023,
"learning_rate": 1.9895984549905255e-05,
"loss": 1.1814,
"step": 274
},
{
"epoch": 0.7101355713363461,
"grad_norm": 0.5009139995620645,
"learning_rate": 1.9893372571154362e-05,
"loss": 1.19,
"step": 275
},
{
"epoch": 0.7127178825048418,
"grad_norm": 0.5193484363230418,
"learning_rate": 1.9890728378270304e-05,
"loss": 1.2066,
"step": 276
},
{
"epoch": 0.7153001936733376,
"grad_norm": 0.49927361128296144,
"learning_rate": 1.9888051979862922e-05,
"loss": 1.2064,
"step": 277
},
{
"epoch": 0.7178825048418335,
"grad_norm": 0.48996929520751215,
"learning_rate": 1.988534338464692e-05,
"loss": 1.1653,
"step": 278
},
{
"epoch": 0.7204648160103292,
"grad_norm": 0.499684088085507,
"learning_rate": 1.988260260144185e-05,
"loss": 1.1654,
"step": 279
},
{
"epoch": 0.723047127178825,
"grad_norm": 0.48422890349549536,
"learning_rate": 1.987982963917206e-05,
"loss": 1.1554,
"step": 280
},
{
"epoch": 0.7256294383473209,
"grad_norm": 0.49199667307495154,
"learning_rate": 1.987702450686669e-05,
"loss": 1.1908,
"step": 281
},
{
"epoch": 0.7282117495158167,
"grad_norm": 0.4852284035856874,
"learning_rate": 1.9874187213659614e-05,
"loss": 1.1367,
"step": 282
},
{
"epoch": 0.7307940606843124,
"grad_norm": 0.49745524250401135,
"learning_rate": 1.987131776878944e-05,
"loss": 1.1801,
"step": 283
},
{
"epoch": 0.7333763718528082,
"grad_norm": 0.49039290208544745,
"learning_rate": 1.986841618159946e-05,
"loss": 1.1691,
"step": 284
},
{
"epoch": 0.7359586830213041,
"grad_norm": 0.4905834691968128,
"learning_rate": 1.986548246153763e-05,
"loss": 1.1752,
"step": 285
},
{
"epoch": 0.7385409941897999,
"grad_norm": 0.490506332640748,
"learning_rate": 1.9862516618156526e-05,
"loss": 1.1883,
"step": 286
},
{
"epoch": 0.7411233053582956,
"grad_norm": 0.5248431755020703,
"learning_rate": 1.9859518661113326e-05,
"loss": 1.205,
"step": 287
},
{
"epoch": 0.7437056165267915,
"grad_norm": 0.5057560855422101,
"learning_rate": 1.9856488600169785e-05,
"loss": 1.2279,
"step": 288
},
{
"epoch": 0.7462879276952873,
"grad_norm": 0.5025298659401831,
"learning_rate": 1.9853426445192175e-05,
"loss": 1.1631,
"step": 289
},
{
"epoch": 0.7488702388637831,
"grad_norm": 0.4851276896544048,
"learning_rate": 1.9850332206151285e-05,
"loss": 1.1626,
"step": 290
},
{
"epoch": 0.7514525500322788,
"grad_norm": 0.48026264033577865,
"learning_rate": 1.984720589312236e-05,
"loss": 1.2098,
"step": 291
},
{
"epoch": 0.7540348612007747,
"grad_norm": 0.5405394737905861,
"learning_rate": 1.9844047516285098e-05,
"loss": 1.2298,
"step": 292
},
{
"epoch": 0.7566171723692705,
"grad_norm": 0.48769934414927935,
"learning_rate": 1.9840857085923585e-05,
"loss": 1.196,
"step": 293
},
{
"epoch": 0.7591994835377663,
"grad_norm": 0.527078245348908,
"learning_rate": 1.9837634612426292e-05,
"loss": 1.1832,
"step": 294
},
{
"epoch": 0.7617817947062621,
"grad_norm": 0.47968694451872484,
"learning_rate": 1.983438010628602e-05,
"loss": 1.176,
"step": 295
},
{
"epoch": 0.7643641058747579,
"grad_norm": 0.5316988367330956,
"learning_rate": 1.9831093578099866e-05,
"loss": 1.215,
"step": 296
},
{
"epoch": 0.7669464170432537,
"grad_norm": 0.4839737165293382,
"learning_rate": 1.9827775038569203e-05,
"loss": 1.1483,
"step": 297
},
{
"epoch": 0.7695287282117496,
"grad_norm": 0.47298182478673473,
"learning_rate": 1.9824424498499644e-05,
"loss": 1.138,
"step": 298
},
{
"epoch": 0.7721110393802453,
"grad_norm": 0.5037524314864462,
"learning_rate": 1.9821041968800982e-05,
"loss": 1.1906,
"step": 299
},
{
"epoch": 0.7746933505487411,
"grad_norm": 0.4726414066930357,
"learning_rate": 1.981762746048719e-05,
"loss": 1.1872,
"step": 300
},
{
"epoch": 0.7772756617172369,
"grad_norm": 0.4775555799653867,
"learning_rate": 1.9814180984676353e-05,
"loss": 1.1741,
"step": 301
},
{
"epoch": 0.7798579728857328,
"grad_norm": 0.48936767771418854,
"learning_rate": 1.981070255259066e-05,
"loss": 1.1687,
"step": 302
},
{
"epoch": 0.7824402840542285,
"grad_norm": 0.5010983431343674,
"learning_rate": 1.9807192175556344e-05,
"loss": 1.1563,
"step": 303
},
{
"epoch": 0.7850225952227243,
"grad_norm": 0.49317307520583575,
"learning_rate": 1.9803649865003658e-05,
"loss": 1.1831,
"step": 304
},
{
"epoch": 0.7876049063912202,
"grad_norm": 0.5072717596433491,
"learning_rate": 1.9800075632466832e-05,
"loss": 1.1795,
"step": 305
},
{
"epoch": 0.790187217559716,
"grad_norm": 0.5044246633386273,
"learning_rate": 1.979646948958405e-05,
"loss": 1.1985,
"step": 306
},
{
"epoch": 0.7927695287282117,
"grad_norm": 0.5075154181174161,
"learning_rate": 1.979283144809738e-05,
"loss": 1.1955,
"step": 307
},
{
"epoch": 0.7953518398967075,
"grad_norm": 0.5199713188120779,
"learning_rate": 1.9789161519852777e-05,
"loss": 1.2114,
"step": 308
},
{
"epoch": 0.7979341510652034,
"grad_norm": 0.5261480930927327,
"learning_rate": 1.9785459716800005e-05,
"loss": 1.1582,
"step": 309
},
{
"epoch": 0.8005164622336992,
"grad_norm": 0.5271373406623951,
"learning_rate": 1.978172605099264e-05,
"loss": 1.1761,
"step": 310
},
{
"epoch": 0.8030987734021949,
"grad_norm": 0.5065930895833843,
"learning_rate": 1.9777960534587975e-05,
"loss": 1.1915,
"step": 311
},
{
"epoch": 0.8056810845706908,
"grad_norm": 0.5171670807714366,
"learning_rate": 1.9774163179847046e-05,
"loss": 1.1776,
"step": 312
},
{
"epoch": 0.8082633957391866,
"grad_norm": 0.49429669397671067,
"learning_rate": 1.9770333999134538e-05,
"loss": 1.2005,
"step": 313
},
{
"epoch": 0.8108457069076824,
"grad_norm": 0.506264732133423,
"learning_rate": 1.976647300491877e-05,
"loss": 1.1555,
"step": 314
},
{
"epoch": 0.8134280180761781,
"grad_norm": 0.4975188714683081,
"learning_rate": 1.9762580209771648e-05,
"loss": 1.1761,
"step": 315
},
{
"epoch": 0.816010329244674,
"grad_norm": 0.4917712576784755,
"learning_rate": 1.9758655626368635e-05,
"loss": 1.1769,
"step": 316
},
{
"epoch": 0.8185926404131698,
"grad_norm": 0.4954187885530158,
"learning_rate": 1.975469926748869e-05,
"loss": 1.169,
"step": 317
},
{
"epoch": 0.8211749515816655,
"grad_norm": 0.5115626137899685,
"learning_rate": 1.9750711146014254e-05,
"loss": 1.1737,
"step": 318
},
{
"epoch": 0.8237572627501614,
"grad_norm": 0.4748616428095463,
"learning_rate": 1.9746691274931168e-05,
"loss": 1.19,
"step": 319
},
{
"epoch": 0.8263395739186572,
"grad_norm": 0.4940492043780541,
"learning_rate": 1.9742639667328666e-05,
"loss": 1.1761,
"step": 320
},
{
"epoch": 0.828921885087153,
"grad_norm": 0.5422433717627166,
"learning_rate": 1.9738556336399322e-05,
"loss": 1.1573,
"step": 321
},
{
"epoch": 0.8315041962556488,
"grad_norm": 0.5003074749343184,
"learning_rate": 1.9734441295439004e-05,
"loss": 1.1777,
"step": 322
},
{
"epoch": 0.8340865074241446,
"grad_norm": 0.4892259502271507,
"learning_rate": 1.973029455784683e-05,
"loss": 1.1696,
"step": 323
},
{
"epoch": 0.8366688185926404,
"grad_norm": 0.5321024468246395,
"learning_rate": 1.9726116137125128e-05,
"loss": 1.1436,
"step": 324
},
{
"epoch": 0.8392511297611362,
"grad_norm": 0.5045646201138196,
"learning_rate": 1.9721906046879392e-05,
"loss": 1.1764,
"step": 325
},
{
"epoch": 0.841833440929632,
"grad_norm": 0.5281769148500022,
"learning_rate": 1.971766430081823e-05,
"loss": 1.1966,
"step": 326
},
{
"epoch": 0.8444157520981278,
"grad_norm": 0.5202036284793085,
"learning_rate": 1.971339091275333e-05,
"loss": 1.1929,
"step": 327
},
{
"epoch": 0.8469980632666236,
"grad_norm": 0.48805338005531557,
"learning_rate": 1.9709085896599414e-05,
"loss": 1.1713,
"step": 328
},
{
"epoch": 0.8495803744351195,
"grad_norm": 0.48834281779827204,
"learning_rate": 1.970474926637418e-05,
"loss": 1.1766,
"step": 329
},
{
"epoch": 0.8521626856036152,
"grad_norm": 0.47045086079257925,
"learning_rate": 1.9700381036198278e-05,
"loss": 1.1733,
"step": 330
},
{
"epoch": 0.854744996772111,
"grad_norm": 0.5046592134608324,
"learning_rate": 1.9695981220295242e-05,
"loss": 1.2065,
"step": 331
},
{
"epoch": 0.8573273079406069,
"grad_norm": 0.4815780307147996,
"learning_rate": 1.9691549832991455e-05,
"loss": 1.1641,
"step": 332
},
{
"epoch": 0.8599096191091027,
"grad_norm": 0.47306399270721433,
"learning_rate": 1.96870868887161e-05,
"loss": 1.2106,
"step": 333
},
{
"epoch": 0.8624919302775984,
"grad_norm": 0.48177902227189173,
"learning_rate": 1.968259240200112e-05,
"loss": 1.1867,
"step": 334
},
{
"epoch": 0.8650742414460942,
"grad_norm": 0.5109266879563596,
"learning_rate": 1.967806638748116e-05,
"loss": 1.1835,
"step": 335
},
{
"epoch": 0.8676565526145901,
"grad_norm": 0.4886245722535642,
"learning_rate": 1.9673508859893515e-05,
"loss": 1.1687,
"step": 336
},
{
"epoch": 0.8702388637830859,
"grad_norm": 0.5248768069436597,
"learning_rate": 1.966891983407811e-05,
"loss": 1.1984,
"step": 337
},
{
"epoch": 0.8728211749515816,
"grad_norm": 0.4943514825391058,
"learning_rate": 1.9664299324977412e-05,
"loss": 1.1891,
"step": 338
},
{
"epoch": 0.8754034861200775,
"grad_norm": 0.47129413485615096,
"learning_rate": 1.9659647347636422e-05,
"loss": 1.1586,
"step": 339
},
{
"epoch": 0.8779857972885733,
"grad_norm": 0.49462243772822845,
"learning_rate": 1.9654963917202586e-05,
"loss": 1.1558,
"step": 340
},
{
"epoch": 0.8805681084570691,
"grad_norm": 0.4861824324614408,
"learning_rate": 1.965024904892578e-05,
"loss": 1.1683,
"step": 341
},
{
"epoch": 0.8831504196255648,
"grad_norm": 0.4951569265192093,
"learning_rate": 1.9645502758158234e-05,
"loss": 1.2037,
"step": 342
},
{
"epoch": 0.8857327307940607,
"grad_norm": 0.46700115170322853,
"learning_rate": 1.9640725060354508e-05,
"loss": 1.1142,
"step": 343
},
{
"epoch": 0.8883150419625565,
"grad_norm": 0.5117387702694532,
"learning_rate": 1.963591597107142e-05,
"loss": 1.1944,
"step": 344
},
{
"epoch": 0.8908973531310523,
"grad_norm": 0.46783399641185575,
"learning_rate": 1.9631075505967993e-05,
"loss": 1.1802,
"step": 345
},
{
"epoch": 0.8934796642995481,
"grad_norm": 0.49795421380846105,
"learning_rate": 1.9626203680805432e-05,
"loss": 1.1814,
"step": 346
},
{
"epoch": 0.8960619754680439,
"grad_norm": 0.4861596444598828,
"learning_rate": 1.9621300511447043e-05,
"loss": 1.1825,
"step": 347
},
{
"epoch": 0.8986442866365397,
"grad_norm": 0.4953435099828471,
"learning_rate": 1.9616366013858195e-05,
"loss": 1.161,
"step": 348
},
{
"epoch": 0.9012265978050356,
"grad_norm": 0.5023878227402143,
"learning_rate": 1.961140020410627e-05,
"loss": 1.1885,
"step": 349
},
{
"epoch": 0.9038089089735313,
"grad_norm": 0.5007001061995038,
"learning_rate": 1.9606403098360597e-05,
"loss": 1.1989,
"step": 350
},
{
"epoch": 0.9063912201420271,
"grad_norm": 0.4635439197533225,
"learning_rate": 1.960137471289242e-05,
"loss": 1.1302,
"step": 351
},
{
"epoch": 0.9089735313105229,
"grad_norm": 0.49621446281583526,
"learning_rate": 1.9596315064074826e-05,
"loss": 1.1991,
"step": 352
},
{
"epoch": 0.9115558424790188,
"grad_norm": 0.48901012014264644,
"learning_rate": 1.9591224168382708e-05,
"loss": 1.1818,
"step": 353
},
{
"epoch": 0.9141381536475145,
"grad_norm": 0.4823284694023606,
"learning_rate": 1.958610204239269e-05,
"loss": 1.1464,
"step": 354
},
{
"epoch": 0.9167204648160103,
"grad_norm": 0.47062926472993427,
"learning_rate": 1.95809487027831e-05,
"loss": 1.1966,
"step": 355
},
{
"epoch": 0.9193027759845062,
"grad_norm": 0.4766355752313664,
"learning_rate": 1.9575764166333887e-05,
"loss": 1.1741,
"step": 356
},
{
"epoch": 0.921885087153002,
"grad_norm": 0.470588804103096,
"learning_rate": 1.95705484499266e-05,
"loss": 1.1425,
"step": 357
},
{
"epoch": 0.9244673983214977,
"grad_norm": 0.49029399234060356,
"learning_rate": 1.9565301570544297e-05,
"loss": 1.19,
"step": 358
},
{
"epoch": 0.9270497094899935,
"grad_norm": 0.4811321660417228,
"learning_rate": 1.9560023545271512e-05,
"loss": 1.1617,
"step": 359
},
{
"epoch": 0.9296320206584894,
"grad_norm": 0.47791045452781805,
"learning_rate": 1.9554714391294198e-05,
"loss": 1.1349,
"step": 360
},
{
"epoch": 0.9322143318269851,
"grad_norm": 0.45457900915984395,
"learning_rate": 1.9549374125899665e-05,
"loss": 1.1697,
"step": 361
},
{
"epoch": 0.9347966429954809,
"grad_norm": 0.4783672298922519,
"learning_rate": 1.9544002766476523e-05,
"loss": 1.1779,
"step": 362
},
{
"epoch": 0.9373789541639768,
"grad_norm": 0.5043177366580075,
"learning_rate": 1.953860033051463e-05,
"loss": 1.16,
"step": 363
},
{
"epoch": 0.9399612653324726,
"grad_norm": 0.5253476046397387,
"learning_rate": 1.953316683560504e-05,
"loss": 1.2074,
"step": 364
},
{
"epoch": 0.9425435765009683,
"grad_norm": 0.4830105807169101,
"learning_rate": 1.9527702299439925e-05,
"loss": 1.1598,
"step": 365
},
{
"epoch": 0.9451258876694641,
"grad_norm": 0.4603907599438401,
"learning_rate": 1.9522206739812546e-05,
"loss": 1.1511,
"step": 366
},
{
"epoch": 0.94770819883796,
"grad_norm": 0.45849247679852523,
"learning_rate": 1.9516680174617168e-05,
"loss": 1.1873,
"step": 367
},
{
"epoch": 0.9502905100064558,
"grad_norm": 0.4826500481926017,
"learning_rate": 1.9511122621849025e-05,
"loss": 1.187,
"step": 368
},
{
"epoch": 0.9528728211749515,
"grad_norm": 0.4577162756249727,
"learning_rate": 1.9505534099604245e-05,
"loss": 1.1611,
"step": 369
},
{
"epoch": 0.9554551323434474,
"grad_norm": 0.4674250946156371,
"learning_rate": 1.94999146260798e-05,
"loss": 1.164,
"step": 370
},
{
"epoch": 0.9580374435119432,
"grad_norm": 0.46337504643140265,
"learning_rate": 1.9494264219573433e-05,
"loss": 1.1898,
"step": 371
},
{
"epoch": 0.960619754680439,
"grad_norm": 0.48205345839710323,
"learning_rate": 1.9488582898483625e-05,
"loss": 1.1641,
"step": 372
},
{
"epoch": 0.9632020658489348,
"grad_norm": 0.4927156506373219,
"learning_rate": 1.9482870681309502e-05,
"loss": 1.1526,
"step": 373
},
{
"epoch": 0.9657843770174306,
"grad_norm": 0.49774685339253233,
"learning_rate": 1.9477127586650812e-05,
"loss": 1.1513,
"step": 374
},
{
"epoch": 0.9683666881859264,
"grad_norm": 0.5493094035957639,
"learning_rate": 1.9471353633207824e-05,
"loss": 1.2067,
"step": 375
},
{
"epoch": 0.9709489993544222,
"grad_norm": 0.4695400511675006,
"learning_rate": 1.94655488397813e-05,
"loss": 1.1575,
"step": 376
},
{
"epoch": 0.973531310522918,
"grad_norm": 0.5159667326407505,
"learning_rate": 1.9459713225272422e-05,
"loss": 1.1785,
"step": 377
},
{
"epoch": 0.9761136216914138,
"grad_norm": 0.5216341313782504,
"learning_rate": 1.9453846808682713e-05,
"loss": 1.1446,
"step": 378
},
{
"epoch": 0.9786959328599096,
"grad_norm": 0.5182964074330236,
"learning_rate": 1.9447949609114018e-05,
"loss": 1.1432,
"step": 379
},
{
"epoch": 0.9812782440284055,
"grad_norm": 0.485083144408532,
"learning_rate": 1.9442021645768383e-05,
"loss": 1.1275,
"step": 380
},
{
"epoch": 0.9838605551969012,
"grad_norm": 0.4599544335039025,
"learning_rate": 1.9436062937948058e-05,
"loss": 1.1151,
"step": 381
},
{
"epoch": 0.986442866365397,
"grad_norm": 0.521054277469576,
"learning_rate": 1.943007350505538e-05,
"loss": 1.2012,
"step": 382
},
{
"epoch": 0.9890251775338929,
"grad_norm": 0.4751977247155287,
"learning_rate": 1.942405336659274e-05,
"loss": 1.1797,
"step": 383
},
{
"epoch": 0.9916074887023887,
"grad_norm": 0.47658638371046963,
"learning_rate": 1.94180025421625e-05,
"loss": 1.1392,
"step": 384
},
{
"epoch": 0.9941897998708844,
"grad_norm": 0.4712331278213858,
"learning_rate": 1.9411921051466952e-05,
"loss": 1.141,
"step": 385
},
{
"epoch": 0.9967721110393802,
"grad_norm": 0.49755103398259565,
"learning_rate": 1.9405808914308236e-05,
"loss": 1.1328,
"step": 386
},
{
"epoch": 0.9993544222078761,
"grad_norm": 0.4847898684583998,
"learning_rate": 1.9399666150588286e-05,
"loss": 1.1669,
"step": 387
},
{
"epoch": 1.0,
"grad_norm": 0.4847898684583998,
"learning_rate": 1.9393492780308745e-05,
"loss": 1.1861,
"step": 388
},
{
"epoch": 1.0025823111684957,
"grad_norm": 1.1014826257650279,
"learning_rate": 1.938728882357093e-05,
"loss": 1.0917,
"step": 389
},
{
"epoch": 1.0051646223369917,
"grad_norm": 0.5639388688605913,
"learning_rate": 1.938105430057575e-05,
"loss": 1.0903,
"step": 390
},
{
"epoch": 1.0077469335054874,
"grad_norm": 0.6443566019067333,
"learning_rate": 1.9374789231623636e-05,
"loss": 1.1009,
"step": 391
},
{
"epoch": 1.010329244673983,
"grad_norm": 0.630435005940545,
"learning_rate": 1.9368493637114483e-05,
"loss": 1.1003,
"step": 392
},
{
"epoch": 1.012911555842479,
"grad_norm": 0.5108574939050173,
"learning_rate": 1.936216753754758e-05,
"loss": 1.0711,
"step": 393
},
{
"epoch": 1.0154938670109748,
"grad_norm": 0.608458106332031,
"learning_rate": 1.9355810953521556e-05,
"loss": 1.0595,
"step": 394
},
{
"epoch": 1.0180761781794707,
"grad_norm": 0.5458550950072169,
"learning_rate": 1.934942390573428e-05,
"loss": 1.0943,
"step": 395
},
{
"epoch": 1.0206584893479664,
"grad_norm": 0.5725197447406076,
"learning_rate": 1.9343006414982827e-05,
"loss": 1.0715,
"step": 396
},
{
"epoch": 1.0232408005164622,
"grad_norm": 0.5707836595715814,
"learning_rate": 1.9336558502163404e-05,
"loss": 1.0845,
"step": 397
},
{
"epoch": 1.025823111684958,
"grad_norm": 0.5436343954371967,
"learning_rate": 1.933008018827127e-05,
"loss": 1.1252,
"step": 398
},
{
"epoch": 1.0284054228534538,
"grad_norm": 0.5722847310225982,
"learning_rate": 1.932357149440067e-05,
"loss": 1.0698,
"step": 399
},
{
"epoch": 1.0309877340219495,
"grad_norm": 0.5620996777465161,
"learning_rate": 1.9317032441744778e-05,
"loss": 1.0999,
"step": 400
},
{
"epoch": 1.0335700451904455,
"grad_norm": 0.6088252142123551,
"learning_rate": 1.9310463051595612e-05,
"loss": 1.1462,
"step": 401
},
{
"epoch": 1.0361523563589412,
"grad_norm": 0.5076462836433528,
"learning_rate": 1.9303863345343985e-05,
"loss": 1.0755,
"step": 402
},
{
"epoch": 1.0387346675274371,
"grad_norm": 0.5109910409402665,
"learning_rate": 1.929723334447941e-05,
"loss": 1.0821,
"step": 403
},
{
"epoch": 1.0413169786959329,
"grad_norm": 0.5636026180024839,
"learning_rate": 1.9290573070590053e-05,
"loss": 1.079,
"step": 404
},
{
"epoch": 1.0438992898644286,
"grad_norm": 0.5112896124943734,
"learning_rate": 1.9283882545362642e-05,
"loss": 1.104,
"step": 405
},
{
"epoch": 1.0464816010329245,
"grad_norm": 0.5515513629476969,
"learning_rate": 1.9277161790582425e-05,
"loss": 1.094,
"step": 406
},
{
"epoch": 1.0490639122014203,
"grad_norm": 0.514886116372971,
"learning_rate": 1.9270410828133062e-05,
"loss": 1.085,
"step": 407
},
{
"epoch": 1.051646223369916,
"grad_norm": 0.5331151636377134,
"learning_rate": 1.9263629679996582e-05,
"loss": 1.1028,
"step": 408
},
{
"epoch": 1.054228534538412,
"grad_norm": 0.5165475698281132,
"learning_rate": 1.925681836825331e-05,
"loss": 1.0848,
"step": 409
},
{
"epoch": 1.0568108457069076,
"grad_norm": 0.5366595294372662,
"learning_rate": 1.9249976915081773e-05,
"loss": 1.1015,
"step": 410
},
{
"epoch": 1.0593931568754036,
"grad_norm": 0.5193038464689262,
"learning_rate": 1.9243105342758657e-05,
"loss": 1.0782,
"step": 411
},
{
"epoch": 1.0619754680438993,
"grad_norm": 0.5009182548303415,
"learning_rate": 1.923620367365871e-05,
"loss": 1.0516,
"step": 412
},
{
"epoch": 1.064557779212395,
"grad_norm": 0.5029748452840229,
"learning_rate": 1.922927193025468e-05,
"loss": 1.0781,
"step": 413
},
{
"epoch": 1.067140090380891,
"grad_norm": 0.5196114525480685,
"learning_rate": 1.922231013511724e-05,
"loss": 1.0873,
"step": 414
},
{
"epoch": 1.0697224015493867,
"grad_norm": 0.5294324303655081,
"learning_rate": 1.921531831091492e-05,
"loss": 1.0955,
"step": 415
},
{
"epoch": 1.0723047127178824,
"grad_norm": 0.5264181862085819,
"learning_rate": 1.9208296480414034e-05,
"loss": 1.0849,
"step": 416
},
{
"epoch": 1.0748870238863784,
"grad_norm": 0.5072255528456242,
"learning_rate": 1.9201244666478586e-05,
"loss": 1.0865,
"step": 417
},
{
"epoch": 1.077469335054874,
"grad_norm": 0.5043454850061722,
"learning_rate": 1.919416289207022e-05,
"loss": 1.1016,
"step": 418
},
{
"epoch": 1.08005164622337,
"grad_norm": 0.521923847407557,
"learning_rate": 1.9187051180248134e-05,
"loss": 1.1006,
"step": 419
},
{
"epoch": 1.0826339573918657,
"grad_norm": 0.5106595879796177,
"learning_rate": 1.9179909554169002e-05,
"loss": 1.0947,
"step": 420
},
{
"epoch": 1.0852162685603615,
"grad_norm": 0.5021486916963932,
"learning_rate": 1.9172738037086905e-05,
"loss": 1.0763,
"step": 421
},
{
"epoch": 1.0877985797288574,
"grad_norm": 0.5348145469187987,
"learning_rate": 1.9165536652353256e-05,
"loss": 1.1169,
"step": 422
},
{
"epoch": 1.0903808908973531,
"grad_norm": 0.49988703400762524,
"learning_rate": 1.915830542341672e-05,
"loss": 1.1116,
"step": 423
},
{
"epoch": 1.0929632020658488,
"grad_norm": 0.5234949485004682,
"learning_rate": 1.915104437382313e-05,
"loss": 1.088,
"step": 424
},
{
"epoch": 1.0955455132343448,
"grad_norm": 0.5079748564445703,
"learning_rate": 1.9143753527215437e-05,
"loss": 1.0716,
"step": 425
},
{
"epoch": 1.0981278244028405,
"grad_norm": 0.5084741143233746,
"learning_rate": 1.91364329073336e-05,
"loss": 1.0913,
"step": 426
},
{
"epoch": 1.1007101355713362,
"grad_norm": 0.5110683137296181,
"learning_rate": 1.912908253801453e-05,
"loss": 1.0376,
"step": 427
},
{
"epoch": 1.1032924467398322,
"grad_norm": 0.5104932914801271,
"learning_rate": 1.9121702443191994e-05,
"loss": 1.0499,
"step": 428
},
{
"epoch": 1.105874757908328,
"grad_norm": 0.5311882952530959,
"learning_rate": 1.9114292646896574e-05,
"loss": 1.0875,
"step": 429
},
{
"epoch": 1.1084570690768238,
"grad_norm": 0.4879682050554302,
"learning_rate": 1.910685317325554e-05,
"loss": 1.0851,
"step": 430
},
{
"epoch": 1.1110393802453196,
"grad_norm": 0.5107417160978968,
"learning_rate": 1.9099384046492807e-05,
"loss": 1.1234,
"step": 431
},
{
"epoch": 1.1136216914138153,
"grad_norm": 0.5323164257143238,
"learning_rate": 1.9091885290928846e-05,
"loss": 1.1051,
"step": 432
},
{
"epoch": 1.1162040025823112,
"grad_norm": 0.5062938072356562,
"learning_rate": 1.9084356930980593e-05,
"loss": 1.1316,
"step": 433
},
{
"epoch": 1.118786313750807,
"grad_norm": 0.5155486221780639,
"learning_rate": 1.9076798991161395e-05,
"loss": 1.1078,
"step": 434
},
{
"epoch": 1.121368624919303,
"grad_norm": 0.5225888914215889,
"learning_rate": 1.90692114960809e-05,
"loss": 1.0605,
"step": 435
},
{
"epoch": 1.1239509360877986,
"grad_norm": 0.5218586332404057,
"learning_rate": 1.9061594470445e-05,
"loss": 1.0544,
"step": 436
},
{
"epoch": 1.1265332472562943,
"grad_norm": 0.5063750610874502,
"learning_rate": 1.9053947939055737e-05,
"loss": 1.1166,
"step": 437
},
{
"epoch": 1.1291155584247903,
"grad_norm": 0.5243484649411593,
"learning_rate": 1.9046271926811238e-05,
"loss": 1.0726,
"step": 438
},
{
"epoch": 1.131697869593286,
"grad_norm": 0.5495963106537264,
"learning_rate": 1.9038566458705615e-05,
"loss": 1.1032,
"step": 439
},
{
"epoch": 1.1342801807617817,
"grad_norm": 0.4964414774064479,
"learning_rate": 1.903083155982889e-05,
"loss": 1.1079,
"step": 440
},
{
"epoch": 1.1368624919302777,
"grad_norm": 0.5147288498020974,
"learning_rate": 1.902306725536692e-05,
"loss": 1.1177,
"step": 441
},
{
"epoch": 1.1394448030987734,
"grad_norm": 0.5327839820642644,
"learning_rate": 1.9015273570601316e-05,
"loss": 1.075,
"step": 442
},
{
"epoch": 1.142027114267269,
"grad_norm": 0.528940491782197,
"learning_rate": 1.9007450530909345e-05,
"loss": 1.086,
"step": 443
},
{
"epoch": 1.144609425435765,
"grad_norm": 0.5495937382324539,
"learning_rate": 1.899959816176386e-05,
"loss": 1.0836,
"step": 444
},
{
"epoch": 1.1471917366042608,
"grad_norm": 0.5259398393873413,
"learning_rate": 1.899171648873322e-05,
"loss": 1.0777,
"step": 445
},
{
"epoch": 1.1497740477727567,
"grad_norm": 0.5301129579516534,
"learning_rate": 1.8983805537481196e-05,
"loss": 1.072,
"step": 446
},
{
"epoch": 1.1523563589412524,
"grad_norm": 0.5472484501403468,
"learning_rate": 1.8975865333766895e-05,
"loss": 1.053,
"step": 447
},
{
"epoch": 1.1549386701097482,
"grad_norm": 0.521949307121031,
"learning_rate": 1.8967895903444672e-05,
"loss": 1.0818,
"step": 448
},
{
"epoch": 1.157520981278244,
"grad_norm": 0.5182353015878332,
"learning_rate": 1.895989727246405e-05,
"loss": 1.1152,
"step": 449
},
{
"epoch": 1.1601032924467398,
"grad_norm": 0.5214919591138573,
"learning_rate": 1.895186946686964e-05,
"loss": 1.1376,
"step": 450
},
{
"epoch": 1.1626856036152355,
"grad_norm": 0.5173501815788256,
"learning_rate": 1.8943812512801023e-05,
"loss": 1.1067,
"step": 451
},
{
"epoch": 1.1652679147837315,
"grad_norm": 0.5017112933282827,
"learning_rate": 1.8935726436492724e-05,
"loss": 1.0956,
"step": 452
},
{
"epoch": 1.1678502259522272,
"grad_norm": 0.5054557050103587,
"learning_rate": 1.8927611264274078e-05,
"loss": 1.0743,
"step": 453
},
{
"epoch": 1.170432537120723,
"grad_norm": 0.5038866398853414,
"learning_rate": 1.8919467022569163e-05,
"loss": 1.0663,
"step": 454
},
{
"epoch": 1.1730148482892189,
"grad_norm": 0.5069505760103037,
"learning_rate": 1.8911293737896706e-05,
"loss": 1.0918,
"step": 455
},
{
"epoch": 1.1755971594577146,
"grad_norm": 0.49669266787942645,
"learning_rate": 1.890309143687001e-05,
"loss": 1.0768,
"step": 456
},
{
"epoch": 1.1781794706262105,
"grad_norm": 0.5059342818530757,
"learning_rate": 1.8894860146196848e-05,
"loss": 1.0606,
"step": 457
},
{
"epoch": 1.1807617817947063,
"grad_norm": 0.49213032174231053,
"learning_rate": 1.88865998926794e-05,
"loss": 1.0538,
"step": 458
},
{
"epoch": 1.183344092963202,
"grad_norm": 0.497950829610666,
"learning_rate": 1.8878310703214148e-05,
"loss": 1.0797,
"step": 459
},
{
"epoch": 1.185926404131698,
"grad_norm": 0.5085396242287622,
"learning_rate": 1.8869992604791786e-05,
"loss": 1.0652,
"step": 460
},
{
"epoch": 1.1885087153001936,
"grad_norm": 0.5050911514318284,
"learning_rate": 1.8861645624497154e-05,
"loss": 1.0744,
"step": 461
},
{
"epoch": 1.1910910264686896,
"grad_norm": 0.518765301787044,
"learning_rate": 1.885326978950912e-05,
"loss": 1.1027,
"step": 462
},
{
"epoch": 1.1936733376371853,
"grad_norm": 0.49638458680440195,
"learning_rate": 1.8844865127100517e-05,
"loss": 1.0804,
"step": 463
},
{
"epoch": 1.196255648805681,
"grad_norm": 0.5183830709936179,
"learning_rate": 1.883643166463804e-05,
"loss": 1.0966,
"step": 464
},
{
"epoch": 1.198837959974177,
"grad_norm": 0.49817074390497323,
"learning_rate": 1.882796942958216e-05,
"loss": 1.0786,
"step": 465
},
{
"epoch": 1.2014202711426727,
"grad_norm": 0.501779662690412,
"learning_rate": 1.8819478449487034e-05,
"loss": 1.0586,
"step": 466
},
{
"epoch": 1.2040025823111684,
"grad_norm": 0.5054852727426871,
"learning_rate": 1.8810958752000426e-05,
"loss": 1.0553,
"step": 467
},
{
"epoch": 1.2065848934796644,
"grad_norm": 0.5016427339348701,
"learning_rate": 1.8802410364863598e-05,
"loss": 1.0943,
"step": 468
},
{
"epoch": 1.20916720464816,
"grad_norm": 0.49091502171551293,
"learning_rate": 1.879383331591123e-05,
"loss": 1.1174,
"step": 469
},
{
"epoch": 1.2117495158166558,
"grad_norm": 0.5378342182934727,
"learning_rate": 1.8785227633071332e-05,
"loss": 1.0729,
"step": 470
},
{
"epoch": 1.2143318269851517,
"grad_norm": 0.5106316461830734,
"learning_rate": 1.877659334436515e-05,
"loss": 1.1085,
"step": 471
},
{
"epoch": 1.2169141381536475,
"grad_norm": 0.5098246784805628,
"learning_rate": 1.8767930477907074e-05,
"loss": 1.0957,
"step": 472
},
{
"epoch": 1.2194964493221434,
"grad_norm": 0.506477376675333,
"learning_rate": 1.875923906190454e-05,
"loss": 1.0918,
"step": 473
},
{
"epoch": 1.2220787604906391,
"grad_norm": 0.5004090128286083,
"learning_rate": 1.875051912465796e-05,
"loss": 1.1018,
"step": 474
},
{
"epoch": 1.2246610716591348,
"grad_norm": 0.5117533679075642,
"learning_rate": 1.8741770694560598e-05,
"loss": 1.0592,
"step": 475
},
{
"epoch": 1.2272433828276308,
"grad_norm": 0.5078739636230806,
"learning_rate": 1.873299380009851e-05,
"loss": 1.0705,
"step": 476
},
{
"epoch": 1.2298256939961265,
"grad_norm": 0.5143902933122632,
"learning_rate": 1.8724188469850423e-05,
"loss": 1.0774,
"step": 477
},
{
"epoch": 1.2324080051646225,
"grad_norm": 0.49232344984603593,
"learning_rate": 1.871535473248766e-05,
"loss": 1.0548,
"step": 478
},
{
"epoch": 1.2349903163331182,
"grad_norm": 0.49486310972586456,
"learning_rate": 1.8706492616774043e-05,
"loss": 1.084,
"step": 479
},
{
"epoch": 1.237572627501614,
"grad_norm": 0.5178421153087599,
"learning_rate": 1.86976021515658e-05,
"loss": 1.0907,
"step": 480
},
{
"epoch": 1.2401549386701098,
"grad_norm": 0.524983289209404,
"learning_rate": 1.8688683365811456e-05,
"loss": 1.0863,
"step": 481
},
{
"epoch": 1.2427372498386056,
"grad_norm": 0.5045481433976463,
"learning_rate": 1.867973628855177e-05,
"loss": 1.1053,
"step": 482
},
{
"epoch": 1.2453195610071013,
"grad_norm": 0.5091501092813715,
"learning_rate": 1.8670760948919608e-05,
"loss": 1.1194,
"step": 483
},
{
"epoch": 1.2479018721755972,
"grad_norm": 0.49967798365538585,
"learning_rate": 1.8661757376139858e-05,
"loss": 1.0845,
"step": 484
},
{
"epoch": 1.250484183344093,
"grad_norm": 0.4980457942019374,
"learning_rate": 1.865272559952936e-05,
"loss": 1.0601,
"step": 485
},
{
"epoch": 1.2530664945125887,
"grad_norm": 0.4972111559723752,
"learning_rate": 1.864366564849677e-05,
"loss": 1.0907,
"step": 486
},
{
"epoch": 1.2556488056810846,
"grad_norm": 0.5065337348249978,
"learning_rate": 1.8634577552542492e-05,
"loss": 1.0795,
"step": 487
},
{
"epoch": 1.2582311168495803,
"grad_norm": 0.513607026130961,
"learning_rate": 1.862546134125857e-05,
"loss": 1.0436,
"step": 488
},
{
"epoch": 1.2608134280180763,
"grad_norm": 0.5225707833426417,
"learning_rate": 1.86163170443286e-05,
"loss": 1.0857,
"step": 489
},
{
"epoch": 1.263395739186572,
"grad_norm": 0.5022229029307232,
"learning_rate": 1.860714469152762e-05,
"loss": 1.0872,
"step": 490
},
{
"epoch": 1.2659780503550677,
"grad_norm": 0.5010606447774606,
"learning_rate": 1.859794431272203e-05,
"loss": 1.1187,
"step": 491
},
{
"epoch": 1.2685603615235637,
"grad_norm": 0.5014036896762357,
"learning_rate": 1.8588715937869487e-05,
"loss": 1.0601,
"step": 492
},
{
"epoch": 1.2711426726920594,
"grad_norm": 0.5062378400840666,
"learning_rate": 1.8579459597018798e-05,
"loss": 1.0733,
"step": 493
},
{
"epoch": 1.2737249838605553,
"grad_norm": 0.49322055025801476,
"learning_rate": 1.857017532030984e-05,
"loss": 1.081,
"step": 494
},
{
"epoch": 1.276307295029051,
"grad_norm": 0.4985978212218064,
"learning_rate": 1.8560863137973447e-05,
"loss": 1.0824,
"step": 495
},
{
"epoch": 1.2788896061975468,
"grad_norm": 0.5035778345625428,
"learning_rate": 1.8551523080331324e-05,
"loss": 1.0875,
"step": 496
},
{
"epoch": 1.2814719173660425,
"grad_norm": 0.5035572503139835,
"learning_rate": 1.854215517779593e-05,
"loss": 1.085,
"step": 497
},
{
"epoch": 1.2840542285345384,
"grad_norm": 0.5464525852922026,
"learning_rate": 1.8532759460870407e-05,
"loss": 1.0964,
"step": 498
},
{
"epoch": 1.2866365397030342,
"grad_norm": 0.5069778816485006,
"learning_rate": 1.8523335960148446e-05,
"loss": 1.1123,
"step": 499
},
{
"epoch": 1.28921885087153,
"grad_norm": 0.5106529164099722,
"learning_rate": 1.8513884706314224e-05,
"loss": 1.0752,
"step": 500
},
{
"epoch": 1.2918011620400258,
"grad_norm": 0.5009571583608718,
"learning_rate": 1.8504405730142267e-05,
"loss": 1.0549,
"step": 501
},
{
"epoch": 1.2943834732085215,
"grad_norm": 0.4969158537205058,
"learning_rate": 1.849489906249739e-05,
"loss": 1.0756,
"step": 502
},
{
"epoch": 1.2969657843770175,
"grad_norm": 0.5336056726258084,
"learning_rate": 1.8485364734334555e-05,
"loss": 1.0765,
"step": 503
},
{
"epoch": 1.2995480955455132,
"grad_norm": 0.5171474777619819,
"learning_rate": 1.84758027766988e-05,
"loss": 1.0919,
"step": 504
},
{
"epoch": 1.3021304067140091,
"grad_norm": 0.5120748195398405,
"learning_rate": 1.8466213220725133e-05,
"loss": 1.092,
"step": 505
},
{
"epoch": 1.3047127178825049,
"grad_norm": 0.5018571042434719,
"learning_rate": 1.8456596097638414e-05,
"loss": 1.0857,
"step": 506
},
{
"epoch": 1.3072950290510006,
"grad_norm": 0.5230975347071961,
"learning_rate": 1.8446951438753272e-05,
"loss": 1.0498,
"step": 507
},
{
"epoch": 1.3098773402194963,
"grad_norm": 0.5217912907178898,
"learning_rate": 1.8437279275474e-05,
"loss": 1.0744,
"step": 508
},
{
"epoch": 1.3124596513879923,
"grad_norm": 0.5317696621552458,
"learning_rate": 1.8427579639294436e-05,
"loss": 1.0914,
"step": 509
},
{
"epoch": 1.315041962556488,
"grad_norm": 0.49394980658380616,
"learning_rate": 1.841785256179789e-05,
"loss": 1.1051,
"step": 510
},
{
"epoch": 1.317624273724984,
"grad_norm": 0.5217876781301438,
"learning_rate": 1.840809807465701e-05,
"loss": 1.1077,
"step": 511
},
{
"epoch": 1.3202065848934796,
"grad_norm": 0.49490066616909967,
"learning_rate": 1.839831620963371e-05,
"loss": 1.0961,
"step": 512
},
{
"epoch": 1.3227888960619754,
"grad_norm": 0.5083891511636687,
"learning_rate": 1.8388506998579025e-05,
"loss": 1.0532,
"step": 513
},
{
"epoch": 1.3253712072304713,
"grad_norm": 0.5242811566268283,
"learning_rate": 1.837867047343306e-05,
"loss": 1.1178,
"step": 514
},
{
"epoch": 1.327953518398967,
"grad_norm": 0.5281173798352883,
"learning_rate": 1.8368806666224836e-05,
"loss": 1.1301,
"step": 515
},
{
"epoch": 1.330535829567463,
"grad_norm": 0.5022194856315405,
"learning_rate": 1.8358915609072223e-05,
"loss": 1.0753,
"step": 516
},
{
"epoch": 1.3331181407359587,
"grad_norm": 0.5275458989077836,
"learning_rate": 1.8348997334181815e-05,
"loss": 1.0857,
"step": 517
},
{
"epoch": 1.3357004519044544,
"grad_norm": 0.5261446317192925,
"learning_rate": 1.833905187384883e-05,
"loss": 1.0724,
"step": 518
},
{
"epoch": 1.3382827630729504,
"grad_norm": 0.4909880164376135,
"learning_rate": 1.8329079260457e-05,
"loss": 1.0867,
"step": 519
},
{
"epoch": 1.340865074241446,
"grad_norm": 0.538007883447562,
"learning_rate": 1.8319079526478487e-05,
"loss": 1.1477,
"step": 520
},
{
"epoch": 1.343447385409942,
"grad_norm": 0.5259603760732874,
"learning_rate": 1.830905270447374e-05,
"loss": 1.0871,
"step": 521
},
{
"epoch": 1.3460296965784377,
"grad_norm": 0.5210088026374243,
"learning_rate": 1.829899882709143e-05,
"loss": 1.0548,
"step": 522
},
{
"epoch": 1.3486120077469335,
"grad_norm": 0.5274100435358815,
"learning_rate": 1.8288917927068315e-05,
"loss": 1.0874,
"step": 523
},
{
"epoch": 1.3511943189154292,
"grad_norm": 0.509881963592761,
"learning_rate": 1.8278810037229134e-05,
"loss": 1.0692,
"step": 524
},
{
"epoch": 1.3537766300839251,
"grad_norm": 0.5425562492866538,
"learning_rate": 1.8268675190486524e-05,
"loss": 1.0896,
"step": 525
},
{
"epoch": 1.3563589412524208,
"grad_norm": 0.5172299722903951,
"learning_rate": 1.825851341984089e-05,
"loss": 1.0332,
"step": 526
},
{
"epoch": 1.3589412524209168,
"grad_norm": 0.5113811516912794,
"learning_rate": 1.82483247583803e-05,
"loss": 1.1021,
"step": 527
},
{
"epoch": 1.3615235635894125,
"grad_norm": 0.4972938127748451,
"learning_rate": 1.8238109239280393e-05,
"loss": 1.0932,
"step": 528
},
{
"epoch": 1.3641058747579082,
"grad_norm": 0.516789056255083,
"learning_rate": 1.822786689580425e-05,
"loss": 1.0544,
"step": 529
},
{
"epoch": 1.3666881859264042,
"grad_norm": 0.55011290621819,
"learning_rate": 1.8217597761302298e-05,
"loss": 1.1075,
"step": 530
},
{
"epoch": 1.3692704970949,
"grad_norm": 0.5273282578927786,
"learning_rate": 1.8207301869212207e-05,
"loss": 1.0642,
"step": 531
},
{
"epoch": 1.3718528082633958,
"grad_norm": 0.5212309533973544,
"learning_rate": 1.8196979253058765e-05,
"loss": 1.1039,
"step": 532
},
{
"epoch": 1.3744351194318916,
"grad_norm": 0.5234164457393133,
"learning_rate": 1.8186629946453774e-05,
"loss": 1.0697,
"step": 533
},
{
"epoch": 1.3770174306003873,
"grad_norm": 0.5143378010325881,
"learning_rate": 1.8176253983095958e-05,
"loss": 1.059,
"step": 534
},
{
"epoch": 1.3795997417688832,
"grad_norm": 0.5202992098003197,
"learning_rate": 1.816585139677082e-05,
"loss": 1.0832,
"step": 535
},
{
"epoch": 1.382182052937379,
"grad_norm": 0.5137267251756589,
"learning_rate": 1.8155422221350566e-05,
"loss": 1.077,
"step": 536
},
{
"epoch": 1.384764364105875,
"grad_norm": 0.5206482372388472,
"learning_rate": 1.8144966490793973e-05,
"loss": 1.0808,
"step": 537
},
{
"epoch": 1.3873466752743706,
"grad_norm": 0.4977846119013132,
"learning_rate": 1.813448423914629e-05,
"loss": 1.0889,
"step": 538
},
{
"epoch": 1.3899289864428663,
"grad_norm": 0.5038953337541946,
"learning_rate": 1.8123975500539114e-05,
"loss": 1.0517,
"step": 539
},
{
"epoch": 1.392511297611362,
"grad_norm": 0.48358165460230973,
"learning_rate": 1.811344030919029e-05,
"loss": 1.0637,
"step": 540
},
{
"epoch": 1.395093608779858,
"grad_norm": 0.5010075364080836,
"learning_rate": 1.8102878699403804e-05,
"loss": 1.0718,
"step": 541
},
{
"epoch": 1.3976759199483537,
"grad_norm": 0.5031248131184225,
"learning_rate": 1.8092290705569655e-05,
"loss": 1.08,
"step": 542
},
{
"epoch": 1.4002582311168497,
"grad_norm": 0.4906664062118931,
"learning_rate": 1.8081676362163757e-05,
"loss": 1.0582,
"step": 543
},
{
"epoch": 1.4028405422853454,
"grad_norm": 0.5121501864467626,
"learning_rate": 1.8071035703747816e-05,
"loss": 1.0751,
"step": 544
},
{
"epoch": 1.405422853453841,
"grad_norm": 0.524910024204937,
"learning_rate": 1.806036876496923e-05,
"loss": 1.0547,
"step": 545
},
{
"epoch": 1.408005164622337,
"grad_norm": 0.5209029065997814,
"learning_rate": 1.8049675580560965e-05,
"loss": 1.1205,
"step": 546
},
{
"epoch": 1.4105874757908328,
"grad_norm": 0.48220028684533556,
"learning_rate": 1.8038956185341452e-05,
"loss": 1.0426,
"step": 547
},
{
"epoch": 1.4131697869593287,
"grad_norm": 0.503658857829128,
"learning_rate": 1.8028210614214458e-05,
"loss": 1.0772,
"step": 548
},
{
"epoch": 1.4157520981278244,
"grad_norm": 0.5055334860886664,
"learning_rate": 1.8017438902168987e-05,
"loss": 1.0702,
"step": 549
},
{
"epoch": 1.4183344092963202,
"grad_norm": 0.5171324204492811,
"learning_rate": 1.800664108427917e-05,
"loss": 1.0609,
"step": 550
},
{
"epoch": 1.4209167204648159,
"grad_norm": 0.5189722895903278,
"learning_rate": 1.799581719570412e-05,
"loss": 1.0755,
"step": 551
},
{
"epoch": 1.4234990316333118,
"grad_norm": 0.5243160862800866,
"learning_rate": 1.798496727168787e-05,
"loss": 1.0739,
"step": 552
},
{
"epoch": 1.4260813428018075,
"grad_norm": 0.5317337215399274,
"learning_rate": 1.7974091347559197e-05,
"loss": 1.0711,
"step": 553
},
{
"epoch": 1.4286636539703035,
"grad_norm": 0.49461538156213314,
"learning_rate": 1.796318945873156e-05,
"loss": 1.1056,
"step": 554
},
{
"epoch": 1.4312459651387992,
"grad_norm": 0.5212170098454869,
"learning_rate": 1.795226164070296e-05,
"loss": 1.1166,
"step": 555
},
{
"epoch": 1.433828276307295,
"grad_norm": 0.5384906886276548,
"learning_rate": 1.7941307929055813e-05,
"loss": 1.0836,
"step": 556
},
{
"epoch": 1.4364105874757909,
"grad_norm": 0.538099607308631,
"learning_rate": 1.7930328359456856e-05,
"loss": 1.0563,
"step": 557
},
{
"epoch": 1.4389928986442866,
"grad_norm": 0.508125640639657,
"learning_rate": 1.791932296765703e-05,
"loss": 1.0862,
"step": 558
},
{
"epoch": 1.4415752098127825,
"grad_norm": 0.5258022272020693,
"learning_rate": 1.7908291789491348e-05,
"loss": 1.0947,
"step": 559
},
{
"epoch": 1.4441575209812783,
"grad_norm": 0.5292716445438939,
"learning_rate": 1.7897234860878783e-05,
"loss": 1.0953,
"step": 560
},
{
"epoch": 1.446739832149774,
"grad_norm": 0.4919901732224947,
"learning_rate": 1.7886152217822173e-05,
"loss": 1.0589,
"step": 561
},
{
"epoch": 1.44932214331827,
"grad_norm": 0.540399625807644,
"learning_rate": 1.7875043896408065e-05,
"loss": 1.0868,
"step": 562
},
{
"epoch": 1.4519044544867656,
"grad_norm": 0.5157274204814702,
"learning_rate": 1.7863909932806632e-05,
"loss": 1.055,
"step": 563
},
{
"epoch": 1.4544867656552616,
"grad_norm": 0.5157064713943694,
"learning_rate": 1.785275036327153e-05,
"loss": 1.0743,
"step": 564
},
{
"epoch": 1.4570690768237573,
"grad_norm": 0.5093111094768367,
"learning_rate": 1.7841565224139798e-05,
"loss": 1.0885,
"step": 565
},
{
"epoch": 1.459651387992253,
"grad_norm": 0.48350148375931845,
"learning_rate": 1.783035455183174e-05,
"loss": 1.0795,
"step": 566
},
{
"epoch": 1.4622336991607487,
"grad_norm": 0.5104024923814483,
"learning_rate": 1.781911838285078e-05,
"loss": 1.0691,
"step": 567
},
{
"epoch": 1.4648160103292447,
"grad_norm": 0.5257774613135558,
"learning_rate": 1.7807856753783387e-05,
"loss": 1.0836,
"step": 568
},
{
"epoch": 1.4673983214977404,
"grad_norm": 0.5133170260226599,
"learning_rate": 1.7796569701298906e-05,
"loss": 1.111,
"step": 569
},
{
"epoch": 1.4699806326662364,
"grad_norm": 0.4960488771792151,
"learning_rate": 1.778525726214949e-05,
"loss": 1.0913,
"step": 570
},
{
"epoch": 1.472562943834732,
"grad_norm": 0.4923055735581868,
"learning_rate": 1.7773919473169933e-05,
"loss": 1.0585,
"step": 571
},
{
"epoch": 1.4751452550032278,
"grad_norm": 0.5127181992354112,
"learning_rate": 1.7762556371277578e-05,
"loss": 1.0647,
"step": 572
},
{
"epoch": 1.4777275661717237,
"grad_norm": 0.5282994414831952,
"learning_rate": 1.7751167993472198e-05,
"loss": 1.1137,
"step": 573
},
{
"epoch": 1.4803098773402195,
"grad_norm": 0.5248953393475492,
"learning_rate": 1.7739754376835858e-05,
"loss": 1.0999,
"step": 574
},
{
"epoch": 1.4828921885087154,
"grad_norm": 0.5281077411991068,
"learning_rate": 1.7728315558532806e-05,
"loss": 1.0953,
"step": 575
},
{
"epoch": 1.4854744996772111,
"grad_norm": 0.4914770889754758,
"learning_rate": 1.7716851575809354e-05,
"loss": 1.1072,
"step": 576
},
{
"epoch": 1.4880568108457068,
"grad_norm": 0.5218556469624681,
"learning_rate": 1.770536246599375e-05,
"loss": 1.0899,
"step": 577
},
{
"epoch": 1.4906391220142028,
"grad_norm": 0.5239346375890538,
"learning_rate": 1.769384826649606e-05,
"loss": 1.0779,
"step": 578
},
{
"epoch": 1.4932214331826985,
"grad_norm": 0.5538774540635639,
"learning_rate": 1.7682309014808043e-05,
"loss": 1.0503,
"step": 579
},
{
"epoch": 1.4958037443511945,
"grad_norm": 0.5212453237405811,
"learning_rate": 1.7670744748503033e-05,
"loss": 1.1206,
"step": 580
},
{
"epoch": 1.4983860555196902,
"grad_norm": 0.5221975743446008,
"learning_rate": 1.7659155505235812e-05,
"loss": 1.0712,
"step": 581
},
{
"epoch": 1.500968366688186,
"grad_norm": 0.49914883784122016,
"learning_rate": 1.76475413227425e-05,
"loss": 1.0649,
"step": 582
},
{
"epoch": 1.5035506778566816,
"grad_norm": 0.5000466728400638,
"learning_rate": 1.7635902238840408e-05,
"loss": 1.0621,
"step": 583
},
{
"epoch": 1.5061329890251776,
"grad_norm": 0.4994098902333712,
"learning_rate": 1.762423829142794e-05,
"loss": 1.0712,
"step": 584
},
{
"epoch": 1.5087153001936735,
"grad_norm": 0.5342342246357215,
"learning_rate": 1.7612549518484458e-05,
"loss": 1.1141,
"step": 585
},
{
"epoch": 1.5112976113621692,
"grad_norm": 0.5097744481641414,
"learning_rate": 1.7600835958070156e-05,
"loss": 1.1007,
"step": 586
},
{
"epoch": 1.513879922530665,
"grad_norm": 0.5079344339293564,
"learning_rate": 1.7589097648325936e-05,
"loss": 1.0814,
"step": 587
},
{
"epoch": 1.5164622336991607,
"grad_norm": 0.4902197192987524,
"learning_rate": 1.7577334627473295e-05,
"loss": 1.0589,
"step": 588
},
{
"epoch": 1.5190445448676564,
"grad_norm": 0.4893765216413381,
"learning_rate": 1.756554693381419e-05,
"loss": 1.0913,
"step": 589
},
{
"epoch": 1.5216268560361523,
"grad_norm": 0.5225764596558536,
"learning_rate": 1.755373460573091e-05,
"loss": 1.0923,
"step": 590
},
{
"epoch": 1.5242091672046483,
"grad_norm": 0.5152845982203591,
"learning_rate": 1.7541897681685967e-05,
"loss": 1.0946,
"step": 591
},
{
"epoch": 1.526791478373144,
"grad_norm": 0.5261227535805723,
"learning_rate": 1.7530036200221955e-05,
"loss": 1.1183,
"step": 592
},
{
"epoch": 1.5293737895416397,
"grad_norm": 0.49461625515890395,
"learning_rate": 1.7518150199961427e-05,
"loss": 1.0876,
"step": 593
},
{
"epoch": 1.5319561007101354,
"grad_norm": 0.5021228041031806,
"learning_rate": 1.7506239719606776e-05,
"loss": 1.0916,
"step": 594
},
{
"epoch": 1.5345384118786314,
"grad_norm": 0.503576565099223,
"learning_rate": 1.749430479794011e-05,
"loss": 1.0943,
"step": 595
},
{
"epoch": 1.5371207230471273,
"grad_norm": 0.5226270592481841,
"learning_rate": 1.7482345473823116e-05,
"loss": 1.1015,
"step": 596
},
{
"epoch": 1.539703034215623,
"grad_norm": 0.537294451707703,
"learning_rate": 1.7470361786196938e-05,
"loss": 1.0954,
"step": 597
},
{
"epoch": 1.5422853453841188,
"grad_norm": 0.504661881274588,
"learning_rate": 1.7458353774082052e-05,
"loss": 1.0821,
"step": 598
},
{
"epoch": 1.5448676565526145,
"grad_norm": 0.49601719518902315,
"learning_rate": 1.7446321476578138e-05,
"loss": 1.0721,
"step": 599
},
{
"epoch": 1.5474499677211104,
"grad_norm": 0.5187763220017648,
"learning_rate": 1.743426493286395e-05,
"loss": 1.0507,
"step": 600
},
{
"epoch": 1.5500322788896062,
"grad_norm": 0.4989292387853037,
"learning_rate": 1.7422184182197197e-05,
"loss": 1.0897,
"step": 601
},
{
"epoch": 1.552614590058102,
"grad_norm": 0.4843467686137247,
"learning_rate": 1.7410079263914406e-05,
"loss": 1.0631,
"step": 602
},
{
"epoch": 1.5551969012265978,
"grad_norm": 0.5271533786423233,
"learning_rate": 1.7397950217430794e-05,
"loss": 1.1022,
"step": 603
},
{
"epoch": 1.5577792123950935,
"grad_norm": 0.5088338681798537,
"learning_rate": 1.7385797082240147e-05,
"loss": 1.0839,
"step": 604
},
{
"epoch": 1.5603615235635893,
"grad_norm": 0.4853868656293585,
"learning_rate": 1.737361989791468e-05,
"loss": 1.0539,
"step": 605
},
{
"epoch": 1.5629438347320852,
"grad_norm": 0.49682074497284284,
"learning_rate": 1.7361418704104925e-05,
"loss": 1.089,
"step": 606
},
{
"epoch": 1.5655261459005811,
"grad_norm": 0.49391110999180715,
"learning_rate": 1.734919354053959e-05,
"loss": 1.0829,
"step": 607
},
{
"epoch": 1.5681084570690769,
"grad_norm": 0.5033926514159104,
"learning_rate": 1.733694444702542e-05,
"loss": 1.0882,
"step": 608
},
{
"epoch": 1.5706907682375726,
"grad_norm": 0.5120915845296581,
"learning_rate": 1.7324671463447092e-05,
"loss": 1.1071,
"step": 609
},
{
"epoch": 1.5732730794060683,
"grad_norm": 0.5027154616635228,
"learning_rate": 1.731237462976707e-05,
"loss": 1.0706,
"step": 610
},
{
"epoch": 1.5758553905745643,
"grad_norm": 0.5017997136995258,
"learning_rate": 1.7300053986025476e-05,
"loss": 1.0935,
"step": 611
},
{
"epoch": 1.5784377017430602,
"grad_norm": 0.4876968907410891,
"learning_rate": 1.7287709572339958e-05,
"loss": 1.0414,
"step": 612
},
{
"epoch": 1.581020012911556,
"grad_norm": 0.5124364267745315,
"learning_rate": 1.7275341428905564e-05,
"loss": 1.0569,
"step": 613
},
{
"epoch": 1.5836023240800516,
"grad_norm": 0.5192211728464061,
"learning_rate": 1.7262949595994606e-05,
"loss": 1.0761,
"step": 614
},
{
"epoch": 1.5861846352485474,
"grad_norm": 0.49024680943396437,
"learning_rate": 1.7250534113956543e-05,
"loss": 1.08,
"step": 615
},
{
"epoch": 1.5887669464170433,
"grad_norm": 0.4920460482587524,
"learning_rate": 1.7238095023217823e-05,
"loss": 1.0739,
"step": 616
},
{
"epoch": 1.591349257585539,
"grad_norm": 0.5124627096290069,
"learning_rate": 1.722563236428178e-05,
"loss": 1.0507,
"step": 617
},
{
"epoch": 1.593931568754035,
"grad_norm": 0.51198818397323,
"learning_rate": 1.721314617772849e-05,
"loss": 1.0922,
"step": 618
},
{
"epoch": 1.5965138799225307,
"grad_norm": 0.5062132347505444,
"learning_rate": 1.7200636504214618e-05,
"loss": 1.0374,
"step": 619
},
{
"epoch": 1.5990961910910264,
"grad_norm": 0.521687159299446,
"learning_rate": 1.7188103384473334e-05,
"loss": 1.064,
"step": 620
},
{
"epoch": 1.6016785022595221,
"grad_norm": 0.47867011946021426,
"learning_rate": 1.7175546859314126e-05,
"loss": 1.0988,
"step": 621
},
{
"epoch": 1.604260813428018,
"grad_norm": 0.4894260917344886,
"learning_rate": 1.7162966969622713e-05,
"loss": 1.0709,
"step": 622
},
{
"epoch": 1.606843124596514,
"grad_norm": 0.5354553757656224,
"learning_rate": 1.7150363756360886e-05,
"loss": 1.1033,
"step": 623
},
{
"epoch": 1.6094254357650097,
"grad_norm": 0.5039991951669948,
"learning_rate": 1.713773726056637e-05,
"loss": 1.1001,
"step": 624
},
{
"epoch": 1.6120077469335055,
"grad_norm": 0.4980182271238221,
"learning_rate": 1.7125087523352718e-05,
"loss": 1.0788,
"step": 625
},
{
"epoch": 1.6145900581020012,
"grad_norm": 0.5322347706228192,
"learning_rate": 1.7112414585909146e-05,
"loss": 1.0673,
"step": 626
},
{
"epoch": 1.6171723692704971,
"grad_norm": 0.49515788566577773,
"learning_rate": 1.7099718489500426e-05,
"loss": 1.0818,
"step": 627
},
{
"epoch": 1.6197546804389928,
"grad_norm": 0.49847544874894734,
"learning_rate": 1.7086999275466727e-05,
"loss": 1.071,
"step": 628
},
{
"epoch": 1.6223369916074888,
"grad_norm": 0.4852829304995305,
"learning_rate": 1.7074256985223496e-05,
"loss": 1.0631,
"step": 629
},
{
"epoch": 1.6249193027759845,
"grad_norm": 0.49795661914754413,
"learning_rate": 1.706149166026132e-05,
"loss": 1.0876,
"step": 630
},
{
"epoch": 1.6275016139444802,
"grad_norm": 0.502622123586083,
"learning_rate": 1.7048703342145793e-05,
"loss": 1.0846,
"step": 631
},
{
"epoch": 1.630083925112976,
"grad_norm": 0.49228802240874425,
"learning_rate": 1.7035892072517373e-05,
"loss": 1.1087,
"step": 632
},
{
"epoch": 1.632666236281472,
"grad_norm": 0.4971680566338004,
"learning_rate": 1.7023057893091254e-05,
"loss": 1.0768,
"step": 633
},
{
"epoch": 1.6352485474499678,
"grad_norm": 0.5131651796956291,
"learning_rate": 1.7010200845657222e-05,
"loss": 1.0899,
"step": 634
},
{
"epoch": 1.6378308586184636,
"grad_norm": 0.5074519176524721,
"learning_rate": 1.6997320972079536e-05,
"loss": 1.081,
"step": 635
},
{
"epoch": 1.6404131697869593,
"grad_norm": 0.5174545968485477,
"learning_rate": 1.6984418314296768e-05,
"loss": 1.0472,
"step": 636
},
{
"epoch": 1.642995480955455,
"grad_norm": 0.5077381746768771,
"learning_rate": 1.697149291432168e-05,
"loss": 1.0926,
"step": 637
},
{
"epoch": 1.645577792123951,
"grad_norm": 0.5354282615337868,
"learning_rate": 1.6958544814241094e-05,
"loss": 1.0414,
"step": 638
},
{
"epoch": 1.6481601032924469,
"grad_norm": 0.52898951516764,
"learning_rate": 1.6945574056215742e-05,
"loss": 1.0973,
"step": 639
},
{
"epoch": 1.6507424144609426,
"grad_norm": 0.5009163137242975,
"learning_rate": 1.6932580682480124e-05,
"loss": 1.0826,
"step": 640
},
{
"epoch": 1.6533247256294383,
"grad_norm": 0.5066860393622376,
"learning_rate": 1.6919564735342398e-05,
"loss": 1.0836,
"step": 641
},
{
"epoch": 1.655907036797934,
"grad_norm": 0.5418314242744041,
"learning_rate": 1.6906526257184206e-05,
"loss": 1.1132,
"step": 642
},
{
"epoch": 1.65848934796643,
"grad_norm": 0.4999534074892505,
"learning_rate": 1.689346529046057e-05,
"loss": 1.0818,
"step": 643
},
{
"epoch": 1.6610716591349257,
"grad_norm": 0.5017097366254959,
"learning_rate": 1.6880381877699717e-05,
"loss": 1.074,
"step": 644
},
{
"epoch": 1.6636539703034217,
"grad_norm": 0.5233206395612633,
"learning_rate": 1.686727606150299e-05,
"loss": 1.0628,
"step": 645
},
{
"epoch": 1.6662362814719174,
"grad_norm": 0.512780816400479,
"learning_rate": 1.6854147884544655e-05,
"loss": 1.0843,
"step": 646
},
{
"epoch": 1.668818592640413,
"grad_norm": 0.49275381075866503,
"learning_rate": 1.68409973895718e-05,
"loss": 1.0843,
"step": 647
},
{
"epoch": 1.6714009038089088,
"grad_norm": 0.5496131300871087,
"learning_rate": 1.682782461940418e-05,
"loss": 1.0836,
"step": 648
},
{
"epoch": 1.6739832149774048,
"grad_norm": 0.512860264741888,
"learning_rate": 1.6814629616934078e-05,
"loss": 1.0743,
"step": 649
},
{
"epoch": 1.6765655261459007,
"grad_norm": 0.4975144838656257,
"learning_rate": 1.6801412425126183e-05,
"loss": 1.0864,
"step": 650
},
{
"epoch": 1.6791478373143964,
"grad_norm": 0.5014057129631031,
"learning_rate": 1.678817308701741e-05,
"loss": 1.0427,
"step": 651
},
{
"epoch": 1.6817301484828922,
"grad_norm": 0.5234985943818525,
"learning_rate": 1.677491164571681e-05,
"loss": 1.1048,
"step": 652
},
{
"epoch": 1.6843124596513879,
"grad_norm": 0.5201679982476692,
"learning_rate": 1.6761628144405394e-05,
"loss": 1.064,
"step": 653
},
{
"epoch": 1.6868947708198838,
"grad_norm": 0.4846519175712527,
"learning_rate": 1.6748322626336e-05,
"loss": 1.0539,
"step": 654
},
{
"epoch": 1.6894770819883798,
"grad_norm": 0.5173646723613604,
"learning_rate": 1.6734995134833155e-05,
"loss": 1.1007,
"step": 655
},
{
"epoch": 1.6920593931568755,
"grad_norm": 0.5113183351556072,
"learning_rate": 1.6721645713292953e-05,
"loss": 1.0815,
"step": 656
},
{
"epoch": 1.6946417043253712,
"grad_norm": 0.5211205639308888,
"learning_rate": 1.670827440518287e-05,
"loss": 1.0837,
"step": 657
},
{
"epoch": 1.697224015493867,
"grad_norm": 0.5080702604570161,
"learning_rate": 1.6694881254041657e-05,
"loss": 1.1173,
"step": 658
},
{
"epoch": 1.6998063266623629,
"grad_norm": 0.4962653526436615,
"learning_rate": 1.6681466303479196e-05,
"loss": 1.0352,
"step": 659
},
{
"epoch": 1.7023886378308586,
"grad_norm": 0.5142414297852521,
"learning_rate": 1.6668029597176344e-05,
"loss": 1.0666,
"step": 660
},
{
"epoch": 1.7049709489993545,
"grad_norm": 0.4901838014123924,
"learning_rate": 1.66545711788848e-05,
"loss": 1.0816,
"step": 661
},
{
"epoch": 1.7075532601678503,
"grad_norm": 0.5141149184635171,
"learning_rate": 1.664109109242696e-05,
"loss": 1.0771,
"step": 662
},
{
"epoch": 1.710135571336346,
"grad_norm": 0.5172010165390014,
"learning_rate": 1.6627589381695763e-05,
"loss": 1.0752,
"step": 663
},
{
"epoch": 1.7127178825048417,
"grad_norm": 0.4963415308538906,
"learning_rate": 1.661406609065458e-05,
"loss": 1.1219,
"step": 664
},
{
"epoch": 1.7153001936733376,
"grad_norm": 0.4877763872520322,
"learning_rate": 1.6600521263337043e-05,
"loss": 1.058,
"step": 665
},
{
"epoch": 1.7178825048418336,
"grad_norm": 0.4969456781904556,
"learning_rate": 1.6586954943846895e-05,
"loss": 1.0834,
"step": 666
},
{
"epoch": 1.7204648160103293,
"grad_norm": 0.4815019660085988,
"learning_rate": 1.6573367176357876e-05,
"loss": 1.0618,
"step": 667
},
{
"epoch": 1.723047127178825,
"grad_norm": 0.4936696257730036,
"learning_rate": 1.6559758005113564e-05,
"loss": 1.0902,
"step": 668
},
{
"epoch": 1.7256294383473207,
"grad_norm": 0.4850729841607312,
"learning_rate": 1.6546127474427217e-05,
"loss": 1.0499,
"step": 669
},
{
"epoch": 1.7282117495158167,
"grad_norm": 0.48113300472686776,
"learning_rate": 1.653247562868166e-05,
"loss": 1.0682,
"step": 670
},
{
"epoch": 1.7307940606843124,
"grad_norm": 0.4814780954159902,
"learning_rate": 1.6518802512329105e-05,
"loss": 1.083,
"step": 671
},
{
"epoch": 1.7333763718528084,
"grad_norm": 0.5247380192600469,
"learning_rate": 1.6505108169891032e-05,
"loss": 1.093,
"step": 672
},
{
"epoch": 1.735958683021304,
"grad_norm": 0.4981848964288428,
"learning_rate": 1.6491392645958043e-05,
"loss": 1.0656,
"step": 673
},
{
"epoch": 1.7385409941897998,
"grad_norm": 0.5007815563313807,
"learning_rate": 1.6477655985189703e-05,
"loss": 1.0583,
"step": 674
},
{
"epoch": 1.7411233053582955,
"grad_norm": 0.4924655390382668,
"learning_rate": 1.6463898232314393e-05,
"loss": 1.0881,
"step": 675
},
{
"epoch": 1.7437056165267915,
"grad_norm": 0.4964112767549225,
"learning_rate": 1.6450119432129185e-05,
"loss": 1.0645,
"step": 676
},
{
"epoch": 1.7462879276952874,
"grad_norm": 0.48606768423741387,
"learning_rate": 1.6436319629499683e-05,
"loss": 1.0984,
"step": 677
},
{
"epoch": 1.7488702388637831,
"grad_norm": 0.5148244978248903,
"learning_rate": 1.642249886935987e-05,
"loss": 1.0668,
"step": 678
},
{
"epoch": 1.7514525500322788,
"grad_norm": 0.5005608398817017,
"learning_rate": 1.6408657196711977e-05,
"loss": 1.0253,
"step": 679
},
{
"epoch": 1.7540348612007746,
"grad_norm": 0.5231072514633008,
"learning_rate": 1.6394794656626325e-05,
"loss": 1.1069,
"step": 680
},
{
"epoch": 1.7566171723692705,
"grad_norm": 0.5158005487144547,
"learning_rate": 1.638091129424118e-05,
"loss": 1.1059,
"step": 681
},
{
"epoch": 1.7591994835377665,
"grad_norm": 0.5085221327854161,
"learning_rate": 1.6367007154762616e-05,
"loss": 1.0628,
"step": 682
},
{
"epoch": 1.7617817947062622,
"grad_norm": 0.49993960938301113,
"learning_rate": 1.6353082283464355e-05,
"loss": 1.0774,
"step": 683
},
{
"epoch": 1.764364105874758,
"grad_norm": 0.5136885732061924,
"learning_rate": 1.633913672568762e-05,
"loss": 1.0571,
"step": 684
},
{
"epoch": 1.7669464170432536,
"grad_norm": 0.5064861116191551,
"learning_rate": 1.6325170526841e-05,
"loss": 1.0927,
"step": 685
},
{
"epoch": 1.7695287282117496,
"grad_norm": 0.5065336598849988,
"learning_rate": 1.631118373240029e-05,
"loss": 1.0437,
"step": 686
},
{
"epoch": 1.7721110393802453,
"grad_norm": 0.5071797770884724,
"learning_rate": 1.629717638790835e-05,
"loss": 1.058,
"step": 687
},
{
"epoch": 1.7746933505487412,
"grad_norm": 0.5032716603331865,
"learning_rate": 1.6283148538974943e-05,
"loss": 1.108,
"step": 688
},
{
"epoch": 1.777275661717237,
"grad_norm": 0.5168971680331349,
"learning_rate": 1.6269100231276617e-05,
"loss": 1.0967,
"step": 689
},
{
"epoch": 1.7798579728857327,
"grad_norm": 0.5328504423513274,
"learning_rate": 1.6255031510556513e-05,
"loss": 1.0755,
"step": 690
},
{
"epoch": 1.7824402840542284,
"grad_norm": 0.474134415576521,
"learning_rate": 1.6240942422624264e-05,
"loss": 1.0433,
"step": 691
},
{
"epoch": 1.7850225952227243,
"grad_norm": 0.5062840676066106,
"learning_rate": 1.62268330133558e-05,
"loss": 1.0884,
"step": 692
},
{
"epoch": 1.7876049063912203,
"grad_norm": 0.5242892989776939,
"learning_rate": 1.6212703328693232e-05,
"loss": 1.0813,
"step": 693
},
{
"epoch": 1.790187217559716,
"grad_norm": 0.49294778164946207,
"learning_rate": 1.6198553414644687e-05,
"loss": 1.0589,
"step": 694
},
{
"epoch": 1.7927695287282117,
"grad_norm": 0.5333831319134179,
"learning_rate": 1.6184383317284163e-05,
"loss": 1.0803,
"step": 695
},
{
"epoch": 1.7953518398967074,
"grad_norm": 0.4848117805750976,
"learning_rate": 1.6170193082751372e-05,
"loss": 1.0651,
"step": 696
},
{
"epoch": 1.7979341510652034,
"grad_norm": 0.4912184014826424,
"learning_rate": 1.6155982757251605e-05,
"loss": 1.0805,
"step": 697
},
{
"epoch": 1.8005164622336993,
"grad_norm": 0.5209563606543747,
"learning_rate": 1.614175238705556e-05,
"loss": 1.0676,
"step": 698
},
{
"epoch": 1.803098773402195,
"grad_norm": 0.4983116660478031,
"learning_rate": 1.6127502018499216e-05,
"loss": 1.0523,
"step": 699
},
{
"epoch": 1.8056810845706908,
"grad_norm": 0.487693383153112,
"learning_rate": 1.6113231697983658e-05,
"loss": 1.0663,
"step": 700
},
{
"epoch": 1.8082633957391865,
"grad_norm": 0.5338041396304789,
"learning_rate": 1.6098941471974945e-05,
"loss": 1.1128,
"step": 701
},
{
"epoch": 1.8108457069076824,
"grad_norm": 0.5142298199439157,
"learning_rate": 1.608463138700395e-05,
"loss": 1.0712,
"step": 702
},
{
"epoch": 1.8134280180761781,
"grad_norm": 0.47630790480629104,
"learning_rate": 1.6070301489666203e-05,
"loss": 1.0988,
"step": 703
},
{
"epoch": 1.816010329244674,
"grad_norm": 0.4901104298299483,
"learning_rate": 1.6055951826621753e-05,
"loss": 1.0428,
"step": 704
},
{
"epoch": 1.8185926404131698,
"grad_norm": 0.5227604090573119,
"learning_rate": 1.6041582444595004e-05,
"loss": 1.0698,
"step": 705
},
{
"epoch": 1.8211749515816655,
"grad_norm": 0.5041405266487794,
"learning_rate": 1.602719339037457e-05,
"loss": 1.0753,
"step": 706
},
{
"epoch": 1.8237572627501613,
"grad_norm": 0.5093841266418548,
"learning_rate": 1.6012784710813122e-05,
"loss": 1.1189,
"step": 707
},
{
"epoch": 1.8263395739186572,
"grad_norm": 0.5166236437305157,
"learning_rate": 1.599835645282723e-05,
"loss": 1.07,
"step": 708
},
{
"epoch": 1.8289218850871531,
"grad_norm": 0.5238202604739227,
"learning_rate": 1.598390866339721e-05,
"loss": 1.0734,
"step": 709
},
{
"epoch": 1.8315041962556489,
"grad_norm": 0.5351507809671923,
"learning_rate": 1.5969441389566995e-05,
"loss": 1.0722,
"step": 710
},
{
"epoch": 1.8340865074241446,
"grad_norm": 0.48654357580846874,
"learning_rate": 1.5954954678443934e-05,
"loss": 1.0581,
"step": 711
},
{
"epoch": 1.8366688185926403,
"grad_norm": 0.48828499263183267,
"learning_rate": 1.5940448577198685e-05,
"loss": 1.0778,
"step": 712
},
{
"epoch": 1.8392511297611362,
"grad_norm": 0.49993618606120815,
"learning_rate": 1.5925923133065036e-05,
"loss": 1.0744,
"step": 713
},
{
"epoch": 1.841833440929632,
"grad_norm": 0.5060526028050449,
"learning_rate": 1.591137839333976e-05,
"loss": 1.0869,
"step": 714
},
{
"epoch": 1.844415752098128,
"grad_norm": 0.4854542346534205,
"learning_rate": 1.5896814405382455e-05,
"loss": 1.0734,
"step": 715
},
{
"epoch": 1.8469980632666236,
"grad_norm": 0.5120958440799618,
"learning_rate": 1.5882231216615405e-05,
"loss": 1.056,
"step": 716
},
{
"epoch": 1.8495803744351194,
"grad_norm": 0.4857556591454626,
"learning_rate": 1.58676288745234e-05,
"loss": 1.0502,
"step": 717
},
{
"epoch": 1.852162685603615,
"grad_norm": 0.49183704296233893,
"learning_rate": 1.5853007426653607e-05,
"loss": 1.116,
"step": 718
},
{
"epoch": 1.854744996772111,
"grad_norm": 0.4971020661357399,
"learning_rate": 1.5838366920615395e-05,
"loss": 1.0535,
"step": 719
},
{
"epoch": 1.857327307940607,
"grad_norm": 0.485071482170176,
"learning_rate": 1.5823707404080196e-05,
"loss": 1.0465,
"step": 720
},
{
"epoch": 1.8599096191091027,
"grad_norm": 0.48718149162761787,
"learning_rate": 1.5809028924781343e-05,
"loss": 1.0787,
"step": 721
},
{
"epoch": 1.8624919302775984,
"grad_norm": 0.4775709718268873,
"learning_rate": 1.5794331530513903e-05,
"loss": 1.0354,
"step": 722
},
{
"epoch": 1.8650742414460941,
"grad_norm": 0.5008952740743758,
"learning_rate": 1.577961526913455e-05,
"loss": 1.0602,
"step": 723
},
{
"epoch": 1.86765655261459,
"grad_norm": 0.5064643531485886,
"learning_rate": 1.5764880188561376e-05,
"loss": 1.1178,
"step": 724
},
{
"epoch": 1.870238863783086,
"grad_norm": 0.49848568260978515,
"learning_rate": 1.5750126336773755e-05,
"loss": 1.0422,
"step": 725
},
{
"epoch": 1.8728211749515817,
"grad_norm": 0.5220419002346904,
"learning_rate": 1.5735353761812197e-05,
"loss": 1.057,
"step": 726
},
{
"epoch": 1.8754034861200775,
"grad_norm": 0.4971228637528045,
"learning_rate": 1.5720562511778156e-05,
"loss": 1.0556,
"step": 727
},
{
"epoch": 1.8779857972885732,
"grad_norm": 0.48640138793502713,
"learning_rate": 1.5705752634833908e-05,
"loss": 1.0857,
"step": 728
},
{
"epoch": 1.8805681084570691,
"grad_norm": 0.5037467924591017,
"learning_rate": 1.5690924179202375e-05,
"loss": 1.0581,
"step": 729
},
{
"epoch": 1.8831504196255648,
"grad_norm": 0.5282496443218059,
"learning_rate": 1.5676077193166973e-05,
"loss": 1.0799,
"step": 730
},
{
"epoch": 1.8857327307940608,
"grad_norm": 0.47021596749068756,
"learning_rate": 1.5661211725071457e-05,
"loss": 1.0352,
"step": 731
},
{
"epoch": 1.8883150419625565,
"grad_norm": 0.513391496193585,
"learning_rate": 1.5646327823319765e-05,
"loss": 1.1031,
"step": 732
},
{
"epoch": 1.8908973531310522,
"grad_norm": 0.5186791214030437,
"learning_rate": 1.5631425536375858e-05,
"loss": 1.0849,
"step": 733
},
{
"epoch": 1.893479664299548,
"grad_norm": 0.4945851612291226,
"learning_rate": 1.5616504912763554e-05,
"loss": 1.0513,
"step": 734
},
{
"epoch": 1.896061975468044,
"grad_norm": 0.4805032305818217,
"learning_rate": 1.5601566001066384e-05,
"loss": 1.0388,
"step": 735
},
{
"epoch": 1.8986442866365398,
"grad_norm": 0.49355307137386584,
"learning_rate": 1.5586608849927424e-05,
"loss": 1.0729,
"step": 736
},
{
"epoch": 1.9012265978050356,
"grad_norm": 0.4868522623792675,
"learning_rate": 1.5571633508049148e-05,
"loss": 1.0472,
"step": 737
},
{
"epoch": 1.9038089089735313,
"grad_norm": 0.5064645951323721,
"learning_rate": 1.5556640024193245e-05,
"loss": 1.0592,
"step": 738
},
{
"epoch": 1.906391220142027,
"grad_norm": 0.4935670987750482,
"learning_rate": 1.5541628447180494e-05,
"loss": 1.0567,
"step": 739
},
{
"epoch": 1.908973531310523,
"grad_norm": 0.5087284962422527,
"learning_rate": 1.552659882589058e-05,
"loss": 1.0544,
"step": 740
},
{
"epoch": 1.9115558424790189,
"grad_norm": 0.503398066954607,
"learning_rate": 1.551155120926194e-05,
"loss": 1.0416,
"step": 741
},
{
"epoch": 1.9141381536475146,
"grad_norm": 0.5248315003621526,
"learning_rate": 1.5496485646291613e-05,
"loss": 1.0821,
"step": 742
},
{
"epoch": 1.9167204648160103,
"grad_norm": 0.509374102002012,
"learning_rate": 1.548140218603507e-05,
"loss": 1.1231,
"step": 743
},
{
"epoch": 1.919302775984506,
"grad_norm": 0.4763357005641916,
"learning_rate": 1.5466300877606054e-05,
"loss": 1.0557,
"step": 744
},
{
"epoch": 1.921885087153002,
"grad_norm": 0.5156095352624543,
"learning_rate": 1.5451181770176434e-05,
"loss": 1.102,
"step": 745
},
{
"epoch": 1.9244673983214977,
"grad_norm": 0.5116973644648233,
"learning_rate": 1.543604491297602e-05,
"loss": 1.1098,
"step": 746
},
{
"epoch": 1.9270497094899937,
"grad_norm": 0.5093190129624484,
"learning_rate": 1.5420890355292435e-05,
"loss": 1.0528,
"step": 747
},
{
"epoch": 1.9296320206584894,
"grad_norm": 0.506225349532536,
"learning_rate": 1.5405718146470926e-05,
"loss": 1.0607,
"step": 748
},
{
"epoch": 1.932214331826985,
"grad_norm": 0.5271647344306659,
"learning_rate": 1.5390528335914216e-05,
"loss": 1.1065,
"step": 749
},
{
"epoch": 1.9347966429954808,
"grad_norm": 0.5309546928129137,
"learning_rate": 1.5375320973082346e-05,
"loss": 1.0818,
"step": 750
},
{
"epoch": 1.9373789541639768,
"grad_norm": 0.5187838315898233,
"learning_rate": 1.53600961074925e-05,
"loss": 1.0614,
"step": 751
},
{
"epoch": 1.9399612653324727,
"grad_norm": 0.4999420933843893,
"learning_rate": 1.5344853788718867e-05,
"loss": 1.0385,
"step": 752
},
{
"epoch": 1.9425435765009684,
"grad_norm": 0.5449765150372478,
"learning_rate": 1.532959406639245e-05,
"loss": 1.0324,
"step": 753
},
{
"epoch": 1.9451258876694641,
"grad_norm": 0.48682953533866824,
"learning_rate": 1.5314316990200933e-05,
"loss": 1.0302,
"step": 754
},
{
"epoch": 1.9477081988379599,
"grad_norm": 0.49226909739556324,
"learning_rate": 1.5299022609888507e-05,
"loss": 1.1016,
"step": 755
},
{
"epoch": 1.9502905100064558,
"grad_norm": 0.5347878649288165,
"learning_rate": 1.5283710975255695e-05,
"loss": 1.0843,
"step": 756
},
{
"epoch": 1.9528728211749515,
"grad_norm": 0.5028835492330221,
"learning_rate": 1.5268382136159213e-05,
"loss": 1.0832,
"step": 757
},
{
"epoch": 1.9554551323434475,
"grad_norm": 0.5020788778708613,
"learning_rate": 1.5253036142511794e-05,
"loss": 1.0554,
"step": 758
},
{
"epoch": 1.9580374435119432,
"grad_norm": 0.5117826424662124,
"learning_rate": 1.5237673044282028e-05,
"loss": 1.0407,
"step": 759
},
{
"epoch": 1.960619754680439,
"grad_norm": 0.486751220436105,
"learning_rate": 1.5222292891494204e-05,
"loss": 1.1028,
"step": 760
},
{
"epoch": 1.9632020658489346,
"grad_norm": 0.5042620665613498,
"learning_rate": 1.5206895734228133e-05,
"loss": 1.1089,
"step": 761
},
{
"epoch": 1.9657843770174306,
"grad_norm": 0.4985230564456094,
"learning_rate": 1.5191481622619006e-05,
"loss": 1.0892,
"step": 762
},
{
"epoch": 1.9683666881859265,
"grad_norm": 0.46564788925434075,
"learning_rate": 1.5176050606857211e-05,
"loss": 1.0687,
"step": 763
},
{
"epoch": 1.9709489993544222,
"grad_norm": 0.4799444405216457,
"learning_rate": 1.5160602737188184e-05,
"loss": 1.0627,
"step": 764
},
{
"epoch": 1.973531310522918,
"grad_norm": 0.517680238719764,
"learning_rate": 1.514513806391224e-05,
"loss": 1.1087,
"step": 765
},
{
"epoch": 1.9761136216914137,
"grad_norm": 0.4945356852520951,
"learning_rate": 1.5129656637384398e-05,
"loss": 1.0333,
"step": 766
},
{
"epoch": 1.9786959328599096,
"grad_norm": 0.47246161399068515,
"learning_rate": 1.5114158508014244e-05,
"loss": 1.0622,
"step": 767
},
{
"epoch": 1.9812782440284056,
"grad_norm": 0.4792556964609251,
"learning_rate": 1.509864372626574e-05,
"loss": 1.0807,
"step": 768
},
{
"epoch": 1.9838605551969013,
"grad_norm": 0.4942144710838991,
"learning_rate": 1.5083112342657071e-05,
"loss": 1.088,
"step": 769
},
{
"epoch": 1.986442866365397,
"grad_norm": 0.5201249256236419,
"learning_rate": 1.5067564407760485e-05,
"loss": 1.0938,
"step": 770
},
{
"epoch": 1.9890251775338927,
"grad_norm": 0.4829123624901927,
"learning_rate": 1.5051999972202118e-05,
"loss": 1.0353,
"step": 771
},
{
"epoch": 1.9916074887023887,
"grad_norm": 0.5024498746492575,
"learning_rate": 1.5036419086661837e-05,
"loss": 1.0802,
"step": 772
},
{
"epoch": 1.9941897998708844,
"grad_norm": 0.4963413917672638,
"learning_rate": 1.5020821801873072e-05,
"loss": 1.0801,
"step": 773
},
{
"epoch": 1.9967721110393803,
"grad_norm": 0.4872864553127849,
"learning_rate": 1.5005208168622649e-05,
"loss": 1.0509,
"step": 774
},
{
"epoch": 1.999354422207876,
"grad_norm": 0.47707633602130245,
"learning_rate": 1.4989578237750628e-05,
"loss": 1.0485,
"step": 775
},
{
"epoch": 2.0,
"grad_norm": 0.47707633602130245,
"learning_rate": 1.4973932060150142e-05,
"loss": 1.0293,
"step": 776
},
{
"epoch": 2.0025823111684957,
"grad_norm": 1.1941485826343992,
"learning_rate": 1.4958269686767214e-05,
"loss": 0.9552,
"step": 777
},
{
"epoch": 2.0051646223369914,
"grad_norm": 0.7512477337089716,
"learning_rate": 1.4942591168600616e-05,
"loss": 0.9653,
"step": 778
},
{
"epoch": 2.0077469335054876,
"grad_norm": 0.6413401233120336,
"learning_rate": 1.4926896556701676e-05,
"loss": 0.9713,
"step": 779
},
{
"epoch": 2.0103292446739833,
"grad_norm": 0.8265225982878504,
"learning_rate": 1.4911185902174134e-05,
"loss": 0.9674,
"step": 780
},
{
"epoch": 2.012911555842479,
"grad_norm": 0.7690019968988396,
"learning_rate": 1.4895459256173966e-05,
"loss": 0.9701,
"step": 781
},
{
"epoch": 2.0154938670109748,
"grad_norm": 0.6695923734742643,
"learning_rate": 1.4879716669909215e-05,
"loss": 0.9262,
"step": 782
},
{
"epoch": 2.0180761781794705,
"grad_norm": 0.6968627838761038,
"learning_rate": 1.4863958194639828e-05,
"loss": 0.9738,
"step": 783
},
{
"epoch": 2.020658489347966,
"grad_norm": 0.7154538295447892,
"learning_rate": 1.4848183881677497e-05,
"loss": 0.9537,
"step": 784
},
{
"epoch": 2.0232408005164624,
"grad_norm": 0.6599822904047927,
"learning_rate": 1.4832393782385475e-05,
"loss": 0.9428,
"step": 785
},
{
"epoch": 2.025823111684958,
"grad_norm": 0.6785737464207784,
"learning_rate": 1.4816587948178411e-05,
"loss": 0.9377,
"step": 786
},
{
"epoch": 2.028405422853454,
"grad_norm": 0.6659927550215519,
"learning_rate": 1.4800766430522208e-05,
"loss": 0.9477,
"step": 787
},
{
"epoch": 2.0309877340219495,
"grad_norm": 0.6453008064623791,
"learning_rate": 1.4784929280933819e-05,
"loss": 0.9734,
"step": 788
},
{
"epoch": 2.0335700451904453,
"grad_norm": 0.6138501650256379,
"learning_rate": 1.4769076550981107e-05,
"loss": 0.9485,
"step": 789
},
{
"epoch": 2.0361523563589414,
"grad_norm": 0.628292728528221,
"learning_rate": 1.4753208292282666e-05,
"loss": 0.9373,
"step": 790
},
{
"epoch": 2.038734667527437,
"grad_norm": 0.6518720740962953,
"learning_rate": 1.4737324556507639e-05,
"loss": 0.9854,
"step": 791
},
{
"epoch": 2.041316978695933,
"grad_norm": 0.6417366086713475,
"learning_rate": 1.472142539537559e-05,
"loss": 0.9668,
"step": 792
},
{
"epoch": 2.0438992898644286,
"grad_norm": 0.6525413690852432,
"learning_rate": 1.4705510860656289e-05,
"loss": 0.9429,
"step": 793
},
{
"epoch": 2.0464816010329243,
"grad_norm": 0.6126084056396112,
"learning_rate": 1.4689581004169573e-05,
"loss": 0.9828,
"step": 794
},
{
"epoch": 2.0490639122014205,
"grad_norm": 0.6169432282421197,
"learning_rate": 1.4673635877785168e-05,
"loss": 0.9522,
"step": 795
},
{
"epoch": 2.051646223369916,
"grad_norm": 0.5922213822722046,
"learning_rate": 1.4657675533422517e-05,
"loss": 0.9478,
"step": 796
},
{
"epoch": 2.054228534538412,
"grad_norm": 0.6211700530426607,
"learning_rate": 1.4641700023050625e-05,
"loss": 0.9325,
"step": 797
},
{
"epoch": 2.0568108457069076,
"grad_norm": 0.6173048324016761,
"learning_rate": 1.4625709398687862e-05,
"loss": 0.9477,
"step": 798
},
{
"epoch": 2.0593931568754034,
"grad_norm": 0.5866193216584925,
"learning_rate": 1.4609703712401832e-05,
"loss": 0.9378,
"step": 799
},
{
"epoch": 2.061975468043899,
"grad_norm": 0.6128450747778748,
"learning_rate": 1.4593683016309168e-05,
"loss": 0.9785,
"step": 800
},
{
"epoch": 2.0645577792123952,
"grad_norm": 0.582652343191567,
"learning_rate": 1.4577647362575378e-05,
"loss": 0.9318,
"step": 801
},
{
"epoch": 2.067140090380891,
"grad_norm": 0.5589523161558311,
"learning_rate": 1.4561596803414681e-05,
"loss": 0.9295,
"step": 802
},
{
"epoch": 2.0697224015493867,
"grad_norm": 0.5777811559605781,
"learning_rate": 1.4545531391089826e-05,
"loss": 0.9606,
"step": 803
},
{
"epoch": 2.0723047127178824,
"grad_norm": 0.5932336435576502,
"learning_rate": 1.4529451177911926e-05,
"loss": 0.973,
"step": 804
},
{
"epoch": 2.074887023886378,
"grad_norm": 0.5467403526396694,
"learning_rate": 1.4513356216240287e-05,
"loss": 0.8862,
"step": 805
},
{
"epoch": 2.0774693350548743,
"grad_norm": 0.5667654235524354,
"learning_rate": 1.449724655848224e-05,
"loss": 0.9484,
"step": 806
},
{
"epoch": 2.08005164622337,
"grad_norm": 0.557027598681585,
"learning_rate": 1.4481122257092966e-05,
"loss": 0.9537,
"step": 807
},
{
"epoch": 2.0826339573918657,
"grad_norm": 0.5736006444800077,
"learning_rate": 1.4464983364575327e-05,
"loss": 0.9644,
"step": 808
},
{
"epoch": 2.0852162685603615,
"grad_norm": 0.5911533935183022,
"learning_rate": 1.44488299334797e-05,
"loss": 0.9547,
"step": 809
},
{
"epoch": 2.087798579728857,
"grad_norm": 0.5854788104570025,
"learning_rate": 1.44326620164038e-05,
"loss": 0.9316,
"step": 810
},
{
"epoch": 2.090380890897353,
"grad_norm": 0.5885109768704322,
"learning_rate": 1.4416479665992507e-05,
"loss": 0.9468,
"step": 811
},
{
"epoch": 2.092963202065849,
"grad_norm": 0.5860083582206628,
"learning_rate": 1.4400282934937702e-05,
"loss": 0.9597,
"step": 812
},
{
"epoch": 2.095545513234345,
"grad_norm": 0.5612799121488241,
"learning_rate": 1.4384071875978085e-05,
"loss": 0.9291,
"step": 813
},
{
"epoch": 2.0981278244028405,
"grad_norm": 0.5760413758972827,
"learning_rate": 1.4367846541899017e-05,
"loss": 0.9434,
"step": 814
},
{
"epoch": 2.1007101355713362,
"grad_norm": 0.5872031423934213,
"learning_rate": 1.4351606985532338e-05,
"loss": 0.9546,
"step": 815
},
{
"epoch": 2.103292446739832,
"grad_norm": 0.5875266718802965,
"learning_rate": 1.4335353259756199e-05,
"loss": 0.9739,
"step": 816
},
{
"epoch": 2.105874757908328,
"grad_norm": 0.5834229896061526,
"learning_rate": 1.4319085417494885e-05,
"loss": 0.936,
"step": 817
},
{
"epoch": 2.108457069076824,
"grad_norm": 0.5740341555688057,
"learning_rate": 1.430280351171864e-05,
"loss": 0.9295,
"step": 818
},
{
"epoch": 2.1110393802453196,
"grad_norm": 0.6028061663862296,
"learning_rate": 1.4286507595443527e-05,
"loss": 0.9475,
"step": 819
},
{
"epoch": 2.1136216914138153,
"grad_norm": 0.6066376132775557,
"learning_rate": 1.4270197721731192e-05,
"loss": 0.9748,
"step": 820
},
{
"epoch": 2.116204002582311,
"grad_norm": 0.6009913417618149,
"learning_rate": 1.4253873943688751e-05,
"loss": 0.9599,
"step": 821
},
{
"epoch": 2.118786313750807,
"grad_norm": 0.5983886456577467,
"learning_rate": 1.4237536314468602e-05,
"loss": 0.9594,
"step": 822
},
{
"epoch": 2.121368624919303,
"grad_norm": 0.617011626576933,
"learning_rate": 1.4221184887268218e-05,
"loss": 0.9498,
"step": 823
},
{
"epoch": 2.1239509360877986,
"grad_norm": 0.6005132003701584,
"learning_rate": 1.4204819715330026e-05,
"loss": 0.9503,
"step": 824
},
{
"epoch": 2.1265332472562943,
"grad_norm": 0.5741558367115511,
"learning_rate": 1.4188440851941185e-05,
"loss": 0.9587,
"step": 825
},
{
"epoch": 2.12911555842479,
"grad_norm": 0.6062156734819026,
"learning_rate": 1.4172048350433457e-05,
"loss": 0.969,
"step": 826
},
{
"epoch": 2.131697869593286,
"grad_norm": 0.6128646943053142,
"learning_rate": 1.4155642264182992e-05,
"loss": 0.9534,
"step": 827
},
{
"epoch": 2.134280180761782,
"grad_norm": 0.5828534204572827,
"learning_rate": 1.4139222646610185e-05,
"loss": 0.9388,
"step": 828
},
{
"epoch": 2.1368624919302777,
"grad_norm": 0.6060884386262935,
"learning_rate": 1.4122789551179495e-05,
"loss": 0.9884,
"step": 829
},
{
"epoch": 2.1394448030987734,
"grad_norm": 0.6061173547442686,
"learning_rate": 1.4106343031399252e-05,
"loss": 0.924,
"step": 830
},
{
"epoch": 2.142027114267269,
"grad_norm": 0.5851413898430766,
"learning_rate": 1.408988314082151e-05,
"loss": 0.9455,
"step": 831
},
{
"epoch": 2.144609425435765,
"grad_norm": 0.588388475305726,
"learning_rate": 1.4073409933041853e-05,
"loss": 0.9337,
"step": 832
},
{
"epoch": 2.147191736604261,
"grad_norm": 0.5838096533852828,
"learning_rate": 1.4056923461699232e-05,
"loss": 0.9392,
"step": 833
},
{
"epoch": 2.1497740477727567,
"grad_norm": 0.5997141349811622,
"learning_rate": 1.4040423780475787e-05,
"loss": 0.9593,
"step": 834
},
{
"epoch": 2.1523563589412524,
"grad_norm": 0.6020566174282612,
"learning_rate": 1.4023910943096662e-05,
"loss": 0.9616,
"step": 835
},
{
"epoch": 2.154938670109748,
"grad_norm": 0.6073751111196977,
"learning_rate": 1.4007385003329847e-05,
"loss": 0.9804,
"step": 836
},
{
"epoch": 2.157520981278244,
"grad_norm": 0.640691713500995,
"learning_rate": 1.3990846014985997e-05,
"loss": 0.9525,
"step": 837
},
{
"epoch": 2.16010329244674,
"grad_norm": 0.5749963474745784,
"learning_rate": 1.397429403191825e-05,
"loss": 0.9753,
"step": 838
},
{
"epoch": 2.1626856036152358,
"grad_norm": 0.6017911845722985,
"learning_rate": 1.3957729108022057e-05,
"loss": 0.9698,
"step": 839
},
{
"epoch": 2.1652679147837315,
"grad_norm": 0.598963393328458,
"learning_rate": 1.3941151297235007e-05,
"loss": 0.9828,
"step": 840
},
{
"epoch": 2.167850225952227,
"grad_norm": 0.5829192765375827,
"learning_rate": 1.3924560653536652e-05,
"loss": 0.9399,
"step": 841
},
{
"epoch": 2.170432537120723,
"grad_norm": 0.5736202743026629,
"learning_rate": 1.3907957230948328e-05,
"loss": 0.9414,
"step": 842
},
{
"epoch": 2.1730148482892186,
"grad_norm": 0.6048383091141705,
"learning_rate": 1.3891341083532979e-05,
"loss": 0.93,
"step": 843
},
{
"epoch": 2.175597159457715,
"grad_norm": 0.6058814179639644,
"learning_rate": 1.3874712265394984e-05,
"loss": 0.9625,
"step": 844
},
{
"epoch": 2.1781794706262105,
"grad_norm": 0.6048568085747608,
"learning_rate": 1.3858070830679987e-05,
"loss": 0.9325,
"step": 845
},
{
"epoch": 2.1807617817947063,
"grad_norm": 0.6166529166864086,
"learning_rate": 1.3841416833574696e-05,
"loss": 0.9991,
"step": 846
},
{
"epoch": 2.183344092963202,
"grad_norm": 0.5836884801008753,
"learning_rate": 1.3824750328306747e-05,
"loss": 0.9567,
"step": 847
},
{
"epoch": 2.1859264041316977,
"grad_norm": 0.5952429990454414,
"learning_rate": 1.3808071369144476e-05,
"loss": 0.9244,
"step": 848
},
{
"epoch": 2.188508715300194,
"grad_norm": 0.5857084084921026,
"learning_rate": 1.37913800103968e-05,
"loss": 0.9655,
"step": 849
},
{
"epoch": 2.1910910264686896,
"grad_norm": 0.6279175507836195,
"learning_rate": 1.3774676306412986e-05,
"loss": 0.9323,
"step": 850
},
{
"epoch": 2.1936733376371853,
"grad_norm": 0.5863735033805826,
"learning_rate": 1.3757960311582518e-05,
"loss": 0.961,
"step": 851
},
{
"epoch": 2.196255648805681,
"grad_norm": 0.5793193685107874,
"learning_rate": 1.3741232080334889e-05,
"loss": 0.9417,
"step": 852
},
{
"epoch": 2.1988379599741767,
"grad_norm": 0.5779370435007501,
"learning_rate": 1.3724491667139437e-05,
"loss": 0.9543,
"step": 853
},
{
"epoch": 2.2014202711426725,
"grad_norm": 0.5898150549054328,
"learning_rate": 1.3707739126505168e-05,
"loss": 0.9751,
"step": 854
},
{
"epoch": 2.2040025823111686,
"grad_norm": 0.5932733176039338,
"learning_rate": 1.3690974512980577e-05,
"loss": 0.9453,
"step": 855
},
{
"epoch": 2.2065848934796644,
"grad_norm": 0.5905918422617804,
"learning_rate": 1.3674197881153468e-05,
"loss": 0.9361,
"step": 856
},
{
"epoch": 2.20916720464816,
"grad_norm": 0.594528411021171,
"learning_rate": 1.365740928565078e-05,
"loss": 0.9781,
"step": 857
},
{
"epoch": 2.211749515816656,
"grad_norm": 0.5872952131266409,
"learning_rate": 1.3640608781138407e-05,
"loss": 0.9479,
"step": 858
},
{
"epoch": 2.2143318269851515,
"grad_norm": 0.5929574963797165,
"learning_rate": 1.3623796422321018e-05,
"loss": 0.9488,
"step": 859
},
{
"epoch": 2.2169141381536477,
"grad_norm": 0.6100088602969217,
"learning_rate": 1.3606972263941884e-05,
"loss": 0.93,
"step": 860
},
{
"epoch": 2.2194964493221434,
"grad_norm": 0.5757485969334069,
"learning_rate": 1.3590136360782697e-05,
"loss": 0.9167,
"step": 861
},
{
"epoch": 2.222078760490639,
"grad_norm": 0.5880881759424176,
"learning_rate": 1.3573288767663388e-05,
"loss": 0.9831,
"step": 862
},
{
"epoch": 2.224661071659135,
"grad_norm": 0.6101438672240849,
"learning_rate": 1.3556429539441957e-05,
"loss": 0.9425,
"step": 863
},
{
"epoch": 2.2272433828276306,
"grad_norm": 0.6032144416691072,
"learning_rate": 1.3539558731014285e-05,
"loss": 0.956,
"step": 864
},
{
"epoch": 2.2298256939961267,
"grad_norm": 0.5877358574038184,
"learning_rate": 1.3522676397313963e-05,
"loss": 0.9769,
"step": 865
},
{
"epoch": 2.2324080051646225,
"grad_norm": 0.6037905375839121,
"learning_rate": 1.3505782593312108e-05,
"loss": 0.9577,
"step": 866
},
{
"epoch": 2.234990316333118,
"grad_norm": 0.5826777668673346,
"learning_rate": 1.3488877374017189e-05,
"loss": 0.9514,
"step": 867
},
{
"epoch": 2.237572627501614,
"grad_norm": 0.591593499195398,
"learning_rate": 1.3471960794474837e-05,
"loss": 0.9563,
"step": 868
},
{
"epoch": 2.2401549386701096,
"grad_norm": 0.5972872893141782,
"learning_rate": 1.345503290976768e-05,
"loss": 0.9646,
"step": 869
},
{
"epoch": 2.242737249838606,
"grad_norm": 0.5695814980462333,
"learning_rate": 1.3438093775015157e-05,
"loss": 0.9295,
"step": 870
},
{
"epoch": 2.2453195610071015,
"grad_norm": 0.5950572680113415,
"learning_rate": 1.342114344537334e-05,
"loss": 0.9378,
"step": 871
},
{
"epoch": 2.2479018721755972,
"grad_norm": 0.645911801845914,
"learning_rate": 1.3404181976034743e-05,
"loss": 0.9889,
"step": 872
},
{
"epoch": 2.250484183344093,
"grad_norm": 0.5891952037473503,
"learning_rate": 1.3387209422228164e-05,
"loss": 0.9257,
"step": 873
},
{
"epoch": 2.2530664945125887,
"grad_norm": 0.6101696680348054,
"learning_rate": 1.3370225839218494e-05,
"loss": 0.9387,
"step": 874
},
{
"epoch": 2.2556488056810844,
"grad_norm": 0.6235755995527572,
"learning_rate": 1.3353231282306521e-05,
"loss": 0.9699,
"step": 875
},
{
"epoch": 2.2582311168495806,
"grad_norm": 0.6032240561162692,
"learning_rate": 1.3336225806828782e-05,
"loss": 0.9256,
"step": 876
},
{
"epoch": 2.2608134280180763,
"grad_norm": 0.6200539436633388,
"learning_rate": 1.3319209468157362e-05,
"loss": 0.977,
"step": 877
},
{
"epoch": 2.263395739186572,
"grad_norm": 0.6357789919117319,
"learning_rate": 1.3302182321699712e-05,
"loss": 0.9589,
"step": 878
},
{
"epoch": 2.2659780503550677,
"grad_norm": 0.6102482086269118,
"learning_rate": 1.3285144422898486e-05,
"loss": 0.9595,
"step": 879
},
{
"epoch": 2.2685603615235634,
"grad_norm": 0.6310634925304537,
"learning_rate": 1.3268095827231333e-05,
"loss": 0.9406,
"step": 880
},
{
"epoch": 2.2711426726920596,
"grad_norm": 0.6196741175987706,
"learning_rate": 1.3251036590210751e-05,
"loss": 0.9623,
"step": 881
},
{
"epoch": 2.2737249838605553,
"grad_norm": 0.5865418451174635,
"learning_rate": 1.323396676738387e-05,
"loss": 0.9618,
"step": 882
},
{
"epoch": 2.276307295029051,
"grad_norm": 0.5886489491664807,
"learning_rate": 1.3216886414332304e-05,
"loss": 0.9654,
"step": 883
},
{
"epoch": 2.2788896061975468,
"grad_norm": 0.6269313692986308,
"learning_rate": 1.319979558667194e-05,
"loss": 0.9648,
"step": 884
},
{
"epoch": 2.2814719173660425,
"grad_norm": 0.5950331112803471,
"learning_rate": 1.3182694340052785e-05,
"loss": 1.0065,
"step": 885
},
{
"epoch": 2.284054228534538,
"grad_norm": 0.5868804806129319,
"learning_rate": 1.3165582730158764e-05,
"loss": 0.9425,
"step": 886
},
{
"epoch": 2.2866365397030344,
"grad_norm": 0.585709126958065,
"learning_rate": 1.3148460812707549e-05,
"loss": 0.9866,
"step": 887
},
{
"epoch": 2.28921885087153,
"grad_norm": 0.5943971591153827,
"learning_rate": 1.3131328643450373e-05,
"loss": 0.928,
"step": 888
},
{
"epoch": 2.291801162040026,
"grad_norm": 0.6011485207920195,
"learning_rate": 1.3114186278171855e-05,
"loss": 0.9471,
"step": 889
},
{
"epoch": 2.2943834732085215,
"grad_norm": 0.6202130154424499,
"learning_rate": 1.3097033772689804e-05,
"loss": 0.9555,
"step": 890
},
{
"epoch": 2.2969657843770173,
"grad_norm": 0.601191279942045,
"learning_rate": 1.3079871182855056e-05,
"loss": 0.9763,
"step": 891
},
{
"epoch": 2.2995480955455134,
"grad_norm": 0.6091424415493963,
"learning_rate": 1.3062698564551277e-05,
"loss": 0.9564,
"step": 892
},
{
"epoch": 2.302130406714009,
"grad_norm": 0.6322044545300952,
"learning_rate": 1.3045515973694793e-05,
"loss": 0.9621,
"step": 893
},
{
"epoch": 2.304712717882505,
"grad_norm": 0.593976781762648,
"learning_rate": 1.3028323466234398e-05,
"loss": 0.9352,
"step": 894
},
{
"epoch": 2.3072950290510006,
"grad_norm": 0.6093135390414695,
"learning_rate": 1.3011121098151177e-05,
"loss": 0.9444,
"step": 895
},
{
"epoch": 2.3098773402194963,
"grad_norm": 0.6081280945984243,
"learning_rate": 1.2993908925458318e-05,
"loss": 0.9019,
"step": 896
},
{
"epoch": 2.312459651387992,
"grad_norm": 0.5965625320422764,
"learning_rate": 1.2976687004200941e-05,
"loss": 0.9504,
"step": 897
},
{
"epoch": 2.315041962556488,
"grad_norm": 0.6136358258415586,
"learning_rate": 1.2959455390455906e-05,
"loss": 0.9598,
"step": 898
},
{
"epoch": 2.317624273724984,
"grad_norm": 0.614066787514822,
"learning_rate": 1.294221414033163e-05,
"loss": 0.9151,
"step": 899
},
{
"epoch": 2.3202065848934796,
"grad_norm": 0.595393393778215,
"learning_rate": 1.2924963309967914e-05,
"loss": 0.9383,
"step": 900
},
{
"epoch": 2.3227888960619754,
"grad_norm": 0.6123276452590078,
"learning_rate": 1.2907702955535744e-05,
"loss": 0.9449,
"step": 901
},
{
"epoch": 2.325371207230471,
"grad_norm": 0.6002189347008143,
"learning_rate": 1.2890433133237129e-05,
"loss": 0.9648,
"step": 902
},
{
"epoch": 2.3279535183989672,
"grad_norm": 0.5948640736384636,
"learning_rate": 1.2873153899304898e-05,
"loss": 0.9654,
"step": 903
},
{
"epoch": 2.330535829567463,
"grad_norm": 0.6253192331451701,
"learning_rate": 1.2855865310002526e-05,
"loss": 0.9459,
"step": 904
},
{
"epoch": 2.3331181407359587,
"grad_norm": 0.6060085962717341,
"learning_rate": 1.2838567421623957e-05,
"loss": 0.9648,
"step": 905
},
{
"epoch": 2.3357004519044544,
"grad_norm": 0.5909129536256885,
"learning_rate": 1.2821260290493411e-05,
"loss": 0.9615,
"step": 906
},
{
"epoch": 2.33828276307295,
"grad_norm": 0.6033489652168267,
"learning_rate": 1.2803943972965193e-05,
"loss": 0.9822,
"step": 907
},
{
"epoch": 2.340865074241446,
"grad_norm": 0.6471948077451358,
"learning_rate": 1.278661852542354e-05,
"loss": 0.9372,
"step": 908
},
{
"epoch": 2.343447385409942,
"grad_norm": 0.5875321400886871,
"learning_rate": 1.2769284004282398e-05,
"loss": 0.9283,
"step": 909
},
{
"epoch": 2.3460296965784377,
"grad_norm": 0.57397296055963,
"learning_rate": 1.2751940465985273e-05,
"loss": 0.9443,
"step": 910
},
{
"epoch": 2.3486120077469335,
"grad_norm": 0.6083870147758043,
"learning_rate": 1.2734587967005025e-05,
"loss": 0.9911,
"step": 911
},
{
"epoch": 2.351194318915429,
"grad_norm": 0.5893684173951856,
"learning_rate": 1.2717226563843687e-05,
"loss": 0.9775,
"step": 912
},
{
"epoch": 2.3537766300839253,
"grad_norm": 0.6098963204737635,
"learning_rate": 1.26998563130323e-05,
"loss": 0.9352,
"step": 913
},
{
"epoch": 2.356358941252421,
"grad_norm": 0.6028323564667681,
"learning_rate": 1.268247727113069e-05,
"loss": 0.9535,
"step": 914
},
{
"epoch": 2.358941252420917,
"grad_norm": 0.6139836763290958,
"learning_rate": 1.2665089494727338e-05,
"loss": 0.9543,
"step": 915
},
{
"epoch": 2.3615235635894125,
"grad_norm": 0.5979010266216653,
"learning_rate": 1.2647693040439142e-05,
"loss": 0.9584,
"step": 916
},
{
"epoch": 2.3641058747579082,
"grad_norm": 0.6035572479241811,
"learning_rate": 1.2630287964911261e-05,
"loss": 0.958,
"step": 917
},
{
"epoch": 2.366688185926404,
"grad_norm": 0.5830490108904467,
"learning_rate": 1.2612874324816935e-05,
"loss": 0.9492,
"step": 918
},
{
"epoch": 2.3692704970949,
"grad_norm": 0.6049407013095448,
"learning_rate": 1.2595452176857283e-05,
"loss": 0.9215,
"step": 919
},
{
"epoch": 2.371852808263396,
"grad_norm": 0.5853837977544576,
"learning_rate": 1.2578021577761132e-05,
"loss": 0.9397,
"step": 920
},
{
"epoch": 2.3744351194318916,
"grad_norm": 0.6270536757002744,
"learning_rate": 1.2560582584284822e-05,
"loss": 0.9817,
"step": 921
},
{
"epoch": 2.3770174306003873,
"grad_norm": 0.6353840335416789,
"learning_rate": 1.2543135253212027e-05,
"loss": 0.9559,
"step": 922
},
{
"epoch": 2.379599741768883,
"grad_norm": 0.585963379760864,
"learning_rate": 1.2525679641353571e-05,
"loss": 0.9453,
"step": 923
},
{
"epoch": 2.382182052937379,
"grad_norm": 0.5894211852291655,
"learning_rate": 1.2508215805547246e-05,
"loss": 0.9251,
"step": 924
},
{
"epoch": 2.384764364105875,
"grad_norm": 0.572359971184135,
"learning_rate": 1.2490743802657614e-05,
"loss": 0.9564,
"step": 925
},
{
"epoch": 2.3873466752743706,
"grad_norm": 0.5786820742067271,
"learning_rate": 1.2473263689575835e-05,
"loss": 0.9291,
"step": 926
},
{
"epoch": 2.3899289864428663,
"grad_norm": 0.590281816939995,
"learning_rate": 1.2455775523219472e-05,
"loss": 0.9248,
"step": 927
},
{
"epoch": 2.392511297611362,
"grad_norm": 0.5851632591490395,
"learning_rate": 1.2438279360532317e-05,
"loss": 0.9558,
"step": 928
},
{
"epoch": 2.3950936087798578,
"grad_norm": 0.6073264082842632,
"learning_rate": 1.2420775258484194e-05,
"loss": 0.9152,
"step": 929
},
{
"epoch": 2.397675919948354,
"grad_norm": 0.5961742171533062,
"learning_rate": 1.2403263274070786e-05,
"loss": 0.9614,
"step": 930
},
{
"epoch": 2.4002582311168497,
"grad_norm": 0.6094671056323115,
"learning_rate": 1.238574346431343e-05,
"loss": 0.9478,
"step": 931
},
{
"epoch": 2.4028405422853454,
"grad_norm": 0.6141679641874132,
"learning_rate": 1.2368215886258952e-05,
"loss": 0.9588,
"step": 932
},
{
"epoch": 2.405422853453841,
"grad_norm": 0.5891888079618772,
"learning_rate": 1.2350680596979474e-05,
"loss": 0.9748,
"step": 933
},
{
"epoch": 2.408005164622337,
"grad_norm": 0.6220233686120056,
"learning_rate": 1.233313765357222e-05,
"loss": 0.9547,
"step": 934
},
{
"epoch": 2.410587475790833,
"grad_norm": 0.6111194107579635,
"learning_rate": 1.2315587113159342e-05,
"loss": 0.9374,
"step": 935
},
{
"epoch": 2.4131697869593287,
"grad_norm": 0.5816987304224244,
"learning_rate": 1.2298029032887725e-05,
"loss": 0.9611,
"step": 936
},
{
"epoch": 2.4157520981278244,
"grad_norm": 0.6103256535275182,
"learning_rate": 1.228046346992881e-05,
"loss": 0.9388,
"step": 937
},
{
"epoch": 2.41833440929632,
"grad_norm": 0.5861255113568193,
"learning_rate": 1.22628904814784e-05,
"loss": 0.9582,
"step": 938
},
{
"epoch": 2.420916720464816,
"grad_norm": 0.5823515712678948,
"learning_rate": 1.224531012475647e-05,
"loss": 0.9898,
"step": 939
},
{
"epoch": 2.4234990316333116,
"grad_norm": 0.609671663934881,
"learning_rate": 1.2227722457007e-05,
"loss": 0.9596,
"step": 940
},
{
"epoch": 2.4260813428018078,
"grad_norm": 0.5972948161325082,
"learning_rate": 1.221012753549776e-05,
"loss": 0.9955,
"step": 941
},
{
"epoch": 2.4286636539703035,
"grad_norm": 0.5879925977172995,
"learning_rate": 1.2192525417520159e-05,
"loss": 0.9615,
"step": 942
},
{
"epoch": 2.431245965138799,
"grad_norm": 0.6075551488590047,
"learning_rate": 1.2174916160389024e-05,
"loss": 0.9572,
"step": 943
},
{
"epoch": 2.433828276307295,
"grad_norm": 0.6113872256428539,
"learning_rate": 1.2157299821442424e-05,
"loss": 0.9671,
"step": 944
},
{
"epoch": 2.4364105874757906,
"grad_norm": 0.5838911691926075,
"learning_rate": 1.2139676458041505e-05,
"loss": 0.9352,
"step": 945
},
{
"epoch": 2.438992898644287,
"grad_norm": 0.604879771295695,
"learning_rate": 1.2122046127570268e-05,
"loss": 0.9541,
"step": 946
},
{
"epoch": 2.4415752098127825,
"grad_norm": 0.6008885632399309,
"learning_rate": 1.2104408887435413e-05,
"loss": 0.9633,
"step": 947
},
{
"epoch": 2.4441575209812783,
"grad_norm": 0.5834385132140035,
"learning_rate": 1.2086764795066128e-05,
"loss": 0.9455,
"step": 948
},
{
"epoch": 2.446739832149774,
"grad_norm": 0.6092567677261247,
"learning_rate": 1.2069113907913921e-05,
"loss": 0.9564,
"step": 949
},
{
"epoch": 2.4493221433182697,
"grad_norm": 0.5650318694461209,
"learning_rate": 1.2051456283452423e-05,
"loss": 0.97,
"step": 950
},
{
"epoch": 2.4519044544867654,
"grad_norm": 0.608288549791379,
"learning_rate": 1.2033791979177196e-05,
"loss": 0.9628,
"step": 951
},
{
"epoch": 2.4544867656552616,
"grad_norm": 0.6033407862962766,
"learning_rate": 1.2016121052605558e-05,
"loss": 0.9565,
"step": 952
},
{
"epoch": 2.4570690768237573,
"grad_norm": 0.6028336342782669,
"learning_rate": 1.1998443561276395e-05,
"loss": 0.9829,
"step": 953
},
{
"epoch": 2.459651387992253,
"grad_norm": 0.584653200324165,
"learning_rate": 1.1980759562749957e-05,
"loss": 0.9566,
"step": 954
},
{
"epoch": 2.4622336991607487,
"grad_norm": 0.6030118438156815,
"learning_rate": 1.1963069114607692e-05,
"loss": 0.9306,
"step": 955
},
{
"epoch": 2.464816010329245,
"grad_norm": 0.598687643898121,
"learning_rate": 1.1945372274452045e-05,
"loss": 0.9717,
"step": 956
},
{
"epoch": 2.4673983214977406,
"grad_norm": 0.6007026870754814,
"learning_rate": 1.1927669099906274e-05,
"loss": 0.9483,
"step": 957
},
{
"epoch": 2.4699806326662364,
"grad_norm": 0.5841035235550123,
"learning_rate": 1.1909959648614262e-05,
"loss": 0.9888,
"step": 958
},
{
"epoch": 2.472562943834732,
"grad_norm": 0.5950731809881308,
"learning_rate": 1.1892243978240332e-05,
"loss": 0.9442,
"step": 959
},
{
"epoch": 2.475145255003228,
"grad_norm": 0.6073950825590259,
"learning_rate": 1.1874522146469056e-05,
"loss": 0.9607,
"step": 960
},
{
"epoch": 2.4777275661717235,
"grad_norm": 0.5917705341695404,
"learning_rate": 1.1856794211005069e-05,
"loss": 0.9288,
"step": 961
},
{
"epoch": 2.4803098773402197,
"grad_norm": 0.5839083243509722,
"learning_rate": 1.183906022957288e-05,
"loss": 0.9676,
"step": 962
},
{
"epoch": 2.4828921885087154,
"grad_norm": 0.597493851436026,
"learning_rate": 1.182132025991669e-05,
"loss": 0.9598,
"step": 963
},
{
"epoch": 2.485474499677211,
"grad_norm": 0.5765315620556862,
"learning_rate": 1.1803574359800179e-05,
"loss": 0.9744,
"step": 964
},
{
"epoch": 2.488056810845707,
"grad_norm": 0.5912961838573095,
"learning_rate": 1.1785822587006362e-05,
"loss": 0.9847,
"step": 965
},
{
"epoch": 2.4906391220142026,
"grad_norm": 0.5816475691671312,
"learning_rate": 1.1768064999337364e-05,
"loss": 0.9411,
"step": 966
},
{
"epoch": 2.4932214331826987,
"grad_norm": 0.5846058314378276,
"learning_rate": 1.1750301654614242e-05,
"loss": 0.9693,
"step": 967
},
{
"epoch": 2.4958037443511945,
"grad_norm": 0.5830471307870174,
"learning_rate": 1.1732532610676808e-05,
"loss": 0.9354,
"step": 968
},
{
"epoch": 2.49838605551969,
"grad_norm": 0.5836564756949956,
"learning_rate": 1.1714757925383418e-05,
"loss": 0.9617,
"step": 969
},
{
"epoch": 2.500968366688186,
"grad_norm": 0.604259375943389,
"learning_rate": 1.1696977656610813e-05,
"loss": 0.9519,
"step": 970
},
{
"epoch": 2.5035506778566816,
"grad_norm": 0.5742797021433684,
"learning_rate": 1.1679191862253898e-05,
"loss": 0.9547,
"step": 971
},
{
"epoch": 2.5061329890251773,
"grad_norm": 0.5746553555926329,
"learning_rate": 1.1661400600225588e-05,
"loss": 0.9564,
"step": 972
},
{
"epoch": 2.5087153001936735,
"grad_norm": 0.5956191491364381,
"learning_rate": 1.1643603928456581e-05,
"loss": 0.9315,
"step": 973
},
{
"epoch": 2.5112976113621692,
"grad_norm": 0.5972863649697912,
"learning_rate": 1.1625801904895207e-05,
"loss": 0.9828,
"step": 974
},
{
"epoch": 2.513879922530665,
"grad_norm": 0.628215407427667,
"learning_rate": 1.1607994587507216e-05,
"loss": 0.9791,
"step": 975
},
{
"epoch": 2.5164622336991607,
"grad_norm": 0.5794209025299315,
"learning_rate": 1.1590182034275588e-05,
"loss": 0.9765,
"step": 976
},
{
"epoch": 2.5190445448676564,
"grad_norm": 0.5995540341976862,
"learning_rate": 1.157236430320037e-05,
"loss": 0.9425,
"step": 977
},
{
"epoch": 2.5216268560361526,
"grad_norm": 0.589687726091925,
"learning_rate": 1.155454145229845e-05,
"loss": 0.9269,
"step": 978
},
{
"epoch": 2.5242091672046483,
"grad_norm": 0.5903300924561746,
"learning_rate": 1.1536713539603392e-05,
"loss": 0.9515,
"step": 979
},
{
"epoch": 2.526791478373144,
"grad_norm": 0.5926698140037857,
"learning_rate": 1.1518880623165249e-05,
"loss": 0.9613,
"step": 980
},
{
"epoch": 2.5293737895416397,
"grad_norm": 0.5882141138215461,
"learning_rate": 1.1501042761050359e-05,
"loss": 0.9646,
"step": 981
},
{
"epoch": 2.5319561007101354,
"grad_norm": 0.5907114287701524,
"learning_rate": 1.1483200011341172e-05,
"loss": 0.9502,
"step": 982
},
{
"epoch": 2.534538411878631,
"grad_norm": 0.5796881924318279,
"learning_rate": 1.1465352432136041e-05,
"loss": 0.9337,
"step": 983
},
{
"epoch": 2.5371207230471273,
"grad_norm": 0.5738929122712656,
"learning_rate": 1.1447500081549054e-05,
"loss": 0.9405,
"step": 984
},
{
"epoch": 2.539703034215623,
"grad_norm": 0.580328116392153,
"learning_rate": 1.1429643017709833e-05,
"loss": 0.9539,
"step": 985
},
{
"epoch": 2.5422853453841188,
"grad_norm": 0.5881438247765939,
"learning_rate": 1.1411781298763343e-05,
"loss": 0.9313,
"step": 986
},
{
"epoch": 2.5448676565526145,
"grad_norm": 0.5885562040781032,
"learning_rate": 1.1393914982869711e-05,
"loss": 0.9425,
"step": 987
},
{
"epoch": 2.5474499677211107,
"grad_norm": 0.580594597098575,
"learning_rate": 1.1376044128204033e-05,
"loss": 0.9391,
"step": 988
},
{
"epoch": 2.5500322788896064,
"grad_norm": 0.5952429990647207,
"learning_rate": 1.1358168792956178e-05,
"loss": 0.9504,
"step": 989
},
{
"epoch": 2.552614590058102,
"grad_norm": 0.5970710014238076,
"learning_rate": 1.1340289035330614e-05,
"loss": 0.9878,
"step": 990
},
{
"epoch": 2.555196901226598,
"grad_norm": 0.6152662315238809,
"learning_rate": 1.1322404913546197e-05,
"loss": 0.9465,
"step": 991
},
{
"epoch": 2.5577792123950935,
"grad_norm": 0.6027222855714028,
"learning_rate": 1.1304516485836002e-05,
"loss": 0.971,
"step": 992
},
{
"epoch": 2.5603615235635893,
"grad_norm": 0.5918016876334783,
"learning_rate": 1.1286623810447122e-05,
"loss": 0.9652,
"step": 993
},
{
"epoch": 2.562943834732085,
"grad_norm": 0.5896636574102831,
"learning_rate": 1.1268726945640483e-05,
"loss": 0.9372,
"step": 994
},
{
"epoch": 2.565526145900581,
"grad_norm": 0.5824587400275631,
"learning_rate": 1.125082594969065e-05,
"loss": 0.9529,
"step": 995
},
{
"epoch": 2.568108457069077,
"grad_norm": 0.5697833980293927,
"learning_rate": 1.1232920880885632e-05,
"loss": 0.9554,
"step": 996
},
{
"epoch": 2.5706907682375726,
"grad_norm": 0.5801306805314953,
"learning_rate": 1.1215011797526716e-05,
"loss": 0.9268,
"step": 997
},
{
"epoch": 2.5732730794060683,
"grad_norm": 0.5965814716018379,
"learning_rate": 1.119709875792825e-05,
"loss": 0.962,
"step": 998
},
{
"epoch": 2.5758553905745645,
"grad_norm": 0.6086711337973163,
"learning_rate": 1.1179181820417469e-05,
"loss": 0.97,
"step": 999
},
{
"epoch": 2.57843770174306,
"grad_norm": 0.5785369755423095,
"learning_rate": 1.1161261043334296e-05,
"loss": 0.9495,
"step": 1000
},
{
"epoch": 2.581020012911556,
"grad_norm": 0.608023719014441,
"learning_rate": 1.1143336485031156e-05,
"loss": 0.9165,
"step": 1001
},
{
"epoch": 2.5836023240800516,
"grad_norm": 0.6320332520260791,
"learning_rate": 1.1125408203872793e-05,
"loss": 1.0028,
"step": 1002
},
{
"epoch": 2.5861846352485474,
"grad_norm": 0.5833673102474324,
"learning_rate": 1.1107476258236059e-05,
"loss": 0.942,
"step": 1003
},
{
"epoch": 2.588766946417043,
"grad_norm": 0.5888791372130312,
"learning_rate": 1.1089540706509757e-05,
"loss": 0.9548,
"step": 1004
},
{
"epoch": 2.591349257585539,
"grad_norm": 0.5862228558392754,
"learning_rate": 1.1071601607094416e-05,
"loss": 0.9096,
"step": 1005
},
{
"epoch": 2.593931568754035,
"grad_norm": 0.6178780038575998,
"learning_rate": 1.1053659018402123e-05,
"loss": 0.9539,
"step": 1006
},
{
"epoch": 2.5965138799225307,
"grad_norm": 0.6227035958216502,
"learning_rate": 1.1035712998856332e-05,
"loss": 0.9845,
"step": 1007
},
{
"epoch": 2.5990961910910264,
"grad_norm": 0.585793574816453,
"learning_rate": 1.1017763606891653e-05,
"loss": 0.9564,
"step": 1008
},
{
"epoch": 2.601678502259522,
"grad_norm": 0.6031850388726575,
"learning_rate": 1.0999810900953701e-05,
"loss": 0.966,
"step": 1009
},
{
"epoch": 2.6042608134280183,
"grad_norm": 0.6325995476999388,
"learning_rate": 1.0981854939498853e-05,
"loss": 0.934,
"step": 1010
},
{
"epoch": 2.606843124596514,
"grad_norm": 0.604370954178913,
"learning_rate": 1.0963895780994106e-05,
"loss": 0.962,
"step": 1011
},
{
"epoch": 2.6094254357650097,
"grad_norm": 0.6046507204858135,
"learning_rate": 1.0945933483916867e-05,
"loss": 0.9628,
"step": 1012
},
{
"epoch": 2.6120077469335055,
"grad_norm": 0.6055958607582257,
"learning_rate": 1.0927968106754747e-05,
"loss": 0.9724,
"step": 1013
},
{
"epoch": 2.614590058102001,
"grad_norm": 0.6142519834748665,
"learning_rate": 1.0909999708005407e-05,
"loss": 0.9859,
"step": 1014
},
{
"epoch": 2.617172369270497,
"grad_norm": 0.587585188897923,
"learning_rate": 1.0892028346176333e-05,
"loss": 0.9337,
"step": 1015
},
{
"epoch": 2.6197546804389926,
"grad_norm": 0.5775553775383109,
"learning_rate": 1.087405407978466e-05,
"loss": 0.9247,
"step": 1016
},
{
"epoch": 2.622336991607489,
"grad_norm": 0.5914086152693361,
"learning_rate": 1.0856076967356983e-05,
"loss": 0.9646,
"step": 1017
},
{
"epoch": 2.6249193027759845,
"grad_norm": 0.6252845963452488,
"learning_rate": 1.0838097067429168e-05,
"loss": 0.9783,
"step": 1018
},
{
"epoch": 2.6275016139444802,
"grad_norm": 0.5861511527646114,
"learning_rate": 1.0820114438546152e-05,
"loss": 0.9621,
"step": 1019
},
{
"epoch": 2.630083925112976,
"grad_norm": 0.5836312295046293,
"learning_rate": 1.080212913926176e-05,
"loss": 0.9554,
"step": 1020
},
{
"epoch": 2.632666236281472,
"grad_norm": 0.6040474003003209,
"learning_rate": 1.0784141228138507e-05,
"loss": 0.9516,
"step": 1021
},
{
"epoch": 2.635248547449968,
"grad_norm": 0.6355202880988752,
"learning_rate": 1.0766150763747423e-05,
"loss": 0.9789,
"step": 1022
},
{
"epoch": 2.6378308586184636,
"grad_norm": 0.5897510462672635,
"learning_rate": 1.0748157804667844e-05,
"loss": 0.9374,
"step": 1023
},
{
"epoch": 2.6404131697869593,
"grad_norm": 0.5899429810230572,
"learning_rate": 1.0730162409487233e-05,
"loss": 0.9329,
"step": 1024
},
{
"epoch": 2.642995480955455,
"grad_norm": 0.6030569126093994,
"learning_rate": 1.071216463680098e-05,
"loss": 0.9662,
"step": 1025
},
{
"epoch": 2.6455777921239507,
"grad_norm": 0.5981188227832869,
"learning_rate": 1.069416454521222e-05,
"loss": 0.9753,
"step": 1026
},
{
"epoch": 2.648160103292447,
"grad_norm": 0.618406401340536,
"learning_rate": 1.0676162193331642e-05,
"loss": 0.9729,
"step": 1027
},
{
"epoch": 2.6507424144609426,
"grad_norm": 0.5945181324122579,
"learning_rate": 1.0658157639777285e-05,
"loss": 0.9296,
"step": 1028
},
{
"epoch": 2.6533247256294383,
"grad_norm": 0.621876814177428,
"learning_rate": 1.0640150943174368e-05,
"loss": 0.9628,
"step": 1029
},
{
"epoch": 2.655907036797934,
"grad_norm": 0.5872555607480314,
"learning_rate": 1.0622142162155084e-05,
"loss": 0.9647,
"step": 1030
},
{
"epoch": 2.65848934796643,
"grad_norm": 0.6016180713767454,
"learning_rate": 1.060413135535841e-05,
"loss": 0.9489,
"step": 1031
},
{
"epoch": 2.661071659134926,
"grad_norm": 0.5963657410420156,
"learning_rate": 1.0586118581429923e-05,
"loss": 0.9476,
"step": 1032
},
{
"epoch": 2.6636539703034217,
"grad_norm": 0.5814763983307615,
"learning_rate": 1.05681038990216e-05,
"loss": 0.9463,
"step": 1033
},
{
"epoch": 2.6662362814719174,
"grad_norm": 0.5725192948619975,
"learning_rate": 1.0550087366791641e-05,
"loss": 0.9804,
"step": 1034
},
{
"epoch": 2.668818592640413,
"grad_norm": 0.5916916107783017,
"learning_rate": 1.053206904340426e-05,
"loss": 0.9629,
"step": 1035
},
{
"epoch": 2.671400903808909,
"grad_norm": 0.5904165915891584,
"learning_rate": 1.0514048987529515e-05,
"loss": 0.9579,
"step": 1036
},
{
"epoch": 2.6739832149774045,
"grad_norm": 0.5914405056148352,
"learning_rate": 1.0496027257843088e-05,
"loss": 0.9807,
"step": 1037
},
{
"epoch": 2.6765655261459007,
"grad_norm": 0.5846745644240308,
"learning_rate": 1.0478003913026125e-05,
"loss": 0.9679,
"step": 1038
},
{
"epoch": 2.6791478373143964,
"grad_norm": 0.6002766375251781,
"learning_rate": 1.045997901176503e-05,
"loss": 0.971,
"step": 1039
},
{
"epoch": 2.681730148482892,
"grad_norm": 0.5847650891279706,
"learning_rate": 1.0441952612751267e-05,
"loss": 0.9627,
"step": 1040
},
{
"epoch": 2.684312459651388,
"grad_norm": 0.6040931012169604,
"learning_rate": 1.0423924774681186e-05,
"loss": 0.9503,
"step": 1041
},
{
"epoch": 2.686894770819884,
"grad_norm": 0.5785542819032363,
"learning_rate": 1.0405895556255818e-05,
"loss": 0.9559,
"step": 1042
},
{
"epoch": 2.6894770819883798,
"grad_norm": 0.6052229883487668,
"learning_rate": 1.0387865016180688e-05,
"loss": 0.9622,
"step": 1043
},
{
"epoch": 2.6920593931568755,
"grad_norm": 0.5848263105245827,
"learning_rate": 1.0369833213165625e-05,
"loss": 0.9598,
"step": 1044
},
{
"epoch": 2.694641704325371,
"grad_norm": 0.5926309991366325,
"learning_rate": 1.035180020592457e-05,
"loss": 0.9372,
"step": 1045
},
{
"epoch": 2.697224015493867,
"grad_norm": 0.5844049554145337,
"learning_rate": 1.0333766053175391e-05,
"loss": 0.9439,
"step": 1046
},
{
"epoch": 2.6998063266623626,
"grad_norm": 0.6001743480120659,
"learning_rate": 1.031573081363968e-05,
"loss": 0.9346,
"step": 1047
},
{
"epoch": 2.7023886378308584,
"grad_norm": 0.5897380533051093,
"learning_rate": 1.0297694546042563e-05,
"loss": 0.9604,
"step": 1048
},
{
"epoch": 2.7049709489993545,
"grad_norm": 0.584956431101729,
"learning_rate": 1.0279657309112526e-05,
"loss": 0.9045,
"step": 1049
},
{
"epoch": 2.7075532601678503,
"grad_norm": 0.5712935010828868,
"learning_rate": 1.02616191615812e-05,
"loss": 0.9466,
"step": 1050
},
{
"epoch": 2.710135571336346,
"grad_norm": 0.583381386123002,
"learning_rate": 1.0243580162183189e-05,
"loss": 0.9838,
"step": 1051
},
{
"epoch": 2.7127178825048417,
"grad_norm": 0.5846652612272821,
"learning_rate": 1.0225540369655866e-05,
"loss": 0.9751,
"step": 1052
},
{
"epoch": 2.715300193673338,
"grad_norm": 0.5978067742385131,
"learning_rate": 1.0207499842739185e-05,
"loss": 0.9625,
"step": 1053
},
{
"epoch": 2.7178825048418336,
"grad_norm": 0.5853977002645502,
"learning_rate": 1.01894586401755e-05,
"loss": 0.9614,
"step": 1054
},
{
"epoch": 2.7204648160103293,
"grad_norm": 0.5983002966741684,
"learning_rate": 1.0171416820709356e-05,
"loss": 0.9373,
"step": 1055
},
{
"epoch": 2.723047127178825,
"grad_norm": 0.5856993759606652,
"learning_rate": 1.015337444308731e-05,
"loss": 0.9489,
"step": 1056
},
{
"epoch": 2.7256294383473207,
"grad_norm": 0.5901281403453162,
"learning_rate": 1.0135331566057735e-05,
"loss": 0.9332,
"step": 1057
},
{
"epoch": 2.7282117495158165,
"grad_norm": 0.5906660579573058,
"learning_rate": 1.0117288248370636e-05,
"loss": 0.9609,
"step": 1058
},
{
"epoch": 2.730794060684312,
"grad_norm": 0.6062946865104221,
"learning_rate": 1.0099244548777444e-05,
"loss": 0.9372,
"step": 1059
},
{
"epoch": 2.7333763718528084,
"grad_norm": 0.6025103390237757,
"learning_rate": 1.008120052603084e-05,
"loss": 0.9325,
"step": 1060
},
{
"epoch": 2.735958683021304,
"grad_norm": 0.6037740140636985,
"learning_rate": 1.006315623888455e-05,
"loss": 0.9407,
"step": 1061
},
{
"epoch": 2.7385409941898,
"grad_norm": 0.5818379563267816,
"learning_rate": 1.0045111746093174e-05,
"loss": 0.9565,
"step": 1062
},
{
"epoch": 2.7411233053582955,
"grad_norm": 0.5972098469584126,
"learning_rate": 1.0027067106411969e-05,
"loss": 0.9559,
"step": 1063
},
{
"epoch": 2.7437056165267917,
"grad_norm": 0.5921309288084705,
"learning_rate": 1.000902237859668e-05,
"loss": 0.9267,
"step": 1064
},
{
"epoch": 2.7462879276952874,
"grad_norm": 0.5858852838442818,
"learning_rate": 9.990977621403326e-06,
"loss": 0.9778,
"step": 1065
},
{
"epoch": 2.748870238863783,
"grad_norm": 0.5887566802759674,
"learning_rate": 9.972932893588033e-06,
"loss": 0.9054,
"step": 1066
},
{
"epoch": 2.751452550032279,
"grad_norm": 0.5706187383084692,
"learning_rate": 9.954888253906827e-06,
"loss": 0.9482,
"step": 1067
},
{
"epoch": 2.7540348612007746,
"grad_norm": 0.5737416712225011,
"learning_rate": 9.936843761115448e-06,
"loss": 0.9313,
"step": 1068
},
{
"epoch": 2.7566171723692703,
"grad_norm": 0.5618668457848085,
"learning_rate": 9.918799473969162e-06,
"loss": 0.9268,
"step": 1069
},
{
"epoch": 2.7591994835377665,
"grad_norm": 0.5945215622528138,
"learning_rate": 9.90075545122256e-06,
"loss": 0.9708,
"step": 1070
},
{
"epoch": 2.761781794706262,
"grad_norm": 0.5965929940159351,
"learning_rate": 9.882711751629368e-06,
"loss": 0.9618,
"step": 1071
},
{
"epoch": 2.764364105874758,
"grad_norm": 0.6238969650308814,
"learning_rate": 9.864668433942266e-06,
"loss": 0.9206,
"step": 1072
},
{
"epoch": 2.7669464170432536,
"grad_norm": 0.561902457075373,
"learning_rate": 9.84662555691269e-06,
"loss": 0.9762,
"step": 1073
},
{
"epoch": 2.76952872821175,
"grad_norm": 0.6148930289646558,
"learning_rate": 9.828583179290645e-06,
"loss": 0.9293,
"step": 1074
},
{
"epoch": 2.7721110393802455,
"grad_norm": 0.5816613771287756,
"learning_rate": 9.810541359824501e-06,
"loss": 0.9591,
"step": 1075
},
{
"epoch": 2.774693350548741,
"grad_norm": 0.6121639894598173,
"learning_rate": 9.792500157260816e-06,
"loss": 0.9727,
"step": 1076
},
{
"epoch": 2.777275661717237,
"grad_norm": 0.5738661064344951,
"learning_rate": 9.774459630344137e-06,
"loss": 0.9067,
"step": 1077
},
{
"epoch": 2.7798579728857327,
"grad_norm": 0.5745834880727902,
"learning_rate": 9.756419837816811e-06,
"loss": 0.9283,
"step": 1078
},
{
"epoch": 2.7824402840542284,
"grad_norm": 0.6019753697435574,
"learning_rate": 9.738380838418804e-06,
"loss": 0.9414,
"step": 1079
},
{
"epoch": 2.785022595222724,
"grad_norm": 0.5919495172527766,
"learning_rate": 9.720342690887477e-06,
"loss": 0.9464,
"step": 1080
},
{
"epoch": 2.7876049063912203,
"grad_norm": 0.5868873130752621,
"learning_rate": 9.702305453957439e-06,
"loss": 0.9589,
"step": 1081
},
{
"epoch": 2.790187217559716,
"grad_norm": 0.5951626550396919,
"learning_rate": 9.684269186360325e-06,
"loss": 0.9559,
"step": 1082
},
{
"epoch": 2.7927695287282117,
"grad_norm": 0.6004222716865213,
"learning_rate": 9.666233946824612e-06,
"loss": 0.9812,
"step": 1083
},
{
"epoch": 2.7953518398967074,
"grad_norm": 0.5696400324744211,
"learning_rate": 9.648199794075433e-06,
"loss": 0.9503,
"step": 1084
},
{
"epoch": 2.7979341510652036,
"grad_norm": 0.5743980783747284,
"learning_rate": 9.630166786834378e-06,
"loss": 0.935,
"step": 1085
},
{
"epoch": 2.8005164622336993,
"grad_norm": 0.5751575788089159,
"learning_rate": 9.612134983819316e-06,
"loss": 0.9294,
"step": 1086
},
{
"epoch": 2.803098773402195,
"grad_norm": 0.5735290035862011,
"learning_rate": 9.594104443744184e-06,
"loss": 0.9326,
"step": 1087
},
{
"epoch": 2.8056810845706908,
"grad_norm": 0.5850179347042352,
"learning_rate": 9.576075225318817e-06,
"loss": 0.9489,
"step": 1088
},
{
"epoch": 2.8082633957391865,
"grad_norm": 0.5751681929532767,
"learning_rate": 9.558047387248736e-06,
"loss": 0.933,
"step": 1089
},
{
"epoch": 2.810845706907682,
"grad_norm": 0.5767517016594284,
"learning_rate": 9.540020988234972e-06,
"loss": 0.9688,
"step": 1090
},
{
"epoch": 2.813428018076178,
"grad_norm": 0.5834104537340995,
"learning_rate": 9.521996086973877e-06,
"loss": 1.0005,
"step": 1091
},
{
"epoch": 2.816010329244674,
"grad_norm": 0.6121890401561288,
"learning_rate": 9.503972742156917e-06,
"loss": 0.9683,
"step": 1092
},
{
"epoch": 2.81859264041317,
"grad_norm": 0.5780502941320363,
"learning_rate": 9.485951012470491e-06,
"loss": 0.9651,
"step": 1093
},
{
"epoch": 2.8211749515816655,
"grad_norm": 0.5872895683685423,
"learning_rate": 9.467930956595742e-06,
"loss": 0.9497,
"step": 1094
},
{
"epoch": 2.8237572627501613,
"grad_norm": 0.6012252860645063,
"learning_rate": 9.449912633208362e-06,
"loss": 0.962,
"step": 1095
},
{
"epoch": 2.8263395739186574,
"grad_norm": 0.5812288173466004,
"learning_rate": 9.431896100978402e-06,
"loss": 0.9516,
"step": 1096
},
{
"epoch": 2.828921885087153,
"grad_norm": 0.5903667057899601,
"learning_rate": 9.413881418570082e-06,
"loss": 0.933,
"step": 1097
},
{
"epoch": 2.831504196255649,
"grad_norm": 0.574567344834327,
"learning_rate": 9.395868644641594e-06,
"loss": 0.9311,
"step": 1098
},
{
"epoch": 2.8340865074241446,
"grad_norm": 0.6029860483322287,
"learning_rate": 9.37785783784492e-06,
"loss": 0.9365,
"step": 1099
},
{
"epoch": 2.8366688185926403,
"grad_norm": 0.5885522147718864,
"learning_rate": 9.359849056825632e-06,
"loss": 0.9375,
"step": 1100
},
{
"epoch": 2.839251129761136,
"grad_norm": 0.5897382549514845,
"learning_rate": 9.341842360222717e-06,
"loss": 0.9568,
"step": 1101
},
{
"epoch": 2.8418334409296317,
"grad_norm": 0.5970771697415443,
"learning_rate": 9.323837806668363e-06,
"loss": 0.9544,
"step": 1102
},
{
"epoch": 2.844415752098128,
"grad_norm": 0.5998731951808198,
"learning_rate": 9.305835454787784e-06,
"loss": 0.9668,
"step": 1103
},
{
"epoch": 2.8469980632666236,
"grad_norm": 0.5820942005741839,
"learning_rate": 9.287835363199026e-06,
"loss": 0.9552,
"step": 1104
},
{
"epoch": 2.8495803744351194,
"grad_norm": 0.6084126869227644,
"learning_rate": 9.269837590512768e-06,
"loss": 0.9628,
"step": 1105
},
{
"epoch": 2.852162685603615,
"grad_norm": 0.66554698305709,
"learning_rate": 9.25184219533216e-06,
"loss": 0.9367,
"step": 1106
},
{
"epoch": 2.8547449967721112,
"grad_norm": 0.5807306091688449,
"learning_rate": 9.23384923625258e-06,
"loss": 0.9692,
"step": 1107
},
{
"epoch": 2.857327307940607,
"grad_norm": 0.5837765762229058,
"learning_rate": 9.215858771861495e-06,
"loss": 0.9355,
"step": 1108
},
{
"epoch": 2.8599096191091027,
"grad_norm": 0.6108951996108233,
"learning_rate": 9.197870860738245e-06,
"loss": 0.9618,
"step": 1109
},
{
"epoch": 2.8624919302775984,
"grad_norm": 0.6085207165359778,
"learning_rate": 9.17988556145385e-06,
"loss": 0.9749,
"step": 1110
},
{
"epoch": 2.865074241446094,
"grad_norm": 0.6014777439249565,
"learning_rate": 9.161902932570837e-06,
"loss": 0.9419,
"step": 1111
},
{
"epoch": 2.86765655261459,
"grad_norm": 0.5740295354518736,
"learning_rate": 9.143923032643019e-06,
"loss": 0.9325,
"step": 1112
},
{
"epoch": 2.870238863783086,
"grad_norm": 0.5824503091707712,
"learning_rate": 9.125945920215344e-06,
"loss": 0.9624,
"step": 1113
},
{
"epoch": 2.8728211749515817,
"grad_norm": 0.5881119183147646,
"learning_rate": 9.10797165382367e-06,
"loss": 0.9604,
"step": 1114
},
{
"epoch": 2.8754034861200775,
"grad_norm": 0.591602227679226,
"learning_rate": 9.090000291994596e-06,
"loss": 0.9522,
"step": 1115
},
{
"epoch": 2.877985797288573,
"grad_norm": 0.5894398262140761,
"learning_rate": 9.072031893245256e-06,
"loss": 0.9447,
"step": 1116
},
{
"epoch": 2.8805681084570693,
"grad_norm": 0.5843901076209989,
"learning_rate": 9.054066516083138e-06,
"loss": 0.9651,
"step": 1117
},
{
"epoch": 2.883150419625565,
"grad_norm": 0.5830155470269734,
"learning_rate": 9.036104219005895e-06,
"loss": 0.9391,
"step": 1118
},
{
"epoch": 2.885732730794061,
"grad_norm": 0.5795525849711025,
"learning_rate": 9.018145060501152e-06,
"loss": 0.9522,
"step": 1119
},
{
"epoch": 2.8883150419625565,
"grad_norm": 0.5722538427227781,
"learning_rate": 9.000189099046306e-06,
"loss": 0.9652,
"step": 1120
},
{
"epoch": 2.8908973531310522,
"grad_norm": 0.5834430509021916,
"learning_rate": 8.982236393108349e-06,
"loss": 0.9573,
"step": 1121
},
{
"epoch": 2.893479664299548,
"grad_norm": 0.5834678810709014,
"learning_rate": 8.964287001143672e-06,
"loss": 0.9901,
"step": 1122
},
{
"epoch": 2.8960619754680437,
"grad_norm": 0.6148204310068593,
"learning_rate": 8.946340981597879e-06,
"loss": 0.9392,
"step": 1123
},
{
"epoch": 2.89864428663654,
"grad_norm": 0.5823218513706327,
"learning_rate": 8.92839839290559e-06,
"loss": 0.9595,
"step": 1124
},
{
"epoch": 2.9012265978050356,
"grad_norm": 0.5970215090631561,
"learning_rate": 8.910459293490248e-06,
"loss": 0.9334,
"step": 1125
},
{
"epoch": 2.9038089089735313,
"grad_norm": 0.5998590562400262,
"learning_rate": 8.892523741763945e-06,
"loss": 0.9442,
"step": 1126
},
{
"epoch": 2.906391220142027,
"grad_norm": 0.6101328337460503,
"learning_rate": 8.874591796127213e-06,
"loss": 0.9584,
"step": 1127
},
{
"epoch": 2.908973531310523,
"grad_norm": 0.6129595613398248,
"learning_rate": 8.856663514968845e-06,
"loss": 0.9524,
"step": 1128
},
{
"epoch": 2.911555842479019,
"grad_norm": 0.603793328877133,
"learning_rate": 8.838738956665709e-06,
"loss": 0.9197,
"step": 1129
},
{
"epoch": 2.9141381536475146,
"grad_norm": 0.5909017292382529,
"learning_rate": 8.820818179582533e-06,
"loss": 0.9405,
"step": 1130
},
{
"epoch": 2.9167204648160103,
"grad_norm": 0.5831175832584677,
"learning_rate": 8.802901242071751e-06,
"loss": 0.9346,
"step": 1131
},
{
"epoch": 2.919302775984506,
"grad_norm": 0.5837762849349555,
"learning_rate": 8.784988202473284e-06,
"loss": 0.9333,
"step": 1132
},
{
"epoch": 2.9218850871530018,
"grad_norm": 0.5848385359972617,
"learning_rate": 8.76707911911437e-06,
"loss": 0.9551,
"step": 1133
},
{
"epoch": 2.9244673983214975,
"grad_norm": 0.6253939331925262,
"learning_rate": 8.749174050309357e-06,
"loss": 0.9813,
"step": 1134
},
{
"epoch": 2.9270497094899937,
"grad_norm": 0.5845874358674058,
"learning_rate": 8.73127305435952e-06,
"loss": 0.9567,
"step": 1135
},
{
"epoch": 2.9296320206584894,
"grad_norm": 0.5953343497217751,
"learning_rate": 8.71337618955288e-06,
"loss": 0.9953,
"step": 1136
},
{
"epoch": 2.932214331826985,
"grad_norm": 0.6030060364823723,
"learning_rate": 8.695483514163998e-06,
"loss": 0.9455,
"step": 1137
},
{
"epoch": 2.934796642995481,
"grad_norm": 0.5854700296562423,
"learning_rate": 8.677595086453808e-06,
"loss": 0.9408,
"step": 1138
},
{
"epoch": 2.937378954163977,
"grad_norm": 0.5891938385206548,
"learning_rate": 8.65971096466939e-06,
"loss": 0.9547,
"step": 1139
},
{
"epoch": 2.9399612653324727,
"grad_norm": 0.6001369316354844,
"learning_rate": 8.641831207043823e-06,
"loss": 0.9686,
"step": 1140
},
{
"epoch": 2.9425435765009684,
"grad_norm": 0.5815314396468559,
"learning_rate": 8.62395587179597e-06,
"loss": 0.9582,
"step": 1141
},
{
"epoch": 2.945125887669464,
"grad_norm": 0.5918140434178532,
"learning_rate": 8.606085017130289e-06,
"loss": 0.9825,
"step": 1142
},
{
"epoch": 2.94770819883796,
"grad_norm": 0.5892397810416701,
"learning_rate": 8.588218701236662e-06,
"loss": 0.944,
"step": 1143
},
{
"epoch": 2.9502905100064556,
"grad_norm": 0.5832582968436837,
"learning_rate": 8.570356982290172e-06,
"loss": 0.9375,
"step": 1144
},
{
"epoch": 2.9528728211749513,
"grad_norm": 0.6173280182644898,
"learning_rate": 8.552499918450949e-06,
"loss": 0.9782,
"step": 1145
},
{
"epoch": 2.9554551323434475,
"grad_norm": 0.584992585761828,
"learning_rate": 8.534647567863962e-06,
"loss": 0.9657,
"step": 1146
},
{
"epoch": 2.958037443511943,
"grad_norm": 0.6102553737687162,
"learning_rate": 8.516799988658833e-06,
"loss": 0.9371,
"step": 1147
},
{
"epoch": 2.960619754680439,
"grad_norm": 0.5813839505299179,
"learning_rate": 8.498957238949645e-06,
"loss": 0.9702,
"step": 1148
},
{
"epoch": 2.9632020658489346,
"grad_norm": 0.5873347150678367,
"learning_rate": 8.481119376834753e-06,
"loss": 0.9843,
"step": 1149
},
{
"epoch": 2.965784377017431,
"grad_norm": 0.6111402642438966,
"learning_rate": 8.46328646039661e-06,
"loss": 0.9697,
"step": 1150
},
{
"epoch": 2.9683666881859265,
"grad_norm": 0.5922931684597259,
"learning_rate": 8.445458547701555e-06,
"loss": 0.9627,
"step": 1151
},
{
"epoch": 2.9709489993544222,
"grad_norm": 0.5851245117575304,
"learning_rate": 8.427635696799636e-06,
"loss": 0.9215,
"step": 1152
},
{
"epoch": 2.973531310522918,
"grad_norm": 0.5804327721924878,
"learning_rate": 8.409817965724413e-06,
"loss": 0.9716,
"step": 1153
},
{
"epoch": 2.9761136216914137,
"grad_norm": 0.6003712250873723,
"learning_rate": 8.392005412492788e-06,
"loss": 0.9648,
"step": 1154
},
{
"epoch": 2.9786959328599094,
"grad_norm": 0.6082518706572542,
"learning_rate": 8.374198095104795e-06,
"loss": 0.95,
"step": 1155
},
{
"epoch": 2.9812782440284056,
"grad_norm": 0.5866011566920423,
"learning_rate": 8.356396071543422e-06,
"loss": 0.9444,
"step": 1156
},
{
"epoch": 2.9838605551969013,
"grad_norm": 0.6114880019803942,
"learning_rate": 8.338599399774417e-06,
"loss": 0.9693,
"step": 1157
},
{
"epoch": 2.986442866365397,
"grad_norm": 0.5927950336965607,
"learning_rate": 8.320808137746104e-06,
"loss": 0.9667,
"step": 1158
},
{
"epoch": 2.9890251775338927,
"grad_norm": 0.6169016547178486,
"learning_rate": 8.303022343389188e-06,
"loss": 0.9406,
"step": 1159
},
{
"epoch": 2.991607488702389,
"grad_norm": 0.582024586435705,
"learning_rate": 8.285242074616582e-06,
"loss": 0.9729,
"step": 1160
},
{
"epoch": 2.9941897998708846,
"grad_norm": 0.5944691597316901,
"learning_rate": 8.267467389323197e-06,
"loss": 0.9649,
"step": 1161
},
{
"epoch": 2.9967721110393803,
"grad_norm": 0.5916217281333404,
"learning_rate": 8.249698345385761e-06,
"loss": 0.9567,
"step": 1162
},
{
"epoch": 2.999354422207876,
"grad_norm": 0.5736806203482997,
"learning_rate": 8.231935000662641e-06,
"loss": 0.9526,
"step": 1163
},
{
"epoch": 3.0,
"grad_norm": 0.5736806203482997,
"learning_rate": 8.21417741299364e-06,
"loss": 0.8611,
"step": 1164
},
{
"epoch": 3.0025823111684957,
"grad_norm": 1.3479570480783307,
"learning_rate": 8.196425640199823e-06,
"loss": 0.8352,
"step": 1165
},
{
"epoch": 3.0051646223369914,
"grad_norm": 1.1968748953729984,
"learning_rate": 8.178679740083317e-06,
"loss": 0.8032,
"step": 1166
},
{
"epoch": 3.0077469335054876,
"grad_norm": 0.9744412992683603,
"learning_rate": 8.160939770427122e-06,
"loss": 0.811,
"step": 1167
},
{
"epoch": 3.0103292446739833,
"grad_norm": 0.8092124569788149,
"learning_rate": 8.143205788994933e-06,
"loss": 0.8442,
"step": 1168
},
{
"epoch": 3.012911555842479,
"grad_norm": 1.1536699236828805,
"learning_rate": 8.125477853530944e-06,
"loss": 0.8623,
"step": 1169
},
{
"epoch": 3.0154938670109748,
"grad_norm": 1.4343186656583924,
"learning_rate": 8.107756021759673e-06,
"loss": 0.7984,
"step": 1170
},
{
"epoch": 3.0180761781794705,
"grad_norm": 1.1928449593644268,
"learning_rate": 8.090040351385741e-06,
"loss": 0.8323,
"step": 1171
},
{
"epoch": 3.020658489347966,
"grad_norm": 0.9892158368981517,
"learning_rate": 8.072330900093728e-06,
"loss": 0.8219,
"step": 1172
},
{
"epoch": 3.0232408005164624,
"grad_norm": 0.9513434336373858,
"learning_rate": 8.054627725547958e-06,
"loss": 0.7942,
"step": 1173
},
{
"epoch": 3.025823111684958,
"grad_norm": 0.9160615847863206,
"learning_rate": 8.036930885392308e-06,
"loss": 0.824,
"step": 1174
},
{
"epoch": 3.028405422853454,
"grad_norm": 0.9712632670523301,
"learning_rate": 8.019240437250046e-06,
"loss": 0.8105,
"step": 1175
},
{
"epoch": 3.0309877340219495,
"grad_norm": 0.888585643517764,
"learning_rate": 8.001556438723608e-06,
"loss": 0.8133,
"step": 1176
},
{
"epoch": 3.0335700451904453,
"grad_norm": 0.8336632805823568,
"learning_rate": 7.983878947394444e-06,
"loss": 0.8087,
"step": 1177
},
{
"epoch": 3.0361523563589414,
"grad_norm": 0.9159983137263322,
"learning_rate": 7.966208020822808e-06,
"loss": 0.8458,
"step": 1178
},
{
"epoch": 3.038734667527437,
"grad_norm": 0.9601758597787429,
"learning_rate": 7.948543716547584e-06,
"loss": 0.8261,
"step": 1179
},
{
"epoch": 3.041316978695933,
"grad_norm": 0.876187258797956,
"learning_rate": 7.930886092086084e-06,
"loss": 0.8018,
"step": 1180
},
{
"epoch": 3.0438992898644286,
"grad_norm": 0.8319336547056765,
"learning_rate": 7.913235204933873e-06,
"loss": 0.8301,
"step": 1181
},
{
"epoch": 3.0464816010329243,
"grad_norm": 0.870478623487457,
"learning_rate": 7.895591112564588e-06,
"loss": 0.793,
"step": 1182
},
{
"epoch": 3.0490639122014205,
"grad_norm": 0.8628894314897713,
"learning_rate": 7.877953872429734e-06,
"loss": 0.8174,
"step": 1183
},
{
"epoch": 3.051646223369916,
"grad_norm": 0.7872403489116827,
"learning_rate": 7.8603235419585e-06,
"loss": 0.8163,
"step": 1184
},
{
"epoch": 3.054228534538412,
"grad_norm": 0.7851405920047361,
"learning_rate": 7.84270017855758e-06,
"loss": 0.8178,
"step": 1185
},
{
"epoch": 3.0568108457069076,
"grad_norm": 0.8157181746918352,
"learning_rate": 7.825083839610981e-06,
"loss": 0.7963,
"step": 1186
},
{
"epoch": 3.0593931568754034,
"grad_norm": 0.8180215928459832,
"learning_rate": 7.807474582479841e-06,
"loss": 0.8148,
"step": 1187
},
{
"epoch": 3.061975468043899,
"grad_norm": 0.8088389505024169,
"learning_rate": 7.789872464502241e-06,
"loss": 0.827,
"step": 1188
},
{
"epoch": 3.0645577792123952,
"grad_norm": 0.7907227679234932,
"learning_rate": 7.772277542993006e-06,
"loss": 0.8407,
"step": 1189
},
{
"epoch": 3.067140090380891,
"grad_norm": 0.8168926580368819,
"learning_rate": 7.754689875243536e-06,
"loss": 0.8252,
"step": 1190
},
{
"epoch": 3.0697224015493867,
"grad_norm": 0.8128108171059767,
"learning_rate": 7.737109518521604e-06,
"loss": 0.811,
"step": 1191
},
{
"epoch": 3.0723047127178824,
"grad_norm": 0.7723477729268966,
"learning_rate": 7.71953653007119e-06,
"loss": 0.8018,
"step": 1192
},
{
"epoch": 3.074887023886378,
"grad_norm": 0.7771679955089591,
"learning_rate": 7.701970967112278e-06,
"loss": 0.8206,
"step": 1193
},
{
"epoch": 3.0774693350548743,
"grad_norm": 0.7947531953649853,
"learning_rate": 7.684412886840662e-06,
"loss": 0.8374,
"step": 1194
},
{
"epoch": 3.08005164622337,
"grad_norm": 0.7797780708525804,
"learning_rate": 7.666862346427784e-06,
"loss": 0.809,
"step": 1195
},
{
"epoch": 3.0826339573918657,
"grad_norm": 0.7951018870568382,
"learning_rate": 7.649319403020528e-06,
"loss": 0.8148,
"step": 1196
},
{
"epoch": 3.0852162685603615,
"grad_norm": 0.7768045025376982,
"learning_rate": 7.631784113741048e-06,
"loss": 0.7905,
"step": 1197
},
{
"epoch": 3.087798579728857,
"grad_norm": 0.7380091530118719,
"learning_rate": 7.614256535686574e-06,
"loss": 0.8277,
"step": 1198
},
{
"epoch": 3.090380890897353,
"grad_norm": 0.8090369362037133,
"learning_rate": 7.596736725929218e-06,
"loss": 0.7897,
"step": 1199
},
{
"epoch": 3.092963202065849,
"grad_norm": 0.7816172334191853,
"learning_rate": 7.579224741515808e-06,
"loss": 0.801,
"step": 1200
},
{
"epoch": 3.095545513234345,
"grad_norm": 0.7716968909350221,
"learning_rate": 7.561720639467684e-06,
"loss": 0.8253,
"step": 1201
},
{
"epoch": 3.0981278244028405,
"grad_norm": 0.7767223781947307,
"learning_rate": 7.544224476780534e-06,
"loss": 0.8171,
"step": 1202
},
{
"epoch": 3.1007101355713362,
"grad_norm": 0.7780640346641391,
"learning_rate": 7.52673631042417e-06,
"loss": 0.8142,
"step": 1203
},
{
"epoch": 3.103292446739832,
"grad_norm": 0.7678875942864142,
"learning_rate": 7.509256197342389e-06,
"loss": 0.8437,
"step": 1204
},
{
"epoch": 3.105874757908328,
"grad_norm": 0.7741428737890553,
"learning_rate": 7.491784194452756e-06,
"loss": 0.7948,
"step": 1205
},
{
"epoch": 3.108457069076824,
"grad_norm": 0.7445434454135789,
"learning_rate": 7.4743203586464286e-06,
"loss": 0.8186,
"step": 1206
},
{
"epoch": 3.1110393802453196,
"grad_norm": 0.7308461534374082,
"learning_rate": 7.45686474678798e-06,
"loss": 0.8117,
"step": 1207
},
{
"epoch": 3.1136216914138153,
"grad_norm": 0.7624570651090968,
"learning_rate": 7.4394174157151826e-06,
"loss": 0.8184,
"step": 1208
},
{
"epoch": 3.116204002582311,
"grad_norm": 0.7787385810762857,
"learning_rate": 7.421978422238871e-06,
"loss": 0.8051,
"step": 1209
},
{
"epoch": 3.118786313750807,
"grad_norm": 0.7487622485166701,
"learning_rate": 7.404547823142718e-06,
"loss": 0.8065,
"step": 1210
},
{
"epoch": 3.121368624919303,
"grad_norm": 0.7700484213439688,
"learning_rate": 7.387125675183069e-06,
"loss": 0.7893,
"step": 1211
},
{
"epoch": 3.1239509360877986,
"grad_norm": 0.7498057989796449,
"learning_rate": 7.369712035088743e-06,
"loss": 0.8271,
"step": 1212
},
{
"epoch": 3.1265332472562943,
"grad_norm": 0.782447832053478,
"learning_rate": 7.352306959560862e-06,
"loss": 0.8177,
"step": 1213
},
{
"epoch": 3.12911555842479,
"grad_norm": 0.760945467135789,
"learning_rate": 7.3349105052726635e-06,
"loss": 0.8016,
"step": 1214
},
{
"epoch": 3.131697869593286,
"grad_norm": 0.7451691837423764,
"learning_rate": 7.317522728869308e-06,
"loss": 0.8292,
"step": 1215
},
{
"epoch": 3.134280180761782,
"grad_norm": 0.7594539784955314,
"learning_rate": 7.3001436869677056e-06,
"loss": 0.8363,
"step": 1216
},
{
"epoch": 3.1368624919302777,
"grad_norm": 0.7551959784047992,
"learning_rate": 7.2827734361563154e-06,
"loss": 0.8193,
"step": 1217
},
{
"epoch": 3.1394448030987734,
"grad_norm": 0.7523949203336101,
"learning_rate": 7.265412032994977e-06,
"loss": 0.8365,
"step": 1218
},
{
"epoch": 3.142027114267269,
"grad_norm": 0.7736463491191788,
"learning_rate": 7.248059534014728e-06,
"loss": 0.7735,
"step": 1219
},
{
"epoch": 3.144609425435765,
"grad_norm": 0.7260637259865401,
"learning_rate": 7.230715995717605e-06,
"loss": 0.816,
"step": 1220
},
{
"epoch": 3.147191736604261,
"grad_norm": 0.7971984476822972,
"learning_rate": 7.213381474576465e-06,
"loss": 0.844,
"step": 1221
},
{
"epoch": 3.1497740477727567,
"grad_norm": 0.7598850752183374,
"learning_rate": 7.19605602703481e-06,
"loss": 0.7923,
"step": 1222
},
{
"epoch": 3.1523563589412524,
"grad_norm": 0.7608019681518811,
"learning_rate": 7.178739709506592e-06,
"loss": 0.818,
"step": 1223
},
{
"epoch": 3.154938670109748,
"grad_norm": 0.7773577191907428,
"learning_rate": 7.161432578376042e-06,
"loss": 0.8353,
"step": 1224
},
{
"epoch": 3.157520981278244,
"grad_norm": 0.7551689941223817,
"learning_rate": 7.144134689997475e-06,
"loss": 0.8366,
"step": 1225
},
{
"epoch": 3.16010329244674,
"grad_norm": 0.7696289215692551,
"learning_rate": 7.126846100695105e-06,
"loss": 0.831,
"step": 1226
},
{
"epoch": 3.1626856036152358,
"grad_norm": 0.7600151124859899,
"learning_rate": 7.109566866762874e-06,
"loss": 0.8073,
"step": 1227
},
{
"epoch": 3.1652679147837315,
"grad_norm": 0.7520515666346982,
"learning_rate": 7.092297044464256e-06,
"loss": 0.8344,
"step": 1228
},
{
"epoch": 3.167850225952227,
"grad_norm": 0.7818985132603024,
"learning_rate": 7.075036690032088e-06,
"loss": 0.8273,
"step": 1229
},
{
"epoch": 3.170432537120723,
"grad_norm": 0.7438737448683109,
"learning_rate": 7.057785859668373e-06,
"loss": 0.8292,
"step": 1230
},
{
"epoch": 3.1730148482892186,
"grad_norm": 0.7604238311874598,
"learning_rate": 7.040544609544098e-06,
"loss": 0.806,
"step": 1231
},
{
"epoch": 3.175597159457715,
"grad_norm": 0.7739278944618028,
"learning_rate": 7.023312995799062e-06,
"loss": 0.8321,
"step": 1232
},
{
"epoch": 3.1781794706262105,
"grad_norm": 0.7829719049826178,
"learning_rate": 7.006091074541684e-06,
"loss": 0.8207,
"step": 1233
},
{
"epoch": 3.1807617817947063,
"grad_norm": 0.8051283397017396,
"learning_rate": 6.988878901848829e-06,
"loss": 0.7937,
"step": 1234
},
{
"epoch": 3.183344092963202,
"grad_norm": 0.7723245876655893,
"learning_rate": 6.9716765337656034e-06,
"loss": 0.7945,
"step": 1235
},
{
"epoch": 3.1859264041316977,
"grad_norm": 0.7838025063241568,
"learning_rate": 6.954484026305208e-06,
"loss": 0.7946,
"step": 1236
},
{
"epoch": 3.188508715300194,
"grad_norm": 0.7307107229399178,
"learning_rate": 6.937301435448725e-06,
"loss": 0.7995,
"step": 1237
},
{
"epoch": 3.1910910264686896,
"grad_norm": 0.8052398372954221,
"learning_rate": 6.920128817144946e-06,
"loss": 0.8201,
"step": 1238
},
{
"epoch": 3.1936733376371853,
"grad_norm": 0.760805158015623,
"learning_rate": 6.9029662273102015e-06,
"loss": 0.7999,
"step": 1239
},
{
"epoch": 3.196255648805681,
"grad_norm": 0.7483396811127839,
"learning_rate": 6.885813721828149e-06,
"loss": 0.7988,
"step": 1240
},
{
"epoch": 3.1988379599741767,
"grad_norm": 0.7404586071162459,
"learning_rate": 6.868671356549628e-06,
"loss": 0.8092,
"step": 1241
},
{
"epoch": 3.2014202711426725,
"grad_norm": 0.7813265679668377,
"learning_rate": 6.851539187292453e-06,
"loss": 0.8358,
"step": 1242
},
{
"epoch": 3.2040025823111686,
"grad_norm": 0.7759146212310302,
"learning_rate": 6.83441726984124e-06,
"loss": 0.8228,
"step": 1243
},
{
"epoch": 3.2065848934796644,
"grad_norm": 0.7818503076848575,
"learning_rate": 6.81730565994722e-06,
"loss": 0.8149,
"step": 1244
},
{
"epoch": 3.20916720464816,
"grad_norm": 0.7402110582844729,
"learning_rate": 6.800204413328062e-06,
"loss": 0.8388,
"step": 1245
},
{
"epoch": 3.211749515816656,
"grad_norm": 0.738425464653366,
"learning_rate": 6.7831135856677e-06,
"loss": 0.8089,
"step": 1246
},
{
"epoch": 3.2143318269851515,
"grad_norm": 0.777277503906415,
"learning_rate": 6.766033232616131e-06,
"loss": 0.8233,
"step": 1247
},
{
"epoch": 3.2169141381536477,
"grad_norm": 0.7530080273180854,
"learning_rate": 6.748963409789253e-06,
"loss": 0.82,
"step": 1248
},
{
"epoch": 3.2194964493221434,
"grad_norm": 0.7527460534251285,
"learning_rate": 6.731904172768668e-06,
"loss": 0.7935,
"step": 1249
},
{
"epoch": 3.222078760490639,
"grad_norm": 0.7417948760299368,
"learning_rate": 6.714855577101515e-06,
"loss": 0.81,
"step": 1250
},
{
"epoch": 3.224661071659135,
"grad_norm": 0.76497441596248,
"learning_rate": 6.697817678300287e-06,
"loss": 0.8134,
"step": 1251
},
{
"epoch": 3.2272433828276306,
"grad_norm": 0.7743095167259862,
"learning_rate": 6.680790531842641e-06,
"loss": 0.8158,
"step": 1252
},
{
"epoch": 3.2298256939961267,
"grad_norm": 0.7564105405805621,
"learning_rate": 6.6637741931712204e-06,
"loss": 0.8139,
"step": 1253
},
{
"epoch": 3.2324080051646225,
"grad_norm": 0.7569395840964698,
"learning_rate": 6.646768717693484e-06,
"loss": 0.8178,
"step": 1254
},
{
"epoch": 3.234990316333118,
"grad_norm": 0.7669325942851178,
"learning_rate": 6.629774160781511e-06,
"loss": 0.824,
"step": 1255
},
{
"epoch": 3.237572627501614,
"grad_norm": 0.7858446496283839,
"learning_rate": 6.612790577771835e-06,
"loss": 0.8176,
"step": 1256
},
{
"epoch": 3.2401549386701096,
"grad_norm": 0.756498502430699,
"learning_rate": 6.59581802396526e-06,
"loss": 0.8322,
"step": 1257
},
{
"epoch": 3.242737249838606,
"grad_norm": 0.7523880712468195,
"learning_rate": 6.578856554626665e-06,
"loss": 0.8179,
"step": 1258
},
{
"epoch": 3.2453195610071015,
"grad_norm": 0.7418381111698618,
"learning_rate": 6.561906224984844e-06,
"loss": 0.8214,
"step": 1259
},
{
"epoch": 3.2479018721755972,
"grad_norm": 0.748062534762086,
"learning_rate": 6.544967090232321e-06,
"loss": 0.8325,
"step": 1260
},
{
"epoch": 3.250484183344093,
"grad_norm": 0.7830260472124719,
"learning_rate": 6.5280392055251696e-06,
"loss": 0.8245,
"step": 1261
},
{
"epoch": 3.2530664945125887,
"grad_norm": 0.7580297797282579,
"learning_rate": 6.511122625982815e-06,
"loss": 0.8269,
"step": 1262
},
{
"epoch": 3.2556488056810844,
"grad_norm": 0.7545843411413197,
"learning_rate": 6.494217406687893e-06,
"loss": 0.8242,
"step": 1263
},
{
"epoch": 3.2582311168495806,
"grad_norm": 0.7439461629106354,
"learning_rate": 6.477323602686039e-06,
"loss": 0.8087,
"step": 1264
},
{
"epoch": 3.2608134280180763,
"grad_norm": 0.7693068257824085,
"learning_rate": 6.460441268985715e-06,
"loss": 0.8333,
"step": 1265
},
{
"epoch": 3.263395739186572,
"grad_norm": 0.7572186415123207,
"learning_rate": 6.443570460558048e-06,
"loss": 0.8085,
"step": 1266
},
{
"epoch": 3.2659780503550677,
"grad_norm": 0.7558961737811011,
"learning_rate": 6.426711232336613e-06,
"loss": 0.8068,
"step": 1267
},
{
"epoch": 3.2685603615235634,
"grad_norm": 0.7855400126793302,
"learning_rate": 6.409863639217306e-06,
"loss": 0.8147,
"step": 1268
},
{
"epoch": 3.2711426726920596,
"grad_norm": 0.7790255090638041,
"learning_rate": 6.393027736058117e-06,
"loss": 0.8256,
"step": 1269
},
{
"epoch": 3.2737249838605553,
"grad_norm": 0.7607044109115157,
"learning_rate": 6.376203577678981e-06,
"loss": 0.7971,
"step": 1270
},
{
"epoch": 3.276307295029051,
"grad_norm": 0.7553997691720208,
"learning_rate": 6.3593912188615966e-06,
"loss": 0.842,
"step": 1271
},
{
"epoch": 3.2788896061975468,
"grad_norm": 0.7889787626268039,
"learning_rate": 6.3425907143492216e-06,
"loss": 0.8183,
"step": 1272
},
{
"epoch": 3.2814719173660425,
"grad_norm": 0.7485790330649242,
"learning_rate": 6.325802118846533e-06,
"loss": 0.8185,
"step": 1273
},
{
"epoch": 3.284054228534538,
"grad_norm": 0.7536890088538672,
"learning_rate": 6.309025487019425e-06,
"loss": 0.8266,
"step": 1274
},
{
"epoch": 3.2866365397030344,
"grad_norm": 0.7550810533925633,
"learning_rate": 6.2922608734948355e-06,
"loss": 0.8079,
"step": 1275
},
{
"epoch": 3.28921885087153,
"grad_norm": 0.7471626158383303,
"learning_rate": 6.275508332860567e-06,
"loss": 0.8205,
"step": 1276
},
{
"epoch": 3.291801162040026,
"grad_norm": 0.7300314640072086,
"learning_rate": 6.258767919665113e-06,
"loss": 0.8021,
"step": 1277
},
{
"epoch": 3.2943834732085215,
"grad_norm": 0.7640658148164554,
"learning_rate": 6.242039688417483e-06,
"loss": 0.8132,
"step": 1278
},
{
"epoch": 3.2969657843770173,
"grad_norm": 0.7547528610145464,
"learning_rate": 6.225323693587014e-06,
"loss": 0.8287,
"step": 1279
},
{
"epoch": 3.2995480955455134,
"grad_norm": 0.7972400105837699,
"learning_rate": 6.208619989603205e-06,
"loss": 0.8315,
"step": 1280
},
{
"epoch": 3.302130406714009,
"grad_norm": 0.758612295379575,
"learning_rate": 6.191928630855527e-06,
"loss": 0.802,
"step": 1281
},
{
"epoch": 3.304712717882505,
"grad_norm": 0.7601220070386198,
"learning_rate": 6.1752496716932576e-06,
"loss": 0.834,
"step": 1282
},
{
"epoch": 3.3072950290510006,
"grad_norm": 0.7684262734009513,
"learning_rate": 6.158583166425304e-06,
"loss": 0.8481,
"step": 1283
},
{
"epoch": 3.3098773402194963,
"grad_norm": 0.7880034761965038,
"learning_rate": 6.141929169320018e-06,
"loss": 0.815,
"step": 1284
},
{
"epoch": 3.312459651387992,
"grad_norm": 0.7560178543170282,
"learning_rate": 6.125287734605018e-06,
"loss": 0.8129,
"step": 1285
},
{
"epoch": 3.315041962556488,
"grad_norm": 0.7706041266881096,
"learning_rate": 6.108658916467025e-06,
"loss": 0.8016,
"step": 1286
},
{
"epoch": 3.317624273724984,
"grad_norm": 0.7569765139274263,
"learning_rate": 6.092042769051674e-06,
"loss": 0.8273,
"step": 1287
},
{
"epoch": 3.3202065848934796,
"grad_norm": 0.7623292555288878,
"learning_rate": 6.075439346463349e-06,
"loss": 0.7931,
"step": 1288
},
{
"epoch": 3.3227888960619754,
"grad_norm": 0.7427549761512925,
"learning_rate": 6.0588487027649954e-06,
"loss": 0.7812,
"step": 1289
},
{
"epoch": 3.325371207230471,
"grad_norm": 0.7772449050888204,
"learning_rate": 6.042270891977946e-06,
"loss": 0.8305,
"step": 1290
},
{
"epoch": 3.3279535183989672,
"grad_norm": 0.7871222544756025,
"learning_rate": 6.025705968081753e-06,
"loss": 0.8387,
"step": 1291
},
{
"epoch": 3.330535829567463,
"grad_norm": 0.7583353530346796,
"learning_rate": 6.009153985014003e-06,
"loss": 0.8466,
"step": 1292
},
{
"epoch": 3.3331181407359587,
"grad_norm": 0.772127846582864,
"learning_rate": 5.992614996670156e-06,
"loss": 0.8155,
"step": 1293
},
{
"epoch": 3.3357004519044544,
"grad_norm": 0.7487201668100457,
"learning_rate": 5.976089056903342e-06,
"loss": 0.7953,
"step": 1294
},
{
"epoch": 3.33828276307295,
"grad_norm": 0.7614204718665639,
"learning_rate": 5.959576219524217e-06,
"loss": 0.8131,
"step": 1295
},
{
"epoch": 3.340865074241446,
"grad_norm": 0.7763835258668194,
"learning_rate": 5.94307653830077e-06,
"loss": 0.8198,
"step": 1296
},
{
"epoch": 3.343447385409942,
"grad_norm": 0.7753456222642561,
"learning_rate": 5.926590066958149e-06,
"loss": 0.8356,
"step": 1297
},
{
"epoch": 3.3460296965784377,
"grad_norm": 0.7516557804123375,
"learning_rate": 5.910116859178494e-06,
"loss": 0.7854,
"step": 1298
},
{
"epoch": 3.3486120077469335,
"grad_norm": 0.7503527754212284,
"learning_rate": 5.89365696860075e-06,
"loss": 0.8383,
"step": 1299
},
{
"epoch": 3.351194318915429,
"grad_norm": 0.8230846268240456,
"learning_rate": 5.877210448820508e-06,
"loss": 0.8282,
"step": 1300
},
{
"epoch": 3.3537766300839253,
"grad_norm": 0.7864548933883284,
"learning_rate": 5.860777353389816e-06,
"loss": 0.8201,
"step": 1301
},
{
"epoch": 3.356358941252421,
"grad_norm": 0.796951326601112,
"learning_rate": 5.844357735817012e-06,
"loss": 0.8124,
"step": 1302
},
{
"epoch": 3.358941252420917,
"grad_norm": 0.7449791284424515,
"learning_rate": 5.82795164956655e-06,
"loss": 0.8449,
"step": 1303
},
{
"epoch": 3.3615235635894125,
"grad_norm": 0.7629551074846378,
"learning_rate": 5.811559148058817e-06,
"loss": 0.787,
"step": 1304
},
{
"epoch": 3.3641058747579082,
"grad_norm": 0.7440812424379075,
"learning_rate": 5.795180284669981e-06,
"loss": 0.8282,
"step": 1305
},
{
"epoch": 3.366688185926404,
"grad_norm": 0.7744183120279426,
"learning_rate": 5.7788151127317825e-06,
"loss": 0.8258,
"step": 1306
},
{
"epoch": 3.3692704970949,
"grad_norm": 0.7418245554432372,
"learning_rate": 5.762463685531403e-06,
"loss": 0.8284,
"step": 1307
},
{
"epoch": 3.371852808263396,
"grad_norm": 0.7830933256063822,
"learning_rate": 5.746126056311248e-06,
"loss": 0.8452,
"step": 1308
},
{
"epoch": 3.3744351194318916,
"grad_norm": 0.7909760674112923,
"learning_rate": 5.729802278268813e-06,
"loss": 0.8168,
"step": 1309
},
{
"epoch": 3.3770174306003873,
"grad_norm": 0.8007258278890194,
"learning_rate": 5.713492404556477e-06,
"loss": 0.8027,
"step": 1310
},
{
"epoch": 3.379599741768883,
"grad_norm": 0.7758822514790055,
"learning_rate": 5.697196488281357e-06,
"loss": 0.8266,
"step": 1311
},
{
"epoch": 3.382182052937379,
"grad_norm": 0.765799650737426,
"learning_rate": 5.680914582505123e-06,
"loss": 0.8057,
"step": 1312
},
{
"epoch": 3.384764364105875,
"grad_norm": 0.7655979019681851,
"learning_rate": 5.6646467402438045e-06,
"loss": 0.8157,
"step": 1313
},
{
"epoch": 3.3873466752743706,
"grad_norm": 0.7766423523842311,
"learning_rate": 5.6483930144676616e-06,
"loss": 0.8162,
"step": 1314
},
{
"epoch": 3.3899289864428663,
"grad_norm": 0.7589364799160417,
"learning_rate": 5.632153458100985e-06,
"loss": 0.8321,
"step": 1315
},
{
"epoch": 3.392511297611362,
"grad_norm": 0.7788060942298414,
"learning_rate": 5.615928124021921e-06,
"loss": 0.837,
"step": 1316
},
{
"epoch": 3.3950936087798578,
"grad_norm": 0.7694554084656864,
"learning_rate": 5.599717065062302e-06,
"loss": 0.8438,
"step": 1317
},
{
"epoch": 3.397675919948354,
"grad_norm": 0.7631810781785031,
"learning_rate": 5.583520334007494e-06,
"loss": 0.8205,
"step": 1318
},
{
"epoch": 3.4002582311168497,
"grad_norm": 0.7854745994817811,
"learning_rate": 5.567337983596201e-06,
"loss": 0.8208,
"step": 1319
},
{
"epoch": 3.4028405422853454,
"grad_norm": 0.7690571440773396,
"learning_rate": 5.551170066520299e-06,
"loss": 0.814,
"step": 1320
},
{
"epoch": 3.405422853453841,
"grad_norm": 0.7681900860146816,
"learning_rate": 5.535016635424675e-06,
"loss": 0.822,
"step": 1321
},
{
"epoch": 3.408005164622337,
"grad_norm": 0.7811781678205161,
"learning_rate": 5.51887774290704e-06,
"loss": 0.818,
"step": 1322
},
{
"epoch": 3.410587475790833,
"grad_norm": 0.7664479268038544,
"learning_rate": 5.502753441517763e-06,
"loss": 0.8331,
"step": 1323
},
{
"epoch": 3.4131697869593287,
"grad_norm": 0.7775122726368401,
"learning_rate": 5.486643783759713e-06,
"loss": 0.8163,
"step": 1324
},
{
"epoch": 3.4157520981278244,
"grad_norm": 0.7851782250823803,
"learning_rate": 5.470548822088075e-06,
"loss": 0.833,
"step": 1325
},
{
"epoch": 3.41833440929632,
"grad_norm": 0.7722198216385613,
"learning_rate": 5.454468608910177e-06,
"loss": 0.8216,
"step": 1326
},
{
"epoch": 3.420916720464816,
"grad_norm": 0.7650125939985358,
"learning_rate": 5.43840319658532e-06,
"loss": 0.8195,
"step": 1327
},
{
"epoch": 3.4234990316333116,
"grad_norm": 0.7889877069137401,
"learning_rate": 5.422352637424623e-06,
"loss": 0.8356,
"step": 1328
},
{
"epoch": 3.4260813428018078,
"grad_norm": 0.759046090525073,
"learning_rate": 5.4063169836908355e-06,
"loss": 0.8281,
"step": 1329
},
{
"epoch": 3.4286636539703035,
"grad_norm": 0.7614819002487212,
"learning_rate": 5.390296287598173e-06,
"loss": 0.8176,
"step": 1330
},
{
"epoch": 3.431245965138799,
"grad_norm": 0.7733681640312509,
"learning_rate": 5.374290601312139e-06,
"loss": 0.8347,
"step": 1331
},
{
"epoch": 3.433828276307295,
"grad_norm": 0.7679752600393633,
"learning_rate": 5.3582999769493816e-06,
"loss": 0.8129,
"step": 1332
},
{
"epoch": 3.4364105874757906,
"grad_norm": 0.7666398230614995,
"learning_rate": 5.342324466577484e-06,
"loss": 0.8041,
"step": 1333
},
{
"epoch": 3.438992898644287,
"grad_norm": 0.757735562684295,
"learning_rate": 5.326364122214833e-06,
"loss": 0.832,
"step": 1334
},
{
"epoch": 3.4415752098127825,
"grad_norm": 0.7658776895218172,
"learning_rate": 5.310418995830429e-06,
"loss": 0.8127,
"step": 1335
},
{
"epoch": 3.4441575209812783,
"grad_norm": 0.7501769033527278,
"learning_rate": 5.2944891393437145e-06,
"loss": 0.8069,
"step": 1336
},
{
"epoch": 3.446739832149774,
"grad_norm": 0.7720443515660191,
"learning_rate": 5.278574604624411e-06,
"loss": 0.8031,
"step": 1337
},
{
"epoch": 3.4493221433182697,
"grad_norm": 0.7746952692548283,
"learning_rate": 5.262675443492359e-06,
"loss": 0.8212,
"step": 1338
},
{
"epoch": 3.4519044544867654,
"grad_norm": 0.7721583267540482,
"learning_rate": 5.246791707717343e-06,
"loss": 0.806,
"step": 1339
},
{
"epoch": 3.4544867656552616,
"grad_norm": 0.7656034684267539,
"learning_rate": 5.230923449018896e-06,
"loss": 0.815,
"step": 1340
},
{
"epoch": 3.4570690768237573,
"grad_norm": 0.7847317646384887,
"learning_rate": 5.215070719066182e-06,
"loss": 0.8406,
"step": 1341
},
{
"epoch": 3.459651387992253,
"grad_norm": 0.76108699723527,
"learning_rate": 5.199233569477796e-06,
"loss": 0.8535,
"step": 1342
},
{
"epoch": 3.4622336991607487,
"grad_norm": 0.7642688851829174,
"learning_rate": 5.183412051821591e-06,
"loss": 0.8082,
"step": 1343
},
{
"epoch": 3.464816010329245,
"grad_norm": 0.7767506589914347,
"learning_rate": 5.167606217614531e-06,
"loss": 0.8175,
"step": 1344
},
{
"epoch": 3.4673983214977406,
"grad_norm": 0.7584520457174025,
"learning_rate": 5.151816118322503e-06,
"loss": 0.8027,
"step": 1345
},
{
"epoch": 3.4699806326662364,
"grad_norm": 0.768873105060371,
"learning_rate": 5.136041805360172e-06,
"loss": 0.8109,
"step": 1346
},
{
"epoch": 3.472562943834732,
"grad_norm": 0.7839594886363217,
"learning_rate": 5.120283330090787e-06,
"loss": 0.8148,
"step": 1347
},
{
"epoch": 3.475145255003228,
"grad_norm": 0.773747886290827,
"learning_rate": 5.104540743826038e-06,
"loss": 0.8112,
"step": 1348
},
{
"epoch": 3.4777275661717235,
"grad_norm": 0.7739729084947021,
"learning_rate": 5.088814097825871e-06,
"loss": 0.809,
"step": 1349
},
{
"epoch": 3.4803098773402197,
"grad_norm": 0.7918183544657171,
"learning_rate": 5.073103443298326e-06,
"loss": 0.8455,
"step": 1350
},
{
"epoch": 3.4828921885087154,
"grad_norm": 0.7878706269294227,
"learning_rate": 5.057408831399385e-06,
"loss": 0.8308,
"step": 1351
},
{
"epoch": 3.485474499677211,
"grad_norm": 0.7823661607708897,
"learning_rate": 5.041730313232786e-06,
"loss": 0.8393,
"step": 1352
},
{
"epoch": 3.488056810845707,
"grad_norm": 0.7543319732416526,
"learning_rate": 5.026067939849864e-06,
"loss": 0.8318,
"step": 1353
},
{
"epoch": 3.4906391220142026,
"grad_norm": 0.769779193831718,
"learning_rate": 5.0104217622493736e-06,
"loss": 0.833,
"step": 1354
},
{
"epoch": 3.4932214331826987,
"grad_norm": 0.7716520294743638,
"learning_rate": 4.994791831377354e-06,
"loss": 0.8222,
"step": 1355
},
{
"epoch": 3.4958037443511945,
"grad_norm": 0.7736579686573494,
"learning_rate": 4.9791781981269326e-06,
"loss": 0.7974,
"step": 1356
},
{
"epoch": 3.49838605551969,
"grad_norm": 0.7302768684285259,
"learning_rate": 4.9635809133381685e-06,
"loss": 0.8207,
"step": 1357
},
{
"epoch": 3.500968366688186,
"grad_norm": 0.7621702814787035,
"learning_rate": 4.948000027797885e-06,
"loss": 0.8077,
"step": 1358
},
{
"epoch": 3.5035506778566816,
"grad_norm": 0.7638463102404097,
"learning_rate": 4.93243559223952e-06,
"loss": 0.7849,
"step": 1359
},
{
"epoch": 3.5061329890251773,
"grad_norm": 0.7525066168052732,
"learning_rate": 4.916887657342931e-06,
"loss": 0.8103,
"step": 1360
},
{
"epoch": 3.5087153001936735,
"grad_norm": 0.7465812252567701,
"learning_rate": 4.901356273734261e-06,
"loss": 0.8251,
"step": 1361
},
{
"epoch": 3.5112976113621692,
"grad_norm": 0.789989621898556,
"learning_rate": 4.885841491985758e-06,
"loss": 0.8156,
"step": 1362
},
{
"epoch": 3.513879922530665,
"grad_norm": 0.7567373410192682,
"learning_rate": 4.870343362615605e-06,
"loss": 0.8241,
"step": 1363
},
{
"epoch": 3.5164622336991607,
"grad_norm": 0.7748589228781302,
"learning_rate": 4.8548619360877635e-06,
"loss": 0.8061,
"step": 1364
},
{
"epoch": 3.5190445448676564,
"grad_norm": 0.7738666119944785,
"learning_rate": 4.839397262811814e-06,
"loss": 0.8101,
"step": 1365
},
{
"epoch": 3.5216268560361526,
"grad_norm": 0.7614113255993917,
"learning_rate": 4.823949393142791e-06,
"loss": 0.8237,
"step": 1366
},
{
"epoch": 3.5242091672046483,
"grad_norm": 0.776611038166,
"learning_rate": 4.808518377380999e-06,
"loss": 0.8334,
"step": 1367
},
{
"epoch": 3.526791478373144,
"grad_norm": 0.7600268059134173,
"learning_rate": 4.7931042657718685e-06,
"loss": 0.8221,
"step": 1368
},
{
"epoch": 3.5293737895416397,
"grad_norm": 0.7659298097233458,
"learning_rate": 4.777707108505801e-06,
"loss": 0.8374,
"step": 1369
},
{
"epoch": 3.5319561007101354,
"grad_norm": 0.740099038297969,
"learning_rate": 4.762326955717972e-06,
"loss": 0.8138,
"step": 1370
},
{
"epoch": 3.534538411878631,
"grad_norm": 0.7714772416888985,
"learning_rate": 4.746963857488208e-06,
"loss": 0.8288,
"step": 1371
},
{
"epoch": 3.5371207230471273,
"grad_norm": 0.7725845357201855,
"learning_rate": 4.7316178638407885e-06,
"loss": 0.822,
"step": 1372
},
{
"epoch": 3.539703034215623,
"grad_norm": 0.7621909389829162,
"learning_rate": 4.716289024744308e-06,
"loss": 0.8231,
"step": 1373
},
{
"epoch": 3.5422853453841188,
"grad_norm": 0.7607917831814722,
"learning_rate": 4.700977390111495e-06,
"loss": 0.8446,
"step": 1374
},
{
"epoch": 3.5448676565526145,
"grad_norm": 0.7656461760658241,
"learning_rate": 4.685683009799065e-06,
"loss": 0.8214,
"step": 1375
},
{
"epoch": 3.5474499677211107,
"grad_norm": 0.7752118246819907,
"learning_rate": 4.670405933607554e-06,
"loss": 0.8249,
"step": 1376
},
{
"epoch": 3.5500322788896064,
"grad_norm": 0.7402699684294802,
"learning_rate": 4.6551462112811384e-06,
"loss": 0.8409,
"step": 1377
},
{
"epoch": 3.552614590058102,
"grad_norm": 0.7778533292706469,
"learning_rate": 4.639903892507501e-06,
"loss": 0.7924,
"step": 1378
},
{
"epoch": 3.555196901226598,
"grad_norm": 0.7705963566454496,
"learning_rate": 4.624679026917658e-06,
"loss": 0.8203,
"step": 1379
},
{
"epoch": 3.5577792123950935,
"grad_norm": 0.7365633344886319,
"learning_rate": 4.609471664085787e-06,
"loss": 0.8123,
"step": 1380
},
{
"epoch": 3.5603615235635893,
"grad_norm": 0.7586569356393417,
"learning_rate": 4.594281853529076e-06,
"loss": 0.8299,
"step": 1381
},
{
"epoch": 3.562943834732085,
"grad_norm": 0.7652449035633111,
"learning_rate": 4.5791096447075645e-06,
"loss": 0.8141,
"step": 1382
},
{
"epoch": 3.565526145900581,
"grad_norm": 0.7582437627266295,
"learning_rate": 4.563955087023981e-06,
"loss": 0.805,
"step": 1383
},
{
"epoch": 3.568108457069077,
"grad_norm": 0.7826618805912787,
"learning_rate": 4.548818229823568e-06,
"loss": 0.8293,
"step": 1384
},
{
"epoch": 3.5706907682375726,
"grad_norm": 0.762954307030826,
"learning_rate": 4.5336991223939486e-06,
"loss": 0.8456,
"step": 1385
},
{
"epoch": 3.5732730794060683,
"grad_norm": 0.7623373985847602,
"learning_rate": 4.5185978139649355e-06,
"loss": 0.8192,
"step": 1386
},
{
"epoch": 3.5758553905745645,
"grad_norm": 0.7765489599431679,
"learning_rate": 4.503514353708389e-06,
"loss": 0.815,
"step": 1387
},
{
"epoch": 3.57843770174306,
"grad_norm": 0.7584186284661693,
"learning_rate": 4.488448790738059e-06,
"loss": 0.8301,
"step": 1388
},
{
"epoch": 3.581020012911556,
"grad_norm": 0.7763483912193899,
"learning_rate": 4.473401174109423e-06,
"loss": 0.8518,
"step": 1389
},
{
"epoch": 3.5836023240800516,
"grad_norm": 0.796977617965849,
"learning_rate": 4.45837155281951e-06,
"loss": 0.8258,
"step": 1390
},
{
"epoch": 3.5861846352485474,
"grad_norm": 0.7770662519335874,
"learning_rate": 4.443359975806757e-06,
"loss": 0.8068,
"step": 1391
},
{
"epoch": 3.588766946417043,
"grad_norm": 0.7790952514127689,
"learning_rate": 4.428366491950854e-06,
"loss": 0.8296,
"step": 1392
},
{
"epoch": 3.591349257585539,
"grad_norm": 0.7764896929937788,
"learning_rate": 4.413391150072577e-06,
"loss": 0.8007,
"step": 1393
},
{
"epoch": 3.593931568754035,
"grad_norm": 0.7607538258474698,
"learning_rate": 4.39843399893362e-06,
"loss": 0.8025,
"step": 1394
},
{
"epoch": 3.5965138799225307,
"grad_norm": 0.7492819656752748,
"learning_rate": 4.383495087236448e-06,
"loss": 0.8157,
"step": 1395
},
{
"epoch": 3.5990961910910264,
"grad_norm": 0.7596354310000931,
"learning_rate": 4.368574463624146e-06,
"loss": 0.8272,
"step": 1396
},
{
"epoch": 3.601678502259522,
"grad_norm": 0.7407159353225491,
"learning_rate": 4.353672176680236e-06,
"loss": 0.8123,
"step": 1397
},
{
"epoch": 3.6042608134280183,
"grad_norm": 0.7672926201622885,
"learning_rate": 4.338788274928544e-06,
"loss": 0.8086,
"step": 1398
},
{
"epoch": 3.606843124596514,
"grad_norm": 0.7636171894829668,
"learning_rate": 4.323922806833031e-06,
"loss": 0.8067,
"step": 1399
},
{
"epoch": 3.6094254357650097,
"grad_norm": 0.7677055145204084,
"learning_rate": 4.3090758207976305e-06,
"loss": 0.7908,
"step": 1400
},
{
"epoch": 3.6120077469335055,
"grad_norm": 0.748818246629621,
"learning_rate": 4.294247365166093e-06,
"loss": 0.8312,
"step": 1401
},
{
"epoch": 3.614590058102001,
"grad_norm": 0.7660560945307108,
"learning_rate": 4.279437488221843e-06,
"loss": 0.8022,
"step": 1402
},
{
"epoch": 3.617172369270497,
"grad_norm": 0.7838437600172277,
"learning_rate": 4.2646462381878076e-06,
"loss": 0.8377,
"step": 1403
},
{
"epoch": 3.6197546804389926,
"grad_norm": 0.8010185687327693,
"learning_rate": 4.249873663226245e-06,
"loss": 0.7993,
"step": 1404
},
{
"epoch": 3.622336991607489,
"grad_norm": 0.7553739164896088,
"learning_rate": 4.235119811438627e-06,
"loss": 0.8261,
"step": 1405
},
{
"epoch": 3.6249193027759845,
"grad_norm": 0.7716962358330001,
"learning_rate": 4.220384730865456e-06,
"loss": 0.8405,
"step": 1406
},
{
"epoch": 3.6275016139444802,
"grad_norm": 0.7536919382865905,
"learning_rate": 4.205668469486098e-06,
"loss": 0.8108,
"step": 1407
},
{
"epoch": 3.630083925112976,
"grad_norm": 0.7420774971418518,
"learning_rate": 4.190971075218662e-06,
"loss": 0.8065,
"step": 1408
},
{
"epoch": 3.632666236281472,
"grad_norm": 0.7632144438839245,
"learning_rate": 4.176292595919803e-06,
"loss": 0.7927,
"step": 1409
},
{
"epoch": 3.635248547449968,
"grad_norm": 0.7571064594393233,
"learning_rate": 4.1616330793846075e-06,
"loss": 0.8362,
"step": 1410
},
{
"epoch": 3.6378308586184636,
"grad_norm": 0.7622517676642483,
"learning_rate": 4.146992573346394e-06,
"loss": 0.8257,
"step": 1411
},
{
"epoch": 3.6404131697869593,
"grad_norm": 0.7835717386734106,
"learning_rate": 4.1323711254766015e-06,
"loss": 0.8223,
"step": 1412
},
{
"epoch": 3.642995480955455,
"grad_norm": 0.7478789535763465,
"learning_rate": 4.117768783384599e-06,
"loss": 0.7949,
"step": 1413
},
{
"epoch": 3.6455777921239507,
"grad_norm": 0.7767414531451252,
"learning_rate": 4.1031855946175455e-06,
"loss": 0.7961,
"step": 1414
},
{
"epoch": 3.648160103292447,
"grad_norm": 0.7840863973404958,
"learning_rate": 4.088621606660243e-06,
"loss": 0.7999,
"step": 1415
},
{
"epoch": 3.6507424144609426,
"grad_norm": 0.7547796898810605,
"learning_rate": 4.074076866934967e-06,
"loss": 0.818,
"step": 1416
},
{
"epoch": 3.6533247256294383,
"grad_norm": 0.7714902567445823,
"learning_rate": 4.05955142280132e-06,
"loss": 0.8105,
"step": 1417
},
{
"epoch": 3.655907036797934,
"grad_norm": 0.7532163616590967,
"learning_rate": 4.0450453215560684e-06,
"loss": 0.8049,
"step": 1418
},
{
"epoch": 3.65848934796643,
"grad_norm": 0.7773578119690996,
"learning_rate": 4.030558610433005e-06,
"loss": 0.7914,
"step": 1419
},
{
"epoch": 3.661071659134926,
"grad_norm": 0.763304292538588,
"learning_rate": 4.016091336602789e-06,
"loss": 0.8275,
"step": 1420
},
{
"epoch": 3.6636539703034217,
"grad_norm": 0.7804812337616664,
"learning_rate": 4.001643547172776e-06,
"loss": 0.8377,
"step": 1421
},
{
"epoch": 3.6662362814719174,
"grad_norm": 0.767412403491653,
"learning_rate": 3.987215289186881e-06,
"loss": 0.8256,
"step": 1422
},
{
"epoch": 3.668818592640413,
"grad_norm": 0.7617867902372989,
"learning_rate": 3.972806609625434e-06,
"loss": 0.8106,
"step": 1423
},
{
"epoch": 3.671400903808909,
"grad_norm": 0.7540382466723832,
"learning_rate": 3.958417555404999e-06,
"loss": 0.8074,
"step": 1424
},
{
"epoch": 3.6739832149774045,
"grad_norm": 0.7748311551652659,
"learning_rate": 3.9440481733782485e-06,
"loss": 0.8125,
"step": 1425
},
{
"epoch": 3.6765655261459007,
"grad_norm": 0.7623341018369493,
"learning_rate": 3.929698510333799e-06,
"loss": 0.8337,
"step": 1426
},
{
"epoch": 3.6791478373143964,
"grad_norm": 0.767758139184047,
"learning_rate": 3.915368612996055e-06,
"loss": 0.8341,
"step": 1427
},
{
"epoch": 3.681730148482892,
"grad_norm": 0.7519042843627542,
"learning_rate": 3.901058528025055e-06,
"loss": 0.8061,
"step": 1428
},
{
"epoch": 3.684312459651388,
"grad_norm": 0.7625093078444409,
"learning_rate": 3.8867683020163446e-06,
"loss": 0.822,
"step": 1429
},
{
"epoch": 3.686894770819884,
"grad_norm": 0.80028510095772,
"learning_rate": 3.872497981500787e-06,
"loss": 0.8502,
"step": 1430
},
{
"epoch": 3.6894770819883798,
"grad_norm": 0.7652238383245407,
"learning_rate": 3.8582476129444435e-06,
"loss": 0.8163,
"step": 1431
},
{
"epoch": 3.6920593931568755,
"grad_norm": 0.7842966117293941,
"learning_rate": 3.844017242748398e-06,
"loss": 0.7996,
"step": 1432
},
{
"epoch": 3.694641704325371,
"grad_norm": 0.7495726108816106,
"learning_rate": 3.829806917248631e-06,
"loss": 0.8061,
"step": 1433
},
{
"epoch": 3.697224015493867,
"grad_norm": 0.7579352515486196,
"learning_rate": 3.815616682715839e-06,
"loss": 0.7876,
"step": 1434
},
{
"epoch": 3.6998063266623626,
"grad_norm": 0.7788849623252266,
"learning_rate": 3.801446585355315e-06,
"loss": 0.8334,
"step": 1435
},
{
"epoch": 3.7023886378308584,
"grad_norm": 0.7445580252143607,
"learning_rate": 3.7872966713067683e-06,
"loss": 0.8182,
"step": 1436
},
{
"epoch": 3.7049709489993545,
"grad_norm": 0.7772631069572105,
"learning_rate": 3.773166986644202e-06,
"loss": 0.8149,
"step": 1437
},
{
"epoch": 3.7075532601678503,
"grad_norm": 0.7542443690987021,
"learning_rate": 3.7590575773757378e-06,
"loss": 0.8085,
"step": 1438
},
{
"epoch": 3.710135571336346,
"grad_norm": 0.7490006884397156,
"learning_rate": 3.744968489443488e-06,
"loss": 0.8364,
"step": 1439
},
{
"epoch": 3.7127178825048417,
"grad_norm": 0.7588964697785906,
"learning_rate": 3.7308997687233896e-06,
"loss": 0.8109,
"step": 1440
},
{
"epoch": 3.715300193673338,
"grad_norm": 0.7401504987559857,
"learning_rate": 3.7168514610250594e-06,
"loss": 0.8026,
"step": 1441
},
{
"epoch": 3.7178825048418336,
"grad_norm": 0.7554850061896863,
"learning_rate": 3.7028236120916537e-06,
"loss": 0.8315,
"step": 1442
},
{
"epoch": 3.7204648160103293,
"grad_norm": 0.7804604627439944,
"learning_rate": 3.688816267599713e-06,
"loss": 0.8317,
"step": 1443
},
{
"epoch": 3.723047127178825,
"grad_norm": 0.7748081363692426,
"learning_rate": 3.6748294731590038e-06,
"loss": 0.811,
"step": 1444
},
{
"epoch": 3.7256294383473207,
"grad_norm": 0.7657357348049666,
"learning_rate": 3.6608632743123827e-06,
"loss": 0.8244,
"step": 1445
},
{
"epoch": 3.7282117495158165,
"grad_norm": 0.775640736208981,
"learning_rate": 3.6469177165356493e-06,
"loss": 0.835,
"step": 1446
},
{
"epoch": 3.730794060684312,
"grad_norm": 0.7864122051723232,
"learning_rate": 3.6329928452373843e-06,
"loss": 0.8354,
"step": 1447
},
{
"epoch": 3.7333763718528084,
"grad_norm": 0.7862243619147634,
"learning_rate": 3.6190887057588185e-06,
"loss": 0.8311,
"step": 1448
},
{
"epoch": 3.735958683021304,
"grad_norm": 0.7454712765612813,
"learning_rate": 3.6052053433736777e-06,
"loss": 0.8061,
"step": 1449
},
{
"epoch": 3.7385409941898,
"grad_norm": 0.7477377098403756,
"learning_rate": 3.591342803288027e-06,
"loss": 0.7974,
"step": 1450
},
{
"epoch": 3.7411233053582955,
"grad_norm": 0.7414265044721174,
"learning_rate": 3.5775011306401317e-06,
"loss": 0.8101,
"step": 1451
},
{
"epoch": 3.7437056165267917,
"grad_norm": 0.7682965899056072,
"learning_rate": 3.5636803705003174e-06,
"loss": 0.8396,
"step": 1452
},
{
"epoch": 3.7462879276952874,
"grad_norm": 0.7638628497129718,
"learning_rate": 3.5498805678708172e-06,
"loss": 0.8086,
"step": 1453
},
{
"epoch": 3.748870238863783,
"grad_norm": 0.7987728579111757,
"learning_rate": 3.5361017676856114e-06,
"loss": 0.8301,
"step": 1454
},
{
"epoch": 3.751452550032279,
"grad_norm": 0.7642546226790663,
"learning_rate": 3.5223440148103017e-06,
"loss": 0.8127,
"step": 1455
},
{
"epoch": 3.7540348612007746,
"grad_norm": 0.7590358311679077,
"learning_rate": 3.5086073540419594e-06,
"loss": 0.8299,
"step": 1456
},
{
"epoch": 3.7566171723692703,
"grad_norm": 0.7737724761627253,
"learning_rate": 3.4948918301089687e-06,
"loss": 0.7995,
"step": 1457
},
{
"epoch": 3.7591994835377665,
"grad_norm": 0.7893128648030869,
"learning_rate": 3.481197487670901e-06,
"loss": 0.8304,
"step": 1458
},
{
"epoch": 3.761781794706262,
"grad_norm": 0.7672293433531253,
"learning_rate": 3.4675243713183436e-06,
"loss": 0.8271,
"step": 1459
},
{
"epoch": 3.764364105874758,
"grad_norm": 0.781703741527432,
"learning_rate": 3.4538725255727855e-06,
"loss": 0.8248,
"step": 1460
},
{
"epoch": 3.7669464170432536,
"grad_norm": 0.7580531483398701,
"learning_rate": 3.4402419948864384e-06,
"loss": 0.7916,
"step": 1461
},
{
"epoch": 3.76952872821175,
"grad_norm": 0.7486908586065847,
"learning_rate": 3.426632823642123e-06,
"loss": 0.8137,
"step": 1462
},
{
"epoch": 3.7721110393802455,
"grad_norm": 0.7644509690799265,
"learning_rate": 3.4130450561531102e-06,
"loss": 0.8355,
"step": 1463
},
{
"epoch": 3.774693350548741,
"grad_norm": 0.7534559123863085,
"learning_rate": 3.3994787366629623e-06,
"loss": 0.8255,
"step": 1464
},
{
"epoch": 3.777275661717237,
"grad_norm": 0.7614512432818442,
"learning_rate": 3.385933909345419e-06,
"loss": 0.8115,
"step": 1465
},
{
"epoch": 3.7798579728857327,
"grad_norm": 0.786112853584683,
"learning_rate": 3.372410618304238e-06,
"loss": 0.8559,
"step": 1466
},
{
"epoch": 3.7824402840542284,
"grad_norm": 0.7573955890285036,
"learning_rate": 3.3589089075730474e-06,
"loss": 0.8079,
"step": 1467
},
{
"epoch": 3.785022595222724,
"grad_norm": 0.7339247578928705,
"learning_rate": 3.345428821115202e-06,
"loss": 0.8239,
"step": 1468
},
{
"epoch": 3.7876049063912203,
"grad_norm": 0.8027337173972606,
"learning_rate": 3.3319704028236553e-06,
"loss": 0.8258,
"step": 1469
},
{
"epoch": 3.790187217559716,
"grad_norm": 0.7602204125813788,
"learning_rate": 3.3185336965208057e-06,
"loss": 0.8267,
"step": 1470
},
{
"epoch": 3.7927695287282117,
"grad_norm": 0.7661870955667278,
"learning_rate": 3.3051187459583454e-06,
"loss": 0.8059,
"step": 1471
},
{
"epoch": 3.7953518398967074,
"grad_norm": 0.7519474368509446,
"learning_rate": 3.2917255948171366e-06,
"loss": 0.8056,
"step": 1472
},
{
"epoch": 3.7979341510652036,
"grad_norm": 0.7657682155523916,
"learning_rate": 3.2783542867070538e-06,
"loss": 0.8293,
"step": 1473
},
{
"epoch": 3.8005164622336993,
"grad_norm": 0.76866731372553,
"learning_rate": 3.2650048651668463e-06,
"loss": 0.847,
"step": 1474
},
{
"epoch": 3.803098773402195,
"grad_norm": 0.7770729452000317,
"learning_rate": 3.251677373664004e-06,
"loss": 0.8026,
"step": 1475
},
{
"epoch": 3.8056810845706908,
"grad_norm": 0.7444822909996885,
"learning_rate": 3.2383718555946098e-06,
"loss": 0.8205,
"step": 1476
},
{
"epoch": 3.8082633957391865,
"grad_norm": 0.7522360723786109,
"learning_rate": 3.2250883542831933e-06,
"loss": 0.7975,
"step": 1477
},
{
"epoch": 3.810845706907682,
"grad_norm": 0.7476561413065432,
"learning_rate": 3.211826912982591e-06,
"loss": 0.8302,
"step": 1478
},
{
"epoch": 3.813428018076178,
"grad_norm": 0.7624889384036899,
"learning_rate": 3.1985875748738193e-06,
"loss": 0.8336,
"step": 1479
},
{
"epoch": 3.816010329244674,
"grad_norm": 0.7611831427412808,
"learning_rate": 3.1853703830659223e-06,
"loss": 0.8241,
"step": 1480
},
{
"epoch": 3.81859264041317,
"grad_norm": 0.7522692175193729,
"learning_rate": 3.1721753805958245e-06,
"loss": 0.8464,
"step": 1481
},
{
"epoch": 3.8211749515816655,
"grad_norm": 0.7971847308682318,
"learning_rate": 3.1590026104282024e-06,
"loss": 0.8315,
"step": 1482
},
{
"epoch": 3.8237572627501613,
"grad_norm": 0.7769310232315048,
"learning_rate": 3.145852115455348e-06,
"loss": 0.8264,
"step": 1483
},
{
"epoch": 3.8263395739186574,
"grad_norm": 0.7815145076904971,
"learning_rate": 3.132723938497011e-06,
"loss": 0.8103,
"step": 1484
},
{
"epoch": 3.828921885087153,
"grad_norm": 0.7394387659719708,
"learning_rate": 3.1196181223002842e-06,
"loss": 0.8057,
"step": 1485
},
{
"epoch": 3.831504196255649,
"grad_norm": 0.7552308531154759,
"learning_rate": 3.106534709539435e-06,
"loss": 0.8411,
"step": 1486
},
{
"epoch": 3.8340865074241446,
"grad_norm": 0.7493333935655804,
"learning_rate": 3.093473742815797e-06,
"loss": 0.8039,
"step": 1487
},
{
"epoch": 3.8366688185926403,
"grad_norm": 0.738912917115583,
"learning_rate": 3.0804352646576052e-06,
"loss": 0.8271,
"step": 1488
},
{
"epoch": 3.839251129761136,
"grad_norm": 0.7712798674043028,
"learning_rate": 3.067419317519875e-06,
"loss": 0.821,
"step": 1489
},
{
"epoch": 3.8418334409296317,
"grad_norm": 0.7673812637479288,
"learning_rate": 3.054425943784265e-06,
"loss": 0.8401,
"step": 1490
},
{
"epoch": 3.844415752098128,
"grad_norm": 0.7690243688879774,
"learning_rate": 3.041455185758908e-06,
"loss": 0.7975,
"step": 1491
},
{
"epoch": 3.8469980632666236,
"grad_norm": 0.7488704099566282,
"learning_rate": 3.0285070856783206e-06,
"loss": 0.793,
"step": 1492
},
{
"epoch": 3.8495803744351194,
"grad_norm": 0.7468016897185966,
"learning_rate": 3.015581685703237e-06,
"loss": 0.8109,
"step": 1493
},
{
"epoch": 3.852162685603615,
"grad_norm": 0.7504249930796498,
"learning_rate": 3.0026790279204664e-06,
"loss": 0.8314,
"step": 1494
},
{
"epoch": 3.8547449967721112,
"grad_norm": 0.7557407963240177,
"learning_rate": 2.9897991543427797e-06,
"loss": 0.8327,
"step": 1495
},
{
"epoch": 3.857327307940607,
"grad_norm": 0.7670324489918254,
"learning_rate": 2.976942106908749e-06,
"loss": 0.8292,
"step": 1496
},
{
"epoch": 3.8599096191091027,
"grad_norm": 0.7638293238485997,
"learning_rate": 2.9641079274826302e-06,
"loss": 0.8177,
"step": 1497
},
{
"epoch": 3.8624919302775984,
"grad_norm": 0.7712018435277216,
"learning_rate": 2.951296657854209e-06,
"loss": 0.8285,
"step": 1498
},
{
"epoch": 3.865074241446094,
"grad_norm": 0.7597575954985475,
"learning_rate": 2.938508339738683e-06,
"loss": 0.816,
"step": 1499
},
{
"epoch": 3.86765655261459,
"grad_norm": 0.7717927570202332,
"learning_rate": 2.9257430147765096e-06,
"loss": 0.8493,
"step": 1500
},
{
"epoch": 3.870238863783086,
"grad_norm": 0.7649795668645112,
"learning_rate": 2.913000724533277e-06,
"loss": 0.7985,
"step": 1501
},
{
"epoch": 3.8728211749515817,
"grad_norm": 0.7542531439383557,
"learning_rate": 2.900281510499575e-06,
"loss": 0.8093,
"step": 1502
},
{
"epoch": 3.8754034861200775,
"grad_norm": 0.7519132753628803,
"learning_rate": 2.8875854140908544e-06,
"loss": 0.8137,
"step": 1503
},
{
"epoch": 3.877985797288573,
"grad_norm": 0.7514626395050483,
"learning_rate": 2.8749124766472858e-06,
"loss": 0.8094,
"step": 1504
},
{
"epoch": 3.8805681084570693,
"grad_norm": 0.7683645531072449,
"learning_rate": 2.862262739433631e-06,
"loss": 0.8132,
"step": 1505
},
{
"epoch": 3.883150419625565,
"grad_norm": 0.7458416884658178,
"learning_rate": 2.8496362436391157e-06,
"loss": 0.8168,
"step": 1506
},
{
"epoch": 3.885732730794061,
"grad_norm": 0.7746722925229123,
"learning_rate": 2.8370330303772874e-06,
"loss": 0.7996,
"step": 1507
},
{
"epoch": 3.8883150419625565,
"grad_norm": 0.7553669840963009,
"learning_rate": 2.8244531406858765e-06,
"loss": 0.8288,
"step": 1508
},
{
"epoch": 3.8908973531310522,
"grad_norm": 0.7573845094295684,
"learning_rate": 2.81189661552667e-06,
"loss": 0.8374,
"step": 1509
},
{
"epoch": 3.893479664299548,
"grad_norm": 0.7400418799985741,
"learning_rate": 2.7993634957853843e-06,
"loss": 0.8375,
"step": 1510
},
{
"epoch": 3.8960619754680437,
"grad_norm": 0.7719250258407471,
"learning_rate": 2.7868538222715134e-06,
"loss": 0.826,
"step": 1511
},
{
"epoch": 3.89864428663654,
"grad_norm": 0.7563300858230262,
"learning_rate": 2.774367635718217e-06,
"loss": 0.7974,
"step": 1512
},
{
"epoch": 3.9012265978050356,
"grad_norm": 0.7458708691408243,
"learning_rate": 2.761904976782177e-06,
"loss": 0.8012,
"step": 1513
},
{
"epoch": 3.9038089089735313,
"grad_norm": 0.7442133907212378,
"learning_rate": 2.749465886043462e-06,
"loss": 0.8129,
"step": 1514
},
{
"epoch": 3.906391220142027,
"grad_norm": 0.7606057865370476,
"learning_rate": 2.7370504040053957e-06,
"loss": 0.7908,
"step": 1515
},
{
"epoch": 3.908973531310523,
"grad_norm": 0.7415475080933377,
"learning_rate": 2.7246585710944383e-06,
"loss": 0.8383,
"step": 1516
},
{
"epoch": 3.911555842479019,
"grad_norm": 0.7747366485131825,
"learning_rate": 2.7122904276600483e-06,
"loss": 0.8299,
"step": 1517
},
{
"epoch": 3.9141381536475146,
"grad_norm": 0.7470976085296888,
"learning_rate": 2.699946013974527e-06,
"loss": 0.8225,
"step": 1518
},
{
"epoch": 3.9167204648160103,
"grad_norm": 0.759056171874551,
"learning_rate": 2.68762537023293e-06,
"loss": 0.8079,
"step": 1519
},
{
"epoch": 3.919302775984506,
"grad_norm": 0.7626528987921768,
"learning_rate": 2.6753285365529103e-06,
"loss": 0.8272,
"step": 1520
},
{
"epoch": 3.9218850871530018,
"grad_norm": 0.7651769802741644,
"learning_rate": 2.6630555529745826e-06,
"loss": 0.8338,
"step": 1521
},
{
"epoch": 3.9244673983214975,
"grad_norm": 0.7774095564165737,
"learning_rate": 2.6508064594604157e-06,
"loss": 0.8203,
"step": 1522
},
{
"epoch": 3.9270497094899937,
"grad_norm": 0.7651143778092684,
"learning_rate": 2.638581295895075e-06,
"loss": 0.849,
"step": 1523
},
{
"epoch": 3.9296320206584894,
"grad_norm": 0.7903147596028517,
"learning_rate": 2.626380102085322e-06,
"loss": 0.8106,
"step": 1524
},
{
"epoch": 3.932214331826985,
"grad_norm": 0.7530887651415382,
"learning_rate": 2.614202917759855e-06,
"loss": 0.8333,
"step": 1525
},
{
"epoch": 3.934796642995481,
"grad_norm": 0.77568209115188,
"learning_rate": 2.602049782569206e-06,
"loss": 0.8137,
"step": 1526
},
{
"epoch": 3.937378954163977,
"grad_norm": 0.7855983310276548,
"learning_rate": 2.5899207360855984e-06,
"loss": 0.7917,
"step": 1527
},
{
"epoch": 3.9399612653324727,
"grad_norm": 0.7531574331667908,
"learning_rate": 2.5778158178028045e-06,
"loss": 0.8178,
"step": 1528
},
{
"epoch": 3.9425435765009684,
"grad_norm": 0.7469480826297799,
"learning_rate": 2.5657350671360514e-06,
"loss": 0.844,
"step": 1529
},
{
"epoch": 3.945125887669464,
"grad_norm": 0.7762679927670401,
"learning_rate": 2.5536785234218664e-06,
"loss": 0.8234,
"step": 1530
},
{
"epoch": 3.94770819883796,
"grad_norm": 0.7554503148115285,
"learning_rate": 2.541646225917954e-06,
"loss": 0.8214,
"step": 1531
},
{
"epoch": 3.9502905100064556,
"grad_norm": 0.7509360975231941,
"learning_rate": 2.529638213803065e-06,
"loss": 0.8096,
"step": 1532
},
{
"epoch": 3.9528728211749513,
"grad_norm": 0.71654004355707,
"learning_rate": 2.5176545261768847e-06,
"loss": 0.8168,
"step": 1533
},
{
"epoch": 3.9554551323434475,
"grad_norm": 0.7790891430190677,
"learning_rate": 2.5056952020598913e-06,
"loss": 0.8014,
"step": 1534
},
{
"epoch": 3.958037443511943,
"grad_norm": 0.7484517687807049,
"learning_rate": 2.4937602803932237e-06,
"loss": 0.8326,
"step": 1535
},
{
"epoch": 3.960619754680439,
"grad_norm": 0.7746336735865199,
"learning_rate": 2.481849800038577e-06,
"loss": 0.8329,
"step": 1536
},
{
"epoch": 3.9632020658489346,
"grad_norm": 0.7508850795911187,
"learning_rate": 2.4699637997780503e-06,
"loss": 0.8104,
"step": 1537
},
{
"epoch": 3.965784377017431,
"grad_norm": 0.7624683021848369,
"learning_rate": 2.458102318314034e-06,
"loss": 0.8195,
"step": 1538
},
{
"epoch": 3.9683666881859265,
"grad_norm": 0.7576649154014872,
"learning_rate": 2.4462653942690895e-06,
"loss": 0.8154,
"step": 1539
},
{
"epoch": 3.9709489993544222,
"grad_norm": 0.7548460407193209,
"learning_rate": 2.4344530661858123e-06,
"loss": 0.8193,
"step": 1540
},
{
"epoch": 3.973531310522918,
"grad_norm": 0.7602374654670865,
"learning_rate": 2.422665372526708e-06,
"loss": 0.8203,
"step": 1541
},
{
"epoch": 3.9761136216914137,
"grad_norm": 0.7675841326705145,
"learning_rate": 2.410902351674066e-06,
"loss": 0.8207,
"step": 1542
},
{
"epoch": 3.9786959328599094,
"grad_norm": 0.7514141818157982,
"learning_rate": 2.399164041929846e-06,
"loss": 0.7885,
"step": 1543
},
{
"epoch": 3.9812782440284056,
"grad_norm": 0.7642082673174962,
"learning_rate": 2.387450481515543e-06,
"loss": 0.799,
"step": 1544
},
{
"epoch": 3.9838605551969013,
"grad_norm": 0.7494471147548254,
"learning_rate": 2.3757617085720617e-06,
"loss": 0.8128,
"step": 1545
},
{
"epoch": 3.986442866365397,
"grad_norm": 0.7566122069144386,
"learning_rate": 2.364097761159594e-06,
"loss": 0.8212,
"step": 1546
},
{
"epoch": 3.9890251775338927,
"grad_norm": 0.7695050070788328,
"learning_rate": 2.3524586772575055e-06,
"loss": 0.8265,
"step": 1547
},
{
"epoch": 3.991607488702389,
"grad_norm": 0.767828875724427,
"learning_rate": 2.3408444947641897e-06,
"loss": 0.8107,
"step": 1548
},
{
"epoch": 3.9941897998708846,
"grad_norm": 0.7241738464533221,
"learning_rate": 2.3292552514969723e-06,
"loss": 0.8248,
"step": 1549
},
{
"epoch": 3.9967721110393803,
"grad_norm": 0.7625460770903586,
"learning_rate": 2.3176909851919593e-06,
"loss": 0.8179,
"step": 1550
},
{
"epoch": 3.999354422207876,
"grad_norm": 0.741225630820047,
"learning_rate": 2.306151733503943e-06,
"loss": 0.7945,
"step": 1551
},
{
"epoch": 4.0,
"grad_norm": 1.6164851247119985,
"learning_rate": 2.294637534006251e-06,
"loss": 0.789,
"step": 1552
},
{
"epoch": 4.002582311168496,
"grad_norm": 1.424317513923708,
"learning_rate": 2.2831484241906456e-06,
"loss": 0.7301,
"step": 1553
},
{
"epoch": 4.005164622336991,
"grad_norm": 1.3624952198303995,
"learning_rate": 2.271684441467198e-06,
"loss": 0.7151,
"step": 1554
},
{
"epoch": 4.007746933505487,
"grad_norm": 1.2756030345247056,
"learning_rate": 2.2602456231641457e-06,
"loss": 0.73,
"step": 1555
},
{
"epoch": 4.010329244673983,
"grad_norm": 1.0331700783629776,
"learning_rate": 2.2488320065278034e-06,
"loss": 0.6833,
"step": 1556
},
{
"epoch": 4.012911555842479,
"grad_norm": 0.9404904636726831,
"learning_rate": 2.2374436287224245e-06,
"loss": 0.728,
"step": 1557
},
{
"epoch": 4.015493867010975,
"grad_norm": 0.9727430065578684,
"learning_rate": 2.22608052683007e-06,
"loss": 0.7489,
"step": 1558
},
{
"epoch": 4.018076178179471,
"grad_norm": 1.098170239940058,
"learning_rate": 2.214742737850514e-06,
"loss": 0.7356,
"step": 1559
},
{
"epoch": 4.020658489347967,
"grad_norm": 1.2733638094374413,
"learning_rate": 2.2034302987010938e-06,
"loss": 0.7244,
"step": 1560
},
{
"epoch": 4.023240800516462,
"grad_norm": 1.2876487754543966,
"learning_rate": 2.192143246216618e-06,
"loss": 0.71,
"step": 1561
},
{
"epoch": 4.025823111684958,
"grad_norm": 1.2629073802929212,
"learning_rate": 2.180881617149221e-06,
"loss": 0.7205,
"step": 1562
},
{
"epoch": 4.028405422853454,
"grad_norm": 1.179837995515697,
"learning_rate": 2.169645448168265e-06,
"loss": 0.7329,
"step": 1563
},
{
"epoch": 4.0309877340219495,
"grad_norm": 1.0543627752747324,
"learning_rate": 2.158434775860205e-06,
"loss": 0.7173,
"step": 1564
},
{
"epoch": 4.033570045190445,
"grad_norm": 0.9861615682326618,
"learning_rate": 2.1472496367284746e-06,
"loss": 0.7369,
"step": 1565
},
{
"epoch": 4.036152356358941,
"grad_norm": 1.0060005644443055,
"learning_rate": 2.1360900671933703e-06,
"loss": 0.7039,
"step": 1566
},
{
"epoch": 4.038734667527437,
"grad_norm": 0.949784297792835,
"learning_rate": 2.1249561035919364e-06,
"loss": 0.7236,
"step": 1567
},
{
"epoch": 4.041316978695932,
"grad_norm": 0.9737652328085534,
"learning_rate": 2.113847782177829e-06,
"loss": 0.7088,
"step": 1568
},
{
"epoch": 4.043899289864429,
"grad_norm": 0.9725504207142661,
"learning_rate": 2.1027651391212158e-06,
"loss": 0.7366,
"step": 1569
},
{
"epoch": 4.046481601032925,
"grad_norm": 0.9215401564734376,
"learning_rate": 2.091708210508654e-06,
"loss": 0.7031,
"step": 1570
},
{
"epoch": 4.0490639122014205,
"grad_norm": 0.902423487206708,
"learning_rate": 2.0806770323429725e-06,
"loss": 0.7369,
"step": 1571
},
{
"epoch": 4.051646223369916,
"grad_norm": 0.9424761496048374,
"learning_rate": 2.069671640543147e-06,
"loss": 0.7624,
"step": 1572
},
{
"epoch": 4.054228534538412,
"grad_norm": 0.9147025826090719,
"learning_rate": 2.0586920709441916e-06,
"loss": 0.719,
"step": 1573
},
{
"epoch": 4.056810845706908,
"grad_norm": 0.8911633869358179,
"learning_rate": 2.0477383592970445e-06,
"loss": 0.6934,
"step": 1574
},
{
"epoch": 4.059393156875403,
"grad_norm": 1.0041409401766892,
"learning_rate": 2.0368105412684393e-06,
"loss": 0.7207,
"step": 1575
},
{
"epoch": 4.061975468043899,
"grad_norm": 1.0810423533153977,
"learning_rate": 2.0259086524408036e-06,
"loss": 0.7488,
"step": 1576
},
{
"epoch": 4.064557779212395,
"grad_norm": 1.081615913030172,
"learning_rate": 2.015032728312134e-06,
"loss": 0.7308,
"step": 1577
},
{
"epoch": 4.0671400903808905,
"grad_norm": 0.9264001598492572,
"learning_rate": 2.0041828042958823e-06,
"loss": 0.7099,
"step": 1578
},
{
"epoch": 4.069722401549387,
"grad_norm": 0.9108042452749365,
"learning_rate": 1.9933589157208356e-06,
"loss": 0.706,
"step": 1579
},
{
"epoch": 4.072304712717883,
"grad_norm": 0.8937819117625528,
"learning_rate": 1.9825610978310127e-06,
"loss": 0.7104,
"step": 1580
},
{
"epoch": 4.074887023886379,
"grad_norm": 0.8825092444237356,
"learning_rate": 1.9717893857855475e-06,
"loss": 0.7053,
"step": 1581
},
{
"epoch": 4.077469335054874,
"grad_norm": 0.8817000599546978,
"learning_rate": 1.961043814658552e-06,
"loss": 0.7098,
"step": 1582
},
{
"epoch": 4.08005164622337,
"grad_norm": 0.9005717565381924,
"learning_rate": 1.950324419439035e-06,
"loss": 0.6968,
"step": 1583
},
{
"epoch": 4.082633957391866,
"grad_norm": 0.9122929744904504,
"learning_rate": 1.9396312350307722e-06,
"loss": 0.7119,
"step": 1584
},
{
"epoch": 4.0852162685603615,
"grad_norm": 0.8952175519583466,
"learning_rate": 1.9289642962521847e-06,
"loss": 0.7177,
"step": 1585
},
{
"epoch": 4.087798579728857,
"grad_norm": 0.8962217979338948,
"learning_rate": 1.918323637836247e-06,
"loss": 0.7047,
"step": 1586
},
{
"epoch": 4.090380890897353,
"grad_norm": 0.898920603548723,
"learning_rate": 1.9077092944303453e-06,
"loss": 0.7328,
"step": 1587
},
{
"epoch": 4.092963202065849,
"grad_norm": 0.9407672412231067,
"learning_rate": 1.8971213005961985e-06,
"loss": 0.7244,
"step": 1588
},
{
"epoch": 4.095545513234344,
"grad_norm": 0.9175278092820267,
"learning_rate": 1.8865596908097105e-06,
"loss": 0.7076,
"step": 1589
},
{
"epoch": 4.098127824402841,
"grad_norm": 0.9219067326264855,
"learning_rate": 1.8760244994608911e-06,
"loss": 0.7205,
"step": 1590
},
{
"epoch": 4.100710135571337,
"grad_norm": 0.9313883405997428,
"learning_rate": 1.8655157608537156e-06,
"loss": 0.7329,
"step": 1591
},
{
"epoch": 4.103292446739832,
"grad_norm": 0.9069473980493022,
"learning_rate": 1.855033509206029e-06,
"loss": 0.7058,
"step": 1592
},
{
"epoch": 4.105874757908328,
"grad_norm": 0.9042132782800456,
"learning_rate": 1.8445777786494356e-06,
"loss": 0.722,
"step": 1593
},
{
"epoch": 4.108457069076824,
"grad_norm": 0.8984738476555618,
"learning_rate": 1.8341486032291834e-06,
"loss": 0.6965,
"step": 1594
},
{
"epoch": 4.11103938024532,
"grad_norm": 0.9033341125566603,
"learning_rate": 1.823746016904049e-06,
"loss": 0.7043,
"step": 1595
},
{
"epoch": 4.113621691413815,
"grad_norm": 0.9024091267438483,
"learning_rate": 1.8133700535462274e-06,
"loss": 0.7181,
"step": 1596
},
{
"epoch": 4.116204002582311,
"grad_norm": 0.8840547702331727,
"learning_rate": 1.8030207469412374e-06,
"loss": 0.7137,
"step": 1597
},
{
"epoch": 4.118786313750807,
"grad_norm": 0.8791770361734527,
"learning_rate": 1.7926981307877944e-06,
"loss": 0.707,
"step": 1598
},
{
"epoch": 4.1213686249193024,
"grad_norm": 0.8765500683094899,
"learning_rate": 1.7824022386977014e-06,
"loss": 0.7332,
"step": 1599
},
{
"epoch": 4.123950936087798,
"grad_norm": 0.8727571261897714,
"learning_rate": 1.7721331041957535e-06,
"loss": 0.7026,
"step": 1600
},
{
"epoch": 4.126533247256295,
"grad_norm": 0.8893215641197096,
"learning_rate": 1.7618907607196112e-06,
"loss": 0.699,
"step": 1601
},
{
"epoch": 4.1291155584247905,
"grad_norm": 0.8809282308536279,
"learning_rate": 1.7516752416197013e-06,
"loss": 0.6937,
"step": 1602
},
{
"epoch": 4.131697869593286,
"grad_norm": 0.9065700149312429,
"learning_rate": 1.741486580159112e-06,
"loss": 0.7156,
"step": 1603
},
{
"epoch": 4.134280180761782,
"grad_norm": 0.9172825352706213,
"learning_rate": 1.7313248095134772e-06,
"loss": 0.7224,
"step": 1604
},
{
"epoch": 4.136862491930278,
"grad_norm": 0.9359192197464888,
"learning_rate": 1.7211899627708694e-06,
"loss": 0.7159,
"step": 1605
},
{
"epoch": 4.139444803098773,
"grad_norm": 0.8923723735789315,
"learning_rate": 1.711082072931689e-06,
"loss": 0.7144,
"step": 1606
},
{
"epoch": 4.142027114267269,
"grad_norm": 0.8698447798036731,
"learning_rate": 1.7010011729085696e-06,
"loss": 0.7183,
"step": 1607
},
{
"epoch": 4.144609425435765,
"grad_norm": 0.9184478130391627,
"learning_rate": 1.6909472955262596e-06,
"loss": 0.7542,
"step": 1608
},
{
"epoch": 4.1471917366042605,
"grad_norm": 0.8850461139260019,
"learning_rate": 1.6809204735215179e-06,
"loss": 0.7186,
"step": 1609
},
{
"epoch": 4.149774047772756,
"grad_norm": 0.8889238278435273,
"learning_rate": 1.6709207395430005e-06,
"loss": 0.7405,
"step": 1610
},
{
"epoch": 4.152356358941253,
"grad_norm": 0.9208003199238048,
"learning_rate": 1.660948126151175e-06,
"loss": 0.7124,
"step": 1611
},
{
"epoch": 4.154938670109749,
"grad_norm": 0.8762844910372851,
"learning_rate": 1.6510026658181866e-06,
"loss": 0.7292,
"step": 1612
},
{
"epoch": 4.157520981278244,
"grad_norm": 0.8940625291851263,
"learning_rate": 1.6410843909277784e-06,
"loss": 0.7186,
"step": 1613
},
{
"epoch": 4.16010329244674,
"grad_norm": 0.8584435328232947,
"learning_rate": 1.6311933337751652e-06,
"loss": 0.7018,
"step": 1614
},
{
"epoch": 4.162685603615236,
"grad_norm": 0.8889295547847345,
"learning_rate": 1.6213295265669448e-06,
"loss": 0.713,
"step": 1615
},
{
"epoch": 4.1652679147837315,
"grad_norm": 0.8961567659168423,
"learning_rate": 1.6114930014209763e-06,
"loss": 0.716,
"step": 1616
},
{
"epoch": 4.167850225952227,
"grad_norm": 0.8623292807248303,
"learning_rate": 1.601683790366293e-06,
"loss": 0.7409,
"step": 1617
},
{
"epoch": 4.170432537120723,
"grad_norm": 0.9014567493180559,
"learning_rate": 1.5919019253429923e-06,
"loss": 0.7147,
"step": 1618
},
{
"epoch": 4.173014848289219,
"grad_norm": 0.8966269088663105,
"learning_rate": 1.5821474382021128e-06,
"loss": 0.7202,
"step": 1619
},
{
"epoch": 4.175597159457714,
"grad_norm": 0.9039554140412117,
"learning_rate": 1.5724203607055655e-06,
"loss": 0.7208,
"step": 1620
},
{
"epoch": 4.17817947062621,
"grad_norm": 0.9117935626371781,
"learning_rate": 1.5627207245260046e-06,
"loss": 0.7252,
"step": 1621
},
{
"epoch": 4.180761781794706,
"grad_norm": 0.8838537392487884,
"learning_rate": 1.5530485612467317e-06,
"loss": 0.7143,
"step": 1622
},
{
"epoch": 4.183344092963202,
"grad_norm": 0.8872948835258441,
"learning_rate": 1.54340390236159e-06,
"loss": 0.6962,
"step": 1623
},
{
"epoch": 4.185926404131698,
"grad_norm": 0.8960284062739021,
"learning_rate": 1.5337867792748694e-06,
"loss": 0.7195,
"step": 1624
},
{
"epoch": 4.188508715300194,
"grad_norm": 0.8995137169581848,
"learning_rate": 1.5241972233012015e-06,
"loss": 0.6987,
"step": 1625
},
{
"epoch": 4.19109102646869,
"grad_norm": 0.8725877233542136,
"learning_rate": 1.5146352656654473e-06,
"loss": 0.6822,
"step": 1626
},
{
"epoch": 4.193673337637185,
"grad_norm": 0.9022708955863336,
"learning_rate": 1.5051009375026127e-06,
"loss": 0.7124,
"step": 1627
},
{
"epoch": 4.196255648805681,
"grad_norm": 0.894833962035567,
"learning_rate": 1.4955942698577341e-06,
"loss": 0.7362,
"step": 1628
},
{
"epoch": 4.198837959974177,
"grad_norm": 0.9297395231684169,
"learning_rate": 1.4861152936857792e-06,
"loss": 0.7272,
"step": 1629
},
{
"epoch": 4.2014202711426725,
"grad_norm": 0.9212227444100315,
"learning_rate": 1.476664039851554e-06,
"loss": 0.7345,
"step": 1630
},
{
"epoch": 4.204002582311168,
"grad_norm": 0.9040538736898476,
"learning_rate": 1.4672405391295964e-06,
"loss": 0.7202,
"step": 1631
},
{
"epoch": 4.206584893479664,
"grad_norm": 0.90954184793202,
"learning_rate": 1.4578448222040708e-06,
"loss": 0.7144,
"step": 1632
},
{
"epoch": 4.2091672046481605,
"grad_norm": 0.8765308727348899,
"learning_rate": 1.4484769196686777e-06,
"loss": 0.6932,
"step": 1633
},
{
"epoch": 4.211749515816656,
"grad_norm": 0.8913725709084924,
"learning_rate": 1.4391368620265522e-06,
"loss": 0.6839,
"step": 1634
},
{
"epoch": 4.214331826985152,
"grad_norm": 0.8949496649062495,
"learning_rate": 1.4298246796901615e-06,
"loss": 0.7081,
"step": 1635
},
{
"epoch": 4.216914138153648,
"grad_norm": 0.8890323192323862,
"learning_rate": 1.4205404029812043e-06,
"loss": 0.7148,
"step": 1636
},
{
"epoch": 4.219496449322143,
"grad_norm": 0.8898844083009926,
"learning_rate": 1.4112840621305156e-06,
"loss": 0.7055,
"step": 1637
},
{
"epoch": 4.222078760490639,
"grad_norm": 0.8973037287194257,
"learning_rate": 1.4020556872779723e-06,
"loss": 0.7001,
"step": 1638
},
{
"epoch": 4.224661071659135,
"grad_norm": 0.8881050754779889,
"learning_rate": 1.3928553084723828e-06,
"loss": 0.7029,
"step": 1639
},
{
"epoch": 4.227243382827631,
"grad_norm": 0.9282818296375939,
"learning_rate": 1.3836829556714027e-06,
"loss": 0.7436,
"step": 1640
},
{
"epoch": 4.229825693996126,
"grad_norm": 0.896945078296962,
"learning_rate": 1.3745386587414312e-06,
"loss": 0.7051,
"step": 1641
},
{
"epoch": 4.232408005164622,
"grad_norm": 0.8904775792068886,
"learning_rate": 1.3654224474575105e-06,
"loss": 0.75,
"step": 1642
},
{
"epoch": 4.234990316333118,
"grad_norm": 0.9051407253126452,
"learning_rate": 1.3563343515032312e-06,
"loss": 0.7122,
"step": 1643
},
{
"epoch": 4.237572627501614,
"grad_norm": 0.875475228439986,
"learning_rate": 1.3472744004706406e-06,
"loss": 0.7138,
"step": 1644
},
{
"epoch": 4.24015493867011,
"grad_norm": 0.9170090364957498,
"learning_rate": 1.3382426238601443e-06,
"loss": 0.7209,
"step": 1645
},
{
"epoch": 4.242737249838606,
"grad_norm": 0.8963557044070765,
"learning_rate": 1.3292390510803987e-06,
"loss": 0.7207,
"step": 1646
},
{
"epoch": 4.2453195610071015,
"grad_norm": 0.8870017097792205,
"learning_rate": 1.320263711448232e-06,
"loss": 0.7344,
"step": 1647
},
{
"epoch": 4.247901872175597,
"grad_norm": 0.8643621623469757,
"learning_rate": 1.3113166341885453e-06,
"loss": 0.6909,
"step": 1648
},
{
"epoch": 4.250484183344093,
"grad_norm": 0.88693541516387,
"learning_rate": 1.3023978484342027e-06,
"loss": 0.7172,
"step": 1649
},
{
"epoch": 4.253066494512589,
"grad_norm": 0.8827410284281785,
"learning_rate": 1.293507383225958e-06,
"loss": 0.6974,
"step": 1650
},
{
"epoch": 4.255648805681084,
"grad_norm": 0.8921892267635948,
"learning_rate": 1.2846452675123412e-06,
"loss": 0.7198,
"step": 1651
},
{
"epoch": 4.25823111684958,
"grad_norm": 0.8699520076203895,
"learning_rate": 1.275811530149581e-06,
"loss": 0.712,
"step": 1652
},
{
"epoch": 4.260813428018076,
"grad_norm": 0.9053418709356683,
"learning_rate": 1.2670061999014926e-06,
"loss": 0.711,
"step": 1653
},
{
"epoch": 4.263395739186572,
"grad_norm": 0.8933588527941319,
"learning_rate": 1.2582293054394034e-06,
"loss": 0.7191,
"step": 1654
},
{
"epoch": 4.265978050355068,
"grad_norm": 0.8957606594695622,
"learning_rate": 1.249480875342044e-06,
"loss": 0.7013,
"step": 1655
},
{
"epoch": 4.268560361523564,
"grad_norm": 0.8896835872883254,
"learning_rate": 1.240760938095461e-06,
"loss": 0.6909,
"step": 1656
},
{
"epoch": 4.27114267269206,
"grad_norm": 0.902860652986643,
"learning_rate": 1.232069522092929e-06,
"loss": 0.7023,
"step": 1657
},
{
"epoch": 4.273724983860555,
"grad_norm": 0.8991644670101375,
"learning_rate": 1.2234066556348524e-06,
"loss": 0.7201,
"step": 1658
},
{
"epoch": 4.276307295029051,
"grad_norm": 0.871953669734789,
"learning_rate": 1.2147723669286703e-06,
"loss": 0.7135,
"step": 1659
},
{
"epoch": 4.278889606197547,
"grad_norm": 0.8976349496395944,
"learning_rate": 1.206166684088772e-06,
"loss": 0.7142,
"step": 1660
},
{
"epoch": 4.2814719173660425,
"grad_norm": 0.9150642282309124,
"learning_rate": 1.1975896351364036e-06,
"loss": 0.717,
"step": 1661
},
{
"epoch": 4.284054228534538,
"grad_norm": 0.89830091523198,
"learning_rate": 1.189041247999575e-06,
"loss": 0.7038,
"step": 1662
},
{
"epoch": 4.286636539703034,
"grad_norm": 0.9002768402003223,
"learning_rate": 1.1805215505129653e-06,
"loss": 0.7171,
"step": 1663
},
{
"epoch": 4.28921885087153,
"grad_norm": 0.9189338614452367,
"learning_rate": 1.1720305704178436e-06,
"loss": 0.7198,
"step": 1664
},
{
"epoch": 4.291801162040025,
"grad_norm": 0.9241169326437726,
"learning_rate": 1.1635683353619643e-06,
"loss": 0.733,
"step": 1665
},
{
"epoch": 4.294383473208522,
"grad_norm": 0.9074601151507607,
"learning_rate": 1.1551348728994849e-06,
"loss": 0.7251,
"step": 1666
},
{
"epoch": 4.296965784377018,
"grad_norm": 0.9201495613344312,
"learning_rate": 1.1467302104908796e-06,
"loss": 0.7261,
"step": 1667
},
{
"epoch": 4.299548095545513,
"grad_norm": 0.8938379902505227,
"learning_rate": 1.138354375502847e-06,
"loss": 0.6994,
"step": 1668
},
{
"epoch": 4.302130406714009,
"grad_norm": 0.9336658672822801,
"learning_rate": 1.1300073952082147e-06,
"loss": 0.7156,
"step": 1669
},
{
"epoch": 4.304712717882505,
"grad_norm": 0.8831440608193399,
"learning_rate": 1.121689296785854e-06,
"loss": 0.6995,
"step": 1670
},
{
"epoch": 4.307295029051001,
"grad_norm": 0.8799798301503751,
"learning_rate": 1.1134001073206025e-06,
"loss": 0.7193,
"step": 1671
},
{
"epoch": 4.309877340219496,
"grad_norm": 0.8804550173412993,
"learning_rate": 1.1051398538031544e-06,
"loss": 0.7258,
"step": 1672
},
{
"epoch": 4.312459651387992,
"grad_norm": 0.9036007303009939,
"learning_rate": 1.0969085631299946e-06,
"loss": 0.708,
"step": 1673
},
{
"epoch": 4.315041962556488,
"grad_norm": 0.9024030379125595,
"learning_rate": 1.0887062621032951e-06,
"loss": 0.7055,
"step": 1674
},
{
"epoch": 4.3176242737249835,
"grad_norm": 0.8894223361973113,
"learning_rate": 1.0805329774308392e-06,
"loss": 0.726,
"step": 1675
},
{
"epoch": 4.32020658489348,
"grad_norm": 0.885938538418052,
"learning_rate": 1.072388735725921e-06,
"loss": 0.703,
"step": 1676
},
{
"epoch": 4.322788896061976,
"grad_norm": 0.9305506401372774,
"learning_rate": 1.0642735635072764e-06,
"loss": 0.7266,
"step": 1677
},
{
"epoch": 4.3253712072304715,
"grad_norm": 0.9113060756686157,
"learning_rate": 1.0561874871989775e-06,
"loss": 0.7206,
"step": 1678
},
{
"epoch": 4.327953518398967,
"grad_norm": 0.889475937075751,
"learning_rate": 1.0481305331303659e-06,
"loss": 0.7145,
"step": 1679
},
{
"epoch": 4.330535829567463,
"grad_norm": 0.9193184675001664,
"learning_rate": 1.0401027275359487e-06,
"loss": 0.7213,
"step": 1680
},
{
"epoch": 4.333118140735959,
"grad_norm": 0.9057888219841576,
"learning_rate": 1.0321040965553286e-06,
"loss": 0.7142,
"step": 1681
},
{
"epoch": 4.335700451904454,
"grad_norm": 0.9181347146447232,
"learning_rate": 1.0241346662331075e-06,
"loss": 0.7001,
"step": 1682
},
{
"epoch": 4.33828276307295,
"grad_norm": 0.8761392579902068,
"learning_rate": 1.0161944625188046e-06,
"loss": 0.7144,
"step": 1683
},
{
"epoch": 4.340865074241446,
"grad_norm": 0.8957507000212769,
"learning_rate": 1.008283511266781e-06,
"loss": 0.72,
"step": 1684
},
{
"epoch": 4.343447385409942,
"grad_norm": 0.9048968155853336,
"learning_rate": 1.0004018382361414e-06,
"loss": 0.7154,
"step": 1685
},
{
"epoch": 4.346029696578437,
"grad_norm": 0.9061516663607375,
"learning_rate": 9.92549469090659e-07,
"loss": 0.7138,
"step": 1686
},
{
"epoch": 4.348612007746934,
"grad_norm": 0.907960683381014,
"learning_rate": 9.847264293986869e-07,
"loss": 0.6933,
"step": 1687
},
{
"epoch": 4.35119431891543,
"grad_norm": 0.9335500032392648,
"learning_rate": 9.769327446330802e-07,
"loss": 0.7203,
"step": 1688
},
{
"epoch": 4.353776630083925,
"grad_norm": 0.8979953039902708,
"learning_rate": 9.691684401711143e-07,
"loss": 0.7335,
"step": 1689
},
{
"epoch": 4.356358941252421,
"grad_norm": 0.884673020616097,
"learning_rate": 9.614335412943887e-07,
"loss": 0.7141,
"step": 1690
},
{
"epoch": 4.358941252420917,
"grad_norm": 0.8749160348750349,
"learning_rate": 9.537280731887644e-07,
"loss": 0.7038,
"step": 1691
},
{
"epoch": 4.3615235635894125,
"grad_norm": 0.8709729980037048,
"learning_rate": 9.460520609442647e-07,
"loss": 0.6812,
"step": 1692
},
{
"epoch": 4.364105874757908,
"grad_norm": 0.8750487222127739,
"learning_rate": 9.384055295550032e-07,
"loss": 0.69,
"step": 1693
},
{
"epoch": 4.366688185926404,
"grad_norm": 0.8996853241438628,
"learning_rate": 9.307885039191011e-07,
"loss": 0.7232,
"step": 1694
},
{
"epoch": 4.3692704970949,
"grad_norm": 0.908802912077574,
"learning_rate": 9.232010088386067e-07,
"loss": 0.7062,
"step": 1695
},
{
"epoch": 4.371852808263395,
"grad_norm": 0.9168462712654659,
"learning_rate": 9.156430690194074e-07,
"loss": 0.7084,
"step": 1696
},
{
"epoch": 4.374435119431892,
"grad_norm": 0.9259940687964294,
"learning_rate": 9.081147090711562e-07,
"loss": 0.742,
"step": 1697
},
{
"epoch": 4.377017430600388,
"grad_norm": 0.9383290055652561,
"learning_rate": 9.006159535071945e-07,
"loss": 0.7364,
"step": 1698
},
{
"epoch": 4.3795997417688834,
"grad_norm": 0.8900989908997234,
"learning_rate": 8.93146826744462e-07,
"loss": 0.6925,
"step": 1699
},
{
"epoch": 4.382182052937379,
"grad_norm": 0.906245275227718,
"learning_rate": 8.8570735310343e-07,
"loss": 0.7152,
"step": 1700
},
{
"epoch": 4.384764364105875,
"grad_norm": 0.8817115384410557,
"learning_rate": 8.782975568080066e-07,
"loss": 0.7119,
"step": 1701
},
{
"epoch": 4.387346675274371,
"grad_norm": 0.8936803543044946,
"learning_rate": 8.709174619854766e-07,
"loss": 0.7221,
"step": 1702
},
{
"epoch": 4.389928986442866,
"grad_norm": 0.9106707340778925,
"learning_rate": 8.635670926664019e-07,
"loss": 0.7159,
"step": 1703
},
{
"epoch": 4.392511297611362,
"grad_norm": 0.8995573561800408,
"learning_rate": 8.562464727845621e-07,
"loss": 0.7232,
"step": 1704
},
{
"epoch": 4.395093608779858,
"grad_norm": 0.9242674966931874,
"learning_rate": 8.489556261768694e-07,
"loss": 0.7511,
"step": 1705
},
{
"epoch": 4.3976759199483535,
"grad_norm": 0.8973937221578497,
"learning_rate": 8.41694576583284e-07,
"loss": 0.7151,
"step": 1706
},
{
"epoch": 4.400258231116849,
"grad_norm": 0.9016096213701568,
"learning_rate": 8.344633476467456e-07,
"loss": 0.7555,
"step": 1707
},
{
"epoch": 4.402840542285345,
"grad_norm": 0.9070638086527365,
"learning_rate": 8.272619629130984e-07,
"loss": 0.7405,
"step": 1708
},
{
"epoch": 4.4054228534538415,
"grad_norm": 0.8882275237620317,
"learning_rate": 8.200904458310022e-07,
"loss": 0.6947,
"step": 1709
},
{
"epoch": 4.408005164622337,
"grad_norm": 0.8804791482936096,
"learning_rate": 8.129488197518687e-07,
"loss": 0.6977,
"step": 1710
},
{
"epoch": 4.410587475790833,
"grad_norm": 0.9272005439255128,
"learning_rate": 8.0583710792978e-07,
"loss": 0.7132,
"step": 1711
},
{
"epoch": 4.413169786959329,
"grad_norm": 0.8899287069527094,
"learning_rate": 7.987553335214149e-07,
"loss": 0.731,
"step": 1712
},
{
"epoch": 4.415752098127824,
"grad_norm": 0.8808619191876175,
"learning_rate": 7.917035195859668e-07,
"loss": 0.7265,
"step": 1713
},
{
"epoch": 4.41833440929632,
"grad_norm": 0.8969831722003705,
"learning_rate": 7.846816890850806e-07,
"loss": 0.7116,
"step": 1714
},
{
"epoch": 4.420916720464816,
"grad_norm": 0.8924810647720324,
"learning_rate": 7.776898648827647e-07,
"loss": 0.7146,
"step": 1715
},
{
"epoch": 4.423499031633312,
"grad_norm": 1.1018976996499141,
"learning_rate": 7.707280697453256e-07,
"loss": 0.6941,
"step": 1716
},
{
"epoch": 4.426081342801807,
"grad_norm": 0.894570214234641,
"learning_rate": 7.637963263412929e-07,
"loss": 0.7145,
"step": 1717
},
{
"epoch": 4.428663653970303,
"grad_norm": 0.907267880331535,
"learning_rate": 7.568946572413438e-07,
"loss": 0.7239,
"step": 1718
},
{
"epoch": 4.4312459651388,
"grad_norm": 0.904439736544676,
"learning_rate": 7.500230849182278e-07,
"loss": 0.7148,
"step": 1719
},
{
"epoch": 4.433828276307295,
"grad_norm": 0.9165453956707031,
"learning_rate": 7.431816317466923e-07,
"loss": 0.7276,
"step": 1720
},
{
"epoch": 4.436410587475791,
"grad_norm": 0.8958325943917459,
"learning_rate": 7.363703200034177e-07,
"loss": 0.7121,
"step": 1721
},
{
"epoch": 4.438992898644287,
"grad_norm": 0.9140196598832558,
"learning_rate": 7.295891718669423e-07,
"loss": 0.7331,
"step": 1722
},
{
"epoch": 4.4415752098127825,
"grad_norm": 0.8872380269740475,
"learning_rate": 7.228382094175801e-07,
"loss": 0.7001,
"step": 1723
},
{
"epoch": 4.444157520981278,
"grad_norm": 0.9270216958684496,
"learning_rate": 7.161174546373595e-07,
"loss": 0.7181,
"step": 1724
},
{
"epoch": 4.446739832149774,
"grad_norm": 0.9132455796590981,
"learning_rate": 7.094269294099509e-07,
"loss": 0.731,
"step": 1725
},
{
"epoch": 4.44932214331827,
"grad_norm": 0.9227817170397697,
"learning_rate": 7.027666555205915e-07,
"loss": 0.7337,
"step": 1726
},
{
"epoch": 4.451904454486765,
"grad_norm": 0.9210558504510359,
"learning_rate": 6.961366546560156e-07,
"loss": 0.7291,
"step": 1727
},
{
"epoch": 4.454486765655261,
"grad_norm": 0.9075281818010581,
"learning_rate": 6.895369484043879e-07,
"loss": 0.7321,
"step": 1728
},
{
"epoch": 4.457069076823757,
"grad_norm": 1.0617614842864411,
"learning_rate": 6.829675582552253e-07,
"loss": 0.6943,
"step": 1729
},
{
"epoch": 4.4596513879922535,
"grad_norm": 0.9007403795148768,
"learning_rate": 6.764285055993313e-07,
"loss": 0.7094,
"step": 1730
},
{
"epoch": 4.462233699160749,
"grad_norm": 0.9117234111423507,
"learning_rate": 6.699198117287309e-07,
"loss": 0.7385,
"step": 1731
},
{
"epoch": 4.464816010329245,
"grad_norm": 0.8912704657574764,
"learning_rate": 6.634414978365978e-07,
"loss": 0.7145,
"step": 1732
},
{
"epoch": 4.467398321497741,
"grad_norm": 0.9112230980418938,
"learning_rate": 6.569935850171749e-07,
"loss": 0.7199,
"step": 1733
},
{
"epoch": 4.469980632666236,
"grad_norm": 0.9068319118246875,
"learning_rate": 6.505760942657235e-07,
"loss": 0.728,
"step": 1734
},
{
"epoch": 4.472562943834732,
"grad_norm": 0.8698085381695577,
"learning_rate": 6.441890464784473e-07,
"loss": 0.6873,
"step": 1735
},
{
"epoch": 4.475145255003228,
"grad_norm": 0.8820814075457806,
"learning_rate": 6.37832462452418e-07,
"loss": 0.7087,
"step": 1736
},
{
"epoch": 4.4777275661717235,
"grad_norm": 0.9076300266568639,
"learning_rate": 6.315063628855178e-07,
"loss": 0.7207,
"step": 1737
},
{
"epoch": 4.480309877340219,
"grad_norm": 0.8761294190152139,
"learning_rate": 6.252107683763642e-07,
"loss": 0.7028,
"step": 1738
},
{
"epoch": 4.482892188508715,
"grad_norm": 0.9023071618115094,
"learning_rate": 6.189456994242516e-07,
"loss": 0.7548,
"step": 1739
},
{
"epoch": 4.485474499677212,
"grad_norm": 0.9032118146111997,
"learning_rate": 6.127111764290694e-07,
"loss": 0.7198,
"step": 1740
},
{
"epoch": 4.488056810845707,
"grad_norm": 0.9423906564703732,
"learning_rate": 6.065072196912569e-07,
"loss": 0.7192,
"step": 1741
},
{
"epoch": 4.490639122014203,
"grad_norm": 0.8871664697024927,
"learning_rate": 6.003338494117183e-07,
"loss": 0.7261,
"step": 1742
},
{
"epoch": 4.493221433182699,
"grad_norm": 0.8723986193214446,
"learning_rate": 5.941910856917643e-07,
"loss": 0.6919,
"step": 1743
},
{
"epoch": 4.4958037443511945,
"grad_norm": 0.9160786881274069,
"learning_rate": 5.880789485330484e-07,
"loss": 0.7184,
"step": 1744
},
{
"epoch": 4.49838605551969,
"grad_norm": 0.8749173043898083,
"learning_rate": 5.81997457837502e-07,
"loss": 0.7038,
"step": 1745
},
{
"epoch": 4.500968366688186,
"grad_norm": 0.8836993274435628,
"learning_rate": 5.75946633407265e-07,
"loss": 0.7058,
"step": 1746
},
{
"epoch": 4.503550677856682,
"grad_norm": 0.9168059326784899,
"learning_rate": 5.699264949446215e-07,
"loss": 0.7576,
"step": 1747
},
{
"epoch": 4.506132989025177,
"grad_norm": 0.916608005705834,
"learning_rate": 5.639370620519424e-07,
"loss": 0.7176,
"step": 1748
},
{
"epoch": 4.508715300193673,
"grad_norm": 0.8792129119048742,
"learning_rate": 5.579783542316175e-07,
"loss": 0.7004,
"step": 1749
},
{
"epoch": 4.511297611362169,
"grad_norm": 0.9102642584599099,
"learning_rate": 5.520503908859876e-07,
"loss": 0.7296,
"step": 1750
},
{
"epoch": 4.5138799225306645,
"grad_norm": 0.8976181656024875,
"learning_rate": 5.461531913172869e-07,
"loss": 0.7137,
"step": 1751
},
{
"epoch": 4.516462233699161,
"grad_norm": 0.8955819573412418,
"learning_rate": 5.40286774727582e-07,
"loss": 0.7243,
"step": 1752
},
{
"epoch": 4.519044544867657,
"grad_norm": 0.8848415473903943,
"learning_rate": 5.344511602186986e-07,
"loss": 0.6937,
"step": 1753
},
{
"epoch": 4.5216268560361526,
"grad_norm": 0.8935210617924505,
"learning_rate": 5.28646366792176e-07,
"loss": 0.7115,
"step": 1754
},
{
"epoch": 4.524209167204648,
"grad_norm": 0.9277553981673546,
"learning_rate": 5.228724133491903e-07,
"loss": 0.7464,
"step": 1755
},
{
"epoch": 4.526791478373144,
"grad_norm": 0.8697756183845172,
"learning_rate": 5.171293186904991e-07,
"loss": 0.6713,
"step": 1756
},
{
"epoch": 4.52937378954164,
"grad_norm": 0.8699331724645898,
"learning_rate": 5.114171015163793e-07,
"loss": 0.6981,
"step": 1757
},
{
"epoch": 4.531956100710135,
"grad_norm": 0.8867759757326906,
"learning_rate": 5.057357804265695e-07,
"loss": 0.713,
"step": 1758
},
{
"epoch": 4.534538411878631,
"grad_norm": 0.8965559347020167,
"learning_rate": 5.000853739202039e-07,
"loss": 0.7084,
"step": 1759
},
{
"epoch": 4.537120723047127,
"grad_norm": 0.9176654044616335,
"learning_rate": 4.944659003957564e-07,
"loss": 0.7214,
"step": 1760
},
{
"epoch": 4.539703034215623,
"grad_norm": 0.9221143520181255,
"learning_rate": 4.888773781509748e-07,
"loss": 0.737,
"step": 1761
},
{
"epoch": 4.542285345384119,
"grad_norm": 0.9322259452120046,
"learning_rate": 4.833198253828331e-07,
"loss": 0.7416,
"step": 1762
},
{
"epoch": 4.544867656552615,
"grad_norm": 0.9208718551935449,
"learning_rate": 4.777932601874557e-07,
"loss": 0.7487,
"step": 1763
},
{
"epoch": 4.547449967721111,
"grad_norm": 0.8890662647225912,
"learning_rate": 4.7229770056007707e-07,
"loss": 0.6894,
"step": 1764
},
{
"epoch": 4.550032278889606,
"grad_norm": 0.887652550850276,
"learning_rate": 4.66833164394962e-07,
"loss": 0.7031,
"step": 1765
},
{
"epoch": 4.552614590058102,
"grad_norm": 0.8951195625274412,
"learning_rate": 4.6139966948537064e-07,
"loss": 0.7419,
"step": 1766
},
{
"epoch": 4.555196901226598,
"grad_norm": 0.9259533456732394,
"learning_rate": 4.5599723352347857e-07,
"loss": 0.6975,
"step": 1767
},
{
"epoch": 4.5577792123950935,
"grad_norm": 0.9119373622565761,
"learning_rate": 4.5062587410033663e-07,
"loss": 0.727,
"step": 1768
},
{
"epoch": 4.560361523563589,
"grad_norm": 0.8749876771396683,
"learning_rate": 4.452856087058044e-07,
"loss": 0.6747,
"step": 1769
},
{
"epoch": 4.562943834732085,
"grad_norm": 0.8781088555589601,
"learning_rate": 4.3997645472849016e-07,
"loss": 0.7024,
"step": 1770
},
{
"epoch": 4.565526145900581,
"grad_norm": 0.8811255741349678,
"learning_rate": 4.346984294557055e-07,
"loss": 0.7078,
"step": 1771
},
{
"epoch": 4.568108457069076,
"grad_norm": 0.9013328171452745,
"learning_rate": 4.29451550073402e-07,
"loss": 0.7471,
"step": 1772
},
{
"epoch": 4.570690768237572,
"grad_norm": 0.9233092729022429,
"learning_rate": 4.2423583366611345e-07,
"loss": 0.7443,
"step": 1773
},
{
"epoch": 4.573273079406069,
"grad_norm": 0.8783319514354955,
"learning_rate": 4.190512972169036e-07,
"loss": 0.7247,
"step": 1774
},
{
"epoch": 4.5758553905745645,
"grad_norm": 0.8856387585636677,
"learning_rate": 4.13897957607311e-07,
"loss": 0.701,
"step": 1775
},
{
"epoch": 4.57843770174306,
"grad_norm": 0.9032296044304654,
"learning_rate": 4.0877583161729406e-07,
"loss": 0.7,
"step": 1776
},
{
"epoch": 4.581020012911556,
"grad_norm": 0.8704929901157932,
"learning_rate": 4.036849359251738e-07,
"loss": 0.7071,
"step": 1777
},
{
"epoch": 4.583602324080052,
"grad_norm": 0.9068350583367365,
"learning_rate": 3.986252871075813e-07,
"loss": 0.6992,
"step": 1778
},
{
"epoch": 4.586184635248547,
"grad_norm": 0.874153868567516,
"learning_rate": 3.935969016394048e-07,
"loss": 0.708,
"step": 1779
},
{
"epoch": 4.588766946417043,
"grad_norm": 0.8976406116845383,
"learning_rate": 3.8859979589373265e-07,
"loss": 0.7182,
"step": 1780
},
{
"epoch": 4.591349257585539,
"grad_norm": 0.8973977227814353,
"learning_rate": 3.836339861418059e-07,
"loss": 0.6996,
"step": 1781
},
{
"epoch": 4.5939315687540345,
"grad_norm": 0.9057741077447837,
"learning_rate": 3.786994885529582e-07,
"loss": 0.707,
"step": 1782
},
{
"epoch": 4.596513879922531,
"grad_norm": 0.9248023645258433,
"learning_rate": 3.7379631919457036e-07,
"loss": 0.7433,
"step": 1783
},
{
"epoch": 4.599096191091027,
"grad_norm": 0.8815428248966636,
"learning_rate": 3.6892449403200805e-07,
"loss": 0.7049,
"step": 1784
},
{
"epoch": 4.601678502259523,
"grad_norm": 0.8865336637004515,
"learning_rate": 3.6408402892858297e-07,
"loss": 0.7074,
"step": 1785
},
{
"epoch": 4.604260813428018,
"grad_norm": 0.8908085795505662,
"learning_rate": 3.592749396454931e-07,
"loss": 0.7158,
"step": 1786
},
{
"epoch": 4.606843124596514,
"grad_norm": 0.895409574949093,
"learning_rate": 3.5449724184176695e-07,
"loss": 0.7006,
"step": 1787
},
{
"epoch": 4.60942543576501,
"grad_norm": 0.8953545545983936,
"learning_rate": 3.4975095107422473e-07,
"loss": 0.7043,
"step": 1788
},
{
"epoch": 4.6120077469335055,
"grad_norm": 0.9024082202649676,
"learning_rate": 3.450360827974175e-07,
"loss": 0.7188,
"step": 1789
},
{
"epoch": 4.614590058102001,
"grad_norm": 0.8901535896655877,
"learning_rate": 3.403526523635825e-07,
"loss": 0.7044,
"step": 1790
},
{
"epoch": 4.617172369270497,
"grad_norm": 0.8661077661763952,
"learning_rate": 3.3570067502258887e-07,
"loss": 0.6895,
"step": 1791
},
{
"epoch": 4.619754680438993,
"grad_norm": 0.8685402290165503,
"learning_rate": 3.310801659218943e-07,
"loss": 0.7115,
"step": 1792
},
{
"epoch": 4.622336991607488,
"grad_norm": 0.8964236761531204,
"learning_rate": 3.264911401064874e-07,
"loss": 0.7268,
"step": 1793
},
{
"epoch": 4.624919302775984,
"grad_norm": 0.8907216569293869,
"learning_rate": 3.219336125188455e-07,
"loss": 0.7009,
"step": 1794
},
{
"epoch": 4.627501613944481,
"grad_norm": 0.8752296130228919,
"learning_rate": 3.174075979988811e-07,
"loss": 0.7155,
"step": 1795
},
{
"epoch": 4.630083925112976,
"grad_norm": 0.8970700959270853,
"learning_rate": 3.1291311128390233e-07,
"loss": 0.7261,
"step": 1796
},
{
"epoch": 4.632666236281472,
"grad_norm": 0.8832170790109071,
"learning_rate": 3.0845016700854827e-07,
"loss": 0.6962,
"step": 1797
},
{
"epoch": 4.635248547449968,
"grad_norm": 0.8737294820452377,
"learning_rate": 3.0401877970476e-07,
"loss": 0.697,
"step": 1798
},
{
"epoch": 4.637830858618464,
"grad_norm": 0.8664266455857452,
"learning_rate": 2.996189638017233e-07,
"loss": 0.7204,
"step": 1799
},
{
"epoch": 4.640413169786959,
"grad_norm": 0.8875751806701481,
"learning_rate": 2.9525073362581924e-07,
"loss": 0.7349,
"step": 1800
},
{
"epoch": 4.642995480955455,
"grad_norm": 0.8976875904169599,
"learning_rate": 2.909141034005891e-07,
"loss": 0.7118,
"step": 1801
},
{
"epoch": 4.645577792123951,
"grad_norm": 0.875180687352795,
"learning_rate": 2.86609087246672e-07,
"loss": 0.7072,
"step": 1802
},
{
"epoch": 4.648160103292446,
"grad_norm": 0.8837922794467638,
"learning_rate": 2.8233569918177384e-07,
"loss": 0.6872,
"step": 1803
},
{
"epoch": 4.650742414460942,
"grad_norm": 0.8773613611606011,
"learning_rate": 2.780939531206106e-07,
"loss": 0.6997,
"step": 1804
},
{
"epoch": 4.653324725629439,
"grad_norm": 0.9021810787669173,
"learning_rate": 2.73883862874873e-07,
"loss": 0.7059,
"step": 1805
},
{
"epoch": 4.6559070367979345,
"grad_norm": 0.8823700361393249,
"learning_rate": 2.6970544215317197e-07,
"loss": 0.7047,
"step": 1806
},
{
"epoch": 4.65848934796643,
"grad_norm": 0.8783238825683962,
"learning_rate": 2.655587045609975e-07,
"loss": 0.7249,
"step": 1807
},
{
"epoch": 4.661071659134926,
"grad_norm": 0.8827143960172458,
"learning_rate": 2.6144366360067896e-07,
"loss": 0.7159,
"step": 1808
},
{
"epoch": 4.663653970303422,
"grad_norm": 0.8851244634156046,
"learning_rate": 2.57360332671337e-07,
"loss": 0.7181,
"step": 1809
},
{
"epoch": 4.666236281471917,
"grad_norm": 0.91360975849369,
"learning_rate": 2.5330872506883595e-07,
"loss": 0.7426,
"step": 1810
},
{
"epoch": 4.668818592640413,
"grad_norm": 0.9075412171714544,
"learning_rate": 2.492888539857485e-07,
"loss": 0.6909,
"step": 1811
},
{
"epoch": 4.671400903808909,
"grad_norm": 0.888744832476072,
"learning_rate": 2.453007325113077e-07,
"loss": 0.7102,
"step": 1812
},
{
"epoch": 4.6739832149774045,
"grad_norm": 0.890507374481685,
"learning_rate": 2.41344373631367e-07,
"loss": 0.727,
"step": 1813
},
{
"epoch": 4.6765655261459,
"grad_norm": 0.8866830934511432,
"learning_rate": 2.374197902283548e-07,
"loss": 0.7308,
"step": 1814
},
{
"epoch": 4.679147837314396,
"grad_norm": 0.8771540551708261,
"learning_rate": 2.3352699508123579e-07,
"loss": 0.6949,
"step": 1815
},
{
"epoch": 4.681730148482892,
"grad_norm": 1.0503180466316975,
"learning_rate": 2.296660008654661e-07,
"loss": 0.7213,
"step": 1816
},
{
"epoch": 4.684312459651388,
"grad_norm": 0.8959301940497476,
"learning_rate": 2.2583682015295593e-07,
"loss": 0.7101,
"step": 1817
},
{
"epoch": 4.686894770819884,
"grad_norm": 0.8581736544054299,
"learning_rate": 2.2203946541202392e-07,
"loss": 0.6836,
"step": 1818
},
{
"epoch": 4.68947708198838,
"grad_norm": 0.8933010730764249,
"learning_rate": 2.1827394900736377e-07,
"loss": 0.7032,
"step": 1819
},
{
"epoch": 4.6920593931568755,
"grad_norm": 0.8824184636051327,
"learning_rate": 2.145402831999943e-07,
"loss": 0.6866,
"step": 1820
},
{
"epoch": 4.694641704325371,
"grad_norm": 0.8962203196556846,
"learning_rate": 2.108384801472263e-07,
"loss": 0.6891,
"step": 1821
},
{
"epoch": 4.697224015493867,
"grad_norm": 0.8825956696146154,
"learning_rate": 2.0716855190262118e-07,
"loss": 0.7159,
"step": 1822
},
{
"epoch": 4.699806326662363,
"grad_norm": 0.8978568032331434,
"learning_rate": 2.035305104159546e-07,
"loss": 0.6948,
"step": 1823
},
{
"epoch": 4.702388637830858,
"grad_norm": 0.9066750181863855,
"learning_rate": 1.9992436753316967e-07,
"loss": 0.7321,
"step": 1824
},
{
"epoch": 4.704970948999354,
"grad_norm": 0.8862969137730125,
"learning_rate": 1.963501349963448e-07,
"loss": 0.7224,
"step": 1825
},
{
"epoch": 4.707553260167851,
"grad_norm": 0.8963573378284775,
"learning_rate": 1.928078244436582e-07,
"loss": 0.7078,
"step": 1826
},
{
"epoch": 4.710135571336346,
"grad_norm": 0.8951212547195972,
"learning_rate": 1.892974474093412e-07,
"loss": 0.7324,
"step": 1827
},
{
"epoch": 4.712717882504842,
"grad_norm": 0.8796409018946402,
"learning_rate": 1.8581901532364722e-07,
"loss": 0.6997,
"step": 1828
},
{
"epoch": 4.715300193673338,
"grad_norm": 0.9084865841892311,
"learning_rate": 1.8237253951281287e-07,
"loss": 0.7176,
"step": 1829
},
{
"epoch": 4.717882504841834,
"grad_norm": 0.8939350444456864,
"learning_rate": 1.789580311990191e-07,
"loss": 0.7273,
"step": 1830
},
{
"epoch": 4.720464816010329,
"grad_norm": 0.8897734672415167,
"learning_rate": 1.7557550150035906e-07,
"loss": 0.7311,
"step": 1831
},
{
"epoch": 4.723047127178825,
"grad_norm": 0.9004838760057952,
"learning_rate": 1.7222496143079803e-07,
"loss": 0.735,
"step": 1832
},
{
"epoch": 4.725629438347321,
"grad_norm": 0.8899859768430314,
"learning_rate": 1.6890642190013906e-07,
"loss": 0.7231,
"step": 1833
},
{
"epoch": 4.7282117495158165,
"grad_norm": 0.8945971398194342,
"learning_rate": 1.6561989371398523e-07,
"loss": 0.7242,
"step": 1834
},
{
"epoch": 4.730794060684312,
"grad_norm": 0.9022295113729779,
"learning_rate": 1.6236538757370967e-07,
"loss": 0.7124,
"step": 1835
},
{
"epoch": 4.733376371852808,
"grad_norm": 0.9072500620579464,
"learning_rate": 1.5914291407641668e-07,
"loss": 0.7252,
"step": 1836
},
{
"epoch": 4.735958683021304,
"grad_norm": 0.9078114931306495,
"learning_rate": 1.5595248371490512e-07,
"loss": 0.7252,
"step": 1837
},
{
"epoch": 4.7385409941898,
"grad_norm": 0.8961898314266026,
"learning_rate": 1.5279410687764173e-07,
"loss": 0.7436,
"step": 1838
},
{
"epoch": 4.741123305358296,
"grad_norm": 0.8983212101762444,
"learning_rate": 1.4966779384871789e-07,
"loss": 0.7123,
"step": 1839
},
{
"epoch": 4.743705616526792,
"grad_norm": 0.8867079379295172,
"learning_rate": 1.465735548078262e-07,
"loss": 0.7091,
"step": 1840
},
{
"epoch": 4.746287927695287,
"grad_norm": 0.9004558245837394,
"learning_rate": 1.4351139983021623e-07,
"loss": 0.7001,
"step": 1841
},
{
"epoch": 4.748870238863783,
"grad_norm": 0.9069705760502946,
"learning_rate": 1.4048133888667436e-07,
"loss": 0.7132,
"step": 1842
},
{
"epoch": 4.751452550032279,
"grad_norm": 0.8977843363832826,
"learning_rate": 1.3748338184347842e-07,
"loss": 0.7348,
"step": 1843
},
{
"epoch": 4.754034861200775,
"grad_norm": 0.9077365721137113,
"learning_rate": 1.3451753846237314e-07,
"loss": 0.7221,
"step": 1844
},
{
"epoch": 4.75661717236927,
"grad_norm": 0.8989075753789744,
"learning_rate": 1.3158381840054025e-07,
"loss": 0.7193,
"step": 1845
},
{
"epoch": 4.759199483537766,
"grad_norm": 0.9186459910068699,
"learning_rate": 1.2868223121056178e-07,
"loss": 0.722,
"step": 1846
},
{
"epoch": 4.761781794706262,
"grad_norm": 0.9020493854829156,
"learning_rate": 1.2581278634038795e-07,
"loss": 0.7148,
"step": 1847
},
{
"epoch": 4.764364105874758,
"grad_norm": 0.8768801704765319,
"learning_rate": 1.229754931333127e-07,
"loss": 0.7035,
"step": 1848
},
{
"epoch": 4.766946417043254,
"grad_norm": 0.8844151561998117,
"learning_rate": 1.2017036082793922e-07,
"loss": 0.7184,
"step": 1849
},
{
"epoch": 4.76952872821175,
"grad_norm": 0.8929803701346444,
"learning_rate": 1.1739739855815224e-07,
"loss": 0.7302,
"step": 1850
},
{
"epoch": 4.7721110393802455,
"grad_norm": 0.883991183642243,
"learning_rate": 1.1465661535308147e-07,
"loss": 0.7293,
"step": 1851
},
{
"epoch": 4.774693350548741,
"grad_norm": 0.8949880710889703,
"learning_rate": 1.1194802013708151e-07,
"loss": 0.723,
"step": 1852
},
{
"epoch": 4.777275661717237,
"grad_norm": 0.8764502668642703,
"learning_rate": 1.0927162172969852e-07,
"loss": 0.6951,
"step": 1853
},
{
"epoch": 4.779857972885733,
"grad_norm": 0.8800003007012933,
"learning_rate": 1.0662742884563926e-07,
"loss": 0.7233,
"step": 1854
},
{
"epoch": 4.782440284054228,
"grad_norm": 0.9056650063921504,
"learning_rate": 1.0401545009474768e-07,
"loss": 0.7303,
"step": 1855
},
{
"epoch": 4.785022595222724,
"grad_norm": 0.8936292984984834,
"learning_rate": 1.0143569398197384e-07,
"loss": 0.7165,
"step": 1856
},
{
"epoch": 4.78760490639122,
"grad_norm": 0.8930843203036322,
"learning_rate": 9.888816890734399e-08,
"loss": 0.6987,
"step": 1857
},
{
"epoch": 4.7901872175597155,
"grad_norm": 0.8914663323073332,
"learning_rate": 9.637288316593718e-08,
"loss": 0.7257,
"step": 1858
},
{
"epoch": 4.792769528728211,
"grad_norm": 0.8776073044944982,
"learning_rate": 9.388984494785869e-08,
"loss": 0.692,
"step": 1859
},
{
"epoch": 4.795351839896708,
"grad_norm": 0.8779266479863871,
"learning_rate": 9.14390623382111e-08,
"loss": 0.7133,
"step": 1860
},
{
"epoch": 4.797934151065204,
"grad_norm": 0.9264245878689963,
"learning_rate": 8.902054331706545e-08,
"loss": 0.7283,
"step": 1861
},
{
"epoch": 4.800516462233699,
"grad_norm": 0.8900359447271092,
"learning_rate": 8.663429575944126e-08,
"loss": 0.6855,
"step": 1862
},
{
"epoch": 4.803098773402195,
"grad_norm": 0.8817750073930983,
"learning_rate": 8.42803274352777e-08,
"loss": 0.6943,
"step": 1863
},
{
"epoch": 4.805681084570691,
"grad_norm": 0.8903555513223317,
"learning_rate": 8.195864600940684e-08,
"loss": 0.6981,
"step": 1864
},
{
"epoch": 4.8082633957391865,
"grad_norm": 0.9144780971305131,
"learning_rate": 7.966925904153156e-08,
"loss": 0.7352,
"step": 1865
},
{
"epoch": 4.810845706907682,
"grad_norm": 0.9009644315118552,
"learning_rate": 7.741217398619993e-08,
"loss": 0.6861,
"step": 1866
},
{
"epoch": 4.813428018076178,
"grad_norm": 0.9355145906326285,
"learning_rate": 7.518739819278087e-08,
"loss": 0.7482,
"step": 1867
},
{
"epoch": 4.816010329244674,
"grad_norm": 0.9073134543589949,
"learning_rate": 7.29949389054374e-08,
"loss": 0.7514,
"step": 1868
},
{
"epoch": 4.81859264041317,
"grad_norm": 0.9117075631720867,
"learning_rate": 7.08348032631101e-08,
"loss": 0.7251,
"step": 1869
},
{
"epoch": 4.821174951581666,
"grad_norm": 0.9074195781202343,
"learning_rate": 6.870699829948479e-08,
"loss": 0.7186,
"step": 1870
},
{
"epoch": 4.823757262750162,
"grad_norm": 0.8938435858944255,
"learning_rate": 6.661153094297823e-08,
"loss": 0.7074,
"step": 1871
},
{
"epoch": 4.826339573918657,
"grad_norm": 0.9003531915044409,
"learning_rate": 6.454840801670803e-08,
"loss": 0.7319,
"step": 1872
},
{
"epoch": 4.828921885087153,
"grad_norm": 0.8820415069306509,
"learning_rate": 6.25176362384794e-08,
"loss": 0.712,
"step": 1873
},
{
"epoch": 4.831504196255649,
"grad_norm": 0.8995431946768817,
"learning_rate": 6.051922222075179e-08,
"loss": 0.7358,
"step": 1874
},
{
"epoch": 4.834086507424145,
"grad_norm": 0.9146524578848008,
"learning_rate": 5.855317247062786e-08,
"loss": 0.708,
"step": 1875
},
{
"epoch": 4.83666881859264,
"grad_norm": 0.882452405450629,
"learning_rate": 5.6619493389824534e-08,
"loss": 0.6995,
"step": 1876
},
{
"epoch": 4.839251129761136,
"grad_norm": 0.903967291714597,
"learning_rate": 5.4718191274659716e-08,
"loss": 0.7299,
"step": 1877
},
{
"epoch": 4.841833440929632,
"grad_norm": 0.8789763679697778,
"learning_rate": 5.284927231602344e-08,
"loss": 0.6955,
"step": 1878
},
{
"epoch": 4.8444157520981275,
"grad_norm": 0.8987948069283417,
"learning_rate": 5.101274259936451e-08,
"loss": 0.7204,
"step": 1879
},
{
"epoch": 4.846998063266623,
"grad_norm": 0.9053366952624305,
"learning_rate": 4.92086081046661e-08,
"loss": 0.7246,
"step": 1880
},
{
"epoch": 4.84958037443512,
"grad_norm": 0.8949590025267126,
"learning_rate": 4.7436874706431324e-08,
"loss": 0.7101,
"step": 1881
},
{
"epoch": 4.8521626856036155,
"grad_norm": 0.9214195129359755,
"learning_rate": 4.569754817365657e-08,
"loss": 0.7246,
"step": 1882
},
{
"epoch": 4.854744996772111,
"grad_norm": 0.9089598728157908,
"learning_rate": 4.399063416982263e-08,
"loss": 0.7218,
"step": 1883
},
{
"epoch": 4.857327307940607,
"grad_norm": 0.901720805129725,
"learning_rate": 4.2316138252866954e-08,
"loss": 0.6872,
"step": 1884
},
{
"epoch": 4.859909619109103,
"grad_norm": 0.9104687149335616,
"learning_rate": 4.067406587516809e-08,
"loss": 0.7451,
"step": 1885
},
{
"epoch": 4.862491930277598,
"grad_norm": 0.9041958158253693,
"learning_rate": 3.9064422383534587e-08,
"loss": 0.7049,
"step": 1886
},
{
"epoch": 4.865074241446094,
"grad_norm": 0.8939398515698603,
"learning_rate": 3.748721301917724e-08,
"loss": 0.7071,
"step": 1887
},
{
"epoch": 4.86765655261459,
"grad_norm": 0.9161924402984971,
"learning_rate": 3.5942442917699107e-08,
"loss": 0.7442,
"step": 1888
},
{
"epoch": 4.870238863783086,
"grad_norm": 0.8963507291873432,
"learning_rate": 3.443011710907662e-08,
"loss": 0.726,
"step": 1889
},
{
"epoch": 4.872821174951581,
"grad_norm": 0.8879121159936676,
"learning_rate": 3.295024051764406e-08,
"loss": 0.6938,
"step": 1890
},
{
"epoch": 4.875403486120078,
"grad_norm": 0.875832944883481,
"learning_rate": 3.150281796207466e-08,
"loss": 0.6997,
"step": 1891
},
{
"epoch": 4.877985797288574,
"grad_norm": 0.8977974322344513,
"learning_rate": 3.008785415536841e-08,
"loss": 0.7159,
"step": 1892
},
{
"epoch": 4.880568108457069,
"grad_norm": 0.8748881414978601,
"learning_rate": 2.8705353704836515e-08,
"loss": 0.7027,
"step": 1893
},
{
"epoch": 4.883150419625565,
"grad_norm": 0.88445635221765,
"learning_rate": 2.73553211120825e-08,
"loss": 0.7245,
"step": 1894
},
{
"epoch": 4.885732730794061,
"grad_norm": 0.9134713261429174,
"learning_rate": 2.6037760772991138e-08,
"loss": 0.7239,
"step": 1895
},
{
"epoch": 4.8883150419625565,
"grad_norm": 0.8985059705700046,
"learning_rate": 2.4752676977713997e-08,
"loss": 0.7245,
"step": 1896
},
{
"epoch": 4.890897353131052,
"grad_norm": 0.8885305233658356,
"learning_rate": 2.3500073910655007e-08,
"loss": 0.74,
"step": 1897
},
{
"epoch": 4.893479664299548,
"grad_norm": 0.8823177052515677,
"learning_rate": 2.2279955650456043e-08,
"loss": 0.6953,
"step": 1898
},
{
"epoch": 4.896061975468044,
"grad_norm": 0.9143103720474103,
"learning_rate": 2.109232616998247e-08,
"loss": 0.7159,
"step": 1899
},
{
"epoch": 4.898644286636539,
"grad_norm": 0.9122168970845141,
"learning_rate": 1.993718933631428e-08,
"loss": 0.7356,
"step": 1900
},
{
"epoch": 4.901226597805035,
"grad_norm": 0.8855771980386666,
"learning_rate": 1.8814548910730535e-08,
"loss": 0.7129,
"step": 1901
},
{
"epoch": 4.903808908973531,
"grad_norm": 0.9057306349994061,
"learning_rate": 1.7724408548697168e-08,
"loss": 0.718,
"step": 1902
},
{
"epoch": 4.906391220142027,
"grad_norm": 0.9014437329192154,
"learning_rate": 1.6666771799855875e-08,
"loss": 0.7178,
"step": 1903
},
{
"epoch": 4.908973531310523,
"grad_norm": 0.8787325490430078,
"learning_rate": 1.5641642108011888e-08,
"loss": 0.698,
"step": 1904
},
{
"epoch": 4.911555842479019,
"grad_norm": 0.9212584947084932,
"learning_rate": 1.4649022811122904e-08,
"loss": 0.7462,
"step": 1905
},
{
"epoch": 4.914138153647515,
"grad_norm": 0.8828205242032983,
"learning_rate": 1.368891714129017e-08,
"loss": 0.6942,
"step": 1906
},
{
"epoch": 4.91672046481601,
"grad_norm": 0.9012685033665765,
"learning_rate": 1.2761328224744074e-08,
"loss": 0.711,
"step": 1907
},
{
"epoch": 4.919302775984506,
"grad_norm": 0.8922139187625531,
"learning_rate": 1.1866259081837473e-08,
"loss": 0.7111,
"step": 1908
},
{
"epoch": 4.921885087153002,
"grad_norm": 0.9214550856281982,
"learning_rate": 1.100371262703459e-08,
"loss": 0.7151,
"step": 1909
},
{
"epoch": 4.9244673983214975,
"grad_norm": 0.8940406772236208,
"learning_rate": 1.0173691668901031e-08,
"loss": 0.7364,
"step": 1910
},
{
"epoch": 4.927049709489993,
"grad_norm": 0.9145036949820405,
"learning_rate": 9.376198910094892e-09,
"loss": 0.7231,
"step": 1911
},
{
"epoch": 4.92963202065849,
"grad_norm": 0.8932180687439902,
"learning_rate": 8.611236947357881e-09,
"loss": 0.7347,
"step": 1912
},
{
"epoch": 4.9322143318269855,
"grad_norm": 0.9074743226465335,
"learning_rate": 7.878808271507554e-09,
"loss": 0.7205,
"step": 1913
},
{
"epoch": 4.934796642995481,
"grad_norm": 0.9340148563069326,
"learning_rate": 7.178915267429531e-09,
"loss": 0.7412,
"step": 1914
},
{
"epoch": 4.937378954163977,
"grad_norm": 0.9029720647465063,
"learning_rate": 6.5115602140686244e-09,
"loss": 0.706,
"step": 1915
},
{
"epoch": 4.939961265332473,
"grad_norm": 0.9024330652166745,
"learning_rate": 5.876745284421059e-09,
"loss": 0.7179,
"step": 1916
},
{
"epoch": 4.942543576500968,
"grad_norm": 0.905536650405652,
"learning_rate": 5.27447254552782e-09,
"loss": 0.7265,
"step": 1917
},
{
"epoch": 4.945125887669464,
"grad_norm": 0.9411331572422978,
"learning_rate": 4.704743958467984e-09,
"loss": 0.7411,
"step": 1918
},
{
"epoch": 4.94770819883796,
"grad_norm": 0.8844024471969056,
"learning_rate": 4.1675613783565e-09,
"loss": 0.7076,
"step": 1919
},
{
"epoch": 4.950290510006456,
"grad_norm": 0.8830101926273661,
"learning_rate": 3.6629265543275393e-09,
"loss": 0.7213,
"step": 1920
},
{
"epoch": 4.952872821174951,
"grad_norm": 0.9108539534557583,
"learning_rate": 3.190841129542266e-09,
"loss": 0.7139,
"step": 1921
},
{
"epoch": 4.955455132343447,
"grad_norm": 0.9188851667256828,
"learning_rate": 2.7513066411699597e-09,
"loss": 0.7048,
"step": 1922
},
{
"epoch": 4.958037443511943,
"grad_norm": 0.9011821980131381,
"learning_rate": 2.344324520396901e-09,
"loss": 0.7166,
"step": 1923
},
{
"epoch": 4.960619754680439,
"grad_norm": 0.9260828637057613,
"learning_rate": 1.9698960924074973e-09,
"loss": 0.7124,
"step": 1924
},
{
"epoch": 4.963202065848935,
"grad_norm": 0.9176208831919505,
"learning_rate": 1.6280225763931623e-09,
"loss": 0.7192,
"step": 1925
},
{
"epoch": 4.965784377017431,
"grad_norm": 0.8999670316918704,
"learning_rate": 1.3187050855367755e-09,
"loss": 0.7011,
"step": 1926
},
{
"epoch": 4.9683666881859265,
"grad_norm": 0.9002005545182482,
"learning_rate": 1.0419446270193423e-09,
"loss": 0.7271,
"step": 1927
},
{
"epoch": 4.970948999354422,
"grad_norm": 0.8792229281282367,
"learning_rate": 7.977421020088916e-10,
"loss": 0.6954,
"step": 1928
},
{
"epoch": 4.973531310522918,
"grad_norm": 0.9030675678611226,
"learning_rate": 5.860983056604763e-10,
"loss": 0.717,
"step": 1929
},
{
"epoch": 4.976113621691414,
"grad_norm": 0.8822396526894568,
"learning_rate": 4.0701392711506307e-10,
"loss": 0.6956,
"step": 1930
},
{
"epoch": 4.978695932859909,
"grad_norm": 0.8758112231639331,
"learning_rate": 2.60489549495091e-10,
"loss": 0.6989,
"step": 1931
},
{
"epoch": 4.981278244028405,
"grad_norm": 0.9008898902977136,
"learning_rate": 1.4652564990336183e-10,
"loss": 0.7116,
"step": 1932
},
{
"epoch": 4.983860555196901,
"grad_norm": 0.8627458403348608,
"learning_rate": 6.512259942192955e-11,
"loss": 0.7134,
"step": 1933
},
{
"epoch": 4.9864428663653975,
"grad_norm": 0.8843096004745279,
"learning_rate": 1.6280663108769745e-11,
"loss": 0.6674,
"step": 1934
},
{
"epoch": 4.989025177533893,
"grad_norm": 0.8707183390390197,
"learning_rate": 0.0,
"loss": 0.6915,
"step": 1935
},
{
"epoch": 4.989025177533893,
"step": 1935,
"total_flos": 3.212141825011745e+18,
"train_loss": 0.9760797875796178,
"train_runtime": 14099.2726,
"train_samples_per_second": 17.57,
"train_steps_per_second": 0.137
}
],
"logging_steps": 1,
"max_steps": 1935,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.212141825011745e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}