flyingbugs's picture
Model save
562b143 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9962264150943394,
"eval_steps": 500,
"global_step": 1986,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015094339622641509,
"grad_norm": 42.84351638901055,
"learning_rate": 2.5125628140703517e-07,
"loss": 11.7003,
"step": 1
},
{
"epoch": 0.0030188679245283017,
"grad_norm": 40.58796594063912,
"learning_rate": 5.025125628140703e-07,
"loss": 11.756,
"step": 2
},
{
"epoch": 0.004528301886792453,
"grad_norm": 42.144032983881885,
"learning_rate": 7.537688442211055e-07,
"loss": 11.7907,
"step": 3
},
{
"epoch": 0.0060377358490566035,
"grad_norm": 45.72964350820413,
"learning_rate": 1.0050251256281407e-06,
"loss": 11.6821,
"step": 4
},
{
"epoch": 0.007547169811320755,
"grad_norm": 40.81047058078221,
"learning_rate": 1.256281407035176e-06,
"loss": 11.8394,
"step": 5
},
{
"epoch": 0.009056603773584906,
"grad_norm": 42.899640519128475,
"learning_rate": 1.507537688442211e-06,
"loss": 11.6455,
"step": 6
},
{
"epoch": 0.010566037735849057,
"grad_norm": 40.231481376946206,
"learning_rate": 1.7587939698492463e-06,
"loss": 11.7821,
"step": 7
},
{
"epoch": 0.012075471698113207,
"grad_norm": 43.18420575417782,
"learning_rate": 2.0100502512562813e-06,
"loss": 11.51,
"step": 8
},
{
"epoch": 0.013584905660377358,
"grad_norm": 45.40072785894473,
"learning_rate": 2.261306532663317e-06,
"loss": 11.4937,
"step": 9
},
{
"epoch": 0.01509433962264151,
"grad_norm": 60.478523691312404,
"learning_rate": 2.512562814070352e-06,
"loss": 10.4386,
"step": 10
},
{
"epoch": 0.01660377358490566,
"grad_norm": 55.94613742853918,
"learning_rate": 2.7638190954773874e-06,
"loss": 10.2241,
"step": 11
},
{
"epoch": 0.018113207547169812,
"grad_norm": 54.818513224626024,
"learning_rate": 3.015075376884422e-06,
"loss": 10.6026,
"step": 12
},
{
"epoch": 0.019622641509433963,
"grad_norm": 84.73466989691137,
"learning_rate": 3.2663316582914575e-06,
"loss": 6.2547,
"step": 13
},
{
"epoch": 0.021132075471698115,
"grad_norm": 76.688984609838,
"learning_rate": 3.5175879396984926e-06,
"loss": 5.6541,
"step": 14
},
{
"epoch": 0.022641509433962263,
"grad_norm": 76.71708348082493,
"learning_rate": 3.7688442211055276e-06,
"loss": 5.5968,
"step": 15
},
{
"epoch": 0.024150943396226414,
"grad_norm": 57.53302315459275,
"learning_rate": 4.020100502512563e-06,
"loss": 4.4551,
"step": 16
},
{
"epoch": 0.025660377358490565,
"grad_norm": 20.381484799022463,
"learning_rate": 4.271356783919598e-06,
"loss": 2.6246,
"step": 17
},
{
"epoch": 0.027169811320754716,
"grad_norm": 7.4120018391528975,
"learning_rate": 4.522613065326634e-06,
"loss": 1.9439,
"step": 18
},
{
"epoch": 0.028679245283018868,
"grad_norm": 7.246473146417547,
"learning_rate": 4.773869346733668e-06,
"loss": 2.0671,
"step": 19
},
{
"epoch": 0.03018867924528302,
"grad_norm": 5.317897222021412,
"learning_rate": 5.025125628140704e-06,
"loss": 1.8909,
"step": 20
},
{
"epoch": 0.03169811320754717,
"grad_norm": 4.371583867174312,
"learning_rate": 5.276381909547739e-06,
"loss": 1.706,
"step": 21
},
{
"epoch": 0.03320754716981132,
"grad_norm": 3.7247484208320563,
"learning_rate": 5.527638190954775e-06,
"loss": 1.6542,
"step": 22
},
{
"epoch": 0.03471698113207547,
"grad_norm": 3.5432475784664277,
"learning_rate": 5.778894472361809e-06,
"loss": 1.8566,
"step": 23
},
{
"epoch": 0.036226415094339624,
"grad_norm": 2.2793446224909015,
"learning_rate": 6.030150753768844e-06,
"loss": 1.6225,
"step": 24
},
{
"epoch": 0.03773584905660377,
"grad_norm": 2.976629270506673,
"learning_rate": 6.2814070351758795e-06,
"loss": 1.8027,
"step": 25
},
{
"epoch": 0.03924528301886793,
"grad_norm": 1.9292710465468252,
"learning_rate": 6.532663316582915e-06,
"loss": 1.6694,
"step": 26
},
{
"epoch": 0.040754716981132075,
"grad_norm": 1.8235949344084434,
"learning_rate": 6.7839195979899505e-06,
"loss": 1.3449,
"step": 27
},
{
"epoch": 0.04226415094339623,
"grad_norm": 1.2509325845880814,
"learning_rate": 7.035175879396985e-06,
"loss": 1.4428,
"step": 28
},
{
"epoch": 0.04377358490566038,
"grad_norm": 1.1446202521586268,
"learning_rate": 7.28643216080402e-06,
"loss": 1.2865,
"step": 29
},
{
"epoch": 0.045283018867924525,
"grad_norm": 1.036741546377771,
"learning_rate": 7.537688442211055e-06,
"loss": 1.4755,
"step": 30
},
{
"epoch": 0.04679245283018868,
"grad_norm": 1.0321415589665388,
"learning_rate": 7.788944723618092e-06,
"loss": 1.5658,
"step": 31
},
{
"epoch": 0.04830188679245283,
"grad_norm": 0.8390420469610276,
"learning_rate": 8.040201005025125e-06,
"loss": 1.2749,
"step": 32
},
{
"epoch": 0.04981132075471698,
"grad_norm": 0.8773961154600026,
"learning_rate": 8.291457286432161e-06,
"loss": 1.3381,
"step": 33
},
{
"epoch": 0.05132075471698113,
"grad_norm": 0.688788854923463,
"learning_rate": 8.542713567839196e-06,
"loss": 1.2844,
"step": 34
},
{
"epoch": 0.052830188679245285,
"grad_norm": 0.697731142261381,
"learning_rate": 8.793969849246232e-06,
"loss": 1.0224,
"step": 35
},
{
"epoch": 0.05433962264150943,
"grad_norm": 0.7481698939939284,
"learning_rate": 9.045226130653267e-06,
"loss": 1.3223,
"step": 36
},
{
"epoch": 0.05584905660377359,
"grad_norm": 0.713480143085657,
"learning_rate": 9.296482412060301e-06,
"loss": 1.3433,
"step": 37
},
{
"epoch": 0.057358490566037736,
"grad_norm": 0.5516147418737927,
"learning_rate": 9.547738693467337e-06,
"loss": 1.1849,
"step": 38
},
{
"epoch": 0.05886792452830188,
"grad_norm": 0.5489764999546655,
"learning_rate": 9.798994974874372e-06,
"loss": 1.221,
"step": 39
},
{
"epoch": 0.06037735849056604,
"grad_norm": 0.5976907585489283,
"learning_rate": 1.0050251256281408e-05,
"loss": 1.1651,
"step": 40
},
{
"epoch": 0.061886792452830186,
"grad_norm": 0.5729987457779835,
"learning_rate": 1.0301507537688443e-05,
"loss": 1.2038,
"step": 41
},
{
"epoch": 0.06339622641509433,
"grad_norm": 0.849789141484363,
"learning_rate": 1.0552763819095479e-05,
"loss": 0.975,
"step": 42
},
{
"epoch": 0.0649056603773585,
"grad_norm": 0.5378335024110182,
"learning_rate": 1.0804020100502512e-05,
"loss": 1.1693,
"step": 43
},
{
"epoch": 0.06641509433962264,
"grad_norm": 0.4605169598599595,
"learning_rate": 1.105527638190955e-05,
"loss": 1.0198,
"step": 44
},
{
"epoch": 0.06792452830188679,
"grad_norm": 0.4716228872243182,
"learning_rate": 1.1306532663316583e-05,
"loss": 1.0925,
"step": 45
},
{
"epoch": 0.06943396226415094,
"grad_norm": 0.4669934577396619,
"learning_rate": 1.1557788944723619e-05,
"loss": 1.1274,
"step": 46
},
{
"epoch": 0.0709433962264151,
"grad_norm": 0.4621587559628666,
"learning_rate": 1.1809045226130654e-05,
"loss": 1.1123,
"step": 47
},
{
"epoch": 0.07245283018867925,
"grad_norm": 0.7302537885626854,
"learning_rate": 1.2060301507537688e-05,
"loss": 1.0177,
"step": 48
},
{
"epoch": 0.0739622641509434,
"grad_norm": 0.4731493256116344,
"learning_rate": 1.2311557788944725e-05,
"loss": 1.0597,
"step": 49
},
{
"epoch": 0.07547169811320754,
"grad_norm": 0.44703159598258635,
"learning_rate": 1.2562814070351759e-05,
"loss": 1.1184,
"step": 50
},
{
"epoch": 0.07698113207547169,
"grad_norm": 0.37947111567149666,
"learning_rate": 1.2814070351758795e-05,
"loss": 0.9152,
"step": 51
},
{
"epoch": 0.07849056603773585,
"grad_norm": 0.4138677678629633,
"learning_rate": 1.306532663316583e-05,
"loss": 0.9698,
"step": 52
},
{
"epoch": 0.08,
"grad_norm": 0.4210863662592338,
"learning_rate": 1.3316582914572864e-05,
"loss": 1.0264,
"step": 53
},
{
"epoch": 0.08150943396226415,
"grad_norm": 0.4657161680493391,
"learning_rate": 1.3567839195979901e-05,
"loss": 1.2491,
"step": 54
},
{
"epoch": 0.0830188679245283,
"grad_norm": 0.4322144912028561,
"learning_rate": 1.3819095477386935e-05,
"loss": 1.0837,
"step": 55
},
{
"epoch": 0.08452830188679246,
"grad_norm": 0.4895899695520348,
"learning_rate": 1.407035175879397e-05,
"loss": 0.9843,
"step": 56
},
{
"epoch": 0.0860377358490566,
"grad_norm": 0.38329697027276266,
"learning_rate": 1.4321608040201007e-05,
"loss": 0.9874,
"step": 57
},
{
"epoch": 0.08754716981132075,
"grad_norm": 0.3728506554128249,
"learning_rate": 1.457286432160804e-05,
"loss": 1.0505,
"step": 58
},
{
"epoch": 0.0890566037735849,
"grad_norm": 0.42582396506199066,
"learning_rate": 1.4824120603015077e-05,
"loss": 1.0177,
"step": 59
},
{
"epoch": 0.09056603773584905,
"grad_norm": 0.4039105071166847,
"learning_rate": 1.507537688442211e-05,
"loss": 1.0522,
"step": 60
},
{
"epoch": 0.09207547169811321,
"grad_norm": 0.39048018099504467,
"learning_rate": 1.5326633165829146e-05,
"loss": 1.1106,
"step": 61
},
{
"epoch": 0.09358490566037736,
"grad_norm": 0.3598639795497603,
"learning_rate": 1.5577889447236183e-05,
"loss": 0.9756,
"step": 62
},
{
"epoch": 0.09509433962264151,
"grad_norm": 0.33978834353660775,
"learning_rate": 1.5829145728643217e-05,
"loss": 0.9874,
"step": 63
},
{
"epoch": 0.09660377358490566,
"grad_norm": 0.3571676859401004,
"learning_rate": 1.608040201005025e-05,
"loss": 0.9647,
"step": 64
},
{
"epoch": 0.09811320754716982,
"grad_norm": 0.3713953220516257,
"learning_rate": 1.6331658291457288e-05,
"loss": 1.0445,
"step": 65
},
{
"epoch": 0.09962264150943397,
"grad_norm": 0.3708364360600743,
"learning_rate": 1.6582914572864322e-05,
"loss": 1.0168,
"step": 66
},
{
"epoch": 0.10113207547169811,
"grad_norm": 0.3673356575030503,
"learning_rate": 1.683417085427136e-05,
"loss": 1.0713,
"step": 67
},
{
"epoch": 0.10264150943396226,
"grad_norm": 0.3817704093554328,
"learning_rate": 1.7085427135678393e-05,
"loss": 1.1859,
"step": 68
},
{
"epoch": 0.10415094339622641,
"grad_norm": 0.35553437445641883,
"learning_rate": 1.7336683417085427e-05,
"loss": 0.9106,
"step": 69
},
{
"epoch": 0.10566037735849057,
"grad_norm": 0.3549271646935127,
"learning_rate": 1.7587939698492464e-05,
"loss": 0.9792,
"step": 70
},
{
"epoch": 0.10716981132075472,
"grad_norm": 0.34909558103973126,
"learning_rate": 1.7839195979899497e-05,
"loss": 0.9646,
"step": 71
},
{
"epoch": 0.10867924528301887,
"grad_norm": 0.3777705845997939,
"learning_rate": 1.8090452261306535e-05,
"loss": 1.0047,
"step": 72
},
{
"epoch": 0.11018867924528301,
"grad_norm": 0.34807538051544334,
"learning_rate": 1.834170854271357e-05,
"loss": 1.0855,
"step": 73
},
{
"epoch": 0.11169811320754718,
"grad_norm": 0.3253777447991679,
"learning_rate": 1.8592964824120602e-05,
"loss": 0.9121,
"step": 74
},
{
"epoch": 0.11320754716981132,
"grad_norm": 0.3734270413143541,
"learning_rate": 1.884422110552764e-05,
"loss": 1.0115,
"step": 75
},
{
"epoch": 0.11471698113207547,
"grad_norm": 0.3746609650254723,
"learning_rate": 1.9095477386934673e-05,
"loss": 1.0999,
"step": 76
},
{
"epoch": 0.11622641509433962,
"grad_norm": 0.3218308743491144,
"learning_rate": 1.934673366834171e-05,
"loss": 0.9628,
"step": 77
},
{
"epoch": 0.11773584905660377,
"grad_norm": 0.378328515287124,
"learning_rate": 1.9597989949748744e-05,
"loss": 0.8928,
"step": 78
},
{
"epoch": 0.11924528301886793,
"grad_norm": 0.385332164332375,
"learning_rate": 1.984924623115578e-05,
"loss": 1.11,
"step": 79
},
{
"epoch": 0.12075471698113208,
"grad_norm": 0.36828879580340973,
"learning_rate": 2.0100502512562815e-05,
"loss": 0.9687,
"step": 80
},
{
"epoch": 0.12226415094339622,
"grad_norm": 0.37180628983371017,
"learning_rate": 2.035175879396985e-05,
"loss": 0.9488,
"step": 81
},
{
"epoch": 0.12377358490566037,
"grad_norm": 0.3655463598030299,
"learning_rate": 2.0603015075376886e-05,
"loss": 0.9635,
"step": 82
},
{
"epoch": 0.12528301886792453,
"grad_norm": 0.3973839411165058,
"learning_rate": 2.085427135678392e-05,
"loss": 0.9539,
"step": 83
},
{
"epoch": 0.12679245283018867,
"grad_norm": 0.8889038300232109,
"learning_rate": 2.1105527638190957e-05,
"loss": 0.9591,
"step": 84
},
{
"epoch": 0.12830188679245283,
"grad_norm": 0.4159535642199707,
"learning_rate": 2.135678391959799e-05,
"loss": 0.9782,
"step": 85
},
{
"epoch": 0.129811320754717,
"grad_norm": 0.4055951184040692,
"learning_rate": 2.1608040201005025e-05,
"loss": 0.9885,
"step": 86
},
{
"epoch": 0.13132075471698113,
"grad_norm": 0.3740604222184423,
"learning_rate": 2.1859296482412062e-05,
"loss": 0.9783,
"step": 87
},
{
"epoch": 0.1328301886792453,
"grad_norm": 0.41043743276277883,
"learning_rate": 2.21105527638191e-05,
"loss": 0.9789,
"step": 88
},
{
"epoch": 0.13433962264150942,
"grad_norm": 0.3744677884389674,
"learning_rate": 2.2361809045226133e-05,
"loss": 0.8614,
"step": 89
},
{
"epoch": 0.13584905660377358,
"grad_norm": 0.3674741281297594,
"learning_rate": 2.2613065326633167e-05,
"loss": 0.8971,
"step": 90
},
{
"epoch": 0.13735849056603774,
"grad_norm": 0.41628170180596963,
"learning_rate": 2.28643216080402e-05,
"loss": 0.8112,
"step": 91
},
{
"epoch": 0.13886792452830188,
"grad_norm": 0.4508017917622187,
"learning_rate": 2.3115577889447238e-05,
"loss": 1.0037,
"step": 92
},
{
"epoch": 0.14037735849056604,
"grad_norm": 0.5170078422537966,
"learning_rate": 2.3366834170854275e-05,
"loss": 1.0547,
"step": 93
},
{
"epoch": 0.1418867924528302,
"grad_norm": 0.5212510340969881,
"learning_rate": 2.361809045226131e-05,
"loss": 1.0543,
"step": 94
},
{
"epoch": 0.14339622641509434,
"grad_norm": 0.6121067181622158,
"learning_rate": 2.3869346733668342e-05,
"loss": 0.9061,
"step": 95
},
{
"epoch": 0.1449056603773585,
"grad_norm": 0.5238865642185518,
"learning_rate": 2.4120603015075376e-05,
"loss": 0.9103,
"step": 96
},
{
"epoch": 0.14641509433962263,
"grad_norm": 0.4171766577781957,
"learning_rate": 2.4371859296482413e-05,
"loss": 0.8452,
"step": 97
},
{
"epoch": 0.1479245283018868,
"grad_norm": 0.5448433488826973,
"learning_rate": 2.462311557788945e-05,
"loss": 1.0022,
"step": 98
},
{
"epoch": 0.14943396226415095,
"grad_norm": 1.0734992777741617,
"learning_rate": 2.4874371859296484e-05,
"loss": 0.943,
"step": 99
},
{
"epoch": 0.1509433962264151,
"grad_norm": 0.49449168180642084,
"learning_rate": 2.5125628140703518e-05,
"loss": 0.9584,
"step": 100
},
{
"epoch": 0.15245283018867925,
"grad_norm": 0.5088881724419568,
"learning_rate": 2.5376884422110552e-05,
"loss": 0.8866,
"step": 101
},
{
"epoch": 0.15396226415094338,
"grad_norm": 0.508102690727942,
"learning_rate": 2.562814070351759e-05,
"loss": 0.9326,
"step": 102
},
{
"epoch": 0.15547169811320755,
"grad_norm": 0.4653650117166445,
"learning_rate": 2.5879396984924626e-05,
"loss": 0.953,
"step": 103
},
{
"epoch": 0.1569811320754717,
"grad_norm": 0.46750559798962293,
"learning_rate": 2.613065326633166e-05,
"loss": 0.8609,
"step": 104
},
{
"epoch": 0.15849056603773584,
"grad_norm": 0.6835796303360562,
"learning_rate": 2.6381909547738694e-05,
"loss": 0.8787,
"step": 105
},
{
"epoch": 0.16,
"grad_norm": 0.5141911707155195,
"learning_rate": 2.6633165829145728e-05,
"loss": 0.9585,
"step": 106
},
{
"epoch": 0.16150943396226414,
"grad_norm": 0.4883615708108046,
"learning_rate": 2.6884422110552765e-05,
"loss": 0.9979,
"step": 107
},
{
"epoch": 0.1630188679245283,
"grad_norm": 0.5190216260067821,
"learning_rate": 2.7135678391959802e-05,
"loss": 0.865,
"step": 108
},
{
"epoch": 0.16452830188679246,
"grad_norm": 0.408410057009239,
"learning_rate": 2.738693467336684e-05,
"loss": 0.9051,
"step": 109
},
{
"epoch": 0.1660377358490566,
"grad_norm": 0.4906888730107437,
"learning_rate": 2.763819095477387e-05,
"loss": 0.8904,
"step": 110
},
{
"epoch": 0.16754716981132076,
"grad_norm": 0.42357291845895245,
"learning_rate": 2.7889447236180903e-05,
"loss": 0.8314,
"step": 111
},
{
"epoch": 0.16905660377358492,
"grad_norm": 1.440741980111514,
"learning_rate": 2.814070351758794e-05,
"loss": 1.0156,
"step": 112
},
{
"epoch": 0.17056603773584905,
"grad_norm": 0.5124296806329448,
"learning_rate": 2.8391959798994978e-05,
"loss": 0.8618,
"step": 113
},
{
"epoch": 0.1720754716981132,
"grad_norm": 0.5963788620163426,
"learning_rate": 2.8643216080402015e-05,
"loss": 0.964,
"step": 114
},
{
"epoch": 0.17358490566037735,
"grad_norm": 0.45527147444306076,
"learning_rate": 2.8894472361809045e-05,
"loss": 0.8331,
"step": 115
},
{
"epoch": 0.1750943396226415,
"grad_norm": 0.5115219921915511,
"learning_rate": 2.914572864321608e-05,
"loss": 0.9331,
"step": 116
},
{
"epoch": 0.17660377358490567,
"grad_norm": 4.556437650890275,
"learning_rate": 2.9396984924623116e-05,
"loss": 0.962,
"step": 117
},
{
"epoch": 0.1781132075471698,
"grad_norm": 0.5312376654687548,
"learning_rate": 2.9648241206030153e-05,
"loss": 0.8834,
"step": 118
},
{
"epoch": 0.17962264150943397,
"grad_norm": 0.45200310784615344,
"learning_rate": 2.989949748743719e-05,
"loss": 0.9321,
"step": 119
},
{
"epoch": 0.1811320754716981,
"grad_norm": 0.49479911246054076,
"learning_rate": 3.015075376884422e-05,
"loss": 0.8612,
"step": 120
},
{
"epoch": 0.18264150943396226,
"grad_norm": 0.5356223053603791,
"learning_rate": 3.0402010050251255e-05,
"loss": 0.9651,
"step": 121
},
{
"epoch": 0.18415094339622642,
"grad_norm": 0.4775869273954035,
"learning_rate": 3.065326633165829e-05,
"loss": 0.9167,
"step": 122
},
{
"epoch": 0.18566037735849056,
"grad_norm": 0.38759830910589615,
"learning_rate": 3.0904522613065326e-05,
"loss": 0.7914,
"step": 123
},
{
"epoch": 0.18716981132075472,
"grad_norm": 0.505778754703735,
"learning_rate": 3.1155778894472366e-05,
"loss": 0.9004,
"step": 124
},
{
"epoch": 0.18867924528301888,
"grad_norm": 1.5610813600582665,
"learning_rate": 3.14070351758794e-05,
"loss": 0.823,
"step": 125
},
{
"epoch": 0.19018867924528302,
"grad_norm": 0.5033816078558214,
"learning_rate": 3.1658291457286434e-05,
"loss": 0.9111,
"step": 126
},
{
"epoch": 0.19169811320754718,
"grad_norm": 0.5411937400048007,
"learning_rate": 3.190954773869347e-05,
"loss": 0.8375,
"step": 127
},
{
"epoch": 0.1932075471698113,
"grad_norm": 0.4287407951685938,
"learning_rate": 3.21608040201005e-05,
"loss": 0.8805,
"step": 128
},
{
"epoch": 0.19471698113207547,
"grad_norm": 0.5109121770551573,
"learning_rate": 3.241206030150754e-05,
"loss": 0.9597,
"step": 129
},
{
"epoch": 0.19622641509433963,
"grad_norm": 0.4260153095022451,
"learning_rate": 3.2663316582914576e-05,
"loss": 0.998,
"step": 130
},
{
"epoch": 0.19773584905660377,
"grad_norm": 0.44701240738286824,
"learning_rate": 3.291457286432161e-05,
"loss": 0.8782,
"step": 131
},
{
"epoch": 0.19924528301886793,
"grad_norm": 0.44157622535461144,
"learning_rate": 3.3165829145728643e-05,
"loss": 0.8841,
"step": 132
},
{
"epoch": 0.20075471698113206,
"grad_norm": 0.4801398431121367,
"learning_rate": 3.341708542713568e-05,
"loss": 0.8644,
"step": 133
},
{
"epoch": 0.20226415094339623,
"grad_norm": 0.36530196405233833,
"learning_rate": 3.366834170854272e-05,
"loss": 0.8119,
"step": 134
},
{
"epoch": 0.2037735849056604,
"grad_norm": 0.5570490282592016,
"learning_rate": 3.391959798994975e-05,
"loss": 0.8855,
"step": 135
},
{
"epoch": 0.20528301886792452,
"grad_norm": 0.3986151374909903,
"learning_rate": 3.4170854271356785e-05,
"loss": 0.8876,
"step": 136
},
{
"epoch": 0.20679245283018868,
"grad_norm": 0.43926099950410996,
"learning_rate": 3.442211055276382e-05,
"loss": 0.8575,
"step": 137
},
{
"epoch": 0.20830188679245282,
"grad_norm": 0.45592611275226874,
"learning_rate": 3.467336683417085e-05,
"loss": 0.8576,
"step": 138
},
{
"epoch": 0.20981132075471698,
"grad_norm": 0.5049788202212411,
"learning_rate": 3.4924623115577894e-05,
"loss": 0.9399,
"step": 139
},
{
"epoch": 0.21132075471698114,
"grad_norm": 0.3968817831624458,
"learning_rate": 3.517587939698493e-05,
"loss": 0.884,
"step": 140
},
{
"epoch": 0.21283018867924527,
"grad_norm": 0.5034047348280332,
"learning_rate": 3.542713567839196e-05,
"loss": 0.9453,
"step": 141
},
{
"epoch": 0.21433962264150944,
"grad_norm": 0.38193650268073553,
"learning_rate": 3.5678391959798995e-05,
"loss": 0.8446,
"step": 142
},
{
"epoch": 0.2158490566037736,
"grad_norm": 0.47293401837423554,
"learning_rate": 3.592964824120603e-05,
"loss": 0.8357,
"step": 143
},
{
"epoch": 0.21735849056603773,
"grad_norm": 0.5128727207732378,
"learning_rate": 3.618090452261307e-05,
"loss": 0.8878,
"step": 144
},
{
"epoch": 0.2188679245283019,
"grad_norm": 0.4490745977510919,
"learning_rate": 3.64321608040201e-05,
"loss": 0.7525,
"step": 145
},
{
"epoch": 0.22037735849056603,
"grad_norm": 0.5285464668249072,
"learning_rate": 3.668341708542714e-05,
"loss": 0.8317,
"step": 146
},
{
"epoch": 0.2218867924528302,
"grad_norm": 0.4684615864196027,
"learning_rate": 3.693467336683417e-05,
"loss": 0.8679,
"step": 147
},
{
"epoch": 0.22339622641509435,
"grad_norm": 0.4384754233570824,
"learning_rate": 3.7185929648241204e-05,
"loss": 0.9187,
"step": 148
},
{
"epoch": 0.22490566037735849,
"grad_norm": 0.4959217477046417,
"learning_rate": 3.7437185929648245e-05,
"loss": 0.9176,
"step": 149
},
{
"epoch": 0.22641509433962265,
"grad_norm": 0.513351882366079,
"learning_rate": 3.768844221105528e-05,
"loss": 0.9131,
"step": 150
},
{
"epoch": 0.22792452830188678,
"grad_norm": 0.46013492291199676,
"learning_rate": 3.793969849246231e-05,
"loss": 0.8977,
"step": 151
},
{
"epoch": 0.22943396226415094,
"grad_norm": 0.4228108610077172,
"learning_rate": 3.8190954773869346e-05,
"loss": 0.8378,
"step": 152
},
{
"epoch": 0.2309433962264151,
"grad_norm": 0.5211386844607362,
"learning_rate": 3.844221105527639e-05,
"loss": 0.8609,
"step": 153
},
{
"epoch": 0.23245283018867924,
"grad_norm": 0.42815407260277805,
"learning_rate": 3.869346733668342e-05,
"loss": 0.797,
"step": 154
},
{
"epoch": 0.2339622641509434,
"grad_norm": 0.44022812612987366,
"learning_rate": 3.8944723618090455e-05,
"loss": 0.8717,
"step": 155
},
{
"epoch": 0.23547169811320753,
"grad_norm": 0.4131388371605724,
"learning_rate": 3.919597989949749e-05,
"loss": 0.812,
"step": 156
},
{
"epoch": 0.2369811320754717,
"grad_norm": 0.6710269473568021,
"learning_rate": 3.944723618090452e-05,
"loss": 0.8191,
"step": 157
},
{
"epoch": 0.23849056603773586,
"grad_norm": 0.4897114594610496,
"learning_rate": 3.969849246231156e-05,
"loss": 0.7659,
"step": 158
},
{
"epoch": 0.24,
"grad_norm": 0.5108095802087971,
"learning_rate": 3.9949748743718597e-05,
"loss": 0.8281,
"step": 159
},
{
"epoch": 0.24150943396226415,
"grad_norm": 0.6108421680937045,
"learning_rate": 4.020100502512563e-05,
"loss": 0.8847,
"step": 160
},
{
"epoch": 0.24301886792452831,
"grad_norm": 0.5616258226397572,
"learning_rate": 4.0452261306532664e-05,
"loss": 0.8054,
"step": 161
},
{
"epoch": 0.24452830188679245,
"grad_norm": 0.45309534243186583,
"learning_rate": 4.07035175879397e-05,
"loss": 0.8975,
"step": 162
},
{
"epoch": 0.2460377358490566,
"grad_norm": 0.5773610924314223,
"learning_rate": 4.095477386934674e-05,
"loss": 0.9968,
"step": 163
},
{
"epoch": 0.24754716981132074,
"grad_norm": 0.4469837688813638,
"learning_rate": 4.120603015075377e-05,
"loss": 0.8012,
"step": 164
},
{
"epoch": 0.2490566037735849,
"grad_norm": 2.054275267813225,
"learning_rate": 4.1457286432160806e-05,
"loss": 0.8895,
"step": 165
},
{
"epoch": 0.25056603773584907,
"grad_norm": 0.5614011880192892,
"learning_rate": 4.170854271356784e-05,
"loss": 0.9091,
"step": 166
},
{
"epoch": 0.25207547169811323,
"grad_norm": 0.5803759031771593,
"learning_rate": 4.1959798994974874e-05,
"loss": 0.9152,
"step": 167
},
{
"epoch": 0.25358490566037734,
"grad_norm": 0.6748698350397639,
"learning_rate": 4.2211055276381914e-05,
"loss": 0.8804,
"step": 168
},
{
"epoch": 0.2550943396226415,
"grad_norm": 0.5901722233952649,
"learning_rate": 4.246231155778895e-05,
"loss": 0.8415,
"step": 169
},
{
"epoch": 0.25660377358490566,
"grad_norm": 0.6343703843442664,
"learning_rate": 4.271356783919598e-05,
"loss": 0.8981,
"step": 170
},
{
"epoch": 0.2581132075471698,
"grad_norm": 0.5181992447670495,
"learning_rate": 4.2964824120603016e-05,
"loss": 0.8191,
"step": 171
},
{
"epoch": 0.259622641509434,
"grad_norm": 0.4542387485161782,
"learning_rate": 4.321608040201005e-05,
"loss": 0.8212,
"step": 172
},
{
"epoch": 0.2611320754716981,
"grad_norm": 0.5151193905845751,
"learning_rate": 4.346733668341709e-05,
"loss": 0.8486,
"step": 173
},
{
"epoch": 0.26264150943396225,
"grad_norm": 1.037000010215159,
"learning_rate": 4.3718592964824124e-05,
"loss": 0.813,
"step": 174
},
{
"epoch": 0.2641509433962264,
"grad_norm": 0.4911515809403663,
"learning_rate": 4.396984924623116e-05,
"loss": 0.8018,
"step": 175
},
{
"epoch": 0.2656603773584906,
"grad_norm": 0.4701828499446655,
"learning_rate": 4.42211055276382e-05,
"loss": 0.8715,
"step": 176
},
{
"epoch": 0.26716981132075474,
"grad_norm": 0.4950297728241695,
"learning_rate": 4.4472361809045225e-05,
"loss": 0.8019,
"step": 177
},
{
"epoch": 0.26867924528301884,
"grad_norm": 0.5589767820377305,
"learning_rate": 4.4723618090452266e-05,
"loss": 0.9524,
"step": 178
},
{
"epoch": 0.270188679245283,
"grad_norm": 0.571633016201735,
"learning_rate": 4.49748743718593e-05,
"loss": 0.8792,
"step": 179
},
{
"epoch": 0.27169811320754716,
"grad_norm": 0.476203248798549,
"learning_rate": 4.522613065326633e-05,
"loss": 0.8768,
"step": 180
},
{
"epoch": 0.2732075471698113,
"grad_norm": 0.48088169221041444,
"learning_rate": 4.5477386934673374e-05,
"loss": 0.7918,
"step": 181
},
{
"epoch": 0.2747169811320755,
"grad_norm": 0.5571799591230107,
"learning_rate": 4.57286432160804e-05,
"loss": 0.8038,
"step": 182
},
{
"epoch": 0.27622641509433965,
"grad_norm": 0.5527228247900622,
"learning_rate": 4.597989949748744e-05,
"loss": 0.8643,
"step": 183
},
{
"epoch": 0.27773584905660376,
"grad_norm": 0.5211423771128341,
"learning_rate": 4.6231155778894475e-05,
"loss": 0.8878,
"step": 184
},
{
"epoch": 0.2792452830188679,
"grad_norm": 0.6572932038519691,
"learning_rate": 4.648241206030151e-05,
"loss": 0.8722,
"step": 185
},
{
"epoch": 0.2807547169811321,
"grad_norm": 0.7101877955695006,
"learning_rate": 4.673366834170855e-05,
"loss": 0.725,
"step": 186
},
{
"epoch": 0.28226415094339624,
"grad_norm": 0.49463109890029217,
"learning_rate": 4.6984924623115577e-05,
"loss": 0.8044,
"step": 187
},
{
"epoch": 0.2837735849056604,
"grad_norm": 0.6761004422971036,
"learning_rate": 4.723618090452262e-05,
"loss": 0.8928,
"step": 188
},
{
"epoch": 0.2852830188679245,
"grad_norm": 0.7308836814149202,
"learning_rate": 4.748743718592965e-05,
"loss": 0.7881,
"step": 189
},
{
"epoch": 0.28679245283018867,
"grad_norm": 0.5752705782759212,
"learning_rate": 4.7738693467336685e-05,
"loss": 0.7849,
"step": 190
},
{
"epoch": 0.28830188679245283,
"grad_norm": 0.6124086779896891,
"learning_rate": 4.7989949748743725e-05,
"loss": 0.8595,
"step": 191
},
{
"epoch": 0.289811320754717,
"grad_norm": 0.9703844768396788,
"learning_rate": 4.824120603015075e-05,
"loss": 0.8965,
"step": 192
},
{
"epoch": 0.29132075471698116,
"grad_norm": 0.920385457687424,
"learning_rate": 4.849246231155779e-05,
"loss": 0.8885,
"step": 193
},
{
"epoch": 0.29283018867924526,
"grad_norm": 0.5451187115485865,
"learning_rate": 4.874371859296483e-05,
"loss": 0.8876,
"step": 194
},
{
"epoch": 0.2943396226415094,
"grad_norm": 0.8924359703337917,
"learning_rate": 4.899497487437186e-05,
"loss": 0.7749,
"step": 195
},
{
"epoch": 0.2958490566037736,
"grad_norm": 0.6987574115677371,
"learning_rate": 4.92462311557789e-05,
"loss": 0.8024,
"step": 196
},
{
"epoch": 0.29735849056603775,
"grad_norm": 1.0879879193939195,
"learning_rate": 4.949748743718593e-05,
"loss": 0.908,
"step": 197
},
{
"epoch": 0.2988679245283019,
"grad_norm": 0.7248543407154597,
"learning_rate": 4.974874371859297e-05,
"loss": 0.8631,
"step": 198
},
{
"epoch": 0.300377358490566,
"grad_norm": 0.8236665003667092,
"learning_rate": 5e-05,
"loss": 0.7335,
"step": 199
},
{
"epoch": 0.3018867924528302,
"grad_norm": 0.6243498075203732,
"learning_rate": 4.9972020145495246e-05,
"loss": 0.8225,
"step": 200
},
{
"epoch": 0.30339622641509434,
"grad_norm": 0.9010469648278001,
"learning_rate": 4.994404029099049e-05,
"loss": 0.8684,
"step": 201
},
{
"epoch": 0.3049056603773585,
"grad_norm": 0.8500224224154146,
"learning_rate": 4.991606043648573e-05,
"loss": 0.7941,
"step": 202
},
{
"epoch": 0.30641509433962266,
"grad_norm": 0.5361097858505313,
"learning_rate": 4.9888080581980976e-05,
"loss": 0.8696,
"step": 203
},
{
"epoch": 0.30792452830188677,
"grad_norm": 0.9288603951781687,
"learning_rate": 4.986010072747622e-05,
"loss": 0.9553,
"step": 204
},
{
"epoch": 0.30943396226415093,
"grad_norm": 0.7165689385459199,
"learning_rate": 4.983212087297146e-05,
"loss": 0.7807,
"step": 205
},
{
"epoch": 0.3109433962264151,
"grad_norm": 0.6324302878103103,
"learning_rate": 4.9804141018466706e-05,
"loss": 0.826,
"step": 206
},
{
"epoch": 0.31245283018867925,
"grad_norm": 1.0280953874798517,
"learning_rate": 4.977616116396195e-05,
"loss": 0.8772,
"step": 207
},
{
"epoch": 0.3139622641509434,
"grad_norm": 0.6365774522631158,
"learning_rate": 4.974818130945719e-05,
"loss": 0.7882,
"step": 208
},
{
"epoch": 0.3154716981132075,
"grad_norm": 0.7866464392944168,
"learning_rate": 4.9720201454952436e-05,
"loss": 0.8591,
"step": 209
},
{
"epoch": 0.3169811320754717,
"grad_norm": 0.9900793181165777,
"learning_rate": 4.969222160044768e-05,
"loss": 0.9166,
"step": 210
},
{
"epoch": 0.31849056603773584,
"grad_norm": 0.5027834608849903,
"learning_rate": 4.966424174594292e-05,
"loss": 0.8475,
"step": 211
},
{
"epoch": 0.32,
"grad_norm": 0.8149475773947513,
"learning_rate": 4.9636261891438166e-05,
"loss": 0.8877,
"step": 212
},
{
"epoch": 0.32150943396226417,
"grad_norm": 0.7936117976140553,
"learning_rate": 4.960828203693341e-05,
"loss": 0.7943,
"step": 213
},
{
"epoch": 0.3230188679245283,
"grad_norm": 0.6047908551122195,
"learning_rate": 4.958030218242865e-05,
"loss": 0.7487,
"step": 214
},
{
"epoch": 0.32452830188679244,
"grad_norm": 0.8210359919252752,
"learning_rate": 4.9552322327923896e-05,
"loss": 0.8052,
"step": 215
},
{
"epoch": 0.3260377358490566,
"grad_norm": 0.5010377931393522,
"learning_rate": 4.952434247341914e-05,
"loss": 0.8029,
"step": 216
},
{
"epoch": 0.32754716981132076,
"grad_norm": 0.6159395499082359,
"learning_rate": 4.949636261891438e-05,
"loss": 0.7816,
"step": 217
},
{
"epoch": 0.3290566037735849,
"grad_norm": 0.6021660774660482,
"learning_rate": 4.9468382764409626e-05,
"loss": 0.8034,
"step": 218
},
{
"epoch": 0.3305660377358491,
"grad_norm": 0.48862257577764984,
"learning_rate": 4.944040290990487e-05,
"loss": 0.7641,
"step": 219
},
{
"epoch": 0.3320754716981132,
"grad_norm": 0.7291022201204262,
"learning_rate": 4.941242305540011e-05,
"loss": 0.7622,
"step": 220
},
{
"epoch": 0.33358490566037735,
"grad_norm": 0.5680273830366244,
"learning_rate": 4.9384443200895356e-05,
"loss": 0.8032,
"step": 221
},
{
"epoch": 0.3350943396226415,
"grad_norm": 0.7388526494706458,
"learning_rate": 4.93564633463906e-05,
"loss": 0.7876,
"step": 222
},
{
"epoch": 0.3366037735849057,
"grad_norm": 0.8366607526648661,
"learning_rate": 4.932848349188584e-05,
"loss": 0.838,
"step": 223
},
{
"epoch": 0.33811320754716984,
"grad_norm": 0.5362665065766964,
"learning_rate": 4.930050363738109e-05,
"loss": 0.8709,
"step": 224
},
{
"epoch": 0.33962264150943394,
"grad_norm": 0.7046748659670787,
"learning_rate": 4.927252378287633e-05,
"loss": 0.8295,
"step": 225
},
{
"epoch": 0.3411320754716981,
"grad_norm": 0.5709018100667398,
"learning_rate": 4.924454392837157e-05,
"loss": 0.7931,
"step": 226
},
{
"epoch": 0.34264150943396227,
"grad_norm": 1.5265206975752903,
"learning_rate": 4.9216564073866817e-05,
"loss": 0.7451,
"step": 227
},
{
"epoch": 0.3441509433962264,
"grad_norm": 0.6830742695717387,
"learning_rate": 4.918858421936206e-05,
"loss": 0.8248,
"step": 228
},
{
"epoch": 0.3456603773584906,
"grad_norm": 0.7664933367072962,
"learning_rate": 4.91606043648573e-05,
"loss": 0.8626,
"step": 229
},
{
"epoch": 0.3471698113207547,
"grad_norm": 0.4891691600978454,
"learning_rate": 4.913262451035255e-05,
"loss": 0.8749,
"step": 230
},
{
"epoch": 0.34867924528301886,
"grad_norm": 1.1932498298234255,
"learning_rate": 4.910464465584779e-05,
"loss": 0.7708,
"step": 231
},
{
"epoch": 0.350188679245283,
"grad_norm": 0.6049277141915559,
"learning_rate": 4.907666480134303e-05,
"loss": 0.7572,
"step": 232
},
{
"epoch": 0.3516981132075472,
"grad_norm": 0.6315896027669823,
"learning_rate": 4.9048684946838284e-05,
"loss": 0.8363,
"step": 233
},
{
"epoch": 0.35320754716981134,
"grad_norm": 0.688434404877305,
"learning_rate": 4.902070509233353e-05,
"loss": 0.8554,
"step": 234
},
{
"epoch": 0.35471698113207545,
"grad_norm": 0.5422581659132807,
"learning_rate": 4.899272523782877e-05,
"loss": 0.8488,
"step": 235
},
{
"epoch": 0.3562264150943396,
"grad_norm": 0.591458349973424,
"learning_rate": 4.896474538332401e-05,
"loss": 0.8144,
"step": 236
},
{
"epoch": 0.35773584905660377,
"grad_norm": 0.5604699909131293,
"learning_rate": 4.893676552881925e-05,
"loss": 0.7237,
"step": 237
},
{
"epoch": 0.35924528301886793,
"grad_norm": 0.4673344116856225,
"learning_rate": 4.8908785674314494e-05,
"loss": 0.7766,
"step": 238
},
{
"epoch": 0.3607547169811321,
"grad_norm": 0.5436449382903117,
"learning_rate": 4.888080581980974e-05,
"loss": 0.8197,
"step": 239
},
{
"epoch": 0.3622641509433962,
"grad_norm": 0.479468226600264,
"learning_rate": 4.885282596530498e-05,
"loss": 0.7939,
"step": 240
},
{
"epoch": 0.36377358490566036,
"grad_norm": 0.6602471005804883,
"learning_rate": 4.8824846110800224e-05,
"loss": 0.8495,
"step": 241
},
{
"epoch": 0.3652830188679245,
"grad_norm": 0.4571917624290203,
"learning_rate": 4.879686625629547e-05,
"loss": 0.7913,
"step": 242
},
{
"epoch": 0.3667924528301887,
"grad_norm": 0.6232210769659097,
"learning_rate": 4.876888640179072e-05,
"loss": 0.7552,
"step": 243
},
{
"epoch": 0.36830188679245285,
"grad_norm": 0.5204436625742763,
"learning_rate": 4.874090654728596e-05,
"loss": 0.7701,
"step": 244
},
{
"epoch": 0.36981132075471695,
"grad_norm": 0.6612096193726886,
"learning_rate": 4.8712926692781204e-05,
"loss": 0.7847,
"step": 245
},
{
"epoch": 0.3713207547169811,
"grad_norm": 0.45379455740824437,
"learning_rate": 4.868494683827644e-05,
"loss": 0.8053,
"step": 246
},
{
"epoch": 0.3728301886792453,
"grad_norm": 0.5973811927067406,
"learning_rate": 4.8656966983771684e-05,
"loss": 0.8246,
"step": 247
},
{
"epoch": 0.37433962264150944,
"grad_norm": 0.5220120154434879,
"learning_rate": 4.862898712926693e-05,
"loss": 0.8486,
"step": 248
},
{
"epoch": 0.3758490566037736,
"grad_norm": 0.5431811554175676,
"learning_rate": 4.860100727476217e-05,
"loss": 0.8141,
"step": 249
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.5029866404820176,
"learning_rate": 4.8573027420257414e-05,
"loss": 0.7952,
"step": 250
},
{
"epoch": 0.37886792452830187,
"grad_norm": 0.5898909870291161,
"learning_rate": 4.854504756575266e-05,
"loss": 0.8547,
"step": 251
},
{
"epoch": 0.38037735849056603,
"grad_norm": 0.6238187627468014,
"learning_rate": 4.851706771124791e-05,
"loss": 0.7726,
"step": 252
},
{
"epoch": 0.3818867924528302,
"grad_norm": 0.42756782085548817,
"learning_rate": 4.848908785674315e-05,
"loss": 0.7988,
"step": 253
},
{
"epoch": 0.38339622641509435,
"grad_norm": 0.5943288699361647,
"learning_rate": 4.8461108002238394e-05,
"loss": 0.8483,
"step": 254
},
{
"epoch": 0.3849056603773585,
"grad_norm": 0.44627827351325955,
"learning_rate": 4.843312814773364e-05,
"loss": 0.7907,
"step": 255
},
{
"epoch": 0.3864150943396226,
"grad_norm": 0.6245548346580109,
"learning_rate": 4.840514829322888e-05,
"loss": 0.7789,
"step": 256
},
{
"epoch": 0.3879245283018868,
"grad_norm": 0.5052966445102851,
"learning_rate": 4.837716843872412e-05,
"loss": 0.8078,
"step": 257
},
{
"epoch": 0.38943396226415095,
"grad_norm": 0.5086194272663704,
"learning_rate": 4.834918858421936e-05,
"loss": 0.797,
"step": 258
},
{
"epoch": 0.3909433962264151,
"grad_norm": 0.4917833870150589,
"learning_rate": 4.8321208729714604e-05,
"loss": 0.7691,
"step": 259
},
{
"epoch": 0.39245283018867927,
"grad_norm": 0.5338995503228233,
"learning_rate": 4.829322887520985e-05,
"loss": 0.7981,
"step": 260
},
{
"epoch": 0.3939622641509434,
"grad_norm": 0.5412599367495976,
"learning_rate": 4.82652490207051e-05,
"loss": 0.8443,
"step": 261
},
{
"epoch": 0.39547169811320754,
"grad_norm": 0.580624564144757,
"learning_rate": 4.823726916620034e-05,
"loss": 0.7907,
"step": 262
},
{
"epoch": 0.3969811320754717,
"grad_norm": 0.48369587610402004,
"learning_rate": 4.8209289311695584e-05,
"loss": 0.794,
"step": 263
},
{
"epoch": 0.39849056603773586,
"grad_norm": 0.49030096209333235,
"learning_rate": 4.818130945719083e-05,
"loss": 0.7584,
"step": 264
},
{
"epoch": 0.4,
"grad_norm": 0.5061198889128261,
"learning_rate": 4.815332960268607e-05,
"loss": 0.7352,
"step": 265
},
{
"epoch": 0.40150943396226413,
"grad_norm": 0.44245029843710665,
"learning_rate": 4.8125349748181314e-05,
"loss": 0.7376,
"step": 266
},
{
"epoch": 0.4030188679245283,
"grad_norm": 0.5564346808520174,
"learning_rate": 4.809736989367655e-05,
"loss": 0.7939,
"step": 267
},
{
"epoch": 0.40452830188679245,
"grad_norm": 0.4231713875112469,
"learning_rate": 4.8069390039171794e-05,
"loss": 0.7596,
"step": 268
},
{
"epoch": 0.4060377358490566,
"grad_norm": 0.5072296368778142,
"learning_rate": 4.804141018466704e-05,
"loss": 0.9397,
"step": 269
},
{
"epoch": 0.4075471698113208,
"grad_norm": 1.3091372981391216,
"learning_rate": 4.801343033016228e-05,
"loss": 0.7321,
"step": 270
},
{
"epoch": 0.4090566037735849,
"grad_norm": 0.7068197146851543,
"learning_rate": 4.798545047565753e-05,
"loss": 0.8049,
"step": 271
},
{
"epoch": 0.41056603773584904,
"grad_norm": 0.39121782749476086,
"learning_rate": 4.7957470621152775e-05,
"loss": 0.707,
"step": 272
},
{
"epoch": 0.4120754716981132,
"grad_norm": 0.6092417359403263,
"learning_rate": 4.792949076664802e-05,
"loss": 0.7187,
"step": 273
},
{
"epoch": 0.41358490566037737,
"grad_norm": 0.44996301987627735,
"learning_rate": 4.790151091214326e-05,
"loss": 0.8322,
"step": 274
},
{
"epoch": 0.41509433962264153,
"grad_norm": 0.6626369105932329,
"learning_rate": 4.7873531057638505e-05,
"loss": 0.7857,
"step": 275
},
{
"epoch": 0.41660377358490563,
"grad_norm": 0.4967773382857501,
"learning_rate": 4.784555120313375e-05,
"loss": 0.746,
"step": 276
},
{
"epoch": 0.4181132075471698,
"grad_norm": 0.5435211685588449,
"learning_rate": 4.7817571348628985e-05,
"loss": 0.7965,
"step": 277
},
{
"epoch": 0.41962264150943396,
"grad_norm": 0.7022029449146439,
"learning_rate": 4.778959149412423e-05,
"loss": 0.8653,
"step": 278
},
{
"epoch": 0.4211320754716981,
"grad_norm": 0.45601467205881,
"learning_rate": 4.776161163961947e-05,
"loss": 0.777,
"step": 279
},
{
"epoch": 0.4226415094339623,
"grad_norm": 0.6644957716637454,
"learning_rate": 4.773363178511472e-05,
"loss": 0.8214,
"step": 280
},
{
"epoch": 0.4241509433962264,
"grad_norm": 0.6246544429191916,
"learning_rate": 4.7705651930609965e-05,
"loss": 0.8785,
"step": 281
},
{
"epoch": 0.42566037735849055,
"grad_norm": 0.5291406456848594,
"learning_rate": 4.767767207610521e-05,
"loss": 0.7691,
"step": 282
},
{
"epoch": 0.4271698113207547,
"grad_norm": 0.7456610522310195,
"learning_rate": 4.764969222160045e-05,
"loss": 0.8264,
"step": 283
},
{
"epoch": 0.4286792452830189,
"grad_norm": 0.6353984731391955,
"learning_rate": 4.7621712367095695e-05,
"loss": 0.7689,
"step": 284
},
{
"epoch": 0.43018867924528303,
"grad_norm": 0.5508771074957549,
"learning_rate": 4.759373251259094e-05,
"loss": 0.803,
"step": 285
},
{
"epoch": 0.4316981132075472,
"grad_norm": 0.6135542045293,
"learning_rate": 4.756575265808618e-05,
"loss": 0.7338,
"step": 286
},
{
"epoch": 0.4332075471698113,
"grad_norm": 0.5030983850050345,
"learning_rate": 4.7537772803581425e-05,
"loss": 0.818,
"step": 287
},
{
"epoch": 0.43471698113207546,
"grad_norm": 0.7052367727423026,
"learning_rate": 4.750979294907666e-05,
"loss": 0.8135,
"step": 288
},
{
"epoch": 0.4362264150943396,
"grad_norm": 0.533554107770091,
"learning_rate": 4.748181309457191e-05,
"loss": 0.9047,
"step": 289
},
{
"epoch": 0.4377358490566038,
"grad_norm": 0.881159613978913,
"learning_rate": 4.7453833240067155e-05,
"loss": 0.884,
"step": 290
},
{
"epoch": 0.43924528301886795,
"grad_norm": 0.7306080850175966,
"learning_rate": 4.74258533855624e-05,
"loss": 0.8131,
"step": 291
},
{
"epoch": 0.44075471698113206,
"grad_norm": 0.5044741998175036,
"learning_rate": 4.739787353105764e-05,
"loss": 0.7513,
"step": 292
},
{
"epoch": 0.4422641509433962,
"grad_norm": 0.7168412587080375,
"learning_rate": 4.7369893676552885e-05,
"loss": 0.7788,
"step": 293
},
{
"epoch": 0.4437735849056604,
"grad_norm": 0.4558320123761158,
"learning_rate": 4.734191382204813e-05,
"loss": 0.7893,
"step": 294
},
{
"epoch": 0.44528301886792454,
"grad_norm": 0.7940272573905822,
"learning_rate": 4.731393396754337e-05,
"loss": 0.8386,
"step": 295
},
{
"epoch": 0.4467924528301887,
"grad_norm": 0.5513668143022887,
"learning_rate": 4.7285954113038615e-05,
"loss": 0.7833,
"step": 296
},
{
"epoch": 0.4483018867924528,
"grad_norm": 0.7079029126017938,
"learning_rate": 4.725797425853386e-05,
"loss": 0.7196,
"step": 297
},
{
"epoch": 0.44981132075471697,
"grad_norm": 1.229514968171817,
"learning_rate": 4.72299944040291e-05,
"loss": 0.8989,
"step": 298
},
{
"epoch": 0.45132075471698113,
"grad_norm": 0.6404714941336582,
"learning_rate": 4.7202014549524345e-05,
"loss": 0.8317,
"step": 299
},
{
"epoch": 0.4528301886792453,
"grad_norm": 2.4628741313154188,
"learning_rate": 4.717403469501959e-05,
"loss": 0.8618,
"step": 300
},
{
"epoch": 0.45433962264150946,
"grad_norm": 0.8912363562097001,
"learning_rate": 4.714605484051483e-05,
"loss": 0.8371,
"step": 301
},
{
"epoch": 0.45584905660377356,
"grad_norm": 0.7676920170222373,
"learning_rate": 4.7118074986010076e-05,
"loss": 0.7677,
"step": 302
},
{
"epoch": 0.4573584905660377,
"grad_norm": 0.9101103649802816,
"learning_rate": 4.709009513150532e-05,
"loss": 0.7726,
"step": 303
},
{
"epoch": 0.4588679245283019,
"grad_norm": 0.865381015702614,
"learning_rate": 4.706211527700056e-05,
"loss": 0.77,
"step": 304
},
{
"epoch": 0.46037735849056605,
"grad_norm": 0.629343619599737,
"learning_rate": 4.7034135422495806e-05,
"loss": 0.7224,
"step": 305
},
{
"epoch": 0.4618867924528302,
"grad_norm": 0.9800276759621851,
"learning_rate": 4.700615556799105e-05,
"loss": 0.8232,
"step": 306
},
{
"epoch": 0.4633962264150943,
"grad_norm": 0.5404048701315444,
"learning_rate": 4.697817571348629e-05,
"loss": 0.7897,
"step": 307
},
{
"epoch": 0.4649056603773585,
"grad_norm": 1.0278186663808309,
"learning_rate": 4.6950195858981536e-05,
"loss": 0.8159,
"step": 308
},
{
"epoch": 0.46641509433962264,
"grad_norm": 0.5823665222431843,
"learning_rate": 4.692221600447678e-05,
"loss": 0.7906,
"step": 309
},
{
"epoch": 0.4679245283018868,
"grad_norm": 0.7252680198432144,
"learning_rate": 4.689423614997202e-05,
"loss": 0.764,
"step": 310
},
{
"epoch": 0.46943396226415096,
"grad_norm": 0.6570038646216206,
"learning_rate": 4.6866256295467266e-05,
"loss": 0.6926,
"step": 311
},
{
"epoch": 0.47094339622641507,
"grad_norm": 0.5621715975516278,
"learning_rate": 4.683827644096251e-05,
"loss": 0.6961,
"step": 312
},
{
"epoch": 0.47245283018867923,
"grad_norm": 0.4779432536942869,
"learning_rate": 4.681029658645775e-05,
"loss": 0.7378,
"step": 313
},
{
"epoch": 0.4739622641509434,
"grad_norm": 0.674459154650339,
"learning_rate": 4.6782316731952996e-05,
"loss": 0.7874,
"step": 314
},
{
"epoch": 0.47547169811320755,
"grad_norm": 0.5004466668430579,
"learning_rate": 4.675433687744824e-05,
"loss": 0.7406,
"step": 315
},
{
"epoch": 0.4769811320754717,
"grad_norm": 0.5395088155154139,
"learning_rate": 4.672635702294348e-05,
"loss": 0.7362,
"step": 316
},
{
"epoch": 0.4784905660377359,
"grad_norm": 0.6418834874570026,
"learning_rate": 4.6698377168438726e-05,
"loss": 0.8157,
"step": 317
},
{
"epoch": 0.48,
"grad_norm": 0.4797662965902652,
"learning_rate": 4.667039731393397e-05,
"loss": 0.7916,
"step": 318
},
{
"epoch": 0.48150943396226414,
"grad_norm": 0.6366002378504793,
"learning_rate": 4.664241745942921e-05,
"loss": 0.8928,
"step": 319
},
{
"epoch": 0.4830188679245283,
"grad_norm": 0.7131620418095839,
"learning_rate": 4.6614437604924456e-05,
"loss": 0.7556,
"step": 320
},
{
"epoch": 0.48452830188679247,
"grad_norm": 0.5851313011583172,
"learning_rate": 4.65864577504197e-05,
"loss": 0.8102,
"step": 321
},
{
"epoch": 0.48603773584905663,
"grad_norm": 0.5129745799858986,
"learning_rate": 4.655847789591494e-05,
"loss": 0.7388,
"step": 322
},
{
"epoch": 0.48754716981132074,
"grad_norm": 0.48227342480016644,
"learning_rate": 4.6530498041410186e-05,
"loss": 0.7968,
"step": 323
},
{
"epoch": 0.4890566037735849,
"grad_norm": 0.5765939446109118,
"learning_rate": 4.650251818690543e-05,
"loss": 0.8276,
"step": 324
},
{
"epoch": 0.49056603773584906,
"grad_norm": 0.45210550750524087,
"learning_rate": 4.647453833240067e-05,
"loss": 0.7838,
"step": 325
},
{
"epoch": 0.4920754716981132,
"grad_norm": 0.5235976941093462,
"learning_rate": 4.6446558477895916e-05,
"loss": 0.8524,
"step": 326
},
{
"epoch": 0.4935849056603774,
"grad_norm": 0.948045987049377,
"learning_rate": 4.641857862339116e-05,
"loss": 0.783,
"step": 327
},
{
"epoch": 0.4950943396226415,
"grad_norm": 0.8405622911075784,
"learning_rate": 4.63905987688864e-05,
"loss": 0.7448,
"step": 328
},
{
"epoch": 0.49660377358490565,
"grad_norm": 0.5163025902946453,
"learning_rate": 4.6362618914381646e-05,
"loss": 0.8237,
"step": 329
},
{
"epoch": 0.4981132075471698,
"grad_norm": 0.7136811122203169,
"learning_rate": 4.633463905987689e-05,
"loss": 0.8279,
"step": 330
},
{
"epoch": 0.499622641509434,
"grad_norm": 0.5478782108688454,
"learning_rate": 4.630665920537213e-05,
"loss": 0.7627,
"step": 331
},
{
"epoch": 0.5011320754716981,
"grad_norm": 0.5191383894592315,
"learning_rate": 4.6278679350867376e-05,
"loss": 0.7477,
"step": 332
},
{
"epoch": 0.5026415094339622,
"grad_norm": 0.7132287751115131,
"learning_rate": 4.625069949636262e-05,
"loss": 0.715,
"step": 333
},
{
"epoch": 0.5041509433962265,
"grad_norm": 0.5596228616176007,
"learning_rate": 4.622271964185786e-05,
"loss": 0.8015,
"step": 334
},
{
"epoch": 0.5056603773584906,
"grad_norm": 0.44498739497428585,
"learning_rate": 4.6194739787353107e-05,
"loss": 0.7529,
"step": 335
},
{
"epoch": 0.5071698113207547,
"grad_norm": 0.5546353422236862,
"learning_rate": 4.616675993284835e-05,
"loss": 0.7459,
"step": 336
},
{
"epoch": 0.5086792452830189,
"grad_norm": 0.45280156933343746,
"learning_rate": 4.613878007834359e-05,
"loss": 0.8087,
"step": 337
},
{
"epoch": 0.510188679245283,
"grad_norm": 0.6178648389283045,
"learning_rate": 4.611080022383884e-05,
"loss": 0.8791,
"step": 338
},
{
"epoch": 0.5116981132075472,
"grad_norm": 0.4595941605540381,
"learning_rate": 4.608282036933409e-05,
"loss": 0.7174,
"step": 339
},
{
"epoch": 0.5132075471698113,
"grad_norm": 0.5584877878966316,
"learning_rate": 4.605484051482932e-05,
"loss": 0.7559,
"step": 340
},
{
"epoch": 0.5147169811320754,
"grad_norm": 0.5173754218021847,
"learning_rate": 4.602686066032457e-05,
"loss": 0.7863,
"step": 341
},
{
"epoch": 0.5162264150943396,
"grad_norm": 0.5740142718980299,
"learning_rate": 4.599888080581981e-05,
"loss": 0.7521,
"step": 342
},
{
"epoch": 0.5177358490566037,
"grad_norm": 0.48616061117595205,
"learning_rate": 4.5970900951315053e-05,
"loss": 0.7239,
"step": 343
},
{
"epoch": 0.519245283018868,
"grad_norm": 0.5948531580198754,
"learning_rate": 4.59429210968103e-05,
"loss": 0.7209,
"step": 344
},
{
"epoch": 0.5207547169811321,
"grad_norm": 0.7290623947785152,
"learning_rate": 4.591494124230554e-05,
"loss": 0.759,
"step": 345
},
{
"epoch": 0.5222641509433962,
"grad_norm": 0.488056932359024,
"learning_rate": 4.5886961387800783e-05,
"loss": 0.7456,
"step": 346
},
{
"epoch": 0.5237735849056604,
"grad_norm": 0.47054544364109835,
"learning_rate": 4.5858981533296034e-05,
"loss": 0.7124,
"step": 347
},
{
"epoch": 0.5252830188679245,
"grad_norm": 0.50177713952636,
"learning_rate": 4.583100167879128e-05,
"loss": 0.7519,
"step": 348
},
{
"epoch": 0.5267924528301887,
"grad_norm": 0.6140162666917494,
"learning_rate": 4.580302182428652e-05,
"loss": 0.7098,
"step": 349
},
{
"epoch": 0.5283018867924528,
"grad_norm": 0.4266279539939302,
"learning_rate": 4.577504196978176e-05,
"loss": 0.7665,
"step": 350
},
{
"epoch": 0.5298113207547169,
"grad_norm": 0.9041107079579056,
"learning_rate": 4.5747062115277e-05,
"loss": 0.7677,
"step": 351
},
{
"epoch": 0.5313207547169811,
"grad_norm": 0.5038590825788706,
"learning_rate": 4.5719082260772244e-05,
"loss": 0.7628,
"step": 352
},
{
"epoch": 0.5328301886792453,
"grad_norm": 0.44535742550996604,
"learning_rate": 4.569110240626749e-05,
"loss": 0.7568,
"step": 353
},
{
"epoch": 0.5343396226415095,
"grad_norm": 0.4862471874939931,
"learning_rate": 4.566312255176273e-05,
"loss": 0.7696,
"step": 354
},
{
"epoch": 0.5358490566037736,
"grad_norm": 0.4386852797735026,
"learning_rate": 4.5635142697257974e-05,
"loss": 0.7495,
"step": 355
},
{
"epoch": 0.5373584905660377,
"grad_norm": 0.6316124118050598,
"learning_rate": 4.560716284275322e-05,
"loss": 0.8228,
"step": 356
},
{
"epoch": 0.5388679245283019,
"grad_norm": 0.41211245426770826,
"learning_rate": 4.557918298824847e-05,
"loss": 0.7613,
"step": 357
},
{
"epoch": 0.540377358490566,
"grad_norm": 0.5868461457556423,
"learning_rate": 4.555120313374371e-05,
"loss": 0.7506,
"step": 358
},
{
"epoch": 0.5418867924528302,
"grad_norm": 0.4717702823928158,
"learning_rate": 4.5523223279238954e-05,
"loss": 0.8838,
"step": 359
},
{
"epoch": 0.5433962264150943,
"grad_norm": 0.5003003408121309,
"learning_rate": 4.54952434247342e-05,
"loss": 0.8818,
"step": 360
},
{
"epoch": 0.5449056603773584,
"grad_norm": 0.4907938538260416,
"learning_rate": 4.5467263570229434e-05,
"loss": 0.7319,
"step": 361
},
{
"epoch": 0.5464150943396227,
"grad_norm": 0.4644540043260514,
"learning_rate": 4.543928371572468e-05,
"loss": 0.735,
"step": 362
},
{
"epoch": 0.5479245283018868,
"grad_norm": 0.5560732867320977,
"learning_rate": 4.541130386121992e-05,
"loss": 0.8294,
"step": 363
},
{
"epoch": 0.549433962264151,
"grad_norm": 0.41552200602803446,
"learning_rate": 4.5383324006715164e-05,
"loss": 0.8152,
"step": 364
},
{
"epoch": 0.5509433962264151,
"grad_norm": 0.539878788115719,
"learning_rate": 4.535534415221041e-05,
"loss": 0.7746,
"step": 365
},
{
"epoch": 0.5524528301886793,
"grad_norm": 0.4414859813055167,
"learning_rate": 4.532736429770566e-05,
"loss": 0.7332,
"step": 366
},
{
"epoch": 0.5539622641509434,
"grad_norm": 0.4321477596135969,
"learning_rate": 4.52993844432009e-05,
"loss": 0.7245,
"step": 367
},
{
"epoch": 0.5554716981132075,
"grad_norm": 1.2570879787382243,
"learning_rate": 4.5271404588696144e-05,
"loss": 0.7191,
"step": 368
},
{
"epoch": 0.5569811320754717,
"grad_norm": 0.5742427583402032,
"learning_rate": 4.524342473419139e-05,
"loss": 0.8317,
"step": 369
},
{
"epoch": 0.5584905660377358,
"grad_norm": 0.4741231113436004,
"learning_rate": 4.521544487968663e-05,
"loss": 0.6925,
"step": 370
},
{
"epoch": 0.56,
"grad_norm": 0.48896613714320825,
"learning_rate": 4.518746502518187e-05,
"loss": 0.6983,
"step": 371
},
{
"epoch": 0.5615094339622642,
"grad_norm": 0.42917322921737827,
"learning_rate": 4.515948517067711e-05,
"loss": 0.7469,
"step": 372
},
{
"epoch": 0.5630188679245283,
"grad_norm": 0.41025766821703724,
"learning_rate": 4.5131505316172354e-05,
"loss": 0.7545,
"step": 373
},
{
"epoch": 0.5645283018867925,
"grad_norm": 0.5635148674930043,
"learning_rate": 4.51035254616676e-05,
"loss": 0.8145,
"step": 374
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.40552510627195454,
"learning_rate": 4.507554560716285e-05,
"loss": 0.7851,
"step": 375
},
{
"epoch": 0.5675471698113208,
"grad_norm": 0.7307038202791475,
"learning_rate": 4.504756575265809e-05,
"loss": 0.8569,
"step": 376
},
{
"epoch": 0.5690566037735849,
"grad_norm": 1.0811129556259813,
"learning_rate": 4.5019585898153335e-05,
"loss": 0.9044,
"step": 377
},
{
"epoch": 0.570566037735849,
"grad_norm": 0.42008950694302577,
"learning_rate": 4.499160604364858e-05,
"loss": 0.8506,
"step": 378
},
{
"epoch": 0.5720754716981132,
"grad_norm": 0.42648022512202355,
"learning_rate": 4.496362618914382e-05,
"loss": 0.8346,
"step": 379
},
{
"epoch": 0.5735849056603773,
"grad_norm": 0.45926513207861036,
"learning_rate": 4.4935646334639065e-05,
"loss": 0.8785,
"step": 380
},
{
"epoch": 0.5750943396226416,
"grad_norm": 0.43277141797293067,
"learning_rate": 4.49076664801343e-05,
"loss": 0.7923,
"step": 381
},
{
"epoch": 0.5766037735849057,
"grad_norm": 0.4171395355778019,
"learning_rate": 4.4879686625629545e-05,
"loss": 0.7133,
"step": 382
},
{
"epoch": 0.5781132075471698,
"grad_norm": 0.49673115482652136,
"learning_rate": 4.485170677112479e-05,
"loss": 0.7325,
"step": 383
},
{
"epoch": 0.579622641509434,
"grad_norm": 0.7116762388295127,
"learning_rate": 4.482372691662003e-05,
"loss": 0.7799,
"step": 384
},
{
"epoch": 0.5811320754716981,
"grad_norm": 0.8174353919858245,
"learning_rate": 4.479574706211528e-05,
"loss": 0.8845,
"step": 385
},
{
"epoch": 0.5826415094339623,
"grad_norm": 0.8335644956676623,
"learning_rate": 4.4767767207610525e-05,
"loss": 0.7319,
"step": 386
},
{
"epoch": 0.5841509433962264,
"grad_norm": 0.6876169170804783,
"learning_rate": 4.473978735310577e-05,
"loss": 0.8176,
"step": 387
},
{
"epoch": 0.5856603773584905,
"grad_norm": 0.5536550742512403,
"learning_rate": 4.471180749860101e-05,
"loss": 0.8086,
"step": 388
},
{
"epoch": 0.5871698113207547,
"grad_norm": 0.5323019116550912,
"learning_rate": 4.4683827644096255e-05,
"loss": 0.7065,
"step": 389
},
{
"epoch": 0.5886792452830188,
"grad_norm": 0.5135931602373224,
"learning_rate": 4.46558477895915e-05,
"loss": 0.7253,
"step": 390
},
{
"epoch": 0.5901886792452831,
"grad_norm": 0.4559259275389328,
"learning_rate": 4.462786793508674e-05,
"loss": 0.7999,
"step": 391
},
{
"epoch": 0.5916981132075472,
"grad_norm": 0.44803810661731486,
"learning_rate": 4.459988808058198e-05,
"loss": 0.8239,
"step": 392
},
{
"epoch": 0.5932075471698113,
"grad_norm": 0.5069482759856541,
"learning_rate": 4.457190822607722e-05,
"loss": 0.8289,
"step": 393
},
{
"epoch": 0.5947169811320755,
"grad_norm": 0.5671863335272355,
"learning_rate": 4.454392837157247e-05,
"loss": 0.8187,
"step": 394
},
{
"epoch": 0.5962264150943396,
"grad_norm": 0.4606809083404723,
"learning_rate": 4.4515948517067715e-05,
"loss": 0.8192,
"step": 395
},
{
"epoch": 0.5977358490566038,
"grad_norm": 0.38018446697126707,
"learning_rate": 4.448796866256296e-05,
"loss": 0.7691,
"step": 396
},
{
"epoch": 0.5992452830188679,
"grad_norm": 0.4323225254781683,
"learning_rate": 4.44599888080582e-05,
"loss": 0.8939,
"step": 397
},
{
"epoch": 0.600754716981132,
"grad_norm": 0.428772129920215,
"learning_rate": 4.4432008953553445e-05,
"loss": 0.7648,
"step": 398
},
{
"epoch": 0.6022641509433962,
"grad_norm": 0.4106799432786821,
"learning_rate": 4.440402909904869e-05,
"loss": 0.7747,
"step": 399
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.9201724559585969,
"learning_rate": 4.437604924454393e-05,
"loss": 0.7752,
"step": 400
},
{
"epoch": 0.6052830188679246,
"grad_norm": 0.63142220777098,
"learning_rate": 4.4348069390039175e-05,
"loss": 0.7367,
"step": 401
},
{
"epoch": 0.6067924528301887,
"grad_norm": 0.5430276304717978,
"learning_rate": 4.432008953553441e-05,
"loss": 0.804,
"step": 402
},
{
"epoch": 0.6083018867924528,
"grad_norm": 0.439182439759478,
"learning_rate": 4.429210968102966e-05,
"loss": 0.8251,
"step": 403
},
{
"epoch": 0.609811320754717,
"grad_norm": 0.5486244365234464,
"learning_rate": 4.4264129826524905e-05,
"loss": 0.7075,
"step": 404
},
{
"epoch": 0.6113207547169811,
"grad_norm": 0.5277673001485089,
"learning_rate": 4.423614997202015e-05,
"loss": 0.7607,
"step": 405
},
{
"epoch": 0.6128301886792453,
"grad_norm": 0.5256998794976435,
"learning_rate": 4.420817011751539e-05,
"loss": 0.7948,
"step": 406
},
{
"epoch": 0.6143396226415094,
"grad_norm": 0.5122728349415228,
"learning_rate": 4.4180190263010635e-05,
"loss": 0.8268,
"step": 407
},
{
"epoch": 0.6158490566037735,
"grad_norm": 0.5089282410409556,
"learning_rate": 4.415221040850588e-05,
"loss": 0.7666,
"step": 408
},
{
"epoch": 0.6173584905660378,
"grad_norm": 1.4029487791422663,
"learning_rate": 4.412423055400112e-05,
"loss": 0.7687,
"step": 409
},
{
"epoch": 0.6188679245283019,
"grad_norm": 1.0081149280553756,
"learning_rate": 4.4096250699496365e-05,
"loss": 0.7524,
"step": 410
},
{
"epoch": 0.6203773584905661,
"grad_norm": 0.9006929253576336,
"learning_rate": 4.406827084499161e-05,
"loss": 0.8247,
"step": 411
},
{
"epoch": 0.6218867924528302,
"grad_norm": 0.8411745640797793,
"learning_rate": 4.4040290990486845e-05,
"loss": 0.8105,
"step": 412
},
{
"epoch": 0.6233962264150943,
"grad_norm": 0.7294710313107107,
"learning_rate": 4.4012311135982096e-05,
"loss": 0.748,
"step": 413
},
{
"epoch": 0.6249056603773585,
"grad_norm": 4.837670218659803,
"learning_rate": 4.398433128147734e-05,
"loss": 0.8315,
"step": 414
},
{
"epoch": 0.6264150943396226,
"grad_norm": 0.7291575178965143,
"learning_rate": 4.395635142697258e-05,
"loss": 0.7576,
"step": 415
},
{
"epoch": 0.6279245283018868,
"grad_norm": 0.67449080523453,
"learning_rate": 4.3928371572467826e-05,
"loss": 0.7537,
"step": 416
},
{
"epoch": 0.6294339622641509,
"grad_norm": 0.5951820027076258,
"learning_rate": 4.390039171796307e-05,
"loss": 0.7073,
"step": 417
},
{
"epoch": 0.630943396226415,
"grad_norm": 0.7759167017515514,
"learning_rate": 4.387241186345831e-05,
"loss": 0.7775,
"step": 418
},
{
"epoch": 0.6324528301886793,
"grad_norm": 0.65776925709979,
"learning_rate": 4.3844432008953556e-05,
"loss": 0.7754,
"step": 419
},
{
"epoch": 0.6339622641509434,
"grad_norm": 0.783091467326888,
"learning_rate": 4.38164521544488e-05,
"loss": 0.7448,
"step": 420
},
{
"epoch": 0.6354716981132076,
"grad_norm": 0.6111140171989418,
"learning_rate": 4.378847229994404e-05,
"loss": 0.7472,
"step": 421
},
{
"epoch": 0.6369811320754717,
"grad_norm": 0.6125601796793868,
"learning_rate": 4.3760492445439286e-05,
"loss": 0.7546,
"step": 422
},
{
"epoch": 0.6384905660377358,
"grad_norm": 0.5915644550948223,
"learning_rate": 4.373251259093453e-05,
"loss": 0.8102,
"step": 423
},
{
"epoch": 0.64,
"grad_norm": 4.073736000326432,
"learning_rate": 4.370453273642977e-05,
"loss": 0.7923,
"step": 424
},
{
"epoch": 0.6415094339622641,
"grad_norm": 0.8436199611615184,
"learning_rate": 4.3676552881925016e-05,
"loss": 0.8023,
"step": 425
},
{
"epoch": 0.6430188679245283,
"grad_norm": 0.5606909686356082,
"learning_rate": 4.364857302742026e-05,
"loss": 0.7346,
"step": 426
},
{
"epoch": 0.6445283018867924,
"grad_norm": 0.7771075034666999,
"learning_rate": 4.36205931729155e-05,
"loss": 0.6807,
"step": 427
},
{
"epoch": 0.6460377358490565,
"grad_norm": 0.5192219758704827,
"learning_rate": 4.3592613318410746e-05,
"loss": 0.7678,
"step": 428
},
{
"epoch": 0.6475471698113208,
"grad_norm": 0.6083104527276569,
"learning_rate": 4.356463346390599e-05,
"loss": 0.7552,
"step": 429
},
{
"epoch": 0.6490566037735849,
"grad_norm": 0.601019714720564,
"learning_rate": 4.353665360940123e-05,
"loss": 0.8074,
"step": 430
},
{
"epoch": 0.6505660377358491,
"grad_norm": 0.6089857562150989,
"learning_rate": 4.3508673754896476e-05,
"loss": 0.7508,
"step": 431
},
{
"epoch": 0.6520754716981132,
"grad_norm": 0.6249210482138133,
"learning_rate": 4.348069390039172e-05,
"loss": 0.7558,
"step": 432
},
{
"epoch": 0.6535849056603774,
"grad_norm": 2.537695860786402,
"learning_rate": 4.345271404588696e-05,
"loss": 0.817,
"step": 433
},
{
"epoch": 0.6550943396226415,
"grad_norm": 0.9217891149931049,
"learning_rate": 4.3424734191382206e-05,
"loss": 0.7971,
"step": 434
},
{
"epoch": 0.6566037735849056,
"grad_norm": 0.5519943971261175,
"learning_rate": 4.339675433687745e-05,
"loss": 0.8088,
"step": 435
},
{
"epoch": 0.6581132075471698,
"grad_norm": 0.7425761436709358,
"learning_rate": 4.336877448237269e-05,
"loss": 0.756,
"step": 436
},
{
"epoch": 0.659622641509434,
"grad_norm": 0.6942699916554731,
"learning_rate": 4.3340794627867936e-05,
"loss": 0.8032,
"step": 437
},
{
"epoch": 0.6611320754716982,
"grad_norm": 0.5334615246424551,
"learning_rate": 4.331281477336318e-05,
"loss": 0.7998,
"step": 438
},
{
"epoch": 0.6626415094339623,
"grad_norm": 0.8292482053092843,
"learning_rate": 4.328483491885842e-05,
"loss": 0.7431,
"step": 439
},
{
"epoch": 0.6641509433962264,
"grad_norm": 0.4530926505666787,
"learning_rate": 4.3256855064353666e-05,
"loss": 0.7596,
"step": 440
},
{
"epoch": 0.6656603773584906,
"grad_norm": 0.6624652358047928,
"learning_rate": 4.322887520984891e-05,
"loss": 0.7712,
"step": 441
},
{
"epoch": 0.6671698113207547,
"grad_norm": 0.5064411908735837,
"learning_rate": 4.320089535534415e-05,
"loss": 0.695,
"step": 442
},
{
"epoch": 0.6686792452830189,
"grad_norm": 0.5377369847550588,
"learning_rate": 4.31729155008394e-05,
"loss": 0.756,
"step": 443
},
{
"epoch": 0.670188679245283,
"grad_norm": 0.4983199069825207,
"learning_rate": 4.314493564633464e-05,
"loss": 0.7807,
"step": 444
},
{
"epoch": 0.6716981132075471,
"grad_norm": 0.44158119758359315,
"learning_rate": 4.311695579182988e-05,
"loss": 0.7754,
"step": 445
},
{
"epoch": 0.6732075471698113,
"grad_norm": 0.5596653554139499,
"learning_rate": 4.3088975937325127e-05,
"loss": 0.8195,
"step": 446
},
{
"epoch": 0.6747169811320755,
"grad_norm": 0.43436077791659994,
"learning_rate": 4.306099608282037e-05,
"loss": 0.7604,
"step": 447
},
{
"epoch": 0.6762264150943397,
"grad_norm": 0.5030810215462936,
"learning_rate": 4.303301622831561e-05,
"loss": 0.6785,
"step": 448
},
{
"epoch": 0.6777358490566038,
"grad_norm": 0.5482694189131312,
"learning_rate": 4.300503637381086e-05,
"loss": 0.7199,
"step": 449
},
{
"epoch": 0.6792452830188679,
"grad_norm": 0.5967150137777674,
"learning_rate": 4.29770565193061e-05,
"loss": 0.7983,
"step": 450
},
{
"epoch": 0.6807547169811321,
"grad_norm": 0.5940918534861581,
"learning_rate": 4.294907666480134e-05,
"loss": 0.7591,
"step": 451
},
{
"epoch": 0.6822641509433962,
"grad_norm": 0.5427565481245458,
"learning_rate": 4.2921096810296593e-05,
"loss": 0.8326,
"step": 452
},
{
"epoch": 0.6837735849056604,
"grad_norm": 2.638916003074341,
"learning_rate": 4.289311695579184e-05,
"loss": 0.7776,
"step": 453
},
{
"epoch": 0.6852830188679245,
"grad_norm": 0.8031629641842678,
"learning_rate": 4.2865137101287073e-05,
"loss": 0.7061,
"step": 454
},
{
"epoch": 0.6867924528301886,
"grad_norm": 0.4705714421622477,
"learning_rate": 4.283715724678232e-05,
"loss": 0.8115,
"step": 455
},
{
"epoch": 0.6883018867924529,
"grad_norm": 0.694102381672302,
"learning_rate": 4.280917739227756e-05,
"loss": 0.7455,
"step": 456
},
{
"epoch": 0.689811320754717,
"grad_norm": 0.7026076420751877,
"learning_rate": 4.2781197537772804e-05,
"loss": 0.7419,
"step": 457
},
{
"epoch": 0.6913207547169812,
"grad_norm": 0.5435116018268908,
"learning_rate": 4.275321768326805e-05,
"loss": 0.7141,
"step": 458
},
{
"epoch": 0.6928301886792453,
"grad_norm": 0.7695258633555748,
"learning_rate": 4.272523782876329e-05,
"loss": 0.7067,
"step": 459
},
{
"epoch": 0.6943396226415094,
"grad_norm": 1.7975424022373265,
"learning_rate": 4.2697257974258534e-05,
"loss": 0.8697,
"step": 460
},
{
"epoch": 0.6958490566037736,
"grad_norm": 0.6317051374255238,
"learning_rate": 4.2669278119753784e-05,
"loss": 0.7739,
"step": 461
},
{
"epoch": 0.6973584905660377,
"grad_norm": 0.6351687672902928,
"learning_rate": 4.264129826524903e-05,
"loss": 0.6983,
"step": 462
},
{
"epoch": 0.6988679245283019,
"grad_norm": 0.6244759183757971,
"learning_rate": 4.261331841074427e-05,
"loss": 0.7031,
"step": 463
},
{
"epoch": 0.700377358490566,
"grad_norm": 0.5514796773294356,
"learning_rate": 4.258533855623951e-05,
"loss": 0.6884,
"step": 464
},
{
"epoch": 0.7018867924528301,
"grad_norm": 0.4726505169399417,
"learning_rate": 4.255735870173475e-05,
"loss": 0.7382,
"step": 465
},
{
"epoch": 0.7033962264150944,
"grad_norm": 0.6739904189805587,
"learning_rate": 4.2529378847229994e-05,
"loss": 0.8122,
"step": 466
},
{
"epoch": 0.7049056603773585,
"grad_norm": 0.4821838176068059,
"learning_rate": 4.250139899272524e-05,
"loss": 0.8558,
"step": 467
},
{
"epoch": 0.7064150943396227,
"grad_norm": 0.44367485643876686,
"learning_rate": 4.247341913822048e-05,
"loss": 0.7216,
"step": 468
},
{
"epoch": 0.7079245283018868,
"grad_norm": 0.4529362658881182,
"learning_rate": 4.2445439283715724e-05,
"loss": 0.6532,
"step": 469
},
{
"epoch": 0.7094339622641509,
"grad_norm": 0.3961737139001265,
"learning_rate": 4.241745942921097e-05,
"loss": 0.7866,
"step": 470
},
{
"epoch": 0.7109433962264151,
"grad_norm": 0.5258997289167945,
"learning_rate": 4.238947957470622e-05,
"loss": 0.8513,
"step": 471
},
{
"epoch": 0.7124528301886792,
"grad_norm": 0.40132597205607645,
"learning_rate": 4.236149972020146e-05,
"loss": 0.8006,
"step": 472
},
{
"epoch": 0.7139622641509434,
"grad_norm": 0.4178549903280029,
"learning_rate": 4.2333519865696704e-05,
"loss": 0.7349,
"step": 473
},
{
"epoch": 0.7154716981132075,
"grad_norm": 0.6137285870140219,
"learning_rate": 4.230554001119195e-05,
"loss": 0.8465,
"step": 474
},
{
"epoch": 0.7169811320754716,
"grad_norm": 0.549511124878987,
"learning_rate": 4.2277560156687184e-05,
"loss": 0.6763,
"step": 475
},
{
"epoch": 0.7184905660377359,
"grad_norm": 0.46406879130022355,
"learning_rate": 4.224958030218243e-05,
"loss": 0.7497,
"step": 476
},
{
"epoch": 0.72,
"grad_norm": 0.4888047862828803,
"learning_rate": 4.222160044767767e-05,
"loss": 0.7669,
"step": 477
},
{
"epoch": 0.7215094339622642,
"grad_norm": 0.49711406759007537,
"learning_rate": 4.2193620593172914e-05,
"loss": 0.8006,
"step": 478
},
{
"epoch": 0.7230188679245283,
"grad_norm": 0.4176159200268463,
"learning_rate": 4.216564073866816e-05,
"loss": 0.7379,
"step": 479
},
{
"epoch": 0.7245283018867924,
"grad_norm": 0.5327893586007508,
"learning_rate": 4.213766088416341e-05,
"loss": 0.8138,
"step": 480
},
{
"epoch": 0.7260377358490566,
"grad_norm": 0.9142602019147515,
"learning_rate": 4.210968102965865e-05,
"loss": 0.7227,
"step": 481
},
{
"epoch": 0.7275471698113207,
"grad_norm": 0.8778784458357308,
"learning_rate": 4.2081701175153894e-05,
"loss": 0.7579,
"step": 482
},
{
"epoch": 0.7290566037735849,
"grad_norm": 0.6964388295748577,
"learning_rate": 4.205372132064914e-05,
"loss": 0.712,
"step": 483
},
{
"epoch": 0.730566037735849,
"grad_norm": 0.43020746781080765,
"learning_rate": 4.202574146614438e-05,
"loss": 0.7171,
"step": 484
},
{
"epoch": 0.7320754716981132,
"grad_norm": 0.5749225465192066,
"learning_rate": 4.199776161163962e-05,
"loss": 0.7308,
"step": 485
},
{
"epoch": 0.7335849056603774,
"grad_norm": 0.6045836791728121,
"learning_rate": 4.196978175713486e-05,
"loss": 0.7837,
"step": 486
},
{
"epoch": 0.7350943396226415,
"grad_norm": 0.853172024259124,
"learning_rate": 4.1941801902630104e-05,
"loss": 0.7823,
"step": 487
},
{
"epoch": 0.7366037735849057,
"grad_norm": 0.8328941937503014,
"learning_rate": 4.191382204812535e-05,
"loss": 0.7805,
"step": 488
},
{
"epoch": 0.7381132075471698,
"grad_norm": 0.5856813273142618,
"learning_rate": 4.18858421936206e-05,
"loss": 0.7613,
"step": 489
},
{
"epoch": 0.7396226415094339,
"grad_norm": 0.7017696574876483,
"learning_rate": 4.185786233911584e-05,
"loss": 0.7977,
"step": 490
},
{
"epoch": 0.7411320754716981,
"grad_norm": 0.5883283825983562,
"learning_rate": 4.1829882484611085e-05,
"loss": 0.6595,
"step": 491
},
{
"epoch": 0.7426415094339622,
"grad_norm": 0.9189100296218359,
"learning_rate": 4.180190263010633e-05,
"loss": 0.8225,
"step": 492
},
{
"epoch": 0.7441509433962264,
"grad_norm": 0.4621273821982856,
"learning_rate": 4.177392277560157e-05,
"loss": 0.7267,
"step": 493
},
{
"epoch": 0.7456603773584906,
"grad_norm": 0.7263731461661145,
"learning_rate": 4.1745942921096815e-05,
"loss": 0.8351,
"step": 494
},
{
"epoch": 0.7471698113207547,
"grad_norm": 0.4239681400253427,
"learning_rate": 4.171796306659206e-05,
"loss": 0.8602,
"step": 495
},
{
"epoch": 0.7486792452830189,
"grad_norm": 0.5587028545439432,
"learning_rate": 4.1689983212087295e-05,
"loss": 0.7781,
"step": 496
},
{
"epoch": 0.750188679245283,
"grad_norm": 0.43677162709699907,
"learning_rate": 4.166200335758254e-05,
"loss": 0.7885,
"step": 497
},
{
"epoch": 0.7516981132075472,
"grad_norm": 0.5956328426453763,
"learning_rate": 4.163402350307778e-05,
"loss": 0.7078,
"step": 498
},
{
"epoch": 0.7532075471698113,
"grad_norm": 0.44802871978378656,
"learning_rate": 4.160604364857303e-05,
"loss": 0.7486,
"step": 499
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.40451743130792983,
"learning_rate": 4.1578063794068275e-05,
"loss": 0.7091,
"step": 500
},
{
"epoch": 0.7562264150943396,
"grad_norm": 0.4482261465687673,
"learning_rate": 4.155008393956352e-05,
"loss": 0.708,
"step": 501
},
{
"epoch": 0.7577358490566037,
"grad_norm": 4.209187428133594,
"learning_rate": 4.152210408505876e-05,
"loss": 0.7711,
"step": 502
},
{
"epoch": 0.759245283018868,
"grad_norm": 0.6084246843903299,
"learning_rate": 4.1494124230554005e-05,
"loss": 0.7152,
"step": 503
},
{
"epoch": 0.7607547169811321,
"grad_norm": 0.5301187311477114,
"learning_rate": 4.146614437604925e-05,
"loss": 0.822,
"step": 504
},
{
"epoch": 0.7622641509433963,
"grad_norm": 0.5775612395962103,
"learning_rate": 4.143816452154449e-05,
"loss": 0.773,
"step": 505
},
{
"epoch": 0.7637735849056604,
"grad_norm": 0.5169776427958961,
"learning_rate": 4.141018466703973e-05,
"loss": 0.7756,
"step": 506
},
{
"epoch": 0.7652830188679245,
"grad_norm": 0.6175580792907378,
"learning_rate": 4.138220481253497e-05,
"loss": 0.7714,
"step": 507
},
{
"epoch": 0.7667924528301887,
"grad_norm": 0.5202456866228283,
"learning_rate": 4.135422495803022e-05,
"loss": 0.7569,
"step": 508
},
{
"epoch": 0.7683018867924528,
"grad_norm": 0.43230254405968893,
"learning_rate": 4.1326245103525465e-05,
"loss": 0.7701,
"step": 509
},
{
"epoch": 0.769811320754717,
"grad_norm": 0.6199856475998696,
"learning_rate": 4.129826524902071e-05,
"loss": 0.7365,
"step": 510
},
{
"epoch": 0.7713207547169811,
"grad_norm": 0.4590557643196903,
"learning_rate": 4.127028539451595e-05,
"loss": 0.7349,
"step": 511
},
{
"epoch": 0.7728301886792452,
"grad_norm": 0.5947228546936734,
"learning_rate": 4.1242305540011195e-05,
"loss": 0.7292,
"step": 512
},
{
"epoch": 0.7743396226415095,
"grad_norm": 0.503790040694598,
"learning_rate": 4.121432568550644e-05,
"loss": 0.7439,
"step": 513
},
{
"epoch": 0.7758490566037736,
"grad_norm": 0.6227639804784848,
"learning_rate": 4.118634583100168e-05,
"loss": 0.7778,
"step": 514
},
{
"epoch": 0.7773584905660378,
"grad_norm": 0.7695881075574827,
"learning_rate": 4.1158365976496925e-05,
"loss": 0.7474,
"step": 515
},
{
"epoch": 0.7788679245283019,
"grad_norm": 0.4832882825699558,
"learning_rate": 4.113038612199216e-05,
"loss": 0.7468,
"step": 516
},
{
"epoch": 0.780377358490566,
"grad_norm": 0.5553008775060718,
"learning_rate": 4.110240626748741e-05,
"loss": 0.6952,
"step": 517
},
{
"epoch": 0.7818867924528302,
"grad_norm": 0.5887331512376914,
"learning_rate": 4.1074426412982655e-05,
"loss": 0.753,
"step": 518
},
{
"epoch": 0.7833962264150943,
"grad_norm": 0.44943535047602,
"learning_rate": 4.10464465584779e-05,
"loss": 0.7084,
"step": 519
},
{
"epoch": 0.7849056603773585,
"grad_norm": 0.6695367567635757,
"learning_rate": 4.101846670397314e-05,
"loss": 0.8359,
"step": 520
},
{
"epoch": 0.7864150943396226,
"grad_norm": 0.4641289136731032,
"learning_rate": 4.0990486849468386e-05,
"loss": 0.727,
"step": 521
},
{
"epoch": 0.7879245283018868,
"grad_norm": 0.7406198663798486,
"learning_rate": 4.096250699496363e-05,
"loss": 0.7915,
"step": 522
},
{
"epoch": 0.789433962264151,
"grad_norm": 0.44176398853870474,
"learning_rate": 4.093452714045887e-05,
"loss": 0.6947,
"step": 523
},
{
"epoch": 0.7909433962264151,
"grad_norm": 0.48042954012719924,
"learning_rate": 4.0906547285954116e-05,
"loss": 0.7552,
"step": 524
},
{
"epoch": 0.7924528301886793,
"grad_norm": 0.5842724845306636,
"learning_rate": 4.087856743144936e-05,
"loss": 0.8175,
"step": 525
},
{
"epoch": 0.7939622641509434,
"grad_norm": 0.4284726371980202,
"learning_rate": 4.08505875769446e-05,
"loss": 0.7494,
"step": 526
},
{
"epoch": 0.7954716981132075,
"grad_norm": 0.5254149807506523,
"learning_rate": 4.0822607722439846e-05,
"loss": 0.7549,
"step": 527
},
{
"epoch": 0.7969811320754717,
"grad_norm": 0.48365346339231174,
"learning_rate": 4.079462786793509e-05,
"loss": 0.6465,
"step": 528
},
{
"epoch": 0.7984905660377358,
"grad_norm": 0.5361901921704368,
"learning_rate": 4.076664801343033e-05,
"loss": 0.7893,
"step": 529
},
{
"epoch": 0.8,
"grad_norm": 0.4450391820564859,
"learning_rate": 4.0738668158925576e-05,
"loss": 0.7596,
"step": 530
},
{
"epoch": 0.8015094339622642,
"grad_norm": 0.4238114156508658,
"learning_rate": 4.071068830442082e-05,
"loss": 0.7685,
"step": 531
},
{
"epoch": 0.8030188679245283,
"grad_norm": 0.48625846049027577,
"learning_rate": 4.068270844991606e-05,
"loss": 0.6887,
"step": 532
},
{
"epoch": 0.8045283018867925,
"grad_norm": 0.4417344646967358,
"learning_rate": 4.0654728595411306e-05,
"loss": 0.8144,
"step": 533
},
{
"epoch": 0.8060377358490566,
"grad_norm": 0.38616318769298813,
"learning_rate": 4.062674874090655e-05,
"loss": 0.7711,
"step": 534
},
{
"epoch": 0.8075471698113208,
"grad_norm": 0.4732687688025921,
"learning_rate": 4.059876888640179e-05,
"loss": 0.7262,
"step": 535
},
{
"epoch": 0.8090566037735849,
"grad_norm": 0.3662632839035369,
"learning_rate": 4.0570789031897036e-05,
"loss": 0.7762,
"step": 536
},
{
"epoch": 0.810566037735849,
"grad_norm": 0.41660011598030444,
"learning_rate": 4.054280917739228e-05,
"loss": 0.8063,
"step": 537
},
{
"epoch": 0.8120754716981132,
"grad_norm": 0.5061770396979196,
"learning_rate": 4.051482932288752e-05,
"loss": 0.8055,
"step": 538
},
{
"epoch": 0.8135849056603773,
"grad_norm": 0.4321594676145302,
"learning_rate": 4.0486849468382766e-05,
"loss": 0.6895,
"step": 539
},
{
"epoch": 0.8150943396226416,
"grad_norm": 0.4154273903737854,
"learning_rate": 4.045886961387801e-05,
"loss": 0.8246,
"step": 540
},
{
"epoch": 0.8166037735849057,
"grad_norm": 0.44089593064248783,
"learning_rate": 4.043088975937325e-05,
"loss": 0.6919,
"step": 541
},
{
"epoch": 0.8181132075471698,
"grad_norm": 0.3931592707406957,
"learning_rate": 4.0402909904868496e-05,
"loss": 0.7232,
"step": 542
},
{
"epoch": 0.819622641509434,
"grad_norm": 0.3824488662725976,
"learning_rate": 4.037493005036374e-05,
"loss": 0.678,
"step": 543
},
{
"epoch": 0.8211320754716981,
"grad_norm": 0.47459855214234453,
"learning_rate": 4.034695019585898e-05,
"loss": 0.6916,
"step": 544
},
{
"epoch": 0.8226415094339623,
"grad_norm": 0.4092162364813964,
"learning_rate": 4.0318970341354226e-05,
"loss": 0.7284,
"step": 545
},
{
"epoch": 0.8241509433962264,
"grad_norm": 0.40158642010989914,
"learning_rate": 4.029099048684947e-05,
"loss": 0.7858,
"step": 546
},
{
"epoch": 0.8256603773584905,
"grad_norm": 0.47547007845195716,
"learning_rate": 4.026301063234472e-05,
"loss": 0.7484,
"step": 547
},
{
"epoch": 0.8271698113207547,
"grad_norm": 0.41353726994458534,
"learning_rate": 4.0235030777839956e-05,
"loss": 0.6895,
"step": 548
},
{
"epoch": 0.8286792452830188,
"grad_norm": 0.4261847130879816,
"learning_rate": 4.02070509233352e-05,
"loss": 0.7615,
"step": 549
},
{
"epoch": 0.8301886792452831,
"grad_norm": 0.3713485609487883,
"learning_rate": 4.017907106883044e-05,
"loss": 0.702,
"step": 550
},
{
"epoch": 0.8316981132075472,
"grad_norm": 0.4940211687654344,
"learning_rate": 4.0151091214325686e-05,
"loss": 0.7926,
"step": 551
},
{
"epoch": 0.8332075471698113,
"grad_norm": 0.4418268573162059,
"learning_rate": 4.012311135982093e-05,
"loss": 0.7477,
"step": 552
},
{
"epoch": 0.8347169811320755,
"grad_norm": 0.4296889446715816,
"learning_rate": 4.009513150531617e-05,
"loss": 0.7066,
"step": 553
},
{
"epoch": 0.8362264150943396,
"grad_norm": 0.43481881270259315,
"learning_rate": 4.0067151650811416e-05,
"loss": 0.7104,
"step": 554
},
{
"epoch": 0.8377358490566038,
"grad_norm": 0.444949300217321,
"learning_rate": 4.003917179630666e-05,
"loss": 0.8165,
"step": 555
},
{
"epoch": 0.8392452830188679,
"grad_norm": 0.4167038227484225,
"learning_rate": 4.00111919418019e-05,
"loss": 0.7655,
"step": 556
},
{
"epoch": 0.840754716981132,
"grad_norm": 0.46442021729984984,
"learning_rate": 3.998321208729715e-05,
"loss": 0.8047,
"step": 557
},
{
"epoch": 0.8422641509433962,
"grad_norm": 0.452338392615249,
"learning_rate": 3.995523223279239e-05,
"loss": 0.6587,
"step": 558
},
{
"epoch": 0.8437735849056603,
"grad_norm": 0.41153779842520494,
"learning_rate": 3.992725237828763e-05,
"loss": 0.7026,
"step": 559
},
{
"epoch": 0.8452830188679246,
"grad_norm": 2.6556838900087136,
"learning_rate": 3.989927252378288e-05,
"loss": 0.7276,
"step": 560
},
{
"epoch": 0.8467924528301887,
"grad_norm": 0.4051057742589149,
"learning_rate": 3.987129266927812e-05,
"loss": 0.7598,
"step": 561
},
{
"epoch": 0.8483018867924528,
"grad_norm": 0.4783860215225303,
"learning_rate": 3.9843312814773363e-05,
"loss": 0.7517,
"step": 562
},
{
"epoch": 0.849811320754717,
"grad_norm": 0.379963266361194,
"learning_rate": 3.981533296026861e-05,
"loss": 0.7294,
"step": 563
},
{
"epoch": 0.8513207547169811,
"grad_norm": 0.3995738900162358,
"learning_rate": 3.978735310576385e-05,
"loss": 0.7339,
"step": 564
},
{
"epoch": 0.8528301886792453,
"grad_norm": 0.4429737030646909,
"learning_rate": 3.9759373251259093e-05,
"loss": 0.7687,
"step": 565
},
{
"epoch": 0.8543396226415094,
"grad_norm": 0.35025220758613634,
"learning_rate": 3.9731393396754344e-05,
"loss": 0.7217,
"step": 566
},
{
"epoch": 0.8558490566037736,
"grad_norm": 0.5313539267701104,
"learning_rate": 3.970341354224959e-05,
"loss": 0.6841,
"step": 567
},
{
"epoch": 0.8573584905660377,
"grad_norm": 0.44159011121842684,
"learning_rate": 3.9675433687744824e-05,
"loss": 0.7777,
"step": 568
},
{
"epoch": 0.8588679245283019,
"grad_norm": 0.5032221283954244,
"learning_rate": 3.964745383324007e-05,
"loss": 0.7903,
"step": 569
},
{
"epoch": 0.8603773584905661,
"grad_norm": 0.5209160538052351,
"learning_rate": 3.961947397873531e-05,
"loss": 0.7832,
"step": 570
},
{
"epoch": 0.8618867924528302,
"grad_norm": 0.4117034313241634,
"learning_rate": 3.9591494124230554e-05,
"loss": 0.7483,
"step": 571
},
{
"epoch": 0.8633962264150944,
"grad_norm": 0.6056396156171621,
"learning_rate": 3.95635142697258e-05,
"loss": 0.7845,
"step": 572
},
{
"epoch": 0.8649056603773585,
"grad_norm": 0.4032653916231986,
"learning_rate": 3.953553441522104e-05,
"loss": 0.7263,
"step": 573
},
{
"epoch": 0.8664150943396226,
"grad_norm": 0.45120438719371664,
"learning_rate": 3.9507554560716284e-05,
"loss": 0.6961,
"step": 574
},
{
"epoch": 0.8679245283018868,
"grad_norm": 0.4944595803156947,
"learning_rate": 3.9479574706211534e-05,
"loss": 0.7188,
"step": 575
},
{
"epoch": 0.8694339622641509,
"grad_norm": 0.46427671257440184,
"learning_rate": 3.945159485170678e-05,
"loss": 0.798,
"step": 576
},
{
"epoch": 0.8709433962264151,
"grad_norm": 0.6434219625626204,
"learning_rate": 3.942361499720202e-05,
"loss": 0.7815,
"step": 577
},
{
"epoch": 0.8724528301886793,
"grad_norm": 0.4620657198048205,
"learning_rate": 3.9395635142697264e-05,
"loss": 0.7327,
"step": 578
},
{
"epoch": 0.8739622641509434,
"grad_norm": 0.40262252638288304,
"learning_rate": 3.93676552881925e-05,
"loss": 0.7607,
"step": 579
},
{
"epoch": 0.8754716981132076,
"grad_norm": 0.45983570698328446,
"learning_rate": 3.9339675433687744e-05,
"loss": 0.7012,
"step": 580
},
{
"epoch": 0.8769811320754717,
"grad_norm": 0.40363702713690075,
"learning_rate": 3.931169557918299e-05,
"loss": 0.7372,
"step": 581
},
{
"epoch": 0.8784905660377359,
"grad_norm": 0.3907346317291026,
"learning_rate": 3.928371572467823e-05,
"loss": 0.7761,
"step": 582
},
{
"epoch": 0.88,
"grad_norm": 0.3990895685405329,
"learning_rate": 3.9255735870173474e-05,
"loss": 0.726,
"step": 583
},
{
"epoch": 0.8815094339622641,
"grad_norm": 0.3729973185332231,
"learning_rate": 3.922775601566872e-05,
"loss": 0.7677,
"step": 584
},
{
"epoch": 0.8830188679245283,
"grad_norm": 0.4089477576652253,
"learning_rate": 3.919977616116397e-05,
"loss": 0.7048,
"step": 585
},
{
"epoch": 0.8845283018867924,
"grad_norm": 0.3945532227748924,
"learning_rate": 3.917179630665921e-05,
"loss": 0.7714,
"step": 586
},
{
"epoch": 0.8860377358490567,
"grad_norm": 0.5140900259958995,
"learning_rate": 3.9143816452154454e-05,
"loss": 0.7382,
"step": 587
},
{
"epoch": 0.8875471698113208,
"grad_norm": 0.38891901989138217,
"learning_rate": 3.91158365976497e-05,
"loss": 0.8208,
"step": 588
},
{
"epoch": 0.8890566037735849,
"grad_norm": 0.49569695718152956,
"learning_rate": 3.9087856743144934e-05,
"loss": 0.7436,
"step": 589
},
{
"epoch": 0.8905660377358491,
"grad_norm": 0.43285829098870593,
"learning_rate": 3.905987688864018e-05,
"loss": 0.7365,
"step": 590
},
{
"epoch": 0.8920754716981132,
"grad_norm": 0.36885687866534356,
"learning_rate": 3.903189703413542e-05,
"loss": 0.7022,
"step": 591
},
{
"epoch": 0.8935849056603774,
"grad_norm": 0.43241043983514127,
"learning_rate": 3.9003917179630664e-05,
"loss": 0.8662,
"step": 592
},
{
"epoch": 0.8950943396226415,
"grad_norm": 0.3649761816139934,
"learning_rate": 3.897593732512591e-05,
"loss": 0.7368,
"step": 593
},
{
"epoch": 0.8966037735849056,
"grad_norm": 0.6268302532119931,
"learning_rate": 3.894795747062116e-05,
"loss": 0.7893,
"step": 594
},
{
"epoch": 0.8981132075471698,
"grad_norm": 0.3667724539461183,
"learning_rate": 3.89199776161164e-05,
"loss": 0.7488,
"step": 595
},
{
"epoch": 0.8996226415094339,
"grad_norm": 0.48371976093317937,
"learning_rate": 3.8891997761611645e-05,
"loss": 0.7535,
"step": 596
},
{
"epoch": 0.9011320754716982,
"grad_norm": 0.4239299088649613,
"learning_rate": 3.886401790710689e-05,
"loss": 0.7663,
"step": 597
},
{
"epoch": 0.9026415094339623,
"grad_norm": 0.3779199263406811,
"learning_rate": 3.883603805260213e-05,
"loss": 0.7457,
"step": 598
},
{
"epoch": 0.9041509433962264,
"grad_norm": 0.9168684585034359,
"learning_rate": 3.880805819809737e-05,
"loss": 0.7519,
"step": 599
},
{
"epoch": 0.9056603773584906,
"grad_norm": 0.3661469557119822,
"learning_rate": 3.878007834359261e-05,
"loss": 0.7733,
"step": 600
},
{
"epoch": 0.9071698113207547,
"grad_norm": 3.9823358965275806,
"learning_rate": 3.8752098489087855e-05,
"loss": 0.9001,
"step": 601
},
{
"epoch": 0.9086792452830189,
"grad_norm": 0.7852186633819904,
"learning_rate": 3.87241186345831e-05,
"loss": 0.7157,
"step": 602
},
{
"epoch": 0.910188679245283,
"grad_norm": 0.5408079896292302,
"learning_rate": 3.869613878007835e-05,
"loss": 0.7022,
"step": 603
},
{
"epoch": 0.9116981132075471,
"grad_norm": 0.7834797565859986,
"learning_rate": 3.866815892557359e-05,
"loss": 0.6932,
"step": 604
},
{
"epoch": 0.9132075471698113,
"grad_norm": 0.5709707609818889,
"learning_rate": 3.8640179071068835e-05,
"loss": 0.7641,
"step": 605
},
{
"epoch": 0.9147169811320754,
"grad_norm": 0.5337869168215333,
"learning_rate": 3.861219921656408e-05,
"loss": 0.7806,
"step": 606
},
{
"epoch": 0.9162264150943397,
"grad_norm": 0.45607562967499066,
"learning_rate": 3.858421936205932e-05,
"loss": 0.6848,
"step": 607
},
{
"epoch": 0.9177358490566038,
"grad_norm": 0.5120296828216754,
"learning_rate": 3.8556239507554565e-05,
"loss": 0.7603,
"step": 608
},
{
"epoch": 0.9192452830188679,
"grad_norm": 0.4344514448742209,
"learning_rate": 3.852825965304981e-05,
"loss": 0.6694,
"step": 609
},
{
"epoch": 0.9207547169811321,
"grad_norm": 0.5404883518478442,
"learning_rate": 3.8500279798545045e-05,
"loss": 0.7228,
"step": 610
},
{
"epoch": 0.9222641509433962,
"grad_norm": 0.4970271679168196,
"learning_rate": 3.847229994404029e-05,
"loss": 0.7797,
"step": 611
},
{
"epoch": 0.9237735849056604,
"grad_norm": 0.49505737921449006,
"learning_rate": 3.844432008953553e-05,
"loss": 0.7037,
"step": 612
},
{
"epoch": 0.9252830188679245,
"grad_norm": 0.7739566407768966,
"learning_rate": 3.841634023503078e-05,
"loss": 0.7542,
"step": 613
},
{
"epoch": 0.9267924528301886,
"grad_norm": 0.4238126871917205,
"learning_rate": 3.8388360380526025e-05,
"loss": 0.7202,
"step": 614
},
{
"epoch": 0.9283018867924528,
"grad_norm": 0.6151008080921576,
"learning_rate": 3.836038052602127e-05,
"loss": 0.7636,
"step": 615
},
{
"epoch": 0.929811320754717,
"grad_norm": 0.42881880105092096,
"learning_rate": 3.833240067151651e-05,
"loss": 0.7613,
"step": 616
},
{
"epoch": 0.9313207547169812,
"grad_norm": 0.5146237853082963,
"learning_rate": 3.8304420817011755e-05,
"loss": 0.715,
"step": 617
},
{
"epoch": 0.9328301886792453,
"grad_norm": 0.3714673908627866,
"learning_rate": 3.8276440962507e-05,
"loss": 0.745,
"step": 618
},
{
"epoch": 0.9343396226415094,
"grad_norm": 0.4650679347629378,
"learning_rate": 3.824846110800224e-05,
"loss": 0.7422,
"step": 619
},
{
"epoch": 0.9358490566037736,
"grad_norm": 1.0749664641896852,
"learning_rate": 3.822048125349748e-05,
"loss": 0.7985,
"step": 620
},
{
"epoch": 0.9373584905660377,
"grad_norm": 0.40863953645268636,
"learning_rate": 3.819250139899272e-05,
"loss": 0.7385,
"step": 621
},
{
"epoch": 0.9388679245283019,
"grad_norm": 0.40252351812274834,
"learning_rate": 3.816452154448797e-05,
"loss": 0.7643,
"step": 622
},
{
"epoch": 0.940377358490566,
"grad_norm": 0.43559572158086535,
"learning_rate": 3.8136541689983215e-05,
"loss": 0.7439,
"step": 623
},
{
"epoch": 0.9418867924528301,
"grad_norm": 0.9641369155300525,
"learning_rate": 3.810856183547846e-05,
"loss": 0.7718,
"step": 624
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.4310013200104187,
"learning_rate": 3.80805819809737e-05,
"loss": 0.7925,
"step": 625
},
{
"epoch": 0.9449056603773585,
"grad_norm": 0.4194039790316717,
"learning_rate": 3.8052602126468945e-05,
"loss": 0.7287,
"step": 626
},
{
"epoch": 0.9464150943396227,
"grad_norm": 0.4665354497735913,
"learning_rate": 3.802462227196419e-05,
"loss": 0.6991,
"step": 627
},
{
"epoch": 0.9479245283018868,
"grad_norm": 0.45730976704052134,
"learning_rate": 3.799664241745943e-05,
"loss": 0.7642,
"step": 628
},
{
"epoch": 0.9494339622641509,
"grad_norm": 0.3856324295687261,
"learning_rate": 3.7968662562954675e-05,
"loss": 0.6671,
"step": 629
},
{
"epoch": 0.9509433962264151,
"grad_norm": 0.4107928434108811,
"learning_rate": 3.794068270844992e-05,
"loss": 0.7561,
"step": 630
},
{
"epoch": 0.9524528301886792,
"grad_norm": 0.44719174208858464,
"learning_rate": 3.791270285394516e-05,
"loss": 0.8235,
"step": 631
},
{
"epoch": 0.9539622641509434,
"grad_norm": 0.4008367508350329,
"learning_rate": 3.7884722999440406e-05,
"loss": 0.7292,
"step": 632
},
{
"epoch": 0.9554716981132075,
"grad_norm": 0.4391202885533926,
"learning_rate": 3.785674314493565e-05,
"loss": 0.8191,
"step": 633
},
{
"epoch": 0.9569811320754718,
"grad_norm": 0.42891554570166274,
"learning_rate": 3.782876329043089e-05,
"loss": 0.8442,
"step": 634
},
{
"epoch": 0.9584905660377359,
"grad_norm": 0.43000876777383873,
"learning_rate": 3.7800783435926136e-05,
"loss": 0.7543,
"step": 635
},
{
"epoch": 0.96,
"grad_norm": 0.4155832959292569,
"learning_rate": 3.777280358142138e-05,
"loss": 0.729,
"step": 636
},
{
"epoch": 0.9615094339622642,
"grad_norm": 1.3022894310168054,
"learning_rate": 3.774482372691662e-05,
"loss": 0.7624,
"step": 637
},
{
"epoch": 0.9630188679245283,
"grad_norm": 0.39110065047725334,
"learning_rate": 3.7716843872411866e-05,
"loss": 0.7788,
"step": 638
},
{
"epoch": 0.9645283018867925,
"grad_norm": 0.4420204425883805,
"learning_rate": 3.768886401790711e-05,
"loss": 0.8003,
"step": 639
},
{
"epoch": 0.9660377358490566,
"grad_norm": 0.47846340432718537,
"learning_rate": 3.766088416340235e-05,
"loss": 0.661,
"step": 640
},
{
"epoch": 0.9675471698113207,
"grad_norm": 0.45386680890224157,
"learning_rate": 3.7632904308897596e-05,
"loss": 0.7336,
"step": 641
},
{
"epoch": 0.9690566037735849,
"grad_norm": 0.4666290433954947,
"learning_rate": 3.760492445439284e-05,
"loss": 0.8496,
"step": 642
},
{
"epoch": 0.970566037735849,
"grad_norm": 0.5022567954654142,
"learning_rate": 3.757694459988808e-05,
"loss": 0.7529,
"step": 643
},
{
"epoch": 0.9720754716981133,
"grad_norm": 0.49014277465359335,
"learning_rate": 3.7548964745383326e-05,
"loss": 0.7722,
"step": 644
},
{
"epoch": 0.9735849056603774,
"grad_norm": 0.5190484716624026,
"learning_rate": 3.752098489087857e-05,
"loss": 0.723,
"step": 645
},
{
"epoch": 0.9750943396226415,
"grad_norm": 0.41311399967316736,
"learning_rate": 3.749300503637381e-05,
"loss": 0.7089,
"step": 646
},
{
"epoch": 0.9766037735849057,
"grad_norm": 0.4396273185284637,
"learning_rate": 3.7465025181869056e-05,
"loss": 0.7921,
"step": 647
},
{
"epoch": 0.9781132075471698,
"grad_norm": 0.4228832100437926,
"learning_rate": 3.74370453273643e-05,
"loss": 0.7376,
"step": 648
},
{
"epoch": 0.979622641509434,
"grad_norm": 0.42404331809881607,
"learning_rate": 3.740906547285954e-05,
"loss": 0.8042,
"step": 649
},
{
"epoch": 0.9811320754716981,
"grad_norm": 0.43384836343190214,
"learning_rate": 3.7381085618354786e-05,
"loss": 0.7457,
"step": 650
},
{
"epoch": 0.9826415094339622,
"grad_norm": 0.40991552369681833,
"learning_rate": 3.735310576385003e-05,
"loss": 0.7271,
"step": 651
},
{
"epoch": 0.9841509433962264,
"grad_norm": 0.42720312031263036,
"learning_rate": 3.732512590934527e-05,
"loss": 0.6783,
"step": 652
},
{
"epoch": 0.9856603773584905,
"grad_norm": 0.3820997114893428,
"learning_rate": 3.7297146054840516e-05,
"loss": 0.741,
"step": 653
},
{
"epoch": 0.9871698113207548,
"grad_norm": 0.3859225460582329,
"learning_rate": 3.726916620033576e-05,
"loss": 0.7616,
"step": 654
},
{
"epoch": 0.9886792452830189,
"grad_norm": 0.3184257704684841,
"learning_rate": 3.7241186345831e-05,
"loss": 0.7018,
"step": 655
},
{
"epoch": 0.990188679245283,
"grad_norm": 5.0525018446915455,
"learning_rate": 3.7213206491326246e-05,
"loss": 0.6822,
"step": 656
},
{
"epoch": 0.9916981132075472,
"grad_norm": 0.4774160070976838,
"learning_rate": 3.718522663682149e-05,
"loss": 0.759,
"step": 657
},
{
"epoch": 0.9932075471698113,
"grad_norm": 0.38749071520309747,
"learning_rate": 3.715724678231673e-05,
"loss": 0.681,
"step": 658
},
{
"epoch": 0.9947169811320755,
"grad_norm": 0.3565441160393527,
"learning_rate": 3.7129266927811976e-05,
"loss": 0.7235,
"step": 659
},
{
"epoch": 0.9962264150943396,
"grad_norm": 0.3620064697768303,
"learning_rate": 3.710128707330722e-05,
"loss": 0.702,
"step": 660
},
{
"epoch": 0.9977358490566037,
"grad_norm": 0.3997483511508158,
"learning_rate": 3.707330721880247e-05,
"loss": 0.7241,
"step": 661
},
{
"epoch": 0.999245283018868,
"grad_norm": 0.9076134480414219,
"learning_rate": 3.7045327364297706e-05,
"loss": 0.7179,
"step": 662
},
{
"epoch": 1.0,
"grad_norm": 0.9076134480414219,
"learning_rate": 3.701734750979295e-05,
"loss": 0.7348,
"step": 663
},
{
"epoch": 1.001509433962264,
"grad_norm": 0.6627110764246336,
"learning_rate": 3.698936765528819e-05,
"loss": 0.6885,
"step": 664
},
{
"epoch": 1.0030188679245282,
"grad_norm": 10.266344586969133,
"learning_rate": 3.6961387800783437e-05,
"loss": 1.0522,
"step": 665
},
{
"epoch": 1.0045283018867925,
"grad_norm": 0.810534888309623,
"learning_rate": 3.693340794627868e-05,
"loss": 0.6195,
"step": 666
},
{
"epoch": 1.0060377358490566,
"grad_norm": 1.7755771547569201,
"learning_rate": 3.690542809177392e-05,
"loss": 0.6254,
"step": 667
},
{
"epoch": 1.0075471698113208,
"grad_norm": 0.6892452310322204,
"learning_rate": 3.687744823726917e-05,
"loss": 0.6086,
"step": 668
},
{
"epoch": 1.0090566037735849,
"grad_norm": 0.5926694715314103,
"learning_rate": 3.684946838276441e-05,
"loss": 0.5926,
"step": 669
},
{
"epoch": 1.010566037735849,
"grad_norm": 0.41200219649494063,
"learning_rate": 3.682148852825965e-05,
"loss": 0.6277,
"step": 670
},
{
"epoch": 1.0120754716981133,
"grad_norm": 0.8158461895891851,
"learning_rate": 3.6793508673754903e-05,
"loss": 0.6335,
"step": 671
},
{
"epoch": 1.0135849056603774,
"grad_norm": 0.601172077942669,
"learning_rate": 3.676552881925014e-05,
"loss": 0.6547,
"step": 672
},
{
"epoch": 1.0150943396226415,
"grad_norm": 0.4134386101441369,
"learning_rate": 3.6737548964745383e-05,
"loss": 0.6927,
"step": 673
},
{
"epoch": 1.0166037735849056,
"grad_norm": 0.5120306933334605,
"learning_rate": 3.670956911024063e-05,
"loss": 0.6646,
"step": 674
},
{
"epoch": 1.0181132075471697,
"grad_norm": 0.5525313287248008,
"learning_rate": 3.668158925573587e-05,
"loss": 0.6283,
"step": 675
},
{
"epoch": 1.019622641509434,
"grad_norm": 0.42590696095726077,
"learning_rate": 3.6653609401231114e-05,
"loss": 0.7062,
"step": 676
},
{
"epoch": 1.0211320754716982,
"grad_norm": 0.724226525712548,
"learning_rate": 3.662562954672636e-05,
"loss": 0.6758,
"step": 677
},
{
"epoch": 1.0226415094339623,
"grad_norm": 0.5674130150381019,
"learning_rate": 3.65976496922216e-05,
"loss": 0.6043,
"step": 678
},
{
"epoch": 1.0241509433962264,
"grad_norm": 0.4079281128046953,
"learning_rate": 3.6569669837716844e-05,
"loss": 0.6014,
"step": 679
},
{
"epoch": 1.0256603773584905,
"grad_norm": 0.46953990551509994,
"learning_rate": 3.6541689983212094e-05,
"loss": 0.7177,
"step": 680
},
{
"epoch": 1.0271698113207548,
"grad_norm": 0.4545817525144365,
"learning_rate": 3.651371012870734e-05,
"loss": 0.6133,
"step": 681
},
{
"epoch": 1.028679245283019,
"grad_norm": 0.38317634414943963,
"learning_rate": 3.648573027420258e-05,
"loss": 0.5863,
"step": 682
},
{
"epoch": 1.030188679245283,
"grad_norm": 0.4097662107255479,
"learning_rate": 3.645775041969782e-05,
"loss": 0.6544,
"step": 683
},
{
"epoch": 1.0316981132075471,
"grad_norm": 0.5667177083373588,
"learning_rate": 3.642977056519306e-05,
"loss": 0.6172,
"step": 684
},
{
"epoch": 1.0332075471698112,
"grad_norm": 0.3606672570163167,
"learning_rate": 3.6401790710688304e-05,
"loss": 0.6386,
"step": 685
},
{
"epoch": 1.0347169811320756,
"grad_norm": 0.4911306296070032,
"learning_rate": 3.637381085618355e-05,
"loss": 0.6652,
"step": 686
},
{
"epoch": 1.0362264150943397,
"grad_norm": 2.4217324647576763,
"learning_rate": 3.634583100167879e-05,
"loss": 0.7402,
"step": 687
},
{
"epoch": 1.0377358490566038,
"grad_norm": 0.4824187002755876,
"learning_rate": 3.6317851147174034e-05,
"loss": 0.711,
"step": 688
},
{
"epoch": 1.0392452830188679,
"grad_norm": 0.49721674246607617,
"learning_rate": 3.6289871292669284e-05,
"loss": 0.5908,
"step": 689
},
{
"epoch": 1.040754716981132,
"grad_norm": 0.3743482427307361,
"learning_rate": 3.626189143816453e-05,
"loss": 0.6319,
"step": 690
},
{
"epoch": 1.0422641509433963,
"grad_norm": 0.43291507293459125,
"learning_rate": 3.623391158365977e-05,
"loss": 0.6705,
"step": 691
},
{
"epoch": 1.0437735849056604,
"grad_norm": 0.5208705268592786,
"learning_rate": 3.6205931729155014e-05,
"loss": 0.6039,
"step": 692
},
{
"epoch": 1.0452830188679245,
"grad_norm": 0.410450044767452,
"learning_rate": 3.617795187465025e-05,
"loss": 0.6525,
"step": 693
},
{
"epoch": 1.0467924528301886,
"grad_norm": 0.4022445567989694,
"learning_rate": 3.6149972020145494e-05,
"loss": 0.6954,
"step": 694
},
{
"epoch": 1.0483018867924527,
"grad_norm": 0.47132444327306144,
"learning_rate": 3.612199216564074e-05,
"loss": 0.6521,
"step": 695
},
{
"epoch": 1.049811320754717,
"grad_norm": 0.3875966766440471,
"learning_rate": 3.609401231113598e-05,
"loss": 0.614,
"step": 696
},
{
"epoch": 1.0513207547169812,
"grad_norm": 0.451663223959601,
"learning_rate": 3.6066032456631224e-05,
"loss": 0.6445,
"step": 697
},
{
"epoch": 1.0528301886792453,
"grad_norm": 0.41172392506051253,
"learning_rate": 3.603805260212647e-05,
"loss": 0.6422,
"step": 698
},
{
"epoch": 1.0543396226415094,
"grad_norm": 0.375540946924188,
"learning_rate": 3.601007274762172e-05,
"loss": 0.652,
"step": 699
},
{
"epoch": 1.0558490566037735,
"grad_norm": 0.3717387172700223,
"learning_rate": 3.598209289311696e-05,
"loss": 0.6777,
"step": 700
},
{
"epoch": 1.0573584905660378,
"grad_norm": 0.424782758304706,
"learning_rate": 3.5954113038612204e-05,
"loss": 0.6228,
"step": 701
},
{
"epoch": 1.058867924528302,
"grad_norm": 0.3391925209354729,
"learning_rate": 3.592613318410745e-05,
"loss": 0.6024,
"step": 702
},
{
"epoch": 1.060377358490566,
"grad_norm": 0.32914720950015897,
"learning_rate": 3.5898153329602684e-05,
"loss": 0.6457,
"step": 703
},
{
"epoch": 1.0618867924528301,
"grad_norm": 1.0554743822729729,
"learning_rate": 3.587017347509793e-05,
"loss": 0.5799,
"step": 704
},
{
"epoch": 1.0633962264150942,
"grad_norm": 0.39339053656170486,
"learning_rate": 3.584219362059317e-05,
"loss": 0.5598,
"step": 705
},
{
"epoch": 1.0649056603773586,
"grad_norm": 0.40135937465077187,
"learning_rate": 3.5814213766088414e-05,
"loss": 0.6226,
"step": 706
},
{
"epoch": 1.0664150943396227,
"grad_norm": 0.43288336102917757,
"learning_rate": 3.578623391158366e-05,
"loss": 0.6274,
"step": 707
},
{
"epoch": 1.0679245283018868,
"grad_norm": 0.4521991854235038,
"learning_rate": 3.575825405707891e-05,
"loss": 0.6483,
"step": 708
},
{
"epoch": 1.0694339622641509,
"grad_norm": 0.402604567263997,
"learning_rate": 3.573027420257415e-05,
"loss": 0.6309,
"step": 709
},
{
"epoch": 1.070943396226415,
"grad_norm": 0.46402872137647033,
"learning_rate": 3.5702294348069395e-05,
"loss": 0.639,
"step": 710
},
{
"epoch": 1.0724528301886793,
"grad_norm": 0.4017030194373752,
"learning_rate": 3.567431449356464e-05,
"loss": 0.6329,
"step": 711
},
{
"epoch": 1.0739622641509434,
"grad_norm": 0.47618502583548106,
"learning_rate": 3.564633463905988e-05,
"loss": 0.7084,
"step": 712
},
{
"epoch": 1.0754716981132075,
"grad_norm": 0.41264204371650326,
"learning_rate": 3.5618354784555125e-05,
"loss": 0.6365,
"step": 713
},
{
"epoch": 1.0769811320754716,
"grad_norm": 0.38772697844532533,
"learning_rate": 3.559037493005036e-05,
"loss": 0.6308,
"step": 714
},
{
"epoch": 1.0784905660377357,
"grad_norm": 0.46449961773150183,
"learning_rate": 3.5562395075545605e-05,
"loss": 0.6279,
"step": 715
},
{
"epoch": 1.08,
"grad_norm": 0.45078141741988764,
"learning_rate": 3.553441522104085e-05,
"loss": 0.6256,
"step": 716
},
{
"epoch": 1.0815094339622642,
"grad_norm": 0.5161091002718486,
"learning_rate": 3.55064353665361e-05,
"loss": 0.6283,
"step": 717
},
{
"epoch": 1.0830188679245283,
"grad_norm": 0.3938545925062218,
"learning_rate": 3.547845551203134e-05,
"loss": 0.6029,
"step": 718
},
{
"epoch": 1.0845283018867924,
"grad_norm": 0.4535260490235984,
"learning_rate": 3.5450475657526585e-05,
"loss": 0.6482,
"step": 719
},
{
"epoch": 1.0860377358490565,
"grad_norm": 0.3970289231442372,
"learning_rate": 3.542249580302183e-05,
"loss": 0.5763,
"step": 720
},
{
"epoch": 1.0875471698113208,
"grad_norm": 0.4039865278069702,
"learning_rate": 3.539451594851707e-05,
"loss": 0.6558,
"step": 721
},
{
"epoch": 1.089056603773585,
"grad_norm": 0.4202122162673237,
"learning_rate": 3.5366536094012315e-05,
"loss": 0.6462,
"step": 722
},
{
"epoch": 1.090566037735849,
"grad_norm": 0.38039927746822294,
"learning_rate": 3.533855623950756e-05,
"loss": 0.6544,
"step": 723
},
{
"epoch": 1.0920754716981131,
"grad_norm": 0.40116562127860167,
"learning_rate": 3.5310576385002795e-05,
"loss": 0.6408,
"step": 724
},
{
"epoch": 1.0935849056603772,
"grad_norm": 0.48128273610391287,
"learning_rate": 3.528259653049804e-05,
"loss": 0.5859,
"step": 725
},
{
"epoch": 1.0950943396226416,
"grad_norm": 0.42443398500645513,
"learning_rate": 3.525461667599328e-05,
"loss": 0.6272,
"step": 726
},
{
"epoch": 1.0966037735849057,
"grad_norm": 0.8589027269475118,
"learning_rate": 3.522663682148853e-05,
"loss": 0.6466,
"step": 727
},
{
"epoch": 1.0981132075471698,
"grad_norm": 1.3107651423337432,
"learning_rate": 3.5198656966983775e-05,
"loss": 0.6948,
"step": 728
},
{
"epoch": 1.099622641509434,
"grad_norm": 0.46154805702038726,
"learning_rate": 3.517067711247902e-05,
"loss": 0.647,
"step": 729
},
{
"epoch": 1.101132075471698,
"grad_norm": 0.4215490457807108,
"learning_rate": 3.514269725797426e-05,
"loss": 0.639,
"step": 730
},
{
"epoch": 1.1026415094339623,
"grad_norm": 0.44910831262776585,
"learning_rate": 3.5114717403469505e-05,
"loss": 0.6416,
"step": 731
},
{
"epoch": 1.1041509433962264,
"grad_norm": 0.4180279321095757,
"learning_rate": 3.508673754896475e-05,
"loss": 0.687,
"step": 732
},
{
"epoch": 1.1056603773584905,
"grad_norm": 0.3787657588954383,
"learning_rate": 3.505875769445999e-05,
"loss": 0.6842,
"step": 733
},
{
"epoch": 1.1071698113207546,
"grad_norm": 0.4577454096053106,
"learning_rate": 3.5030777839955235e-05,
"loss": 0.6741,
"step": 734
},
{
"epoch": 1.1086792452830188,
"grad_norm": 1.4461079332215183,
"learning_rate": 3.500279798545047e-05,
"loss": 0.6817,
"step": 735
},
{
"epoch": 1.110188679245283,
"grad_norm": 0.4104193651538478,
"learning_rate": 3.497481813094572e-05,
"loss": 0.6317,
"step": 736
},
{
"epoch": 1.1116981132075472,
"grad_norm": 0.655866457602048,
"learning_rate": 3.4946838276440965e-05,
"loss": 0.6313,
"step": 737
},
{
"epoch": 1.1132075471698113,
"grad_norm": 0.4531623276746969,
"learning_rate": 3.491885842193621e-05,
"loss": 0.5892,
"step": 738
},
{
"epoch": 1.1147169811320754,
"grad_norm": 0.5709351123804889,
"learning_rate": 3.489087856743145e-05,
"loss": 0.5634,
"step": 739
},
{
"epoch": 1.1162264150943395,
"grad_norm": 0.48674031702120063,
"learning_rate": 3.4862898712926696e-05,
"loss": 0.6801,
"step": 740
},
{
"epoch": 1.1177358490566038,
"grad_norm": 0.4048608285674579,
"learning_rate": 3.483491885842194e-05,
"loss": 0.5654,
"step": 741
},
{
"epoch": 1.119245283018868,
"grad_norm": 0.4626449984604195,
"learning_rate": 3.480693900391718e-05,
"loss": 0.6544,
"step": 742
},
{
"epoch": 1.120754716981132,
"grad_norm": 0.5183643140468202,
"learning_rate": 3.4778959149412426e-05,
"loss": 0.6651,
"step": 743
},
{
"epoch": 1.1222641509433962,
"grad_norm": 0.49226691831400854,
"learning_rate": 3.475097929490767e-05,
"loss": 0.6862,
"step": 744
},
{
"epoch": 1.1237735849056603,
"grad_norm": 4.415706986920795,
"learning_rate": 3.472299944040291e-05,
"loss": 0.6689,
"step": 745
},
{
"epoch": 1.1252830188679246,
"grad_norm": 0.7573980083060947,
"learning_rate": 3.4695019585898156e-05,
"loss": 0.6373,
"step": 746
},
{
"epoch": 1.1267924528301887,
"grad_norm": 0.5906345275939948,
"learning_rate": 3.46670397313934e-05,
"loss": 0.6309,
"step": 747
},
{
"epoch": 1.1283018867924528,
"grad_norm": 0.3998318687851271,
"learning_rate": 3.463905987688864e-05,
"loss": 0.6802,
"step": 748
},
{
"epoch": 1.129811320754717,
"grad_norm": 0.5467978795675057,
"learning_rate": 3.4611080022383886e-05,
"loss": 0.562,
"step": 749
},
{
"epoch": 1.131320754716981,
"grad_norm": 0.5191137071980696,
"learning_rate": 3.458310016787913e-05,
"loss": 0.6156,
"step": 750
},
{
"epoch": 1.1328301886792453,
"grad_norm": 0.44633904601748514,
"learning_rate": 3.455512031337437e-05,
"loss": 0.5838,
"step": 751
},
{
"epoch": 1.1343396226415094,
"grad_norm": 0.5065992896082492,
"learning_rate": 3.4527140458869616e-05,
"loss": 0.6843,
"step": 752
},
{
"epoch": 1.1358490566037736,
"grad_norm": 0.4264224918247215,
"learning_rate": 3.449916060436486e-05,
"loss": 0.6091,
"step": 753
},
{
"epoch": 1.1373584905660377,
"grad_norm": 0.3992177248827404,
"learning_rate": 3.44711807498601e-05,
"loss": 0.6,
"step": 754
},
{
"epoch": 1.1388679245283018,
"grad_norm": 0.48903828473507216,
"learning_rate": 3.4443200895355346e-05,
"loss": 0.6459,
"step": 755
},
{
"epoch": 1.140377358490566,
"grad_norm": 0.4105745437343364,
"learning_rate": 3.441522104085059e-05,
"loss": 0.6526,
"step": 756
},
{
"epoch": 1.1418867924528302,
"grad_norm": 0.49384071168597404,
"learning_rate": 3.438724118634583e-05,
"loss": 0.685,
"step": 757
},
{
"epoch": 1.1433962264150943,
"grad_norm": 0.5413740841399795,
"learning_rate": 3.4359261331841076e-05,
"loss": 0.6377,
"step": 758
},
{
"epoch": 1.1449056603773584,
"grad_norm": 0.480269969475746,
"learning_rate": 3.433128147733632e-05,
"loss": 0.6463,
"step": 759
},
{
"epoch": 1.1464150943396225,
"grad_norm": 0.5646301785912243,
"learning_rate": 3.430330162283156e-05,
"loss": 0.6265,
"step": 760
},
{
"epoch": 1.1479245283018868,
"grad_norm": 0.8673433809554542,
"learning_rate": 3.4275321768326806e-05,
"loss": 0.642,
"step": 761
},
{
"epoch": 1.149433962264151,
"grad_norm": 0.4112877387329245,
"learning_rate": 3.424734191382205e-05,
"loss": 0.6082,
"step": 762
},
{
"epoch": 1.150943396226415,
"grad_norm": 0.6320032562966805,
"learning_rate": 3.421936205931729e-05,
"loss": 0.6219,
"step": 763
},
{
"epoch": 1.1524528301886792,
"grad_norm": 0.46715710458859117,
"learning_rate": 3.4191382204812536e-05,
"loss": 0.6477,
"step": 764
},
{
"epoch": 1.1539622641509433,
"grad_norm": 0.5192541163104857,
"learning_rate": 3.416340235030778e-05,
"loss": 0.6686,
"step": 765
},
{
"epoch": 1.1554716981132076,
"grad_norm": 0.5115037747798377,
"learning_rate": 3.413542249580302e-05,
"loss": 0.5957,
"step": 766
},
{
"epoch": 1.1569811320754717,
"grad_norm": 0.4631548895084437,
"learning_rate": 3.4107442641298266e-05,
"loss": 0.6691,
"step": 767
},
{
"epoch": 1.1584905660377358,
"grad_norm": 0.4315295867507691,
"learning_rate": 3.407946278679351e-05,
"loss": 0.5968,
"step": 768
},
{
"epoch": 1.16,
"grad_norm": 0.5489635360542999,
"learning_rate": 3.405148293228875e-05,
"loss": 0.6286,
"step": 769
},
{
"epoch": 1.161509433962264,
"grad_norm": 0.5260776713612061,
"learning_rate": 3.4023503077783996e-05,
"loss": 0.6876,
"step": 770
},
{
"epoch": 1.1630188679245284,
"grad_norm": 0.41102121571815237,
"learning_rate": 3.399552322327924e-05,
"loss": 0.6875,
"step": 771
},
{
"epoch": 1.1645283018867925,
"grad_norm": 0.5051554044354539,
"learning_rate": 3.396754336877448e-05,
"loss": 0.6126,
"step": 772
},
{
"epoch": 1.1660377358490566,
"grad_norm": 0.4562465913464345,
"learning_rate": 3.3939563514269726e-05,
"loss": 0.6137,
"step": 773
},
{
"epoch": 1.1675471698113207,
"grad_norm": 0.4743549865343238,
"learning_rate": 3.391158365976497e-05,
"loss": 0.6315,
"step": 774
},
{
"epoch": 1.169056603773585,
"grad_norm": 0.5247109156016383,
"learning_rate": 3.388360380526022e-05,
"loss": 0.6428,
"step": 775
},
{
"epoch": 1.170566037735849,
"grad_norm": 3.559423434494465,
"learning_rate": 3.3855623950755457e-05,
"loss": 0.6067,
"step": 776
},
{
"epoch": 1.1720754716981132,
"grad_norm": 0.5989095184362712,
"learning_rate": 3.38276440962507e-05,
"loss": 0.6457,
"step": 777
},
{
"epoch": 1.1735849056603773,
"grad_norm": 0.5188354410044358,
"learning_rate": 3.379966424174594e-05,
"loss": 0.5998,
"step": 778
},
{
"epoch": 1.1750943396226414,
"grad_norm": 0.4214267781907578,
"learning_rate": 3.377168438724119e-05,
"loss": 0.6579,
"step": 779
},
{
"epoch": 1.1766037735849058,
"grad_norm": 0.45186691120990086,
"learning_rate": 3.374370453273643e-05,
"loss": 0.6321,
"step": 780
},
{
"epoch": 1.1781132075471699,
"grad_norm": 0.5363268074933428,
"learning_rate": 3.371572467823167e-05,
"loss": 0.6145,
"step": 781
},
{
"epoch": 1.179622641509434,
"grad_norm": 0.49420994902734927,
"learning_rate": 3.368774482372692e-05,
"loss": 0.6015,
"step": 782
},
{
"epoch": 1.181132075471698,
"grad_norm": 0.3799463287008628,
"learning_rate": 3.365976496922216e-05,
"loss": 0.7114,
"step": 783
},
{
"epoch": 1.1826415094339622,
"grad_norm": 0.4873082496659108,
"learning_rate": 3.3631785114717403e-05,
"loss": 0.6194,
"step": 784
},
{
"epoch": 1.1841509433962265,
"grad_norm": 0.4207411818460765,
"learning_rate": 3.3603805260212654e-05,
"loss": 0.6103,
"step": 785
},
{
"epoch": 1.1856603773584906,
"grad_norm": 0.5223740890535683,
"learning_rate": 3.35758254057079e-05,
"loss": 0.6299,
"step": 786
},
{
"epoch": 1.1871698113207547,
"grad_norm": 0.3569381738413025,
"learning_rate": 3.3547845551203134e-05,
"loss": 0.6105,
"step": 787
},
{
"epoch": 1.1886792452830188,
"grad_norm": 4.0178075776210775,
"learning_rate": 3.351986569669838e-05,
"loss": 0.6551,
"step": 788
},
{
"epoch": 1.190188679245283,
"grad_norm": 0.9315621690551965,
"learning_rate": 3.349188584219362e-05,
"loss": 0.6726,
"step": 789
},
{
"epoch": 1.1916981132075473,
"grad_norm": 0.4663325979493853,
"learning_rate": 3.3463905987688864e-05,
"loss": 0.6479,
"step": 790
},
{
"epoch": 1.1932075471698114,
"grad_norm": 0.8155128870027623,
"learning_rate": 3.343592613318411e-05,
"loss": 0.6321,
"step": 791
},
{
"epoch": 1.1947169811320755,
"grad_norm": 0.5354713553205344,
"learning_rate": 3.340794627867935e-05,
"loss": 0.6267,
"step": 792
},
{
"epoch": 1.1962264150943396,
"grad_norm": 0.8995514690519233,
"learning_rate": 3.3379966424174594e-05,
"loss": 0.6668,
"step": 793
},
{
"epoch": 1.1977358490566037,
"grad_norm": 0.42325453026549564,
"learning_rate": 3.3351986569669844e-05,
"loss": 0.6892,
"step": 794
},
{
"epoch": 1.199245283018868,
"grad_norm": 0.9019398748449015,
"learning_rate": 3.332400671516509e-05,
"loss": 0.6142,
"step": 795
},
{
"epoch": 1.2007547169811321,
"grad_norm": 0.42650778637925774,
"learning_rate": 3.329602686066033e-05,
"loss": 0.9311,
"step": 796
},
{
"epoch": 1.2022641509433962,
"grad_norm": 5.832651725718385,
"learning_rate": 3.326804700615557e-05,
"loss": 0.5994,
"step": 797
},
{
"epoch": 1.2037735849056603,
"grad_norm": 1.4055840202932217,
"learning_rate": 3.324006715165081e-05,
"loss": 0.6326,
"step": 798
},
{
"epoch": 1.2052830188679244,
"grad_norm": 0.5916688857583293,
"learning_rate": 3.3212087297146054e-05,
"loss": 0.6772,
"step": 799
},
{
"epoch": 1.2067924528301888,
"grad_norm": 0.9928690203020595,
"learning_rate": 3.31841074426413e-05,
"loss": 0.595,
"step": 800
},
{
"epoch": 1.2083018867924529,
"grad_norm": 0.9349255830289889,
"learning_rate": 3.315612758813654e-05,
"loss": 0.7102,
"step": 801
},
{
"epoch": 1.209811320754717,
"grad_norm": 0.7686689517909183,
"learning_rate": 3.3128147733631784e-05,
"loss": 0.6264,
"step": 802
},
{
"epoch": 1.211320754716981,
"grad_norm": 0.9656464748523269,
"learning_rate": 3.3100167879127034e-05,
"loss": 0.6611,
"step": 803
},
{
"epoch": 1.2128301886792452,
"grad_norm": 0.5809016244321841,
"learning_rate": 3.307218802462228e-05,
"loss": 0.6674,
"step": 804
},
{
"epoch": 1.2143396226415095,
"grad_norm": 0.8488641757151281,
"learning_rate": 3.304420817011752e-05,
"loss": 0.6724,
"step": 805
},
{
"epoch": 1.2158490566037736,
"grad_norm": 0.5115352162894421,
"learning_rate": 3.3016228315612764e-05,
"loss": 0.6244,
"step": 806
},
{
"epoch": 1.2173584905660377,
"grad_norm": 0.6746402205487975,
"learning_rate": 3.2988248461108e-05,
"loss": 0.6216,
"step": 807
},
{
"epoch": 1.2188679245283018,
"grad_norm": 0.5947361094685111,
"learning_rate": 3.2960268606603244e-05,
"loss": 0.6407,
"step": 808
},
{
"epoch": 1.220377358490566,
"grad_norm": 0.6118537661273408,
"learning_rate": 3.293228875209849e-05,
"loss": 0.6642,
"step": 809
},
{
"epoch": 1.2218867924528303,
"grad_norm": 0.5853643244928053,
"learning_rate": 3.290430889759373e-05,
"loss": 0.6274,
"step": 810
},
{
"epoch": 1.2233962264150944,
"grad_norm": 0.4408870524517529,
"learning_rate": 3.2876329043088974e-05,
"loss": 0.6762,
"step": 811
},
{
"epoch": 1.2249056603773585,
"grad_norm": 0.5786375937277597,
"learning_rate": 3.284834918858422e-05,
"loss": 0.618,
"step": 812
},
{
"epoch": 1.2264150943396226,
"grad_norm": 0.37448594144435,
"learning_rate": 3.282036933407947e-05,
"loss": 0.6316,
"step": 813
},
{
"epoch": 1.2279245283018867,
"grad_norm": 0.4860905984518343,
"learning_rate": 3.279238947957471e-05,
"loss": 0.6299,
"step": 814
},
{
"epoch": 1.229433962264151,
"grad_norm": 0.5849544770920747,
"learning_rate": 3.2764409625069954e-05,
"loss": 0.5729,
"step": 815
},
{
"epoch": 1.2309433962264151,
"grad_norm": 0.4008195084296223,
"learning_rate": 3.27364297705652e-05,
"loss": 0.6682,
"step": 816
},
{
"epoch": 1.2324528301886792,
"grad_norm": 0.5272118883156421,
"learning_rate": 3.270844991606044e-05,
"loss": 0.6049,
"step": 817
},
{
"epoch": 1.2339622641509433,
"grad_norm": 0.550682484167377,
"learning_rate": 3.268047006155568e-05,
"loss": 0.5918,
"step": 818
},
{
"epoch": 1.2354716981132075,
"grad_norm": 0.45899566874631015,
"learning_rate": 3.265249020705092e-05,
"loss": 0.6293,
"step": 819
},
{
"epoch": 1.2369811320754718,
"grad_norm": 0.41053358276258073,
"learning_rate": 3.2624510352546165e-05,
"loss": 0.5974,
"step": 820
},
{
"epoch": 1.2384905660377359,
"grad_norm": 0.5673047046076692,
"learning_rate": 3.259653049804141e-05,
"loss": 0.6689,
"step": 821
},
{
"epoch": 1.24,
"grad_norm": 0.42001733257774354,
"learning_rate": 3.256855064353666e-05,
"loss": 0.5781,
"step": 822
},
{
"epoch": 1.241509433962264,
"grad_norm": 0.5274791837888488,
"learning_rate": 3.25405707890319e-05,
"loss": 0.6556,
"step": 823
},
{
"epoch": 1.2430188679245284,
"grad_norm": 0.40140251232729945,
"learning_rate": 3.2512590934527145e-05,
"loss": 0.5618,
"step": 824
},
{
"epoch": 1.2445283018867925,
"grad_norm": 0.38988606187868724,
"learning_rate": 3.248461108002239e-05,
"loss": 0.6696,
"step": 825
},
{
"epoch": 1.2460377358490566,
"grad_norm": 0.6304384074605135,
"learning_rate": 3.245663122551763e-05,
"loss": 0.6769,
"step": 826
},
{
"epoch": 1.2475471698113207,
"grad_norm": 0.3979395391659101,
"learning_rate": 3.2428651371012875e-05,
"loss": 0.6174,
"step": 827
},
{
"epoch": 1.2490566037735849,
"grad_norm": 0.5329831109280307,
"learning_rate": 3.240067151650811e-05,
"loss": 0.6181,
"step": 828
},
{
"epoch": 1.2505660377358492,
"grad_norm": 0.4319961747815377,
"learning_rate": 3.2372691662003355e-05,
"loss": 0.6993,
"step": 829
},
{
"epoch": 1.2520754716981133,
"grad_norm": 0.4920198115969687,
"learning_rate": 3.23447118074986e-05,
"loss": 0.6775,
"step": 830
},
{
"epoch": 1.2535849056603774,
"grad_norm": 0.5327815925026391,
"learning_rate": 3.231673195299385e-05,
"loss": 0.6518,
"step": 831
},
{
"epoch": 1.2550943396226415,
"grad_norm": 0.4389886233997492,
"learning_rate": 3.228875209848909e-05,
"loss": 0.6365,
"step": 832
},
{
"epoch": 1.2566037735849056,
"grad_norm": 0.5047415384999543,
"learning_rate": 3.2260772243984335e-05,
"loss": 0.7062,
"step": 833
},
{
"epoch": 1.25811320754717,
"grad_norm": 0.4396225841431117,
"learning_rate": 3.223279238947958e-05,
"loss": 0.61,
"step": 834
},
{
"epoch": 1.259622641509434,
"grad_norm": 0.4564667026923815,
"learning_rate": 3.220481253497482e-05,
"loss": 0.6335,
"step": 835
},
{
"epoch": 1.2611320754716981,
"grad_norm": 0.39053767513919807,
"learning_rate": 3.2176832680470065e-05,
"loss": 0.6055,
"step": 836
},
{
"epoch": 1.2626415094339623,
"grad_norm": 0.3783353602690344,
"learning_rate": 3.214885282596531e-05,
"loss": 0.7125,
"step": 837
},
{
"epoch": 1.2641509433962264,
"grad_norm": 0.3553163046641145,
"learning_rate": 3.2120872971460545e-05,
"loss": 0.6143,
"step": 838
},
{
"epoch": 1.2656603773584907,
"grad_norm": 0.35917670960749537,
"learning_rate": 3.209289311695579e-05,
"loss": 0.5769,
"step": 839
},
{
"epoch": 1.2671698113207548,
"grad_norm": 0.36467830677027235,
"learning_rate": 3.206491326245103e-05,
"loss": 0.5699,
"step": 840
},
{
"epoch": 1.268679245283019,
"grad_norm": 0.3773609262039195,
"learning_rate": 3.203693340794628e-05,
"loss": 0.6271,
"step": 841
},
{
"epoch": 1.270188679245283,
"grad_norm": 0.3795609120240298,
"learning_rate": 3.2008953553441525e-05,
"loss": 0.6725,
"step": 842
},
{
"epoch": 1.271698113207547,
"grad_norm": 0.37050676683076367,
"learning_rate": 3.198097369893677e-05,
"loss": 0.5933,
"step": 843
},
{
"epoch": 1.2732075471698114,
"grad_norm": 0.45887192265355964,
"learning_rate": 3.195299384443201e-05,
"loss": 0.6373,
"step": 844
},
{
"epoch": 1.2747169811320755,
"grad_norm": 1.4950560043816827,
"learning_rate": 3.1925013989927255e-05,
"loss": 0.9561,
"step": 845
},
{
"epoch": 1.2762264150943397,
"grad_norm": 0.39850378462049746,
"learning_rate": 3.18970341354225e-05,
"loss": 0.6525,
"step": 846
},
{
"epoch": 1.2777358490566038,
"grad_norm": 0.4135301305547025,
"learning_rate": 3.186905428091774e-05,
"loss": 0.6623,
"step": 847
},
{
"epoch": 1.2792452830188679,
"grad_norm": 0.42557572265505506,
"learning_rate": 3.1841074426412985e-05,
"loss": 0.6964,
"step": 848
},
{
"epoch": 1.2807547169811322,
"grad_norm": 0.39760056122405496,
"learning_rate": 3.181309457190822e-05,
"loss": 0.6106,
"step": 849
},
{
"epoch": 1.2822641509433963,
"grad_norm": 0.40798606836706397,
"learning_rate": 3.178511471740347e-05,
"loss": 0.5632,
"step": 850
},
{
"epoch": 1.2837735849056604,
"grad_norm": 0.3462421636576025,
"learning_rate": 3.1757134862898716e-05,
"loss": 0.5894,
"step": 851
},
{
"epoch": 1.2852830188679245,
"grad_norm": 0.3722881735233624,
"learning_rate": 3.172915500839396e-05,
"loss": 0.6412,
"step": 852
},
{
"epoch": 1.2867924528301886,
"grad_norm": 0.37960556416454605,
"learning_rate": 3.17011751538892e-05,
"loss": 0.6001,
"step": 853
},
{
"epoch": 1.288301886792453,
"grad_norm": 0.4276713553028228,
"learning_rate": 3.1673195299384446e-05,
"loss": 0.6603,
"step": 854
},
{
"epoch": 1.289811320754717,
"grad_norm": 0.7095036362422655,
"learning_rate": 3.164521544487969e-05,
"loss": 0.6137,
"step": 855
},
{
"epoch": 1.2913207547169812,
"grad_norm": 0.4807408403609567,
"learning_rate": 3.161723559037493e-05,
"loss": 0.6588,
"step": 856
},
{
"epoch": 1.2928301886792453,
"grad_norm": 0.4358649697165436,
"learning_rate": 3.1589255735870176e-05,
"loss": 0.5926,
"step": 857
},
{
"epoch": 1.2943396226415094,
"grad_norm": 0.41013182956481836,
"learning_rate": 3.156127588136542e-05,
"loss": 0.6665,
"step": 858
},
{
"epoch": 1.2958490566037737,
"grad_norm": 0.41515671451727343,
"learning_rate": 3.153329602686066e-05,
"loss": 0.7209,
"step": 859
},
{
"epoch": 1.2973584905660378,
"grad_norm": 0.3942619832225865,
"learning_rate": 3.1505316172355906e-05,
"loss": 0.6667,
"step": 860
},
{
"epoch": 1.298867924528302,
"grad_norm": 0.4021542796788365,
"learning_rate": 3.147733631785115e-05,
"loss": 0.6465,
"step": 861
},
{
"epoch": 1.300377358490566,
"grad_norm": 0.37633875180949317,
"learning_rate": 3.144935646334639e-05,
"loss": 0.6343,
"step": 862
},
{
"epoch": 1.3018867924528301,
"grad_norm": 0.7184734573887464,
"learning_rate": 3.1421376608841636e-05,
"loss": 0.6515,
"step": 863
},
{
"epoch": 1.3033962264150944,
"grad_norm": 0.36958219984109786,
"learning_rate": 3.139339675433688e-05,
"loss": 0.6474,
"step": 864
},
{
"epoch": 1.3049056603773586,
"grad_norm": 0.41188446823897223,
"learning_rate": 3.136541689983212e-05,
"loss": 0.6052,
"step": 865
},
{
"epoch": 1.3064150943396227,
"grad_norm": 0.42281806811295863,
"learning_rate": 3.1337437045327366e-05,
"loss": 0.6652,
"step": 866
},
{
"epoch": 1.3079245283018868,
"grad_norm": 0.33175743854835926,
"learning_rate": 3.130945719082261e-05,
"loss": 0.5977,
"step": 867
},
{
"epoch": 1.3094339622641509,
"grad_norm": 0.37734563826001954,
"learning_rate": 3.128147733631785e-05,
"loss": 0.5966,
"step": 868
},
{
"epoch": 1.3109433962264152,
"grad_norm": 0.4182778269406502,
"learning_rate": 3.1253497481813096e-05,
"loss": 0.6583,
"step": 869
},
{
"epoch": 1.3124528301886793,
"grad_norm": 0.36140009515229593,
"learning_rate": 3.122551762730834e-05,
"loss": 0.6068,
"step": 870
},
{
"epoch": 1.3139622641509434,
"grad_norm": 0.37977481362373233,
"learning_rate": 3.119753777280358e-05,
"loss": 0.654,
"step": 871
},
{
"epoch": 1.3154716981132075,
"grad_norm": 0.3416259213239559,
"learning_rate": 3.1169557918298826e-05,
"loss": 0.6387,
"step": 872
},
{
"epoch": 1.3169811320754716,
"grad_norm": 0.3564805566672587,
"learning_rate": 3.114157806379407e-05,
"loss": 0.6369,
"step": 873
},
{
"epoch": 1.318490566037736,
"grad_norm": 0.3803523160244477,
"learning_rate": 3.111359820928931e-05,
"loss": 0.7093,
"step": 874
},
{
"epoch": 1.32,
"grad_norm": 0.39166045934086346,
"learning_rate": 3.1085618354784556e-05,
"loss": 0.6496,
"step": 875
},
{
"epoch": 1.3215094339622642,
"grad_norm": 0.4353767626286411,
"learning_rate": 3.10576385002798e-05,
"loss": 0.6266,
"step": 876
},
{
"epoch": 1.3230188679245283,
"grad_norm": 0.3593032604836283,
"learning_rate": 3.102965864577504e-05,
"loss": 0.6782,
"step": 877
},
{
"epoch": 1.3245283018867924,
"grad_norm": 0.4168829319681385,
"learning_rate": 3.1001678791270286e-05,
"loss": 0.6282,
"step": 878
},
{
"epoch": 1.3260377358490567,
"grad_norm": 0.37757583489653257,
"learning_rate": 3.097369893676553e-05,
"loss": 0.6298,
"step": 879
},
{
"epoch": 1.3275471698113208,
"grad_norm": 0.3501040816376274,
"learning_rate": 3.094571908226077e-05,
"loss": 0.6461,
"step": 880
},
{
"epoch": 1.329056603773585,
"grad_norm": 0.4721309371418326,
"learning_rate": 3.0917739227756016e-05,
"loss": 0.6283,
"step": 881
},
{
"epoch": 1.330566037735849,
"grad_norm": 0.40370139757838286,
"learning_rate": 3.088975937325126e-05,
"loss": 0.6533,
"step": 882
},
{
"epoch": 1.3320754716981131,
"grad_norm": 0.4191434348168408,
"learning_rate": 3.08617795187465e-05,
"loss": 0.5634,
"step": 883
},
{
"epoch": 1.3335849056603775,
"grad_norm": 0.47048915913727934,
"learning_rate": 3.0833799664241747e-05,
"loss": 0.6498,
"step": 884
},
{
"epoch": 1.3350943396226416,
"grad_norm": 0.5386238703131466,
"learning_rate": 3.080581980973699e-05,
"loss": 0.6598,
"step": 885
},
{
"epoch": 1.3366037735849057,
"grad_norm": 0.4046153720620124,
"learning_rate": 3.077783995523223e-05,
"loss": 0.6303,
"step": 886
},
{
"epoch": 1.3381132075471698,
"grad_norm": 0.5092275981590267,
"learning_rate": 3.0749860100727477e-05,
"loss": 0.5994,
"step": 887
},
{
"epoch": 1.3396226415094339,
"grad_norm": 0.4201270256233553,
"learning_rate": 3.072188024622272e-05,
"loss": 0.6035,
"step": 888
},
{
"epoch": 1.3411320754716982,
"grad_norm": 0.44363066935554674,
"learning_rate": 3.069390039171797e-05,
"loss": 0.6149,
"step": 889
},
{
"epoch": 1.3426415094339623,
"grad_norm": 0.45931419806615414,
"learning_rate": 3.066592053721321e-05,
"loss": 0.6853,
"step": 890
},
{
"epoch": 1.3441509433962264,
"grad_norm": 0.3981069141574624,
"learning_rate": 3.063794068270845e-05,
"loss": 0.6445,
"step": 891
},
{
"epoch": 1.3456603773584905,
"grad_norm": 0.4033897537692928,
"learning_rate": 3.0609960828203693e-05,
"loss": 0.6657,
"step": 892
},
{
"epoch": 1.3471698113207546,
"grad_norm": 0.3858298350639289,
"learning_rate": 3.058198097369894e-05,
"loss": 0.6255,
"step": 893
},
{
"epoch": 1.348679245283019,
"grad_norm": 0.47123781695477013,
"learning_rate": 3.055400111919418e-05,
"loss": 0.6509,
"step": 894
},
{
"epoch": 1.350188679245283,
"grad_norm": 0.4029191937376374,
"learning_rate": 3.0526021264689424e-05,
"loss": 0.6651,
"step": 895
},
{
"epoch": 1.3516981132075472,
"grad_norm": 0.388896779384108,
"learning_rate": 3.049804141018467e-05,
"loss": 0.6395,
"step": 896
},
{
"epoch": 1.3532075471698113,
"grad_norm": 0.3768031336195982,
"learning_rate": 3.0470061555679914e-05,
"loss": 0.655,
"step": 897
},
{
"epoch": 1.3547169811320754,
"grad_norm": 0.3984983631125007,
"learning_rate": 3.0442081701175157e-05,
"loss": 0.6149,
"step": 898
},
{
"epoch": 1.3562264150943397,
"grad_norm": 0.4066202145440965,
"learning_rate": 3.04141018466704e-05,
"loss": 0.6486,
"step": 899
},
{
"epoch": 1.3577358490566038,
"grad_norm": 0.44059085553416244,
"learning_rate": 3.0386121992165644e-05,
"loss": 0.676,
"step": 900
},
{
"epoch": 1.359245283018868,
"grad_norm": 0.41928112497983105,
"learning_rate": 3.0358142137660884e-05,
"loss": 0.6166,
"step": 901
},
{
"epoch": 1.360754716981132,
"grad_norm": 0.4484411056202494,
"learning_rate": 3.0330162283156127e-05,
"loss": 0.5916,
"step": 902
},
{
"epoch": 1.3622641509433961,
"grad_norm": 0.39462759464635006,
"learning_rate": 3.030218242865137e-05,
"loss": 0.6252,
"step": 903
},
{
"epoch": 1.3637735849056605,
"grad_norm": 0.3064423590338561,
"learning_rate": 3.0274202574146614e-05,
"loss": 0.7015,
"step": 904
},
{
"epoch": 1.3652830188679246,
"grad_norm": 0.4187213380492406,
"learning_rate": 3.024622271964186e-05,
"loss": 0.6168,
"step": 905
},
{
"epoch": 1.3667924528301887,
"grad_norm": 0.40261471242008323,
"learning_rate": 3.0218242865137104e-05,
"loss": 0.7055,
"step": 906
},
{
"epoch": 1.3683018867924528,
"grad_norm": 0.3481493833030105,
"learning_rate": 3.0190263010632347e-05,
"loss": 0.6208,
"step": 907
},
{
"epoch": 1.369811320754717,
"grad_norm": 0.35049655031964855,
"learning_rate": 3.016228315612759e-05,
"loss": 0.5781,
"step": 908
},
{
"epoch": 1.3713207547169812,
"grad_norm": 0.42278059968973397,
"learning_rate": 3.0134303301622834e-05,
"loss": 0.5904,
"step": 909
},
{
"epoch": 1.3728301886792453,
"grad_norm": 0.3445994928179493,
"learning_rate": 3.0106323447118077e-05,
"loss": 0.6288,
"step": 910
},
{
"epoch": 1.3743396226415094,
"grad_norm": 0.38967053457550976,
"learning_rate": 3.0078343592613317e-05,
"loss": 0.6392,
"step": 911
},
{
"epoch": 1.3758490566037735,
"grad_norm": 0.39707064143261184,
"learning_rate": 3.005036373810856e-05,
"loss": 0.6279,
"step": 912
},
{
"epoch": 1.3773584905660377,
"grad_norm": 0.35444427397617145,
"learning_rate": 3.0022383883603804e-05,
"loss": 0.5976,
"step": 913
},
{
"epoch": 1.378867924528302,
"grad_norm": 0.34690507904913026,
"learning_rate": 2.999440402909905e-05,
"loss": 0.634,
"step": 914
},
{
"epoch": 1.380377358490566,
"grad_norm": 0.4626788655800128,
"learning_rate": 2.9966424174594294e-05,
"loss": 0.6583,
"step": 915
},
{
"epoch": 1.3818867924528302,
"grad_norm": 0.3671421219021182,
"learning_rate": 2.9938444320089538e-05,
"loss": 0.6514,
"step": 916
},
{
"epoch": 1.3833962264150943,
"grad_norm": 0.37784633929271194,
"learning_rate": 2.991046446558478e-05,
"loss": 0.601,
"step": 917
},
{
"epoch": 1.3849056603773584,
"grad_norm": 0.4089029572046124,
"learning_rate": 2.9882484611080024e-05,
"loss": 0.6588,
"step": 918
},
{
"epoch": 1.3864150943396227,
"grad_norm": 0.3455888490666691,
"learning_rate": 2.9854504756575268e-05,
"loss": 0.6022,
"step": 919
},
{
"epoch": 1.3879245283018868,
"grad_norm": 0.3654741927522717,
"learning_rate": 2.9826524902070514e-05,
"loss": 0.6638,
"step": 920
},
{
"epoch": 1.389433962264151,
"grad_norm": 0.3502350361285747,
"learning_rate": 2.9798545047565758e-05,
"loss": 0.6645,
"step": 921
},
{
"epoch": 1.390943396226415,
"grad_norm": 0.37331731488300535,
"learning_rate": 2.9770565193060994e-05,
"loss": 0.6074,
"step": 922
},
{
"epoch": 1.3924528301886792,
"grad_norm": 0.3185944385198641,
"learning_rate": 2.9742585338556238e-05,
"loss": 0.5764,
"step": 923
},
{
"epoch": 1.3939622641509435,
"grad_norm": 0.37558027024168333,
"learning_rate": 2.9714605484051484e-05,
"loss": 0.6027,
"step": 924
},
{
"epoch": 1.3954716981132076,
"grad_norm": 0.3494928337929872,
"learning_rate": 2.9686625629546728e-05,
"loss": 0.6464,
"step": 925
},
{
"epoch": 1.3969811320754717,
"grad_norm": 0.41717609426235447,
"learning_rate": 2.965864577504197e-05,
"loss": 0.6136,
"step": 926
},
{
"epoch": 1.3984905660377358,
"grad_norm": 0.3690138657188448,
"learning_rate": 2.9630665920537215e-05,
"loss": 0.6544,
"step": 927
},
{
"epoch": 1.4,
"grad_norm": 0.36363869250987335,
"learning_rate": 2.9602686066032458e-05,
"loss": 0.6072,
"step": 928
},
{
"epoch": 1.4015094339622642,
"grad_norm": 0.327574112494754,
"learning_rate": 2.9574706211527705e-05,
"loss": 0.6352,
"step": 929
},
{
"epoch": 1.4030188679245283,
"grad_norm": 0.35727876020347676,
"learning_rate": 2.9546726357022948e-05,
"loss": 0.5951,
"step": 930
},
{
"epoch": 1.4045283018867925,
"grad_norm": 0.31716305744497264,
"learning_rate": 2.951874650251819e-05,
"loss": 0.6039,
"step": 931
},
{
"epoch": 1.4060377358490566,
"grad_norm": 0.33093980503518344,
"learning_rate": 2.9490766648013428e-05,
"loss": 0.6163,
"step": 932
},
{
"epoch": 1.4075471698113207,
"grad_norm": 0.3861900118247219,
"learning_rate": 2.9462786793508675e-05,
"loss": 0.6153,
"step": 933
},
{
"epoch": 1.409056603773585,
"grad_norm": 0.3277916345887553,
"learning_rate": 2.9434806939003918e-05,
"loss": 0.6284,
"step": 934
},
{
"epoch": 1.410566037735849,
"grad_norm": 0.3780457505042723,
"learning_rate": 2.940682708449916e-05,
"loss": 0.6452,
"step": 935
},
{
"epoch": 1.4120754716981132,
"grad_norm": 0.3465233045348986,
"learning_rate": 2.9378847229994405e-05,
"loss": 0.7393,
"step": 936
},
{
"epoch": 1.4135849056603773,
"grad_norm": 2.5533756746713117,
"learning_rate": 2.9350867375489648e-05,
"loss": 0.5907,
"step": 937
},
{
"epoch": 1.4150943396226414,
"grad_norm": 0.4089686961924739,
"learning_rate": 2.932288752098489e-05,
"loss": 0.6067,
"step": 938
},
{
"epoch": 1.4166037735849057,
"grad_norm": 0.35586033724923993,
"learning_rate": 2.9294907666480138e-05,
"loss": 0.5488,
"step": 939
},
{
"epoch": 1.4181132075471699,
"grad_norm": 0.4380246394519367,
"learning_rate": 2.926692781197538e-05,
"loss": 0.6399,
"step": 940
},
{
"epoch": 1.419622641509434,
"grad_norm": 0.4249638666583559,
"learning_rate": 2.9238947957470625e-05,
"loss": 0.6233,
"step": 941
},
{
"epoch": 1.421132075471698,
"grad_norm": 0.4153579482995771,
"learning_rate": 2.9210968102965865e-05,
"loss": 0.6602,
"step": 942
},
{
"epoch": 1.4226415094339622,
"grad_norm": 0.4754922849198537,
"learning_rate": 2.918298824846111e-05,
"loss": 0.7016,
"step": 943
},
{
"epoch": 1.4241509433962265,
"grad_norm": 0.3236274287786117,
"learning_rate": 2.915500839395635e-05,
"loss": 0.5863,
"step": 944
},
{
"epoch": 1.4256603773584906,
"grad_norm": 0.41992181352774643,
"learning_rate": 2.9127028539451595e-05,
"loss": 0.6258,
"step": 945
},
{
"epoch": 1.4271698113207547,
"grad_norm": 0.8115843394380751,
"learning_rate": 2.909904868494684e-05,
"loss": 0.6465,
"step": 946
},
{
"epoch": 1.4286792452830188,
"grad_norm": 0.39731341593956065,
"learning_rate": 2.9071068830442082e-05,
"loss": 0.6443,
"step": 947
},
{
"epoch": 1.430188679245283,
"grad_norm": 0.36644813089178,
"learning_rate": 2.904308897593733e-05,
"loss": 0.5728,
"step": 948
},
{
"epoch": 1.4316981132075473,
"grad_norm": 0.438411822701408,
"learning_rate": 2.9015109121432572e-05,
"loss": 0.5963,
"step": 949
},
{
"epoch": 1.4332075471698114,
"grad_norm": 0.38295490840225677,
"learning_rate": 2.8987129266927815e-05,
"loss": 0.6152,
"step": 950
},
{
"epoch": 1.4347169811320755,
"grad_norm": 0.39618114403032584,
"learning_rate": 2.895914941242306e-05,
"loss": 0.6637,
"step": 951
},
{
"epoch": 1.4362264150943396,
"grad_norm": 0.3815428588638221,
"learning_rate": 2.8931169557918302e-05,
"loss": 0.5864,
"step": 952
},
{
"epoch": 1.4377358490566037,
"grad_norm": 0.3663074285581354,
"learning_rate": 2.8903189703413542e-05,
"loss": 0.6733,
"step": 953
},
{
"epoch": 1.439245283018868,
"grad_norm": 0.3668247689466776,
"learning_rate": 2.8875209848908785e-05,
"loss": 0.6375,
"step": 954
},
{
"epoch": 1.440754716981132,
"grad_norm": 0.3535138691082089,
"learning_rate": 2.884722999440403e-05,
"loss": 0.5939,
"step": 955
},
{
"epoch": 1.4422641509433962,
"grad_norm": 0.3634202864100825,
"learning_rate": 2.8819250139899272e-05,
"loss": 0.6292,
"step": 956
},
{
"epoch": 1.4437735849056603,
"grad_norm": 0.3572638644281544,
"learning_rate": 2.879127028539452e-05,
"loss": 0.6664,
"step": 957
},
{
"epoch": 1.4452830188679244,
"grad_norm": 0.3508425660348941,
"learning_rate": 2.8763290430889762e-05,
"loss": 0.6468,
"step": 958
},
{
"epoch": 1.4467924528301888,
"grad_norm": 0.3707956017303932,
"learning_rate": 2.8735310576385005e-05,
"loss": 0.6495,
"step": 959
},
{
"epoch": 1.4483018867924529,
"grad_norm": 0.3278213890755489,
"learning_rate": 2.870733072188025e-05,
"loss": 0.6217,
"step": 960
},
{
"epoch": 1.449811320754717,
"grad_norm": 0.40970965375073787,
"learning_rate": 2.8679350867375492e-05,
"loss": 0.6451,
"step": 961
},
{
"epoch": 1.451320754716981,
"grad_norm": 0.3688583638649611,
"learning_rate": 2.8651371012870736e-05,
"loss": 0.6251,
"step": 962
},
{
"epoch": 1.4528301886792452,
"grad_norm": 0.34179313589597693,
"learning_rate": 2.8623391158365976e-05,
"loss": 0.6409,
"step": 963
},
{
"epoch": 1.4543396226415095,
"grad_norm": 0.3448261436866798,
"learning_rate": 2.859541130386122e-05,
"loss": 0.5905,
"step": 964
},
{
"epoch": 1.4558490566037736,
"grad_norm": 0.33694291297212864,
"learning_rate": 2.8567431449356462e-05,
"loss": 0.5761,
"step": 965
},
{
"epoch": 1.4573584905660377,
"grad_norm": 0.3118187273291105,
"learning_rate": 2.8539451594851706e-05,
"loss": 0.6088,
"step": 966
},
{
"epoch": 1.4588679245283018,
"grad_norm": 0.32045778178994183,
"learning_rate": 2.8511471740346952e-05,
"loss": 0.6205,
"step": 967
},
{
"epoch": 1.460377358490566,
"grad_norm": 0.45338400028752146,
"learning_rate": 2.8483491885842196e-05,
"loss": 0.5963,
"step": 968
},
{
"epoch": 1.4618867924528303,
"grad_norm": 0.32689068860548115,
"learning_rate": 2.845551203133744e-05,
"loss": 0.5745,
"step": 969
},
{
"epoch": 1.4633962264150944,
"grad_norm": 0.38005063017031954,
"learning_rate": 2.8427532176832682e-05,
"loss": 0.6426,
"step": 970
},
{
"epoch": 1.4649056603773585,
"grad_norm": 0.3309282292367092,
"learning_rate": 2.8399552322327926e-05,
"loss": 0.669,
"step": 971
},
{
"epoch": 1.4664150943396226,
"grad_norm": 1.4917946740992876,
"learning_rate": 2.8371572467823173e-05,
"loss": 0.6343,
"step": 972
},
{
"epoch": 1.4679245283018867,
"grad_norm": 0.3233023167238395,
"learning_rate": 2.8343592613318416e-05,
"loss": 0.6077,
"step": 973
},
{
"epoch": 1.469433962264151,
"grad_norm": 0.3262290819657421,
"learning_rate": 2.8315612758813653e-05,
"loss": 0.6578,
"step": 974
},
{
"epoch": 1.4709433962264151,
"grad_norm": 0.3339350102619771,
"learning_rate": 2.8287632904308896e-05,
"loss": 0.621,
"step": 975
},
{
"epoch": 1.4724528301886792,
"grad_norm": 0.3192961373970712,
"learning_rate": 2.8259653049804143e-05,
"loss": 0.6236,
"step": 976
},
{
"epoch": 1.4739622641509433,
"grad_norm": 0.49845825196565774,
"learning_rate": 2.8231673195299386e-05,
"loss": 0.6254,
"step": 977
},
{
"epoch": 1.4754716981132074,
"grad_norm": 0.3485802760303236,
"learning_rate": 2.820369334079463e-05,
"loss": 0.6046,
"step": 978
},
{
"epoch": 1.4769811320754718,
"grad_norm": 0.3598346283166558,
"learning_rate": 2.8175713486289873e-05,
"loss": 0.6621,
"step": 979
},
{
"epoch": 1.4784905660377359,
"grad_norm": 0.7079216014041246,
"learning_rate": 2.8147733631785116e-05,
"loss": 0.6659,
"step": 980
},
{
"epoch": 1.48,
"grad_norm": 0.3996327762211729,
"learning_rate": 2.811975377728036e-05,
"loss": 0.5864,
"step": 981
},
{
"epoch": 1.481509433962264,
"grad_norm": 0.4468257832623057,
"learning_rate": 2.8091773922775606e-05,
"loss": 0.629,
"step": 982
},
{
"epoch": 1.4830188679245282,
"grad_norm": 0.35338065855143863,
"learning_rate": 2.806379406827085e-05,
"loss": 0.5689,
"step": 983
},
{
"epoch": 1.4845283018867925,
"grad_norm": 0.3447521513306188,
"learning_rate": 2.8035814213766086e-05,
"loss": 0.5519,
"step": 984
},
{
"epoch": 1.4860377358490566,
"grad_norm": 0.34657588388233884,
"learning_rate": 2.8007834359261333e-05,
"loss": 0.6423,
"step": 985
},
{
"epoch": 1.4875471698113207,
"grad_norm": 0.34128797853204484,
"learning_rate": 2.7979854504756576e-05,
"loss": 0.6009,
"step": 986
},
{
"epoch": 1.4890566037735848,
"grad_norm": 0.8214335030714001,
"learning_rate": 2.795187465025182e-05,
"loss": 0.5986,
"step": 987
},
{
"epoch": 1.490566037735849,
"grad_norm": 9.613076505885713,
"learning_rate": 2.7923894795747063e-05,
"loss": 0.8501,
"step": 988
},
{
"epoch": 1.4920754716981133,
"grad_norm": 0.5243283003121673,
"learning_rate": 2.7895914941242306e-05,
"loss": 0.6533,
"step": 989
},
{
"epoch": 1.4935849056603774,
"grad_norm": 0.4045389908997858,
"learning_rate": 2.786793508673755e-05,
"loss": 0.6253,
"step": 990
},
{
"epoch": 1.4950943396226415,
"grad_norm": 0.4281589533591875,
"learning_rate": 2.7839955232232796e-05,
"loss": 0.6287,
"step": 991
},
{
"epoch": 1.4966037735849056,
"grad_norm": 0.38192093777254366,
"learning_rate": 2.781197537772804e-05,
"loss": 0.6681,
"step": 992
},
{
"epoch": 1.4981132075471697,
"grad_norm": 0.47848727765965543,
"learning_rate": 2.7783995523223283e-05,
"loss": 0.6019,
"step": 993
},
{
"epoch": 1.499622641509434,
"grad_norm": 0.3985883342403703,
"learning_rate": 2.775601566871852e-05,
"loss": 0.6599,
"step": 994
},
{
"epoch": 1.5011320754716981,
"grad_norm": 0.3434562172845685,
"learning_rate": 2.7728035814213767e-05,
"loss": 0.6081,
"step": 995
},
{
"epoch": 1.5026415094339622,
"grad_norm": 0.3980403992472545,
"learning_rate": 2.770005595970901e-05,
"loss": 0.6172,
"step": 996
},
{
"epoch": 1.5041509433962266,
"grad_norm": 0.3763385571813578,
"learning_rate": 2.7672076105204253e-05,
"loss": 0.6792,
"step": 997
},
{
"epoch": 1.5056603773584905,
"grad_norm": 0.40146854330428877,
"learning_rate": 2.7644096250699497e-05,
"loss": 0.6224,
"step": 998
},
{
"epoch": 1.5071698113207548,
"grad_norm": 0.33301446846068716,
"learning_rate": 2.761611639619474e-05,
"loss": 0.6887,
"step": 999
},
{
"epoch": 1.5086792452830189,
"grad_norm": 0.38371280593670326,
"learning_rate": 2.7588136541689987e-05,
"loss": 0.6132,
"step": 1000
},
{
"epoch": 1.510188679245283,
"grad_norm": 0.37104373493288634,
"learning_rate": 2.756015668718523e-05,
"loss": 0.6067,
"step": 1001
},
{
"epoch": 1.5116981132075473,
"grad_norm": 0.47232500165135083,
"learning_rate": 2.7532176832680473e-05,
"loss": 0.6102,
"step": 1002
},
{
"epoch": 1.5132075471698112,
"grad_norm": 0.33043113034271476,
"learning_rate": 2.7504196978175717e-05,
"loss": 0.6384,
"step": 1003
},
{
"epoch": 1.5147169811320755,
"grad_norm": 0.3448322098677679,
"learning_rate": 2.747621712367096e-05,
"loss": 0.6998,
"step": 1004
},
{
"epoch": 1.5162264150943396,
"grad_norm": 0.40306936202383525,
"learning_rate": 2.74482372691662e-05,
"loss": 0.6676,
"step": 1005
},
{
"epoch": 1.5177358490566037,
"grad_norm": 0.32212746145926247,
"learning_rate": 2.7420257414661444e-05,
"loss": 0.7054,
"step": 1006
},
{
"epoch": 1.519245283018868,
"grad_norm": 0.3630795836083071,
"learning_rate": 2.7392277560156687e-05,
"loss": 0.6069,
"step": 1007
},
{
"epoch": 1.520754716981132,
"grad_norm": 0.35892977901686324,
"learning_rate": 2.736429770565193e-05,
"loss": 0.6303,
"step": 1008
},
{
"epoch": 1.5222641509433963,
"grad_norm": 0.34033834989241846,
"learning_rate": 2.7336317851147174e-05,
"loss": 0.6073,
"step": 1009
},
{
"epoch": 1.5237735849056604,
"grad_norm": 0.3606987139401938,
"learning_rate": 2.730833799664242e-05,
"loss": 0.6533,
"step": 1010
},
{
"epoch": 1.5252830188679245,
"grad_norm": 0.35197301530590785,
"learning_rate": 2.7280358142137664e-05,
"loss": 0.6774,
"step": 1011
},
{
"epoch": 1.5267924528301888,
"grad_norm": 0.3409953188778291,
"learning_rate": 2.7252378287632907e-05,
"loss": 0.6111,
"step": 1012
},
{
"epoch": 1.5283018867924527,
"grad_norm": 0.35710307374963385,
"learning_rate": 2.722439843312815e-05,
"loss": 0.5887,
"step": 1013
},
{
"epoch": 1.529811320754717,
"grad_norm": 0.39361169119324246,
"learning_rate": 2.7196418578623394e-05,
"loss": 0.6824,
"step": 1014
},
{
"epoch": 1.5313207547169811,
"grad_norm": 0.42089198638548375,
"learning_rate": 2.7168438724118634e-05,
"loss": 0.6084,
"step": 1015
},
{
"epoch": 1.5328301886792453,
"grad_norm": 0.4002037847566263,
"learning_rate": 2.7140458869613877e-05,
"loss": 0.6164,
"step": 1016
},
{
"epoch": 1.5343396226415096,
"grad_norm": 0.4279739458874451,
"learning_rate": 2.711247901510912e-05,
"loss": 0.602,
"step": 1017
},
{
"epoch": 1.5358490566037735,
"grad_norm": 0.3741065695840471,
"learning_rate": 2.7084499160604364e-05,
"loss": 0.5819,
"step": 1018
},
{
"epoch": 1.5373584905660378,
"grad_norm": 0.4279718365040865,
"learning_rate": 2.705651930609961e-05,
"loss": 0.6296,
"step": 1019
},
{
"epoch": 1.538867924528302,
"grad_norm": 0.448203149422696,
"learning_rate": 2.7028539451594854e-05,
"loss": 0.5985,
"step": 1020
},
{
"epoch": 1.540377358490566,
"grad_norm": 0.39251867440111504,
"learning_rate": 2.7000559597090097e-05,
"loss": 0.6152,
"step": 1021
},
{
"epoch": 1.5418867924528303,
"grad_norm": 0.5318193655500686,
"learning_rate": 2.697257974258534e-05,
"loss": 0.6533,
"step": 1022
},
{
"epoch": 1.5433962264150942,
"grad_norm": 0.36282379983998825,
"learning_rate": 2.6944599888080584e-05,
"loss": 0.6422,
"step": 1023
},
{
"epoch": 1.5449056603773585,
"grad_norm": 0.4679464718781621,
"learning_rate": 2.6916620033575827e-05,
"loss": 0.6914,
"step": 1024
},
{
"epoch": 1.5464150943396227,
"grad_norm": 0.390523648842573,
"learning_rate": 2.6888640179071067e-05,
"loss": 0.6562,
"step": 1025
},
{
"epoch": 1.5479245283018868,
"grad_norm": 0.3704983255661872,
"learning_rate": 2.686066032456631e-05,
"loss": 0.6257,
"step": 1026
},
{
"epoch": 1.549433962264151,
"grad_norm": 0.41154051577412576,
"learning_rate": 2.6832680470061554e-05,
"loss": 0.6028,
"step": 1027
},
{
"epoch": 1.550943396226415,
"grad_norm": 0.33250241430063915,
"learning_rate": 2.68047006155568e-05,
"loss": 0.6491,
"step": 1028
},
{
"epoch": 1.5524528301886793,
"grad_norm": 0.4012914420070519,
"learning_rate": 2.6776720761052044e-05,
"loss": 0.6053,
"step": 1029
},
{
"epoch": 1.5539622641509434,
"grad_norm": 0.47506656031425826,
"learning_rate": 2.6748740906547288e-05,
"loss": 0.6044,
"step": 1030
},
{
"epoch": 1.5554716981132075,
"grad_norm": 0.3707491589866621,
"learning_rate": 2.672076105204253e-05,
"loss": 0.6703,
"step": 1031
},
{
"epoch": 1.5569811320754718,
"grad_norm": 0.34186133320917916,
"learning_rate": 2.6692781197537774e-05,
"loss": 0.6596,
"step": 1032
},
{
"epoch": 1.5584905660377357,
"grad_norm": 0.3491021187362257,
"learning_rate": 2.6664801343033018e-05,
"loss": 0.5844,
"step": 1033
},
{
"epoch": 1.56,
"grad_norm": 0.7155210370076859,
"learning_rate": 2.6636821488528264e-05,
"loss": 0.6393,
"step": 1034
},
{
"epoch": 1.5615094339622642,
"grad_norm": 0.37905853694030756,
"learning_rate": 2.6608841634023508e-05,
"loss": 0.5804,
"step": 1035
},
{
"epoch": 1.5630188679245283,
"grad_norm": 0.39249010794322703,
"learning_rate": 2.6580861779518744e-05,
"loss": 0.6622,
"step": 1036
},
{
"epoch": 1.5645283018867926,
"grad_norm": 0.3177813349439448,
"learning_rate": 2.6552881925013988e-05,
"loss": 0.6893,
"step": 1037
},
{
"epoch": 1.5660377358490565,
"grad_norm": 0.3578675379157616,
"learning_rate": 2.6524902070509235e-05,
"loss": 0.6743,
"step": 1038
},
{
"epoch": 1.5675471698113208,
"grad_norm": 0.38432047726232615,
"learning_rate": 2.6496922216004478e-05,
"loss": 0.5848,
"step": 1039
},
{
"epoch": 1.569056603773585,
"grad_norm": 0.38664999715982123,
"learning_rate": 2.646894236149972e-05,
"loss": 0.6501,
"step": 1040
},
{
"epoch": 1.570566037735849,
"grad_norm": 0.3030876699883648,
"learning_rate": 2.6440962506994965e-05,
"loss": 0.6048,
"step": 1041
},
{
"epoch": 1.5720754716981133,
"grad_norm": 0.3282480223581648,
"learning_rate": 2.6412982652490208e-05,
"loss": 0.5784,
"step": 1042
},
{
"epoch": 1.5735849056603772,
"grad_norm": 0.34287712921209795,
"learning_rate": 2.638500279798545e-05,
"loss": 0.5587,
"step": 1043
},
{
"epoch": 1.5750943396226416,
"grad_norm": 0.4249512336319463,
"learning_rate": 2.6357022943480698e-05,
"loss": 0.6872,
"step": 1044
},
{
"epoch": 1.5766037735849057,
"grad_norm": 0.36483078091450444,
"learning_rate": 2.632904308897594e-05,
"loss": 0.6272,
"step": 1045
},
{
"epoch": 1.5781132075471698,
"grad_norm": 0.33642660623413323,
"learning_rate": 2.6301063234471178e-05,
"loss": 0.6152,
"step": 1046
},
{
"epoch": 1.579622641509434,
"grad_norm": 0.3795901995942625,
"learning_rate": 2.6273083379966425e-05,
"loss": 0.6457,
"step": 1047
},
{
"epoch": 1.581132075471698,
"grad_norm": 0.4999474857747963,
"learning_rate": 2.6245103525461668e-05,
"loss": 0.6492,
"step": 1048
},
{
"epoch": 1.5826415094339623,
"grad_norm": 0.31486204696092474,
"learning_rate": 2.621712367095691e-05,
"loss": 0.6266,
"step": 1049
},
{
"epoch": 1.5841509433962264,
"grad_norm": 0.3742304773869301,
"learning_rate": 2.6189143816452155e-05,
"loss": 0.6116,
"step": 1050
},
{
"epoch": 1.5856603773584905,
"grad_norm": 0.41017165219333346,
"learning_rate": 2.6161163961947398e-05,
"loss": 0.6209,
"step": 1051
},
{
"epoch": 1.5871698113207549,
"grad_norm": 0.32510313559158954,
"learning_rate": 2.613318410744264e-05,
"loss": 0.5938,
"step": 1052
},
{
"epoch": 1.5886792452830187,
"grad_norm": 0.34975228019540705,
"learning_rate": 2.610520425293789e-05,
"loss": 0.6052,
"step": 1053
},
{
"epoch": 1.590188679245283,
"grad_norm": 0.3593828761751314,
"learning_rate": 2.6077224398433132e-05,
"loss": 0.5994,
"step": 1054
},
{
"epoch": 1.5916981132075472,
"grad_norm": 0.3379735940847511,
"learning_rate": 2.6049244543928375e-05,
"loss": 0.6682,
"step": 1055
},
{
"epoch": 1.5932075471698113,
"grad_norm": 0.3688041258843097,
"learning_rate": 2.602126468942362e-05,
"loss": 0.7183,
"step": 1056
},
{
"epoch": 1.5947169811320756,
"grad_norm": 0.4216449339373789,
"learning_rate": 2.599328483491886e-05,
"loss": 0.7043,
"step": 1057
},
{
"epoch": 1.5962264150943395,
"grad_norm": 2.267511774259029,
"learning_rate": 2.5965304980414102e-05,
"loss": 0.683,
"step": 1058
},
{
"epoch": 1.5977358490566038,
"grad_norm": 0.3895472450306408,
"learning_rate": 2.5937325125909345e-05,
"loss": 0.6855,
"step": 1059
},
{
"epoch": 1.599245283018868,
"grad_norm": 0.41188597998599974,
"learning_rate": 2.590934527140459e-05,
"loss": 0.6083,
"step": 1060
},
{
"epoch": 1.600754716981132,
"grad_norm": 0.3059560204490495,
"learning_rate": 2.5881365416899832e-05,
"loss": 0.5571,
"step": 1061
},
{
"epoch": 1.6022641509433964,
"grad_norm": 0.34622860668786465,
"learning_rate": 2.585338556239508e-05,
"loss": 0.5771,
"step": 1062
},
{
"epoch": 1.6037735849056602,
"grad_norm": 0.34479433863653974,
"learning_rate": 2.5825405707890322e-05,
"loss": 0.6081,
"step": 1063
},
{
"epoch": 1.6052830188679246,
"grad_norm": 0.36272238390017086,
"learning_rate": 2.5797425853385565e-05,
"loss": 0.6959,
"step": 1064
},
{
"epoch": 1.6067924528301887,
"grad_norm": 0.4094790107793089,
"learning_rate": 2.576944599888081e-05,
"loss": 0.6309,
"step": 1065
},
{
"epoch": 1.6083018867924528,
"grad_norm": 0.3788756955199532,
"learning_rate": 2.5741466144376052e-05,
"loss": 0.5994,
"step": 1066
},
{
"epoch": 1.6098113207547171,
"grad_norm": 0.363138476827497,
"learning_rate": 2.5713486289871292e-05,
"loss": 0.6657,
"step": 1067
},
{
"epoch": 1.611320754716981,
"grad_norm": 0.35600650897268843,
"learning_rate": 2.5685506435366535e-05,
"loss": 0.6238,
"step": 1068
},
{
"epoch": 1.6128301886792453,
"grad_norm": 0.31529869654986054,
"learning_rate": 2.565752658086178e-05,
"loss": 0.6157,
"step": 1069
},
{
"epoch": 1.6143396226415094,
"grad_norm": 0.41343618537042476,
"learning_rate": 2.5629546726357022e-05,
"loss": 0.6847,
"step": 1070
},
{
"epoch": 1.6158490566037735,
"grad_norm": 0.36948590263079245,
"learning_rate": 2.560156687185227e-05,
"loss": 0.6373,
"step": 1071
},
{
"epoch": 1.6173584905660379,
"grad_norm": 0.3405631397524817,
"learning_rate": 2.5573587017347512e-05,
"loss": 0.6454,
"step": 1072
},
{
"epoch": 1.6188679245283017,
"grad_norm": 0.3876899509134273,
"learning_rate": 2.5545607162842756e-05,
"loss": 0.6897,
"step": 1073
},
{
"epoch": 1.620377358490566,
"grad_norm": 0.36648572255209627,
"learning_rate": 2.5517627308338e-05,
"loss": 0.6366,
"step": 1074
},
{
"epoch": 1.6218867924528302,
"grad_norm": 0.39363149190822344,
"learning_rate": 2.5489647453833242e-05,
"loss": 0.6312,
"step": 1075
},
{
"epoch": 1.6233962264150943,
"grad_norm": 0.393083017888767,
"learning_rate": 2.5461667599328486e-05,
"loss": 0.621,
"step": 1076
},
{
"epoch": 1.6249056603773586,
"grad_norm": 0.42789295095166247,
"learning_rate": 2.5433687744823726e-05,
"loss": 0.6328,
"step": 1077
},
{
"epoch": 1.6264150943396225,
"grad_norm": 0.34143614581342263,
"learning_rate": 2.540570789031897e-05,
"loss": 0.5833,
"step": 1078
},
{
"epoch": 1.6279245283018868,
"grad_norm": 0.48993095092983346,
"learning_rate": 2.5377728035814212e-05,
"loss": 0.6499,
"step": 1079
},
{
"epoch": 1.629433962264151,
"grad_norm": 0.38482007687414094,
"learning_rate": 2.5349748181309456e-05,
"loss": 0.624,
"step": 1080
},
{
"epoch": 1.630943396226415,
"grad_norm": 0.3620296499170073,
"learning_rate": 2.5321768326804703e-05,
"loss": 0.6196,
"step": 1081
},
{
"epoch": 1.6324528301886794,
"grad_norm": 0.4413925417638725,
"learning_rate": 2.5293788472299946e-05,
"loss": 0.6562,
"step": 1082
},
{
"epoch": 1.6339622641509433,
"grad_norm": 0.31728101683770593,
"learning_rate": 2.526580861779519e-05,
"loss": 0.5931,
"step": 1083
},
{
"epoch": 1.6354716981132076,
"grad_norm": 0.3242208210115639,
"learning_rate": 2.5237828763290433e-05,
"loss": 0.6428,
"step": 1084
},
{
"epoch": 1.6369811320754717,
"grad_norm": 0.34313826598204494,
"learning_rate": 2.5209848908785676e-05,
"loss": 0.5892,
"step": 1085
},
{
"epoch": 1.6384905660377358,
"grad_norm": 0.27873747077218364,
"learning_rate": 2.518186905428092e-05,
"loss": 0.6435,
"step": 1086
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.34391194671727837,
"learning_rate": 2.5153889199776166e-05,
"loss": 0.6208,
"step": 1087
},
{
"epoch": 1.641509433962264,
"grad_norm": 0.3202174999860832,
"learning_rate": 2.5125909345271403e-05,
"loss": 0.5982,
"step": 1088
},
{
"epoch": 1.6430188679245283,
"grad_norm": 0.353421412519148,
"learning_rate": 2.5097929490766646e-05,
"loss": 0.6558,
"step": 1089
},
{
"epoch": 1.6445283018867924,
"grad_norm": 0.33590448835443065,
"learning_rate": 2.5069949636261893e-05,
"loss": 0.5849,
"step": 1090
},
{
"epoch": 1.6460377358490565,
"grad_norm": 0.3049009352630597,
"learning_rate": 2.5041969781757136e-05,
"loss": 0.6268,
"step": 1091
},
{
"epoch": 1.6475471698113209,
"grad_norm": 0.32814946592178634,
"learning_rate": 2.501398992725238e-05,
"loss": 0.571,
"step": 1092
},
{
"epoch": 1.6490566037735848,
"grad_norm": 0.32851218281151046,
"learning_rate": 2.4986010072747623e-05,
"loss": 0.5919,
"step": 1093
},
{
"epoch": 1.650566037735849,
"grad_norm": 0.29866330007475594,
"learning_rate": 2.4958030218242866e-05,
"loss": 0.609,
"step": 1094
},
{
"epoch": 1.6520754716981132,
"grad_norm": 0.2903575786430782,
"learning_rate": 2.493005036373811e-05,
"loss": 0.633,
"step": 1095
},
{
"epoch": 1.6535849056603773,
"grad_norm": 0.34536143070473535,
"learning_rate": 2.4902070509233353e-05,
"loss": 0.6005,
"step": 1096
},
{
"epoch": 1.6550943396226416,
"grad_norm": 0.3026599205345073,
"learning_rate": 2.4874090654728596e-05,
"loss": 0.6478,
"step": 1097
},
{
"epoch": 1.6566037735849055,
"grad_norm": 0.306175601002009,
"learning_rate": 2.484611080022384e-05,
"loss": 0.6195,
"step": 1098
},
{
"epoch": 1.6581132075471698,
"grad_norm": 0.3710617396392141,
"learning_rate": 2.4818130945719083e-05,
"loss": 0.6228,
"step": 1099
},
{
"epoch": 1.659622641509434,
"grad_norm": 0.3377466953849946,
"learning_rate": 2.4790151091214326e-05,
"loss": 0.6511,
"step": 1100
},
{
"epoch": 1.661132075471698,
"grad_norm": 0.31549413830814466,
"learning_rate": 2.476217123670957e-05,
"loss": 0.5633,
"step": 1101
},
{
"epoch": 1.6626415094339624,
"grad_norm": 0.357754789091578,
"learning_rate": 2.4734191382204813e-05,
"loss": 0.6337,
"step": 1102
},
{
"epoch": 1.6641509433962263,
"grad_norm": 0.3306548434162944,
"learning_rate": 2.4706211527700057e-05,
"loss": 0.5843,
"step": 1103
},
{
"epoch": 1.6656603773584906,
"grad_norm": 6.054734215990575,
"learning_rate": 2.46782316731953e-05,
"loss": 0.661,
"step": 1104
},
{
"epoch": 1.6671698113207547,
"grad_norm": 0.4115492900895262,
"learning_rate": 2.4650251818690547e-05,
"loss": 0.6434,
"step": 1105
},
{
"epoch": 1.6686792452830188,
"grad_norm": 0.822362998347998,
"learning_rate": 2.4622271964185787e-05,
"loss": 0.6846,
"step": 1106
},
{
"epoch": 1.6701886792452831,
"grad_norm": 0.4448253668967885,
"learning_rate": 2.459429210968103e-05,
"loss": 0.5628,
"step": 1107
},
{
"epoch": 1.671698113207547,
"grad_norm": 0.3770953555568863,
"learning_rate": 2.4566312255176273e-05,
"loss": 0.6269,
"step": 1108
},
{
"epoch": 1.6732075471698113,
"grad_norm": 0.5299343272830751,
"learning_rate": 2.4538332400671517e-05,
"loss": 0.6237,
"step": 1109
},
{
"epoch": 1.6747169811320755,
"grad_norm": 0.3702282688776482,
"learning_rate": 2.4510352546166763e-05,
"loss": 0.6586,
"step": 1110
},
{
"epoch": 1.6762264150943396,
"grad_norm": 0.39024405698123815,
"learning_rate": 2.4482372691662003e-05,
"loss": 0.6429,
"step": 1111
},
{
"epoch": 1.677735849056604,
"grad_norm": 0.4537391331752708,
"learning_rate": 2.4454392837157247e-05,
"loss": 0.639,
"step": 1112
},
{
"epoch": 1.6792452830188678,
"grad_norm": 0.3149708264349129,
"learning_rate": 2.442641298265249e-05,
"loss": 0.5799,
"step": 1113
},
{
"epoch": 1.680754716981132,
"grad_norm": 0.332009587191511,
"learning_rate": 2.4398433128147733e-05,
"loss": 0.5921,
"step": 1114
},
{
"epoch": 1.6822641509433962,
"grad_norm": 0.4987660993726321,
"learning_rate": 2.437045327364298e-05,
"loss": 0.5617,
"step": 1115
},
{
"epoch": 1.6837735849056603,
"grad_norm": 0.38823515270339287,
"learning_rate": 2.434247341913822e-05,
"loss": 0.6943,
"step": 1116
},
{
"epoch": 1.6852830188679246,
"grad_norm": 0.41948680248507986,
"learning_rate": 2.4314493564633464e-05,
"loss": 0.6714,
"step": 1117
},
{
"epoch": 1.6867924528301885,
"grad_norm": 0.41536448849439084,
"learning_rate": 2.4286513710128707e-05,
"loss": 0.6236,
"step": 1118
},
{
"epoch": 1.6883018867924529,
"grad_norm": 0.38203854076076366,
"learning_rate": 2.4258533855623954e-05,
"loss": 0.6081,
"step": 1119
},
{
"epoch": 1.689811320754717,
"grad_norm": 0.3167935649962668,
"learning_rate": 2.4230554001119197e-05,
"loss": 0.6062,
"step": 1120
},
{
"epoch": 1.691320754716981,
"grad_norm": 0.39343182435379037,
"learning_rate": 2.420257414661444e-05,
"loss": 0.6464,
"step": 1121
},
{
"epoch": 1.6928301886792454,
"grad_norm": 0.38741191325073565,
"learning_rate": 2.417459429210968e-05,
"loss": 0.5858,
"step": 1122
},
{
"epoch": 1.6943396226415093,
"grad_norm": 0.3110251540082128,
"learning_rate": 2.4146614437604924e-05,
"loss": 0.6742,
"step": 1123
},
{
"epoch": 1.6958490566037736,
"grad_norm": 0.4205404270555864,
"learning_rate": 2.411863458310017e-05,
"loss": 0.6817,
"step": 1124
},
{
"epoch": 1.6973584905660377,
"grad_norm": 0.3370144854328815,
"learning_rate": 2.4090654728595414e-05,
"loss": 0.6306,
"step": 1125
},
{
"epoch": 1.6988679245283018,
"grad_norm": 0.31428186192021673,
"learning_rate": 2.4062674874090657e-05,
"loss": 0.6701,
"step": 1126
},
{
"epoch": 1.7003773584905661,
"grad_norm": 0.35546961013819645,
"learning_rate": 2.4034695019585897e-05,
"loss": 0.5598,
"step": 1127
},
{
"epoch": 1.70188679245283,
"grad_norm": 0.30842915313942293,
"learning_rate": 2.400671516508114e-05,
"loss": 0.6555,
"step": 1128
},
{
"epoch": 1.7033962264150944,
"grad_norm": 0.3503526140590797,
"learning_rate": 2.3978735310576387e-05,
"loss": 0.6268,
"step": 1129
},
{
"epoch": 1.7049056603773585,
"grad_norm": 0.3525131623141097,
"learning_rate": 2.395075545607163e-05,
"loss": 0.651,
"step": 1130
},
{
"epoch": 1.7064150943396226,
"grad_norm": 0.362462685268756,
"learning_rate": 2.3922775601566874e-05,
"loss": 0.654,
"step": 1131
},
{
"epoch": 1.707924528301887,
"grad_norm": 0.35270111632126117,
"learning_rate": 2.3894795747062114e-05,
"loss": 0.6304,
"step": 1132
},
{
"epoch": 1.7094339622641508,
"grad_norm": 0.35642789633544897,
"learning_rate": 2.386681589255736e-05,
"loss": 0.569,
"step": 1133
},
{
"epoch": 1.7109433962264151,
"grad_norm": 0.3548672747435073,
"learning_rate": 2.3838836038052604e-05,
"loss": 0.6078,
"step": 1134
},
{
"epoch": 1.7124528301886792,
"grad_norm": 0.41883631687319944,
"learning_rate": 2.3810856183547848e-05,
"loss": 0.6137,
"step": 1135
},
{
"epoch": 1.7139622641509433,
"grad_norm": 0.37255769585734255,
"learning_rate": 2.378287632904309e-05,
"loss": 0.6341,
"step": 1136
},
{
"epoch": 1.7154716981132077,
"grad_norm": 0.343650289950691,
"learning_rate": 2.375489647453833e-05,
"loss": 0.6816,
"step": 1137
},
{
"epoch": 1.7169811320754715,
"grad_norm": 0.3135936860965841,
"learning_rate": 2.3726916620033578e-05,
"loss": 0.661,
"step": 1138
},
{
"epoch": 1.7184905660377359,
"grad_norm": 0.3354145369379278,
"learning_rate": 2.369893676552882e-05,
"loss": 0.6131,
"step": 1139
},
{
"epoch": 1.72,
"grad_norm": 0.37591403645106586,
"learning_rate": 2.3670956911024064e-05,
"loss": 0.7752,
"step": 1140
},
{
"epoch": 1.721509433962264,
"grad_norm": 3.276557779806758,
"learning_rate": 2.3642977056519308e-05,
"loss": 0.6548,
"step": 1141
},
{
"epoch": 1.7230188679245284,
"grad_norm": 0.40078418503283786,
"learning_rate": 2.361499720201455e-05,
"loss": 0.6305,
"step": 1142
},
{
"epoch": 1.7245283018867923,
"grad_norm": 0.3490281170396962,
"learning_rate": 2.3587017347509794e-05,
"loss": 0.6477,
"step": 1143
},
{
"epoch": 1.7260377358490566,
"grad_norm": 0.3130044965355929,
"learning_rate": 2.3559037493005038e-05,
"loss": 0.6318,
"step": 1144
},
{
"epoch": 1.7275471698113207,
"grad_norm": 0.31341978043560415,
"learning_rate": 2.353105763850028e-05,
"loss": 0.5936,
"step": 1145
},
{
"epoch": 1.7290566037735848,
"grad_norm": 0.4007373637031972,
"learning_rate": 2.3503077783995524e-05,
"loss": 0.6289,
"step": 1146
},
{
"epoch": 1.7305660377358492,
"grad_norm": 0.3145194497545859,
"learning_rate": 2.3475097929490768e-05,
"loss": 0.5794,
"step": 1147
},
{
"epoch": 1.732075471698113,
"grad_norm": 0.41907955688569665,
"learning_rate": 2.344711807498601e-05,
"loss": 0.6204,
"step": 1148
},
{
"epoch": 1.7335849056603774,
"grad_norm": 0.3368506672709832,
"learning_rate": 2.3419138220481255e-05,
"loss": 0.6453,
"step": 1149
},
{
"epoch": 1.7350943396226415,
"grad_norm": 0.3578725045779005,
"learning_rate": 2.3391158365976498e-05,
"loss": 0.6131,
"step": 1150
},
{
"epoch": 1.7366037735849056,
"grad_norm": 0.387646182019032,
"learning_rate": 2.336317851147174e-05,
"loss": 0.6591,
"step": 1151
},
{
"epoch": 1.73811320754717,
"grad_norm": 0.370813360320474,
"learning_rate": 2.3335198656966985e-05,
"loss": 0.6132,
"step": 1152
},
{
"epoch": 1.7396226415094338,
"grad_norm": 0.35325542219207184,
"learning_rate": 2.3307218802462228e-05,
"loss": 0.6152,
"step": 1153
},
{
"epoch": 1.7411320754716981,
"grad_norm": 0.34775995134307486,
"learning_rate": 2.327923894795747e-05,
"loss": 0.696,
"step": 1154
},
{
"epoch": 1.7426415094339622,
"grad_norm": 0.431893921598768,
"learning_rate": 2.3251259093452715e-05,
"loss": 0.5584,
"step": 1155
},
{
"epoch": 1.7441509433962263,
"grad_norm": 0.4818685295387096,
"learning_rate": 2.3223279238947958e-05,
"loss": 0.648,
"step": 1156
},
{
"epoch": 1.7456603773584907,
"grad_norm": 0.3148571492339726,
"learning_rate": 2.31952993844432e-05,
"loss": 0.6053,
"step": 1157
},
{
"epoch": 1.7471698113207546,
"grad_norm": 0.49474292238819984,
"learning_rate": 2.3167319529938445e-05,
"loss": 0.6756,
"step": 1158
},
{
"epoch": 1.7486792452830189,
"grad_norm": 0.42970659132808886,
"learning_rate": 2.3139339675433688e-05,
"loss": 0.6378,
"step": 1159
},
{
"epoch": 1.750188679245283,
"grad_norm": 0.3467646221311692,
"learning_rate": 2.311135982092893e-05,
"loss": 0.6767,
"step": 1160
},
{
"epoch": 1.751698113207547,
"grad_norm": 0.49298106645190726,
"learning_rate": 2.3083379966424175e-05,
"loss": 0.6482,
"step": 1161
},
{
"epoch": 1.7532075471698114,
"grad_norm": 0.38860132343453263,
"learning_rate": 2.305540011191942e-05,
"loss": 0.6828,
"step": 1162
},
{
"epoch": 1.7547169811320755,
"grad_norm": 0.36904627662475137,
"learning_rate": 2.302742025741466e-05,
"loss": 0.5228,
"step": 1163
},
{
"epoch": 1.7562264150943396,
"grad_norm": 0.3699438829613781,
"learning_rate": 2.2999440402909905e-05,
"loss": 0.7172,
"step": 1164
},
{
"epoch": 1.7577358490566037,
"grad_norm": 0.46842274714832544,
"learning_rate": 2.297146054840515e-05,
"loss": 0.6039,
"step": 1165
},
{
"epoch": 1.7592452830188678,
"grad_norm": 0.33529915395896437,
"learning_rate": 2.2943480693900392e-05,
"loss": 0.6698,
"step": 1166
},
{
"epoch": 1.7607547169811322,
"grad_norm": 1.0369116423976208,
"learning_rate": 2.291550083939564e-05,
"loss": 0.632,
"step": 1167
},
{
"epoch": 1.7622641509433963,
"grad_norm": 0.40116881234876345,
"learning_rate": 2.288752098489088e-05,
"loss": 0.5547,
"step": 1168
},
{
"epoch": 1.7637735849056604,
"grad_norm": 0.32276429823331587,
"learning_rate": 2.2859541130386122e-05,
"loss": 0.6588,
"step": 1169
},
{
"epoch": 1.7652830188679245,
"grad_norm": 0.43292042576567075,
"learning_rate": 2.2831561275881365e-05,
"loss": 0.6191,
"step": 1170
},
{
"epoch": 1.7667924528301886,
"grad_norm": 0.3272559617175695,
"learning_rate": 2.280358142137661e-05,
"loss": 0.6573,
"step": 1171
},
{
"epoch": 1.768301886792453,
"grad_norm": 0.3315275623212996,
"learning_rate": 2.2775601566871855e-05,
"loss": 0.5584,
"step": 1172
},
{
"epoch": 1.769811320754717,
"grad_norm": 0.3839260631685633,
"learning_rate": 2.27476217123671e-05,
"loss": 0.6137,
"step": 1173
},
{
"epoch": 1.7713207547169811,
"grad_norm": 0.29910749329288777,
"learning_rate": 2.271964185786234e-05,
"loss": 0.593,
"step": 1174
},
{
"epoch": 1.7728301886792452,
"grad_norm": 0.3966476953266179,
"learning_rate": 2.2691662003357582e-05,
"loss": 0.5881,
"step": 1175
},
{
"epoch": 1.7743396226415094,
"grad_norm": 0.3834221903344162,
"learning_rate": 2.266368214885283e-05,
"loss": 0.652,
"step": 1176
},
{
"epoch": 1.7758490566037737,
"grad_norm": 0.3086183820074408,
"learning_rate": 2.2635702294348072e-05,
"loss": 0.6265,
"step": 1177
},
{
"epoch": 1.7773584905660378,
"grad_norm": 0.3760096804332004,
"learning_rate": 2.2607722439843315e-05,
"loss": 0.5765,
"step": 1178
},
{
"epoch": 1.778867924528302,
"grad_norm": 0.28869156048512906,
"learning_rate": 2.2579742585338555e-05,
"loss": 0.6032,
"step": 1179
},
{
"epoch": 1.780377358490566,
"grad_norm": 0.3723280148627601,
"learning_rate": 2.25517627308338e-05,
"loss": 0.6364,
"step": 1180
},
{
"epoch": 1.78188679245283,
"grad_norm": 0.3073245665980812,
"learning_rate": 2.2523782876329046e-05,
"loss": 0.5801,
"step": 1181
},
{
"epoch": 1.7833962264150944,
"grad_norm": 0.30275210444327105,
"learning_rate": 2.249580302182429e-05,
"loss": 0.6272,
"step": 1182
},
{
"epoch": 1.7849056603773585,
"grad_norm": 0.3482396962287078,
"learning_rate": 2.2467823167319532e-05,
"loss": 0.6292,
"step": 1183
},
{
"epoch": 1.7864150943396226,
"grad_norm": 0.3036844211927974,
"learning_rate": 2.2439843312814772e-05,
"loss": 0.6301,
"step": 1184
},
{
"epoch": 1.7879245283018868,
"grad_norm": 0.36892257197797645,
"learning_rate": 2.2411863458310016e-05,
"loss": 0.601,
"step": 1185
},
{
"epoch": 1.7894339622641509,
"grad_norm": 0.30478398813561736,
"learning_rate": 2.2383883603805262e-05,
"loss": 0.5778,
"step": 1186
},
{
"epoch": 1.7909433962264152,
"grad_norm": 0.41357451178016175,
"learning_rate": 2.2355903749300506e-05,
"loss": 0.6415,
"step": 1187
},
{
"epoch": 1.7924528301886793,
"grad_norm": 0.40936383915996677,
"learning_rate": 2.232792389479575e-05,
"loss": 0.6318,
"step": 1188
},
{
"epoch": 1.7939622641509434,
"grad_norm": 0.3305974308413721,
"learning_rate": 2.229994404029099e-05,
"loss": 0.6276,
"step": 1189
},
{
"epoch": 1.7954716981132075,
"grad_norm": 0.44756079605901494,
"learning_rate": 2.2271964185786236e-05,
"loss": 0.5627,
"step": 1190
},
{
"epoch": 1.7969811320754716,
"grad_norm": 0.32668591688032894,
"learning_rate": 2.224398433128148e-05,
"loss": 0.5553,
"step": 1191
},
{
"epoch": 1.798490566037736,
"grad_norm": 0.4056153389625477,
"learning_rate": 2.2216004476776723e-05,
"loss": 0.6964,
"step": 1192
},
{
"epoch": 1.8,
"grad_norm": 0.36423242716401244,
"learning_rate": 2.2188024622271966e-05,
"loss": 0.6385,
"step": 1193
},
{
"epoch": 1.8015094339622642,
"grad_norm": 0.3714533894188977,
"learning_rate": 2.2160044767767206e-05,
"loss": 0.5732,
"step": 1194
},
{
"epoch": 1.8030188679245283,
"grad_norm": 0.4092859262603853,
"learning_rate": 2.2132064913262453e-05,
"loss": 0.6767,
"step": 1195
},
{
"epoch": 1.8045283018867924,
"grad_norm": 0.4137797947792045,
"learning_rate": 2.2104085058757696e-05,
"loss": 0.5931,
"step": 1196
},
{
"epoch": 1.8060377358490567,
"grad_norm": 0.34744989842603025,
"learning_rate": 2.207610520425294e-05,
"loss": 0.6201,
"step": 1197
},
{
"epoch": 1.8075471698113208,
"grad_norm": 0.3331997614705039,
"learning_rate": 2.2048125349748183e-05,
"loss": 0.6613,
"step": 1198
},
{
"epoch": 1.809056603773585,
"grad_norm": 0.511957469574061,
"learning_rate": 2.2020145495243423e-05,
"loss": 0.5911,
"step": 1199
},
{
"epoch": 1.810566037735849,
"grad_norm": 0.33360941658253085,
"learning_rate": 2.199216564073867e-05,
"loss": 0.5849,
"step": 1200
},
{
"epoch": 1.8120754716981131,
"grad_norm": 0.3741495659072703,
"learning_rate": 2.1964185786233913e-05,
"loss": 0.609,
"step": 1201
},
{
"epoch": 1.8135849056603774,
"grad_norm": 0.4742678582136245,
"learning_rate": 2.1936205931729156e-05,
"loss": 0.6193,
"step": 1202
},
{
"epoch": 1.8150943396226416,
"grad_norm": 0.44917206739099247,
"learning_rate": 2.19082260772244e-05,
"loss": 0.692,
"step": 1203
},
{
"epoch": 1.8166037735849057,
"grad_norm": 0.394176326005825,
"learning_rate": 2.1880246222719643e-05,
"loss": 0.6738,
"step": 1204
},
{
"epoch": 1.8181132075471698,
"grad_norm": 0.43066185484760017,
"learning_rate": 2.1852266368214886e-05,
"loss": 0.642,
"step": 1205
},
{
"epoch": 1.8196226415094339,
"grad_norm": 0.3418450949122748,
"learning_rate": 2.182428651371013e-05,
"loss": 0.6076,
"step": 1206
},
{
"epoch": 1.8211320754716982,
"grad_norm": 0.3652282110342225,
"learning_rate": 2.1796306659205373e-05,
"loss": 0.6063,
"step": 1207
},
{
"epoch": 1.8226415094339623,
"grad_norm": 0.30250644712093494,
"learning_rate": 2.1768326804700616e-05,
"loss": 0.5631,
"step": 1208
},
{
"epoch": 1.8241509433962264,
"grad_norm": 0.3288737231812076,
"learning_rate": 2.174034695019586e-05,
"loss": 0.592,
"step": 1209
},
{
"epoch": 1.8256603773584905,
"grad_norm": 0.33567232845354483,
"learning_rate": 2.1712367095691103e-05,
"loss": 0.6278,
"step": 1210
},
{
"epoch": 1.8271698113207546,
"grad_norm": 0.3277915063716108,
"learning_rate": 2.1684387241186346e-05,
"loss": 0.6181,
"step": 1211
},
{
"epoch": 1.828679245283019,
"grad_norm": 0.3265756953441019,
"learning_rate": 2.165640738668159e-05,
"loss": 0.6163,
"step": 1212
},
{
"epoch": 1.830188679245283,
"grad_norm": 0.36101523587088497,
"learning_rate": 2.1628427532176833e-05,
"loss": 0.6528,
"step": 1213
},
{
"epoch": 1.8316981132075472,
"grad_norm": 0.3559998541778918,
"learning_rate": 2.1600447677672077e-05,
"loss": 0.5998,
"step": 1214
},
{
"epoch": 1.8332075471698113,
"grad_norm": 0.293141053328167,
"learning_rate": 2.157246782316732e-05,
"loss": 0.5898,
"step": 1215
},
{
"epoch": 1.8347169811320754,
"grad_norm": 0.29122255758943166,
"learning_rate": 2.1544487968662563e-05,
"loss": 0.6354,
"step": 1216
},
{
"epoch": 1.8362264150943397,
"grad_norm": 1.045084927109783,
"learning_rate": 2.1516508114157807e-05,
"loss": 0.5911,
"step": 1217
},
{
"epoch": 1.8377358490566038,
"grad_norm": 0.31757940698589965,
"learning_rate": 2.148852825965305e-05,
"loss": 0.6768,
"step": 1218
},
{
"epoch": 1.839245283018868,
"grad_norm": 0.32235934503323127,
"learning_rate": 2.1460548405148297e-05,
"loss": 0.6418,
"step": 1219
},
{
"epoch": 1.840754716981132,
"grad_norm": 0.3428069649816916,
"learning_rate": 2.1432568550643537e-05,
"loss": 0.6437,
"step": 1220
},
{
"epoch": 1.8422641509433961,
"grad_norm": 0.3059598449926168,
"learning_rate": 2.140458869613878e-05,
"loss": 0.6399,
"step": 1221
},
{
"epoch": 1.8437735849056605,
"grad_norm": 0.29285459882489406,
"learning_rate": 2.1376608841634023e-05,
"loss": 0.6112,
"step": 1222
},
{
"epoch": 1.8452830188679246,
"grad_norm": 1.0373182604259723,
"learning_rate": 2.1348628987129267e-05,
"loss": 0.6441,
"step": 1223
},
{
"epoch": 1.8467924528301887,
"grad_norm": 0.2824305236439785,
"learning_rate": 2.1320649132624514e-05,
"loss": 0.6141,
"step": 1224
},
{
"epoch": 1.8483018867924528,
"grad_norm": 0.40148675247914556,
"learning_rate": 2.1292669278119754e-05,
"loss": 0.6118,
"step": 1225
},
{
"epoch": 1.8498113207547169,
"grad_norm": 0.2981810451237384,
"learning_rate": 2.1264689423614997e-05,
"loss": 0.6731,
"step": 1226
},
{
"epoch": 1.8513207547169812,
"grad_norm": 2.584492019717478,
"learning_rate": 2.123670956911024e-05,
"loss": 0.616,
"step": 1227
},
{
"epoch": 1.8528301886792453,
"grad_norm": 0.3591861226985177,
"learning_rate": 2.1208729714605484e-05,
"loss": 0.6236,
"step": 1228
},
{
"epoch": 1.8543396226415094,
"grad_norm": 0.3095222847651958,
"learning_rate": 2.118074986010073e-05,
"loss": 0.6523,
"step": 1229
},
{
"epoch": 1.8558490566037738,
"grad_norm": 0.3424147285170496,
"learning_rate": 2.1152770005595974e-05,
"loss": 0.6396,
"step": 1230
},
{
"epoch": 1.8573584905660376,
"grad_norm": 0.314080508242106,
"learning_rate": 2.1124790151091214e-05,
"loss": 0.5968,
"step": 1231
},
{
"epoch": 1.858867924528302,
"grad_norm": 0.29525316784574474,
"learning_rate": 2.1096810296586457e-05,
"loss": 0.6179,
"step": 1232
},
{
"epoch": 1.860377358490566,
"grad_norm": 0.3262826338661236,
"learning_rate": 2.1068830442081704e-05,
"loss": 0.6361,
"step": 1233
},
{
"epoch": 1.8618867924528302,
"grad_norm": 0.30483158690049045,
"learning_rate": 2.1040850587576947e-05,
"loss": 0.5972,
"step": 1234
},
{
"epoch": 1.8633962264150945,
"grad_norm": 0.2998154483089429,
"learning_rate": 2.101287073307219e-05,
"loss": 0.6319,
"step": 1235
},
{
"epoch": 1.8649056603773584,
"grad_norm": 0.3132929823968501,
"learning_rate": 2.098489087856743e-05,
"loss": 0.6272,
"step": 1236
},
{
"epoch": 1.8664150943396227,
"grad_norm": 0.3644256019767862,
"learning_rate": 2.0956911024062674e-05,
"loss": 0.6275,
"step": 1237
},
{
"epoch": 1.8679245283018868,
"grad_norm": 0.3619907633958512,
"learning_rate": 2.092893116955792e-05,
"loss": 0.6324,
"step": 1238
},
{
"epoch": 1.869433962264151,
"grad_norm": 0.30644024116691077,
"learning_rate": 2.0900951315053164e-05,
"loss": 0.6487,
"step": 1239
},
{
"epoch": 1.8709433962264153,
"grad_norm": 0.32101407266001664,
"learning_rate": 2.0872971460548407e-05,
"loss": 0.5942,
"step": 1240
},
{
"epoch": 1.8724528301886791,
"grad_norm": 0.3574797990980977,
"learning_rate": 2.0844991606043647e-05,
"loss": 0.6058,
"step": 1241
},
{
"epoch": 1.8739622641509435,
"grad_norm": 0.2731432397008649,
"learning_rate": 2.081701175153889e-05,
"loss": 0.5838,
"step": 1242
},
{
"epoch": 1.8754716981132076,
"grad_norm": 0.3081971583676486,
"learning_rate": 2.0789031897034137e-05,
"loss": 0.5985,
"step": 1243
},
{
"epoch": 1.8769811320754717,
"grad_norm": 0.4638755248661489,
"learning_rate": 2.076105204252938e-05,
"loss": 0.5448,
"step": 1244
},
{
"epoch": 1.878490566037736,
"grad_norm": 0.36428569342156447,
"learning_rate": 2.0733072188024624e-05,
"loss": 0.6677,
"step": 1245
},
{
"epoch": 1.88,
"grad_norm": 0.32794651307072614,
"learning_rate": 2.0705092333519864e-05,
"loss": 0.6155,
"step": 1246
},
{
"epoch": 1.8815094339622642,
"grad_norm": 0.817788237288649,
"learning_rate": 2.067711247901511e-05,
"loss": 0.6651,
"step": 1247
},
{
"epoch": 1.8830188679245283,
"grad_norm": 0.32900653987481365,
"learning_rate": 2.0649132624510354e-05,
"loss": 0.6534,
"step": 1248
},
{
"epoch": 1.8845283018867924,
"grad_norm": 0.354259188328193,
"learning_rate": 2.0621152770005598e-05,
"loss": 0.5931,
"step": 1249
},
{
"epoch": 1.8860377358490568,
"grad_norm": 0.3280985985620716,
"learning_rate": 2.059317291550084e-05,
"loss": 0.6111,
"step": 1250
},
{
"epoch": 1.8875471698113206,
"grad_norm": 0.3875058797193474,
"learning_rate": 2.056519306099608e-05,
"loss": 0.5952,
"step": 1251
},
{
"epoch": 1.889056603773585,
"grad_norm": 0.37671901941800245,
"learning_rate": 2.0537213206491328e-05,
"loss": 0.6522,
"step": 1252
},
{
"epoch": 1.890566037735849,
"grad_norm": 0.4128531980352486,
"learning_rate": 2.050923335198657e-05,
"loss": 0.6864,
"step": 1253
},
{
"epoch": 1.8920754716981132,
"grad_norm": 0.3164903698692737,
"learning_rate": 2.0481253497481814e-05,
"loss": 0.5493,
"step": 1254
},
{
"epoch": 1.8935849056603775,
"grad_norm": 0.3960462076340831,
"learning_rate": 2.0453273642977058e-05,
"loss": 0.5976,
"step": 1255
},
{
"epoch": 1.8950943396226414,
"grad_norm": 0.35480881312131624,
"learning_rate": 2.04252937884723e-05,
"loss": 0.5845,
"step": 1256
},
{
"epoch": 1.8966037735849057,
"grad_norm": 0.34522057536500367,
"learning_rate": 2.0397313933967545e-05,
"loss": 0.6556,
"step": 1257
},
{
"epoch": 1.8981132075471698,
"grad_norm": 0.337296443279804,
"learning_rate": 2.0369334079462788e-05,
"loss": 0.6814,
"step": 1258
},
{
"epoch": 1.899622641509434,
"grad_norm": 0.2805168811056917,
"learning_rate": 2.034135422495803e-05,
"loss": 0.6272,
"step": 1259
},
{
"epoch": 1.9011320754716983,
"grad_norm": 0.3449624394911455,
"learning_rate": 2.0313374370453275e-05,
"loss": 0.6474,
"step": 1260
},
{
"epoch": 1.9026415094339622,
"grad_norm": 0.3280572211846723,
"learning_rate": 2.0285394515948518e-05,
"loss": 0.6347,
"step": 1261
},
{
"epoch": 1.9041509433962265,
"grad_norm": 0.35714732927573933,
"learning_rate": 2.025741466144376e-05,
"loss": 0.5295,
"step": 1262
},
{
"epoch": 1.9056603773584906,
"grad_norm": 0.35049276342588254,
"learning_rate": 2.0229434806939005e-05,
"loss": 0.586,
"step": 1263
},
{
"epoch": 1.9071698113207547,
"grad_norm": 0.36879482492260923,
"learning_rate": 2.0201454952434248e-05,
"loss": 0.5682,
"step": 1264
},
{
"epoch": 1.908679245283019,
"grad_norm": 0.300565774847212,
"learning_rate": 2.017347509792949e-05,
"loss": 0.6407,
"step": 1265
},
{
"epoch": 1.910188679245283,
"grad_norm": 0.29997892836529605,
"learning_rate": 2.0145495243424735e-05,
"loss": 0.5913,
"step": 1266
},
{
"epoch": 1.9116981132075472,
"grad_norm": 0.3222923618904655,
"learning_rate": 2.0117515388919978e-05,
"loss": 0.6426,
"step": 1267
},
{
"epoch": 1.9132075471698113,
"grad_norm": 0.34653857778460406,
"learning_rate": 2.008953553441522e-05,
"loss": 0.629,
"step": 1268
},
{
"epoch": 1.9147169811320754,
"grad_norm": 0.36204978834352647,
"learning_rate": 2.0061555679910465e-05,
"loss": 0.6626,
"step": 1269
},
{
"epoch": 1.9162264150943398,
"grad_norm": 0.30703816406770273,
"learning_rate": 2.0033575825405708e-05,
"loss": 0.6269,
"step": 1270
},
{
"epoch": 1.9177358490566037,
"grad_norm": 0.45526307662284454,
"learning_rate": 2.000559597090095e-05,
"loss": 0.6394,
"step": 1271
},
{
"epoch": 1.919245283018868,
"grad_norm": 0.3428934663134498,
"learning_rate": 1.9977616116396195e-05,
"loss": 0.5814,
"step": 1272
},
{
"epoch": 1.920754716981132,
"grad_norm": 0.3483198311610776,
"learning_rate": 1.994963626189144e-05,
"loss": 0.6114,
"step": 1273
},
{
"epoch": 1.9222641509433962,
"grad_norm": 0.4806434783200171,
"learning_rate": 1.9921656407386682e-05,
"loss": 0.6376,
"step": 1274
},
{
"epoch": 1.9237735849056605,
"grad_norm": 0.3746332153159746,
"learning_rate": 1.9893676552881925e-05,
"loss": 0.6103,
"step": 1275
},
{
"epoch": 1.9252830188679244,
"grad_norm": 0.330995548312209,
"learning_rate": 1.9865696698377172e-05,
"loss": 0.5888,
"step": 1276
},
{
"epoch": 1.9267924528301887,
"grad_norm": 0.33991119023837807,
"learning_rate": 1.9837716843872412e-05,
"loss": 0.6318,
"step": 1277
},
{
"epoch": 1.9283018867924528,
"grad_norm": 0.7848019038955028,
"learning_rate": 1.9809736989367655e-05,
"loss": 0.6823,
"step": 1278
},
{
"epoch": 1.929811320754717,
"grad_norm": 0.36581755414914946,
"learning_rate": 1.97817571348629e-05,
"loss": 0.631,
"step": 1279
},
{
"epoch": 1.9313207547169813,
"grad_norm": 0.3516181876894466,
"learning_rate": 1.9753777280358142e-05,
"loss": 0.623,
"step": 1280
},
{
"epoch": 1.9328301886792452,
"grad_norm": 0.34076006303037526,
"learning_rate": 1.972579742585339e-05,
"loss": 0.6362,
"step": 1281
},
{
"epoch": 1.9343396226415095,
"grad_norm": 0.38953097373298856,
"learning_rate": 1.9697817571348632e-05,
"loss": 0.5828,
"step": 1282
},
{
"epoch": 1.9358490566037736,
"grad_norm": 0.30915722554617087,
"learning_rate": 1.9669837716843872e-05,
"loss": 0.6231,
"step": 1283
},
{
"epoch": 1.9373584905660377,
"grad_norm": 0.4125242098883407,
"learning_rate": 1.9641857862339115e-05,
"loss": 0.6137,
"step": 1284
},
{
"epoch": 1.938867924528302,
"grad_norm": 0.306469481947522,
"learning_rate": 1.961387800783436e-05,
"loss": 0.6435,
"step": 1285
},
{
"epoch": 1.940377358490566,
"grad_norm": 0.3902186395516134,
"learning_rate": 1.9585898153329605e-05,
"loss": 0.5786,
"step": 1286
},
{
"epoch": 1.9418867924528302,
"grad_norm": 0.42033083017228917,
"learning_rate": 1.955791829882485e-05,
"loss": 0.6479,
"step": 1287
},
{
"epoch": 1.9433962264150944,
"grad_norm": 0.376407214561354,
"learning_rate": 1.952993844432009e-05,
"loss": 0.5883,
"step": 1288
},
{
"epoch": 1.9449056603773585,
"grad_norm": 0.3254588344744923,
"learning_rate": 1.9501958589815332e-05,
"loss": 0.6866,
"step": 1289
},
{
"epoch": 1.9464150943396228,
"grad_norm": 0.5212042890070583,
"learning_rate": 1.947397873531058e-05,
"loss": 0.667,
"step": 1290
},
{
"epoch": 1.9479245283018867,
"grad_norm": 0.48459611656974566,
"learning_rate": 1.9445998880805822e-05,
"loss": 0.5971,
"step": 1291
},
{
"epoch": 1.949433962264151,
"grad_norm": 0.3517856389474132,
"learning_rate": 1.9418019026301066e-05,
"loss": 0.5731,
"step": 1292
},
{
"epoch": 1.950943396226415,
"grad_norm": 0.37784085824797087,
"learning_rate": 1.9390039171796306e-05,
"loss": 0.6935,
"step": 1293
},
{
"epoch": 1.9524528301886792,
"grad_norm": 0.4003678989850066,
"learning_rate": 1.936205931729155e-05,
"loss": 0.5787,
"step": 1294
},
{
"epoch": 1.9539622641509435,
"grad_norm": 0.38383200350522656,
"learning_rate": 1.9334079462786796e-05,
"loss": 0.5821,
"step": 1295
},
{
"epoch": 1.9554716981132074,
"grad_norm": 0.38567119387350274,
"learning_rate": 1.930609960828204e-05,
"loss": 0.6339,
"step": 1296
},
{
"epoch": 1.9569811320754718,
"grad_norm": 0.3505634513365029,
"learning_rate": 1.9278119753777282e-05,
"loss": 0.6306,
"step": 1297
},
{
"epoch": 1.9584905660377359,
"grad_norm": 0.36433000051048686,
"learning_rate": 1.9250139899272522e-05,
"loss": 0.6311,
"step": 1298
},
{
"epoch": 1.96,
"grad_norm": 0.32889466881528095,
"learning_rate": 1.9222160044767766e-05,
"loss": 0.5904,
"step": 1299
},
{
"epoch": 1.9615094339622643,
"grad_norm": 0.31496527251241535,
"learning_rate": 1.9194180190263013e-05,
"loss": 0.628,
"step": 1300
},
{
"epoch": 1.9630188679245282,
"grad_norm": 3.183835781238465,
"learning_rate": 1.9166200335758256e-05,
"loss": 0.6688,
"step": 1301
},
{
"epoch": 1.9645283018867925,
"grad_norm": 2.11988404901896,
"learning_rate": 1.91382204812535e-05,
"loss": 0.5751,
"step": 1302
},
{
"epoch": 1.9660377358490566,
"grad_norm": 0.39102169211455845,
"learning_rate": 1.911024062674874e-05,
"loss": 0.6105,
"step": 1303
},
{
"epoch": 1.9675471698113207,
"grad_norm": 0.3356373344184646,
"learning_rate": 1.9082260772243986e-05,
"loss": 0.6691,
"step": 1304
},
{
"epoch": 1.969056603773585,
"grad_norm": 0.33936850902738813,
"learning_rate": 1.905428091773923e-05,
"loss": 0.5818,
"step": 1305
},
{
"epoch": 1.970566037735849,
"grad_norm": 0.3125127190755324,
"learning_rate": 1.9026301063234473e-05,
"loss": 0.6903,
"step": 1306
},
{
"epoch": 1.9720754716981133,
"grad_norm": 0.37220176903618524,
"learning_rate": 1.8998321208729716e-05,
"loss": 0.6372,
"step": 1307
},
{
"epoch": 1.9735849056603774,
"grad_norm": 0.333323524072569,
"learning_rate": 1.897034135422496e-05,
"loss": 0.5685,
"step": 1308
},
{
"epoch": 1.9750943396226415,
"grad_norm": 0.30576012281180814,
"learning_rate": 1.8942361499720203e-05,
"loss": 0.6038,
"step": 1309
},
{
"epoch": 1.9766037735849058,
"grad_norm": 0.3945529751240445,
"learning_rate": 1.8914381645215446e-05,
"loss": 0.6457,
"step": 1310
},
{
"epoch": 1.9781132075471697,
"grad_norm": 0.2774316641209328,
"learning_rate": 1.888640179071069e-05,
"loss": 0.6567,
"step": 1311
},
{
"epoch": 1.979622641509434,
"grad_norm": 0.35117826532025936,
"learning_rate": 1.8858421936205933e-05,
"loss": 0.6308,
"step": 1312
},
{
"epoch": 1.9811320754716981,
"grad_norm": 0.3419364217067856,
"learning_rate": 1.8830442081701176e-05,
"loss": 0.6162,
"step": 1313
},
{
"epoch": 1.9826415094339622,
"grad_norm": 0.36693025313968536,
"learning_rate": 1.880246222719642e-05,
"loss": 0.634,
"step": 1314
},
{
"epoch": 1.9841509433962266,
"grad_norm": 0.3183259269090567,
"learning_rate": 1.8774482372691663e-05,
"loss": 0.5759,
"step": 1315
},
{
"epoch": 1.9856603773584904,
"grad_norm": 2.9550667184423682,
"learning_rate": 1.8746502518186906e-05,
"loss": 0.712,
"step": 1316
},
{
"epoch": 1.9871698113207548,
"grad_norm": 0.36436478637302866,
"learning_rate": 1.871852266368215e-05,
"loss": 0.6046,
"step": 1317
},
{
"epoch": 1.9886792452830189,
"grad_norm": 0.3513792552765972,
"learning_rate": 1.8690542809177393e-05,
"loss": 0.6677,
"step": 1318
},
{
"epoch": 1.990188679245283,
"grad_norm": 0.32038155032867643,
"learning_rate": 1.8662562954672636e-05,
"loss": 0.6549,
"step": 1319
},
{
"epoch": 1.9916981132075473,
"grad_norm": 0.31445105810605595,
"learning_rate": 1.863458310016788e-05,
"loss": 0.6003,
"step": 1320
},
{
"epoch": 1.9932075471698112,
"grad_norm": 0.4328766003560714,
"learning_rate": 1.8606603245663123e-05,
"loss": 0.6476,
"step": 1321
},
{
"epoch": 1.9947169811320755,
"grad_norm": 0.32805033665845823,
"learning_rate": 1.8578623391158366e-05,
"loss": 0.6121,
"step": 1322
},
{
"epoch": 1.9962264150943396,
"grad_norm": 0.3016162458656436,
"learning_rate": 1.855064353665361e-05,
"loss": 0.5952,
"step": 1323
},
{
"epoch": 1.9977358490566037,
"grad_norm": 0.3131440934220455,
"learning_rate": 1.8522663682148853e-05,
"loss": 0.5831,
"step": 1324
},
{
"epoch": 1.999245283018868,
"grad_norm": 0.29401127063820814,
"learning_rate": 1.8494683827644097e-05,
"loss": 0.5822,
"step": 1325
},
{
"epoch": 2.0,
"grad_norm": 0.49733230922246346,
"learning_rate": 1.846670397313934e-05,
"loss": 0.6198,
"step": 1326
},
{
"epoch": 2.0015094339622643,
"grad_norm": 0.35646209321907746,
"learning_rate": 1.8438724118634583e-05,
"loss": 0.4999,
"step": 1327
},
{
"epoch": 2.003018867924528,
"grad_norm": 0.3981125944090188,
"learning_rate": 1.8410744264129827e-05,
"loss": 0.4977,
"step": 1328
},
{
"epoch": 2.0045283018867925,
"grad_norm": 0.3736032043806837,
"learning_rate": 1.838276440962507e-05,
"loss": 0.541,
"step": 1329
},
{
"epoch": 2.0060377358490564,
"grad_norm": 0.30212541569145873,
"learning_rate": 1.8354784555120313e-05,
"loss": 0.4858,
"step": 1330
},
{
"epoch": 2.0075471698113208,
"grad_norm": 0.3073556357585776,
"learning_rate": 1.8326804700615557e-05,
"loss": 0.5015,
"step": 1331
},
{
"epoch": 2.009056603773585,
"grad_norm": 0.34113812336109905,
"learning_rate": 1.82988248461108e-05,
"loss": 0.5172,
"step": 1332
},
{
"epoch": 2.010566037735849,
"grad_norm": 4.487795810249564,
"learning_rate": 1.8270844991606047e-05,
"loss": 0.5514,
"step": 1333
},
{
"epoch": 2.0120754716981133,
"grad_norm": 0.42900541776859996,
"learning_rate": 1.824286513710129e-05,
"loss": 0.4934,
"step": 1334
},
{
"epoch": 2.013584905660377,
"grad_norm": 3.065346721241103,
"learning_rate": 1.821488528259653e-05,
"loss": 0.5091,
"step": 1335
},
{
"epoch": 2.0150943396226415,
"grad_norm": 0.40144469758266016,
"learning_rate": 1.8186905428091774e-05,
"loss": 0.5088,
"step": 1336
},
{
"epoch": 2.016603773584906,
"grad_norm": 0.40845248070738877,
"learning_rate": 1.8158925573587017e-05,
"loss": 0.5574,
"step": 1337
},
{
"epoch": 2.0181132075471697,
"grad_norm": 3.392575632261363,
"learning_rate": 1.8130945719082264e-05,
"loss": 0.5533,
"step": 1338
},
{
"epoch": 2.019622641509434,
"grad_norm": 0.45092263682500694,
"learning_rate": 1.8102965864577507e-05,
"loss": 0.5609,
"step": 1339
},
{
"epoch": 2.021132075471698,
"grad_norm": 0.37318211497573206,
"learning_rate": 1.8074986010072747e-05,
"loss": 0.5026,
"step": 1340
},
{
"epoch": 2.0226415094339623,
"grad_norm": 0.34400471516145126,
"learning_rate": 1.804700615556799e-05,
"loss": 0.4826,
"step": 1341
},
{
"epoch": 2.0241509433962266,
"grad_norm": 0.3500144022243311,
"learning_rate": 1.8019026301063234e-05,
"loss": 0.4663,
"step": 1342
},
{
"epoch": 2.0256603773584905,
"grad_norm": 0.34048486473138423,
"learning_rate": 1.799104644655848e-05,
"loss": 0.4835,
"step": 1343
},
{
"epoch": 2.027169811320755,
"grad_norm": 0.33541429775955595,
"learning_rate": 1.7963066592053724e-05,
"loss": 0.5427,
"step": 1344
},
{
"epoch": 2.0286792452830187,
"grad_norm": 0.3600896985895227,
"learning_rate": 1.7935086737548964e-05,
"loss": 0.516,
"step": 1345
},
{
"epoch": 2.030188679245283,
"grad_norm": 0.3429326727185839,
"learning_rate": 1.7907106883044207e-05,
"loss": 0.5144,
"step": 1346
},
{
"epoch": 2.0316981132075473,
"grad_norm": 0.31543021450047887,
"learning_rate": 1.7879127028539454e-05,
"loss": 0.5449,
"step": 1347
},
{
"epoch": 2.0332075471698112,
"grad_norm": 0.27664831389978795,
"learning_rate": 1.7851147174034697e-05,
"loss": 0.4857,
"step": 1348
},
{
"epoch": 2.0347169811320756,
"grad_norm": 0.32371945804571334,
"learning_rate": 1.782316731952994e-05,
"loss": 0.5512,
"step": 1349
},
{
"epoch": 2.0362264150943394,
"grad_norm": 0.3123789070192699,
"learning_rate": 1.779518746502518e-05,
"loss": 0.5339,
"step": 1350
},
{
"epoch": 2.0377358490566038,
"grad_norm": 0.3018082555969256,
"learning_rate": 1.7767207610520424e-05,
"loss": 0.5477,
"step": 1351
},
{
"epoch": 2.039245283018868,
"grad_norm": 0.330825753840397,
"learning_rate": 1.773922775601567e-05,
"loss": 0.5226,
"step": 1352
},
{
"epoch": 2.040754716981132,
"grad_norm": 0.3064556391674686,
"learning_rate": 1.7711247901510914e-05,
"loss": 0.5009,
"step": 1353
},
{
"epoch": 2.0422641509433963,
"grad_norm": 0.3279356419954503,
"learning_rate": 1.7683268047006157e-05,
"loss": 0.5007,
"step": 1354
},
{
"epoch": 2.04377358490566,
"grad_norm": 0.3562043309709785,
"learning_rate": 1.7655288192501397e-05,
"loss": 0.4928,
"step": 1355
},
{
"epoch": 2.0452830188679245,
"grad_norm": 0.2992958064668747,
"learning_rate": 1.762730833799664e-05,
"loss": 0.513,
"step": 1356
},
{
"epoch": 2.046792452830189,
"grad_norm": 0.36710409137930194,
"learning_rate": 1.7599328483491888e-05,
"loss": 0.5065,
"step": 1357
},
{
"epoch": 2.0483018867924527,
"grad_norm": 0.3134601371847729,
"learning_rate": 1.757134862898713e-05,
"loss": 0.4913,
"step": 1358
},
{
"epoch": 2.049811320754717,
"grad_norm": 0.27689129271088303,
"learning_rate": 1.7543368774482374e-05,
"loss": 0.4884,
"step": 1359
},
{
"epoch": 2.051320754716981,
"grad_norm": 0.3714023246669663,
"learning_rate": 1.7515388919977618e-05,
"loss": 0.5384,
"step": 1360
},
{
"epoch": 2.0528301886792453,
"grad_norm": 0.2847785796252443,
"learning_rate": 1.748740906547286e-05,
"loss": 0.503,
"step": 1361
},
{
"epoch": 2.0543396226415096,
"grad_norm": 0.31030468078337337,
"learning_rate": 1.7459429210968104e-05,
"loss": 0.5054,
"step": 1362
},
{
"epoch": 2.0558490566037735,
"grad_norm": 0.2888211469303781,
"learning_rate": 1.7431449356463348e-05,
"loss": 0.5146,
"step": 1363
},
{
"epoch": 2.057358490566038,
"grad_norm": 0.3059017261234641,
"learning_rate": 1.740346950195859e-05,
"loss": 0.5527,
"step": 1364
},
{
"epoch": 2.0588679245283017,
"grad_norm": 0.2880026611275554,
"learning_rate": 1.7375489647453834e-05,
"loss": 0.5871,
"step": 1365
},
{
"epoch": 2.060377358490566,
"grad_norm": 0.3435683903553664,
"learning_rate": 1.7347509792949078e-05,
"loss": 0.4751,
"step": 1366
},
{
"epoch": 2.0618867924528304,
"grad_norm": 0.2577466623620189,
"learning_rate": 1.731952993844432e-05,
"loss": 0.5072,
"step": 1367
},
{
"epoch": 2.0633962264150942,
"grad_norm": 0.5951239656292413,
"learning_rate": 1.7291550083939565e-05,
"loss": 0.5067,
"step": 1368
},
{
"epoch": 2.0649056603773586,
"grad_norm": 0.3091376294357986,
"learning_rate": 1.7263570229434808e-05,
"loss": 0.5628,
"step": 1369
},
{
"epoch": 2.0664150943396224,
"grad_norm": 0.3448984710683876,
"learning_rate": 1.723559037493005e-05,
"loss": 0.5315,
"step": 1370
},
{
"epoch": 2.0679245283018868,
"grad_norm": 0.2883682281289665,
"learning_rate": 1.7207610520425295e-05,
"loss": 0.4979,
"step": 1371
},
{
"epoch": 2.069433962264151,
"grad_norm": 0.3162892758521297,
"learning_rate": 1.7179630665920538e-05,
"loss": 0.5156,
"step": 1372
},
{
"epoch": 2.070943396226415,
"grad_norm": 9.124432005211863,
"learning_rate": 1.715165081141578e-05,
"loss": 0.6299,
"step": 1373
},
{
"epoch": 2.0724528301886793,
"grad_norm": 0.3426920377467487,
"learning_rate": 1.7123670956911025e-05,
"loss": 0.5106,
"step": 1374
},
{
"epoch": 2.073962264150943,
"grad_norm": 0.3252637771238837,
"learning_rate": 1.7095691102406268e-05,
"loss": 0.465,
"step": 1375
},
{
"epoch": 2.0754716981132075,
"grad_norm": 0.27946931242846335,
"learning_rate": 1.706771124790151e-05,
"loss": 0.5466,
"step": 1376
},
{
"epoch": 2.076981132075472,
"grad_norm": 0.2950006671302379,
"learning_rate": 1.7039731393396755e-05,
"loss": 0.5233,
"step": 1377
},
{
"epoch": 2.0784905660377357,
"grad_norm": 0.3071238734202378,
"learning_rate": 1.7011751538891998e-05,
"loss": 0.4996,
"step": 1378
},
{
"epoch": 2.08,
"grad_norm": 0.29000161966129556,
"learning_rate": 1.698377168438724e-05,
"loss": 0.5456,
"step": 1379
},
{
"epoch": 2.081509433962264,
"grad_norm": 0.3638090048005039,
"learning_rate": 1.6955791829882485e-05,
"loss": 0.5342,
"step": 1380
},
{
"epoch": 2.0830188679245283,
"grad_norm": 0.3221376007630064,
"learning_rate": 1.6927811975377728e-05,
"loss": 0.506,
"step": 1381
},
{
"epoch": 2.0845283018867926,
"grad_norm": 0.2882987076412252,
"learning_rate": 1.689983212087297e-05,
"loss": 0.5291,
"step": 1382
},
{
"epoch": 2.0860377358490565,
"grad_norm": 0.28696681372347427,
"learning_rate": 1.6871852266368215e-05,
"loss": 0.4838,
"step": 1383
},
{
"epoch": 2.087547169811321,
"grad_norm": 0.38854977880255626,
"learning_rate": 1.684387241186346e-05,
"loss": 0.5473,
"step": 1384
},
{
"epoch": 2.0890566037735847,
"grad_norm": 0.29092276215854995,
"learning_rate": 1.6815892557358702e-05,
"loss": 0.4947,
"step": 1385
},
{
"epoch": 2.090566037735849,
"grad_norm": 0.3087900936237078,
"learning_rate": 1.678791270285395e-05,
"loss": 0.4894,
"step": 1386
},
{
"epoch": 2.0920754716981134,
"grad_norm": 0.3447103919070566,
"learning_rate": 1.675993284834919e-05,
"loss": 0.4929,
"step": 1387
},
{
"epoch": 2.0935849056603772,
"grad_norm": 0.2823804807438117,
"learning_rate": 1.6731952993844432e-05,
"loss": 0.5266,
"step": 1388
},
{
"epoch": 2.0950943396226416,
"grad_norm": 2.227830939004405,
"learning_rate": 1.6703973139339675e-05,
"loss": 0.4667,
"step": 1389
},
{
"epoch": 2.0966037735849055,
"grad_norm": 0.36027609216088624,
"learning_rate": 1.6675993284834922e-05,
"loss": 0.5089,
"step": 1390
},
{
"epoch": 2.09811320754717,
"grad_norm": 0.30939324253732786,
"learning_rate": 1.6648013430330165e-05,
"loss": 0.509,
"step": 1391
},
{
"epoch": 2.099622641509434,
"grad_norm": 0.2940056905558936,
"learning_rate": 1.6620033575825405e-05,
"loss": 0.5088,
"step": 1392
},
{
"epoch": 2.101132075471698,
"grad_norm": 0.2906000756858416,
"learning_rate": 1.659205372132065e-05,
"loss": 0.496,
"step": 1393
},
{
"epoch": 2.1026415094339623,
"grad_norm": 0.2869459547310367,
"learning_rate": 1.6564073866815892e-05,
"loss": 0.5428,
"step": 1394
},
{
"epoch": 2.104150943396226,
"grad_norm": 0.2999230365705766,
"learning_rate": 1.653609401231114e-05,
"loss": 0.4922,
"step": 1395
},
{
"epoch": 2.1056603773584905,
"grad_norm": 0.2962964554482559,
"learning_rate": 1.6508114157806382e-05,
"loss": 0.507,
"step": 1396
},
{
"epoch": 2.107169811320755,
"grad_norm": 0.2795439894392488,
"learning_rate": 1.6480134303301622e-05,
"loss": 0.5061,
"step": 1397
},
{
"epoch": 2.1086792452830188,
"grad_norm": 0.31442101897336405,
"learning_rate": 1.6452154448796865e-05,
"loss": 0.5429,
"step": 1398
},
{
"epoch": 2.110188679245283,
"grad_norm": 0.29732567961373857,
"learning_rate": 1.642417459429211e-05,
"loss": 0.5197,
"step": 1399
},
{
"epoch": 2.111698113207547,
"grad_norm": 0.2869692773071042,
"learning_rate": 1.6396194739787356e-05,
"loss": 0.5009,
"step": 1400
},
{
"epoch": 2.1132075471698113,
"grad_norm": 0.31504920038578266,
"learning_rate": 1.63682148852826e-05,
"loss": 0.5181,
"step": 1401
},
{
"epoch": 2.1147169811320756,
"grad_norm": 0.31546568417414367,
"learning_rate": 1.634023503077784e-05,
"loss": 0.4921,
"step": 1402
},
{
"epoch": 2.1162264150943395,
"grad_norm": 0.5023731953716996,
"learning_rate": 1.6312255176273082e-05,
"loss": 0.5294,
"step": 1403
},
{
"epoch": 2.117735849056604,
"grad_norm": 0.2997062635797708,
"learning_rate": 1.628427532176833e-05,
"loss": 0.5035,
"step": 1404
},
{
"epoch": 2.1192452830188677,
"grad_norm": 6.197559051784822,
"learning_rate": 1.6256295467263572e-05,
"loss": 1.1389,
"step": 1405
},
{
"epoch": 2.120754716981132,
"grad_norm": 0.37051838525151753,
"learning_rate": 1.6228315612758816e-05,
"loss": 0.4735,
"step": 1406
},
{
"epoch": 2.1222641509433964,
"grad_norm": 0.34610400457497426,
"learning_rate": 1.6200335758254056e-05,
"loss": 0.5028,
"step": 1407
},
{
"epoch": 2.1237735849056603,
"grad_norm": 0.29036868542009187,
"learning_rate": 1.61723559037493e-05,
"loss": 0.5034,
"step": 1408
},
{
"epoch": 2.1252830188679246,
"grad_norm": 0.2800323997242073,
"learning_rate": 1.6144376049244546e-05,
"loss": 0.502,
"step": 1409
},
{
"epoch": 2.1267924528301885,
"grad_norm": 0.38872365691156985,
"learning_rate": 1.611639619473979e-05,
"loss": 0.5192,
"step": 1410
},
{
"epoch": 2.128301886792453,
"grad_norm": 0.3370959225184659,
"learning_rate": 1.6088416340235033e-05,
"loss": 0.6035,
"step": 1411
},
{
"epoch": 2.129811320754717,
"grad_norm": 0.27435668970687543,
"learning_rate": 1.6060436485730273e-05,
"loss": 0.51,
"step": 1412
},
{
"epoch": 2.131320754716981,
"grad_norm": 1.7820096334751305,
"learning_rate": 1.6032456631225516e-05,
"loss": 0.5151,
"step": 1413
},
{
"epoch": 2.1328301886792453,
"grad_norm": 0.383674413890331,
"learning_rate": 1.6004476776720763e-05,
"loss": 0.5579,
"step": 1414
},
{
"epoch": 2.1343396226415092,
"grad_norm": 0.328537968912913,
"learning_rate": 1.5976496922216006e-05,
"loss": 0.495,
"step": 1415
},
{
"epoch": 2.1358490566037736,
"grad_norm": 0.3243298723276784,
"learning_rate": 1.594851706771125e-05,
"loss": 0.5264,
"step": 1416
},
{
"epoch": 2.137358490566038,
"grad_norm": 0.32700388956318677,
"learning_rate": 1.5920537213206493e-05,
"loss": 0.4996,
"step": 1417
},
{
"epoch": 2.1388679245283018,
"grad_norm": 0.3322839065657285,
"learning_rate": 1.5892557358701736e-05,
"loss": 0.5223,
"step": 1418
},
{
"epoch": 2.140377358490566,
"grad_norm": 0.3004552967389372,
"learning_rate": 1.586457750419698e-05,
"loss": 0.5064,
"step": 1419
},
{
"epoch": 2.14188679245283,
"grad_norm": 0.5763848814349466,
"learning_rate": 1.5836597649692223e-05,
"loss": 0.5033,
"step": 1420
},
{
"epoch": 2.1433962264150943,
"grad_norm": 0.32960695702396287,
"learning_rate": 1.5808617795187466e-05,
"loss": 0.5373,
"step": 1421
},
{
"epoch": 2.1449056603773586,
"grad_norm": 0.3107640748412959,
"learning_rate": 1.578063794068271e-05,
"loss": 0.5038,
"step": 1422
},
{
"epoch": 2.1464150943396225,
"grad_norm": 0.30299740916002305,
"learning_rate": 1.5752658086177953e-05,
"loss": 0.4905,
"step": 1423
},
{
"epoch": 2.147924528301887,
"grad_norm": 0.3175005717166028,
"learning_rate": 1.5724678231673196e-05,
"loss": 0.5247,
"step": 1424
},
{
"epoch": 2.149433962264151,
"grad_norm": 0.38504686240959507,
"learning_rate": 1.569669837716844e-05,
"loss": 0.4848,
"step": 1425
},
{
"epoch": 2.150943396226415,
"grad_norm": 0.29861363410964187,
"learning_rate": 1.5668718522663683e-05,
"loss": 0.5272,
"step": 1426
},
{
"epoch": 2.1524528301886794,
"grad_norm": 0.3048000067062521,
"learning_rate": 1.5640738668158926e-05,
"loss": 0.523,
"step": 1427
},
{
"epoch": 2.1539622641509433,
"grad_norm": 0.29199245961301584,
"learning_rate": 1.561275881365417e-05,
"loss": 0.4737,
"step": 1428
},
{
"epoch": 2.1554716981132076,
"grad_norm": 0.324024535729314,
"learning_rate": 1.5584778959149413e-05,
"loss": 0.5564,
"step": 1429
},
{
"epoch": 2.1569811320754715,
"grad_norm": 0.2706042276628025,
"learning_rate": 1.5556799104644656e-05,
"loss": 0.4991,
"step": 1430
},
{
"epoch": 2.158490566037736,
"grad_norm": 0.31292251156779505,
"learning_rate": 1.55288192501399e-05,
"loss": 0.5495,
"step": 1431
},
{
"epoch": 2.16,
"grad_norm": 0.28435967603261125,
"learning_rate": 1.5500839395635143e-05,
"loss": 0.5318,
"step": 1432
},
{
"epoch": 2.161509433962264,
"grad_norm": 0.28290949197621923,
"learning_rate": 1.5472859541130387e-05,
"loss": 0.475,
"step": 1433
},
{
"epoch": 2.1630188679245284,
"grad_norm": 0.27419716811272765,
"learning_rate": 1.544487968662563e-05,
"loss": 0.512,
"step": 1434
},
{
"epoch": 2.1645283018867927,
"grad_norm": 0.33229780717562507,
"learning_rate": 1.5416899832120873e-05,
"loss": 0.5253,
"step": 1435
},
{
"epoch": 2.1660377358490566,
"grad_norm": 0.2635435711478846,
"learning_rate": 1.5388919977616117e-05,
"loss": 0.4922,
"step": 1436
},
{
"epoch": 2.167547169811321,
"grad_norm": 0.3354287803738388,
"learning_rate": 1.536094012311136e-05,
"loss": 0.5607,
"step": 1437
},
{
"epoch": 2.169056603773585,
"grad_norm": 0.2982433296429279,
"learning_rate": 1.5332960268606603e-05,
"loss": 0.53,
"step": 1438
},
{
"epoch": 2.170566037735849,
"grad_norm": 0.2744678686241482,
"learning_rate": 1.5304980414101847e-05,
"loss": 0.4821,
"step": 1439
},
{
"epoch": 2.172075471698113,
"grad_norm": 0.30144942338602565,
"learning_rate": 1.527700055959709e-05,
"loss": 0.5545,
"step": 1440
},
{
"epoch": 2.1735849056603773,
"grad_norm": 0.2890165107307089,
"learning_rate": 1.5249020705092335e-05,
"loss": 0.4918,
"step": 1441
},
{
"epoch": 2.1750943396226416,
"grad_norm": 0.27397501117313733,
"learning_rate": 1.5221040850587578e-05,
"loss": 0.5342,
"step": 1442
},
{
"epoch": 2.1766037735849055,
"grad_norm": 0.2948193831715891,
"learning_rate": 1.5193060996082822e-05,
"loss": 0.493,
"step": 1443
},
{
"epoch": 2.17811320754717,
"grad_norm": 0.31953210199754906,
"learning_rate": 1.5165081141578064e-05,
"loss": 0.5324,
"step": 1444
},
{
"epoch": 2.179622641509434,
"grad_norm": 0.34514765305750156,
"learning_rate": 1.5137101287073307e-05,
"loss": 0.4684,
"step": 1445
},
{
"epoch": 2.181132075471698,
"grad_norm": 0.30888012716661545,
"learning_rate": 1.5109121432568552e-05,
"loss": 0.568,
"step": 1446
},
{
"epoch": 2.1826415094339624,
"grad_norm": 0.31339189400441597,
"learning_rate": 1.5081141578063795e-05,
"loss": 0.5351,
"step": 1447
},
{
"epoch": 2.1841509433962263,
"grad_norm": 0.2850461935374051,
"learning_rate": 1.5053161723559039e-05,
"loss": 0.4853,
"step": 1448
},
{
"epoch": 2.1856603773584906,
"grad_norm": 0.2827499266089179,
"learning_rate": 1.502518186905428e-05,
"loss": 0.5234,
"step": 1449
},
{
"epoch": 2.1871698113207545,
"grad_norm": 0.3015886512697336,
"learning_rate": 1.4997202014549525e-05,
"loss": 0.5494,
"step": 1450
},
{
"epoch": 2.188679245283019,
"grad_norm": 0.2849817847705387,
"learning_rate": 1.4969222160044769e-05,
"loss": 0.5304,
"step": 1451
},
{
"epoch": 2.190188679245283,
"grad_norm": 0.40838242894053084,
"learning_rate": 1.4941242305540012e-05,
"loss": 0.5282,
"step": 1452
},
{
"epoch": 2.191698113207547,
"grad_norm": 0.3487341672894736,
"learning_rate": 1.4913262451035257e-05,
"loss": 0.5272,
"step": 1453
},
{
"epoch": 2.1932075471698114,
"grad_norm": 0.28796501873449454,
"learning_rate": 1.4885282596530497e-05,
"loss": 0.4951,
"step": 1454
},
{
"epoch": 2.1947169811320757,
"grad_norm": 0.2684130709861942,
"learning_rate": 1.4857302742025742e-05,
"loss": 0.4948,
"step": 1455
},
{
"epoch": 2.1962264150943396,
"grad_norm": 0.27600668574894804,
"learning_rate": 1.4829322887520986e-05,
"loss": 0.4908,
"step": 1456
},
{
"epoch": 2.197735849056604,
"grad_norm": 0.2588348566679071,
"learning_rate": 1.4801343033016229e-05,
"loss": 0.491,
"step": 1457
},
{
"epoch": 2.199245283018868,
"grad_norm": 0.2987400337756697,
"learning_rate": 1.4773363178511474e-05,
"loss": 0.5446,
"step": 1458
},
{
"epoch": 2.200754716981132,
"grad_norm": 0.25621718687856143,
"learning_rate": 1.4745383324006714e-05,
"loss": 0.4764,
"step": 1459
},
{
"epoch": 2.202264150943396,
"grad_norm": 0.7981297027382572,
"learning_rate": 1.4717403469501959e-05,
"loss": 0.4999,
"step": 1460
},
{
"epoch": 2.2037735849056603,
"grad_norm": 0.2965666251899607,
"learning_rate": 1.4689423614997202e-05,
"loss": 0.535,
"step": 1461
},
{
"epoch": 2.2052830188679247,
"grad_norm": 0.2804521739277098,
"learning_rate": 1.4661443760492446e-05,
"loss": 0.4918,
"step": 1462
},
{
"epoch": 2.2067924528301885,
"grad_norm": 0.276142140919494,
"learning_rate": 1.463346390598769e-05,
"loss": 0.4702,
"step": 1463
},
{
"epoch": 2.208301886792453,
"grad_norm": 0.2820984265166286,
"learning_rate": 1.4605484051482932e-05,
"loss": 0.4939,
"step": 1464
},
{
"epoch": 2.209811320754717,
"grad_norm": 0.7192982932338521,
"learning_rate": 1.4577504196978176e-05,
"loss": 0.5609,
"step": 1465
},
{
"epoch": 2.211320754716981,
"grad_norm": 0.2937186576584232,
"learning_rate": 1.454952434247342e-05,
"loss": 0.5353,
"step": 1466
},
{
"epoch": 2.2128301886792454,
"grad_norm": 0.31277570600962407,
"learning_rate": 1.4521544487968664e-05,
"loss": 0.4838,
"step": 1467
},
{
"epoch": 2.2143396226415093,
"grad_norm": 0.3055256840051491,
"learning_rate": 1.4493564633463908e-05,
"loss": 0.5433,
"step": 1468
},
{
"epoch": 2.2158490566037736,
"grad_norm": 0.31560711080139303,
"learning_rate": 1.4465584778959151e-05,
"loss": 0.5337,
"step": 1469
},
{
"epoch": 2.2173584905660375,
"grad_norm": 0.29251628786569595,
"learning_rate": 1.4437604924454393e-05,
"loss": 0.5144,
"step": 1470
},
{
"epoch": 2.218867924528302,
"grad_norm": 0.3049591268418919,
"learning_rate": 1.4409625069949636e-05,
"loss": 0.5322,
"step": 1471
},
{
"epoch": 2.220377358490566,
"grad_norm": 0.3094888112301788,
"learning_rate": 1.4381645215444881e-05,
"loss": 0.5531,
"step": 1472
},
{
"epoch": 2.22188679245283,
"grad_norm": 0.272447846375912,
"learning_rate": 1.4353665360940124e-05,
"loss": 0.4913,
"step": 1473
},
{
"epoch": 2.2233962264150944,
"grad_norm": 0.3123501475624154,
"learning_rate": 1.4325685506435368e-05,
"loss": 0.5543,
"step": 1474
},
{
"epoch": 2.2249056603773587,
"grad_norm": 0.272171738262368,
"learning_rate": 1.429770565193061e-05,
"loss": 0.5062,
"step": 1475
},
{
"epoch": 2.2264150943396226,
"grad_norm": 0.34071458995993736,
"learning_rate": 1.4269725797425853e-05,
"loss": 0.4612,
"step": 1476
},
{
"epoch": 2.227924528301887,
"grad_norm": 0.3207863817121142,
"learning_rate": 1.4241745942921098e-05,
"loss": 0.5399,
"step": 1477
},
{
"epoch": 2.229433962264151,
"grad_norm": 0.3312395334585876,
"learning_rate": 1.4213766088416341e-05,
"loss": 0.5551,
"step": 1478
},
{
"epoch": 2.230943396226415,
"grad_norm": 0.3078119559299569,
"learning_rate": 1.4185786233911586e-05,
"loss": 0.5399,
"step": 1479
},
{
"epoch": 2.232452830188679,
"grad_norm": 0.3677416007012839,
"learning_rate": 1.4157806379406826e-05,
"loss": 0.4692,
"step": 1480
},
{
"epoch": 2.2339622641509433,
"grad_norm": 0.29294788473317285,
"learning_rate": 1.4129826524902071e-05,
"loss": 0.5189,
"step": 1481
},
{
"epoch": 2.2354716981132077,
"grad_norm": 0.2967231058765099,
"learning_rate": 1.4101846670397315e-05,
"loss": 0.4883,
"step": 1482
},
{
"epoch": 2.2369811320754716,
"grad_norm": 0.28070084795251554,
"learning_rate": 1.4073866815892558e-05,
"loss": 0.5129,
"step": 1483
},
{
"epoch": 2.238490566037736,
"grad_norm": 0.27025440257233807,
"learning_rate": 1.4045886961387803e-05,
"loss": 0.5129,
"step": 1484
},
{
"epoch": 2.24,
"grad_norm": 0.3278016709506503,
"learning_rate": 1.4017907106883043e-05,
"loss": 0.5552,
"step": 1485
},
{
"epoch": 2.241509433962264,
"grad_norm": 0.2677486192645805,
"learning_rate": 1.3989927252378288e-05,
"loss": 0.505,
"step": 1486
},
{
"epoch": 2.2430188679245284,
"grad_norm": 0.27640060062320465,
"learning_rate": 1.3961947397873532e-05,
"loss": 0.5228,
"step": 1487
},
{
"epoch": 2.2445283018867923,
"grad_norm": 0.32687355578285887,
"learning_rate": 1.3933967543368775e-05,
"loss": 0.5329,
"step": 1488
},
{
"epoch": 2.2460377358490566,
"grad_norm": 0.28290727880482414,
"learning_rate": 1.390598768886402e-05,
"loss": 0.5653,
"step": 1489
},
{
"epoch": 2.2475471698113205,
"grad_norm": 0.2945089980775346,
"learning_rate": 1.387800783435926e-05,
"loss": 0.5558,
"step": 1490
},
{
"epoch": 2.249056603773585,
"grad_norm": 0.24883863133395934,
"learning_rate": 1.3850027979854505e-05,
"loss": 0.4777,
"step": 1491
},
{
"epoch": 2.250566037735849,
"grad_norm": 0.2864405466136228,
"learning_rate": 1.3822048125349748e-05,
"loss": 0.5316,
"step": 1492
},
{
"epoch": 2.252075471698113,
"grad_norm": 0.29305064929033103,
"learning_rate": 1.3794068270844993e-05,
"loss": 0.5115,
"step": 1493
},
{
"epoch": 2.2535849056603774,
"grad_norm": 0.2657764273856441,
"learning_rate": 1.3766088416340237e-05,
"loss": 0.496,
"step": 1494
},
{
"epoch": 2.2550943396226417,
"grad_norm": 0.29966606901721604,
"learning_rate": 1.373810856183548e-05,
"loss": 0.5369,
"step": 1495
},
{
"epoch": 2.2566037735849056,
"grad_norm": 0.32608189723105613,
"learning_rate": 1.3710128707330722e-05,
"loss": 0.5373,
"step": 1496
},
{
"epoch": 2.25811320754717,
"grad_norm": 0.2698321482337212,
"learning_rate": 1.3682148852825965e-05,
"loss": 0.5642,
"step": 1497
},
{
"epoch": 2.259622641509434,
"grad_norm": 0.2862557953990325,
"learning_rate": 1.365416899832121e-05,
"loss": 0.5419,
"step": 1498
},
{
"epoch": 2.261132075471698,
"grad_norm": 0.2610202337731504,
"learning_rate": 1.3626189143816454e-05,
"loss": 0.4617,
"step": 1499
},
{
"epoch": 2.262641509433962,
"grad_norm": 0.2756743186352737,
"learning_rate": 1.3598209289311697e-05,
"loss": 0.5105,
"step": 1500
},
{
"epoch": 2.2641509433962264,
"grad_norm": 0.318550644447325,
"learning_rate": 1.3570229434806939e-05,
"loss": 0.5459,
"step": 1501
},
{
"epoch": 2.2656603773584907,
"grad_norm": 0.27546152249243994,
"learning_rate": 1.3542249580302182e-05,
"loss": 0.5206,
"step": 1502
},
{
"epoch": 2.2671698113207546,
"grad_norm": 0.29381799157195004,
"learning_rate": 1.3514269725797427e-05,
"loss": 0.4987,
"step": 1503
},
{
"epoch": 2.268679245283019,
"grad_norm": 0.295754784857654,
"learning_rate": 1.348628987129267e-05,
"loss": 0.5064,
"step": 1504
},
{
"epoch": 2.2701886792452832,
"grad_norm": 0.2918834331491631,
"learning_rate": 1.3458310016787914e-05,
"loss": 0.571,
"step": 1505
},
{
"epoch": 2.271698113207547,
"grad_norm": 0.256431144636905,
"learning_rate": 1.3430330162283155e-05,
"loss": 0.4934,
"step": 1506
},
{
"epoch": 2.2732075471698114,
"grad_norm": 0.2836824188188069,
"learning_rate": 1.34023503077784e-05,
"loss": 0.5082,
"step": 1507
},
{
"epoch": 2.2747169811320753,
"grad_norm": 0.29318207704716076,
"learning_rate": 1.3374370453273644e-05,
"loss": 0.5,
"step": 1508
},
{
"epoch": 2.2762264150943397,
"grad_norm": 0.271941956506678,
"learning_rate": 1.3346390598768887e-05,
"loss": 0.5325,
"step": 1509
},
{
"epoch": 2.2777358490566035,
"grad_norm": 0.2855337211319555,
"learning_rate": 1.3318410744264132e-05,
"loss": 0.4959,
"step": 1510
},
{
"epoch": 2.279245283018868,
"grad_norm": 0.2949051630826,
"learning_rate": 1.3290430889759372e-05,
"loss": 0.5202,
"step": 1511
},
{
"epoch": 2.280754716981132,
"grad_norm": 0.27050450657637826,
"learning_rate": 1.3262451035254617e-05,
"loss": 0.5023,
"step": 1512
},
{
"epoch": 2.282264150943396,
"grad_norm": 0.291946921230413,
"learning_rate": 1.323447118074986e-05,
"loss": 0.5177,
"step": 1513
},
{
"epoch": 2.2837735849056604,
"grad_norm": 0.27140194708974613,
"learning_rate": 1.3206491326245104e-05,
"loss": 0.4802,
"step": 1514
},
{
"epoch": 2.2852830188679247,
"grad_norm": 0.28632708768258713,
"learning_rate": 1.3178511471740349e-05,
"loss": 0.5031,
"step": 1515
},
{
"epoch": 2.2867924528301886,
"grad_norm": 0.2653378913781436,
"learning_rate": 1.3150531617235589e-05,
"loss": 0.5348,
"step": 1516
},
{
"epoch": 2.288301886792453,
"grad_norm": 0.3416254176656584,
"learning_rate": 1.3122551762730834e-05,
"loss": 0.5115,
"step": 1517
},
{
"epoch": 2.289811320754717,
"grad_norm": 0.3132158552150094,
"learning_rate": 1.3094571908226077e-05,
"loss": 0.5507,
"step": 1518
},
{
"epoch": 2.291320754716981,
"grad_norm": 0.2587415321407086,
"learning_rate": 1.306659205372132e-05,
"loss": 0.471,
"step": 1519
},
{
"epoch": 2.292830188679245,
"grad_norm": 0.2813684595304604,
"learning_rate": 1.3038612199216566e-05,
"loss": 0.5033,
"step": 1520
},
{
"epoch": 2.2943396226415094,
"grad_norm": 0.34086813878117894,
"learning_rate": 1.301063234471181e-05,
"loss": 0.5173,
"step": 1521
},
{
"epoch": 2.2958490566037737,
"grad_norm": 0.29897003760539104,
"learning_rate": 1.2982652490207051e-05,
"loss": 0.4927,
"step": 1522
},
{
"epoch": 2.2973584905660376,
"grad_norm": 0.3875603639423126,
"learning_rate": 1.2954672635702294e-05,
"loss": 0.5581,
"step": 1523
},
{
"epoch": 2.298867924528302,
"grad_norm": 0.2862074867727448,
"learning_rate": 1.292669278119754e-05,
"loss": 0.4862,
"step": 1524
},
{
"epoch": 2.3003773584905662,
"grad_norm": 0.30868691166763873,
"learning_rate": 1.2898712926692783e-05,
"loss": 0.5272,
"step": 1525
},
{
"epoch": 2.30188679245283,
"grad_norm": 0.29790893971786336,
"learning_rate": 1.2870733072188026e-05,
"loss": 0.5266,
"step": 1526
},
{
"epoch": 2.3033962264150944,
"grad_norm": 0.2782090398414401,
"learning_rate": 1.2842753217683268e-05,
"loss": 0.5462,
"step": 1527
},
{
"epoch": 2.3049056603773583,
"grad_norm": 0.2854745590256956,
"learning_rate": 1.2814773363178511e-05,
"loss": 0.4805,
"step": 1528
},
{
"epoch": 2.3064150943396227,
"grad_norm": 0.3113454727191614,
"learning_rate": 1.2786793508673756e-05,
"loss": 0.4902,
"step": 1529
},
{
"epoch": 2.3079245283018865,
"grad_norm": 0.25928474454077677,
"learning_rate": 1.2758813654169e-05,
"loss": 0.4906,
"step": 1530
},
{
"epoch": 2.309433962264151,
"grad_norm": 0.26698105372178027,
"learning_rate": 1.2730833799664243e-05,
"loss": 0.5508,
"step": 1531
},
{
"epoch": 2.310943396226415,
"grad_norm": 0.26530252126413506,
"learning_rate": 1.2702853945159485e-05,
"loss": 0.5375,
"step": 1532
},
{
"epoch": 2.312452830188679,
"grad_norm": 0.32824150490138687,
"learning_rate": 1.2674874090654728e-05,
"loss": 0.5316,
"step": 1533
},
{
"epoch": 2.3139622641509434,
"grad_norm": 0.2616604148241911,
"learning_rate": 1.2646894236149973e-05,
"loss": 0.4693,
"step": 1534
},
{
"epoch": 2.3154716981132077,
"grad_norm": 0.2937527365395457,
"learning_rate": 1.2618914381645216e-05,
"loss": 0.5329,
"step": 1535
},
{
"epoch": 2.3169811320754716,
"grad_norm": 0.28549646821620334,
"learning_rate": 1.259093452714046e-05,
"loss": 0.5417,
"step": 1536
},
{
"epoch": 2.318490566037736,
"grad_norm": 0.29605947776580616,
"learning_rate": 1.2562954672635701e-05,
"loss": 0.5188,
"step": 1537
},
{
"epoch": 2.32,
"grad_norm": 0.28290540315403917,
"learning_rate": 1.2534974818130946e-05,
"loss": 0.4817,
"step": 1538
},
{
"epoch": 2.321509433962264,
"grad_norm": 0.28346799307216214,
"learning_rate": 1.250699496362619e-05,
"loss": 0.5291,
"step": 1539
},
{
"epoch": 2.323018867924528,
"grad_norm": 0.3025578648182897,
"learning_rate": 1.2479015109121433e-05,
"loss": 0.5549,
"step": 1540
},
{
"epoch": 2.3245283018867924,
"grad_norm": 0.28767635571955014,
"learning_rate": 1.2451035254616676e-05,
"loss": 0.5334,
"step": 1541
},
{
"epoch": 2.3260377358490567,
"grad_norm": 0.31251235625172796,
"learning_rate": 1.242305540011192e-05,
"loss": 0.5222,
"step": 1542
},
{
"epoch": 2.3275471698113206,
"grad_norm": 0.3280951350127823,
"learning_rate": 1.2395075545607163e-05,
"loss": 0.5397,
"step": 1543
},
{
"epoch": 2.329056603773585,
"grad_norm": 0.313854810082151,
"learning_rate": 1.2367095691102407e-05,
"loss": 0.5483,
"step": 1544
},
{
"epoch": 2.3305660377358492,
"grad_norm": 0.2670608264932217,
"learning_rate": 1.233911583659765e-05,
"loss": 0.5254,
"step": 1545
},
{
"epoch": 2.332075471698113,
"grad_norm": 0.2719825460240819,
"learning_rate": 1.2311135982092893e-05,
"loss": 0.5459,
"step": 1546
},
{
"epoch": 2.3335849056603775,
"grad_norm": 0.31088912379982053,
"learning_rate": 1.2283156127588137e-05,
"loss": 0.5487,
"step": 1547
},
{
"epoch": 2.3350943396226413,
"grad_norm": 0.27061842798171537,
"learning_rate": 1.2255176273083382e-05,
"loss": 0.4935,
"step": 1548
},
{
"epoch": 2.3366037735849057,
"grad_norm": 0.2831678674322194,
"learning_rate": 1.2227196418578623e-05,
"loss": 0.5351,
"step": 1549
},
{
"epoch": 2.33811320754717,
"grad_norm": 0.49601884749761044,
"learning_rate": 1.2199216564073867e-05,
"loss": 0.5137,
"step": 1550
},
{
"epoch": 2.339622641509434,
"grad_norm": 0.27586411056769244,
"learning_rate": 1.217123670956911e-05,
"loss": 0.5412,
"step": 1551
},
{
"epoch": 2.341132075471698,
"grad_norm": 0.29114042947531105,
"learning_rate": 1.2143256855064353e-05,
"loss": 0.5458,
"step": 1552
},
{
"epoch": 2.342641509433962,
"grad_norm": 0.2689747977811522,
"learning_rate": 1.2115277000559599e-05,
"loss": 0.5441,
"step": 1553
},
{
"epoch": 2.3441509433962264,
"grad_norm": 0.261267562119711,
"learning_rate": 1.208729714605484e-05,
"loss": 0.4996,
"step": 1554
},
{
"epoch": 2.3456603773584908,
"grad_norm": 0.2630661794726856,
"learning_rate": 1.2059317291550085e-05,
"loss": 0.4945,
"step": 1555
},
{
"epoch": 2.3471698113207546,
"grad_norm": 0.2714886371024407,
"learning_rate": 1.2031337437045329e-05,
"loss": 0.5419,
"step": 1556
},
{
"epoch": 2.348679245283019,
"grad_norm": 0.27322996123705456,
"learning_rate": 1.200335758254057e-05,
"loss": 0.5344,
"step": 1557
},
{
"epoch": 2.350188679245283,
"grad_norm": 0.27446564225924486,
"learning_rate": 1.1975377728035815e-05,
"loss": 0.4914,
"step": 1558
},
{
"epoch": 2.351698113207547,
"grad_norm": 0.2786018549729709,
"learning_rate": 1.1947397873531057e-05,
"loss": 0.5524,
"step": 1559
},
{
"epoch": 2.3532075471698115,
"grad_norm": 0.2744379654868362,
"learning_rate": 1.1919418019026302e-05,
"loss": 0.5057,
"step": 1560
},
{
"epoch": 2.3547169811320754,
"grad_norm": 0.29383230395925714,
"learning_rate": 1.1891438164521545e-05,
"loss": 0.4995,
"step": 1561
},
{
"epoch": 2.3562264150943397,
"grad_norm": 0.27567156486489924,
"learning_rate": 1.1863458310016789e-05,
"loss": 0.5059,
"step": 1562
},
{
"epoch": 2.3577358490566036,
"grad_norm": 0.2965755545135667,
"learning_rate": 1.1835478455512032e-05,
"loss": 0.5183,
"step": 1563
},
{
"epoch": 2.359245283018868,
"grad_norm": 0.2942882557087409,
"learning_rate": 1.1807498601007276e-05,
"loss": 0.4978,
"step": 1564
},
{
"epoch": 2.3607547169811323,
"grad_norm": 0.3612767923459085,
"learning_rate": 1.1779518746502519e-05,
"loss": 0.5482,
"step": 1565
},
{
"epoch": 2.362264150943396,
"grad_norm": 0.37068931043688397,
"learning_rate": 1.1751538891997762e-05,
"loss": 0.5417,
"step": 1566
},
{
"epoch": 2.3637735849056605,
"grad_norm": 0.3009994319645029,
"learning_rate": 1.1723559037493006e-05,
"loss": 0.5241,
"step": 1567
},
{
"epoch": 2.3652830188679244,
"grad_norm": 0.2982051870389813,
"learning_rate": 1.1695579182988249e-05,
"loss": 0.5138,
"step": 1568
},
{
"epoch": 2.3667924528301887,
"grad_norm": 0.273290340770851,
"learning_rate": 1.1667599328483492e-05,
"loss": 0.5425,
"step": 1569
},
{
"epoch": 2.368301886792453,
"grad_norm": 0.30521472956958234,
"learning_rate": 1.1639619473978736e-05,
"loss": 0.503,
"step": 1570
},
{
"epoch": 2.369811320754717,
"grad_norm": 0.2733183011713467,
"learning_rate": 1.1611639619473979e-05,
"loss": 0.4991,
"step": 1571
},
{
"epoch": 2.3713207547169812,
"grad_norm": 0.2700309326901011,
"learning_rate": 1.1583659764969222e-05,
"loss": 0.4871,
"step": 1572
},
{
"epoch": 2.372830188679245,
"grad_norm": 0.3165698192789348,
"learning_rate": 1.1555679910464466e-05,
"loss": 0.5259,
"step": 1573
},
{
"epoch": 2.3743396226415094,
"grad_norm": 0.32213328390083884,
"learning_rate": 1.152770005595971e-05,
"loss": 0.4876,
"step": 1574
},
{
"epoch": 2.3758490566037738,
"grad_norm": 0.3212920255017883,
"learning_rate": 1.1499720201454953e-05,
"loss": 0.5213,
"step": 1575
},
{
"epoch": 2.3773584905660377,
"grad_norm": 0.28550611630396816,
"learning_rate": 1.1471740346950196e-05,
"loss": 0.5245,
"step": 1576
},
{
"epoch": 2.378867924528302,
"grad_norm": 0.33575387480012625,
"learning_rate": 1.144376049244544e-05,
"loss": 0.5325,
"step": 1577
},
{
"epoch": 2.380377358490566,
"grad_norm": 0.3209791568084635,
"learning_rate": 1.1415780637940683e-05,
"loss": 0.5203,
"step": 1578
},
{
"epoch": 2.38188679245283,
"grad_norm": 0.3068339238371063,
"learning_rate": 1.1387800783435928e-05,
"loss": 0.5432,
"step": 1579
},
{
"epoch": 2.3833962264150945,
"grad_norm": 0.29288109611440577,
"learning_rate": 1.135982092893117e-05,
"loss": 0.4908,
"step": 1580
},
{
"epoch": 2.3849056603773584,
"grad_norm": 0.2823808739473944,
"learning_rate": 1.1331841074426414e-05,
"loss": 0.5247,
"step": 1581
},
{
"epoch": 2.3864150943396227,
"grad_norm": 0.2970923159171053,
"learning_rate": 1.1303861219921658e-05,
"loss": 0.5034,
"step": 1582
},
{
"epoch": 2.3879245283018866,
"grad_norm": 0.2811796099376647,
"learning_rate": 1.12758813654169e-05,
"loss": 0.513,
"step": 1583
},
{
"epoch": 2.389433962264151,
"grad_norm": 0.28383899311229177,
"learning_rate": 1.1247901510912144e-05,
"loss": 0.4913,
"step": 1584
},
{
"epoch": 2.3909433962264153,
"grad_norm": 0.2965229723431742,
"learning_rate": 1.1219921656407386e-05,
"loss": 0.5565,
"step": 1585
},
{
"epoch": 2.392452830188679,
"grad_norm": 0.28613474578656095,
"learning_rate": 1.1191941801902631e-05,
"loss": 0.5035,
"step": 1586
},
{
"epoch": 2.3939622641509435,
"grad_norm": 0.333638319152662,
"learning_rate": 1.1163961947397875e-05,
"loss": 0.5502,
"step": 1587
},
{
"epoch": 2.3954716981132074,
"grad_norm": 0.2762683785182854,
"learning_rate": 1.1135982092893118e-05,
"loss": 0.4957,
"step": 1588
},
{
"epoch": 2.3969811320754717,
"grad_norm": 0.2841965209758776,
"learning_rate": 1.1108002238388361e-05,
"loss": 0.5323,
"step": 1589
},
{
"epoch": 2.398490566037736,
"grad_norm": 0.30355635904838757,
"learning_rate": 1.1080022383883603e-05,
"loss": 0.5362,
"step": 1590
},
{
"epoch": 2.4,
"grad_norm": 0.2613635701106073,
"learning_rate": 1.1052042529378848e-05,
"loss": 0.5163,
"step": 1591
},
{
"epoch": 2.4015094339622642,
"grad_norm": 0.616566657926194,
"learning_rate": 1.1024062674874091e-05,
"loss": 0.5233,
"step": 1592
},
{
"epoch": 2.403018867924528,
"grad_norm": 0.2693893123464824,
"learning_rate": 1.0996082820369335e-05,
"loss": 0.5075,
"step": 1593
},
{
"epoch": 2.4045283018867925,
"grad_norm": 0.27548371010084427,
"learning_rate": 1.0968102965864578e-05,
"loss": 0.5055,
"step": 1594
},
{
"epoch": 2.406037735849057,
"grad_norm": 0.2781662462502358,
"learning_rate": 1.0940123111359821e-05,
"loss": 0.5539,
"step": 1595
},
{
"epoch": 2.4075471698113207,
"grad_norm": 0.30912389736469736,
"learning_rate": 1.0912143256855065e-05,
"loss": 0.5981,
"step": 1596
},
{
"epoch": 2.409056603773585,
"grad_norm": 0.28742057157673107,
"learning_rate": 1.0884163402350308e-05,
"loss": 0.4773,
"step": 1597
},
{
"epoch": 2.410566037735849,
"grad_norm": 0.2544732632143813,
"learning_rate": 1.0856183547845552e-05,
"loss": 0.4747,
"step": 1598
},
{
"epoch": 2.412075471698113,
"grad_norm": 0.2542075611364493,
"learning_rate": 1.0828203693340795e-05,
"loss": 0.4653,
"step": 1599
},
{
"epoch": 2.4135849056603775,
"grad_norm": 0.28973359121279074,
"learning_rate": 1.0800223838836038e-05,
"loss": 0.5229,
"step": 1600
},
{
"epoch": 2.4150943396226414,
"grad_norm": 0.36347700962188983,
"learning_rate": 1.0772243984331282e-05,
"loss": 0.5372,
"step": 1601
},
{
"epoch": 2.4166037735849057,
"grad_norm": 0.27163267397417085,
"learning_rate": 1.0744264129826525e-05,
"loss": 0.5022,
"step": 1602
},
{
"epoch": 2.4181132075471696,
"grad_norm": 3.40159737994843,
"learning_rate": 1.0716284275321768e-05,
"loss": 0.5791,
"step": 1603
},
{
"epoch": 2.419622641509434,
"grad_norm": 0.3024980193028065,
"learning_rate": 1.0688304420817012e-05,
"loss": 0.5413,
"step": 1604
},
{
"epoch": 2.4211320754716983,
"grad_norm": 0.3191908802604592,
"learning_rate": 1.0660324566312257e-05,
"loss": 0.5505,
"step": 1605
},
{
"epoch": 2.422641509433962,
"grad_norm": 0.34247393903147477,
"learning_rate": 1.0632344711807498e-05,
"loss": 0.5156,
"step": 1606
},
{
"epoch": 2.4241509433962265,
"grad_norm": 0.30555621465573174,
"learning_rate": 1.0604364857302742e-05,
"loss": 0.5164,
"step": 1607
},
{
"epoch": 2.4256603773584904,
"grad_norm": 0.28883349420538895,
"learning_rate": 1.0576385002797987e-05,
"loss": 0.5196,
"step": 1608
},
{
"epoch": 2.4271698113207547,
"grad_norm": 0.29370588202466086,
"learning_rate": 1.0548405148293229e-05,
"loss": 0.4992,
"step": 1609
},
{
"epoch": 2.428679245283019,
"grad_norm": 0.2992724879265412,
"learning_rate": 1.0520425293788474e-05,
"loss": 0.5085,
"step": 1610
},
{
"epoch": 2.430188679245283,
"grad_norm": 0.32945758350811966,
"learning_rate": 1.0492445439283715e-05,
"loss": 0.552,
"step": 1611
},
{
"epoch": 2.4316981132075473,
"grad_norm": 0.2682269673076967,
"learning_rate": 1.046446558477896e-05,
"loss": 0.5156,
"step": 1612
},
{
"epoch": 2.433207547169811,
"grad_norm": 0.28647715743947366,
"learning_rate": 1.0436485730274204e-05,
"loss": 0.4993,
"step": 1613
},
{
"epoch": 2.4347169811320755,
"grad_norm": 0.2744939023949034,
"learning_rate": 1.0408505875769445e-05,
"loss": 0.5253,
"step": 1614
},
{
"epoch": 2.43622641509434,
"grad_norm": 0.35250397665004307,
"learning_rate": 1.038052602126469e-05,
"loss": 0.5396,
"step": 1615
},
{
"epoch": 2.4377358490566037,
"grad_norm": 0.3123726343205459,
"learning_rate": 1.0352546166759932e-05,
"loss": 0.5134,
"step": 1616
},
{
"epoch": 2.439245283018868,
"grad_norm": 0.2764691102160436,
"learning_rate": 1.0324566312255177e-05,
"loss": 0.5091,
"step": 1617
},
{
"epoch": 2.440754716981132,
"grad_norm": 0.2814627012575424,
"learning_rate": 1.029658645775042e-05,
"loss": 0.489,
"step": 1618
},
{
"epoch": 2.442264150943396,
"grad_norm": 0.39565086571218255,
"learning_rate": 1.0268606603245664e-05,
"loss": 0.5338,
"step": 1619
},
{
"epoch": 2.4437735849056605,
"grad_norm": 0.3089370742923853,
"learning_rate": 1.0240626748740907e-05,
"loss": 0.5511,
"step": 1620
},
{
"epoch": 2.4452830188679244,
"grad_norm": 0.27889082409535804,
"learning_rate": 1.021264689423615e-05,
"loss": 0.4989,
"step": 1621
},
{
"epoch": 2.4467924528301888,
"grad_norm": 0.2801027230431505,
"learning_rate": 1.0184667039731394e-05,
"loss": 0.5151,
"step": 1622
},
{
"epoch": 2.4483018867924526,
"grad_norm": 0.2864728776570272,
"learning_rate": 1.0156687185226637e-05,
"loss": 0.5427,
"step": 1623
},
{
"epoch": 2.449811320754717,
"grad_norm": 0.2870692848584596,
"learning_rate": 1.012870733072188e-05,
"loss": 0.5044,
"step": 1624
},
{
"epoch": 2.4513207547169813,
"grad_norm": 0.27728518831547827,
"learning_rate": 1.0100727476217124e-05,
"loss": 0.5263,
"step": 1625
},
{
"epoch": 2.452830188679245,
"grad_norm": 0.28108945164897814,
"learning_rate": 1.0072747621712367e-05,
"loss": 0.5278,
"step": 1626
},
{
"epoch": 2.4543396226415095,
"grad_norm": 0.281028089568473,
"learning_rate": 1.004476776720761e-05,
"loss": 0.526,
"step": 1627
},
{
"epoch": 2.4558490566037734,
"grad_norm": 0.29992491532026355,
"learning_rate": 1.0016787912702854e-05,
"loss": 0.5543,
"step": 1628
},
{
"epoch": 2.4573584905660377,
"grad_norm": 0.2446925829790535,
"learning_rate": 9.988808058198097e-06,
"loss": 0.4827,
"step": 1629
},
{
"epoch": 2.458867924528302,
"grad_norm": 0.29289490874253116,
"learning_rate": 9.960828203693341e-06,
"loss": 0.5348,
"step": 1630
},
{
"epoch": 2.460377358490566,
"grad_norm": 0.3010609740305946,
"learning_rate": 9.932848349188586e-06,
"loss": 0.5043,
"step": 1631
},
{
"epoch": 2.4618867924528303,
"grad_norm": 0.28789020180830055,
"learning_rate": 9.904868494683828e-06,
"loss": 0.5147,
"step": 1632
},
{
"epoch": 2.463396226415094,
"grad_norm": 0.26705365000795894,
"learning_rate": 9.876888640179071e-06,
"loss": 0.5287,
"step": 1633
},
{
"epoch": 2.4649056603773585,
"grad_norm": 0.2798599523967274,
"learning_rate": 9.848908785674316e-06,
"loss": 0.4933,
"step": 1634
},
{
"epoch": 2.466415094339623,
"grad_norm": 0.2733384516616599,
"learning_rate": 9.820928931169558e-06,
"loss": 0.5029,
"step": 1635
},
{
"epoch": 2.4679245283018867,
"grad_norm": 0.29643592834837407,
"learning_rate": 9.792949076664803e-06,
"loss": 0.5158,
"step": 1636
},
{
"epoch": 2.469433962264151,
"grad_norm": 0.28526464289507686,
"learning_rate": 9.764969222160044e-06,
"loss": 0.55,
"step": 1637
},
{
"epoch": 2.470943396226415,
"grad_norm": 0.26357930501597526,
"learning_rate": 9.73698936765529e-06,
"loss": 0.4749,
"step": 1638
},
{
"epoch": 2.4724528301886792,
"grad_norm": 0.27127677302581304,
"learning_rate": 9.709009513150533e-06,
"loss": 0.4896,
"step": 1639
},
{
"epoch": 2.4739622641509436,
"grad_norm": 0.3258038736676753,
"learning_rate": 9.681029658645774e-06,
"loss": 0.5453,
"step": 1640
},
{
"epoch": 2.4754716981132074,
"grad_norm": 0.2721708903510538,
"learning_rate": 9.65304980414102e-06,
"loss": 0.5088,
"step": 1641
},
{
"epoch": 2.4769811320754718,
"grad_norm": 0.25031552664361656,
"learning_rate": 9.625069949636261e-06,
"loss": 0.4432,
"step": 1642
},
{
"epoch": 2.4784905660377357,
"grad_norm": 0.27170615085235295,
"learning_rate": 9.597090095131506e-06,
"loss": 0.5401,
"step": 1643
},
{
"epoch": 2.48,
"grad_norm": 0.25077059492786125,
"learning_rate": 9.56911024062675e-06,
"loss": 0.5061,
"step": 1644
},
{
"epoch": 2.4815094339622643,
"grad_norm": 0.4381733597945453,
"learning_rate": 9.541130386121993e-06,
"loss": 0.5121,
"step": 1645
},
{
"epoch": 2.483018867924528,
"grad_norm": 0.30828080858038703,
"learning_rate": 9.513150531617236e-06,
"loss": 0.4767,
"step": 1646
},
{
"epoch": 2.4845283018867925,
"grad_norm": 0.2679178849366846,
"learning_rate": 9.48517067711248e-06,
"loss": 0.5115,
"step": 1647
},
{
"epoch": 2.486037735849057,
"grad_norm": 0.25498782051352026,
"learning_rate": 9.457190822607723e-06,
"loss": 0.4925,
"step": 1648
},
{
"epoch": 2.4875471698113207,
"grad_norm": 0.2712046865589304,
"learning_rate": 9.429210968102966e-06,
"loss": 0.5239,
"step": 1649
},
{
"epoch": 2.489056603773585,
"grad_norm": 0.2841429645079444,
"learning_rate": 9.40123111359821e-06,
"loss": 0.5038,
"step": 1650
},
{
"epoch": 2.490566037735849,
"grad_norm": 0.2922250090980482,
"learning_rate": 9.373251259093453e-06,
"loss": 0.5323,
"step": 1651
},
{
"epoch": 2.4920754716981133,
"grad_norm": 0.3490130576354896,
"learning_rate": 9.345271404588697e-06,
"loss": 0.5219,
"step": 1652
},
{
"epoch": 2.493584905660377,
"grad_norm": 0.2911807875623749,
"learning_rate": 9.31729155008394e-06,
"loss": 0.5356,
"step": 1653
},
{
"epoch": 2.4950943396226415,
"grad_norm": 0.26818440305605495,
"learning_rate": 9.289311695579183e-06,
"loss": 0.4687,
"step": 1654
},
{
"epoch": 2.496603773584906,
"grad_norm": 0.2629570306134703,
"learning_rate": 9.261331841074427e-06,
"loss": 0.5058,
"step": 1655
},
{
"epoch": 2.4981132075471697,
"grad_norm": 0.27394341115092274,
"learning_rate": 9.23335198656967e-06,
"loss": 0.5189,
"step": 1656
},
{
"epoch": 2.499622641509434,
"grad_norm": 0.31719664729022873,
"learning_rate": 9.205372132064913e-06,
"loss": 0.5479,
"step": 1657
},
{
"epoch": 2.5011320754716984,
"grad_norm": 0.28948224769873077,
"learning_rate": 9.177392277560157e-06,
"loss": 0.5195,
"step": 1658
},
{
"epoch": 2.5026415094339622,
"grad_norm": 0.27872732971809544,
"learning_rate": 9.1494124230554e-06,
"loss": 0.4954,
"step": 1659
},
{
"epoch": 2.5041509433962266,
"grad_norm": 0.2972680704163209,
"learning_rate": 9.121432568550645e-06,
"loss": 0.5492,
"step": 1660
},
{
"epoch": 2.5056603773584905,
"grad_norm": 0.258478206277154,
"learning_rate": 9.093452714045887e-06,
"loss": 0.4811,
"step": 1661
},
{
"epoch": 2.507169811320755,
"grad_norm": 0.27135076957243554,
"learning_rate": 9.065472859541132e-06,
"loss": 0.5241,
"step": 1662
},
{
"epoch": 2.5086792452830187,
"grad_norm": 0.25049127152785716,
"learning_rate": 9.037493005036374e-06,
"loss": 0.5322,
"step": 1663
},
{
"epoch": 2.510188679245283,
"grad_norm": 0.2414933347105079,
"learning_rate": 9.009513150531617e-06,
"loss": 0.5065,
"step": 1664
},
{
"epoch": 2.5116981132075473,
"grad_norm": 0.26905976255414465,
"learning_rate": 8.981533296026862e-06,
"loss": 0.5204,
"step": 1665
},
{
"epoch": 2.513207547169811,
"grad_norm": 0.2574263160663239,
"learning_rate": 8.953553441522104e-06,
"loss": 0.4995,
"step": 1666
},
{
"epoch": 2.5147169811320755,
"grad_norm": 0.2412927664002052,
"learning_rate": 8.925573587017349e-06,
"loss": 0.5189,
"step": 1667
},
{
"epoch": 2.51622641509434,
"grad_norm": 0.2724697409186211,
"learning_rate": 8.89759373251259e-06,
"loss": 0.4707,
"step": 1668
},
{
"epoch": 2.5177358490566037,
"grad_norm": 0.2649030391477551,
"learning_rate": 8.869613878007835e-06,
"loss": 0.5068,
"step": 1669
},
{
"epoch": 2.519245283018868,
"grad_norm": 1.937782774908197,
"learning_rate": 8.841634023503079e-06,
"loss": 0.4939,
"step": 1670
},
{
"epoch": 2.520754716981132,
"grad_norm": 0.26438563805690896,
"learning_rate": 8.81365416899832e-06,
"loss": 0.5031,
"step": 1671
},
{
"epoch": 2.5222641509433963,
"grad_norm": 0.254419310430134,
"learning_rate": 8.785674314493565e-06,
"loss": 0.5183,
"step": 1672
},
{
"epoch": 2.52377358490566,
"grad_norm": 0.27032143419025384,
"learning_rate": 8.757694459988809e-06,
"loss": 0.4891,
"step": 1673
},
{
"epoch": 2.5252830188679245,
"grad_norm": 0.2714305046812353,
"learning_rate": 8.729714605484052e-06,
"loss": 0.5136,
"step": 1674
},
{
"epoch": 2.526792452830189,
"grad_norm": 0.27145056603532564,
"learning_rate": 8.701734750979296e-06,
"loss": 0.5275,
"step": 1675
},
{
"epoch": 2.5283018867924527,
"grad_norm": 0.2636503580959262,
"learning_rate": 8.673754896474539e-06,
"loss": 0.5661,
"step": 1676
},
{
"epoch": 2.529811320754717,
"grad_norm": 0.2862829068236602,
"learning_rate": 8.645775041969782e-06,
"loss": 0.5069,
"step": 1677
},
{
"epoch": 2.5313207547169814,
"grad_norm": 0.2629202015341607,
"learning_rate": 8.617795187465026e-06,
"loss": 0.5216,
"step": 1678
},
{
"epoch": 2.5328301886792453,
"grad_norm": 0.2512663214374365,
"learning_rate": 8.589815332960269e-06,
"loss": 0.4779,
"step": 1679
},
{
"epoch": 2.5343396226415096,
"grad_norm": 0.2582674957787444,
"learning_rate": 8.561835478455512e-06,
"loss": 0.5012,
"step": 1680
},
{
"epoch": 2.5358490566037735,
"grad_norm": 0.26493998715379696,
"learning_rate": 8.533855623950756e-06,
"loss": 0.5263,
"step": 1681
},
{
"epoch": 2.537358490566038,
"grad_norm": 0.2451699637847086,
"learning_rate": 8.505875769445999e-06,
"loss": 0.4737,
"step": 1682
},
{
"epoch": 2.5388679245283017,
"grad_norm": 0.2646435180201724,
"learning_rate": 8.477895914941242e-06,
"loss": 0.5208,
"step": 1683
},
{
"epoch": 2.540377358490566,
"grad_norm": 0.2582198344619996,
"learning_rate": 8.449916060436486e-06,
"loss": 0.4994,
"step": 1684
},
{
"epoch": 2.5418867924528303,
"grad_norm": 0.24851559763189898,
"learning_rate": 8.42193620593173e-06,
"loss": 0.4999,
"step": 1685
},
{
"epoch": 2.543396226415094,
"grad_norm": 0.2554578588043926,
"learning_rate": 8.393956351426974e-06,
"loss": 0.5173,
"step": 1686
},
{
"epoch": 2.5449056603773585,
"grad_norm": 0.29270235106198766,
"learning_rate": 8.365976496922216e-06,
"loss": 0.5509,
"step": 1687
},
{
"epoch": 2.546415094339623,
"grad_norm": 0.2521719040156708,
"learning_rate": 8.337996642417461e-06,
"loss": 0.4927,
"step": 1688
},
{
"epoch": 2.5479245283018868,
"grad_norm": 0.24926253146252536,
"learning_rate": 8.310016787912703e-06,
"loss": 0.5119,
"step": 1689
},
{
"epoch": 2.549433962264151,
"grad_norm": 0.24764507855189977,
"learning_rate": 8.282036933407946e-06,
"loss": 0.5169,
"step": 1690
},
{
"epoch": 2.550943396226415,
"grad_norm": 0.2846935555038941,
"learning_rate": 8.254057078903191e-06,
"loss": 0.5303,
"step": 1691
},
{
"epoch": 2.5524528301886793,
"grad_norm": 1.3298175058900719,
"learning_rate": 8.226077224398433e-06,
"loss": 0.4987,
"step": 1692
},
{
"epoch": 2.553962264150943,
"grad_norm": 0.26585384992525096,
"learning_rate": 8.198097369893678e-06,
"loss": 0.533,
"step": 1693
},
{
"epoch": 2.5554716981132075,
"grad_norm": 0.26892517763510493,
"learning_rate": 8.17011751538892e-06,
"loss": 0.5505,
"step": 1694
},
{
"epoch": 2.556981132075472,
"grad_norm": 0.27708427633239685,
"learning_rate": 8.142137660884165e-06,
"loss": 0.4835,
"step": 1695
},
{
"epoch": 2.5584905660377357,
"grad_norm": 0.25860083912017584,
"learning_rate": 8.114157806379408e-06,
"loss": 0.4909,
"step": 1696
},
{
"epoch": 2.56,
"grad_norm": 0.28186533953501614,
"learning_rate": 8.08617795187465e-06,
"loss": 0.5547,
"step": 1697
},
{
"epoch": 2.5615094339622644,
"grad_norm": 0.3569206208752698,
"learning_rate": 8.058198097369895e-06,
"loss": 0.5023,
"step": 1698
},
{
"epoch": 2.5630188679245283,
"grad_norm": 0.26678040300011296,
"learning_rate": 8.030218242865136e-06,
"loss": 0.514,
"step": 1699
},
{
"epoch": 2.5645283018867926,
"grad_norm": 0.27519779245794745,
"learning_rate": 8.002238388360381e-06,
"loss": 0.499,
"step": 1700
},
{
"epoch": 2.5660377358490565,
"grad_norm": 0.28184342859209355,
"learning_rate": 7.974258533855625e-06,
"loss": 0.5079,
"step": 1701
},
{
"epoch": 2.567547169811321,
"grad_norm": 0.2696907607563735,
"learning_rate": 7.946278679350868e-06,
"loss": 0.5349,
"step": 1702
},
{
"epoch": 2.5690566037735847,
"grad_norm": 0.34249892968895884,
"learning_rate": 7.918298824846111e-06,
"loss": 0.4951,
"step": 1703
},
{
"epoch": 2.570566037735849,
"grad_norm": 0.28104386328396547,
"learning_rate": 7.890318970341355e-06,
"loss": 0.4748,
"step": 1704
},
{
"epoch": 2.5720754716981133,
"grad_norm": 0.27841856070995974,
"learning_rate": 7.862339115836598e-06,
"loss": 0.5164,
"step": 1705
},
{
"epoch": 2.5735849056603772,
"grad_norm": 0.26103373360378673,
"learning_rate": 7.834359261331841e-06,
"loss": 0.5196,
"step": 1706
},
{
"epoch": 2.5750943396226416,
"grad_norm": 0.28226689926781434,
"learning_rate": 7.806379406827085e-06,
"loss": 0.5471,
"step": 1707
},
{
"epoch": 2.576603773584906,
"grad_norm": 0.260925718946685,
"learning_rate": 7.778399552322328e-06,
"loss": 0.5245,
"step": 1708
},
{
"epoch": 2.5781132075471698,
"grad_norm": 0.2651170853166187,
"learning_rate": 7.750419697817572e-06,
"loss": 0.5191,
"step": 1709
},
{
"epoch": 2.579622641509434,
"grad_norm": 0.2878754716574323,
"learning_rate": 7.722439843312815e-06,
"loss": 0.5276,
"step": 1710
},
{
"epoch": 2.581132075471698,
"grad_norm": 0.38149999629802744,
"learning_rate": 7.694459988808058e-06,
"loss": 0.5063,
"step": 1711
},
{
"epoch": 2.5826415094339623,
"grad_norm": 0.25769713900097513,
"learning_rate": 7.666480134303302e-06,
"loss": 0.4829,
"step": 1712
},
{
"epoch": 2.584150943396226,
"grad_norm": 0.2537097287006656,
"learning_rate": 7.638500279798545e-06,
"loss": 0.5452,
"step": 1713
},
{
"epoch": 2.5856603773584905,
"grad_norm": 0.26625886535127746,
"learning_rate": 7.610520425293789e-06,
"loss": 0.4987,
"step": 1714
},
{
"epoch": 2.587169811320755,
"grad_norm": 0.27567634014604986,
"learning_rate": 7.582540570789032e-06,
"loss": 0.5292,
"step": 1715
},
{
"epoch": 2.5886792452830187,
"grad_norm": 0.2745735348694907,
"learning_rate": 7.554560716284276e-06,
"loss": 0.5124,
"step": 1716
},
{
"epoch": 2.590188679245283,
"grad_norm": 0.27869253594934607,
"learning_rate": 7.526580861779519e-06,
"loss": 0.5011,
"step": 1717
},
{
"epoch": 2.5916981132075474,
"grad_norm": 0.26083708585981563,
"learning_rate": 7.498601007274763e-06,
"loss": 0.5063,
"step": 1718
},
{
"epoch": 2.5932075471698113,
"grad_norm": 0.25898145180183996,
"learning_rate": 7.470621152770006e-06,
"loss": 0.5263,
"step": 1719
},
{
"epoch": 2.5947169811320756,
"grad_norm": 0.25116990389413063,
"learning_rate": 7.4426412982652486e-06,
"loss": 0.5004,
"step": 1720
},
{
"epoch": 2.5962264150943395,
"grad_norm": 0.26255445723201376,
"learning_rate": 7.414661443760493e-06,
"loss": 0.4588,
"step": 1721
},
{
"epoch": 2.597735849056604,
"grad_norm": 0.29059302191645914,
"learning_rate": 7.386681589255737e-06,
"loss": 0.5715,
"step": 1722
},
{
"epoch": 2.5992452830188677,
"grad_norm": 0.259169324744869,
"learning_rate": 7.3587017347509795e-06,
"loss": 0.5066,
"step": 1723
},
{
"epoch": 2.600754716981132,
"grad_norm": 0.2550956884217296,
"learning_rate": 7.330721880246223e-06,
"loss": 0.4929,
"step": 1724
},
{
"epoch": 2.6022641509433964,
"grad_norm": 0.26605119100893077,
"learning_rate": 7.302742025741466e-06,
"loss": 0.5148,
"step": 1725
},
{
"epoch": 2.6037735849056602,
"grad_norm": 0.2839684129223697,
"learning_rate": 7.27476217123671e-06,
"loss": 0.5379,
"step": 1726
},
{
"epoch": 2.6052830188679246,
"grad_norm": 0.24671746161838828,
"learning_rate": 7.246782316731954e-06,
"loss": 0.5344,
"step": 1727
},
{
"epoch": 2.606792452830189,
"grad_norm": 0.26950728167251553,
"learning_rate": 7.218802462227196e-06,
"loss": 0.5269,
"step": 1728
},
{
"epoch": 2.608301886792453,
"grad_norm": 0.30410413396696345,
"learning_rate": 7.1908226077224405e-06,
"loss": 0.5198,
"step": 1729
},
{
"epoch": 2.609811320754717,
"grad_norm": 0.3012042476084446,
"learning_rate": 7.162842753217684e-06,
"loss": 0.5261,
"step": 1730
},
{
"epoch": 2.611320754716981,
"grad_norm": 0.25613913625539647,
"learning_rate": 7.134862898712926e-06,
"loss": 0.5445,
"step": 1731
},
{
"epoch": 2.6128301886792453,
"grad_norm": 0.24351065578422196,
"learning_rate": 7.106883044208171e-06,
"loss": 0.4737,
"step": 1732
},
{
"epoch": 2.614339622641509,
"grad_norm": 0.27310531472703253,
"learning_rate": 7.078903189703413e-06,
"loss": 0.5484,
"step": 1733
},
{
"epoch": 2.6158490566037735,
"grad_norm": 0.27115505717512334,
"learning_rate": 7.050923335198657e-06,
"loss": 0.5029,
"step": 1734
},
{
"epoch": 2.617358490566038,
"grad_norm": 0.3029677136637512,
"learning_rate": 7.0229434806939016e-06,
"loss": 0.5418,
"step": 1735
},
{
"epoch": 2.6188679245283017,
"grad_norm": 0.27908867772659823,
"learning_rate": 6.994963626189144e-06,
"loss": 0.5047,
"step": 1736
},
{
"epoch": 2.620377358490566,
"grad_norm": 0.26058238891559354,
"learning_rate": 6.9669837716843874e-06,
"loss": 0.4792,
"step": 1737
},
{
"epoch": 2.6218867924528304,
"grad_norm": 0.2675541705765002,
"learning_rate": 6.93900391717963e-06,
"loss": 0.5317,
"step": 1738
},
{
"epoch": 2.6233962264150943,
"grad_norm": 0.27497640444418847,
"learning_rate": 6.911024062674874e-06,
"loss": 0.5155,
"step": 1739
},
{
"epoch": 2.6249056603773586,
"grad_norm": 0.2744416701414958,
"learning_rate": 6.883044208170118e-06,
"loss": 0.5331,
"step": 1740
},
{
"epoch": 2.6264150943396225,
"grad_norm": 0.2601570495959637,
"learning_rate": 6.855064353665361e-06,
"loss": 0.5181,
"step": 1741
},
{
"epoch": 2.627924528301887,
"grad_norm": 0.5745643045304843,
"learning_rate": 6.827084499160605e-06,
"loss": 0.543,
"step": 1742
},
{
"epoch": 2.6294339622641507,
"grad_norm": 0.2511923804891276,
"learning_rate": 6.7991046446558485e-06,
"loss": 0.5087,
"step": 1743
},
{
"epoch": 2.630943396226415,
"grad_norm": 0.28546674221830076,
"learning_rate": 6.771124790151091e-06,
"loss": 0.5444,
"step": 1744
},
{
"epoch": 2.6324528301886794,
"grad_norm": 0.28540740645360846,
"learning_rate": 6.743144935646335e-06,
"loss": 0.5114,
"step": 1745
},
{
"epoch": 2.6339622641509433,
"grad_norm": 0.2930716219200601,
"learning_rate": 6.715165081141578e-06,
"loss": 0.4819,
"step": 1746
},
{
"epoch": 2.6354716981132076,
"grad_norm": 0.25044896688250096,
"learning_rate": 6.687185226636822e-06,
"loss": 0.5132,
"step": 1747
},
{
"epoch": 2.636981132075472,
"grad_norm": 0.27658711363810984,
"learning_rate": 6.659205372132066e-06,
"loss": 0.5184,
"step": 1748
},
{
"epoch": 2.638490566037736,
"grad_norm": 1.2715522760039857,
"learning_rate": 6.631225517627309e-06,
"loss": 0.4769,
"step": 1749
},
{
"epoch": 2.64,
"grad_norm": 0.2606537545098945,
"learning_rate": 6.603245663122552e-06,
"loss": 0.5156,
"step": 1750
},
{
"epoch": 2.641509433962264,
"grad_norm": 0.25476970965477846,
"learning_rate": 6.5752658086177945e-06,
"loss": 0.5238,
"step": 1751
},
{
"epoch": 2.6430188679245283,
"grad_norm": 0.2760824398454106,
"learning_rate": 6.547285954113039e-06,
"loss": 0.4519,
"step": 1752
},
{
"epoch": 2.644528301886792,
"grad_norm": 0.2552261697445966,
"learning_rate": 6.519306099608283e-06,
"loss": 0.5238,
"step": 1753
},
{
"epoch": 2.6460377358490565,
"grad_norm": 0.2505994172404276,
"learning_rate": 6.4913262451035254e-06,
"loss": 0.4914,
"step": 1754
},
{
"epoch": 2.647547169811321,
"grad_norm": 0.2622458535402264,
"learning_rate": 6.46334639059877e-06,
"loss": 0.5338,
"step": 1755
},
{
"epoch": 2.6490566037735848,
"grad_norm": 0.2678028391066474,
"learning_rate": 6.435366536094013e-06,
"loss": 0.5065,
"step": 1756
},
{
"epoch": 2.650566037735849,
"grad_norm": 0.2484890494798296,
"learning_rate": 6.4073866815892555e-06,
"loss": 0.4955,
"step": 1757
},
{
"epoch": 2.6520754716981134,
"grad_norm": 0.2400998987991155,
"learning_rate": 6.3794068270845e-06,
"loss": 0.479,
"step": 1758
},
{
"epoch": 2.6535849056603773,
"grad_norm": 0.32685735246268977,
"learning_rate": 6.351426972579742e-06,
"loss": 0.538,
"step": 1759
},
{
"epoch": 2.6550943396226416,
"grad_norm": 0.2603240897335215,
"learning_rate": 6.3234471180749865e-06,
"loss": 0.498,
"step": 1760
},
{
"epoch": 2.6566037735849055,
"grad_norm": 0.26860424402573896,
"learning_rate": 6.29546726357023e-06,
"loss": 0.5033,
"step": 1761
},
{
"epoch": 2.65811320754717,
"grad_norm": 0.27530236656485146,
"learning_rate": 6.267487409065473e-06,
"loss": 0.4868,
"step": 1762
},
{
"epoch": 2.6596226415094337,
"grad_norm": 0.24655990905403674,
"learning_rate": 6.2395075545607166e-06,
"loss": 0.4981,
"step": 1763
},
{
"epoch": 2.661132075471698,
"grad_norm": 0.2620387036360828,
"learning_rate": 6.21152770005596e-06,
"loss": 0.5074,
"step": 1764
},
{
"epoch": 2.6626415094339624,
"grad_norm": 0.2680227306633879,
"learning_rate": 6.183547845551203e-06,
"loss": 0.5265,
"step": 1765
},
{
"epoch": 2.6641509433962263,
"grad_norm": 0.30502521082126244,
"learning_rate": 6.155567991046447e-06,
"loss": 0.5369,
"step": 1766
},
{
"epoch": 2.6656603773584906,
"grad_norm": 0.2600029610076752,
"learning_rate": 6.127588136541691e-06,
"loss": 0.496,
"step": 1767
},
{
"epoch": 2.667169811320755,
"grad_norm": 0.31160692938753265,
"learning_rate": 6.099608282036933e-06,
"loss": 0.5235,
"step": 1768
},
{
"epoch": 2.668679245283019,
"grad_norm": 0.25597688257703527,
"learning_rate": 6.071628427532177e-06,
"loss": 0.5042,
"step": 1769
},
{
"epoch": 2.670188679245283,
"grad_norm": 0.2713273198602419,
"learning_rate": 6.04364857302742e-06,
"loss": 0.5259,
"step": 1770
},
{
"epoch": 2.671698113207547,
"grad_norm": 0.2665123339756921,
"learning_rate": 6.015668718522664e-06,
"loss": 0.4882,
"step": 1771
},
{
"epoch": 2.6732075471698113,
"grad_norm": 0.25296669006459144,
"learning_rate": 5.987688864017908e-06,
"loss": 0.5413,
"step": 1772
},
{
"epoch": 2.6747169811320752,
"grad_norm": 0.2429367748155575,
"learning_rate": 5.959709009513151e-06,
"loss": 0.518,
"step": 1773
},
{
"epoch": 2.6762264150943396,
"grad_norm": 0.23850623638622218,
"learning_rate": 5.931729155008394e-06,
"loss": 0.4992,
"step": 1774
},
{
"epoch": 2.677735849056604,
"grad_norm": 0.2749471413882996,
"learning_rate": 5.903749300503638e-06,
"loss": 0.5218,
"step": 1775
},
{
"epoch": 2.6792452830188678,
"grad_norm": 0.27456504003042126,
"learning_rate": 5.875769445998881e-06,
"loss": 0.471,
"step": 1776
},
{
"epoch": 2.680754716981132,
"grad_norm": 6.661889115901275,
"learning_rate": 5.8477895914941245e-06,
"loss": 0.8345,
"step": 1777
},
{
"epoch": 2.6822641509433964,
"grad_norm": 0.3348543340219312,
"learning_rate": 5.819809736989368e-06,
"loss": 0.529,
"step": 1778
},
{
"epoch": 2.6837735849056603,
"grad_norm": 0.4156712628420884,
"learning_rate": 5.791829882484611e-06,
"loss": 0.5019,
"step": 1779
},
{
"epoch": 2.6852830188679246,
"grad_norm": 0.3163158572906154,
"learning_rate": 5.763850027979855e-06,
"loss": 0.5285,
"step": 1780
},
{
"epoch": 2.6867924528301885,
"grad_norm": 0.2574033108522731,
"learning_rate": 5.735870173475098e-06,
"loss": 0.5036,
"step": 1781
},
{
"epoch": 2.688301886792453,
"grad_norm": 0.2579216297103016,
"learning_rate": 5.707890318970341e-06,
"loss": 0.5149,
"step": 1782
},
{
"epoch": 2.6898113207547167,
"grad_norm": 0.2566957267530941,
"learning_rate": 5.679910464465585e-06,
"loss": 0.5001,
"step": 1783
},
{
"epoch": 2.691320754716981,
"grad_norm": 0.2440800355297943,
"learning_rate": 5.651930609960829e-06,
"loss": 0.4883,
"step": 1784
},
{
"epoch": 2.6928301886792454,
"grad_norm": 0.2474228533524819,
"learning_rate": 5.623950755456072e-06,
"loss": 0.51,
"step": 1785
},
{
"epoch": 2.6943396226415093,
"grad_norm": 0.2482868350432357,
"learning_rate": 5.595970900951316e-06,
"loss": 0.5291,
"step": 1786
},
{
"epoch": 2.6958490566037736,
"grad_norm": 0.23259366850007013,
"learning_rate": 5.567991046446559e-06,
"loss": 0.4855,
"step": 1787
},
{
"epoch": 2.697358490566038,
"grad_norm": 0.2460435686988639,
"learning_rate": 5.5400111919418015e-06,
"loss": 0.5084,
"step": 1788
},
{
"epoch": 2.698867924528302,
"grad_norm": 0.26038359055504035,
"learning_rate": 5.512031337437046e-06,
"loss": 0.5132,
"step": 1789
},
{
"epoch": 2.700377358490566,
"grad_norm": 0.28067308819480147,
"learning_rate": 5.484051482932289e-06,
"loss": 0.567,
"step": 1790
},
{
"epoch": 2.70188679245283,
"grad_norm": 0.2471009110626984,
"learning_rate": 5.456071628427532e-06,
"loss": 0.516,
"step": 1791
},
{
"epoch": 2.7033962264150944,
"grad_norm": 0.26721610477068264,
"learning_rate": 5.428091773922776e-06,
"loss": 0.5093,
"step": 1792
},
{
"epoch": 2.7049056603773582,
"grad_norm": 0.2498621325235465,
"learning_rate": 5.400111919418019e-06,
"loss": 0.4712,
"step": 1793
},
{
"epoch": 2.7064150943396226,
"grad_norm": 0.25672107157026186,
"learning_rate": 5.3721320649132625e-06,
"loss": 0.4877,
"step": 1794
},
{
"epoch": 2.707924528301887,
"grad_norm": 0.22665414380543533,
"learning_rate": 5.344152210408506e-06,
"loss": 0.4315,
"step": 1795
},
{
"epoch": 2.709433962264151,
"grad_norm": 0.24408552016267313,
"learning_rate": 5.316172355903749e-06,
"loss": 0.5067,
"step": 1796
},
{
"epoch": 2.710943396226415,
"grad_norm": 0.24060193772424485,
"learning_rate": 5.2881925013989934e-06,
"loss": 0.4887,
"step": 1797
},
{
"epoch": 2.7124528301886794,
"grad_norm": 0.2626077977831592,
"learning_rate": 5.260212646894237e-06,
"loss": 0.5321,
"step": 1798
},
{
"epoch": 2.7139622641509433,
"grad_norm": 0.26635629306405656,
"learning_rate": 5.23223279238948e-06,
"loss": 0.4984,
"step": 1799
},
{
"epoch": 2.7154716981132077,
"grad_norm": 0.27439116564294674,
"learning_rate": 5.204252937884723e-06,
"loss": 0.5383,
"step": 1800
},
{
"epoch": 2.7169811320754715,
"grad_norm": 0.27336316693975615,
"learning_rate": 5.176273083379966e-06,
"loss": 0.4991,
"step": 1801
},
{
"epoch": 2.718490566037736,
"grad_norm": 0.25078792605060657,
"learning_rate": 5.14829322887521e-06,
"loss": 0.4746,
"step": 1802
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.2579469072246058,
"learning_rate": 5.120313374370454e-06,
"loss": 0.528,
"step": 1803
},
{
"epoch": 2.721509433962264,
"grad_norm": 0.3410333484332067,
"learning_rate": 5.092333519865697e-06,
"loss": 0.4824,
"step": 1804
},
{
"epoch": 2.7230188679245284,
"grad_norm": 0.34511657220725606,
"learning_rate": 5.06435366536094e-06,
"loss": 0.4832,
"step": 1805
},
{
"epoch": 2.7245283018867923,
"grad_norm": 0.23917828132388028,
"learning_rate": 5.036373810856184e-06,
"loss": 0.5131,
"step": 1806
},
{
"epoch": 2.7260377358490566,
"grad_norm": 0.250710939200575,
"learning_rate": 5.008393956351427e-06,
"loss": 0.5168,
"step": 1807
},
{
"epoch": 2.727547169811321,
"grad_norm": 0.25569387237719443,
"learning_rate": 4.9804141018466704e-06,
"loss": 0.5251,
"step": 1808
},
{
"epoch": 2.729056603773585,
"grad_norm": 0.2595212700539186,
"learning_rate": 4.952434247341914e-06,
"loss": 0.4766,
"step": 1809
},
{
"epoch": 2.730566037735849,
"grad_norm": 0.3827330155254612,
"learning_rate": 4.924454392837158e-06,
"loss": 0.5103,
"step": 1810
},
{
"epoch": 2.732075471698113,
"grad_norm": 0.24296190605611276,
"learning_rate": 4.896474538332401e-06,
"loss": 0.4906,
"step": 1811
},
{
"epoch": 2.7335849056603774,
"grad_norm": 0.4549554603421421,
"learning_rate": 4.868494683827645e-06,
"loss": 0.4825,
"step": 1812
},
{
"epoch": 2.7350943396226413,
"grad_norm": 0.27193474503030435,
"learning_rate": 4.840514829322887e-06,
"loss": 0.4838,
"step": 1813
},
{
"epoch": 2.7366037735849056,
"grad_norm": 0.2563880809210197,
"learning_rate": 4.812534974818131e-06,
"loss": 0.5064,
"step": 1814
},
{
"epoch": 2.73811320754717,
"grad_norm": 0.2548907465195013,
"learning_rate": 4.784555120313375e-06,
"loss": 0.5096,
"step": 1815
},
{
"epoch": 2.739622641509434,
"grad_norm": 0.2535545744550968,
"learning_rate": 4.756575265808618e-06,
"loss": 0.5075,
"step": 1816
},
{
"epoch": 2.741132075471698,
"grad_norm": 0.2538458515041986,
"learning_rate": 4.7285954113038615e-06,
"loss": 0.5233,
"step": 1817
},
{
"epoch": 2.7426415094339625,
"grad_norm": 0.2449829091378601,
"learning_rate": 4.700615556799105e-06,
"loss": 0.4963,
"step": 1818
},
{
"epoch": 2.7441509433962263,
"grad_norm": 0.2583453664244264,
"learning_rate": 4.672635702294348e-06,
"loss": 0.5075,
"step": 1819
},
{
"epoch": 2.7456603773584907,
"grad_norm": 0.24305960423400333,
"learning_rate": 4.644655847789592e-06,
"loss": 0.5265,
"step": 1820
},
{
"epoch": 2.7471698113207546,
"grad_norm": 0.24200712952451542,
"learning_rate": 4.616675993284835e-06,
"loss": 0.4775,
"step": 1821
},
{
"epoch": 2.748679245283019,
"grad_norm": 0.28201193390392365,
"learning_rate": 4.588696138780078e-06,
"loss": 0.4837,
"step": 1822
},
{
"epoch": 2.7501886792452828,
"grad_norm": 0.26394748624366776,
"learning_rate": 4.5607162842753226e-06,
"loss": 0.5334,
"step": 1823
},
{
"epoch": 2.751698113207547,
"grad_norm": 0.24720195678002388,
"learning_rate": 4.532736429770566e-06,
"loss": 0.5021,
"step": 1824
},
{
"epoch": 2.7532075471698114,
"grad_norm": 0.30947073261476093,
"learning_rate": 4.5047565752658084e-06,
"loss": 0.5546,
"step": 1825
},
{
"epoch": 2.7547169811320753,
"grad_norm": 0.2988805165162394,
"learning_rate": 4.476776720761052e-06,
"loss": 0.5042,
"step": 1826
},
{
"epoch": 2.7562264150943396,
"grad_norm": 0.2462987932828045,
"learning_rate": 4.448796866256295e-06,
"loss": 0.5357,
"step": 1827
},
{
"epoch": 2.757735849056604,
"grad_norm": 0.2415826373536316,
"learning_rate": 4.420817011751539e-06,
"loss": 0.4834,
"step": 1828
},
{
"epoch": 2.759245283018868,
"grad_norm": 0.2535333115257667,
"learning_rate": 4.392837157246783e-06,
"loss": 0.5412,
"step": 1829
},
{
"epoch": 2.760754716981132,
"grad_norm": 0.24596768950408637,
"learning_rate": 4.364857302742026e-06,
"loss": 0.5119,
"step": 1830
},
{
"epoch": 2.7622641509433965,
"grad_norm": 0.24845301619797933,
"learning_rate": 4.3368774482372695e-06,
"loss": 0.5117,
"step": 1831
},
{
"epoch": 2.7637735849056604,
"grad_norm": 2.3475646101301897,
"learning_rate": 4.308897593732513e-06,
"loss": 0.509,
"step": 1832
},
{
"epoch": 2.7652830188679243,
"grad_norm": 0.2544934651097907,
"learning_rate": 4.280917739227756e-06,
"loss": 0.5145,
"step": 1833
},
{
"epoch": 2.7667924528301886,
"grad_norm": 0.2634896263966863,
"learning_rate": 4.2529378847229995e-06,
"loss": 0.5485,
"step": 1834
},
{
"epoch": 2.768301886792453,
"grad_norm": 0.25651084274796315,
"learning_rate": 4.224958030218243e-06,
"loss": 0.5134,
"step": 1835
},
{
"epoch": 2.769811320754717,
"grad_norm": 0.2646923542784634,
"learning_rate": 4.196978175713487e-06,
"loss": 0.5164,
"step": 1836
},
{
"epoch": 2.771320754716981,
"grad_norm": 0.26044964960565437,
"learning_rate": 4.1689983212087305e-06,
"loss": 0.5161,
"step": 1837
},
{
"epoch": 2.7728301886792455,
"grad_norm": 0.231525130140077,
"learning_rate": 4.141018466703973e-06,
"loss": 0.4982,
"step": 1838
},
{
"epoch": 2.7743396226415094,
"grad_norm": 0.25942703847568144,
"learning_rate": 4.113038612199216e-06,
"loss": 0.5105,
"step": 1839
},
{
"epoch": 2.7758490566037737,
"grad_norm": 0.26605983694964946,
"learning_rate": 4.08505875769446e-06,
"loss": 0.5111,
"step": 1840
},
{
"epoch": 2.777358490566038,
"grad_norm": 0.24395856253509673,
"learning_rate": 4.057078903189704e-06,
"loss": 0.4747,
"step": 1841
},
{
"epoch": 2.778867924528302,
"grad_norm": 0.26184813710048754,
"learning_rate": 4.029099048684947e-06,
"loss": 0.5157,
"step": 1842
},
{
"epoch": 2.7803773584905658,
"grad_norm": 0.26268875875244085,
"learning_rate": 4.001119194180191e-06,
"loss": 0.5598,
"step": 1843
},
{
"epoch": 2.78188679245283,
"grad_norm": 0.23009142240299044,
"learning_rate": 3.973139339675434e-06,
"loss": 0.4734,
"step": 1844
},
{
"epoch": 2.7833962264150944,
"grad_norm": 0.2458756232178251,
"learning_rate": 3.945159485170677e-06,
"loss": 0.4777,
"step": 1845
},
{
"epoch": 2.7849056603773583,
"grad_norm": 0.24794152608284992,
"learning_rate": 3.917179630665921e-06,
"loss": 0.4422,
"step": 1846
},
{
"epoch": 2.7864150943396226,
"grad_norm": 0.24897274231650662,
"learning_rate": 3.889199776161164e-06,
"loss": 0.5042,
"step": 1847
},
{
"epoch": 2.787924528301887,
"grad_norm": 0.26060926813065055,
"learning_rate": 3.8612199216564075e-06,
"loss": 0.4873,
"step": 1848
},
{
"epoch": 2.789433962264151,
"grad_norm": 0.23848484427152364,
"learning_rate": 3.833240067151651e-06,
"loss": 0.5234,
"step": 1849
},
{
"epoch": 2.790943396226415,
"grad_norm": 0.250353041090764,
"learning_rate": 3.8052602126468946e-06,
"loss": 0.5231,
"step": 1850
},
{
"epoch": 2.7924528301886795,
"grad_norm": 0.24066679801616733,
"learning_rate": 3.777280358142138e-06,
"loss": 0.5212,
"step": 1851
},
{
"epoch": 2.7939622641509434,
"grad_norm": 0.2600941674746975,
"learning_rate": 3.7493005036373813e-06,
"loss": 0.5084,
"step": 1852
},
{
"epoch": 2.7954716981132073,
"grad_norm": 0.2510555438696239,
"learning_rate": 3.7213206491326243e-06,
"loss": 0.5511,
"step": 1853
},
{
"epoch": 2.7969811320754716,
"grad_norm": 0.24160412240772206,
"learning_rate": 3.6933407946278685e-06,
"loss": 0.5095,
"step": 1854
},
{
"epoch": 2.798490566037736,
"grad_norm": 0.2748568322627208,
"learning_rate": 3.6653609401231114e-06,
"loss": 0.5496,
"step": 1855
},
{
"epoch": 2.8,
"grad_norm": 0.2584358586285045,
"learning_rate": 3.637381085618355e-06,
"loss": 0.5463,
"step": 1856
},
{
"epoch": 2.801509433962264,
"grad_norm": 0.2498517036300968,
"learning_rate": 3.609401231113598e-06,
"loss": 0.5415,
"step": 1857
},
{
"epoch": 2.8030188679245285,
"grad_norm": 0.24524690085464163,
"learning_rate": 3.581421376608842e-06,
"loss": 0.5375,
"step": 1858
},
{
"epoch": 2.8045283018867924,
"grad_norm": 0.2769513128398129,
"learning_rate": 3.5534415221040853e-06,
"loss": 0.552,
"step": 1859
},
{
"epoch": 2.8060377358490567,
"grad_norm": 0.2506846446168906,
"learning_rate": 3.5254616675993287e-06,
"loss": 0.5459,
"step": 1860
},
{
"epoch": 2.807547169811321,
"grad_norm": 0.2554286512869799,
"learning_rate": 3.497481813094572e-06,
"loss": 0.4944,
"step": 1861
},
{
"epoch": 2.809056603773585,
"grad_norm": 0.23601391002584438,
"learning_rate": 3.469501958589815e-06,
"loss": 0.4922,
"step": 1862
},
{
"epoch": 2.810566037735849,
"grad_norm": 0.26302780246456897,
"learning_rate": 3.441522104085059e-06,
"loss": 0.5044,
"step": 1863
},
{
"epoch": 2.812075471698113,
"grad_norm": 0.2577873934898228,
"learning_rate": 3.4135422495803025e-06,
"loss": 0.5334,
"step": 1864
},
{
"epoch": 2.8135849056603774,
"grad_norm": 0.23924590337032847,
"learning_rate": 3.3855623950755455e-06,
"loss": 0.5229,
"step": 1865
},
{
"epoch": 2.8150943396226413,
"grad_norm": 0.24147365730794057,
"learning_rate": 3.357582540570789e-06,
"loss": 0.5122,
"step": 1866
},
{
"epoch": 2.8166037735849057,
"grad_norm": 0.2327138561482277,
"learning_rate": 3.329602686066033e-06,
"loss": 0.4637,
"step": 1867
},
{
"epoch": 2.81811320754717,
"grad_norm": 0.2750595632862016,
"learning_rate": 3.301622831561276e-06,
"loss": 0.4651,
"step": 1868
},
{
"epoch": 2.819622641509434,
"grad_norm": 0.25380188643044477,
"learning_rate": 3.2736429770565194e-06,
"loss": 0.5683,
"step": 1869
},
{
"epoch": 2.821132075471698,
"grad_norm": 0.24310252434490498,
"learning_rate": 3.2456631225517627e-06,
"loss": 0.5079,
"step": 1870
},
{
"epoch": 2.8226415094339625,
"grad_norm": 0.22968025157830738,
"learning_rate": 3.2176832680470065e-06,
"loss": 0.4756,
"step": 1871
},
{
"epoch": 2.8241509433962264,
"grad_norm": 0.23406333934374154,
"learning_rate": 3.18970341354225e-06,
"loss": 0.508,
"step": 1872
},
{
"epoch": 2.8256603773584903,
"grad_norm": 3.331648704661487,
"learning_rate": 3.1617235590374932e-06,
"loss": 0.5314,
"step": 1873
},
{
"epoch": 2.8271698113207546,
"grad_norm": 0.2695647874070789,
"learning_rate": 3.1337437045327366e-06,
"loss": 0.5471,
"step": 1874
},
{
"epoch": 2.828679245283019,
"grad_norm": 0.25565203431365335,
"learning_rate": 3.10576385002798e-06,
"loss": 0.4956,
"step": 1875
},
{
"epoch": 2.830188679245283,
"grad_norm": 0.24812316022229003,
"learning_rate": 3.0777839955232233e-06,
"loss": 0.5063,
"step": 1876
},
{
"epoch": 2.831698113207547,
"grad_norm": 0.2512898425536453,
"learning_rate": 3.0498041410184667e-06,
"loss": 0.5132,
"step": 1877
},
{
"epoch": 2.8332075471698115,
"grad_norm": 0.25905997230865613,
"learning_rate": 3.02182428651371e-06,
"loss": 0.536,
"step": 1878
},
{
"epoch": 2.8347169811320754,
"grad_norm": 0.2527084751717376,
"learning_rate": 2.993844432008954e-06,
"loss": 0.5058,
"step": 1879
},
{
"epoch": 2.8362264150943397,
"grad_norm": 0.25014337600940817,
"learning_rate": 2.965864577504197e-06,
"loss": 0.5141,
"step": 1880
},
{
"epoch": 2.837735849056604,
"grad_norm": 0.26771202447798464,
"learning_rate": 2.9378847229994406e-06,
"loss": 0.4997,
"step": 1881
},
{
"epoch": 2.839245283018868,
"grad_norm": 0.3542671459269332,
"learning_rate": 2.909904868494684e-06,
"loss": 0.5118,
"step": 1882
},
{
"epoch": 2.840754716981132,
"grad_norm": 0.29850083758472223,
"learning_rate": 2.8819250139899277e-06,
"loss": 0.5432,
"step": 1883
},
{
"epoch": 2.842264150943396,
"grad_norm": 0.23709430578888918,
"learning_rate": 2.8539451594851706e-06,
"loss": 0.5221,
"step": 1884
},
{
"epoch": 2.8437735849056605,
"grad_norm": 0.26493114426048814,
"learning_rate": 2.8259653049804144e-06,
"loss": 0.5103,
"step": 1885
},
{
"epoch": 2.8452830188679243,
"grad_norm": 0.23960026530705572,
"learning_rate": 2.797985450475658e-06,
"loss": 0.4965,
"step": 1886
},
{
"epoch": 2.8467924528301887,
"grad_norm": 0.251820373185035,
"learning_rate": 2.7700055959709007e-06,
"loss": 0.512,
"step": 1887
},
{
"epoch": 2.848301886792453,
"grad_norm": 0.2716135014490187,
"learning_rate": 2.7420257414661445e-06,
"loss": 0.5373,
"step": 1888
},
{
"epoch": 2.849811320754717,
"grad_norm": 0.3050812952561557,
"learning_rate": 2.714045886961388e-06,
"loss": 0.4955,
"step": 1889
},
{
"epoch": 2.851320754716981,
"grad_norm": 0.23801738006267384,
"learning_rate": 2.6860660324566312e-06,
"loss": 0.4919,
"step": 1890
},
{
"epoch": 2.8528301886792455,
"grad_norm": 0.23926038110207012,
"learning_rate": 2.6580861779518746e-06,
"loss": 0.486,
"step": 1891
},
{
"epoch": 2.8543396226415094,
"grad_norm": 0.24625086888055978,
"learning_rate": 2.6301063234471184e-06,
"loss": 0.563,
"step": 1892
},
{
"epoch": 2.8558490566037738,
"grad_norm": 0.25559463779086583,
"learning_rate": 2.6021264689423613e-06,
"loss": 0.4613,
"step": 1893
},
{
"epoch": 2.8573584905660376,
"grad_norm": 0.2437794999089875,
"learning_rate": 2.574146614437605e-06,
"loss": 0.5394,
"step": 1894
},
{
"epoch": 2.858867924528302,
"grad_norm": 0.24054866492710766,
"learning_rate": 2.5461667599328485e-06,
"loss": 0.5187,
"step": 1895
},
{
"epoch": 2.860377358490566,
"grad_norm": 0.24376318441166045,
"learning_rate": 2.518186905428092e-06,
"loss": 0.4887,
"step": 1896
},
{
"epoch": 2.86188679245283,
"grad_norm": 0.2410895191336604,
"learning_rate": 2.4902070509233352e-06,
"loss": 0.4956,
"step": 1897
},
{
"epoch": 2.8633962264150945,
"grad_norm": 0.2389979905294272,
"learning_rate": 2.462227196418579e-06,
"loss": 0.5055,
"step": 1898
},
{
"epoch": 2.8649056603773584,
"grad_norm": 0.24099313691698937,
"learning_rate": 2.4342473419138224e-06,
"loss": 0.4998,
"step": 1899
},
{
"epoch": 2.8664150943396227,
"grad_norm": 0.24527579020611284,
"learning_rate": 2.4062674874090653e-06,
"loss": 0.5264,
"step": 1900
},
{
"epoch": 2.867924528301887,
"grad_norm": 0.23933465605234516,
"learning_rate": 2.378287632904309e-06,
"loss": 0.4959,
"step": 1901
},
{
"epoch": 2.869433962264151,
"grad_norm": 0.24622168013990314,
"learning_rate": 2.3503077783995524e-06,
"loss": 0.5173,
"step": 1902
},
{
"epoch": 2.8709433962264153,
"grad_norm": 0.23919977406233273,
"learning_rate": 2.322327923894796e-06,
"loss": 0.4966,
"step": 1903
},
{
"epoch": 2.872452830188679,
"grad_norm": 0.261296807681531,
"learning_rate": 2.294348069390039e-06,
"loss": 0.5598,
"step": 1904
},
{
"epoch": 2.8739622641509435,
"grad_norm": 0.2799512370951401,
"learning_rate": 2.266368214885283e-06,
"loss": 0.4819,
"step": 1905
},
{
"epoch": 2.8754716981132074,
"grad_norm": 0.23214062234311064,
"learning_rate": 2.238388360380526e-06,
"loss": 0.5261,
"step": 1906
},
{
"epoch": 2.8769811320754717,
"grad_norm": 0.24329861142019843,
"learning_rate": 2.2104085058757697e-06,
"loss": 0.5273,
"step": 1907
},
{
"epoch": 2.878490566037736,
"grad_norm": 0.24170422215332774,
"learning_rate": 2.182428651371013e-06,
"loss": 0.4899,
"step": 1908
},
{
"epoch": 2.88,
"grad_norm": 0.21557021451090852,
"learning_rate": 2.1544487968662564e-06,
"loss": 0.4499,
"step": 1909
},
{
"epoch": 2.881509433962264,
"grad_norm": 0.24761336600665296,
"learning_rate": 2.1264689423614998e-06,
"loss": 0.5349,
"step": 1910
},
{
"epoch": 2.8830188679245285,
"grad_norm": 0.28587028512046125,
"learning_rate": 2.0984890878567436e-06,
"loss": 0.4769,
"step": 1911
},
{
"epoch": 2.8845283018867924,
"grad_norm": 0.23215345407826837,
"learning_rate": 2.0705092333519865e-06,
"loss": 0.5057,
"step": 1912
},
{
"epoch": 2.8860377358490568,
"grad_norm": 0.23557540955317524,
"learning_rate": 2.04252937884723e-06,
"loss": 0.4983,
"step": 1913
},
{
"epoch": 2.8875471698113206,
"grad_norm": 0.24826597774453552,
"learning_rate": 2.0145495243424736e-06,
"loss": 0.506,
"step": 1914
},
{
"epoch": 2.889056603773585,
"grad_norm": 0.27097286067099186,
"learning_rate": 1.986569669837717e-06,
"loss": 0.4824,
"step": 1915
},
{
"epoch": 2.890566037735849,
"grad_norm": 0.2323466023436389,
"learning_rate": 1.9585898153329604e-06,
"loss": 0.49,
"step": 1916
},
{
"epoch": 2.892075471698113,
"grad_norm": 0.24530440733332493,
"learning_rate": 1.9306099608282037e-06,
"loss": 0.5128,
"step": 1917
},
{
"epoch": 2.8935849056603775,
"grad_norm": 0.2217381642822851,
"learning_rate": 1.9026301063234473e-06,
"loss": 0.4941,
"step": 1918
},
{
"epoch": 2.8950943396226414,
"grad_norm": 0.3033421523357643,
"learning_rate": 1.8746502518186907e-06,
"loss": 0.5488,
"step": 1919
},
{
"epoch": 2.8966037735849057,
"grad_norm": 0.22952196943803127,
"learning_rate": 1.8466703973139342e-06,
"loss": 0.4873,
"step": 1920
},
{
"epoch": 2.89811320754717,
"grad_norm": 0.24646380697980175,
"learning_rate": 1.8186905428091774e-06,
"loss": 0.5466,
"step": 1921
},
{
"epoch": 2.899622641509434,
"grad_norm": 0.2931648546342374,
"learning_rate": 1.790710688304421e-06,
"loss": 0.494,
"step": 1922
},
{
"epoch": 2.9011320754716983,
"grad_norm": 0.6019713876730823,
"learning_rate": 1.7627308337996643e-06,
"loss": 0.5361,
"step": 1923
},
{
"epoch": 2.902641509433962,
"grad_norm": 0.2643446098419106,
"learning_rate": 1.7347509792949075e-06,
"loss": 0.51,
"step": 1924
},
{
"epoch": 2.9041509433962265,
"grad_norm": 0.24476304649977285,
"learning_rate": 1.7067711247901513e-06,
"loss": 0.5391,
"step": 1925
},
{
"epoch": 2.9056603773584904,
"grad_norm": 0.24501363596775003,
"learning_rate": 1.6787912702853944e-06,
"loss": 0.5126,
"step": 1926
},
{
"epoch": 2.9071698113207547,
"grad_norm": 0.23738294037707477,
"learning_rate": 1.650811415780638e-06,
"loss": 0.4981,
"step": 1927
},
{
"epoch": 2.908679245283019,
"grad_norm": 0.2353858028048875,
"learning_rate": 1.6228315612758814e-06,
"loss": 0.5393,
"step": 1928
},
{
"epoch": 2.910188679245283,
"grad_norm": 0.24557576052226965,
"learning_rate": 1.594851706771125e-06,
"loss": 0.5146,
"step": 1929
},
{
"epoch": 2.9116981132075472,
"grad_norm": 0.23598740859198736,
"learning_rate": 1.5668718522663683e-06,
"loss": 0.5069,
"step": 1930
},
{
"epoch": 2.9132075471698116,
"grad_norm": 0.23965223087346654,
"learning_rate": 1.5388919977616117e-06,
"loss": 0.5293,
"step": 1931
},
{
"epoch": 2.9147169811320754,
"grad_norm": 0.23578049631296016,
"learning_rate": 1.510912143256855e-06,
"loss": 0.5237,
"step": 1932
},
{
"epoch": 2.9162264150943398,
"grad_norm": 0.2512094274756097,
"learning_rate": 1.4829322887520986e-06,
"loss": 0.5305,
"step": 1933
},
{
"epoch": 2.9177358490566037,
"grad_norm": 0.2419640704406858,
"learning_rate": 1.454952434247342e-06,
"loss": 0.5019,
"step": 1934
},
{
"epoch": 2.919245283018868,
"grad_norm": 0.2341231605907135,
"learning_rate": 1.4269725797425853e-06,
"loss": 0.493,
"step": 1935
},
{
"epoch": 2.920754716981132,
"grad_norm": 0.22368464083267006,
"learning_rate": 1.398992725237829e-06,
"loss": 0.479,
"step": 1936
},
{
"epoch": 2.922264150943396,
"grad_norm": 0.23934904699725973,
"learning_rate": 1.3710128707330723e-06,
"loss": 0.5182,
"step": 1937
},
{
"epoch": 2.9237735849056605,
"grad_norm": 0.2423304768056013,
"learning_rate": 1.3430330162283156e-06,
"loss": 0.5287,
"step": 1938
},
{
"epoch": 2.9252830188679244,
"grad_norm": 0.24220829389874426,
"learning_rate": 1.3150531617235592e-06,
"loss": 0.5131,
"step": 1939
},
{
"epoch": 2.9267924528301887,
"grad_norm": 0.2408649756673509,
"learning_rate": 1.2870733072188026e-06,
"loss": 0.5256,
"step": 1940
},
{
"epoch": 2.928301886792453,
"grad_norm": 0.26970041615884593,
"learning_rate": 1.259093452714046e-06,
"loss": 0.5247,
"step": 1941
},
{
"epoch": 2.929811320754717,
"grad_norm": 0.2462934836700248,
"learning_rate": 1.2311135982092895e-06,
"loss": 0.5274,
"step": 1942
},
{
"epoch": 2.9313207547169813,
"grad_norm": 0.24346784211748765,
"learning_rate": 1.2031337437045327e-06,
"loss": 0.5353,
"step": 1943
},
{
"epoch": 2.932830188679245,
"grad_norm": 0.2308252685210892,
"learning_rate": 1.1751538891997762e-06,
"loss": 0.4403,
"step": 1944
},
{
"epoch": 2.9343396226415095,
"grad_norm": 0.24611093220055966,
"learning_rate": 1.1471740346950196e-06,
"loss": 0.4911,
"step": 1945
},
{
"epoch": 2.9358490566037734,
"grad_norm": 0.23677755966512776,
"learning_rate": 1.119194180190263e-06,
"loss": 0.4889,
"step": 1946
},
{
"epoch": 2.9373584905660377,
"grad_norm": 0.2390396184181226,
"learning_rate": 1.0912143256855065e-06,
"loss": 0.4938,
"step": 1947
},
{
"epoch": 2.938867924528302,
"grad_norm": 0.2298114470810896,
"learning_rate": 1.0632344711807499e-06,
"loss": 0.5164,
"step": 1948
},
{
"epoch": 2.940377358490566,
"grad_norm": 0.23867767626825967,
"learning_rate": 1.0352546166759932e-06,
"loss": 0.5339,
"step": 1949
},
{
"epoch": 2.9418867924528302,
"grad_norm": 0.2273239656737244,
"learning_rate": 1.0072747621712368e-06,
"loss": 0.4873,
"step": 1950
},
{
"epoch": 2.9433962264150946,
"grad_norm": 0.2510529360776652,
"learning_rate": 9.792949076664802e-07,
"loss": 0.541,
"step": 1951
},
{
"epoch": 2.9449056603773585,
"grad_norm": 0.2420207367421682,
"learning_rate": 9.513150531617237e-07,
"loss": 0.5357,
"step": 1952
},
{
"epoch": 2.946415094339623,
"grad_norm": 0.23866840294873037,
"learning_rate": 9.233351986569671e-07,
"loss": 0.516,
"step": 1953
},
{
"epoch": 2.9479245283018867,
"grad_norm": 0.23833048632270618,
"learning_rate": 8.953553441522105e-07,
"loss": 0.5001,
"step": 1954
},
{
"epoch": 2.949433962264151,
"grad_norm": 0.23240358196803712,
"learning_rate": 8.673754896474537e-07,
"loss": 0.5263,
"step": 1955
},
{
"epoch": 2.950943396226415,
"grad_norm": 0.27536994744227083,
"learning_rate": 8.393956351426972e-07,
"loss": 0.4919,
"step": 1956
},
{
"epoch": 2.952452830188679,
"grad_norm": 0.23738342408866947,
"learning_rate": 8.114157806379407e-07,
"loss": 0.4924,
"step": 1957
},
{
"epoch": 2.9539622641509435,
"grad_norm": 0.2227032657641613,
"learning_rate": 7.834359261331841e-07,
"loss": 0.4912,
"step": 1958
},
{
"epoch": 2.9554716981132074,
"grad_norm": 0.23463528292989802,
"learning_rate": 7.554560716284275e-07,
"loss": 0.4838,
"step": 1959
},
{
"epoch": 2.9569811320754718,
"grad_norm": 0.2351782144870339,
"learning_rate": 7.27476217123671e-07,
"loss": 0.5385,
"step": 1960
},
{
"epoch": 2.958490566037736,
"grad_norm": 0.30869561893156994,
"learning_rate": 6.994963626189144e-07,
"loss": 0.5375,
"step": 1961
},
{
"epoch": 2.96,
"grad_norm": 0.2507617576529779,
"learning_rate": 6.715165081141578e-07,
"loss": 0.5086,
"step": 1962
},
{
"epoch": 2.9615094339622643,
"grad_norm": 0.26263822536508746,
"learning_rate": 6.435366536094013e-07,
"loss": 0.5411,
"step": 1963
},
{
"epoch": 2.963018867924528,
"grad_norm": 0.22776692640780824,
"learning_rate": 6.155567991046447e-07,
"loss": 0.4882,
"step": 1964
},
{
"epoch": 2.9645283018867925,
"grad_norm": 0.23187613900036108,
"learning_rate": 5.875769445998881e-07,
"loss": 0.5031,
"step": 1965
},
{
"epoch": 2.9660377358490564,
"grad_norm": 0.23980976100159304,
"learning_rate": 5.595970900951315e-07,
"loss": 0.5111,
"step": 1966
},
{
"epoch": 2.9675471698113207,
"grad_norm": 0.23454625576676985,
"learning_rate": 5.316172355903749e-07,
"loss": 0.4886,
"step": 1967
},
{
"epoch": 2.969056603773585,
"grad_norm": 0.22466469775472325,
"learning_rate": 5.036373810856184e-07,
"loss": 0.4927,
"step": 1968
},
{
"epoch": 2.970566037735849,
"grad_norm": 0.2424670877215881,
"learning_rate": 4.7565752658086183e-07,
"loss": 0.5188,
"step": 1969
},
{
"epoch": 2.9720754716981133,
"grad_norm": 0.2634277432088806,
"learning_rate": 4.4767767207610524e-07,
"loss": 0.4995,
"step": 1970
},
{
"epoch": 2.9735849056603776,
"grad_norm": 0.24871804164038816,
"learning_rate": 4.196978175713486e-07,
"loss": 0.4975,
"step": 1971
},
{
"epoch": 2.9750943396226415,
"grad_norm": 0.22918011887355255,
"learning_rate": 3.917179630665921e-07,
"loss": 0.5146,
"step": 1972
},
{
"epoch": 2.976603773584906,
"grad_norm": 0.24494562015577118,
"learning_rate": 3.637381085618355e-07,
"loss": 0.5191,
"step": 1973
},
{
"epoch": 2.9781132075471697,
"grad_norm": 0.24554508049209534,
"learning_rate": 3.357582540570789e-07,
"loss": 0.5134,
"step": 1974
},
{
"epoch": 2.979622641509434,
"grad_norm": 0.24686957526571518,
"learning_rate": 3.077783995523224e-07,
"loss": 0.4754,
"step": 1975
},
{
"epoch": 2.981132075471698,
"grad_norm": 0.2309101984662458,
"learning_rate": 2.7979854504756574e-07,
"loss": 0.4805,
"step": 1976
},
{
"epoch": 2.9826415094339622,
"grad_norm": 0.23759627074403436,
"learning_rate": 2.518186905428092e-07,
"loss": 0.5359,
"step": 1977
},
{
"epoch": 2.9841509433962266,
"grad_norm": 0.25434227095914247,
"learning_rate": 2.2383883603805262e-07,
"loss": 0.5258,
"step": 1978
},
{
"epoch": 2.9856603773584904,
"grad_norm": 0.24774479007599787,
"learning_rate": 1.9585898153329604e-07,
"loss": 0.5029,
"step": 1979
},
{
"epoch": 2.9871698113207548,
"grad_norm": 0.2562930857219345,
"learning_rate": 1.6787912702853945e-07,
"loss": 0.5213,
"step": 1980
},
{
"epoch": 2.988679245283019,
"grad_norm": 0.24488783175852774,
"learning_rate": 1.3989927252378287e-07,
"loss": 0.5062,
"step": 1981
},
{
"epoch": 2.990188679245283,
"grad_norm": 0.24061088726491453,
"learning_rate": 1.1191941801902631e-07,
"loss": 0.5221,
"step": 1982
},
{
"epoch": 2.9916981132075473,
"grad_norm": 0.23745813306754143,
"learning_rate": 8.393956351426973e-08,
"loss": 0.4621,
"step": 1983
},
{
"epoch": 2.993207547169811,
"grad_norm": 0.24766707785226677,
"learning_rate": 5.5959709009513155e-08,
"loss": 0.5439,
"step": 1984
},
{
"epoch": 2.9947169811320755,
"grad_norm": 0.22755699352891728,
"learning_rate": 2.7979854504756578e-08,
"loss": 0.4968,
"step": 1985
},
{
"epoch": 2.9962264150943394,
"grad_norm": 0.2639171679918981,
"learning_rate": 0.0,
"loss": 0.5176,
"step": 1986
},
{
"epoch": 2.9962264150943394,
"step": 1986,
"total_flos": 1.6837767016679997e+18,
"train_loss": 0.7350575219978619,
"train_runtime": 115424.0134,
"train_samples_per_second": 0.275,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 1986,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6837767016679997e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}