rulins's picture
Upload folder using huggingface_hub
a03f093 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 13.0,
"eval_steps": 500,
"global_step": 1807,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03597122302158273,
"grad_norm": 2.5657113026110396,
"learning_rate": 2.2099447513812156e-07,
"loss": 1.0307,
"step": 5
},
{
"epoch": 0.07194244604316546,
"grad_norm": 2.4709120913894567,
"learning_rate": 4.972375690607735e-07,
"loss": 0.999,
"step": 10
},
{
"epoch": 0.1079136690647482,
"grad_norm": 2.4404034208735927,
"learning_rate": 7.734806629834254e-07,
"loss": 0.9694,
"step": 15
},
{
"epoch": 0.14388489208633093,
"grad_norm": 2.441872944336153,
"learning_rate": 1.0497237569060774e-06,
"loss": 1.0269,
"step": 20
},
{
"epoch": 0.17985611510791366,
"grad_norm": 1.6991451599323688,
"learning_rate": 1.3259668508287293e-06,
"loss": 0.957,
"step": 25
},
{
"epoch": 0.2158273381294964,
"grad_norm": 1.3307484799446745,
"learning_rate": 1.6022099447513815e-06,
"loss": 0.9355,
"step": 30
},
{
"epoch": 0.2517985611510791,
"grad_norm": 1.2496799611826757,
"learning_rate": 1.8784530386740332e-06,
"loss": 0.9666,
"step": 35
},
{
"epoch": 0.28776978417266186,
"grad_norm": 1.1356552936883524,
"learning_rate": 2.1546961325966854e-06,
"loss": 0.921,
"step": 40
},
{
"epoch": 0.3237410071942446,
"grad_norm": 1.1527057549470892,
"learning_rate": 2.430939226519337e-06,
"loss": 0.852,
"step": 45
},
{
"epoch": 0.3597122302158273,
"grad_norm": 1.0249903652912056,
"learning_rate": 2.707182320441989e-06,
"loss": 0.8769,
"step": 50
},
{
"epoch": 0.39568345323741005,
"grad_norm": 1.1043714979095989,
"learning_rate": 2.983425414364641e-06,
"loss": 0.8719,
"step": 55
},
{
"epoch": 0.4316546762589928,
"grad_norm": 1.0874576236402984,
"learning_rate": 3.2596685082872933e-06,
"loss": 0.8416,
"step": 60
},
{
"epoch": 0.4676258992805755,
"grad_norm": 1.127326980338214,
"learning_rate": 3.535911602209945e-06,
"loss": 0.8216,
"step": 65
},
{
"epoch": 0.5035971223021583,
"grad_norm": 0.9717363036593373,
"learning_rate": 3.812154696132597e-06,
"loss": 0.8,
"step": 70
},
{
"epoch": 0.539568345323741,
"grad_norm": 1.0927788503087883,
"learning_rate": 4.088397790055249e-06,
"loss": 0.8177,
"step": 75
},
{
"epoch": 0.5755395683453237,
"grad_norm": 1.0523015167479852,
"learning_rate": 4.364640883977901e-06,
"loss": 0.8068,
"step": 80
},
{
"epoch": 0.6115107913669064,
"grad_norm": 1.0693768198193012,
"learning_rate": 4.640883977900552e-06,
"loss": 0.7852,
"step": 85
},
{
"epoch": 0.6474820143884892,
"grad_norm": 1.046777484175068,
"learning_rate": 4.9171270718232054e-06,
"loss": 0.7906,
"step": 90
},
{
"epoch": 0.6834532374100719,
"grad_norm": 1.0197896252518375,
"learning_rate": 5.193370165745857e-06,
"loss": 0.7802,
"step": 95
},
{
"epoch": 0.7194244604316546,
"grad_norm": 1.0435304344679766,
"learning_rate": 5.469613259668509e-06,
"loss": 0.7939,
"step": 100
},
{
"epoch": 0.7553956834532374,
"grad_norm": 1.1192401846344067,
"learning_rate": 5.74585635359116e-06,
"loss": 0.7666,
"step": 105
},
{
"epoch": 0.7913669064748201,
"grad_norm": 1.0649149685291524,
"learning_rate": 6.0220994475138124e-06,
"loss": 0.7701,
"step": 110
},
{
"epoch": 0.8273381294964028,
"grad_norm": 0.9973750867415139,
"learning_rate": 6.298342541436464e-06,
"loss": 0.7771,
"step": 115
},
{
"epoch": 0.8633093525179856,
"grad_norm": 0.9416210367748032,
"learning_rate": 6.574585635359117e-06,
"loss": 0.7638,
"step": 120
},
{
"epoch": 0.8992805755395683,
"grad_norm": 0.8869763185560768,
"learning_rate": 6.850828729281769e-06,
"loss": 0.7391,
"step": 125
},
{
"epoch": 0.935251798561151,
"grad_norm": 0.9496059833390906,
"learning_rate": 7.12707182320442e-06,
"loss": 0.7351,
"step": 130
},
{
"epoch": 0.9712230215827338,
"grad_norm": 0.9400545014058165,
"learning_rate": 7.4033149171270724e-06,
"loss": 0.7645,
"step": 135
},
{
"epoch": 1.0071942446043165,
"grad_norm": 0.839356585689632,
"learning_rate": 7.679558011049725e-06,
"loss": 0.7526,
"step": 140
},
{
"epoch": 1.0431654676258992,
"grad_norm": 0.808772401804446,
"learning_rate": 7.955801104972377e-06,
"loss": 0.7108,
"step": 145
},
{
"epoch": 1.079136690647482,
"grad_norm": 0.9195777885535735,
"learning_rate": 8.232044198895029e-06,
"loss": 0.7375,
"step": 150
},
{
"epoch": 1.1151079136690647,
"grad_norm": 0.80938147816933,
"learning_rate": 8.508287292817681e-06,
"loss": 0.7267,
"step": 155
},
{
"epoch": 1.1510791366906474,
"grad_norm": 0.8267082776521085,
"learning_rate": 8.784530386740332e-06,
"loss": 0.6438,
"step": 160
},
{
"epoch": 1.1870503597122302,
"grad_norm": 0.8810718593300568,
"learning_rate": 9.060773480662984e-06,
"loss": 0.6825,
"step": 165
},
{
"epoch": 1.223021582733813,
"grad_norm": 0.8978348613093966,
"learning_rate": 9.337016574585636e-06,
"loss": 0.6717,
"step": 170
},
{
"epoch": 1.2589928057553956,
"grad_norm": 1.652966094402784,
"learning_rate": 9.613259668508288e-06,
"loss": 0.6801,
"step": 175
},
{
"epoch": 1.2949640287769784,
"grad_norm": 0.8344736270472325,
"learning_rate": 9.88950276243094e-06,
"loss": 0.6605,
"step": 180
},
{
"epoch": 1.330935251798561,
"grad_norm": 0.7122691201582964,
"learning_rate": 9.999916007605012e-06,
"loss": 0.6418,
"step": 185
},
{
"epoch": 1.3669064748201438,
"grad_norm": 0.8256873807773062,
"learning_rate": 9.999402730965894e-06,
"loss": 0.6919,
"step": 190
},
{
"epoch": 1.4028776978417266,
"grad_norm": 1.007866860643835,
"learning_rate": 9.9984228879725e-06,
"loss": 0.6635,
"step": 195
},
{
"epoch": 1.4388489208633093,
"grad_norm": 0.9234823911775631,
"learning_rate": 9.99697657006811e-06,
"loss": 0.6714,
"step": 200
},
{
"epoch": 1.474820143884892,
"grad_norm": 0.8634620569129478,
"learning_rate": 9.995063912229499e-06,
"loss": 0.6859,
"step": 205
},
{
"epoch": 1.5107913669064748,
"grad_norm": 0.9085461363247348,
"learning_rate": 9.992685092954347e-06,
"loss": 0.6493,
"step": 210
},
{
"epoch": 1.5467625899280577,
"grad_norm": 0.8821359520671455,
"learning_rate": 9.989840334244583e-06,
"loss": 0.6507,
"step": 215
},
{
"epoch": 1.5827338129496402,
"grad_norm": 0.8741181648998291,
"learning_rate": 9.98652990158566e-06,
"loss": 0.6321,
"step": 220
},
{
"epoch": 1.6187050359712232,
"grad_norm": 0.8647283657532671,
"learning_rate": 9.98275410392178e-06,
"loss": 0.623,
"step": 225
},
{
"epoch": 1.6546762589928057,
"grad_norm": 0.8020886077383842,
"learning_rate": 9.978513293627068e-06,
"loss": 0.652,
"step": 230
},
{
"epoch": 1.6906474820143886,
"grad_norm": 0.9087355156431727,
"learning_rate": 9.973807866472679e-06,
"loss": 0.6684,
"step": 235
},
{
"epoch": 1.7266187050359711,
"grad_norm": 0.9656812695394805,
"learning_rate": 9.968638261589866e-06,
"loss": 0.6981,
"step": 240
},
{
"epoch": 1.762589928057554,
"grad_norm": 0.825927566467652,
"learning_rate": 9.963004961429004e-06,
"loss": 0.6539,
"step": 245
},
{
"epoch": 1.7985611510791366,
"grad_norm": 0.8013671458538759,
"learning_rate": 9.956908491714552e-06,
"loss": 0.6622,
"step": 250
},
{
"epoch": 1.8345323741007196,
"grad_norm": 0.8459870643511322,
"learning_rate": 9.950349421396004e-06,
"loss": 0.666,
"step": 255
},
{
"epoch": 1.870503597122302,
"grad_norm": 0.8165693120838837,
"learning_rate": 9.943328362594788e-06,
"loss": 0.6513,
"step": 260
},
{
"epoch": 1.906474820143885,
"grad_norm": 0.7543912284760205,
"learning_rate": 9.935845970547133e-06,
"loss": 0.6335,
"step": 265
},
{
"epoch": 1.9424460431654675,
"grad_norm": 0.8724447270007475,
"learning_rate": 9.927902943542932e-06,
"loss": 0.6575,
"step": 270
},
{
"epoch": 1.9784172661870505,
"grad_norm": 0.8973865160814858,
"learning_rate": 9.919500022860559e-06,
"loss": 0.6434,
"step": 275
},
{
"epoch": 2.014388489208633,
"grad_norm": 0.8246254108343064,
"learning_rate": 9.910637992697707e-06,
"loss": 0.6219,
"step": 280
},
{
"epoch": 2.050359712230216,
"grad_norm": 0.9316036149555057,
"learning_rate": 9.901317680098187e-06,
"loss": 0.552,
"step": 285
},
{
"epoch": 2.0863309352517985,
"grad_norm": 0.7747424028512833,
"learning_rate": 9.891539954874758e-06,
"loss": 0.5254,
"step": 290
},
{
"epoch": 2.1223021582733814,
"grad_norm": 0.765023536746088,
"learning_rate": 9.881305729527944e-06,
"loss": 0.5417,
"step": 295
},
{
"epoch": 2.158273381294964,
"grad_norm": 0.7589194364572505,
"learning_rate": 9.870615959160876e-06,
"loss": 0.5683,
"step": 300
},
{
"epoch": 2.194244604316547,
"grad_norm": 0.6742604327945994,
"learning_rate": 9.859471641390161e-06,
"loss": 0.5241,
"step": 305
},
{
"epoch": 2.2302158273381294,
"grad_norm": 0.8215048068090698,
"learning_rate": 9.84787381625278e-06,
"loss": 0.5223,
"step": 310
},
{
"epoch": 2.2661870503597124,
"grad_norm": 0.7846331525765837,
"learning_rate": 9.83582356610902e-06,
"loss": 0.5091,
"step": 315
},
{
"epoch": 2.302158273381295,
"grad_norm": 0.839958631503289,
"learning_rate": 9.823322015541474e-06,
"loss": 0.5768,
"step": 320
},
{
"epoch": 2.338129496402878,
"grad_norm": 0.7523314646946071,
"learning_rate": 9.810370331250082e-06,
"loss": 0.534,
"step": 325
},
{
"epoch": 2.3741007194244603,
"grad_norm": 0.8094520756368656,
"learning_rate": 9.796969721943257e-06,
"loss": 0.5416,
"step": 330
},
{
"epoch": 2.4100719424460433,
"grad_norm": 0.8079898872263164,
"learning_rate": 9.783121438225069e-06,
"loss": 0.5407,
"step": 335
},
{
"epoch": 2.446043165467626,
"grad_norm": 0.7534024146698997,
"learning_rate": 9.76882677247855e-06,
"loss": 0.5129,
"step": 340
},
{
"epoch": 2.4820143884892087,
"grad_norm": 0.6770277707875069,
"learning_rate": 9.754087058745074e-06,
"loss": 0.5538,
"step": 345
},
{
"epoch": 2.5179856115107913,
"grad_norm": 0.7346186223847041,
"learning_rate": 9.738903672599858e-06,
"loss": 0.547,
"step": 350
},
{
"epoch": 2.553956834532374,
"grad_norm": 0.8072986015854965,
"learning_rate": 9.723278031023587e-06,
"loss": 0.5206,
"step": 355
},
{
"epoch": 2.5899280575539567,
"grad_norm": 0.747891183883854,
"learning_rate": 9.707211592270183e-06,
"loss": 0.5204,
"step": 360
},
{
"epoch": 2.6258992805755397,
"grad_norm": 0.7849096675174403,
"learning_rate": 9.690705855730704e-06,
"loss": 0.5519,
"step": 365
},
{
"epoch": 2.661870503597122,
"grad_norm": 0.8069083086893393,
"learning_rate": 9.673762361793418e-06,
"loss": 0.5368,
"step": 370
},
{
"epoch": 2.697841726618705,
"grad_norm": 0.740470534449088,
"learning_rate": 9.656382691700053e-06,
"loss": 0.5352,
"step": 375
},
{
"epoch": 2.7338129496402876,
"grad_norm": 0.837590597245089,
"learning_rate": 9.638568467398215e-06,
"loss": 0.5254,
"step": 380
},
{
"epoch": 2.7697841726618706,
"grad_norm": 0.8025317262533782,
"learning_rate": 9.620321351390037e-06,
"loss": 0.5187,
"step": 385
},
{
"epoch": 2.805755395683453,
"grad_norm": 0.7569128415023125,
"learning_rate": 9.601643046577014e-06,
"loss": 0.5164,
"step": 390
},
{
"epoch": 2.841726618705036,
"grad_norm": 0.7785055236138303,
"learning_rate": 9.582535296101088e-06,
"loss": 0.4887,
"step": 395
},
{
"epoch": 2.8776978417266186,
"grad_norm": 0.7719458883382703,
"learning_rate": 9.562999883181968e-06,
"loss": 0.5173,
"step": 400
},
{
"epoch": 2.9136690647482015,
"grad_norm": 0.8209235363580544,
"learning_rate": 9.543038630950706e-06,
"loss": 0.511,
"step": 405
},
{
"epoch": 2.949640287769784,
"grad_norm": 0.7253247761806964,
"learning_rate": 9.52265340227957e-06,
"loss": 0.5388,
"step": 410
},
{
"epoch": 2.985611510791367,
"grad_norm": 0.8010150911294615,
"learning_rate": 9.501846099608178e-06,
"loss": 0.5279,
"step": 415
},
{
"epoch": 3.0215827338129495,
"grad_norm": 0.7799763170983607,
"learning_rate": 9.480618664765956e-06,
"loss": 0.444,
"step": 420
},
{
"epoch": 3.0575539568345325,
"grad_norm": 0.8473689333796085,
"learning_rate": 9.458973078790925e-06,
"loss": 0.4316,
"step": 425
},
{
"epoch": 3.093525179856115,
"grad_norm": 0.7435775172722132,
"learning_rate": 9.436911361744817e-06,
"loss": 0.3797,
"step": 430
},
{
"epoch": 3.129496402877698,
"grad_norm": 0.8351745729159173,
"learning_rate": 9.414435572524551e-06,
"loss": 0.3939,
"step": 435
},
{
"epoch": 3.1654676258992804,
"grad_norm": 0.7390963756725633,
"learning_rate": 9.391547808670097e-06,
"loss": 0.3908,
"step": 440
},
{
"epoch": 3.2014388489208634,
"grad_norm": 0.750139478955199,
"learning_rate": 9.368250206168712e-06,
"loss": 0.3841,
"step": 445
},
{
"epoch": 3.237410071942446,
"grad_norm": 0.8335176332743932,
"learning_rate": 9.344544939255608e-06,
"loss": 0.4053,
"step": 450
},
{
"epoch": 3.273381294964029,
"grad_norm": 0.9346298960554003,
"learning_rate": 9.320434220211046e-06,
"loss": 0.4098,
"step": 455
},
{
"epoch": 3.3093525179856114,
"grad_norm": 0.82446257107031,
"learning_rate": 9.295920299153863e-06,
"loss": 0.4098,
"step": 460
},
{
"epoch": 3.3453237410071943,
"grad_norm": 0.8355789027376981,
"learning_rate": 9.2710054638315e-06,
"loss": 0.4131,
"step": 465
},
{
"epoch": 3.381294964028777,
"grad_norm": 0.7578251758484424,
"learning_rate": 9.24569203940648e-06,
"loss": 0.4199,
"step": 470
},
{
"epoch": 3.41726618705036,
"grad_norm": 2.2996955508669257,
"learning_rate": 9.219982388239426e-06,
"loss": 0.4351,
"step": 475
},
{
"epoch": 3.4532374100719423,
"grad_norm": 0.7779867034535775,
"learning_rate": 9.193878909668591e-06,
"loss": 0.407,
"step": 480
},
{
"epoch": 3.4892086330935252,
"grad_norm": 0.7862144605775059,
"learning_rate": 9.167384039785943e-06,
"loss": 0.3808,
"step": 485
},
{
"epoch": 3.5251798561151078,
"grad_norm": 0.7666253828866978,
"learning_rate": 9.140500251209813e-06,
"loss": 0.3926,
"step": 490
},
{
"epoch": 3.5611510791366907,
"grad_norm": 0.7839175938965846,
"learning_rate": 9.113230052854148e-06,
"loss": 0.4075,
"step": 495
},
{
"epoch": 3.597122302158273,
"grad_norm": 0.7521082431006533,
"learning_rate": 9.085575989694358e-06,
"loss": 0.4119,
"step": 500
},
{
"epoch": 3.633093525179856,
"grad_norm": 0.7916504768929141,
"learning_rate": 9.057540642529816e-06,
"loss": 0.4028,
"step": 505
},
{
"epoch": 3.6690647482014387,
"grad_norm": 0.8092849334210818,
"learning_rate": 9.029126627743003e-06,
"loss": 0.4045,
"step": 510
},
{
"epoch": 3.7050359712230216,
"grad_norm": 0.8376631711045777,
"learning_rate": 9.000336597055335e-06,
"loss": 0.4067,
"step": 515
},
{
"epoch": 3.741007194244604,
"grad_norm": 0.7860913819423667,
"learning_rate": 8.971173237279693e-06,
"loss": 0.3975,
"step": 520
},
{
"epoch": 3.776978417266187,
"grad_norm": 0.7642723973824054,
"learning_rate": 8.941639270069678e-06,
"loss": 0.3805,
"step": 525
},
{
"epoch": 3.81294964028777,
"grad_norm": 0.7998252232015214,
"learning_rate": 8.911737451665616e-06,
"loss": 0.3919,
"step": 530
},
{
"epoch": 3.8489208633093526,
"grad_norm": 0.8048664536386205,
"learning_rate": 8.881470572637331e-06,
"loss": 0.4037,
"step": 535
},
{
"epoch": 3.884892086330935,
"grad_norm": 0.7500962887822149,
"learning_rate": 8.85084145762372e-06,
"loss": 0.399,
"step": 540
},
{
"epoch": 3.920863309352518,
"grad_norm": 0.9025166412441733,
"learning_rate": 8.819852965069135e-06,
"loss": 0.3896,
"step": 545
},
{
"epoch": 3.956834532374101,
"grad_norm": 0.8301594036050014,
"learning_rate": 8.788507986956639e-06,
"loss": 0.4238,
"step": 550
},
{
"epoch": 3.9928057553956835,
"grad_norm": 0.787139826401505,
"learning_rate": 8.756809448538091e-06,
"loss": 0.4173,
"step": 555
},
{
"epoch": 4.028776978417266,
"grad_norm": 0.7382914056190027,
"learning_rate": 8.724760308061172e-06,
"loss": 0.2962,
"step": 560
},
{
"epoch": 4.0647482014388485,
"grad_norm": 0.8624964895203346,
"learning_rate": 8.692363556493288e-06,
"loss": 0.2789,
"step": 565
},
{
"epoch": 4.100719424460432,
"grad_norm": 0.8180011592364103,
"learning_rate": 8.65962221724245e-06,
"loss": 0.2595,
"step": 570
},
{
"epoch": 4.136690647482014,
"grad_norm": 0.9258676683243418,
"learning_rate": 8.626539345875114e-06,
"loss": 0.2799,
"step": 575
},
{
"epoch": 4.172661870503597,
"grad_norm": 0.7487403598173822,
"learning_rate": 8.593118029831025e-06,
"loss": 0.271,
"step": 580
},
{
"epoch": 4.2086330935251794,
"grad_norm": 0.8833477103510577,
"learning_rate": 8.559361388135079e-06,
"loss": 0.282,
"step": 585
},
{
"epoch": 4.244604316546763,
"grad_norm": 0.7782248198105488,
"learning_rate": 8.525272571106242e-06,
"loss": 0.289,
"step": 590
},
{
"epoch": 4.280575539568345,
"grad_norm": 0.8034461020363071,
"learning_rate": 8.490854760063551e-06,
"loss": 0.2616,
"step": 595
},
{
"epoch": 4.316546762589928,
"grad_norm": 0.793816496134317,
"learning_rate": 8.456111167029219e-06,
"loss": 0.2813,
"step": 600
},
{
"epoch": 4.35251798561151,
"grad_norm": 0.8190581140556358,
"learning_rate": 8.421045034428871e-06,
"loss": 0.3232,
"step": 605
},
{
"epoch": 4.388489208633094,
"grad_norm": 0.8718874337244386,
"learning_rate": 8.385659634788959e-06,
"loss": 0.2989,
"step": 610
},
{
"epoch": 4.424460431654676,
"grad_norm": 0.7812963827584564,
"learning_rate": 8.349958270431331e-06,
"loss": 0.252,
"step": 615
},
{
"epoch": 4.460431654676259,
"grad_norm": 0.7877942550600477,
"learning_rate": 8.313944273165068e-06,
"loss": 0.2898,
"step": 620
},
{
"epoch": 4.496402877697841,
"grad_norm": 0.9163627849897873,
"learning_rate": 8.277621003975538e-06,
"loss": 0.2677,
"step": 625
},
{
"epoch": 4.532374100719425,
"grad_norm": 0.7629297663386118,
"learning_rate": 8.240991852710724e-06,
"loss": 0.2932,
"step": 630
},
{
"epoch": 4.568345323741007,
"grad_norm": 0.8142905446688702,
"learning_rate": 8.204060237764881e-06,
"loss": 0.2975,
"step": 635
},
{
"epoch": 4.60431654676259,
"grad_norm": 0.8952608329134608,
"learning_rate": 8.166829605759507e-06,
"loss": 0.2833,
"step": 640
},
{
"epoch": 4.640287769784173,
"grad_norm": 0.7138017190167328,
"learning_rate": 8.1293034312217e-06,
"loss": 0.2578,
"step": 645
},
{
"epoch": 4.676258992805756,
"grad_norm": 0.8550352663430013,
"learning_rate": 8.091485216259886e-06,
"loss": 0.2962,
"step": 650
},
{
"epoch": 4.712230215827338,
"grad_norm": 0.8260458421966028,
"learning_rate": 8.053378490236998e-06,
"loss": 0.3018,
"step": 655
},
{
"epoch": 4.748201438848921,
"grad_norm": 0.7578084834418747,
"learning_rate": 8.014986809441093e-06,
"loss": 0.3008,
"step": 660
},
{
"epoch": 4.784172661870503,
"grad_norm": 0.8541676466900879,
"learning_rate": 7.976313756753474e-06,
"loss": 0.2947,
"step": 665
},
{
"epoch": 4.820143884892087,
"grad_norm": 0.771785543725891,
"learning_rate": 7.9373629413143e-06,
"loss": 0.2777,
"step": 670
},
{
"epoch": 4.856115107913669,
"grad_norm": 0.7674809820247032,
"learning_rate": 7.898137998185788e-06,
"loss": 0.2772,
"step": 675
},
{
"epoch": 4.892086330935252,
"grad_norm": 0.8225717543186113,
"learning_rate": 7.858642588012957e-06,
"loss": 0.2989,
"step": 680
},
{
"epoch": 4.928057553956835,
"grad_norm": 0.7245903343028117,
"learning_rate": 7.818880396682007e-06,
"loss": 0.2796,
"step": 685
},
{
"epoch": 4.9640287769784175,
"grad_norm": 0.8489860259719797,
"learning_rate": 7.778855134976334e-06,
"loss": 0.2796,
"step": 690
},
{
"epoch": 5.0,
"grad_norm": 0.7746624242136518,
"learning_rate": 7.73857053823023e-06,
"loss": 0.2891,
"step": 695
},
{
"epoch": 5.0359712230215825,
"grad_norm": 1.0291353857874774,
"learning_rate": 7.698030365980265e-06,
"loss": 0.2075,
"step": 700
},
{
"epoch": 5.071942446043165,
"grad_norm": 0.9042435786419071,
"learning_rate": 7.657238401614459e-06,
"loss": 0.1749,
"step": 705
},
{
"epoch": 5.107913669064748,
"grad_norm": 0.895579817335833,
"learning_rate": 7.616198452019176e-06,
"loss": 0.1904,
"step": 710
},
{
"epoch": 5.143884892086331,
"grad_norm": 0.790487071528411,
"learning_rate": 7.57491434722386e-06,
"loss": 0.172,
"step": 715
},
{
"epoch": 5.179856115107913,
"grad_norm": 1.0172179899933327,
"learning_rate": 7.5333899400435986e-06,
"loss": 0.2023,
"step": 720
},
{
"epoch": 5.215827338129497,
"grad_norm": 0.8101299548770012,
"learning_rate": 7.491629105719559e-06,
"loss": 0.1732,
"step": 725
},
{
"epoch": 5.251798561151079,
"grad_norm": 0.8041008787827414,
"learning_rate": 7.44963574155733e-06,
"loss": 0.1901,
"step": 730
},
{
"epoch": 5.287769784172662,
"grad_norm": 0.7679456105085443,
"learning_rate": 7.407413766563218e-06,
"loss": 0.1987,
"step": 735
},
{
"epoch": 5.323741007194244,
"grad_norm": 0.9189882418553729,
"learning_rate": 7.3649671210785024e-06,
"loss": 0.1756,
"step": 740
},
{
"epoch": 5.359712230215827,
"grad_norm": 0.8739708430814714,
"learning_rate": 7.322299766411702e-06,
"loss": 0.1858,
"step": 745
},
{
"epoch": 5.39568345323741,
"grad_norm": 0.7930479843078312,
"learning_rate": 7.279415684468893e-06,
"loss": 0.1985,
"step": 750
},
{
"epoch": 5.431654676258993,
"grad_norm": 0.8224192025147354,
"learning_rate": 7.236318877382098e-06,
"loss": 0.1773,
"step": 755
},
{
"epoch": 5.467625899280575,
"grad_norm": 0.8202045692138589,
"learning_rate": 7.1930133671357915e-06,
"loss": 0.1917,
"step": 760
},
{
"epoch": 5.503597122302159,
"grad_norm": 0.9216022917795681,
"learning_rate": 7.149503195191553e-06,
"loss": 0.1868,
"step": 765
},
{
"epoch": 5.539568345323741,
"grad_norm": 0.8282102212656368,
"learning_rate": 7.1057924221108856e-06,
"loss": 0.1793,
"step": 770
},
{
"epoch": 5.575539568345324,
"grad_norm": 0.9944130581062844,
"learning_rate": 7.061885127176285e-06,
"loss": 0.195,
"step": 775
},
{
"epoch": 5.611510791366906,
"grad_norm": 0.8384690598165658,
"learning_rate": 7.017785408010533e-06,
"loss": 0.1733,
"step": 780
},
{
"epoch": 5.647482014388489,
"grad_norm": 0.8126766061557951,
"learning_rate": 6.973497380194292e-06,
"loss": 0.1893,
"step": 785
},
{
"epoch": 5.683453237410072,
"grad_norm": 0.7781648778019523,
"learning_rate": 6.929025176882016e-06,
"loss": 0.1671,
"step": 790
},
{
"epoch": 5.719424460431655,
"grad_norm": 0.8950618409041426,
"learning_rate": 6.884372948416232e-06,
"loss": 0.1844,
"step": 795
},
{
"epoch": 5.755395683453237,
"grad_norm": 0.8460453893006227,
"learning_rate": 6.839544861940214e-06,
"loss": 0.1927,
"step": 800
},
{
"epoch": 5.7913669064748206,
"grad_norm": 0.7684170747964488,
"learning_rate": 6.794545101009074e-06,
"loss": 0.1856,
"step": 805
},
{
"epoch": 5.827338129496403,
"grad_norm": 0.7353554154462942,
"learning_rate": 6.74937786519935e-06,
"loss": 0.1827,
"step": 810
},
{
"epoch": 5.863309352517986,
"grad_norm": 1.2438204201134049,
"learning_rate": 6.704047369717075e-06,
"loss": 0.1789,
"step": 815
},
{
"epoch": 5.899280575539568,
"grad_norm": 0.8460577382893343,
"learning_rate": 6.65855784500439e-06,
"loss": 0.1892,
"step": 820
},
{
"epoch": 5.935251798561151,
"grad_norm": 0.7772437606617016,
"learning_rate": 6.612913536344755e-06,
"loss": 0.2007,
"step": 825
},
{
"epoch": 5.971223021582734,
"grad_norm": 0.8274969656085451,
"learning_rate": 6.5671187034667465e-06,
"loss": 0.1804,
"step": 830
},
{
"epoch": 6.0071942446043165,
"grad_norm": 0.8049472498410428,
"learning_rate": 6.521177620146525e-06,
"loss": 0.1698,
"step": 835
},
{
"epoch": 6.043165467625899,
"grad_norm": 0.8195061687563975,
"learning_rate": 6.475094573808994e-06,
"loss": 0.1189,
"step": 840
},
{
"epoch": 6.079136690647482,
"grad_norm": 0.7117012258140467,
"learning_rate": 6.42887386512767e-06,
"loss": 0.1092,
"step": 845
},
{
"epoch": 6.115107913669065,
"grad_norm": 0.8585323527604425,
"learning_rate": 6.3825198076233255e-06,
"loss": 0.1172,
"step": 850
},
{
"epoch": 6.151079136690647,
"grad_norm": 0.719310146465657,
"learning_rate": 6.336036727261438e-06,
"loss": 0.1284,
"step": 855
},
{
"epoch": 6.18705035971223,
"grad_norm": 0.7584056325173733,
"learning_rate": 6.289428962048467e-06,
"loss": 0.0971,
"step": 860
},
{
"epoch": 6.223021582733813,
"grad_norm": 0.7585595993348573,
"learning_rate": 6.242700861627015e-06,
"loss": 0.1151,
"step": 865
},
{
"epoch": 6.258992805755396,
"grad_norm": 0.7847975616110491,
"learning_rate": 6.195856786869893e-06,
"loss": 0.1069,
"step": 870
},
{
"epoch": 6.294964028776978,
"grad_norm": 0.8845845513208976,
"learning_rate": 6.148901109473153e-06,
"loss": 0.1069,
"step": 875
},
{
"epoch": 6.330935251798561,
"grad_norm": 0.755964158625991,
"learning_rate": 6.101838211548099e-06,
"loss": 0.113,
"step": 880
},
{
"epoch": 6.366906474820144,
"grad_norm": 0.7457589645048261,
"learning_rate": 6.054672485212327e-06,
"loss": 0.1206,
"step": 885
},
{
"epoch": 6.402877697841727,
"grad_norm": 0.8137239720399341,
"learning_rate": 6.007408332179836e-06,
"loss": 0.1096,
"step": 890
},
{
"epoch": 6.438848920863309,
"grad_norm": 0.7371197763823699,
"learning_rate": 5.960050163350235e-06,
"loss": 0.1028,
"step": 895
},
{
"epoch": 6.474820143884892,
"grad_norm": 0.7124693107117961,
"learning_rate": 5.9126023983971114e-06,
"loss": 0.1128,
"step": 900
},
{
"epoch": 6.510791366906475,
"grad_norm": 0.7750858789601416,
"learning_rate": 5.865069465355551e-06,
"loss": 0.1297,
"step": 905
},
{
"epoch": 6.546762589928058,
"grad_norm": 0.8213615097610627,
"learning_rate": 5.817455800208901e-06,
"loss": 0.1277,
"step": 910
},
{
"epoch": 6.58273381294964,
"grad_norm": 0.7304317963820385,
"learning_rate": 5.769765846474794e-06,
"loss": 0.1125,
"step": 915
},
{
"epoch": 6.618705035971223,
"grad_norm": 0.8292934519593511,
"learning_rate": 5.722004054790442e-06,
"loss": 0.1238,
"step": 920
},
{
"epoch": 6.654676258992806,
"grad_norm": 0.7036770476021397,
"learning_rate": 5.674174882497297e-06,
"loss": 0.1105,
"step": 925
},
{
"epoch": 6.690647482014389,
"grad_norm": 0.8766920039902681,
"learning_rate": 5.626282793225066e-06,
"loss": 0.1134,
"step": 930
},
{
"epoch": 6.726618705035971,
"grad_norm": 0.8803786508866736,
"learning_rate": 5.578332256475144e-06,
"loss": 0.116,
"step": 935
},
{
"epoch": 6.762589928057554,
"grad_norm": 0.8141270371349804,
"learning_rate": 5.530327747203507e-06,
"loss": 0.1175,
"step": 940
},
{
"epoch": 6.798561151079137,
"grad_norm": 0.7327559370982355,
"learning_rate": 5.482273745403082e-06,
"loss": 0.1084,
"step": 945
},
{
"epoch": 6.83453237410072,
"grad_norm": 0.6975665306165219,
"learning_rate": 5.434174735685658e-06,
"loss": 0.125,
"step": 950
},
{
"epoch": 6.870503597122302,
"grad_norm": 0.7213427193750023,
"learning_rate": 5.3860352068633635e-06,
"loss": 0.1078,
"step": 955
},
{
"epoch": 6.906474820143885,
"grad_norm": 0.7912624637000346,
"learning_rate": 5.337859651529747e-06,
"loss": 0.1089,
"step": 960
},
{
"epoch": 6.942446043165468,
"grad_norm": 0.8661381989776245,
"learning_rate": 5.289652565640513e-06,
"loss": 0.1201,
"step": 965
},
{
"epoch": 6.9784172661870505,
"grad_norm": 0.8889995059859764,
"learning_rate": 5.241418448093931e-06,
"loss": 0.1045,
"step": 970
},
{
"epoch": 7.014388489208633,
"grad_norm": 0.6848728107959661,
"learning_rate": 5.193161800310991e-06,
"loss": 0.0868,
"step": 975
},
{
"epoch": 7.0503597122302155,
"grad_norm": 0.8191951918685765,
"learning_rate": 5.144887125815301e-06,
"loss": 0.0688,
"step": 980
},
{
"epoch": 7.086330935251799,
"grad_norm": 0.7083429689515068,
"learning_rate": 5.0965989298128e-06,
"loss": 0.0717,
"step": 985
},
{
"epoch": 7.122302158273381,
"grad_norm": 0.7638610219919097,
"learning_rate": 5.048301718771317e-06,
"loss": 0.0695,
"step": 990
},
{
"epoch": 7.158273381294964,
"grad_norm": 0.7485748648028515,
"learning_rate": 5e-06,
"loss": 0.0687,
"step": 995
},
{
"epoch": 7.194244604316546,
"grad_norm": 0.7050027098157597,
"learning_rate": 4.951698281228686e-06,
"loss": 0.0729,
"step": 1000
},
{
"epoch": 7.23021582733813,
"grad_norm": 0.657912930608705,
"learning_rate": 4.903401070187201e-06,
"loss": 0.0704,
"step": 1005
},
{
"epoch": 7.266187050359712,
"grad_norm": 0.5398206206893736,
"learning_rate": 4.855112874184701e-06,
"loss": 0.058,
"step": 1010
},
{
"epoch": 7.302158273381295,
"grad_norm": 0.5495044551987519,
"learning_rate": 4.806838199689009e-06,
"loss": 0.0684,
"step": 1015
},
{
"epoch": 7.338129496402877,
"grad_norm": 0.661299793154578,
"learning_rate": 4.75858155190607e-06,
"loss": 0.067,
"step": 1020
},
{
"epoch": 7.374100719424461,
"grad_norm": 0.6484353795344787,
"learning_rate": 4.710347434359489e-06,
"loss": 0.0649,
"step": 1025
},
{
"epoch": 7.410071942446043,
"grad_norm": 0.6667433332665962,
"learning_rate": 4.662140348470253e-06,
"loss": 0.066,
"step": 1030
},
{
"epoch": 7.446043165467626,
"grad_norm": 0.6727307298334666,
"learning_rate": 4.613964793136637e-06,
"loss": 0.0677,
"step": 1035
},
{
"epoch": 7.482014388489208,
"grad_norm": 0.6730404475842242,
"learning_rate": 4.565825264314344e-06,
"loss": 0.072,
"step": 1040
},
{
"epoch": 7.517985611510792,
"grad_norm": 0.5740431223544631,
"learning_rate": 4.51772625459692e-06,
"loss": 0.0679,
"step": 1045
},
{
"epoch": 7.553956834532374,
"grad_norm": 0.6895089066435401,
"learning_rate": 4.469672252796495e-06,
"loss": 0.0689,
"step": 1050
},
{
"epoch": 7.589928057553957,
"grad_norm": 0.781118218856585,
"learning_rate": 4.421667743524856e-06,
"loss": 0.0698,
"step": 1055
},
{
"epoch": 7.625899280575539,
"grad_norm": 0.5852743633262958,
"learning_rate": 4.373717206774935e-06,
"loss": 0.0661,
"step": 1060
},
{
"epoch": 7.661870503597123,
"grad_norm": 0.8465001999729229,
"learning_rate": 4.3258251175027036e-06,
"loss": 0.0699,
"step": 1065
},
{
"epoch": 7.697841726618705,
"grad_norm": 0.6063388727288704,
"learning_rate": 4.277995945209558e-06,
"loss": 0.0627,
"step": 1070
},
{
"epoch": 7.733812949640288,
"grad_norm": 0.6267594676130663,
"learning_rate": 4.230234153525207e-06,
"loss": 0.0729,
"step": 1075
},
{
"epoch": 7.76978417266187,
"grad_norm": 0.9079983203188293,
"learning_rate": 4.182544199791102e-06,
"loss": 0.0584,
"step": 1080
},
{
"epoch": 7.805755395683454,
"grad_norm": 0.7011901007125456,
"learning_rate": 4.1349305346444515e-06,
"loss": 0.072,
"step": 1085
},
{
"epoch": 7.841726618705036,
"grad_norm": 0.5894372195866899,
"learning_rate": 4.08739760160289e-06,
"loss": 0.0681,
"step": 1090
},
{
"epoch": 7.877697841726619,
"grad_norm": 0.7312384684995438,
"learning_rate": 4.039949836649765e-06,
"loss": 0.0652,
"step": 1095
},
{
"epoch": 7.913669064748201,
"grad_norm": 0.7889140825844604,
"learning_rate": 3.992591667820166e-06,
"loss": 0.0675,
"step": 1100
},
{
"epoch": 7.9496402877697845,
"grad_norm": 0.566611163942455,
"learning_rate": 3.945327514787676e-06,
"loss": 0.0624,
"step": 1105
},
{
"epoch": 7.985611510791367,
"grad_norm": 0.5476455607834616,
"learning_rate": 3.8981617884519015e-06,
"loss": 0.0747,
"step": 1110
},
{
"epoch": 8.02158273381295,
"grad_norm": 0.5147724688346386,
"learning_rate": 3.851098890526848e-06,
"loss": 0.0548,
"step": 1115
},
{
"epoch": 8.057553956834532,
"grad_norm": 0.6656915502998506,
"learning_rate": 3.80414321313011e-06,
"loss": 0.0439,
"step": 1120
},
{
"epoch": 8.093525179856115,
"grad_norm": 0.5212083633131025,
"learning_rate": 3.7572991383729855e-06,
"loss": 0.0416,
"step": 1125
},
{
"epoch": 8.129496402877697,
"grad_norm": 0.7168392620555522,
"learning_rate": 3.7105710379515335e-06,
"loss": 0.0489,
"step": 1130
},
{
"epoch": 8.16546762589928,
"grad_norm": 0.5342340402870694,
"learning_rate": 3.6639632727385616e-06,
"loss": 0.0453,
"step": 1135
},
{
"epoch": 8.201438848920864,
"grad_norm": 0.4424742834921625,
"learning_rate": 3.6174801923766762e-06,
"loss": 0.0438,
"step": 1140
},
{
"epoch": 8.237410071942445,
"grad_norm": 0.4371446386393137,
"learning_rate": 3.5711261348723327e-06,
"loss": 0.0421,
"step": 1145
},
{
"epoch": 8.273381294964029,
"grad_norm": 0.44606778279830905,
"learning_rate": 3.5249054261910067e-06,
"loss": 0.0449,
"step": 1150
},
{
"epoch": 8.309352517985612,
"grad_norm": 0.4196749502255892,
"learning_rate": 3.478822379853477e-06,
"loss": 0.0456,
"step": 1155
},
{
"epoch": 8.345323741007194,
"grad_norm": 0.45404549332701183,
"learning_rate": 3.432881296533257e-06,
"loss": 0.0446,
"step": 1160
},
{
"epoch": 8.381294964028777,
"grad_norm": 0.47041138885456785,
"learning_rate": 3.3870864636552468e-06,
"loss": 0.0429,
"step": 1165
},
{
"epoch": 8.417266187050359,
"grad_norm": 0.4574468532910372,
"learning_rate": 3.3414421549956115e-06,
"loss": 0.0485,
"step": 1170
},
{
"epoch": 8.453237410071942,
"grad_norm": 0.5297931614784003,
"learning_rate": 3.2959526302829257e-06,
"loss": 0.0361,
"step": 1175
},
{
"epoch": 8.489208633093526,
"grad_norm": 0.49346890066053606,
"learning_rate": 3.250622134800651e-06,
"loss": 0.0352,
"step": 1180
},
{
"epoch": 8.525179856115107,
"grad_norm": 0.5225771130962942,
"learning_rate": 3.205454898990928e-06,
"loss": 0.0481,
"step": 1185
},
{
"epoch": 8.56115107913669,
"grad_norm": 0.44436107460781493,
"learning_rate": 3.160455138059788e-06,
"loss": 0.0405,
"step": 1190
},
{
"epoch": 8.597122302158274,
"grad_norm": 0.44898570613304095,
"learning_rate": 3.115627051583768e-06,
"loss": 0.0438,
"step": 1195
},
{
"epoch": 8.633093525179856,
"grad_norm": 0.5492491751490948,
"learning_rate": 3.070974823117986e-06,
"loss": 0.0396,
"step": 1200
},
{
"epoch": 8.66906474820144,
"grad_norm": 0.565956815075197,
"learning_rate": 3.026502619805709e-06,
"loss": 0.048,
"step": 1205
},
{
"epoch": 8.70503597122302,
"grad_norm": 0.5284087475275612,
"learning_rate": 2.9822145919894676e-06,
"loss": 0.0384,
"step": 1210
},
{
"epoch": 8.741007194244604,
"grad_norm": 0.5103352561656974,
"learning_rate": 2.938114872823716e-06,
"loss": 0.0407,
"step": 1215
},
{
"epoch": 8.776978417266188,
"grad_norm": 0.6098332646000372,
"learning_rate": 2.8942075778891153e-06,
"loss": 0.0414,
"step": 1220
},
{
"epoch": 8.81294964028777,
"grad_norm": 0.5038330297794213,
"learning_rate": 2.8504968048084492e-06,
"loss": 0.0414,
"step": 1225
},
{
"epoch": 8.848920863309353,
"grad_norm": 0.5302618793861339,
"learning_rate": 2.806986632864208e-06,
"loss": 0.0377,
"step": 1230
},
{
"epoch": 8.884892086330936,
"grad_norm": 0.5568603382860081,
"learning_rate": 2.7636811226179027e-06,
"loss": 0.0433,
"step": 1235
},
{
"epoch": 8.920863309352518,
"grad_norm": 0.43414177083233707,
"learning_rate": 2.7205843155311098e-06,
"loss": 0.0407,
"step": 1240
},
{
"epoch": 8.956834532374101,
"grad_norm": 0.4216923605954666,
"learning_rate": 2.6777002335882996e-06,
"loss": 0.0432,
"step": 1245
},
{
"epoch": 8.992805755395683,
"grad_norm": 0.37758354262761024,
"learning_rate": 2.6350328789215e-06,
"loss": 0.0437,
"step": 1250
},
{
"epoch": 9.028776978417266,
"grad_norm": 0.3581546154815773,
"learning_rate": 2.5925862334367813e-06,
"loss": 0.035,
"step": 1255
},
{
"epoch": 9.06474820143885,
"grad_norm": 0.3170867641515182,
"learning_rate": 2.550364258442671e-06,
"loss": 0.0329,
"step": 1260
},
{
"epoch": 9.100719424460431,
"grad_norm": 0.38183737001651963,
"learning_rate": 2.5083708942804446e-06,
"loss": 0.0313,
"step": 1265
},
{
"epoch": 9.136690647482014,
"grad_norm": 0.2622381036741509,
"learning_rate": 2.466610059956401e-06,
"loss": 0.0299,
"step": 1270
},
{
"epoch": 9.172661870503598,
"grad_norm": 0.3649107007802287,
"learning_rate": 2.425085652776141e-06,
"loss": 0.0298,
"step": 1275
},
{
"epoch": 9.20863309352518,
"grad_norm": 0.3672806177221026,
"learning_rate": 2.383801547980826e-06,
"loss": 0.0313,
"step": 1280
},
{
"epoch": 9.244604316546763,
"grad_norm": 0.2692943801711555,
"learning_rate": 2.342761598385543e-06,
"loss": 0.0299,
"step": 1285
},
{
"epoch": 9.280575539568344,
"grad_norm": 0.5182589947207659,
"learning_rate": 2.3019696340197358e-06,
"loss": 0.0315,
"step": 1290
},
{
"epoch": 9.316546762589928,
"grad_norm": 0.3084054383118723,
"learning_rate": 2.2614294617697718e-06,
"loss": 0.031,
"step": 1295
},
{
"epoch": 9.352517985611511,
"grad_norm": 0.31260300284142956,
"learning_rate": 2.221144865023666e-06,
"loss": 0.0355,
"step": 1300
},
{
"epoch": 9.388489208633093,
"grad_norm": 0.41419458958369776,
"learning_rate": 2.181119603317994e-06,
"loss": 0.0303,
"step": 1305
},
{
"epoch": 9.424460431654676,
"grad_norm": 0.3777963995786534,
"learning_rate": 2.141357411987044e-06,
"loss": 0.0346,
"step": 1310
},
{
"epoch": 9.46043165467626,
"grad_norm": 0.36001008845633387,
"learning_rate": 2.1018620018142145e-06,
"loss": 0.0304,
"step": 1315
},
{
"epoch": 9.496402877697841,
"grad_norm": 0.3374166061680025,
"learning_rate": 2.062637058685701e-06,
"loss": 0.0321,
"step": 1320
},
{
"epoch": 9.532374100719425,
"grad_norm": 0.3122428132703056,
"learning_rate": 2.023686243246527e-06,
"loss": 0.0311,
"step": 1325
},
{
"epoch": 9.568345323741006,
"grad_norm": 0.31657772369937653,
"learning_rate": 1.9850131905589065e-06,
"loss": 0.0352,
"step": 1330
},
{
"epoch": 9.60431654676259,
"grad_norm": 0.3556383226917608,
"learning_rate": 1.9466215097630027e-06,
"loss": 0.0307,
"step": 1335
},
{
"epoch": 9.640287769784173,
"grad_norm": 0.3018207938690679,
"learning_rate": 1.908514783740114e-06,
"loss": 0.0304,
"step": 1340
},
{
"epoch": 9.676258992805755,
"grad_norm": 0.33472427448796227,
"learning_rate": 1.8706965687783013e-06,
"loss": 0.0318,
"step": 1345
},
{
"epoch": 9.712230215827338,
"grad_norm": 0.3014237413240292,
"learning_rate": 1.8331703942404932e-06,
"loss": 0.0345,
"step": 1350
},
{
"epoch": 9.748201438848922,
"grad_norm": 0.2795954718654732,
"learning_rate": 1.7959397622351199e-06,
"loss": 0.0321,
"step": 1355
},
{
"epoch": 9.784172661870503,
"grad_norm": 0.3404458491839953,
"learning_rate": 1.7590081472892779e-06,
"loss": 0.026,
"step": 1360
},
{
"epoch": 9.820143884892087,
"grad_norm": 0.27845491491806207,
"learning_rate": 1.7223789960244636e-06,
"loss": 0.0296,
"step": 1365
},
{
"epoch": 9.85611510791367,
"grad_norm": 0.3612513367851547,
"learning_rate": 1.686055726834932e-06,
"loss": 0.03,
"step": 1370
},
{
"epoch": 9.892086330935252,
"grad_norm": 0.4797636206277336,
"learning_rate": 1.6500417295686705e-06,
"loss": 0.03,
"step": 1375
},
{
"epoch": 9.928057553956835,
"grad_norm": 0.36211393385671814,
"learning_rate": 1.614340365211044e-06,
"loss": 0.0317,
"step": 1380
},
{
"epoch": 9.964028776978417,
"grad_norm": 0.4460467422949885,
"learning_rate": 1.5789549655711283e-06,
"loss": 0.0308,
"step": 1385
},
{
"epoch": 10.0,
"grad_norm": 0.3268330088868262,
"learning_rate": 1.5438888329707824e-06,
"loss": 0.0315,
"step": 1390
},
{
"epoch": 10.035971223021583,
"grad_norm": 0.25099763594150687,
"learning_rate": 1.5091452399364514e-06,
"loss": 0.0259,
"step": 1395
},
{
"epoch": 10.071942446043165,
"grad_norm": 0.21916155615378147,
"learning_rate": 1.4747274288937597e-06,
"loss": 0.0262,
"step": 1400
},
{
"epoch": 10.107913669064748,
"grad_norm": 0.1703480324817164,
"learning_rate": 1.4406386118649219e-06,
"loss": 0.0264,
"step": 1405
},
{
"epoch": 10.14388489208633,
"grad_norm": 0.19666841057864862,
"learning_rate": 1.4068819701689761e-06,
"loss": 0.0271,
"step": 1410
},
{
"epoch": 10.179856115107913,
"grad_norm": 0.22016830192324838,
"learning_rate": 1.3734606541248868e-06,
"loss": 0.0251,
"step": 1415
},
{
"epoch": 10.215827338129497,
"grad_norm": 0.2717923478176765,
"learning_rate": 1.3403777827575515e-06,
"loss": 0.0275,
"step": 1420
},
{
"epoch": 10.251798561151078,
"grad_norm": 0.37675768752343775,
"learning_rate": 1.3076364435067145e-06,
"loss": 0.026,
"step": 1425
},
{
"epoch": 10.287769784172662,
"grad_norm": 0.21221208648318335,
"learning_rate": 1.2752396919388293e-06,
"loss": 0.0273,
"step": 1430
},
{
"epoch": 10.323741007194245,
"grad_norm": 0.19655351295102655,
"learning_rate": 1.2431905514619092e-06,
"loss": 0.0282,
"step": 1435
},
{
"epoch": 10.359712230215827,
"grad_norm": 0.20261972203630707,
"learning_rate": 1.2114920130433644e-06,
"loss": 0.0246,
"step": 1440
},
{
"epoch": 10.39568345323741,
"grad_norm": 0.2010958074650419,
"learning_rate": 1.1801470349308664e-06,
"loss": 0.0252,
"step": 1445
},
{
"epoch": 10.431654676258994,
"grad_norm": 0.19932681605206862,
"learning_rate": 1.1491585423762818e-06,
"loss": 0.0251,
"step": 1450
},
{
"epoch": 10.467625899280575,
"grad_norm": 0.20849253670352091,
"learning_rate": 1.1185294273626685e-06,
"loss": 0.0261,
"step": 1455
},
{
"epoch": 10.503597122302159,
"grad_norm": 0.1639683839169035,
"learning_rate": 1.0882625483343846e-06,
"loss": 0.0257,
"step": 1460
},
{
"epoch": 10.53956834532374,
"grad_norm": 0.2736139772755203,
"learning_rate": 1.0583607299303245e-06,
"loss": 0.0274,
"step": 1465
},
{
"epoch": 10.575539568345324,
"grad_norm": 0.22918950850642805,
"learning_rate": 1.028826762720308e-06,
"loss": 0.0275,
"step": 1470
},
{
"epoch": 10.611510791366907,
"grad_norm": 0.314745772340147,
"learning_rate": 9.996634029446672e-07,
"loss": 0.0272,
"step": 1475
},
{
"epoch": 10.647482014388489,
"grad_norm": 0.17606113078594732,
"learning_rate": 9.708733722569996e-07,
"loss": 0.0239,
"step": 1480
},
{
"epoch": 10.683453237410072,
"grad_norm": 0.17429980721967395,
"learning_rate": 9.424593574701845e-07,
"loss": 0.0301,
"step": 1485
},
{
"epoch": 10.719424460431654,
"grad_norm": 0.1852236456926952,
"learning_rate": 9.144240103056439e-07,
"loss": 0.0246,
"step": 1490
},
{
"epoch": 10.755395683453237,
"grad_norm": 0.22805051647340693,
"learning_rate": 8.867699471458541e-07,
"loss": 0.0235,
"step": 1495
},
{
"epoch": 10.79136690647482,
"grad_norm": 0.19411564486758986,
"learning_rate": 8.59499748790188e-07,
"loss": 0.0273,
"step": 1500
},
{
"epoch": 10.827338129496402,
"grad_norm": 0.241482451282526,
"learning_rate": 8.326159602140594e-07,
"loss": 0.0283,
"step": 1505
},
{
"epoch": 10.863309352517986,
"grad_norm": 0.2345245699793013,
"learning_rate": 8.061210903314104e-07,
"loss": 0.0291,
"step": 1510
},
{
"epoch": 10.899280575539569,
"grad_norm": 0.47802038072543573,
"learning_rate": 7.800176117605762e-07,
"loss": 0.0272,
"step": 1515
},
{
"epoch": 10.93525179856115,
"grad_norm": 0.19335201088686405,
"learning_rate": 7.543079605935222e-07,
"loss": 0.0255,
"step": 1520
},
{
"epoch": 10.971223021582734,
"grad_norm": 0.16860350286261763,
"learning_rate": 7.289945361685013e-07,
"loss": 0.0257,
"step": 1525
},
{
"epoch": 11.007194244604317,
"grad_norm": 0.2725776570715531,
"learning_rate": 7.040797008461386e-07,
"loss": 0.0253,
"step": 1530
},
{
"epoch": 11.043165467625899,
"grad_norm": 0.19705015027897385,
"learning_rate": 6.795657797889555e-07,
"loss": 0.0248,
"step": 1535
},
{
"epoch": 11.079136690647482,
"grad_norm": 0.18399504559736504,
"learning_rate": 6.554550607443932e-07,
"loss": 0.0259,
"step": 1540
},
{
"epoch": 11.115107913669064,
"grad_norm": 0.22286270188259397,
"learning_rate": 6.317497938312905e-07,
"loss": 0.0265,
"step": 1545
},
{
"epoch": 11.151079136690647,
"grad_norm": 0.16493904987971855,
"learning_rate": 6.08452191329903e-07,
"loss": 0.0261,
"step": 1550
},
{
"epoch": 11.18705035971223,
"grad_norm": 0.1274225765011938,
"learning_rate": 5.855644274754485e-07,
"loss": 0.0231,
"step": 1555
},
{
"epoch": 11.223021582733812,
"grad_norm": 0.17013387891479795,
"learning_rate": 5.630886382551843e-07,
"loss": 0.0258,
"step": 1560
},
{
"epoch": 11.258992805755396,
"grad_norm": 0.16983637374927996,
"learning_rate": 5.410269212090757e-07,
"loss": 0.023,
"step": 1565
},
{
"epoch": 11.29496402877698,
"grad_norm": 0.1623168682381631,
"learning_rate": 5.193813352340448e-07,
"loss": 0.0227,
"step": 1570
},
{
"epoch": 11.33093525179856,
"grad_norm": 0.15643198279648218,
"learning_rate": 4.981539003918235e-07,
"loss": 0.0264,
"step": 1575
},
{
"epoch": 11.366906474820144,
"grad_norm": 0.15647879083727156,
"learning_rate": 4.773465977204311e-07,
"loss": 0.0241,
"step": 1580
},
{
"epoch": 11.402877697841726,
"grad_norm": 0.33238336836597426,
"learning_rate": 4.5696136904929464e-07,
"loss": 0.025,
"step": 1585
},
{
"epoch": 11.43884892086331,
"grad_norm": 0.17002067755615763,
"learning_rate": 4.3700011681803436e-07,
"loss": 0.0236,
"step": 1590
},
{
"epoch": 11.474820143884893,
"grad_norm": 0.32709296265366894,
"learning_rate": 4.1746470389891323e-07,
"loss": 0.0234,
"step": 1595
},
{
"epoch": 11.510791366906474,
"grad_norm": 0.1902707074643716,
"learning_rate": 3.9835695342298643e-07,
"loss": 0.025,
"step": 1600
},
{
"epoch": 11.546762589928058,
"grad_norm": 0.35433032752427657,
"learning_rate": 3.796786486099635e-07,
"loss": 0.0261,
"step": 1605
},
{
"epoch": 11.582733812949641,
"grad_norm": 0.16376293850363066,
"learning_rate": 3.6143153260178586e-07,
"loss": 0.025,
"step": 1610
},
{
"epoch": 11.618705035971223,
"grad_norm": 0.21222268474299327,
"learning_rate": 3.436173082999489e-07,
"loss": 0.0253,
"step": 1615
},
{
"epoch": 11.654676258992806,
"grad_norm": 0.24007081237773362,
"learning_rate": 3.262376382065824e-07,
"loss": 0.0257,
"step": 1620
},
{
"epoch": 11.690647482014388,
"grad_norm": 0.17120988541747317,
"learning_rate": 3.092941442692976e-07,
"loss": 0.0248,
"step": 1625
},
{
"epoch": 11.726618705035971,
"grad_norm": 0.1887089507451267,
"learning_rate": 2.927884077298182e-07,
"loss": 0.0231,
"step": 1630
},
{
"epoch": 11.762589928057555,
"grad_norm": 0.16792432822698705,
"learning_rate": 2.7672196897641336e-07,
"loss": 0.0262,
"step": 1635
},
{
"epoch": 11.798561151079136,
"grad_norm": 0.17705501184744799,
"learning_rate": 2.610963274001438e-07,
"loss": 0.0243,
"step": 1640
},
{
"epoch": 11.83453237410072,
"grad_norm": 0.16452397650807815,
"learning_rate": 2.459129412549266e-07,
"loss": 0.023,
"step": 1645
},
{
"epoch": 11.870503597122303,
"grad_norm": 0.1649667452272,
"learning_rate": 2.311732275214501e-07,
"loss": 0.0224,
"step": 1650
},
{
"epoch": 11.906474820143885,
"grad_norm": 0.17302393323353582,
"learning_rate": 2.1687856177493137e-07,
"loss": 0.0241,
"step": 1655
},
{
"epoch": 11.942446043165468,
"grad_norm": 0.16106116378893284,
"learning_rate": 2.0303027805674447e-07,
"loss": 0.0218,
"step": 1660
},
{
"epoch": 11.97841726618705,
"grad_norm": 0.18701658079265462,
"learning_rate": 1.8962966874991773e-07,
"loss": 0.0232,
"step": 1665
},
{
"epoch": 12.014388489208633,
"grad_norm": 0.13530271509575517,
"learning_rate": 1.7667798445852703e-07,
"loss": 0.0216,
"step": 1670
},
{
"epoch": 12.050359712230216,
"grad_norm": 0.19532436363008074,
"learning_rate": 1.6417643389098182e-07,
"loss": 0.0221,
"step": 1675
},
{
"epoch": 12.086330935251798,
"grad_norm": 0.1412049337424891,
"learning_rate": 1.5212618374722155e-07,
"loss": 0.0204,
"step": 1680
},
{
"epoch": 12.122302158273381,
"grad_norm": 0.15939325045792674,
"learning_rate": 1.4052835860983937e-07,
"loss": 0.0248,
"step": 1685
},
{
"epoch": 12.158273381294965,
"grad_norm": 0.15143043177780832,
"learning_rate": 1.2938404083912502e-07,
"loss": 0.0226,
"step": 1690
},
{
"epoch": 12.194244604316546,
"grad_norm": 0.1658493522364899,
"learning_rate": 1.1869427047205673e-07,
"loss": 0.0223,
"step": 1695
},
{
"epoch": 12.23021582733813,
"grad_norm": 0.17505367364240096,
"learning_rate": 1.084600451252421e-07,
"loss": 0.0261,
"step": 1700
},
{
"epoch": 12.266187050359711,
"grad_norm": 0.14470298755684347,
"learning_rate": 9.868231990181332e-08,
"loss": 0.0225,
"step": 1705
},
{
"epoch": 12.302158273381295,
"grad_norm": 0.1492823965956767,
"learning_rate": 8.936200730229439e-08,
"loss": 0.0253,
"step": 1710
},
{
"epoch": 12.338129496402878,
"grad_norm": 0.19794064778842438,
"learning_rate": 8.049997713944158e-08,
"loss": 0.0262,
"step": 1715
},
{
"epoch": 12.37410071942446,
"grad_norm": 0.14141923113991967,
"learning_rate": 7.209705645706944e-08,
"loss": 0.0229,
"step": 1720
},
{
"epoch": 12.410071942446043,
"grad_norm": 0.1710020025729561,
"learning_rate": 6.415402945286698e-08,
"loss": 0.0245,
"step": 1725
},
{
"epoch": 12.446043165467627,
"grad_norm": 0.14036902238695775,
"learning_rate": 5.6671637405212865e-08,
"loss": 0.021,
"step": 1730
},
{
"epoch": 12.482014388489208,
"grad_norm": 0.18979536208011222,
"learning_rate": 4.9650578603996355e-08,
"loss": 0.0227,
"step": 1735
},
{
"epoch": 12.517985611510792,
"grad_norm": 0.16609096182431196,
"learning_rate": 4.309150828544939e-08,
"loss": 0.0237,
"step": 1740
},
{
"epoch": 12.553956834532373,
"grad_norm": 0.16297966249868379,
"learning_rate": 3.699503857099829e-08,
"loss": 0.0226,
"step": 1745
},
{
"epoch": 12.589928057553957,
"grad_norm": 0.14779137892803101,
"learning_rate": 3.1361738410133905e-08,
"loss": 0.0236,
"step": 1750
},
{
"epoch": 12.62589928057554,
"grad_norm": 0.15356487001181282,
"learning_rate": 2.619213352732186e-08,
"loss": 0.0256,
"step": 1755
},
{
"epoch": 12.661870503597122,
"grad_norm": 0.16653231778754538,
"learning_rate": 2.1486706372932375e-08,
"loss": 0.0229,
"step": 1760
},
{
"epoch": 12.697841726618705,
"grad_norm": 0.15764673944415686,
"learning_rate": 1.7245896078220135e-08,
"loss": 0.022,
"step": 1765
},
{
"epoch": 12.733812949640289,
"grad_norm": 0.16016051420414304,
"learning_rate": 1.3470098414340993e-08,
"loss": 0.0226,
"step": 1770
},
{
"epoch": 12.76978417266187,
"grad_norm": 0.24481997751094148,
"learning_rate": 1.0159665755417603e-08,
"loss": 0.0264,
"step": 1775
},
{
"epoch": 12.805755395683454,
"grad_norm": 0.22542723588692715,
"learning_rate": 7.314907045653519e-09,
"loss": 0.0253,
"step": 1780
},
{
"epoch": 12.841726618705035,
"grad_norm": 0.157525745290468,
"learning_rate": 4.936087770502917e-09,
"loss": 0.0239,
"step": 1785
},
{
"epoch": 12.877697841726619,
"grad_norm": 0.17750732441905473,
"learning_rate": 3.0234299318909755e-09,
"loss": 0.022,
"step": 1790
},
{
"epoch": 12.913669064748202,
"grad_norm": 0.16333347276082377,
"learning_rate": 1.5771120274993278e-09,
"loss": 0.0255,
"step": 1795
},
{
"epoch": 12.949640287769784,
"grad_norm": 0.14651237067290176,
"learning_rate": 5.972690341066178e-10,
"loss": 0.0251,
"step": 1800
},
{
"epoch": 12.985611510791367,
"grad_norm": 0.15396045493776508,
"learning_rate": 8.39923949891297e-11,
"loss": 0.0252,
"step": 1805
},
{
"epoch": 13.0,
"step": 1807,
"total_flos": 722931058671616.0,
"train_loss": 0.25000700822350524,
"train_runtime": 34410.5906,
"train_samples_per_second": 1.679,
"train_steps_per_second": 0.053
}
],
"logging_steps": 5,
"max_steps": 1807,
"num_input_tokens_seen": 0,
"num_train_epochs": 13,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 722931058671616.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}