9b-30 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
d1ef073 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1308,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0061162079510703364,
"grad_norm": 0.796875,
"learning_rate": 1.5151515151515152e-07,
"loss": 1.9698597192764282,
"step": 2
},
{
"epoch": 0.012232415902140673,
"grad_norm": 1.625,
"learning_rate": 4.5454545454545457e-07,
"loss": 2.038839101791382,
"step": 4
},
{
"epoch": 0.01834862385321101,
"grad_norm": 0.70703125,
"learning_rate": 7.575757575757576e-07,
"loss": 2.013974189758301,
"step": 6
},
{
"epoch": 0.024464831804281346,
"grad_norm": 0.83984375,
"learning_rate": 1.0606060606060608e-06,
"loss": 2.130162239074707,
"step": 8
},
{
"epoch": 0.03058103975535168,
"grad_norm": 0.7890625,
"learning_rate": 1.3636363636363636e-06,
"loss": 2.004484176635742,
"step": 10
},
{
"epoch": 0.03669724770642202,
"grad_norm": 0.8671875,
"learning_rate": 1.6666666666666667e-06,
"loss": 2.008946180343628,
"step": 12
},
{
"epoch": 0.04281345565749235,
"grad_norm": 0.74609375,
"learning_rate": 1.96969696969697e-06,
"loss": 2.0183446407318115,
"step": 14
},
{
"epoch": 0.04892966360856269,
"grad_norm": 1.078125,
"learning_rate": 2.2727272727272728e-06,
"loss": 1.9834390878677368,
"step": 16
},
{
"epoch": 0.05504587155963303,
"grad_norm": 0.75390625,
"learning_rate": 2.575757575757576e-06,
"loss": 2.044725179672241,
"step": 18
},
{
"epoch": 0.06116207951070336,
"grad_norm": 0.8828125,
"learning_rate": 2.8787878787878793e-06,
"loss": 2.3765246868133545,
"step": 20
},
{
"epoch": 0.0672782874617737,
"grad_norm": 0.62890625,
"learning_rate": 3.181818181818182e-06,
"loss": 2.135927438735962,
"step": 22
},
{
"epoch": 0.07339449541284404,
"grad_norm": 1.1484375,
"learning_rate": 3.4848484848484854e-06,
"loss": 2.1705098152160645,
"step": 24
},
{
"epoch": 0.07951070336391437,
"grad_norm": 1.2421875,
"learning_rate": 3.7878787878787882e-06,
"loss": 2.0761096477508545,
"step": 26
},
{
"epoch": 0.0856269113149847,
"grad_norm": 2.03125,
"learning_rate": 4.0909090909090915e-06,
"loss": 2.0886521339416504,
"step": 28
},
{
"epoch": 0.09174311926605505,
"grad_norm": 0.921875,
"learning_rate": 4.393939393939394e-06,
"loss": 2.201143980026245,
"step": 30
},
{
"epoch": 0.09785932721712538,
"grad_norm": 8.625,
"learning_rate": 4.696969696969698e-06,
"loss": 2.045245885848999,
"step": 32
},
{
"epoch": 0.10397553516819572,
"grad_norm": 0.65234375,
"learning_rate": 5e-06,
"loss": 1.907004475593567,
"step": 34
},
{
"epoch": 0.11009174311926606,
"grad_norm": 1.2265625,
"learning_rate": 5.303030303030303e-06,
"loss": 1.9484505653381348,
"step": 36
},
{
"epoch": 0.1162079510703364,
"grad_norm": 0.77734375,
"learning_rate": 5.606060606060606e-06,
"loss": 1.963153600692749,
"step": 38
},
{
"epoch": 0.12232415902140673,
"grad_norm": 0.51953125,
"learning_rate": 5.90909090909091e-06,
"loss": 2.0666375160217285,
"step": 40
},
{
"epoch": 0.12844036697247707,
"grad_norm": 0.4453125,
"learning_rate": 6.212121212121213e-06,
"loss": 1.9147648811340332,
"step": 42
},
{
"epoch": 0.1345565749235474,
"grad_norm": 0.59765625,
"learning_rate": 6.515151515151516e-06,
"loss": 1.8978981971740723,
"step": 44
},
{
"epoch": 0.14067278287461774,
"grad_norm": 0.61328125,
"learning_rate": 6.818181818181818e-06,
"loss": 1.8800417184829712,
"step": 46
},
{
"epoch": 0.14678899082568808,
"grad_norm": 0.78125,
"learning_rate": 7.121212121212122e-06,
"loss": 2.1724555492401123,
"step": 48
},
{
"epoch": 0.1529051987767584,
"grad_norm": 0.8046875,
"learning_rate": 7.424242424242425e-06,
"loss": 2.0132710933685303,
"step": 50
},
{
"epoch": 0.15902140672782875,
"grad_norm": 0.53515625,
"learning_rate": 7.727272727272727e-06,
"loss": 1.9249347448349,
"step": 52
},
{
"epoch": 0.1651376146788991,
"grad_norm": 0.5625,
"learning_rate": 8.03030303030303e-06,
"loss": 2.0230674743652344,
"step": 54
},
{
"epoch": 0.1712538226299694,
"grad_norm": 0.46484375,
"learning_rate": 8.333333333333334e-06,
"loss": 1.8684461116790771,
"step": 56
},
{
"epoch": 0.17737003058103976,
"grad_norm": 0.68359375,
"learning_rate": 8.636363636363637e-06,
"loss": 1.9816838502883911,
"step": 58
},
{
"epoch": 0.1834862385321101,
"grad_norm": 0.70703125,
"learning_rate": 8.93939393939394e-06,
"loss": 1.8070955276489258,
"step": 60
},
{
"epoch": 0.18960244648318042,
"grad_norm": 0.71875,
"learning_rate": 9.242424242424244e-06,
"loss": 1.9102303981781006,
"step": 62
},
{
"epoch": 0.19571865443425077,
"grad_norm": 0.703125,
"learning_rate": 9.545454545454547e-06,
"loss": 1.761095643043518,
"step": 64
},
{
"epoch": 0.2018348623853211,
"grad_norm": 0.9765625,
"learning_rate": 9.84848484848485e-06,
"loss": 1.8172270059585571,
"step": 66
},
{
"epoch": 0.20795107033639143,
"grad_norm": 0.69921875,
"learning_rate": 9.99998560409937e-06,
"loss": 1.7951654195785522,
"step": 68
},
{
"epoch": 0.21406727828746178,
"grad_norm": 0.59375,
"learning_rate": 9.999870437446959e-06,
"loss": 1.7246266603469849,
"step": 70
},
{
"epoch": 0.22018348623853212,
"grad_norm": 0.61328125,
"learning_rate": 9.99964010708956e-06,
"loss": 1.7382261753082275,
"step": 72
},
{
"epoch": 0.22629969418960244,
"grad_norm": 0.93359375,
"learning_rate": 9.999294618921943e-06,
"loss": 1.8094028234481812,
"step": 74
},
{
"epoch": 0.2324159021406728,
"grad_norm": 0.69140625,
"learning_rate": 9.998833981786072e-06,
"loss": 1.7889823913574219,
"step": 76
},
{
"epoch": 0.23853211009174313,
"grad_norm": 0.515625,
"learning_rate": 9.998258207470882e-06,
"loss": 1.7645984888076782,
"step": 78
},
{
"epoch": 0.24464831804281345,
"grad_norm": 1.6328125,
"learning_rate": 9.997567310711977e-06,
"loss": 1.692162275314331,
"step": 80
},
{
"epoch": 0.25076452599388377,
"grad_norm": 0.38671875,
"learning_rate": 9.996761309191248e-06,
"loss": 1.6656694412231445,
"step": 82
},
{
"epoch": 0.25688073394495414,
"grad_norm": 1.0859375,
"learning_rate": 9.995840223536428e-06,
"loss": 1.69821035861969,
"step": 84
},
{
"epoch": 0.26299694189602446,
"grad_norm": 0.55078125,
"learning_rate": 9.99480407732056e-06,
"loss": 1.693019986152649,
"step": 86
},
{
"epoch": 0.2691131498470948,
"grad_norm": 0.66796875,
"learning_rate": 9.993652897061394e-06,
"loss": 1.585938572883606,
"step": 88
},
{
"epoch": 0.27522935779816515,
"grad_norm": 0.55859375,
"learning_rate": 9.99238671222071e-06,
"loss": 1.5834678411483765,
"step": 90
},
{
"epoch": 0.28134556574923547,
"grad_norm": 0.72265625,
"learning_rate": 9.991005555203553e-06,
"loss": 1.5904253721237183,
"step": 92
},
{
"epoch": 0.2874617737003058,
"grad_norm": 0.93359375,
"learning_rate": 9.989509461357428e-06,
"loss": 1.7213293313980103,
"step": 94
},
{
"epoch": 0.29357798165137616,
"grad_norm": 0.96875,
"learning_rate": 9.98789846897137e-06,
"loss": 1.59124755859375,
"step": 96
},
{
"epoch": 0.2996941896024465,
"grad_norm": 0.328125,
"learning_rate": 9.986172619274977e-06,
"loss": 1.4882735013961792,
"step": 98
},
{
"epoch": 0.3058103975535168,
"grad_norm": 0.490234375,
"learning_rate": 9.984331956437354e-06,
"loss": 1.6401163339614868,
"step": 100
},
{
"epoch": 0.3119266055045872,
"grad_norm": 0.484375,
"learning_rate": 9.982376527565981e-06,
"loss": 1.6229268312454224,
"step": 102
},
{
"epoch": 0.3180428134556575,
"grad_norm": 0.6953125,
"learning_rate": 9.980306382705504e-06,
"loss": 1.6486362218856812,
"step": 104
},
{
"epoch": 0.3241590214067278,
"grad_norm": 0.4921875,
"learning_rate": 9.978121574836463e-06,
"loss": 1.7563343048095703,
"step": 106
},
{
"epoch": 0.3302752293577982,
"grad_norm": 0.55078125,
"learning_rate": 9.975822159873925e-06,
"loss": 1.5931520462036133,
"step": 108
},
{
"epoch": 0.3363914373088685,
"grad_norm": 0.490234375,
"learning_rate": 9.973408196666062e-06,
"loss": 1.6376924514770508,
"step": 110
},
{
"epoch": 0.3425076452599388,
"grad_norm": 1.0234375,
"learning_rate": 9.970879746992641e-06,
"loss": 1.6083383560180664,
"step": 112
},
{
"epoch": 0.3486238532110092,
"grad_norm": 0.408203125,
"learning_rate": 9.968236875563444e-06,
"loss": 1.5672008991241455,
"step": 114
},
{
"epoch": 0.3547400611620795,
"grad_norm": 0.400390625,
"learning_rate": 9.965479650016611e-06,
"loss": 1.5744966268539429,
"step": 116
},
{
"epoch": 0.36085626911314983,
"grad_norm": 0.51171875,
"learning_rate": 9.962608140916906e-06,
"loss": 1.6350196599960327,
"step": 118
},
{
"epoch": 0.3669724770642202,
"grad_norm": 0.4609375,
"learning_rate": 9.959622421753922e-06,
"loss": 1.4963032007217407,
"step": 120
},
{
"epoch": 0.3730886850152905,
"grad_norm": 0.486328125,
"learning_rate": 9.956522568940185e-06,
"loss": 1.5451488494873047,
"step": 122
},
{
"epoch": 0.37920489296636084,
"grad_norm": 0.439453125,
"learning_rate": 9.953308661809209e-06,
"loss": 1.599358320236206,
"step": 124
},
{
"epoch": 0.3853211009174312,
"grad_norm": 0.37890625,
"learning_rate": 9.949980782613466e-06,
"loss": 1.5644880533218384,
"step": 126
},
{
"epoch": 0.39143730886850153,
"grad_norm": 0.6171875,
"learning_rate": 9.94653901652227e-06,
"loss": 1.6034414768218994,
"step": 128
},
{
"epoch": 0.39755351681957185,
"grad_norm": 0.55078125,
"learning_rate": 9.942983451619614e-06,
"loss": 1.6047066450119019,
"step": 130
},
{
"epoch": 0.4036697247706422,
"grad_norm": 0.4140625,
"learning_rate": 9.939314178901898e-06,
"loss": 1.5338762998580933,
"step": 132
},
{
"epoch": 0.40978593272171254,
"grad_norm": 0.60546875,
"learning_rate": 9.935531292275615e-06,
"loss": 1.5983346700668335,
"step": 134
},
{
"epoch": 0.41590214067278286,
"grad_norm": 0.390625,
"learning_rate": 9.931634888554937e-06,
"loss": 1.4490175247192383,
"step": 136
},
{
"epoch": 0.42201834862385323,
"grad_norm": 0.86328125,
"learning_rate": 9.927625067459245e-06,
"loss": 1.43030846118927,
"step": 138
},
{
"epoch": 0.42813455657492355,
"grad_norm": 0.369140625,
"learning_rate": 9.923501931610571e-06,
"loss": 1.5441913604736328,
"step": 140
},
{
"epoch": 0.43425076452599387,
"grad_norm": 1.296875,
"learning_rate": 9.919265586530977e-06,
"loss": 1.5886114835739136,
"step": 142
},
{
"epoch": 0.44036697247706424,
"grad_norm": 0.8203125,
"learning_rate": 9.914916140639849e-06,
"loss": 1.5252549648284912,
"step": 144
},
{
"epoch": 0.44648318042813456,
"grad_norm": 0.353515625,
"learning_rate": 9.910453705251127e-06,
"loss": 1.4197413921356201,
"step": 146
},
{
"epoch": 0.4525993883792049,
"grad_norm": 0.5390625,
"learning_rate": 9.905878394570453e-06,
"loss": 1.5738030672073364,
"step": 148
},
{
"epoch": 0.45871559633027525,
"grad_norm": 0.55859375,
"learning_rate": 9.90119032569225e-06,
"loss": 1.595241904258728,
"step": 150
},
{
"epoch": 0.4648318042813456,
"grad_norm": 0.515625,
"learning_rate": 9.89638961859672e-06,
"loss": 1.5898534059524536,
"step": 152
},
{
"epoch": 0.4709480122324159,
"grad_norm": 0.5078125,
"learning_rate": 9.891476396146785e-06,
"loss": 1.5508402585983276,
"step": 154
},
{
"epoch": 0.47706422018348627,
"grad_norm": 0.404296875,
"learning_rate": 9.886450784084934e-06,
"loss": 1.5691711902618408,
"step": 156
},
{
"epoch": 0.4831804281345566,
"grad_norm": 0.85546875,
"learning_rate": 9.88131291103e-06,
"loss": 1.5895097255706787,
"step": 158
},
{
"epoch": 0.4892966360856269,
"grad_norm": 0.58984375,
"learning_rate": 9.876062908473883e-06,
"loss": 1.5543285608291626,
"step": 160
},
{
"epoch": 0.4954128440366973,
"grad_norm": 0.375,
"learning_rate": 9.870700910778169e-06,
"loss": 1.4683598279953003,
"step": 162
},
{
"epoch": 0.5015290519877675,
"grad_norm": 1.328125,
"learning_rate": 9.865227055170706e-06,
"loss": 1.4957642555236816,
"step": 164
},
{
"epoch": 0.5076452599388379,
"grad_norm": 0.54296875,
"learning_rate": 9.85964148174208e-06,
"loss": 1.448598027229309,
"step": 166
},
{
"epoch": 0.5137614678899083,
"grad_norm": 0.54296875,
"learning_rate": 9.853944333442036e-06,
"loss": 1.4433187246322632,
"step": 168
},
{
"epoch": 0.5198776758409785,
"grad_norm": 5.46875,
"learning_rate": 9.848135756075816e-06,
"loss": 1.500611424446106,
"step": 170
},
{
"epoch": 0.5259938837920489,
"grad_norm": 0.435546875,
"learning_rate": 9.842215898300434e-06,
"loss": 1.4782170057296753,
"step": 172
},
{
"epoch": 0.5321100917431193,
"grad_norm": 0.35546875,
"learning_rate": 9.836184911620863e-06,
"loss": 1.485479712486267,
"step": 174
},
{
"epoch": 0.5382262996941896,
"grad_norm": 1.53125,
"learning_rate": 9.830042950386162e-06,
"loss": 1.5060051679611206,
"step": 176
},
{
"epoch": 0.5443425076452599,
"grad_norm": 0.3671875,
"learning_rate": 9.823790171785527e-06,
"loss": 1.4704962968826294,
"step": 178
},
{
"epoch": 0.5504587155963303,
"grad_norm": 0.4765625,
"learning_rate": 9.817426735844265e-06,
"loss": 1.4355278015136719,
"step": 180
},
{
"epoch": 0.5565749235474006,
"grad_norm": 0.55078125,
"learning_rate": 9.810952805419701e-06,
"loss": 1.5194344520568848,
"step": 182
},
{
"epoch": 0.5626911314984709,
"grad_norm": 0.455078125,
"learning_rate": 9.804368546197007e-06,
"loss": 1.5073320865631104,
"step": 184
},
{
"epoch": 0.5688073394495413,
"grad_norm": 0.6640625,
"learning_rate": 9.797674126684967e-06,
"loss": 1.522252082824707,
"step": 186
},
{
"epoch": 0.5749235474006116,
"grad_norm": 0.5390625,
"learning_rate": 9.790869718211657e-06,
"loss": 1.6073163747787476,
"step": 188
},
{
"epoch": 0.581039755351682,
"grad_norm": 0.416015625,
"learning_rate": 9.783955494920067e-06,
"loss": 1.4052844047546387,
"step": 190
},
{
"epoch": 0.5871559633027523,
"grad_norm": 0.3515625,
"learning_rate": 9.77693163376364e-06,
"loss": 1.4193068742752075,
"step": 192
},
{
"epoch": 0.5932721712538226,
"grad_norm": 0.4609375,
"learning_rate": 9.76979831450175e-06,
"loss": 1.5307352542877197,
"step": 194
},
{
"epoch": 0.599388379204893,
"grad_norm": 0.5625,
"learning_rate": 9.76255571969509e-06,
"loss": 1.424899697303772,
"step": 196
},
{
"epoch": 0.6055045871559633,
"grad_norm": 0.52734375,
"learning_rate": 9.755204034701004e-06,
"loss": 1.359844446182251,
"step": 198
},
{
"epoch": 0.6116207951070336,
"grad_norm": 0.5078125,
"learning_rate": 9.747743447668755e-06,
"loss": 1.582168459892273,
"step": 200
},
{
"epoch": 0.617737003058104,
"grad_norm": 0.470703125,
"learning_rate": 9.740174149534694e-06,
"loss": 1.488830327987671,
"step": 202
},
{
"epoch": 0.6238532110091743,
"grad_norm": 0.5,
"learning_rate": 9.732496334017376e-06,
"loss": 1.4927191734313965,
"step": 204
},
{
"epoch": 0.6299694189602446,
"grad_norm": 0.45703125,
"learning_rate": 9.724710197612615e-06,
"loss": 1.4716768264770508,
"step": 206
},
{
"epoch": 0.636085626911315,
"grad_norm": 0.408203125,
"learning_rate": 9.716815939588437e-06,
"loss": 1.3903311491012573,
"step": 208
},
{
"epoch": 0.6422018348623854,
"grad_norm": 0.341796875,
"learning_rate": 9.708813761979992e-06,
"loss": 1.5344760417938232,
"step": 210
},
{
"epoch": 0.6483180428134556,
"grad_norm": 0.73046875,
"learning_rate": 9.700703869584386e-06,
"loss": 1.4522379636764526,
"step": 212
},
{
"epoch": 0.654434250764526,
"grad_norm": 0.35546875,
"learning_rate": 9.692486469955425e-06,
"loss": 1.3874422311782837,
"step": 214
},
{
"epoch": 0.6605504587155964,
"grad_norm": 0.2734375,
"learning_rate": 9.684161773398321e-06,
"loss": 1.3861643075942993,
"step": 216
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.55859375,
"learning_rate": 9.675729992964292e-06,
"loss": 1.5152150392532349,
"step": 218
},
{
"epoch": 0.672782874617737,
"grad_norm": 0.98828125,
"learning_rate": 9.667191344445123e-06,
"loss": 1.3514238595962524,
"step": 220
},
{
"epoch": 0.6788990825688074,
"grad_norm": 0.70703125,
"learning_rate": 9.658546046367646e-06,
"loss": 1.39436936378479,
"step": 222
},
{
"epoch": 0.6850152905198776,
"grad_norm": 0.69140625,
"learning_rate": 9.649794319988121e-06,
"loss": 1.4995126724243164,
"step": 224
},
{
"epoch": 0.691131498470948,
"grad_norm": 0.490234375,
"learning_rate": 9.640936389286617e-06,
"loss": 1.4583836793899536,
"step": 226
},
{
"epoch": 0.6972477064220184,
"grad_norm": 0.61328125,
"learning_rate": 9.631972480961235e-06,
"loss": 1.4303733110427856,
"step": 228
},
{
"epoch": 0.7033639143730887,
"grad_norm": 0.421875,
"learning_rate": 9.622902824422336e-06,
"loss": 1.393810749053955,
"step": 230
},
{
"epoch": 0.709480122324159,
"grad_norm": 0.453125,
"learning_rate": 9.613727651786659e-06,
"loss": 1.51703679561615,
"step": 232
},
{
"epoch": 0.7155963302752294,
"grad_norm": 0.5,
"learning_rate": 9.604447197871382e-06,
"loss": 1.373485803604126,
"step": 234
},
{
"epoch": 0.7217125382262997,
"grad_norm": 0.384765625,
"learning_rate": 9.59506170018811e-06,
"loss": 1.4396356344223022,
"step": 236
},
{
"epoch": 0.72782874617737,
"grad_norm": 0.5859375,
"learning_rate": 9.5855713989368e-06,
"loss": 1.5568106174468994,
"step": 238
},
{
"epoch": 0.7339449541284404,
"grad_norm": 0.94140625,
"learning_rate": 9.575976536999616e-06,
"loss": 1.4187113046646118,
"step": 240
},
{
"epoch": 0.7400611620795107,
"grad_norm": 0.423828125,
"learning_rate": 9.566277359934703e-06,
"loss": 1.4353150129318237,
"step": 242
},
{
"epoch": 0.746177370030581,
"grad_norm": 0.62109375,
"learning_rate": 9.556474115969911e-06,
"loss": 1.5181076526641846,
"step": 244
},
{
"epoch": 0.7522935779816514,
"grad_norm": 0.408203125,
"learning_rate": 9.546567055996441e-06,
"loss": 1.4269428253173828,
"step": 246
},
{
"epoch": 0.7584097859327217,
"grad_norm": 0.4765625,
"learning_rate": 9.536556433562422e-06,
"loss": 1.4407360553741455,
"step": 248
},
{
"epoch": 0.764525993883792,
"grad_norm": 0.373046875,
"learning_rate": 9.526442504866427e-06,
"loss": 1.3571839332580566,
"step": 250
},
{
"epoch": 0.7706422018348624,
"grad_norm": 0.41796875,
"learning_rate": 9.516225528750904e-06,
"loss": 1.4300589561462402,
"step": 252
},
{
"epoch": 0.7767584097859327,
"grad_norm": 0.59375,
"learning_rate": 9.505905766695564e-06,
"loss": 1.5078905820846558,
"step": 254
},
{
"epoch": 0.7828746177370031,
"grad_norm": 0.64453125,
"learning_rate": 9.495483482810688e-06,
"loss": 1.456427812576294,
"step": 256
},
{
"epoch": 0.7889908256880734,
"grad_norm": 0.546875,
"learning_rate": 9.484958943830363e-06,
"loss": 1.4158270359039307,
"step": 258
},
{
"epoch": 0.7951070336391437,
"grad_norm": 0.3984375,
"learning_rate": 9.474332419105652e-06,
"loss": 1.3977278470993042,
"step": 260
},
{
"epoch": 0.8012232415902141,
"grad_norm": 0.51953125,
"learning_rate": 9.463604180597712e-06,
"loss": 1.3898099660873413,
"step": 262
},
{
"epoch": 0.8073394495412844,
"grad_norm": 0.58203125,
"learning_rate": 9.452774502870822e-06,
"loss": 1.4355534315109253,
"step": 264
},
{
"epoch": 0.8134556574923547,
"grad_norm": 0.59375,
"learning_rate": 9.441843663085368e-06,
"loss": 1.454459309577942,
"step": 266
},
{
"epoch": 0.8195718654434251,
"grad_norm": 0.4453125,
"learning_rate": 9.430811940990736e-06,
"loss": 1.4455972909927368,
"step": 268
},
{
"epoch": 0.8256880733944955,
"grad_norm": 0.57421875,
"learning_rate": 9.419679618918164e-06,
"loss": 1.381105661392212,
"step": 270
},
{
"epoch": 0.8318042813455657,
"grad_norm": 0.46484375,
"learning_rate": 9.408446981773514e-06,
"loss": 1.4196290969848633,
"step": 272
},
{
"epoch": 0.8379204892966361,
"grad_norm": 1.1328125,
"learning_rate": 9.397114317029975e-06,
"loss": 1.4939439296722412,
"step": 274
},
{
"epoch": 0.8440366972477065,
"grad_norm": 0.5859375,
"learning_rate": 9.38568191472071e-06,
"loss": 1.4450997114181519,
"step": 276
},
{
"epoch": 0.8501529051987767,
"grad_norm": 0.6640625,
"learning_rate": 9.374150067431433e-06,
"loss": 1.4556881189346313,
"step": 278
},
{
"epoch": 0.8562691131498471,
"grad_norm": 0.36328125,
"learning_rate": 9.362519070292924e-06,
"loss": 1.3958441019058228,
"step": 280
},
{
"epoch": 0.8623853211009175,
"grad_norm": 0.55078125,
"learning_rate": 9.350789220973468e-06,
"loss": 1.492562174797058,
"step": 282
},
{
"epoch": 0.8685015290519877,
"grad_norm": 0.33203125,
"learning_rate": 9.33896081967124e-06,
"loss": 1.3768280744552612,
"step": 284
},
{
"epoch": 0.8746177370030581,
"grad_norm": 0.94140625,
"learning_rate": 9.32703416910663e-06,
"loss": 1.3635163307189941,
"step": 286
},
{
"epoch": 0.8807339449541285,
"grad_norm": 0.703125,
"learning_rate": 9.315009574514487e-06,
"loss": 1.3402776718139648,
"step": 288
},
{
"epoch": 0.8868501529051988,
"grad_norm": 0.328125,
"learning_rate": 9.302887343636305e-06,
"loss": 1.4155652523040771,
"step": 290
},
{
"epoch": 0.8929663608562691,
"grad_norm": 0.453125,
"learning_rate": 9.290667786712354e-06,
"loss": 1.5360904932022095,
"step": 292
},
{
"epoch": 0.8990825688073395,
"grad_norm": 0.447265625,
"learning_rate": 9.278351216473737e-06,
"loss": 1.4269368648529053,
"step": 294
},
{
"epoch": 0.9051987767584098,
"grad_norm": 2.40625,
"learning_rate": 9.265937948134393e-06,
"loss": 1.4990252256393433,
"step": 296
},
{
"epoch": 0.9113149847094801,
"grad_norm": 0.37890625,
"learning_rate": 9.253428299383013e-06,
"loss": 1.4629848003387451,
"step": 298
},
{
"epoch": 0.9174311926605505,
"grad_norm": 0.66015625,
"learning_rate": 9.240822590374927e-06,
"loss": 1.3986918926239014,
"step": 300
},
{
"epoch": 0.9235474006116208,
"grad_norm": 0.40234375,
"learning_rate": 9.228121143723901e-06,
"loss": 1.374011754989624,
"step": 302
},
{
"epoch": 0.9296636085626911,
"grad_norm": 0.33203125,
"learning_rate": 9.215324284493888e-06,
"loss": 1.3964948654174805,
"step": 304
},
{
"epoch": 0.9357798165137615,
"grad_norm": 0.43359375,
"learning_rate": 9.202432340190696e-06,
"loss": 1.3667786121368408,
"step": 306
},
{
"epoch": 0.9418960244648318,
"grad_norm": 0.380859375,
"learning_rate": 9.18944564075362e-06,
"loss": 1.3175499439239502,
"step": 308
},
{
"epoch": 0.9480122324159022,
"grad_norm": 0.376953125,
"learning_rate": 9.17636451854699e-06,
"loss": 1.3974062204360962,
"step": 310
},
{
"epoch": 0.9541284403669725,
"grad_norm": 0.578125,
"learning_rate": 9.163189308351666e-06,
"loss": 1.405277132987976,
"step": 312
},
{
"epoch": 0.9602446483180428,
"grad_norm": 0.59375,
"learning_rate": 9.149920347356472e-06,
"loss": 1.4029018878936768,
"step": 314
},
{
"epoch": 0.9663608562691132,
"grad_norm": 0.41015625,
"learning_rate": 9.136557975149563e-06,
"loss": 1.3701725006103516,
"step": 316
},
{
"epoch": 0.9724770642201835,
"grad_norm": 0.5859375,
"learning_rate": 9.12310253370974e-06,
"loss": 1.4639108180999756,
"step": 318
},
{
"epoch": 0.9785932721712538,
"grad_norm": 1.140625,
"learning_rate": 9.109554367397699e-06,
"loss": 1.3428951501846313,
"step": 320
},
{
"epoch": 0.9847094801223242,
"grad_norm": 0.9296875,
"learning_rate": 9.095913822947197e-06,
"loss": 1.2543865442276,
"step": 322
},
{
"epoch": 0.9908256880733946,
"grad_norm": 0.4296875,
"learning_rate": 9.082181249456211e-06,
"loss": 1.287245512008667,
"step": 324
},
{
"epoch": 0.9969418960244648,
"grad_norm": 0.404296875,
"learning_rate": 9.06835699837798e-06,
"loss": 1.3998972177505493,
"step": 326
},
{
"epoch": 1.003058103975535,
"grad_norm": 0.4921875,
"learning_rate": 9.054441423512015e-06,
"loss": 1.381530523300171,
"step": 328
},
{
"epoch": 1.0091743119266054,
"grad_norm": 0.380859375,
"learning_rate": 9.040434880995052e-06,
"loss": 1.388320803642273,
"step": 330
},
{
"epoch": 1.0152905198776758,
"grad_norm": 0.478515625,
"learning_rate": 9.026337729291927e-06,
"loss": 1.3628325462341309,
"step": 332
},
{
"epoch": 1.0214067278287462,
"grad_norm": 0.447265625,
"learning_rate": 9.012150329186412e-06,
"loss": 1.3321391344070435,
"step": 334
},
{
"epoch": 1.0275229357798166,
"grad_norm": 0.53515625,
"learning_rate": 8.997873043771974e-06,
"loss": 1.4002933502197266,
"step": 336
},
{
"epoch": 1.033639143730887,
"grad_norm": 0.55859375,
"learning_rate": 8.983506238442486e-06,
"loss": 1.3850795030593872,
"step": 338
},
{
"epoch": 1.039755351681957,
"grad_norm": 0.8984375,
"learning_rate": 8.969050280882873e-06,
"loss": 1.3575987815856934,
"step": 340
},
{
"epoch": 1.0458715596330275,
"grad_norm": 1.671875,
"learning_rate": 8.954505541059707e-06,
"loss": 1.448436975479126,
"step": 342
},
{
"epoch": 1.0519877675840978,
"grad_norm": 0.51953125,
"learning_rate": 8.939872391211732e-06,
"loss": 1.3202804327011108,
"step": 344
},
{
"epoch": 1.0581039755351682,
"grad_norm": 0.5703125,
"learning_rate": 8.925151205840343e-06,
"loss": 1.335026741027832,
"step": 346
},
{
"epoch": 1.0642201834862386,
"grad_norm": 0.412109375,
"learning_rate": 8.910342361699996e-06,
"loss": 1.3237738609313965,
"step": 348
},
{
"epoch": 1.070336391437309,
"grad_norm": 0.51171875,
"learning_rate": 8.895446237788574e-06,
"loss": 1.3708387613296509,
"step": 350
},
{
"epoch": 1.0764525993883791,
"grad_norm": 0.400390625,
"learning_rate": 8.88046321533768e-06,
"loss": 1.3443958759307861,
"step": 352
},
{
"epoch": 1.0825688073394495,
"grad_norm": 1.2578125,
"learning_rate": 8.865393677802882e-06,
"loss": 1.231789231300354,
"step": 354
},
{
"epoch": 1.0886850152905199,
"grad_norm": 0.455078125,
"learning_rate": 8.850238010853902e-06,
"loss": 1.3118000030517578,
"step": 356
},
{
"epoch": 1.0948012232415902,
"grad_norm": 0.46484375,
"learning_rate": 8.834996602364738e-06,
"loss": 1.449766993522644,
"step": 358
},
{
"epoch": 1.1009174311926606,
"grad_norm": 0.53515625,
"learning_rate": 8.81966984240375e-06,
"loss": 1.3435068130493164,
"step": 360
},
{
"epoch": 1.107033639143731,
"grad_norm": 5.9375,
"learning_rate": 8.80425812322367e-06,
"loss": 1.2645937204360962,
"step": 362
},
{
"epoch": 1.1131498470948011,
"grad_norm": 0.64453125,
"learning_rate": 8.78876183925156e-06,
"loss": 1.2984048128128052,
"step": 364
},
{
"epoch": 1.1192660550458715,
"grad_norm": 0.5546875,
"learning_rate": 8.77318138707872e-06,
"loss": 1.3319660425186157,
"step": 366
},
{
"epoch": 1.1253822629969419,
"grad_norm": 0.97265625,
"learning_rate": 8.757517165450543e-06,
"loss": 1.3149017095565796,
"step": 368
},
{
"epoch": 1.1314984709480123,
"grad_norm": 0.4375,
"learning_rate": 8.741769575256306e-06,
"loss": 1.3030190467834473,
"step": 370
},
{
"epoch": 1.1376146788990826,
"grad_norm": 0.45703125,
"learning_rate": 8.725939019518902e-06,
"loss": 1.2816126346588135,
"step": 372
},
{
"epoch": 1.143730886850153,
"grad_norm": 0.640625,
"learning_rate": 8.710025903384548e-06,
"loss": 1.3657718896865845,
"step": 374
},
{
"epoch": 1.1498470948012232,
"grad_norm": 0.408203125,
"learning_rate": 8.69403063411239e-06,
"loss": 1.2346255779266357,
"step": 376
},
{
"epoch": 1.1559633027522935,
"grad_norm": 0.35546875,
"learning_rate": 8.6779536210641e-06,
"loss": 1.2943626642227173,
"step": 378
},
{
"epoch": 1.162079510703364,
"grad_norm": 0.515625,
"learning_rate": 8.661795275693385e-06,
"loss": 1.3616715669631958,
"step": 380
},
{
"epoch": 1.1681957186544343,
"grad_norm": 0.447265625,
"learning_rate": 8.64555601153547e-06,
"loss": 1.2533824443817139,
"step": 382
},
{
"epoch": 1.1743119266055047,
"grad_norm": 0.56640625,
"learning_rate": 8.629236244196502e-06,
"loss": 1.287404179573059,
"step": 384
},
{
"epoch": 1.1804281345565748,
"grad_norm": 0.62890625,
"learning_rate": 8.612836391342925e-06,
"loss": 1.3631038665771484,
"step": 386
},
{
"epoch": 1.1865443425076452,
"grad_norm": 0.41015625,
"learning_rate": 8.596356872690779e-06,
"loss": 1.3277571201324463,
"step": 388
},
{
"epoch": 1.1926605504587156,
"grad_norm": 0.50390625,
"learning_rate": 8.579798109994968e-06,
"loss": 1.3345115184783936,
"step": 390
},
{
"epoch": 1.198776758409786,
"grad_norm": 0.458984375,
"learning_rate": 8.563160527038467e-06,
"loss": 1.2454558610916138,
"step": 392
},
{
"epoch": 1.2048929663608563,
"grad_norm": 0.53125,
"learning_rate": 8.546444549621467e-06,
"loss": 1.3097434043884277,
"step": 394
},
{
"epoch": 1.2110091743119267,
"grad_norm": 0.470703125,
"learning_rate": 8.529650605550478e-06,
"loss": 1.2673131227493286,
"step": 396
},
{
"epoch": 1.217125382262997,
"grad_norm": 0.55859375,
"learning_rate": 8.512779124627395e-06,
"loss": 1.4371856451034546,
"step": 398
},
{
"epoch": 1.2232415902140672,
"grad_norm": 0.392578125,
"learning_rate": 8.495830538638481e-06,
"loss": 1.2818241119384766,
"step": 400
},
{
"epoch": 1.2293577981651376,
"grad_norm": 0.48046875,
"learning_rate": 8.478805281343335e-06,
"loss": 1.215641736984253,
"step": 402
},
{
"epoch": 1.235474006116208,
"grad_norm": 2.296875,
"learning_rate": 8.461703788463757e-06,
"loss": 1.2823781967163086,
"step": 404
},
{
"epoch": 1.2415902140672783,
"grad_norm": 0.671875,
"learning_rate": 8.44452649767264e-06,
"loss": 1.3114620447158813,
"step": 406
},
{
"epoch": 1.2477064220183487,
"grad_norm": 0.62890625,
"learning_rate": 8.427273848582744e-06,
"loss": 1.2511239051818848,
"step": 408
},
{
"epoch": 1.2538226299694188,
"grad_norm": 0.5234375,
"learning_rate": 8.40994628273544e-06,
"loss": 1.2478758096694946,
"step": 410
},
{
"epoch": 1.2599388379204892,
"grad_norm": 0.5390625,
"learning_rate": 8.392544243589428e-06,
"loss": 1.3285698890686035,
"step": 412
},
{
"epoch": 1.2660550458715596,
"grad_norm": 0.74609375,
"learning_rate": 8.375068176509375e-06,
"loss": 1.3709665536880493,
"step": 414
},
{
"epoch": 1.27217125382263,
"grad_norm": 0.703125,
"learning_rate": 8.357518528754524e-06,
"loss": 1.3329336643218994,
"step": 416
},
{
"epoch": 1.2782874617737003,
"grad_norm": 0.53515625,
"learning_rate": 8.339895749467238e-06,
"loss": 1.2674789428710938,
"step": 418
},
{
"epoch": 1.2844036697247707,
"grad_norm": 0.80859375,
"learning_rate": 8.322200289661517e-06,
"loss": 1.152662992477417,
"step": 420
},
{
"epoch": 1.290519877675841,
"grad_norm": 0.45703125,
"learning_rate": 8.304432602211446e-06,
"loss": 1.3445444107055664,
"step": 422
},
{
"epoch": 1.2966360856269112,
"grad_norm": 0.578125,
"learning_rate": 8.28659314183961e-06,
"loss": 1.3826080560684204,
"step": 424
},
{
"epoch": 1.3027522935779816,
"grad_norm": 0.3828125,
"learning_rate": 8.268682365105453e-06,
"loss": 1.3560914993286133,
"step": 426
},
{
"epoch": 1.308868501529052,
"grad_norm": 0.62890625,
"learning_rate": 8.250700730393599e-06,
"loss": 1.2076865434646606,
"step": 428
},
{
"epoch": 1.3149847094801224,
"grad_norm": 0.408203125,
"learning_rate": 8.232648697902113e-06,
"loss": 1.3048980236053467,
"step": 430
},
{
"epoch": 1.3211009174311927,
"grad_norm": 0.4921875,
"learning_rate": 8.21452672963073e-06,
"loss": 1.352384328842163,
"step": 432
},
{
"epoch": 1.3272171253822629,
"grad_norm": 0.3984375,
"learning_rate": 8.196335289369027e-06,
"loss": 1.390981674194336,
"step": 434
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.0625,
"learning_rate": 8.178074842684554e-06,
"loss": 1.32779860496521,
"step": 436
},
{
"epoch": 1.3394495412844036,
"grad_norm": 1.0390625,
"learning_rate": 8.159745856910922e-06,
"loss": 1.2868674993515015,
"step": 438
},
{
"epoch": 1.345565749235474,
"grad_norm": 0.419921875,
"learning_rate": 8.14134880113584e-06,
"loss": 1.305415153503418,
"step": 440
},
{
"epoch": 1.3516819571865444,
"grad_norm": 0.53125,
"learning_rate": 8.122884146189104e-06,
"loss": 1.3808095455169678,
"step": 442
},
{
"epoch": 1.3577981651376148,
"grad_norm": 0.4921875,
"learning_rate": 8.104352364630565e-06,
"loss": 1.2937378883361816,
"step": 444
},
{
"epoch": 1.3639143730886851,
"grad_norm": 0.32421875,
"learning_rate": 8.085753930738013e-06,
"loss": 1.2958605289459229,
"step": 446
},
{
"epoch": 1.3700305810397553,
"grad_norm": 0.60546875,
"learning_rate": 8.067089320495057e-06,
"loss": 1.3038794994354248,
"step": 448
},
{
"epoch": 1.3761467889908257,
"grad_norm": 0.52734375,
"learning_rate": 8.048359011578927e-06,
"loss": 1.2670778036117554,
"step": 450
},
{
"epoch": 1.382262996941896,
"grad_norm": 1.390625,
"learning_rate": 8.029563483348268e-06,
"loss": 1.3002293109893799,
"step": 452
},
{
"epoch": 1.3883792048929664,
"grad_norm": 0.73046875,
"learning_rate": 8.010703216830852e-06,
"loss": 1.3091164827346802,
"step": 454
},
{
"epoch": 1.3944954128440368,
"grad_norm": 0.4921875,
"learning_rate": 7.991778694711278e-06,
"loss": 1.2860240936279297,
"step": 456
},
{
"epoch": 1.400611620795107,
"grad_norm": 0.421875,
"learning_rate": 7.972790401318627e-06,
"loss": 1.2974958419799805,
"step": 458
},
{
"epoch": 1.4067278287461773,
"grad_norm": 1.4609375,
"learning_rate": 7.953738822614048e-06,
"loss": 1.3687572479248047,
"step": 460
},
{
"epoch": 1.4128440366972477,
"grad_norm": 0.447265625,
"learning_rate": 7.934624446178328e-06,
"loss": 1.2588635683059692,
"step": 462
},
{
"epoch": 1.418960244648318,
"grad_norm": 0.427734375,
"learning_rate": 7.915447761199427e-06,
"loss": 1.3145904541015625,
"step": 464
},
{
"epoch": 1.4250764525993884,
"grad_norm": 0.318359375,
"learning_rate": 7.896209258459934e-06,
"loss": 1.2143771648406982,
"step": 466
},
{
"epoch": 1.4311926605504588,
"grad_norm": 0.40625,
"learning_rate": 7.876909430324527e-06,
"loss": 1.2713569402694702,
"step": 468
},
{
"epoch": 1.4373088685015292,
"grad_norm": 0.64453125,
"learning_rate": 7.85754877072737e-06,
"loss": 1.3136000633239746,
"step": 470
},
{
"epoch": 1.4434250764525993,
"grad_norm": 0.46484375,
"learning_rate": 7.838127775159451e-06,
"loss": 1.2473974227905273,
"step": 472
},
{
"epoch": 1.4495412844036697,
"grad_norm": 0.5078125,
"learning_rate": 7.818646940655933e-06,
"loss": 1.3004451990127563,
"step": 474
},
{
"epoch": 1.45565749235474,
"grad_norm": 1.4296875,
"learning_rate": 7.799106765783407e-06,
"loss": 1.3775520324707031,
"step": 476
},
{
"epoch": 1.4617737003058104,
"grad_norm": 0.515625,
"learning_rate": 7.779507750627145e-06,
"loss": 1.409247875213623,
"step": 478
},
{
"epoch": 1.4678899082568808,
"grad_norm": 3.953125,
"learning_rate": 7.7598503967783e-06,
"loss": 1.282897710800171,
"step": 480
},
{
"epoch": 1.474006116207951,
"grad_norm": 0.4453125,
"learning_rate": 7.74013520732107e-06,
"loss": 1.2685235738754272,
"step": 482
},
{
"epoch": 1.4801223241590213,
"grad_norm": 0.54296875,
"learning_rate": 7.720362686819814e-06,
"loss": 1.202805995941162,
"step": 484
},
{
"epoch": 1.4862385321100917,
"grad_norm": 0.359375,
"learning_rate": 7.700533341306155e-06,
"loss": 1.3179457187652588,
"step": 486
},
{
"epoch": 1.492354740061162,
"grad_norm": 0.4140625,
"learning_rate": 7.680647678266011e-06,
"loss": 1.3416056632995605,
"step": 488
},
{
"epoch": 1.4984709480122325,
"grad_norm": 0.609375,
"learning_rate": 7.66070620662662e-06,
"loss": 1.2907155752182007,
"step": 490
},
{
"epoch": 1.5045871559633026,
"grad_norm": 0.6640625,
"learning_rate": 7.640709436743512e-06,
"loss": 1.2985384464263916,
"step": 492
},
{
"epoch": 1.5107033639143732,
"grad_norm": 0.54296875,
"learning_rate": 7.620657880387448e-06,
"loss": 1.2733287811279297,
"step": 494
},
{
"epoch": 1.5168195718654434,
"grad_norm": 0.55859375,
"learning_rate": 7.600552050731315e-06,
"loss": 1.2120338678359985,
"step": 496
},
{
"epoch": 1.5229357798165137,
"grad_norm": 0.859375,
"learning_rate": 7.5803924623370025e-06,
"loss": 1.2848923206329346,
"step": 498
},
{
"epoch": 1.529051987767584,
"grad_norm": 0.69140625,
"learning_rate": 7.5601796311422325e-06,
"loss": 1.3336488008499146,
"step": 500
},
{
"epoch": 1.5351681957186545,
"grad_norm": 0.45703125,
"learning_rate": 7.539914074447349e-06,
"loss": 1.2442420721054077,
"step": 502
},
{
"epoch": 1.5412844036697249,
"grad_norm": 0.68359375,
"learning_rate": 7.519596310902081e-06,
"loss": 1.266619324684143,
"step": 504
},
{
"epoch": 1.547400611620795,
"grad_norm": 0.65625,
"learning_rate": 7.499226860492273e-06,
"loss": 1.374267816543579,
"step": 506
},
{
"epoch": 1.5535168195718656,
"grad_norm": 0.46484375,
"learning_rate": 7.478806244526576e-06,
"loss": 1.3529757261276245,
"step": 508
},
{
"epoch": 1.5596330275229358,
"grad_norm": 0.455078125,
"learning_rate": 7.458334985623102e-06,
"loss": 1.2986624240875244,
"step": 510
},
{
"epoch": 1.5657492354740061,
"grad_norm": 0.37109375,
"learning_rate": 7.437813607696049e-06,
"loss": 1.2934763431549072,
"step": 512
},
{
"epoch": 1.5718654434250765,
"grad_norm": 0.90234375,
"learning_rate": 7.4172426359422976e-06,
"loss": 1.3502346277236938,
"step": 514
},
{
"epoch": 1.5779816513761467,
"grad_norm": 0.58203125,
"learning_rate": 7.396622596827967e-06,
"loss": 1.2319389581680298,
"step": 516
},
{
"epoch": 1.5840978593272173,
"grad_norm": 0.625,
"learning_rate": 7.375954018074941e-06,
"loss": 1.3282928466796875,
"step": 518
},
{
"epoch": 1.5902140672782874,
"grad_norm": 0.6875,
"learning_rate": 7.3552374286473595e-06,
"loss": 1.3678048849105835,
"step": 520
},
{
"epoch": 1.5963302752293578,
"grad_norm": 0.30859375,
"learning_rate": 7.3344733587380875e-06,
"loss": 1.2744084596633911,
"step": 522
},
{
"epoch": 1.6024464831804281,
"grad_norm": 0.84375,
"learning_rate": 7.31366233975514e-06,
"loss": 1.281977891921997,
"step": 524
},
{
"epoch": 1.6085626911314985,
"grad_norm": 0.61328125,
"learning_rate": 7.292804904308087e-06,
"loss": 1.2926934957504272,
"step": 526
},
{
"epoch": 1.614678899082569,
"grad_norm": 0.484375,
"learning_rate": 7.271901586194417e-06,
"loss": 1.3355308771133423,
"step": 528
},
{
"epoch": 1.620795107033639,
"grad_norm": 0.62890625,
"learning_rate": 7.2509529203858794e-06,
"loss": 1.2734055519104004,
"step": 530
},
{
"epoch": 1.6269113149847096,
"grad_norm": 0.412109375,
"learning_rate": 7.229959443014793e-06,
"loss": 1.2471139430999756,
"step": 532
},
{
"epoch": 1.6330275229357798,
"grad_norm": 0.376953125,
"learning_rate": 7.208921691360323e-06,
"loss": 1.3476160764694214,
"step": 534
},
{
"epoch": 1.6391437308868502,
"grad_norm": 0.408203125,
"learning_rate": 7.187840203834732e-06,
"loss": 1.2233093976974487,
"step": 536
},
{
"epoch": 1.6452599388379205,
"grad_norm": 0.66015625,
"learning_rate": 7.166715519969601e-06,
"loss": 1.2761595249176025,
"step": 538
},
{
"epoch": 1.6513761467889907,
"grad_norm": 1.84375,
"learning_rate": 7.145548180402021e-06,
"loss": 1.3554096221923828,
"step": 540
},
{
"epoch": 1.6574923547400613,
"grad_norm": 0.474609375,
"learning_rate": 7.124338726860755e-06,
"loss": 1.3470004796981812,
"step": 542
},
{
"epoch": 1.6636085626911314,
"grad_norm": 0.796875,
"learning_rate": 7.103087702152377e-06,
"loss": 1.312508225440979,
"step": 544
},
{
"epoch": 1.6697247706422018,
"grad_norm": 2.484375,
"learning_rate": 7.081795650147375e-06,
"loss": 1.2889965772628784,
"step": 546
},
{
"epoch": 1.6758409785932722,
"grad_norm": 0.75,
"learning_rate": 7.060463115766239e-06,
"loss": 1.3792515993118286,
"step": 548
},
{
"epoch": 1.6819571865443423,
"grad_norm": 1.96875,
"learning_rate": 7.0390906449655104e-06,
"loss": 1.321378469467163,
"step": 550
},
{
"epoch": 1.688073394495413,
"grad_norm": 0.515625,
"learning_rate": 7.017678784723806e-06,
"loss": 1.3485661745071411,
"step": 552
},
{
"epoch": 1.694189602446483,
"grad_norm": 0.5546875,
"learning_rate": 6.99622808302783e-06,
"loss": 1.3221888542175293,
"step": 554
},
{
"epoch": 1.7003058103975535,
"grad_norm": 0.46875,
"learning_rate": 6.974739088858338e-06,
"loss": 1.3821053504943848,
"step": 556
},
{
"epoch": 1.7064220183486238,
"grad_norm": 0.466796875,
"learning_rate": 6.9532123521760944e-06,
"loss": 1.272276759147644,
"step": 558
},
{
"epoch": 1.7125382262996942,
"grad_norm": 0.4375,
"learning_rate": 6.931648423907796e-06,
"loss": 1.2930102348327637,
"step": 560
},
{
"epoch": 1.7186544342507646,
"grad_norm": 0.361328125,
"learning_rate": 6.91004785593197e-06,
"loss": 1.2617864608764648,
"step": 562
},
{
"epoch": 1.7247706422018347,
"grad_norm": 0.640625,
"learning_rate": 6.888411201064854e-06,
"loss": 1.3153817653656006,
"step": 564
},
{
"epoch": 1.7308868501529053,
"grad_norm": 0.412109375,
"learning_rate": 6.866739013046243e-06,
"loss": 1.2653061151504517,
"step": 566
},
{
"epoch": 1.7370030581039755,
"grad_norm": 0.5078125,
"learning_rate": 6.845031846525322e-06,
"loss": 1.2796239852905273,
"step": 568
},
{
"epoch": 1.7431192660550459,
"grad_norm": 0.328125,
"learning_rate": 6.823290257046467e-06,
"loss": 1.2797678709030151,
"step": 570
},
{
"epoch": 1.7492354740061162,
"grad_norm": 0.671875,
"learning_rate": 6.801514801035031e-06,
"loss": 1.2564300298690796,
"step": 572
},
{
"epoch": 1.7553516819571864,
"grad_norm": 0.53515625,
"learning_rate": 6.7797060357831045e-06,
"loss": 1.3716152906417847,
"step": 574
},
{
"epoch": 1.761467889908257,
"grad_norm": 0.64453125,
"learning_rate": 6.757864519435245e-06,
"loss": 1.3831623792648315,
"step": 576
},
{
"epoch": 1.7675840978593271,
"grad_norm": 1.4765625,
"learning_rate": 6.735990810974205e-06,
"loss": 1.3119230270385742,
"step": 578
},
{
"epoch": 1.7737003058103975,
"grad_norm": 0.69140625,
"learning_rate": 6.71408547020661e-06,
"loss": 1.2804102897644043,
"step": 580
},
{
"epoch": 1.7798165137614679,
"grad_norm": 0.578125,
"learning_rate": 6.6921490577486495e-06,
"loss": 1.403084635734558,
"step": 582
},
{
"epoch": 1.7859327217125383,
"grad_norm": 0.78125,
"learning_rate": 6.6701821350117155e-06,
"loss": 1.2526099681854248,
"step": 584
},
{
"epoch": 1.7920489296636086,
"grad_norm": 0.7890625,
"learning_rate": 6.648185264188043e-06,
"loss": 1.2811146974563599,
"step": 586
},
{
"epoch": 1.7981651376146788,
"grad_norm": 0.6171875,
"learning_rate": 6.626159008236316e-06,
"loss": 1.2454664707183838,
"step": 588
},
{
"epoch": 1.8042813455657494,
"grad_norm": 0.71875,
"learning_rate": 6.60410393086726e-06,
"loss": 1.2602325677871704,
"step": 590
},
{
"epoch": 1.8103975535168195,
"grad_norm": 0.421875,
"learning_rate": 6.582020596529224e-06,
"loss": 1.2364270687103271,
"step": 592
},
{
"epoch": 1.81651376146789,
"grad_norm": 0.462890625,
"learning_rate": 6.559909570393723e-06,
"loss": 1.2236618995666504,
"step": 594
},
{
"epoch": 1.8226299694189603,
"grad_norm": 1.109375,
"learning_rate": 6.537771418340981e-06,
"loss": 1.3950483798980713,
"step": 596
},
{
"epoch": 1.8287461773700304,
"grad_norm": 0.578125,
"learning_rate": 6.515606706945448e-06,
"loss": 1.2344207763671875,
"step": 598
},
{
"epoch": 1.834862385321101,
"grad_norm": 0.62890625,
"learning_rate": 6.493416003461296e-06,
"loss": 1.335288643836975,
"step": 600
},
{
"epoch": 1.8409785932721712,
"grad_norm": 0.515625,
"learning_rate": 6.4711998758079064e-06,
"loss": 1.255522608757019,
"step": 602
},
{
"epoch": 1.8470948012232415,
"grad_norm": 0.392578125,
"learning_rate": 6.448958892555332e-06,
"loss": 1.2738847732543945,
"step": 604
},
{
"epoch": 1.853211009174312,
"grad_norm": 0.37890625,
"learning_rate": 6.426693622909742e-06,
"loss": 1.2251421213150024,
"step": 606
},
{
"epoch": 1.8593272171253823,
"grad_norm": 0.58203125,
"learning_rate": 6.404404636698869e-06,
"loss": 1.1613845825195312,
"step": 608
},
{
"epoch": 1.8654434250764527,
"grad_norm": 0.47265625,
"learning_rate": 6.3820925043574074e-06,
"loss": 1.288172721862793,
"step": 610
},
{
"epoch": 1.8715596330275228,
"grad_norm": 1.2109375,
"learning_rate": 6.35975779691243e-06,
"loss": 1.2886998653411865,
"step": 612
},
{
"epoch": 1.8776758409785934,
"grad_norm": 0.72265625,
"learning_rate": 6.337401085968759e-06,
"loss": 1.286860466003418,
"step": 614
},
{
"epoch": 1.8837920489296636,
"grad_norm": 0.46484375,
"learning_rate": 6.3150229436943514e-06,
"loss": 1.2472259998321533,
"step": 616
},
{
"epoch": 1.889908256880734,
"grad_norm": 0.796875,
"learning_rate": 6.2926239428056456e-06,
"loss": 1.309545874595642,
"step": 618
},
{
"epoch": 1.8960244648318043,
"grad_norm": 0.546875,
"learning_rate": 6.270204656552908e-06,
"loss": 1.2884358167648315,
"step": 620
},
{
"epoch": 1.9021406727828745,
"grad_norm": 0.61328125,
"learning_rate": 6.247765658705564e-06,
"loss": 1.2543675899505615,
"step": 622
},
{
"epoch": 1.908256880733945,
"grad_norm": 0.3515625,
"learning_rate": 6.225307523537509e-06,
"loss": 1.1704795360565186,
"step": 624
},
{
"epoch": 1.9143730886850152,
"grad_norm": 0.921875,
"learning_rate": 6.2028308258124135e-06,
"loss": 1.362220048904419,
"step": 626
},
{
"epoch": 1.9204892966360856,
"grad_norm": 1.078125,
"learning_rate": 6.180336140769015e-06,
"loss": 1.3805466890335083,
"step": 628
},
{
"epoch": 1.926605504587156,
"grad_norm": 0.60546875,
"learning_rate": 6.157824044106394e-06,
"loss": 1.3186891078948975,
"step": 630
},
{
"epoch": 1.9327217125382263,
"grad_norm": 1.0625,
"learning_rate": 6.13529511196924e-06,
"loss": 1.1534855365753174,
"step": 632
},
{
"epoch": 1.9388379204892967,
"grad_norm": 0.80859375,
"learning_rate": 6.112749920933111e-06,
"loss": 1.2515051364898682,
"step": 634
},
{
"epoch": 1.9449541284403669,
"grad_norm": 1.4140625,
"learning_rate": 6.090189047989665e-06,
"loss": 1.4018653631210327,
"step": 636
},
{
"epoch": 1.9510703363914375,
"grad_norm": 0.5625,
"learning_rate": 6.067613070531912e-06,
"loss": 1.300402283668518,
"step": 638
},
{
"epoch": 1.9571865443425076,
"grad_norm": 0.6015625,
"learning_rate": 6.045022566339419e-06,
"loss": 1.3779313564300537,
"step": 640
},
{
"epoch": 1.963302752293578,
"grad_norm": 0.439453125,
"learning_rate": 6.022418113563536e-06,
"loss": 1.2664169073104858,
"step": 642
},
{
"epoch": 1.9694189602446484,
"grad_norm": 0.72265625,
"learning_rate": 5.999800290712594e-06,
"loss": 1.2255876064300537,
"step": 644
},
{
"epoch": 1.9755351681957185,
"grad_norm": 0.34375,
"learning_rate": 5.9771696766370965e-06,
"loss": 1.3016749620437622,
"step": 646
},
{
"epoch": 1.981651376146789,
"grad_norm": 0.443359375,
"learning_rate": 5.9545268505149114e-06,
"loss": 1.2409298419952393,
"step": 648
},
{
"epoch": 1.9877675840978593,
"grad_norm": 0.85546875,
"learning_rate": 5.931872391836446e-06,
"loss": 1.3296973705291748,
"step": 650
},
{
"epoch": 1.9938837920489296,
"grad_norm": 0.5703125,
"learning_rate": 5.909206880389813e-06,
"loss": 1.376185655593872,
"step": 652
},
{
"epoch": 2.0,
"grad_norm": 0.80078125,
"learning_rate": 5.8865308962459976e-06,
"loss": 1.2528204917907715,
"step": 654
},
{
"epoch": 2.00611620795107,
"grad_norm": 0.45703125,
"learning_rate": 5.863845019744007e-06,
"loss": 1.1687815189361572,
"step": 656
},
{
"epoch": 2.0122324159021407,
"grad_norm": 0.7734375,
"learning_rate": 5.841149831476024e-06,
"loss": 1.2196176052093506,
"step": 658
},
{
"epoch": 2.018348623853211,
"grad_norm": 0.4453125,
"learning_rate": 5.81844591227254e-06,
"loss": 1.261337399482727,
"step": 660
},
{
"epoch": 2.0244648318042815,
"grad_norm": 0.490234375,
"learning_rate": 5.795733843187496e-06,
"loss": 1.2313090562820435,
"step": 662
},
{
"epoch": 2.0305810397553516,
"grad_norm": 1.1640625,
"learning_rate": 5.773014205483414e-06,
"loss": 1.2076407670974731,
"step": 664
},
{
"epoch": 2.036697247706422,
"grad_norm": 0.5859375,
"learning_rate": 5.750287580616511e-06,
"loss": 1.1940546035766602,
"step": 666
},
{
"epoch": 2.0428134556574924,
"grad_norm": 1.3203125,
"learning_rate": 5.7275545502218274e-06,
"loss": 1.0421754121780396,
"step": 668
},
{
"epoch": 2.0489296636085625,
"grad_norm": 0.54296875,
"learning_rate": 5.704815696098337e-06,
"loss": 1.2445980310440063,
"step": 670
},
{
"epoch": 2.055045871559633,
"grad_norm": 0.50390625,
"learning_rate": 5.68207160019406e-06,
"loss": 1.2573648691177368,
"step": 672
},
{
"epoch": 2.0611620795107033,
"grad_norm": 0.498046875,
"learning_rate": 5.659322844591166e-06,
"loss": 1.3194655179977417,
"step": 674
},
{
"epoch": 2.067278287461774,
"grad_norm": 0.51171875,
"learning_rate": 5.636570011491082e-06,
"loss": 1.2315115928649902,
"step": 676
},
{
"epoch": 2.073394495412844,
"grad_norm": 0.486328125,
"learning_rate": 5.613813683199582e-06,
"loss": 1.1668107509613037,
"step": 678
},
{
"epoch": 2.079510703363914,
"grad_norm": 0.51953125,
"learning_rate": 5.591054442111901e-06,
"loss": 1.1951708793640137,
"step": 680
},
{
"epoch": 2.085626911314985,
"grad_norm": 0.384765625,
"learning_rate": 5.568292870697812e-06,
"loss": 1.1300991773605347,
"step": 682
},
{
"epoch": 2.091743119266055,
"grad_norm": 0.71484375,
"learning_rate": 5.545529551486731e-06,
"loss": 1.269416332244873,
"step": 684
},
{
"epoch": 2.0978593272171255,
"grad_norm": 0.81640625,
"learning_rate": 5.522765067052805e-06,
"loss": 1.1883726119995117,
"step": 686
},
{
"epoch": 2.1039755351681957,
"grad_norm": 0.482421875,
"learning_rate": 5.500000000000001e-06,
"loss": 1.172653317451477,
"step": 688
},
{
"epoch": 2.1100917431192663,
"grad_norm": 0.6796875,
"learning_rate": 5.477234932947196e-06,
"loss": 1.2290334701538086,
"step": 690
},
{
"epoch": 2.1162079510703364,
"grad_norm": 1.4453125,
"learning_rate": 5.45447044851327e-06,
"loss": 1.1191812753677368,
"step": 692
},
{
"epoch": 2.1223241590214066,
"grad_norm": 0.43359375,
"learning_rate": 5.431707129302188e-06,
"loss": 1.3137654066085815,
"step": 694
},
{
"epoch": 2.128440366972477,
"grad_norm": 0.734375,
"learning_rate": 5.4089455578881005e-06,
"loss": 1.1786179542541504,
"step": 696
},
{
"epoch": 2.1345565749235473,
"grad_norm": 0.419921875,
"learning_rate": 5.386186316800418e-06,
"loss": 1.1776201725006104,
"step": 698
},
{
"epoch": 2.140672782874618,
"grad_norm": 0.482421875,
"learning_rate": 5.36342998850892e-06,
"loss": 1.18330979347229,
"step": 700
},
{
"epoch": 2.146788990825688,
"grad_norm": 0.68359375,
"learning_rate": 5.340677155408835e-06,
"loss": 1.2524994611740112,
"step": 702
},
{
"epoch": 2.1529051987767582,
"grad_norm": 0.62109375,
"learning_rate": 5.317928399805943e-06,
"loss": 1.2536473274230957,
"step": 704
},
{
"epoch": 2.159021406727829,
"grad_norm": 0.46875,
"learning_rate": 5.295184303901665e-06,
"loss": 1.1864341497421265,
"step": 706
},
{
"epoch": 2.165137614678899,
"grad_norm": 0.439453125,
"learning_rate": 5.272445449778175e-06,
"loss": 1.2302113771438599,
"step": 708
},
{
"epoch": 2.1712538226299696,
"grad_norm": 0.447265625,
"learning_rate": 5.249712419383492e-06,
"loss": 1.153498888015747,
"step": 710
},
{
"epoch": 2.1773700305810397,
"grad_norm": 0.4765625,
"learning_rate": 5.226985794516587e-06,
"loss": 1.2334654331207275,
"step": 712
},
{
"epoch": 2.18348623853211,
"grad_norm": 0.4453125,
"learning_rate": 5.204266156812504e-06,
"loss": 1.1027376651763916,
"step": 714
},
{
"epoch": 2.1896024464831805,
"grad_norm": 4.0625,
"learning_rate": 5.181554087727462e-06,
"loss": 1.2752158641815186,
"step": 716
},
{
"epoch": 2.1957186544342506,
"grad_norm": 0.5625,
"learning_rate": 5.158850168523979e-06,
"loss": 1.2342238426208496,
"step": 718
},
{
"epoch": 2.2018348623853212,
"grad_norm": 0.7421875,
"learning_rate": 5.136154980255995e-06,
"loss": 1.2153668403625488,
"step": 720
},
{
"epoch": 2.2079510703363914,
"grad_norm": 0.6796875,
"learning_rate": 5.1134691037540055e-06,
"loss": 1.2085171937942505,
"step": 722
},
{
"epoch": 2.214067278287462,
"grad_norm": 0.80859375,
"learning_rate": 5.090793119610189e-06,
"loss": 1.230190634727478,
"step": 724
},
{
"epoch": 2.220183486238532,
"grad_norm": 0.65625,
"learning_rate": 5.068127608163557e-06,
"loss": 1.1547964811325073,
"step": 726
},
{
"epoch": 2.2262996941896023,
"grad_norm": 0.57421875,
"learning_rate": 5.045473149485091e-06,
"loss": 1.2784456014633179,
"step": 728
},
{
"epoch": 2.232415902140673,
"grad_norm": 0.734375,
"learning_rate": 5.022830323362905e-06,
"loss": 1.1994041204452515,
"step": 730
},
{
"epoch": 2.238532110091743,
"grad_norm": 0.59375,
"learning_rate": 5.000199709287408e-06,
"loss": 1.1957271099090576,
"step": 732
},
{
"epoch": 2.2446483180428136,
"grad_norm": 0.70703125,
"learning_rate": 4.9775818864364635e-06,
"loss": 1.2446789741516113,
"step": 734
},
{
"epoch": 2.2507645259938838,
"grad_norm": 0.486328125,
"learning_rate": 4.954977433660583e-06,
"loss": 1.1822783946990967,
"step": 736
},
{
"epoch": 2.2568807339449544,
"grad_norm": 0.60546875,
"learning_rate": 4.9323869294680915e-06,
"loss": 1.1413577795028687,
"step": 738
},
{
"epoch": 2.2629969418960245,
"grad_norm": 0.49609375,
"learning_rate": 4.909810952010336e-06,
"loss": 1.1892144680023193,
"step": 740
},
{
"epoch": 2.2691131498470947,
"grad_norm": 0.498046875,
"learning_rate": 4.887250079066892e-06,
"loss": 1.2589919567108154,
"step": 742
},
{
"epoch": 2.2752293577981653,
"grad_norm": 0.9375,
"learning_rate": 4.86470488803076e-06,
"loss": 1.2584980726242065,
"step": 744
},
{
"epoch": 2.2813455657492354,
"grad_norm": 0.435546875,
"learning_rate": 4.842175955893608e-06,
"loss": 1.1710209846496582,
"step": 746
},
{
"epoch": 2.287461773700306,
"grad_norm": 0.59765625,
"learning_rate": 4.819663859230986e-06,
"loss": 1.2968641519546509,
"step": 748
},
{
"epoch": 2.293577981651376,
"grad_norm": 1.2109375,
"learning_rate": 4.797169174187588e-06,
"loss": 1.198433756828308,
"step": 750
},
{
"epoch": 2.2996941896024463,
"grad_norm": 0.6875,
"learning_rate": 4.774692476462493e-06,
"loss": 1.296976089477539,
"step": 752
},
{
"epoch": 2.305810397553517,
"grad_norm": 0.63671875,
"learning_rate": 4.752234341294438e-06,
"loss": 1.2286152839660645,
"step": 754
},
{
"epoch": 2.311926605504587,
"grad_norm": 0.87890625,
"learning_rate": 4.729795343447093e-06,
"loss": 1.2850275039672852,
"step": 756
},
{
"epoch": 2.3180428134556577,
"grad_norm": 0.578125,
"learning_rate": 4.707376057194356e-06,
"loss": 1.2537508010864258,
"step": 758
},
{
"epoch": 2.324159021406728,
"grad_norm": 0.58984375,
"learning_rate": 4.68497705630565e-06,
"loss": 1.1948941946029663,
"step": 760
},
{
"epoch": 2.330275229357798,
"grad_norm": 0.51171875,
"learning_rate": 4.662598914031241e-06,
"loss": 1.2438340187072754,
"step": 762
},
{
"epoch": 2.3363914373088686,
"grad_norm": 0.62890625,
"learning_rate": 4.6402422030875704e-06,
"loss": 1.3103235960006714,
"step": 764
},
{
"epoch": 2.3425076452599387,
"grad_norm": 1.1328125,
"learning_rate": 4.617907495642594e-06,
"loss": 1.1827704906463623,
"step": 766
},
{
"epoch": 2.3486238532110093,
"grad_norm": 0.515625,
"learning_rate": 4.595595363301133e-06,
"loss": 1.1387625932693481,
"step": 768
},
{
"epoch": 2.3547400611620795,
"grad_norm": 0.68359375,
"learning_rate": 4.5733063770902595e-06,
"loss": 1.2371636629104614,
"step": 770
},
{
"epoch": 2.3608562691131496,
"grad_norm": 0.54296875,
"learning_rate": 4.551041107444671e-06,
"loss": 1.1606448888778687,
"step": 772
},
{
"epoch": 2.36697247706422,
"grad_norm": 0.60546875,
"learning_rate": 4.528800124192095e-06,
"loss": 1.3499796390533447,
"step": 774
},
{
"epoch": 2.3730886850152904,
"grad_norm": 0.388671875,
"learning_rate": 4.506583996538705e-06,
"loss": 1.1447316408157349,
"step": 776
},
{
"epoch": 2.379204892966361,
"grad_norm": 0.6640625,
"learning_rate": 4.484393293054553e-06,
"loss": 1.190900444984436,
"step": 778
},
{
"epoch": 2.385321100917431,
"grad_norm": 0.54296875,
"learning_rate": 4.462228581659019e-06,
"loss": 1.2503337860107422,
"step": 780
},
{
"epoch": 2.3914373088685017,
"grad_norm": 0.609375,
"learning_rate": 4.440090429606278e-06,
"loss": 1.1737557649612427,
"step": 782
},
{
"epoch": 2.397553516819572,
"grad_norm": 0.75390625,
"learning_rate": 4.417979403470778e-06,
"loss": 1.239940881729126,
"step": 784
},
{
"epoch": 2.4036697247706424,
"grad_norm": 0.419921875,
"learning_rate": 4.3958960691327425e-06,
"loss": 1.1777243614196777,
"step": 786
},
{
"epoch": 2.4097859327217126,
"grad_norm": 0.5859375,
"learning_rate": 4.373840991763686e-06,
"loss": 1.1661309003829956,
"step": 788
},
{
"epoch": 2.4159021406727827,
"grad_norm": 0.466796875,
"learning_rate": 4.3518147358119575e-06,
"loss": 1.2908847332000732,
"step": 790
},
{
"epoch": 2.4220183486238533,
"grad_norm": 0.58984375,
"learning_rate": 4.329817864988285e-06,
"loss": 1.261257290840149,
"step": 792
},
{
"epoch": 2.4281345565749235,
"grad_norm": 0.486328125,
"learning_rate": 4.307850942251351e-06,
"loss": 1.0505046844482422,
"step": 794
},
{
"epoch": 2.434250764525994,
"grad_norm": 0.68359375,
"learning_rate": 4.285914529793392e-06,
"loss": 1.260128378868103,
"step": 796
},
{
"epoch": 2.4403669724770642,
"grad_norm": 0.72265625,
"learning_rate": 4.2640091890257984e-06,
"loss": 1.336702823638916,
"step": 798
},
{
"epoch": 2.4464831804281344,
"grad_norm": 0.50390625,
"learning_rate": 4.242135480564756e-06,
"loss": 1.2336891889572144,
"step": 800
},
{
"epoch": 2.452599388379205,
"grad_norm": 0.75390625,
"learning_rate": 4.220293964216899e-06,
"loss": 1.1661975383758545,
"step": 802
},
{
"epoch": 2.458715596330275,
"grad_norm": 0.59765625,
"learning_rate": 4.198485198964971e-06,
"loss": 1.2408455610275269,
"step": 804
},
{
"epoch": 2.4648318042813457,
"grad_norm": 0.48828125,
"learning_rate": 4.176709742953536e-06,
"loss": 1.1344859600067139,
"step": 806
},
{
"epoch": 2.470948012232416,
"grad_norm": 5.84375,
"learning_rate": 4.15496815347468e-06,
"loss": 1.1564085483551025,
"step": 808
},
{
"epoch": 2.477064220183486,
"grad_norm": 0.69921875,
"learning_rate": 4.133260986953759e-06,
"loss": 1.3386648893356323,
"step": 810
},
{
"epoch": 2.4831804281345566,
"grad_norm": 0.423828125,
"learning_rate": 4.111588798935146e-06,
"loss": 1.1828325986862183,
"step": 812
},
{
"epoch": 2.489296636085627,
"grad_norm": 0.703125,
"learning_rate": 4.089952144068031e-06,
"loss": 1.1244158744812012,
"step": 814
},
{
"epoch": 2.4954128440366974,
"grad_norm": 0.68359375,
"learning_rate": 4.068351576092204e-06,
"loss": 1.2698228359222412,
"step": 816
},
{
"epoch": 2.5015290519877675,
"grad_norm": 0.74609375,
"learning_rate": 4.046787647823906e-06,
"loss": 1.2006717920303345,
"step": 818
},
{
"epoch": 2.5076452599388377,
"grad_norm": 1.375,
"learning_rate": 4.025260911141664e-06,
"loss": 1.217053771018982,
"step": 820
},
{
"epoch": 2.5137614678899083,
"grad_norm": 0.53125,
"learning_rate": 4.003771916972171e-06,
"loss": 1.2399015426635742,
"step": 822
},
{
"epoch": 2.5198776758409784,
"grad_norm": 0.59765625,
"learning_rate": 3.982321215276195e-06,
"loss": 1.1872673034667969,
"step": 824
},
{
"epoch": 2.525993883792049,
"grad_norm": 1.2421875,
"learning_rate": 3.960909355034491e-06,
"loss": 1.2071783542633057,
"step": 826
},
{
"epoch": 2.532110091743119,
"grad_norm": 0.58984375,
"learning_rate": 3.939536884233762e-06,
"loss": 1.2099813222885132,
"step": 828
},
{
"epoch": 2.5382262996941893,
"grad_norm": 0.8203125,
"learning_rate": 3.918204349852626e-06,
"loss": 1.2038205862045288,
"step": 830
},
{
"epoch": 2.54434250764526,
"grad_norm": 1.1015625,
"learning_rate": 3.896912297847626e-06,
"loss": 1.1809529066085815,
"step": 832
},
{
"epoch": 2.5504587155963305,
"grad_norm": 0.57421875,
"learning_rate": 3.875661273139246e-06,
"loss": 1.1591264009475708,
"step": 834
},
{
"epoch": 2.5565749235474007,
"grad_norm": 0.8359375,
"learning_rate": 3.854451819597981e-06,
"loss": 1.0593103170394897,
"step": 836
},
{
"epoch": 2.562691131498471,
"grad_norm": 0.466796875,
"learning_rate": 3.833284480030401e-06,
"loss": 1.2778112888336182,
"step": 838
},
{
"epoch": 2.5688073394495414,
"grad_norm": 0.6015625,
"learning_rate": 3.81215979616527e-06,
"loss": 1.153441309928894,
"step": 840
},
{
"epoch": 2.5749235474006116,
"grad_norm": 0.71875,
"learning_rate": 3.79107830863968e-06,
"loss": 1.251842975616455,
"step": 842
},
{
"epoch": 2.581039755351682,
"grad_norm": 0.77734375,
"learning_rate": 3.7700405569852082e-06,
"loss": 1.1608760356903076,
"step": 844
},
{
"epoch": 2.5871559633027523,
"grad_norm": 0.96875,
"learning_rate": 3.749047079614121e-06,
"loss": 1.1455830335617065,
"step": 846
},
{
"epoch": 2.5932721712538225,
"grad_norm": 0.390625,
"learning_rate": 3.7280984138055842e-06,
"loss": 1.201966643333435,
"step": 848
},
{
"epoch": 2.599388379204893,
"grad_norm": 0.62890625,
"learning_rate": 3.707195095691913e-06,
"loss": 1.232427954673767,
"step": 850
},
{
"epoch": 2.6055045871559632,
"grad_norm": 2.890625,
"learning_rate": 3.6863376602448607e-06,
"loss": 1.257423758506775,
"step": 852
},
{
"epoch": 2.611620795107034,
"grad_norm": 0.50390625,
"learning_rate": 3.665526641261914e-06,
"loss": 1.154307246208191,
"step": 854
},
{
"epoch": 2.617737003058104,
"grad_norm": 0.56640625,
"learning_rate": 3.6447625713526415e-06,
"loss": 1.3352923393249512,
"step": 856
},
{
"epoch": 2.623853211009174,
"grad_norm": 0.59765625,
"learning_rate": 3.6240459819250605e-06,
"loss": 1.2940092086791992,
"step": 858
},
{
"epoch": 2.6299694189602447,
"grad_norm": 1.984375,
"learning_rate": 3.603377403172035e-06,
"loss": 1.3129587173461914,
"step": 860
},
{
"epoch": 2.636085626911315,
"grad_norm": 0.66015625,
"learning_rate": 3.582757364057704e-06,
"loss": 1.1294050216674805,
"step": 862
},
{
"epoch": 2.6422018348623855,
"grad_norm": 1.0625,
"learning_rate": 3.5621863923039533e-06,
"loss": 1.1990245580673218,
"step": 864
},
{
"epoch": 2.6483180428134556,
"grad_norm": 0.609375,
"learning_rate": 3.5416650143768994e-06,
"loss": 1.1870311498641968,
"step": 866
},
{
"epoch": 2.6544342507645258,
"grad_norm": 0.44921875,
"learning_rate": 3.5211937554734234e-06,
"loss": 1.199330449104309,
"step": 868
},
{
"epoch": 2.6605504587155964,
"grad_norm": 0.609375,
"learning_rate": 3.5007731395077273e-06,
"loss": 1.247740387916565,
"step": 870
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.447265625,
"learning_rate": 3.4804036890979207e-06,
"loss": 1.1606550216674805,
"step": 872
},
{
"epoch": 2.672782874617737,
"grad_norm": 0.48046875,
"learning_rate": 3.460085925552653e-06,
"loss": 1.1833080053329468,
"step": 874
},
{
"epoch": 2.6788990825688073,
"grad_norm": 0.8984375,
"learning_rate": 3.439820368857768e-06,
"loss": 1.199750304222107,
"step": 876
},
{
"epoch": 2.6850152905198774,
"grad_norm": 0.69140625,
"learning_rate": 3.4196075376629976e-06,
"loss": 1.1525050401687622,
"step": 878
},
{
"epoch": 2.691131498470948,
"grad_norm": 1.7578125,
"learning_rate": 3.3994479492686867e-06,
"loss": 1.251511812210083,
"step": 880
},
{
"epoch": 2.6972477064220186,
"grad_norm": 1.1640625,
"learning_rate": 3.379342119612553e-06,
"loss": 1.2259825468063354,
"step": 882
},
{
"epoch": 2.7033639143730888,
"grad_norm": 0.39453125,
"learning_rate": 3.3592905632564874e-06,
"loss": 1.2564154863357544,
"step": 884
},
{
"epoch": 2.709480122324159,
"grad_norm": 0.75,
"learning_rate": 3.3392937933733804e-06,
"loss": 1.1841342449188232,
"step": 886
},
{
"epoch": 2.7155963302752295,
"grad_norm": 0.76953125,
"learning_rate": 3.319352321733989e-06,
"loss": 1.194476842880249,
"step": 888
},
{
"epoch": 2.7217125382262997,
"grad_norm": 0.59765625,
"learning_rate": 3.2994666586938473e-06,
"loss": 1.254859209060669,
"step": 890
},
{
"epoch": 2.7278287461773703,
"grad_norm": 0.58203125,
"learning_rate": 3.2796373131801873e-06,
"loss": 1.255743384361267,
"step": 892
},
{
"epoch": 2.7339449541284404,
"grad_norm": 0.86328125,
"learning_rate": 3.259864792678933e-06,
"loss": 1.2186676263809204,
"step": 894
},
{
"epoch": 2.7400611620795106,
"grad_norm": 0.6015625,
"learning_rate": 3.2401496032217017e-06,
"loss": 1.3046661615371704,
"step": 896
},
{
"epoch": 2.746177370030581,
"grad_norm": 0.50390625,
"learning_rate": 3.2204922493728576e-06,
"loss": 1.3042587041854858,
"step": 898
},
{
"epoch": 2.7522935779816513,
"grad_norm": 0.5234375,
"learning_rate": 3.200893234216596e-06,
"loss": 1.179953694343567,
"step": 900
},
{
"epoch": 2.758409785932722,
"grad_norm": 0.515625,
"learning_rate": 3.1813530593440693e-06,
"loss": 1.2110344171524048,
"step": 902
},
{
"epoch": 2.764525993883792,
"grad_norm": 0.8984375,
"learning_rate": 3.1618722248405504e-06,
"loss": 1.155335783958435,
"step": 904
},
{
"epoch": 2.770642201834862,
"grad_norm": 0.578125,
"learning_rate": 3.1424512292726315e-06,
"loss": 1.1856063604354858,
"step": 906
},
{
"epoch": 2.776758409785933,
"grad_norm": 1.1796875,
"learning_rate": 3.123090569675472e-06,
"loss": 1.142336368560791,
"step": 908
},
{
"epoch": 2.782874617737003,
"grad_norm": 0.93359375,
"learning_rate": 3.1037907415400674e-06,
"loss": 1.143799066543579,
"step": 910
},
{
"epoch": 2.7889908256880735,
"grad_norm": 1.0,
"learning_rate": 3.0845522388005756e-06,
"loss": 1.2811185121536255,
"step": 912
},
{
"epoch": 2.7951070336391437,
"grad_norm": 0.81640625,
"learning_rate": 3.0653755538216724e-06,
"loss": 1.099307894706726,
"step": 914
},
{
"epoch": 2.801223241590214,
"grad_norm": 0.58984375,
"learning_rate": 3.046261177385954e-06,
"loss": 1.1932672262191772,
"step": 916
},
{
"epoch": 2.8073394495412844,
"grad_norm": 0.5859375,
"learning_rate": 3.027209598681373e-06,
"loss": 1.1614950895309448,
"step": 918
},
{
"epoch": 2.8134556574923546,
"grad_norm": 1.625,
"learning_rate": 3.008221305288722e-06,
"loss": 1.3066401481628418,
"step": 920
},
{
"epoch": 2.819571865443425,
"grad_norm": 0.71484375,
"learning_rate": 2.9892967831691506e-06,
"loss": 1.261734962463379,
"step": 922
},
{
"epoch": 2.8256880733944953,
"grad_norm": 0.87109375,
"learning_rate": 2.9704365166517337e-06,
"loss": 1.2576831579208374,
"step": 924
},
{
"epoch": 2.8318042813455655,
"grad_norm": 0.578125,
"learning_rate": 2.9516409884210726e-06,
"loss": 1.1941940784454346,
"step": 926
},
{
"epoch": 2.837920489296636,
"grad_norm": 0.78515625,
"learning_rate": 2.9329106795049445e-06,
"loss": 1.2333204746246338,
"step": 928
},
{
"epoch": 2.8440366972477067,
"grad_norm": 0.5234375,
"learning_rate": 2.914246069261988e-06,
"loss": 1.2176916599273682,
"step": 930
},
{
"epoch": 2.850152905198777,
"grad_norm": 0.75390625,
"learning_rate": 2.8956476353694368e-06,
"loss": 1.2780966758728027,
"step": 932
},
{
"epoch": 2.856269113149847,
"grad_norm": 1.09375,
"learning_rate": 2.877115853810898e-06,
"loss": 1.2115226984024048,
"step": 934
},
{
"epoch": 2.8623853211009176,
"grad_norm": 0.65625,
"learning_rate": 2.8586511988641634e-06,
"loss": 1.189244031906128,
"step": 936
},
{
"epoch": 2.8685015290519877,
"grad_norm": 0.56640625,
"learning_rate": 2.8402541430890794e-06,
"loss": 1.2004551887512207,
"step": 938
},
{
"epoch": 2.8746177370030583,
"grad_norm": 0.84375,
"learning_rate": 2.821925157315447e-06,
"loss": 1.228663682937622,
"step": 940
},
{
"epoch": 2.8807339449541285,
"grad_norm": 0.5703125,
"learning_rate": 2.8036647106309744e-06,
"loss": 1.2045689821243286,
"step": 942
},
{
"epoch": 2.8868501529051986,
"grad_norm": 0.5859375,
"learning_rate": 2.78547327036927e-06,
"loss": 1.2150650024414062,
"step": 944
},
{
"epoch": 2.8929663608562692,
"grad_norm": 0.65625,
"learning_rate": 2.767351302097887e-06,
"loss": 1.199387788772583,
"step": 946
},
{
"epoch": 2.8990825688073394,
"grad_norm": 0.62109375,
"learning_rate": 2.7492992696064013e-06,
"loss": 1.2142434120178223,
"step": 948
},
{
"epoch": 2.90519877675841,
"grad_norm": 0.76953125,
"learning_rate": 2.731317634894548e-06,
"loss": 1.2693067789077759,
"step": 950
},
{
"epoch": 2.91131498470948,
"grad_norm": 1.4921875,
"learning_rate": 2.7134068581603936e-06,
"loss": 1.2424131631851196,
"step": 952
},
{
"epoch": 2.9174311926605503,
"grad_norm": 0.625,
"learning_rate": 2.6955673977885566e-06,
"loss": 1.2381134033203125,
"step": 954
},
{
"epoch": 2.923547400611621,
"grad_norm": 0.8046875,
"learning_rate": 2.677799710338486e-06,
"loss": 1.2375258207321167,
"step": 956
},
{
"epoch": 2.929663608562691,
"grad_norm": 1.125,
"learning_rate": 2.660104250532764e-06,
"loss": 1.129172921180725,
"step": 958
},
{
"epoch": 2.9357798165137616,
"grad_norm": 0.5,
"learning_rate": 2.6424814712454773e-06,
"loss": 1.1203192472457886,
"step": 960
},
{
"epoch": 2.941896024464832,
"grad_norm": 1.046875,
"learning_rate": 2.624931823490625e-06,
"loss": 1.2383675575256348,
"step": 962
},
{
"epoch": 2.948012232415902,
"grad_norm": 0.6171875,
"learning_rate": 2.607455756410573e-06,
"loss": 1.1556285619735718,
"step": 964
},
{
"epoch": 2.9541284403669725,
"grad_norm": 0.90625,
"learning_rate": 2.5900537172645624e-06,
"loss": 1.211835503578186,
"step": 966
},
{
"epoch": 2.9602446483180427,
"grad_norm": 0.8046875,
"learning_rate": 2.5727261514172586e-06,
"loss": 1.1909599304199219,
"step": 968
},
{
"epoch": 2.9663608562691133,
"grad_norm": 0.7265625,
"learning_rate": 2.55547350232736e-06,
"loss": 1.2073407173156738,
"step": 970
},
{
"epoch": 2.9724770642201834,
"grad_norm": 0.498046875,
"learning_rate": 2.5382962115362454e-06,
"loss": 1.202832818031311,
"step": 972
},
{
"epoch": 2.9785932721712536,
"grad_norm": 0.62109375,
"learning_rate": 2.521194718656669e-06,
"loss": 1.2078254222869873,
"step": 974
},
{
"epoch": 2.984709480122324,
"grad_norm": 0.498046875,
"learning_rate": 2.504169461361518e-06,
"loss": 1.1780730485916138,
"step": 976
},
{
"epoch": 2.9908256880733948,
"grad_norm": 1.0546875,
"learning_rate": 2.487220875372606e-06,
"loss": 1.1677711009979248,
"step": 978
},
{
"epoch": 2.996941896024465,
"grad_norm": 0.984375,
"learning_rate": 2.470349394449524e-06,
"loss": 1.2224700450897217,
"step": 980
},
{
"epoch": 3.003058103975535,
"grad_norm": 0.5859375,
"learning_rate": 2.453555450378535e-06,
"loss": 1.2254760265350342,
"step": 982
},
{
"epoch": 3.0091743119266057,
"grad_norm": 1.359375,
"learning_rate": 2.436839472961534e-06,
"loss": 1.235056757926941,
"step": 984
},
{
"epoch": 3.015290519877676,
"grad_norm": 0.53125,
"learning_rate": 2.4202018900050327e-06,
"loss": 1.2022202014923096,
"step": 986
},
{
"epoch": 3.021406727828746,
"grad_norm": 0.470703125,
"learning_rate": 2.4036431273092238e-06,
"loss": 1.2913790941238403,
"step": 988
},
{
"epoch": 3.0275229357798166,
"grad_norm": 0.6640625,
"learning_rate": 2.387163608657078e-06,
"loss": 1.2257859706878662,
"step": 990
},
{
"epoch": 3.0336391437308867,
"grad_norm": 0.453125,
"learning_rate": 2.3707637558034994e-06,
"loss": 1.173649787902832,
"step": 992
},
{
"epoch": 3.0397553516819573,
"grad_norm": 0.89453125,
"learning_rate": 2.3544439884645317e-06,
"loss": 1.2261406183242798,
"step": 994
},
{
"epoch": 3.0458715596330275,
"grad_norm": 1.421875,
"learning_rate": 2.3382047243066163e-06,
"loss": 1.132150650024414,
"step": 996
},
{
"epoch": 3.051987767584098,
"grad_norm": 0.470703125,
"learning_rate": 2.3220463789359014e-06,
"loss": 1.1366033554077148,
"step": 998
},
{
"epoch": 3.058103975535168,
"grad_norm": 0.953125,
"learning_rate": 2.30596936588761e-06,
"loss": 1.1769903898239136,
"step": 1000
},
{
"epoch": 3.0642201834862384,
"grad_norm": 0.427734375,
"learning_rate": 2.2899740966154526e-06,
"loss": 1.2203010320663452,
"step": 1002
},
{
"epoch": 3.070336391437309,
"grad_norm": 0.578125,
"learning_rate": 2.274060980481098e-06,
"loss": 1.077088475227356,
"step": 1004
},
{
"epoch": 3.076452599388379,
"grad_norm": 0.384765625,
"learning_rate": 2.2582304247436963e-06,
"loss": 1.177517056465149,
"step": 1006
},
{
"epoch": 3.0825688073394497,
"grad_norm": 0.408203125,
"learning_rate": 2.2424828345494575e-06,
"loss": 1.0615501403808594,
"step": 1008
},
{
"epoch": 3.08868501529052,
"grad_norm": 0.51953125,
"learning_rate": 2.226818612921281e-06,
"loss": 1.257022738456726,
"step": 1010
},
{
"epoch": 3.09480122324159,
"grad_norm": 0.57421875,
"learning_rate": 2.2112381607484417e-06,
"loss": 1.3333863019943237,
"step": 1012
},
{
"epoch": 3.1009174311926606,
"grad_norm": 0.53125,
"learning_rate": 2.195741876776331e-06,
"loss": 1.116982102394104,
"step": 1014
},
{
"epoch": 3.1070336391437308,
"grad_norm": 0.484375,
"learning_rate": 2.180330157596251e-06,
"loss": 1.1025663614273071,
"step": 1016
},
{
"epoch": 3.1131498470948014,
"grad_norm": 0.84375,
"learning_rate": 2.1650033976352645e-06,
"loss": 1.1931098699569702,
"step": 1018
},
{
"epoch": 3.1192660550458715,
"grad_norm": 0.58984375,
"learning_rate": 2.1497619891461016e-06,
"loss": 1.2750816345214844,
"step": 1020
},
{
"epoch": 3.1253822629969417,
"grad_norm": 0.578125,
"learning_rate": 2.134606322197119e-06,
"loss": 1.200748324394226,
"step": 1022
},
{
"epoch": 3.1314984709480123,
"grad_norm": 0.9375,
"learning_rate": 2.119536784662321e-06,
"loss": 1.1820026636123657,
"step": 1024
},
{
"epoch": 3.1376146788990824,
"grad_norm": 0.50390625,
"learning_rate": 2.1045537622114265e-06,
"loss": 1.072840929031372,
"step": 1026
},
{
"epoch": 3.143730886850153,
"grad_norm": 0.7265625,
"learning_rate": 2.089657638300005e-06,
"loss": 1.1731314659118652,
"step": 1028
},
{
"epoch": 3.149847094801223,
"grad_norm": 0.55859375,
"learning_rate": 2.0748487941596596e-06,
"loss": 1.1329575777053833,
"step": 1030
},
{
"epoch": 3.1559633027522938,
"grad_norm": 0.6328125,
"learning_rate": 2.06012760878827e-06,
"loss": 1.2068923711776733,
"step": 1032
},
{
"epoch": 3.162079510703364,
"grad_norm": 0.5703125,
"learning_rate": 2.045494458940295e-06,
"loss": 1.1488394737243652,
"step": 1034
},
{
"epoch": 3.168195718654434,
"grad_norm": 0.65625,
"learning_rate": 2.0309497191171285e-06,
"loss": 1.1287355422973633,
"step": 1036
},
{
"epoch": 3.1743119266055047,
"grad_norm": 0.453125,
"learning_rate": 2.0164937615575148e-06,
"loss": 1.182981014251709,
"step": 1038
},
{
"epoch": 3.180428134556575,
"grad_norm": 0.66015625,
"learning_rate": 2.002126956228026e-06,
"loss": 1.159349799156189,
"step": 1040
},
{
"epoch": 3.1865443425076454,
"grad_norm": 0.89453125,
"learning_rate": 1.9878496708135885e-06,
"loss": 1.1993876695632935,
"step": 1042
},
{
"epoch": 3.1926605504587156,
"grad_norm": 0.435546875,
"learning_rate": 1.973662270708074e-06,
"loss": 1.1298656463623047,
"step": 1044
},
{
"epoch": 3.198776758409786,
"grad_norm": 0.5078125,
"learning_rate": 1.959565119004951e-06,
"loss": 1.1985409259796143,
"step": 1046
},
{
"epoch": 3.2048929663608563,
"grad_norm": 0.9453125,
"learning_rate": 1.9455585764879877e-06,
"loss": 1.1955678462982178,
"step": 1048
},
{
"epoch": 3.2110091743119265,
"grad_norm": 0.5703125,
"learning_rate": 1.9316430016220223e-06,
"loss": 1.1202224493026733,
"step": 1050
},
{
"epoch": 3.217125382262997,
"grad_norm": 0.72265625,
"learning_rate": 1.91781875054379e-06,
"loss": 1.157238245010376,
"step": 1052
},
{
"epoch": 3.223241590214067,
"grad_norm": 1.0390625,
"learning_rate": 1.9040861770528047e-06,
"loss": 1.1316120624542236,
"step": 1054
},
{
"epoch": 3.229357798165138,
"grad_norm": 0.58984375,
"learning_rate": 1.890445632602303e-06,
"loss": 1.1989833116531372,
"step": 1056
},
{
"epoch": 3.235474006116208,
"grad_norm": 1.828125,
"learning_rate": 1.876897466290259e-06,
"loss": 1.222222089767456,
"step": 1058
},
{
"epoch": 3.241590214067278,
"grad_norm": 0.63671875,
"learning_rate": 1.8634420248504382e-06,
"loss": 1.2111024856567383,
"step": 1060
},
{
"epoch": 3.2477064220183487,
"grad_norm": 0.5859375,
"learning_rate": 1.8500796526435305e-06,
"loss": 1.172393560409546,
"step": 1062
},
{
"epoch": 3.253822629969419,
"grad_norm": 2.328125,
"learning_rate": 1.8368106916483358e-06,
"loss": 1.1863235235214233,
"step": 1064
},
{
"epoch": 3.2599388379204894,
"grad_norm": 0.53125,
"learning_rate": 1.8236354814530113e-06,
"loss": 1.29865562915802,
"step": 1066
},
{
"epoch": 3.2660550458715596,
"grad_norm": 0.5,
"learning_rate": 1.8105543592463803e-06,
"loss": 1.261027455329895,
"step": 1068
},
{
"epoch": 3.2721712538226297,
"grad_norm": 0.421875,
"learning_rate": 1.7975676598093042e-06,
"loss": 1.2421050071716309,
"step": 1070
},
{
"epoch": 3.2782874617737003,
"grad_norm": 1.2578125,
"learning_rate": 1.784675715506113e-06,
"loss": 1.274834394454956,
"step": 1072
},
{
"epoch": 3.2844036697247705,
"grad_norm": 0.87890625,
"learning_rate": 1.7718788562760992e-06,
"loss": 1.2069604396820068,
"step": 1074
},
{
"epoch": 3.290519877675841,
"grad_norm": 0.63671875,
"learning_rate": 1.7591774096250736e-06,
"loss": 1.1289021968841553,
"step": 1076
},
{
"epoch": 3.2966360856269112,
"grad_norm": 0.79296875,
"learning_rate": 1.7465717006169887e-06,
"loss": 1.2350070476531982,
"step": 1078
},
{
"epoch": 3.302752293577982,
"grad_norm": 0.69921875,
"learning_rate": 1.734062051865609e-06,
"loss": 1.1916759014129639,
"step": 1080
},
{
"epoch": 3.308868501529052,
"grad_norm": 0.5390625,
"learning_rate": 1.7216487835262635e-06,
"loss": 1.1767183542251587,
"step": 1082
},
{
"epoch": 3.314984709480122,
"grad_norm": 1.0859375,
"learning_rate": 1.7093322132876485e-06,
"loss": 1.1724700927734375,
"step": 1084
},
{
"epoch": 3.3211009174311927,
"grad_norm": 0.494140625,
"learning_rate": 1.6971126563636977e-06,
"loss": 1.1266517639160156,
"step": 1086
},
{
"epoch": 3.327217125382263,
"grad_norm": 0.61328125,
"learning_rate": 1.6849904254855151e-06,
"loss": 1.211061716079712,
"step": 1088
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.609375,
"learning_rate": 1.6729658308933706e-06,
"loss": 1.2213722467422485,
"step": 1090
},
{
"epoch": 3.3394495412844036,
"grad_norm": 0.419921875,
"learning_rate": 1.6610391803287611e-06,
"loss": 1.1516450643539429,
"step": 1092
},
{
"epoch": 3.3455657492354742,
"grad_norm": 1.03125,
"learning_rate": 1.6492107790265338e-06,
"loss": 1.1679214239120483,
"step": 1094
},
{
"epoch": 3.3516819571865444,
"grad_norm": 0.796875,
"learning_rate": 1.6374809297070766e-06,
"loss": 1.2308049201965332,
"step": 1096
},
{
"epoch": 3.3577981651376145,
"grad_norm": 0.53125,
"learning_rate": 1.6258499325685673e-06,
"loss": 1.181188941001892,
"step": 1098
},
{
"epoch": 3.363914373088685,
"grad_norm": 0.9609375,
"learning_rate": 1.6143180852792911e-06,
"loss": 1.2255089282989502,
"step": 1100
},
{
"epoch": 3.3700305810397553,
"grad_norm": 0.68359375,
"learning_rate": 1.602885682970026e-06,
"loss": 1.2569115161895752,
"step": 1102
},
{
"epoch": 3.376146788990826,
"grad_norm": 0.92578125,
"learning_rate": 1.5915530182264868e-06,
"loss": 1.2300969362258911,
"step": 1104
},
{
"epoch": 3.382262996941896,
"grad_norm": 2.84375,
"learning_rate": 1.5803203810818366e-06,
"loss": 1.2431167364120483,
"step": 1106
},
{
"epoch": 3.388379204892966,
"grad_norm": 0.7890625,
"learning_rate": 1.5691880590092671e-06,
"loss": 1.2489876747131348,
"step": 1108
},
{
"epoch": 3.3944954128440368,
"grad_norm": 0.5703125,
"learning_rate": 1.558156336914634e-06,
"loss": 1.1550531387329102,
"step": 1110
},
{
"epoch": 3.400611620795107,
"grad_norm": 0.62890625,
"learning_rate": 1.547225497129179e-06,
"loss": 1.1249154806137085,
"step": 1112
},
{
"epoch": 3.4067278287461775,
"grad_norm": 1.84375,
"learning_rate": 1.5363958194022896e-06,
"loss": 1.1979522705078125,
"step": 1114
},
{
"epoch": 3.4128440366972477,
"grad_norm": 0.89453125,
"learning_rate": 1.5256675808943488e-06,
"loss": 1.1001931428909302,
"step": 1116
},
{
"epoch": 3.418960244648318,
"grad_norm": 0.6484375,
"learning_rate": 1.5150410561696382e-06,
"loss": 1.1855971813201904,
"step": 1118
},
{
"epoch": 3.4250764525993884,
"grad_norm": 0.69140625,
"learning_rate": 1.5045165171893117e-06,
"loss": 1.197637677192688,
"step": 1120
},
{
"epoch": 3.4311926605504586,
"grad_norm": 0.61328125,
"learning_rate": 1.4940942333044367e-06,
"loss": 1.1402236223220825,
"step": 1122
},
{
"epoch": 3.437308868501529,
"grad_norm": 0.50390625,
"learning_rate": 1.4837744712490983e-06,
"loss": 1.1059956550598145,
"step": 1124
},
{
"epoch": 3.4434250764525993,
"grad_norm": 0.7734375,
"learning_rate": 1.4735574951335752e-06,
"loss": 1.1585502624511719,
"step": 1126
},
{
"epoch": 3.44954128440367,
"grad_norm": 0.55859375,
"learning_rate": 1.4634435664375784e-06,
"loss": 1.2298681735992432,
"step": 1128
},
{
"epoch": 3.45565749235474,
"grad_norm": 0.4765625,
"learning_rate": 1.4534329440035599e-06,
"loss": 1.1276212930679321,
"step": 1130
},
{
"epoch": 3.46177370030581,
"grad_norm": 0.7421875,
"learning_rate": 1.4435258840300897e-06,
"loss": 1.1073015928268433,
"step": 1132
},
{
"epoch": 3.467889908256881,
"grad_norm": 0.5703125,
"learning_rate": 1.4337226400652977e-06,
"loss": 1.1824053525924683,
"step": 1134
},
{
"epoch": 3.474006116207951,
"grad_norm": 1.2265625,
"learning_rate": 1.424023463000384e-06,
"loss": 1.2478643655776978,
"step": 1136
},
{
"epoch": 3.4801223241590216,
"grad_norm": 0.458984375,
"learning_rate": 1.4144286010631993e-06,
"loss": 1.2114766836166382,
"step": 1138
},
{
"epoch": 3.4862385321100917,
"grad_norm": 0.57421875,
"learning_rate": 1.4049382998118919e-06,
"loss": 1.2164137363433838,
"step": 1140
},
{
"epoch": 3.4923547400611623,
"grad_norm": 0.8046875,
"learning_rate": 1.3955528021286208e-06,
"loss": 1.115936517715454,
"step": 1142
},
{
"epoch": 3.4984709480122325,
"grad_norm": 0.7421875,
"learning_rate": 1.3862723482133437e-06,
"loss": 1.1582000255584717,
"step": 1144
},
{
"epoch": 3.5045871559633026,
"grad_norm": 1.203125,
"learning_rate": 1.3770971755776667e-06,
"loss": 1.1616395711898804,
"step": 1146
},
{
"epoch": 3.510703363914373,
"grad_norm": 0.73828125,
"learning_rate": 1.3680275190387677e-06,
"loss": 1.20869779586792,
"step": 1148
},
{
"epoch": 3.5168195718654434,
"grad_norm": 0.5859375,
"learning_rate": 1.3590636107133849e-06,
"loss": 1.2474617958068848,
"step": 1150
},
{
"epoch": 3.522935779816514,
"grad_norm": 1.2578125,
"learning_rate": 1.3502056800118784e-06,
"loss": 1.2327600717544556,
"step": 1152
},
{
"epoch": 3.529051987767584,
"grad_norm": 0.46484375,
"learning_rate": 1.3414539536323568e-06,
"loss": 1.1355574131011963,
"step": 1154
},
{
"epoch": 3.5351681957186543,
"grad_norm": 1.0625,
"learning_rate": 1.3328086555548764e-06,
"loss": 1.1376428604125977,
"step": 1156
},
{
"epoch": 3.541284403669725,
"grad_norm": 0.48828125,
"learning_rate": 1.3242700070357098e-06,
"loss": 1.128600001335144,
"step": 1158
},
{
"epoch": 3.547400611620795,
"grad_norm": 1.65625,
"learning_rate": 1.3158382266016803e-06,
"loss": 1.2273775339126587,
"step": 1160
},
{
"epoch": 3.5535168195718656,
"grad_norm": 0.53125,
"learning_rate": 1.3075135300445746e-06,
"loss": 1.1972393989562988,
"step": 1162
},
{
"epoch": 3.5596330275229358,
"grad_norm": 0.62890625,
"learning_rate": 1.2992961304156146e-06,
"loss": 1.2698583602905273,
"step": 1164
},
{
"epoch": 3.565749235474006,
"grad_norm": 0.54296875,
"learning_rate": 1.2911862380200076e-06,
"loss": 1.215325117111206,
"step": 1166
},
{
"epoch": 3.5718654434250765,
"grad_norm": 0.53125,
"learning_rate": 1.2831840604115647e-06,
"loss": 1.1836117506027222,
"step": 1168
},
{
"epoch": 3.5779816513761467,
"grad_norm": 1.0859375,
"learning_rate": 1.2752898023873873e-06,
"loss": 1.1673725843429565,
"step": 1170
},
{
"epoch": 3.5840978593272173,
"grad_norm": 0.6171875,
"learning_rate": 1.2675036659826251e-06,
"loss": 1.1013611555099487,
"step": 1172
},
{
"epoch": 3.5902140672782874,
"grad_norm": 0.55859375,
"learning_rate": 1.2598258504653082e-06,
"loss": 1.2070239782333374,
"step": 1174
},
{
"epoch": 3.5963302752293576,
"grad_norm": 0.5234375,
"learning_rate": 1.2522565523312456e-06,
"loss": 1.1760621070861816,
"step": 1176
},
{
"epoch": 3.602446483180428,
"grad_norm": 1.0625,
"learning_rate": 1.2447959652989963e-06,
"loss": 1.2546082735061646,
"step": 1178
},
{
"epoch": 3.6085626911314987,
"grad_norm": 1.40625,
"learning_rate": 1.2374442803049125e-06,
"loss": 1.11211359500885,
"step": 1180
},
{
"epoch": 3.614678899082569,
"grad_norm": 0.67578125,
"learning_rate": 1.2302016854982504e-06,
"loss": 1.1653016805648804,
"step": 1182
},
{
"epoch": 3.620795107033639,
"grad_norm": 0.76953125,
"learning_rate": 1.2230683662363599e-06,
"loss": 1.0931107997894287,
"step": 1184
},
{
"epoch": 3.6269113149847096,
"grad_norm": 0.53125,
"learning_rate": 1.2160445050799346e-06,
"loss": 1.1593706607818604,
"step": 1186
},
{
"epoch": 3.63302752293578,
"grad_norm": 0.5859375,
"learning_rate": 1.2091302817883444e-06,
"loss": 1.2466744184494019,
"step": 1188
},
{
"epoch": 3.6391437308868504,
"grad_norm": 0.69921875,
"learning_rate": 1.2023258733150345e-06,
"loss": 1.1520183086395264,
"step": 1190
},
{
"epoch": 3.6452599388379205,
"grad_norm": 0.6875,
"learning_rate": 1.195631453802994e-06,
"loss": 1.1501617431640625,
"step": 1192
},
{
"epoch": 3.6513761467889907,
"grad_norm": 0.5078125,
"learning_rate": 1.1890471945803e-06,
"loss": 1.1947115659713745,
"step": 1194
},
{
"epoch": 3.6574923547400613,
"grad_norm": 0.82421875,
"learning_rate": 1.1825732641557358e-06,
"loss": 1.090171217918396,
"step": 1196
},
{
"epoch": 3.6636085626911314,
"grad_norm": 0.486328125,
"learning_rate": 1.1762098282144735e-06,
"loss": 1.231759786605835,
"step": 1198
},
{
"epoch": 3.669724770642202,
"grad_norm": 0.5859375,
"learning_rate": 1.169957049613839e-06,
"loss": 1.2382960319519043,
"step": 1200
},
{
"epoch": 3.675840978593272,
"grad_norm": 0.46484375,
"learning_rate": 1.1638150883791386e-06,
"loss": 1.1713348627090454,
"step": 1202
},
{
"epoch": 3.6819571865443423,
"grad_norm": 0.625,
"learning_rate": 1.157784101699567e-06,
"loss": 1.1755608320236206,
"step": 1204
},
{
"epoch": 3.688073394495413,
"grad_norm": 0.609375,
"learning_rate": 1.1518642439241849e-06,
"loss": 1.2025344371795654,
"step": 1206
},
{
"epoch": 3.694189602446483,
"grad_norm": 1.2265625,
"learning_rate": 1.146055666557966e-06,
"loss": 1.2071685791015625,
"step": 1208
},
{
"epoch": 3.7003058103975537,
"grad_norm": 0.703125,
"learning_rate": 1.140358518257922e-06,
"loss": 1.1952728033065796,
"step": 1210
},
{
"epoch": 3.706422018348624,
"grad_norm": 0.60546875,
"learning_rate": 1.1347729448292953e-06,
"loss": 1.21987783908844,
"step": 1212
},
{
"epoch": 3.712538226299694,
"grad_norm": 0.7421875,
"learning_rate": 1.129299089221832e-06,
"loss": 1.2178161144256592,
"step": 1214
},
{
"epoch": 3.7186544342507646,
"grad_norm": 0.68359375,
"learning_rate": 1.1239370915261196e-06,
"loss": 1.1406751871109009,
"step": 1216
},
{
"epoch": 3.7247706422018347,
"grad_norm": 1.6484375,
"learning_rate": 1.1186870889700013e-06,
"loss": 1.1654596328735352,
"step": 1218
},
{
"epoch": 3.7308868501529053,
"grad_norm": 2.328125,
"learning_rate": 1.1135492159150676e-06,
"loss": 1.2073957920074463,
"step": 1220
},
{
"epoch": 3.7370030581039755,
"grad_norm": 0.609375,
"learning_rate": 1.108523603853215e-06,
"loss": 1.1250100135803223,
"step": 1222
},
{
"epoch": 3.7431192660550456,
"grad_norm": 0.60546875,
"learning_rate": 1.1036103814032804e-06,
"loss": 1.2246984243392944,
"step": 1224
},
{
"epoch": 3.7492354740061162,
"grad_norm": 0.58203125,
"learning_rate": 1.0988096743077513e-06,
"loss": 1.1390925645828247,
"step": 1226
},
{
"epoch": 3.7553516819571864,
"grad_norm": 0.58203125,
"learning_rate": 1.094121605429547e-06,
"loss": 1.1992175579071045,
"step": 1228
},
{
"epoch": 3.761467889908257,
"grad_norm": 0.4609375,
"learning_rate": 1.089546294748873e-06,
"loss": 1.0999352931976318,
"step": 1230
},
{
"epoch": 3.767584097859327,
"grad_norm": 0.451171875,
"learning_rate": 1.085083859360151e-06,
"loss": 1.1122483015060425,
"step": 1232
},
{
"epoch": 3.7737003058103973,
"grad_norm": 1.0625,
"learning_rate": 1.0807344134690236e-06,
"loss": 1.1888892650604248,
"step": 1234
},
{
"epoch": 3.779816513761468,
"grad_norm": 0.59765625,
"learning_rate": 1.0764980683894297e-06,
"loss": 1.1580041646957397,
"step": 1236
},
{
"epoch": 3.7859327217125385,
"grad_norm": 0.76171875,
"learning_rate": 1.0723749325407564e-06,
"loss": 1.176745891571045,
"step": 1238
},
{
"epoch": 3.7920489296636086,
"grad_norm": 0.65234375,
"learning_rate": 1.0683651114450641e-06,
"loss": 1.1710706949234009,
"step": 1240
},
{
"epoch": 3.7981651376146788,
"grad_norm": 2.578125,
"learning_rate": 1.0644687077243864e-06,
"loss": 1.1870887279510498,
"step": 1242
},
{
"epoch": 3.8042813455657494,
"grad_norm": 0.76953125,
"learning_rate": 1.0606858210981025e-06,
"loss": 1.169495940208435,
"step": 1244
},
{
"epoch": 3.8103975535168195,
"grad_norm": 0.66015625,
"learning_rate": 1.0570165483803867e-06,
"loss": 1.1190178394317627,
"step": 1246
},
{
"epoch": 3.81651376146789,
"grad_norm": 1.15625,
"learning_rate": 1.05346098347773e-06,
"loss": 1.1436067819595337,
"step": 1248
},
{
"epoch": 3.8226299694189603,
"grad_norm": 0.66796875,
"learning_rate": 1.050019217386535e-06,
"loss": 1.2288410663604736,
"step": 1250
},
{
"epoch": 3.8287461773700304,
"grad_norm": 1.25,
"learning_rate": 1.0466913381907914e-06,
"loss": 1.2218413352966309,
"step": 1252
},
{
"epoch": 3.834862385321101,
"grad_norm": 0.53515625,
"learning_rate": 1.0434774310598166e-06,
"loss": 1.208377480506897,
"step": 1254
},
{
"epoch": 3.840978593272171,
"grad_norm": 0.5859375,
"learning_rate": 1.04037757824608e-06,
"loss": 1.1784342527389526,
"step": 1256
},
{
"epoch": 3.8470948012232418,
"grad_norm": 1.0703125,
"learning_rate": 1.0373918590830952e-06,
"loss": 1.2136183977127075,
"step": 1258
},
{
"epoch": 3.853211009174312,
"grad_norm": 0.5390625,
"learning_rate": 1.0345203499833913e-06,
"loss": 1.2747994661331177,
"step": 1260
},
{
"epoch": 3.859327217125382,
"grad_norm": 0.73828125,
"learning_rate": 1.0317631244365575e-06,
"loss": 1.1638200283050537,
"step": 1262
},
{
"epoch": 3.8654434250764527,
"grad_norm": 0.466796875,
"learning_rate": 1.0291202530073602e-06,
"loss": 1.2454450130462646,
"step": 1264
},
{
"epoch": 3.871559633027523,
"grad_norm": 0.64453125,
"learning_rate": 1.0265918033339392e-06,
"loss": 1.1502002477645874,
"step": 1266
},
{
"epoch": 3.8776758409785934,
"grad_norm": 0.9609375,
"learning_rate": 1.0241778401260764e-06,
"loss": 1.1322892904281616,
"step": 1268
},
{
"epoch": 3.8837920489296636,
"grad_norm": 0.61328125,
"learning_rate": 1.0218784251635382e-06,
"loss": 1.1245934963226318,
"step": 1270
},
{
"epoch": 3.8899082568807337,
"grad_norm": 0.82421875,
"learning_rate": 1.0196936172944962e-06,
"loss": 1.2275093793869019,
"step": 1272
},
{
"epoch": 3.8960244648318043,
"grad_norm": 0.6328125,
"learning_rate": 1.0176234724340201e-06,
"loss": 1.2591514587402344,
"step": 1274
},
{
"epoch": 3.9021406727828745,
"grad_norm": 0.46484375,
"learning_rate": 1.0156680435626468e-06,
"loss": 1.1828017234802246,
"step": 1276
},
{
"epoch": 3.908256880733945,
"grad_norm": 0.6875,
"learning_rate": 1.0138273807250244e-06,
"loss": 1.1989636421203613,
"step": 1278
},
{
"epoch": 3.914373088685015,
"grad_norm": 0.5234375,
"learning_rate": 1.0121015310286318e-06,
"loss": 1.1318210363388062,
"step": 1280
},
{
"epoch": 3.9204892966360854,
"grad_norm": 0.51953125,
"learning_rate": 1.0104905386425735e-06,
"loss": 1.1387715339660645,
"step": 1282
},
{
"epoch": 3.926605504587156,
"grad_norm": 0.486328125,
"learning_rate": 1.0089944447964479e-06,
"loss": 1.0994793176651,
"step": 1284
},
{
"epoch": 3.9327217125382266,
"grad_norm": 0.5078125,
"learning_rate": 1.0076132877792933e-06,
"loss": 1.2001361846923828,
"step": 1286
},
{
"epoch": 3.9388379204892967,
"grad_norm": 2.203125,
"learning_rate": 1.0063471029386065e-06,
"loss": 1.1622974872589111,
"step": 1288
},
{
"epoch": 3.944954128440367,
"grad_norm": 1.015625,
"learning_rate": 1.0051959226794407e-06,
"loss": 1.170785903930664,
"step": 1290
},
{
"epoch": 3.9510703363914375,
"grad_norm": 0.546875,
"learning_rate": 1.004159776463573e-06,
"loss": 1.1379996538162231,
"step": 1292
},
{
"epoch": 3.9571865443425076,
"grad_norm": 1.046875,
"learning_rate": 1.003238690808754e-06,
"loss": 1.2565704584121704,
"step": 1294
},
{
"epoch": 3.963302752293578,
"grad_norm": 0.609375,
"learning_rate": 1.0024326892880253e-06,
"loss": 1.1255217790603638,
"step": 1296
},
{
"epoch": 3.9694189602446484,
"grad_norm": 0.58984375,
"learning_rate": 1.0017417925291187e-06,
"loss": 1.149346113204956,
"step": 1298
},
{
"epoch": 3.9755351681957185,
"grad_norm": 0.46875,
"learning_rate": 1.001166018213929e-06,
"loss": 1.1946812868118286,
"step": 1300
},
{
"epoch": 3.981651376146789,
"grad_norm": 1.3828125,
"learning_rate": 1.0007053810780578e-06,
"loss": 1.1319454908370972,
"step": 1302
},
{
"epoch": 3.9877675840978593,
"grad_norm": 0.8203125,
"learning_rate": 1.0003598929104407e-06,
"loss": 1.1686453819274902,
"step": 1304
},
{
"epoch": 3.99388379204893,
"grad_norm": 0.52734375,
"learning_rate": 1.0001295625530423e-06,
"loss": 1.130082607269287,
"step": 1306
},
{
"epoch": 4.0,
"grad_norm": 0.78515625,
"learning_rate": 1.0000143959006323e-06,
"loss": 1.2041049003601074,
"step": 1308
},
{
"epoch": 4.0,
"step": 1308,
"total_flos": 3.2734142949973033e+18,
"train_loss": 1.3248716153500641,
"train_runtime": 16065.8623,
"train_samples_per_second": 2.605,
"train_steps_per_second": 0.081
}
],
"logging_steps": 2,
"max_steps": 1308,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 9999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.2734142949973033e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}