VLM2Vec-LoRA / trainer_state.json
memray's picture
Upload folder using huggingface_hub
15e7a0f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.091190108191654,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015455950540958269,
"grad_norm": 472.0,
"learning_rate": 2.0000000000000002e-07,
"loss": 26.6042,
"step": 1
},
{
"epoch": 0.0030911901081916537,
"grad_norm": 490.0,
"learning_rate": 4.0000000000000003e-07,
"loss": 27.2068,
"step": 2
},
{
"epoch": 0.00463678516228748,
"grad_norm": 480.0,
"learning_rate": 6.000000000000001e-07,
"loss": 26.9311,
"step": 3
},
{
"epoch": 0.0061823802163833074,
"grad_norm": 472.0,
"learning_rate": 8.000000000000001e-07,
"loss": 26.4758,
"step": 4
},
{
"epoch": 0.0077279752704791345,
"grad_norm": 504.0,
"learning_rate": 1.0000000000000002e-06,
"loss": 27.7404,
"step": 5
},
{
"epoch": 0.00927357032457496,
"grad_norm": 482.0,
"learning_rate": 1.2000000000000002e-06,
"loss": 26.5912,
"step": 6
},
{
"epoch": 0.010819165378670788,
"grad_norm": 484.0,
"learning_rate": 1.4000000000000001e-06,
"loss": 27.0497,
"step": 7
},
{
"epoch": 0.012364760432766615,
"grad_norm": 482.0,
"learning_rate": 1.6000000000000001e-06,
"loss": 26.375,
"step": 8
},
{
"epoch": 0.013910355486862442,
"grad_norm": 470.0,
"learning_rate": 1.8000000000000001e-06,
"loss": 24.9608,
"step": 9
},
{
"epoch": 0.015455950540958269,
"grad_norm": 484.0,
"learning_rate": 2.0000000000000003e-06,
"loss": 24.9394,
"step": 10
},
{
"epoch": 0.017001545595054096,
"grad_norm": 464.0,
"learning_rate": 2.2e-06,
"loss": 23.9378,
"step": 11
},
{
"epoch": 0.01854714064914992,
"grad_norm": 480.0,
"learning_rate": 2.4000000000000003e-06,
"loss": 23.005,
"step": 12
},
{
"epoch": 0.02009273570324575,
"grad_norm": 448.0,
"learning_rate": 2.6e-06,
"loss": 21.464,
"step": 13
},
{
"epoch": 0.021638330757341576,
"grad_norm": 430.0,
"learning_rate": 2.8000000000000003e-06,
"loss": 20.2247,
"step": 14
},
{
"epoch": 0.023183925811437404,
"grad_norm": 416.0,
"learning_rate": 3e-06,
"loss": 18.9013,
"step": 15
},
{
"epoch": 0.02472952086553323,
"grad_norm": 426.0,
"learning_rate": 3.2000000000000003e-06,
"loss": 17.6419,
"step": 16
},
{
"epoch": 0.02627511591962906,
"grad_norm": 428.0,
"learning_rate": 3.4000000000000005e-06,
"loss": 16.2712,
"step": 17
},
{
"epoch": 0.027820710973724884,
"grad_norm": 426.0,
"learning_rate": 3.6000000000000003e-06,
"loss": 14.3173,
"step": 18
},
{
"epoch": 0.02936630602782071,
"grad_norm": 416.0,
"learning_rate": 3.8000000000000005e-06,
"loss": 13.1013,
"step": 19
},
{
"epoch": 0.030911901081916538,
"grad_norm": 366.0,
"learning_rate": 4.000000000000001e-06,
"loss": 11.5202,
"step": 20
},
{
"epoch": 0.03245749613601236,
"grad_norm": 322.0,
"learning_rate": 4.2000000000000004e-06,
"loss": 10.0744,
"step": 21
},
{
"epoch": 0.03400309119010819,
"grad_norm": 282.0,
"learning_rate": 4.4e-06,
"loss": 9.0714,
"step": 22
},
{
"epoch": 0.03554868624420402,
"grad_norm": 207.0,
"learning_rate": 4.600000000000001e-06,
"loss": 8.0312,
"step": 23
},
{
"epoch": 0.03709428129829984,
"grad_norm": 178.0,
"learning_rate": 4.800000000000001e-06,
"loss": 7.7583,
"step": 24
},
{
"epoch": 0.03863987635239567,
"grad_norm": 190.0,
"learning_rate": 5e-06,
"loss": 7.3539,
"step": 25
},
{
"epoch": 0.0401854714064915,
"grad_norm": 136.0,
"learning_rate": 5.2e-06,
"loss": 6.9746,
"step": 26
},
{
"epoch": 0.04173106646058733,
"grad_norm": 121.5,
"learning_rate": 5.400000000000001e-06,
"loss": 6.638,
"step": 27
},
{
"epoch": 0.04327666151468315,
"grad_norm": 112.0,
"learning_rate": 5.600000000000001e-06,
"loss": 6.4899,
"step": 28
},
{
"epoch": 0.04482225656877898,
"grad_norm": 102.5,
"learning_rate": 5.8e-06,
"loss": 6.2407,
"step": 29
},
{
"epoch": 0.04636785162287481,
"grad_norm": 87.0,
"learning_rate": 6e-06,
"loss": 5.9863,
"step": 30
},
{
"epoch": 0.04791344667697063,
"grad_norm": 84.0,
"learning_rate": 6.200000000000001e-06,
"loss": 5.822,
"step": 31
},
{
"epoch": 0.04945904173106646,
"grad_norm": 73.5,
"learning_rate": 6.4000000000000006e-06,
"loss": 5.7442,
"step": 32
},
{
"epoch": 0.05100463678516229,
"grad_norm": 72.0,
"learning_rate": 6.600000000000001e-06,
"loss": 5.4181,
"step": 33
},
{
"epoch": 0.05255023183925812,
"grad_norm": 67.5,
"learning_rate": 6.800000000000001e-06,
"loss": 5.2732,
"step": 34
},
{
"epoch": 0.05409582689335394,
"grad_norm": 62.5,
"learning_rate": 7e-06,
"loss": 5.0897,
"step": 35
},
{
"epoch": 0.05564142194744977,
"grad_norm": 55.5,
"learning_rate": 7.2000000000000005e-06,
"loss": 4.8823,
"step": 36
},
{
"epoch": 0.0571870170015456,
"grad_norm": 62.5,
"learning_rate": 7.4e-06,
"loss": 4.7194,
"step": 37
},
{
"epoch": 0.05873261205564142,
"grad_norm": 59.0,
"learning_rate": 7.600000000000001e-06,
"loss": 4.5549,
"step": 38
},
{
"epoch": 0.06027820710973725,
"grad_norm": 70.5,
"learning_rate": 7.800000000000002e-06,
"loss": 4.3491,
"step": 39
},
{
"epoch": 0.061823802163833076,
"grad_norm": 54.25,
"learning_rate": 8.000000000000001e-06,
"loss": 4.1664,
"step": 40
},
{
"epoch": 0.0633693972179289,
"grad_norm": 53.5,
"learning_rate": 8.2e-06,
"loss": 3.9786,
"step": 41
},
{
"epoch": 0.06491499227202473,
"grad_norm": 52.25,
"learning_rate": 8.400000000000001e-06,
"loss": 3.718,
"step": 42
},
{
"epoch": 0.06646058732612056,
"grad_norm": 60.0,
"learning_rate": 8.6e-06,
"loss": 3.57,
"step": 43
},
{
"epoch": 0.06800618238021638,
"grad_norm": 44.5,
"learning_rate": 8.8e-06,
"loss": 3.4842,
"step": 44
},
{
"epoch": 0.0695517774343122,
"grad_norm": 39.0,
"learning_rate": 9e-06,
"loss": 3.0784,
"step": 45
},
{
"epoch": 0.07109737248840804,
"grad_norm": 86.5,
"learning_rate": 9.200000000000002e-06,
"loss": 3.1007,
"step": 46
},
{
"epoch": 0.07264296754250386,
"grad_norm": 74.5,
"learning_rate": 9.4e-06,
"loss": 2.9358,
"step": 47
},
{
"epoch": 0.07418856259659969,
"grad_norm": 39.0,
"learning_rate": 9.600000000000001e-06,
"loss": 2.932,
"step": 48
},
{
"epoch": 0.07573415765069552,
"grad_norm": 35.75,
"learning_rate": 9.800000000000001e-06,
"loss": 2.7691,
"step": 49
},
{
"epoch": 0.07727975270479134,
"grad_norm": 32.5,
"learning_rate": 1e-05,
"loss": 2.6701,
"step": 50
},
{
"epoch": 0.07882534775888717,
"grad_norm": 28.875,
"learning_rate": 1.02e-05,
"loss": 2.4431,
"step": 51
},
{
"epoch": 0.080370942812983,
"grad_norm": 30.375,
"learning_rate": 1.04e-05,
"loss": 2.5914,
"step": 52
},
{
"epoch": 0.08191653786707882,
"grad_norm": 28.875,
"learning_rate": 1.0600000000000002e-05,
"loss": 2.4222,
"step": 53
},
{
"epoch": 0.08346213292117466,
"grad_norm": 27.375,
"learning_rate": 1.0800000000000002e-05,
"loss": 2.1741,
"step": 54
},
{
"epoch": 0.08500772797527048,
"grad_norm": 55.75,
"learning_rate": 1.1000000000000001e-05,
"loss": 2.1799,
"step": 55
},
{
"epoch": 0.0865533230293663,
"grad_norm": 22.75,
"learning_rate": 1.1200000000000001e-05,
"loss": 2.0037,
"step": 56
},
{
"epoch": 0.08809891808346214,
"grad_norm": 23.0,
"learning_rate": 1.14e-05,
"loss": 2.0929,
"step": 57
},
{
"epoch": 0.08964451313755796,
"grad_norm": 22.75,
"learning_rate": 1.16e-05,
"loss": 1.9233,
"step": 58
},
{
"epoch": 0.09119010819165378,
"grad_norm": 22.625,
"learning_rate": 1.18e-05,
"loss": 1.8583,
"step": 59
},
{
"epoch": 0.09273570324574962,
"grad_norm": 27.0,
"learning_rate": 1.2e-05,
"loss": 1.9135,
"step": 60
},
{
"epoch": 0.09428129829984544,
"grad_norm": 20.875,
"learning_rate": 1.22e-05,
"loss": 1.7249,
"step": 61
},
{
"epoch": 0.09582689335394126,
"grad_norm": 20.625,
"learning_rate": 1.2400000000000002e-05,
"loss": 1.6811,
"step": 62
},
{
"epoch": 0.0973724884080371,
"grad_norm": 26.5,
"learning_rate": 1.2600000000000001e-05,
"loss": 1.72,
"step": 63
},
{
"epoch": 0.09891808346213292,
"grad_norm": 18.0,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.6351,
"step": 64
},
{
"epoch": 0.10046367851622875,
"grad_norm": 18.0,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.6122,
"step": 65
},
{
"epoch": 0.10200927357032458,
"grad_norm": 17.625,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.5638,
"step": 66
},
{
"epoch": 0.1035548686244204,
"grad_norm": 17.875,
"learning_rate": 1.3400000000000002e-05,
"loss": 1.489,
"step": 67
},
{
"epoch": 0.10510046367851623,
"grad_norm": 18.75,
"learning_rate": 1.3600000000000002e-05,
"loss": 1.5691,
"step": 68
},
{
"epoch": 0.10664605873261206,
"grad_norm": 20.5,
"learning_rate": 1.38e-05,
"loss": 1.4363,
"step": 69
},
{
"epoch": 0.10819165378670788,
"grad_norm": 17.125,
"learning_rate": 1.4e-05,
"loss": 1.4473,
"step": 70
},
{
"epoch": 0.10973724884080371,
"grad_norm": 17.625,
"learning_rate": 1.4200000000000001e-05,
"loss": 1.4345,
"step": 71
},
{
"epoch": 0.11128284389489954,
"grad_norm": 27.5,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.5177,
"step": 72
},
{
"epoch": 0.11282843894899536,
"grad_norm": 16.625,
"learning_rate": 1.46e-05,
"loss": 1.3802,
"step": 73
},
{
"epoch": 0.1143740340030912,
"grad_norm": 19.625,
"learning_rate": 1.48e-05,
"loss": 1.3202,
"step": 74
},
{
"epoch": 0.11591962905718702,
"grad_norm": 27.25,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.3449,
"step": 75
},
{
"epoch": 0.11746522411128284,
"grad_norm": 16.375,
"learning_rate": 1.5200000000000002e-05,
"loss": 1.3505,
"step": 76
},
{
"epoch": 0.11901081916537867,
"grad_norm": 16.25,
"learning_rate": 1.54e-05,
"loss": 1.2425,
"step": 77
},
{
"epoch": 0.1205564142194745,
"grad_norm": 15.25,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.2216,
"step": 78
},
{
"epoch": 0.12210200927357033,
"grad_norm": 15.4375,
"learning_rate": 1.58e-05,
"loss": 1.2965,
"step": 79
},
{
"epoch": 0.12364760432766615,
"grad_norm": 15.6875,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.3441,
"step": 80
},
{
"epoch": 0.125193199381762,
"grad_norm": 43.25,
"learning_rate": 1.62e-05,
"loss": 1.2836,
"step": 81
},
{
"epoch": 0.1267387944358578,
"grad_norm": 18.375,
"learning_rate": 1.64e-05,
"loss": 1.2163,
"step": 82
},
{
"epoch": 0.12828438948995363,
"grad_norm": 38.25,
"learning_rate": 1.66e-05,
"loss": 1.1794,
"step": 83
},
{
"epoch": 0.12982998454404945,
"grad_norm": 14.8125,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.1239,
"step": 84
},
{
"epoch": 0.13137557959814528,
"grad_norm": 13.5625,
"learning_rate": 1.7e-05,
"loss": 1.0771,
"step": 85
},
{
"epoch": 0.13292117465224113,
"grad_norm": 14.5625,
"learning_rate": 1.72e-05,
"loss": 1.0916,
"step": 86
},
{
"epoch": 0.13446676970633695,
"grad_norm": 14.5625,
"learning_rate": 1.7400000000000003e-05,
"loss": 1.0952,
"step": 87
},
{
"epoch": 0.13601236476043277,
"grad_norm": 13.75,
"learning_rate": 1.76e-05,
"loss": 1.0659,
"step": 88
},
{
"epoch": 0.1375579598145286,
"grad_norm": 13.5625,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.9821,
"step": 89
},
{
"epoch": 0.1391035548686244,
"grad_norm": 14.75,
"learning_rate": 1.8e-05,
"loss": 1.079,
"step": 90
},
{
"epoch": 0.14064914992272023,
"grad_norm": 13.5625,
"learning_rate": 1.8200000000000002e-05,
"loss": 1.003,
"step": 91
},
{
"epoch": 0.14219474497681608,
"grad_norm": 19.875,
"learning_rate": 1.8400000000000003e-05,
"loss": 1.1328,
"step": 92
},
{
"epoch": 0.1437403400309119,
"grad_norm": 16.875,
"learning_rate": 1.86e-05,
"loss": 1.0903,
"step": 93
},
{
"epoch": 0.14528593508500773,
"grad_norm": 15.5,
"learning_rate": 1.88e-05,
"loss": 1.0609,
"step": 94
},
{
"epoch": 0.14683153013910355,
"grad_norm": 13.8125,
"learning_rate": 1.9e-05,
"loss": 1.0328,
"step": 95
},
{
"epoch": 0.14837712519319937,
"grad_norm": 13.375,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.9742,
"step": 96
},
{
"epoch": 0.14992272024729522,
"grad_norm": 14.3125,
"learning_rate": 1.94e-05,
"loss": 1.0703,
"step": 97
},
{
"epoch": 0.15146831530139104,
"grad_norm": 14.3125,
"learning_rate": 1.9600000000000002e-05,
"loss": 1.034,
"step": 98
},
{
"epoch": 0.15301391035548687,
"grad_norm": 13.3125,
"learning_rate": 1.98e-05,
"loss": 0.9953,
"step": 99
},
{
"epoch": 0.1545595054095827,
"grad_norm": 13.5,
"learning_rate": 2e-05,
"loss": 0.9211,
"step": 100
},
{
"epoch": 0.1561051004636785,
"grad_norm": 13.9375,
"learning_rate": 1.9989473684210526e-05,
"loss": 1.0385,
"step": 101
},
{
"epoch": 0.15765069551777433,
"grad_norm": 13.375,
"learning_rate": 1.9978947368421054e-05,
"loss": 0.9443,
"step": 102
},
{
"epoch": 0.15919629057187018,
"grad_norm": 23.625,
"learning_rate": 1.9968421052631582e-05,
"loss": 0.9293,
"step": 103
},
{
"epoch": 0.160741885625966,
"grad_norm": 15.0,
"learning_rate": 1.9957894736842107e-05,
"loss": 1.0052,
"step": 104
},
{
"epoch": 0.16228748068006182,
"grad_norm": 18.125,
"learning_rate": 1.994736842105263e-05,
"loss": 0.9428,
"step": 105
},
{
"epoch": 0.16383307573415765,
"grad_norm": 14.1875,
"learning_rate": 1.993684210526316e-05,
"loss": 0.9634,
"step": 106
},
{
"epoch": 0.16537867078825347,
"grad_norm": 14.875,
"learning_rate": 1.9926315789473688e-05,
"loss": 1.0048,
"step": 107
},
{
"epoch": 0.16692426584234932,
"grad_norm": 12.875,
"learning_rate": 1.9915789473684212e-05,
"loss": 0.9729,
"step": 108
},
{
"epoch": 0.16846986089644514,
"grad_norm": 12.6875,
"learning_rate": 1.990526315789474e-05,
"loss": 0.9173,
"step": 109
},
{
"epoch": 0.17001545595054096,
"grad_norm": 18.0,
"learning_rate": 1.9894736842105265e-05,
"loss": 1.0079,
"step": 110
},
{
"epoch": 0.17156105100463678,
"grad_norm": 12.9375,
"learning_rate": 1.988421052631579e-05,
"loss": 0.9352,
"step": 111
},
{
"epoch": 0.1731066460587326,
"grad_norm": 14.125,
"learning_rate": 1.9873684210526318e-05,
"loss": 0.9911,
"step": 112
},
{
"epoch": 0.17465224111282843,
"grad_norm": 15.25,
"learning_rate": 1.9863157894736846e-05,
"loss": 0.9601,
"step": 113
},
{
"epoch": 0.17619783616692428,
"grad_norm": 13.0625,
"learning_rate": 1.985263157894737e-05,
"loss": 0.8756,
"step": 114
},
{
"epoch": 0.1777434312210201,
"grad_norm": 16.75,
"learning_rate": 1.9842105263157895e-05,
"loss": 0.9709,
"step": 115
},
{
"epoch": 0.17928902627511592,
"grad_norm": 12.9375,
"learning_rate": 1.9831578947368423e-05,
"loss": 0.9185,
"step": 116
},
{
"epoch": 0.18083462132921174,
"grad_norm": 14.25,
"learning_rate": 1.982105263157895e-05,
"loss": 0.98,
"step": 117
},
{
"epoch": 0.18238021638330756,
"grad_norm": 13.5,
"learning_rate": 1.9810526315789476e-05,
"loss": 1.0079,
"step": 118
},
{
"epoch": 0.1839258114374034,
"grad_norm": 12.125,
"learning_rate": 1.98e-05,
"loss": 0.9213,
"step": 119
},
{
"epoch": 0.18547140649149924,
"grad_norm": 12.125,
"learning_rate": 1.9789473684210528e-05,
"loss": 0.9095,
"step": 120
},
{
"epoch": 0.18701700154559506,
"grad_norm": 13.75,
"learning_rate": 1.9778947368421056e-05,
"loss": 0.8693,
"step": 121
},
{
"epoch": 0.18856259659969088,
"grad_norm": 12.8125,
"learning_rate": 1.976842105263158e-05,
"loss": 0.897,
"step": 122
},
{
"epoch": 0.1901081916537867,
"grad_norm": 11.875,
"learning_rate": 1.9757894736842105e-05,
"loss": 0.7896,
"step": 123
},
{
"epoch": 0.19165378670788252,
"grad_norm": 12.25,
"learning_rate": 1.9747368421052633e-05,
"loss": 0.8458,
"step": 124
},
{
"epoch": 0.19319938176197837,
"grad_norm": 11.0625,
"learning_rate": 1.9736842105263158e-05,
"loss": 0.8362,
"step": 125
},
{
"epoch": 0.1947449768160742,
"grad_norm": 11.0,
"learning_rate": 1.9726315789473686e-05,
"loss": 0.8465,
"step": 126
},
{
"epoch": 0.19629057187017002,
"grad_norm": 11.0625,
"learning_rate": 1.9715789473684214e-05,
"loss": 0.7972,
"step": 127
},
{
"epoch": 0.19783616692426584,
"grad_norm": 12.3125,
"learning_rate": 1.970526315789474e-05,
"loss": 0.9165,
"step": 128
},
{
"epoch": 0.19938176197836166,
"grad_norm": 12.9375,
"learning_rate": 1.9694736842105263e-05,
"loss": 0.9013,
"step": 129
},
{
"epoch": 0.2009273570324575,
"grad_norm": 12.5,
"learning_rate": 1.968421052631579e-05,
"loss": 0.8593,
"step": 130
},
{
"epoch": 0.20247295208655333,
"grad_norm": 13.125,
"learning_rate": 1.967368421052632e-05,
"loss": 0.8728,
"step": 131
},
{
"epoch": 0.20401854714064915,
"grad_norm": 12.5625,
"learning_rate": 1.9663157894736844e-05,
"loss": 0.8588,
"step": 132
},
{
"epoch": 0.20556414219474498,
"grad_norm": 13.125,
"learning_rate": 1.965263157894737e-05,
"loss": 0.837,
"step": 133
},
{
"epoch": 0.2071097372488408,
"grad_norm": 12.875,
"learning_rate": 1.9642105263157897e-05,
"loss": 0.8504,
"step": 134
},
{
"epoch": 0.20865533230293662,
"grad_norm": 15.5625,
"learning_rate": 1.9631578947368425e-05,
"loss": 0.86,
"step": 135
},
{
"epoch": 0.21020092735703247,
"grad_norm": 13.0,
"learning_rate": 1.962105263157895e-05,
"loss": 0.7569,
"step": 136
},
{
"epoch": 0.2117465224111283,
"grad_norm": 12.75,
"learning_rate": 1.9610526315789474e-05,
"loss": 0.8461,
"step": 137
},
{
"epoch": 0.2132921174652241,
"grad_norm": 29.25,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.7988,
"step": 138
},
{
"epoch": 0.21483771251931993,
"grad_norm": 13.5625,
"learning_rate": 1.9589473684210527e-05,
"loss": 0.8452,
"step": 139
},
{
"epoch": 0.21638330757341576,
"grad_norm": 14.625,
"learning_rate": 1.9578947368421055e-05,
"loss": 0.8358,
"step": 140
},
{
"epoch": 0.21792890262751158,
"grad_norm": 11.625,
"learning_rate": 1.956842105263158e-05,
"loss": 0.7932,
"step": 141
},
{
"epoch": 0.21947449768160743,
"grad_norm": 12.75,
"learning_rate": 1.9557894736842107e-05,
"loss": 0.817,
"step": 142
},
{
"epoch": 0.22102009273570325,
"grad_norm": 16.875,
"learning_rate": 1.9547368421052632e-05,
"loss": 0.795,
"step": 143
},
{
"epoch": 0.22256568778979907,
"grad_norm": 13.0625,
"learning_rate": 1.953684210526316e-05,
"loss": 0.7426,
"step": 144
},
{
"epoch": 0.2241112828438949,
"grad_norm": 11.3125,
"learning_rate": 1.9526315789473688e-05,
"loss": 0.76,
"step": 145
},
{
"epoch": 0.22565687789799072,
"grad_norm": 11.625,
"learning_rate": 1.9515789473684213e-05,
"loss": 0.8191,
"step": 146
},
{
"epoch": 0.22720247295208656,
"grad_norm": 12.3125,
"learning_rate": 1.9505263157894737e-05,
"loss": 0.8164,
"step": 147
},
{
"epoch": 0.2287480680061824,
"grad_norm": 11.5,
"learning_rate": 1.9494736842105265e-05,
"loss": 0.8299,
"step": 148
},
{
"epoch": 0.2302936630602782,
"grad_norm": 16.125,
"learning_rate": 1.9484210526315793e-05,
"loss": 0.8695,
"step": 149
},
{
"epoch": 0.23183925811437403,
"grad_norm": 12.5625,
"learning_rate": 1.9473684210526318e-05,
"loss": 0.8691,
"step": 150
},
{
"epoch": 0.23338485316846985,
"grad_norm": 12.125,
"learning_rate": 1.9463157894736843e-05,
"loss": 0.8504,
"step": 151
},
{
"epoch": 0.23493044822256567,
"grad_norm": 15.5625,
"learning_rate": 1.945263157894737e-05,
"loss": 0.8583,
"step": 152
},
{
"epoch": 0.23647604327666152,
"grad_norm": 11.0625,
"learning_rate": 1.9442105263157895e-05,
"loss": 0.9378,
"step": 153
},
{
"epoch": 0.23802163833075735,
"grad_norm": 12.6875,
"learning_rate": 1.9431578947368423e-05,
"loss": 0.9021,
"step": 154
},
{
"epoch": 0.23956723338485317,
"grad_norm": 12.625,
"learning_rate": 1.9421052631578948e-05,
"loss": 0.8527,
"step": 155
},
{
"epoch": 0.241112828438949,
"grad_norm": 11.6875,
"learning_rate": 1.9410526315789476e-05,
"loss": 0.8327,
"step": 156
},
{
"epoch": 0.2426584234930448,
"grad_norm": 11.9375,
"learning_rate": 1.94e-05,
"loss": 0.8003,
"step": 157
},
{
"epoch": 0.24420401854714066,
"grad_norm": 13.375,
"learning_rate": 1.9389473684210525e-05,
"loss": 0.6912,
"step": 158
},
{
"epoch": 0.24574961360123648,
"grad_norm": 10.6875,
"learning_rate": 1.9378947368421053e-05,
"loss": 0.7362,
"step": 159
},
{
"epoch": 0.2472952086553323,
"grad_norm": 11.0,
"learning_rate": 1.936842105263158e-05,
"loss": 0.7379,
"step": 160
},
{
"epoch": 0.24884080370942813,
"grad_norm": 25.625,
"learning_rate": 1.9357894736842106e-05,
"loss": 0.7803,
"step": 161
},
{
"epoch": 0.250386398763524,
"grad_norm": 13.125,
"learning_rate": 1.9347368421052634e-05,
"loss": 0.8244,
"step": 162
},
{
"epoch": 0.25193199381761977,
"grad_norm": 13.625,
"learning_rate": 1.9336842105263162e-05,
"loss": 0.773,
"step": 163
},
{
"epoch": 0.2534775888717156,
"grad_norm": 11.625,
"learning_rate": 1.9326315789473687e-05,
"loss": 0.7818,
"step": 164
},
{
"epoch": 0.2550231839258114,
"grad_norm": 11.5625,
"learning_rate": 1.931578947368421e-05,
"loss": 0.825,
"step": 165
},
{
"epoch": 0.25656877897990726,
"grad_norm": 10.75,
"learning_rate": 1.930526315789474e-05,
"loss": 0.7203,
"step": 166
},
{
"epoch": 0.2581143740340031,
"grad_norm": 13.0625,
"learning_rate": 1.9294736842105264e-05,
"loss": 0.8392,
"step": 167
},
{
"epoch": 0.2596599690880989,
"grad_norm": 12.875,
"learning_rate": 1.9284210526315792e-05,
"loss": 0.8234,
"step": 168
},
{
"epoch": 0.26120556414219476,
"grad_norm": 12.5625,
"learning_rate": 1.9273684210526317e-05,
"loss": 0.8638,
"step": 169
},
{
"epoch": 0.26275115919629055,
"grad_norm": 10.5625,
"learning_rate": 1.9263157894736845e-05,
"loss": 0.7969,
"step": 170
},
{
"epoch": 0.2642967542503864,
"grad_norm": 11.375,
"learning_rate": 1.925263157894737e-05,
"loss": 0.7549,
"step": 171
},
{
"epoch": 0.26584234930448225,
"grad_norm": 11.25,
"learning_rate": 1.9242105263157894e-05,
"loss": 0.7519,
"step": 172
},
{
"epoch": 0.26738794435857804,
"grad_norm": 11.9375,
"learning_rate": 1.9231578947368422e-05,
"loss": 0.7572,
"step": 173
},
{
"epoch": 0.2689335394126739,
"grad_norm": 10.75,
"learning_rate": 1.922105263157895e-05,
"loss": 0.8011,
"step": 174
},
{
"epoch": 0.2704791344667697,
"grad_norm": 11.5625,
"learning_rate": 1.9210526315789474e-05,
"loss": 0.7712,
"step": 175
},
{
"epoch": 0.27202472952086554,
"grad_norm": 11.5625,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.6951,
"step": 176
},
{
"epoch": 0.2735703245749614,
"grad_norm": 11.0,
"learning_rate": 1.918947368421053e-05,
"loss": 0.7982,
"step": 177
},
{
"epoch": 0.2751159196290572,
"grad_norm": 11.9375,
"learning_rate": 1.9178947368421055e-05,
"loss": 0.6785,
"step": 178
},
{
"epoch": 0.27666151468315303,
"grad_norm": 11.875,
"learning_rate": 1.916842105263158e-05,
"loss": 0.7832,
"step": 179
},
{
"epoch": 0.2782071097372488,
"grad_norm": 12.4375,
"learning_rate": 1.9157894736842108e-05,
"loss": 0.7041,
"step": 180
},
{
"epoch": 0.2797527047913447,
"grad_norm": 10.625,
"learning_rate": 1.9147368421052632e-05,
"loss": 0.7545,
"step": 181
},
{
"epoch": 0.28129829984544047,
"grad_norm": 11.6875,
"learning_rate": 1.913684210526316e-05,
"loss": 0.7533,
"step": 182
},
{
"epoch": 0.2828438948995363,
"grad_norm": 17.75,
"learning_rate": 1.9126315789473685e-05,
"loss": 0.8242,
"step": 183
},
{
"epoch": 0.28438948995363217,
"grad_norm": 10.8125,
"learning_rate": 1.9115789473684213e-05,
"loss": 0.6681,
"step": 184
},
{
"epoch": 0.28593508500772796,
"grad_norm": 11.5,
"learning_rate": 1.9105263157894738e-05,
"loss": 0.6771,
"step": 185
},
{
"epoch": 0.2874806800618238,
"grad_norm": 12.125,
"learning_rate": 1.9094736842105262e-05,
"loss": 0.7475,
"step": 186
},
{
"epoch": 0.2890262751159196,
"grad_norm": 10.9375,
"learning_rate": 1.908421052631579e-05,
"loss": 0.7511,
"step": 187
},
{
"epoch": 0.29057187017001546,
"grad_norm": 15.6875,
"learning_rate": 1.907368421052632e-05,
"loss": 0.7622,
"step": 188
},
{
"epoch": 0.2921174652241113,
"grad_norm": 14.0625,
"learning_rate": 1.9063157894736843e-05,
"loss": 0.7271,
"step": 189
},
{
"epoch": 0.2936630602782071,
"grad_norm": 11.5625,
"learning_rate": 1.9052631578947368e-05,
"loss": 0.7669,
"step": 190
},
{
"epoch": 0.29520865533230295,
"grad_norm": 10.8125,
"learning_rate": 1.9042105263157896e-05,
"loss": 0.6624,
"step": 191
},
{
"epoch": 0.29675425038639874,
"grad_norm": 11.875,
"learning_rate": 1.9031578947368424e-05,
"loss": 0.7241,
"step": 192
},
{
"epoch": 0.2982998454404946,
"grad_norm": 12.5,
"learning_rate": 1.902105263157895e-05,
"loss": 0.6986,
"step": 193
},
{
"epoch": 0.29984544049459044,
"grad_norm": 13.5625,
"learning_rate": 1.9010526315789476e-05,
"loss": 0.7209,
"step": 194
},
{
"epoch": 0.30139103554868624,
"grad_norm": 12.3125,
"learning_rate": 1.9e-05,
"loss": 0.7389,
"step": 195
},
{
"epoch": 0.3029366306027821,
"grad_norm": 16.125,
"learning_rate": 1.898947368421053e-05,
"loss": 0.7554,
"step": 196
},
{
"epoch": 0.3044822256568779,
"grad_norm": 12.5,
"learning_rate": 1.8978947368421054e-05,
"loss": 0.7488,
"step": 197
},
{
"epoch": 0.30602782071097373,
"grad_norm": 11.8125,
"learning_rate": 1.8968421052631582e-05,
"loss": 0.6887,
"step": 198
},
{
"epoch": 0.3075734157650695,
"grad_norm": 34.25,
"learning_rate": 1.8957894736842106e-05,
"loss": 0.6583,
"step": 199
},
{
"epoch": 0.3091190108191654,
"grad_norm": 10.6875,
"learning_rate": 1.894736842105263e-05,
"loss": 0.6745,
"step": 200
},
{
"epoch": 0.3106646058732612,
"grad_norm": 14.0625,
"learning_rate": 1.893684210526316e-05,
"loss": 0.7117,
"step": 201
},
{
"epoch": 0.312210200927357,
"grad_norm": 12.25,
"learning_rate": 1.8926315789473687e-05,
"loss": 0.7723,
"step": 202
},
{
"epoch": 0.31375579598145287,
"grad_norm": 12.0,
"learning_rate": 1.891578947368421e-05,
"loss": 0.7528,
"step": 203
},
{
"epoch": 0.31530139103554866,
"grad_norm": 14.5625,
"learning_rate": 1.8905263157894736e-05,
"loss": 0.7505,
"step": 204
},
{
"epoch": 0.3168469860896445,
"grad_norm": 60.0,
"learning_rate": 1.8894736842105264e-05,
"loss": 0.7367,
"step": 205
},
{
"epoch": 0.31839258114374036,
"grad_norm": 11.4375,
"learning_rate": 1.8884210526315792e-05,
"loss": 0.7468,
"step": 206
},
{
"epoch": 0.31993817619783615,
"grad_norm": 11.0,
"learning_rate": 1.8873684210526317e-05,
"loss": 0.6802,
"step": 207
},
{
"epoch": 0.321483771251932,
"grad_norm": 11.625,
"learning_rate": 1.886315789473684e-05,
"loss": 0.8019,
"step": 208
},
{
"epoch": 0.3230293663060278,
"grad_norm": 10.4375,
"learning_rate": 1.885263157894737e-05,
"loss": 0.6416,
"step": 209
},
{
"epoch": 0.32457496136012365,
"grad_norm": 11.25,
"learning_rate": 1.8842105263157898e-05,
"loss": 0.7199,
"step": 210
},
{
"epoch": 0.3261205564142195,
"grad_norm": 12.5,
"learning_rate": 1.8831578947368422e-05,
"loss": 0.7252,
"step": 211
},
{
"epoch": 0.3276661514683153,
"grad_norm": 12.6875,
"learning_rate": 1.882105263157895e-05,
"loss": 0.7604,
"step": 212
},
{
"epoch": 0.32921174652241114,
"grad_norm": 12.0625,
"learning_rate": 1.8810526315789475e-05,
"loss": 0.694,
"step": 213
},
{
"epoch": 0.33075734157650694,
"grad_norm": 13.9375,
"learning_rate": 1.88e-05,
"loss": 0.7057,
"step": 214
},
{
"epoch": 0.3323029366306028,
"grad_norm": 10.9375,
"learning_rate": 1.8789473684210528e-05,
"loss": 0.7296,
"step": 215
},
{
"epoch": 0.33384853168469864,
"grad_norm": 11.5,
"learning_rate": 1.8778947368421056e-05,
"loss": 0.6646,
"step": 216
},
{
"epoch": 0.33539412673879443,
"grad_norm": 15.3125,
"learning_rate": 1.876842105263158e-05,
"loss": 0.7514,
"step": 217
},
{
"epoch": 0.3369397217928903,
"grad_norm": 10.125,
"learning_rate": 1.8757894736842105e-05,
"loss": 0.6781,
"step": 218
},
{
"epoch": 0.3384853168469861,
"grad_norm": 11.375,
"learning_rate": 1.8747368421052633e-05,
"loss": 0.7358,
"step": 219
},
{
"epoch": 0.3400309119010819,
"grad_norm": 22.5,
"learning_rate": 1.873684210526316e-05,
"loss": 0.7296,
"step": 220
},
{
"epoch": 0.3415765069551777,
"grad_norm": 11.3125,
"learning_rate": 1.8726315789473686e-05,
"loss": 0.7116,
"step": 221
},
{
"epoch": 0.34312210200927357,
"grad_norm": 12.5625,
"learning_rate": 1.871578947368421e-05,
"loss": 0.6634,
"step": 222
},
{
"epoch": 0.3446676970633694,
"grad_norm": 13.375,
"learning_rate": 1.8705263157894738e-05,
"loss": 0.7326,
"step": 223
},
{
"epoch": 0.3462132921174652,
"grad_norm": 10.75,
"learning_rate": 1.8694736842105266e-05,
"loss": 0.7085,
"step": 224
},
{
"epoch": 0.34775888717156106,
"grad_norm": 11.6875,
"learning_rate": 1.868421052631579e-05,
"loss": 0.7484,
"step": 225
},
{
"epoch": 0.34930448222565685,
"grad_norm": 9.9375,
"learning_rate": 1.8673684210526316e-05,
"loss": 0.649,
"step": 226
},
{
"epoch": 0.3508500772797527,
"grad_norm": 11.25,
"learning_rate": 1.8663157894736844e-05,
"loss": 0.71,
"step": 227
},
{
"epoch": 0.35239567233384855,
"grad_norm": 10.9375,
"learning_rate": 1.8652631578947368e-05,
"loss": 0.6652,
"step": 228
},
{
"epoch": 0.35394126738794435,
"grad_norm": 11.6875,
"learning_rate": 1.8642105263157896e-05,
"loss": 0.6122,
"step": 229
},
{
"epoch": 0.3554868624420402,
"grad_norm": 11.3125,
"learning_rate": 1.8631578947368424e-05,
"loss": 0.6739,
"step": 230
},
{
"epoch": 0.357032457496136,
"grad_norm": 11.8125,
"learning_rate": 1.862105263157895e-05,
"loss": 0.7886,
"step": 231
},
{
"epoch": 0.35857805255023184,
"grad_norm": 11.9375,
"learning_rate": 1.8610526315789473e-05,
"loss": 0.7362,
"step": 232
},
{
"epoch": 0.3601236476043277,
"grad_norm": 13.8125,
"learning_rate": 1.86e-05,
"loss": 0.7711,
"step": 233
},
{
"epoch": 0.3616692426584235,
"grad_norm": 13.0625,
"learning_rate": 1.858947368421053e-05,
"loss": 0.7046,
"step": 234
},
{
"epoch": 0.36321483771251933,
"grad_norm": 11.3125,
"learning_rate": 1.8578947368421054e-05,
"loss": 0.6497,
"step": 235
},
{
"epoch": 0.36476043276661513,
"grad_norm": 11.0625,
"learning_rate": 1.856842105263158e-05,
"loss": 0.6476,
"step": 236
},
{
"epoch": 0.366306027820711,
"grad_norm": 13.375,
"learning_rate": 1.8557894736842107e-05,
"loss": 0.7496,
"step": 237
},
{
"epoch": 0.3678516228748068,
"grad_norm": 12.3125,
"learning_rate": 1.8547368421052635e-05,
"loss": 0.7072,
"step": 238
},
{
"epoch": 0.3693972179289026,
"grad_norm": 13.0625,
"learning_rate": 1.853684210526316e-05,
"loss": 0.7318,
"step": 239
},
{
"epoch": 0.37094281298299847,
"grad_norm": 12.3125,
"learning_rate": 1.8526315789473684e-05,
"loss": 0.7002,
"step": 240
},
{
"epoch": 0.37248840803709427,
"grad_norm": 13.0,
"learning_rate": 1.8515789473684212e-05,
"loss": 0.7398,
"step": 241
},
{
"epoch": 0.3740340030911901,
"grad_norm": 12.375,
"learning_rate": 1.8505263157894737e-05,
"loss": 0.7129,
"step": 242
},
{
"epoch": 0.3755795981452859,
"grad_norm": 12.75,
"learning_rate": 1.8494736842105265e-05,
"loss": 0.6971,
"step": 243
},
{
"epoch": 0.37712519319938176,
"grad_norm": 11.375,
"learning_rate": 1.8484210526315793e-05,
"loss": 0.6709,
"step": 244
},
{
"epoch": 0.3786707882534776,
"grad_norm": 12.4375,
"learning_rate": 1.8473684210526317e-05,
"loss": 0.6225,
"step": 245
},
{
"epoch": 0.3802163833075734,
"grad_norm": 13.1875,
"learning_rate": 1.8463157894736842e-05,
"loss": 0.6671,
"step": 246
},
{
"epoch": 0.38176197836166925,
"grad_norm": 12.5625,
"learning_rate": 1.845263157894737e-05,
"loss": 0.6965,
"step": 247
},
{
"epoch": 0.38330757341576505,
"grad_norm": 15.125,
"learning_rate": 1.8442105263157898e-05,
"loss": 0.6453,
"step": 248
},
{
"epoch": 0.3848531684698609,
"grad_norm": 12.125,
"learning_rate": 1.8431578947368423e-05,
"loss": 0.7017,
"step": 249
},
{
"epoch": 0.38639876352395675,
"grad_norm": 11.875,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.6827,
"step": 250
},
{
"epoch": 0.38794435857805254,
"grad_norm": 12.3125,
"learning_rate": 1.8410526315789475e-05,
"loss": 0.6371,
"step": 251
},
{
"epoch": 0.3894899536321484,
"grad_norm": 10.8125,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.681,
"step": 252
},
{
"epoch": 0.3910355486862442,
"grad_norm": 11.0625,
"learning_rate": 1.8389473684210528e-05,
"loss": 0.6629,
"step": 253
},
{
"epoch": 0.39258114374034003,
"grad_norm": 10.1875,
"learning_rate": 1.8378947368421053e-05,
"loss": 0.6976,
"step": 254
},
{
"epoch": 0.3941267387944359,
"grad_norm": 12.5625,
"learning_rate": 1.836842105263158e-05,
"loss": 0.6869,
"step": 255
},
{
"epoch": 0.3956723338485317,
"grad_norm": 20.75,
"learning_rate": 1.8357894736842105e-05,
"loss": 0.6835,
"step": 256
},
{
"epoch": 0.3972179289026275,
"grad_norm": 12.3125,
"learning_rate": 1.8347368421052633e-05,
"loss": 0.7118,
"step": 257
},
{
"epoch": 0.3987635239567233,
"grad_norm": 12.75,
"learning_rate": 1.8336842105263158e-05,
"loss": 0.7664,
"step": 258
},
{
"epoch": 0.40030911901081917,
"grad_norm": 10.875,
"learning_rate": 1.8326315789473686e-05,
"loss": 0.653,
"step": 259
},
{
"epoch": 0.401854714064915,
"grad_norm": 10.9375,
"learning_rate": 1.831578947368421e-05,
"loss": 0.6779,
"step": 260
},
{
"epoch": 0.4034003091190108,
"grad_norm": 10.9375,
"learning_rate": 1.830526315789474e-05,
"loss": 0.611,
"step": 261
},
{
"epoch": 0.40494590417310666,
"grad_norm": 9.75,
"learning_rate": 1.8294736842105267e-05,
"loss": 0.6194,
"step": 262
},
{
"epoch": 0.40649149922720246,
"grad_norm": 11.25,
"learning_rate": 1.828421052631579e-05,
"loss": 0.7023,
"step": 263
},
{
"epoch": 0.4080370942812983,
"grad_norm": 10.375,
"learning_rate": 1.8273684210526316e-05,
"loss": 0.6753,
"step": 264
},
{
"epoch": 0.4095826893353941,
"grad_norm": 10.75,
"learning_rate": 1.8263157894736844e-05,
"loss": 0.7059,
"step": 265
},
{
"epoch": 0.41112828438948995,
"grad_norm": 10.5625,
"learning_rate": 1.8252631578947372e-05,
"loss": 0.688,
"step": 266
},
{
"epoch": 0.4126738794435858,
"grad_norm": 11.25,
"learning_rate": 1.8242105263157897e-05,
"loss": 0.7569,
"step": 267
},
{
"epoch": 0.4142194744976816,
"grad_norm": 9.75,
"learning_rate": 1.823157894736842e-05,
"loss": 0.66,
"step": 268
},
{
"epoch": 0.41576506955177744,
"grad_norm": 11.25,
"learning_rate": 1.822105263157895e-05,
"loss": 0.5989,
"step": 269
},
{
"epoch": 0.41731066460587324,
"grad_norm": 10.1875,
"learning_rate": 1.8210526315789477e-05,
"loss": 0.6503,
"step": 270
},
{
"epoch": 0.4188562596599691,
"grad_norm": 12.9375,
"learning_rate": 1.8200000000000002e-05,
"loss": 0.7197,
"step": 271
},
{
"epoch": 0.42040185471406494,
"grad_norm": 11.3125,
"learning_rate": 1.8189473684210527e-05,
"loss": 0.6916,
"step": 272
},
{
"epoch": 0.42194744976816073,
"grad_norm": 11.4375,
"learning_rate": 1.8178947368421055e-05,
"loss": 0.658,
"step": 273
},
{
"epoch": 0.4234930448222566,
"grad_norm": 10.5625,
"learning_rate": 1.816842105263158e-05,
"loss": 0.6581,
"step": 274
},
{
"epoch": 0.4250386398763524,
"grad_norm": 12.375,
"learning_rate": 1.8157894736842107e-05,
"loss": 0.6791,
"step": 275
},
{
"epoch": 0.4265842349304482,
"grad_norm": 12.1875,
"learning_rate": 1.8147368421052632e-05,
"loss": 0.6634,
"step": 276
},
{
"epoch": 0.4281298299845441,
"grad_norm": 21.375,
"learning_rate": 1.813684210526316e-05,
"loss": 0.743,
"step": 277
},
{
"epoch": 0.42967542503863987,
"grad_norm": 11.6875,
"learning_rate": 1.8126315789473685e-05,
"loss": 0.7256,
"step": 278
},
{
"epoch": 0.4312210200927357,
"grad_norm": 10.6875,
"learning_rate": 1.8115789473684213e-05,
"loss": 0.6024,
"step": 279
},
{
"epoch": 0.4327666151468315,
"grad_norm": 11.375,
"learning_rate": 1.810526315789474e-05,
"loss": 0.7624,
"step": 280
},
{
"epoch": 0.43431221020092736,
"grad_norm": 12.0625,
"learning_rate": 1.8094736842105265e-05,
"loss": 0.6995,
"step": 281
},
{
"epoch": 0.43585780525502316,
"grad_norm": 11.1875,
"learning_rate": 1.808421052631579e-05,
"loss": 0.6469,
"step": 282
},
{
"epoch": 0.437403400309119,
"grad_norm": 11.3125,
"learning_rate": 1.8073684210526318e-05,
"loss": 0.629,
"step": 283
},
{
"epoch": 0.43894899536321486,
"grad_norm": 13.5625,
"learning_rate": 1.8063157894736846e-05,
"loss": 0.7263,
"step": 284
},
{
"epoch": 0.44049459041731065,
"grad_norm": 10.125,
"learning_rate": 1.805263157894737e-05,
"loss": 0.6615,
"step": 285
},
{
"epoch": 0.4420401854714065,
"grad_norm": 9.875,
"learning_rate": 1.8042105263157895e-05,
"loss": 0.6439,
"step": 286
},
{
"epoch": 0.4435857805255023,
"grad_norm": 10.625,
"learning_rate": 1.8031578947368423e-05,
"loss": 0.571,
"step": 287
},
{
"epoch": 0.44513137557959814,
"grad_norm": 10.875,
"learning_rate": 1.8021052631578948e-05,
"loss": 0.703,
"step": 288
},
{
"epoch": 0.446676970633694,
"grad_norm": 10.875,
"learning_rate": 1.8010526315789476e-05,
"loss": 0.6262,
"step": 289
},
{
"epoch": 0.4482225656877898,
"grad_norm": 11.8125,
"learning_rate": 1.8e-05,
"loss": 0.6794,
"step": 290
},
{
"epoch": 0.44976816074188564,
"grad_norm": 11.6875,
"learning_rate": 1.798947368421053e-05,
"loss": 0.6678,
"step": 291
},
{
"epoch": 0.45131375579598143,
"grad_norm": 10.6875,
"learning_rate": 1.7978947368421053e-05,
"loss": 0.641,
"step": 292
},
{
"epoch": 0.4528593508500773,
"grad_norm": 9.625,
"learning_rate": 1.7968421052631578e-05,
"loss": 0.6154,
"step": 293
},
{
"epoch": 0.45440494590417313,
"grad_norm": 11.6875,
"learning_rate": 1.795789473684211e-05,
"loss": 0.7398,
"step": 294
},
{
"epoch": 0.4559505409582689,
"grad_norm": 10.875,
"learning_rate": 1.7947368421052634e-05,
"loss": 0.7055,
"step": 295
},
{
"epoch": 0.4574961360123648,
"grad_norm": 10.4375,
"learning_rate": 1.793684210526316e-05,
"loss": 0.6077,
"step": 296
},
{
"epoch": 0.45904173106646057,
"grad_norm": 12.375,
"learning_rate": 1.7926315789473686e-05,
"loss": 0.736,
"step": 297
},
{
"epoch": 0.4605873261205564,
"grad_norm": 11.25,
"learning_rate": 1.7915789473684214e-05,
"loss": 0.6597,
"step": 298
},
{
"epoch": 0.46213292117465227,
"grad_norm": 10.4375,
"learning_rate": 1.790526315789474e-05,
"loss": 0.5764,
"step": 299
},
{
"epoch": 0.46367851622874806,
"grad_norm": 11.875,
"learning_rate": 1.7894736842105264e-05,
"loss": 0.6833,
"step": 300
},
{
"epoch": 0.4652241112828439,
"grad_norm": 14.0625,
"learning_rate": 1.7884210526315792e-05,
"loss": 0.5959,
"step": 301
},
{
"epoch": 0.4667697063369397,
"grad_norm": 13.5625,
"learning_rate": 1.7873684210526316e-05,
"loss": 0.6949,
"step": 302
},
{
"epoch": 0.46831530139103555,
"grad_norm": 10.875,
"learning_rate": 1.7863157894736844e-05,
"loss": 0.6948,
"step": 303
},
{
"epoch": 0.46986089644513135,
"grad_norm": 11.0625,
"learning_rate": 1.785263157894737e-05,
"loss": 0.6381,
"step": 304
},
{
"epoch": 0.4714064914992272,
"grad_norm": 10.4375,
"learning_rate": 1.7842105263157897e-05,
"loss": 0.6298,
"step": 305
},
{
"epoch": 0.47295208655332305,
"grad_norm": 12.9375,
"learning_rate": 1.7831578947368422e-05,
"loss": 0.6889,
"step": 306
},
{
"epoch": 0.47449768160741884,
"grad_norm": 11.0,
"learning_rate": 1.7821052631578946e-05,
"loss": 0.6346,
"step": 307
},
{
"epoch": 0.4760432766615147,
"grad_norm": 10.0625,
"learning_rate": 1.7810526315789474e-05,
"loss": 0.6656,
"step": 308
},
{
"epoch": 0.4775888717156105,
"grad_norm": 12.125,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.635,
"step": 309
},
{
"epoch": 0.47913446676970634,
"grad_norm": 11.375,
"learning_rate": 1.7789473684210527e-05,
"loss": 0.6428,
"step": 310
},
{
"epoch": 0.4806800618238022,
"grad_norm": 10.375,
"learning_rate": 1.7778947368421055e-05,
"loss": 0.6268,
"step": 311
},
{
"epoch": 0.482225656877898,
"grad_norm": 16.75,
"learning_rate": 1.7768421052631583e-05,
"loss": 0.7004,
"step": 312
},
{
"epoch": 0.48377125193199383,
"grad_norm": 11.5,
"learning_rate": 1.7757894736842108e-05,
"loss": 0.6226,
"step": 313
},
{
"epoch": 0.4853168469860896,
"grad_norm": 10.0,
"learning_rate": 1.7747368421052632e-05,
"loss": 0.5717,
"step": 314
},
{
"epoch": 0.4868624420401855,
"grad_norm": 15.4375,
"learning_rate": 1.773684210526316e-05,
"loss": 0.5794,
"step": 315
},
{
"epoch": 0.4884080370942813,
"grad_norm": 9.9375,
"learning_rate": 1.7726315789473685e-05,
"loss": 0.6375,
"step": 316
},
{
"epoch": 0.4899536321483771,
"grad_norm": 10.8125,
"learning_rate": 1.7715789473684213e-05,
"loss": 0.6641,
"step": 317
},
{
"epoch": 0.49149922720247297,
"grad_norm": 9.9375,
"learning_rate": 1.7705263157894738e-05,
"loss": 0.7167,
"step": 318
},
{
"epoch": 0.49304482225656876,
"grad_norm": 10.0625,
"learning_rate": 1.7694736842105266e-05,
"loss": 0.6788,
"step": 319
},
{
"epoch": 0.4945904173106646,
"grad_norm": 10.6875,
"learning_rate": 1.768421052631579e-05,
"loss": 0.6849,
"step": 320
},
{
"epoch": 0.49613601236476046,
"grad_norm": 9.8125,
"learning_rate": 1.7673684210526315e-05,
"loss": 0.6135,
"step": 321
},
{
"epoch": 0.49768160741885625,
"grad_norm": 10.25,
"learning_rate": 1.7663157894736843e-05,
"loss": 0.6388,
"step": 322
},
{
"epoch": 0.4992272024729521,
"grad_norm": 10.9375,
"learning_rate": 1.765263157894737e-05,
"loss": 0.5765,
"step": 323
},
{
"epoch": 0.500772797527048,
"grad_norm": 10.625,
"learning_rate": 1.7642105263157896e-05,
"loss": 0.6513,
"step": 324
},
{
"epoch": 0.5023183925811437,
"grad_norm": 11.4375,
"learning_rate": 1.763157894736842e-05,
"loss": 0.7304,
"step": 325
},
{
"epoch": 0.5038639876352395,
"grad_norm": 10.875,
"learning_rate": 1.7621052631578948e-05,
"loss": 0.6222,
"step": 326
},
{
"epoch": 0.5054095826893354,
"grad_norm": 10.1875,
"learning_rate": 1.7610526315789476e-05,
"loss": 0.6632,
"step": 327
},
{
"epoch": 0.5069551777434312,
"grad_norm": 10.9375,
"learning_rate": 1.76e-05,
"loss": 0.6116,
"step": 328
},
{
"epoch": 0.508500772797527,
"grad_norm": 15.4375,
"learning_rate": 1.758947368421053e-05,
"loss": 0.7024,
"step": 329
},
{
"epoch": 0.5100463678516228,
"grad_norm": 10.75,
"learning_rate": 1.7578947368421054e-05,
"loss": 0.599,
"step": 330
},
{
"epoch": 0.5115919629057187,
"grad_norm": 11.9375,
"learning_rate": 1.756842105263158e-05,
"loss": 0.6844,
"step": 331
},
{
"epoch": 0.5131375579598145,
"grad_norm": 12.125,
"learning_rate": 1.7557894736842106e-05,
"loss": 0.7101,
"step": 332
},
{
"epoch": 0.5146831530139103,
"grad_norm": 9.875,
"learning_rate": 1.7547368421052634e-05,
"loss": 0.6213,
"step": 333
},
{
"epoch": 0.5162287480680062,
"grad_norm": 10.5625,
"learning_rate": 1.753684210526316e-05,
"loss": 0.565,
"step": 334
},
{
"epoch": 0.517774343122102,
"grad_norm": 10.1875,
"learning_rate": 1.7526315789473683e-05,
"loss": 0.6775,
"step": 335
},
{
"epoch": 0.5193199381761978,
"grad_norm": 10.25,
"learning_rate": 1.751578947368421e-05,
"loss": 0.6364,
"step": 336
},
{
"epoch": 0.5208655332302936,
"grad_norm": 11.375,
"learning_rate": 1.750526315789474e-05,
"loss": 0.7518,
"step": 337
},
{
"epoch": 0.5224111282843895,
"grad_norm": 10.5,
"learning_rate": 1.7494736842105264e-05,
"loss": 0.6518,
"step": 338
},
{
"epoch": 0.5239567233384853,
"grad_norm": 12.75,
"learning_rate": 1.748421052631579e-05,
"loss": 0.6856,
"step": 339
},
{
"epoch": 0.5255023183925811,
"grad_norm": 10.25,
"learning_rate": 1.7473684210526317e-05,
"loss": 0.6262,
"step": 340
},
{
"epoch": 0.527047913446677,
"grad_norm": 10.9375,
"learning_rate": 1.7463157894736845e-05,
"loss": 0.6176,
"step": 341
},
{
"epoch": 0.5285935085007728,
"grad_norm": 9.375,
"learning_rate": 1.745263157894737e-05,
"loss": 0.5422,
"step": 342
},
{
"epoch": 0.5301391035548686,
"grad_norm": 11.0,
"learning_rate": 1.7442105263157894e-05,
"loss": 0.6596,
"step": 343
},
{
"epoch": 0.5316846986089645,
"grad_norm": 14.5,
"learning_rate": 1.7431578947368422e-05,
"loss": 0.672,
"step": 344
},
{
"epoch": 0.5332302936630603,
"grad_norm": 10.3125,
"learning_rate": 1.742105263157895e-05,
"loss": 0.6334,
"step": 345
},
{
"epoch": 0.5347758887171561,
"grad_norm": 12.0625,
"learning_rate": 1.7410526315789475e-05,
"loss": 0.7048,
"step": 346
},
{
"epoch": 0.5363214837712519,
"grad_norm": 11.0,
"learning_rate": 1.7400000000000003e-05,
"loss": 0.6699,
"step": 347
},
{
"epoch": 0.5378670788253478,
"grad_norm": 10.9375,
"learning_rate": 1.7389473684210527e-05,
"loss": 0.682,
"step": 348
},
{
"epoch": 0.5394126738794436,
"grad_norm": 11.0,
"learning_rate": 1.7378947368421052e-05,
"loss": 0.624,
"step": 349
},
{
"epoch": 0.5409582689335394,
"grad_norm": 11.5,
"learning_rate": 1.736842105263158e-05,
"loss": 0.5733,
"step": 350
},
{
"epoch": 0.5425038639876353,
"grad_norm": 10.9375,
"learning_rate": 1.7357894736842108e-05,
"loss": 0.6272,
"step": 351
},
{
"epoch": 0.5440494590417311,
"grad_norm": 12.5,
"learning_rate": 1.7347368421052633e-05,
"loss": 0.6308,
"step": 352
},
{
"epoch": 0.5455950540958269,
"grad_norm": 9.25,
"learning_rate": 1.7336842105263157e-05,
"loss": 0.559,
"step": 353
},
{
"epoch": 0.5471406491499228,
"grad_norm": 13.375,
"learning_rate": 1.7326315789473685e-05,
"loss": 0.6935,
"step": 354
},
{
"epoch": 0.5486862442040186,
"grad_norm": 12.1875,
"learning_rate": 1.7315789473684213e-05,
"loss": 0.6228,
"step": 355
},
{
"epoch": 0.5502318392581144,
"grad_norm": 11.5625,
"learning_rate": 1.7305263157894738e-05,
"loss": 0.5491,
"step": 356
},
{
"epoch": 0.5517774343122102,
"grad_norm": 11.625,
"learning_rate": 1.7294736842105263e-05,
"loss": 0.6608,
"step": 357
},
{
"epoch": 0.5533230293663061,
"grad_norm": 11.875,
"learning_rate": 1.728421052631579e-05,
"loss": 0.6535,
"step": 358
},
{
"epoch": 0.5548686244204019,
"grad_norm": 11.25,
"learning_rate": 1.727368421052632e-05,
"loss": 0.6686,
"step": 359
},
{
"epoch": 0.5564142194744977,
"grad_norm": 16.875,
"learning_rate": 1.7263157894736843e-05,
"loss": 0.6258,
"step": 360
},
{
"epoch": 0.5579598145285936,
"grad_norm": 11.25,
"learning_rate": 1.725263157894737e-05,
"loss": 0.6164,
"step": 361
},
{
"epoch": 0.5595054095826894,
"grad_norm": 10.75,
"learning_rate": 1.7242105263157896e-05,
"loss": 0.6558,
"step": 362
},
{
"epoch": 0.5610510046367851,
"grad_norm": 16.375,
"learning_rate": 1.723157894736842e-05,
"loss": 0.6748,
"step": 363
},
{
"epoch": 0.5625965996908809,
"grad_norm": 10.875,
"learning_rate": 1.722105263157895e-05,
"loss": 0.6224,
"step": 364
},
{
"epoch": 0.5641421947449768,
"grad_norm": 11.0,
"learning_rate": 1.7210526315789477e-05,
"loss": 0.5964,
"step": 365
},
{
"epoch": 0.5656877897990726,
"grad_norm": 11.25,
"learning_rate": 1.72e-05,
"loss": 0.5954,
"step": 366
},
{
"epoch": 0.5672333848531684,
"grad_norm": 11.3125,
"learning_rate": 1.7189473684210526e-05,
"loss": 0.703,
"step": 367
},
{
"epoch": 0.5687789799072643,
"grad_norm": 10.1875,
"learning_rate": 1.7178947368421054e-05,
"loss": 0.5393,
"step": 368
},
{
"epoch": 0.5703245749613601,
"grad_norm": 9.625,
"learning_rate": 1.7168421052631582e-05,
"loss": 0.5729,
"step": 369
},
{
"epoch": 0.5718701700154559,
"grad_norm": 11.9375,
"learning_rate": 1.7157894736842107e-05,
"loss": 0.5776,
"step": 370
},
{
"epoch": 0.5734157650695518,
"grad_norm": 10.5625,
"learning_rate": 1.714736842105263e-05,
"loss": 0.6248,
"step": 371
},
{
"epoch": 0.5749613601236476,
"grad_norm": 11.5,
"learning_rate": 1.713684210526316e-05,
"loss": 0.6055,
"step": 372
},
{
"epoch": 0.5765069551777434,
"grad_norm": 9.9375,
"learning_rate": 1.7126315789473687e-05,
"loss": 0.5462,
"step": 373
},
{
"epoch": 0.5780525502318392,
"grad_norm": 12.0,
"learning_rate": 1.7115789473684212e-05,
"loss": 0.6324,
"step": 374
},
{
"epoch": 0.5795981452859351,
"grad_norm": 10.0625,
"learning_rate": 1.7105263157894737e-05,
"loss": 0.5597,
"step": 375
},
{
"epoch": 0.5811437403400309,
"grad_norm": 10.5625,
"learning_rate": 1.7094736842105265e-05,
"loss": 0.5971,
"step": 376
},
{
"epoch": 0.5826893353941267,
"grad_norm": 10.1875,
"learning_rate": 1.708421052631579e-05,
"loss": 0.592,
"step": 377
},
{
"epoch": 0.5842349304482226,
"grad_norm": 9.125,
"learning_rate": 1.7073684210526317e-05,
"loss": 0.5512,
"step": 378
},
{
"epoch": 0.5857805255023184,
"grad_norm": 10.4375,
"learning_rate": 1.7063157894736845e-05,
"loss": 0.6129,
"step": 379
},
{
"epoch": 0.5873261205564142,
"grad_norm": 11.0,
"learning_rate": 1.705263157894737e-05,
"loss": 0.6398,
"step": 380
},
{
"epoch": 0.58887171561051,
"grad_norm": 13.375,
"learning_rate": 1.7042105263157895e-05,
"loss": 0.6193,
"step": 381
},
{
"epoch": 0.5904173106646059,
"grad_norm": 12.4375,
"learning_rate": 1.7031578947368423e-05,
"loss": 0.5899,
"step": 382
},
{
"epoch": 0.5919629057187017,
"grad_norm": 11.625,
"learning_rate": 1.702105263157895e-05,
"loss": 0.6047,
"step": 383
},
{
"epoch": 0.5935085007727975,
"grad_norm": 12.3125,
"learning_rate": 1.7010526315789475e-05,
"loss": 0.6289,
"step": 384
},
{
"epoch": 0.5950540958268934,
"grad_norm": 14.0,
"learning_rate": 1.7e-05,
"loss": 0.7105,
"step": 385
},
{
"epoch": 0.5965996908809892,
"grad_norm": 10.8125,
"learning_rate": 1.6989473684210528e-05,
"loss": 0.6865,
"step": 386
},
{
"epoch": 0.598145285935085,
"grad_norm": 10.0625,
"learning_rate": 1.6978947368421056e-05,
"loss": 0.5632,
"step": 387
},
{
"epoch": 0.5996908809891809,
"grad_norm": 10.625,
"learning_rate": 1.696842105263158e-05,
"loss": 0.6353,
"step": 388
},
{
"epoch": 0.6012364760432767,
"grad_norm": 10.4375,
"learning_rate": 1.6957894736842105e-05,
"loss": 0.5294,
"step": 389
},
{
"epoch": 0.6027820710973725,
"grad_norm": 9.8125,
"learning_rate": 1.6947368421052633e-05,
"loss": 0.5474,
"step": 390
},
{
"epoch": 0.6043276661514683,
"grad_norm": 11.4375,
"learning_rate": 1.6936842105263158e-05,
"loss": 0.6746,
"step": 391
},
{
"epoch": 0.6058732612055642,
"grad_norm": 10.5,
"learning_rate": 1.6926315789473686e-05,
"loss": 0.6166,
"step": 392
},
{
"epoch": 0.60741885625966,
"grad_norm": 10.9375,
"learning_rate": 1.691578947368421e-05,
"loss": 0.6486,
"step": 393
},
{
"epoch": 0.6089644513137558,
"grad_norm": 11.375,
"learning_rate": 1.690526315789474e-05,
"loss": 0.6609,
"step": 394
},
{
"epoch": 0.6105100463678517,
"grad_norm": 10.6875,
"learning_rate": 1.6894736842105263e-05,
"loss": 0.63,
"step": 395
},
{
"epoch": 0.6120556414219475,
"grad_norm": 11.3125,
"learning_rate": 1.688421052631579e-05,
"loss": 0.6215,
"step": 396
},
{
"epoch": 0.6136012364760433,
"grad_norm": 20.0,
"learning_rate": 1.687368421052632e-05,
"loss": 0.6329,
"step": 397
},
{
"epoch": 0.615146831530139,
"grad_norm": 14.3125,
"learning_rate": 1.6863157894736844e-05,
"loss": 0.6106,
"step": 398
},
{
"epoch": 0.616692426584235,
"grad_norm": 10.25,
"learning_rate": 1.685263157894737e-05,
"loss": 0.6392,
"step": 399
},
{
"epoch": 0.6182380216383307,
"grad_norm": 12.5625,
"learning_rate": 1.6842105263157896e-05,
"loss": 0.6487,
"step": 400
},
{
"epoch": 0.6197836166924265,
"grad_norm": 11.0,
"learning_rate": 1.6831578947368424e-05,
"loss": 0.6431,
"step": 401
},
{
"epoch": 0.6213292117465224,
"grad_norm": 11.3125,
"learning_rate": 1.682105263157895e-05,
"loss": 0.6677,
"step": 402
},
{
"epoch": 0.6228748068006182,
"grad_norm": 16.0,
"learning_rate": 1.6810526315789474e-05,
"loss": 0.5728,
"step": 403
},
{
"epoch": 0.624420401854714,
"grad_norm": 9.8125,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.6026,
"step": 404
},
{
"epoch": 0.6259659969088099,
"grad_norm": 9.875,
"learning_rate": 1.6789473684210526e-05,
"loss": 0.631,
"step": 405
},
{
"epoch": 0.6275115919629057,
"grad_norm": 11.3125,
"learning_rate": 1.6778947368421054e-05,
"loss": 0.6257,
"step": 406
},
{
"epoch": 0.6290571870170015,
"grad_norm": 9.75,
"learning_rate": 1.676842105263158e-05,
"loss": 0.6249,
"step": 407
},
{
"epoch": 0.6306027820710973,
"grad_norm": 11.125,
"learning_rate": 1.6757894736842107e-05,
"loss": 0.5465,
"step": 408
},
{
"epoch": 0.6321483771251932,
"grad_norm": 11.8125,
"learning_rate": 1.6747368421052632e-05,
"loss": 0.5982,
"step": 409
},
{
"epoch": 0.633693972179289,
"grad_norm": 10.25,
"learning_rate": 1.673684210526316e-05,
"loss": 0.6516,
"step": 410
},
{
"epoch": 0.6352395672333848,
"grad_norm": 11.9375,
"learning_rate": 1.6726315789473684e-05,
"loss": 0.6128,
"step": 411
},
{
"epoch": 0.6367851622874807,
"grad_norm": 10.8125,
"learning_rate": 1.6715789473684212e-05,
"loss": 0.6512,
"step": 412
},
{
"epoch": 0.6383307573415765,
"grad_norm": 10.625,
"learning_rate": 1.6705263157894737e-05,
"loss": 0.6355,
"step": 413
},
{
"epoch": 0.6398763523956723,
"grad_norm": 11.6875,
"learning_rate": 1.6694736842105265e-05,
"loss": 0.6905,
"step": 414
},
{
"epoch": 0.6414219474497682,
"grad_norm": 10.6875,
"learning_rate": 1.6684210526315793e-05,
"loss": 0.6242,
"step": 415
},
{
"epoch": 0.642967542503864,
"grad_norm": 10.3125,
"learning_rate": 1.6673684210526318e-05,
"loss": 0.588,
"step": 416
},
{
"epoch": 0.6445131375579598,
"grad_norm": 10.5625,
"learning_rate": 1.6663157894736842e-05,
"loss": 0.6352,
"step": 417
},
{
"epoch": 0.6460587326120556,
"grad_norm": 11.8125,
"learning_rate": 1.665263157894737e-05,
"loss": 0.5938,
"step": 418
},
{
"epoch": 0.6476043276661515,
"grad_norm": 11.125,
"learning_rate": 1.66421052631579e-05,
"loss": 0.6157,
"step": 419
},
{
"epoch": 0.6491499227202473,
"grad_norm": 10.625,
"learning_rate": 1.6631578947368423e-05,
"loss": 0.5659,
"step": 420
},
{
"epoch": 0.6506955177743431,
"grad_norm": 10.8125,
"learning_rate": 1.6621052631578948e-05,
"loss": 0.6424,
"step": 421
},
{
"epoch": 0.652241112828439,
"grad_norm": 10.875,
"learning_rate": 1.6610526315789476e-05,
"loss": 0.5827,
"step": 422
},
{
"epoch": 0.6537867078825348,
"grad_norm": 11.5,
"learning_rate": 1.66e-05,
"loss": 0.6227,
"step": 423
},
{
"epoch": 0.6553323029366306,
"grad_norm": 10.5,
"learning_rate": 1.658947368421053e-05,
"loss": 0.6027,
"step": 424
},
{
"epoch": 0.6568778979907264,
"grad_norm": 11.0,
"learning_rate": 1.6578947368421053e-05,
"loss": 0.5889,
"step": 425
},
{
"epoch": 0.6584234930448223,
"grad_norm": 9.875,
"learning_rate": 1.656842105263158e-05,
"loss": 0.6242,
"step": 426
},
{
"epoch": 0.6599690880989181,
"grad_norm": 10.4375,
"learning_rate": 1.6557894736842106e-05,
"loss": 0.5866,
"step": 427
},
{
"epoch": 0.6615146831530139,
"grad_norm": 11.0625,
"learning_rate": 1.6547368421052634e-05,
"loss": 0.596,
"step": 428
},
{
"epoch": 0.6630602782071098,
"grad_norm": 14.0,
"learning_rate": 1.653684210526316e-05,
"loss": 0.5661,
"step": 429
},
{
"epoch": 0.6646058732612056,
"grad_norm": 11.25,
"learning_rate": 1.6526315789473686e-05,
"loss": 0.5982,
"step": 430
},
{
"epoch": 0.6661514683153014,
"grad_norm": 11.75,
"learning_rate": 1.651578947368421e-05,
"loss": 0.5538,
"step": 431
},
{
"epoch": 0.6676970633693973,
"grad_norm": 10.4375,
"learning_rate": 1.650526315789474e-05,
"loss": 0.648,
"step": 432
},
{
"epoch": 0.6692426584234931,
"grad_norm": 14.9375,
"learning_rate": 1.6494736842105267e-05,
"loss": 0.5278,
"step": 433
},
{
"epoch": 0.6707882534775889,
"grad_norm": 10.0,
"learning_rate": 1.648421052631579e-05,
"loss": 0.5821,
"step": 434
},
{
"epoch": 0.6723338485316847,
"grad_norm": 13.1875,
"learning_rate": 1.6473684210526316e-05,
"loss": 0.6284,
"step": 435
},
{
"epoch": 0.6738794435857806,
"grad_norm": 10.625,
"learning_rate": 1.6463157894736844e-05,
"loss": 0.6116,
"step": 436
},
{
"epoch": 0.6754250386398764,
"grad_norm": 9.9375,
"learning_rate": 1.645263157894737e-05,
"loss": 0.5687,
"step": 437
},
{
"epoch": 0.6769706336939721,
"grad_norm": 10.0625,
"learning_rate": 1.6442105263157897e-05,
"loss": 0.5742,
"step": 438
},
{
"epoch": 0.678516228748068,
"grad_norm": 11.375,
"learning_rate": 1.643157894736842e-05,
"loss": 0.5422,
"step": 439
},
{
"epoch": 0.6800618238021638,
"grad_norm": 11.625,
"learning_rate": 1.642105263157895e-05,
"loss": 0.6112,
"step": 440
},
{
"epoch": 0.6816074188562596,
"grad_norm": 12.0,
"learning_rate": 1.6410526315789474e-05,
"loss": 0.5813,
"step": 441
},
{
"epoch": 0.6831530139103554,
"grad_norm": 12.4375,
"learning_rate": 1.64e-05,
"loss": 0.5414,
"step": 442
},
{
"epoch": 0.6846986089644513,
"grad_norm": 13.125,
"learning_rate": 1.6389473684210527e-05,
"loss": 0.6109,
"step": 443
},
{
"epoch": 0.6862442040185471,
"grad_norm": 10.75,
"learning_rate": 1.6378947368421055e-05,
"loss": 0.6202,
"step": 444
},
{
"epoch": 0.6877897990726429,
"grad_norm": 10.625,
"learning_rate": 1.636842105263158e-05,
"loss": 0.5505,
"step": 445
},
{
"epoch": 0.6893353941267388,
"grad_norm": 10.75,
"learning_rate": 1.6357894736842108e-05,
"loss": 0.5557,
"step": 446
},
{
"epoch": 0.6908809891808346,
"grad_norm": 10.0625,
"learning_rate": 1.6347368421052636e-05,
"loss": 0.5864,
"step": 447
},
{
"epoch": 0.6924265842349304,
"grad_norm": 11.125,
"learning_rate": 1.633684210526316e-05,
"loss": 0.6437,
"step": 448
},
{
"epoch": 0.6939721792890263,
"grad_norm": 12.5,
"learning_rate": 1.6326315789473685e-05,
"loss": 0.6422,
"step": 449
},
{
"epoch": 0.6955177743431221,
"grad_norm": 15.5,
"learning_rate": 1.6315789473684213e-05,
"loss": 0.5523,
"step": 450
},
{
"epoch": 0.6970633693972179,
"grad_norm": 11.0,
"learning_rate": 1.6305263157894737e-05,
"loss": 0.5723,
"step": 451
},
{
"epoch": 0.6986089644513137,
"grad_norm": 12.0625,
"learning_rate": 1.6294736842105265e-05,
"loss": 0.6847,
"step": 452
},
{
"epoch": 0.7001545595054096,
"grad_norm": 10.0,
"learning_rate": 1.628421052631579e-05,
"loss": 0.606,
"step": 453
},
{
"epoch": 0.7017001545595054,
"grad_norm": 13.1875,
"learning_rate": 1.6273684210526318e-05,
"loss": 0.6227,
"step": 454
},
{
"epoch": 0.7032457496136012,
"grad_norm": 10.75,
"learning_rate": 1.6263157894736843e-05,
"loss": 0.5474,
"step": 455
},
{
"epoch": 0.7047913446676971,
"grad_norm": 12.375,
"learning_rate": 1.6252631578947367e-05,
"loss": 0.6253,
"step": 456
},
{
"epoch": 0.7063369397217929,
"grad_norm": 11.5,
"learning_rate": 1.6242105263157895e-05,
"loss": 0.5415,
"step": 457
},
{
"epoch": 0.7078825347758887,
"grad_norm": 12.8125,
"learning_rate": 1.6231578947368423e-05,
"loss": 0.5887,
"step": 458
},
{
"epoch": 0.7094281298299846,
"grad_norm": 11.0625,
"learning_rate": 1.6221052631578948e-05,
"loss": 0.5866,
"step": 459
},
{
"epoch": 0.7109737248840804,
"grad_norm": 11.1875,
"learning_rate": 1.6210526315789473e-05,
"loss": 0.6069,
"step": 460
},
{
"epoch": 0.7125193199381762,
"grad_norm": 10.75,
"learning_rate": 1.62e-05,
"loss": 0.5322,
"step": 461
},
{
"epoch": 0.714064914992272,
"grad_norm": 10.375,
"learning_rate": 1.618947368421053e-05,
"loss": 0.5771,
"step": 462
},
{
"epoch": 0.7156105100463679,
"grad_norm": 10.5625,
"learning_rate": 1.6178947368421053e-05,
"loss": 0.5291,
"step": 463
},
{
"epoch": 0.7171561051004637,
"grad_norm": 10.9375,
"learning_rate": 1.616842105263158e-05,
"loss": 0.5948,
"step": 464
},
{
"epoch": 0.7187017001545595,
"grad_norm": 13.0625,
"learning_rate": 1.6157894736842106e-05,
"loss": 0.5395,
"step": 465
},
{
"epoch": 0.7202472952086554,
"grad_norm": 9.875,
"learning_rate": 1.6147368421052634e-05,
"loss": 0.524,
"step": 466
},
{
"epoch": 0.7217928902627512,
"grad_norm": 9.625,
"learning_rate": 1.613684210526316e-05,
"loss": 0.5742,
"step": 467
},
{
"epoch": 0.723338485316847,
"grad_norm": 11.0,
"learning_rate": 1.6126315789473687e-05,
"loss": 0.5524,
"step": 468
},
{
"epoch": 0.7248840803709428,
"grad_norm": 10.25,
"learning_rate": 1.611578947368421e-05,
"loss": 0.601,
"step": 469
},
{
"epoch": 0.7264296754250387,
"grad_norm": 10.5625,
"learning_rate": 1.6105263157894736e-05,
"loss": 0.528,
"step": 470
},
{
"epoch": 0.7279752704791345,
"grad_norm": 10.6875,
"learning_rate": 1.6094736842105264e-05,
"loss": 0.6095,
"step": 471
},
{
"epoch": 0.7295208655332303,
"grad_norm": 9.625,
"learning_rate": 1.6084210526315792e-05,
"loss": 0.5767,
"step": 472
},
{
"epoch": 0.7310664605873262,
"grad_norm": 10.9375,
"learning_rate": 1.6073684210526317e-05,
"loss": 0.6444,
"step": 473
},
{
"epoch": 0.732612055641422,
"grad_norm": 11.0,
"learning_rate": 1.606315789473684e-05,
"loss": 0.5567,
"step": 474
},
{
"epoch": 0.7341576506955177,
"grad_norm": 10.4375,
"learning_rate": 1.605263157894737e-05,
"loss": 0.5302,
"step": 475
},
{
"epoch": 0.7357032457496137,
"grad_norm": 11.0,
"learning_rate": 1.6042105263157897e-05,
"loss": 0.6625,
"step": 476
},
{
"epoch": 0.7372488408037094,
"grad_norm": 10.0,
"learning_rate": 1.6031578947368422e-05,
"loss": 0.5529,
"step": 477
},
{
"epoch": 0.7387944358578052,
"grad_norm": 9.5625,
"learning_rate": 1.6021052631578947e-05,
"loss": 0.5262,
"step": 478
},
{
"epoch": 0.740340030911901,
"grad_norm": 11.9375,
"learning_rate": 1.6010526315789475e-05,
"loss": 0.6064,
"step": 479
},
{
"epoch": 0.7418856259659969,
"grad_norm": 10.9375,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.5494,
"step": 480
},
{
"epoch": 0.7434312210200927,
"grad_norm": 10.5625,
"learning_rate": 1.5989473684210527e-05,
"loss": 0.6263,
"step": 481
},
{
"epoch": 0.7449768160741885,
"grad_norm": 21.75,
"learning_rate": 1.5978947368421055e-05,
"loss": 0.5636,
"step": 482
},
{
"epoch": 0.7465224111282844,
"grad_norm": 10.375,
"learning_rate": 1.596842105263158e-05,
"loss": 0.5776,
"step": 483
},
{
"epoch": 0.7480680061823802,
"grad_norm": 10.5,
"learning_rate": 1.5957894736842105e-05,
"loss": 0.4751,
"step": 484
},
{
"epoch": 0.749613601236476,
"grad_norm": 10.625,
"learning_rate": 1.5947368421052633e-05,
"loss": 0.5703,
"step": 485
},
{
"epoch": 0.7511591962905718,
"grad_norm": 10.8125,
"learning_rate": 1.593684210526316e-05,
"loss": 0.5683,
"step": 486
},
{
"epoch": 0.7527047913446677,
"grad_norm": 11.625,
"learning_rate": 1.5926315789473685e-05,
"loss": 0.6533,
"step": 487
},
{
"epoch": 0.7542503863987635,
"grad_norm": 9.3125,
"learning_rate": 1.591578947368421e-05,
"loss": 0.563,
"step": 488
},
{
"epoch": 0.7557959814528593,
"grad_norm": 10.4375,
"learning_rate": 1.5905263157894738e-05,
"loss": 0.6214,
"step": 489
},
{
"epoch": 0.7573415765069552,
"grad_norm": 12.0,
"learning_rate": 1.5894736842105266e-05,
"loss": 0.5328,
"step": 490
},
{
"epoch": 0.758887171561051,
"grad_norm": 10.375,
"learning_rate": 1.588421052631579e-05,
"loss": 0.6254,
"step": 491
},
{
"epoch": 0.7604327666151468,
"grad_norm": 10.875,
"learning_rate": 1.5873684210526315e-05,
"loss": 0.6091,
"step": 492
},
{
"epoch": 0.7619783616692427,
"grad_norm": 9.9375,
"learning_rate": 1.5863157894736843e-05,
"loss": 0.6014,
"step": 493
},
{
"epoch": 0.7635239567233385,
"grad_norm": 10.0,
"learning_rate": 1.585263157894737e-05,
"loss": 0.5707,
"step": 494
},
{
"epoch": 0.7650695517774343,
"grad_norm": 11.125,
"learning_rate": 1.5842105263157896e-05,
"loss": 0.6192,
"step": 495
},
{
"epoch": 0.7666151468315301,
"grad_norm": 10.9375,
"learning_rate": 1.5831578947368424e-05,
"loss": 0.666,
"step": 496
},
{
"epoch": 0.768160741885626,
"grad_norm": 9.4375,
"learning_rate": 1.582105263157895e-05,
"loss": 0.6,
"step": 497
},
{
"epoch": 0.7697063369397218,
"grad_norm": 10.125,
"learning_rate": 1.5810526315789473e-05,
"loss": 0.5649,
"step": 498
},
{
"epoch": 0.7712519319938176,
"grad_norm": 13.0625,
"learning_rate": 1.58e-05,
"loss": 0.5569,
"step": 499
},
{
"epoch": 0.7727975270479135,
"grad_norm": 11.0625,
"learning_rate": 1.578947368421053e-05,
"loss": 0.5915,
"step": 500
},
{
"epoch": 0.7743431221020093,
"grad_norm": 10.4375,
"learning_rate": 1.5778947368421054e-05,
"loss": 0.5871,
"step": 501
},
{
"epoch": 0.7758887171561051,
"grad_norm": 12.0625,
"learning_rate": 1.576842105263158e-05,
"loss": 0.566,
"step": 502
},
{
"epoch": 0.7774343122102009,
"grad_norm": 12.75,
"learning_rate": 1.5757894736842107e-05,
"loss": 0.5629,
"step": 503
},
{
"epoch": 0.7789799072642968,
"grad_norm": 11.25,
"learning_rate": 1.5747368421052635e-05,
"loss": 0.6066,
"step": 504
},
{
"epoch": 0.7805255023183926,
"grad_norm": 10.9375,
"learning_rate": 1.573684210526316e-05,
"loss": 0.5439,
"step": 505
},
{
"epoch": 0.7820710973724884,
"grad_norm": 13.875,
"learning_rate": 1.5726315789473684e-05,
"loss": 0.5939,
"step": 506
},
{
"epoch": 0.7836166924265843,
"grad_norm": 10.0625,
"learning_rate": 1.5715789473684212e-05,
"loss": 0.568,
"step": 507
},
{
"epoch": 0.7851622874806801,
"grad_norm": 10.375,
"learning_rate": 1.570526315789474e-05,
"loss": 0.5633,
"step": 508
},
{
"epoch": 0.7867078825347759,
"grad_norm": 9.75,
"learning_rate": 1.5694736842105264e-05,
"loss": 0.4905,
"step": 509
},
{
"epoch": 0.7882534775888718,
"grad_norm": 11.9375,
"learning_rate": 1.568421052631579e-05,
"loss": 0.5626,
"step": 510
},
{
"epoch": 0.7897990726429676,
"grad_norm": 11.5,
"learning_rate": 1.5673684210526317e-05,
"loss": 0.5532,
"step": 511
},
{
"epoch": 0.7913446676970634,
"grad_norm": 9.1875,
"learning_rate": 1.5663157894736842e-05,
"loss": 0.5102,
"step": 512
},
{
"epoch": 0.7928902627511591,
"grad_norm": 10.6875,
"learning_rate": 1.565263157894737e-05,
"loss": 0.6168,
"step": 513
},
{
"epoch": 0.794435857805255,
"grad_norm": 10.25,
"learning_rate": 1.5642105263157898e-05,
"loss": 0.5717,
"step": 514
},
{
"epoch": 0.7959814528593508,
"grad_norm": 10.5625,
"learning_rate": 1.5631578947368422e-05,
"loss": 0.5673,
"step": 515
},
{
"epoch": 0.7975270479134466,
"grad_norm": 9.375,
"learning_rate": 1.5621052631578947e-05,
"loss": 0.5007,
"step": 516
},
{
"epoch": 0.7990726429675425,
"grad_norm": 9.25,
"learning_rate": 1.5610526315789475e-05,
"loss": 0.603,
"step": 517
},
{
"epoch": 0.8006182380216383,
"grad_norm": 12.5,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.5386,
"step": 518
},
{
"epoch": 0.8021638330757341,
"grad_norm": 13.25,
"learning_rate": 1.5589473684210528e-05,
"loss": 0.5935,
"step": 519
},
{
"epoch": 0.80370942812983,
"grad_norm": 10.375,
"learning_rate": 1.5578947368421052e-05,
"loss": 0.5865,
"step": 520
},
{
"epoch": 0.8052550231839258,
"grad_norm": 18.375,
"learning_rate": 1.556842105263158e-05,
"loss": 0.6485,
"step": 521
},
{
"epoch": 0.8068006182380216,
"grad_norm": 11.125,
"learning_rate": 1.555789473684211e-05,
"loss": 0.6273,
"step": 522
},
{
"epoch": 0.8083462132921174,
"grad_norm": 10.5,
"learning_rate": 1.5547368421052633e-05,
"loss": 0.5633,
"step": 523
},
{
"epoch": 0.8098918083462133,
"grad_norm": 11.0,
"learning_rate": 1.5536842105263158e-05,
"loss": 0.5802,
"step": 524
},
{
"epoch": 0.8114374034003091,
"grad_norm": 10.875,
"learning_rate": 1.5526315789473686e-05,
"loss": 0.5109,
"step": 525
},
{
"epoch": 0.8129829984544049,
"grad_norm": 9.9375,
"learning_rate": 1.551578947368421e-05,
"loss": 0.5406,
"step": 526
},
{
"epoch": 0.8145285935085008,
"grad_norm": 10.0625,
"learning_rate": 1.550526315789474e-05,
"loss": 0.5531,
"step": 527
},
{
"epoch": 0.8160741885625966,
"grad_norm": 11.0,
"learning_rate": 1.5494736842105263e-05,
"loss": 0.6518,
"step": 528
},
{
"epoch": 0.8176197836166924,
"grad_norm": 9.75,
"learning_rate": 1.548421052631579e-05,
"loss": 0.5383,
"step": 529
},
{
"epoch": 0.8191653786707882,
"grad_norm": 10.3125,
"learning_rate": 1.5473684210526316e-05,
"loss": 0.5739,
"step": 530
},
{
"epoch": 0.8207109737248841,
"grad_norm": 13.875,
"learning_rate": 1.5463157894736844e-05,
"loss": 0.5225,
"step": 531
},
{
"epoch": 0.8222565687789799,
"grad_norm": 9.3125,
"learning_rate": 1.545263157894737e-05,
"loss": 0.5946,
"step": 532
},
{
"epoch": 0.8238021638330757,
"grad_norm": 10.875,
"learning_rate": 1.5442105263157896e-05,
"loss": 0.5962,
"step": 533
},
{
"epoch": 0.8253477588871716,
"grad_norm": 10.6875,
"learning_rate": 1.543157894736842e-05,
"loss": 0.534,
"step": 534
},
{
"epoch": 0.8268933539412674,
"grad_norm": 10.1875,
"learning_rate": 1.542105263157895e-05,
"loss": 0.5131,
"step": 535
},
{
"epoch": 0.8284389489953632,
"grad_norm": 12.375,
"learning_rate": 1.5410526315789477e-05,
"loss": 0.5951,
"step": 536
},
{
"epoch": 0.8299845440494591,
"grad_norm": 14.375,
"learning_rate": 1.54e-05,
"loss": 0.5676,
"step": 537
},
{
"epoch": 0.8315301391035549,
"grad_norm": 11.875,
"learning_rate": 1.5389473684210526e-05,
"loss": 0.5749,
"step": 538
},
{
"epoch": 0.8330757341576507,
"grad_norm": 10.5625,
"learning_rate": 1.5378947368421054e-05,
"loss": 0.5402,
"step": 539
},
{
"epoch": 0.8346213292117465,
"grad_norm": 10.5625,
"learning_rate": 1.536842105263158e-05,
"loss": 0.5603,
"step": 540
},
{
"epoch": 0.8361669242658424,
"grad_norm": 11.625,
"learning_rate": 1.5357894736842107e-05,
"loss": 0.5902,
"step": 541
},
{
"epoch": 0.8377125193199382,
"grad_norm": 26.0,
"learning_rate": 1.534736842105263e-05,
"loss": 0.5891,
"step": 542
},
{
"epoch": 0.839258114374034,
"grad_norm": 8.625,
"learning_rate": 1.533684210526316e-05,
"loss": 0.5634,
"step": 543
},
{
"epoch": 0.8408037094281299,
"grad_norm": 10.125,
"learning_rate": 1.5326315789473684e-05,
"loss": 0.5918,
"step": 544
},
{
"epoch": 0.8423493044822257,
"grad_norm": 12.3125,
"learning_rate": 1.5315789473684212e-05,
"loss": 0.5903,
"step": 545
},
{
"epoch": 0.8438948995363215,
"grad_norm": 14.1875,
"learning_rate": 1.530526315789474e-05,
"loss": 0.5949,
"step": 546
},
{
"epoch": 0.8454404945904173,
"grad_norm": 11.3125,
"learning_rate": 1.5294736842105265e-05,
"loss": 0.5468,
"step": 547
},
{
"epoch": 0.8469860896445132,
"grad_norm": 62.5,
"learning_rate": 1.528421052631579e-05,
"loss": 0.5462,
"step": 548
},
{
"epoch": 0.848531684698609,
"grad_norm": 8.8125,
"learning_rate": 1.5273684210526318e-05,
"loss": 0.5228,
"step": 549
},
{
"epoch": 0.8500772797527048,
"grad_norm": 10.8125,
"learning_rate": 1.5263157894736846e-05,
"loss": 0.592,
"step": 550
},
{
"epoch": 0.8516228748068007,
"grad_norm": 11.1875,
"learning_rate": 1.525263157894737e-05,
"loss": 0.5414,
"step": 551
},
{
"epoch": 0.8531684698608965,
"grad_norm": 11.4375,
"learning_rate": 1.5242105263157897e-05,
"loss": 0.5329,
"step": 552
},
{
"epoch": 0.8547140649149922,
"grad_norm": 10.5,
"learning_rate": 1.5231578947368421e-05,
"loss": 0.5918,
"step": 553
},
{
"epoch": 0.8562596599690881,
"grad_norm": 10.4375,
"learning_rate": 1.5221052631578948e-05,
"loss": 0.5064,
"step": 554
},
{
"epoch": 0.8578052550231839,
"grad_norm": 10.8125,
"learning_rate": 1.5210526315789476e-05,
"loss": 0.5486,
"step": 555
},
{
"epoch": 0.8593508500772797,
"grad_norm": 12.0625,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.5412,
"step": 556
},
{
"epoch": 0.8608964451313755,
"grad_norm": 9.875,
"learning_rate": 1.5189473684210526e-05,
"loss": 0.5417,
"step": 557
},
{
"epoch": 0.8624420401854714,
"grad_norm": 10.9375,
"learning_rate": 1.5178947368421053e-05,
"loss": 0.6457,
"step": 558
},
{
"epoch": 0.8639876352395672,
"grad_norm": 10.125,
"learning_rate": 1.516842105263158e-05,
"loss": 0.5393,
"step": 559
},
{
"epoch": 0.865533230293663,
"grad_norm": 10.25,
"learning_rate": 1.5157894736842107e-05,
"loss": 0.6142,
"step": 560
},
{
"epoch": 0.8670788253477589,
"grad_norm": 9.6875,
"learning_rate": 1.5147368421052633e-05,
"loss": 0.5455,
"step": 561
},
{
"epoch": 0.8686244204018547,
"grad_norm": 10.75,
"learning_rate": 1.5136842105263158e-05,
"loss": 0.5162,
"step": 562
},
{
"epoch": 0.8701700154559505,
"grad_norm": 10.875,
"learning_rate": 1.5126315789473684e-05,
"loss": 0.6003,
"step": 563
},
{
"epoch": 0.8717156105100463,
"grad_norm": 10.875,
"learning_rate": 1.5115789473684212e-05,
"loss": 0.5808,
"step": 564
},
{
"epoch": 0.8732612055641422,
"grad_norm": 10.3125,
"learning_rate": 1.5105263157894739e-05,
"loss": 0.6351,
"step": 565
},
{
"epoch": 0.874806800618238,
"grad_norm": 11.4375,
"learning_rate": 1.5094736842105263e-05,
"loss": 0.5996,
"step": 566
},
{
"epoch": 0.8763523956723338,
"grad_norm": 10.0625,
"learning_rate": 1.508421052631579e-05,
"loss": 0.5089,
"step": 567
},
{
"epoch": 0.8778979907264297,
"grad_norm": 11.6875,
"learning_rate": 1.5073684210526316e-05,
"loss": 0.4852,
"step": 568
},
{
"epoch": 0.8794435857805255,
"grad_norm": 10.25,
"learning_rate": 1.5063157894736844e-05,
"loss": 0.5551,
"step": 569
},
{
"epoch": 0.8809891808346213,
"grad_norm": 11.6875,
"learning_rate": 1.505263157894737e-05,
"loss": 0.5481,
"step": 570
},
{
"epoch": 0.8825347758887172,
"grad_norm": 11.375,
"learning_rate": 1.5042105263157895e-05,
"loss": 0.6355,
"step": 571
},
{
"epoch": 0.884080370942813,
"grad_norm": 11.375,
"learning_rate": 1.5031578947368421e-05,
"loss": 0.5246,
"step": 572
},
{
"epoch": 0.8856259659969088,
"grad_norm": 12.0,
"learning_rate": 1.502105263157895e-05,
"loss": 0.5812,
"step": 573
},
{
"epoch": 0.8871715610510046,
"grad_norm": 11.125,
"learning_rate": 1.5010526315789476e-05,
"loss": 0.58,
"step": 574
},
{
"epoch": 0.8887171561051005,
"grad_norm": 9.8125,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.5265,
"step": 575
},
{
"epoch": 0.8902627511591963,
"grad_norm": 9.8125,
"learning_rate": 1.4989473684210527e-05,
"loss": 0.5676,
"step": 576
},
{
"epoch": 0.8918083462132921,
"grad_norm": 10.625,
"learning_rate": 1.4978947368421053e-05,
"loss": 0.5765,
"step": 577
},
{
"epoch": 0.893353941267388,
"grad_norm": 10.9375,
"learning_rate": 1.4968421052631581e-05,
"loss": 0.5414,
"step": 578
},
{
"epoch": 0.8948995363214838,
"grad_norm": 11.625,
"learning_rate": 1.4957894736842107e-05,
"loss": 0.5369,
"step": 579
},
{
"epoch": 0.8964451313755796,
"grad_norm": 9.75,
"learning_rate": 1.4947368421052632e-05,
"loss": 0.5327,
"step": 580
},
{
"epoch": 0.8979907264296755,
"grad_norm": 10.875,
"learning_rate": 1.4936842105263158e-05,
"loss": 0.5638,
"step": 581
},
{
"epoch": 0.8995363214837713,
"grad_norm": 15.625,
"learning_rate": 1.4926315789473686e-05,
"loss": 0.5382,
"step": 582
},
{
"epoch": 0.9010819165378671,
"grad_norm": 10.5625,
"learning_rate": 1.4915789473684213e-05,
"loss": 0.551,
"step": 583
},
{
"epoch": 0.9026275115919629,
"grad_norm": 12.5,
"learning_rate": 1.4905263157894739e-05,
"loss": 0.5302,
"step": 584
},
{
"epoch": 0.9041731066460588,
"grad_norm": 11.8125,
"learning_rate": 1.4894736842105264e-05,
"loss": 0.5138,
"step": 585
},
{
"epoch": 0.9057187017001546,
"grad_norm": 10.0625,
"learning_rate": 1.488421052631579e-05,
"loss": 0.5535,
"step": 586
},
{
"epoch": 0.9072642967542504,
"grad_norm": 11.125,
"learning_rate": 1.4873684210526318e-05,
"loss": 0.5963,
"step": 587
},
{
"epoch": 0.9088098918083463,
"grad_norm": 10.875,
"learning_rate": 1.4863157894736844e-05,
"loss": 0.577,
"step": 588
},
{
"epoch": 0.910355486862442,
"grad_norm": 13.1875,
"learning_rate": 1.4852631578947369e-05,
"loss": 0.5407,
"step": 589
},
{
"epoch": 0.9119010819165378,
"grad_norm": 10.875,
"learning_rate": 1.4842105263157895e-05,
"loss": 0.4994,
"step": 590
},
{
"epoch": 0.9134466769706336,
"grad_norm": 9.875,
"learning_rate": 1.4831578947368422e-05,
"loss": 0.5298,
"step": 591
},
{
"epoch": 0.9149922720247295,
"grad_norm": 10.75,
"learning_rate": 1.482105263157895e-05,
"loss": 0.5297,
"step": 592
},
{
"epoch": 0.9165378670788253,
"grad_norm": 10.9375,
"learning_rate": 1.4810526315789476e-05,
"loss": 0.5725,
"step": 593
},
{
"epoch": 0.9180834621329211,
"grad_norm": 10.375,
"learning_rate": 1.48e-05,
"loss": 0.4832,
"step": 594
},
{
"epoch": 0.919629057187017,
"grad_norm": 11.0,
"learning_rate": 1.4789473684210527e-05,
"loss": 0.5872,
"step": 595
},
{
"epoch": 0.9211746522411128,
"grad_norm": 11.0625,
"learning_rate": 1.4778947368421055e-05,
"loss": 0.5893,
"step": 596
},
{
"epoch": 0.9227202472952086,
"grad_norm": 12.8125,
"learning_rate": 1.4768421052631581e-05,
"loss": 0.5816,
"step": 597
},
{
"epoch": 0.9242658423493045,
"grad_norm": 10.5,
"learning_rate": 1.4757894736842106e-05,
"loss": 0.5638,
"step": 598
},
{
"epoch": 0.9258114374034003,
"grad_norm": 14.5,
"learning_rate": 1.4747368421052632e-05,
"loss": 0.5258,
"step": 599
},
{
"epoch": 0.9273570324574961,
"grad_norm": 10.0625,
"learning_rate": 1.4736842105263159e-05,
"loss": 0.5906,
"step": 600
},
{
"epoch": 0.9289026275115919,
"grad_norm": 9.5,
"learning_rate": 1.4726315789473687e-05,
"loss": 0.4826,
"step": 601
},
{
"epoch": 0.9304482225656878,
"grad_norm": 14.6875,
"learning_rate": 1.4715789473684213e-05,
"loss": 0.5613,
"step": 602
},
{
"epoch": 0.9319938176197836,
"grad_norm": 11.125,
"learning_rate": 1.4705263157894738e-05,
"loss": 0.5907,
"step": 603
},
{
"epoch": 0.9335394126738794,
"grad_norm": 10.4375,
"learning_rate": 1.4694736842105264e-05,
"loss": 0.5513,
"step": 604
},
{
"epoch": 0.9350850077279753,
"grad_norm": 9.6875,
"learning_rate": 1.468421052631579e-05,
"loss": 0.5197,
"step": 605
},
{
"epoch": 0.9366306027820711,
"grad_norm": 9.8125,
"learning_rate": 1.4673684210526318e-05,
"loss": 0.6047,
"step": 606
},
{
"epoch": 0.9381761978361669,
"grad_norm": 11.5,
"learning_rate": 1.4663157894736843e-05,
"loss": 0.6445,
"step": 607
},
{
"epoch": 0.9397217928902627,
"grad_norm": 8.625,
"learning_rate": 1.465263157894737e-05,
"loss": 0.5412,
"step": 608
},
{
"epoch": 0.9412673879443586,
"grad_norm": 9.4375,
"learning_rate": 1.4642105263157896e-05,
"loss": 0.5648,
"step": 609
},
{
"epoch": 0.9428129829984544,
"grad_norm": 9.5625,
"learning_rate": 1.4631578947368424e-05,
"loss": 0.6084,
"step": 610
},
{
"epoch": 0.9443585780525502,
"grad_norm": 9.875,
"learning_rate": 1.462105263157895e-05,
"loss": 0.5427,
"step": 611
},
{
"epoch": 0.9459041731066461,
"grad_norm": 11.5625,
"learning_rate": 1.4610526315789474e-05,
"loss": 0.5912,
"step": 612
},
{
"epoch": 0.9474497681607419,
"grad_norm": 10.8125,
"learning_rate": 1.46e-05,
"loss": 0.4611,
"step": 613
},
{
"epoch": 0.9489953632148377,
"grad_norm": 11.125,
"learning_rate": 1.4589473684210527e-05,
"loss": 0.5616,
"step": 614
},
{
"epoch": 0.9505409582689336,
"grad_norm": 9.9375,
"learning_rate": 1.4578947368421055e-05,
"loss": 0.6371,
"step": 615
},
{
"epoch": 0.9520865533230294,
"grad_norm": 11.1875,
"learning_rate": 1.456842105263158e-05,
"loss": 0.5843,
"step": 616
},
{
"epoch": 0.9536321483771252,
"grad_norm": 13.25,
"learning_rate": 1.4557894736842106e-05,
"loss": 0.5961,
"step": 617
},
{
"epoch": 0.955177743431221,
"grad_norm": 10.6875,
"learning_rate": 1.4547368421052632e-05,
"loss": 0.5279,
"step": 618
},
{
"epoch": 0.9567233384853169,
"grad_norm": 10.375,
"learning_rate": 1.4536842105263159e-05,
"loss": 0.6456,
"step": 619
},
{
"epoch": 0.9582689335394127,
"grad_norm": 9.625,
"learning_rate": 1.4526315789473687e-05,
"loss": 0.5538,
"step": 620
},
{
"epoch": 0.9598145285935085,
"grad_norm": 12.25,
"learning_rate": 1.4515789473684211e-05,
"loss": 0.5519,
"step": 621
},
{
"epoch": 0.9613601236476044,
"grad_norm": 9.8125,
"learning_rate": 1.4505263157894738e-05,
"loss": 0.5315,
"step": 622
},
{
"epoch": 0.9629057187017002,
"grad_norm": 12.4375,
"learning_rate": 1.4494736842105264e-05,
"loss": 0.5218,
"step": 623
},
{
"epoch": 0.964451313755796,
"grad_norm": 11.75,
"learning_rate": 1.4484210526315792e-05,
"loss": 0.5299,
"step": 624
},
{
"epoch": 0.9659969088098919,
"grad_norm": 15.4375,
"learning_rate": 1.4473684210526317e-05,
"loss": 0.5595,
"step": 625
},
{
"epoch": 0.9675425038639877,
"grad_norm": 10.375,
"learning_rate": 1.4463157894736843e-05,
"loss": 0.5253,
"step": 626
},
{
"epoch": 0.9690880989180835,
"grad_norm": 12.9375,
"learning_rate": 1.445263157894737e-05,
"loss": 0.5111,
"step": 627
},
{
"epoch": 0.9706336939721792,
"grad_norm": 11.375,
"learning_rate": 1.4442105263157896e-05,
"loss": 0.6317,
"step": 628
},
{
"epoch": 0.9721792890262752,
"grad_norm": 10.875,
"learning_rate": 1.4431578947368424e-05,
"loss": 0.6264,
"step": 629
},
{
"epoch": 0.973724884080371,
"grad_norm": 10.6875,
"learning_rate": 1.4421052631578948e-05,
"loss": 0.6112,
"step": 630
},
{
"epoch": 0.9752704791344667,
"grad_norm": 11.0,
"learning_rate": 1.4410526315789475e-05,
"loss": 0.6162,
"step": 631
},
{
"epoch": 0.9768160741885626,
"grad_norm": 10.4375,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.5203,
"step": 632
},
{
"epoch": 0.9783616692426584,
"grad_norm": 9.25,
"learning_rate": 1.4389473684210526e-05,
"loss": 0.5198,
"step": 633
},
{
"epoch": 0.9799072642967542,
"grad_norm": 10.25,
"learning_rate": 1.4378947368421054e-05,
"loss": 0.5171,
"step": 634
},
{
"epoch": 0.98145285935085,
"grad_norm": 10.8125,
"learning_rate": 1.436842105263158e-05,
"loss": 0.5061,
"step": 635
},
{
"epoch": 0.9829984544049459,
"grad_norm": 8.875,
"learning_rate": 1.4357894736842106e-05,
"loss": 0.53,
"step": 636
},
{
"epoch": 0.9845440494590417,
"grad_norm": 11.5,
"learning_rate": 1.4347368421052633e-05,
"loss": 0.5439,
"step": 637
},
{
"epoch": 0.9860896445131375,
"grad_norm": 9.875,
"learning_rate": 1.433684210526316e-05,
"loss": 0.5757,
"step": 638
},
{
"epoch": 0.9876352395672334,
"grad_norm": 18.25,
"learning_rate": 1.4326315789473685e-05,
"loss": 0.5069,
"step": 639
},
{
"epoch": 0.9891808346213292,
"grad_norm": 12.125,
"learning_rate": 1.4315789473684212e-05,
"loss": 0.6396,
"step": 640
},
{
"epoch": 0.990726429675425,
"grad_norm": 10.1875,
"learning_rate": 1.4305263157894738e-05,
"loss": 0.5631,
"step": 641
},
{
"epoch": 0.9922720247295209,
"grad_norm": 11.0,
"learning_rate": 1.4294736842105263e-05,
"loss": 0.5161,
"step": 642
},
{
"epoch": 0.9938176197836167,
"grad_norm": 11.375,
"learning_rate": 1.4284210526315792e-05,
"loss": 0.5589,
"step": 643
},
{
"epoch": 0.9953632148377125,
"grad_norm": 12.25,
"learning_rate": 1.4273684210526317e-05,
"loss": 0.5475,
"step": 644
},
{
"epoch": 0.9969088098918083,
"grad_norm": 9.625,
"learning_rate": 1.4263157894736843e-05,
"loss": 0.5181,
"step": 645
},
{
"epoch": 0.9984544049459042,
"grad_norm": 10.625,
"learning_rate": 1.425263157894737e-05,
"loss": 0.4886,
"step": 646
},
{
"epoch": 1.0,
"grad_norm": 9.875,
"learning_rate": 1.4242105263157894e-05,
"loss": 0.5598,
"step": 647
},
{
"epoch": 1.001545595054096,
"grad_norm": 8.875,
"learning_rate": 1.4231578947368422e-05,
"loss": 0.4375,
"step": 648
},
{
"epoch": 1.0030911901081916,
"grad_norm": 9.4375,
"learning_rate": 1.4221052631578949e-05,
"loss": 0.4773,
"step": 649
},
{
"epoch": 1.0046367851622875,
"grad_norm": 9.875,
"learning_rate": 1.4210526315789475e-05,
"loss": 0.5076,
"step": 650
},
{
"epoch": 1.0061823802163834,
"grad_norm": 10.0,
"learning_rate": 1.4200000000000001e-05,
"loss": 0.5444,
"step": 651
},
{
"epoch": 1.007727975270479,
"grad_norm": 9.0,
"learning_rate": 1.418947368421053e-05,
"loss": 0.4753,
"step": 652
},
{
"epoch": 1.009273570324575,
"grad_norm": 8.625,
"learning_rate": 1.4178947368421054e-05,
"loss": 0.4429,
"step": 653
},
{
"epoch": 1.010819165378671,
"grad_norm": 9.9375,
"learning_rate": 1.416842105263158e-05,
"loss": 0.5413,
"step": 654
},
{
"epoch": 1.0123647604327666,
"grad_norm": 9.5625,
"learning_rate": 1.4157894736842107e-05,
"loss": 0.4887,
"step": 655
},
{
"epoch": 1.0139103554868625,
"grad_norm": 8.375,
"learning_rate": 1.4147368421052631e-05,
"loss": 0.4326,
"step": 656
},
{
"epoch": 1.0154559505409582,
"grad_norm": 10.1875,
"learning_rate": 1.413684210526316e-05,
"loss": 0.5413,
"step": 657
},
{
"epoch": 1.017001545595054,
"grad_norm": 11.125,
"learning_rate": 1.4126315789473686e-05,
"loss": 0.53,
"step": 658
},
{
"epoch": 1.01854714064915,
"grad_norm": 10.125,
"learning_rate": 1.4115789473684212e-05,
"loss": 0.4957,
"step": 659
},
{
"epoch": 1.0200927357032457,
"grad_norm": 9.25,
"learning_rate": 1.4105263157894738e-05,
"loss": 0.4954,
"step": 660
},
{
"epoch": 1.0216383307573416,
"grad_norm": 12.5625,
"learning_rate": 1.4094736842105263e-05,
"loss": 0.4645,
"step": 661
},
{
"epoch": 1.0231839258114375,
"grad_norm": 10.0,
"learning_rate": 1.4084210526315791e-05,
"loss": 0.4992,
"step": 662
},
{
"epoch": 1.0247295208655331,
"grad_norm": 12.5625,
"learning_rate": 1.4073684210526317e-05,
"loss": 0.5098,
"step": 663
},
{
"epoch": 1.026275115919629,
"grad_norm": 11.125,
"learning_rate": 1.4063157894736844e-05,
"loss": 0.5931,
"step": 664
},
{
"epoch": 1.027820710973725,
"grad_norm": 11.3125,
"learning_rate": 1.4052631578947368e-05,
"loss": 0.5139,
"step": 665
},
{
"epoch": 1.0293663060278206,
"grad_norm": 10.1875,
"learning_rate": 1.4042105263157896e-05,
"loss": 0.4894,
"step": 666
},
{
"epoch": 1.0309119010819165,
"grad_norm": 10.4375,
"learning_rate": 1.4031578947368423e-05,
"loss": 0.4719,
"step": 667
},
{
"epoch": 1.0324574961360125,
"grad_norm": 12.75,
"learning_rate": 1.4021052631578949e-05,
"loss": 0.5063,
"step": 668
},
{
"epoch": 1.0340030911901081,
"grad_norm": 9.5625,
"learning_rate": 1.4010526315789475e-05,
"loss": 0.4891,
"step": 669
},
{
"epoch": 1.035548686244204,
"grad_norm": 9.75,
"learning_rate": 1.4e-05,
"loss": 0.5401,
"step": 670
},
{
"epoch": 1.0370942812983,
"grad_norm": 9.8125,
"learning_rate": 1.3989473684210528e-05,
"loss": 0.4784,
"step": 671
},
{
"epoch": 1.0386398763523956,
"grad_norm": 11.8125,
"learning_rate": 1.3978947368421054e-05,
"loss": 0.5106,
"step": 672
},
{
"epoch": 1.0401854714064915,
"grad_norm": 10.6875,
"learning_rate": 1.396842105263158e-05,
"loss": 0.4898,
"step": 673
},
{
"epoch": 1.0417310664605872,
"grad_norm": 10.0,
"learning_rate": 1.3957894736842105e-05,
"loss": 0.5321,
"step": 674
},
{
"epoch": 1.0432766615146831,
"grad_norm": 10.4375,
"learning_rate": 1.3947368421052631e-05,
"loss": 0.5076,
"step": 675
},
{
"epoch": 1.044822256568779,
"grad_norm": 9.625,
"learning_rate": 1.393684210526316e-05,
"loss": 0.5249,
"step": 676
},
{
"epoch": 1.0463678516228747,
"grad_norm": 10.25,
"learning_rate": 1.3926315789473686e-05,
"loss": 0.5416,
"step": 677
},
{
"epoch": 1.0479134466769706,
"grad_norm": 10.3125,
"learning_rate": 1.3915789473684212e-05,
"loss": 0.5749,
"step": 678
},
{
"epoch": 1.0494590417310665,
"grad_norm": 11.0625,
"learning_rate": 1.3905263157894737e-05,
"loss": 0.4648,
"step": 679
},
{
"epoch": 1.0510046367851622,
"grad_norm": 10.625,
"learning_rate": 1.3894736842105265e-05,
"loss": 0.4699,
"step": 680
},
{
"epoch": 1.052550231839258,
"grad_norm": 10.8125,
"learning_rate": 1.3884210526315791e-05,
"loss": 0.4832,
"step": 681
},
{
"epoch": 1.054095826893354,
"grad_norm": 10.375,
"learning_rate": 1.3873684210526317e-05,
"loss": 0.5309,
"step": 682
},
{
"epoch": 1.0556414219474497,
"grad_norm": 10.0,
"learning_rate": 1.3863157894736842e-05,
"loss": 0.5179,
"step": 683
},
{
"epoch": 1.0571870170015456,
"grad_norm": 10.125,
"learning_rate": 1.3852631578947368e-05,
"loss": 0.5455,
"step": 684
},
{
"epoch": 1.0587326120556415,
"grad_norm": 11.75,
"learning_rate": 1.3842105263157896e-05,
"loss": 0.5686,
"step": 685
},
{
"epoch": 1.0602782071097372,
"grad_norm": 11.6875,
"learning_rate": 1.3831578947368423e-05,
"loss": 0.5095,
"step": 686
},
{
"epoch": 1.061823802163833,
"grad_norm": 10.8125,
"learning_rate": 1.3821052631578949e-05,
"loss": 0.4923,
"step": 687
},
{
"epoch": 1.063369397217929,
"grad_norm": 10.0,
"learning_rate": 1.3810526315789474e-05,
"loss": 0.4592,
"step": 688
},
{
"epoch": 1.0649149922720247,
"grad_norm": 10.5,
"learning_rate": 1.38e-05,
"loss": 0.456,
"step": 689
},
{
"epoch": 1.0664605873261206,
"grad_norm": 9.25,
"learning_rate": 1.3789473684210528e-05,
"loss": 0.4445,
"step": 690
},
{
"epoch": 1.0680061823802163,
"grad_norm": 9.4375,
"learning_rate": 1.3778947368421054e-05,
"loss": 0.4769,
"step": 691
},
{
"epoch": 1.0695517774343122,
"grad_norm": 13.5625,
"learning_rate": 1.3768421052631579e-05,
"loss": 0.5493,
"step": 692
},
{
"epoch": 1.071097372488408,
"grad_norm": 10.5625,
"learning_rate": 1.3757894736842105e-05,
"loss": 0.5005,
"step": 693
},
{
"epoch": 1.0726429675425038,
"grad_norm": 9.625,
"learning_rate": 1.3747368421052633e-05,
"loss": 0.5041,
"step": 694
},
{
"epoch": 1.0741885625965997,
"grad_norm": 11.375,
"learning_rate": 1.373684210526316e-05,
"loss": 0.5143,
"step": 695
},
{
"epoch": 1.0757341576506956,
"grad_norm": 13.6875,
"learning_rate": 1.3726315789473686e-05,
"loss": 0.4953,
"step": 696
},
{
"epoch": 1.0772797527047913,
"grad_norm": 10.9375,
"learning_rate": 1.371578947368421e-05,
"loss": 0.534,
"step": 697
},
{
"epoch": 1.0788253477588872,
"grad_norm": 9.125,
"learning_rate": 1.3705263157894737e-05,
"loss": 0.479,
"step": 698
},
{
"epoch": 1.080370942812983,
"grad_norm": 11.75,
"learning_rate": 1.3694736842105265e-05,
"loss": 0.5076,
"step": 699
},
{
"epoch": 1.0819165378670788,
"grad_norm": 10.125,
"learning_rate": 1.3684210526315791e-05,
"loss": 0.505,
"step": 700
},
{
"epoch": 1.0834621329211747,
"grad_norm": 9.875,
"learning_rate": 1.3673684210526316e-05,
"loss": 0.4913,
"step": 701
},
{
"epoch": 1.0850077279752706,
"grad_norm": 12.625,
"learning_rate": 1.3663157894736842e-05,
"loss": 0.4428,
"step": 702
},
{
"epoch": 1.0865533230293662,
"grad_norm": 10.8125,
"learning_rate": 1.3652631578947369e-05,
"loss": 0.5485,
"step": 703
},
{
"epoch": 1.0880989180834622,
"grad_norm": 10.25,
"learning_rate": 1.3642105263157897e-05,
"loss": 0.5158,
"step": 704
},
{
"epoch": 1.089644513137558,
"grad_norm": 30.125,
"learning_rate": 1.3631578947368423e-05,
"loss": 0.4655,
"step": 705
},
{
"epoch": 1.0911901081916537,
"grad_norm": 9.8125,
"learning_rate": 1.3621052631578948e-05,
"loss": 0.4943,
"step": 706
},
{
"epoch": 1.0927357032457496,
"grad_norm": 11.3125,
"learning_rate": 1.3610526315789474e-05,
"loss": 0.4817,
"step": 707
},
{
"epoch": 1.0942812982998453,
"grad_norm": 10.3125,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.5063,
"step": 708
},
{
"epoch": 1.0958268933539412,
"grad_norm": 9.1875,
"learning_rate": 1.3589473684210528e-05,
"loss": 0.4958,
"step": 709
},
{
"epoch": 1.0973724884080371,
"grad_norm": 13.6875,
"learning_rate": 1.3578947368421055e-05,
"loss": 0.5026,
"step": 710
},
{
"epoch": 1.0989180834621328,
"grad_norm": 9.875,
"learning_rate": 1.356842105263158e-05,
"loss": 0.4928,
"step": 711
},
{
"epoch": 1.1004636785162287,
"grad_norm": 11.0625,
"learning_rate": 1.3557894736842106e-05,
"loss": 0.564,
"step": 712
},
{
"epoch": 1.1020092735703246,
"grad_norm": 10.0,
"learning_rate": 1.3547368421052634e-05,
"loss": 0.4681,
"step": 713
},
{
"epoch": 1.1035548686244203,
"grad_norm": 10.9375,
"learning_rate": 1.353684210526316e-05,
"loss": 0.5466,
"step": 714
},
{
"epoch": 1.1051004636785162,
"grad_norm": 11.0,
"learning_rate": 1.3526315789473685e-05,
"loss": 0.4717,
"step": 715
},
{
"epoch": 1.1066460587326121,
"grad_norm": 9.75,
"learning_rate": 1.3515789473684211e-05,
"loss": 0.4312,
"step": 716
},
{
"epoch": 1.1081916537867078,
"grad_norm": 10.1875,
"learning_rate": 1.3505263157894737e-05,
"loss": 0.5215,
"step": 717
},
{
"epoch": 1.1097372488408037,
"grad_norm": 10.75,
"learning_rate": 1.3494736842105265e-05,
"loss": 0.4868,
"step": 718
},
{
"epoch": 1.1112828438948996,
"grad_norm": 11.0,
"learning_rate": 1.3484210526315792e-05,
"loss": 0.4764,
"step": 719
},
{
"epoch": 1.1128284389489953,
"grad_norm": 10.5625,
"learning_rate": 1.3473684210526316e-05,
"loss": 0.5441,
"step": 720
},
{
"epoch": 1.1143740340030912,
"grad_norm": 9.3125,
"learning_rate": 1.3463157894736842e-05,
"loss": 0.4406,
"step": 721
},
{
"epoch": 1.1159196290571871,
"grad_norm": 10.6875,
"learning_rate": 1.345263157894737e-05,
"loss": 0.5739,
"step": 722
},
{
"epoch": 1.1174652241112828,
"grad_norm": 9.875,
"learning_rate": 1.3442105263157897e-05,
"loss": 0.5379,
"step": 723
},
{
"epoch": 1.1190108191653787,
"grad_norm": 9.625,
"learning_rate": 1.3431578947368421e-05,
"loss": 0.4805,
"step": 724
},
{
"epoch": 1.1205564142194744,
"grad_norm": 10.4375,
"learning_rate": 1.3421052631578948e-05,
"loss": 0.5088,
"step": 725
},
{
"epoch": 1.1221020092735703,
"grad_norm": 11.1875,
"learning_rate": 1.3410526315789474e-05,
"loss": 0.4453,
"step": 726
},
{
"epoch": 1.1236476043276662,
"grad_norm": 11.3125,
"learning_rate": 1.3400000000000002e-05,
"loss": 0.5294,
"step": 727
},
{
"epoch": 1.125193199381762,
"grad_norm": 8.875,
"learning_rate": 1.3389473684210528e-05,
"loss": 0.4508,
"step": 728
},
{
"epoch": 1.1267387944358578,
"grad_norm": 9.25,
"learning_rate": 1.3378947368421053e-05,
"loss": 0.4841,
"step": 729
},
{
"epoch": 1.1282843894899537,
"grad_norm": 9.4375,
"learning_rate": 1.336842105263158e-05,
"loss": 0.4764,
"step": 730
},
{
"epoch": 1.1298299845440494,
"grad_norm": 10.4375,
"learning_rate": 1.3357894736842106e-05,
"loss": 0.4555,
"step": 731
},
{
"epoch": 1.1313755795981453,
"grad_norm": 10.0,
"learning_rate": 1.3347368421052634e-05,
"loss": 0.5199,
"step": 732
},
{
"epoch": 1.1329211746522412,
"grad_norm": 10.5,
"learning_rate": 1.3336842105263158e-05,
"loss": 0.4891,
"step": 733
},
{
"epoch": 1.1344667697063369,
"grad_norm": 12.0,
"learning_rate": 1.3326315789473685e-05,
"loss": 0.4165,
"step": 734
},
{
"epoch": 1.1360123647604328,
"grad_norm": 9.3125,
"learning_rate": 1.3315789473684211e-05,
"loss": 0.4111,
"step": 735
},
{
"epoch": 1.1375579598145287,
"grad_norm": 12.75,
"learning_rate": 1.3305263157894739e-05,
"loss": 0.5086,
"step": 736
},
{
"epoch": 1.1391035548686244,
"grad_norm": 10.625,
"learning_rate": 1.3294736842105265e-05,
"loss": 0.5237,
"step": 737
},
{
"epoch": 1.1406491499227203,
"grad_norm": 13.0625,
"learning_rate": 1.328421052631579e-05,
"loss": 0.5114,
"step": 738
},
{
"epoch": 1.1421947449768162,
"grad_norm": 10.0,
"learning_rate": 1.3273684210526316e-05,
"loss": 0.4488,
"step": 739
},
{
"epoch": 1.1437403400309119,
"grad_norm": 9.9375,
"learning_rate": 1.3263157894736843e-05,
"loss": 0.5056,
"step": 740
},
{
"epoch": 1.1452859350850078,
"grad_norm": 9.125,
"learning_rate": 1.325263157894737e-05,
"loss": 0.4251,
"step": 741
},
{
"epoch": 1.1468315301391034,
"grad_norm": 20.625,
"learning_rate": 1.3242105263157895e-05,
"loss": 0.4894,
"step": 742
},
{
"epoch": 1.1483771251931993,
"grad_norm": 11.0625,
"learning_rate": 1.3231578947368422e-05,
"loss": 0.5005,
"step": 743
},
{
"epoch": 1.1499227202472952,
"grad_norm": 11.25,
"learning_rate": 1.3221052631578948e-05,
"loss": 0.5261,
"step": 744
},
{
"epoch": 1.1514683153013912,
"grad_norm": 16.375,
"learning_rate": 1.3210526315789476e-05,
"loss": 0.5253,
"step": 745
},
{
"epoch": 1.1530139103554868,
"grad_norm": 12.1875,
"learning_rate": 1.3200000000000002e-05,
"loss": 0.5434,
"step": 746
},
{
"epoch": 1.1545595054095827,
"grad_norm": 21.125,
"learning_rate": 1.3189473684210527e-05,
"loss": 0.4824,
"step": 747
},
{
"epoch": 1.1561051004636784,
"grad_norm": 10.8125,
"learning_rate": 1.3178947368421053e-05,
"loss": 0.4825,
"step": 748
},
{
"epoch": 1.1576506955177743,
"grad_norm": 11.0,
"learning_rate": 1.316842105263158e-05,
"loss": 0.5656,
"step": 749
},
{
"epoch": 1.1591962905718702,
"grad_norm": 10.6875,
"learning_rate": 1.3157894736842108e-05,
"loss": 0.4975,
"step": 750
},
{
"epoch": 1.160741885625966,
"grad_norm": 9.125,
"learning_rate": 1.3147368421052632e-05,
"loss": 0.4886,
"step": 751
},
{
"epoch": 1.1622874806800618,
"grad_norm": 8.9375,
"learning_rate": 1.3136842105263159e-05,
"loss": 0.4697,
"step": 752
},
{
"epoch": 1.1638330757341577,
"grad_norm": 10.5625,
"learning_rate": 1.3126315789473685e-05,
"loss": 0.516,
"step": 753
},
{
"epoch": 1.1653786707882534,
"grad_norm": 9.875,
"learning_rate": 1.3115789473684211e-05,
"loss": 0.4741,
"step": 754
},
{
"epoch": 1.1669242658423493,
"grad_norm": 9.125,
"learning_rate": 1.310526315789474e-05,
"loss": 0.4448,
"step": 755
},
{
"epoch": 1.1684698608964452,
"grad_norm": 9.75,
"learning_rate": 1.3094736842105264e-05,
"loss": 0.4573,
"step": 756
},
{
"epoch": 1.170015455950541,
"grad_norm": 10.5,
"learning_rate": 1.308421052631579e-05,
"loss": 0.4871,
"step": 757
},
{
"epoch": 1.1715610510046368,
"grad_norm": 9.6875,
"learning_rate": 1.3073684210526317e-05,
"loss": 0.4972,
"step": 758
},
{
"epoch": 1.1731066460587325,
"grad_norm": 10.4375,
"learning_rate": 1.3063157894736845e-05,
"loss": 0.5072,
"step": 759
},
{
"epoch": 1.1746522411128284,
"grad_norm": 9.875,
"learning_rate": 1.305263157894737e-05,
"loss": 0.508,
"step": 760
},
{
"epoch": 1.1761978361669243,
"grad_norm": 10.125,
"learning_rate": 1.3042105263157896e-05,
"loss": 0.449,
"step": 761
},
{
"epoch": 1.1777434312210202,
"grad_norm": 10.9375,
"learning_rate": 1.3031578947368422e-05,
"loss": 0.4341,
"step": 762
},
{
"epoch": 1.179289026275116,
"grad_norm": 10.625,
"learning_rate": 1.3021052631578948e-05,
"loss": 0.4597,
"step": 763
},
{
"epoch": 1.1808346213292118,
"grad_norm": 9.5625,
"learning_rate": 1.3010526315789476e-05,
"loss": 0.4595,
"step": 764
},
{
"epoch": 1.1823802163833075,
"grad_norm": 10.125,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.4008,
"step": 765
},
{
"epoch": 1.1839258114374034,
"grad_norm": 9.8125,
"learning_rate": 1.2989473684210527e-05,
"loss": 0.4453,
"step": 766
},
{
"epoch": 1.1854714064914993,
"grad_norm": 10.0,
"learning_rate": 1.2978947368421054e-05,
"loss": 0.4873,
"step": 767
},
{
"epoch": 1.187017001545595,
"grad_norm": 11.1875,
"learning_rate": 1.2968421052631578e-05,
"loss": 0.4919,
"step": 768
},
{
"epoch": 1.1885625965996909,
"grad_norm": 9.625,
"learning_rate": 1.2957894736842108e-05,
"loss": 0.4375,
"step": 769
},
{
"epoch": 1.1901081916537868,
"grad_norm": 9.5,
"learning_rate": 1.2947368421052633e-05,
"loss": 0.5009,
"step": 770
},
{
"epoch": 1.1916537867078825,
"grad_norm": 9.8125,
"learning_rate": 1.2936842105263159e-05,
"loss": 0.483,
"step": 771
},
{
"epoch": 1.1931993817619784,
"grad_norm": 11.125,
"learning_rate": 1.2926315789473685e-05,
"loss": 0.4926,
"step": 772
},
{
"epoch": 1.1947449768160743,
"grad_norm": 9.25,
"learning_rate": 1.2915789473684213e-05,
"loss": 0.4601,
"step": 773
},
{
"epoch": 1.19629057187017,
"grad_norm": 12.1875,
"learning_rate": 1.2905263157894738e-05,
"loss": 0.4376,
"step": 774
},
{
"epoch": 1.1978361669242659,
"grad_norm": 10.875,
"learning_rate": 1.2894736842105264e-05,
"loss": 0.4916,
"step": 775
},
{
"epoch": 1.1993817619783615,
"grad_norm": 9.75,
"learning_rate": 1.288421052631579e-05,
"loss": 0.4897,
"step": 776
},
{
"epoch": 1.2009273570324575,
"grad_norm": 11.875,
"learning_rate": 1.2873684210526317e-05,
"loss": 0.5209,
"step": 777
},
{
"epoch": 1.2024729520865534,
"grad_norm": 9.25,
"learning_rate": 1.2863157894736845e-05,
"loss": 0.5088,
"step": 778
},
{
"epoch": 1.2040185471406493,
"grad_norm": 11.1875,
"learning_rate": 1.285263157894737e-05,
"loss": 0.4298,
"step": 779
},
{
"epoch": 1.205564142194745,
"grad_norm": 10.375,
"learning_rate": 1.2842105263157896e-05,
"loss": 0.4875,
"step": 780
},
{
"epoch": 1.2071097372488409,
"grad_norm": 12.5,
"learning_rate": 1.2831578947368422e-05,
"loss": 0.5777,
"step": 781
},
{
"epoch": 1.2086553323029365,
"grad_norm": 9.75,
"learning_rate": 1.2821052631578947e-05,
"loss": 0.4746,
"step": 782
},
{
"epoch": 1.2102009273570324,
"grad_norm": 20.75,
"learning_rate": 1.2810526315789475e-05,
"loss": 0.4538,
"step": 783
},
{
"epoch": 1.2117465224111283,
"grad_norm": 13.875,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.4987,
"step": 784
},
{
"epoch": 1.213292117465224,
"grad_norm": 11.8125,
"learning_rate": 1.2789473684210527e-05,
"loss": 0.4823,
"step": 785
},
{
"epoch": 1.21483771251932,
"grad_norm": 14.75,
"learning_rate": 1.2778947368421054e-05,
"loss": 0.5438,
"step": 786
},
{
"epoch": 1.2163833075734158,
"grad_norm": 12.0625,
"learning_rate": 1.2768421052631582e-05,
"loss": 0.4995,
"step": 787
},
{
"epoch": 1.2179289026275115,
"grad_norm": 17.625,
"learning_rate": 1.2757894736842106e-05,
"loss": 0.5238,
"step": 788
},
{
"epoch": 1.2194744976816074,
"grad_norm": 16.875,
"learning_rate": 1.2747368421052633e-05,
"loss": 0.456,
"step": 789
},
{
"epoch": 1.2210200927357033,
"grad_norm": 13.3125,
"learning_rate": 1.2736842105263159e-05,
"loss": 0.5335,
"step": 790
},
{
"epoch": 1.222565687789799,
"grad_norm": 10.5625,
"learning_rate": 1.2726315789473684e-05,
"loss": 0.4543,
"step": 791
},
{
"epoch": 1.224111282843895,
"grad_norm": 10.1875,
"learning_rate": 1.2715789473684212e-05,
"loss": 0.5536,
"step": 792
},
{
"epoch": 1.2256568778979906,
"grad_norm": 11.0,
"learning_rate": 1.2705263157894738e-05,
"loss": 0.5724,
"step": 793
},
{
"epoch": 1.2272024729520865,
"grad_norm": 10.0625,
"learning_rate": 1.2694736842105264e-05,
"loss": 0.4694,
"step": 794
},
{
"epoch": 1.2287480680061824,
"grad_norm": 10.4375,
"learning_rate": 1.268421052631579e-05,
"loss": 0.5237,
"step": 795
},
{
"epoch": 1.2302936630602783,
"grad_norm": 16.75,
"learning_rate": 1.2673684210526315e-05,
"loss": 0.5096,
"step": 796
},
{
"epoch": 1.231839258114374,
"grad_norm": 10.125,
"learning_rate": 1.2663157894736843e-05,
"loss": 0.5019,
"step": 797
},
{
"epoch": 1.23338485316847,
"grad_norm": 12.0,
"learning_rate": 1.265263157894737e-05,
"loss": 0.4939,
"step": 798
},
{
"epoch": 1.2349304482225656,
"grad_norm": 11.375,
"learning_rate": 1.2642105263157896e-05,
"loss": 0.4893,
"step": 799
},
{
"epoch": 1.2364760432766615,
"grad_norm": 10.4375,
"learning_rate": 1.263157894736842e-05,
"loss": 0.466,
"step": 800
},
{
"epoch": 1.2380216383307574,
"grad_norm": 10.375,
"learning_rate": 1.2621052631578949e-05,
"loss": 0.4546,
"step": 801
},
{
"epoch": 1.239567233384853,
"grad_norm": 10.5,
"learning_rate": 1.2610526315789475e-05,
"loss": 0.4507,
"step": 802
},
{
"epoch": 1.241112828438949,
"grad_norm": 10.5,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.4615,
"step": 803
},
{
"epoch": 1.242658423493045,
"grad_norm": 9.5625,
"learning_rate": 1.2589473684210528e-05,
"loss": 0.5071,
"step": 804
},
{
"epoch": 1.2442040185471406,
"grad_norm": 10.25,
"learning_rate": 1.2578947368421052e-05,
"loss": 0.4829,
"step": 805
},
{
"epoch": 1.2457496136012365,
"grad_norm": 9.8125,
"learning_rate": 1.256842105263158e-05,
"loss": 0.4414,
"step": 806
},
{
"epoch": 1.2472952086553324,
"grad_norm": 10.1875,
"learning_rate": 1.2557894736842107e-05,
"loss": 0.4959,
"step": 807
},
{
"epoch": 1.248840803709428,
"grad_norm": 9.875,
"learning_rate": 1.2547368421052633e-05,
"loss": 0.4745,
"step": 808
},
{
"epoch": 1.250386398763524,
"grad_norm": 9.625,
"learning_rate": 1.2536842105263158e-05,
"loss": 0.5139,
"step": 809
},
{
"epoch": 1.2519319938176197,
"grad_norm": 11.6875,
"learning_rate": 1.2526315789473684e-05,
"loss": 0.5002,
"step": 810
},
{
"epoch": 1.2534775888717156,
"grad_norm": 9.1875,
"learning_rate": 1.2515789473684212e-05,
"loss": 0.478,
"step": 811
},
{
"epoch": 1.2550231839258115,
"grad_norm": 11.5,
"learning_rate": 1.2505263157894738e-05,
"loss": 0.4273,
"step": 812
},
{
"epoch": 1.2565687789799074,
"grad_norm": 10.1875,
"learning_rate": 1.2494736842105265e-05,
"loss": 0.4785,
"step": 813
},
{
"epoch": 1.258114374034003,
"grad_norm": 9.9375,
"learning_rate": 1.248421052631579e-05,
"loss": 0.431,
"step": 814
},
{
"epoch": 1.259659969088099,
"grad_norm": 10.9375,
"learning_rate": 1.2473684210526317e-05,
"loss": 0.4266,
"step": 815
},
{
"epoch": 1.2612055641421946,
"grad_norm": 10.4375,
"learning_rate": 1.2463157894736844e-05,
"loss": 0.4777,
"step": 816
},
{
"epoch": 1.2627511591962906,
"grad_norm": 9.375,
"learning_rate": 1.245263157894737e-05,
"loss": 0.4552,
"step": 817
},
{
"epoch": 1.2642967542503865,
"grad_norm": 12.125,
"learning_rate": 1.2442105263157895e-05,
"loss": 0.4703,
"step": 818
},
{
"epoch": 1.2658423493044824,
"grad_norm": 8.875,
"learning_rate": 1.2431578947368421e-05,
"loss": 0.4642,
"step": 819
},
{
"epoch": 1.267387944358578,
"grad_norm": 10.1875,
"learning_rate": 1.2421052631578949e-05,
"loss": 0.4594,
"step": 820
},
{
"epoch": 1.268933539412674,
"grad_norm": 31.0,
"learning_rate": 1.2410526315789475e-05,
"loss": 0.5015,
"step": 821
},
{
"epoch": 1.2704791344667696,
"grad_norm": 13.1875,
"learning_rate": 1.2400000000000002e-05,
"loss": 0.5548,
"step": 822
},
{
"epoch": 1.2720247295208655,
"grad_norm": 10.25,
"learning_rate": 1.2389473684210526e-05,
"loss": 0.4303,
"step": 823
},
{
"epoch": 1.2735703245749614,
"grad_norm": 9.375,
"learning_rate": 1.2378947368421053e-05,
"loss": 0.4304,
"step": 824
},
{
"epoch": 1.2751159196290571,
"grad_norm": 9.6875,
"learning_rate": 1.236842105263158e-05,
"loss": 0.4988,
"step": 825
},
{
"epoch": 1.276661514683153,
"grad_norm": 9.75,
"learning_rate": 1.2357894736842107e-05,
"loss": 0.4577,
"step": 826
},
{
"epoch": 1.2782071097372487,
"grad_norm": 11.375,
"learning_rate": 1.2347368421052631e-05,
"loss": 0.422,
"step": 827
},
{
"epoch": 1.2797527047913446,
"grad_norm": 11.125,
"learning_rate": 1.2336842105263158e-05,
"loss": 0.4912,
"step": 828
},
{
"epoch": 1.2812982998454405,
"grad_norm": 10.25,
"learning_rate": 1.2326315789473686e-05,
"loss": 0.4765,
"step": 829
},
{
"epoch": 1.2828438948995364,
"grad_norm": 9.625,
"learning_rate": 1.2315789473684212e-05,
"loss": 0.5089,
"step": 830
},
{
"epoch": 1.2843894899536321,
"grad_norm": 10.0625,
"learning_rate": 1.2305263157894739e-05,
"loss": 0.4665,
"step": 831
},
{
"epoch": 1.285935085007728,
"grad_norm": 10.4375,
"learning_rate": 1.2294736842105263e-05,
"loss": 0.4573,
"step": 832
},
{
"epoch": 1.2874806800618237,
"grad_norm": 11.9375,
"learning_rate": 1.228421052631579e-05,
"loss": 0.517,
"step": 833
},
{
"epoch": 1.2890262751159196,
"grad_norm": 9.3125,
"learning_rate": 1.2273684210526317e-05,
"loss": 0.4762,
"step": 834
},
{
"epoch": 1.2905718701700155,
"grad_norm": 13.25,
"learning_rate": 1.2263157894736844e-05,
"loss": 0.4777,
"step": 835
},
{
"epoch": 1.2921174652241114,
"grad_norm": 10.9375,
"learning_rate": 1.225263157894737e-05,
"loss": 0.5324,
"step": 836
},
{
"epoch": 1.293663060278207,
"grad_norm": 10.4375,
"learning_rate": 1.2242105263157895e-05,
"loss": 0.4963,
"step": 837
},
{
"epoch": 1.295208655332303,
"grad_norm": 9.6875,
"learning_rate": 1.2231578947368421e-05,
"loss": 0.422,
"step": 838
},
{
"epoch": 1.2967542503863987,
"grad_norm": 11.25,
"learning_rate": 1.2221052631578949e-05,
"loss": 0.4485,
"step": 839
},
{
"epoch": 1.2982998454404946,
"grad_norm": 10.625,
"learning_rate": 1.2210526315789475e-05,
"loss": 0.4733,
"step": 840
},
{
"epoch": 1.2998454404945905,
"grad_norm": 11.9375,
"learning_rate": 1.22e-05,
"loss": 0.4719,
"step": 841
},
{
"epoch": 1.3013910355486862,
"grad_norm": 9.625,
"learning_rate": 1.2189473684210526e-05,
"loss": 0.4764,
"step": 842
},
{
"epoch": 1.302936630602782,
"grad_norm": 9.4375,
"learning_rate": 1.2178947368421054e-05,
"loss": 0.4021,
"step": 843
},
{
"epoch": 1.3044822256568778,
"grad_norm": 9.75,
"learning_rate": 1.216842105263158e-05,
"loss": 0.4517,
"step": 844
},
{
"epoch": 1.3060278207109737,
"grad_norm": 17.625,
"learning_rate": 1.2157894736842107e-05,
"loss": 0.4449,
"step": 845
},
{
"epoch": 1.3075734157650696,
"grad_norm": 10.9375,
"learning_rate": 1.2147368421052632e-05,
"loss": 0.5565,
"step": 846
},
{
"epoch": 1.3091190108191655,
"grad_norm": 10.125,
"learning_rate": 1.2136842105263158e-05,
"loss": 0.4427,
"step": 847
},
{
"epoch": 1.3106646058732612,
"grad_norm": 11.4375,
"learning_rate": 1.2126315789473686e-05,
"loss": 0.467,
"step": 848
},
{
"epoch": 1.312210200927357,
"grad_norm": 11.1875,
"learning_rate": 1.2115789473684212e-05,
"loss": 0.5061,
"step": 849
},
{
"epoch": 1.3137557959814528,
"grad_norm": 9.0625,
"learning_rate": 1.2105263157894737e-05,
"loss": 0.3927,
"step": 850
},
{
"epoch": 1.3153013910355487,
"grad_norm": 14.875,
"learning_rate": 1.2094736842105263e-05,
"loss": 0.4699,
"step": 851
},
{
"epoch": 1.3168469860896446,
"grad_norm": 10.5625,
"learning_rate": 1.208421052631579e-05,
"loss": 0.4375,
"step": 852
},
{
"epoch": 1.3183925811437405,
"grad_norm": 9.125,
"learning_rate": 1.2073684210526318e-05,
"loss": 0.4619,
"step": 853
},
{
"epoch": 1.3199381761978362,
"grad_norm": 9.75,
"learning_rate": 1.2063157894736844e-05,
"loss": 0.4803,
"step": 854
},
{
"epoch": 1.321483771251932,
"grad_norm": 10.1875,
"learning_rate": 1.2052631578947369e-05,
"loss": 0.5041,
"step": 855
},
{
"epoch": 1.3230293663060277,
"grad_norm": 12.125,
"learning_rate": 1.2042105263157895e-05,
"loss": 0.4666,
"step": 856
},
{
"epoch": 1.3245749613601236,
"grad_norm": 10.0,
"learning_rate": 1.2031578947368423e-05,
"loss": 0.5002,
"step": 857
},
{
"epoch": 1.3261205564142196,
"grad_norm": 12.1875,
"learning_rate": 1.202105263157895e-05,
"loss": 0.5174,
"step": 858
},
{
"epoch": 1.3276661514683152,
"grad_norm": 14.0,
"learning_rate": 1.2010526315789474e-05,
"loss": 0.4862,
"step": 859
},
{
"epoch": 1.3292117465224111,
"grad_norm": 11.0,
"learning_rate": 1.2e-05,
"loss": 0.4956,
"step": 860
},
{
"epoch": 1.3307573415765068,
"grad_norm": 11.3125,
"learning_rate": 1.1989473684210527e-05,
"loss": 0.4812,
"step": 861
},
{
"epoch": 1.3323029366306027,
"grad_norm": 9.375,
"learning_rate": 1.1978947368421055e-05,
"loss": 0.4597,
"step": 862
},
{
"epoch": 1.3338485316846986,
"grad_norm": 21.25,
"learning_rate": 1.1968421052631581e-05,
"loss": 0.4589,
"step": 863
},
{
"epoch": 1.3353941267387945,
"grad_norm": 10.0625,
"learning_rate": 1.1957894736842106e-05,
"loss": 0.4872,
"step": 864
},
{
"epoch": 1.3369397217928902,
"grad_norm": 11.8125,
"learning_rate": 1.1947368421052632e-05,
"loss": 0.448,
"step": 865
},
{
"epoch": 1.3384853168469861,
"grad_norm": 13.0,
"learning_rate": 1.1936842105263158e-05,
"loss": 0.4954,
"step": 866
},
{
"epoch": 1.3400309119010818,
"grad_norm": 10.4375,
"learning_rate": 1.1926315789473686e-05,
"loss": 0.5019,
"step": 867
},
{
"epoch": 1.3415765069551777,
"grad_norm": 12.6875,
"learning_rate": 1.1915789473684211e-05,
"loss": 0.4909,
"step": 868
},
{
"epoch": 1.3431221020092736,
"grad_norm": 9.6875,
"learning_rate": 1.1905263157894737e-05,
"loss": 0.4753,
"step": 869
},
{
"epoch": 1.3446676970633695,
"grad_norm": 15.5625,
"learning_rate": 1.1894736842105264e-05,
"loss": 0.4639,
"step": 870
},
{
"epoch": 1.3462132921174652,
"grad_norm": 10.5,
"learning_rate": 1.1884210526315792e-05,
"loss": 0.459,
"step": 871
},
{
"epoch": 1.3477588871715611,
"grad_norm": 9.8125,
"learning_rate": 1.1873684210526318e-05,
"loss": 0.4603,
"step": 872
},
{
"epoch": 1.3493044822256568,
"grad_norm": 11.0625,
"learning_rate": 1.1863157894736843e-05,
"loss": 0.493,
"step": 873
},
{
"epoch": 1.3508500772797527,
"grad_norm": 9.3125,
"learning_rate": 1.1852631578947369e-05,
"loss": 0.4714,
"step": 874
},
{
"epoch": 1.3523956723338486,
"grad_norm": 9.3125,
"learning_rate": 1.1842105263157895e-05,
"loss": 0.4595,
"step": 875
},
{
"epoch": 1.3539412673879443,
"grad_norm": 10.9375,
"learning_rate": 1.1831578947368423e-05,
"loss": 0.5351,
"step": 876
},
{
"epoch": 1.3554868624420402,
"grad_norm": 9.8125,
"learning_rate": 1.1821052631578948e-05,
"loss": 0.5331,
"step": 877
},
{
"epoch": 1.3570324574961359,
"grad_norm": 9.6875,
"learning_rate": 1.1810526315789474e-05,
"loss": 0.4049,
"step": 878
},
{
"epoch": 1.3585780525502318,
"grad_norm": 11.4375,
"learning_rate": 1.18e-05,
"loss": 0.4966,
"step": 879
},
{
"epoch": 1.3601236476043277,
"grad_norm": 10.6875,
"learning_rate": 1.1789473684210527e-05,
"loss": 0.5107,
"step": 880
},
{
"epoch": 1.3616692426584236,
"grad_norm": 13.1875,
"learning_rate": 1.1778947368421055e-05,
"loss": 0.5277,
"step": 881
},
{
"epoch": 1.3632148377125193,
"grad_norm": 21.75,
"learning_rate": 1.176842105263158e-05,
"loss": 0.454,
"step": 882
},
{
"epoch": 1.3647604327666152,
"grad_norm": 10.125,
"learning_rate": 1.1757894736842106e-05,
"loss": 0.4219,
"step": 883
},
{
"epoch": 1.3663060278207109,
"grad_norm": 10.5,
"learning_rate": 1.1747368421052632e-05,
"loss": 0.5257,
"step": 884
},
{
"epoch": 1.3678516228748068,
"grad_norm": 10.875,
"learning_rate": 1.173684210526316e-05,
"loss": 0.4527,
"step": 885
},
{
"epoch": 1.3693972179289027,
"grad_norm": 11.875,
"learning_rate": 1.1726315789473685e-05,
"loss": 0.5218,
"step": 886
},
{
"epoch": 1.3709428129829986,
"grad_norm": 27.125,
"learning_rate": 1.1715789473684211e-05,
"loss": 0.5205,
"step": 887
},
{
"epoch": 1.3724884080370943,
"grad_norm": 10.75,
"learning_rate": 1.1705263157894737e-05,
"loss": 0.535,
"step": 888
},
{
"epoch": 1.3740340030911902,
"grad_norm": 10.1875,
"learning_rate": 1.1694736842105264e-05,
"loss": 0.5003,
"step": 889
},
{
"epoch": 1.3755795981452859,
"grad_norm": 11.875,
"learning_rate": 1.1684210526315792e-05,
"loss": 0.5055,
"step": 890
},
{
"epoch": 1.3771251931993818,
"grad_norm": 10.8125,
"learning_rate": 1.1673684210526316e-05,
"loss": 0.4783,
"step": 891
},
{
"epoch": 1.3786707882534777,
"grad_norm": 9.5625,
"learning_rate": 1.1663157894736843e-05,
"loss": 0.4727,
"step": 892
},
{
"epoch": 1.3802163833075733,
"grad_norm": 9.0625,
"learning_rate": 1.1652631578947369e-05,
"loss": 0.4669,
"step": 893
},
{
"epoch": 1.3817619783616693,
"grad_norm": 10.5,
"learning_rate": 1.1642105263157897e-05,
"loss": 0.509,
"step": 894
},
{
"epoch": 1.383307573415765,
"grad_norm": 12.1875,
"learning_rate": 1.1631578947368423e-05,
"loss": 0.5069,
"step": 895
},
{
"epoch": 1.3848531684698608,
"grad_norm": 12.25,
"learning_rate": 1.1621052631578948e-05,
"loss": 0.551,
"step": 896
},
{
"epoch": 1.3863987635239567,
"grad_norm": 14.875,
"learning_rate": 1.1610526315789474e-05,
"loss": 0.5221,
"step": 897
},
{
"epoch": 1.3879443585780527,
"grad_norm": 11.3125,
"learning_rate": 1.16e-05,
"loss": 0.4923,
"step": 898
},
{
"epoch": 1.3894899536321483,
"grad_norm": 9.5625,
"learning_rate": 1.1589473684210529e-05,
"loss": 0.501,
"step": 899
},
{
"epoch": 1.3910355486862442,
"grad_norm": 10.4375,
"learning_rate": 1.1578947368421053e-05,
"loss": 0.4895,
"step": 900
},
{
"epoch": 1.39258114374034,
"grad_norm": 11.25,
"learning_rate": 1.156842105263158e-05,
"loss": 0.4939,
"step": 901
},
{
"epoch": 1.3941267387944358,
"grad_norm": 9.8125,
"learning_rate": 1.1557894736842106e-05,
"loss": 0.4997,
"step": 902
},
{
"epoch": 1.3956723338485317,
"grad_norm": 10.8125,
"learning_rate": 1.1547368421052632e-05,
"loss": 0.4545,
"step": 903
},
{
"epoch": 1.3972179289026276,
"grad_norm": 12.3125,
"learning_rate": 1.153684210526316e-05,
"loss": 0.4652,
"step": 904
},
{
"epoch": 1.3987635239567233,
"grad_norm": 10.625,
"learning_rate": 1.1526315789473685e-05,
"loss": 0.4821,
"step": 905
},
{
"epoch": 1.4003091190108192,
"grad_norm": 8.875,
"learning_rate": 1.1515789473684211e-05,
"loss": 0.4238,
"step": 906
},
{
"epoch": 1.401854714064915,
"grad_norm": 10.25,
"learning_rate": 1.1505263157894738e-05,
"loss": 0.4316,
"step": 907
},
{
"epoch": 1.4034003091190108,
"grad_norm": 12.625,
"learning_rate": 1.1494736842105266e-05,
"loss": 0.5027,
"step": 908
},
{
"epoch": 1.4049459041731067,
"grad_norm": 10.5,
"learning_rate": 1.148421052631579e-05,
"loss": 0.4796,
"step": 909
},
{
"epoch": 1.4064914992272024,
"grad_norm": 11.0,
"learning_rate": 1.1473684210526317e-05,
"loss": 0.4792,
"step": 910
},
{
"epoch": 1.4080370942812983,
"grad_norm": 8.75,
"learning_rate": 1.1463157894736843e-05,
"loss": 0.3923,
"step": 911
},
{
"epoch": 1.409582689335394,
"grad_norm": 10.8125,
"learning_rate": 1.145263157894737e-05,
"loss": 0.516,
"step": 912
},
{
"epoch": 1.41112828438949,
"grad_norm": 9.0,
"learning_rate": 1.1442105263157897e-05,
"loss": 0.4059,
"step": 913
},
{
"epoch": 1.4126738794435858,
"grad_norm": 10.625,
"learning_rate": 1.1431578947368422e-05,
"loss": 0.509,
"step": 914
},
{
"epoch": 1.4142194744976817,
"grad_norm": 10.1875,
"learning_rate": 1.1421052631578948e-05,
"loss": 0.4528,
"step": 915
},
{
"epoch": 1.4157650695517774,
"grad_norm": 12.3125,
"learning_rate": 1.1410526315789475e-05,
"loss": 0.4544,
"step": 916
},
{
"epoch": 1.4173106646058733,
"grad_norm": 11.5,
"learning_rate": 1.14e-05,
"loss": 0.4991,
"step": 917
},
{
"epoch": 1.418856259659969,
"grad_norm": 13.0,
"learning_rate": 1.1389473684210527e-05,
"loss": 0.5215,
"step": 918
},
{
"epoch": 1.4204018547140649,
"grad_norm": 10.5,
"learning_rate": 1.1378947368421054e-05,
"loss": 0.4418,
"step": 919
},
{
"epoch": 1.4219474497681608,
"grad_norm": 11.8125,
"learning_rate": 1.136842105263158e-05,
"loss": 0.4987,
"step": 920
},
{
"epoch": 1.4234930448222567,
"grad_norm": 11.8125,
"learning_rate": 1.1357894736842106e-05,
"loss": 0.5373,
"step": 921
},
{
"epoch": 1.4250386398763524,
"grad_norm": 10.3125,
"learning_rate": 1.1347368421052634e-05,
"loss": 0.4254,
"step": 922
},
{
"epoch": 1.4265842349304483,
"grad_norm": 11.5625,
"learning_rate": 1.1336842105263159e-05,
"loss": 0.4403,
"step": 923
},
{
"epoch": 1.428129829984544,
"grad_norm": 10.1875,
"learning_rate": 1.1326315789473685e-05,
"loss": 0.4446,
"step": 924
},
{
"epoch": 1.4296754250386399,
"grad_norm": 10.125,
"learning_rate": 1.1315789473684212e-05,
"loss": 0.4583,
"step": 925
},
{
"epoch": 1.4312210200927358,
"grad_norm": 10.875,
"learning_rate": 1.1305263157894736e-05,
"loss": 0.465,
"step": 926
},
{
"epoch": 1.4327666151468315,
"grad_norm": 9.9375,
"learning_rate": 1.1294736842105264e-05,
"loss": 0.4704,
"step": 927
},
{
"epoch": 1.4343122102009274,
"grad_norm": 15.3125,
"learning_rate": 1.128421052631579e-05,
"loss": 0.4738,
"step": 928
},
{
"epoch": 1.435857805255023,
"grad_norm": 10.625,
"learning_rate": 1.1273684210526317e-05,
"loss": 0.4362,
"step": 929
},
{
"epoch": 1.437403400309119,
"grad_norm": 9.6875,
"learning_rate": 1.1263157894736843e-05,
"loss": 0.4921,
"step": 930
},
{
"epoch": 1.4389489953632149,
"grad_norm": 11.0625,
"learning_rate": 1.1252631578947368e-05,
"loss": 0.4173,
"step": 931
},
{
"epoch": 1.4404945904173108,
"grad_norm": 10.5,
"learning_rate": 1.1242105263157896e-05,
"loss": 0.477,
"step": 932
},
{
"epoch": 1.4420401854714064,
"grad_norm": 10.5625,
"learning_rate": 1.1231578947368422e-05,
"loss": 0.428,
"step": 933
},
{
"epoch": 1.4435857805255023,
"grad_norm": 10.6875,
"learning_rate": 1.1221052631578949e-05,
"loss": 0.5187,
"step": 934
},
{
"epoch": 1.445131375579598,
"grad_norm": 10.5625,
"learning_rate": 1.1210526315789473e-05,
"loss": 0.463,
"step": 935
},
{
"epoch": 1.446676970633694,
"grad_norm": 11.3125,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.478,
"step": 936
},
{
"epoch": 1.4482225656877898,
"grad_norm": 10.75,
"learning_rate": 1.1189473684210528e-05,
"loss": 0.4547,
"step": 937
},
{
"epoch": 1.4497681607418857,
"grad_norm": 10.4375,
"learning_rate": 1.1178947368421054e-05,
"loss": 0.5182,
"step": 938
},
{
"epoch": 1.4513137557959814,
"grad_norm": 12.0625,
"learning_rate": 1.116842105263158e-05,
"loss": 0.4846,
"step": 939
},
{
"epoch": 1.4528593508500773,
"grad_norm": 11.0,
"learning_rate": 1.1157894736842105e-05,
"loss": 0.4685,
"step": 940
},
{
"epoch": 1.454404945904173,
"grad_norm": 11.125,
"learning_rate": 1.1147368421052633e-05,
"loss": 0.5032,
"step": 941
},
{
"epoch": 1.455950540958269,
"grad_norm": 19.125,
"learning_rate": 1.1136842105263159e-05,
"loss": 0.4597,
"step": 942
},
{
"epoch": 1.4574961360123648,
"grad_norm": 9.125,
"learning_rate": 1.1126315789473685e-05,
"loss": 0.5173,
"step": 943
},
{
"epoch": 1.4590417310664605,
"grad_norm": 11.75,
"learning_rate": 1.111578947368421e-05,
"loss": 0.4817,
"step": 944
},
{
"epoch": 1.4605873261205564,
"grad_norm": 11.625,
"learning_rate": 1.1105263157894736e-05,
"loss": 0.5123,
"step": 945
},
{
"epoch": 1.4621329211746523,
"grad_norm": 10.875,
"learning_rate": 1.1094736842105264e-05,
"loss": 0.4537,
"step": 946
},
{
"epoch": 1.463678516228748,
"grad_norm": 10.3125,
"learning_rate": 1.108421052631579e-05,
"loss": 0.5061,
"step": 947
},
{
"epoch": 1.465224111282844,
"grad_norm": 9.875,
"learning_rate": 1.1073684210526317e-05,
"loss": 0.4714,
"step": 948
},
{
"epoch": 1.4667697063369398,
"grad_norm": 10.1875,
"learning_rate": 1.1063157894736842e-05,
"loss": 0.4595,
"step": 949
},
{
"epoch": 1.4683153013910355,
"grad_norm": 9.5625,
"learning_rate": 1.105263157894737e-05,
"loss": 0.485,
"step": 950
},
{
"epoch": 1.4698608964451314,
"grad_norm": 11.3125,
"learning_rate": 1.1042105263157896e-05,
"loss": 0.4696,
"step": 951
},
{
"epoch": 1.471406491499227,
"grad_norm": 10.6875,
"learning_rate": 1.1031578947368422e-05,
"loss": 0.5382,
"step": 952
},
{
"epoch": 1.472952086553323,
"grad_norm": 11.125,
"learning_rate": 1.1021052631578947e-05,
"loss": 0.5138,
"step": 953
},
{
"epoch": 1.474497681607419,
"grad_norm": 10.6875,
"learning_rate": 1.1010526315789473e-05,
"loss": 0.4927,
"step": 954
},
{
"epoch": 1.4760432766615148,
"grad_norm": 8.875,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.4716,
"step": 955
},
{
"epoch": 1.4775888717156105,
"grad_norm": 11.375,
"learning_rate": 1.0989473684210528e-05,
"loss": 0.4962,
"step": 956
},
{
"epoch": 1.4791344667697064,
"grad_norm": 10.9375,
"learning_rate": 1.0978947368421054e-05,
"loss": 0.4632,
"step": 957
},
{
"epoch": 1.480680061823802,
"grad_norm": 9.8125,
"learning_rate": 1.0968421052631579e-05,
"loss": 0.4632,
"step": 958
},
{
"epoch": 1.482225656877898,
"grad_norm": 9.75,
"learning_rate": 1.0957894736842105e-05,
"loss": 0.4837,
"step": 959
},
{
"epoch": 1.4837712519319939,
"grad_norm": 12.6875,
"learning_rate": 1.0947368421052633e-05,
"loss": 0.4365,
"step": 960
},
{
"epoch": 1.4853168469860896,
"grad_norm": 26.5,
"learning_rate": 1.093684210526316e-05,
"loss": 0.5004,
"step": 961
},
{
"epoch": 1.4868624420401855,
"grad_norm": 9.75,
"learning_rate": 1.0926315789473686e-05,
"loss": 0.4456,
"step": 962
},
{
"epoch": 1.4884080370942814,
"grad_norm": 12.0625,
"learning_rate": 1.091578947368421e-05,
"loss": 0.5036,
"step": 963
},
{
"epoch": 1.489953632148377,
"grad_norm": 10.125,
"learning_rate": 1.0905263157894738e-05,
"loss": 0.4448,
"step": 964
},
{
"epoch": 1.491499227202473,
"grad_norm": 10.6875,
"learning_rate": 1.0894736842105265e-05,
"loss": 0.3977,
"step": 965
},
{
"epoch": 1.4930448222565689,
"grad_norm": 9.6875,
"learning_rate": 1.0884210526315791e-05,
"loss": 0.5065,
"step": 966
},
{
"epoch": 1.4945904173106646,
"grad_norm": 12.125,
"learning_rate": 1.0873684210526316e-05,
"loss": 0.4816,
"step": 967
},
{
"epoch": 1.4961360123647605,
"grad_norm": 11.4375,
"learning_rate": 1.0863157894736842e-05,
"loss": 0.486,
"step": 968
},
{
"epoch": 1.4976816074188561,
"grad_norm": 10.875,
"learning_rate": 1.085263157894737e-05,
"loss": 0.4613,
"step": 969
},
{
"epoch": 1.499227202472952,
"grad_norm": 11.9375,
"learning_rate": 1.0842105263157896e-05,
"loss": 0.4749,
"step": 970
},
{
"epoch": 1.500772797527048,
"grad_norm": 37.0,
"learning_rate": 1.0831578947368423e-05,
"loss": 0.431,
"step": 971
},
{
"epoch": 1.5023183925811439,
"grad_norm": 9.5625,
"learning_rate": 1.0821052631578947e-05,
"loss": 0.4514,
"step": 972
},
{
"epoch": 1.5038639876352395,
"grad_norm": 11.0,
"learning_rate": 1.0810526315789474e-05,
"loss": 0.4485,
"step": 973
},
{
"epoch": 1.5054095826893354,
"grad_norm": 10.5,
"learning_rate": 1.0800000000000002e-05,
"loss": 0.4487,
"step": 974
},
{
"epoch": 1.5069551777434311,
"grad_norm": 10.3125,
"learning_rate": 1.0789473684210528e-05,
"loss": 0.5331,
"step": 975
},
{
"epoch": 1.508500772797527,
"grad_norm": 10.375,
"learning_rate": 1.0778947368421053e-05,
"loss": 0.4387,
"step": 976
},
{
"epoch": 1.510046367851623,
"grad_norm": 9.875,
"learning_rate": 1.0768421052631579e-05,
"loss": 0.4714,
"step": 977
},
{
"epoch": 1.5115919629057188,
"grad_norm": 10.5,
"learning_rate": 1.0757894736842107e-05,
"loss": 0.4552,
"step": 978
},
{
"epoch": 1.5131375579598145,
"grad_norm": 9.375,
"learning_rate": 1.0747368421052633e-05,
"loss": 0.4952,
"step": 979
},
{
"epoch": 1.5146831530139102,
"grad_norm": 10.0625,
"learning_rate": 1.073684210526316e-05,
"loss": 0.4592,
"step": 980
},
{
"epoch": 1.5162287480680061,
"grad_norm": 11.75,
"learning_rate": 1.0726315789473684e-05,
"loss": 0.5091,
"step": 981
},
{
"epoch": 1.517774343122102,
"grad_norm": 10.125,
"learning_rate": 1.071578947368421e-05,
"loss": 0.4525,
"step": 982
},
{
"epoch": 1.519319938176198,
"grad_norm": 11.0625,
"learning_rate": 1.0705263157894739e-05,
"loss": 0.4868,
"step": 983
},
{
"epoch": 1.5208655332302936,
"grad_norm": 16.875,
"learning_rate": 1.0694736842105265e-05,
"loss": 0.4312,
"step": 984
},
{
"epoch": 1.5224111282843895,
"grad_norm": 11.1875,
"learning_rate": 1.068421052631579e-05,
"loss": 0.4687,
"step": 985
},
{
"epoch": 1.5239567233384852,
"grad_norm": 10.75,
"learning_rate": 1.0673684210526316e-05,
"loss": 0.5132,
"step": 986
},
{
"epoch": 1.525502318392581,
"grad_norm": 10.4375,
"learning_rate": 1.0663157894736842e-05,
"loss": 0.4795,
"step": 987
},
{
"epoch": 1.527047913446677,
"grad_norm": 11.3125,
"learning_rate": 1.065263157894737e-05,
"loss": 0.5154,
"step": 988
},
{
"epoch": 1.528593508500773,
"grad_norm": 10.3125,
"learning_rate": 1.0642105263157897e-05,
"loss": 0.4829,
"step": 989
},
{
"epoch": 1.5301391035548686,
"grad_norm": 9.75,
"learning_rate": 1.0631578947368421e-05,
"loss": 0.4962,
"step": 990
},
{
"epoch": 1.5316846986089645,
"grad_norm": 10.6875,
"learning_rate": 1.0621052631578948e-05,
"loss": 0.529,
"step": 991
},
{
"epoch": 1.5332302936630602,
"grad_norm": 9.375,
"learning_rate": 1.0610526315789476e-05,
"loss": 0.462,
"step": 992
},
{
"epoch": 1.534775888717156,
"grad_norm": 15.4375,
"learning_rate": 1.0600000000000002e-05,
"loss": 0.4847,
"step": 993
},
{
"epoch": 1.536321483771252,
"grad_norm": 9.125,
"learning_rate": 1.0589473684210526e-05,
"loss": 0.4255,
"step": 994
},
{
"epoch": 1.537867078825348,
"grad_norm": 9.6875,
"learning_rate": 1.0578947368421053e-05,
"loss": 0.4902,
"step": 995
},
{
"epoch": 1.5394126738794436,
"grad_norm": 13.9375,
"learning_rate": 1.0568421052631579e-05,
"loss": 0.5037,
"step": 996
},
{
"epoch": 1.5409582689335393,
"grad_norm": 11.125,
"learning_rate": 1.0557894736842107e-05,
"loss": 0.5112,
"step": 997
},
{
"epoch": 1.5425038639876352,
"grad_norm": 10.3125,
"learning_rate": 1.0547368421052633e-05,
"loss": 0.4961,
"step": 998
},
{
"epoch": 1.544049459041731,
"grad_norm": 10.0,
"learning_rate": 1.0536842105263158e-05,
"loss": 0.4942,
"step": 999
},
{
"epoch": 1.545595054095827,
"grad_norm": 11.0,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.4709,
"step": 1000
},
{
"epoch": 1.5471406491499229,
"grad_norm": 9.625,
"learning_rate": 1.051578947368421e-05,
"loss": 0.4633,
"step": 1001
},
{
"epoch": 1.5486862442040186,
"grad_norm": 16.25,
"learning_rate": 1.0505263157894739e-05,
"loss": 0.4135,
"step": 1002
},
{
"epoch": 1.5502318392581143,
"grad_norm": 9.8125,
"learning_rate": 1.0494736842105263e-05,
"loss": 0.5058,
"step": 1003
},
{
"epoch": 1.5517774343122102,
"grad_norm": 11.4375,
"learning_rate": 1.048421052631579e-05,
"loss": 0.4899,
"step": 1004
},
{
"epoch": 1.553323029366306,
"grad_norm": 11.0625,
"learning_rate": 1.0473684210526316e-05,
"loss": 0.4655,
"step": 1005
},
{
"epoch": 1.554868624420402,
"grad_norm": 9.6875,
"learning_rate": 1.0463157894736844e-05,
"loss": 0.4379,
"step": 1006
},
{
"epoch": 1.5564142194744977,
"grad_norm": 11.625,
"learning_rate": 1.045263157894737e-05,
"loss": 0.5125,
"step": 1007
},
{
"epoch": 1.5579598145285936,
"grad_norm": 9.5,
"learning_rate": 1.0442105263157895e-05,
"loss": 0.4641,
"step": 1008
},
{
"epoch": 1.5595054095826892,
"grad_norm": 9.625,
"learning_rate": 1.0431578947368421e-05,
"loss": 0.439,
"step": 1009
},
{
"epoch": 1.5610510046367851,
"grad_norm": 10.6875,
"learning_rate": 1.0421052631578948e-05,
"loss": 0.462,
"step": 1010
},
{
"epoch": 1.562596599690881,
"grad_norm": 10.0625,
"learning_rate": 1.0410526315789476e-05,
"loss": 0.4458,
"step": 1011
},
{
"epoch": 1.564142194744977,
"grad_norm": 11.0,
"learning_rate": 1.04e-05,
"loss": 0.4508,
"step": 1012
},
{
"epoch": 1.5656877897990726,
"grad_norm": 11.0625,
"learning_rate": 1.0389473684210527e-05,
"loss": 0.4882,
"step": 1013
},
{
"epoch": 1.5672333848531683,
"grad_norm": 9.4375,
"learning_rate": 1.0378947368421053e-05,
"loss": 0.5,
"step": 1014
},
{
"epoch": 1.5687789799072642,
"grad_norm": 11.6875,
"learning_rate": 1.036842105263158e-05,
"loss": 0.4609,
"step": 1015
},
{
"epoch": 1.5703245749613601,
"grad_norm": 11.0,
"learning_rate": 1.0357894736842107e-05,
"loss": 0.4355,
"step": 1016
},
{
"epoch": 1.571870170015456,
"grad_norm": 10.375,
"learning_rate": 1.0347368421052632e-05,
"loss": 0.4909,
"step": 1017
},
{
"epoch": 1.573415765069552,
"grad_norm": 10.5625,
"learning_rate": 1.0336842105263158e-05,
"loss": 0.4217,
"step": 1018
},
{
"epoch": 1.5749613601236476,
"grad_norm": 11.375,
"learning_rate": 1.0326315789473685e-05,
"loss": 0.4935,
"step": 1019
},
{
"epoch": 1.5765069551777433,
"grad_norm": 13.125,
"learning_rate": 1.0315789473684213e-05,
"loss": 0.4512,
"step": 1020
},
{
"epoch": 1.5780525502318392,
"grad_norm": 9.3125,
"learning_rate": 1.0305263157894739e-05,
"loss": 0.4376,
"step": 1021
},
{
"epoch": 1.5795981452859351,
"grad_norm": 10.9375,
"learning_rate": 1.0294736842105264e-05,
"loss": 0.5034,
"step": 1022
},
{
"epoch": 1.581143740340031,
"grad_norm": 10.625,
"learning_rate": 1.028421052631579e-05,
"loss": 0.4651,
"step": 1023
},
{
"epoch": 1.5826893353941267,
"grad_norm": 10.0,
"learning_rate": 1.0273684210526316e-05,
"loss": 0.4074,
"step": 1024
},
{
"epoch": 1.5842349304482226,
"grad_norm": 11.0625,
"learning_rate": 1.0263157894736844e-05,
"loss": 0.474,
"step": 1025
},
{
"epoch": 1.5857805255023183,
"grad_norm": 9.25,
"learning_rate": 1.0252631578947369e-05,
"loss": 0.4211,
"step": 1026
},
{
"epoch": 1.5873261205564142,
"grad_norm": 11.1875,
"learning_rate": 1.0242105263157895e-05,
"loss": 0.4565,
"step": 1027
},
{
"epoch": 1.58887171561051,
"grad_norm": 9.0,
"learning_rate": 1.0231578947368422e-05,
"loss": 0.4297,
"step": 1028
},
{
"epoch": 1.590417310664606,
"grad_norm": 9.9375,
"learning_rate": 1.0221052631578948e-05,
"loss": 0.5189,
"step": 1029
},
{
"epoch": 1.5919629057187017,
"grad_norm": 9.125,
"learning_rate": 1.0210526315789476e-05,
"loss": 0.4161,
"step": 1030
},
{
"epoch": 1.5935085007727974,
"grad_norm": 9.9375,
"learning_rate": 1.02e-05,
"loss": 0.4562,
"step": 1031
},
{
"epoch": 1.5950540958268933,
"grad_norm": 9.1875,
"learning_rate": 1.0189473684210527e-05,
"loss": 0.4355,
"step": 1032
},
{
"epoch": 1.5965996908809892,
"grad_norm": 9.375,
"learning_rate": 1.0178947368421053e-05,
"loss": 0.407,
"step": 1033
},
{
"epoch": 1.598145285935085,
"grad_norm": 10.375,
"learning_rate": 1.0168421052631581e-05,
"loss": 0.4643,
"step": 1034
},
{
"epoch": 1.599690880989181,
"grad_norm": 11.125,
"learning_rate": 1.0157894736842106e-05,
"loss": 0.4079,
"step": 1035
},
{
"epoch": 1.6012364760432767,
"grad_norm": 10.1875,
"learning_rate": 1.0147368421052632e-05,
"loss": 0.4584,
"step": 1036
},
{
"epoch": 1.6027820710973724,
"grad_norm": 8.875,
"learning_rate": 1.0136842105263159e-05,
"loss": 0.4669,
"step": 1037
},
{
"epoch": 1.6043276661514683,
"grad_norm": 10.3125,
"learning_rate": 1.0126315789473685e-05,
"loss": 0.5169,
"step": 1038
},
{
"epoch": 1.6058732612055642,
"grad_norm": 9.6875,
"learning_rate": 1.0115789473684213e-05,
"loss": 0.4227,
"step": 1039
},
{
"epoch": 1.60741885625966,
"grad_norm": 10.0,
"learning_rate": 1.0105263157894738e-05,
"loss": 0.4692,
"step": 1040
},
{
"epoch": 1.6089644513137558,
"grad_norm": 10.6875,
"learning_rate": 1.0094736842105264e-05,
"loss": 0.4665,
"step": 1041
},
{
"epoch": 1.6105100463678517,
"grad_norm": 10.4375,
"learning_rate": 1.008421052631579e-05,
"loss": 0.465,
"step": 1042
},
{
"epoch": 1.6120556414219473,
"grad_norm": 12.0625,
"learning_rate": 1.0073684210526315e-05,
"loss": 0.4639,
"step": 1043
},
{
"epoch": 1.6136012364760433,
"grad_norm": 10.25,
"learning_rate": 1.0063157894736843e-05,
"loss": 0.3995,
"step": 1044
},
{
"epoch": 1.6151468315301392,
"grad_norm": 9.9375,
"learning_rate": 1.005263157894737e-05,
"loss": 0.4406,
"step": 1045
},
{
"epoch": 1.616692426584235,
"grad_norm": 10.5,
"learning_rate": 1.0042105263157896e-05,
"loss": 0.4526,
"step": 1046
},
{
"epoch": 1.6182380216383307,
"grad_norm": 12.1875,
"learning_rate": 1.0031578947368422e-05,
"loss": 0.428,
"step": 1047
},
{
"epoch": 1.6197836166924264,
"grad_norm": 10.625,
"learning_rate": 1.002105263157895e-05,
"loss": 0.4434,
"step": 1048
},
{
"epoch": 1.6213292117465223,
"grad_norm": 10.1875,
"learning_rate": 1.0010526315789474e-05,
"loss": 0.4815,
"step": 1049
},
{
"epoch": 1.6228748068006182,
"grad_norm": 11.0625,
"learning_rate": 1e-05,
"loss": 0.4534,
"step": 1050
},
{
"epoch": 1.6244204018547141,
"grad_norm": 9.5625,
"learning_rate": 9.989473684210527e-06,
"loss": 0.432,
"step": 1051
},
{
"epoch": 1.62596599690881,
"grad_norm": 10.1875,
"learning_rate": 9.978947368421053e-06,
"loss": 0.4792,
"step": 1052
},
{
"epoch": 1.6275115919629057,
"grad_norm": 14.625,
"learning_rate": 9.96842105263158e-06,
"loss": 0.5092,
"step": 1053
},
{
"epoch": 1.6290571870170014,
"grad_norm": 8.6875,
"learning_rate": 9.957894736842106e-06,
"loss": 0.4296,
"step": 1054
},
{
"epoch": 1.6306027820710973,
"grad_norm": 9.25,
"learning_rate": 9.947368421052632e-06,
"loss": 0.4024,
"step": 1055
},
{
"epoch": 1.6321483771251932,
"grad_norm": 11.3125,
"learning_rate": 9.936842105263159e-06,
"loss": 0.4677,
"step": 1056
},
{
"epoch": 1.6336939721792891,
"grad_norm": 10.25,
"learning_rate": 9.926315789473685e-06,
"loss": 0.4696,
"step": 1057
},
{
"epoch": 1.6352395672333848,
"grad_norm": 10.875,
"learning_rate": 9.915789473684211e-06,
"loss": 0.4547,
"step": 1058
},
{
"epoch": 1.6367851622874807,
"grad_norm": 11.5,
"learning_rate": 9.905263157894738e-06,
"loss": 0.4883,
"step": 1059
},
{
"epoch": 1.6383307573415764,
"grad_norm": 10.0,
"learning_rate": 9.894736842105264e-06,
"loss": 0.4173,
"step": 1060
},
{
"epoch": 1.6398763523956723,
"grad_norm": 10.9375,
"learning_rate": 9.88421052631579e-06,
"loss": 0.4968,
"step": 1061
},
{
"epoch": 1.6414219474497682,
"grad_norm": 12.5625,
"learning_rate": 9.873684210526317e-06,
"loss": 0.3705,
"step": 1062
},
{
"epoch": 1.6429675425038641,
"grad_norm": 11.0625,
"learning_rate": 9.863157894736843e-06,
"loss": 0.4502,
"step": 1063
},
{
"epoch": 1.6445131375579598,
"grad_norm": 10.8125,
"learning_rate": 9.85263157894737e-06,
"loss": 0.4769,
"step": 1064
},
{
"epoch": 1.6460587326120555,
"grad_norm": 9.1875,
"learning_rate": 9.842105263157896e-06,
"loss": 0.4284,
"step": 1065
},
{
"epoch": 1.6476043276661514,
"grad_norm": 9.6875,
"learning_rate": 9.831578947368422e-06,
"loss": 0.4869,
"step": 1066
},
{
"epoch": 1.6491499227202473,
"grad_norm": 10.375,
"learning_rate": 9.821052631578948e-06,
"loss": 0.4387,
"step": 1067
},
{
"epoch": 1.6506955177743432,
"grad_norm": 11.375,
"learning_rate": 9.810526315789475e-06,
"loss": 0.478,
"step": 1068
},
{
"epoch": 1.652241112828439,
"grad_norm": 9.75,
"learning_rate": 9.800000000000001e-06,
"loss": 0.3812,
"step": 1069
},
{
"epoch": 1.6537867078825348,
"grad_norm": 9.3125,
"learning_rate": 9.789473684210527e-06,
"loss": 0.421,
"step": 1070
},
{
"epoch": 1.6553323029366305,
"grad_norm": 10.8125,
"learning_rate": 9.778947368421054e-06,
"loss": 0.441,
"step": 1071
},
{
"epoch": 1.6568778979907264,
"grad_norm": 10.0625,
"learning_rate": 9.76842105263158e-06,
"loss": 0.4801,
"step": 1072
},
{
"epoch": 1.6584234930448223,
"grad_norm": 10.875,
"learning_rate": 9.757894736842106e-06,
"loss": 0.4156,
"step": 1073
},
{
"epoch": 1.6599690880989182,
"grad_norm": 11.4375,
"learning_rate": 9.747368421052633e-06,
"loss": 0.4873,
"step": 1074
},
{
"epoch": 1.6615146831530139,
"grad_norm": 24.375,
"learning_rate": 9.736842105263159e-06,
"loss": 0.4652,
"step": 1075
},
{
"epoch": 1.6630602782071098,
"grad_norm": 9.9375,
"learning_rate": 9.726315789473685e-06,
"loss": 0.4408,
"step": 1076
},
{
"epoch": 1.6646058732612055,
"grad_norm": 9.1875,
"learning_rate": 9.715789473684212e-06,
"loss": 0.4339,
"step": 1077
},
{
"epoch": 1.6661514683153014,
"grad_norm": 9.625,
"learning_rate": 9.705263157894738e-06,
"loss": 0.4899,
"step": 1078
},
{
"epoch": 1.6676970633693973,
"grad_norm": 10.1875,
"learning_rate": 9.694736842105263e-06,
"loss": 0.4612,
"step": 1079
},
{
"epoch": 1.6692426584234932,
"grad_norm": 10.0,
"learning_rate": 9.68421052631579e-06,
"loss": 0.4764,
"step": 1080
},
{
"epoch": 1.6707882534775889,
"grad_norm": 11.1875,
"learning_rate": 9.673684210526317e-06,
"loss": 0.4575,
"step": 1081
},
{
"epoch": 1.6723338485316845,
"grad_norm": 12.1875,
"learning_rate": 9.663157894736843e-06,
"loss": 0.4651,
"step": 1082
},
{
"epoch": 1.6738794435857804,
"grad_norm": 9.5625,
"learning_rate": 9.65263157894737e-06,
"loss": 0.4052,
"step": 1083
},
{
"epoch": 1.6754250386398764,
"grad_norm": 11.4375,
"learning_rate": 9.642105263157896e-06,
"loss": 0.4619,
"step": 1084
},
{
"epoch": 1.6769706336939723,
"grad_norm": 20.375,
"learning_rate": 9.631578947368422e-06,
"loss": 0.4103,
"step": 1085
},
{
"epoch": 1.6785162287480682,
"grad_norm": 10.625,
"learning_rate": 9.621052631578947e-06,
"loss": 0.4498,
"step": 1086
},
{
"epoch": 1.6800618238021638,
"grad_norm": 14.5625,
"learning_rate": 9.610526315789475e-06,
"loss": 0.5129,
"step": 1087
},
{
"epoch": 1.6816074188562595,
"grad_norm": 10.625,
"learning_rate": 9.600000000000001e-06,
"loss": 0.4292,
"step": 1088
},
{
"epoch": 1.6831530139103554,
"grad_norm": 13.875,
"learning_rate": 9.589473684210528e-06,
"loss": 0.4277,
"step": 1089
},
{
"epoch": 1.6846986089644513,
"grad_norm": 10.1875,
"learning_rate": 9.578947368421054e-06,
"loss": 0.5244,
"step": 1090
},
{
"epoch": 1.6862442040185472,
"grad_norm": 12.6875,
"learning_rate": 9.56842105263158e-06,
"loss": 0.5629,
"step": 1091
},
{
"epoch": 1.687789799072643,
"grad_norm": 10.75,
"learning_rate": 9.557894736842107e-06,
"loss": 0.4874,
"step": 1092
},
{
"epoch": 1.6893353941267388,
"grad_norm": 9.6875,
"learning_rate": 9.547368421052631e-06,
"loss": 0.436,
"step": 1093
},
{
"epoch": 1.6908809891808345,
"grad_norm": 10.5625,
"learning_rate": 9.53684210526316e-06,
"loss": 0.4772,
"step": 1094
},
{
"epoch": 1.6924265842349304,
"grad_norm": 10.4375,
"learning_rate": 9.526315789473684e-06,
"loss": 0.4902,
"step": 1095
},
{
"epoch": 1.6939721792890263,
"grad_norm": 9.9375,
"learning_rate": 9.515789473684212e-06,
"loss": 0.4513,
"step": 1096
},
{
"epoch": 1.6955177743431222,
"grad_norm": 12.1875,
"learning_rate": 9.505263157894738e-06,
"loss": 0.5156,
"step": 1097
},
{
"epoch": 1.697063369397218,
"grad_norm": 9.5625,
"learning_rate": 9.494736842105265e-06,
"loss": 0.4841,
"step": 1098
},
{
"epoch": 1.6986089644513136,
"grad_norm": 11.4375,
"learning_rate": 9.484210526315791e-06,
"loss": 0.4655,
"step": 1099
},
{
"epoch": 1.7001545595054095,
"grad_norm": 12.0625,
"learning_rate": 9.473684210526315e-06,
"loss": 0.4568,
"step": 1100
},
{
"epoch": 1.7017001545595054,
"grad_norm": 12.375,
"learning_rate": 9.463157894736844e-06,
"loss": 0.4388,
"step": 1101
},
{
"epoch": 1.7032457496136013,
"grad_norm": 9.5,
"learning_rate": 9.452631578947368e-06,
"loss": 0.4388,
"step": 1102
},
{
"epoch": 1.7047913446676972,
"grad_norm": 11.1875,
"learning_rate": 9.442105263157896e-06,
"loss": 0.4927,
"step": 1103
},
{
"epoch": 1.706336939721793,
"grad_norm": 9.625,
"learning_rate": 9.43157894736842e-06,
"loss": 0.4393,
"step": 1104
},
{
"epoch": 1.7078825347758886,
"grad_norm": 10.3125,
"learning_rate": 9.421052631578949e-06,
"loss": 0.5402,
"step": 1105
},
{
"epoch": 1.7094281298299845,
"grad_norm": 13.5625,
"learning_rate": 9.410526315789475e-06,
"loss": 0.4338,
"step": 1106
},
{
"epoch": 1.7109737248840804,
"grad_norm": 22.0,
"learning_rate": 9.4e-06,
"loss": 0.4686,
"step": 1107
},
{
"epoch": 1.7125193199381763,
"grad_norm": 9.5625,
"learning_rate": 9.389473684210528e-06,
"loss": 0.4635,
"step": 1108
},
{
"epoch": 1.714064914992272,
"grad_norm": 10.5625,
"learning_rate": 9.378947368421052e-06,
"loss": 0.4477,
"step": 1109
},
{
"epoch": 1.7156105100463679,
"grad_norm": 14.5,
"learning_rate": 9.36842105263158e-06,
"loss": 0.5234,
"step": 1110
},
{
"epoch": 1.7171561051004636,
"grad_norm": 9.875,
"learning_rate": 9.357894736842105e-06,
"loss": 0.4074,
"step": 1111
},
{
"epoch": 1.7187017001545595,
"grad_norm": 10.0625,
"learning_rate": 9.347368421052633e-06,
"loss": 0.4459,
"step": 1112
},
{
"epoch": 1.7202472952086554,
"grad_norm": 13.75,
"learning_rate": 9.336842105263158e-06,
"loss": 0.49,
"step": 1113
},
{
"epoch": 1.7217928902627513,
"grad_norm": 9.5625,
"learning_rate": 9.326315789473684e-06,
"loss": 0.4651,
"step": 1114
},
{
"epoch": 1.723338485316847,
"grad_norm": 13.1875,
"learning_rate": 9.315789473684212e-06,
"loss": 0.4525,
"step": 1115
},
{
"epoch": 1.7248840803709427,
"grad_norm": 9.1875,
"learning_rate": 9.305263157894737e-06,
"loss": 0.4321,
"step": 1116
},
{
"epoch": 1.7264296754250386,
"grad_norm": 12.375,
"learning_rate": 9.294736842105265e-06,
"loss": 0.4772,
"step": 1117
},
{
"epoch": 1.7279752704791345,
"grad_norm": 9.6875,
"learning_rate": 9.28421052631579e-06,
"loss": 0.4411,
"step": 1118
},
{
"epoch": 1.7295208655332304,
"grad_norm": 12.1875,
"learning_rate": 9.273684210526317e-06,
"loss": 0.4489,
"step": 1119
},
{
"epoch": 1.7310664605873263,
"grad_norm": 13.6875,
"learning_rate": 9.263157894736842e-06,
"loss": 0.4343,
"step": 1120
},
{
"epoch": 1.732612055641422,
"grad_norm": 10.0,
"learning_rate": 9.252631578947368e-06,
"loss": 0.3969,
"step": 1121
},
{
"epoch": 1.7341576506955176,
"grad_norm": 10.6875,
"learning_rate": 9.242105263157896e-06,
"loss": 0.4903,
"step": 1122
},
{
"epoch": 1.7357032457496135,
"grad_norm": 9.0625,
"learning_rate": 9.231578947368421e-06,
"loss": 0.4758,
"step": 1123
},
{
"epoch": 1.7372488408037094,
"grad_norm": 10.3125,
"learning_rate": 9.221052631578949e-06,
"loss": 0.3913,
"step": 1124
},
{
"epoch": 1.7387944358578054,
"grad_norm": 10.5,
"learning_rate": 9.210526315789474e-06,
"loss": 0.4722,
"step": 1125
},
{
"epoch": 1.740340030911901,
"grad_norm": 9.8125,
"learning_rate": 9.200000000000002e-06,
"loss": 0.4681,
"step": 1126
},
{
"epoch": 1.741885625965997,
"grad_norm": 14.4375,
"learning_rate": 9.189473684210526e-06,
"loss": 0.3733,
"step": 1127
},
{
"epoch": 1.7434312210200926,
"grad_norm": 16.0,
"learning_rate": 9.178947368421053e-06,
"loss": 0.4727,
"step": 1128
},
{
"epoch": 1.7449768160741885,
"grad_norm": 9.9375,
"learning_rate": 9.168421052631579e-06,
"loss": 0.467,
"step": 1129
},
{
"epoch": 1.7465224111282844,
"grad_norm": 8.9375,
"learning_rate": 9.157894736842105e-06,
"loss": 0.471,
"step": 1130
},
{
"epoch": 1.7480680061823803,
"grad_norm": 9.5,
"learning_rate": 9.147368421052633e-06,
"loss": 0.4584,
"step": 1131
},
{
"epoch": 1.749613601236476,
"grad_norm": 12.125,
"learning_rate": 9.136842105263158e-06,
"loss": 0.4868,
"step": 1132
},
{
"epoch": 1.7511591962905717,
"grad_norm": 11.5,
"learning_rate": 9.126315789473686e-06,
"loss": 0.4431,
"step": 1133
},
{
"epoch": 1.7527047913446676,
"grad_norm": 11.5625,
"learning_rate": 9.11578947368421e-06,
"loss": 0.4722,
"step": 1134
},
{
"epoch": 1.7542503863987635,
"grad_norm": 10.9375,
"learning_rate": 9.105263157894739e-06,
"loss": 0.4667,
"step": 1135
},
{
"epoch": 1.7557959814528594,
"grad_norm": 10.625,
"learning_rate": 9.094736842105263e-06,
"loss": 0.4947,
"step": 1136
},
{
"epoch": 1.7573415765069553,
"grad_norm": 10.125,
"learning_rate": 9.08421052631579e-06,
"loss": 0.4612,
"step": 1137
},
{
"epoch": 1.758887171561051,
"grad_norm": 11.5,
"learning_rate": 9.073684210526316e-06,
"loss": 0.5586,
"step": 1138
},
{
"epoch": 1.7604327666151467,
"grad_norm": 10.1875,
"learning_rate": 9.063157894736842e-06,
"loss": 0.4374,
"step": 1139
},
{
"epoch": 1.7619783616692426,
"grad_norm": 10.25,
"learning_rate": 9.05263157894737e-06,
"loss": 0.4535,
"step": 1140
},
{
"epoch": 1.7635239567233385,
"grad_norm": 10.0,
"learning_rate": 9.042105263157895e-06,
"loss": 0.436,
"step": 1141
},
{
"epoch": 1.7650695517774344,
"grad_norm": 11.9375,
"learning_rate": 9.031578947368423e-06,
"loss": 0.5334,
"step": 1142
},
{
"epoch": 1.76661514683153,
"grad_norm": 10.4375,
"learning_rate": 9.021052631578948e-06,
"loss": 0.5028,
"step": 1143
},
{
"epoch": 1.768160741885626,
"grad_norm": 9.25,
"learning_rate": 9.010526315789474e-06,
"loss": 0.4424,
"step": 1144
},
{
"epoch": 1.7697063369397217,
"grad_norm": 9.75,
"learning_rate": 9e-06,
"loss": 0.4092,
"step": 1145
},
{
"epoch": 1.7712519319938176,
"grad_norm": 9.3125,
"learning_rate": 8.989473684210527e-06,
"loss": 0.4267,
"step": 1146
},
{
"epoch": 1.7727975270479135,
"grad_norm": 10.0625,
"learning_rate": 8.978947368421055e-06,
"loss": 0.4288,
"step": 1147
},
{
"epoch": 1.7743431221020094,
"grad_norm": 11.125,
"learning_rate": 8.96842105263158e-06,
"loss": 0.5062,
"step": 1148
},
{
"epoch": 1.775888717156105,
"grad_norm": 10.0,
"learning_rate": 8.957894736842107e-06,
"loss": 0.4726,
"step": 1149
},
{
"epoch": 1.7774343122102008,
"grad_norm": 10.25,
"learning_rate": 8.947368421052632e-06,
"loss": 0.4466,
"step": 1150
},
{
"epoch": 1.7789799072642967,
"grad_norm": 9.6875,
"learning_rate": 8.936842105263158e-06,
"loss": 0.5098,
"step": 1151
},
{
"epoch": 1.7805255023183926,
"grad_norm": 10.25,
"learning_rate": 8.926315789473685e-06,
"loss": 0.4475,
"step": 1152
},
{
"epoch": 1.7820710973724885,
"grad_norm": 9.75,
"learning_rate": 8.915789473684211e-06,
"loss": 0.425,
"step": 1153
},
{
"epoch": 1.7836166924265844,
"grad_norm": 11.1875,
"learning_rate": 8.905263157894737e-06,
"loss": 0.5059,
"step": 1154
},
{
"epoch": 1.78516228748068,
"grad_norm": 13.75,
"learning_rate": 8.894736842105264e-06,
"loss": 0.4619,
"step": 1155
},
{
"epoch": 1.7867078825347757,
"grad_norm": 10.0,
"learning_rate": 8.884210526315792e-06,
"loss": 0.4803,
"step": 1156
},
{
"epoch": 1.7882534775888717,
"grad_norm": 11.875,
"learning_rate": 8.873684210526316e-06,
"loss": 0.4309,
"step": 1157
},
{
"epoch": 1.7897990726429676,
"grad_norm": 10.0,
"learning_rate": 8.863157894736842e-06,
"loss": 0.4585,
"step": 1158
},
{
"epoch": 1.7913446676970635,
"grad_norm": 12.0625,
"learning_rate": 8.852631578947369e-06,
"loss": 0.4533,
"step": 1159
},
{
"epoch": 1.7928902627511591,
"grad_norm": 10.0625,
"learning_rate": 8.842105263157895e-06,
"loss": 0.4266,
"step": 1160
},
{
"epoch": 1.794435857805255,
"grad_norm": 10.125,
"learning_rate": 8.831578947368421e-06,
"loss": 0.4381,
"step": 1161
},
{
"epoch": 1.7959814528593507,
"grad_norm": 10.0,
"learning_rate": 8.821052631578948e-06,
"loss": 0.4596,
"step": 1162
},
{
"epoch": 1.7975270479134466,
"grad_norm": 9.8125,
"learning_rate": 8.810526315789474e-06,
"loss": 0.5047,
"step": 1163
},
{
"epoch": 1.7990726429675425,
"grad_norm": 11.3125,
"learning_rate": 8.8e-06,
"loss": 0.4673,
"step": 1164
},
{
"epoch": 1.8006182380216385,
"grad_norm": 10.3125,
"learning_rate": 8.789473684210527e-06,
"loss": 0.4945,
"step": 1165
},
{
"epoch": 1.8021638330757341,
"grad_norm": 9.5,
"learning_rate": 8.778947368421053e-06,
"loss": 0.4753,
"step": 1166
},
{
"epoch": 1.80370942812983,
"grad_norm": 9.375,
"learning_rate": 8.76842105263158e-06,
"loss": 0.4579,
"step": 1167
},
{
"epoch": 1.8052550231839257,
"grad_norm": 23.75,
"learning_rate": 8.757894736842106e-06,
"loss": 0.4629,
"step": 1168
},
{
"epoch": 1.8068006182380216,
"grad_norm": 10.25,
"learning_rate": 8.747368421052632e-06,
"loss": 0.4488,
"step": 1169
},
{
"epoch": 1.8083462132921175,
"grad_norm": 9.5,
"learning_rate": 8.736842105263158e-06,
"loss": 0.4859,
"step": 1170
},
{
"epoch": 1.8098918083462134,
"grad_norm": 9.4375,
"learning_rate": 8.726315789473685e-06,
"loss": 0.4414,
"step": 1171
},
{
"epoch": 1.8114374034003091,
"grad_norm": 18.125,
"learning_rate": 8.715789473684211e-06,
"loss": 0.4149,
"step": 1172
},
{
"epoch": 1.8129829984544048,
"grad_norm": 10.3125,
"learning_rate": 8.705263157894737e-06,
"loss": 0.4415,
"step": 1173
},
{
"epoch": 1.8145285935085007,
"grad_norm": 11.0625,
"learning_rate": 8.694736842105264e-06,
"loss": 0.4448,
"step": 1174
},
{
"epoch": 1.8160741885625966,
"grad_norm": 10.3125,
"learning_rate": 8.68421052631579e-06,
"loss": 0.4808,
"step": 1175
},
{
"epoch": 1.8176197836166925,
"grad_norm": 10.6875,
"learning_rate": 8.673684210526316e-06,
"loss": 0.4208,
"step": 1176
},
{
"epoch": 1.8191653786707882,
"grad_norm": 12.6875,
"learning_rate": 8.663157894736843e-06,
"loss": 0.4413,
"step": 1177
},
{
"epoch": 1.820710973724884,
"grad_norm": 24.125,
"learning_rate": 8.652631578947369e-06,
"loss": 0.4286,
"step": 1178
},
{
"epoch": 1.8222565687789798,
"grad_norm": 10.3125,
"learning_rate": 8.642105263157895e-06,
"loss": 0.4778,
"step": 1179
},
{
"epoch": 1.8238021638330757,
"grad_norm": 12.9375,
"learning_rate": 8.631578947368422e-06,
"loss": 0.3949,
"step": 1180
},
{
"epoch": 1.8253477588871716,
"grad_norm": 11.6875,
"learning_rate": 8.621052631578948e-06,
"loss": 0.482,
"step": 1181
},
{
"epoch": 1.8268933539412675,
"grad_norm": 9.25,
"learning_rate": 8.610526315789474e-06,
"loss": 0.3963,
"step": 1182
},
{
"epoch": 1.8284389489953632,
"grad_norm": 12.0,
"learning_rate": 8.6e-06,
"loss": 0.4687,
"step": 1183
},
{
"epoch": 1.829984544049459,
"grad_norm": 8.6875,
"learning_rate": 8.589473684210527e-06,
"loss": 0.4124,
"step": 1184
},
{
"epoch": 1.8315301391035548,
"grad_norm": 9.0625,
"learning_rate": 8.578947368421053e-06,
"loss": 0.4552,
"step": 1185
},
{
"epoch": 1.8330757341576507,
"grad_norm": 9.25,
"learning_rate": 8.56842105263158e-06,
"loss": 0.4079,
"step": 1186
},
{
"epoch": 1.8346213292117466,
"grad_norm": 9.6875,
"learning_rate": 8.557894736842106e-06,
"loss": 0.4101,
"step": 1187
},
{
"epoch": 1.8361669242658425,
"grad_norm": 9.5625,
"learning_rate": 8.547368421052632e-06,
"loss": 0.4624,
"step": 1188
},
{
"epoch": 1.8377125193199382,
"grad_norm": 10.125,
"learning_rate": 8.536842105263159e-06,
"loss": 0.4779,
"step": 1189
},
{
"epoch": 1.8392581143740339,
"grad_norm": 11.9375,
"learning_rate": 8.526315789473685e-06,
"loss": 0.5607,
"step": 1190
},
{
"epoch": 1.8408037094281298,
"grad_norm": 11.8125,
"learning_rate": 8.515789473684211e-06,
"loss": 0.4958,
"step": 1191
},
{
"epoch": 1.8423493044822257,
"grad_norm": 10.125,
"learning_rate": 8.505263157894738e-06,
"loss": 0.3887,
"step": 1192
},
{
"epoch": 1.8438948995363216,
"grad_norm": 10.1875,
"learning_rate": 8.494736842105264e-06,
"loss": 0.4486,
"step": 1193
},
{
"epoch": 1.8454404945904173,
"grad_norm": 10.9375,
"learning_rate": 8.48421052631579e-06,
"loss": 0.4227,
"step": 1194
},
{
"epoch": 1.8469860896445132,
"grad_norm": 9.625,
"learning_rate": 8.473684210526317e-06,
"loss": 0.4628,
"step": 1195
},
{
"epoch": 1.8485316846986088,
"grad_norm": 10.5625,
"learning_rate": 8.463157894736843e-06,
"loss": 0.4385,
"step": 1196
},
{
"epoch": 1.8500772797527048,
"grad_norm": 9.875,
"learning_rate": 8.45263157894737e-06,
"loss": 0.4347,
"step": 1197
},
{
"epoch": 1.8516228748068007,
"grad_norm": 10.25,
"learning_rate": 8.442105263157896e-06,
"loss": 0.4574,
"step": 1198
},
{
"epoch": 1.8531684698608966,
"grad_norm": 10.875,
"learning_rate": 8.431578947368422e-06,
"loss": 0.4808,
"step": 1199
},
{
"epoch": 1.8547140649149922,
"grad_norm": 9.8125,
"learning_rate": 8.421052631578948e-06,
"loss": 0.4708,
"step": 1200
},
{
"epoch": 1.8562596599690881,
"grad_norm": 10.9375,
"learning_rate": 8.410526315789475e-06,
"loss": 0.5147,
"step": 1201
},
{
"epoch": 1.8578052550231838,
"grad_norm": 9.5625,
"learning_rate": 8.400000000000001e-06,
"loss": 0.4379,
"step": 1202
},
{
"epoch": 1.8593508500772797,
"grad_norm": 11.625,
"learning_rate": 8.389473684210527e-06,
"loss": 0.4416,
"step": 1203
},
{
"epoch": 1.8608964451313756,
"grad_norm": 10.625,
"learning_rate": 8.378947368421054e-06,
"loss": 0.4611,
"step": 1204
},
{
"epoch": 1.8624420401854715,
"grad_norm": 10.8125,
"learning_rate": 8.36842105263158e-06,
"loss": 0.4531,
"step": 1205
},
{
"epoch": 1.8639876352395672,
"grad_norm": 10.75,
"learning_rate": 8.357894736842106e-06,
"loss": 0.4236,
"step": 1206
},
{
"epoch": 1.865533230293663,
"grad_norm": 11.6875,
"learning_rate": 8.347368421052633e-06,
"loss": 0.4655,
"step": 1207
},
{
"epoch": 1.8670788253477588,
"grad_norm": 10.9375,
"learning_rate": 8.336842105263159e-06,
"loss": 0.4587,
"step": 1208
},
{
"epoch": 1.8686244204018547,
"grad_norm": 10.75,
"learning_rate": 8.326315789473685e-06,
"loss": 0.5585,
"step": 1209
},
{
"epoch": 1.8701700154559506,
"grad_norm": 9.9375,
"learning_rate": 8.315789473684212e-06,
"loss": 0.3895,
"step": 1210
},
{
"epoch": 1.8717156105100463,
"grad_norm": 9.5,
"learning_rate": 8.305263157894738e-06,
"loss": 0.3855,
"step": 1211
},
{
"epoch": 1.8732612055641422,
"grad_norm": 10.75,
"learning_rate": 8.294736842105264e-06,
"loss": 0.4555,
"step": 1212
},
{
"epoch": 1.874806800618238,
"grad_norm": 10.8125,
"learning_rate": 8.28421052631579e-06,
"loss": 0.5291,
"step": 1213
},
{
"epoch": 1.8763523956723338,
"grad_norm": 10.125,
"learning_rate": 8.273684210526317e-06,
"loss": 0.4472,
"step": 1214
},
{
"epoch": 1.8778979907264297,
"grad_norm": 10.9375,
"learning_rate": 8.263157894736843e-06,
"loss": 0.463,
"step": 1215
},
{
"epoch": 1.8794435857805256,
"grad_norm": 19.625,
"learning_rate": 8.25263157894737e-06,
"loss": 0.484,
"step": 1216
},
{
"epoch": 1.8809891808346213,
"grad_norm": 9.875,
"learning_rate": 8.242105263157896e-06,
"loss": 0.4934,
"step": 1217
},
{
"epoch": 1.8825347758887172,
"grad_norm": 11.9375,
"learning_rate": 8.231578947368422e-06,
"loss": 0.417,
"step": 1218
},
{
"epoch": 1.8840803709428129,
"grad_norm": 10.0,
"learning_rate": 8.221052631578948e-06,
"loss": 0.4737,
"step": 1219
},
{
"epoch": 1.8856259659969088,
"grad_norm": 11.75,
"learning_rate": 8.210526315789475e-06,
"loss": 0.4513,
"step": 1220
},
{
"epoch": 1.8871715610510047,
"grad_norm": 10.5,
"learning_rate": 8.2e-06,
"loss": 0.4611,
"step": 1221
},
{
"epoch": 1.8887171561051006,
"grad_norm": 11.4375,
"learning_rate": 8.189473684210527e-06,
"loss": 0.4833,
"step": 1222
},
{
"epoch": 1.8902627511591963,
"grad_norm": 12.0,
"learning_rate": 8.178947368421054e-06,
"loss": 0.4304,
"step": 1223
},
{
"epoch": 1.891808346213292,
"grad_norm": 10.8125,
"learning_rate": 8.16842105263158e-06,
"loss": 0.4075,
"step": 1224
},
{
"epoch": 1.8933539412673879,
"grad_norm": 13.5625,
"learning_rate": 8.157894736842106e-06,
"loss": 0.5219,
"step": 1225
},
{
"epoch": 1.8948995363214838,
"grad_norm": 8.0,
"learning_rate": 8.147368421052633e-06,
"loss": 0.3928,
"step": 1226
},
{
"epoch": 1.8964451313755797,
"grad_norm": 15.6875,
"learning_rate": 8.136842105263159e-06,
"loss": 0.398,
"step": 1227
},
{
"epoch": 1.8979907264296756,
"grad_norm": 9.8125,
"learning_rate": 8.126315789473684e-06,
"loss": 0.4388,
"step": 1228
},
{
"epoch": 1.8995363214837713,
"grad_norm": 10.6875,
"learning_rate": 8.115789473684212e-06,
"loss": 0.4244,
"step": 1229
},
{
"epoch": 1.901081916537867,
"grad_norm": 9.125,
"learning_rate": 8.105263157894736e-06,
"loss": 0.3926,
"step": 1230
},
{
"epoch": 1.9026275115919629,
"grad_norm": 10.75,
"learning_rate": 8.094736842105264e-06,
"loss": 0.4463,
"step": 1231
},
{
"epoch": 1.9041731066460588,
"grad_norm": 13.375,
"learning_rate": 8.08421052631579e-06,
"loss": 0.4225,
"step": 1232
},
{
"epoch": 1.9057187017001547,
"grad_norm": 10.6875,
"learning_rate": 8.073684210526317e-06,
"loss": 0.4547,
"step": 1233
},
{
"epoch": 1.9072642967542504,
"grad_norm": 10.5,
"learning_rate": 8.063157894736843e-06,
"loss": 0.427,
"step": 1234
},
{
"epoch": 1.9088098918083463,
"grad_norm": 16.75,
"learning_rate": 8.052631578947368e-06,
"loss": 0.45,
"step": 1235
},
{
"epoch": 1.910355486862442,
"grad_norm": 9.75,
"learning_rate": 8.042105263157896e-06,
"loss": 0.4475,
"step": 1236
},
{
"epoch": 1.9119010819165378,
"grad_norm": 9.875,
"learning_rate": 8.03157894736842e-06,
"loss": 0.4848,
"step": 1237
},
{
"epoch": 1.9134466769706338,
"grad_norm": 12.0,
"learning_rate": 8.021052631578949e-06,
"loss": 0.485,
"step": 1238
},
{
"epoch": 1.9149922720247297,
"grad_norm": 10.5,
"learning_rate": 8.010526315789473e-06,
"loss": 0.4884,
"step": 1239
},
{
"epoch": 1.9165378670788253,
"grad_norm": 10.5,
"learning_rate": 8.000000000000001e-06,
"loss": 0.4178,
"step": 1240
},
{
"epoch": 1.918083462132921,
"grad_norm": 11.25,
"learning_rate": 7.989473684210528e-06,
"loss": 0.4832,
"step": 1241
},
{
"epoch": 1.919629057187017,
"grad_norm": 11.625,
"learning_rate": 7.978947368421052e-06,
"loss": 0.4736,
"step": 1242
},
{
"epoch": 1.9211746522411128,
"grad_norm": 11.3125,
"learning_rate": 7.96842105263158e-06,
"loss": 0.4225,
"step": 1243
},
{
"epoch": 1.9227202472952087,
"grad_norm": 8.9375,
"learning_rate": 7.957894736842105e-06,
"loss": 0.4381,
"step": 1244
},
{
"epoch": 1.9242658423493046,
"grad_norm": 11.375,
"learning_rate": 7.947368421052633e-06,
"loss": 0.4732,
"step": 1245
},
{
"epoch": 1.9258114374034003,
"grad_norm": 10.375,
"learning_rate": 7.936842105263158e-06,
"loss": 0.4806,
"step": 1246
},
{
"epoch": 1.927357032457496,
"grad_norm": 10.375,
"learning_rate": 7.926315789473686e-06,
"loss": 0.371,
"step": 1247
},
{
"epoch": 1.928902627511592,
"grad_norm": 9.75,
"learning_rate": 7.915789473684212e-06,
"loss": 0.4921,
"step": 1248
},
{
"epoch": 1.9304482225656878,
"grad_norm": 14.8125,
"learning_rate": 7.905263157894737e-06,
"loss": 0.5108,
"step": 1249
},
{
"epoch": 1.9319938176197837,
"grad_norm": 12.5625,
"learning_rate": 7.894736842105265e-06,
"loss": 0.4646,
"step": 1250
},
{
"epoch": 1.9335394126738794,
"grad_norm": 10.0,
"learning_rate": 7.88421052631579e-06,
"loss": 0.4252,
"step": 1251
},
{
"epoch": 1.9350850077279753,
"grad_norm": 9.6875,
"learning_rate": 7.873684210526317e-06,
"loss": 0.4493,
"step": 1252
},
{
"epoch": 1.936630602782071,
"grad_norm": 10.875,
"learning_rate": 7.863157894736842e-06,
"loss": 0.4008,
"step": 1253
},
{
"epoch": 1.938176197836167,
"grad_norm": 15.5,
"learning_rate": 7.85263157894737e-06,
"loss": 0.4658,
"step": 1254
},
{
"epoch": 1.9397217928902628,
"grad_norm": 10.375,
"learning_rate": 7.842105263157895e-06,
"loss": 0.4306,
"step": 1255
},
{
"epoch": 1.9412673879443587,
"grad_norm": 10.375,
"learning_rate": 7.831578947368421e-06,
"loss": 0.5087,
"step": 1256
},
{
"epoch": 1.9428129829984544,
"grad_norm": 9.6875,
"learning_rate": 7.821052631578949e-06,
"loss": 0.4678,
"step": 1257
},
{
"epoch": 1.94435857805255,
"grad_norm": 9.3125,
"learning_rate": 7.810526315789474e-06,
"loss": 0.4189,
"step": 1258
},
{
"epoch": 1.945904173106646,
"grad_norm": 10.6875,
"learning_rate": 7.800000000000002e-06,
"loss": 0.4527,
"step": 1259
},
{
"epoch": 1.947449768160742,
"grad_norm": 11.75,
"learning_rate": 7.789473684210526e-06,
"loss": 0.499,
"step": 1260
},
{
"epoch": 1.9489953632148378,
"grad_norm": 9.25,
"learning_rate": 7.778947368421054e-06,
"loss": 0.4036,
"step": 1261
},
{
"epoch": 1.9505409582689337,
"grad_norm": 10.25,
"learning_rate": 7.768421052631579e-06,
"loss": 0.4725,
"step": 1262
},
{
"epoch": 1.9520865533230294,
"grad_norm": 8.5,
"learning_rate": 7.757894736842105e-06,
"loss": 0.4144,
"step": 1263
},
{
"epoch": 1.953632148377125,
"grad_norm": 13.0,
"learning_rate": 7.747368421052631e-06,
"loss": 0.4833,
"step": 1264
},
{
"epoch": 1.955177743431221,
"grad_norm": 8.4375,
"learning_rate": 7.736842105263158e-06,
"loss": 0.4006,
"step": 1265
},
{
"epoch": 1.9567233384853169,
"grad_norm": 10.875,
"learning_rate": 7.726315789473686e-06,
"loss": 0.4265,
"step": 1266
},
{
"epoch": 1.9582689335394128,
"grad_norm": 10.9375,
"learning_rate": 7.71578947368421e-06,
"loss": 0.4581,
"step": 1267
},
{
"epoch": 1.9598145285935085,
"grad_norm": 13.0625,
"learning_rate": 7.705263157894738e-06,
"loss": 0.4954,
"step": 1268
},
{
"epoch": 1.9613601236476044,
"grad_norm": 9.875,
"learning_rate": 7.694736842105263e-06,
"loss": 0.4877,
"step": 1269
},
{
"epoch": 1.9629057187017,
"grad_norm": 11.8125,
"learning_rate": 7.68421052631579e-06,
"loss": 0.4149,
"step": 1270
},
{
"epoch": 1.964451313755796,
"grad_norm": 10.125,
"learning_rate": 7.673684210526316e-06,
"loss": 0.4674,
"step": 1271
},
{
"epoch": 1.9659969088098919,
"grad_norm": 11.0,
"learning_rate": 7.663157894736842e-06,
"loss": 0.447,
"step": 1272
},
{
"epoch": 1.9675425038639878,
"grad_norm": 11.5,
"learning_rate": 7.65263157894737e-06,
"loss": 0.4988,
"step": 1273
},
{
"epoch": 1.9690880989180835,
"grad_norm": 12.625,
"learning_rate": 7.642105263157895e-06,
"loss": 0.4138,
"step": 1274
},
{
"epoch": 1.9706336939721791,
"grad_norm": 9.6875,
"learning_rate": 7.631578947368423e-06,
"loss": 0.4018,
"step": 1275
},
{
"epoch": 1.972179289026275,
"grad_norm": 9.6875,
"learning_rate": 7.621052631578948e-06,
"loss": 0.4473,
"step": 1276
},
{
"epoch": 1.973724884080371,
"grad_norm": 10.5625,
"learning_rate": 7.610526315789474e-06,
"loss": 0.4526,
"step": 1277
},
{
"epoch": 1.9752704791344669,
"grad_norm": 11.8125,
"learning_rate": 7.600000000000001e-06,
"loss": 0.4457,
"step": 1278
},
{
"epoch": 1.9768160741885628,
"grad_norm": 8.8125,
"learning_rate": 7.589473684210526e-06,
"loss": 0.4082,
"step": 1279
},
{
"epoch": 1.9783616692426584,
"grad_norm": 11.5625,
"learning_rate": 7.578947368421054e-06,
"loss": 0.4891,
"step": 1280
},
{
"epoch": 1.9799072642967541,
"grad_norm": 11.0625,
"learning_rate": 7.568421052631579e-06,
"loss": 0.4421,
"step": 1281
},
{
"epoch": 1.98145285935085,
"grad_norm": 10.1875,
"learning_rate": 7.557894736842106e-06,
"loss": 0.4449,
"step": 1282
},
{
"epoch": 1.982998454404946,
"grad_norm": 11.875,
"learning_rate": 7.547368421052632e-06,
"loss": 0.5123,
"step": 1283
},
{
"epoch": 1.9845440494590418,
"grad_norm": 9.625,
"learning_rate": 7.536842105263158e-06,
"loss": 0.4362,
"step": 1284
},
{
"epoch": 1.9860896445131375,
"grad_norm": 12.375,
"learning_rate": 7.526315789473685e-06,
"loss": 0.446,
"step": 1285
},
{
"epoch": 1.9876352395672334,
"grad_norm": 10.5625,
"learning_rate": 7.515789473684211e-06,
"loss": 0.4251,
"step": 1286
},
{
"epoch": 1.989180834621329,
"grad_norm": 11.875,
"learning_rate": 7.505263157894738e-06,
"loss": 0.4227,
"step": 1287
},
{
"epoch": 1.990726429675425,
"grad_norm": 11.75,
"learning_rate": 7.494736842105263e-06,
"loss": 0.4794,
"step": 1288
},
{
"epoch": 1.992272024729521,
"grad_norm": 10.625,
"learning_rate": 7.4842105263157905e-06,
"loss": 0.4814,
"step": 1289
},
{
"epoch": 1.9938176197836168,
"grad_norm": 10.3125,
"learning_rate": 7.473684210526316e-06,
"loss": 0.421,
"step": 1290
},
{
"epoch": 1.9953632148377125,
"grad_norm": 10.125,
"learning_rate": 7.463157894736843e-06,
"loss": 0.4335,
"step": 1291
},
{
"epoch": 1.9969088098918082,
"grad_norm": 10.4375,
"learning_rate": 7.4526315789473695e-06,
"loss": 0.4998,
"step": 1292
},
{
"epoch": 1.998454404945904,
"grad_norm": 13.1875,
"learning_rate": 7.442105263157895e-06,
"loss": 0.3654,
"step": 1293
},
{
"epoch": 2.0,
"grad_norm": 9.5,
"learning_rate": 7.431578947368422e-06,
"loss": 0.421,
"step": 1294
},
{
"epoch": 2.001545595054096,
"grad_norm": 9.5625,
"learning_rate": 7.421052631578948e-06,
"loss": 0.3409,
"step": 1295
},
{
"epoch": 2.003091190108192,
"grad_norm": 8.4375,
"learning_rate": 7.410526315789475e-06,
"loss": 0.4153,
"step": 1296
},
{
"epoch": 2.0046367851622873,
"grad_norm": 8.0,
"learning_rate": 7.4e-06,
"loss": 0.4277,
"step": 1297
},
{
"epoch": 2.006182380216383,
"grad_norm": 11.1875,
"learning_rate": 7.3894736842105275e-06,
"loss": 0.4332,
"step": 1298
},
{
"epoch": 2.007727975270479,
"grad_norm": 8.3125,
"learning_rate": 7.378947368421053e-06,
"loss": 0.3774,
"step": 1299
},
{
"epoch": 2.009273570324575,
"grad_norm": 9.375,
"learning_rate": 7.368421052631579e-06,
"loss": 0.4583,
"step": 1300
},
{
"epoch": 2.010819165378671,
"grad_norm": 9.75,
"learning_rate": 7.3578947368421065e-06,
"loss": 0.4037,
"step": 1301
},
{
"epoch": 2.012364760432767,
"grad_norm": 9.4375,
"learning_rate": 7.347368421052632e-06,
"loss": 0.4613,
"step": 1302
},
{
"epoch": 2.0139103554868623,
"grad_norm": 10.6875,
"learning_rate": 7.336842105263159e-06,
"loss": 0.45,
"step": 1303
},
{
"epoch": 2.015455950540958,
"grad_norm": 9.6875,
"learning_rate": 7.326315789473685e-06,
"loss": 0.478,
"step": 1304
},
{
"epoch": 2.017001545595054,
"grad_norm": 7.5625,
"learning_rate": 7.315789473684212e-06,
"loss": 0.4099,
"step": 1305
},
{
"epoch": 2.01854714064915,
"grad_norm": 17.5,
"learning_rate": 7.305263157894737e-06,
"loss": 0.4187,
"step": 1306
},
{
"epoch": 2.020092735703246,
"grad_norm": 10.1875,
"learning_rate": 7.2947368421052636e-06,
"loss": 0.429,
"step": 1307
},
{
"epoch": 2.021638330757342,
"grad_norm": 12.375,
"learning_rate": 7.28421052631579e-06,
"loss": 0.4693,
"step": 1308
},
{
"epoch": 2.0231839258114372,
"grad_norm": 10.5625,
"learning_rate": 7.273684210526316e-06,
"loss": 0.4918,
"step": 1309
},
{
"epoch": 2.024729520865533,
"grad_norm": 10.9375,
"learning_rate": 7.263157894736843e-06,
"loss": 0.4465,
"step": 1310
},
{
"epoch": 2.026275115919629,
"grad_norm": 10.625,
"learning_rate": 7.252631578947369e-06,
"loss": 0.4254,
"step": 1311
},
{
"epoch": 2.027820710973725,
"grad_norm": 9.5625,
"learning_rate": 7.242105263157896e-06,
"loss": 0.4116,
"step": 1312
},
{
"epoch": 2.029366306027821,
"grad_norm": 37.75,
"learning_rate": 7.2315789473684215e-06,
"loss": 0.4356,
"step": 1313
},
{
"epoch": 2.0309119010819163,
"grad_norm": 9.5625,
"learning_rate": 7.221052631578948e-06,
"loss": 0.4526,
"step": 1314
},
{
"epoch": 2.0324574961360122,
"grad_norm": 14.5625,
"learning_rate": 7.210526315789474e-06,
"loss": 0.4862,
"step": 1315
},
{
"epoch": 2.034003091190108,
"grad_norm": 8.75,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.3943,
"step": 1316
},
{
"epoch": 2.035548686244204,
"grad_norm": 8.625,
"learning_rate": 7.189473684210527e-06,
"loss": 0.3862,
"step": 1317
},
{
"epoch": 2.0370942812983,
"grad_norm": 10.0,
"learning_rate": 7.178947368421053e-06,
"loss": 0.3895,
"step": 1318
},
{
"epoch": 2.038639876352396,
"grad_norm": 9.0625,
"learning_rate": 7.16842105263158e-06,
"loss": 0.3919,
"step": 1319
},
{
"epoch": 2.0401854714064913,
"grad_norm": 8.3125,
"learning_rate": 7.157894736842106e-06,
"loss": 0.4206,
"step": 1320
},
{
"epoch": 2.041731066460587,
"grad_norm": 9.125,
"learning_rate": 7.147368421052631e-06,
"loss": 0.4216,
"step": 1321
},
{
"epoch": 2.043276661514683,
"grad_norm": 12.5625,
"learning_rate": 7.1368421052631585e-06,
"loss": 0.3875,
"step": 1322
},
{
"epoch": 2.044822256568779,
"grad_norm": 8.625,
"learning_rate": 7.126315789473685e-06,
"loss": 0.394,
"step": 1323
},
{
"epoch": 2.046367851622875,
"grad_norm": 10.0,
"learning_rate": 7.115789473684211e-06,
"loss": 0.398,
"step": 1324
},
{
"epoch": 2.047913446676971,
"grad_norm": 9.5,
"learning_rate": 7.1052631578947375e-06,
"loss": 0.3825,
"step": 1325
},
{
"epoch": 2.0494590417310663,
"grad_norm": 9.625,
"learning_rate": 7.094736842105265e-06,
"loss": 0.4619,
"step": 1326
},
{
"epoch": 2.051004636785162,
"grad_norm": 9.8125,
"learning_rate": 7.08421052631579e-06,
"loss": 0.3744,
"step": 1327
},
{
"epoch": 2.052550231839258,
"grad_norm": 8.625,
"learning_rate": 7.073684210526316e-06,
"loss": 0.3775,
"step": 1328
},
{
"epoch": 2.054095826893354,
"grad_norm": 14.3125,
"learning_rate": 7.063157894736843e-06,
"loss": 0.4167,
"step": 1329
},
{
"epoch": 2.05564142194745,
"grad_norm": 11.625,
"learning_rate": 7.052631578947369e-06,
"loss": 0.3926,
"step": 1330
},
{
"epoch": 2.0571870170015454,
"grad_norm": 14.25,
"learning_rate": 7.0421052631578954e-06,
"loss": 0.4458,
"step": 1331
},
{
"epoch": 2.0587326120556413,
"grad_norm": 9.5625,
"learning_rate": 7.031578947368422e-06,
"loss": 0.3743,
"step": 1332
},
{
"epoch": 2.060278207109737,
"grad_norm": 9.4375,
"learning_rate": 7.021052631578948e-06,
"loss": 0.4417,
"step": 1333
},
{
"epoch": 2.061823802163833,
"grad_norm": 8.9375,
"learning_rate": 7.010526315789474e-06,
"loss": 0.4122,
"step": 1334
},
{
"epoch": 2.063369397217929,
"grad_norm": 9.625,
"learning_rate": 7e-06,
"loss": 0.4043,
"step": 1335
},
{
"epoch": 2.064914992272025,
"grad_norm": 9.625,
"learning_rate": 6.989473684210527e-06,
"loss": 0.3708,
"step": 1336
},
{
"epoch": 2.0664605873261204,
"grad_norm": 10.9375,
"learning_rate": 6.9789473684210525e-06,
"loss": 0.4429,
"step": 1337
},
{
"epoch": 2.0680061823802163,
"grad_norm": 10.25,
"learning_rate": 6.96842105263158e-06,
"loss": 0.392,
"step": 1338
},
{
"epoch": 2.069551777434312,
"grad_norm": 11.375,
"learning_rate": 6.957894736842106e-06,
"loss": 0.3632,
"step": 1339
},
{
"epoch": 2.071097372488408,
"grad_norm": 10.6875,
"learning_rate": 6.947368421052632e-06,
"loss": 0.4258,
"step": 1340
},
{
"epoch": 2.072642967542504,
"grad_norm": 11.8125,
"learning_rate": 6.936842105263159e-06,
"loss": 0.4532,
"step": 1341
},
{
"epoch": 2.0741885625966,
"grad_norm": 10.75,
"learning_rate": 6.926315789473684e-06,
"loss": 0.3734,
"step": 1342
},
{
"epoch": 2.0757341576506954,
"grad_norm": 8.6875,
"learning_rate": 6.915789473684211e-06,
"loss": 0.4601,
"step": 1343
},
{
"epoch": 2.0772797527047913,
"grad_norm": 10.125,
"learning_rate": 6.905263157894737e-06,
"loss": 0.3973,
"step": 1344
},
{
"epoch": 2.078825347758887,
"grad_norm": 10.0,
"learning_rate": 6.894736842105264e-06,
"loss": 0.3799,
"step": 1345
},
{
"epoch": 2.080370942812983,
"grad_norm": 9.4375,
"learning_rate": 6.8842105263157895e-06,
"loss": 0.3657,
"step": 1346
},
{
"epoch": 2.081916537867079,
"grad_norm": 9.1875,
"learning_rate": 6.873684210526317e-06,
"loss": 0.3858,
"step": 1347
},
{
"epoch": 2.0834621329211744,
"grad_norm": 11.6875,
"learning_rate": 6.863157894736843e-06,
"loss": 0.4277,
"step": 1348
},
{
"epoch": 2.0850077279752703,
"grad_norm": 14.125,
"learning_rate": 6.8526315789473685e-06,
"loss": 0.3472,
"step": 1349
},
{
"epoch": 2.0865533230293662,
"grad_norm": 12.8125,
"learning_rate": 6.842105263157896e-06,
"loss": 0.3654,
"step": 1350
},
{
"epoch": 2.088098918083462,
"grad_norm": 8.75,
"learning_rate": 6.831578947368421e-06,
"loss": 0.3987,
"step": 1351
},
{
"epoch": 2.089644513137558,
"grad_norm": 10.8125,
"learning_rate": 6.821052631578948e-06,
"loss": 0.3386,
"step": 1352
},
{
"epoch": 2.091190108191654,
"grad_norm": 10.9375,
"learning_rate": 6.810526315789474e-06,
"loss": 0.466,
"step": 1353
},
{
"epoch": 2.0927357032457494,
"grad_norm": 10.0,
"learning_rate": 6.800000000000001e-06,
"loss": 0.4296,
"step": 1354
},
{
"epoch": 2.0942812982998453,
"grad_norm": 11.1875,
"learning_rate": 6.789473684210527e-06,
"loss": 0.3884,
"step": 1355
},
{
"epoch": 2.0958268933539412,
"grad_norm": 10.75,
"learning_rate": 6.778947368421053e-06,
"loss": 0.3552,
"step": 1356
},
{
"epoch": 2.097372488408037,
"grad_norm": 10.375,
"learning_rate": 6.76842105263158e-06,
"loss": 0.4134,
"step": 1357
},
{
"epoch": 2.098918083462133,
"grad_norm": 10.0625,
"learning_rate": 6.7578947368421054e-06,
"loss": 0.429,
"step": 1358
},
{
"epoch": 2.100463678516229,
"grad_norm": 9.6875,
"learning_rate": 6.747368421052633e-06,
"loss": 0.3613,
"step": 1359
},
{
"epoch": 2.1020092735703244,
"grad_norm": 9.9375,
"learning_rate": 6.736842105263158e-06,
"loss": 0.3704,
"step": 1360
},
{
"epoch": 2.1035548686244203,
"grad_norm": 11.625,
"learning_rate": 6.726315789473685e-06,
"loss": 0.4295,
"step": 1361
},
{
"epoch": 2.105100463678516,
"grad_norm": 9.75,
"learning_rate": 6.715789473684211e-06,
"loss": 0.4071,
"step": 1362
},
{
"epoch": 2.106646058732612,
"grad_norm": 10.0,
"learning_rate": 6.705263157894737e-06,
"loss": 0.447,
"step": 1363
},
{
"epoch": 2.108191653786708,
"grad_norm": 10.5625,
"learning_rate": 6.694736842105264e-06,
"loss": 0.4299,
"step": 1364
},
{
"epoch": 2.109737248840804,
"grad_norm": 8.6875,
"learning_rate": 6.68421052631579e-06,
"loss": 0.3642,
"step": 1365
},
{
"epoch": 2.1112828438948994,
"grad_norm": 11.1875,
"learning_rate": 6.673684210526317e-06,
"loss": 0.4116,
"step": 1366
},
{
"epoch": 2.1128284389489953,
"grad_norm": 10.6875,
"learning_rate": 6.663157894736842e-06,
"loss": 0.4034,
"step": 1367
},
{
"epoch": 2.114374034003091,
"grad_norm": 10.0625,
"learning_rate": 6.6526315789473695e-06,
"loss": 0.3993,
"step": 1368
},
{
"epoch": 2.115919629057187,
"grad_norm": 11.4375,
"learning_rate": 6.642105263157895e-06,
"loss": 0.4173,
"step": 1369
},
{
"epoch": 2.117465224111283,
"grad_norm": 12.6875,
"learning_rate": 6.631578947368421e-06,
"loss": 0.4035,
"step": 1370
},
{
"epoch": 2.1190108191653785,
"grad_norm": 10.4375,
"learning_rate": 6.621052631578948e-06,
"loss": 0.383,
"step": 1371
},
{
"epoch": 2.1205564142194744,
"grad_norm": 9.125,
"learning_rate": 6.610526315789474e-06,
"loss": 0.3614,
"step": 1372
},
{
"epoch": 2.1221020092735703,
"grad_norm": 10.3125,
"learning_rate": 6.600000000000001e-06,
"loss": 0.4334,
"step": 1373
},
{
"epoch": 2.123647604327666,
"grad_norm": 9.5,
"learning_rate": 6.589473684210527e-06,
"loss": 0.4083,
"step": 1374
},
{
"epoch": 2.125193199381762,
"grad_norm": 9.0,
"learning_rate": 6.578947368421054e-06,
"loss": 0.4057,
"step": 1375
},
{
"epoch": 2.126738794435858,
"grad_norm": 9.75,
"learning_rate": 6.568421052631579e-06,
"loss": 0.3865,
"step": 1376
},
{
"epoch": 2.1282843894899535,
"grad_norm": 9.1875,
"learning_rate": 6.557894736842106e-06,
"loss": 0.3658,
"step": 1377
},
{
"epoch": 2.1298299845440494,
"grad_norm": 9.25,
"learning_rate": 6.547368421052632e-06,
"loss": 0.379,
"step": 1378
},
{
"epoch": 2.1313755795981453,
"grad_norm": 9.0,
"learning_rate": 6.536842105263158e-06,
"loss": 0.3727,
"step": 1379
},
{
"epoch": 2.132921174652241,
"grad_norm": 10.3125,
"learning_rate": 6.526315789473685e-06,
"loss": 0.4024,
"step": 1380
},
{
"epoch": 2.134466769706337,
"grad_norm": 9.375,
"learning_rate": 6.515789473684211e-06,
"loss": 0.372,
"step": 1381
},
{
"epoch": 2.1360123647604325,
"grad_norm": 9.9375,
"learning_rate": 6.505263157894738e-06,
"loss": 0.3966,
"step": 1382
},
{
"epoch": 2.1375579598145285,
"grad_norm": 18.5,
"learning_rate": 6.494736842105264e-06,
"loss": 0.396,
"step": 1383
},
{
"epoch": 2.1391035548686244,
"grad_norm": 10.1875,
"learning_rate": 6.484210526315789e-06,
"loss": 0.4434,
"step": 1384
},
{
"epoch": 2.1406491499227203,
"grad_norm": 9.875,
"learning_rate": 6.473684210526316e-06,
"loss": 0.4041,
"step": 1385
},
{
"epoch": 2.142194744976816,
"grad_norm": 9.4375,
"learning_rate": 6.463157894736843e-06,
"loss": 0.3848,
"step": 1386
},
{
"epoch": 2.143740340030912,
"grad_norm": 9.8125,
"learning_rate": 6.452631578947369e-06,
"loss": 0.4139,
"step": 1387
},
{
"epoch": 2.1452859350850075,
"grad_norm": 13.375,
"learning_rate": 6.442105263157895e-06,
"loss": 0.42,
"step": 1388
},
{
"epoch": 2.1468315301391034,
"grad_norm": 10.75,
"learning_rate": 6.431578947368422e-06,
"loss": 0.4103,
"step": 1389
},
{
"epoch": 2.1483771251931993,
"grad_norm": 20.125,
"learning_rate": 6.421052631578948e-06,
"loss": 0.4167,
"step": 1390
},
{
"epoch": 2.1499227202472952,
"grad_norm": 12.6875,
"learning_rate": 6.410526315789473e-06,
"loss": 0.3943,
"step": 1391
},
{
"epoch": 2.151468315301391,
"grad_norm": 10.0,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.4198,
"step": 1392
},
{
"epoch": 2.153013910355487,
"grad_norm": 11.3125,
"learning_rate": 6.389473684210527e-06,
"loss": 0.3931,
"step": 1393
},
{
"epoch": 2.1545595054095825,
"grad_norm": 11.0625,
"learning_rate": 6.378947368421053e-06,
"loss": 0.4473,
"step": 1394
},
{
"epoch": 2.1561051004636784,
"grad_norm": 10.8125,
"learning_rate": 6.3684210526315795e-06,
"loss": 0.4545,
"step": 1395
},
{
"epoch": 2.1576506955177743,
"grad_norm": 10.625,
"learning_rate": 6.357894736842106e-06,
"loss": 0.4511,
"step": 1396
},
{
"epoch": 2.1591962905718702,
"grad_norm": 9.5,
"learning_rate": 6.347368421052632e-06,
"loss": 0.3953,
"step": 1397
},
{
"epoch": 2.160741885625966,
"grad_norm": 9.5,
"learning_rate": 6.336842105263158e-06,
"loss": 0.4376,
"step": 1398
},
{
"epoch": 2.162287480680062,
"grad_norm": 10.125,
"learning_rate": 6.326315789473685e-06,
"loss": 0.3919,
"step": 1399
},
{
"epoch": 2.1638330757341575,
"grad_norm": 11.625,
"learning_rate": 6.31578947368421e-06,
"loss": 0.4433,
"step": 1400
},
{
"epoch": 2.1653786707882534,
"grad_norm": 10.75,
"learning_rate": 6.3052631578947375e-06,
"loss": 0.4774,
"step": 1401
},
{
"epoch": 2.1669242658423493,
"grad_norm": 11.5625,
"learning_rate": 6.294736842105264e-06,
"loss": 0.3942,
"step": 1402
},
{
"epoch": 2.1684698608964452,
"grad_norm": 11.3125,
"learning_rate": 6.28421052631579e-06,
"loss": 0.4453,
"step": 1403
},
{
"epoch": 2.170015455950541,
"grad_norm": 10.0625,
"learning_rate": 6.2736842105263165e-06,
"loss": 0.3921,
"step": 1404
},
{
"epoch": 2.1715610510046366,
"grad_norm": 11.625,
"learning_rate": 6.263157894736842e-06,
"loss": 0.405,
"step": 1405
},
{
"epoch": 2.1731066460587325,
"grad_norm": 13.3125,
"learning_rate": 6.252631578947369e-06,
"loss": 0.3684,
"step": 1406
},
{
"epoch": 2.1746522411128284,
"grad_norm": 11.8125,
"learning_rate": 6.242105263157895e-06,
"loss": 0.4626,
"step": 1407
},
{
"epoch": 2.1761978361669243,
"grad_norm": 10.25,
"learning_rate": 6.231578947368422e-06,
"loss": 0.422,
"step": 1408
},
{
"epoch": 2.17774343122102,
"grad_norm": 11.6875,
"learning_rate": 6.221052631578947e-06,
"loss": 0.4048,
"step": 1409
},
{
"epoch": 2.179289026275116,
"grad_norm": 10.1875,
"learning_rate": 6.2105263157894745e-06,
"loss": 0.4063,
"step": 1410
},
{
"epoch": 2.1808346213292116,
"grad_norm": 11.1875,
"learning_rate": 6.200000000000001e-06,
"loss": 0.4196,
"step": 1411
},
{
"epoch": 2.1823802163833075,
"grad_norm": 9.0,
"learning_rate": 6.189473684210526e-06,
"loss": 0.3823,
"step": 1412
},
{
"epoch": 2.1839258114374034,
"grad_norm": 9.75,
"learning_rate": 6.1789473684210534e-06,
"loss": 0.4392,
"step": 1413
},
{
"epoch": 2.1854714064914993,
"grad_norm": 13.6875,
"learning_rate": 6.168421052631579e-06,
"loss": 0.4116,
"step": 1414
},
{
"epoch": 2.187017001545595,
"grad_norm": 9.0,
"learning_rate": 6.157894736842106e-06,
"loss": 0.4159,
"step": 1415
},
{
"epoch": 2.1885625965996907,
"grad_norm": 11.125,
"learning_rate": 6.1473684210526316e-06,
"loss": 0.3939,
"step": 1416
},
{
"epoch": 2.1901081916537866,
"grad_norm": 10.5,
"learning_rate": 6.136842105263159e-06,
"loss": 0.3893,
"step": 1417
},
{
"epoch": 2.1916537867078825,
"grad_norm": 9.25,
"learning_rate": 6.126315789473685e-06,
"loss": 0.3817,
"step": 1418
},
{
"epoch": 2.1931993817619784,
"grad_norm": 10.3125,
"learning_rate": 6.1157894736842106e-06,
"loss": 0.4388,
"step": 1419
},
{
"epoch": 2.1947449768160743,
"grad_norm": 12.125,
"learning_rate": 6.105263157894738e-06,
"loss": 0.5682,
"step": 1420
},
{
"epoch": 2.19629057187017,
"grad_norm": 10.8125,
"learning_rate": 6.094736842105263e-06,
"loss": 0.3891,
"step": 1421
},
{
"epoch": 2.1978361669242656,
"grad_norm": 9.9375,
"learning_rate": 6.08421052631579e-06,
"loss": 0.3766,
"step": 1422
},
{
"epoch": 2.1993817619783615,
"grad_norm": 13.0625,
"learning_rate": 6.073684210526316e-06,
"loss": 0.4692,
"step": 1423
},
{
"epoch": 2.2009273570324575,
"grad_norm": 9.6875,
"learning_rate": 6.063157894736843e-06,
"loss": 0.3734,
"step": 1424
},
{
"epoch": 2.2024729520865534,
"grad_norm": 9.3125,
"learning_rate": 6.0526315789473685e-06,
"loss": 0.3428,
"step": 1425
},
{
"epoch": 2.2040185471406493,
"grad_norm": 10.3125,
"learning_rate": 6.042105263157895e-06,
"loss": 0.4315,
"step": 1426
},
{
"epoch": 2.205564142194745,
"grad_norm": 15.125,
"learning_rate": 6.031578947368422e-06,
"loss": 0.4135,
"step": 1427
},
{
"epoch": 2.2071097372488406,
"grad_norm": 10.25,
"learning_rate": 6.0210526315789475e-06,
"loss": 0.4742,
"step": 1428
},
{
"epoch": 2.2086553323029365,
"grad_norm": 8.9375,
"learning_rate": 6.010526315789475e-06,
"loss": 0.3849,
"step": 1429
},
{
"epoch": 2.2102009273570324,
"grad_norm": 10.0,
"learning_rate": 6e-06,
"loss": 0.4307,
"step": 1430
},
{
"epoch": 2.2117465224111283,
"grad_norm": 9.25,
"learning_rate": 5.989473684210527e-06,
"loss": 0.3806,
"step": 1431
},
{
"epoch": 2.2132921174652243,
"grad_norm": 9.875,
"learning_rate": 5.978947368421053e-06,
"loss": 0.3976,
"step": 1432
},
{
"epoch": 2.21483771251932,
"grad_norm": 10.125,
"learning_rate": 5.968421052631579e-06,
"loss": 0.4277,
"step": 1433
},
{
"epoch": 2.2163833075734156,
"grad_norm": 10.1875,
"learning_rate": 5.9578947368421055e-06,
"loss": 0.3941,
"step": 1434
},
{
"epoch": 2.2179289026275115,
"grad_norm": 11.75,
"learning_rate": 5.947368421052632e-06,
"loss": 0.4576,
"step": 1435
},
{
"epoch": 2.2194744976816074,
"grad_norm": 9.5,
"learning_rate": 5.936842105263159e-06,
"loss": 0.4679,
"step": 1436
},
{
"epoch": 2.2210200927357033,
"grad_norm": 9.4375,
"learning_rate": 5.9263157894736844e-06,
"loss": 0.4179,
"step": 1437
},
{
"epoch": 2.2225656877897992,
"grad_norm": 10.75,
"learning_rate": 5.915789473684212e-06,
"loss": 0.4054,
"step": 1438
},
{
"epoch": 2.2241112828438947,
"grad_norm": 9.375,
"learning_rate": 5.905263157894737e-06,
"loss": 0.4408,
"step": 1439
},
{
"epoch": 2.2256568778979906,
"grad_norm": 8.75,
"learning_rate": 5.8947368421052634e-06,
"loss": 0.3918,
"step": 1440
},
{
"epoch": 2.2272024729520865,
"grad_norm": 10.6875,
"learning_rate": 5.88421052631579e-06,
"loss": 0.3736,
"step": 1441
},
{
"epoch": 2.2287480680061824,
"grad_norm": 12.625,
"learning_rate": 5.873684210526316e-06,
"loss": 0.445,
"step": 1442
},
{
"epoch": 2.2302936630602783,
"grad_norm": 9.5625,
"learning_rate": 5.863157894736842e-06,
"loss": 0.4658,
"step": 1443
},
{
"epoch": 2.2318392581143742,
"grad_norm": 9.375,
"learning_rate": 5.852631578947369e-06,
"loss": 0.3555,
"step": 1444
},
{
"epoch": 2.2333848531684697,
"grad_norm": 9.8125,
"learning_rate": 5.842105263157896e-06,
"loss": 0.4339,
"step": 1445
},
{
"epoch": 2.2349304482225656,
"grad_norm": 10.4375,
"learning_rate": 5.831578947368421e-06,
"loss": 0.4169,
"step": 1446
},
{
"epoch": 2.2364760432766615,
"grad_norm": 9.5625,
"learning_rate": 5.8210526315789486e-06,
"loss": 0.4089,
"step": 1447
},
{
"epoch": 2.2380216383307574,
"grad_norm": 10.5625,
"learning_rate": 5.810526315789474e-06,
"loss": 0.451,
"step": 1448
},
{
"epoch": 2.2395672333848533,
"grad_norm": 11.4375,
"learning_rate": 5.8e-06,
"loss": 0.4551,
"step": 1449
},
{
"epoch": 2.2411128284389488,
"grad_norm": 10.3125,
"learning_rate": 5.789473684210527e-06,
"loss": 0.405,
"step": 1450
},
{
"epoch": 2.2426584234930447,
"grad_norm": 9.875,
"learning_rate": 5.778947368421053e-06,
"loss": 0.4629,
"step": 1451
},
{
"epoch": 2.2442040185471406,
"grad_norm": 8.9375,
"learning_rate": 5.76842105263158e-06,
"loss": 0.3815,
"step": 1452
},
{
"epoch": 2.2457496136012365,
"grad_norm": 10.375,
"learning_rate": 5.757894736842106e-06,
"loss": 0.4429,
"step": 1453
},
{
"epoch": 2.2472952086553324,
"grad_norm": 10.125,
"learning_rate": 5.747368421052633e-06,
"loss": 0.4238,
"step": 1454
},
{
"epoch": 2.2488408037094283,
"grad_norm": 10.125,
"learning_rate": 5.736842105263158e-06,
"loss": 0.3564,
"step": 1455
},
{
"epoch": 2.250386398763524,
"grad_norm": 12.1875,
"learning_rate": 5.726315789473685e-06,
"loss": 0.4415,
"step": 1456
},
{
"epoch": 2.2519319938176197,
"grad_norm": 9.125,
"learning_rate": 5.715789473684211e-06,
"loss": 0.3871,
"step": 1457
},
{
"epoch": 2.2534775888717156,
"grad_norm": 10.125,
"learning_rate": 5.705263157894737e-06,
"loss": 0.4082,
"step": 1458
},
{
"epoch": 2.2550231839258115,
"grad_norm": 11.375,
"learning_rate": 5.694736842105264e-06,
"loss": 0.5043,
"step": 1459
},
{
"epoch": 2.2565687789799074,
"grad_norm": 9.25,
"learning_rate": 5.68421052631579e-06,
"loss": 0.414,
"step": 1460
},
{
"epoch": 2.2581143740340033,
"grad_norm": 11.25,
"learning_rate": 5.673684210526317e-06,
"loss": 0.4273,
"step": 1461
},
{
"epoch": 2.2596599690880987,
"grad_norm": 8.4375,
"learning_rate": 5.663157894736843e-06,
"loss": 0.366,
"step": 1462
},
{
"epoch": 2.2612055641421946,
"grad_norm": 28.375,
"learning_rate": 5.652631578947368e-06,
"loss": 0.4265,
"step": 1463
},
{
"epoch": 2.2627511591962906,
"grad_norm": 12.0625,
"learning_rate": 5.642105263157895e-06,
"loss": 0.417,
"step": 1464
},
{
"epoch": 2.2642967542503865,
"grad_norm": 8.875,
"learning_rate": 5.631578947368422e-06,
"loss": 0.4107,
"step": 1465
},
{
"epoch": 2.2658423493044824,
"grad_norm": 9.4375,
"learning_rate": 5.621052631578948e-06,
"loss": 0.4519,
"step": 1466
},
{
"epoch": 2.2673879443585783,
"grad_norm": 9.8125,
"learning_rate": 5.610526315789474e-06,
"loss": 0.4128,
"step": 1467
},
{
"epoch": 2.2689335394126737,
"grad_norm": 10.75,
"learning_rate": 5.600000000000001e-06,
"loss": 0.4795,
"step": 1468
},
{
"epoch": 2.2704791344667696,
"grad_norm": 11.0625,
"learning_rate": 5.589473684210527e-06,
"loss": 0.3894,
"step": 1469
},
{
"epoch": 2.2720247295208655,
"grad_norm": 10.0625,
"learning_rate": 5.578947368421052e-06,
"loss": 0.4256,
"step": 1470
},
{
"epoch": 2.2735703245749614,
"grad_norm": 10.75,
"learning_rate": 5.5684210526315796e-06,
"loss": 0.397,
"step": 1471
},
{
"epoch": 2.2751159196290573,
"grad_norm": 10.1875,
"learning_rate": 5.557894736842105e-06,
"loss": 0.4466,
"step": 1472
},
{
"epoch": 2.276661514683153,
"grad_norm": 12.125,
"learning_rate": 5.547368421052632e-06,
"loss": 0.4147,
"step": 1473
},
{
"epoch": 2.2782071097372487,
"grad_norm": 9.6875,
"learning_rate": 5.5368421052631586e-06,
"loss": 0.3968,
"step": 1474
},
{
"epoch": 2.2797527047913446,
"grad_norm": 21.375,
"learning_rate": 5.526315789473685e-06,
"loss": 0.4401,
"step": 1475
},
{
"epoch": 2.2812982998454405,
"grad_norm": 11.75,
"learning_rate": 5.515789473684211e-06,
"loss": 0.3951,
"step": 1476
},
{
"epoch": 2.2828438948995364,
"grad_norm": 8.3125,
"learning_rate": 5.505263157894737e-06,
"loss": 0.3654,
"step": 1477
},
{
"epoch": 2.2843894899536323,
"grad_norm": 9.9375,
"learning_rate": 5.494736842105264e-06,
"loss": 0.4121,
"step": 1478
},
{
"epoch": 2.285935085007728,
"grad_norm": 10.0625,
"learning_rate": 5.484210526315789e-06,
"loss": 0.4137,
"step": 1479
},
{
"epoch": 2.2874806800618237,
"grad_norm": 10.0,
"learning_rate": 5.4736842105263165e-06,
"loss": 0.508,
"step": 1480
},
{
"epoch": 2.2890262751159196,
"grad_norm": 14.5625,
"learning_rate": 5.463157894736843e-06,
"loss": 0.436,
"step": 1481
},
{
"epoch": 2.2905718701700155,
"grad_norm": 9.9375,
"learning_rate": 5.452631578947369e-06,
"loss": 0.4118,
"step": 1482
},
{
"epoch": 2.2921174652241114,
"grad_norm": 11.375,
"learning_rate": 5.4421052631578955e-06,
"loss": 0.4587,
"step": 1483
},
{
"epoch": 2.293663060278207,
"grad_norm": 10.4375,
"learning_rate": 5.431578947368421e-06,
"loss": 0.3737,
"step": 1484
},
{
"epoch": 2.295208655332303,
"grad_norm": 10.25,
"learning_rate": 5.421052631578948e-06,
"loss": 0.4325,
"step": 1485
},
{
"epoch": 2.2967542503863987,
"grad_norm": 10.25,
"learning_rate": 5.410526315789474e-06,
"loss": 0.4284,
"step": 1486
},
{
"epoch": 2.2982998454404946,
"grad_norm": 9.3125,
"learning_rate": 5.400000000000001e-06,
"loss": 0.4085,
"step": 1487
},
{
"epoch": 2.2998454404945905,
"grad_norm": 10.8125,
"learning_rate": 5.389473684210526e-06,
"loss": 0.374,
"step": 1488
},
{
"epoch": 2.3013910355486864,
"grad_norm": 9.4375,
"learning_rate": 5.3789473684210535e-06,
"loss": 0.3844,
"step": 1489
},
{
"epoch": 2.3029366306027823,
"grad_norm": 10.4375,
"learning_rate": 5.36842105263158e-06,
"loss": 0.4483,
"step": 1490
},
{
"epoch": 2.3044822256568778,
"grad_norm": 9.375,
"learning_rate": 5.357894736842105e-06,
"loss": 0.3997,
"step": 1491
},
{
"epoch": 2.3060278207109737,
"grad_norm": 9.5,
"learning_rate": 5.3473684210526325e-06,
"loss": 0.3746,
"step": 1492
},
{
"epoch": 2.3075734157650696,
"grad_norm": 11.375,
"learning_rate": 5.336842105263158e-06,
"loss": 0.4513,
"step": 1493
},
{
"epoch": 2.3091190108191655,
"grad_norm": 12.125,
"learning_rate": 5.326315789473685e-06,
"loss": 0.4056,
"step": 1494
},
{
"epoch": 2.3106646058732614,
"grad_norm": 8.6875,
"learning_rate": 5.315789473684211e-06,
"loss": 0.3634,
"step": 1495
},
{
"epoch": 2.312210200927357,
"grad_norm": 9.5625,
"learning_rate": 5.305263157894738e-06,
"loss": 0.4042,
"step": 1496
},
{
"epoch": 2.3137557959814528,
"grad_norm": 9.3125,
"learning_rate": 5.294736842105263e-06,
"loss": 0.3689,
"step": 1497
},
{
"epoch": 2.3153013910355487,
"grad_norm": 13.1875,
"learning_rate": 5.2842105263157896e-06,
"loss": 0.3948,
"step": 1498
},
{
"epoch": 2.3168469860896446,
"grad_norm": 9.8125,
"learning_rate": 5.273684210526317e-06,
"loss": 0.4147,
"step": 1499
},
{
"epoch": 2.3183925811437405,
"grad_norm": 33.0,
"learning_rate": 5.263157894736842e-06,
"loss": 0.4441,
"step": 1500
},
{
"epoch": 2.3199381761978364,
"grad_norm": 10.5,
"learning_rate": 5.252631578947369e-06,
"loss": 0.4294,
"step": 1501
},
{
"epoch": 2.321483771251932,
"grad_norm": 9.0625,
"learning_rate": 5.242105263157895e-06,
"loss": 0.364,
"step": 1502
},
{
"epoch": 2.3230293663060277,
"grad_norm": 11.0625,
"learning_rate": 5.231578947368422e-06,
"loss": 0.4008,
"step": 1503
},
{
"epoch": 2.3245749613601236,
"grad_norm": 12.0625,
"learning_rate": 5.2210526315789475e-06,
"loss": 0.4188,
"step": 1504
},
{
"epoch": 2.3261205564142196,
"grad_norm": 22.375,
"learning_rate": 5.210526315789474e-06,
"loss": 0.4373,
"step": 1505
},
{
"epoch": 2.3276661514683155,
"grad_norm": 12.0,
"learning_rate": 5.2e-06,
"loss": 0.4485,
"step": 1506
},
{
"epoch": 2.329211746522411,
"grad_norm": 11.125,
"learning_rate": 5.1894736842105265e-06,
"loss": 0.3989,
"step": 1507
},
{
"epoch": 2.330757341576507,
"grad_norm": 11.0,
"learning_rate": 5.178947368421054e-06,
"loss": 0.4409,
"step": 1508
},
{
"epoch": 2.3323029366306027,
"grad_norm": 10.625,
"learning_rate": 5.168421052631579e-06,
"loss": 0.4522,
"step": 1509
},
{
"epoch": 2.3338485316846986,
"grad_norm": 9.875,
"learning_rate": 5.157894736842106e-06,
"loss": 0.4109,
"step": 1510
},
{
"epoch": 2.3353941267387945,
"grad_norm": 9.25,
"learning_rate": 5.147368421052632e-06,
"loss": 0.3738,
"step": 1511
},
{
"epoch": 2.3369397217928904,
"grad_norm": 10.0625,
"learning_rate": 5.136842105263158e-06,
"loss": 0.4172,
"step": 1512
},
{
"epoch": 2.338485316846986,
"grad_norm": 9.625,
"learning_rate": 5.1263157894736845e-06,
"loss": 0.4146,
"step": 1513
},
{
"epoch": 2.340030911901082,
"grad_norm": 9.4375,
"learning_rate": 5.115789473684211e-06,
"loss": 0.3463,
"step": 1514
},
{
"epoch": 2.3415765069551777,
"grad_norm": 10.375,
"learning_rate": 5.105263157894738e-06,
"loss": 0.4149,
"step": 1515
},
{
"epoch": 2.3431221020092736,
"grad_norm": 14.375,
"learning_rate": 5.0947368421052635e-06,
"loss": 0.4231,
"step": 1516
},
{
"epoch": 2.3446676970633695,
"grad_norm": 10.125,
"learning_rate": 5.084210526315791e-06,
"loss": 0.4468,
"step": 1517
},
{
"epoch": 2.346213292117465,
"grad_norm": 12.5625,
"learning_rate": 5.073684210526316e-06,
"loss": 0.3793,
"step": 1518
},
{
"epoch": 2.347758887171561,
"grad_norm": 11.5625,
"learning_rate": 5.0631578947368424e-06,
"loss": 0.4346,
"step": 1519
},
{
"epoch": 2.349304482225657,
"grad_norm": 10.0,
"learning_rate": 5.052631578947369e-06,
"loss": 0.3638,
"step": 1520
},
{
"epoch": 2.3508500772797527,
"grad_norm": 11.6875,
"learning_rate": 5.042105263157895e-06,
"loss": 0.462,
"step": 1521
},
{
"epoch": 2.3523956723338486,
"grad_norm": 10.0,
"learning_rate": 5.0315789473684214e-06,
"loss": 0.3755,
"step": 1522
},
{
"epoch": 2.3539412673879445,
"grad_norm": 9.5,
"learning_rate": 5.021052631578948e-06,
"loss": 0.4176,
"step": 1523
},
{
"epoch": 2.3554868624420404,
"grad_norm": 16.625,
"learning_rate": 5.010526315789475e-06,
"loss": 0.427,
"step": 1524
},
{
"epoch": 2.357032457496136,
"grad_norm": 12.25,
"learning_rate": 5e-06,
"loss": 0.4444,
"step": 1525
},
{
"epoch": 2.358578052550232,
"grad_norm": 10.625,
"learning_rate": 4.989473684210527e-06,
"loss": 0.4319,
"step": 1526
},
{
"epoch": 2.3601236476043277,
"grad_norm": 10.6875,
"learning_rate": 4.978947368421053e-06,
"loss": 0.406,
"step": 1527
},
{
"epoch": 2.3616692426584236,
"grad_norm": 12.125,
"learning_rate": 4.968421052631579e-06,
"loss": 0.4427,
"step": 1528
},
{
"epoch": 2.3632148377125195,
"grad_norm": 18.25,
"learning_rate": 4.957894736842106e-06,
"loss": 0.4267,
"step": 1529
},
{
"epoch": 2.364760432766615,
"grad_norm": 11.9375,
"learning_rate": 4.947368421052632e-06,
"loss": 0.4236,
"step": 1530
},
{
"epoch": 2.366306027820711,
"grad_norm": 13.75,
"learning_rate": 4.936842105263158e-06,
"loss": 0.3631,
"step": 1531
},
{
"epoch": 2.3678516228748068,
"grad_norm": 9.3125,
"learning_rate": 4.926315789473685e-06,
"loss": 0.4081,
"step": 1532
},
{
"epoch": 2.3693972179289027,
"grad_norm": 10.3125,
"learning_rate": 4.915789473684211e-06,
"loss": 0.431,
"step": 1533
},
{
"epoch": 2.3709428129829986,
"grad_norm": 11.0,
"learning_rate": 4.905263157894737e-06,
"loss": 0.3904,
"step": 1534
},
{
"epoch": 2.3724884080370945,
"grad_norm": 10.0,
"learning_rate": 4.894736842105264e-06,
"loss": 0.3249,
"step": 1535
},
{
"epoch": 2.37403400309119,
"grad_norm": 10.25,
"learning_rate": 4.88421052631579e-06,
"loss": 0.4481,
"step": 1536
},
{
"epoch": 2.375579598145286,
"grad_norm": 10.8125,
"learning_rate": 4.873684210526316e-06,
"loss": 0.4045,
"step": 1537
},
{
"epoch": 2.3771251931993818,
"grad_norm": 11.125,
"learning_rate": 4.863157894736843e-06,
"loss": 0.4266,
"step": 1538
},
{
"epoch": 2.3786707882534777,
"grad_norm": 11.25,
"learning_rate": 4.852631578947369e-06,
"loss": 0.4105,
"step": 1539
},
{
"epoch": 2.3802163833075736,
"grad_norm": 11.0,
"learning_rate": 4.842105263157895e-06,
"loss": 0.3739,
"step": 1540
},
{
"epoch": 2.381761978361669,
"grad_norm": 9.6875,
"learning_rate": 4.831578947368422e-06,
"loss": 0.367,
"step": 1541
},
{
"epoch": 2.383307573415765,
"grad_norm": 13.6875,
"learning_rate": 4.821052631578948e-06,
"loss": 0.4097,
"step": 1542
},
{
"epoch": 2.384853168469861,
"grad_norm": 11.5,
"learning_rate": 4.8105263157894735e-06,
"loss": 0.3812,
"step": 1543
},
{
"epoch": 2.3863987635239567,
"grad_norm": 10.4375,
"learning_rate": 4.800000000000001e-06,
"loss": 0.3696,
"step": 1544
},
{
"epoch": 2.3879443585780527,
"grad_norm": 16.375,
"learning_rate": 4.789473684210527e-06,
"loss": 0.4267,
"step": 1545
},
{
"epoch": 2.3894899536321486,
"grad_norm": 11.125,
"learning_rate": 4.778947368421053e-06,
"loss": 0.445,
"step": 1546
},
{
"epoch": 2.391035548686244,
"grad_norm": 9.625,
"learning_rate": 4.76842105263158e-06,
"loss": 0.4179,
"step": 1547
},
{
"epoch": 2.39258114374034,
"grad_norm": 12.4375,
"learning_rate": 4.757894736842106e-06,
"loss": 0.3801,
"step": 1548
},
{
"epoch": 2.394126738794436,
"grad_norm": 12.8125,
"learning_rate": 4.747368421052632e-06,
"loss": 0.3343,
"step": 1549
},
{
"epoch": 2.3956723338485317,
"grad_norm": 24.875,
"learning_rate": 4.736842105263158e-06,
"loss": 0.4098,
"step": 1550
},
{
"epoch": 2.3972179289026276,
"grad_norm": 11.75,
"learning_rate": 4.726315789473684e-06,
"loss": 0.4387,
"step": 1551
},
{
"epoch": 2.398763523956723,
"grad_norm": 9.125,
"learning_rate": 4.71578947368421e-06,
"loss": 0.4161,
"step": 1552
},
{
"epoch": 2.400309119010819,
"grad_norm": 11.25,
"learning_rate": 4.705263157894738e-06,
"loss": 0.4809,
"step": 1553
},
{
"epoch": 2.401854714064915,
"grad_norm": 11.25,
"learning_rate": 4.694736842105264e-06,
"loss": 0.4091,
"step": 1554
},
{
"epoch": 2.403400309119011,
"grad_norm": 10.5625,
"learning_rate": 4.68421052631579e-06,
"loss": 0.3778,
"step": 1555
},
{
"epoch": 2.4049459041731067,
"grad_norm": 10.3125,
"learning_rate": 4.6736842105263166e-06,
"loss": 0.3738,
"step": 1556
},
{
"epoch": 2.4064914992272026,
"grad_norm": 11.375,
"learning_rate": 4.663157894736842e-06,
"loss": 0.4912,
"step": 1557
},
{
"epoch": 2.4080370942812985,
"grad_norm": 9.8125,
"learning_rate": 4.652631578947368e-06,
"loss": 0.3971,
"step": 1558
},
{
"epoch": 2.409582689335394,
"grad_norm": 11.625,
"learning_rate": 4.642105263157895e-06,
"loss": 0.3892,
"step": 1559
},
{
"epoch": 2.41112828438949,
"grad_norm": 9.875,
"learning_rate": 4.631578947368421e-06,
"loss": 0.4003,
"step": 1560
},
{
"epoch": 2.412673879443586,
"grad_norm": 10.1875,
"learning_rate": 4.621052631578948e-06,
"loss": 0.4324,
"step": 1561
},
{
"epoch": 2.4142194744976817,
"grad_norm": 9.375,
"learning_rate": 4.6105263157894745e-06,
"loss": 0.3877,
"step": 1562
},
{
"epoch": 2.4157650695517776,
"grad_norm": 10.375,
"learning_rate": 4.600000000000001e-06,
"loss": 0.4197,
"step": 1563
},
{
"epoch": 2.417310664605873,
"grad_norm": 8.5625,
"learning_rate": 4.589473684210526e-06,
"loss": 0.3632,
"step": 1564
},
{
"epoch": 2.418856259659969,
"grad_norm": 9.875,
"learning_rate": 4.578947368421053e-06,
"loss": 0.3915,
"step": 1565
},
{
"epoch": 2.420401854714065,
"grad_norm": 8.625,
"learning_rate": 4.568421052631579e-06,
"loss": 0.3877,
"step": 1566
},
{
"epoch": 2.421947449768161,
"grad_norm": 10.125,
"learning_rate": 4.557894736842105e-06,
"loss": 0.4359,
"step": 1567
},
{
"epoch": 2.4234930448222567,
"grad_norm": 9.75,
"learning_rate": 4.547368421052632e-06,
"loss": 0.3725,
"step": 1568
},
{
"epoch": 2.4250386398763526,
"grad_norm": 10.3125,
"learning_rate": 4.536842105263158e-06,
"loss": 0.3291,
"step": 1569
},
{
"epoch": 2.426584234930448,
"grad_norm": 10.75,
"learning_rate": 4.526315789473685e-06,
"loss": 0.3561,
"step": 1570
},
{
"epoch": 2.428129829984544,
"grad_norm": 9.3125,
"learning_rate": 4.5157894736842115e-06,
"loss": 0.4158,
"step": 1571
},
{
"epoch": 2.42967542503864,
"grad_norm": 10.0625,
"learning_rate": 4.505263157894737e-06,
"loss": 0.3719,
"step": 1572
},
{
"epoch": 2.4312210200927358,
"grad_norm": 9.5,
"learning_rate": 4.494736842105263e-06,
"loss": 0.3725,
"step": 1573
},
{
"epoch": 2.4327666151468317,
"grad_norm": 10.6875,
"learning_rate": 4.48421052631579e-06,
"loss": 0.4742,
"step": 1574
},
{
"epoch": 2.434312210200927,
"grad_norm": 9.9375,
"learning_rate": 4.473684210526316e-06,
"loss": 0.4109,
"step": 1575
},
{
"epoch": 2.435857805255023,
"grad_norm": 11.8125,
"learning_rate": 4.463157894736842e-06,
"loss": 0.3921,
"step": 1576
},
{
"epoch": 2.437403400309119,
"grad_norm": 10.625,
"learning_rate": 4.452631578947369e-06,
"loss": 0.4003,
"step": 1577
},
{
"epoch": 2.438948995363215,
"grad_norm": 10.125,
"learning_rate": 4.442105263157896e-06,
"loss": 0.3946,
"step": 1578
},
{
"epoch": 2.4404945904173108,
"grad_norm": 9.75,
"learning_rate": 4.431578947368421e-06,
"loss": 0.394,
"step": 1579
},
{
"epoch": 2.4420401854714067,
"grad_norm": 10.875,
"learning_rate": 4.4210526315789476e-06,
"loss": 0.4385,
"step": 1580
},
{
"epoch": 2.443585780525502,
"grad_norm": 11.5,
"learning_rate": 4.410526315789474e-06,
"loss": 0.4159,
"step": 1581
},
{
"epoch": 2.445131375579598,
"grad_norm": 10.4375,
"learning_rate": 4.4e-06,
"loss": 0.3802,
"step": 1582
},
{
"epoch": 2.446676970633694,
"grad_norm": 10.375,
"learning_rate": 4.3894736842105266e-06,
"loss": 0.4597,
"step": 1583
},
{
"epoch": 2.44822256568779,
"grad_norm": 10.25,
"learning_rate": 4.378947368421053e-06,
"loss": 0.4427,
"step": 1584
},
{
"epoch": 2.4497681607418857,
"grad_norm": 10.75,
"learning_rate": 4.368421052631579e-06,
"loss": 0.4613,
"step": 1585
},
{
"epoch": 2.451313755795981,
"grad_norm": 10.75,
"learning_rate": 4.3578947368421055e-06,
"loss": 0.4316,
"step": 1586
},
{
"epoch": 2.452859350850077,
"grad_norm": 11.0625,
"learning_rate": 4.347368421052632e-06,
"loss": 0.4197,
"step": 1587
},
{
"epoch": 2.454404945904173,
"grad_norm": 14.75,
"learning_rate": 4.336842105263158e-06,
"loss": 0.411,
"step": 1588
},
{
"epoch": 2.455950540958269,
"grad_norm": 12.4375,
"learning_rate": 4.3263157894736845e-06,
"loss": 0.4625,
"step": 1589
},
{
"epoch": 2.457496136012365,
"grad_norm": 9.8125,
"learning_rate": 4.315789473684211e-06,
"loss": 0.418,
"step": 1590
},
{
"epoch": 2.4590417310664607,
"grad_norm": 20.125,
"learning_rate": 4.305263157894737e-06,
"loss": 0.379,
"step": 1591
},
{
"epoch": 2.4605873261205566,
"grad_norm": 11.0625,
"learning_rate": 4.2947368421052635e-06,
"loss": 0.3886,
"step": 1592
},
{
"epoch": 2.462132921174652,
"grad_norm": 10.8125,
"learning_rate": 4.28421052631579e-06,
"loss": 0.4662,
"step": 1593
},
{
"epoch": 2.463678516228748,
"grad_norm": 12.75,
"learning_rate": 4.273684210526316e-06,
"loss": 0.4249,
"step": 1594
},
{
"epoch": 2.465224111282844,
"grad_norm": 10.75,
"learning_rate": 4.2631578947368425e-06,
"loss": 0.3885,
"step": 1595
},
{
"epoch": 2.46676970633694,
"grad_norm": 9.3125,
"learning_rate": 4.252631578947369e-06,
"loss": 0.3949,
"step": 1596
},
{
"epoch": 2.4683153013910357,
"grad_norm": 10.0625,
"learning_rate": 4.242105263157895e-06,
"loss": 0.4085,
"step": 1597
},
{
"epoch": 2.469860896445131,
"grad_norm": 9.875,
"learning_rate": 4.2315789473684215e-06,
"loss": 0.392,
"step": 1598
},
{
"epoch": 2.471406491499227,
"grad_norm": 11.125,
"learning_rate": 4.221052631578948e-06,
"loss": 0.4615,
"step": 1599
},
{
"epoch": 2.472952086553323,
"grad_norm": 10.5,
"learning_rate": 4.210526315789474e-06,
"loss": 0.3662,
"step": 1600
},
{
"epoch": 2.474497681607419,
"grad_norm": 10.1875,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.4281,
"step": 1601
},
{
"epoch": 2.476043276661515,
"grad_norm": 10.875,
"learning_rate": 4.189473684210527e-06,
"loss": 0.382,
"step": 1602
},
{
"epoch": 2.4775888717156107,
"grad_norm": 9.8125,
"learning_rate": 4.178947368421053e-06,
"loss": 0.378,
"step": 1603
},
{
"epoch": 2.479134466769706,
"grad_norm": 11.75,
"learning_rate": 4.1684210526315794e-06,
"loss": 0.436,
"step": 1604
},
{
"epoch": 2.480680061823802,
"grad_norm": 10.75,
"learning_rate": 4.157894736842106e-06,
"loss": 0.3976,
"step": 1605
},
{
"epoch": 2.482225656877898,
"grad_norm": 10.4375,
"learning_rate": 4.147368421052632e-06,
"loss": 0.4102,
"step": 1606
},
{
"epoch": 2.483771251931994,
"grad_norm": 8.5625,
"learning_rate": 4.136842105263158e-06,
"loss": 0.3513,
"step": 1607
},
{
"epoch": 2.48531684698609,
"grad_norm": 12.3125,
"learning_rate": 4.126315789473685e-06,
"loss": 0.4326,
"step": 1608
},
{
"epoch": 2.4868624420401853,
"grad_norm": 10.1875,
"learning_rate": 4.115789473684211e-06,
"loss": 0.3978,
"step": 1609
},
{
"epoch": 2.488408037094281,
"grad_norm": 9.625,
"learning_rate": 4.105263157894737e-06,
"loss": 0.3764,
"step": 1610
},
{
"epoch": 2.489953632148377,
"grad_norm": 11.5625,
"learning_rate": 4.094736842105264e-06,
"loss": 0.4472,
"step": 1611
},
{
"epoch": 2.491499227202473,
"grad_norm": 9.6875,
"learning_rate": 4.08421052631579e-06,
"loss": 0.3672,
"step": 1612
},
{
"epoch": 2.493044822256569,
"grad_norm": 20.75,
"learning_rate": 4.073684210526316e-06,
"loss": 0.4568,
"step": 1613
},
{
"epoch": 2.4945904173106648,
"grad_norm": 10.0625,
"learning_rate": 4.063157894736842e-06,
"loss": 0.3847,
"step": 1614
},
{
"epoch": 2.4961360123647607,
"grad_norm": 9.9375,
"learning_rate": 4.052631578947368e-06,
"loss": 0.3891,
"step": 1615
},
{
"epoch": 2.497681607418856,
"grad_norm": 8.75,
"learning_rate": 4.042105263157895e-06,
"loss": 0.3774,
"step": 1616
},
{
"epoch": 2.499227202472952,
"grad_norm": 9.8125,
"learning_rate": 4.031578947368422e-06,
"loss": 0.3916,
"step": 1617
},
{
"epoch": 2.500772797527048,
"grad_norm": 10.125,
"learning_rate": 4.021052631578948e-06,
"loss": 0.4479,
"step": 1618
},
{
"epoch": 2.502318392581144,
"grad_norm": 12.8125,
"learning_rate": 4.010526315789474e-06,
"loss": 0.4328,
"step": 1619
},
{
"epoch": 2.5038639876352393,
"grad_norm": 11.625,
"learning_rate": 4.000000000000001e-06,
"loss": 0.4016,
"step": 1620
},
{
"epoch": 2.5054095826893352,
"grad_norm": 9.375,
"learning_rate": 3.989473684210526e-06,
"loss": 0.3279,
"step": 1621
},
{
"epoch": 2.506955177743431,
"grad_norm": 10.5625,
"learning_rate": 3.9789473684210525e-06,
"loss": 0.418,
"step": 1622
},
{
"epoch": 2.508500772797527,
"grad_norm": 13.375,
"learning_rate": 3.968421052631579e-06,
"loss": 0.4089,
"step": 1623
},
{
"epoch": 2.510046367851623,
"grad_norm": 9.1875,
"learning_rate": 3.957894736842106e-06,
"loss": 0.4321,
"step": 1624
},
{
"epoch": 2.511591962905719,
"grad_norm": 12.875,
"learning_rate": 3.947368421052632e-06,
"loss": 0.4205,
"step": 1625
},
{
"epoch": 2.5131375579598147,
"grad_norm": 9.3125,
"learning_rate": 3.936842105263159e-06,
"loss": 0.4202,
"step": 1626
},
{
"epoch": 2.51468315301391,
"grad_norm": 11.9375,
"learning_rate": 3.926315789473685e-06,
"loss": 0.4315,
"step": 1627
},
{
"epoch": 2.516228748068006,
"grad_norm": 10.375,
"learning_rate": 3.9157894736842104e-06,
"loss": 0.4529,
"step": 1628
},
{
"epoch": 2.517774343122102,
"grad_norm": 17.125,
"learning_rate": 3.905263157894737e-06,
"loss": 0.4297,
"step": 1629
},
{
"epoch": 2.519319938176198,
"grad_norm": 11.9375,
"learning_rate": 3.894736842105263e-06,
"loss": 0.3866,
"step": 1630
},
{
"epoch": 2.5208655332302934,
"grad_norm": 9.5625,
"learning_rate": 3.884210526315789e-06,
"loss": 0.3415,
"step": 1631
},
{
"epoch": 2.5224111282843893,
"grad_norm": 9.3125,
"learning_rate": 3.873684210526316e-06,
"loss": 0.3847,
"step": 1632
},
{
"epoch": 2.523956723338485,
"grad_norm": 10.8125,
"learning_rate": 3.863157894736843e-06,
"loss": 0.4469,
"step": 1633
},
{
"epoch": 2.525502318392581,
"grad_norm": 9.5625,
"learning_rate": 3.852631578947369e-06,
"loss": 0.3824,
"step": 1634
},
{
"epoch": 2.527047913446677,
"grad_norm": 9.3125,
"learning_rate": 3.842105263157895e-06,
"loss": 0.3743,
"step": 1635
},
{
"epoch": 2.528593508500773,
"grad_norm": 10.6875,
"learning_rate": 3.831578947368421e-06,
"loss": 0.445,
"step": 1636
},
{
"epoch": 2.530139103554869,
"grad_norm": 10.875,
"learning_rate": 3.821052631578947e-06,
"loss": 0.4259,
"step": 1637
},
{
"epoch": 2.5316846986089647,
"grad_norm": 9.5,
"learning_rate": 3.810526315789474e-06,
"loss": 0.4229,
"step": 1638
},
{
"epoch": 2.53323029366306,
"grad_norm": 9.5625,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.4429,
"step": 1639
},
{
"epoch": 2.534775888717156,
"grad_norm": 10.8125,
"learning_rate": 3.789473684210527e-06,
"loss": 0.372,
"step": 1640
},
{
"epoch": 2.536321483771252,
"grad_norm": 10.25,
"learning_rate": 3.778947368421053e-06,
"loss": 0.4234,
"step": 1641
},
{
"epoch": 2.537867078825348,
"grad_norm": 9.0625,
"learning_rate": 3.768421052631579e-06,
"loss": 0.4577,
"step": 1642
},
{
"epoch": 2.5394126738794434,
"grad_norm": 10.8125,
"learning_rate": 3.7578947368421053e-06,
"loss": 0.4178,
"step": 1643
},
{
"epoch": 2.5409582689335393,
"grad_norm": 10.0625,
"learning_rate": 3.7473684210526317e-06,
"loss": 0.374,
"step": 1644
},
{
"epoch": 2.542503863987635,
"grad_norm": 10.4375,
"learning_rate": 3.736842105263158e-06,
"loss": 0.444,
"step": 1645
},
{
"epoch": 2.544049459041731,
"grad_norm": 12.625,
"learning_rate": 3.7263157894736848e-06,
"loss": 0.4005,
"step": 1646
},
{
"epoch": 2.545595054095827,
"grad_norm": 8.875,
"learning_rate": 3.715789473684211e-06,
"loss": 0.3939,
"step": 1647
},
{
"epoch": 2.547140649149923,
"grad_norm": 10.4375,
"learning_rate": 3.7052631578947374e-06,
"loss": 0.4416,
"step": 1648
},
{
"epoch": 2.548686244204019,
"grad_norm": 11.0625,
"learning_rate": 3.6947368421052637e-06,
"loss": 0.3953,
"step": 1649
},
{
"epoch": 2.5502318392581143,
"grad_norm": 10.375,
"learning_rate": 3.6842105263157896e-06,
"loss": 0.3872,
"step": 1650
},
{
"epoch": 2.55177743431221,
"grad_norm": 10.0,
"learning_rate": 3.673684210526316e-06,
"loss": 0.4056,
"step": 1651
},
{
"epoch": 2.553323029366306,
"grad_norm": 10.125,
"learning_rate": 3.6631578947368423e-06,
"loss": 0.4306,
"step": 1652
},
{
"epoch": 2.554868624420402,
"grad_norm": 9.0,
"learning_rate": 3.6526315789473686e-06,
"loss": 0.4008,
"step": 1653
},
{
"epoch": 2.5564142194744974,
"grad_norm": 9.625,
"learning_rate": 3.642105263157895e-06,
"loss": 0.3627,
"step": 1654
},
{
"epoch": 2.5579598145285933,
"grad_norm": 10.3125,
"learning_rate": 3.6315789473684217e-06,
"loss": 0.4194,
"step": 1655
},
{
"epoch": 2.5595054095826892,
"grad_norm": 11.75,
"learning_rate": 3.621052631578948e-06,
"loss": 0.4478,
"step": 1656
},
{
"epoch": 2.561051004636785,
"grad_norm": 11.0,
"learning_rate": 3.610526315789474e-06,
"loss": 0.4145,
"step": 1657
},
{
"epoch": 2.562596599690881,
"grad_norm": 10.1875,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.4391,
"step": 1658
},
{
"epoch": 2.564142194744977,
"grad_norm": 9.9375,
"learning_rate": 3.5894736842105266e-06,
"loss": 0.4615,
"step": 1659
},
{
"epoch": 2.565687789799073,
"grad_norm": 10.4375,
"learning_rate": 3.578947368421053e-06,
"loss": 0.3776,
"step": 1660
},
{
"epoch": 2.5672333848531683,
"grad_norm": 11.1875,
"learning_rate": 3.5684210526315792e-06,
"loss": 0.3906,
"step": 1661
},
{
"epoch": 2.5687789799072642,
"grad_norm": 10.8125,
"learning_rate": 3.5578947368421056e-06,
"loss": 0.4325,
"step": 1662
},
{
"epoch": 2.57032457496136,
"grad_norm": 11.4375,
"learning_rate": 3.5473684210526323e-06,
"loss": 0.3989,
"step": 1663
},
{
"epoch": 2.571870170015456,
"grad_norm": 11.5,
"learning_rate": 3.536842105263158e-06,
"loss": 0.3671,
"step": 1664
},
{
"epoch": 2.573415765069552,
"grad_norm": 9.25,
"learning_rate": 3.5263157894736846e-06,
"loss": 0.4,
"step": 1665
},
{
"epoch": 2.5749613601236474,
"grad_norm": 9.3125,
"learning_rate": 3.515789473684211e-06,
"loss": 0.3852,
"step": 1666
},
{
"epoch": 2.5765069551777433,
"grad_norm": 10.8125,
"learning_rate": 3.505263157894737e-06,
"loss": 0.4481,
"step": 1667
},
{
"epoch": 2.578052550231839,
"grad_norm": 17.5,
"learning_rate": 3.4947368421052635e-06,
"loss": 0.411,
"step": 1668
},
{
"epoch": 2.579598145285935,
"grad_norm": 9.9375,
"learning_rate": 3.48421052631579e-06,
"loss": 0.4385,
"step": 1669
},
{
"epoch": 2.581143740340031,
"grad_norm": 10.6875,
"learning_rate": 3.473684210526316e-06,
"loss": 0.4663,
"step": 1670
},
{
"epoch": 2.582689335394127,
"grad_norm": 27.375,
"learning_rate": 3.463157894736842e-06,
"loss": 0.367,
"step": 1671
},
{
"epoch": 2.584234930448223,
"grad_norm": 14.0625,
"learning_rate": 3.4526315789473684e-06,
"loss": 0.393,
"step": 1672
},
{
"epoch": 2.5857805255023183,
"grad_norm": 12.75,
"learning_rate": 3.4421052631578947e-06,
"loss": 0.4084,
"step": 1673
},
{
"epoch": 2.587326120556414,
"grad_norm": 10.0625,
"learning_rate": 3.4315789473684215e-06,
"loss": 0.3922,
"step": 1674
},
{
"epoch": 2.58887171561051,
"grad_norm": 9.4375,
"learning_rate": 3.421052631578948e-06,
"loss": 0.3793,
"step": 1675
},
{
"epoch": 2.590417310664606,
"grad_norm": 10.25,
"learning_rate": 3.410526315789474e-06,
"loss": 0.3873,
"step": 1676
},
{
"epoch": 2.5919629057187015,
"grad_norm": 10.125,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.4841,
"step": 1677
},
{
"epoch": 2.5935085007727974,
"grad_norm": 9.8125,
"learning_rate": 3.3894736842105264e-06,
"loss": 0.4,
"step": 1678
},
{
"epoch": 2.5950540958268933,
"grad_norm": 10.625,
"learning_rate": 3.3789473684210527e-06,
"loss": 0.3864,
"step": 1679
},
{
"epoch": 2.596599690880989,
"grad_norm": 10.5625,
"learning_rate": 3.368421052631579e-06,
"loss": 0.4561,
"step": 1680
},
{
"epoch": 2.598145285935085,
"grad_norm": 10.75,
"learning_rate": 3.3578947368421054e-06,
"loss": 0.4422,
"step": 1681
},
{
"epoch": 2.599690880989181,
"grad_norm": 11.6875,
"learning_rate": 3.347368421052632e-06,
"loss": 0.3638,
"step": 1682
},
{
"epoch": 2.601236476043277,
"grad_norm": 10.0625,
"learning_rate": 3.3368421052631584e-06,
"loss": 0.3278,
"step": 1683
},
{
"epoch": 2.6027820710973724,
"grad_norm": 9.9375,
"learning_rate": 3.3263157894736848e-06,
"loss": 0.3749,
"step": 1684
},
{
"epoch": 2.6043276661514683,
"grad_norm": 11.625,
"learning_rate": 3.3157894736842107e-06,
"loss": 0.4204,
"step": 1685
},
{
"epoch": 2.605873261205564,
"grad_norm": 9.75,
"learning_rate": 3.305263157894737e-06,
"loss": 0.4236,
"step": 1686
},
{
"epoch": 2.60741885625966,
"grad_norm": 10.875,
"learning_rate": 3.2947368421052633e-06,
"loss": 0.3948,
"step": 1687
},
{
"epoch": 2.6089644513137555,
"grad_norm": 12.9375,
"learning_rate": 3.2842105263157897e-06,
"loss": 0.41,
"step": 1688
},
{
"epoch": 2.6105100463678514,
"grad_norm": 11.0,
"learning_rate": 3.273684210526316e-06,
"loss": 0.3952,
"step": 1689
},
{
"epoch": 2.6120556414219473,
"grad_norm": 10.3125,
"learning_rate": 3.2631578947368423e-06,
"loss": 0.402,
"step": 1690
},
{
"epoch": 2.6136012364760433,
"grad_norm": 11.25,
"learning_rate": 3.252631578947369e-06,
"loss": 0.3283,
"step": 1691
},
{
"epoch": 2.615146831530139,
"grad_norm": 9.6875,
"learning_rate": 3.2421052631578945e-06,
"loss": 0.3604,
"step": 1692
},
{
"epoch": 2.616692426584235,
"grad_norm": 12.5625,
"learning_rate": 3.2315789473684213e-06,
"loss": 0.4216,
"step": 1693
},
{
"epoch": 2.618238021638331,
"grad_norm": 9.3125,
"learning_rate": 3.2210526315789476e-06,
"loss": 0.4181,
"step": 1694
},
{
"epoch": 2.6197836166924264,
"grad_norm": 10.5625,
"learning_rate": 3.210526315789474e-06,
"loss": 0.4043,
"step": 1695
},
{
"epoch": 2.6213292117465223,
"grad_norm": 9.0,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.3961,
"step": 1696
},
{
"epoch": 2.6228748068006182,
"grad_norm": 15.6875,
"learning_rate": 3.1894736842105266e-06,
"loss": 0.4476,
"step": 1697
},
{
"epoch": 2.624420401854714,
"grad_norm": 11.0625,
"learning_rate": 3.178947368421053e-06,
"loss": 0.4221,
"step": 1698
},
{
"epoch": 2.62596599690881,
"grad_norm": 11.0625,
"learning_rate": 3.168421052631579e-06,
"loss": 0.4244,
"step": 1699
},
{
"epoch": 2.6275115919629055,
"grad_norm": 10.0,
"learning_rate": 3.157894736842105e-06,
"loss": 0.3973,
"step": 1700
},
{
"epoch": 2.6290571870170014,
"grad_norm": 11.375,
"learning_rate": 3.147368421052632e-06,
"loss": 0.4377,
"step": 1701
},
{
"epoch": 2.6306027820710973,
"grad_norm": 11.3125,
"learning_rate": 3.1368421052631582e-06,
"loss": 0.4146,
"step": 1702
},
{
"epoch": 2.6321483771251932,
"grad_norm": 10.4375,
"learning_rate": 3.1263157894736846e-06,
"loss": 0.4678,
"step": 1703
},
{
"epoch": 2.633693972179289,
"grad_norm": 11.1875,
"learning_rate": 3.115789473684211e-06,
"loss": 0.4078,
"step": 1704
},
{
"epoch": 2.635239567233385,
"grad_norm": 11.125,
"learning_rate": 3.1052631578947372e-06,
"loss": 0.3784,
"step": 1705
},
{
"epoch": 2.636785162287481,
"grad_norm": 9.9375,
"learning_rate": 3.094736842105263e-06,
"loss": 0.4102,
"step": 1706
},
{
"epoch": 2.6383307573415764,
"grad_norm": 9.3125,
"learning_rate": 3.0842105263157895e-06,
"loss": 0.4125,
"step": 1707
},
{
"epoch": 2.6398763523956723,
"grad_norm": 11.0625,
"learning_rate": 3.0736842105263158e-06,
"loss": 0.3951,
"step": 1708
},
{
"epoch": 2.641421947449768,
"grad_norm": 9.875,
"learning_rate": 3.0631578947368425e-06,
"loss": 0.3627,
"step": 1709
},
{
"epoch": 2.642967542503864,
"grad_norm": 14.4375,
"learning_rate": 3.052631578947369e-06,
"loss": 0.406,
"step": 1710
},
{
"epoch": 2.6445131375579596,
"grad_norm": 10.125,
"learning_rate": 3.042105263157895e-06,
"loss": 0.4151,
"step": 1711
},
{
"epoch": 2.6460587326120555,
"grad_norm": 9.4375,
"learning_rate": 3.0315789473684215e-06,
"loss": 0.3784,
"step": 1712
},
{
"epoch": 2.6476043276661514,
"grad_norm": 10.3125,
"learning_rate": 3.0210526315789474e-06,
"loss": 0.4338,
"step": 1713
},
{
"epoch": 2.6491499227202473,
"grad_norm": 10.3125,
"learning_rate": 3.0105263157894737e-06,
"loss": 0.3633,
"step": 1714
},
{
"epoch": 2.650695517774343,
"grad_norm": 9.375,
"learning_rate": 3e-06,
"loss": 0.4379,
"step": 1715
},
{
"epoch": 2.652241112828439,
"grad_norm": 11.0,
"learning_rate": 2.9894736842105264e-06,
"loss": 0.3851,
"step": 1716
},
{
"epoch": 2.653786707882535,
"grad_norm": 10.5,
"learning_rate": 2.9789473684210527e-06,
"loss": 0.4064,
"step": 1717
},
{
"epoch": 2.6553323029366305,
"grad_norm": 9.5,
"learning_rate": 2.9684210526315795e-06,
"loss": 0.3912,
"step": 1718
},
{
"epoch": 2.6568778979907264,
"grad_norm": 10.75,
"learning_rate": 2.957894736842106e-06,
"loss": 0.381,
"step": 1719
},
{
"epoch": 2.6584234930448223,
"grad_norm": 12.0,
"learning_rate": 2.9473684210526317e-06,
"loss": 0.4228,
"step": 1720
},
{
"epoch": 2.659969088098918,
"grad_norm": 23.375,
"learning_rate": 2.936842105263158e-06,
"loss": 0.3884,
"step": 1721
},
{
"epoch": 2.6615146831530136,
"grad_norm": 10.5625,
"learning_rate": 2.9263157894736844e-06,
"loss": 0.4685,
"step": 1722
},
{
"epoch": 2.6630602782071096,
"grad_norm": 10.125,
"learning_rate": 2.9157894736842107e-06,
"loss": 0.3619,
"step": 1723
},
{
"epoch": 2.6646058732612055,
"grad_norm": 9.5625,
"learning_rate": 2.905263157894737e-06,
"loss": 0.3738,
"step": 1724
},
{
"epoch": 2.6661514683153014,
"grad_norm": 9.1875,
"learning_rate": 2.8947368421052634e-06,
"loss": 0.3707,
"step": 1725
},
{
"epoch": 2.6676970633693973,
"grad_norm": 11.8125,
"learning_rate": 2.88421052631579e-06,
"loss": 0.3928,
"step": 1726
},
{
"epoch": 2.669242658423493,
"grad_norm": 13.0,
"learning_rate": 2.8736842105263164e-06,
"loss": 0.3825,
"step": 1727
},
{
"epoch": 2.670788253477589,
"grad_norm": 10.375,
"learning_rate": 2.8631578947368423e-06,
"loss": 0.3854,
"step": 1728
},
{
"epoch": 2.6723338485316845,
"grad_norm": 10.75,
"learning_rate": 2.8526315789473687e-06,
"loss": 0.3673,
"step": 1729
},
{
"epoch": 2.6738794435857804,
"grad_norm": 9.3125,
"learning_rate": 2.842105263157895e-06,
"loss": 0.4554,
"step": 1730
},
{
"epoch": 2.6754250386398764,
"grad_norm": 9.5,
"learning_rate": 2.8315789473684213e-06,
"loss": 0.3797,
"step": 1731
},
{
"epoch": 2.6769706336939723,
"grad_norm": 9.375,
"learning_rate": 2.8210526315789476e-06,
"loss": 0.4412,
"step": 1732
},
{
"epoch": 2.678516228748068,
"grad_norm": 10.0625,
"learning_rate": 2.810526315789474e-06,
"loss": 0.4124,
"step": 1733
},
{
"epoch": 2.6800618238021636,
"grad_norm": 10.8125,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.3944,
"step": 1734
},
{
"epoch": 2.6816074188562595,
"grad_norm": 9.25,
"learning_rate": 2.789473684210526e-06,
"loss": 0.3976,
"step": 1735
},
{
"epoch": 2.6831530139103554,
"grad_norm": 13.1875,
"learning_rate": 2.7789473684210525e-06,
"loss": 0.3777,
"step": 1736
},
{
"epoch": 2.6846986089644513,
"grad_norm": 12.5,
"learning_rate": 2.7684210526315793e-06,
"loss": 0.4489,
"step": 1737
},
{
"epoch": 2.6862442040185472,
"grad_norm": 14.625,
"learning_rate": 2.7578947368421056e-06,
"loss": 0.4448,
"step": 1738
},
{
"epoch": 2.687789799072643,
"grad_norm": 10.5,
"learning_rate": 2.747368421052632e-06,
"loss": 0.3733,
"step": 1739
},
{
"epoch": 2.689335394126739,
"grad_norm": 9.125,
"learning_rate": 2.7368421052631583e-06,
"loss": 0.3982,
"step": 1740
},
{
"epoch": 2.6908809891808345,
"grad_norm": 9.8125,
"learning_rate": 2.7263157894736846e-06,
"loss": 0.4048,
"step": 1741
},
{
"epoch": 2.6924265842349304,
"grad_norm": 9.75,
"learning_rate": 2.7157894736842105e-06,
"loss": 0.408,
"step": 1742
},
{
"epoch": 2.6939721792890263,
"grad_norm": 9.6875,
"learning_rate": 2.705263157894737e-06,
"loss": 0.4068,
"step": 1743
},
{
"epoch": 2.6955177743431222,
"grad_norm": 10.125,
"learning_rate": 2.694736842105263e-06,
"loss": 0.4276,
"step": 1744
},
{
"epoch": 2.6970633693972177,
"grad_norm": 10.8125,
"learning_rate": 2.68421052631579e-06,
"loss": 0.4537,
"step": 1745
},
{
"epoch": 2.6986089644513136,
"grad_norm": 10.3125,
"learning_rate": 2.6736842105263162e-06,
"loss": 0.4682,
"step": 1746
},
{
"epoch": 2.7001545595054095,
"grad_norm": 10.125,
"learning_rate": 2.6631578947368426e-06,
"loss": 0.3858,
"step": 1747
},
{
"epoch": 2.7017001545595054,
"grad_norm": 9.0,
"learning_rate": 2.652631578947369e-06,
"loss": 0.3793,
"step": 1748
},
{
"epoch": 2.7032457496136013,
"grad_norm": 10.75,
"learning_rate": 2.6421052631578948e-06,
"loss": 0.395,
"step": 1749
},
{
"epoch": 2.704791344667697,
"grad_norm": 10.75,
"learning_rate": 2.631578947368421e-06,
"loss": 0.3849,
"step": 1750
},
{
"epoch": 2.706336939721793,
"grad_norm": 9.8125,
"learning_rate": 2.6210526315789474e-06,
"loss": 0.4047,
"step": 1751
},
{
"epoch": 2.7078825347758886,
"grad_norm": 10.6875,
"learning_rate": 2.6105263157894738e-06,
"loss": 0.4146,
"step": 1752
},
{
"epoch": 2.7094281298299845,
"grad_norm": 11.375,
"learning_rate": 2.6e-06,
"loss": 0.4149,
"step": 1753
},
{
"epoch": 2.7109737248840804,
"grad_norm": 9.5625,
"learning_rate": 2.589473684210527e-06,
"loss": 0.3694,
"step": 1754
},
{
"epoch": 2.7125193199381763,
"grad_norm": 9.8125,
"learning_rate": 2.578947368421053e-06,
"loss": 0.4454,
"step": 1755
},
{
"epoch": 2.7140649149922718,
"grad_norm": 9.3125,
"learning_rate": 2.568421052631579e-06,
"loss": 0.3473,
"step": 1756
},
{
"epoch": 2.7156105100463677,
"grad_norm": 9.875,
"learning_rate": 2.5578947368421054e-06,
"loss": 0.3919,
"step": 1757
},
{
"epoch": 2.7171561051004636,
"grad_norm": 9.25,
"learning_rate": 2.5473684210526317e-06,
"loss": 0.3689,
"step": 1758
},
{
"epoch": 2.7187017001545595,
"grad_norm": 10.9375,
"learning_rate": 2.536842105263158e-06,
"loss": 0.3767,
"step": 1759
},
{
"epoch": 2.7202472952086554,
"grad_norm": 11.5625,
"learning_rate": 2.5263157894736844e-06,
"loss": 0.4031,
"step": 1760
},
{
"epoch": 2.7217928902627513,
"grad_norm": 10.75,
"learning_rate": 2.5157894736842107e-06,
"loss": 0.4077,
"step": 1761
},
{
"epoch": 2.723338485316847,
"grad_norm": 11.0625,
"learning_rate": 2.5052631578947375e-06,
"loss": 0.4016,
"step": 1762
},
{
"epoch": 2.7248840803709427,
"grad_norm": 9.625,
"learning_rate": 2.4947368421052634e-06,
"loss": 0.3557,
"step": 1763
},
{
"epoch": 2.7264296754250386,
"grad_norm": 10.625,
"learning_rate": 2.4842105263157897e-06,
"loss": 0.3676,
"step": 1764
},
{
"epoch": 2.7279752704791345,
"grad_norm": 9.25,
"learning_rate": 2.473684210526316e-06,
"loss": 0.4012,
"step": 1765
},
{
"epoch": 2.7295208655332304,
"grad_norm": 10.0625,
"learning_rate": 2.4631578947368424e-06,
"loss": 0.3387,
"step": 1766
},
{
"epoch": 2.7310664605873263,
"grad_norm": 9.6875,
"learning_rate": 2.4526315789473687e-06,
"loss": 0.3777,
"step": 1767
},
{
"epoch": 2.7326120556414217,
"grad_norm": 11.0,
"learning_rate": 2.442105263157895e-06,
"loss": 0.4259,
"step": 1768
},
{
"epoch": 2.7341576506955176,
"grad_norm": 11.0,
"learning_rate": 2.4315789473684213e-06,
"loss": 0.3763,
"step": 1769
},
{
"epoch": 2.7357032457496135,
"grad_norm": 11.9375,
"learning_rate": 2.4210526315789477e-06,
"loss": 0.4146,
"step": 1770
},
{
"epoch": 2.7372488408037094,
"grad_norm": 10.3125,
"learning_rate": 2.410526315789474e-06,
"loss": 0.4534,
"step": 1771
},
{
"epoch": 2.7387944358578054,
"grad_norm": 9.0,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.3773,
"step": 1772
},
{
"epoch": 2.7403400309119013,
"grad_norm": 10.9375,
"learning_rate": 2.3894736842105266e-06,
"loss": 0.3715,
"step": 1773
},
{
"epoch": 2.741885625965997,
"grad_norm": 11.5625,
"learning_rate": 2.378947368421053e-06,
"loss": 0.4526,
"step": 1774
},
{
"epoch": 2.7434312210200926,
"grad_norm": 8.75,
"learning_rate": 2.368421052631579e-06,
"loss": 0.3926,
"step": 1775
},
{
"epoch": 2.7449768160741885,
"grad_norm": 10.75,
"learning_rate": 2.357894736842105e-06,
"loss": 0.434,
"step": 1776
},
{
"epoch": 2.7465224111282844,
"grad_norm": 20.625,
"learning_rate": 2.347368421052632e-06,
"loss": 0.4384,
"step": 1777
},
{
"epoch": 2.7480680061823803,
"grad_norm": 8.8125,
"learning_rate": 2.3368421052631583e-06,
"loss": 0.372,
"step": 1778
},
{
"epoch": 2.749613601236476,
"grad_norm": 10.3125,
"learning_rate": 2.326315789473684e-06,
"loss": 0.4376,
"step": 1779
},
{
"epoch": 2.7511591962905717,
"grad_norm": 10.0625,
"learning_rate": 2.3157894736842105e-06,
"loss": 0.3777,
"step": 1780
},
{
"epoch": 2.7527047913446676,
"grad_norm": 11.375,
"learning_rate": 2.3052631578947373e-06,
"loss": 0.4423,
"step": 1781
},
{
"epoch": 2.7542503863987635,
"grad_norm": 9.9375,
"learning_rate": 2.294736842105263e-06,
"loss": 0.3894,
"step": 1782
},
{
"epoch": 2.7557959814528594,
"grad_norm": 9.6875,
"learning_rate": 2.2842105263157895e-06,
"loss": 0.3711,
"step": 1783
},
{
"epoch": 2.7573415765069553,
"grad_norm": 9.6875,
"learning_rate": 2.273684210526316e-06,
"loss": 0.3663,
"step": 1784
},
{
"epoch": 2.7588871715610512,
"grad_norm": 10.125,
"learning_rate": 2.2631578947368426e-06,
"loss": 0.3994,
"step": 1785
},
{
"epoch": 2.7604327666151467,
"grad_norm": 9.125,
"learning_rate": 2.2526315789473685e-06,
"loss": 0.3846,
"step": 1786
},
{
"epoch": 2.7619783616692426,
"grad_norm": 8.8125,
"learning_rate": 2.242105263157895e-06,
"loss": 0.3311,
"step": 1787
},
{
"epoch": 2.7635239567233385,
"grad_norm": 12.125,
"learning_rate": 2.231578947368421e-06,
"loss": 0.4519,
"step": 1788
},
{
"epoch": 2.7650695517774344,
"grad_norm": 10.875,
"learning_rate": 2.221052631578948e-06,
"loss": 0.4599,
"step": 1789
},
{
"epoch": 2.76661514683153,
"grad_norm": 31.625,
"learning_rate": 2.2105263157894738e-06,
"loss": 0.3943,
"step": 1790
},
{
"epoch": 2.7681607418856258,
"grad_norm": 9.8125,
"learning_rate": 2.2e-06,
"loss": 0.4059,
"step": 1791
},
{
"epoch": 2.7697063369397217,
"grad_norm": 10.1875,
"learning_rate": 2.1894736842105264e-06,
"loss": 0.3567,
"step": 1792
},
{
"epoch": 2.7712519319938176,
"grad_norm": 10.125,
"learning_rate": 2.1789473684210528e-06,
"loss": 0.3779,
"step": 1793
},
{
"epoch": 2.7727975270479135,
"grad_norm": 9.375,
"learning_rate": 2.168421052631579e-06,
"loss": 0.3605,
"step": 1794
},
{
"epoch": 2.7743431221020094,
"grad_norm": 9.0,
"learning_rate": 2.1578947368421054e-06,
"loss": 0.3392,
"step": 1795
},
{
"epoch": 2.7758887171561053,
"grad_norm": 11.1875,
"learning_rate": 2.1473684210526317e-06,
"loss": 0.3849,
"step": 1796
},
{
"epoch": 2.7774343122102008,
"grad_norm": 9.25,
"learning_rate": 2.136842105263158e-06,
"loss": 0.4181,
"step": 1797
},
{
"epoch": 2.7789799072642967,
"grad_norm": 12.25,
"learning_rate": 2.1263157894736844e-06,
"loss": 0.4537,
"step": 1798
},
{
"epoch": 2.7805255023183926,
"grad_norm": 11.0,
"learning_rate": 2.1157894736842107e-06,
"loss": 0.3912,
"step": 1799
},
{
"epoch": 2.7820710973724885,
"grad_norm": 10.6875,
"learning_rate": 2.105263157894737e-06,
"loss": 0.4458,
"step": 1800
},
{
"epoch": 2.7836166924265844,
"grad_norm": 11.0,
"learning_rate": 2.0947368421052634e-06,
"loss": 0.3801,
"step": 1801
},
{
"epoch": 2.78516228748068,
"grad_norm": 9.8125,
"learning_rate": 2.0842105263157897e-06,
"loss": 0.3761,
"step": 1802
},
{
"epoch": 2.7867078825347757,
"grad_norm": 8.9375,
"learning_rate": 2.073684210526316e-06,
"loss": 0.4046,
"step": 1803
},
{
"epoch": 2.7882534775888717,
"grad_norm": 10.3125,
"learning_rate": 2.0631578947368424e-06,
"loss": 0.3733,
"step": 1804
},
{
"epoch": 2.7897990726429676,
"grad_norm": 10.4375,
"learning_rate": 2.0526315789473687e-06,
"loss": 0.4184,
"step": 1805
},
{
"epoch": 2.7913446676970635,
"grad_norm": 12.6875,
"learning_rate": 2.042105263157895e-06,
"loss": 0.4501,
"step": 1806
},
{
"epoch": 2.7928902627511594,
"grad_norm": 10.6875,
"learning_rate": 2.031578947368421e-06,
"loss": 0.4095,
"step": 1807
},
{
"epoch": 2.7944358578052553,
"grad_norm": 13.8125,
"learning_rate": 2.0210526315789477e-06,
"loss": 0.4088,
"step": 1808
},
{
"epoch": 2.7959814528593507,
"grad_norm": 11.875,
"learning_rate": 2.010526315789474e-06,
"loss": 0.4242,
"step": 1809
},
{
"epoch": 2.7975270479134466,
"grad_norm": 9.1875,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.3917,
"step": 1810
},
{
"epoch": 2.7990726429675425,
"grad_norm": 13.1875,
"learning_rate": 1.9894736842105262e-06,
"loss": 0.4334,
"step": 1811
},
{
"epoch": 2.8006182380216385,
"grad_norm": 10.9375,
"learning_rate": 1.978947368421053e-06,
"loss": 0.483,
"step": 1812
},
{
"epoch": 2.802163833075734,
"grad_norm": 10.5625,
"learning_rate": 1.9684210526315793e-06,
"loss": 0.4119,
"step": 1813
},
{
"epoch": 2.80370942812983,
"grad_norm": 12.125,
"learning_rate": 1.9578947368421052e-06,
"loss": 0.4372,
"step": 1814
},
{
"epoch": 2.8052550231839257,
"grad_norm": 10.0625,
"learning_rate": 1.9473684210526315e-06,
"loss": 0.3909,
"step": 1815
},
{
"epoch": 2.8068006182380216,
"grad_norm": 16.375,
"learning_rate": 1.936842105263158e-06,
"loss": 0.4327,
"step": 1816
},
{
"epoch": 2.8083462132921175,
"grad_norm": 9.9375,
"learning_rate": 1.9263157894736846e-06,
"loss": 0.4093,
"step": 1817
},
{
"epoch": 2.8098918083462134,
"grad_norm": 13.625,
"learning_rate": 1.9157894736842105e-06,
"loss": 0.4482,
"step": 1818
},
{
"epoch": 2.8114374034003093,
"grad_norm": 11.25,
"learning_rate": 1.905263157894737e-06,
"loss": 0.3899,
"step": 1819
},
{
"epoch": 2.812982998454405,
"grad_norm": 10.375,
"learning_rate": 1.8947368421052634e-06,
"loss": 0.4006,
"step": 1820
},
{
"epoch": 2.8145285935085007,
"grad_norm": 11.75,
"learning_rate": 1.8842105263157895e-06,
"loss": 0.3468,
"step": 1821
},
{
"epoch": 2.8160741885625966,
"grad_norm": 10.8125,
"learning_rate": 1.8736842105263158e-06,
"loss": 0.4039,
"step": 1822
},
{
"epoch": 2.8176197836166925,
"grad_norm": 10.375,
"learning_rate": 1.8631578947368424e-06,
"loss": 0.3988,
"step": 1823
},
{
"epoch": 2.819165378670788,
"grad_norm": 9.625,
"learning_rate": 1.8526315789473687e-06,
"loss": 0.4123,
"step": 1824
},
{
"epoch": 2.820710973724884,
"grad_norm": 9.5,
"learning_rate": 1.8421052631578948e-06,
"loss": 0.4064,
"step": 1825
},
{
"epoch": 2.82225656877898,
"grad_norm": 10.6875,
"learning_rate": 1.8315789473684211e-06,
"loss": 0.4234,
"step": 1826
},
{
"epoch": 2.8238021638330757,
"grad_norm": 9.4375,
"learning_rate": 1.8210526315789475e-06,
"loss": 0.3731,
"step": 1827
},
{
"epoch": 2.8253477588871716,
"grad_norm": 9.0,
"learning_rate": 1.810526315789474e-06,
"loss": 0.3949,
"step": 1828
},
{
"epoch": 2.8268933539412675,
"grad_norm": 13.375,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.3817,
"step": 1829
},
{
"epoch": 2.8284389489953634,
"grad_norm": 14.5625,
"learning_rate": 1.7894736842105265e-06,
"loss": 0.4308,
"step": 1830
},
{
"epoch": 2.8299845440494593,
"grad_norm": 8.6875,
"learning_rate": 1.7789473684210528e-06,
"loss": 0.3637,
"step": 1831
},
{
"epoch": 2.8315301391035548,
"grad_norm": 10.3125,
"learning_rate": 1.768421052631579e-06,
"loss": 0.3711,
"step": 1832
},
{
"epoch": 2.8330757341576507,
"grad_norm": 9.375,
"learning_rate": 1.7578947368421054e-06,
"loss": 0.4294,
"step": 1833
},
{
"epoch": 2.8346213292117466,
"grad_norm": 10.6875,
"learning_rate": 1.7473684210526318e-06,
"loss": 0.3973,
"step": 1834
},
{
"epoch": 2.8361669242658425,
"grad_norm": 12.5625,
"learning_rate": 1.736842105263158e-06,
"loss": 0.4143,
"step": 1835
},
{
"epoch": 2.837712519319938,
"grad_norm": 10.0625,
"learning_rate": 1.7263157894736842e-06,
"loss": 0.4742,
"step": 1836
},
{
"epoch": 2.839258114374034,
"grad_norm": 17.0,
"learning_rate": 1.7157894736842107e-06,
"loss": 0.4588,
"step": 1837
},
{
"epoch": 2.8408037094281298,
"grad_norm": 13.0625,
"learning_rate": 1.705263157894737e-06,
"loss": 0.4121,
"step": 1838
},
{
"epoch": 2.8423493044822257,
"grad_norm": 12.3125,
"learning_rate": 1.6947368421052632e-06,
"loss": 0.4282,
"step": 1839
},
{
"epoch": 2.8438948995363216,
"grad_norm": 10.1875,
"learning_rate": 1.6842105263157895e-06,
"loss": 0.4076,
"step": 1840
},
{
"epoch": 2.8454404945904175,
"grad_norm": 9.4375,
"learning_rate": 1.673684210526316e-06,
"loss": 0.3933,
"step": 1841
},
{
"epoch": 2.8469860896445134,
"grad_norm": 10.5,
"learning_rate": 1.6631578947368424e-06,
"loss": 0.4362,
"step": 1842
},
{
"epoch": 2.848531684698609,
"grad_norm": 12.1875,
"learning_rate": 1.6526315789473685e-06,
"loss": 0.4491,
"step": 1843
},
{
"epoch": 2.8500772797527048,
"grad_norm": 9.125,
"learning_rate": 1.6421052631578948e-06,
"loss": 0.3365,
"step": 1844
},
{
"epoch": 2.8516228748068007,
"grad_norm": 10.5625,
"learning_rate": 1.6315789473684212e-06,
"loss": 0.392,
"step": 1845
},
{
"epoch": 2.8531684698608966,
"grad_norm": 10.3125,
"learning_rate": 1.6210526315789473e-06,
"loss": 0.4085,
"step": 1846
},
{
"epoch": 2.854714064914992,
"grad_norm": 9.5,
"learning_rate": 1.6105263157894738e-06,
"loss": 0.3727,
"step": 1847
},
{
"epoch": 2.856259659969088,
"grad_norm": 48.25,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.4389,
"step": 1848
},
{
"epoch": 2.857805255023184,
"grad_norm": 10.9375,
"learning_rate": 1.5894736842105265e-06,
"loss": 0.4017,
"step": 1849
},
{
"epoch": 2.8593508500772797,
"grad_norm": 9.8125,
"learning_rate": 1.5789473684210526e-06,
"loss": 0.3863,
"step": 1850
},
{
"epoch": 2.8608964451313756,
"grad_norm": 9.375,
"learning_rate": 1.5684210526315791e-06,
"loss": 0.3702,
"step": 1851
},
{
"epoch": 2.8624420401854715,
"grad_norm": 9.4375,
"learning_rate": 1.5578947368421054e-06,
"loss": 0.4564,
"step": 1852
},
{
"epoch": 2.8639876352395675,
"grad_norm": 13.5,
"learning_rate": 1.5473684210526316e-06,
"loss": 0.3896,
"step": 1853
},
{
"epoch": 2.865533230293663,
"grad_norm": 9.5625,
"learning_rate": 1.5368421052631579e-06,
"loss": 0.4136,
"step": 1854
},
{
"epoch": 2.867078825347759,
"grad_norm": 13.8125,
"learning_rate": 1.5263157894736844e-06,
"loss": 0.4244,
"step": 1855
},
{
"epoch": 2.8686244204018547,
"grad_norm": 9.9375,
"learning_rate": 1.5157894736842108e-06,
"loss": 0.4011,
"step": 1856
},
{
"epoch": 2.8701700154559506,
"grad_norm": 12.0625,
"learning_rate": 1.5052631578947369e-06,
"loss": 0.4049,
"step": 1857
},
{
"epoch": 2.871715610510046,
"grad_norm": 9.6875,
"learning_rate": 1.4947368421052632e-06,
"loss": 0.399,
"step": 1858
},
{
"epoch": 2.873261205564142,
"grad_norm": 13.3125,
"learning_rate": 1.4842105263157897e-06,
"loss": 0.4574,
"step": 1859
},
{
"epoch": 2.874806800618238,
"grad_norm": 9.5,
"learning_rate": 1.4736842105263159e-06,
"loss": 0.3981,
"step": 1860
},
{
"epoch": 2.876352395672334,
"grad_norm": 12.5625,
"learning_rate": 1.4631578947368422e-06,
"loss": 0.458,
"step": 1861
},
{
"epoch": 2.8778979907264297,
"grad_norm": 11.5625,
"learning_rate": 1.4526315789473685e-06,
"loss": 0.4053,
"step": 1862
},
{
"epoch": 2.8794435857805256,
"grad_norm": 9.125,
"learning_rate": 1.442105263157895e-06,
"loss": 0.4028,
"step": 1863
},
{
"epoch": 2.8809891808346215,
"grad_norm": 10.875,
"learning_rate": 1.4315789473684212e-06,
"loss": 0.453,
"step": 1864
},
{
"epoch": 2.8825347758887174,
"grad_norm": 9.875,
"learning_rate": 1.4210526315789475e-06,
"loss": 0.4118,
"step": 1865
},
{
"epoch": 2.884080370942813,
"grad_norm": 9.75,
"learning_rate": 1.4105263157894738e-06,
"loss": 0.3685,
"step": 1866
},
{
"epoch": 2.885625965996909,
"grad_norm": 11.8125,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.3809,
"step": 1867
},
{
"epoch": 2.8871715610510047,
"grad_norm": 9.6875,
"learning_rate": 1.3894736842105263e-06,
"loss": 0.4276,
"step": 1868
},
{
"epoch": 2.8887171561051006,
"grad_norm": 9.625,
"learning_rate": 1.3789473684210528e-06,
"loss": 0.4094,
"step": 1869
},
{
"epoch": 2.890262751159196,
"grad_norm": 14.625,
"learning_rate": 1.3684210526315791e-06,
"loss": 0.4412,
"step": 1870
},
{
"epoch": 2.891808346213292,
"grad_norm": 11.5625,
"learning_rate": 1.3578947368421052e-06,
"loss": 0.3844,
"step": 1871
},
{
"epoch": 2.893353941267388,
"grad_norm": 10.5625,
"learning_rate": 1.3473684210526316e-06,
"loss": 0.4251,
"step": 1872
},
{
"epoch": 2.894899536321484,
"grad_norm": 9.1875,
"learning_rate": 1.3368421052631581e-06,
"loss": 0.371,
"step": 1873
},
{
"epoch": 2.8964451313755797,
"grad_norm": 10.1875,
"learning_rate": 1.3263157894736844e-06,
"loss": 0.4371,
"step": 1874
},
{
"epoch": 2.8979907264296756,
"grad_norm": 10.125,
"learning_rate": 1.3157894736842106e-06,
"loss": 0.427,
"step": 1875
},
{
"epoch": 2.8995363214837715,
"grad_norm": 11.375,
"learning_rate": 1.3052631578947369e-06,
"loss": 0.4068,
"step": 1876
},
{
"epoch": 2.901081916537867,
"grad_norm": 12.0,
"learning_rate": 1.2947368421052634e-06,
"loss": 0.3963,
"step": 1877
},
{
"epoch": 2.902627511591963,
"grad_norm": 8.875,
"learning_rate": 1.2842105263157895e-06,
"loss": 0.3563,
"step": 1878
},
{
"epoch": 2.9041731066460588,
"grad_norm": 8.625,
"learning_rate": 1.2736842105263159e-06,
"loss": 0.4034,
"step": 1879
},
{
"epoch": 2.9057187017001547,
"grad_norm": 9.375,
"learning_rate": 1.2631578947368422e-06,
"loss": 0.3989,
"step": 1880
},
{
"epoch": 2.90726429675425,
"grad_norm": 9.3125,
"learning_rate": 1.2526315789473687e-06,
"loss": 0.3692,
"step": 1881
},
{
"epoch": 2.908809891808346,
"grad_norm": 10.1875,
"learning_rate": 1.2421052631578948e-06,
"loss": 0.4374,
"step": 1882
},
{
"epoch": 2.910355486862442,
"grad_norm": 10.125,
"learning_rate": 1.2315789473684212e-06,
"loss": 0.4132,
"step": 1883
},
{
"epoch": 2.911901081916538,
"grad_norm": 13.25,
"learning_rate": 1.2210526315789475e-06,
"loss": 0.3918,
"step": 1884
},
{
"epoch": 2.9134466769706338,
"grad_norm": 9.3125,
"learning_rate": 1.2105263157894738e-06,
"loss": 0.385,
"step": 1885
},
{
"epoch": 2.9149922720247297,
"grad_norm": 9.5,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.3525,
"step": 1886
},
{
"epoch": 2.9165378670788256,
"grad_norm": 9.125,
"learning_rate": 1.1894736842105265e-06,
"loss": 0.3566,
"step": 1887
},
{
"epoch": 2.918083462132921,
"grad_norm": 11.5625,
"learning_rate": 1.1789473684210526e-06,
"loss": 0.4376,
"step": 1888
},
{
"epoch": 2.919629057187017,
"grad_norm": 9.6875,
"learning_rate": 1.1684210526315791e-06,
"loss": 0.4202,
"step": 1889
},
{
"epoch": 2.921174652241113,
"grad_norm": 8.75,
"learning_rate": 1.1578947368421053e-06,
"loss": 0.3859,
"step": 1890
},
{
"epoch": 2.9227202472952087,
"grad_norm": 9.875,
"learning_rate": 1.1473684210526316e-06,
"loss": 0.4488,
"step": 1891
},
{
"epoch": 2.9242658423493046,
"grad_norm": 11.3125,
"learning_rate": 1.136842105263158e-06,
"loss": 0.4885,
"step": 1892
},
{
"epoch": 2.9258114374034,
"grad_norm": 10.8125,
"learning_rate": 1.1263157894736842e-06,
"loss": 0.4429,
"step": 1893
},
{
"epoch": 2.927357032457496,
"grad_norm": 9.625,
"learning_rate": 1.1157894736842106e-06,
"loss": 0.4645,
"step": 1894
},
{
"epoch": 2.928902627511592,
"grad_norm": 11.125,
"learning_rate": 1.1052631578947369e-06,
"loss": 0.3797,
"step": 1895
},
{
"epoch": 2.930448222565688,
"grad_norm": 11.8125,
"learning_rate": 1.0947368421052632e-06,
"loss": 0.4121,
"step": 1896
},
{
"epoch": 2.9319938176197837,
"grad_norm": 10.6875,
"learning_rate": 1.0842105263157895e-06,
"loss": 0.4085,
"step": 1897
},
{
"epoch": 2.9335394126738796,
"grad_norm": 12.5,
"learning_rate": 1.0736842105263159e-06,
"loss": 0.3847,
"step": 1898
},
{
"epoch": 2.9350850077279755,
"grad_norm": 12.75,
"learning_rate": 1.0631578947368422e-06,
"loss": 0.4126,
"step": 1899
},
{
"epoch": 2.936630602782071,
"grad_norm": 10.75,
"learning_rate": 1.0526315789473685e-06,
"loss": 0.4209,
"step": 1900
},
{
"epoch": 2.938176197836167,
"grad_norm": 12.125,
"learning_rate": 1.0421052631578949e-06,
"loss": 0.4829,
"step": 1901
},
{
"epoch": 2.939721792890263,
"grad_norm": 12.875,
"learning_rate": 1.0315789473684212e-06,
"loss": 0.4663,
"step": 1902
},
{
"epoch": 2.9412673879443587,
"grad_norm": 13.0625,
"learning_rate": 1.0210526315789475e-06,
"loss": 0.4614,
"step": 1903
},
{
"epoch": 2.942812982998454,
"grad_norm": 10.375,
"learning_rate": 1.0105263157894738e-06,
"loss": 0.4151,
"step": 1904
},
{
"epoch": 2.94435857805255,
"grad_norm": 9.75,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4042,
"step": 1905
},
{
"epoch": 2.945904173106646,
"grad_norm": 9.4375,
"learning_rate": 9.894736842105265e-07,
"loss": 0.4206,
"step": 1906
},
{
"epoch": 2.947449768160742,
"grad_norm": 14.8125,
"learning_rate": 9.789473684210526e-07,
"loss": 0.4037,
"step": 1907
},
{
"epoch": 2.948995363214838,
"grad_norm": 9.75,
"learning_rate": 9.68421052631579e-07,
"loss": 0.3797,
"step": 1908
},
{
"epoch": 2.9505409582689337,
"grad_norm": 9.1875,
"learning_rate": 9.578947368421053e-07,
"loss": 0.3993,
"step": 1909
},
{
"epoch": 2.9520865533230296,
"grad_norm": 11.5,
"learning_rate": 9.473684210526317e-07,
"loss": 0.4062,
"step": 1910
},
{
"epoch": 2.953632148377125,
"grad_norm": 9.6875,
"learning_rate": 9.368421052631579e-07,
"loss": 0.3637,
"step": 1911
},
{
"epoch": 2.955177743431221,
"grad_norm": 9.1875,
"learning_rate": 9.263157894736844e-07,
"loss": 0.3892,
"step": 1912
},
{
"epoch": 2.956723338485317,
"grad_norm": 9.375,
"learning_rate": 9.157894736842106e-07,
"loss": 0.4085,
"step": 1913
},
{
"epoch": 2.958268933539413,
"grad_norm": 10.0625,
"learning_rate": 9.05263157894737e-07,
"loss": 0.4408,
"step": 1914
},
{
"epoch": 2.9598145285935082,
"grad_norm": 10.5,
"learning_rate": 8.947368421052632e-07,
"loss": 0.3937,
"step": 1915
},
{
"epoch": 2.961360123647604,
"grad_norm": 10.0625,
"learning_rate": 8.842105263157895e-07,
"loss": 0.43,
"step": 1916
},
{
"epoch": 2.9629057187017,
"grad_norm": 9.75,
"learning_rate": 8.736842105263159e-07,
"loss": 0.3885,
"step": 1917
},
{
"epoch": 2.964451313755796,
"grad_norm": 13.875,
"learning_rate": 8.631578947368421e-07,
"loss": 0.3249,
"step": 1918
},
{
"epoch": 2.965996908809892,
"grad_norm": 9.9375,
"learning_rate": 8.526315789473685e-07,
"loss": 0.4174,
"step": 1919
},
{
"epoch": 2.9675425038639878,
"grad_norm": 10.5625,
"learning_rate": 8.421052631578948e-07,
"loss": 0.3626,
"step": 1920
},
{
"epoch": 2.9690880989180837,
"grad_norm": 9.625,
"learning_rate": 8.315789473684212e-07,
"loss": 0.3434,
"step": 1921
},
{
"epoch": 2.970633693972179,
"grad_norm": 9.6875,
"learning_rate": 8.210526315789474e-07,
"loss": 0.3301,
"step": 1922
},
{
"epoch": 2.972179289026275,
"grad_norm": 10.3125,
"learning_rate": 8.105263157894736e-07,
"loss": 0.3664,
"step": 1923
},
{
"epoch": 2.973724884080371,
"grad_norm": 11.625,
"learning_rate": 8.000000000000001e-07,
"loss": 0.3901,
"step": 1924
},
{
"epoch": 2.975270479134467,
"grad_norm": 10.375,
"learning_rate": 7.894736842105263e-07,
"loss": 0.3904,
"step": 1925
},
{
"epoch": 2.9768160741885628,
"grad_norm": 9.875,
"learning_rate": 7.789473684210527e-07,
"loss": 0.4489,
"step": 1926
},
{
"epoch": 2.978361669242658,
"grad_norm": 11.0625,
"learning_rate": 7.684210526315789e-07,
"loss": 0.3721,
"step": 1927
},
{
"epoch": 2.979907264296754,
"grad_norm": 17.875,
"learning_rate": 7.578947368421054e-07,
"loss": 0.3797,
"step": 1928
},
{
"epoch": 2.98145285935085,
"grad_norm": 10.6875,
"learning_rate": 7.473684210526316e-07,
"loss": 0.4435,
"step": 1929
},
{
"epoch": 2.982998454404946,
"grad_norm": 9.8125,
"learning_rate": 7.368421052631579e-07,
"loss": 0.3561,
"step": 1930
},
{
"epoch": 2.984544049459042,
"grad_norm": 9.3125,
"learning_rate": 7.263157894736843e-07,
"loss": 0.3817,
"step": 1931
},
{
"epoch": 2.9860896445131377,
"grad_norm": 9.875,
"learning_rate": 7.157894736842106e-07,
"loss": 0.4161,
"step": 1932
},
{
"epoch": 2.9876352395672336,
"grad_norm": 10.5,
"learning_rate": 7.052631578947369e-07,
"loss": 0.3882,
"step": 1933
},
{
"epoch": 2.989180834621329,
"grad_norm": 9.3125,
"learning_rate": 6.947368421052631e-07,
"loss": 0.3813,
"step": 1934
},
{
"epoch": 2.990726429675425,
"grad_norm": 10.75,
"learning_rate": 6.842105263157896e-07,
"loss": 0.4273,
"step": 1935
},
{
"epoch": 2.992272024729521,
"grad_norm": 11.25,
"learning_rate": 6.736842105263158e-07,
"loss": 0.4464,
"step": 1936
},
{
"epoch": 2.993817619783617,
"grad_norm": 9.75,
"learning_rate": 6.631578947368422e-07,
"loss": 0.4175,
"step": 1937
},
{
"epoch": 2.9953632148377123,
"grad_norm": 10.625,
"learning_rate": 6.526315789473684e-07,
"loss": 0.4584,
"step": 1938
},
{
"epoch": 2.996908809891808,
"grad_norm": 9.875,
"learning_rate": 6.421052631578948e-07,
"loss": 0.4013,
"step": 1939
},
{
"epoch": 2.998454404945904,
"grad_norm": 8.6875,
"learning_rate": 6.315789473684211e-07,
"loss": 0.4026,
"step": 1940
},
{
"epoch": 3.0,
"grad_norm": 8.625,
"learning_rate": 6.210526315789474e-07,
"loss": 0.3594,
"step": 1941
},
{
"epoch": 3.001545595054096,
"grad_norm": 9.25,
"learning_rate": 6.105263157894738e-07,
"loss": 0.3907,
"step": 1942
},
{
"epoch": 3.003091190108192,
"grad_norm": 8.5,
"learning_rate": 6.000000000000001e-07,
"loss": 0.3545,
"step": 1943
},
{
"epoch": 3.0046367851622873,
"grad_norm": 10.375,
"learning_rate": 5.894736842105263e-07,
"loss": 0.3784,
"step": 1944
},
{
"epoch": 3.006182380216383,
"grad_norm": 10.1875,
"learning_rate": 5.789473684210526e-07,
"loss": 0.3808,
"step": 1945
},
{
"epoch": 3.007727975270479,
"grad_norm": 14.125,
"learning_rate": 5.68421052631579e-07,
"loss": 0.4312,
"step": 1946
},
{
"epoch": 3.009273570324575,
"grad_norm": 10.5,
"learning_rate": 5.578947368421053e-07,
"loss": 0.3974,
"step": 1947
},
{
"epoch": 3.010819165378671,
"grad_norm": 28.125,
"learning_rate": 5.473684210526316e-07,
"loss": 0.4432,
"step": 1948
},
{
"epoch": 3.012364760432767,
"grad_norm": 9.0625,
"learning_rate": 5.368421052631579e-07,
"loss": 0.3728,
"step": 1949
},
{
"epoch": 3.0139103554868623,
"grad_norm": 9.6875,
"learning_rate": 5.263157894736843e-07,
"loss": 0.3761,
"step": 1950
},
{
"epoch": 3.015455950540958,
"grad_norm": 9.4375,
"learning_rate": 5.157894736842106e-07,
"loss": 0.3817,
"step": 1951
},
{
"epoch": 3.017001545595054,
"grad_norm": 8.875,
"learning_rate": 5.052631578947369e-07,
"loss": 0.3743,
"step": 1952
},
{
"epoch": 3.01854714064915,
"grad_norm": 10.3125,
"learning_rate": 4.947368421052632e-07,
"loss": 0.4283,
"step": 1953
},
{
"epoch": 3.020092735703246,
"grad_norm": 8.9375,
"learning_rate": 4.842105263157895e-07,
"loss": 0.4342,
"step": 1954
},
{
"epoch": 3.021638330757342,
"grad_norm": 9.6875,
"learning_rate": 4.7368421052631585e-07,
"loss": 0.3495,
"step": 1955
},
{
"epoch": 3.0231839258114372,
"grad_norm": 12.5625,
"learning_rate": 4.631578947368422e-07,
"loss": 0.4606,
"step": 1956
},
{
"epoch": 3.024729520865533,
"grad_norm": 11.75,
"learning_rate": 4.526315789473685e-07,
"loss": 0.3993,
"step": 1957
},
{
"epoch": 3.026275115919629,
"grad_norm": 11.375,
"learning_rate": 4.421052631578947e-07,
"loss": 0.4741,
"step": 1958
},
{
"epoch": 3.027820710973725,
"grad_norm": 9.3125,
"learning_rate": 4.3157894736842105e-07,
"loss": 0.381,
"step": 1959
},
{
"epoch": 3.029366306027821,
"grad_norm": 9.25,
"learning_rate": 4.210526315789474e-07,
"loss": 0.3736,
"step": 1960
},
{
"epoch": 3.0309119010819163,
"grad_norm": 11.75,
"learning_rate": 4.105263157894737e-07,
"loss": 0.4511,
"step": 1961
},
{
"epoch": 3.0324574961360122,
"grad_norm": 10.5,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.4024,
"step": 1962
},
{
"epoch": 3.034003091190108,
"grad_norm": 12.875,
"learning_rate": 3.8947368421052636e-07,
"loss": 0.4278,
"step": 1963
},
{
"epoch": 3.035548686244204,
"grad_norm": 10.0625,
"learning_rate": 3.789473684210527e-07,
"loss": 0.3991,
"step": 1964
},
{
"epoch": 3.0370942812983,
"grad_norm": 9.875,
"learning_rate": 3.6842105263157896e-07,
"loss": 0.4595,
"step": 1965
},
{
"epoch": 3.038639876352396,
"grad_norm": 8.875,
"learning_rate": 3.578947368421053e-07,
"loss": 0.3653,
"step": 1966
},
{
"epoch": 3.0401854714064913,
"grad_norm": 9.8125,
"learning_rate": 3.4736842105263157e-07,
"loss": 0.408,
"step": 1967
},
{
"epoch": 3.041731066460587,
"grad_norm": 9.6875,
"learning_rate": 3.368421052631579e-07,
"loss": 0.3697,
"step": 1968
},
{
"epoch": 3.043276661514683,
"grad_norm": 9.0,
"learning_rate": 3.263157894736842e-07,
"loss": 0.4205,
"step": 1969
},
{
"epoch": 3.044822256568779,
"grad_norm": 9.625,
"learning_rate": 3.1578947368421055e-07,
"loss": 0.3571,
"step": 1970
},
{
"epoch": 3.046367851622875,
"grad_norm": 11.5625,
"learning_rate": 3.052631578947369e-07,
"loss": 0.4104,
"step": 1971
},
{
"epoch": 3.047913446676971,
"grad_norm": 9.875,
"learning_rate": 2.9473684210526315e-07,
"loss": 0.3788,
"step": 1972
},
{
"epoch": 3.0494590417310663,
"grad_norm": 9.4375,
"learning_rate": 2.842105263157895e-07,
"loss": 0.4585,
"step": 1973
},
{
"epoch": 3.051004636785162,
"grad_norm": 10.0,
"learning_rate": 2.736842105263158e-07,
"loss": 0.4223,
"step": 1974
},
{
"epoch": 3.052550231839258,
"grad_norm": 9.6875,
"learning_rate": 2.6315789473684213e-07,
"loss": 0.3643,
"step": 1975
},
{
"epoch": 3.054095826893354,
"grad_norm": 9.6875,
"learning_rate": 2.5263157894736846e-07,
"loss": 0.4294,
"step": 1976
},
{
"epoch": 3.05564142194745,
"grad_norm": 7.9375,
"learning_rate": 2.4210526315789473e-07,
"loss": 0.3805,
"step": 1977
},
{
"epoch": 3.0571870170015454,
"grad_norm": 9.125,
"learning_rate": 2.315789473684211e-07,
"loss": 0.4199,
"step": 1978
},
{
"epoch": 3.0587326120556413,
"grad_norm": 11.1875,
"learning_rate": 2.2105263157894736e-07,
"loss": 0.471,
"step": 1979
},
{
"epoch": 3.060278207109737,
"grad_norm": 13.5,
"learning_rate": 2.105263157894737e-07,
"loss": 0.4051,
"step": 1980
},
{
"epoch": 3.061823802163833,
"grad_norm": 10.9375,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.3503,
"step": 1981
},
{
"epoch": 3.063369397217929,
"grad_norm": 11.25,
"learning_rate": 1.8947368421052634e-07,
"loss": 0.3959,
"step": 1982
},
{
"epoch": 3.064914992272025,
"grad_norm": 9.5625,
"learning_rate": 1.7894736842105265e-07,
"loss": 0.3787,
"step": 1983
},
{
"epoch": 3.0664605873261204,
"grad_norm": 10.0,
"learning_rate": 1.6842105263157895e-07,
"loss": 0.3268,
"step": 1984
},
{
"epoch": 3.0680061823802163,
"grad_norm": 9.8125,
"learning_rate": 1.5789473684210527e-07,
"loss": 0.3751,
"step": 1985
},
{
"epoch": 3.069551777434312,
"grad_norm": 10.4375,
"learning_rate": 1.4736842105263158e-07,
"loss": 0.3919,
"step": 1986
},
{
"epoch": 3.071097372488408,
"grad_norm": 10.1875,
"learning_rate": 1.368421052631579e-07,
"loss": 0.3795,
"step": 1987
},
{
"epoch": 3.072642967542504,
"grad_norm": 9.125,
"learning_rate": 1.2631578947368423e-07,
"loss": 0.3608,
"step": 1988
},
{
"epoch": 3.0741885625966,
"grad_norm": 13.5,
"learning_rate": 1.1578947368421054e-07,
"loss": 0.4163,
"step": 1989
},
{
"epoch": 3.0757341576506954,
"grad_norm": 9.9375,
"learning_rate": 1.0526315789473685e-07,
"loss": 0.4502,
"step": 1990
},
{
"epoch": 3.0772797527047913,
"grad_norm": 10.125,
"learning_rate": 9.473684210526317e-08,
"loss": 0.4118,
"step": 1991
},
{
"epoch": 3.078825347758887,
"grad_norm": 11.25,
"learning_rate": 8.421052631578947e-08,
"loss": 0.3723,
"step": 1992
},
{
"epoch": 3.080370942812983,
"grad_norm": 9.4375,
"learning_rate": 7.368421052631579e-08,
"loss": 0.4009,
"step": 1993
},
{
"epoch": 3.081916537867079,
"grad_norm": 8.9375,
"learning_rate": 6.315789473684211e-08,
"loss": 0.3519,
"step": 1994
},
{
"epoch": 3.0834621329211744,
"grad_norm": 9.625,
"learning_rate": 5.263157894736842e-08,
"loss": 0.3874,
"step": 1995
},
{
"epoch": 3.0850077279752703,
"grad_norm": 11.375,
"learning_rate": 4.2105263157894737e-08,
"loss": 0.4415,
"step": 1996
},
{
"epoch": 3.0865533230293662,
"grad_norm": 10.0,
"learning_rate": 3.157894736842106e-08,
"loss": 0.3404,
"step": 1997
},
{
"epoch": 3.088098918083462,
"grad_norm": 10.9375,
"learning_rate": 2.1052631578947368e-08,
"loss": 0.3715,
"step": 1998
},
{
"epoch": 3.089644513137558,
"grad_norm": 9.4375,
"learning_rate": 1.0526315789473684e-08,
"loss": 0.3938,
"step": 1999
},
{
"epoch": 3.091190108191654,
"grad_norm": 8.8125,
"learning_rate": 0.0,
"loss": 0.3763,
"step": 2000
}
],
"logging_steps": 1,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}