intern-v4 / trainer_state.json
pltops's picture
Upload folder using huggingface_hub
5ed47c0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2077,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00048155059290916753,
"grad_norm": 9.47897720336914,
"learning_rate": 0.0,
"loss": 2.5979,
"step": 1
},
{
"epoch": 0.0009631011858183351,
"grad_norm": 13.405498504638672,
"learning_rate": 3.1746031746031746e-06,
"loss": 2.8498,
"step": 2
},
{
"epoch": 0.0014446517787275025,
"grad_norm": 6.596500873565674,
"learning_rate": 6.349206349206349e-06,
"loss": 1.8474,
"step": 3
},
{
"epoch": 0.0019262023716366701,
"grad_norm": 8.559901237487793,
"learning_rate": 9.523809523809523e-06,
"loss": 2.3071,
"step": 4
},
{
"epoch": 0.0024077529645458377,
"grad_norm": 7.220107555389404,
"learning_rate": 1.2698412698412699e-05,
"loss": 2.4772,
"step": 5
},
{
"epoch": 0.002889303557455005,
"grad_norm": 9.46921157836914,
"learning_rate": 1.5873015873015872e-05,
"loss": 2.0344,
"step": 6
},
{
"epoch": 0.0033708541503641725,
"grad_norm": 7.574646949768066,
"learning_rate": 1.9047619047619046e-05,
"loss": 2.9056,
"step": 7
},
{
"epoch": 0.0038524047432733403,
"grad_norm": 10.66491985321045,
"learning_rate": 2.2222222222222223e-05,
"loss": 2.5144,
"step": 8
},
{
"epoch": 0.004333955336182508,
"grad_norm": 5.920387268066406,
"learning_rate": 2.5396825396825397e-05,
"loss": 2.1156,
"step": 9
},
{
"epoch": 0.004815505929091675,
"grad_norm": 5.609206676483154,
"learning_rate": 2.857142857142857e-05,
"loss": 2.2243,
"step": 10
},
{
"epoch": 0.005297056522000842,
"grad_norm": 6.140780448913574,
"learning_rate": 3.1746031746031745e-05,
"loss": 2.0492,
"step": 11
},
{
"epoch": 0.00577860711491001,
"grad_norm": 6.842236042022705,
"learning_rate": 3.492063492063492e-05,
"loss": 1.2736,
"step": 12
},
{
"epoch": 0.006260157707819178,
"grad_norm": 9.14393424987793,
"learning_rate": 3.809523809523809e-05,
"loss": 1.8261,
"step": 13
},
{
"epoch": 0.006741708300728345,
"grad_norm": 8.761028289794922,
"learning_rate": 4.126984126984127e-05,
"loss": 1.5156,
"step": 14
},
{
"epoch": 0.007223258893637513,
"grad_norm": 7.065422534942627,
"learning_rate": 4.4444444444444447e-05,
"loss": 2.0811,
"step": 15
},
{
"epoch": 0.0077048094865466805,
"grad_norm": 5.040136814117432,
"learning_rate": 4.761904761904762e-05,
"loss": 1.8507,
"step": 16
},
{
"epoch": 0.008186360079455848,
"grad_norm": 5.030274868011475,
"learning_rate": 5.0793650793650794e-05,
"loss": 1.3307,
"step": 17
},
{
"epoch": 0.008667910672365015,
"grad_norm": 6.842209339141846,
"learning_rate": 5.396825396825397e-05,
"loss": 1.6775,
"step": 18
},
{
"epoch": 0.009149461265274182,
"grad_norm": 3.107877016067505,
"learning_rate": 5.714285714285714e-05,
"loss": 1.6994,
"step": 19
},
{
"epoch": 0.00963101185818335,
"grad_norm": 8.357791900634766,
"learning_rate": 6.0317460317460316e-05,
"loss": 2.017,
"step": 20
},
{
"epoch": 0.010112562451092518,
"grad_norm": 5.375106334686279,
"learning_rate": 6.349206349206349e-05,
"loss": 1.2195,
"step": 21
},
{
"epoch": 0.010594113044001685,
"grad_norm": 8.254910469055176,
"learning_rate": 6.666666666666667e-05,
"loss": 1.7287,
"step": 22
},
{
"epoch": 0.011075663636910853,
"grad_norm": 1.862160086631775,
"learning_rate": 6.984126984126984e-05,
"loss": 0.9883,
"step": 23
},
{
"epoch": 0.01155721422982002,
"grad_norm": 4.388360500335693,
"learning_rate": 7.301587301587302e-05,
"loss": 0.8999,
"step": 24
},
{
"epoch": 0.012038764822729187,
"grad_norm": 3.909996509552002,
"learning_rate": 7.619047619047618e-05,
"loss": 1.0851,
"step": 25
},
{
"epoch": 0.012520315415638356,
"grad_norm": 6.695411205291748,
"learning_rate": 7.936507936507937e-05,
"loss": 1.3302,
"step": 26
},
{
"epoch": 0.013001866008547523,
"grad_norm": 5.960219383239746,
"learning_rate": 8.253968253968255e-05,
"loss": 1.111,
"step": 27
},
{
"epoch": 0.01348341660145669,
"grad_norm": 4.400773525238037,
"learning_rate": 8.571428571428571e-05,
"loss": 0.7887,
"step": 28
},
{
"epoch": 0.013964967194365858,
"grad_norm": 5.381070137023926,
"learning_rate": 8.888888888888889e-05,
"loss": 2.0415,
"step": 29
},
{
"epoch": 0.014446517787275025,
"grad_norm": 2.605618715286255,
"learning_rate": 9.206349206349206e-05,
"loss": 1.0637,
"step": 30
},
{
"epoch": 0.014928068380184192,
"grad_norm": 4.758686065673828,
"learning_rate": 9.523809523809524e-05,
"loss": 0.815,
"step": 31
},
{
"epoch": 0.015409618973093361,
"grad_norm": 3.952995777130127,
"learning_rate": 9.841269841269841e-05,
"loss": 1.5256,
"step": 32
},
{
"epoch": 0.01589116956600253,
"grad_norm": 6.927408695220947,
"learning_rate": 0.00010158730158730159,
"loss": 1.2863,
"step": 33
},
{
"epoch": 0.016372720158911697,
"grad_norm": 3.77854323387146,
"learning_rate": 0.00010476190476190477,
"loss": 1.2303,
"step": 34
},
{
"epoch": 0.016854270751820864,
"grad_norm": 3.4671757221221924,
"learning_rate": 0.00010793650793650794,
"loss": 0.8535,
"step": 35
},
{
"epoch": 0.01733582134473003,
"grad_norm": 3.3394312858581543,
"learning_rate": 0.00011111111111111112,
"loss": 1.7921,
"step": 36
},
{
"epoch": 0.017817371937639197,
"grad_norm": 3.3127732276916504,
"learning_rate": 0.00011428571428571428,
"loss": 1.0765,
"step": 37
},
{
"epoch": 0.018298922530548364,
"grad_norm": 3.850238800048828,
"learning_rate": 0.00011746031746031746,
"loss": 1.2267,
"step": 38
},
{
"epoch": 0.018780473123457535,
"grad_norm": 2.822495460510254,
"learning_rate": 0.00012063492063492063,
"loss": 0.9256,
"step": 39
},
{
"epoch": 0.0192620237163667,
"grad_norm": 3.967958688735962,
"learning_rate": 0.0001238095238095238,
"loss": 1.9019,
"step": 40
},
{
"epoch": 0.01974357430927587,
"grad_norm": 3.5611379146575928,
"learning_rate": 0.00012698412698412698,
"loss": 0.8901,
"step": 41
},
{
"epoch": 0.020225124902185036,
"grad_norm": 40.8494758605957,
"learning_rate": 0.00013015873015873017,
"loss": 1.0527,
"step": 42
},
{
"epoch": 0.020706675495094203,
"grad_norm": 3.99414324760437,
"learning_rate": 0.00013333333333333334,
"loss": 1.1458,
"step": 43
},
{
"epoch": 0.02118822608800337,
"grad_norm": 3.9491536617279053,
"learning_rate": 0.0001365079365079365,
"loss": 1.125,
"step": 44
},
{
"epoch": 0.02166977668091254,
"grad_norm": 2.9987173080444336,
"learning_rate": 0.00013968253968253967,
"loss": 1.0546,
"step": 45
},
{
"epoch": 0.022151327273821707,
"grad_norm": 3.976832628250122,
"learning_rate": 0.00014285714285714287,
"loss": 1.1407,
"step": 46
},
{
"epoch": 0.022632877866730874,
"grad_norm": 10.798648834228516,
"learning_rate": 0.00014603174603174603,
"loss": 0.6169,
"step": 47
},
{
"epoch": 0.02311442845964004,
"grad_norm": 3.4334845542907715,
"learning_rate": 0.00014920634920634923,
"loss": 1.1717,
"step": 48
},
{
"epoch": 0.023595979052549208,
"grad_norm": 4.682596206665039,
"learning_rate": 0.00015238095238095237,
"loss": 0.9471,
"step": 49
},
{
"epoch": 0.024077529645458375,
"grad_norm": 6.495236396789551,
"learning_rate": 0.00015555555555555556,
"loss": 1.2659,
"step": 50
},
{
"epoch": 0.024559080238367545,
"grad_norm": 4.563192367553711,
"learning_rate": 0.00015873015873015873,
"loss": 1.5277,
"step": 51
},
{
"epoch": 0.025040630831276712,
"grad_norm": 2.5679526329040527,
"learning_rate": 0.00016190476190476192,
"loss": 1.5084,
"step": 52
},
{
"epoch": 0.02552218142418588,
"grad_norm": 3.2736315727233887,
"learning_rate": 0.0001650793650793651,
"loss": 0.925,
"step": 53
},
{
"epoch": 0.026003732017095046,
"grad_norm": 3.326249837875366,
"learning_rate": 0.00016825396825396826,
"loss": 0.7348,
"step": 54
},
{
"epoch": 0.026485282610004213,
"grad_norm": 4.137823104858398,
"learning_rate": 0.00017142857142857143,
"loss": 1.5321,
"step": 55
},
{
"epoch": 0.02696683320291338,
"grad_norm": 5.634565353393555,
"learning_rate": 0.00017460317460317462,
"loss": 0.8261,
"step": 56
},
{
"epoch": 0.02744838379582255,
"grad_norm": 5.409355163574219,
"learning_rate": 0.00017777777777777779,
"loss": 1.1813,
"step": 57
},
{
"epoch": 0.027929934388731717,
"grad_norm": 3.174149513244629,
"learning_rate": 0.00018095238095238095,
"loss": 0.979,
"step": 58
},
{
"epoch": 0.028411484981640884,
"grad_norm": 3.773308038711548,
"learning_rate": 0.00018412698412698412,
"loss": 0.9456,
"step": 59
},
{
"epoch": 0.02889303557455005,
"grad_norm": 4.3084306716918945,
"learning_rate": 0.00018730158730158731,
"loss": 1.0249,
"step": 60
},
{
"epoch": 0.029374586167459218,
"grad_norm": 1.6852308511734009,
"learning_rate": 0.00019047619047619048,
"loss": 0.9553,
"step": 61
},
{
"epoch": 0.029856136760368385,
"grad_norm": 4.570802211761475,
"learning_rate": 0.00019365079365079365,
"loss": 0.8408,
"step": 62
},
{
"epoch": 0.030337687353277555,
"grad_norm": 4.909348011016846,
"learning_rate": 0.00019682539682539682,
"loss": 1.1777,
"step": 63
},
{
"epoch": 0.030819237946186722,
"grad_norm": 3.654968500137329,
"learning_rate": 0.0002,
"loss": 1.04,
"step": 64
},
{
"epoch": 0.03130078853909589,
"grad_norm": 3.1528286933898926,
"learning_rate": 0.00019999987833918285,
"loss": 0.824,
"step": 65
},
{
"epoch": 0.03178233913200506,
"grad_norm": 4.7957024574279785,
"learning_rate": 0.00019999951335702735,
"loss": 1.0617,
"step": 66
},
{
"epoch": 0.03226388972491422,
"grad_norm": 3.9064583778381348,
"learning_rate": 0.00019999890505442158,
"loss": 0.854,
"step": 67
},
{
"epoch": 0.03274544031782339,
"grad_norm": 4.40252161026001,
"learning_rate": 0.0001999980534328457,
"loss": 1.219,
"step": 68
},
{
"epoch": 0.03322699091073256,
"grad_norm": 10.936585426330566,
"learning_rate": 0.0001999969584943719,
"loss": 1.3314,
"step": 69
},
{
"epoch": 0.03370854150364173,
"grad_norm": 5.040441036224365,
"learning_rate": 0.00019999562024166438,
"loss": 0.7201,
"step": 70
},
{
"epoch": 0.03419009209655089,
"grad_norm": 3.9674477577209473,
"learning_rate": 0.0001999940386779794,
"loss": 0.6603,
"step": 71
},
{
"epoch": 0.03467164268946006,
"grad_norm": 2.814729928970337,
"learning_rate": 0.00019999221380716527,
"loss": 0.829,
"step": 72
},
{
"epoch": 0.03515319328236923,
"grad_norm": 4.869168281555176,
"learning_rate": 0.00019999014563366226,
"loss": 1.014,
"step": 73
},
{
"epoch": 0.035634743875278395,
"grad_norm": 6.23966121673584,
"learning_rate": 0.00019998783416250268,
"loss": 1.0555,
"step": 74
},
{
"epoch": 0.036116294468187565,
"grad_norm": 4.9502458572387695,
"learning_rate": 0.0001999852793993109,
"loss": 1.2713,
"step": 75
},
{
"epoch": 0.03659784506109673,
"grad_norm": 4.012925148010254,
"learning_rate": 0.00019998248135030315,
"loss": 1.0993,
"step": 76
},
{
"epoch": 0.0370793956540059,
"grad_norm": 7.407429218292236,
"learning_rate": 0.00019997944002228774,
"loss": 1.2082,
"step": 77
},
{
"epoch": 0.03756094624691507,
"grad_norm": 6.095324993133545,
"learning_rate": 0.00019997615542266482,
"loss": 0.9925,
"step": 78
},
{
"epoch": 0.03804249683982423,
"grad_norm": 3.40785813331604,
"learning_rate": 0.00019997262755942655,
"loss": 0.7746,
"step": 79
},
{
"epoch": 0.0385240474327334,
"grad_norm": 9.24764633178711,
"learning_rate": 0.000199968856441157,
"loss": 0.7769,
"step": 80
},
{
"epoch": 0.03900559802564257,
"grad_norm": 7.706933975219727,
"learning_rate": 0.0001999648420770321,
"loss": 1.0516,
"step": 81
},
{
"epoch": 0.03948714861855174,
"grad_norm": 3.0405354499816895,
"learning_rate": 0.0001999605844768197,
"loss": 1.1086,
"step": 82
},
{
"epoch": 0.0399686992114609,
"grad_norm": 5.7575225830078125,
"learning_rate": 0.00019995608365087946,
"loss": 1.1402,
"step": 83
},
{
"epoch": 0.04045024980437007,
"grad_norm": 4.292261600494385,
"learning_rate": 0.0001999513396101628,
"loss": 1.0188,
"step": 84
},
{
"epoch": 0.04093180039727924,
"grad_norm": 4.21382474899292,
"learning_rate": 0.00019994635236621306,
"loss": 0.82,
"step": 85
},
{
"epoch": 0.041413350990188405,
"grad_norm": 2.9905776977539062,
"learning_rate": 0.00019994112193116528,
"loss": 1.0823,
"step": 86
},
{
"epoch": 0.041894901583097575,
"grad_norm": 2.917043685913086,
"learning_rate": 0.00019993564831774618,
"loss": 0.9047,
"step": 87
},
{
"epoch": 0.04237645217600674,
"grad_norm": 2.6178548336029053,
"learning_rate": 0.00019992993153927432,
"loss": 0.8579,
"step": 88
},
{
"epoch": 0.04285800276891591,
"grad_norm": 5.876251220703125,
"learning_rate": 0.00019992397160965982,
"loss": 0.8726,
"step": 89
},
{
"epoch": 0.04333955336182508,
"grad_norm": 15.610270500183105,
"learning_rate": 0.0001999177685434045,
"loss": 1.3112,
"step": 90
},
{
"epoch": 0.04382110395473424,
"grad_norm": 3.1741201877593994,
"learning_rate": 0.00019991132235560176,
"loss": 1.0837,
"step": 91
},
{
"epoch": 0.044302654547643414,
"grad_norm": 3.8556501865386963,
"learning_rate": 0.00019990463306193652,
"loss": 0.947,
"step": 92
},
{
"epoch": 0.04478420514055258,
"grad_norm": 4.010826110839844,
"learning_rate": 0.00019989770067868533,
"loss": 0.726,
"step": 93
},
{
"epoch": 0.04526575573346175,
"grad_norm": 3.4739131927490234,
"learning_rate": 0.00019989052522271622,
"loss": 0.9653,
"step": 94
},
{
"epoch": 0.04574730632637091,
"grad_norm": 2.507850170135498,
"learning_rate": 0.00019988310671148848,
"loss": 0.8077,
"step": 95
},
{
"epoch": 0.04622885691928008,
"grad_norm": 3.4519567489624023,
"learning_rate": 0.00019987544516305311,
"loss": 0.897,
"step": 96
},
{
"epoch": 0.04671040751218925,
"grad_norm": 3.6631219387054443,
"learning_rate": 0.00019986754059605222,
"loss": 0.912,
"step": 97
},
{
"epoch": 0.047191958105098415,
"grad_norm": 3.770810842514038,
"learning_rate": 0.00019985939302971938,
"loss": 0.8759,
"step": 98
},
{
"epoch": 0.047673508698007586,
"grad_norm": 5.926509857177734,
"learning_rate": 0.00019985100248387933,
"loss": 1.2971,
"step": 99
},
{
"epoch": 0.04815505929091675,
"grad_norm": 3.8625011444091797,
"learning_rate": 0.00019984236897894816,
"loss": 0.5941,
"step": 100
},
{
"epoch": 0.04863660988382592,
"grad_norm": 5.259921073913574,
"learning_rate": 0.000199833492535933,
"loss": 0.8503,
"step": 101
},
{
"epoch": 0.04911816047673509,
"grad_norm": 4.988558769226074,
"learning_rate": 0.00019982437317643217,
"loss": 1.1631,
"step": 102
},
{
"epoch": 0.04959971106964425,
"grad_norm": 7.225802898406982,
"learning_rate": 0.00019981501092263503,
"loss": 1.3616,
"step": 103
},
{
"epoch": 0.050081261662553424,
"grad_norm": 3.8858981132507324,
"learning_rate": 0.00019980540579732196,
"loss": 0.6591,
"step": 104
},
{
"epoch": 0.05056281225546259,
"grad_norm": 3.4471282958984375,
"learning_rate": 0.00019979555782386434,
"loss": 0.7594,
"step": 105
},
{
"epoch": 0.05104436284837176,
"grad_norm": 4.011205196380615,
"learning_rate": 0.00019978546702622443,
"loss": 1.0898,
"step": 106
},
{
"epoch": 0.05152591344128093,
"grad_norm": 3.6832756996154785,
"learning_rate": 0.00019977513342895532,
"loss": 0.6194,
"step": 107
},
{
"epoch": 0.05200746403419009,
"grad_norm": 6.4003376960754395,
"learning_rate": 0.00019976455705720083,
"loss": 0.7407,
"step": 108
},
{
"epoch": 0.05248901462709926,
"grad_norm": 2.832329034805298,
"learning_rate": 0.0001997537379366956,
"loss": 1.4024,
"step": 109
},
{
"epoch": 0.052970565220008425,
"grad_norm": 4.148341178894043,
"learning_rate": 0.00019974267609376494,
"loss": 1.3058,
"step": 110
},
{
"epoch": 0.053452115812917596,
"grad_norm": 6.119814872741699,
"learning_rate": 0.00019973137155532462,
"loss": 0.8581,
"step": 111
},
{
"epoch": 0.05393366640582676,
"grad_norm": 8.218791961669922,
"learning_rate": 0.00019971982434888107,
"loss": 0.9871,
"step": 112
},
{
"epoch": 0.05441521699873593,
"grad_norm": 2.336301326751709,
"learning_rate": 0.00019970803450253114,
"loss": 1.2561,
"step": 113
},
{
"epoch": 0.0548967675916451,
"grad_norm": 10.881673812866211,
"learning_rate": 0.0001996960020449621,
"loss": 1.3601,
"step": 114
},
{
"epoch": 0.055378318184554264,
"grad_norm": 1.6918381452560425,
"learning_rate": 0.00019968372700545145,
"loss": 0.9491,
"step": 115
},
{
"epoch": 0.055859868777463434,
"grad_norm": 3.8625855445861816,
"learning_rate": 0.00019967120941386709,
"loss": 1.3005,
"step": 116
},
{
"epoch": 0.0563414193703726,
"grad_norm": 4.829530715942383,
"learning_rate": 0.000199658449300667,
"loss": 0.9137,
"step": 117
},
{
"epoch": 0.05682296996328177,
"grad_norm": 6.8195719718933105,
"learning_rate": 0.00019964544669689928,
"loss": 1.6891,
"step": 118
},
{
"epoch": 0.05730452055619094,
"grad_norm": 4.785418510437012,
"learning_rate": 0.00019963220163420214,
"loss": 1.4208,
"step": 119
},
{
"epoch": 0.0577860711491001,
"grad_norm": 3.4616124629974365,
"learning_rate": 0.0001996187141448036,
"loss": 0.6869,
"step": 120
},
{
"epoch": 0.05826762174200927,
"grad_norm": 4.4392924308776855,
"learning_rate": 0.0001996049842615217,
"loss": 0.7986,
"step": 121
},
{
"epoch": 0.058749172334918436,
"grad_norm": 3.8779265880584717,
"learning_rate": 0.0001995910120177642,
"loss": 0.7087,
"step": 122
},
{
"epoch": 0.059230722927827606,
"grad_norm": 6.253561496734619,
"learning_rate": 0.00019957679744752859,
"loss": 1.2118,
"step": 123
},
{
"epoch": 0.05971227352073677,
"grad_norm": 5.026823043823242,
"learning_rate": 0.00019956234058540195,
"loss": 0.8501,
"step": 124
},
{
"epoch": 0.06019382411364594,
"grad_norm": 4.044717788696289,
"learning_rate": 0.00019954764146656105,
"loss": 0.5874,
"step": 125
},
{
"epoch": 0.06067537470655511,
"grad_norm": 3.3693103790283203,
"learning_rate": 0.00019953270012677195,
"loss": 0.5689,
"step": 126
},
{
"epoch": 0.061156925299464274,
"grad_norm": 3.7752881050109863,
"learning_rate": 0.00019951751660239015,
"loss": 0.9354,
"step": 127
},
{
"epoch": 0.061638475892373444,
"grad_norm": 4.32492208480835,
"learning_rate": 0.00019950209093036052,
"loss": 0.8582,
"step": 128
},
{
"epoch": 0.06212002648528261,
"grad_norm": 4.800130367279053,
"learning_rate": 0.000199486423148217,
"loss": 1.0017,
"step": 129
},
{
"epoch": 0.06260157707819178,
"grad_norm": 2.9463722705841064,
"learning_rate": 0.00019947051329408276,
"loss": 1.076,
"step": 130
},
{
"epoch": 0.06308312767110094,
"grad_norm": 1.6262086629867554,
"learning_rate": 0.00019945436140666981,
"loss": 1.2582,
"step": 131
},
{
"epoch": 0.06356467826401012,
"grad_norm": 7.490293979644775,
"learning_rate": 0.0001994379675252793,
"loss": 1.4111,
"step": 132
},
{
"epoch": 0.06404622885691928,
"grad_norm": 2.8741366863250732,
"learning_rate": 0.00019942133168980103,
"loss": 1.4064,
"step": 133
},
{
"epoch": 0.06452777944982845,
"grad_norm": 4.327383995056152,
"learning_rate": 0.00019940445394071355,
"loss": 1.0118,
"step": 134
},
{
"epoch": 0.06500933004273761,
"grad_norm": 2.9505062103271484,
"learning_rate": 0.0001993873343190842,
"loss": 0.9643,
"step": 135
},
{
"epoch": 0.06549088063564679,
"grad_norm": 3.9040603637695312,
"learning_rate": 0.00019936997286656855,
"loss": 1.075,
"step": 136
},
{
"epoch": 0.06597243122855595,
"grad_norm": 4.2163896560668945,
"learning_rate": 0.00019935236962541092,
"loss": 1.1182,
"step": 137
},
{
"epoch": 0.06645398182146511,
"grad_norm": 3.8160250186920166,
"learning_rate": 0.00019933452463844376,
"loss": 1.1374,
"step": 138
},
{
"epoch": 0.06693553241437429,
"grad_norm": 4.615425109863281,
"learning_rate": 0.00019931643794908772,
"loss": 0.7229,
"step": 139
},
{
"epoch": 0.06741708300728345,
"grad_norm": 5.407428741455078,
"learning_rate": 0.00019929810960135172,
"loss": 1.258,
"step": 140
},
{
"epoch": 0.06789863360019262,
"grad_norm": 2.296764612197876,
"learning_rate": 0.00019927953963983254,
"loss": 0.8528,
"step": 141
},
{
"epoch": 0.06838018419310178,
"grad_norm": 2.644909381866455,
"learning_rate": 0.00019926072810971492,
"loss": 1.2323,
"step": 142
},
{
"epoch": 0.06886173478601096,
"grad_norm": 3.273026943206787,
"learning_rate": 0.00019924167505677137,
"loss": 1.1116,
"step": 143
},
{
"epoch": 0.06934328537892012,
"grad_norm": 2.8930141925811768,
"learning_rate": 0.00019922238052736215,
"loss": 0.5674,
"step": 144
},
{
"epoch": 0.06982483597182929,
"grad_norm": 11.064878463745117,
"learning_rate": 0.00019920284456843498,
"loss": 1.054,
"step": 145
},
{
"epoch": 0.07030638656473846,
"grad_norm": 7.081421375274658,
"learning_rate": 0.00019918306722752505,
"loss": 1.2431,
"step": 146
},
{
"epoch": 0.07078793715764763,
"grad_norm": 7.764263153076172,
"learning_rate": 0.00019916304855275497,
"loss": 1.4613,
"step": 147
},
{
"epoch": 0.07126948775055679,
"grad_norm": 8.821840286254883,
"learning_rate": 0.00019914278859283445,
"loss": 1.0356,
"step": 148
},
{
"epoch": 0.07175103834346597,
"grad_norm": 4.817983150482178,
"learning_rate": 0.0001991222873970604,
"loss": 0.6992,
"step": 149
},
{
"epoch": 0.07223258893637513,
"grad_norm": 2.4728589057922363,
"learning_rate": 0.00019910154501531663,
"loss": 0.7112,
"step": 150
},
{
"epoch": 0.0727141395292843,
"grad_norm": 9.176413536071777,
"learning_rate": 0.0001990805614980739,
"loss": 0.8131,
"step": 151
},
{
"epoch": 0.07319569012219346,
"grad_norm": 4.006031036376953,
"learning_rate": 0.00019905933689638955,
"loss": 0.9721,
"step": 152
},
{
"epoch": 0.07367724071510263,
"grad_norm": 3.409881114959717,
"learning_rate": 0.00019903787126190772,
"loss": 0.9142,
"step": 153
},
{
"epoch": 0.0741587913080118,
"grad_norm": 4.4478373527526855,
"learning_rate": 0.00019901616464685888,
"loss": 0.9991,
"step": 154
},
{
"epoch": 0.07464034190092096,
"grad_norm": 5.569812774658203,
"learning_rate": 0.00019899421710405996,
"loss": 0.9713,
"step": 155
},
{
"epoch": 0.07512189249383014,
"grad_norm": 3.1611955165863037,
"learning_rate": 0.00019897202868691407,
"loss": 0.839,
"step": 156
},
{
"epoch": 0.0756034430867393,
"grad_norm": 3.547825574874878,
"learning_rate": 0.00019894959944941038,
"loss": 0.6568,
"step": 157
},
{
"epoch": 0.07608499367964847,
"grad_norm": 4.14447546005249,
"learning_rate": 0.0001989269294461242,
"loss": 0.7445,
"step": 158
},
{
"epoch": 0.07656654427255763,
"grad_norm": 3.5642924308776855,
"learning_rate": 0.0001989040187322164,
"loss": 1.1176,
"step": 159
},
{
"epoch": 0.0770480948654668,
"grad_norm": 3.2829089164733887,
"learning_rate": 0.00019888086736343384,
"loss": 1.2296,
"step": 160
},
{
"epoch": 0.07752964545837597,
"grad_norm": 3.4885025024414062,
"learning_rate": 0.0001988574753961087,
"loss": 0.827,
"step": 161
},
{
"epoch": 0.07801119605128513,
"grad_norm": 4.431697845458984,
"learning_rate": 0.00019883384288715874,
"loss": 0.9405,
"step": 162
},
{
"epoch": 0.07849274664419431,
"grad_norm": 4.44993782043457,
"learning_rate": 0.000198809969894087,
"loss": 0.9186,
"step": 163
},
{
"epoch": 0.07897429723710347,
"grad_norm": 1.9886109828948975,
"learning_rate": 0.0001987858564749816,
"loss": 1.2175,
"step": 164
},
{
"epoch": 0.07945584783001264,
"grad_norm": 1.904360055923462,
"learning_rate": 0.00019876150268851572,
"loss": 0.9003,
"step": 165
},
{
"epoch": 0.0799373984229218,
"grad_norm": 1.732532024383545,
"learning_rate": 0.00019873690859394737,
"loss": 0.8792,
"step": 166
},
{
"epoch": 0.08041894901583098,
"grad_norm": 5.374270439147949,
"learning_rate": 0.0001987120742511193,
"loss": 1.2873,
"step": 167
},
{
"epoch": 0.08090049960874014,
"grad_norm": 4.2816572189331055,
"learning_rate": 0.0001986869997204589,
"loss": 0.855,
"step": 168
},
{
"epoch": 0.0813820502016493,
"grad_norm": 2.6494359970092773,
"learning_rate": 0.00019866168506297788,
"loss": 0.9457,
"step": 169
},
{
"epoch": 0.08186360079455848,
"grad_norm": 4.345731258392334,
"learning_rate": 0.00019863613034027224,
"loss": 0.8582,
"step": 170
},
{
"epoch": 0.08234515138746765,
"grad_norm": 4.3999342918396,
"learning_rate": 0.00019861033561452223,
"loss": 0.7079,
"step": 171
},
{
"epoch": 0.08282670198037681,
"grad_norm": 4.363504409790039,
"learning_rate": 0.00019858430094849195,
"loss": 0.808,
"step": 172
},
{
"epoch": 0.08330825257328599,
"grad_norm": 2.4355199337005615,
"learning_rate": 0.0001985580264055294,
"loss": 0.833,
"step": 173
},
{
"epoch": 0.08378980316619515,
"grad_norm": 3.622019052505493,
"learning_rate": 0.00019853151204956616,
"loss": 0.8358,
"step": 174
},
{
"epoch": 0.08427135375910431,
"grad_norm": 4.48416805267334,
"learning_rate": 0.00019850475794511749,
"loss": 0.9484,
"step": 175
},
{
"epoch": 0.08475290435201348,
"grad_norm": 4.607672214508057,
"learning_rate": 0.00019847776415728185,
"loss": 0.7357,
"step": 176
},
{
"epoch": 0.08523445494492266,
"grad_norm": 2.3138136863708496,
"learning_rate": 0.000198450530751741,
"loss": 1.2186,
"step": 177
},
{
"epoch": 0.08571600553783182,
"grad_norm": 3.8758018016815186,
"learning_rate": 0.00019842305779475968,
"loss": 0.9932,
"step": 178
},
{
"epoch": 0.08619755613074098,
"grad_norm": 3.34867000579834,
"learning_rate": 0.00019839534535318558,
"loss": 1.1485,
"step": 179
},
{
"epoch": 0.08667910672365016,
"grad_norm": 1.9022514820098877,
"learning_rate": 0.00019836739349444899,
"loss": 0.7949,
"step": 180
},
{
"epoch": 0.08716065731655932,
"grad_norm": 3.0250067710876465,
"learning_rate": 0.00019833920228656292,
"loss": 0.9396,
"step": 181
},
{
"epoch": 0.08764220790946849,
"grad_norm": 3.734602689743042,
"learning_rate": 0.0001983107717981226,
"loss": 1.1539,
"step": 182
},
{
"epoch": 0.08812375850237765,
"grad_norm": 2.532639741897583,
"learning_rate": 0.00019828210209830562,
"loss": 0.836,
"step": 183
},
{
"epoch": 0.08860530909528683,
"grad_norm": 3.711615800857544,
"learning_rate": 0.00019825319325687154,
"loss": 0.9482,
"step": 184
},
{
"epoch": 0.08908685968819599,
"grad_norm": 4.490362167358398,
"learning_rate": 0.00019822404534416182,
"loss": 1.0918,
"step": 185
},
{
"epoch": 0.08956841028110515,
"grad_norm": 2.6988461017608643,
"learning_rate": 0.00019819465843109963,
"loss": 0.9532,
"step": 186
},
{
"epoch": 0.09004996087401433,
"grad_norm": 4.821238040924072,
"learning_rate": 0.00019816503258918969,
"loss": 0.6697,
"step": 187
},
{
"epoch": 0.0905315114669235,
"grad_norm": 5.357283592224121,
"learning_rate": 0.00019813516789051808,
"loss": 0.8587,
"step": 188
},
{
"epoch": 0.09101306205983266,
"grad_norm": 4.648171901702881,
"learning_rate": 0.0001981050644077521,
"loss": 1.11,
"step": 189
},
{
"epoch": 0.09149461265274182,
"grad_norm": 3.6666064262390137,
"learning_rate": 0.00019807472221414002,
"loss": 0.6605,
"step": 190
},
{
"epoch": 0.091976163245651,
"grad_norm": 5.507065773010254,
"learning_rate": 0.00019804414138351094,
"loss": 0.9696,
"step": 191
},
{
"epoch": 0.09245771383856016,
"grad_norm": 2.8789749145507812,
"learning_rate": 0.00019801332199027467,
"loss": 1.1445,
"step": 192
},
{
"epoch": 0.09293926443146933,
"grad_norm": 3.538658380508423,
"learning_rate": 0.00019798226410942146,
"loss": 1.2154,
"step": 193
},
{
"epoch": 0.0934208150243785,
"grad_norm": 5.083014011383057,
"learning_rate": 0.00019795096781652182,
"loss": 1.3403,
"step": 194
},
{
"epoch": 0.09390236561728767,
"grad_norm": 4.016115665435791,
"learning_rate": 0.00019791943318772643,
"loss": 1.0433,
"step": 195
},
{
"epoch": 0.09438391621019683,
"grad_norm": 3.3291447162628174,
"learning_rate": 0.00019788766029976587,
"loss": 1.0221,
"step": 196
},
{
"epoch": 0.09486546680310601,
"grad_norm": 4.776575565338135,
"learning_rate": 0.0001978556492299504,
"loss": 0.9671,
"step": 197
},
{
"epoch": 0.09534701739601517,
"grad_norm": 2.6845924854278564,
"learning_rate": 0.00019782340005616996,
"loss": 0.9646,
"step": 198
},
{
"epoch": 0.09582856798892433,
"grad_norm": 3.3309099674224854,
"learning_rate": 0.0001977909128568937,
"loss": 0.8776,
"step": 199
},
{
"epoch": 0.0963101185818335,
"grad_norm": 5.5021586418151855,
"learning_rate": 0.00019775818771117,
"loss": 1.0165,
"step": 200
},
{
"epoch": 0.09679166917474268,
"grad_norm": 4.549814224243164,
"learning_rate": 0.00019772522469862626,
"loss": 0.803,
"step": 201
},
{
"epoch": 0.09727321976765184,
"grad_norm": 4.1988067626953125,
"learning_rate": 0.00019769202389946863,
"loss": 1.2152,
"step": 202
},
{
"epoch": 0.097754770360561,
"grad_norm": 2.6205949783325195,
"learning_rate": 0.0001976585853944818,
"loss": 0.8637,
"step": 203
},
{
"epoch": 0.09823632095347018,
"grad_norm": 2.774397850036621,
"learning_rate": 0.0001976249092650289,
"loss": 1.0912,
"step": 204
},
{
"epoch": 0.09871787154637934,
"grad_norm": 1.7028182744979858,
"learning_rate": 0.00019759099559305124,
"loss": 0.8919,
"step": 205
},
{
"epoch": 0.0991994221392885,
"grad_norm": 3.3020026683807373,
"learning_rate": 0.00019755684446106812,
"loss": 1.0172,
"step": 206
},
{
"epoch": 0.09968097273219767,
"grad_norm": 3.2804148197174072,
"learning_rate": 0.00019752245595217662,
"loss": 1.3593,
"step": 207
},
{
"epoch": 0.10016252332510685,
"grad_norm": 2.095794677734375,
"learning_rate": 0.00019748783015005144,
"loss": 1.0033,
"step": 208
},
{
"epoch": 0.10064407391801601,
"grad_norm": 5.280991077423096,
"learning_rate": 0.00019745296713894465,
"loss": 0.8974,
"step": 209
},
{
"epoch": 0.10112562451092517,
"grad_norm": 2.9938979148864746,
"learning_rate": 0.00019741786700368548,
"loss": 0.8715,
"step": 210
},
{
"epoch": 0.10160717510383435,
"grad_norm": 6.764945030212402,
"learning_rate": 0.00019738252982968017,
"loss": 0.8115,
"step": 211
},
{
"epoch": 0.10208872569674352,
"grad_norm": 2.588499069213867,
"learning_rate": 0.00019734695570291168,
"loss": 0.9158,
"step": 212
},
{
"epoch": 0.10257027628965268,
"grad_norm": 3.7589855194091797,
"learning_rate": 0.00019731114470993962,
"loss": 1.2896,
"step": 213
},
{
"epoch": 0.10305182688256186,
"grad_norm": 6.584591865539551,
"learning_rate": 0.0001972750969378998,
"loss": 1.2588,
"step": 214
},
{
"epoch": 0.10353337747547102,
"grad_norm": 4.74730110168457,
"learning_rate": 0.00019723881247450434,
"loss": 0.8416,
"step": 215
},
{
"epoch": 0.10401492806838018,
"grad_norm": 3.6451597213745117,
"learning_rate": 0.0001972022914080411,
"loss": 1.3575,
"step": 216
},
{
"epoch": 0.10449647866128935,
"grad_norm": 4.290178298950195,
"learning_rate": 0.00019716553382737379,
"loss": 0.8757,
"step": 217
},
{
"epoch": 0.10497802925419852,
"grad_norm": 4.238255023956299,
"learning_rate": 0.00019712853982194152,
"loss": 0.928,
"step": 218
},
{
"epoch": 0.10545957984710769,
"grad_norm": 4.077386856079102,
"learning_rate": 0.00019709130948175876,
"loss": 1.1232,
"step": 219
},
{
"epoch": 0.10594113044001685,
"grad_norm": 2.5770697593688965,
"learning_rate": 0.0001970538428974149,
"loss": 0.867,
"step": 220
},
{
"epoch": 0.10642268103292603,
"grad_norm": 3.7551944255828857,
"learning_rate": 0.00019701614016007436,
"loss": 1.0874,
"step": 221
},
{
"epoch": 0.10690423162583519,
"grad_norm": 3.312821626663208,
"learning_rate": 0.00019697820136147597,
"loss": 0.7091,
"step": 222
},
{
"epoch": 0.10738578221874436,
"grad_norm": 2.8703348636627197,
"learning_rate": 0.00019694002659393305,
"loss": 0.8432,
"step": 223
},
{
"epoch": 0.10786733281165352,
"grad_norm": 2.452773332595825,
"learning_rate": 0.0001969016159503331,
"loss": 1.2779,
"step": 224
},
{
"epoch": 0.1083488834045627,
"grad_norm": 2.704692840576172,
"learning_rate": 0.00019686296952413747,
"loss": 0.651,
"step": 225
},
{
"epoch": 0.10883043399747186,
"grad_norm": 3.8419394493103027,
"learning_rate": 0.0001968240874093813,
"loss": 0.741,
"step": 226
},
{
"epoch": 0.10931198459038102,
"grad_norm": 4.401157379150391,
"learning_rate": 0.00019678496970067325,
"loss": 0.8972,
"step": 227
},
{
"epoch": 0.1097935351832902,
"grad_norm": 3.6739308834075928,
"learning_rate": 0.0001967456164931951,
"loss": 0.4634,
"step": 228
},
{
"epoch": 0.11027508577619936,
"grad_norm": 5.284419059753418,
"learning_rate": 0.0001967060278827017,
"loss": 0.9738,
"step": 229
},
{
"epoch": 0.11075663636910853,
"grad_norm": 4.1912360191345215,
"learning_rate": 0.00019666620396552076,
"loss": 1.0792,
"step": 230
},
{
"epoch": 0.11123818696201769,
"grad_norm": 3.6482601165771484,
"learning_rate": 0.00019662614483855246,
"loss": 1.3046,
"step": 231
},
{
"epoch": 0.11171973755492687,
"grad_norm": 6.06255578994751,
"learning_rate": 0.00019658585059926934,
"loss": 0.796,
"step": 232
},
{
"epoch": 0.11220128814783603,
"grad_norm": 2.2104992866516113,
"learning_rate": 0.00019654532134571594,
"loss": 0.8634,
"step": 233
},
{
"epoch": 0.1126828387407452,
"grad_norm": 5.140758991241455,
"learning_rate": 0.00019650455717650878,
"loss": 0.6197,
"step": 234
},
{
"epoch": 0.11316438933365437,
"grad_norm": 3.7386419773101807,
"learning_rate": 0.00019646355819083589,
"loss": 0.8885,
"step": 235
},
{
"epoch": 0.11364593992656354,
"grad_norm": 4.816273212432861,
"learning_rate": 0.0001964223244884566,
"loss": 0.9825,
"step": 236
},
{
"epoch": 0.1141274905194727,
"grad_norm": 5.175439834594727,
"learning_rate": 0.00019638085616970153,
"loss": 1.0349,
"step": 237
},
{
"epoch": 0.11460904111238188,
"grad_norm": 7.430823802947998,
"learning_rate": 0.00019633915333547202,
"loss": 0.7472,
"step": 238
},
{
"epoch": 0.11509059170529104,
"grad_norm": 5.103323936462402,
"learning_rate": 0.00019629721608724004,
"loss": 1.0827,
"step": 239
},
{
"epoch": 0.1155721422982002,
"grad_norm": 3.144728660583496,
"learning_rate": 0.0001962550445270481,
"loss": 0.6904,
"step": 240
},
{
"epoch": 0.11605369289110937,
"grad_norm": 3.113306760787964,
"learning_rate": 0.00019621263875750864,
"loss": 1.2587,
"step": 241
},
{
"epoch": 0.11653524348401854,
"grad_norm": 3.700697898864746,
"learning_rate": 0.00019616999888180406,
"loss": 0.8636,
"step": 242
},
{
"epoch": 0.11701679407692771,
"grad_norm": 3.9976747035980225,
"learning_rate": 0.0001961271250036865,
"loss": 0.8097,
"step": 243
},
{
"epoch": 0.11749834466983687,
"grad_norm": 3.022249937057495,
"learning_rate": 0.0001960840172274773,
"loss": 0.616,
"step": 244
},
{
"epoch": 0.11797989526274605,
"grad_norm": 5.003868579864502,
"learning_rate": 0.00019604067565806704,
"loss": 0.9634,
"step": 245
},
{
"epoch": 0.11846144585565521,
"grad_norm": 3.217082977294922,
"learning_rate": 0.00019599710040091512,
"loss": 0.8464,
"step": 246
},
{
"epoch": 0.11894299644856438,
"grad_norm": 3.279885768890381,
"learning_rate": 0.00019595329156204955,
"loss": 0.9137,
"step": 247
},
{
"epoch": 0.11942454704147354,
"grad_norm": 7.89387845993042,
"learning_rate": 0.00019590924924806676,
"loss": 0.7351,
"step": 248
},
{
"epoch": 0.11990609763438272,
"grad_norm": 3.444643020629883,
"learning_rate": 0.0001958649735661312,
"loss": 1.1217,
"step": 249
},
{
"epoch": 0.12038764822729188,
"grad_norm": 3.371429443359375,
"learning_rate": 0.00019582046462397515,
"loss": 0.6736,
"step": 250
},
{
"epoch": 0.12086919882020104,
"grad_norm": 2.9329636096954346,
"learning_rate": 0.00019577572252989854,
"loss": 0.91,
"step": 251
},
{
"epoch": 0.12135074941311022,
"grad_norm": 4.010715007781982,
"learning_rate": 0.00019573074739276858,
"loss": 1.1179,
"step": 252
},
{
"epoch": 0.12183230000601938,
"grad_norm": 3.4689531326293945,
"learning_rate": 0.00019568553932201947,
"loss": 0.8237,
"step": 253
},
{
"epoch": 0.12231385059892855,
"grad_norm": 2.893638849258423,
"learning_rate": 0.00019564009842765225,
"loss": 1.3797,
"step": 254
},
{
"epoch": 0.12279540119183771,
"grad_norm": 3.4823315143585205,
"learning_rate": 0.00019559442482023444,
"loss": 1.0072,
"step": 255
},
{
"epoch": 0.12327695178474689,
"grad_norm": 4.071844100952148,
"learning_rate": 0.0001955485186108998,
"loss": 0.9674,
"step": 256
},
{
"epoch": 0.12375850237765605,
"grad_norm": 3.1438372135162354,
"learning_rate": 0.00019550237991134805,
"loss": 0.8143,
"step": 257
},
{
"epoch": 0.12424005297056522,
"grad_norm": 1.9269888401031494,
"learning_rate": 0.00019545600883384467,
"loss": 0.9445,
"step": 258
},
{
"epoch": 0.12472160356347439,
"grad_norm": 4.249316215515137,
"learning_rate": 0.0001954094054912205,
"loss": 0.9279,
"step": 259
},
{
"epoch": 0.12520315415638356,
"grad_norm": 2.100592851638794,
"learning_rate": 0.00019536256999687157,
"loss": 0.566,
"step": 260
},
{
"epoch": 0.12568470474929272,
"grad_norm": 2.685699462890625,
"learning_rate": 0.00019531550246475876,
"loss": 0.6129,
"step": 261
},
{
"epoch": 0.12616625534220188,
"grad_norm": 4.864408493041992,
"learning_rate": 0.00019526820300940756,
"loss": 1.4781,
"step": 262
},
{
"epoch": 0.12664780593511105,
"grad_norm": 5.040529727935791,
"learning_rate": 0.00019522067174590778,
"loss": 0.9867,
"step": 263
},
{
"epoch": 0.12712935652802024,
"grad_norm": 4.969310760498047,
"learning_rate": 0.00019517290878991324,
"loss": 0.8467,
"step": 264
},
{
"epoch": 0.1276109071209294,
"grad_norm": 4.031760215759277,
"learning_rate": 0.0001951249142576416,
"loss": 1.3096,
"step": 265
},
{
"epoch": 0.12809245771383856,
"grad_norm": 4.020783424377441,
"learning_rate": 0.00019507668826587387,
"loss": 1.1767,
"step": 266
},
{
"epoch": 0.12857400830674773,
"grad_norm": 4.138341426849365,
"learning_rate": 0.0001950282309319544,
"loss": 1.2766,
"step": 267
},
{
"epoch": 0.1290555588996569,
"grad_norm": 2.7963201999664307,
"learning_rate": 0.0001949795423737903,
"loss": 0.9197,
"step": 268
},
{
"epoch": 0.12953710949256605,
"grad_norm": 4.995326042175293,
"learning_rate": 0.00019493062270985144,
"loss": 1.0874,
"step": 269
},
{
"epoch": 0.13001866008547522,
"grad_norm": 2.315774917602539,
"learning_rate": 0.00019488147205916985,
"loss": 0.7577,
"step": 270
},
{
"epoch": 0.1305002106783844,
"grad_norm": 6.237738132476807,
"learning_rate": 0.00019483209054133976,
"loss": 1.0659,
"step": 271
},
{
"epoch": 0.13098176127129357,
"grad_norm": 2.8713154792785645,
"learning_rate": 0.00019478247827651708,
"loss": 0.965,
"step": 272
},
{
"epoch": 0.13146331186420274,
"grad_norm": 3.7494618892669678,
"learning_rate": 0.00019473263538541914,
"loss": 0.708,
"step": 273
},
{
"epoch": 0.1319448624571119,
"grad_norm": 5.115126132965088,
"learning_rate": 0.00019468256198932455,
"loss": 1.1001,
"step": 274
},
{
"epoch": 0.13242641305002106,
"grad_norm": 1.991768479347229,
"learning_rate": 0.00019463225821007268,
"loss": 1.5559,
"step": 275
},
{
"epoch": 0.13290796364293023,
"grad_norm": 2.3148343563079834,
"learning_rate": 0.00019458172417006347,
"loss": 0.7299,
"step": 276
},
{
"epoch": 0.1333895142358394,
"grad_norm": 5.019519329071045,
"learning_rate": 0.00019453095999225726,
"loss": 0.9006,
"step": 277
},
{
"epoch": 0.13387106482874858,
"grad_norm": 2.933354616165161,
"learning_rate": 0.0001944799658001742,
"loss": 0.8045,
"step": 278
},
{
"epoch": 0.13435261542165775,
"grad_norm": 3.3015851974487305,
"learning_rate": 0.00019442874171789418,
"loss": 0.8641,
"step": 279
},
{
"epoch": 0.1348341660145669,
"grad_norm": 3.103376626968384,
"learning_rate": 0.00019437728787005657,
"loss": 1.2227,
"step": 280
},
{
"epoch": 0.13531571660747607,
"grad_norm": 3.754467725753784,
"learning_rate": 0.00019432560438185963,
"loss": 0.6292,
"step": 281
},
{
"epoch": 0.13579726720038524,
"grad_norm": 3.083435297012329,
"learning_rate": 0.00019427369137906046,
"loss": 1.0215,
"step": 282
},
{
"epoch": 0.1362788177932944,
"grad_norm": 2.066274404525757,
"learning_rate": 0.00019422154898797472,
"loss": 0.9194,
"step": 283
},
{
"epoch": 0.13676036838620356,
"grad_norm": 2.431302785873413,
"learning_rate": 0.00019416917733547603,
"loss": 1.2826,
"step": 284
},
{
"epoch": 0.13724191897911275,
"grad_norm": 2.993353843688965,
"learning_rate": 0.00019411657654899597,
"loss": 0.8271,
"step": 285
},
{
"epoch": 0.13772346957202192,
"grad_norm": 5.987199783325195,
"learning_rate": 0.0001940637467565237,
"loss": 1.0192,
"step": 286
},
{
"epoch": 0.13820502016493108,
"grad_norm": 1.7869144678115845,
"learning_rate": 0.00019401068808660546,
"loss": 1.1415,
"step": 287
},
{
"epoch": 0.13868657075784024,
"grad_norm": 1.75114905834198,
"learning_rate": 0.0001939574006683445,
"loss": 0.9718,
"step": 288
},
{
"epoch": 0.1391681213507494,
"grad_norm": 3.034210443496704,
"learning_rate": 0.00019390388463140065,
"loss": 0.4041,
"step": 289
},
{
"epoch": 0.13964967194365857,
"grad_norm": 7.397146224975586,
"learning_rate": 0.00019385014010598998,
"loss": 0.9913,
"step": 290
},
{
"epoch": 0.14013122253656773,
"grad_norm": 2.123467206954956,
"learning_rate": 0.00019379616722288456,
"loss": 0.8688,
"step": 291
},
{
"epoch": 0.14061277312947693,
"grad_norm": 3.545257329940796,
"learning_rate": 0.0001937419661134121,
"loss": 1.1841,
"step": 292
},
{
"epoch": 0.1410943237223861,
"grad_norm": 3.6970181465148926,
"learning_rate": 0.0001936875369094556,
"loss": 1.241,
"step": 293
},
{
"epoch": 0.14157587431529525,
"grad_norm": 2.9844279289245605,
"learning_rate": 0.0001936328797434531,
"loss": 0.9453,
"step": 294
},
{
"epoch": 0.14205742490820442,
"grad_norm": 1.6827529668807983,
"learning_rate": 0.00019357799474839735,
"loss": 0.9734,
"step": 295
},
{
"epoch": 0.14253897550111358,
"grad_norm": 3.7799909114837646,
"learning_rate": 0.00019352288205783536,
"loss": 0.7606,
"step": 296
},
{
"epoch": 0.14302052609402274,
"grad_norm": 3.0586202144622803,
"learning_rate": 0.00019346754180586825,
"loss": 0.6152,
"step": 297
},
{
"epoch": 0.14350207668693193,
"grad_norm": 1.6172605752944946,
"learning_rate": 0.00019341197412715082,
"loss": 0.7054,
"step": 298
},
{
"epoch": 0.1439836272798411,
"grad_norm": 2.8775248527526855,
"learning_rate": 0.00019335617915689128,
"loss": 1.1397,
"step": 299
},
{
"epoch": 0.14446517787275026,
"grad_norm": 3.2396507263183594,
"learning_rate": 0.00019330015703085082,
"loss": 0.6854,
"step": 300
},
{
"epoch": 0.14494672846565942,
"grad_norm": 4.7253289222717285,
"learning_rate": 0.00019324390788534343,
"loss": 0.9446,
"step": 301
},
{
"epoch": 0.1454282790585686,
"grad_norm": 5.362252712249756,
"learning_rate": 0.00019318743185723546,
"loss": 0.8447,
"step": 302
},
{
"epoch": 0.14590982965147775,
"grad_norm": 2.6400959491729736,
"learning_rate": 0.00019313072908394525,
"loss": 0.8309,
"step": 303
},
{
"epoch": 0.14639138024438691,
"grad_norm": 2.0676944255828857,
"learning_rate": 0.00019307379970344294,
"loss": 1.0386,
"step": 304
},
{
"epoch": 0.1468729308372961,
"grad_norm": 2.806190013885498,
"learning_rate": 0.00019301664385425004,
"loss": 0.8906,
"step": 305
},
{
"epoch": 0.14735448143020527,
"grad_norm": 2.275996446609497,
"learning_rate": 0.0001929592616754391,
"loss": 0.6389,
"step": 306
},
{
"epoch": 0.14783603202311443,
"grad_norm": 1.520571231842041,
"learning_rate": 0.00019290165330663336,
"loss": 1.0456,
"step": 307
},
{
"epoch": 0.1483175826160236,
"grad_norm": 2.8603453636169434,
"learning_rate": 0.00019284381888800647,
"loss": 0.9411,
"step": 308
},
{
"epoch": 0.14879913320893276,
"grad_norm": 3.3105409145355225,
"learning_rate": 0.00019278575856028206,
"loss": 0.9477,
"step": 309
},
{
"epoch": 0.14928068380184192,
"grad_norm": 2.5645644664764404,
"learning_rate": 0.00019272747246473345,
"loss": 0.54,
"step": 310
},
{
"epoch": 0.1497622343947511,
"grad_norm": 10.714997291564941,
"learning_rate": 0.00019266896074318334,
"loss": 1.3273,
"step": 311
},
{
"epoch": 0.15024378498766028,
"grad_norm": 3.5276308059692383,
"learning_rate": 0.00019261022353800344,
"loss": 0.9848,
"step": 312
},
{
"epoch": 0.15072533558056944,
"grad_norm": 4.048092365264893,
"learning_rate": 0.00019255126099211402,
"loss": 0.7335,
"step": 313
},
{
"epoch": 0.1512068861734786,
"grad_norm": 1.8157590627670288,
"learning_rate": 0.00019249207324898376,
"loss": 1.0381,
"step": 314
},
{
"epoch": 0.15168843676638777,
"grad_norm": 3.7452712059020996,
"learning_rate": 0.0001924326604526292,
"loss": 0.6327,
"step": 315
},
{
"epoch": 0.15216998735929693,
"grad_norm": 3.773587942123413,
"learning_rate": 0.00019237302274761458,
"loss": 0.5525,
"step": 316
},
{
"epoch": 0.1526515379522061,
"grad_norm": 3.6880061626434326,
"learning_rate": 0.0001923131602790513,
"loss": 1.2273,
"step": 317
},
{
"epoch": 0.15313308854511526,
"grad_norm": 3.083782196044922,
"learning_rate": 0.00019225307319259768,
"loss": 1.0963,
"step": 318
},
{
"epoch": 0.15361463913802445,
"grad_norm": 5.022973537445068,
"learning_rate": 0.00019219276163445862,
"loss": 0.7697,
"step": 319
},
{
"epoch": 0.1540961897309336,
"grad_norm": 3.2828280925750732,
"learning_rate": 0.00019213222575138522,
"loss": 1.069,
"step": 320
},
{
"epoch": 0.15457774032384278,
"grad_norm": 4.293641567230225,
"learning_rate": 0.00019207146569067435,
"loss": 0.6866,
"step": 321
},
{
"epoch": 0.15505929091675194,
"grad_norm": 4.453739166259766,
"learning_rate": 0.00019201048160016838,
"loss": 0.939,
"step": 322
},
{
"epoch": 0.1555408415096611,
"grad_norm": 2.3290882110595703,
"learning_rate": 0.00019194927362825478,
"loss": 0.8467,
"step": 323
},
{
"epoch": 0.15602239210257027,
"grad_norm": 4.609375476837158,
"learning_rate": 0.00019188784192386587,
"loss": 1.064,
"step": 324
},
{
"epoch": 0.15650394269547943,
"grad_norm": 2.483145236968994,
"learning_rate": 0.00019182618663647817,
"loss": 0.6174,
"step": 325
},
{
"epoch": 0.15698549328838862,
"grad_norm": 4.485541820526123,
"learning_rate": 0.0001917643079161124,
"loss": 0.9137,
"step": 326
},
{
"epoch": 0.15746704388129779,
"grad_norm": 4.135148525238037,
"learning_rate": 0.00019170220591333283,
"loss": 0.7697,
"step": 327
},
{
"epoch": 0.15794859447420695,
"grad_norm": 4.6411333084106445,
"learning_rate": 0.00019163988077924713,
"loss": 1.1936,
"step": 328
},
{
"epoch": 0.1584301450671161,
"grad_norm": 3.8218069076538086,
"learning_rate": 0.00019157733266550575,
"loss": 0.7865,
"step": 329
},
{
"epoch": 0.15891169566002528,
"grad_norm": 5.044341564178467,
"learning_rate": 0.00019151456172430183,
"loss": 1.0328,
"step": 330
},
{
"epoch": 0.15939324625293444,
"grad_norm": 5.211885929107666,
"learning_rate": 0.0001914515681083707,
"loss": 1.096,
"step": 331
},
{
"epoch": 0.1598747968458436,
"grad_norm": 4.778816223144531,
"learning_rate": 0.00019138835197098937,
"loss": 0.7164,
"step": 332
},
{
"epoch": 0.1603563474387528,
"grad_norm": 3.1588540077209473,
"learning_rate": 0.00019132491346597643,
"loss": 1.1062,
"step": 333
},
{
"epoch": 0.16083789803166196,
"grad_norm": 2.6961734294891357,
"learning_rate": 0.00019126125274769145,
"loss": 0.7453,
"step": 334
},
{
"epoch": 0.16131944862457112,
"grad_norm": 2.4815406799316406,
"learning_rate": 0.00019119736997103476,
"loss": 0.9451,
"step": 335
},
{
"epoch": 0.16180099921748028,
"grad_norm": 1.7432652711868286,
"learning_rate": 0.000191133265291447,
"loss": 0.8675,
"step": 336
},
{
"epoch": 0.16228254981038945,
"grad_norm": 2.9783449172973633,
"learning_rate": 0.00019106893886490864,
"loss": 1.1438,
"step": 337
},
{
"epoch": 0.1627641004032986,
"grad_norm": 3.031538248062134,
"learning_rate": 0.00019100439084793989,
"loss": 0.7219,
"step": 338
},
{
"epoch": 0.1632456509962078,
"grad_norm": 6.661623001098633,
"learning_rate": 0.00019093962139759998,
"loss": 1.3528,
"step": 339
},
{
"epoch": 0.16372720158911697,
"grad_norm": 3.402763843536377,
"learning_rate": 0.000190874630671487,
"loss": 0.9113,
"step": 340
},
{
"epoch": 0.16420875218202613,
"grad_norm": 3.233682632446289,
"learning_rate": 0.00019080941882773745,
"loss": 1.1422,
"step": 341
},
{
"epoch": 0.1646903027749353,
"grad_norm": 3.8870816230773926,
"learning_rate": 0.00019074398602502584,
"loss": 0.6947,
"step": 342
},
{
"epoch": 0.16517185336784446,
"grad_norm": 4.976253509521484,
"learning_rate": 0.00019067833242256442,
"loss": 1.066,
"step": 343
},
{
"epoch": 0.16565340396075362,
"grad_norm": 3.8390371799468994,
"learning_rate": 0.0001906124581801025,
"loss": 0.7719,
"step": 344
},
{
"epoch": 0.16613495455366278,
"grad_norm": 6.181506633758545,
"learning_rate": 0.0001905463634579264,
"loss": 0.6594,
"step": 345
},
{
"epoch": 0.16661650514657197,
"grad_norm": 2.2047243118286133,
"learning_rate": 0.00019048004841685888,
"loss": 0.8504,
"step": 346
},
{
"epoch": 0.16709805573948114,
"grad_norm": 7.163036346435547,
"learning_rate": 0.00019041351321825883,
"loss": 1.1279,
"step": 347
},
{
"epoch": 0.1675796063323903,
"grad_norm": 3.1026716232299805,
"learning_rate": 0.00019034675802402068,
"loss": 1.1557,
"step": 348
},
{
"epoch": 0.16806115692529947,
"grad_norm": 2.942558526992798,
"learning_rate": 0.00019027978299657436,
"loss": 0.4541,
"step": 349
},
{
"epoch": 0.16854270751820863,
"grad_norm": 2.2551536560058594,
"learning_rate": 0.00019021258829888456,
"loss": 0.7569,
"step": 350
},
{
"epoch": 0.1690242581111178,
"grad_norm": 3.777118682861328,
"learning_rate": 0.00019014517409445052,
"loss": 1.0268,
"step": 351
},
{
"epoch": 0.16950580870402696,
"grad_norm": 2.533811330795288,
"learning_rate": 0.00019007754054730554,
"loss": 0.6828,
"step": 352
},
{
"epoch": 0.16998735929693615,
"grad_norm": 3.8600101470947266,
"learning_rate": 0.00019000968782201675,
"loss": 0.5744,
"step": 353
},
{
"epoch": 0.1704689098898453,
"grad_norm": 3.22756028175354,
"learning_rate": 0.00018994161608368448,
"loss": 1.3307,
"step": 354
},
{
"epoch": 0.17095046048275447,
"grad_norm": 2.4903550148010254,
"learning_rate": 0.00018987332549794196,
"loss": 1.0393,
"step": 355
},
{
"epoch": 0.17143201107566364,
"grad_norm": 2.447472333908081,
"learning_rate": 0.00018980481623095502,
"loss": 0.7863,
"step": 356
},
{
"epoch": 0.1719135616685728,
"grad_norm": 4.140078544616699,
"learning_rate": 0.00018973608844942148,
"loss": 0.8217,
"step": 357
},
{
"epoch": 0.17239511226148196,
"grad_norm": 2.152505874633789,
"learning_rate": 0.00018966714232057094,
"loss": 1.1535,
"step": 358
},
{
"epoch": 0.17287666285439113,
"grad_norm": 3.840864419937134,
"learning_rate": 0.00018959797801216418,
"loss": 0.6864,
"step": 359
},
{
"epoch": 0.17335821344730032,
"grad_norm": 4.183706283569336,
"learning_rate": 0.000189528595692493,
"loss": 0.7509,
"step": 360
},
{
"epoch": 0.17383976404020948,
"grad_norm": 3.3056836128234863,
"learning_rate": 0.00018945899553037956,
"loss": 0.8942,
"step": 361
},
{
"epoch": 0.17432131463311865,
"grad_norm": 4.057851314544678,
"learning_rate": 0.00018938917769517613,
"loss": 1.4257,
"step": 362
},
{
"epoch": 0.1748028652260278,
"grad_norm": 2.97546648979187,
"learning_rate": 0.00018931914235676458,
"loss": 0.9936,
"step": 363
},
{
"epoch": 0.17528441581893697,
"grad_norm": 0.8970528841018677,
"learning_rate": 0.00018924888968555606,
"loss": 0.6008,
"step": 364
},
{
"epoch": 0.17576596641184614,
"grad_norm": 4.94012975692749,
"learning_rate": 0.00018917841985249055,
"loss": 0.985,
"step": 365
},
{
"epoch": 0.1762475170047553,
"grad_norm": 3.416455030441284,
"learning_rate": 0.0001891077330290363,
"loss": 0.9864,
"step": 366
},
{
"epoch": 0.1767290675976645,
"grad_norm": 2.498899221420288,
"learning_rate": 0.00018903682938718977,
"loss": 0.8499,
"step": 367
},
{
"epoch": 0.17721061819057365,
"grad_norm": 3.770181655883789,
"learning_rate": 0.00018896570909947475,
"loss": 0.8731,
"step": 368
},
{
"epoch": 0.17769216878348282,
"grad_norm": 4.896841049194336,
"learning_rate": 0.00018889437233894234,
"loss": 1.0454,
"step": 369
},
{
"epoch": 0.17817371937639198,
"grad_norm": 2.264261245727539,
"learning_rate": 0.0001888228192791703,
"loss": 1.0061,
"step": 370
},
{
"epoch": 0.17865526996930114,
"grad_norm": 3.097073793411255,
"learning_rate": 0.00018875105009426272,
"loss": 0.6645,
"step": 371
},
{
"epoch": 0.1791368205622103,
"grad_norm": 1.4488924741744995,
"learning_rate": 0.00018867906495884955,
"loss": 1.1148,
"step": 372
},
{
"epoch": 0.17961837115511947,
"grad_norm": 3.17714524269104,
"learning_rate": 0.0001886068640480862,
"loss": 0.7299,
"step": 373
},
{
"epoch": 0.18009992174802866,
"grad_norm": 5.30600118637085,
"learning_rate": 0.00018853444753765306,
"loss": 0.797,
"step": 374
},
{
"epoch": 0.18058147234093783,
"grad_norm": 2.218719959259033,
"learning_rate": 0.00018846181560375525,
"loss": 0.6802,
"step": 375
},
{
"epoch": 0.181063022933847,
"grad_norm": 1.5951119661331177,
"learning_rate": 0.0001883889684231219,
"loss": 0.9983,
"step": 376
},
{
"epoch": 0.18154457352675615,
"grad_norm": 4.742170810699463,
"learning_rate": 0.000188315906173006,
"loss": 1.3679,
"step": 377
},
{
"epoch": 0.18202612411966532,
"grad_norm": 4.083590507507324,
"learning_rate": 0.0001882426290311838,
"loss": 0.6986,
"step": 378
},
{
"epoch": 0.18250767471257448,
"grad_norm": 4.625772476196289,
"learning_rate": 0.00018816913717595445,
"loss": 0.9917,
"step": 379
},
{
"epoch": 0.18298922530548364,
"grad_norm": 3.811450958251953,
"learning_rate": 0.00018809543078613953,
"loss": 0.7278,
"step": 380
},
{
"epoch": 0.18347077589839283,
"grad_norm": 2.0530614852905273,
"learning_rate": 0.00018802151004108263,
"loss": 0.574,
"step": 381
},
{
"epoch": 0.183952326491302,
"grad_norm": 3.4029970169067383,
"learning_rate": 0.0001879473751206489,
"loss": 0.8849,
"step": 382
},
{
"epoch": 0.18443387708421116,
"grad_norm": 5.864663124084473,
"learning_rate": 0.00018787302620522467,
"loss": 0.5902,
"step": 383
},
{
"epoch": 0.18491542767712033,
"grad_norm": 2.628844976425171,
"learning_rate": 0.00018779846347571693,
"loss": 0.9543,
"step": 384
},
{
"epoch": 0.1853969782700295,
"grad_norm": 5.040539741516113,
"learning_rate": 0.0001877236871135529,
"loss": 0.7266,
"step": 385
},
{
"epoch": 0.18587852886293865,
"grad_norm": 3.538259983062744,
"learning_rate": 0.00018764869730067968,
"loss": 1.0102,
"step": 386
},
{
"epoch": 0.18636007945584784,
"grad_norm": 3.6347954273223877,
"learning_rate": 0.0001875734942195637,
"loss": 0.4102,
"step": 387
},
{
"epoch": 0.186841630048757,
"grad_norm": 3.6020264625549316,
"learning_rate": 0.0001874980780531903,
"loss": 1.0311,
"step": 388
},
{
"epoch": 0.18732318064166617,
"grad_norm": 3.3939337730407715,
"learning_rate": 0.00018742244898506337,
"loss": 0.8185,
"step": 389
},
{
"epoch": 0.18780473123457533,
"grad_norm": 3.194336175918579,
"learning_rate": 0.00018734660719920475,
"loss": 1.0069,
"step": 390
},
{
"epoch": 0.1882862818274845,
"grad_norm": 3.444998025894165,
"learning_rate": 0.00018727055288015397,
"loss": 1.0042,
"step": 391
},
{
"epoch": 0.18876783242039366,
"grad_norm": 1.6734000444412231,
"learning_rate": 0.00018719428621296764,
"loss": 1.1157,
"step": 392
},
{
"epoch": 0.18924938301330282,
"grad_norm": 3.978752374649048,
"learning_rate": 0.00018711780738321897,
"loss": 0.5787,
"step": 393
},
{
"epoch": 0.18973093360621202,
"grad_norm": 3.7722957134246826,
"learning_rate": 0.00018704111657699758,
"loss": 0.7111,
"step": 394
},
{
"epoch": 0.19021248419912118,
"grad_norm": 1.650201678276062,
"learning_rate": 0.0001869642139809088,
"loss": 1.1438,
"step": 395
},
{
"epoch": 0.19069403479203034,
"grad_norm": 8.692386627197266,
"learning_rate": 0.00018688709978207323,
"loss": 0.8174,
"step": 396
},
{
"epoch": 0.1911755853849395,
"grad_norm": 1.9491546154022217,
"learning_rate": 0.00018680977416812644,
"loss": 1.0687,
"step": 397
},
{
"epoch": 0.19165713597784867,
"grad_norm": 2.265296459197998,
"learning_rate": 0.00018673223732721837,
"loss": 0.739,
"step": 398
},
{
"epoch": 0.19213868657075783,
"grad_norm": 3.7249464988708496,
"learning_rate": 0.0001866544894480129,
"loss": 1.0551,
"step": 399
},
{
"epoch": 0.192620237163667,
"grad_norm": 2.0633394718170166,
"learning_rate": 0.00018657653071968747,
"loss": 0.8448,
"step": 400
},
{
"epoch": 0.1931017877565762,
"grad_norm": 3.448591947555542,
"learning_rate": 0.00018649836133193253,
"loss": 0.7965,
"step": 401
},
{
"epoch": 0.19358333834948535,
"grad_norm": 1.4229322671890259,
"learning_rate": 0.00018641998147495112,
"loss": 0.4359,
"step": 402
},
{
"epoch": 0.19406488894239451,
"grad_norm": 2.978297233581543,
"learning_rate": 0.00018634139133945837,
"loss": 0.632,
"step": 403
},
{
"epoch": 0.19454643953530368,
"grad_norm": 2.407181978225708,
"learning_rate": 0.00018626259111668105,
"loss": 1.0891,
"step": 404
},
{
"epoch": 0.19502799012821284,
"grad_norm": 3.066446542739868,
"learning_rate": 0.00018618358099835723,
"loss": 1.3327,
"step": 405
},
{
"epoch": 0.195509540721122,
"grad_norm": 1.5488284826278687,
"learning_rate": 0.00018610436117673555,
"loss": 0.42,
"step": 406
},
{
"epoch": 0.19599109131403117,
"grad_norm": 1.5752339363098145,
"learning_rate": 0.00018602493184457505,
"loss": 0.6942,
"step": 407
},
{
"epoch": 0.19647264190694036,
"grad_norm": 3.5753307342529297,
"learning_rate": 0.00018594529319514437,
"loss": 0.9352,
"step": 408
},
{
"epoch": 0.19695419249984952,
"grad_norm": 2.1090338230133057,
"learning_rate": 0.00018586544542222169,
"loss": 0.8635,
"step": 409
},
{
"epoch": 0.1974357430927587,
"grad_norm": 4.924689292907715,
"learning_rate": 0.00018578538872009384,
"loss": 0.6627,
"step": 410
},
{
"epoch": 0.19791729368566785,
"grad_norm": 0.7094942331314087,
"learning_rate": 0.00018570512328355612,
"loss": 0.4438,
"step": 411
},
{
"epoch": 0.198398844278577,
"grad_norm": 3.230691909790039,
"learning_rate": 0.00018562464930791167,
"loss": 0.7968,
"step": 412
},
{
"epoch": 0.19888039487148618,
"grad_norm": 3.516850709915161,
"learning_rate": 0.00018554396698897116,
"loss": 0.7121,
"step": 413
},
{
"epoch": 0.19936194546439534,
"grad_norm": 2.1137967109680176,
"learning_rate": 0.00018546307652305205,
"loss": 0.8463,
"step": 414
},
{
"epoch": 0.19984349605730453,
"grad_norm": 2.3233392238616943,
"learning_rate": 0.00018538197810697842,
"loss": 0.8193,
"step": 415
},
{
"epoch": 0.2003250466502137,
"grad_norm": 6.277778625488281,
"learning_rate": 0.0001853006719380802,
"loss": 0.8697,
"step": 416
},
{
"epoch": 0.20080659724312286,
"grad_norm": 3.4568729400634766,
"learning_rate": 0.00018521915821419284,
"loss": 0.5947,
"step": 417
},
{
"epoch": 0.20128814783603202,
"grad_norm": 2.8246490955352783,
"learning_rate": 0.00018513743713365698,
"loss": 0.8121,
"step": 418
},
{
"epoch": 0.20176969842894119,
"grad_norm": 3.8738162517547607,
"learning_rate": 0.00018505550889531765,
"loss": 0.7239,
"step": 419
},
{
"epoch": 0.20225124902185035,
"grad_norm": 3.2353687286376953,
"learning_rate": 0.00018497337369852395,
"loss": 0.6751,
"step": 420
},
{
"epoch": 0.2027327996147595,
"grad_norm": 5.0770039558410645,
"learning_rate": 0.0001848910317431286,
"loss": 1.2165,
"step": 421
},
{
"epoch": 0.2032143502076687,
"grad_norm": 3.751051902770996,
"learning_rate": 0.00018480848322948739,
"loss": 0.661,
"step": 422
},
{
"epoch": 0.20369590080057787,
"grad_norm": 3.767159938812256,
"learning_rate": 0.00018472572835845873,
"loss": 0.7486,
"step": 423
},
{
"epoch": 0.20417745139348703,
"grad_norm": 2.8906075954437256,
"learning_rate": 0.00018464276733140306,
"loss": 0.7135,
"step": 424
},
{
"epoch": 0.2046590019863962,
"grad_norm": 2.5436739921569824,
"learning_rate": 0.0001845596003501826,
"loss": 0.7165,
"step": 425
},
{
"epoch": 0.20514055257930536,
"grad_norm": 2.0007944107055664,
"learning_rate": 0.00018447622761716057,
"loss": 0.8495,
"step": 426
},
{
"epoch": 0.20562210317221452,
"grad_norm": 1.651319146156311,
"learning_rate": 0.00018439264933520084,
"loss": 0.9067,
"step": 427
},
{
"epoch": 0.2061036537651237,
"grad_norm": 1.483290433883667,
"learning_rate": 0.00018430886570766747,
"loss": 0.7198,
"step": 428
},
{
"epoch": 0.20658520435803288,
"grad_norm": 4.108277797698975,
"learning_rate": 0.0001842248769384242,
"loss": 1.5626,
"step": 429
},
{
"epoch": 0.20706675495094204,
"grad_norm": 2.4852449893951416,
"learning_rate": 0.00018414068323183375,
"loss": 0.7769,
"step": 430
},
{
"epoch": 0.2075483055438512,
"grad_norm": 4.069541931152344,
"learning_rate": 0.00018405628479275775,
"loss": 0.663,
"step": 431
},
{
"epoch": 0.20802985613676037,
"grad_norm": 2.137789011001587,
"learning_rate": 0.00018397168182655583,
"loss": 0.5468,
"step": 432
},
{
"epoch": 0.20851140672966953,
"grad_norm": 3.0423202514648438,
"learning_rate": 0.00018388687453908527,
"loss": 0.8064,
"step": 433
},
{
"epoch": 0.2089929573225787,
"grad_norm": 2.8734021186828613,
"learning_rate": 0.00018380186313670058,
"loss": 1.0275,
"step": 434
},
{
"epoch": 0.20947450791548788,
"grad_norm": 2.459599733352661,
"learning_rate": 0.00018371664782625287,
"loss": 0.7337,
"step": 435
},
{
"epoch": 0.20995605850839705,
"grad_norm": 4.528045177459717,
"learning_rate": 0.00018363122881508945,
"loss": 1.0966,
"step": 436
},
{
"epoch": 0.2104376091013062,
"grad_norm": 2.942962169647217,
"learning_rate": 0.00018354560631105328,
"loss": 0.6714,
"step": 437
},
{
"epoch": 0.21091915969421537,
"grad_norm": 5.867403507232666,
"learning_rate": 0.00018345978052248233,
"loss": 0.9835,
"step": 438
},
{
"epoch": 0.21140071028712454,
"grad_norm": 1.642223834991455,
"learning_rate": 0.00018337375165820944,
"loss": 0.8807,
"step": 439
},
{
"epoch": 0.2118822608800337,
"grad_norm": 5.436519145965576,
"learning_rate": 0.00018328751992756137,
"loss": 0.8824,
"step": 440
},
{
"epoch": 0.21236381147294286,
"grad_norm": 5.564542770385742,
"learning_rate": 0.0001832010855403586,
"loss": 1.0045,
"step": 441
},
{
"epoch": 0.21284536206585206,
"grad_norm": 2.1191842555999756,
"learning_rate": 0.0001831144487069147,
"loss": 0.7834,
"step": 442
},
{
"epoch": 0.21332691265876122,
"grad_norm": 2.1237781047821045,
"learning_rate": 0.0001830276096380358,
"loss": 0.3141,
"step": 443
},
{
"epoch": 0.21380846325167038,
"grad_norm": 4.568416118621826,
"learning_rate": 0.0001829405685450202,
"loss": 1.2394,
"step": 444
},
{
"epoch": 0.21429001384457955,
"grad_norm": 2.3887364864349365,
"learning_rate": 0.00018285332563965765,
"loss": 1.1355,
"step": 445
},
{
"epoch": 0.2147715644374887,
"grad_norm": 4.617615222930908,
"learning_rate": 0.00018276588113422905,
"loss": 0.8803,
"step": 446
},
{
"epoch": 0.21525311503039787,
"grad_norm": 2.1679558753967285,
"learning_rate": 0.00018267823524150575,
"loss": 0.9606,
"step": 447
},
{
"epoch": 0.21573466562330704,
"grad_norm": 2.1338422298431396,
"learning_rate": 0.00018259038817474923,
"loss": 0.9403,
"step": 448
},
{
"epoch": 0.21621621621621623,
"grad_norm": 2.041907548904419,
"learning_rate": 0.0001825023401477104,
"loss": 0.6543,
"step": 449
},
{
"epoch": 0.2166977668091254,
"grad_norm": 1.8490098714828491,
"learning_rate": 0.0001824140913746291,
"loss": 0.9038,
"step": 450
},
{
"epoch": 0.21717931740203456,
"grad_norm": 3.1015329360961914,
"learning_rate": 0.00018232564207023376,
"loss": 0.6252,
"step": 451
},
{
"epoch": 0.21766086799494372,
"grad_norm": 2.5276334285736084,
"learning_rate": 0.00018223699244974064,
"loss": 0.653,
"step": 452
},
{
"epoch": 0.21814241858785288,
"grad_norm": 2.063218116760254,
"learning_rate": 0.00018214814272885343,
"loss": 0.8085,
"step": 453
},
{
"epoch": 0.21862396918076205,
"grad_norm": 2.227787494659424,
"learning_rate": 0.00018205909312376276,
"loss": 0.9719,
"step": 454
},
{
"epoch": 0.2191055197736712,
"grad_norm": 2.54950213432312,
"learning_rate": 0.00018196984385114554,
"loss": 0.9854,
"step": 455
},
{
"epoch": 0.2195870703665804,
"grad_norm": 2.7499783039093018,
"learning_rate": 0.0001818803951281646,
"loss": 0.7189,
"step": 456
},
{
"epoch": 0.22006862095948956,
"grad_norm": 4.1418070793151855,
"learning_rate": 0.000181790747172468,
"loss": 0.7065,
"step": 457
},
{
"epoch": 0.22055017155239873,
"grad_norm": 1.8730262517929077,
"learning_rate": 0.00018170090020218864,
"loss": 1.164,
"step": 458
},
{
"epoch": 0.2210317221453079,
"grad_norm": 2.4339802265167236,
"learning_rate": 0.00018161085443594365,
"loss": 0.614,
"step": 459
},
{
"epoch": 0.22151327273821705,
"grad_norm": 1.8113713264465332,
"learning_rate": 0.00018152061009283382,
"loss": 0.5136,
"step": 460
},
{
"epoch": 0.22199482333112622,
"grad_norm": 4.9851250648498535,
"learning_rate": 0.00018143016739244314,
"loss": 0.9962,
"step": 461
},
{
"epoch": 0.22247637392403538,
"grad_norm": 3.5249273777008057,
"learning_rate": 0.0001813395265548383,
"loss": 0.7784,
"step": 462
},
{
"epoch": 0.22295792451694457,
"grad_norm": 4.574691295623779,
"learning_rate": 0.00018124868780056814,
"loss": 0.9796,
"step": 463
},
{
"epoch": 0.22343947510985374,
"grad_norm": 4.666108131408691,
"learning_rate": 0.0001811576513506629,
"loss": 0.749,
"step": 464
},
{
"epoch": 0.2239210257027629,
"grad_norm": 2.94307541847229,
"learning_rate": 0.00018106641742663397,
"loss": 0.9141,
"step": 465
},
{
"epoch": 0.22440257629567206,
"grad_norm": 3.553006887435913,
"learning_rate": 0.00018097498625047328,
"loss": 0.7818,
"step": 466
},
{
"epoch": 0.22488412688858123,
"grad_norm": 3.156838893890381,
"learning_rate": 0.00018088335804465258,
"loss": 0.8416,
"step": 467
},
{
"epoch": 0.2253656774814904,
"grad_norm": 3.0316860675811768,
"learning_rate": 0.00018079153303212318,
"loss": 0.777,
"step": 468
},
{
"epoch": 0.22584722807439955,
"grad_norm": 4.4010443687438965,
"learning_rate": 0.0001806995114363152,
"loss": 1.5936,
"step": 469
},
{
"epoch": 0.22632877866730874,
"grad_norm": 2.7999866008758545,
"learning_rate": 0.00018060729348113707,
"loss": 0.662,
"step": 470
},
{
"epoch": 0.2268103292602179,
"grad_norm": 1.6529977321624756,
"learning_rate": 0.00018051487939097505,
"loss": 0.6979,
"step": 471
},
{
"epoch": 0.22729187985312707,
"grad_norm": 9.427469253540039,
"learning_rate": 0.00018042226939069255,
"loss": 1.0829,
"step": 472
},
{
"epoch": 0.22777343044603623,
"grad_norm": 3.0257017612457275,
"learning_rate": 0.00018032946370562982,
"loss": 0.8094,
"step": 473
},
{
"epoch": 0.2282549810389454,
"grad_norm": 3.00710391998291,
"learning_rate": 0.00018023646256160313,
"loss": 0.4987,
"step": 474
},
{
"epoch": 0.22873653163185456,
"grad_norm": 3.28983473777771,
"learning_rate": 0.00018014326618490437,
"loss": 0.7542,
"step": 475
},
{
"epoch": 0.22921808222476375,
"grad_norm": 4.3024210929870605,
"learning_rate": 0.0001800498748023005,
"loss": 1.2465,
"step": 476
},
{
"epoch": 0.22969963281767292,
"grad_norm": 1.8468772172927856,
"learning_rate": 0.000179956288641033,
"loss": 0.5073,
"step": 477
},
{
"epoch": 0.23018118341058208,
"grad_norm": 6.304190635681152,
"learning_rate": 0.00017986250792881718,
"loss": 0.4624,
"step": 478
},
{
"epoch": 0.23066273400349124,
"grad_norm": 2.0171825885772705,
"learning_rate": 0.00017976853289384184,
"loss": 0.886,
"step": 479
},
{
"epoch": 0.2311442845964004,
"grad_norm": 3.8221256732940674,
"learning_rate": 0.00017967436376476855,
"loss": 1.182,
"step": 480
},
{
"epoch": 0.23162583518930957,
"grad_norm": 2.4394781589508057,
"learning_rate": 0.0001795800007707312,
"loss": 0.8758,
"step": 481
},
{
"epoch": 0.23210738578221873,
"grad_norm": 3.8338496685028076,
"learning_rate": 0.00017948544414133534,
"loss": 0.4596,
"step": 482
},
{
"epoch": 0.23258893637512792,
"grad_norm": 6.3291239738464355,
"learning_rate": 0.00017939069410665773,
"loss": 0.7862,
"step": 483
},
{
"epoch": 0.2330704869680371,
"grad_norm": 3.5340969562530518,
"learning_rate": 0.0001792957508972457,
"loss": 0.857,
"step": 484
},
{
"epoch": 0.23355203756094625,
"grad_norm": 3.2382187843322754,
"learning_rate": 0.00017920061474411658,
"loss": 0.4476,
"step": 485
},
{
"epoch": 0.23403358815385542,
"grad_norm": 2.6530380249023438,
"learning_rate": 0.00017910528587875729,
"loss": 0.7092,
"step": 486
},
{
"epoch": 0.23451513874676458,
"grad_norm": 2.1890785694122314,
"learning_rate": 0.00017900976453312352,
"loss": 0.6607,
"step": 487
},
{
"epoch": 0.23499668933967374,
"grad_norm": 3.7793309688568115,
"learning_rate": 0.00017891405093963938,
"loss": 0.805,
"step": 488
},
{
"epoch": 0.2354782399325829,
"grad_norm": 3.355741500854492,
"learning_rate": 0.00017881814533119675,
"loss": 1.1384,
"step": 489
},
{
"epoch": 0.2359597905254921,
"grad_norm": 4.145727157592773,
"learning_rate": 0.00017872204794115474,
"loss": 0.8834,
"step": 490
},
{
"epoch": 0.23644134111840126,
"grad_norm": 2.3569817543029785,
"learning_rate": 0.0001786257590033391,
"loss": 0.678,
"step": 491
},
{
"epoch": 0.23692289171131042,
"grad_norm": 3.5754706859588623,
"learning_rate": 0.00017852927875204163,
"loss": 1.1182,
"step": 492
},
{
"epoch": 0.2374044423042196,
"grad_norm": 4.857983112335205,
"learning_rate": 0.00017843260742201963,
"loss": 0.8987,
"step": 493
},
{
"epoch": 0.23788599289712875,
"grad_norm": 1.9549206495285034,
"learning_rate": 0.00017833574524849535,
"loss": 0.8174,
"step": 494
},
{
"epoch": 0.23836754349003791,
"grad_norm": 6.188207626342773,
"learning_rate": 0.00017823869246715553,
"loss": 1.0608,
"step": 495
},
{
"epoch": 0.23884909408294708,
"grad_norm": 5.574507713317871,
"learning_rate": 0.00017814144931415043,
"loss": 1.1371,
"step": 496
},
{
"epoch": 0.23933064467585627,
"grad_norm": 4.148524284362793,
"learning_rate": 0.0001780440160260938,
"loss": 1.213,
"step": 497
},
{
"epoch": 0.23981219526876543,
"grad_norm": 1.1119147539138794,
"learning_rate": 0.00017794639284006184,
"loss": 0.9442,
"step": 498
},
{
"epoch": 0.2402937458616746,
"grad_norm": 2.266169548034668,
"learning_rate": 0.0001778485799935929,
"loss": 0.8459,
"step": 499
},
{
"epoch": 0.24077529645458376,
"grad_norm": 3.1551809310913086,
"learning_rate": 0.00017775057772468679,
"loss": 1.3422,
"step": 500
},
{
"epoch": 0.24125684704749292,
"grad_norm": 1.7529497146606445,
"learning_rate": 0.00017765238627180424,
"loss": 0.5388,
"step": 501
},
{
"epoch": 0.2417383976404021,
"grad_norm": 1.8274682760238647,
"learning_rate": 0.00017755400587386632,
"loss": 0.679,
"step": 502
},
{
"epoch": 0.24221994823331125,
"grad_norm": 3.1747913360595703,
"learning_rate": 0.00017745543677025378,
"loss": 1.0639,
"step": 503
},
{
"epoch": 0.24270149882622044,
"grad_norm": 3.80859637260437,
"learning_rate": 0.00017735667920080661,
"loss": 0.9085,
"step": 504
},
{
"epoch": 0.2431830494191296,
"grad_norm": 3.507260322570801,
"learning_rate": 0.0001772577334058233,
"loss": 1.2649,
"step": 505
},
{
"epoch": 0.24366460001203877,
"grad_norm": 2.216651201248169,
"learning_rate": 0.00017715859962606043,
"loss": 0.6616,
"step": 506
},
{
"epoch": 0.24414615060494793,
"grad_norm": 2.3983209133148193,
"learning_rate": 0.00017705927810273187,
"loss": 0.5882,
"step": 507
},
{
"epoch": 0.2446277011978571,
"grad_norm": 2.6623141765594482,
"learning_rate": 0.00017695976907750844,
"loss": 0.5488,
"step": 508
},
{
"epoch": 0.24510925179076626,
"grad_norm": 2.3083388805389404,
"learning_rate": 0.00017686007279251706,
"loss": 0.5942,
"step": 509
},
{
"epoch": 0.24559080238367542,
"grad_norm": 2.294562578201294,
"learning_rate": 0.00017676018949034045,
"loss": 0.9633,
"step": 510
},
{
"epoch": 0.2460723529765846,
"grad_norm": 3.9046053886413574,
"learning_rate": 0.0001766601194140162,
"loss": 1.0614,
"step": 511
},
{
"epoch": 0.24655390356949378,
"grad_norm": 2.0208675861358643,
"learning_rate": 0.0001765598628070365,
"loss": 0.8143,
"step": 512
},
{
"epoch": 0.24703545416240294,
"grad_norm": 4.851849555969238,
"learning_rate": 0.00017645941991334732,
"loss": 1.0167,
"step": 513
},
{
"epoch": 0.2475170047553121,
"grad_norm": 2.7783102989196777,
"learning_rate": 0.00017635879097734804,
"loss": 0.9836,
"step": 514
},
{
"epoch": 0.24799855534822127,
"grad_norm": 3.420109510421753,
"learning_rate": 0.00017625797624389055,
"loss": 0.8772,
"step": 515
},
{
"epoch": 0.24848010594113043,
"grad_norm": 2.0368940830230713,
"learning_rate": 0.00017615697595827897,
"loss": 0.6702,
"step": 516
},
{
"epoch": 0.2489616565340396,
"grad_norm": 3.138749599456787,
"learning_rate": 0.0001760557903662688,
"loss": 0.8044,
"step": 517
},
{
"epoch": 0.24944320712694878,
"grad_norm": 3.4814579486846924,
"learning_rate": 0.00017595441971406648,
"loss": 1.1824,
"step": 518
},
{
"epoch": 0.24992475771985795,
"grad_norm": 2.961376190185547,
"learning_rate": 0.00017585286424832874,
"loss": 0.9286,
"step": 519
},
{
"epoch": 0.2504063083127671,
"grad_norm": 2.2734553813934326,
"learning_rate": 0.00017575112421616202,
"loss": 0.8274,
"step": 520
},
{
"epoch": 0.2508878589056763,
"grad_norm": 2.522198438644409,
"learning_rate": 0.0001756491998651218,
"loss": 0.8287,
"step": 521
},
{
"epoch": 0.25136940949858544,
"grad_norm": 3.1421101093292236,
"learning_rate": 0.0001755470914432121,
"loss": 1.0712,
"step": 522
},
{
"epoch": 0.2518509600914946,
"grad_norm": 2.7851064205169678,
"learning_rate": 0.0001754447991988848,
"loss": 0.7893,
"step": 523
},
{
"epoch": 0.25233251068440377,
"grad_norm": 3.5308754444122314,
"learning_rate": 0.00017534232338103903,
"loss": 1.0271,
"step": 524
},
{
"epoch": 0.25281406127731293,
"grad_norm": 2.5698599815368652,
"learning_rate": 0.0001752396642390207,
"loss": 1.2494,
"step": 525
},
{
"epoch": 0.2532956118702221,
"grad_norm": 3.077817440032959,
"learning_rate": 0.00017513682202262163,
"loss": 1.2176,
"step": 526
},
{
"epoch": 0.25377716246313126,
"grad_norm": 2.7685060501098633,
"learning_rate": 0.00017503379698207918,
"loss": 1.017,
"step": 527
},
{
"epoch": 0.2542587130560405,
"grad_norm": 1.3066401481628418,
"learning_rate": 0.00017493058936807562,
"loss": 0.9528,
"step": 528
},
{
"epoch": 0.25474026364894964,
"grad_norm": 2.54870343208313,
"learning_rate": 0.00017482719943173739,
"loss": 0.8665,
"step": 529
},
{
"epoch": 0.2552218142418588,
"grad_norm": 1.8381497859954834,
"learning_rate": 0.00017472362742463455,
"loss": 0.6502,
"step": 530
},
{
"epoch": 0.25570336483476797,
"grad_norm": 3.0634922981262207,
"learning_rate": 0.0001746198735987802,
"loss": 0.7654,
"step": 531
},
{
"epoch": 0.25618491542767713,
"grad_norm": 2.2630629539489746,
"learning_rate": 0.00017451593820662988,
"loss": 0.6992,
"step": 532
},
{
"epoch": 0.2566664660205863,
"grad_norm": 2.220201015472412,
"learning_rate": 0.00017441182150108086,
"loss": 1.0391,
"step": 533
},
{
"epoch": 0.25714801661349546,
"grad_norm": 1.3255223035812378,
"learning_rate": 0.0001743075237354716,
"loss": 1.1717,
"step": 534
},
{
"epoch": 0.2576295672064046,
"grad_norm": 0.7325202822685242,
"learning_rate": 0.00017420304516358113,
"loss": 0.6,
"step": 535
},
{
"epoch": 0.2581111177993138,
"grad_norm": 2.993429660797119,
"learning_rate": 0.00017409838603962843,
"loss": 0.8133,
"step": 536
},
{
"epoch": 0.25859266839222295,
"grad_norm": 1.611694097518921,
"learning_rate": 0.00017399354661827178,
"loss": 0.8367,
"step": 537
},
{
"epoch": 0.2590742189851321,
"grad_norm": 4.344441890716553,
"learning_rate": 0.00017388852715460819,
"loss": 0.7335,
"step": 538
},
{
"epoch": 0.2595557695780413,
"grad_norm": 2.118799924850464,
"learning_rate": 0.00017378332790417273,
"loss": 0.8942,
"step": 539
},
{
"epoch": 0.26003732017095044,
"grad_norm": 3.4612903594970703,
"learning_rate": 0.00017367794912293794,
"loss": 0.997,
"step": 540
},
{
"epoch": 0.26051887076385966,
"grad_norm": 2.6911776065826416,
"learning_rate": 0.00017357239106731317,
"loss": 0.59,
"step": 541
},
{
"epoch": 0.2610004213567688,
"grad_norm": 3.0154871940612793,
"learning_rate": 0.00017346665399414405,
"loss": 0.8628,
"step": 542
},
{
"epoch": 0.261481971949678,
"grad_norm": 2.9015610218048096,
"learning_rate": 0.00017336073816071168,
"loss": 0.7398,
"step": 543
},
{
"epoch": 0.26196352254258715,
"grad_norm": 3.7456789016723633,
"learning_rate": 0.00017325464382473226,
"loss": 1.2309,
"step": 544
},
{
"epoch": 0.2624450731354963,
"grad_norm": 3.10002064704895,
"learning_rate": 0.00017314837124435622,
"loss": 0.8035,
"step": 545
},
{
"epoch": 0.2629266237284055,
"grad_norm": 3.767289876937866,
"learning_rate": 0.00017304192067816782,
"loss": 0.9462,
"step": 546
},
{
"epoch": 0.26340817432131464,
"grad_norm": 1.6517798900604248,
"learning_rate": 0.00017293529238518422,
"loss": 0.7511,
"step": 547
},
{
"epoch": 0.2638897249142238,
"grad_norm": 3.3719449043273926,
"learning_rate": 0.0001728284866248552,
"loss": 0.8092,
"step": 548
},
{
"epoch": 0.26437127550713296,
"grad_norm": 2.3722736835479736,
"learning_rate": 0.00017272150365706224,
"loss": 0.8951,
"step": 549
},
{
"epoch": 0.2648528261000421,
"grad_norm": 4.863512992858887,
"learning_rate": 0.00017261434374211802,
"loss": 0.6823,
"step": 550
},
{
"epoch": 0.2653343766929513,
"grad_norm": 2.9136412143707275,
"learning_rate": 0.00017250700714076586,
"loss": 0.7394,
"step": 551
},
{
"epoch": 0.26581592728586045,
"grad_norm": 1.59123957157135,
"learning_rate": 0.00017239949411417888,
"loss": 0.9012,
"step": 552
},
{
"epoch": 0.2662974778787696,
"grad_norm": 3.566239356994629,
"learning_rate": 0.0001722918049239596,
"loss": 1.3038,
"step": 553
},
{
"epoch": 0.2667790284716788,
"grad_norm": 1.6607437133789062,
"learning_rate": 0.00017218393983213902,
"loss": 0.5825,
"step": 554
},
{
"epoch": 0.267260579064588,
"grad_norm": 2.2083866596221924,
"learning_rate": 0.00017207589910117634,
"loss": 0.6033,
"step": 555
},
{
"epoch": 0.26774212965749716,
"grad_norm": 6.2903523445129395,
"learning_rate": 0.00017196768299395797,
"loss": 0.9105,
"step": 556
},
{
"epoch": 0.2682236802504063,
"grad_norm": 1.9703174829483032,
"learning_rate": 0.00017185929177379714,
"loss": 0.7654,
"step": 557
},
{
"epoch": 0.2687052308433155,
"grad_norm": 3.087989568710327,
"learning_rate": 0.00017175072570443312,
"loss": 0.9272,
"step": 558
},
{
"epoch": 0.26918678143622465,
"grad_norm": 4.169714450836182,
"learning_rate": 0.00017164198505003066,
"loss": 0.6082,
"step": 559
},
{
"epoch": 0.2696683320291338,
"grad_norm": 2.0831711292266846,
"learning_rate": 0.0001715330700751793,
"loss": 0.5273,
"step": 560
},
{
"epoch": 0.270149882622043,
"grad_norm": 2.3498497009277344,
"learning_rate": 0.00017142398104489273,
"loss": 0.6253,
"step": 561
},
{
"epoch": 0.27063143321495214,
"grad_norm": 3.0425565242767334,
"learning_rate": 0.00017131471822460814,
"loss": 0.8767,
"step": 562
},
{
"epoch": 0.2711129838078613,
"grad_norm": 3.1900765895843506,
"learning_rate": 0.00017120528188018565,
"loss": 0.8782,
"step": 563
},
{
"epoch": 0.27159453440077047,
"grad_norm": 4.279084205627441,
"learning_rate": 0.00017109567227790754,
"loss": 0.5321,
"step": 564
},
{
"epoch": 0.27207608499367963,
"grad_norm": 1.9123119115829468,
"learning_rate": 0.00017098588968447766,
"loss": 1.0175,
"step": 565
},
{
"epoch": 0.2725576355865888,
"grad_norm": 3.689532995223999,
"learning_rate": 0.00017087593436702084,
"loss": 0.9917,
"step": 566
},
{
"epoch": 0.27303918617949796,
"grad_norm": 2.9351844787597656,
"learning_rate": 0.00017076580659308222,
"loss": 1.1911,
"step": 567
},
{
"epoch": 0.2735207367724071,
"grad_norm": 1.8407542705535889,
"learning_rate": 0.00017065550663062634,
"loss": 0.6958,
"step": 568
},
{
"epoch": 0.27400228736531634,
"grad_norm": 3.2096939086914062,
"learning_rate": 0.00017054503474803702,
"loss": 1.0163,
"step": 569
},
{
"epoch": 0.2744838379582255,
"grad_norm": 4.783024311065674,
"learning_rate": 0.00017043439121411618,
"loss": 0.9086,
"step": 570
},
{
"epoch": 0.27496538855113467,
"grad_norm": 3.336801290512085,
"learning_rate": 0.0001703235762980835,
"loss": 0.6395,
"step": 571
},
{
"epoch": 0.27544693914404383,
"grad_norm": 6.958205223083496,
"learning_rate": 0.00017021259026957567,
"loss": 1.0467,
"step": 572
},
{
"epoch": 0.275928489736953,
"grad_norm": 3.0236618518829346,
"learning_rate": 0.00017010143339864562,
"loss": 0.8194,
"step": 573
},
{
"epoch": 0.27641004032986216,
"grad_norm": 2.2934484481811523,
"learning_rate": 0.0001699901059557621,
"loss": 0.7926,
"step": 574
},
{
"epoch": 0.2768915909227713,
"grad_norm": 3.157127618789673,
"learning_rate": 0.00016987860821180895,
"loss": 0.849,
"step": 575
},
{
"epoch": 0.2773731415156805,
"grad_norm": 2.1495730876922607,
"learning_rate": 0.00016976694043808416,
"loss": 0.9138,
"step": 576
},
{
"epoch": 0.27785469210858965,
"grad_norm": 2.7366628646850586,
"learning_rate": 0.00016965510290629972,
"loss": 0.7798,
"step": 577
},
{
"epoch": 0.2783362427014988,
"grad_norm": 1.7467342615127563,
"learning_rate": 0.00016954309588858044,
"loss": 0.5792,
"step": 578
},
{
"epoch": 0.278817793294408,
"grad_norm": 2.1908276081085205,
"learning_rate": 0.00016943091965746366,
"loss": 0.6596,
"step": 579
},
{
"epoch": 0.27929934388731714,
"grad_norm": 4.674194812774658,
"learning_rate": 0.00016931857448589845,
"loss": 0.9044,
"step": 580
},
{
"epoch": 0.2797808944802263,
"grad_norm": 1.4668383598327637,
"learning_rate": 0.00016920606064724488,
"loss": 0.5977,
"step": 581
},
{
"epoch": 0.28026244507313547,
"grad_norm": 1.8525112867355347,
"learning_rate": 0.00016909337841527344,
"loss": 0.7542,
"step": 582
},
{
"epoch": 0.2807439956660447,
"grad_norm": 2.2113821506500244,
"learning_rate": 0.00016898052806416444,
"loss": 0.8215,
"step": 583
},
{
"epoch": 0.28122554625895385,
"grad_norm": 3.6429834365844727,
"learning_rate": 0.00016886750986850718,
"loss": 1.0825,
"step": 584
},
{
"epoch": 0.281707096851863,
"grad_norm": 2.043597459793091,
"learning_rate": 0.00016875432410329934,
"loss": 0.5168,
"step": 585
},
{
"epoch": 0.2821886474447722,
"grad_norm": 3.5646705627441406,
"learning_rate": 0.0001686409710439464,
"loss": 0.8117,
"step": 586
},
{
"epoch": 0.28267019803768134,
"grad_norm": 2.080101251602173,
"learning_rate": 0.00016852745096626088,
"loss": 1.0391,
"step": 587
},
{
"epoch": 0.2831517486305905,
"grad_norm": 4.150976181030273,
"learning_rate": 0.0001684137641464617,
"loss": 1.3853,
"step": 588
},
{
"epoch": 0.28363329922349967,
"grad_norm": 3.107977867126465,
"learning_rate": 0.0001682999108611735,
"loss": 0.6732,
"step": 589
},
{
"epoch": 0.28411484981640883,
"grad_norm": 1.3921109437942505,
"learning_rate": 0.00016818589138742587,
"loss": 0.5883,
"step": 590
},
{
"epoch": 0.284596400409318,
"grad_norm": 3.5491769313812256,
"learning_rate": 0.00016807170600265296,
"loss": 0.7489,
"step": 591
},
{
"epoch": 0.28507795100222716,
"grad_norm": 2.4986155033111572,
"learning_rate": 0.00016795735498469246,
"loss": 0.9951,
"step": 592
},
{
"epoch": 0.2855595015951363,
"grad_norm": 2.9514119625091553,
"learning_rate": 0.00016784283861178513,
"loss": 0.9067,
"step": 593
},
{
"epoch": 0.2860410521880455,
"grad_norm": 2.5975046157836914,
"learning_rate": 0.00016772815716257412,
"loss": 0.6998,
"step": 594
},
{
"epoch": 0.28652260278095465,
"grad_norm": 3.2922167778015137,
"learning_rate": 0.00016761331091610416,
"loss": 1.1803,
"step": 595
},
{
"epoch": 0.28700415337386387,
"grad_norm": 2.982027053833008,
"learning_rate": 0.00016749830015182107,
"loss": 0.9353,
"step": 596
},
{
"epoch": 0.28748570396677303,
"grad_norm": 1.3692221641540527,
"learning_rate": 0.00016738312514957086,
"loss": 0.5095,
"step": 597
},
{
"epoch": 0.2879672545596822,
"grad_norm": 2.186056613922119,
"learning_rate": 0.00016726778618959926,
"loss": 0.8501,
"step": 598
},
{
"epoch": 0.28844880515259136,
"grad_norm": 1.2103948593139648,
"learning_rate": 0.00016715228355255093,
"loss": 0.5927,
"step": 599
},
{
"epoch": 0.2889303557455005,
"grad_norm": 2.340742349624634,
"learning_rate": 0.00016703661751946874,
"loss": 0.7416,
"step": 600
},
{
"epoch": 0.2894119063384097,
"grad_norm": 3.421393871307373,
"learning_rate": 0.00016692078837179318,
"loss": 1.001,
"step": 601
},
{
"epoch": 0.28989345693131885,
"grad_norm": 2.6955082416534424,
"learning_rate": 0.00016680479639136163,
"loss": 0.7825,
"step": 602
},
{
"epoch": 0.290375007524228,
"grad_norm": 2.7069554328918457,
"learning_rate": 0.0001666886418604077,
"loss": 0.5969,
"step": 603
},
{
"epoch": 0.2908565581171372,
"grad_norm": 2.855012893676758,
"learning_rate": 0.0001665723250615604,
"loss": 0.6626,
"step": 604
},
{
"epoch": 0.29133810871004634,
"grad_norm": 3.311091423034668,
"learning_rate": 0.00016645584627784381,
"loss": 1.0357,
"step": 605
},
{
"epoch": 0.2918196593029555,
"grad_norm": 3.6477015018463135,
"learning_rate": 0.0001663392057926759,
"loss": 0.8369,
"step": 606
},
{
"epoch": 0.29230120989586467,
"grad_norm": 4.4921345710754395,
"learning_rate": 0.00016622240388986824,
"loss": 0.6446,
"step": 607
},
{
"epoch": 0.29278276048877383,
"grad_norm": 7.8272013664245605,
"learning_rate": 0.0001661054408536251,
"loss": 1.0073,
"step": 608
},
{
"epoch": 0.293264311081683,
"grad_norm": 3.6183552742004395,
"learning_rate": 0.00016598831696854288,
"loss": 1.0384,
"step": 609
},
{
"epoch": 0.2937458616745922,
"grad_norm": 3.8852734565734863,
"learning_rate": 0.00016587103251960937,
"loss": 0.9137,
"step": 610
},
{
"epoch": 0.2942274122675014,
"grad_norm": 4.892505645751953,
"learning_rate": 0.00016575358779220294,
"loss": 0.8421,
"step": 611
},
{
"epoch": 0.29470896286041054,
"grad_norm": 5.816843032836914,
"learning_rate": 0.00016563598307209204,
"loss": 0.9884,
"step": 612
},
{
"epoch": 0.2951905134533197,
"grad_norm": 3.2219488620758057,
"learning_rate": 0.0001655182186454344,
"loss": 0.549,
"step": 613
},
{
"epoch": 0.29567206404622887,
"grad_norm": 1.8293180465698242,
"learning_rate": 0.00016540029479877638,
"loss": 1.0032,
"step": 614
},
{
"epoch": 0.29615361463913803,
"grad_norm": 2.564122200012207,
"learning_rate": 0.00016528221181905217,
"loss": 0.8074,
"step": 615
},
{
"epoch": 0.2966351652320472,
"grad_norm": 2.1164186000823975,
"learning_rate": 0.00016516396999358322,
"loss": 0.7476,
"step": 616
},
{
"epoch": 0.29711671582495636,
"grad_norm": 1.6424773931503296,
"learning_rate": 0.00016504556961007748,
"loss": 0.505,
"step": 617
},
{
"epoch": 0.2975982664178655,
"grad_norm": 2.669262170791626,
"learning_rate": 0.00016492701095662866,
"loss": 0.5681,
"step": 618
},
{
"epoch": 0.2980798170107747,
"grad_norm": 3.94612455368042,
"learning_rate": 0.00016480829432171564,
"loss": 1.0503,
"step": 619
},
{
"epoch": 0.29856136760368385,
"grad_norm": 1.6237881183624268,
"learning_rate": 0.0001646894199942017,
"loss": 0.7969,
"step": 620
},
{
"epoch": 0.299042918196593,
"grad_norm": 1.9715096950531006,
"learning_rate": 0.0001645703882633338,
"loss": 0.9146,
"step": 621
},
{
"epoch": 0.2995244687895022,
"grad_norm": 4.666619300842285,
"learning_rate": 0.00016445119941874183,
"loss": 0.4521,
"step": 622
},
{
"epoch": 0.30000601938241134,
"grad_norm": 5.420642375946045,
"learning_rate": 0.00016433185375043809,
"loss": 0.8339,
"step": 623
},
{
"epoch": 0.30048756997532056,
"grad_norm": 3.869925022125244,
"learning_rate": 0.00016421235154881638,
"loss": 0.9278,
"step": 624
},
{
"epoch": 0.3009691205682297,
"grad_norm": 2.435767889022827,
"learning_rate": 0.00016409269310465146,
"loss": 0.4431,
"step": 625
},
{
"epoch": 0.3014506711611389,
"grad_norm": 3.733633041381836,
"learning_rate": 0.00016397287870909813,
"loss": 1.1122,
"step": 626
},
{
"epoch": 0.30193222175404805,
"grad_norm": 2.7327606678009033,
"learning_rate": 0.00016385290865369079,
"loss": 1.8192,
"step": 627
},
{
"epoch": 0.3024137723469572,
"grad_norm": 2.777545213699341,
"learning_rate": 0.00016373278323034255,
"loss": 0.762,
"step": 628
},
{
"epoch": 0.3028953229398664,
"grad_norm": 3.036078691482544,
"learning_rate": 0.0001636125027313445,
"loss": 0.9722,
"step": 629
},
{
"epoch": 0.30337687353277554,
"grad_norm": 1.9548509120941162,
"learning_rate": 0.00016349206744936518,
"loss": 0.909,
"step": 630
},
{
"epoch": 0.3038584241256847,
"grad_norm": 1.862438678741455,
"learning_rate": 0.00016337147767744967,
"loss": 0.9632,
"step": 631
},
{
"epoch": 0.30433997471859386,
"grad_norm": 3.5347414016723633,
"learning_rate": 0.0001632507337090189,
"loss": 0.6586,
"step": 632
},
{
"epoch": 0.304821525311503,
"grad_norm": 3.137190818786621,
"learning_rate": 0.0001631298358378692,
"loss": 0.8476,
"step": 633
},
{
"epoch": 0.3053030759044122,
"grad_norm": 4.065056800842285,
"learning_rate": 0.00016300878435817113,
"loss": 0.9919,
"step": 634
},
{
"epoch": 0.30578462649732135,
"grad_norm": 3.0076253414154053,
"learning_rate": 0.00016288757956446918,
"loss": 0.8693,
"step": 635
},
{
"epoch": 0.3062661770902305,
"grad_norm": 4.386903762817383,
"learning_rate": 0.00016276622175168083,
"loss": 0.5319,
"step": 636
},
{
"epoch": 0.30674772768313974,
"grad_norm": 2.739208459854126,
"learning_rate": 0.0001626447112150959,
"loss": 1.0968,
"step": 637
},
{
"epoch": 0.3072292782760489,
"grad_norm": 2.428248882293701,
"learning_rate": 0.00016252304825037576,
"loss": 0.9611,
"step": 638
},
{
"epoch": 0.30771082886895806,
"grad_norm": 4.173031330108643,
"learning_rate": 0.0001624012331535528,
"loss": 1.0045,
"step": 639
},
{
"epoch": 0.3081923794618672,
"grad_norm": 2.0858750343322754,
"learning_rate": 0.00016227926622102947,
"loss": 0.4524,
"step": 640
},
{
"epoch": 0.3086739300547764,
"grad_norm": 3.544163703918457,
"learning_rate": 0.00016215714774957772,
"loss": 1.0543,
"step": 641
},
{
"epoch": 0.30915548064768555,
"grad_norm": 5.18956995010376,
"learning_rate": 0.00016203487803633822,
"loss": 1.7808,
"step": 642
},
{
"epoch": 0.3096370312405947,
"grad_norm": 4.177835464477539,
"learning_rate": 0.00016191245737881956,
"loss": 0.6678,
"step": 643
},
{
"epoch": 0.3101185818335039,
"grad_norm": 2.31697678565979,
"learning_rate": 0.00016178988607489777,
"loss": 0.8606,
"step": 644
},
{
"epoch": 0.31060013242641304,
"grad_norm": 3.908705949783325,
"learning_rate": 0.00016166716442281528,
"loss": 0.9769,
"step": 645
},
{
"epoch": 0.3110816830193222,
"grad_norm": 3.6292724609375,
"learning_rate": 0.0001615442927211805,
"loss": 0.7383,
"step": 646
},
{
"epoch": 0.31156323361223137,
"grad_norm": 1.9753597974777222,
"learning_rate": 0.0001614212712689668,
"loss": 0.6893,
"step": 647
},
{
"epoch": 0.31204478420514054,
"grad_norm": 2.6053380966186523,
"learning_rate": 0.00016129810036551198,
"loss": 0.8903,
"step": 648
},
{
"epoch": 0.3125263347980497,
"grad_norm": 4.439826965332031,
"learning_rate": 0.00016117478031051755,
"loss": 0.6233,
"step": 649
},
{
"epoch": 0.31300788539095886,
"grad_norm": 2.590153694152832,
"learning_rate": 0.00016105131140404787,
"loss": 0.8171,
"step": 650
},
{
"epoch": 0.3134894359838681,
"grad_norm": 2.094128370285034,
"learning_rate": 0.00016092769394652947,
"loss": 0.9461,
"step": 651
},
{
"epoch": 0.31397098657677724,
"grad_norm": 2.2599637508392334,
"learning_rate": 0.0001608039282387504,
"loss": 0.7661,
"step": 652
},
{
"epoch": 0.3144525371696864,
"grad_norm": 2.008302927017212,
"learning_rate": 0.00016068001458185936,
"loss": 0.7038,
"step": 653
},
{
"epoch": 0.31493408776259557,
"grad_norm": 1.2377398014068604,
"learning_rate": 0.0001605559532773651,
"loss": 0.7293,
"step": 654
},
{
"epoch": 0.31541563835550473,
"grad_norm": 1.3407477140426636,
"learning_rate": 0.00016043174462713566,
"loss": 0.4857,
"step": 655
},
{
"epoch": 0.3158971889484139,
"grad_norm": 1.853411078453064,
"learning_rate": 0.00016030738893339753,
"loss": 0.8002,
"step": 656
},
{
"epoch": 0.31637873954132306,
"grad_norm": 2.6167731285095215,
"learning_rate": 0.00016018288649873497,
"loss": 0.8817,
"step": 657
},
{
"epoch": 0.3168602901342322,
"grad_norm": 2.0071139335632324,
"learning_rate": 0.0001600582376260894,
"loss": 0.7619,
"step": 658
},
{
"epoch": 0.3173418407271414,
"grad_norm": 1.7096420526504517,
"learning_rate": 0.00015993344261875847,
"loss": 0.3737,
"step": 659
},
{
"epoch": 0.31782339132005055,
"grad_norm": 1.3771498203277588,
"learning_rate": 0.00015980850178039547,
"loss": 0.8174,
"step": 660
},
{
"epoch": 0.3183049419129597,
"grad_norm": 1.803589105606079,
"learning_rate": 0.00015968341541500842,
"loss": 0.886,
"step": 661
},
{
"epoch": 0.3187864925058689,
"grad_norm": 2.595926523208618,
"learning_rate": 0.00015955818382695953,
"loss": 0.6081,
"step": 662
},
{
"epoch": 0.31926804309877804,
"grad_norm": 3.571012496948242,
"learning_rate": 0.00015943280732096438,
"loss": 0.8304,
"step": 663
},
{
"epoch": 0.3197495936916872,
"grad_norm": 3.7733047008514404,
"learning_rate": 0.00015930728620209113,
"loss": 0.9931,
"step": 664
},
{
"epoch": 0.3202311442845964,
"grad_norm": 2.0570201873779297,
"learning_rate": 0.00015918162077575976,
"loss": 0.6084,
"step": 665
},
{
"epoch": 0.3207126948775056,
"grad_norm": 2.1481802463531494,
"learning_rate": 0.00015905581134774153,
"loss": 0.637,
"step": 666
},
{
"epoch": 0.32119424547041475,
"grad_norm": 2.172410488128662,
"learning_rate": 0.0001589298582241579,
"loss": 0.8891,
"step": 667
},
{
"epoch": 0.3216757960633239,
"grad_norm": 1.3450566530227661,
"learning_rate": 0.00015880376171148014,
"loss": 0.743,
"step": 668
},
{
"epoch": 0.3221573466562331,
"grad_norm": 2.8985812664031982,
"learning_rate": 0.00015867752211652831,
"loss": 1.0136,
"step": 669
},
{
"epoch": 0.32263889724914224,
"grad_norm": 2.9778859615325928,
"learning_rate": 0.00015855113974647068,
"loss": 1.1762,
"step": 670
},
{
"epoch": 0.3231204478420514,
"grad_norm": 2.839142322540283,
"learning_rate": 0.0001584246149088229,
"loss": 0.8616,
"step": 671
},
{
"epoch": 0.32360199843496057,
"grad_norm": 2.5461061000823975,
"learning_rate": 0.0001582979479114472,
"loss": 0.883,
"step": 672
},
{
"epoch": 0.32408354902786973,
"grad_norm": 2.5526607036590576,
"learning_rate": 0.0001581711390625519,
"loss": 1.0749,
"step": 673
},
{
"epoch": 0.3245650996207789,
"grad_norm": 1.8686296939849854,
"learning_rate": 0.0001580441886706903,
"loss": 0.5283,
"step": 674
},
{
"epoch": 0.32504665021368806,
"grad_norm": 3.071974277496338,
"learning_rate": 0.00015791709704476015,
"loss": 0.8488,
"step": 675
},
{
"epoch": 0.3255282008065972,
"grad_norm": 2.5691075325012207,
"learning_rate": 0.00015778986449400292,
"loss": 0.6839,
"step": 676
},
{
"epoch": 0.3260097513995064,
"grad_norm": 3.5297348499298096,
"learning_rate": 0.00015766249132800292,
"loss": 0.7555,
"step": 677
},
{
"epoch": 0.3264913019924156,
"grad_norm": 1.9047622680664062,
"learning_rate": 0.00015753497785668663,
"loss": 0.8286,
"step": 678
},
{
"epoch": 0.32697285258532477,
"grad_norm": 1.899084210395813,
"learning_rate": 0.00015740732439032187,
"loss": 0.8663,
"step": 679
},
{
"epoch": 0.32745440317823393,
"grad_norm": 1.7221845388412476,
"learning_rate": 0.00015727953123951716,
"loss": 1.0307,
"step": 680
},
{
"epoch": 0.3279359537711431,
"grad_norm": 2.8478286266326904,
"learning_rate": 0.00015715159871522086,
"loss": 0.9206,
"step": 681
},
{
"epoch": 0.32841750436405226,
"grad_norm": 2.7343266010284424,
"learning_rate": 0.00015702352712872056,
"loss": 0.7745,
"step": 682
},
{
"epoch": 0.3288990549569614,
"grad_norm": 2.272082567214966,
"learning_rate": 0.00015689531679164204,
"loss": 0.7247,
"step": 683
},
{
"epoch": 0.3293806055498706,
"grad_norm": 3.404125690460205,
"learning_rate": 0.00015676696801594886,
"loss": 1.185,
"step": 684
},
{
"epoch": 0.32986215614277975,
"grad_norm": 2.005213499069214,
"learning_rate": 0.00015663848111394132,
"loss": 1.0382,
"step": 685
},
{
"epoch": 0.3303437067356889,
"grad_norm": 3.435062885284424,
"learning_rate": 0.00015650985639825585,
"loss": 0.937,
"step": 686
},
{
"epoch": 0.3308252573285981,
"grad_norm": 2.289677858352661,
"learning_rate": 0.00015638109418186424,
"loss": 0.5748,
"step": 687
},
{
"epoch": 0.33130680792150724,
"grad_norm": 3.6423025131225586,
"learning_rate": 0.00015625219477807277,
"loss": 0.9767,
"step": 688
},
{
"epoch": 0.3317883585144164,
"grad_norm": 2.872910737991333,
"learning_rate": 0.00015612315850052166,
"loss": 0.6958,
"step": 689
},
{
"epoch": 0.33226990910732557,
"grad_norm": 3.16129469871521,
"learning_rate": 0.00015599398566318396,
"loss": 1.0489,
"step": 690
},
{
"epoch": 0.33275145970023473,
"grad_norm": 2.4782161712646484,
"learning_rate": 0.00015586467658036524,
"loss": 0.6644,
"step": 691
},
{
"epoch": 0.33323301029314395,
"grad_norm": 4.447420597076416,
"learning_rate": 0.00015573523156670244,
"loss": 1.2536,
"step": 692
},
{
"epoch": 0.3337145608860531,
"grad_norm": 4.194264888763428,
"learning_rate": 0.0001556056509371633,
"loss": 0.9997,
"step": 693
},
{
"epoch": 0.3341961114789623,
"grad_norm": 3.0863115787506104,
"learning_rate": 0.00015547593500704547,
"loss": 0.9827,
"step": 694
},
{
"epoch": 0.33467766207187144,
"grad_norm": 7.232437610626221,
"learning_rate": 0.00015534608409197592,
"loss": 0.5336,
"step": 695
},
{
"epoch": 0.3351592126647806,
"grad_norm": 2.40484881401062,
"learning_rate": 0.00015521609850791004,
"loss": 0.4391,
"step": 696
},
{
"epoch": 0.33564076325768977,
"grad_norm": 2.6389102935791016,
"learning_rate": 0.0001550859785711308,
"loss": 0.8676,
"step": 697
},
{
"epoch": 0.33612231385059893,
"grad_norm": 1.548851490020752,
"learning_rate": 0.0001549557245982482,
"loss": 0.8892,
"step": 698
},
{
"epoch": 0.3366038644435081,
"grad_norm": 1.7563083171844482,
"learning_rate": 0.00015482533690619837,
"loss": 0.755,
"step": 699
},
{
"epoch": 0.33708541503641726,
"grad_norm": 4.792996883392334,
"learning_rate": 0.00015469481581224272,
"loss": 0.6721,
"step": 700
},
{
"epoch": 0.3375669656293264,
"grad_norm": 2.4070699214935303,
"learning_rate": 0.0001545641616339673,
"loss": 0.8127,
"step": 701
},
{
"epoch": 0.3380485162222356,
"grad_norm": 3.3054072856903076,
"learning_rate": 0.00015443337468928206,
"loss": 0.5389,
"step": 702
},
{
"epoch": 0.33853006681514475,
"grad_norm": 2.826061725616455,
"learning_rate": 0.00015430245529641986,
"loss": 0.58,
"step": 703
},
{
"epoch": 0.3390116174080539,
"grad_norm": 2.2573275566101074,
"learning_rate": 0.00015417140377393596,
"loss": 0.9465,
"step": 704
},
{
"epoch": 0.3394931680009631,
"grad_norm": 2.5977699756622314,
"learning_rate": 0.00015404022044070704,
"loss": 0.707,
"step": 705
},
{
"epoch": 0.3399747185938723,
"grad_norm": 4.191378116607666,
"learning_rate": 0.00015390890561593052,
"loss": 0.8705,
"step": 706
},
{
"epoch": 0.34045626918678146,
"grad_norm": 2.229558229446411,
"learning_rate": 0.0001537774596191238,
"loss": 0.6722,
"step": 707
},
{
"epoch": 0.3409378197796906,
"grad_norm": 3.5231106281280518,
"learning_rate": 0.00015364588277012344,
"loss": 0.6765,
"step": 708
},
{
"epoch": 0.3414193703725998,
"grad_norm": 2.406374216079712,
"learning_rate": 0.00015351417538908435,
"loss": 1.1367,
"step": 709
},
{
"epoch": 0.34190092096550895,
"grad_norm": 2.4399731159210205,
"learning_rate": 0.0001533823377964791,
"loss": 0.8311,
"step": 710
},
{
"epoch": 0.3423824715584181,
"grad_norm": 3.170764207839966,
"learning_rate": 0.00015325037031309704,
"loss": 1.1677,
"step": 711
},
{
"epoch": 0.3428640221513273,
"grad_norm": 2.4215619564056396,
"learning_rate": 0.00015311827326004363,
"loss": 0.7897,
"step": 712
},
{
"epoch": 0.34334557274423644,
"grad_norm": 2.092327117919922,
"learning_rate": 0.0001529860469587396,
"loss": 0.6389,
"step": 713
},
{
"epoch": 0.3438271233371456,
"grad_norm": 2.5935378074645996,
"learning_rate": 0.00015285369173092015,
"loss": 0.6437,
"step": 714
},
{
"epoch": 0.34430867393005476,
"grad_norm": 2.7213246822357178,
"learning_rate": 0.00015272120789863413,
"loss": 1.2337,
"step": 715
},
{
"epoch": 0.34479022452296393,
"grad_norm": 2.3775582313537598,
"learning_rate": 0.00015258859578424342,
"loss": 0.9354,
"step": 716
},
{
"epoch": 0.3452717751158731,
"grad_norm": 2.3401012420654297,
"learning_rate": 0.00015245585571042194,
"loss": 0.573,
"step": 717
},
{
"epoch": 0.34575332570878226,
"grad_norm": 2.1838347911834717,
"learning_rate": 0.00015232298800015506,
"loss": 0.5484,
"step": 718
},
{
"epoch": 0.3462348763016914,
"grad_norm": 2.1311140060424805,
"learning_rate": 0.00015218999297673862,
"loss": 0.695,
"step": 719
},
{
"epoch": 0.34671642689460064,
"grad_norm": 3.2175323963165283,
"learning_rate": 0.0001520568709637783,
"loss": 0.8665,
"step": 720
},
{
"epoch": 0.3471979774875098,
"grad_norm": 2.6574606895446777,
"learning_rate": 0.00015192362228518875,
"loss": 0.6068,
"step": 721
},
{
"epoch": 0.34767952808041896,
"grad_norm": 3.131312847137451,
"learning_rate": 0.00015179024726519284,
"loss": 1.0317,
"step": 722
},
{
"epoch": 0.34816107867332813,
"grad_norm": 2.886371612548828,
"learning_rate": 0.00015165674622832085,
"loss": 0.6881,
"step": 723
},
{
"epoch": 0.3486426292662373,
"grad_norm": 2.9966607093811035,
"learning_rate": 0.0001515231194994097,
"loss": 1.6059,
"step": 724
},
{
"epoch": 0.34912417985914646,
"grad_norm": 5.8844404220581055,
"learning_rate": 0.00015138936740360207,
"loss": 0.8733,
"step": 725
},
{
"epoch": 0.3496057304520556,
"grad_norm": 2.432682752609253,
"learning_rate": 0.00015125549026634585,
"loss": 0.4045,
"step": 726
},
{
"epoch": 0.3500872810449648,
"grad_norm": 2.9346506595611572,
"learning_rate": 0.00015112148841339295,
"loss": 0.6577,
"step": 727
},
{
"epoch": 0.35056883163787395,
"grad_norm": 3.6856017112731934,
"learning_rate": 0.000150987362170799,
"loss": 0.7203,
"step": 728
},
{
"epoch": 0.3510503822307831,
"grad_norm": 3.772768974304199,
"learning_rate": 0.00015085311186492206,
"loss": 0.961,
"step": 729
},
{
"epoch": 0.3515319328236923,
"grad_norm": 2.937117576599121,
"learning_rate": 0.00015071873782242223,
"loss": 0.5519,
"step": 730
},
{
"epoch": 0.35201348341660144,
"grad_norm": 3.9652099609375,
"learning_rate": 0.0001505842403702606,
"loss": 0.9024,
"step": 731
},
{
"epoch": 0.3524950340095106,
"grad_norm": 2.1614396572113037,
"learning_rate": 0.00015044961983569856,
"loss": 0.6737,
"step": 732
},
{
"epoch": 0.3529765846024198,
"grad_norm": 2.625931978225708,
"learning_rate": 0.00015031487654629702,
"loss": 0.6265,
"step": 733
},
{
"epoch": 0.353458135195329,
"grad_norm": 3.378445863723755,
"learning_rate": 0.00015018001082991553,
"loss": 0.6916,
"step": 734
},
{
"epoch": 0.35393968578823815,
"grad_norm": 1.6671521663665771,
"learning_rate": 0.0001500450230147116,
"loss": 0.5809,
"step": 735
},
{
"epoch": 0.3544212363811473,
"grad_norm": 2.095771074295044,
"learning_rate": 0.00014990991342913974,
"loss": 1.0634,
"step": 736
},
{
"epoch": 0.35490278697405647,
"grad_norm": 2.0476694107055664,
"learning_rate": 0.00014977468240195084,
"loss": 0.7652,
"step": 737
},
{
"epoch": 0.35538433756696564,
"grad_norm": 2.9106428623199463,
"learning_rate": 0.0001496393302621912,
"loss": 1.1553,
"step": 738
},
{
"epoch": 0.3558658881598748,
"grad_norm": 2.1478304862976074,
"learning_rate": 0.00014950385733920188,
"loss": 0.7608,
"step": 739
},
{
"epoch": 0.35634743875278396,
"grad_norm": 1.8038551807403564,
"learning_rate": 0.00014936826396261783,
"loss": 0.6694,
"step": 740
},
{
"epoch": 0.3568289893456931,
"grad_norm": 3.3769569396972656,
"learning_rate": 0.00014923255046236705,
"loss": 1.2689,
"step": 741
},
{
"epoch": 0.3573105399386023,
"grad_norm": 4.4860334396362305,
"learning_rate": 0.00014909671716866984,
"loss": 0.852,
"step": 742
},
{
"epoch": 0.35779209053151145,
"grad_norm": 4.017233371734619,
"learning_rate": 0.00014896076441203802,
"loss": 0.8332,
"step": 743
},
{
"epoch": 0.3582736411244206,
"grad_norm": 2.824586868286133,
"learning_rate": 0.000148824692523274,
"loss": 1.215,
"step": 744
},
{
"epoch": 0.3587551917173298,
"grad_norm": 3.6129872798919678,
"learning_rate": 0.0001486885018334702,
"loss": 1.1055,
"step": 745
},
{
"epoch": 0.35923674231023894,
"grad_norm": 3.1985294818878174,
"learning_rate": 0.00014855219267400797,
"loss": 0.8963,
"step": 746
},
{
"epoch": 0.35971829290314816,
"grad_norm": 3.0053601264953613,
"learning_rate": 0.00014841576537655705,
"loss": 0.8728,
"step": 747
},
{
"epoch": 0.3601998434960573,
"grad_norm": 2.2497479915618896,
"learning_rate": 0.00014827922027307451,
"loss": 0.9084,
"step": 748
},
{
"epoch": 0.3606813940889665,
"grad_norm": 3.9402804374694824,
"learning_rate": 0.00014814255769580415,
"loss": 0.609,
"step": 749
},
{
"epoch": 0.36116294468187565,
"grad_norm": 2.3622281551361084,
"learning_rate": 0.00014800577797727558,
"loss": 1.0189,
"step": 750
},
{
"epoch": 0.3616444952747848,
"grad_norm": 1.9683716297149658,
"learning_rate": 0.00014786888145030343,
"loss": 0.8275,
"step": 751
},
{
"epoch": 0.362126045867694,
"grad_norm": 0.9872303009033203,
"learning_rate": 0.0001477318684479866,
"loss": 0.3827,
"step": 752
},
{
"epoch": 0.36260759646060314,
"grad_norm": 3.7244014739990234,
"learning_rate": 0.00014759473930370736,
"loss": 0.6359,
"step": 753
},
{
"epoch": 0.3630891470535123,
"grad_norm": 1.6438435316085815,
"learning_rate": 0.0001474574943511306,
"loss": 0.879,
"step": 754
},
{
"epoch": 0.36357069764642147,
"grad_norm": 2.410429000854492,
"learning_rate": 0.0001473201339242029,
"loss": 1.2406,
"step": 755
},
{
"epoch": 0.36405224823933063,
"grad_norm": 5.812607765197754,
"learning_rate": 0.000147182658357152,
"loss": 0.4273,
"step": 756
},
{
"epoch": 0.3645337988322398,
"grad_norm": 3.4380412101745605,
"learning_rate": 0.00014704506798448566,
"loss": 0.5765,
"step": 757
},
{
"epoch": 0.36501534942514896,
"grad_norm": 10.264464378356934,
"learning_rate": 0.00014690736314099101,
"loss": 0.8553,
"step": 758
},
{
"epoch": 0.3654969000180581,
"grad_norm": 2.479084014892578,
"learning_rate": 0.00014676954416173373,
"loss": 0.8062,
"step": 759
},
{
"epoch": 0.3659784506109673,
"grad_norm": 3.285261392593384,
"learning_rate": 0.00014663161138205724,
"loss": 0.9467,
"step": 760
},
{
"epoch": 0.3664600012038765,
"grad_norm": 2.4864413738250732,
"learning_rate": 0.00014649356513758176,
"loss": 0.8893,
"step": 761
},
{
"epoch": 0.36694155179678567,
"grad_norm": 1.9362248182296753,
"learning_rate": 0.00014635540576420374,
"loss": 0.6744,
"step": 762
},
{
"epoch": 0.36742310238969483,
"grad_norm": 2.7063558101654053,
"learning_rate": 0.0001462171335980948,
"loss": 0.4627,
"step": 763
},
{
"epoch": 0.367904652982604,
"grad_norm": 1.6518278121948242,
"learning_rate": 0.00014607874897570105,
"loss": 0.6235,
"step": 764
},
{
"epoch": 0.36838620357551316,
"grad_norm": 1.9559590816497803,
"learning_rate": 0.0001459402522337422,
"loss": 0.6709,
"step": 765
},
{
"epoch": 0.3688677541684223,
"grad_norm": 2.930201292037964,
"learning_rate": 0.00014580164370921078,
"loss": 0.5976,
"step": 766
},
{
"epoch": 0.3693493047613315,
"grad_norm": 2.146150588989258,
"learning_rate": 0.0001456629237393713,
"loss": 0.7809,
"step": 767
},
{
"epoch": 0.36983085535424065,
"grad_norm": 3.9922261238098145,
"learning_rate": 0.00014552409266175952,
"loss": 0.6659,
"step": 768
},
{
"epoch": 0.3703124059471498,
"grad_norm": 1.5614209175109863,
"learning_rate": 0.00014538515081418142,
"loss": 0.6743,
"step": 769
},
{
"epoch": 0.370793956540059,
"grad_norm": 4.6989970207214355,
"learning_rate": 0.00014524609853471264,
"loss": 0.8936,
"step": 770
},
{
"epoch": 0.37127550713296814,
"grad_norm": 3.576082229614258,
"learning_rate": 0.00014510693616169741,
"loss": 0.9577,
"step": 771
},
{
"epoch": 0.3717570577258773,
"grad_norm": 0.9595773816108704,
"learning_rate": 0.0001449676640337479,
"loss": 0.7148,
"step": 772
},
{
"epoch": 0.37223860831878647,
"grad_norm": 4.762831687927246,
"learning_rate": 0.00014482828248974335,
"loss": 0.4595,
"step": 773
},
{
"epoch": 0.3727201589116957,
"grad_norm": 3.842872381210327,
"learning_rate": 0.00014468879186882916,
"loss": 1.3252,
"step": 774
},
{
"epoch": 0.37320170950460485,
"grad_norm": 1.4984766244888306,
"learning_rate": 0.00014454919251041622,
"loss": 0.5666,
"step": 775
},
{
"epoch": 0.373683260097514,
"grad_norm": 2.2089576721191406,
"learning_rate": 0.00014440948475418,
"loss": 1.341,
"step": 776
},
{
"epoch": 0.3741648106904232,
"grad_norm": 1.6905694007873535,
"learning_rate": 0.00014426966894005966,
"loss": 0.6712,
"step": 777
},
{
"epoch": 0.37464636128333234,
"grad_norm": 2.767066478729248,
"learning_rate": 0.0001441297454082573,
"loss": 0.9909,
"step": 778
},
{
"epoch": 0.3751279118762415,
"grad_norm": 2.3299660682678223,
"learning_rate": 0.00014398971449923722,
"loss": 0.5103,
"step": 779
},
{
"epoch": 0.37560946246915067,
"grad_norm": 1.6824946403503418,
"learning_rate": 0.00014384957655372483,
"loss": 0.6759,
"step": 780
},
{
"epoch": 0.37609101306205983,
"grad_norm": 1.1821125745773315,
"learning_rate": 0.00014370933191270617,
"loss": 0.5259,
"step": 781
},
{
"epoch": 0.376572563654969,
"grad_norm": 1.904584288597107,
"learning_rate": 0.0001435689809174267,
"loss": 0.9894,
"step": 782
},
{
"epoch": 0.37705411424787816,
"grad_norm": 2.2008063793182373,
"learning_rate": 0.0001434285239093908,
"loss": 1.3456,
"step": 783
},
{
"epoch": 0.3775356648407873,
"grad_norm": 2.202802896499634,
"learning_rate": 0.00014328796123036071,
"loss": 0.4833,
"step": 784
},
{
"epoch": 0.3780172154336965,
"grad_norm": 1.7282605171203613,
"learning_rate": 0.0001431472932223559,
"loss": 0.9194,
"step": 785
},
{
"epoch": 0.37849876602660565,
"grad_norm": 1.2373805046081543,
"learning_rate": 0.00014300652022765207,
"loss": 0.466,
"step": 786
},
{
"epoch": 0.3789803166195148,
"grad_norm": 4.87825870513916,
"learning_rate": 0.00014286564258878033,
"loss": 0.9176,
"step": 787
},
{
"epoch": 0.37946186721242403,
"grad_norm": 3.325873613357544,
"learning_rate": 0.00014272466064852644,
"loss": 0.4595,
"step": 788
},
{
"epoch": 0.3799434178053332,
"grad_norm": 5.119507789611816,
"learning_rate": 0.00014258357474993,
"loss": 0.8462,
"step": 789
},
{
"epoch": 0.38042496839824236,
"grad_norm": 4.062798976898193,
"learning_rate": 0.0001424423852362835,
"loss": 0.7553,
"step": 790
},
{
"epoch": 0.3809065189911515,
"grad_norm": 1.8997843265533447,
"learning_rate": 0.00014230109245113158,
"loss": 0.968,
"step": 791
},
{
"epoch": 0.3813880695840607,
"grad_norm": 3.648345470428467,
"learning_rate": 0.00014215969673827018,
"loss": 0.7866,
"step": 792
},
{
"epoch": 0.38186962017696985,
"grad_norm": 3.1891438961029053,
"learning_rate": 0.00014201819844174564,
"loss": 0.7841,
"step": 793
},
{
"epoch": 0.382351170769879,
"grad_norm": 3.960712432861328,
"learning_rate": 0.0001418765979058539,
"loss": 0.8922,
"step": 794
},
{
"epoch": 0.3828327213627882,
"grad_norm": 1.958216667175293,
"learning_rate": 0.00014173489547513973,
"loss": 0.9929,
"step": 795
},
{
"epoch": 0.38331427195569734,
"grad_norm": 2.851674795150757,
"learning_rate": 0.00014159309149439582,
"loss": 0.7668,
"step": 796
},
{
"epoch": 0.3837958225486065,
"grad_norm": 2.4043354988098145,
"learning_rate": 0.00014145118630866187,
"loss": 0.5076,
"step": 797
},
{
"epoch": 0.38427737314151567,
"grad_norm": 2.548334836959839,
"learning_rate": 0.000141309180263224,
"loss": 0.5664,
"step": 798
},
{
"epoch": 0.38475892373442483,
"grad_norm": 7.332959175109863,
"learning_rate": 0.0001411670737036135,
"loss": 0.6663,
"step": 799
},
{
"epoch": 0.385240474327334,
"grad_norm": 2.5418202877044678,
"learning_rate": 0.0001410248669756065,
"loss": 0.6912,
"step": 800
},
{
"epoch": 0.38572202492024316,
"grad_norm": 1.9166046380996704,
"learning_rate": 0.00014088256042522264,
"loss": 0.5785,
"step": 801
},
{
"epoch": 0.3862035755131524,
"grad_norm": 1.9484670162200928,
"learning_rate": 0.00014074015439872458,
"loss": 0.7789,
"step": 802
},
{
"epoch": 0.38668512610606154,
"grad_norm": 1.2977544069290161,
"learning_rate": 0.00014059764924261703,
"loss": 0.448,
"step": 803
},
{
"epoch": 0.3871666766989707,
"grad_norm": 3.7243142127990723,
"learning_rate": 0.00014045504530364584,
"loss": 0.3638,
"step": 804
},
{
"epoch": 0.38764822729187987,
"grad_norm": 2.931234836578369,
"learning_rate": 0.00014031234292879725,
"loss": 0.6048,
"step": 805
},
{
"epoch": 0.38812977788478903,
"grad_norm": 2.235635757446289,
"learning_rate": 0.00014016954246529696,
"loss": 0.741,
"step": 806
},
{
"epoch": 0.3886113284776982,
"grad_norm": 2.4995760917663574,
"learning_rate": 0.00014002664426060942,
"loss": 0.8794,
"step": 807
},
{
"epoch": 0.38909287907060736,
"grad_norm": 5.880919456481934,
"learning_rate": 0.00013988364866243693,
"loss": 0.8441,
"step": 808
},
{
"epoch": 0.3895744296635165,
"grad_norm": 2.27232027053833,
"learning_rate": 0.00013974055601871868,
"loss": 0.5837,
"step": 809
},
{
"epoch": 0.3900559802564257,
"grad_norm": 1.846911907196045,
"learning_rate": 0.00013959736667762998,
"loss": 0.9346,
"step": 810
},
{
"epoch": 0.39053753084933485,
"grad_norm": 1.3983654975891113,
"learning_rate": 0.00013945408098758156,
"loss": 1.0296,
"step": 811
},
{
"epoch": 0.391019081442244,
"grad_norm": 1.4359188079833984,
"learning_rate": 0.0001393106992972184,
"loss": 0.5791,
"step": 812
},
{
"epoch": 0.3915006320351532,
"grad_norm": 2.0418739318847656,
"learning_rate": 0.00013916722195541926,
"loss": 0.5045,
"step": 813
},
{
"epoch": 0.39198218262806234,
"grad_norm": 3.6216964721679688,
"learning_rate": 0.00013902364931129557,
"loss": 0.748,
"step": 814
},
{
"epoch": 0.39246373322097156,
"grad_norm": 2.5726840496063232,
"learning_rate": 0.00013887998171419058,
"loss": 0.8588,
"step": 815
},
{
"epoch": 0.3929452838138807,
"grad_norm": 2.4166088104248047,
"learning_rate": 0.00013873621951367862,
"loss": 0.8306,
"step": 816
},
{
"epoch": 0.3934268344067899,
"grad_norm": 2.8156697750091553,
"learning_rate": 0.00013859236305956425,
"loss": 0.7893,
"step": 817
},
{
"epoch": 0.39390838499969905,
"grad_norm": 2.6449344158172607,
"learning_rate": 0.00013844841270188132,
"loss": 0.7843,
"step": 818
},
{
"epoch": 0.3943899355926082,
"grad_norm": 1.73700749874115,
"learning_rate": 0.00013830436879089228,
"loss": 0.7855,
"step": 819
},
{
"epoch": 0.3948714861855174,
"grad_norm": 2.2804300785064697,
"learning_rate": 0.00013816023167708704,
"loss": 0.6568,
"step": 820
},
{
"epoch": 0.39535303677842654,
"grad_norm": 2.1226370334625244,
"learning_rate": 0.00013801600171118244,
"loss": 0.5294,
"step": 821
},
{
"epoch": 0.3958345873713357,
"grad_norm": 2.894469738006592,
"learning_rate": 0.00013787167924412112,
"loss": 0.8773,
"step": 822
},
{
"epoch": 0.39631613796424486,
"grad_norm": 2.5202245712280273,
"learning_rate": 0.0001377272646270709,
"loss": 0.381,
"step": 823
},
{
"epoch": 0.396797688557154,
"grad_norm": 3.8328442573547363,
"learning_rate": 0.00013758275821142382,
"loss": 0.5329,
"step": 824
},
{
"epoch": 0.3972792391500632,
"grad_norm": 2.541353940963745,
"learning_rate": 0.00013743816034879523,
"loss": 0.5578,
"step": 825
},
{
"epoch": 0.39776078974297235,
"grad_norm": 2.1383888721466064,
"learning_rate": 0.000137293471391023,
"loss": 0.8876,
"step": 826
},
{
"epoch": 0.3982423403358815,
"grad_norm": 2.633044719696045,
"learning_rate": 0.00013714869169016667,
"loss": 0.4708,
"step": 827
},
{
"epoch": 0.3987238909287907,
"grad_norm": 4.365309715270996,
"learning_rate": 0.00013700382159850656,
"loss": 0.4944,
"step": 828
},
{
"epoch": 0.3992054415216999,
"grad_norm": 3.652635097503662,
"learning_rate": 0.00013685886146854297,
"loss": 0.8842,
"step": 829
},
{
"epoch": 0.39968699211460906,
"grad_norm": 2.1585693359375,
"learning_rate": 0.00013671381165299525,
"loss": 0.7996,
"step": 830
},
{
"epoch": 0.4001685427075182,
"grad_norm": 2.429353952407837,
"learning_rate": 0.00013656867250480098,
"loss": 0.8529,
"step": 831
},
{
"epoch": 0.4006500933004274,
"grad_norm": 2.0044384002685547,
"learning_rate": 0.00013642344437711512,
"loss": 0.5831,
"step": 832
},
{
"epoch": 0.40113164389333655,
"grad_norm": 3.845720052719116,
"learning_rate": 0.00013627812762330912,
"loss": 1.0989,
"step": 833
},
{
"epoch": 0.4016131944862457,
"grad_norm": 2.4020707607269287,
"learning_rate": 0.00013613272259697007,
"loss": 0.734,
"step": 834
},
{
"epoch": 0.4020947450791549,
"grad_norm": 1.2454372644424438,
"learning_rate": 0.00013598722965189986,
"loss": 1.1921,
"step": 835
},
{
"epoch": 0.40257629567206404,
"grad_norm": 2.901397228240967,
"learning_rate": 0.0001358416491421143,
"loss": 0.919,
"step": 836
},
{
"epoch": 0.4030578462649732,
"grad_norm": 2.4206631183624268,
"learning_rate": 0.00013569598142184225,
"loss": 0.8408,
"step": 837
},
{
"epoch": 0.40353939685788237,
"grad_norm": 3.595640182495117,
"learning_rate": 0.00013555022684552483,
"loss": 1.245,
"step": 838
},
{
"epoch": 0.40402094745079153,
"grad_norm": 1.4770573377609253,
"learning_rate": 0.00013540438576781441,
"loss": 0.4859,
"step": 839
},
{
"epoch": 0.4045024980437007,
"grad_norm": 2.7039146423339844,
"learning_rate": 0.0001352584585435739,
"loss": 0.9747,
"step": 840
},
{
"epoch": 0.40498404863660986,
"grad_norm": 2.615344524383545,
"learning_rate": 0.00013511244552787583,
"loss": 0.6801,
"step": 841
},
{
"epoch": 0.405465599229519,
"grad_norm": 3.7409796714782715,
"learning_rate": 0.00013496634707600147,
"loss": 1.1876,
"step": 842
},
{
"epoch": 0.40594714982242824,
"grad_norm": 2.508939743041992,
"learning_rate": 0.0001348201635434399,
"loss": 0.7794,
"step": 843
},
{
"epoch": 0.4064287004153374,
"grad_norm": 3.5549421310424805,
"learning_rate": 0.0001346738952858873,
"loss": 1.2157,
"step": 844
},
{
"epoch": 0.40691025100824657,
"grad_norm": 0.9292539954185486,
"learning_rate": 0.000134527542659246,
"loss": 1.4636,
"step": 845
},
{
"epoch": 0.40739180160115573,
"grad_norm": 2.5853211879730225,
"learning_rate": 0.00013438110601962362,
"loss": 0.6864,
"step": 846
},
{
"epoch": 0.4078733521940649,
"grad_norm": 2.0483453273773193,
"learning_rate": 0.00013423458572333214,
"loss": 0.816,
"step": 847
},
{
"epoch": 0.40835490278697406,
"grad_norm": 1.1379551887512207,
"learning_rate": 0.0001340879821268872,
"loss": 0.3914,
"step": 848
},
{
"epoch": 0.4088364533798832,
"grad_norm": 4.26398229598999,
"learning_rate": 0.000133941295587007,
"loss": 0.7028,
"step": 849
},
{
"epoch": 0.4093180039727924,
"grad_norm": 2.3732380867004395,
"learning_rate": 0.00013379452646061164,
"loss": 0.5584,
"step": 850
},
{
"epoch": 0.40979955456570155,
"grad_norm": 2.4338433742523193,
"learning_rate": 0.0001336476751048222,
"loss": 0.9941,
"step": 851
},
{
"epoch": 0.4102811051586107,
"grad_norm": 2.9080970287323,
"learning_rate": 0.00013350074187695979,
"loss": 1.3025,
"step": 852
},
{
"epoch": 0.4107626557515199,
"grad_norm": 2.6538901329040527,
"learning_rate": 0.00013335372713454467,
"loss": 0.7301,
"step": 853
},
{
"epoch": 0.41124420634442904,
"grad_norm": 2.5008554458618164,
"learning_rate": 0.0001332066312352956,
"loss": 0.7566,
"step": 854
},
{
"epoch": 0.4117257569373382,
"grad_norm": 2.0874619483947754,
"learning_rate": 0.00013305945453712868,
"loss": 0.9365,
"step": 855
},
{
"epoch": 0.4122073075302474,
"grad_norm": 2.2397348880767822,
"learning_rate": 0.0001329121973981567,
"loss": 0.9617,
"step": 856
},
{
"epoch": 0.4126888581231566,
"grad_norm": 2.2722036838531494,
"learning_rate": 0.00013276486017668807,
"loss": 0.4104,
"step": 857
},
{
"epoch": 0.41317040871606575,
"grad_norm": 2.11865234375,
"learning_rate": 0.0001326174432312262,
"loss": 0.7596,
"step": 858
},
{
"epoch": 0.4136519593089749,
"grad_norm": 2.2710089683532715,
"learning_rate": 0.00013246994692046836,
"loss": 0.9763,
"step": 859
},
{
"epoch": 0.4141335099018841,
"grad_norm": 3.1512913703918457,
"learning_rate": 0.000132322371603305,
"loss": 0.8637,
"step": 860
},
{
"epoch": 0.41461506049479324,
"grad_norm": 5.3608622550964355,
"learning_rate": 0.0001321747176388188,
"loss": 0.4573,
"step": 861
},
{
"epoch": 0.4150966110877024,
"grad_norm": 1.7726064920425415,
"learning_rate": 0.00013202698538628376,
"loss": 1.0072,
"step": 862
},
{
"epoch": 0.41557816168061157,
"grad_norm": 1.9994593858718872,
"learning_rate": 0.00013187917520516448,
"loss": 0.6097,
"step": 863
},
{
"epoch": 0.41605971227352073,
"grad_norm": 2.665196418762207,
"learning_rate": 0.00013173128745511508,
"loss": 0.8823,
"step": 864
},
{
"epoch": 0.4165412628664299,
"grad_norm": 1.9170902967453003,
"learning_rate": 0.0001315833224959784,
"loss": 0.7834,
"step": 865
},
{
"epoch": 0.41702281345933906,
"grad_norm": 1.7230511903762817,
"learning_rate": 0.00013143528068778525,
"loss": 1.2682,
"step": 866
},
{
"epoch": 0.4175043640522482,
"grad_norm": 1.5421873331069946,
"learning_rate": 0.00013128716239075338,
"loss": 0.4533,
"step": 867
},
{
"epoch": 0.4179859146451574,
"grad_norm": 2.3257434368133545,
"learning_rate": 0.00013113896796528664,
"loss": 0.7117,
"step": 868
},
{
"epoch": 0.41846746523806655,
"grad_norm": 2.053032398223877,
"learning_rate": 0.00013099069777197412,
"loss": 0.8121,
"step": 869
},
{
"epoch": 0.41894901583097577,
"grad_norm": 5.06425142288208,
"learning_rate": 0.0001308423521715893,
"loss": 0.7408,
"step": 870
},
{
"epoch": 0.41943056642388493,
"grad_norm": 1.0767431259155273,
"learning_rate": 0.00013069393152508906,
"loss": 0.6687,
"step": 871
},
{
"epoch": 0.4199121170167941,
"grad_norm": 1.6607890129089355,
"learning_rate": 0.00013054543619361303,
"loss": 0.6322,
"step": 872
},
{
"epoch": 0.42039366760970326,
"grad_norm": 1.7440071105957031,
"learning_rate": 0.0001303968665384824,
"loss": 0.889,
"step": 873
},
{
"epoch": 0.4208752182026124,
"grad_norm": 1.4886302947998047,
"learning_rate": 0.00013024822292119934,
"loss": 0.7009,
"step": 874
},
{
"epoch": 0.4213567687955216,
"grad_norm": 3.8570821285247803,
"learning_rate": 0.0001300995057034459,
"loss": 0.7772,
"step": 875
},
{
"epoch": 0.42183831938843075,
"grad_norm": 2.4680871963500977,
"learning_rate": 0.00012995071524708325,
"loss": 0.7877,
"step": 876
},
{
"epoch": 0.4223198699813399,
"grad_norm": 2.5244038105010986,
"learning_rate": 0.00012980185191415074,
"loss": 0.5928,
"step": 877
},
{
"epoch": 0.4228014205742491,
"grad_norm": 1.7207748889923096,
"learning_rate": 0.0001296529160668651,
"loss": 0.7075,
"step": 878
},
{
"epoch": 0.42328297116715824,
"grad_norm": 1.7171252965927124,
"learning_rate": 0.00012950390806761944,
"loss": 0.8689,
"step": 879
},
{
"epoch": 0.4237645217600674,
"grad_norm": 1.759418249130249,
"learning_rate": 0.0001293548282789825,
"loss": 0.4545,
"step": 880
},
{
"epoch": 0.42424607235297657,
"grad_norm": 1.7909297943115234,
"learning_rate": 0.00012920567706369758,
"loss": 1.3034,
"step": 881
},
{
"epoch": 0.42472762294588573,
"grad_norm": 1.4877880811691284,
"learning_rate": 0.00012905645478468192,
"loss": 0.3629,
"step": 882
},
{
"epoch": 0.4252091735387949,
"grad_norm": 3.6452713012695312,
"learning_rate": 0.00012890716180502564,
"loss": 0.6314,
"step": 883
},
{
"epoch": 0.4256907241317041,
"grad_norm": 2.424837589263916,
"learning_rate": 0.00012875779848799078,
"loss": 0.9437,
"step": 884
},
{
"epoch": 0.4261722747246133,
"grad_norm": 2.7232539653778076,
"learning_rate": 0.00012860836519701063,
"loss": 0.9839,
"step": 885
},
{
"epoch": 0.42665382531752244,
"grad_norm": 1.2569257020950317,
"learning_rate": 0.00012845886229568873,
"loss": 0.8196,
"step": 886
},
{
"epoch": 0.4271353759104316,
"grad_norm": 2.5531201362609863,
"learning_rate": 0.00012830929014779797,
"loss": 0.8545,
"step": 887
},
{
"epoch": 0.42761692650334077,
"grad_norm": 6.38144588470459,
"learning_rate": 0.0001281596491172797,
"loss": 0.5451,
"step": 888
},
{
"epoch": 0.42809847709624993,
"grad_norm": 1.4901047945022583,
"learning_rate": 0.00012800993956824303,
"loss": 0.9357,
"step": 889
},
{
"epoch": 0.4285800276891591,
"grad_norm": 2.5083794593811035,
"learning_rate": 0.00012786016186496358,
"loss": 0.9034,
"step": 890
},
{
"epoch": 0.42906157828206826,
"grad_norm": 2.4690587520599365,
"learning_rate": 0.000127710316371883,
"loss": 0.7518,
"step": 891
},
{
"epoch": 0.4295431288749774,
"grad_norm": 1.5724667310714722,
"learning_rate": 0.0001275604034536077,
"loss": 0.4883,
"step": 892
},
{
"epoch": 0.4300246794678866,
"grad_norm": 1.8662641048431396,
"learning_rate": 0.0001274104234749083,
"loss": 0.6713,
"step": 893
},
{
"epoch": 0.43050623006079575,
"grad_norm": 2.4959278106689453,
"learning_rate": 0.00012726037680071853,
"loss": 0.6975,
"step": 894
},
{
"epoch": 0.4309877806537049,
"grad_norm": 2.854491710662842,
"learning_rate": 0.00012711026379613434,
"loss": 0.5982,
"step": 895
},
{
"epoch": 0.4314693312466141,
"grad_norm": 2.513932704925537,
"learning_rate": 0.00012696008482641325,
"loss": 0.6691,
"step": 896
},
{
"epoch": 0.43195088183952324,
"grad_norm": 2.1765429973602295,
"learning_rate": 0.00012680984025697313,
"loss": 0.4283,
"step": 897
},
{
"epoch": 0.43243243243243246,
"grad_norm": 3.172271966934204,
"learning_rate": 0.00012665953045339152,
"loss": 1.1573,
"step": 898
},
{
"epoch": 0.4329139830253416,
"grad_norm": 1.5411570072174072,
"learning_rate": 0.0001265091557814047,
"loss": 0.603,
"step": 899
},
{
"epoch": 0.4333955336182508,
"grad_norm": 4.4379496574401855,
"learning_rate": 0.00012635871660690676,
"loss": 0.4462,
"step": 900
},
{
"epoch": 0.43387708421115995,
"grad_norm": 4.627134799957275,
"learning_rate": 0.0001262082132959488,
"loss": 0.7033,
"step": 901
},
{
"epoch": 0.4343586348040691,
"grad_norm": 5.416947364807129,
"learning_rate": 0.00012605764621473792,
"loss": 1.0499,
"step": 902
},
{
"epoch": 0.4348401853969783,
"grad_norm": 2.7960314750671387,
"learning_rate": 0.00012590701572963642,
"loss": 1.0619,
"step": 903
},
{
"epoch": 0.43532173598988744,
"grad_norm": 4.81326961517334,
"learning_rate": 0.00012575632220716078,
"loss": 0.925,
"step": 904
},
{
"epoch": 0.4358032865827966,
"grad_norm": 1.967151403427124,
"learning_rate": 0.000125605566013981,
"loss": 0.8593,
"step": 905
},
{
"epoch": 0.43628483717570576,
"grad_norm": 3.3491618633270264,
"learning_rate": 0.00012545474751691953,
"loss": 0.884,
"step": 906
},
{
"epoch": 0.4367663877686149,
"grad_norm": 1.8660701513290405,
"learning_rate": 0.00012530386708295036,
"loss": 0.7831,
"step": 907
},
{
"epoch": 0.4372479383615241,
"grad_norm": 2.808755874633789,
"learning_rate": 0.00012515292507919829,
"loss": 0.8822,
"step": 908
},
{
"epoch": 0.43772948895443325,
"grad_norm": 1.8173489570617676,
"learning_rate": 0.0001250019218729378,
"loss": 0.6888,
"step": 909
},
{
"epoch": 0.4382110395473424,
"grad_norm": 2.033569097518921,
"learning_rate": 0.00012485085783159238,
"loss": 0.3951,
"step": 910
},
{
"epoch": 0.43869259014025164,
"grad_norm": 2.698009729385376,
"learning_rate": 0.00012469973332273354,
"loss": 0.5259,
"step": 911
},
{
"epoch": 0.4391741407331608,
"grad_norm": 1.5774399042129517,
"learning_rate": 0.00012454854871407994,
"loss": 0.4619,
"step": 912
},
{
"epoch": 0.43965569132606996,
"grad_norm": 1.6870514154434204,
"learning_rate": 0.00012439730437349635,
"loss": 0.4305,
"step": 913
},
{
"epoch": 0.4401372419189791,
"grad_norm": 1.871408224105835,
"learning_rate": 0.00012424600066899302,
"loss": 0.6,
"step": 914
},
{
"epoch": 0.4406187925118883,
"grad_norm": 2.036029577255249,
"learning_rate": 0.00012409463796872464,
"loss": 1.0179,
"step": 915
},
{
"epoch": 0.44110034310479745,
"grad_norm": 1.8108471632003784,
"learning_rate": 0.0001239432166409893,
"loss": 0.8426,
"step": 916
},
{
"epoch": 0.4415818936977066,
"grad_norm": 3.0200490951538086,
"learning_rate": 0.00012379173705422795,
"loss": 0.9675,
"step": 917
},
{
"epoch": 0.4420634442906158,
"grad_norm": 2.3675003051757812,
"learning_rate": 0.00012364019957702315,
"loss": 0.6689,
"step": 918
},
{
"epoch": 0.44254499488352494,
"grad_norm": 3.1368930339813232,
"learning_rate": 0.00012348860457809838,
"loss": 0.8873,
"step": 919
},
{
"epoch": 0.4430265454764341,
"grad_norm": 1.4245840311050415,
"learning_rate": 0.00012333695242631705,
"loss": 0.8424,
"step": 920
},
{
"epoch": 0.44350809606934327,
"grad_norm": 3.4238197803497314,
"learning_rate": 0.0001231852434906817,
"loss": 1.2067,
"step": 921
},
{
"epoch": 0.44398964666225244,
"grad_norm": 2.398881196975708,
"learning_rate": 0.00012303347814033292,
"loss": 0.7952,
"step": 922
},
{
"epoch": 0.4444711972551616,
"grad_norm": 2.153909206390381,
"learning_rate": 0.0001228816567445487,
"loss": 0.9368,
"step": 923
},
{
"epoch": 0.44495274784807076,
"grad_norm": 2.978900194168091,
"learning_rate": 0.0001227297796727433,
"loss": 0.6853,
"step": 924
},
{
"epoch": 0.44543429844098,
"grad_norm": 1.9314982891082764,
"learning_rate": 0.00012257784729446656,
"loss": 0.7932,
"step": 925
},
{
"epoch": 0.44591584903388914,
"grad_norm": 2.3480403423309326,
"learning_rate": 0.00012242585997940275,
"loss": 0.7998,
"step": 926
},
{
"epoch": 0.4463973996267983,
"grad_norm": 4.056048393249512,
"learning_rate": 0.0001222738180973699,
"loss": 0.6552,
"step": 927
},
{
"epoch": 0.44687895021970747,
"grad_norm": 3.2654383182525635,
"learning_rate": 0.00012212172201831885,
"loss": 0.6561,
"step": 928
},
{
"epoch": 0.44736050081261663,
"grad_norm": 3.2897989749908447,
"learning_rate": 0.00012196957211233222,
"loss": 1.0814,
"step": 929
},
{
"epoch": 0.4478420514055258,
"grad_norm": 1.9905271530151367,
"learning_rate": 0.00012181736874962371,
"loss": 1.0158,
"step": 930
},
{
"epoch": 0.44832360199843496,
"grad_norm": 2.3979315757751465,
"learning_rate": 0.00012166511230053696,
"loss": 1.0173,
"step": 931
},
{
"epoch": 0.4488051525913441,
"grad_norm": 2.0289697647094727,
"learning_rate": 0.00012151280313554486,
"loss": 0.9401,
"step": 932
},
{
"epoch": 0.4492867031842533,
"grad_norm": 2.8876144886016846,
"learning_rate": 0.00012136044162524858,
"loss": 0.8686,
"step": 933
},
{
"epoch": 0.44976825377716245,
"grad_norm": 3.0945162773132324,
"learning_rate": 0.00012120802814037663,
"loss": 0.3943,
"step": 934
},
{
"epoch": 0.4502498043700716,
"grad_norm": 2.6051065921783447,
"learning_rate": 0.00012105556305178399,
"loss": 0.5688,
"step": 935
},
{
"epoch": 0.4507313549629808,
"grad_norm": 3.5704925060272217,
"learning_rate": 0.00012090304673045123,
"loss": 0.627,
"step": 936
},
{
"epoch": 0.45121290555588994,
"grad_norm": 4.448431491851807,
"learning_rate": 0.00012075047954748353,
"loss": 1.0867,
"step": 937
},
{
"epoch": 0.4516944561487991,
"grad_norm": 3.0634100437164307,
"learning_rate": 0.00012059786187410984,
"loss": 0.7893,
"step": 938
},
{
"epoch": 0.4521760067417083,
"grad_norm": 2.6939537525177,
"learning_rate": 0.000120445194081682,
"loss": 0.4777,
"step": 939
},
{
"epoch": 0.4526575573346175,
"grad_norm": 1.9854243993759155,
"learning_rate": 0.00012029247654167379,
"loss": 0.4645,
"step": 940
},
{
"epoch": 0.45313910792752665,
"grad_norm": 2.8124136924743652,
"learning_rate": 0.00012013970962568002,
"loss": 0.6528,
"step": 941
},
{
"epoch": 0.4536206585204358,
"grad_norm": 2.101633310317993,
"learning_rate": 0.00011998689370541562,
"loss": 0.72,
"step": 942
},
{
"epoch": 0.454102209113345,
"grad_norm": 2.0931015014648438,
"learning_rate": 0.00011983402915271478,
"loss": 0.6483,
"step": 943
},
{
"epoch": 0.45458375970625414,
"grad_norm": 3.0055315494537354,
"learning_rate": 0.00011968111633953007,
"loss": 0.9383,
"step": 944
},
{
"epoch": 0.4550653102991633,
"grad_norm": 2.681931734085083,
"learning_rate": 0.0001195281556379314,
"loss": 0.7556,
"step": 945
},
{
"epoch": 0.45554686089207247,
"grad_norm": 7.055514335632324,
"learning_rate": 0.0001193751474201053,
"loss": 0.4541,
"step": 946
},
{
"epoch": 0.45602841148498163,
"grad_norm": 2.157724618911743,
"learning_rate": 0.00011922209205835382,
"loss": 0.5006,
"step": 947
},
{
"epoch": 0.4565099620778908,
"grad_norm": 1.7895299196243286,
"learning_rate": 0.0001190689899250938,
"loss": 0.8087,
"step": 948
},
{
"epoch": 0.45699151267079996,
"grad_norm": 2.361053228378296,
"learning_rate": 0.00011891584139285582,
"loss": 1.002,
"step": 949
},
{
"epoch": 0.4574730632637091,
"grad_norm": 2.3107917308807373,
"learning_rate": 0.00011876264683428344,
"loss": 0.5038,
"step": 950
},
{
"epoch": 0.4579546138566183,
"grad_norm": 2.193125009536743,
"learning_rate": 0.00011860940662213211,
"loss": 0.5385,
"step": 951
},
{
"epoch": 0.4584361644495275,
"grad_norm": 2.572331190109253,
"learning_rate": 0.00011845612112926843,
"loss": 0.7153,
"step": 952
},
{
"epoch": 0.45891771504243667,
"grad_norm": 4.683573246002197,
"learning_rate": 0.00011830279072866921,
"loss": 0.7073,
"step": 953
},
{
"epoch": 0.45939926563534583,
"grad_norm": 2.2095165252685547,
"learning_rate": 0.00011814941579342044,
"loss": 0.706,
"step": 954
},
{
"epoch": 0.459880816228255,
"grad_norm": 1.5698320865631104,
"learning_rate": 0.00011799599669671654,
"loss": 0.364,
"step": 955
},
{
"epoch": 0.46036236682116416,
"grad_norm": 1.8080724477767944,
"learning_rate": 0.00011784253381185937,
"loss": 0.8959,
"step": 956
},
{
"epoch": 0.4608439174140733,
"grad_norm": 1.1480696201324463,
"learning_rate": 0.0001176890275122573,
"loss": 0.522,
"step": 957
},
{
"epoch": 0.4613254680069825,
"grad_norm": 2.714405059814453,
"learning_rate": 0.0001175354781714244,
"loss": 0.4145,
"step": 958
},
{
"epoch": 0.46180701859989165,
"grad_norm": 3.001786470413208,
"learning_rate": 0.0001173818861629794,
"loss": 0.9095,
"step": 959
},
{
"epoch": 0.4622885691928008,
"grad_norm": 1.477941870689392,
"learning_rate": 0.00011722825186064494,
"loss": 0.3998,
"step": 960
},
{
"epoch": 0.46277011978571,
"grad_norm": 2.0230369567871094,
"learning_rate": 0.00011707457563824646,
"loss": 0.7196,
"step": 961
},
{
"epoch": 0.46325167037861914,
"grad_norm": 2.4260199069976807,
"learning_rate": 0.00011692085786971149,
"loss": 0.5469,
"step": 962
},
{
"epoch": 0.4637332209715283,
"grad_norm": 2.5478789806365967,
"learning_rate": 0.00011676709892906858,
"loss": 0.4603,
"step": 963
},
{
"epoch": 0.46421477156443747,
"grad_norm": 2.9565882682800293,
"learning_rate": 0.00011661329919044656,
"loss": 0.8948,
"step": 964
},
{
"epoch": 0.46469632215734663,
"grad_norm": 2.064451217651367,
"learning_rate": 0.00011645945902807341,
"loss": 0.4803,
"step": 965
},
{
"epoch": 0.46517787275025585,
"grad_norm": 2.4505929946899414,
"learning_rate": 0.00011630557881627553,
"loss": 0.6063,
"step": 966
},
{
"epoch": 0.465659423343165,
"grad_norm": 2.06998348236084,
"learning_rate": 0.0001161516589294768,
"loss": 1.0691,
"step": 967
},
{
"epoch": 0.4661409739360742,
"grad_norm": 2.934462785720825,
"learning_rate": 0.00011599769974219757,
"loss": 0.5514,
"step": 968
},
{
"epoch": 0.46662252452898334,
"grad_norm": 3.6781392097473145,
"learning_rate": 0.0001158437016290539,
"loss": 0.7883,
"step": 969
},
{
"epoch": 0.4671040751218925,
"grad_norm": 5.928903102874756,
"learning_rate": 0.00011568966496475649,
"loss": 0.7908,
"step": 970
},
{
"epoch": 0.46758562571480167,
"grad_norm": 4.225213050842285,
"learning_rate": 0.00011553559012410984,
"loss": 0.642,
"step": 971
},
{
"epoch": 0.46806717630771083,
"grad_norm": 4.230667591094971,
"learning_rate": 0.00011538147748201138,
"loss": 1.0245,
"step": 972
},
{
"epoch": 0.46854872690062,
"grad_norm": 2.045747995376587,
"learning_rate": 0.00011522732741345053,
"loss": 0.8693,
"step": 973
},
{
"epoch": 0.46903027749352916,
"grad_norm": 2.93965744972229,
"learning_rate": 0.00011507314029350776,
"loss": 0.7032,
"step": 974
},
{
"epoch": 0.4695118280864383,
"grad_norm": 2.0694057941436768,
"learning_rate": 0.00011491891649735366,
"loss": 0.7536,
"step": 975
},
{
"epoch": 0.4699933786793475,
"grad_norm": 2.6590757369995117,
"learning_rate": 0.00011476465640024814,
"loss": 0.8599,
"step": 976
},
{
"epoch": 0.47047492927225665,
"grad_norm": 2.7925920486450195,
"learning_rate": 0.00011461036037753934,
"loss": 0.4626,
"step": 977
},
{
"epoch": 0.4709564798651658,
"grad_norm": 1.8391474485397339,
"learning_rate": 0.00011445602880466288,
"loss": 0.5219,
"step": 978
},
{
"epoch": 0.471438030458075,
"grad_norm": 1.3266628980636597,
"learning_rate": 0.00011430166205714088,
"loss": 0.6874,
"step": 979
},
{
"epoch": 0.4719195810509842,
"grad_norm": 2.141636848449707,
"learning_rate": 0.00011414726051058102,
"loss": 0.6873,
"step": 980
},
{
"epoch": 0.47240113164389336,
"grad_norm": 2.55141019821167,
"learning_rate": 0.0001139928245406757,
"loss": 0.6919,
"step": 981
},
{
"epoch": 0.4728826822368025,
"grad_norm": 1.6124935150146484,
"learning_rate": 0.00011383835452320097,
"loss": 0.982,
"step": 982
},
{
"epoch": 0.4733642328297117,
"grad_norm": 2.0726158618927,
"learning_rate": 0.00011368385083401585,
"loss": 1.022,
"step": 983
},
{
"epoch": 0.47384578342262085,
"grad_norm": 3.476106882095337,
"learning_rate": 0.00011352931384906125,
"loss": 0.5655,
"step": 984
},
{
"epoch": 0.47432733401553,
"grad_norm": 2.5833818912506104,
"learning_rate": 0.00011337474394435908,
"loss": 0.7119,
"step": 985
},
{
"epoch": 0.4748088846084392,
"grad_norm": 1.4606103897094727,
"learning_rate": 0.00011322014149601136,
"loss": 0.6343,
"step": 986
},
{
"epoch": 0.47529043520134834,
"grad_norm": 3.8119499683380127,
"learning_rate": 0.00011306550688019926,
"loss": 0.7238,
"step": 987
},
{
"epoch": 0.4757719857942575,
"grad_norm": 2.314828872680664,
"learning_rate": 0.0001129108404731823,
"loss": 0.7181,
"step": 988
},
{
"epoch": 0.47625353638716666,
"grad_norm": 1.842475175857544,
"learning_rate": 0.0001127561426512973,
"loss": 0.7928,
"step": 989
},
{
"epoch": 0.47673508698007583,
"grad_norm": 2.3919920921325684,
"learning_rate": 0.0001126014137909575,
"loss": 0.6528,
"step": 990
},
{
"epoch": 0.477216637572985,
"grad_norm": 1.3240762948989868,
"learning_rate": 0.00011244665426865174,
"loss": 0.543,
"step": 991
},
{
"epoch": 0.47769818816589416,
"grad_norm": 2.4185304641723633,
"learning_rate": 0.00011229186446094338,
"loss": 0.5988,
"step": 992
},
{
"epoch": 0.4781797387588034,
"grad_norm": 1.5180091857910156,
"learning_rate": 0.00011213704474446951,
"loss": 0.8106,
"step": 993
},
{
"epoch": 0.47866128935171254,
"grad_norm": 3.8423268795013428,
"learning_rate": 0.00011198219549594,
"loss": 0.7134,
"step": 994
},
{
"epoch": 0.4791428399446217,
"grad_norm": 4.835480213165283,
"learning_rate": 0.00011182731709213659,
"loss": 0.5784,
"step": 995
},
{
"epoch": 0.47962439053753086,
"grad_norm": 4.3305511474609375,
"learning_rate": 0.00011167240990991192,
"loss": 0.6444,
"step": 996
},
{
"epoch": 0.48010594113044003,
"grad_norm": 1.8703162670135498,
"learning_rate": 0.00011151747432618871,
"loss": 0.6062,
"step": 997
},
{
"epoch": 0.4805874917233492,
"grad_norm": 3.5100109577178955,
"learning_rate": 0.00011136251071795871,
"loss": 0.5488,
"step": 998
},
{
"epoch": 0.48106904231625836,
"grad_norm": 2.39043927192688,
"learning_rate": 0.00011120751946228197,
"loss": 0.7438,
"step": 999
},
{
"epoch": 0.4815505929091675,
"grad_norm": 3.630051612854004,
"learning_rate": 0.00011105250093628565,
"loss": 0.8574,
"step": 1000
},
{
"epoch": 0.4820321435020767,
"grad_norm": 2.519382953643799,
"learning_rate": 0.00011089745551716344,
"loss": 0.8414,
"step": 1001
},
{
"epoch": 0.48251369409498585,
"grad_norm": 2.2886648178100586,
"learning_rate": 0.00011074238358217437,
"loss": 0.9677,
"step": 1002
},
{
"epoch": 0.482995244687895,
"grad_norm": 1.639683723449707,
"learning_rate": 0.00011058728550864197,
"loss": 0.4151,
"step": 1003
},
{
"epoch": 0.4834767952808042,
"grad_norm": 2.246243715286255,
"learning_rate": 0.00011043216167395344,
"loss": 0.8334,
"step": 1004
},
{
"epoch": 0.48395834587371334,
"grad_norm": 4.043992519378662,
"learning_rate": 0.00011027701245555865,
"loss": 1.1405,
"step": 1005
},
{
"epoch": 0.4844398964666225,
"grad_norm": 5.069822788238525,
"learning_rate": 0.00011012183823096917,
"loss": 0.7535,
"step": 1006
},
{
"epoch": 0.4849214470595317,
"grad_norm": 1.6714848279953003,
"learning_rate": 0.00010996663937775751,
"loss": 0.4408,
"step": 1007
},
{
"epoch": 0.4854029976524409,
"grad_norm": 1.6782582998275757,
"learning_rate": 0.000109811416273556,
"loss": 1.055,
"step": 1008
},
{
"epoch": 0.48588454824535005,
"grad_norm": 2.3483331203460693,
"learning_rate": 0.00010965616929605609,
"loss": 0.7248,
"step": 1009
},
{
"epoch": 0.4863660988382592,
"grad_norm": 2.866668701171875,
"learning_rate": 0.0001095008988230072,
"loss": 0.8629,
"step": 1010
},
{
"epoch": 0.48684764943116837,
"grad_norm": 3.7616584300994873,
"learning_rate": 0.00010934560523221602,
"loss": 0.952,
"step": 1011
},
{
"epoch": 0.48732920002407754,
"grad_norm": 5.05987548828125,
"learning_rate": 0.00010919028890154543,
"loss": 0.7482,
"step": 1012
},
{
"epoch": 0.4878107506169867,
"grad_norm": 3.6084094047546387,
"learning_rate": 0.00010903495020891375,
"loss": 0.8013,
"step": 1013
},
{
"epoch": 0.48829230120989586,
"grad_norm": 2.3544795513153076,
"learning_rate": 0.00010887958953229349,
"loss": 0.9513,
"step": 1014
},
{
"epoch": 0.488773851802805,
"grad_norm": 4.078423500061035,
"learning_rate": 0.00010872420724971088,
"loss": 0.8901,
"step": 1015
},
{
"epoch": 0.4892554023957142,
"grad_norm": 3.262572765350342,
"learning_rate": 0.0001085688037392446,
"loss": 0.7107,
"step": 1016
},
{
"epoch": 0.48973695298862335,
"grad_norm": 3.4895589351654053,
"learning_rate": 0.000108413379379025,
"loss": 0.5975,
"step": 1017
},
{
"epoch": 0.4902185035815325,
"grad_norm": 2.3548641204833984,
"learning_rate": 0.00010825793454723325,
"loss": 0.71,
"step": 1018
},
{
"epoch": 0.4907000541744417,
"grad_norm": 2.5070619583129883,
"learning_rate": 0.00010810246962210018,
"loss": 0.8754,
"step": 1019
},
{
"epoch": 0.49118160476735084,
"grad_norm": 2.6222572326660156,
"learning_rate": 0.00010794698498190557,
"loss": 0.7779,
"step": 1020
},
{
"epoch": 0.49166315536026006,
"grad_norm": 2.8594443798065186,
"learning_rate": 0.00010779148100497722,
"loss": 0.5911,
"step": 1021
},
{
"epoch": 0.4921447059531692,
"grad_norm": 3.380793333053589,
"learning_rate": 0.00010763595806968996,
"loss": 0.8463,
"step": 1022
},
{
"epoch": 0.4926262565460784,
"grad_norm": 3.048558235168457,
"learning_rate": 0.00010748041655446473,
"loss": 1.1503,
"step": 1023
},
{
"epoch": 0.49310780713898755,
"grad_norm": 3.162221670150757,
"learning_rate": 0.00010732485683776768,
"loss": 0.9634,
"step": 1024
},
{
"epoch": 0.4935893577318967,
"grad_norm": 1.7662273645401,
"learning_rate": 0.00010716927929810925,
"loss": 0.9218,
"step": 1025
},
{
"epoch": 0.4940709083248059,
"grad_norm": 6.701080322265625,
"learning_rate": 0.00010701368431404326,
"loss": 0.6088,
"step": 1026
},
{
"epoch": 0.49455245891771504,
"grad_norm": 1.6572067737579346,
"learning_rate": 0.00010685807226416598,
"loss": 0.4006,
"step": 1027
},
{
"epoch": 0.4950340095106242,
"grad_norm": 2.3362746238708496,
"learning_rate": 0.00010670244352711518,
"loss": 0.4711,
"step": 1028
},
{
"epoch": 0.49551556010353337,
"grad_norm": 3.27119779586792,
"learning_rate": 0.00010654679848156925,
"loss": 0.5751,
"step": 1029
},
{
"epoch": 0.49599711069644253,
"grad_norm": 2.6703665256500244,
"learning_rate": 0.00010639113750624625,
"loss": 0.3203,
"step": 1030
},
{
"epoch": 0.4964786612893517,
"grad_norm": 2.749845027923584,
"learning_rate": 0.00010623546097990303,
"loss": 0.7552,
"step": 1031
},
{
"epoch": 0.49696021188226086,
"grad_norm": 1.693564772605896,
"learning_rate": 0.00010607976928133423,
"loss": 0.3451,
"step": 1032
},
{
"epoch": 0.49744176247517,
"grad_norm": 2.492354154586792,
"learning_rate": 0.00010592406278937144,
"loss": 0.6278,
"step": 1033
},
{
"epoch": 0.4979233130680792,
"grad_norm": 3.982508897781372,
"learning_rate": 0.00010576834188288226,
"loss": 0.9494,
"step": 1034
},
{
"epoch": 0.4984048636609884,
"grad_norm": 3.6745517253875732,
"learning_rate": 0.00010561260694076935,
"loss": 0.8115,
"step": 1035
},
{
"epoch": 0.49888641425389757,
"grad_norm": 1.9711278676986694,
"learning_rate": 0.00010545685834196948,
"loss": 0.7224,
"step": 1036
},
{
"epoch": 0.49936796484680673,
"grad_norm": 3.948199510574341,
"learning_rate": 0.00010530109646545272,
"loss": 0.7509,
"step": 1037
},
{
"epoch": 0.4998495154397159,
"grad_norm": 4.0536041259765625,
"learning_rate": 0.0001051453216902214,
"loss": 0.8095,
"step": 1038
},
{
"epoch": 0.500331066032625,
"grad_norm": 3.6049885749816895,
"learning_rate": 0.00010498953439530925,
"loss": 0.8699,
"step": 1039
},
{
"epoch": 0.5008126166255342,
"grad_norm": 1.3765301704406738,
"learning_rate": 0.00010483373495978046,
"loss": 0.6613,
"step": 1040
},
{
"epoch": 0.5012941672184434,
"grad_norm": 1.5237274169921875,
"learning_rate": 0.00010467792376272877,
"loss": 0.8436,
"step": 1041
},
{
"epoch": 0.5017757178113526,
"grad_norm": 2.1992526054382324,
"learning_rate": 0.00010452210118327652,
"loss": 0.429,
"step": 1042
},
{
"epoch": 0.5022572684042618,
"grad_norm": 4.125129222869873,
"learning_rate": 0.00010436626760057378,
"loss": 0.7708,
"step": 1043
},
{
"epoch": 0.5027388189971709,
"grad_norm": 2.204009771347046,
"learning_rate": 0.00010421042339379732,
"loss": 0.5653,
"step": 1044
},
{
"epoch": 0.5032203695900801,
"grad_norm": 4.470865726470947,
"learning_rate": 0.00010405456894214987,
"loss": 0.7858,
"step": 1045
},
{
"epoch": 0.5037019201829892,
"grad_norm": 1.1038165092468262,
"learning_rate": 0.00010389870462485902,
"loss": 1.4328,
"step": 1046
},
{
"epoch": 0.5041834707758984,
"grad_norm": 1.9314682483673096,
"learning_rate": 0.00010374283082117635,
"loss": 0.3706,
"step": 1047
},
{
"epoch": 0.5046650213688075,
"grad_norm": 2.6393468379974365,
"learning_rate": 0.00010358694791037653,
"loss": 1.1257,
"step": 1048
},
{
"epoch": 0.5051465719617168,
"grad_norm": 3.338649034500122,
"learning_rate": 0.00010343105627175644,
"loss": 0.8054,
"step": 1049
},
{
"epoch": 0.5056281225546259,
"grad_norm": 1.6628873348236084,
"learning_rate": 0.00010327515628463415,
"loss": 0.4518,
"step": 1050
},
{
"epoch": 0.5061096731475351,
"grad_norm": 3.3070363998413086,
"learning_rate": 0.00010311924832834808,
"loss": 1.2035,
"step": 1051
},
{
"epoch": 0.5065912237404442,
"grad_norm": 2.4879815578460693,
"learning_rate": 0.00010296333278225599,
"loss": 0.5938,
"step": 1052
},
{
"epoch": 0.5070727743333534,
"grad_norm": 3.5677437782287598,
"learning_rate": 0.00010280741002573413,
"loss": 0.3152,
"step": 1053
},
{
"epoch": 0.5075543249262625,
"grad_norm": 2.475534200668335,
"learning_rate": 0.00010265148043817632,
"loss": 0.789,
"step": 1054
},
{
"epoch": 0.5080358755191717,
"grad_norm": 3.422375202178955,
"learning_rate": 0.00010249554439899298,
"loss": 0.8623,
"step": 1055
},
{
"epoch": 0.508517426112081,
"grad_norm": 1.3005175590515137,
"learning_rate": 0.00010233960228761022,
"loss": 0.6675,
"step": 1056
},
{
"epoch": 0.5089989767049901,
"grad_norm": 1.275846004486084,
"learning_rate": 0.00010218365448346893,
"loss": 0.7612,
"step": 1057
},
{
"epoch": 0.5094805272978993,
"grad_norm": 3.3997249603271484,
"learning_rate": 0.00010202770136602388,
"loss": 0.839,
"step": 1058
},
{
"epoch": 0.5099620778908084,
"grad_norm": 1.7658668756484985,
"learning_rate": 0.00010187174331474271,
"loss": 0.4518,
"step": 1059
},
{
"epoch": 0.5104436284837176,
"grad_norm": 2.3334131240844727,
"learning_rate": 0.00010171578070910512,
"loss": 0.4001,
"step": 1060
},
{
"epoch": 0.5109251790766267,
"grad_norm": 2.203070878982544,
"learning_rate": 0.00010155981392860185,
"loss": 0.8666,
"step": 1061
},
{
"epoch": 0.5114067296695359,
"grad_norm": 1.5210875272750854,
"learning_rate": 0.00010140384335273386,
"loss": 0.8547,
"step": 1062
},
{
"epoch": 0.511888280262445,
"grad_norm": 2.5150206089019775,
"learning_rate": 0.00010124786936101127,
"loss": 0.6131,
"step": 1063
},
{
"epoch": 0.5123698308553543,
"grad_norm": 2.087355852127075,
"learning_rate": 0.00010109189233295255,
"loss": 0.7018,
"step": 1064
},
{
"epoch": 0.5128513814482634,
"grad_norm": 3.2802398204803467,
"learning_rate": 0.00010093591264808358,
"loss": 0.6533,
"step": 1065
},
{
"epoch": 0.5133329320411726,
"grad_norm": 2.5115907192230225,
"learning_rate": 0.00010077993068593663,
"loss": 0.8199,
"step": 1066
},
{
"epoch": 0.5138144826340817,
"grad_norm": 3.236037015914917,
"learning_rate": 0.00010062394682604963,
"loss": 0.649,
"step": 1067
},
{
"epoch": 0.5142960332269909,
"grad_norm": 2.0290400981903076,
"learning_rate": 0.00010046796144796497,
"loss": 0.5048,
"step": 1068
},
{
"epoch": 0.5147775838199001,
"grad_norm": 2.3132944107055664,
"learning_rate": 0.0001003119749312289,
"loss": 0.8111,
"step": 1069
},
{
"epoch": 0.5152591344128092,
"grad_norm": 3.8110101222991943,
"learning_rate": 0.00010015598765539031,
"loss": 0.831,
"step": 1070
},
{
"epoch": 0.5157406850057185,
"grad_norm": 2.6409425735473633,
"learning_rate": 0.0001,
"loss": 0.5713,
"step": 1071
},
{
"epoch": 0.5162222355986276,
"grad_norm": 2.540348768234253,
"learning_rate": 9.984401234460971e-05,
"loss": 0.896,
"step": 1072
},
{
"epoch": 0.5167037861915368,
"grad_norm": 1.605286717414856,
"learning_rate": 9.968802506877111e-05,
"loss": 0.7826,
"step": 1073
},
{
"epoch": 0.5171853367844459,
"grad_norm": 1.9852914810180664,
"learning_rate": 9.953203855203504e-05,
"loss": 0.484,
"step": 1074
},
{
"epoch": 0.5176668873773551,
"grad_norm": 2.697453498840332,
"learning_rate": 9.93760531739504e-05,
"loss": 0.5764,
"step": 1075
},
{
"epoch": 0.5181484379702642,
"grad_norm": 3.257183074951172,
"learning_rate": 9.922006931406338e-05,
"loss": 0.8809,
"step": 1076
},
{
"epoch": 0.5186299885631734,
"grad_norm": 1.4868934154510498,
"learning_rate": 9.906408735191643e-05,
"loss": 0.3895,
"step": 1077
},
{
"epoch": 0.5191115391560825,
"grad_norm": 2.801379919052124,
"learning_rate": 9.890810766704745e-05,
"loss": 0.6476,
"step": 1078
},
{
"epoch": 0.5195930897489918,
"grad_norm": 5.3302321434021,
"learning_rate": 9.875213063898875e-05,
"loss": 0.7303,
"step": 1079
},
{
"epoch": 0.5200746403419009,
"grad_norm": 8.286264419555664,
"learning_rate": 9.859615664726615e-05,
"loss": 0.8864,
"step": 1080
},
{
"epoch": 0.5205561909348101,
"grad_norm": 2.5525264739990234,
"learning_rate": 9.844018607139818e-05,
"loss": 1.2073,
"step": 1081
},
{
"epoch": 0.5210377415277193,
"grad_norm": 3.0127081871032715,
"learning_rate": 9.828421929089493e-05,
"loss": 0.7991,
"step": 1082
},
{
"epoch": 0.5215192921206284,
"grad_norm": 3.983294725418091,
"learning_rate": 9.812825668525733e-05,
"loss": 0.821,
"step": 1083
},
{
"epoch": 0.5220008427135376,
"grad_norm": 7.565732479095459,
"learning_rate": 9.797229863397615e-05,
"loss": 1.2835,
"step": 1084
},
{
"epoch": 0.5224823933064467,
"grad_norm": 2.560930013656616,
"learning_rate": 9.781634551653108e-05,
"loss": 0.7872,
"step": 1085
},
{
"epoch": 0.522963943899356,
"grad_norm": 3.7823336124420166,
"learning_rate": 9.766039771238982e-05,
"loss": 0.9539,
"step": 1086
},
{
"epoch": 0.5234454944922651,
"grad_norm": 1.7851648330688477,
"learning_rate": 9.750445560100706e-05,
"loss": 0.7338,
"step": 1087
},
{
"epoch": 0.5239270450851743,
"grad_norm": 2.3627817630767822,
"learning_rate": 9.73485195618237e-05,
"loss": 0.87,
"step": 1088
},
{
"epoch": 0.5244085956780834,
"grad_norm": 3.479341745376587,
"learning_rate": 9.719258997426588e-05,
"loss": 0.872,
"step": 1089
},
{
"epoch": 0.5248901462709926,
"grad_norm": 3.8782670497894287,
"learning_rate": 9.703666721774402e-05,
"loss": 0.4405,
"step": 1090
},
{
"epoch": 0.5253716968639017,
"grad_norm": 1.5596513748168945,
"learning_rate": 9.688075167165194e-05,
"loss": 0.5061,
"step": 1091
},
{
"epoch": 0.525853247456811,
"grad_norm": 2.221703290939331,
"learning_rate": 9.672484371536586e-05,
"loss": 0.4747,
"step": 1092
},
{
"epoch": 0.52633479804972,
"grad_norm": 5.022744178771973,
"learning_rate": 9.656894372824358e-05,
"loss": 1.0149,
"step": 1093
},
{
"epoch": 0.5268163486426293,
"grad_norm": 1.9501501321792603,
"learning_rate": 9.64130520896235e-05,
"loss": 0.7204,
"step": 1094
},
{
"epoch": 0.5272978992355384,
"grad_norm": 1.2803010940551758,
"learning_rate": 9.625716917882367e-05,
"loss": 0.5088,
"step": 1095
},
{
"epoch": 0.5277794498284476,
"grad_norm": 1.8832592964172363,
"learning_rate": 9.6101295375141e-05,
"loss": 0.921,
"step": 1096
},
{
"epoch": 0.5282610004213568,
"grad_norm": 2.0987727642059326,
"learning_rate": 9.594543105785013e-05,
"loss": 0.8486,
"step": 1097
},
{
"epoch": 0.5287425510142659,
"grad_norm": 3.9266583919525146,
"learning_rate": 9.578957660620267e-05,
"loss": 0.5983,
"step": 1098
},
{
"epoch": 0.5292241016071751,
"grad_norm": 2.6706717014312744,
"learning_rate": 9.563373239942623e-05,
"loss": 0.617,
"step": 1099
},
{
"epoch": 0.5297056522000843,
"grad_norm": 1.8419418334960938,
"learning_rate": 9.547789881672348e-05,
"loss": 0.4538,
"step": 1100
},
{
"epoch": 0.5301872027929935,
"grad_norm": 1.9119439125061035,
"learning_rate": 9.532207623727126e-05,
"loss": 0.7275,
"step": 1101
},
{
"epoch": 0.5306687533859026,
"grad_norm": 3.1396830081939697,
"learning_rate": 9.516626504021957e-05,
"loss": 0.6206,
"step": 1102
},
{
"epoch": 0.5311503039788118,
"grad_norm": 2.5384531021118164,
"learning_rate": 9.501046560469079e-05,
"loss": 1.0057,
"step": 1103
},
{
"epoch": 0.5316318545717209,
"grad_norm": 3.143725872039795,
"learning_rate": 9.485467830977864e-05,
"loss": 1.1685,
"step": 1104
},
{
"epoch": 0.5321134051646301,
"grad_norm": 4.282426357269287,
"learning_rate": 9.469890353454732e-05,
"loss": 0.6259,
"step": 1105
},
{
"epoch": 0.5325949557575392,
"grad_norm": 2.5603525638580322,
"learning_rate": 9.454314165803054e-05,
"loss": 0.6818,
"step": 1106
},
{
"epoch": 0.5330765063504485,
"grad_norm": 2.4884443283081055,
"learning_rate": 9.438739305923067e-05,
"loss": 0.7338,
"step": 1107
},
{
"epoch": 0.5335580569433576,
"grad_norm": 3.4453368186950684,
"learning_rate": 9.423165811711777e-05,
"loss": 0.8649,
"step": 1108
},
{
"epoch": 0.5340396075362668,
"grad_norm": 2.136265516281128,
"learning_rate": 9.407593721062859e-05,
"loss": 0.5932,
"step": 1109
},
{
"epoch": 0.534521158129176,
"grad_norm": 2.1778738498687744,
"learning_rate": 9.39202307186658e-05,
"loss": 0.3037,
"step": 1110
},
{
"epoch": 0.5350027087220851,
"grad_norm": 1.9080350399017334,
"learning_rate": 9.3764539020097e-05,
"loss": 0.7092,
"step": 1111
},
{
"epoch": 0.5354842593149943,
"grad_norm": 2.198676824569702,
"learning_rate": 9.360886249375376e-05,
"loss": 1.0817,
"step": 1112
},
{
"epoch": 0.5359658099079034,
"grad_norm": 1.1177189350128174,
"learning_rate": 9.345320151843078e-05,
"loss": 0.5078,
"step": 1113
},
{
"epoch": 0.5364473605008127,
"grad_norm": 3.8491134643554688,
"learning_rate": 9.329755647288485e-05,
"loss": 0.9873,
"step": 1114
},
{
"epoch": 0.5369289110937218,
"grad_norm": 4.839039325714111,
"learning_rate": 9.314192773583403e-05,
"loss": 0.8585,
"step": 1115
},
{
"epoch": 0.537410461686631,
"grad_norm": 3.628781795501709,
"learning_rate": 9.298631568595674e-05,
"loss": 0.9069,
"step": 1116
},
{
"epoch": 0.5378920122795401,
"grad_norm": 1.936665415763855,
"learning_rate": 9.283072070189075e-05,
"loss": 0.6665,
"step": 1117
},
{
"epoch": 0.5383735628724493,
"grad_norm": 1.807746171951294,
"learning_rate": 9.267514316223234e-05,
"loss": 0.7337,
"step": 1118
},
{
"epoch": 0.5388551134653584,
"grad_norm": 2.178152322769165,
"learning_rate": 9.251958344553528e-05,
"loss": 0.701,
"step": 1119
},
{
"epoch": 0.5393366640582676,
"grad_norm": 2.263169765472412,
"learning_rate": 9.23640419303101e-05,
"loss": 0.6482,
"step": 1120
},
{
"epoch": 0.5398182146511767,
"grad_norm": 2.1158978939056396,
"learning_rate": 9.220851899502283e-05,
"loss": 0.9083,
"step": 1121
},
{
"epoch": 0.540299765244086,
"grad_norm": 2.0844359397888184,
"learning_rate": 9.205301501809448e-05,
"loss": 0.9297,
"step": 1122
},
{
"epoch": 0.5407813158369951,
"grad_norm": 1.7951438426971436,
"learning_rate": 9.189753037789987e-05,
"loss": 0.7921,
"step": 1123
},
{
"epoch": 0.5412628664299043,
"grad_norm": 2.2726521492004395,
"learning_rate": 9.174206545276677e-05,
"loss": 0.8874,
"step": 1124
},
{
"epoch": 0.5417444170228135,
"grad_norm": 3.393622875213623,
"learning_rate": 9.158662062097501e-05,
"loss": 1.0911,
"step": 1125
},
{
"epoch": 0.5422259676157226,
"grad_norm": 1.4040405750274658,
"learning_rate": 9.143119626075542e-05,
"loss": 0.5292,
"step": 1126
},
{
"epoch": 0.5427075182086318,
"grad_norm": 1.302949070930481,
"learning_rate": 9.127579275028914e-05,
"loss": 1.026,
"step": 1127
},
{
"epoch": 0.5431890688015409,
"grad_norm": 2.1794188022613525,
"learning_rate": 9.112041046770653e-05,
"loss": 0.8072,
"step": 1128
},
{
"epoch": 0.5436706193944502,
"grad_norm": 2.3347835540771484,
"learning_rate": 9.096504979108629e-05,
"loss": 0.8512,
"step": 1129
},
{
"epoch": 0.5441521699873593,
"grad_norm": 2.353959083557129,
"learning_rate": 9.080971109845458e-05,
"loss": 0.9363,
"step": 1130
},
{
"epoch": 0.5446337205802685,
"grad_norm": 2.1609857082366943,
"learning_rate": 9.0654394767784e-05,
"loss": 0.3391,
"step": 1131
},
{
"epoch": 0.5451152711731776,
"grad_norm": 3.1173667907714844,
"learning_rate": 9.049910117699281e-05,
"loss": 0.5835,
"step": 1132
},
{
"epoch": 0.5455968217660868,
"grad_norm": 3.2934017181396484,
"learning_rate": 9.034383070394393e-05,
"loss": 0.9396,
"step": 1133
},
{
"epoch": 0.5460783723589959,
"grad_norm": 3.2750277519226074,
"learning_rate": 9.0188583726444e-05,
"loss": 0.8517,
"step": 1134
},
{
"epoch": 0.5465599229519051,
"grad_norm": 0.8598925471305847,
"learning_rate": 9.00333606222425e-05,
"loss": 0.5358,
"step": 1135
},
{
"epoch": 0.5470414735448142,
"grad_norm": 2.2244086265563965,
"learning_rate": 8.987816176903082e-05,
"loss": 0.3203,
"step": 1136
},
{
"epoch": 0.5475230241377235,
"grad_norm": 2.2185556888580322,
"learning_rate": 8.972298754444136e-05,
"loss": 1.0547,
"step": 1137
},
{
"epoch": 0.5480045747306327,
"grad_norm": 1.47505784034729,
"learning_rate": 8.956783832604654e-05,
"loss": 0.4243,
"step": 1138
},
{
"epoch": 0.5484861253235418,
"grad_norm": 3.940340757369995,
"learning_rate": 8.941271449135806e-05,
"loss": 0.8955,
"step": 1139
},
{
"epoch": 0.548967675916451,
"grad_norm": 2.3822591304779053,
"learning_rate": 8.925761641782567e-05,
"loss": 0.6393,
"step": 1140
},
{
"epoch": 0.5494492265093601,
"grad_norm": 1.8161604404449463,
"learning_rate": 8.910254448283659e-05,
"loss": 0.4928,
"step": 1141
},
{
"epoch": 0.5499307771022693,
"grad_norm": 3.2381865978240967,
"learning_rate": 8.894749906371439e-05,
"loss": 0.862,
"step": 1142
},
{
"epoch": 0.5504123276951784,
"grad_norm": 2.2119295597076416,
"learning_rate": 8.87924805377181e-05,
"loss": 0.4778,
"step": 1143
},
{
"epoch": 0.5508938782880877,
"grad_norm": 2.175503730773926,
"learning_rate": 8.863748928204131e-05,
"loss": 0.3811,
"step": 1144
},
{
"epoch": 0.5513754288809968,
"grad_norm": 3.0040910243988037,
"learning_rate": 8.848252567381131e-05,
"loss": 0.5659,
"step": 1145
},
{
"epoch": 0.551856979473906,
"grad_norm": 2.7777600288391113,
"learning_rate": 8.83275900900881e-05,
"loss": 0.5106,
"step": 1146
},
{
"epoch": 0.5523385300668151,
"grad_norm": 4.878298282623291,
"learning_rate": 8.817268290786343e-05,
"loss": 0.5554,
"step": 1147
},
{
"epoch": 0.5528200806597243,
"grad_norm": 5.596711158752441,
"learning_rate": 8.801780450406002e-05,
"loss": 0.4911,
"step": 1148
},
{
"epoch": 0.5533016312526334,
"grad_norm": 3.1435718536376953,
"learning_rate": 8.786295525553053e-05,
"loss": 0.3324,
"step": 1149
},
{
"epoch": 0.5537831818455426,
"grad_norm": 3.044595956802368,
"learning_rate": 8.770813553905664e-05,
"loss": 0.6101,
"step": 1150
},
{
"epoch": 0.5542647324384519,
"grad_norm": 2.739715576171875,
"learning_rate": 8.755334573134829e-05,
"loss": 0.6972,
"step": 1151
},
{
"epoch": 0.554746283031361,
"grad_norm": 1.3641911745071411,
"learning_rate": 8.739858620904251e-05,
"loss": 0.4947,
"step": 1152
},
{
"epoch": 0.5552278336242702,
"grad_norm": 1.8812917470932007,
"learning_rate": 8.724385734870271e-05,
"loss": 0.8228,
"step": 1153
},
{
"epoch": 0.5557093842171793,
"grad_norm": 3.0910966396331787,
"learning_rate": 8.708915952681769e-05,
"loss": 0.5776,
"step": 1154
},
{
"epoch": 0.5561909348100885,
"grad_norm": 2.192817449569702,
"learning_rate": 8.693449311980074e-05,
"loss": 0.935,
"step": 1155
},
{
"epoch": 0.5566724854029976,
"grad_norm": 2.3270866870880127,
"learning_rate": 8.677985850398866e-05,
"loss": 0.5251,
"step": 1156
},
{
"epoch": 0.5571540359959068,
"grad_norm": 3.0047972202301025,
"learning_rate": 8.662525605564093e-05,
"loss": 0.9796,
"step": 1157
},
{
"epoch": 0.557635586588816,
"grad_norm": 2.3164725303649902,
"learning_rate": 8.647068615093875e-05,
"loss": 1.551,
"step": 1158
},
{
"epoch": 0.5581171371817252,
"grad_norm": 3.4601895809173584,
"learning_rate": 8.631614916598419e-05,
"loss": 0.7455,
"step": 1159
},
{
"epoch": 0.5585986877746343,
"grad_norm": 3.388256549835205,
"learning_rate": 8.616164547679906e-05,
"loss": 0.5484,
"step": 1160
},
{
"epoch": 0.5590802383675435,
"grad_norm": 2.2302229404449463,
"learning_rate": 8.600717545932435e-05,
"loss": 0.5789,
"step": 1161
},
{
"epoch": 0.5595617889604526,
"grad_norm": 2.5507445335388184,
"learning_rate": 8.5852739489419e-05,
"loss": 0.2962,
"step": 1162
},
{
"epoch": 0.5600433395533618,
"grad_norm": 2.2931394577026367,
"learning_rate": 8.569833794285915e-05,
"loss": 0.9057,
"step": 1163
},
{
"epoch": 0.5605248901462709,
"grad_norm": 2.3694357872009277,
"learning_rate": 8.554397119533714e-05,
"loss": 0.9051,
"step": 1164
},
{
"epoch": 0.5610064407391802,
"grad_norm": 2.3861167430877686,
"learning_rate": 8.538963962246069e-05,
"loss": 0.6481,
"step": 1165
},
{
"epoch": 0.5614879913320894,
"grad_norm": 3.3635268211364746,
"learning_rate": 8.523534359975189e-05,
"loss": 0.6873,
"step": 1166
},
{
"epoch": 0.5619695419249985,
"grad_norm": 2.3280465602874756,
"learning_rate": 8.508108350264635e-05,
"loss": 0.4409,
"step": 1167
},
{
"epoch": 0.5624510925179077,
"grad_norm": 2.3025784492492676,
"learning_rate": 8.492685970649228e-05,
"loss": 0.3629,
"step": 1168
},
{
"epoch": 0.5629326431108168,
"grad_norm": 4.259292125701904,
"learning_rate": 8.477267258654949e-05,
"loss": 0.7646,
"step": 1169
},
{
"epoch": 0.563414193703726,
"grad_norm": 3.045973777770996,
"learning_rate": 8.461852251798866e-05,
"loss": 0.8309,
"step": 1170
},
{
"epoch": 0.5638957442966351,
"grad_norm": 2.590165376663208,
"learning_rate": 8.44644098758902e-05,
"loss": 0.4435,
"step": 1171
},
{
"epoch": 0.5643772948895444,
"grad_norm": 2.0724105834960938,
"learning_rate": 8.431033503524354e-05,
"loss": 0.4976,
"step": 1172
},
{
"epoch": 0.5648588454824535,
"grad_norm": 3.144411087036133,
"learning_rate": 8.415629837094611e-05,
"loss": 0.9775,
"step": 1173
},
{
"epoch": 0.5653403960753627,
"grad_norm": 2.584644079208374,
"learning_rate": 8.400230025780243e-05,
"loss": 0.6065,
"step": 1174
},
{
"epoch": 0.5658219466682718,
"grad_norm": 1.8154007196426392,
"learning_rate": 8.384834107052321e-05,
"loss": 0.3035,
"step": 1175
},
{
"epoch": 0.566303497261181,
"grad_norm": 3.097371816635132,
"learning_rate": 8.369442118372447e-05,
"loss": 0.6747,
"step": 1176
},
{
"epoch": 0.5667850478540901,
"grad_norm": 1.322751522064209,
"learning_rate": 8.35405409719266e-05,
"loss": 0.6194,
"step": 1177
},
{
"epoch": 0.5672665984469993,
"grad_norm": 2.8619985580444336,
"learning_rate": 8.338670080955349e-05,
"loss": 0.9159,
"step": 1178
},
{
"epoch": 0.5677481490399086,
"grad_norm": 1.4597111940383911,
"learning_rate": 8.323290107093143e-05,
"loss": 0.5528,
"step": 1179
},
{
"epoch": 0.5682296996328177,
"grad_norm": 1.321386694908142,
"learning_rate": 8.307914213028856e-05,
"loss": 0.5454,
"step": 1180
},
{
"epoch": 0.5687112502257269,
"grad_norm": 2.653350591659546,
"learning_rate": 8.292542436175356e-05,
"loss": 0.6959,
"step": 1181
},
{
"epoch": 0.569192800818636,
"grad_norm": 3.2664124965667725,
"learning_rate": 8.277174813935508e-05,
"loss": 0.9298,
"step": 1182
},
{
"epoch": 0.5696743514115452,
"grad_norm": 3.0547754764556885,
"learning_rate": 8.261811383702061e-05,
"loss": 0.7422,
"step": 1183
},
{
"epoch": 0.5701559020044543,
"grad_norm": 2.19242000579834,
"learning_rate": 8.246452182857562e-05,
"loss": 0.7436,
"step": 1184
},
{
"epoch": 0.5706374525973635,
"grad_norm": 4.479813098907471,
"learning_rate": 8.231097248774274e-05,
"loss": 1.1304,
"step": 1185
},
{
"epoch": 0.5711190031902726,
"grad_norm": 2.662180185317993,
"learning_rate": 8.215746618814067e-05,
"loss": 0.6066,
"step": 1186
},
{
"epoch": 0.5716005537831819,
"grad_norm": 3.7930872440338135,
"learning_rate": 8.200400330328348e-05,
"loss": 0.7421,
"step": 1187
},
{
"epoch": 0.572082104376091,
"grad_norm": 2.9955811500549316,
"learning_rate": 8.185058420657957e-05,
"loss": 1.1659,
"step": 1188
},
{
"epoch": 0.5725636549690002,
"grad_norm": 1.8238601684570312,
"learning_rate": 8.16972092713308e-05,
"loss": 0.636,
"step": 1189
},
{
"epoch": 0.5730452055619093,
"grad_norm": 2.3906164169311523,
"learning_rate": 8.154387887073158e-05,
"loss": 0.4951,
"step": 1190
},
{
"epoch": 0.5735267561548185,
"grad_norm": 1.853758692741394,
"learning_rate": 8.139059337786792e-05,
"loss": 0.7715,
"step": 1191
},
{
"epoch": 0.5740083067477277,
"grad_norm": 2.8323585987091064,
"learning_rate": 8.12373531657166e-05,
"loss": 1.0706,
"step": 1192
},
{
"epoch": 0.5744898573406368,
"grad_norm": 1.3406877517700195,
"learning_rate": 8.108415860714418e-05,
"loss": 0.3461,
"step": 1193
},
{
"epoch": 0.5749714079335461,
"grad_norm": 2.031278371810913,
"learning_rate": 8.093101007490622e-05,
"loss": 0.8868,
"step": 1194
},
{
"epoch": 0.5754529585264552,
"grad_norm": 3.345834255218506,
"learning_rate": 8.077790794164619e-05,
"loss": 0.4278,
"step": 1195
},
{
"epoch": 0.5759345091193644,
"grad_norm": 2.130840301513672,
"learning_rate": 8.062485257989471e-05,
"loss": 1.0242,
"step": 1196
},
{
"epoch": 0.5764160597122735,
"grad_norm": 2.4846746921539307,
"learning_rate": 8.047184436206864e-05,
"loss": 0.7,
"step": 1197
},
{
"epoch": 0.5768976103051827,
"grad_norm": 2.193743944168091,
"learning_rate": 8.031888366046998e-05,
"loss": 0.6467,
"step": 1198
},
{
"epoch": 0.5773791608980918,
"grad_norm": 1.9895037412643433,
"learning_rate": 8.016597084728526e-05,
"loss": 0.7244,
"step": 1199
},
{
"epoch": 0.577860711491001,
"grad_norm": 2.5619122982025146,
"learning_rate": 8.001310629458443e-05,
"loss": 0.9385,
"step": 1200
},
{
"epoch": 0.5783422620839102,
"grad_norm": 1.9781352281570435,
"learning_rate": 7.986029037432002e-05,
"loss": 0.8172,
"step": 1201
},
{
"epoch": 0.5788238126768194,
"grad_norm": 3.2591843605041504,
"learning_rate": 7.970752345832623e-05,
"loss": 0.7278,
"step": 1202
},
{
"epoch": 0.5793053632697285,
"grad_norm": 1.6107450723648071,
"learning_rate": 7.9554805918318e-05,
"loss": 0.3799,
"step": 1203
},
{
"epoch": 0.5797869138626377,
"grad_norm": 3.175673484802246,
"learning_rate": 7.940213812589018e-05,
"loss": 0.5979,
"step": 1204
},
{
"epoch": 0.5802684644555468,
"grad_norm": 2.0690531730651855,
"learning_rate": 7.92495204525165e-05,
"loss": 0.6842,
"step": 1205
},
{
"epoch": 0.580750015048456,
"grad_norm": 3.2871673107147217,
"learning_rate": 7.909695326954878e-05,
"loss": 1.0002,
"step": 1206
},
{
"epoch": 0.5812315656413652,
"grad_norm": 1.0341432094573975,
"learning_rate": 7.894443694821602e-05,
"loss": 0.516,
"step": 1207
},
{
"epoch": 0.5817131162342744,
"grad_norm": 2.580730676651001,
"learning_rate": 7.879197185962339e-05,
"loss": 0.7898,
"step": 1208
},
{
"epoch": 0.5821946668271836,
"grad_norm": 4.864838600158691,
"learning_rate": 7.863955837475144e-05,
"loss": 0.8172,
"step": 1209
},
{
"epoch": 0.5826762174200927,
"grad_norm": 1.060434341430664,
"learning_rate": 7.848719686445515e-05,
"loss": 0.3784,
"step": 1210
},
{
"epoch": 0.5831577680130019,
"grad_norm": 3.191971778869629,
"learning_rate": 7.833488769946306e-05,
"loss": 0.8063,
"step": 1211
},
{
"epoch": 0.583639318605911,
"grad_norm": 2.525768518447876,
"learning_rate": 7.818263125037633e-05,
"loss": 0.6985,
"step": 1212
},
{
"epoch": 0.5841208691988202,
"grad_norm": 2.8149242401123047,
"learning_rate": 7.803042788766777e-05,
"loss": 1.064,
"step": 1213
},
{
"epoch": 0.5846024197917293,
"grad_norm": 3.0168797969818115,
"learning_rate": 7.787827798168115e-05,
"loss": 0.387,
"step": 1214
},
{
"epoch": 0.5850839703846386,
"grad_norm": 2.1874630451202393,
"learning_rate": 7.772618190263009e-05,
"loss": 0.5811,
"step": 1215
},
{
"epoch": 0.5855655209775477,
"grad_norm": 1.8578369617462158,
"learning_rate": 7.757414002059726e-05,
"loss": 0.5424,
"step": 1216
},
{
"epoch": 0.5860470715704569,
"grad_norm": 3.507887840270996,
"learning_rate": 7.742215270553349e-05,
"loss": 0.5704,
"step": 1217
},
{
"epoch": 0.586528622163366,
"grad_norm": 1.8217120170593262,
"learning_rate": 7.727022032725672e-05,
"loss": 0.72,
"step": 1218
},
{
"epoch": 0.5870101727562752,
"grad_norm": 1.2904176712036133,
"learning_rate": 7.711834325545135e-05,
"loss": 0.4966,
"step": 1219
},
{
"epoch": 0.5874917233491844,
"grad_norm": 1.9854986667633057,
"learning_rate": 7.696652185966711e-05,
"loss": 0.6202,
"step": 1220
},
{
"epoch": 0.5879732739420935,
"grad_norm": 2.831481456756592,
"learning_rate": 7.681475650931834e-05,
"loss": 0.568,
"step": 1221
},
{
"epoch": 0.5884548245350028,
"grad_norm": 2.528315305709839,
"learning_rate": 7.666304757368297e-05,
"loss": 0.9762,
"step": 1222
},
{
"epoch": 0.5889363751279119,
"grad_norm": 1.8668657541275024,
"learning_rate": 7.651139542190164e-05,
"loss": 0.7539,
"step": 1223
},
{
"epoch": 0.5894179257208211,
"grad_norm": 2.6514816284179688,
"learning_rate": 7.635980042297687e-05,
"loss": 0.6104,
"step": 1224
},
{
"epoch": 0.5898994763137302,
"grad_norm": 2.8228659629821777,
"learning_rate": 7.620826294577208e-05,
"loss": 0.5398,
"step": 1225
},
{
"epoch": 0.5903810269066394,
"grad_norm": 1.8238624334335327,
"learning_rate": 7.605678335901071e-05,
"loss": 0.4965,
"step": 1226
},
{
"epoch": 0.5908625774995485,
"grad_norm": 2.332958221435547,
"learning_rate": 7.59053620312754e-05,
"loss": 1.0528,
"step": 1227
},
{
"epoch": 0.5913441280924577,
"grad_norm": 3.5963058471679688,
"learning_rate": 7.575399933100697e-05,
"loss": 0.5706,
"step": 1228
},
{
"epoch": 0.5918256786853668,
"grad_norm": 3.345517873764038,
"learning_rate": 7.560269562650368e-05,
"loss": 1.0137,
"step": 1229
},
{
"epoch": 0.5923072292782761,
"grad_norm": 5.635433673858643,
"learning_rate": 7.54514512859201e-05,
"loss": 0.4088,
"step": 1230
},
{
"epoch": 0.5927887798711852,
"grad_norm": 2.0128109455108643,
"learning_rate": 7.530026667726645e-05,
"loss": 0.5574,
"step": 1231
},
{
"epoch": 0.5932703304640944,
"grad_norm": 2.09451961517334,
"learning_rate": 7.51491421684076e-05,
"loss": 0.9239,
"step": 1232
},
{
"epoch": 0.5937518810570036,
"grad_norm": 3.1210248470306396,
"learning_rate": 7.49980781270622e-05,
"loss": 0.9972,
"step": 1233
},
{
"epoch": 0.5942334316499127,
"grad_norm": 2.9423022270202637,
"learning_rate": 7.484707492080172e-05,
"loss": 0.9545,
"step": 1234
},
{
"epoch": 0.5947149822428219,
"grad_norm": 8.651620864868164,
"learning_rate": 7.469613291704962e-05,
"loss": 0.9859,
"step": 1235
},
{
"epoch": 0.595196532835731,
"grad_norm": 2.376633644104004,
"learning_rate": 7.45452524830805e-05,
"loss": 0.6038,
"step": 1236
},
{
"epoch": 0.5956780834286403,
"grad_norm": 1.0200681686401367,
"learning_rate": 7.439443398601903e-05,
"loss": 0.4735,
"step": 1237
},
{
"epoch": 0.5961596340215494,
"grad_norm": 3.08100962638855,
"learning_rate": 7.424367779283926e-05,
"loss": 0.9614,
"step": 1238
},
{
"epoch": 0.5966411846144586,
"grad_norm": 1.8693490028381348,
"learning_rate": 7.409298427036364e-05,
"loss": 0.4885,
"step": 1239
},
{
"epoch": 0.5971227352073677,
"grad_norm": 3.25297474861145,
"learning_rate": 7.39423537852621e-05,
"loss": 0.7068,
"step": 1240
},
{
"epoch": 0.5976042858002769,
"grad_norm": 2.2627036571502686,
"learning_rate": 7.379178670405123e-05,
"loss": 0.9651,
"step": 1241
},
{
"epoch": 0.598085836393186,
"grad_norm": 3.4235429763793945,
"learning_rate": 7.364128339309326e-05,
"loss": 0.9293,
"step": 1242
},
{
"epoch": 0.5985673869860952,
"grad_norm": 2.710484743118286,
"learning_rate": 7.349084421859533e-05,
"loss": 0.6263,
"step": 1243
},
{
"epoch": 0.5990489375790043,
"grad_norm": 2.2872800827026367,
"learning_rate": 7.334046954660852e-05,
"loss": 0.4224,
"step": 1244
},
{
"epoch": 0.5995304881719136,
"grad_norm": 1.7476857900619507,
"learning_rate": 7.31901597430269e-05,
"loss": 0.7634,
"step": 1245
},
{
"epoch": 0.6000120387648227,
"grad_norm": 2.7267417907714844,
"learning_rate": 7.303991517358678e-05,
"loss": 0.845,
"step": 1246
},
{
"epoch": 0.6004935893577319,
"grad_norm": 2.053980827331543,
"learning_rate": 7.288973620386568e-05,
"loss": 0.8618,
"step": 1247
},
{
"epoch": 0.6009751399506411,
"grad_norm": 1.9945694208145142,
"learning_rate": 7.273962319928151e-05,
"loss": 0.7425,
"step": 1248
},
{
"epoch": 0.6014566905435502,
"grad_norm": 1.3418025970458984,
"learning_rate": 7.258957652509171e-05,
"loss": 0.6352,
"step": 1249
},
{
"epoch": 0.6019382411364594,
"grad_norm": 2.392909288406372,
"learning_rate": 7.24395965463923e-05,
"loss": 0.4533,
"step": 1250
},
{
"epoch": 0.6024197917293685,
"grad_norm": 2.3010342121124268,
"learning_rate": 7.228968362811702e-05,
"loss": 0.4342,
"step": 1251
},
{
"epoch": 0.6029013423222778,
"grad_norm": 3.485913038253784,
"learning_rate": 7.21398381350364e-05,
"loss": 0.9649,
"step": 1252
},
{
"epoch": 0.6033828929151869,
"grad_norm": 0.9633323550224304,
"learning_rate": 7.199006043175698e-05,
"loss": 0.7225,
"step": 1253
},
{
"epoch": 0.6038644435080961,
"grad_norm": 2.767479658126831,
"learning_rate": 7.184035088272028e-05,
"loss": 0.5086,
"step": 1254
},
{
"epoch": 0.6043459941010052,
"grad_norm": 3.417263984680176,
"learning_rate": 7.169070985220208e-05,
"loss": 0.7542,
"step": 1255
},
{
"epoch": 0.6048275446939144,
"grad_norm": 1.0457793474197388,
"learning_rate": 7.154113770431132e-05,
"loss": 1.0051,
"step": 1256
},
{
"epoch": 0.6053090952868235,
"grad_norm": 1.780932068824768,
"learning_rate": 7.13916348029894e-05,
"loss": 0.7171,
"step": 1257
},
{
"epoch": 0.6057906458797327,
"grad_norm": 3.2504794597625732,
"learning_rate": 7.124220151200926e-05,
"loss": 0.4477,
"step": 1258
},
{
"epoch": 0.6062721964726419,
"grad_norm": 3.2658979892730713,
"learning_rate": 7.10928381949744e-05,
"loss": 0.5208,
"step": 1259
},
{
"epoch": 0.6067537470655511,
"grad_norm": 2.36083984375,
"learning_rate": 7.094354521531807e-05,
"loss": 0.8187,
"step": 1260
},
{
"epoch": 0.6072352976584603,
"grad_norm": 1.9257503747940063,
"learning_rate": 7.079432293630244e-05,
"loss": 0.9669,
"step": 1261
},
{
"epoch": 0.6077168482513694,
"grad_norm": 3.070887804031372,
"learning_rate": 7.064517172101753e-05,
"loss": 0.8842,
"step": 1262
},
{
"epoch": 0.6081983988442786,
"grad_norm": 2.950284481048584,
"learning_rate": 7.04960919323806e-05,
"loss": 0.6997,
"step": 1263
},
{
"epoch": 0.6086799494371877,
"grad_norm": 3.656165838241577,
"learning_rate": 7.034708393313493e-05,
"loss": 0.7774,
"step": 1264
},
{
"epoch": 0.609161500030097,
"grad_norm": 3.879746198654175,
"learning_rate": 7.019814808584928e-05,
"loss": 0.6871,
"step": 1265
},
{
"epoch": 0.609643050623006,
"grad_norm": 2.684112310409546,
"learning_rate": 7.004928475291678e-05,
"loss": 0.36,
"step": 1266
},
{
"epoch": 0.6101246012159153,
"grad_norm": 4.9579644203186035,
"learning_rate": 6.990049429655412e-05,
"loss": 0.888,
"step": 1267
},
{
"epoch": 0.6106061518088244,
"grad_norm": 2.2652103900909424,
"learning_rate": 6.97517770788007e-05,
"loss": 0.6242,
"step": 1268
},
{
"epoch": 0.6110877024017336,
"grad_norm": 2.9428718090057373,
"learning_rate": 6.960313346151761e-05,
"loss": 0.5431,
"step": 1269
},
{
"epoch": 0.6115692529946427,
"grad_norm": 3.530306339263916,
"learning_rate": 6.9454563806387e-05,
"loss": 1.0434,
"step": 1270
},
{
"epoch": 0.6120508035875519,
"grad_norm": 1.2338889837265015,
"learning_rate": 6.930606847491094e-05,
"loss": 0.7309,
"step": 1271
},
{
"epoch": 0.612532354180461,
"grad_norm": 2.873732328414917,
"learning_rate": 6.915764782841072e-05,
"loss": 0.8321,
"step": 1272
},
{
"epoch": 0.6130139047733703,
"grad_norm": 2.2155025005340576,
"learning_rate": 6.900930222802588e-05,
"loss": 0.3917,
"step": 1273
},
{
"epoch": 0.6134954553662795,
"grad_norm": 1.7441500425338745,
"learning_rate": 6.886103203471337e-05,
"loss": 0.587,
"step": 1274
},
{
"epoch": 0.6139770059591886,
"grad_norm": 3.1032984256744385,
"learning_rate": 6.871283760924665e-05,
"loss": 0.6219,
"step": 1275
},
{
"epoch": 0.6144585565520978,
"grad_norm": 1.5018118619918823,
"learning_rate": 6.856471931221478e-05,
"loss": 0.8532,
"step": 1276
},
{
"epoch": 0.6149401071450069,
"grad_norm": 2.236863851547241,
"learning_rate": 6.841667750402162e-05,
"loss": 0.4704,
"step": 1277
},
{
"epoch": 0.6154216577379161,
"grad_norm": 2.4868059158325195,
"learning_rate": 6.826871254488496e-05,
"loss": 0.688,
"step": 1278
},
{
"epoch": 0.6159032083308252,
"grad_norm": 1.9790339469909668,
"learning_rate": 6.812082479483553e-05,
"loss": 0.2572,
"step": 1279
},
{
"epoch": 0.6163847589237345,
"grad_norm": 8.326330184936523,
"learning_rate": 6.797301461371625e-05,
"loss": 0.7398,
"step": 1280
},
{
"epoch": 0.6168663095166436,
"grad_norm": 2.0423824787139893,
"learning_rate": 6.782528236118124e-05,
"loss": 0.6242,
"step": 1281
},
{
"epoch": 0.6173478601095528,
"grad_norm": 5.033036708831787,
"learning_rate": 6.767762839669503e-05,
"loss": 0.9255,
"step": 1282
},
{
"epoch": 0.6178294107024619,
"grad_norm": 1.7515184879302979,
"learning_rate": 6.753005307953167e-05,
"loss": 0.536,
"step": 1283
},
{
"epoch": 0.6183109612953711,
"grad_norm": 3.3410611152648926,
"learning_rate": 6.738255676877381e-05,
"loss": 0.6655,
"step": 1284
},
{
"epoch": 0.6187925118882802,
"grad_norm": 2.5926554203033447,
"learning_rate": 6.723513982331195e-05,
"loss": 0.7555,
"step": 1285
},
{
"epoch": 0.6192740624811894,
"grad_norm": 3.253159761428833,
"learning_rate": 6.708780260184333e-05,
"loss": 0.5316,
"step": 1286
},
{
"epoch": 0.6197556130740985,
"grad_norm": 3.1270864009857178,
"learning_rate": 6.694054546287132e-05,
"loss": 0.6255,
"step": 1287
},
{
"epoch": 0.6202371636670078,
"grad_norm": 5.128495216369629,
"learning_rate": 6.679336876470441e-05,
"loss": 0.7771,
"step": 1288
},
{
"epoch": 0.620718714259917,
"grad_norm": 1.7940768003463745,
"learning_rate": 6.664627286545535e-05,
"loss": 0.7788,
"step": 1289
},
{
"epoch": 0.6212002648528261,
"grad_norm": 2.9516167640686035,
"learning_rate": 6.649925812304025e-05,
"loss": 0.3909,
"step": 1290
},
{
"epoch": 0.6216818154457353,
"grad_norm": 1.452250599861145,
"learning_rate": 6.635232489517782e-05,
"loss": 0.6476,
"step": 1291
},
{
"epoch": 0.6221633660386444,
"grad_norm": 1.570677638053894,
"learning_rate": 6.620547353938836e-05,
"loss": 0.4986,
"step": 1292
},
{
"epoch": 0.6226449166315536,
"grad_norm": 2.5657029151916504,
"learning_rate": 6.605870441299302e-05,
"loss": 0.7346,
"step": 1293
},
{
"epoch": 0.6231264672244627,
"grad_norm": 3.6876044273376465,
"learning_rate": 6.591201787311285e-05,
"loss": 1.2753,
"step": 1294
},
{
"epoch": 0.623608017817372,
"grad_norm": 2.211846113204956,
"learning_rate": 6.57654142766679e-05,
"loss": 0.8497,
"step": 1295
},
{
"epoch": 0.6240895684102811,
"grad_norm": 2.1840100288391113,
"learning_rate": 6.561889398037643e-05,
"loss": 0.4188,
"step": 1296
},
{
"epoch": 0.6245711190031903,
"grad_norm": 1.7685606479644775,
"learning_rate": 6.547245734075403e-05,
"loss": 0.6529,
"step": 1297
},
{
"epoch": 0.6250526695960994,
"grad_norm": 1.6937798261642456,
"learning_rate": 6.532610471411274e-05,
"loss": 0.5592,
"step": 1298
},
{
"epoch": 0.6255342201890086,
"grad_norm": 2.8329896926879883,
"learning_rate": 6.517983645656014e-05,
"loss": 0.6343,
"step": 1299
},
{
"epoch": 0.6260157707819177,
"grad_norm": 2.330113172531128,
"learning_rate": 6.503365292399857e-05,
"loss": 1.0539,
"step": 1300
},
{
"epoch": 0.6264973213748269,
"grad_norm": 2.074939489364624,
"learning_rate": 6.488755447212418e-05,
"loss": 0.7005,
"step": 1301
},
{
"epoch": 0.6269788719677362,
"grad_norm": 1.091862678527832,
"learning_rate": 6.474154145642612e-05,
"loss": 0.9088,
"step": 1302
},
{
"epoch": 0.6274604225606453,
"grad_norm": 2.839646577835083,
"learning_rate": 6.459561423218561e-05,
"loss": 0.8255,
"step": 1303
},
{
"epoch": 0.6279419731535545,
"grad_norm": 2.919734477996826,
"learning_rate": 6.444977315447521e-05,
"loss": 0.5693,
"step": 1304
},
{
"epoch": 0.6284235237464636,
"grad_norm": 3.9232397079467773,
"learning_rate": 6.430401857815776e-05,
"loss": 0.9091,
"step": 1305
},
{
"epoch": 0.6289050743393728,
"grad_norm": 2.6297950744628906,
"learning_rate": 6.415835085788575e-05,
"loss": 0.6015,
"step": 1306
},
{
"epoch": 0.6293866249322819,
"grad_norm": 4.350391387939453,
"learning_rate": 6.401277034810017e-05,
"loss": 0.4089,
"step": 1307
},
{
"epoch": 0.6298681755251911,
"grad_norm": 2.0582656860351562,
"learning_rate": 6.386727740302994e-05,
"loss": 0.5737,
"step": 1308
},
{
"epoch": 0.6303497261181003,
"grad_norm": 2.8593883514404297,
"learning_rate": 6.37218723766909e-05,
"loss": 0.5349,
"step": 1309
},
{
"epoch": 0.6308312767110095,
"grad_norm": 2.048414945602417,
"learning_rate": 6.357655562288488e-05,
"loss": 0.928,
"step": 1310
},
{
"epoch": 0.6313128273039186,
"grad_norm": 2.7405006885528564,
"learning_rate": 6.343132749519902e-05,
"loss": 0.9519,
"step": 1311
},
{
"epoch": 0.6317943778968278,
"grad_norm": 1.9664356708526611,
"learning_rate": 6.328618834700474e-05,
"loss": 0.5531,
"step": 1312
},
{
"epoch": 0.6322759284897369,
"grad_norm": 3.6196768283843994,
"learning_rate": 6.314113853145703e-05,
"loss": 0.9089,
"step": 1313
},
{
"epoch": 0.6327574790826461,
"grad_norm": 2.040229082107544,
"learning_rate": 6.299617840149349e-05,
"loss": 0.7539,
"step": 1314
},
{
"epoch": 0.6332390296755553,
"grad_norm": 3.340404748916626,
"learning_rate": 6.285130830983339e-05,
"loss": 0.4569,
"step": 1315
},
{
"epoch": 0.6337205802684645,
"grad_norm": 1.9362350702285767,
"learning_rate": 6.270652860897704e-05,
"loss": 0.6094,
"step": 1316
},
{
"epoch": 0.6342021308613737,
"grad_norm": 3.0248935222625732,
"learning_rate": 6.25618396512048e-05,
"loss": 0.9306,
"step": 1317
},
{
"epoch": 0.6346836814542828,
"grad_norm": 4.514108657836914,
"learning_rate": 6.24172417885762e-05,
"loss": 0.672,
"step": 1318
},
{
"epoch": 0.635165232047192,
"grad_norm": 1.5764905214309692,
"learning_rate": 6.227273537292911e-05,
"loss": 0.8099,
"step": 1319
},
{
"epoch": 0.6356467826401011,
"grad_norm": 1.775439739227295,
"learning_rate": 6.212832075587891e-05,
"loss": 0.6518,
"step": 1320
},
{
"epoch": 0.6361283332330103,
"grad_norm": 2.8492040634155273,
"learning_rate": 6.19839982888176e-05,
"loss": 0.6318,
"step": 1321
},
{
"epoch": 0.6366098838259194,
"grad_norm": 3.787897825241089,
"learning_rate": 6.183976832291296e-05,
"loss": 0.546,
"step": 1322
},
{
"epoch": 0.6370914344188287,
"grad_norm": 1.812030553817749,
"learning_rate": 6.169563120910775e-05,
"loss": 0.9272,
"step": 1323
},
{
"epoch": 0.6375729850117378,
"grad_norm": 3.6685822010040283,
"learning_rate": 6.155158729811867e-05,
"loss": 0.9627,
"step": 1324
},
{
"epoch": 0.638054535604647,
"grad_norm": 2.6122891902923584,
"learning_rate": 6.140763694043578e-05,
"loss": 0.6109,
"step": 1325
},
{
"epoch": 0.6385360861975561,
"grad_norm": 1.1181974411010742,
"learning_rate": 6.126378048632139e-05,
"loss": 0.6805,
"step": 1326
},
{
"epoch": 0.6390176367904653,
"grad_norm": 2.0275444984436035,
"learning_rate": 6.112001828580944e-05,
"loss": 0.9841,
"step": 1327
},
{
"epoch": 0.6394991873833744,
"grad_norm": 2.3112661838531494,
"learning_rate": 6.0976350688704455e-05,
"loss": 0.4051,
"step": 1328
},
{
"epoch": 0.6399807379762836,
"grad_norm": 2.2878177165985107,
"learning_rate": 6.083277804458072e-05,
"loss": 0.6933,
"step": 1329
},
{
"epoch": 0.6404622885691929,
"grad_norm": 2.3939578533172607,
"learning_rate": 6.068930070278159e-05,
"loss": 0.7104,
"step": 1330
},
{
"epoch": 0.640943839162102,
"grad_norm": 2.7695722579956055,
"learning_rate": 6.054591901241846e-05,
"loss": 0.592,
"step": 1331
},
{
"epoch": 0.6414253897550112,
"grad_norm": 1.4362547397613525,
"learning_rate": 6.040263332237002e-05,
"loss": 0.7355,
"step": 1332
},
{
"epoch": 0.6419069403479203,
"grad_norm": 1.8200515508651733,
"learning_rate": 6.025944398128137e-05,
"loss": 0.6226,
"step": 1333
},
{
"epoch": 0.6423884909408295,
"grad_norm": 3.0382542610168457,
"learning_rate": 6.011635133756309e-05,
"loss": 0.5577,
"step": 1334
},
{
"epoch": 0.6428700415337386,
"grad_norm": 1.6607255935668945,
"learning_rate": 5.99733557393906e-05,
"loss": 0.7285,
"step": 1335
},
{
"epoch": 0.6433515921266478,
"grad_norm": 5.5869951248168945,
"learning_rate": 5.983045753470308e-05,
"loss": 1.0281,
"step": 1336
},
{
"epoch": 0.6438331427195569,
"grad_norm": 1.662786841392517,
"learning_rate": 5.96876570712028e-05,
"loss": 0.9,
"step": 1337
},
{
"epoch": 0.6443146933124662,
"grad_norm": 1.6657735109329224,
"learning_rate": 5.954495469635417e-05,
"loss": 0.3676,
"step": 1338
},
{
"epoch": 0.6447962439053753,
"grad_norm": 2.297683000564575,
"learning_rate": 5.940235075738296e-05,
"loss": 0.8609,
"step": 1339
},
{
"epoch": 0.6452777944982845,
"grad_norm": 3.4080722332000732,
"learning_rate": 5.925984560127542e-05,
"loss": 1.11,
"step": 1340
},
{
"epoch": 0.6457593450911936,
"grad_norm": 5.633896350860596,
"learning_rate": 5.911743957477739e-05,
"loss": 1.1069,
"step": 1341
},
{
"epoch": 0.6462408956841028,
"grad_norm": 3.4594554901123047,
"learning_rate": 5.897513302439355e-05,
"loss": 0.5313,
"step": 1342
},
{
"epoch": 0.646722446277012,
"grad_norm": 2.593113660812378,
"learning_rate": 5.883292629638651e-05,
"loss": 0.7902,
"step": 1343
},
{
"epoch": 0.6472039968699211,
"grad_norm": 1.7572481632232666,
"learning_rate": 5.869081973677604e-05,
"loss": 0.6139,
"step": 1344
},
{
"epoch": 0.6476855474628304,
"grad_norm": 2.4023494720458984,
"learning_rate": 5.8548813691338134e-05,
"loss": 0.9859,
"step": 1345
},
{
"epoch": 0.6481670980557395,
"grad_norm": 2.5072319507598877,
"learning_rate": 5.84069085056042e-05,
"loss": 1.0116,
"step": 1346
},
{
"epoch": 0.6486486486486487,
"grad_norm": 4.375543117523193,
"learning_rate": 5.826510452486027e-05,
"loss": 0.4556,
"step": 1347
},
{
"epoch": 0.6491301992415578,
"grad_norm": 2.4621849060058594,
"learning_rate": 5.81234020941461e-05,
"loss": 0.422,
"step": 1348
},
{
"epoch": 0.649611749834467,
"grad_norm": 3.9239776134490967,
"learning_rate": 5.798180155825437e-05,
"loss": 0.8455,
"step": 1349
},
{
"epoch": 0.6500933004273761,
"grad_norm": 2.0162253379821777,
"learning_rate": 5.784030326172981e-05,
"loss": 0.9106,
"step": 1350
},
{
"epoch": 0.6505748510202853,
"grad_norm": 2.1763408184051514,
"learning_rate": 5.7698907548868395e-05,
"loss": 0.3975,
"step": 1351
},
{
"epoch": 0.6510564016131944,
"grad_norm": 1.5609049797058105,
"learning_rate": 5.755761476371653e-05,
"loss": 0.4149,
"step": 1352
},
{
"epoch": 0.6515379522061037,
"grad_norm": 2.1389899253845215,
"learning_rate": 5.741642525007003e-05,
"loss": 1.0683,
"step": 1353
},
{
"epoch": 0.6520195027990128,
"grad_norm": 2.574646472930908,
"learning_rate": 5.727533935147359e-05,
"loss": 0.6677,
"step": 1354
},
{
"epoch": 0.652501053391922,
"grad_norm": 3.0155136585235596,
"learning_rate": 5.713435741121975e-05,
"loss": 0.5586,
"step": 1355
},
{
"epoch": 0.6529826039848312,
"grad_norm": 2.190906524658203,
"learning_rate": 5.699347977234799e-05,
"loss": 0.7389,
"step": 1356
},
{
"epoch": 0.6534641545777403,
"grad_norm": 3.7313098907470703,
"learning_rate": 5.685270677764412e-05,
"loss": 0.6318,
"step": 1357
},
{
"epoch": 0.6539457051706495,
"grad_norm": 1.6453399658203125,
"learning_rate": 5.671203876963931e-05,
"loss": 0.6455,
"step": 1358
},
{
"epoch": 0.6544272557635586,
"grad_norm": 2.063249111175537,
"learning_rate": 5.657147609060924e-05,
"loss": 0.7916,
"step": 1359
},
{
"epoch": 0.6549088063564679,
"grad_norm": 2.5297508239746094,
"learning_rate": 5.643101908257333e-05,
"loss": 0.7939,
"step": 1360
},
{
"epoch": 0.655390356949377,
"grad_norm": 3.1960248947143555,
"learning_rate": 5.629066808729385e-05,
"loss": 0.4917,
"step": 1361
},
{
"epoch": 0.6558719075422862,
"grad_norm": 2.5569260120391846,
"learning_rate": 5.6150423446275144e-05,
"loss": 0.54,
"step": 1362
},
{
"epoch": 0.6563534581351953,
"grad_norm": 2.1092185974121094,
"learning_rate": 5.601028550076277e-05,
"loss": 0.5214,
"step": 1363
},
{
"epoch": 0.6568350087281045,
"grad_norm": 1.965410828590393,
"learning_rate": 5.587025459174271e-05,
"loss": 0.5952,
"step": 1364
},
{
"epoch": 0.6573165593210136,
"grad_norm": 1.7591605186462402,
"learning_rate": 5.573033105994038e-05,
"loss": 0.7113,
"step": 1365
},
{
"epoch": 0.6577981099139228,
"grad_norm": 1.566158652305603,
"learning_rate": 5.559051524582002e-05,
"loss": 0.7087,
"step": 1366
},
{
"epoch": 0.658279660506832,
"grad_norm": 2.3267264366149902,
"learning_rate": 5.5450807489583777e-05,
"loss": 0.673,
"step": 1367
},
{
"epoch": 0.6587612110997412,
"grad_norm": 2.1096274852752686,
"learning_rate": 5.531120813117085e-05,
"loss": 0.6511,
"step": 1368
},
{
"epoch": 0.6592427616926503,
"grad_norm": 2.117785692214966,
"learning_rate": 5.517171751025667e-05,
"loss": 0.6863,
"step": 1369
},
{
"epoch": 0.6597243122855595,
"grad_norm": 2.8077008724212646,
"learning_rate": 5.5032335966252103e-05,
"loss": 0.3785,
"step": 1370
},
{
"epoch": 0.6602058628784687,
"grad_norm": 3.291800022125244,
"learning_rate": 5.489306383830258e-05,
"loss": 0.4787,
"step": 1371
},
{
"epoch": 0.6606874134713778,
"grad_norm": 2.1516401767730713,
"learning_rate": 5.475390146528738e-05,
"loss": 0.6011,
"step": 1372
},
{
"epoch": 0.661168964064287,
"grad_norm": 1.5693684816360474,
"learning_rate": 5.461484918581858e-05,
"loss": 0.4216,
"step": 1373
},
{
"epoch": 0.6616505146571962,
"grad_norm": 3.2931742668151855,
"learning_rate": 5.4475907338240494e-05,
"loss": 0.4253,
"step": 1374
},
{
"epoch": 0.6621320652501054,
"grad_norm": 4.190717697143555,
"learning_rate": 5.43370762606287e-05,
"loss": 1.0326,
"step": 1375
},
{
"epoch": 0.6626136158430145,
"grad_norm": 2.5259127616882324,
"learning_rate": 5.4198356290789276e-05,
"loss": 0.738,
"step": 1376
},
{
"epoch": 0.6630951664359237,
"grad_norm": 3.3138790130615234,
"learning_rate": 5.405974776625785e-05,
"loss": 0.4473,
"step": 1377
},
{
"epoch": 0.6635767170288328,
"grad_norm": 1.9713718891143799,
"learning_rate": 5.392125102429899e-05,
"loss": 0.6931,
"step": 1378
},
{
"epoch": 0.664058267621742,
"grad_norm": 1.5703426599502563,
"learning_rate": 5.378286640190522e-05,
"loss": 0.6073,
"step": 1379
},
{
"epoch": 0.6645398182146511,
"grad_norm": 2.653319835662842,
"learning_rate": 5.364459423579629e-05,
"loss": 0.7751,
"step": 1380
},
{
"epoch": 0.6650213688075604,
"grad_norm": 1.566805124282837,
"learning_rate": 5.350643486241825e-05,
"loss": 0.4636,
"step": 1381
},
{
"epoch": 0.6655029194004695,
"grad_norm": 1.8555259704589844,
"learning_rate": 5.33683886179428e-05,
"loss": 0.4216,
"step": 1382
},
{
"epoch": 0.6659844699933787,
"grad_norm": 5.087174892425537,
"learning_rate": 5.3230455838266266e-05,
"loss": 0.5842,
"step": 1383
},
{
"epoch": 0.6664660205862879,
"grad_norm": 3.006080150604248,
"learning_rate": 5.309263685900898e-05,
"loss": 0.4825,
"step": 1384
},
{
"epoch": 0.666947571179197,
"grad_norm": 1.3487999439239502,
"learning_rate": 5.295493201551433e-05,
"loss": 0.4206,
"step": 1385
},
{
"epoch": 0.6674291217721062,
"grad_norm": 3.11458683013916,
"learning_rate": 5.281734164284802e-05,
"loss": 0.7871,
"step": 1386
},
{
"epoch": 0.6679106723650153,
"grad_norm": 2.8389225006103516,
"learning_rate": 5.26798660757971e-05,
"loss": 0.7236,
"step": 1387
},
{
"epoch": 0.6683922229579246,
"grad_norm": 2.2241663932800293,
"learning_rate": 5.2542505648869434e-05,
"loss": 0.7008,
"step": 1388
},
{
"epoch": 0.6688737735508337,
"grad_norm": 1.7703624963760376,
"learning_rate": 5.240526069629265e-05,
"loss": 0.7416,
"step": 1389
},
{
"epoch": 0.6693553241437429,
"grad_norm": 2.581017017364502,
"learning_rate": 5.22681315520134e-05,
"loss": 1.014,
"step": 1390
},
{
"epoch": 0.669836874736652,
"grad_norm": 1.8126165866851807,
"learning_rate": 5.213111854969661e-05,
"loss": 0.7268,
"step": 1391
},
{
"epoch": 0.6703184253295612,
"grad_norm": 2.5835635662078857,
"learning_rate": 5.199422202272448e-05,
"loss": 0.6623,
"step": 1392
},
{
"epoch": 0.6707999759224703,
"grad_norm": 1.6041021347045898,
"learning_rate": 5.185744230419589e-05,
"loss": 0.6665,
"step": 1393
},
{
"epoch": 0.6712815265153795,
"grad_norm": 1.2473118305206299,
"learning_rate": 5.172077972692553e-05,
"loss": 0.5992,
"step": 1394
},
{
"epoch": 0.6717630771082886,
"grad_norm": 2.415090560913086,
"learning_rate": 5.1584234623442974e-05,
"loss": 0.7947,
"step": 1395
},
{
"epoch": 0.6722446277011979,
"grad_norm": 1.156660795211792,
"learning_rate": 5.1447807325992025e-05,
"loss": 0.4969,
"step": 1396
},
{
"epoch": 0.6727261782941071,
"grad_norm": 1.7031502723693848,
"learning_rate": 5.13114981665298e-05,
"loss": 0.7214,
"step": 1397
},
{
"epoch": 0.6732077288870162,
"grad_norm": 1.3150161504745483,
"learning_rate": 5.117530747672603e-05,
"loss": 0.6689,
"step": 1398
},
{
"epoch": 0.6736892794799254,
"grad_norm": 4.105159282684326,
"learning_rate": 5.103923558796203e-05,
"loss": 0.8401,
"step": 1399
},
{
"epoch": 0.6741708300728345,
"grad_norm": 1.6519443988800049,
"learning_rate": 5.090328283133019e-05,
"loss": 0.603,
"step": 1400
},
{
"epoch": 0.6746523806657437,
"grad_norm": 2.9713187217712402,
"learning_rate": 5.0767449537632986e-05,
"loss": 0.5331,
"step": 1401
},
{
"epoch": 0.6751339312586528,
"grad_norm": 2.6898117065429688,
"learning_rate": 5.06317360373822e-05,
"loss": 0.7912,
"step": 1402
},
{
"epoch": 0.6756154818515621,
"grad_norm": 2.4432663917541504,
"learning_rate": 5.049614266079813e-05,
"loss": 0.5751,
"step": 1403
},
{
"epoch": 0.6760970324444712,
"grad_norm": 2.470055103302002,
"learning_rate": 5.036066973780882e-05,
"loss": 0.483,
"step": 1404
},
{
"epoch": 0.6765785830373804,
"grad_norm": 1.4328922033309937,
"learning_rate": 5.022531759804918e-05,
"loss": 0.6776,
"step": 1405
},
{
"epoch": 0.6770601336302895,
"grad_norm": 3.8581573963165283,
"learning_rate": 5.009008657086025e-05,
"loss": 1.1248,
"step": 1406
},
{
"epoch": 0.6775416842231987,
"grad_norm": 2.339750289916992,
"learning_rate": 4.9954976985288395e-05,
"loss": 0.644,
"step": 1407
},
{
"epoch": 0.6780232348161078,
"grad_norm": 4.777864456176758,
"learning_rate": 4.981998917008448e-05,
"loss": 0.6807,
"step": 1408
},
{
"epoch": 0.678504785409017,
"grad_norm": 1.9307043552398682,
"learning_rate": 4.9685123453703e-05,
"loss": 0.6934,
"step": 1409
},
{
"epoch": 0.6789863360019261,
"grad_norm": 2.9564454555511475,
"learning_rate": 4.955038016430149e-05,
"loss": 0.6737,
"step": 1410
},
{
"epoch": 0.6794678865948354,
"grad_norm": 2.540689468383789,
"learning_rate": 4.9415759629739455e-05,
"loss": 0.5258,
"step": 1411
},
{
"epoch": 0.6799494371877446,
"grad_norm": 1.9452462196350098,
"learning_rate": 4.928126217757782e-05,
"loss": 0.9632,
"step": 1412
},
{
"epoch": 0.6804309877806537,
"grad_norm": 2.1210763454437256,
"learning_rate": 4.914688813507797e-05,
"loss": 0.981,
"step": 1413
},
{
"epoch": 0.6809125383735629,
"grad_norm": 2.85506010055542,
"learning_rate": 4.901263782920105e-05,
"loss": 0.6188,
"step": 1414
},
{
"epoch": 0.681394088966472,
"grad_norm": 2.3669497966766357,
"learning_rate": 4.887851158660706e-05,
"loss": 0.4394,
"step": 1415
},
{
"epoch": 0.6818756395593812,
"grad_norm": 1.358422040939331,
"learning_rate": 4.8744509733654184e-05,
"loss": 0.6346,
"step": 1416
},
{
"epoch": 0.6823571901522903,
"grad_norm": 1.982836127281189,
"learning_rate": 4.861063259639793e-05,
"loss": 0.5481,
"step": 1417
},
{
"epoch": 0.6828387407451996,
"grad_norm": 2.6741561889648438,
"learning_rate": 4.847688050059033e-05,
"loss": 0.7687,
"step": 1418
},
{
"epoch": 0.6833202913381087,
"grad_norm": 2.1041994094848633,
"learning_rate": 4.8343253771679155e-05,
"loss": 0.6466,
"step": 1419
},
{
"epoch": 0.6838018419310179,
"grad_norm": 1.9289573431015015,
"learning_rate": 4.82097527348072e-05,
"loss": 0.8381,
"step": 1420
},
{
"epoch": 0.684283392523927,
"grad_norm": 1.3552838563919067,
"learning_rate": 4.8076377714811284e-05,
"loss": 0.654,
"step": 1421
},
{
"epoch": 0.6847649431168362,
"grad_norm": 2.0205588340759277,
"learning_rate": 4.7943129036221735e-05,
"loss": 0.6172,
"step": 1422
},
{
"epoch": 0.6852464937097453,
"grad_norm": 1.8615128993988037,
"learning_rate": 4.781000702326142e-05,
"loss": 0.502,
"step": 1423
},
{
"epoch": 0.6857280443026545,
"grad_norm": 3.413642406463623,
"learning_rate": 4.767701199984497e-05,
"loss": 0.7401,
"step": 1424
},
{
"epoch": 0.6862095948955638,
"grad_norm": 1.818594217300415,
"learning_rate": 4.7544144289578066e-05,
"loss": 0.3476,
"step": 1425
},
{
"epoch": 0.6866911454884729,
"grad_norm": 2.0355210304260254,
"learning_rate": 4.7411404215756594e-05,
"loss": 0.6143,
"step": 1426
},
{
"epoch": 0.6871726960813821,
"grad_norm": 2.3590846061706543,
"learning_rate": 4.7278792101365866e-05,
"loss": 0.7234,
"step": 1427
},
{
"epoch": 0.6876542466742912,
"grad_norm": 4.059364318847656,
"learning_rate": 4.714630826907985e-05,
"loss": 0.8293,
"step": 1428
},
{
"epoch": 0.6881357972672004,
"grad_norm": 5.450283050537109,
"learning_rate": 4.701395304126038e-05,
"loss": 0.9174,
"step": 1429
},
{
"epoch": 0.6886173478601095,
"grad_norm": 1.6425899267196655,
"learning_rate": 4.6881726739956375e-05,
"loss": 0.482,
"step": 1430
},
{
"epoch": 0.6890988984530187,
"grad_norm": 1.5643469095230103,
"learning_rate": 4.6749629686902984e-05,
"loss": 0.8827,
"step": 1431
},
{
"epoch": 0.6895804490459279,
"grad_norm": 4.321664333343506,
"learning_rate": 4.661766220352097e-05,
"loss": 0.5119,
"step": 1432
},
{
"epoch": 0.6900619996388371,
"grad_norm": 3.4666759967803955,
"learning_rate": 4.64858246109157e-05,
"loss": 1.1991,
"step": 1433
},
{
"epoch": 0.6905435502317462,
"grad_norm": 4.904367923736572,
"learning_rate": 4.63541172298766e-05,
"loss": 0.8087,
"step": 1434
},
{
"epoch": 0.6910251008246554,
"grad_norm": 2.414597272872925,
"learning_rate": 4.622254038087622e-05,
"loss": 0.9569,
"step": 1435
},
{
"epoch": 0.6915066514175645,
"grad_norm": 2.7692532539367676,
"learning_rate": 4.60910943840695e-05,
"loss": 0.705,
"step": 1436
},
{
"epoch": 0.6919882020104737,
"grad_norm": 1.4934767484664917,
"learning_rate": 4.5959779559292985e-05,
"loss": 0.3923,
"step": 1437
},
{
"epoch": 0.6924697526033828,
"grad_norm": 1.7524209022521973,
"learning_rate": 4.582859622606406e-05,
"loss": 0.4808,
"step": 1438
},
{
"epoch": 0.6929513031962921,
"grad_norm": 1.5376161336898804,
"learning_rate": 4.569754470358014e-05,
"loss": 0.7108,
"step": 1439
},
{
"epoch": 0.6934328537892013,
"grad_norm": 2.8887248039245605,
"learning_rate": 4.556662531071796e-05,
"loss": 0.7365,
"step": 1440
},
{
"epoch": 0.6939144043821104,
"grad_norm": 1.2393244504928589,
"learning_rate": 4.54358383660327e-05,
"loss": 0.6929,
"step": 1441
},
{
"epoch": 0.6943959549750196,
"grad_norm": 1.984318494796753,
"learning_rate": 4.530518418775733e-05,
"loss": 0.5759,
"step": 1442
},
{
"epoch": 0.6948775055679287,
"grad_norm": 3.4806067943573,
"learning_rate": 4.5174663093801674e-05,
"loss": 0.963,
"step": 1443
},
{
"epoch": 0.6953590561608379,
"grad_norm": 0.9544948935508728,
"learning_rate": 4.504427540175181e-05,
"loss": 0.4253,
"step": 1444
},
{
"epoch": 0.695840606753747,
"grad_norm": 1.5310953855514526,
"learning_rate": 4.491402142886922e-05,
"loss": 0.6396,
"step": 1445
},
{
"epoch": 0.6963221573466563,
"grad_norm": 1.5412194728851318,
"learning_rate": 4.4783901492089984e-05,
"loss": 0.8048,
"step": 1446
},
{
"epoch": 0.6968037079395654,
"grad_norm": 0.9687153100967407,
"learning_rate": 4.465391590802407e-05,
"loss": 0.3689,
"step": 1447
},
{
"epoch": 0.6972852585324746,
"grad_norm": 2.8478314876556396,
"learning_rate": 4.4524064992954516e-05,
"loss": 0.6788,
"step": 1448
},
{
"epoch": 0.6977668091253837,
"grad_norm": 4.530721187591553,
"learning_rate": 4.4394349062836736e-05,
"loss": 0.7302,
"step": 1449
},
{
"epoch": 0.6982483597182929,
"grad_norm": 2.6478817462921143,
"learning_rate": 4.4264768433297565e-05,
"loss": 0.8899,
"step": 1450
},
{
"epoch": 0.698729910311202,
"grad_norm": 2.0213184356689453,
"learning_rate": 4.4135323419634766e-05,
"loss": 0.9633,
"step": 1451
},
{
"epoch": 0.6992114609041112,
"grad_norm": 3.5041468143463135,
"learning_rate": 4.4006014336816035e-05,
"loss": 1.0225,
"step": 1452
},
{
"epoch": 0.6996930114970205,
"grad_norm": 1.1038386821746826,
"learning_rate": 4.387684149947837e-05,
"loss": 0.3247,
"step": 1453
},
{
"epoch": 0.7001745620899296,
"grad_norm": 2.288525342941284,
"learning_rate": 4.374780522192726e-05,
"loss": 1.0528,
"step": 1454
},
{
"epoch": 0.7006561126828388,
"grad_norm": 4.361599922180176,
"learning_rate": 4.3618905818135805e-05,
"loss": 0.5694,
"step": 1455
},
{
"epoch": 0.7011376632757479,
"grad_norm": 2.766280174255371,
"learning_rate": 4.349014360174417e-05,
"loss": 0.5461,
"step": 1456
},
{
"epoch": 0.7016192138686571,
"grad_norm": 2.252498149871826,
"learning_rate": 4.336151888605871e-05,
"loss": 0.6858,
"step": 1457
},
{
"epoch": 0.7021007644615662,
"grad_norm": 2.4805188179016113,
"learning_rate": 4.323303198405117e-05,
"loss": 0.9368,
"step": 1458
},
{
"epoch": 0.7025823150544754,
"grad_norm": 3.281759738922119,
"learning_rate": 4.310468320835796e-05,
"loss": 0.9059,
"step": 1459
},
{
"epoch": 0.7030638656473845,
"grad_norm": 1.8752919435501099,
"learning_rate": 4.297647287127946e-05,
"loss": 0.3884,
"step": 1460
},
{
"epoch": 0.7035454162402938,
"grad_norm": 2.1307055950164795,
"learning_rate": 4.284840128477913e-05,
"loss": 0.8951,
"step": 1461
},
{
"epoch": 0.7040269668332029,
"grad_norm": 1.1077980995178223,
"learning_rate": 4.2720468760482854e-05,
"loss": 0.5871,
"step": 1462
},
{
"epoch": 0.7045085174261121,
"grad_norm": 1.9439555406570435,
"learning_rate": 4.2592675609678135e-05,
"loss": 0.5813,
"step": 1463
},
{
"epoch": 0.7049900680190212,
"grad_norm": 2.4993746280670166,
"learning_rate": 4.24650221433134e-05,
"loss": 0.671,
"step": 1464
},
{
"epoch": 0.7054716186119304,
"grad_norm": 2.181760787963867,
"learning_rate": 4.2337508671997086e-05,
"loss": 0.4199,
"step": 1465
},
{
"epoch": 0.7059531692048396,
"grad_norm": 2.025681495666504,
"learning_rate": 4.221013550599707e-05,
"loss": 0.4202,
"step": 1466
},
{
"epoch": 0.7064347197977487,
"grad_norm": 3.3956892490386963,
"learning_rate": 4.208290295523984e-05,
"loss": 0.8027,
"step": 1467
},
{
"epoch": 0.706916270390658,
"grad_norm": 2.7422797679901123,
"learning_rate": 4.1955811329309746e-05,
"loss": 1.2046,
"step": 1468
},
{
"epoch": 0.7073978209835671,
"grad_norm": 2.5726048946380615,
"learning_rate": 4.182886093744813e-05,
"loss": 1.1736,
"step": 1469
},
{
"epoch": 0.7078793715764763,
"grad_norm": 2.2286531925201416,
"learning_rate": 4.170205208855281e-05,
"loss": 0.5392,
"step": 1470
},
{
"epoch": 0.7083609221693854,
"grad_norm": 1.7045278549194336,
"learning_rate": 4.157538509117714e-05,
"loss": 0.7592,
"step": 1471
},
{
"epoch": 0.7088424727622946,
"grad_norm": 3.2489068508148193,
"learning_rate": 4.144886025352934e-05,
"loss": 0.6095,
"step": 1472
},
{
"epoch": 0.7093240233552037,
"grad_norm": 2.089620351791382,
"learning_rate": 4.13224778834717e-05,
"loss": 0.4248,
"step": 1473
},
{
"epoch": 0.7098055739481129,
"grad_norm": 1.5423088073730469,
"learning_rate": 4.1196238288519874e-05,
"loss": 0.2669,
"step": 1474
},
{
"epoch": 0.710287124541022,
"grad_norm": 1.412553071975708,
"learning_rate": 4.107014177584211e-05,
"loss": 0.3754,
"step": 1475
},
{
"epoch": 0.7107686751339313,
"grad_norm": 1.9968864917755127,
"learning_rate": 4.094418865225853e-05,
"loss": 0.5111,
"step": 1476
},
{
"epoch": 0.7112502257268404,
"grad_norm": 2.4750101566314697,
"learning_rate": 4.081837922424027e-05,
"loss": 0.6448,
"step": 1477
},
{
"epoch": 0.7117317763197496,
"grad_norm": 2.239208221435547,
"learning_rate": 4.069271379790891e-05,
"loss": 0.4287,
"step": 1478
},
{
"epoch": 0.7122133269126587,
"grad_norm": 7.368317604064941,
"learning_rate": 4.0567192679035636e-05,
"loss": 1.0325,
"step": 1479
},
{
"epoch": 0.7126948775055679,
"grad_norm": 1.8279064893722534,
"learning_rate": 4.044181617304048e-05,
"loss": 0.2988,
"step": 1480
},
{
"epoch": 0.7131764280984771,
"grad_norm": 1.781420111656189,
"learning_rate": 4.03165845849916e-05,
"loss": 0.5195,
"step": 1481
},
{
"epoch": 0.7136579786913863,
"grad_norm": 4.007209777832031,
"learning_rate": 4.019149821960455e-05,
"loss": 0.6838,
"step": 1482
},
{
"epoch": 0.7141395292842955,
"grad_norm": 2.4389944076538086,
"learning_rate": 4.006655738124152e-05,
"loss": 0.773,
"step": 1483
},
{
"epoch": 0.7146210798772046,
"grad_norm": 1.686699628829956,
"learning_rate": 3.9941762373910586e-05,
"loss": 0.3119,
"step": 1484
},
{
"epoch": 0.7151026304701138,
"grad_norm": 3.3148396015167236,
"learning_rate": 3.9817113501265016e-05,
"loss": 0.5543,
"step": 1485
},
{
"epoch": 0.7155841810630229,
"grad_norm": 1.4443938732147217,
"learning_rate": 3.9692611066602516e-05,
"loss": 0.6294,
"step": 1486
},
{
"epoch": 0.7160657316559321,
"grad_norm": 1.5491441488265991,
"learning_rate": 3.956825537286436e-05,
"loss": 0.312,
"step": 1487
},
{
"epoch": 0.7165472822488412,
"grad_norm": 2.059058427810669,
"learning_rate": 3.944404672263494e-05,
"loss": 0.7961,
"step": 1488
},
{
"epoch": 0.7170288328417505,
"grad_norm": 2.7234976291656494,
"learning_rate": 3.931998541814069e-05,
"loss": 0.7906,
"step": 1489
},
{
"epoch": 0.7175103834346596,
"grad_norm": 3.2691609859466553,
"learning_rate": 3.919607176124966e-05,
"loss": 0.9895,
"step": 1490
},
{
"epoch": 0.7179919340275688,
"grad_norm": 4.118697166442871,
"learning_rate": 3.9072306053470566e-05,
"loss": 0.5686,
"step": 1491
},
{
"epoch": 0.7184734846204779,
"grad_norm": 0.7744289040565491,
"learning_rate": 3.8948688595952164e-05,
"loss": 0.3498,
"step": 1492
},
{
"epoch": 0.7189550352133871,
"grad_norm": 2.5896894931793213,
"learning_rate": 3.882521968948246e-05,
"loss": 0.9491,
"step": 1493
},
{
"epoch": 0.7194365858062963,
"grad_norm": 3.2697811126708984,
"learning_rate": 3.8701899634488014e-05,
"loss": 0.6669,
"step": 1494
},
{
"epoch": 0.7199181363992054,
"grad_norm": 1.5220059156417847,
"learning_rate": 3.857872873103322e-05,
"loss": 0.5544,
"step": 1495
},
{
"epoch": 0.7203996869921147,
"grad_norm": 2.07112717628479,
"learning_rate": 3.8455707278819507e-05,
"loss": 0.5567,
"step": 1496
},
{
"epoch": 0.7208812375850238,
"grad_norm": 2.2222588062286377,
"learning_rate": 3.833283557718471e-05,
"loss": 0.4158,
"step": 1497
},
{
"epoch": 0.721362788177933,
"grad_norm": 2.1287026405334473,
"learning_rate": 3.821011392510228e-05,
"loss": 0.4907,
"step": 1498
},
{
"epoch": 0.7218443387708421,
"grad_norm": 3.9196484088897705,
"learning_rate": 3.808754262118046e-05,
"loss": 0.3054,
"step": 1499
},
{
"epoch": 0.7223258893637513,
"grad_norm": 1.4229110479354858,
"learning_rate": 3.796512196366182e-05,
"loss": 0.7671,
"step": 1500
},
{
"epoch": 0.7228074399566604,
"grad_norm": 1.968087911605835,
"learning_rate": 3.784285225042229e-05,
"loss": 0.9449,
"step": 1501
},
{
"epoch": 0.7232889905495696,
"grad_norm": 3.0822694301605225,
"learning_rate": 3.772073377897052e-05,
"loss": 0.8556,
"step": 1502
},
{
"epoch": 0.7237705411424787,
"grad_norm": 2.186847686767578,
"learning_rate": 3.7598766846447184e-05,
"loss": 0.364,
"step": 1503
},
{
"epoch": 0.724252091735388,
"grad_norm": 2.3544540405273438,
"learning_rate": 3.747695174962423e-05,
"loss": 0.9461,
"step": 1504
},
{
"epoch": 0.7247336423282971,
"grad_norm": 2.437185525894165,
"learning_rate": 3.7355288784904116e-05,
"loss": 0.6636,
"step": 1505
},
{
"epoch": 0.7252151929212063,
"grad_norm": 4.94728422164917,
"learning_rate": 3.7233778248319176e-05,
"loss": 0.4558,
"step": 1506
},
{
"epoch": 0.7256967435141155,
"grad_norm": 1.2178354263305664,
"learning_rate": 3.7112420435530845e-05,
"loss": 0.4775,
"step": 1507
},
{
"epoch": 0.7261782941070246,
"grad_norm": 2.4430301189422607,
"learning_rate": 3.69912156418289e-05,
"loss": 0.385,
"step": 1508
},
{
"epoch": 0.7266598446999338,
"grad_norm": 1.444451093673706,
"learning_rate": 3.687016416213084e-05,
"loss": 0.5812,
"step": 1509
},
{
"epoch": 0.7271413952928429,
"grad_norm": 1.2562270164489746,
"learning_rate": 3.674926629098113e-05,
"loss": 0.2545,
"step": 1510
},
{
"epoch": 0.7276229458857522,
"grad_norm": 3.6411561965942383,
"learning_rate": 3.6628522322550394e-05,
"loss": 0.4228,
"step": 1511
},
{
"epoch": 0.7281044964786613,
"grad_norm": 2.335496425628662,
"learning_rate": 3.6507932550634846e-05,
"loss": 0.4863,
"step": 1512
},
{
"epoch": 0.7285860470715705,
"grad_norm": 2.558220386505127,
"learning_rate": 3.638749726865552e-05,
"loss": 0.31,
"step": 1513
},
{
"epoch": 0.7290675976644796,
"grad_norm": 1.8757025003433228,
"learning_rate": 3.6267216769657485e-05,
"loss": 0.722,
"step": 1514
},
{
"epoch": 0.7295491482573888,
"grad_norm": 1.8420372009277344,
"learning_rate": 3.6147091346309224e-05,
"loss": 0.7818,
"step": 1515
},
{
"epoch": 0.7300306988502979,
"grad_norm": 3.078178644180298,
"learning_rate": 3.602712129090189e-05,
"loss": 0.5627,
"step": 1516
},
{
"epoch": 0.7305122494432071,
"grad_norm": 1.5147814750671387,
"learning_rate": 3.590730689534857e-05,
"loss": 0.6291,
"step": 1517
},
{
"epoch": 0.7309938000361162,
"grad_norm": 1.8767939805984497,
"learning_rate": 3.578764845118362e-05,
"loss": 0.3796,
"step": 1518
},
{
"epoch": 0.7314753506290255,
"grad_norm": 3.034921646118164,
"learning_rate": 3.566814624956194e-05,
"loss": 0.5662,
"step": 1519
},
{
"epoch": 0.7319569012219346,
"grad_norm": 0.8887065649032593,
"learning_rate": 3.554880058125819e-05,
"loss": 0.4554,
"step": 1520
},
{
"epoch": 0.7324384518148438,
"grad_norm": 3.390536308288574,
"learning_rate": 3.5429611736666235e-05,
"loss": 0.543,
"step": 1521
},
{
"epoch": 0.732920002407753,
"grad_norm": 1.751465082168579,
"learning_rate": 3.53105800057983e-05,
"loss": 0.4838,
"step": 1522
},
{
"epoch": 0.7334015530006621,
"grad_norm": 1.9569932222366333,
"learning_rate": 3.519170567828435e-05,
"loss": 0.557,
"step": 1523
},
{
"epoch": 0.7338831035935713,
"grad_norm": 1.79653799533844,
"learning_rate": 3.507298904337134e-05,
"loss": 0.7246,
"step": 1524
},
{
"epoch": 0.7343646541864804,
"grad_norm": 1.8159929513931274,
"learning_rate": 3.495443038992253e-05,
"loss": 0.3555,
"step": 1525
},
{
"epoch": 0.7348462047793897,
"grad_norm": 3.199507474899292,
"learning_rate": 3.4836030006416775e-05,
"loss": 0.5046,
"step": 1526
},
{
"epoch": 0.7353277553722988,
"grad_norm": 2.1682991981506348,
"learning_rate": 3.471778818094785e-05,
"loss": 0.7456,
"step": 1527
},
{
"epoch": 0.735809305965208,
"grad_norm": 1.0989952087402344,
"learning_rate": 3.459970520122364e-05,
"loss": 0.5804,
"step": 1528
},
{
"epoch": 0.7362908565581171,
"grad_norm": 1.0416721105575562,
"learning_rate": 3.44817813545656e-05,
"loss": 0.3529,
"step": 1529
},
{
"epoch": 0.7367724071510263,
"grad_norm": 4.703090667724609,
"learning_rate": 3.4364016927907974e-05,
"loss": 0.5961,
"step": 1530
},
{
"epoch": 0.7372539577439354,
"grad_norm": 2.4348576068878174,
"learning_rate": 3.424641220779711e-05,
"loss": 1.0376,
"step": 1531
},
{
"epoch": 0.7377355083368446,
"grad_norm": 1.9386546611785889,
"learning_rate": 3.412896748039067e-05,
"loss": 0.5634,
"step": 1532
},
{
"epoch": 0.7382170589297538,
"grad_norm": 1.232994556427002,
"learning_rate": 3.401168303145713e-05,
"loss": 0.4579,
"step": 1533
},
{
"epoch": 0.738698609522663,
"grad_norm": 1.5528676509857178,
"learning_rate": 3.3894559146374924e-05,
"loss": 0.4419,
"step": 1534
},
{
"epoch": 0.7391801601155722,
"grad_norm": 2.1310155391693115,
"learning_rate": 3.37775961101318e-05,
"loss": 0.5981,
"step": 1535
},
{
"epoch": 0.7396617107084813,
"grad_norm": 3.4211032390594482,
"learning_rate": 3.366079420732413e-05,
"loss": 1.0065,
"step": 1536
},
{
"epoch": 0.7401432613013905,
"grad_norm": 2.1246776580810547,
"learning_rate": 3.3544153722156216e-05,
"loss": 0.4723,
"step": 1537
},
{
"epoch": 0.7406248118942996,
"grad_norm": 3.7902305126190186,
"learning_rate": 3.3427674938439594e-05,
"loss": 0.7686,
"step": 1538
},
{
"epoch": 0.7411063624872088,
"grad_norm": 1.1903222799301147,
"learning_rate": 3.3311358139592317e-05,
"loss": 0.425,
"step": 1539
},
{
"epoch": 0.741587913080118,
"grad_norm": 2.2395949363708496,
"learning_rate": 3.319520360863837e-05,
"loss": 0.511,
"step": 1540
},
{
"epoch": 0.7420694636730272,
"grad_norm": 3.4029898643493652,
"learning_rate": 3.3079211628206854e-05,
"loss": 0.3296,
"step": 1541
},
{
"epoch": 0.7425510142659363,
"grad_norm": 2.042520523071289,
"learning_rate": 3.296338248053129e-05,
"loss": 0.3447,
"step": 1542
},
{
"epoch": 0.7430325648588455,
"grad_norm": 1.8394999504089355,
"learning_rate": 3.2847716447449096e-05,
"loss": 0.8341,
"step": 1543
},
{
"epoch": 0.7435141154517546,
"grad_norm": 1.8122001886367798,
"learning_rate": 3.2732213810400745e-05,
"loss": 0.5026,
"step": 1544
},
{
"epoch": 0.7439956660446638,
"grad_norm": 1.094519853591919,
"learning_rate": 3.261687485042915e-05,
"loss": 0.3282,
"step": 1545
},
{
"epoch": 0.7444772166375729,
"grad_norm": 1.8451745510101318,
"learning_rate": 3.250169984817897e-05,
"loss": 0.598,
"step": 1546
},
{
"epoch": 0.7449587672304822,
"grad_norm": 2.1285202503204346,
"learning_rate": 3.238668908389586e-05,
"loss": 0.5509,
"step": 1547
},
{
"epoch": 0.7454403178233914,
"grad_norm": 5.398340702056885,
"learning_rate": 3.227184283742591e-05,
"loss": 0.8054,
"step": 1548
},
{
"epoch": 0.7459218684163005,
"grad_norm": 4.164507865905762,
"learning_rate": 3.215716138821488e-05,
"loss": 0.6616,
"step": 1549
},
{
"epoch": 0.7464034190092097,
"grad_norm": 2.449247360229492,
"learning_rate": 3.204264501530756e-05,
"loss": 0.5928,
"step": 1550
},
{
"epoch": 0.7468849696021188,
"grad_norm": 2.73966121673584,
"learning_rate": 3.192829399734706e-05,
"loss": 0.8957,
"step": 1551
},
{
"epoch": 0.747366520195028,
"grad_norm": 1.7563470602035522,
"learning_rate": 3.181410861257413e-05,
"loss": 0.7951,
"step": 1552
},
{
"epoch": 0.7478480707879371,
"grad_norm": 2.797558307647705,
"learning_rate": 3.170008913882656e-05,
"loss": 0.4921,
"step": 1553
},
{
"epoch": 0.7483296213808464,
"grad_norm": 0.7847899794578552,
"learning_rate": 3.1586235853538325e-05,
"loss": 0.6741,
"step": 1554
},
{
"epoch": 0.7488111719737555,
"grad_norm": 3.5709075927734375,
"learning_rate": 3.1472549033739126e-05,
"loss": 0.3847,
"step": 1555
},
{
"epoch": 0.7492927225666647,
"grad_norm": 2.81365704536438,
"learning_rate": 3.1359028956053615e-05,
"loss": 0.4534,
"step": 1556
},
{
"epoch": 0.7497742731595738,
"grad_norm": 2.762085199356079,
"learning_rate": 3.1245675896700685e-05,
"loss": 1.0397,
"step": 1557
},
{
"epoch": 0.750255823752483,
"grad_norm": 1.423474907875061,
"learning_rate": 3.113249013149284e-05,
"loss": 0.3242,
"step": 1558
},
{
"epoch": 0.7507373743453921,
"grad_norm": 2.8285672664642334,
"learning_rate": 3.101947193583557e-05,
"loss": 0.7633,
"step": 1559
},
{
"epoch": 0.7512189249383013,
"grad_norm": 1.8334670066833496,
"learning_rate": 3.0906621584726546e-05,
"loss": 0.9668,
"step": 1560
},
{
"epoch": 0.7517004755312104,
"grad_norm": 4.472631454467773,
"learning_rate": 3.079393935275513e-05,
"loss": 0.752,
"step": 1561
},
{
"epoch": 0.7521820261241197,
"grad_norm": 1.2868741750717163,
"learning_rate": 3.068142551410155e-05,
"loss": 0.2786,
"step": 1562
},
{
"epoch": 0.7526635767170289,
"grad_norm": 1.9108686447143555,
"learning_rate": 3.0569080342536347e-05,
"loss": 0.4188,
"step": 1563
},
{
"epoch": 0.753145127309938,
"grad_norm": 3.5402438640594482,
"learning_rate": 3.0456904111419572e-05,
"loss": 0.7682,
"step": 1564
},
{
"epoch": 0.7536266779028472,
"grad_norm": 2.0236620903015137,
"learning_rate": 3.034489709370033e-05,
"loss": 0.4578,
"step": 1565
},
{
"epoch": 0.7541082284957563,
"grad_norm": 1.6213475465774536,
"learning_rate": 3.0233059561915855e-05,
"loss": 0.625,
"step": 1566
},
{
"epoch": 0.7545897790886655,
"grad_norm": 2.4623217582702637,
"learning_rate": 3.01213917881911e-05,
"loss": 0.6995,
"step": 1567
},
{
"epoch": 0.7550713296815746,
"grad_norm": 7.693182468414307,
"learning_rate": 3.0009894044237907e-05,
"loss": 0.5778,
"step": 1568
},
{
"epoch": 0.7555528802744839,
"grad_norm": 1.2127306461334229,
"learning_rate": 2.9898566601354418e-05,
"loss": 0.7593,
"step": 1569
},
{
"epoch": 0.756034430867393,
"grad_norm": 2.333127021789551,
"learning_rate": 2.9787409730424374e-05,
"loss": 0.5861,
"step": 1570
},
{
"epoch": 0.7565159814603022,
"grad_norm": 2.391322612762451,
"learning_rate": 2.96764237019165e-05,
"loss": 0.6207,
"step": 1571
},
{
"epoch": 0.7569975320532113,
"grad_norm": 2.2498581409454346,
"learning_rate": 2.9565608785883815e-05,
"loss": 0.7908,
"step": 1572
},
{
"epoch": 0.7574790826461205,
"grad_norm": 1.9790698289871216,
"learning_rate": 2.9454965251962973e-05,
"loss": 0.9975,
"step": 1573
},
{
"epoch": 0.7579606332390296,
"grad_norm": 1.974587321281433,
"learning_rate": 2.9344493369373637e-05,
"loss": 0.4825,
"step": 1574
},
{
"epoch": 0.7584421838319388,
"grad_norm": 4.309758186340332,
"learning_rate": 2.9234193406917833e-05,
"loss": 0.6722,
"step": 1575
},
{
"epoch": 0.7589237344248481,
"grad_norm": 8.978368759155273,
"learning_rate": 2.912406563297916e-05,
"loss": 1.2436,
"step": 1576
},
{
"epoch": 0.7594052850177572,
"grad_norm": 2.3713653087615967,
"learning_rate": 2.901411031552236e-05,
"loss": 0.6931,
"step": 1577
},
{
"epoch": 0.7598868356106664,
"grad_norm": 2.390103340148926,
"learning_rate": 2.8904327722092495e-05,
"loss": 0.7875,
"step": 1578
},
{
"epoch": 0.7603683862035755,
"grad_norm": 2.7292089462280273,
"learning_rate": 2.879471811981437e-05,
"loss": 0.9226,
"step": 1579
},
{
"epoch": 0.7608499367964847,
"grad_norm": 1.6569634675979614,
"learning_rate": 2.868528177539187e-05,
"loss": 0.9006,
"step": 1580
},
{
"epoch": 0.7613314873893938,
"grad_norm": 2.9300882816314697,
"learning_rate": 2.8576018955107285e-05,
"loss": 0.7848,
"step": 1581
},
{
"epoch": 0.761813037982303,
"grad_norm": 3.6729085445404053,
"learning_rate": 2.8466929924820705e-05,
"loss": 0.9429,
"step": 1582
},
{
"epoch": 0.7622945885752122,
"grad_norm": 2.4138500690460205,
"learning_rate": 2.8358014949969334e-05,
"loss": 0.8423,
"step": 1583
},
{
"epoch": 0.7627761391681214,
"grad_norm": 2.518306255340576,
"learning_rate": 2.8249274295566864e-05,
"loss": 0.5066,
"step": 1584
},
{
"epoch": 0.7632576897610305,
"grad_norm": 2.244164228439331,
"learning_rate": 2.8140708226202884e-05,
"loss": 0.5005,
"step": 1585
},
{
"epoch": 0.7637392403539397,
"grad_norm": 2.3682243824005127,
"learning_rate": 2.803231700604204e-05,
"loss": 0.5431,
"step": 1586
},
{
"epoch": 0.7642207909468488,
"grad_norm": 2.6888134479522705,
"learning_rate": 2.7924100898823702e-05,
"loss": 0.6596,
"step": 1587
},
{
"epoch": 0.764702341539758,
"grad_norm": 2.3632571697235107,
"learning_rate": 2.7816060167861002e-05,
"loss": 0.6924,
"step": 1588
},
{
"epoch": 0.7651838921326672,
"grad_norm": 1.6576805114746094,
"learning_rate": 2.7708195076040445e-05,
"loss": 0.5694,
"step": 1589
},
{
"epoch": 0.7656654427255764,
"grad_norm": 1.40910005569458,
"learning_rate": 2.760050588582114e-05,
"loss": 0.6316,
"step": 1590
},
{
"epoch": 0.7661469933184856,
"grad_norm": 4.426036834716797,
"learning_rate": 2.749299285923417e-05,
"loss": 0.7936,
"step": 1591
},
{
"epoch": 0.7666285439113947,
"grad_norm": 2.89945387840271,
"learning_rate": 2.7385656257881997e-05,
"loss": 0.4305,
"step": 1592
},
{
"epoch": 0.7671100945043039,
"grad_norm": 7.421449184417725,
"learning_rate": 2.7278496342937788e-05,
"loss": 0.4538,
"step": 1593
},
{
"epoch": 0.767591645097213,
"grad_norm": 1.8523188829421997,
"learning_rate": 2.717151337514482e-05,
"loss": 0.4911,
"step": 1594
},
{
"epoch": 0.7680731956901222,
"grad_norm": 1.7590643167495728,
"learning_rate": 2.7064707614815776e-05,
"loss": 0.5798,
"step": 1595
},
{
"epoch": 0.7685547462830313,
"grad_norm": 2.223667860031128,
"learning_rate": 2.6958079321832185e-05,
"loss": 0.4897,
"step": 1596
},
{
"epoch": 0.7690362968759406,
"grad_norm": 2.7905259132385254,
"learning_rate": 2.6851628755643776e-05,
"loss": 0.5537,
"step": 1597
},
{
"epoch": 0.7695178474688497,
"grad_norm": 4.461797714233398,
"learning_rate": 2.6745356175267765e-05,
"loss": 0.4501,
"step": 1598
},
{
"epoch": 0.7699993980617589,
"grad_norm": 4.621416091918945,
"learning_rate": 2.6639261839288343e-05,
"loss": 0.5464,
"step": 1599
},
{
"epoch": 0.770480948654668,
"grad_norm": 6.821358680725098,
"learning_rate": 2.6533346005855987e-05,
"loss": 0.824,
"step": 1600
},
{
"epoch": 0.7709624992475772,
"grad_norm": 1.811643123626709,
"learning_rate": 2.6427608932686843e-05,
"loss": 0.43,
"step": 1601
},
{
"epoch": 0.7714440498404863,
"grad_norm": 4.649533271789551,
"learning_rate": 2.6322050877062064e-05,
"loss": 0.7568,
"step": 1602
},
{
"epoch": 0.7719256004333955,
"grad_norm": 4.929988861083984,
"learning_rate": 2.6216672095827266e-05,
"loss": 0.8517,
"step": 1603
},
{
"epoch": 0.7724071510263048,
"grad_norm": 1.7828702926635742,
"learning_rate": 2.6111472845391827e-05,
"loss": 0.5364,
"step": 1604
},
{
"epoch": 0.7728887016192139,
"grad_norm": 0.7335327863693237,
"learning_rate": 2.6006453381728236e-05,
"loss": 0.306,
"step": 1605
},
{
"epoch": 0.7733702522121231,
"grad_norm": 2.5753631591796875,
"learning_rate": 2.5901613960371585e-05,
"loss": 0.3027,
"step": 1606
},
{
"epoch": 0.7738518028050322,
"grad_norm": 2.3606786727905273,
"learning_rate": 2.5796954836418884e-05,
"loss": 0.6378,
"step": 1607
},
{
"epoch": 0.7743333533979414,
"grad_norm": 2.3741252422332764,
"learning_rate": 2.569247626452842e-05,
"loss": 0.7027,
"step": 1608
},
{
"epoch": 0.7748149039908505,
"grad_norm": 3.609069347381592,
"learning_rate": 2.558817849891918e-05,
"loss": 0.5579,
"step": 1609
},
{
"epoch": 0.7752964545837597,
"grad_norm": 3.4135854244232178,
"learning_rate": 2.548406179337015e-05,
"loss": 0.6868,
"step": 1610
},
{
"epoch": 0.7757780051766688,
"grad_norm": 1.7354921102523804,
"learning_rate": 2.5380126401219807e-05,
"loss": 0.4444,
"step": 1611
},
{
"epoch": 0.7762595557695781,
"grad_norm": 3.9232022762298584,
"learning_rate": 2.527637257536547e-05,
"loss": 0.5952,
"step": 1612
},
{
"epoch": 0.7767411063624872,
"grad_norm": 3.5635814666748047,
"learning_rate": 2.517280056826262e-05,
"loss": 0.6536,
"step": 1613
},
{
"epoch": 0.7772226569553964,
"grad_norm": 2.9686269760131836,
"learning_rate": 2.5069410631924385e-05,
"loss": 0.8749,
"step": 1614
},
{
"epoch": 0.7777042075483055,
"grad_norm": 3.6469061374664307,
"learning_rate": 2.4966203017920818e-05,
"loss": 0.6617,
"step": 1615
},
{
"epoch": 0.7781857581412147,
"grad_norm": 2.2301876544952393,
"learning_rate": 2.4863177977378392e-05,
"loss": 0.5759,
"step": 1616
},
{
"epoch": 0.7786673087341239,
"grad_norm": 1.627908706665039,
"learning_rate": 2.4760335760979312e-05,
"loss": 0.6987,
"step": 1617
},
{
"epoch": 0.779148859327033,
"grad_norm": 3.955803632736206,
"learning_rate": 2.4657676618960944e-05,
"loss": 0.8698,
"step": 1618
},
{
"epoch": 0.7796304099199423,
"grad_norm": 2.231527328491211,
"learning_rate": 2.455520080111522e-05,
"loss": 0.996,
"step": 1619
},
{
"epoch": 0.7801119605128514,
"grad_norm": 3.9988443851470947,
"learning_rate": 2.4452908556787912e-05,
"loss": 0.771,
"step": 1620
},
{
"epoch": 0.7805935111057606,
"grad_norm": 2.5822057723999023,
"learning_rate": 2.4350800134878203e-05,
"loss": 0.6595,
"step": 1621
},
{
"epoch": 0.7810750616986697,
"grad_norm": 2.1049022674560547,
"learning_rate": 2.4248875783837987e-05,
"loss": 0.2905,
"step": 1622
},
{
"epoch": 0.7815566122915789,
"grad_norm": 3.057828426361084,
"learning_rate": 2.414713575167129e-05,
"loss": 0.455,
"step": 1623
},
{
"epoch": 0.782038162884488,
"grad_norm": 3.8854563236236572,
"learning_rate": 2.4045580285933557e-05,
"loss": 0.5088,
"step": 1624
},
{
"epoch": 0.7825197134773972,
"grad_norm": 3.774930715560913,
"learning_rate": 2.3944209633731242e-05,
"loss": 0.828,
"step": 1625
},
{
"epoch": 0.7830012640703063,
"grad_norm": 2.168914794921875,
"learning_rate": 2.3843024041721053e-05,
"loss": 0.499,
"step": 1626
},
{
"epoch": 0.7834828146632156,
"grad_norm": 4.615421772003174,
"learning_rate": 2.3742023756109456e-05,
"loss": 0.579,
"step": 1627
},
{
"epoch": 0.7839643652561247,
"grad_norm": 2.246866226196289,
"learning_rate": 2.3641209022651976e-05,
"loss": 0.7131,
"step": 1628
},
{
"epoch": 0.7844459158490339,
"grad_norm": 2.1677560806274414,
"learning_rate": 2.3540580086652675e-05,
"loss": 0.7653,
"step": 1629
},
{
"epoch": 0.7849274664419431,
"grad_norm": 2.1894803047180176,
"learning_rate": 2.344013719296353e-05,
"loss": 0.5755,
"step": 1630
},
{
"epoch": 0.7854090170348522,
"grad_norm": 1.5638082027435303,
"learning_rate": 2.3339880585983842e-05,
"loss": 0.8477,
"step": 1631
},
{
"epoch": 0.7858905676277614,
"grad_norm": 0.8193626999855042,
"learning_rate": 2.3239810509659597e-05,
"loss": 0.5981,
"step": 1632
},
{
"epoch": 0.7863721182206705,
"grad_norm": 2.181163787841797,
"learning_rate": 2.313992720748295e-05,
"loss": 0.6006,
"step": 1633
},
{
"epoch": 0.7868536688135798,
"grad_norm": 2.4740288257598877,
"learning_rate": 2.304023092249159e-05,
"loss": 0.7376,
"step": 1634
},
{
"epoch": 0.7873352194064889,
"grad_norm": 3.2138454914093018,
"learning_rate": 2.2940721897268136e-05,
"loss": 1.0772,
"step": 1635
},
{
"epoch": 0.7878167699993981,
"grad_norm": 3.062891960144043,
"learning_rate": 2.2841400373939592e-05,
"loss": 0.9387,
"step": 1636
},
{
"epoch": 0.7882983205923072,
"grad_norm": 1.856158971786499,
"learning_rate": 2.274226659417671e-05,
"loss": 0.891,
"step": 1637
},
{
"epoch": 0.7887798711852164,
"grad_norm": 1.4928967952728271,
"learning_rate": 2.2643320799193402e-05,
"loss": 0.3832,
"step": 1638
},
{
"epoch": 0.7892614217781255,
"grad_norm": 2.0665640830993652,
"learning_rate": 2.2544563229746218e-05,
"loss": 0.5602,
"step": 1639
},
{
"epoch": 0.7897429723710347,
"grad_norm": 2.119544506072998,
"learning_rate": 2.2445994126133708e-05,
"loss": 0.8366,
"step": 1640
},
{
"epoch": 0.7902245229639439,
"grad_norm": 1.5489791631698608,
"learning_rate": 2.234761372819577e-05,
"loss": 0.5582,
"step": 1641
},
{
"epoch": 0.7907060735568531,
"grad_norm": 1.4361308813095093,
"learning_rate": 2.2249422275313214e-05,
"loss": 0.3052,
"step": 1642
},
{
"epoch": 0.7911876241497622,
"grad_norm": 2.6924610137939453,
"learning_rate": 2.215142000640714e-05,
"loss": 0.4776,
"step": 1643
},
{
"epoch": 0.7916691747426714,
"grad_norm": 2.889003038406372,
"learning_rate": 2.2053607159938195e-05,
"loss": 0.5702,
"step": 1644
},
{
"epoch": 0.7921507253355806,
"grad_norm": 3.6985273361206055,
"learning_rate": 2.1955983973906236e-05,
"loss": 0.6528,
"step": 1645
},
{
"epoch": 0.7926322759284897,
"grad_norm": 1.6268264055252075,
"learning_rate": 2.1858550685849578e-05,
"loss": 0.7206,
"step": 1646
},
{
"epoch": 0.793113826521399,
"grad_norm": 2.64440655708313,
"learning_rate": 2.17613075328445e-05,
"loss": 0.742,
"step": 1647
},
{
"epoch": 0.793595377114308,
"grad_norm": 2.0996243953704834,
"learning_rate": 2.1664254751504642e-05,
"loss": 0.6661,
"step": 1648
},
{
"epoch": 0.7940769277072173,
"grad_norm": 1.764198660850525,
"learning_rate": 2.1567392577980393e-05,
"loss": 0.3963,
"step": 1649
},
{
"epoch": 0.7945584783001264,
"grad_norm": 2.0087742805480957,
"learning_rate": 2.1470721247958404e-05,
"loss": 0.906,
"step": 1650
},
{
"epoch": 0.7950400288930356,
"grad_norm": 2.0296401977539062,
"learning_rate": 2.137424099666091e-05,
"loss": 0.7582,
"step": 1651
},
{
"epoch": 0.7955215794859447,
"grad_norm": 4.00960636138916,
"learning_rate": 2.1277952058845284e-05,
"loss": 0.5171,
"step": 1652
},
{
"epoch": 0.7960031300788539,
"grad_norm": 1.5805654525756836,
"learning_rate": 2.118185466880327e-05,
"loss": 0.867,
"step": 1653
},
{
"epoch": 0.796484680671763,
"grad_norm": 1.1289174556732178,
"learning_rate": 2.1085949060360654e-05,
"loss": 0.7591,
"step": 1654
},
{
"epoch": 0.7969662312646723,
"grad_norm": 2.465733528137207,
"learning_rate": 2.0990235466876517e-05,
"loss": 0.7738,
"step": 1655
},
{
"epoch": 0.7974477818575814,
"grad_norm": 1.9576550722122192,
"learning_rate": 2.089471412124274e-05,
"loss": 0.5989,
"step": 1656
},
{
"epoch": 0.7979293324504906,
"grad_norm": 2.245087146759033,
"learning_rate": 2.079938525588342e-05,
"loss": 0.6204,
"step": 1657
},
{
"epoch": 0.7984108830433998,
"grad_norm": 2.1866295337677,
"learning_rate": 2.0704249102754324e-05,
"loss": 0.986,
"step": 1658
},
{
"epoch": 0.7988924336363089,
"grad_norm": 2.3913660049438477,
"learning_rate": 2.0609305893342278e-05,
"loss": 0.4221,
"step": 1659
},
{
"epoch": 0.7993739842292181,
"grad_norm": 2.4920084476470947,
"learning_rate": 2.0514555858664663e-05,
"loss": 0.7775,
"step": 1660
},
{
"epoch": 0.7998555348221272,
"grad_norm": 1.5726318359375,
"learning_rate": 2.0419999229268805e-05,
"loss": 0.3526,
"step": 1661
},
{
"epoch": 0.8003370854150365,
"grad_norm": 3.00834584236145,
"learning_rate": 2.032563623523147e-05,
"loss": 0.7506,
"step": 1662
},
{
"epoch": 0.8008186360079456,
"grad_norm": 2.3384523391723633,
"learning_rate": 2.0231467106158186e-05,
"loss": 0.5321,
"step": 1663
},
{
"epoch": 0.8013001866008548,
"grad_norm": 2.5204219818115234,
"learning_rate": 2.0137492071182863e-05,
"loss": 0.8753,
"step": 1664
},
{
"epoch": 0.8017817371937639,
"grad_norm": 1.7203079462051392,
"learning_rate": 2.0043711358967043e-05,
"loss": 1.6074,
"step": 1665
},
{
"epoch": 0.8022632877866731,
"grad_norm": 3.3237717151641846,
"learning_rate": 1.9950125197699508e-05,
"loss": 0.5971,
"step": 1666
},
{
"epoch": 0.8027448383795822,
"grad_norm": 3.3281543254852295,
"learning_rate": 1.985673381509565e-05,
"loss": 0.6587,
"step": 1667
},
{
"epoch": 0.8032263889724914,
"grad_norm": 3.3492562770843506,
"learning_rate": 1.9763537438396894e-05,
"loss": 0.9115,
"step": 1668
},
{
"epoch": 0.8037079395654005,
"grad_norm": 1.2501789331436157,
"learning_rate": 1.96705362943702e-05,
"loss": 0.4328,
"step": 1669
},
{
"epoch": 0.8041894901583098,
"grad_norm": 1.199196219444275,
"learning_rate": 1.9577730609307454e-05,
"loss": 0.283,
"step": 1670
},
{
"epoch": 0.804671040751219,
"grad_norm": 2.56499981880188,
"learning_rate": 1.9485120609024975e-05,
"loss": 0.6122,
"step": 1671
},
{
"epoch": 0.8051525913441281,
"grad_norm": 2.273665189743042,
"learning_rate": 1.9392706518862935e-05,
"loss": 0.9137,
"step": 1672
},
{
"epoch": 0.8056341419370373,
"grad_norm": 2.3677797317504883,
"learning_rate": 1.9300488563684804e-05,
"loss": 0.398,
"step": 1673
},
{
"epoch": 0.8061156925299464,
"grad_norm": 1.3238352537155151,
"learning_rate": 1.920846696787684e-05,
"loss": 0.8935,
"step": 1674
},
{
"epoch": 0.8065972431228556,
"grad_norm": 2.1932382583618164,
"learning_rate": 1.9116641955347446e-05,
"loss": 0.5614,
"step": 1675
},
{
"epoch": 0.8070787937157647,
"grad_norm": 1.0041841268539429,
"learning_rate": 1.9025013749526767e-05,
"loss": 0.5811,
"step": 1676
},
{
"epoch": 0.807560344308674,
"grad_norm": 3.299774169921875,
"learning_rate": 1.8933582573366036e-05,
"loss": 0.8145,
"step": 1677
},
{
"epoch": 0.8080418949015831,
"grad_norm": 2.4343202114105225,
"learning_rate": 1.8842348649337116e-05,
"loss": 0.6614,
"step": 1678
},
{
"epoch": 0.8085234454944923,
"grad_norm": 2.8237218856811523,
"learning_rate": 1.875131219943187e-05,
"loss": 1.1075,
"step": 1679
},
{
"epoch": 0.8090049960874014,
"grad_norm": 2.5553195476531982,
"learning_rate": 1.8660473445161663e-05,
"loss": 0.604,
"step": 1680
},
{
"epoch": 0.8094865466803106,
"grad_norm": 1.5066719055175781,
"learning_rate": 1.856983260755686e-05,
"loss": 0.6099,
"step": 1681
},
{
"epoch": 0.8099680972732197,
"grad_norm": 1.4844105243682861,
"learning_rate": 1.8479389907166223e-05,
"loss": 0.2163,
"step": 1682
},
{
"epoch": 0.8104496478661289,
"grad_norm": 2.3299903869628906,
"learning_rate": 1.8389145564056387e-05,
"loss": 0.4654,
"step": 1683
},
{
"epoch": 0.810931198459038,
"grad_norm": 1.9507116079330444,
"learning_rate": 1.829909979781137e-05,
"loss": 0.3359,
"step": 1684
},
{
"epoch": 0.8114127490519473,
"grad_norm": 2.243999481201172,
"learning_rate": 1.820925282753201e-05,
"loss": 0.5519,
"step": 1685
},
{
"epoch": 0.8118942996448565,
"grad_norm": 1.5406464338302612,
"learning_rate": 1.8119604871835437e-05,
"loss": 0.6571,
"step": 1686
},
{
"epoch": 0.8123758502377656,
"grad_norm": 2.3631114959716797,
"learning_rate": 1.8030156148854492e-05,
"loss": 0.7404,
"step": 1687
},
{
"epoch": 0.8128574008306748,
"grad_norm": 5.411351680755615,
"learning_rate": 1.7940906876237284e-05,
"loss": 0.4588,
"step": 1688
},
{
"epoch": 0.8133389514235839,
"grad_norm": 1.9725611209869385,
"learning_rate": 1.78518572711466e-05,
"loss": 0.6191,
"step": 1689
},
{
"epoch": 0.8138205020164931,
"grad_norm": 2.150035858154297,
"learning_rate": 1.776300755025939e-05,
"loss": 1.0107,
"step": 1690
},
{
"epoch": 0.8143020526094022,
"grad_norm": 1.634321928024292,
"learning_rate": 1.767435792976626e-05,
"loss": 0.42,
"step": 1691
},
{
"epoch": 0.8147836032023115,
"grad_norm": 1.756266474723816,
"learning_rate": 1.7585908625370905e-05,
"loss": 0.7629,
"step": 1692
},
{
"epoch": 0.8152651537952206,
"grad_norm": 2.3073647022247314,
"learning_rate": 1.749765985228963e-05,
"loss": 0.8333,
"step": 1693
},
{
"epoch": 0.8157467043881298,
"grad_norm": 2.3972604274749756,
"learning_rate": 1.740961182525077e-05,
"loss": 0.4132,
"step": 1694
},
{
"epoch": 0.8162282549810389,
"grad_norm": 2.476473569869995,
"learning_rate": 1.7321764758494252e-05,
"loss": 0.8872,
"step": 1695
},
{
"epoch": 0.8167098055739481,
"grad_norm": 2.487661361694336,
"learning_rate": 1.7234118865770987e-05,
"loss": 0.685,
"step": 1696
},
{
"epoch": 0.8171913561668572,
"grad_norm": 1.8796521425247192,
"learning_rate": 1.7146674360342373e-05,
"loss": 0.6886,
"step": 1697
},
{
"epoch": 0.8176729067597664,
"grad_norm": 1.8261488676071167,
"learning_rate": 1.7059431454979824e-05,
"loss": 0.8278,
"step": 1698
},
{
"epoch": 0.8181544573526757,
"grad_norm": 4.842952728271484,
"learning_rate": 1.6972390361964195e-05,
"loss": 1.1315,
"step": 1699
},
{
"epoch": 0.8186360079455848,
"grad_norm": 1.3907809257507324,
"learning_rate": 1.688555129308531e-05,
"loss": 0.5094,
"step": 1700
},
{
"epoch": 0.819117558538494,
"grad_norm": 2.8094334602355957,
"learning_rate": 1.6798914459641434e-05,
"loss": 0.6765,
"step": 1701
},
{
"epoch": 0.8195991091314031,
"grad_norm": 1.7565284967422485,
"learning_rate": 1.6712480072438662e-05,
"loss": 0.8474,
"step": 1702
},
{
"epoch": 0.8200806597243123,
"grad_norm": 3.5661885738372803,
"learning_rate": 1.6626248341790596e-05,
"loss": 1.1639,
"step": 1703
},
{
"epoch": 0.8205622103172214,
"grad_norm": 2.280489683151245,
"learning_rate": 1.6540219477517684e-05,
"loss": 0.6265,
"step": 1704
},
{
"epoch": 0.8210437609101306,
"grad_norm": 1.485849380493164,
"learning_rate": 1.6454393688946767e-05,
"loss": 0.4715,
"step": 1705
},
{
"epoch": 0.8215253115030398,
"grad_norm": 2.863246202468872,
"learning_rate": 1.6368771184910557e-05,
"loss": 0.4076,
"step": 1706
},
{
"epoch": 0.822006862095949,
"grad_norm": 1.4689639806747437,
"learning_rate": 1.6283352173747145e-05,
"loss": 0.6784,
"step": 1707
},
{
"epoch": 0.8224884126888581,
"grad_norm": 1.777126431465149,
"learning_rate": 1.619813686329946e-05,
"loss": 0.6577,
"step": 1708
},
{
"epoch": 0.8229699632817673,
"grad_norm": 2.7305054664611816,
"learning_rate": 1.611312546091476e-05,
"loss": 0.8134,
"step": 1709
},
{
"epoch": 0.8234515138746764,
"grad_norm": 1.6367037296295166,
"learning_rate": 1.6028318173444202e-05,
"loss": 0.7774,
"step": 1710
},
{
"epoch": 0.8239330644675856,
"grad_norm": 1.054276943206787,
"learning_rate": 1.594371520724226e-05,
"loss": 0.6494,
"step": 1711
},
{
"epoch": 0.8244146150604948,
"grad_norm": 1.8237320184707642,
"learning_rate": 1.5859316768166244e-05,
"loss": 0.784,
"step": 1712
},
{
"epoch": 0.824896165653404,
"grad_norm": 2.3219659328460693,
"learning_rate": 1.5775123061575836e-05,
"loss": 0.8381,
"step": 1713
},
{
"epoch": 0.8253777162463132,
"grad_norm": 2.7865233421325684,
"learning_rate": 1.569113429233252e-05,
"loss": 0.4768,
"step": 1714
},
{
"epoch": 0.8258592668392223,
"grad_norm": 2.3359994888305664,
"learning_rate": 1.5607350664799157e-05,
"loss": 0.7649,
"step": 1715
},
{
"epoch": 0.8263408174321315,
"grad_norm": 4.821800708770752,
"learning_rate": 1.552377238283943e-05,
"loss": 0.5414,
"step": 1716
},
{
"epoch": 0.8268223680250406,
"grad_norm": 1.802925944328308,
"learning_rate": 1.5440399649817385e-05,
"loss": 0.2233,
"step": 1717
},
{
"epoch": 0.8273039186179498,
"grad_norm": 3.2591664791107178,
"learning_rate": 1.5357232668596933e-05,
"loss": 1.2976,
"step": 1718
},
{
"epoch": 0.8277854692108589,
"grad_norm": 2.02854585647583,
"learning_rate": 1.5274271641541295e-05,
"loss": 0.5666,
"step": 1719
},
{
"epoch": 0.8282670198037682,
"grad_norm": 2.4530251026153564,
"learning_rate": 1.5191516770512649e-05,
"loss": 0.7718,
"step": 1720
},
{
"epoch": 0.8287485703966773,
"grad_norm": 2.288060188293457,
"learning_rate": 1.5108968256871437e-05,
"loss": 0.333,
"step": 1721
},
{
"epoch": 0.8292301209895865,
"grad_norm": 1.380737543106079,
"learning_rate": 1.5026626301476087e-05,
"loss": 0.4997,
"step": 1722
},
{
"epoch": 0.8297116715824956,
"grad_norm": 3.533025026321411,
"learning_rate": 1.4944491104682379e-05,
"loss": 0.7909,
"step": 1723
},
{
"epoch": 0.8301932221754048,
"grad_norm": 2.4573423862457275,
"learning_rate": 1.4862562866343034e-05,
"loss": 0.6396,
"step": 1724
},
{
"epoch": 0.8306747727683139,
"grad_norm": 2.839277744293213,
"learning_rate": 1.4780841785807164e-05,
"loss": 0.6966,
"step": 1725
},
{
"epoch": 0.8311563233612231,
"grad_norm": 1.8692930936813354,
"learning_rate": 1.4699328061919848e-05,
"loss": 0.5262,
"step": 1726
},
{
"epoch": 0.8316378739541324,
"grad_norm": 2.2407124042510986,
"learning_rate": 1.4618021893021605e-05,
"loss": 0.3409,
"step": 1727
},
{
"epoch": 0.8321194245470415,
"grad_norm": 1.3630995750427246,
"learning_rate": 1.453692347694794e-05,
"loss": 0.3524,
"step": 1728
},
{
"epoch": 0.8326009751399507,
"grad_norm": 2.1543772220611572,
"learning_rate": 1.4456033011028835e-05,
"loss": 0.442,
"step": 1729
},
{
"epoch": 0.8330825257328598,
"grad_norm": 3.9747824668884277,
"learning_rate": 1.437535069208833e-05,
"loss": 0.8306,
"step": 1730
},
{
"epoch": 0.833564076325769,
"grad_norm": 2.9707400798797607,
"learning_rate": 1.4294876716443906e-05,
"loss": 0.3712,
"step": 1731
},
{
"epoch": 0.8340456269186781,
"grad_norm": 1.9354028701782227,
"learning_rate": 1.4214611279906187e-05,
"loss": 0.2021,
"step": 1732
},
{
"epoch": 0.8345271775115873,
"grad_norm": 2.2027204036712646,
"learning_rate": 1.4134554577778337e-05,
"loss": 0.7172,
"step": 1733
},
{
"epoch": 0.8350087281044964,
"grad_norm": 1.4204658269882202,
"learning_rate": 1.4054706804855634e-05,
"loss": 0.8734,
"step": 1734
},
{
"epoch": 0.8354902786974057,
"grad_norm": 2.0968925952911377,
"learning_rate": 1.3975068155424976e-05,
"loss": 0.9967,
"step": 1735
},
{
"epoch": 0.8359718292903148,
"grad_norm": 2.840298891067505,
"learning_rate": 1.3895638823264446e-05,
"loss": 0.6381,
"step": 1736
},
{
"epoch": 0.836453379883224,
"grad_norm": 1.5930904150009155,
"learning_rate": 1.3816419001642777e-05,
"loss": 0.5605,
"step": 1737
},
{
"epoch": 0.8369349304761331,
"grad_norm": 4.177980899810791,
"learning_rate": 1.3737408883318948e-05,
"loss": 0.5922,
"step": 1738
},
{
"epoch": 0.8374164810690423,
"grad_norm": 1.7408493757247925,
"learning_rate": 1.365860866054165e-05,
"loss": 0.4055,
"step": 1739
},
{
"epoch": 0.8378980316619515,
"grad_norm": 1.257311224937439,
"learning_rate": 1.358001852504891e-05,
"loss": 0.2734,
"step": 1740
},
{
"epoch": 0.8383795822548606,
"grad_norm": 1.8963124752044678,
"learning_rate": 1.3501638668067485e-05,
"loss": 0.7453,
"step": 1741
},
{
"epoch": 0.8388611328477699,
"grad_norm": 1.7414535284042358,
"learning_rate": 1.3423469280312562e-05,
"loss": 0.6258,
"step": 1742
},
{
"epoch": 0.839342683440679,
"grad_norm": 1.7837656736373901,
"learning_rate": 1.3345510551987128e-05,
"loss": 0.3573,
"step": 1743
},
{
"epoch": 0.8398242340335882,
"grad_norm": 2.218170404434204,
"learning_rate": 1.326776267278167e-05,
"loss": 0.6641,
"step": 1744
},
{
"epoch": 0.8403057846264973,
"grad_norm": 4.239348411560059,
"learning_rate": 1.3190225831873581e-05,
"loss": 0.7345,
"step": 1745
},
{
"epoch": 0.8407873352194065,
"grad_norm": 1.7612202167510986,
"learning_rate": 1.3112900217926782e-05,
"loss": 0.6602,
"step": 1746
},
{
"epoch": 0.8412688858123156,
"grad_norm": 5.180617332458496,
"learning_rate": 1.3035786019091223e-05,
"loss": 0.7354,
"step": 1747
},
{
"epoch": 0.8417504364052248,
"grad_norm": 2.2071621417999268,
"learning_rate": 1.2958883423002422e-05,
"loss": 0.93,
"step": 1748
},
{
"epoch": 0.842231986998134,
"grad_norm": 2.929159164428711,
"learning_rate": 1.288219261678103e-05,
"loss": 0.7676,
"step": 1749
},
{
"epoch": 0.8427135375910432,
"grad_norm": 1.524143934249878,
"learning_rate": 1.2805713787032381e-05,
"loss": 0.269,
"step": 1750
},
{
"epoch": 0.8431950881839523,
"grad_norm": 1.5855472087860107,
"learning_rate": 1.2729447119846016e-05,
"loss": 0.4037,
"step": 1751
},
{
"epoch": 0.8436766387768615,
"grad_norm": 2.4679388999938965,
"learning_rate": 1.265339280079525e-05,
"loss": 0.7235,
"step": 1752
},
{
"epoch": 0.8441581893697706,
"grad_norm": 3.3254940509796143,
"learning_rate": 1.257755101493665e-05,
"loss": 0.6291,
"step": 1753
},
{
"epoch": 0.8446397399626798,
"grad_norm": 1.8408324718475342,
"learning_rate": 1.2501921946809714e-05,
"loss": 0.3552,
"step": 1754
},
{
"epoch": 0.845121290555589,
"grad_norm": 2.5367562770843506,
"learning_rate": 1.2426505780436326e-05,
"loss": 0.8439,
"step": 1755
},
{
"epoch": 0.8456028411484982,
"grad_norm": 2.7886507511138916,
"learning_rate": 1.2351302699320332e-05,
"loss": 0.7676,
"step": 1756
},
{
"epoch": 0.8460843917414074,
"grad_norm": 4.4448628425598145,
"learning_rate": 1.2276312886447106e-05,
"loss": 0.8199,
"step": 1757
},
{
"epoch": 0.8465659423343165,
"grad_norm": 2.484957218170166,
"learning_rate": 1.2201536524283074e-05,
"loss": 0.6567,
"step": 1758
},
{
"epoch": 0.8470474929272257,
"grad_norm": 1.730948805809021,
"learning_rate": 1.2126973794775343e-05,
"loss": 0.4775,
"step": 1759
},
{
"epoch": 0.8475290435201348,
"grad_norm": 3.7568891048431396,
"learning_rate": 1.2052624879351104e-05,
"loss": 0.8885,
"step": 1760
},
{
"epoch": 0.848010594113044,
"grad_norm": 2.2398736476898193,
"learning_rate": 1.1978489958917382e-05,
"loss": 0.6513,
"step": 1761
},
{
"epoch": 0.8484921447059531,
"grad_norm": 2.391688346862793,
"learning_rate": 1.1904569213860472e-05,
"loss": 0.7705,
"step": 1762
},
{
"epoch": 0.8489736952988624,
"grad_norm": 1.2668508291244507,
"learning_rate": 1.1830862824045552e-05,
"loss": 0.7412,
"step": 1763
},
{
"epoch": 0.8494552458917715,
"grad_norm": 2.4677586555480957,
"learning_rate": 1.1757370968816217e-05,
"loss": 0.4662,
"step": 1764
},
{
"epoch": 0.8499367964846807,
"grad_norm": 1.866142988204956,
"learning_rate": 1.1684093826994024e-05,
"loss": 0.4521,
"step": 1765
},
{
"epoch": 0.8504183470775898,
"grad_norm": 2.4763362407684326,
"learning_rate": 1.1611031576878117e-05,
"loss": 0.601,
"step": 1766
},
{
"epoch": 0.850899897670499,
"grad_norm": 1.6049933433532715,
"learning_rate": 1.1538184396244778e-05,
"loss": 0.2667,
"step": 1767
},
{
"epoch": 0.8513814482634082,
"grad_norm": 2.2135348320007324,
"learning_rate": 1.146555246234694e-05,
"loss": 0.6749,
"step": 1768
},
{
"epoch": 0.8518629988563173,
"grad_norm": 3.2478649616241455,
"learning_rate": 1.1393135951913824e-05,
"loss": 0.6464,
"step": 1769
},
{
"epoch": 0.8523445494492266,
"grad_norm": 1.7736784219741821,
"learning_rate": 1.132093504115046e-05,
"loss": 0.5814,
"step": 1770
},
{
"epoch": 0.8528261000421357,
"grad_norm": 4.978511333465576,
"learning_rate": 1.1248949905737283e-05,
"loss": 0.5157,
"step": 1771
},
{
"epoch": 0.8533076506350449,
"grad_norm": 0.9166672229766846,
"learning_rate": 1.1177180720829694e-05,
"loss": 0.1709,
"step": 1772
},
{
"epoch": 0.853789201227954,
"grad_norm": 1.008035659790039,
"learning_rate": 1.1105627661057671e-05,
"loss": 0.4628,
"step": 1773
},
{
"epoch": 0.8542707518208632,
"grad_norm": 2.2385506629943848,
"learning_rate": 1.103429090052528e-05,
"loss": 1.1356,
"step": 1774
},
{
"epoch": 0.8547523024137723,
"grad_norm": 3.872480630874634,
"learning_rate": 1.096317061281027e-05,
"loss": 0.5905,
"step": 1775
},
{
"epoch": 0.8552338530066815,
"grad_norm": 1.684135913848877,
"learning_rate": 1.0892266970963704e-05,
"loss": 0.5081,
"step": 1776
},
{
"epoch": 0.8557154035995906,
"grad_norm": 3.901571035385132,
"learning_rate": 1.082158014750948e-05,
"loss": 0.9633,
"step": 1777
},
{
"epoch": 0.8561969541924999,
"grad_norm": 2.208216905593872,
"learning_rate": 1.0751110314443958e-05,
"loss": 0.7731,
"step": 1778
},
{
"epoch": 0.856678504785409,
"grad_norm": 2.4418656826019287,
"learning_rate": 1.0680857643235431e-05,
"loss": 0.6918,
"step": 1779
},
{
"epoch": 0.8571600553783182,
"grad_norm": 1.6305257081985474,
"learning_rate": 1.0610822304823887e-05,
"loss": 0.5908,
"step": 1780
},
{
"epoch": 0.8576416059712274,
"grad_norm": 1.6165392398834229,
"learning_rate": 1.0541004469620452e-05,
"loss": 0.6767,
"step": 1781
},
{
"epoch": 0.8581231565641365,
"grad_norm": 2.7876946926116943,
"learning_rate": 1.0471404307507016e-05,
"loss": 1.0515,
"step": 1782
},
{
"epoch": 0.8586047071570457,
"grad_norm": 3.3915517330169678,
"learning_rate": 1.0402021987835831e-05,
"loss": 0.8213,
"step": 1783
},
{
"epoch": 0.8590862577499548,
"grad_norm": 3.31449031829834,
"learning_rate": 1.0332857679429098e-05,
"loss": 0.4672,
"step": 1784
},
{
"epoch": 0.8595678083428641,
"grad_norm": 3.6324501037597656,
"learning_rate": 1.0263911550578531e-05,
"loss": 1.0045,
"step": 1785
},
{
"epoch": 0.8600493589357732,
"grad_norm": 1.995388150215149,
"learning_rate": 1.0195183769045013e-05,
"loss": 0.3615,
"step": 1786
},
{
"epoch": 0.8605309095286824,
"grad_norm": 3.063302993774414,
"learning_rate": 1.0126674502058054e-05,
"loss": 0.8128,
"step": 1787
},
{
"epoch": 0.8610124601215915,
"grad_norm": 1.1554100513458252,
"learning_rate": 1.005838391631555e-05,
"loss": 0.6989,
"step": 1788
},
{
"epoch": 0.8614940107145007,
"grad_norm": 1.5388157367706299,
"learning_rate": 9.990312177983263e-06,
"loss": 0.6449,
"step": 1789
},
{
"epoch": 0.8619755613074098,
"grad_norm": 1.3487037420272827,
"learning_rate": 9.922459452694466e-06,
"loss": 0.7874,
"step": 1790
},
{
"epoch": 0.862457111900319,
"grad_norm": 1.2846475839614868,
"learning_rate": 9.854825905549503e-06,
"loss": 0.6292,
"step": 1791
},
{
"epoch": 0.8629386624932281,
"grad_norm": 2.74332594871521,
"learning_rate": 9.787411701115456e-06,
"loss": 0.5169,
"step": 1792
},
{
"epoch": 0.8634202130861374,
"grad_norm": 1.1050007343292236,
"learning_rate": 9.720217003425647e-06,
"loss": 0.6624,
"step": 1793
},
{
"epoch": 0.8639017636790465,
"grad_norm": 2.6142866611480713,
"learning_rate": 9.65324197597931e-06,
"loss": 0.5766,
"step": 1794
},
{
"epoch": 0.8643833142719557,
"grad_norm": 1.3400239944458008,
"learning_rate": 9.58648678174121e-06,
"loss": 0.8115,
"step": 1795
},
{
"epoch": 0.8648648648648649,
"grad_norm": 1.4871549606323242,
"learning_rate": 9.51995158314113e-06,
"loss": 0.7247,
"step": 1796
},
{
"epoch": 0.865346415457774,
"grad_norm": 3.412703037261963,
"learning_rate": 9.45363654207363e-06,
"loss": 0.4651,
"step": 1797
},
{
"epoch": 0.8658279660506832,
"grad_norm": 1.186317801475525,
"learning_rate": 9.387541819897549e-06,
"loss": 0.504,
"step": 1798
},
{
"epoch": 0.8663095166435923,
"grad_norm": 3.1554412841796875,
"learning_rate": 9.321667577435634e-06,
"loss": 0.6253,
"step": 1799
},
{
"epoch": 0.8667910672365016,
"grad_norm": 2.272794246673584,
"learning_rate": 9.256013974974175e-06,
"loss": 0.5426,
"step": 1800
},
{
"epoch": 0.8672726178294107,
"grad_norm": 1.4032080173492432,
"learning_rate": 9.19058117226258e-06,
"loss": 0.4761,
"step": 1801
},
{
"epoch": 0.8677541684223199,
"grad_norm": 2.6849613189697266,
"learning_rate": 9.125369328513034e-06,
"loss": 0.7514,
"step": 1802
},
{
"epoch": 0.868235719015229,
"grad_norm": 4.494041442871094,
"learning_rate": 9.060378602400054e-06,
"loss": 0.5857,
"step": 1803
},
{
"epoch": 0.8687172696081382,
"grad_norm": 1.7596466541290283,
"learning_rate": 8.995609152060136e-06,
"loss": 0.7958,
"step": 1804
},
{
"epoch": 0.8691988202010473,
"grad_norm": 1.3782743215560913,
"learning_rate": 8.931061135091357e-06,
"loss": 0.7378,
"step": 1805
},
{
"epoch": 0.8696803707939565,
"grad_norm": 2.1466805934906006,
"learning_rate": 8.866734708553015e-06,
"loss": 1.0608,
"step": 1806
},
{
"epoch": 0.8701619213868657,
"grad_norm": 3.3685004711151123,
"learning_rate": 8.802630028965242e-06,
"loss": 0.4598,
"step": 1807
},
{
"epoch": 0.8706434719797749,
"grad_norm": 2.506319999694824,
"learning_rate": 8.738747252308555e-06,
"loss": 0.5106,
"step": 1808
},
{
"epoch": 0.8711250225726841,
"grad_norm": 0.8574779629707336,
"learning_rate": 8.675086534023591e-06,
"loss": 0.3669,
"step": 1809
},
{
"epoch": 0.8716065731655932,
"grad_norm": 2.1670174598693848,
"learning_rate": 8.611648029010643e-06,
"loss": 0.33,
"step": 1810
},
{
"epoch": 0.8720881237585024,
"grad_norm": 3.5678937435150146,
"learning_rate": 8.548431891629316e-06,
"loss": 0.7334,
"step": 1811
},
{
"epoch": 0.8725696743514115,
"grad_norm": 2.3840737342834473,
"learning_rate": 8.485438275698154e-06,
"loss": 0.3852,
"step": 1812
},
{
"epoch": 0.8730512249443207,
"grad_norm": 7.21331262588501,
"learning_rate": 8.422667334494249e-06,
"loss": 0.5615,
"step": 1813
},
{
"epoch": 0.8735327755372299,
"grad_norm": 5.849119186401367,
"learning_rate": 8.360119220752893e-06,
"loss": 0.4217,
"step": 1814
},
{
"epoch": 0.8740143261301391,
"grad_norm": 2.1622002124786377,
"learning_rate": 8.297794086667165e-06,
"loss": 0.8654,
"step": 1815
},
{
"epoch": 0.8744958767230482,
"grad_norm": 1.9863747358322144,
"learning_rate": 8.235692083887613e-06,
"loss": 0.5413,
"step": 1816
},
{
"epoch": 0.8749774273159574,
"grad_norm": 2.883000135421753,
"learning_rate": 8.173813363521843e-06,
"loss": 1.2884,
"step": 1817
},
{
"epoch": 0.8754589779088665,
"grad_norm": 2.683244466781616,
"learning_rate": 8.112158076134157e-06,
"loss": 0.6079,
"step": 1818
},
{
"epoch": 0.8759405285017757,
"grad_norm": 1.6069995164871216,
"learning_rate": 8.05072637174522e-06,
"loss": 0.56,
"step": 1819
},
{
"epoch": 0.8764220790946848,
"grad_norm": 3.1434903144836426,
"learning_rate": 7.989518399831641e-06,
"loss": 0.5649,
"step": 1820
},
{
"epoch": 0.876903629687594,
"grad_norm": 3.7238409519195557,
"learning_rate": 7.928534309325675e-06,
"loss": 0.729,
"step": 1821
},
{
"epoch": 0.8773851802805033,
"grad_norm": 1.1204873323440552,
"learning_rate": 7.8677742486148e-06,
"loss": 0.4625,
"step": 1822
},
{
"epoch": 0.8778667308734124,
"grad_norm": 2.0625314712524414,
"learning_rate": 7.807238365541391e-06,
"loss": 0.4157,
"step": 1823
},
{
"epoch": 0.8783482814663216,
"grad_norm": 2.398089647293091,
"learning_rate": 7.746926807402344e-06,
"loss": 0.611,
"step": 1824
},
{
"epoch": 0.8788298320592307,
"grad_norm": 4.007481098175049,
"learning_rate": 7.686839720948736e-06,
"loss": 1.2355,
"step": 1825
},
{
"epoch": 0.8793113826521399,
"grad_norm": 3.5721206665039062,
"learning_rate": 7.6269772523854365e-06,
"loss": 0.4283,
"step": 1826
},
{
"epoch": 0.879792933245049,
"grad_norm": 2.9199283123016357,
"learning_rate": 7.567339547370789e-06,
"loss": 0.4685,
"step": 1827
},
{
"epoch": 0.8802744838379583,
"grad_norm": 1.7368232011795044,
"learning_rate": 7.507926751016248e-06,
"loss": 0.5865,
"step": 1828
},
{
"epoch": 0.8807560344308674,
"grad_norm": 2.2206578254699707,
"learning_rate": 7.4487390078859855e-06,
"loss": 0.7996,
"step": 1829
},
{
"epoch": 0.8812375850237766,
"grad_norm": 1.2094279527664185,
"learning_rate": 7.389776461996578e-06,
"loss": 0.5491,
"step": 1830
},
{
"epoch": 0.8817191356166857,
"grad_norm": 1.2842280864715576,
"learning_rate": 7.331039256816663e-06,
"loss": 0.9658,
"step": 1831
},
{
"epoch": 0.8822006862095949,
"grad_norm": 3.4786460399627686,
"learning_rate": 7.27252753526656e-06,
"loss": 1.0196,
"step": 1832
},
{
"epoch": 0.882682236802504,
"grad_norm": 3.5382659435272217,
"learning_rate": 7.214241439717962e-06,
"loss": 1.0331,
"step": 1833
},
{
"epoch": 0.8831637873954132,
"grad_norm": 1.174157738685608,
"learning_rate": 7.1561811119935425e-06,
"loss": 0.4535,
"step": 1834
},
{
"epoch": 0.8836453379883223,
"grad_norm": 2.7539329528808594,
"learning_rate": 7.098346693366642e-06,
"loss": 0.5103,
"step": 1835
},
{
"epoch": 0.8841268885812316,
"grad_norm": 2.5821847915649414,
"learning_rate": 7.0407383245609136e-06,
"loss": 0.4046,
"step": 1836
},
{
"epoch": 0.8846084391741408,
"grad_norm": 1.5927815437316895,
"learning_rate": 6.983356145749975e-06,
"loss": 0.3623,
"step": 1837
},
{
"epoch": 0.8850899897670499,
"grad_norm": 2.2626142501831055,
"learning_rate": 6.9262002965570835e-06,
"loss": 0.6639,
"step": 1838
},
{
"epoch": 0.8855715403599591,
"grad_norm": 3.217414617538452,
"learning_rate": 6.869270916054782e-06,
"loss": 0.519,
"step": 1839
},
{
"epoch": 0.8860530909528682,
"grad_norm": 1.9122174978256226,
"learning_rate": 6.812568142764575e-06,
"loss": 0.5984,
"step": 1840
},
{
"epoch": 0.8865346415457774,
"grad_norm": 2.481517791748047,
"learning_rate": 6.756092114656587e-06,
"loss": 0.8017,
"step": 1841
},
{
"epoch": 0.8870161921386865,
"grad_norm": 2.714883327484131,
"learning_rate": 6.699842969149195e-06,
"loss": 0.5422,
"step": 1842
},
{
"epoch": 0.8874977427315958,
"grad_norm": 2.3089113235473633,
"learning_rate": 6.64382084310875e-06,
"loss": 0.5783,
"step": 1843
},
{
"epoch": 0.8879792933245049,
"grad_norm": 2.165722608566284,
"learning_rate": 6.5880258728491905e-06,
"loss": 0.354,
"step": 1844
},
{
"epoch": 0.8884608439174141,
"grad_norm": 4.5781426429748535,
"learning_rate": 6.532458194131763e-06,
"loss": 0.8101,
"step": 1845
},
{
"epoch": 0.8889423945103232,
"grad_norm": 3.533600091934204,
"learning_rate": 6.477117942164657e-06,
"loss": 0.9167,
"step": 1846
},
{
"epoch": 0.8894239451032324,
"grad_norm": 1.031320333480835,
"learning_rate": 6.422005251602658e-06,
"loss": 0.4298,
"step": 1847
},
{
"epoch": 0.8899054956961415,
"grad_norm": 2.306194543838501,
"learning_rate": 6.367120256546888e-06,
"loss": 0.4655,
"step": 1848
},
{
"epoch": 0.8903870462890507,
"grad_norm": 1.633102297782898,
"learning_rate": 6.312463090544396e-06,
"loss": 0.4393,
"step": 1849
},
{
"epoch": 0.89086859688196,
"grad_norm": 1.7181764841079712,
"learning_rate": 6.258033886587911e-06,
"loss": 0.8858,
"step": 1850
},
{
"epoch": 0.8913501474748691,
"grad_norm": 2.9164364337921143,
"learning_rate": 6.2038327771154485e-06,
"loss": 0.4769,
"step": 1851
},
{
"epoch": 0.8918316980677783,
"grad_norm": 3.113100290298462,
"learning_rate": 6.1498598940100346e-06,
"loss": 0.7217,
"step": 1852
},
{
"epoch": 0.8923132486606874,
"grad_norm": 2.327969551086426,
"learning_rate": 6.0961153685993646e-06,
"loss": 0.5315,
"step": 1853
},
{
"epoch": 0.8927947992535966,
"grad_norm": 2.7124183177948,
"learning_rate": 6.0425993316554965e-06,
"loss": 0.5386,
"step": 1854
},
{
"epoch": 0.8932763498465057,
"grad_norm": 2.0107133388519287,
"learning_rate": 5.989311913394546e-06,
"loss": 0.4306,
"step": 1855
},
{
"epoch": 0.8937579004394149,
"grad_norm": 1.8808348178863525,
"learning_rate": 5.93625324347632e-06,
"loss": 0.4791,
"step": 1856
},
{
"epoch": 0.894239451032324,
"grad_norm": 3.451119899749756,
"learning_rate": 5.8834234510040335e-06,
"loss": 0.8056,
"step": 1857
},
{
"epoch": 0.8947210016252333,
"grad_norm": 3.1976735591888428,
"learning_rate": 5.830822664523994e-06,
"loss": 0.3994,
"step": 1858
},
{
"epoch": 0.8952025522181424,
"grad_norm": 2.1719510555267334,
"learning_rate": 5.77845101202531e-06,
"loss": 0.738,
"step": 1859
},
{
"epoch": 0.8956841028110516,
"grad_norm": 4.383519172668457,
"learning_rate": 5.726308620939536e-06,
"loss": 0.5899,
"step": 1860
},
{
"epoch": 0.8961656534039607,
"grad_norm": 2.9527339935302734,
"learning_rate": 5.674395618140393e-06,
"loss": 0.3893,
"step": 1861
},
{
"epoch": 0.8966472039968699,
"grad_norm": 4.863363742828369,
"learning_rate": 5.622712129943453e-06,
"loss": 0.6031,
"step": 1862
},
{
"epoch": 0.8971287545897791,
"grad_norm": 2.3513472080230713,
"learning_rate": 5.571258282105829e-06,
"loss": 0.8987,
"step": 1863
},
{
"epoch": 0.8976103051826883,
"grad_norm": 4.060399055480957,
"learning_rate": 5.520034199825841e-06,
"loss": 0.9195,
"step": 1864
},
{
"epoch": 0.8980918557755975,
"grad_norm": 3.192730188369751,
"learning_rate": 5.469040007742776e-06,
"loss": 0.6854,
"step": 1865
},
{
"epoch": 0.8985734063685066,
"grad_norm": 3.811521530151367,
"learning_rate": 5.418275829936537e-06,
"loss": 1.2028,
"step": 1866
},
{
"epoch": 0.8990549569614158,
"grad_norm": 2.9288651943206787,
"learning_rate": 5.36774178992735e-06,
"loss": 0.6378,
"step": 1867
},
{
"epoch": 0.8995365075543249,
"grad_norm": 2.91579008102417,
"learning_rate": 5.317438010675469e-06,
"loss": 0.5374,
"step": 1868
},
{
"epoch": 0.9000180581472341,
"grad_norm": 2.687274217605591,
"learning_rate": 5.267364614580861e-06,
"loss": 0.4775,
"step": 1869
},
{
"epoch": 0.9004996087401432,
"grad_norm": 6.526017189025879,
"learning_rate": 5.217521723482943e-06,
"loss": 0.6156,
"step": 1870
},
{
"epoch": 0.9009811593330525,
"grad_norm": 2.754613161087036,
"learning_rate": 5.167909458660258e-06,
"loss": 0.9845,
"step": 1871
},
{
"epoch": 0.9014627099259616,
"grad_norm": 3.1940438747406006,
"learning_rate": 5.118527940830165e-06,
"loss": 1.0082,
"step": 1872
},
{
"epoch": 0.9019442605188708,
"grad_norm": 1.7706068754196167,
"learning_rate": 5.069377290148602e-06,
"loss": 0.3283,
"step": 1873
},
{
"epoch": 0.9024258111117799,
"grad_norm": 1.1937077045440674,
"learning_rate": 5.020457626209707e-06,
"loss": 0.252,
"step": 1874
},
{
"epoch": 0.9029073617046891,
"grad_norm": 1.5468496084213257,
"learning_rate": 4.971769068045628e-06,
"loss": 0.6309,
"step": 1875
},
{
"epoch": 0.9033889122975982,
"grad_norm": 2.2661030292510986,
"learning_rate": 4.923311734126135e-06,
"loss": 0.6594,
"step": 1876
},
{
"epoch": 0.9038704628905074,
"grad_norm": 1.7421146631240845,
"learning_rate": 4.875085742358432e-06,
"loss": 0.6087,
"step": 1877
},
{
"epoch": 0.9043520134834167,
"grad_norm": 2.4468908309936523,
"learning_rate": 4.827091210086776e-06,
"loss": 0.7757,
"step": 1878
},
{
"epoch": 0.9048335640763258,
"grad_norm": 2.273754596710205,
"learning_rate": 4.779328254092252e-06,
"loss": 1.082,
"step": 1879
},
{
"epoch": 0.905315114669235,
"grad_norm": 2.547562837600708,
"learning_rate": 4.731796990592452e-06,
"loss": 0.5908,
"step": 1880
},
{
"epoch": 0.9057966652621441,
"grad_norm": 1.9437663555145264,
"learning_rate": 4.68449753524125e-06,
"loss": 0.9443,
"step": 1881
},
{
"epoch": 0.9062782158550533,
"grad_norm": 1.4457284212112427,
"learning_rate": 4.637430003128429e-06,
"loss": 0.5939,
"step": 1882
},
{
"epoch": 0.9067597664479624,
"grad_norm": 4.274806976318359,
"learning_rate": 4.5905945087794996e-06,
"loss": 0.756,
"step": 1883
},
{
"epoch": 0.9072413170408716,
"grad_norm": 1.607393503189087,
"learning_rate": 4.543991166155337e-06,
"loss": 0.6922,
"step": 1884
},
{
"epoch": 0.9077228676337807,
"grad_norm": 3.3161227703094482,
"learning_rate": 4.497620088651966e-06,
"loss": 0.7247,
"step": 1885
},
{
"epoch": 0.90820441822669,
"grad_norm": 3.3778584003448486,
"learning_rate": 4.451481389100232e-06,
"loss": 0.5756,
"step": 1886
},
{
"epoch": 0.9086859688195991,
"grad_norm": 4.653063774108887,
"learning_rate": 4.405575179765586e-06,
"loss": 0.6268,
"step": 1887
},
{
"epoch": 0.9091675194125083,
"grad_norm": 1.695256233215332,
"learning_rate": 4.359901572347758e-06,
"loss": 0.8092,
"step": 1888
},
{
"epoch": 0.9096490700054174,
"grad_norm": 2.5265443325042725,
"learning_rate": 4.314460677980537e-06,
"loss": 0.5014,
"step": 1889
},
{
"epoch": 0.9101306205983266,
"grad_norm": 1.132360816001892,
"learning_rate": 4.269252607231422e-06,
"loss": 0.418,
"step": 1890
},
{
"epoch": 0.9106121711912358,
"grad_norm": 1.7842097282409668,
"learning_rate": 4.224277470101445e-06,
"loss": 0.8378,
"step": 1891
},
{
"epoch": 0.9110937217841449,
"grad_norm": 1.7560157775878906,
"learning_rate": 4.179535376024857e-06,
"loss": 0.7296,
"step": 1892
},
{
"epoch": 0.9115752723770542,
"grad_norm": 1.5116153955459595,
"learning_rate": 4.135026433868827e-06,
"loss": 0.7794,
"step": 1893
},
{
"epoch": 0.9120568229699633,
"grad_norm": 2.2078778743743896,
"learning_rate": 4.090750751933248e-06,
"loss": 0.9489,
"step": 1894
},
{
"epoch": 0.9125383735628725,
"grad_norm": 2.9103267192840576,
"learning_rate": 4.046708437950464e-06,
"loss": 0.733,
"step": 1895
},
{
"epoch": 0.9130199241557816,
"grad_norm": 2.259371280670166,
"learning_rate": 4.0028995990849084e-06,
"loss": 0.476,
"step": 1896
},
{
"epoch": 0.9135014747486908,
"grad_norm": 2.4890007972717285,
"learning_rate": 3.95932434193299e-06,
"loss": 0.4656,
"step": 1897
},
{
"epoch": 0.9139830253415999,
"grad_norm": 3.960632562637329,
"learning_rate": 3.915982772522719e-06,
"loss": 0.74,
"step": 1898
},
{
"epoch": 0.9144645759345091,
"grad_norm": 1.7056382894515991,
"learning_rate": 3.872874996313513e-06,
"loss": 0.5293,
"step": 1899
},
{
"epoch": 0.9149461265274182,
"grad_norm": 2.551649332046509,
"learning_rate": 3.830001118195936e-06,
"loss": 0.5079,
"step": 1900
},
{
"epoch": 0.9154276771203275,
"grad_norm": 1.9389688968658447,
"learning_rate": 3.787361242491394e-06,
"loss": 0.3823,
"step": 1901
},
{
"epoch": 0.9159092277132366,
"grad_norm": 1.6648590564727783,
"learning_rate": 3.744955472951928e-06,
"loss": 0.3093,
"step": 1902
},
{
"epoch": 0.9163907783061458,
"grad_norm": 3.3412230014801025,
"learning_rate": 3.702783912759955e-06,
"loss": 0.8416,
"step": 1903
},
{
"epoch": 0.916872328899055,
"grad_norm": 1.401397943496704,
"learning_rate": 3.660846664528006e-06,
"loss": 0.544,
"step": 1904
},
{
"epoch": 0.9173538794919641,
"grad_norm": 3.2457292079925537,
"learning_rate": 3.6191438302984772e-06,
"loss": 0.7385,
"step": 1905
},
{
"epoch": 0.9178354300848733,
"grad_norm": 2.4073991775512695,
"learning_rate": 3.577675511543388e-06,
"loss": 0.5313,
"step": 1906
},
{
"epoch": 0.9183169806777824,
"grad_norm": 1.6431150436401367,
"learning_rate": 3.5364418091641373e-06,
"loss": 0.9428,
"step": 1907
},
{
"epoch": 0.9187985312706917,
"grad_norm": 2.662550926208496,
"learning_rate": 3.495442823491224e-06,
"loss": 0.8064,
"step": 1908
},
{
"epoch": 0.9192800818636008,
"grad_norm": 2.9634780883789062,
"learning_rate": 3.4546786542840605e-06,
"loss": 0.646,
"step": 1909
},
{
"epoch": 0.91976163245651,
"grad_norm": 2.133837938308716,
"learning_rate": 3.4141494007306816e-06,
"loss": 0.419,
"step": 1910
},
{
"epoch": 0.9202431830494191,
"grad_norm": 1.918086051940918,
"learning_rate": 3.373855161447548e-06,
"loss": 0.8063,
"step": 1911
},
{
"epoch": 0.9207247336423283,
"grad_norm": 1.7618401050567627,
"learning_rate": 3.333796034479242e-06,
"loss": 0.5835,
"step": 1912
},
{
"epoch": 0.9212062842352374,
"grad_norm": 3.4800868034362793,
"learning_rate": 3.293972117298294e-06,
"loss": 0.7598,
"step": 1913
},
{
"epoch": 0.9216878348281466,
"grad_norm": 2.787062168121338,
"learning_rate": 3.2543835068049255e-06,
"loss": 0.4116,
"step": 1914
},
{
"epoch": 0.9221693854210558,
"grad_norm": 2.82818341255188,
"learning_rate": 3.21503029932676e-06,
"loss": 0.7228,
"step": 1915
},
{
"epoch": 0.922650936013965,
"grad_norm": 1.5212979316711426,
"learning_rate": 3.1759125906186793e-06,
"loss": 0.4513,
"step": 1916
},
{
"epoch": 0.9231324866068741,
"grad_norm": 3.6214406490325928,
"learning_rate": 3.137030475862535e-06,
"loss": 1.0936,
"step": 1917
},
{
"epoch": 0.9236140371997833,
"grad_norm": 2.9912490844726562,
"learning_rate": 3.098384049666925e-06,
"loss": 0.3953,
"step": 1918
},
{
"epoch": 0.9240955877926925,
"grad_norm": 1.5457743406295776,
"learning_rate": 3.059973406066963e-06,
"loss": 0.4376,
"step": 1919
},
{
"epoch": 0.9245771383856016,
"grad_norm": 3.1872262954711914,
"learning_rate": 3.0217986385240537e-06,
"loss": 0.5669,
"step": 1920
},
{
"epoch": 0.9250586889785108,
"grad_norm": 2.594231605529785,
"learning_rate": 2.983859839925662e-06,
"loss": 0.6722,
"step": 1921
},
{
"epoch": 0.92554023957142,
"grad_norm": 2.3845884799957275,
"learning_rate": 2.94615710258509e-06,
"loss": 0.8501,
"step": 1922
},
{
"epoch": 0.9260217901643292,
"grad_norm": 3.532987117767334,
"learning_rate": 2.908690518241275e-06,
"loss": 0.544,
"step": 1923
},
{
"epoch": 0.9265033407572383,
"grad_norm": 2.7089667320251465,
"learning_rate": 2.8714601780584937e-06,
"loss": 0.2766,
"step": 1924
},
{
"epoch": 0.9269848913501475,
"grad_norm": 2.490257740020752,
"learning_rate": 2.834466172626238e-06,
"loss": 0.2841,
"step": 1925
},
{
"epoch": 0.9274664419430566,
"grad_norm": 1.7176368236541748,
"learning_rate": 2.7977085919589254e-06,
"loss": 0.3851,
"step": 1926
},
{
"epoch": 0.9279479925359658,
"grad_norm": 2.444751024246216,
"learning_rate": 2.76118752549569e-06,
"loss": 0.4477,
"step": 1927
},
{
"epoch": 0.9284295431288749,
"grad_norm": 1.8582123517990112,
"learning_rate": 2.7249030621001924e-06,
"loss": 0.6806,
"step": 1928
},
{
"epoch": 0.9289110937217842,
"grad_norm": 1.904004454612732,
"learning_rate": 2.688855290060399e-06,
"loss": 0.619,
"step": 1929
},
{
"epoch": 0.9293926443146933,
"grad_norm": 2.0747194290161133,
"learning_rate": 2.653044297088314e-06,
"loss": 0.7597,
"step": 1930
},
{
"epoch": 0.9298741949076025,
"grad_norm": 2.5333268642425537,
"learning_rate": 2.6174701703198468e-06,
"loss": 0.792,
"step": 1931
},
{
"epoch": 0.9303557455005117,
"grad_norm": 2.685360908508301,
"learning_rate": 2.5821329963145347e-06,
"loss": 0.7001,
"step": 1932
},
{
"epoch": 0.9308372960934208,
"grad_norm": 3.6770102977752686,
"learning_rate": 2.547032861055376e-06,
"loss": 0.6245,
"step": 1933
},
{
"epoch": 0.93131884668633,
"grad_norm": 1.6112256050109863,
"learning_rate": 2.5121698499485757e-06,
"loss": 0.6619,
"step": 1934
},
{
"epoch": 0.9318003972792391,
"grad_norm": 2.173598289489746,
"learning_rate": 2.4775440478233993e-06,
"loss": 0.7539,
"step": 1935
},
{
"epoch": 0.9322819478721484,
"grad_norm": 2.8748226165771484,
"learning_rate": 2.4431555389319074e-06,
"loss": 0.6625,
"step": 1936
},
{
"epoch": 0.9327634984650575,
"grad_norm": 2.1532397270202637,
"learning_rate": 2.4090044069487784e-06,
"loss": 0.9355,
"step": 1937
},
{
"epoch": 0.9332450490579667,
"grad_norm": 4.027218341827393,
"learning_rate": 2.3750907349711084e-06,
"loss": 0.5421,
"step": 1938
},
{
"epoch": 0.9337265996508758,
"grad_norm": 2.222975015640259,
"learning_rate": 2.3414146055182106e-06,
"loss": 0.8391,
"step": 1939
},
{
"epoch": 0.934208150243785,
"grad_norm": 3.7854230403900146,
"learning_rate": 2.307976100531384e-06,
"loss": 0.4316,
"step": 1940
},
{
"epoch": 0.9346897008366941,
"grad_norm": 1.5810474157333374,
"learning_rate": 2.274775301373744e-06,
"loss": 0.5887,
"step": 1941
},
{
"epoch": 0.9351712514296033,
"grad_norm": 1.0357911586761475,
"learning_rate": 2.241812288830003e-06,
"loss": 0.594,
"step": 1942
},
{
"epoch": 0.9356528020225124,
"grad_norm": 3.268486261367798,
"learning_rate": 2.2090871431063253e-06,
"loss": 0.4582,
"step": 1943
},
{
"epoch": 0.9361343526154217,
"grad_norm": 2.9726314544677734,
"learning_rate": 2.176599943830071e-06,
"loss": 0.8042,
"step": 1944
},
{
"epoch": 0.9366159032083309,
"grad_norm": 1.949102520942688,
"learning_rate": 2.144350770049597e-06,
"loss": 0.4591,
"step": 1945
},
{
"epoch": 0.93709745380124,
"grad_norm": 2.1186540126800537,
"learning_rate": 2.112339700234156e-06,
"loss": 0.8347,
"step": 1946
},
{
"epoch": 0.9375790043941492,
"grad_norm": 4.00067138671875,
"learning_rate": 2.0805668122735767e-06,
"loss": 0.5937,
"step": 1947
},
{
"epoch": 0.9380605549870583,
"grad_norm": 2.5911052227020264,
"learning_rate": 2.0490321834781833e-06,
"loss": 0.5266,
"step": 1948
},
{
"epoch": 0.9385421055799675,
"grad_norm": 1.883213758468628,
"learning_rate": 2.0177358905785537e-06,
"loss": 0.6082,
"step": 1949
},
{
"epoch": 0.9390236561728766,
"grad_norm": 1.9823429584503174,
"learning_rate": 1.986678009725329e-06,
"loss": 0.5017,
"step": 1950
},
{
"epoch": 0.9395052067657859,
"grad_norm": 1.3896251916885376,
"learning_rate": 1.955858616489059e-06,
"loss": 0.8347,
"step": 1951
},
{
"epoch": 0.939986757358695,
"grad_norm": 6.104365348815918,
"learning_rate": 1.9252777858599915e-06,
"loss": 0.7993,
"step": 1952
},
{
"epoch": 0.9404683079516042,
"grad_norm": 2.0376524925231934,
"learning_rate": 1.8949355922479151e-06,
"loss": 0.4812,
"step": 1953
},
{
"epoch": 0.9409498585445133,
"grad_norm": 4.846323013305664,
"learning_rate": 1.8648321094819287e-06,
"loss": 0.5424,
"step": 1954
},
{
"epoch": 0.9414314091374225,
"grad_norm": 1.7472106218338013,
"learning_rate": 1.8349674108103288e-06,
"loss": 0.56,
"step": 1955
},
{
"epoch": 0.9419129597303316,
"grad_norm": 1.900530219078064,
"learning_rate": 1.8053415689003872e-06,
"loss": 0.6446,
"step": 1956
},
{
"epoch": 0.9423945103232408,
"grad_norm": 4.014410972595215,
"learning_rate": 1.7759546558381967e-06,
"loss": 0.4733,
"step": 1957
},
{
"epoch": 0.94287606091615,
"grad_norm": 1.2827335596084595,
"learning_rate": 1.7468067431284707e-06,
"loss": 0.4226,
"step": 1958
},
{
"epoch": 0.9433576115090592,
"grad_norm": 1.8222554922103882,
"learning_rate": 1.7178979016943764e-06,
"loss": 0.3871,
"step": 1959
},
{
"epoch": 0.9438391621019684,
"grad_norm": 2.0032083988189697,
"learning_rate": 1.6892282018773908e-06,
"loss": 0.6295,
"step": 1960
},
{
"epoch": 0.9443207126948775,
"grad_norm": 3.710329055786133,
"learning_rate": 1.6607977134370789e-06,
"loss": 1.0069,
"step": 1961
},
{
"epoch": 0.9448022632877867,
"grad_norm": 1.8269236087799072,
"learning_rate": 1.6326065055510043e-06,
"loss": 0.8347,
"step": 1962
},
{
"epoch": 0.9452838138806958,
"grad_norm": 2.4813623428344727,
"learning_rate": 1.6046546468144407e-06,
"loss": 0.7641,
"step": 1963
},
{
"epoch": 0.945765364473605,
"grad_norm": 3.3731300830841064,
"learning_rate": 1.576942205240317e-06,
"loss": 0.5967,
"step": 1964
},
{
"epoch": 0.9462469150665141,
"grad_norm": 2.570126533508301,
"learning_rate": 1.5494692482590057e-06,
"loss": 0.5784,
"step": 1965
},
{
"epoch": 0.9467284656594234,
"grad_norm": 2.100484609603882,
"learning_rate": 1.522235842718156e-06,
"loss": 0.4698,
"step": 1966
},
{
"epoch": 0.9472100162523325,
"grad_norm": 2.475597620010376,
"learning_rate": 1.4952420548825285e-06,
"loss": 0.4489,
"step": 1967
},
{
"epoch": 0.9476915668452417,
"grad_norm": 2.690720796585083,
"learning_rate": 1.468487950433839e-06,
"loss": 0.7515,
"step": 1968
},
{
"epoch": 0.9481731174381508,
"grad_norm": 1.925948977470398,
"learning_rate": 1.441973594470636e-06,
"loss": 0.5974,
"step": 1969
},
{
"epoch": 0.94865466803106,
"grad_norm": 3.1811184883117676,
"learning_rate": 1.415699051508068e-06,
"loss": 0.4421,
"step": 1970
},
{
"epoch": 0.9491362186239691,
"grad_norm": 1.8990108966827393,
"learning_rate": 1.3896643854777847e-06,
"loss": 0.641,
"step": 1971
},
{
"epoch": 0.9496177692168783,
"grad_norm": 0.7047016024589539,
"learning_rate": 1.3638696597277679e-06,
"loss": 0.273,
"step": 1972
},
{
"epoch": 0.9500993198097876,
"grad_norm": 1.2902841567993164,
"learning_rate": 1.3383149370221449e-06,
"loss": 0.382,
"step": 1973
},
{
"epoch": 0.9505808704026967,
"grad_norm": 2.200690984725952,
"learning_rate": 1.313000279541121e-06,
"loss": 0.677,
"step": 1974
},
{
"epoch": 0.9510624209956059,
"grad_norm": 1.2776658535003662,
"learning_rate": 1.287925748880703e-06,
"loss": 0.4875,
"step": 1975
},
{
"epoch": 0.951543971588515,
"grad_norm": 3.179433822631836,
"learning_rate": 1.2630914060526522e-06,
"loss": 0.7287,
"step": 1976
},
{
"epoch": 0.9520255221814242,
"grad_norm": 1.7882678508758545,
"learning_rate": 1.2384973114843101e-06,
"loss": 0.6366,
"step": 1977
},
{
"epoch": 0.9525070727743333,
"grad_norm": 2.128645420074463,
"learning_rate": 1.2141435250184185e-06,
"loss": 0.3159,
"step": 1978
},
{
"epoch": 0.9529886233672425,
"grad_norm": 2.0305099487304688,
"learning_rate": 1.1900301059130093e-06,
"loss": 0.7256,
"step": 1979
},
{
"epoch": 0.9534701739601517,
"grad_norm": 3.4359750747680664,
"learning_rate": 1.1661571128412596e-06,
"loss": 0.5136,
"step": 1980
},
{
"epoch": 0.9539517245530609,
"grad_norm": 2.7383172512054443,
"learning_rate": 1.142524603891315e-06,
"loss": 1.0318,
"step": 1981
},
{
"epoch": 0.95443327514597,
"grad_norm": 2.489351511001587,
"learning_rate": 1.1191326365661892e-06,
"loss": 0.6141,
"step": 1982
},
{
"epoch": 0.9549148257388792,
"grad_norm": 2.3964600563049316,
"learning_rate": 1.0959812677835968e-06,
"loss": 0.4736,
"step": 1983
},
{
"epoch": 0.9553963763317883,
"grad_norm": 5.277529716491699,
"learning_rate": 1.0730705538758322e-06,
"loss": 0.8058,
"step": 1984
},
{
"epoch": 0.9558779269246975,
"grad_norm": 2.238236904144287,
"learning_rate": 1.0504005505896141e-06,
"loss": 0.5026,
"step": 1985
},
{
"epoch": 0.9563594775176067,
"grad_norm": 1.928312063217163,
"learning_rate": 1.0279713130859514e-06,
"loss": 0.8141,
"step": 1986
},
{
"epoch": 0.9568410281105159,
"grad_norm": 2.1432981491088867,
"learning_rate": 1.005782895940055e-06,
"loss": 1.2314,
"step": 1987
},
{
"epoch": 0.9573225787034251,
"grad_norm": 5.187903881072998,
"learning_rate": 9.838353531411272e-07,
"loss": 0.5629,
"step": 1988
},
{
"epoch": 0.9578041292963342,
"grad_norm": 2.8405728340148926,
"learning_rate": 9.62128738092294e-07,
"loss": 0.5505,
"step": 1989
},
{
"epoch": 0.9582856798892434,
"grad_norm": 2.8619463443756104,
"learning_rate": 9.406631036104508e-07,
"loss": 0.3591,
"step": 1990
},
{
"epoch": 0.9587672304821525,
"grad_norm": 2.144484758377075,
"learning_rate": 9.194385019261287e-07,
"loss": 0.7098,
"step": 1991
},
{
"epoch": 0.9592487810750617,
"grad_norm": 1.5289490222930908,
"learning_rate": 8.984549846833612e-07,
"loss": 0.5621,
"step": 1992
},
{
"epoch": 0.9597303316679708,
"grad_norm": 1.5724802017211914,
"learning_rate": 8.777126029396065e-07,
"loss": 0.3373,
"step": 1993
},
{
"epoch": 0.9602118822608801,
"grad_norm": 3.027939796447754,
"learning_rate": 8.572114071655479e-07,
"loss": 0.4875,
"step": 1994
},
{
"epoch": 0.9606934328537892,
"grad_norm": 2.6502504348754883,
"learning_rate": 8.369514472450379e-07,
"loss": 0.5928,
"step": 1995
},
{
"epoch": 0.9611749834466984,
"grad_norm": 1.704206943511963,
"learning_rate": 8.169327724749543e-07,
"loss": 0.642,
"step": 1996
},
{
"epoch": 0.9616565340396075,
"grad_norm": 2.1913046836853027,
"learning_rate": 7.971554315650442e-07,
"loss": 0.6125,
"step": 1997
},
{
"epoch": 0.9621380846325167,
"grad_norm": 2.5087268352508545,
"learning_rate": 7.776194726378583e-07,
"loss": 0.7641,
"step": 1998
},
{
"epoch": 0.9626196352254258,
"grad_norm": 2.2763671875,
"learning_rate": 7.583249432286277e-07,
"loss": 0.7061,
"step": 1999
},
{
"epoch": 0.963101185818335,
"grad_norm": 3.4482977390289307,
"learning_rate": 7.392718902850981e-07,
"loss": 0.7786,
"step": 2000
},
{
"epoch": 0.9635827364112443,
"grad_norm": 3.177415132522583,
"learning_rate": 7.204603601674853e-07,
"loss": 0.6386,
"step": 2001
},
{
"epoch": 0.9640642870041534,
"grad_norm": 3.915245771408081,
"learning_rate": 7.018903986483083e-07,
"loss": 0.5658,
"step": 2002
},
{
"epoch": 0.9645458375970626,
"grad_norm": 1.8997553586959839,
"learning_rate": 6.835620509122897e-07,
"loss": 0.571,
"step": 2003
},
{
"epoch": 0.9650273881899717,
"grad_norm": 3.1082587242126465,
"learning_rate": 6.65475361556267e-07,
"loss": 0.6074,
"step": 2004
},
{
"epoch": 0.9655089387828809,
"grad_norm": 2.994706153869629,
"learning_rate": 6.47630374589081e-07,
"loss": 0.7855,
"step": 2005
},
{
"epoch": 0.96599048937579,
"grad_norm": 1.8663792610168457,
"learning_rate": 6.300271334314434e-07,
"loss": 0.2545,
"step": 2006
},
{
"epoch": 0.9664720399686992,
"grad_norm": 1.586185097694397,
"learning_rate": 6.126656809158359e-07,
"loss": 0.8023,
"step": 2007
},
{
"epoch": 0.9669535905616083,
"grad_norm": 1.375166416168213,
"learning_rate": 5.955460592864337e-07,
"loss": 0.4675,
"step": 2008
},
{
"epoch": 0.9674351411545176,
"grad_norm": 2.125596761703491,
"learning_rate": 5.78668310198982e-07,
"loss": 0.7644,
"step": 2009
},
{
"epoch": 0.9679166917474267,
"grad_norm": 1.8324317932128906,
"learning_rate": 5.620324747207084e-07,
"loss": 0.4852,
"step": 2010
},
{
"epoch": 0.9683982423403359,
"grad_norm": 1.296980619430542,
"learning_rate": 5.456385933301777e-07,
"loss": 0.1305,
"step": 2011
},
{
"epoch": 0.968879792933245,
"grad_norm": 3.3267123699188232,
"learning_rate": 5.294867059172592e-07,
"loss": 0.744,
"step": 2012
},
{
"epoch": 0.9693613435261542,
"grad_norm": 1.4626914262771606,
"learning_rate": 5.135768517829819e-07,
"loss": 0.6947,
"step": 2013
},
{
"epoch": 0.9698428941190634,
"grad_norm": 2.6075901985168457,
"learning_rate": 4.979090696394795e-07,
"loss": 0.4833,
"step": 2014
},
{
"epoch": 0.9703244447119725,
"grad_norm": 1.679829478263855,
"learning_rate": 4.824833976098453e-07,
"loss": 0.6612,
"step": 2015
},
{
"epoch": 0.9708059953048818,
"grad_norm": 3.0018815994262695,
"learning_rate": 4.6729987322807757e-07,
"loss": 0.8277,
"step": 2016
},
{
"epoch": 0.9712875458977909,
"grad_norm": 1.4263994693756104,
"learning_rate": 4.523585334389679e-07,
"loss": 0.6912,
"step": 2017
},
{
"epoch": 0.9717690964907001,
"grad_norm": 3.329765558242798,
"learning_rate": 4.3765941459804614e-07,
"loss": 0.7018,
"step": 2018
},
{
"epoch": 0.9722506470836092,
"grad_norm": 3.2026407718658447,
"learning_rate": 4.232025524714356e-07,
"loss": 0.7325,
"step": 2019
},
{
"epoch": 0.9727321976765184,
"grad_norm": 2.716898202896118,
"learning_rate": 4.0898798223582e-07,
"loss": 0.6388,
"step": 2020
},
{
"epoch": 0.9732137482694275,
"grad_norm": 1.2587398290634155,
"learning_rate": 3.950157384783104e-07,
"loss": 0.7916,
"step": 2021
},
{
"epoch": 0.9736952988623367,
"grad_norm": 1.3127981424331665,
"learning_rate": 3.8128585519640046e-07,
"loss": 0.4851,
"step": 2022
},
{
"epoch": 0.9741768494552459,
"grad_norm": 2.5416173934936523,
"learning_rate": 3.677983657978779e-07,
"loss": 0.6746,
"step": 2023
},
{
"epoch": 0.9746584000481551,
"grad_norm": 2.9663803577423096,
"learning_rate": 3.545533031007131e-07,
"loss": 0.7857,
"step": 2024
},
{
"epoch": 0.9751399506410642,
"grad_norm": 4.189087867736816,
"learning_rate": 3.415506993330153e-07,
"loss": 0.4633,
"step": 2025
},
{
"epoch": 0.9756215012339734,
"grad_norm": 3.007153034210205,
"learning_rate": 3.2879058613292105e-07,
"loss": 0.64,
"step": 2026
},
{
"epoch": 0.9761030518268826,
"grad_norm": 1.7615995407104492,
"learning_rate": 3.1627299454856095e-07,
"loss": 0.6468,
"step": 2027
},
{
"epoch": 0.9765846024197917,
"grad_norm": 3.9861655235290527,
"learning_rate": 3.0399795503793793e-07,
"loss": 1.0444,
"step": 2028
},
{
"epoch": 0.9770661530127009,
"grad_norm": 1.9470893144607544,
"learning_rate": 2.9196549746888235e-07,
"loss": 0.516,
"step": 2029
},
{
"epoch": 0.97754770360561,
"grad_norm": 1.5506056547164917,
"learning_rate": 2.801756511189524e-07,
"loss": 0.2129,
"step": 2030
},
{
"epoch": 0.9780292541985193,
"grad_norm": 4.4033331871032715,
"learning_rate": 2.686284446754006e-07,
"loss": 0.7624,
"step": 2031
},
{
"epoch": 0.9785108047914284,
"grad_norm": 2.394306182861328,
"learning_rate": 2.573239062350963e-07,
"loss": 0.8467,
"step": 2032
},
{
"epoch": 0.9789923553843376,
"grad_norm": 2.18649959564209,
"learning_rate": 2.4626206330440326e-07,
"loss": 0.9826,
"step": 2033
},
{
"epoch": 0.9794739059772467,
"grad_norm": 2.4696927070617676,
"learning_rate": 2.3544294279918e-07,
"loss": 0.6252,
"step": 2034
},
{
"epoch": 0.9799554565701559,
"grad_norm": 0.9971011877059937,
"learning_rate": 2.2486657104471286e-07,
"loss": 0.6913,
"step": 2035
},
{
"epoch": 0.980437007163065,
"grad_norm": 2.627206563949585,
"learning_rate": 2.1453297377557191e-07,
"loss": 0.3573,
"step": 2036
},
{
"epoch": 0.9809185577559743,
"grad_norm": 2.772587299346924,
"learning_rate": 2.044421761356552e-07,
"loss": 0.8796,
"step": 2037
},
{
"epoch": 0.9814001083488834,
"grad_norm": 1.3121188879013062,
"learning_rate": 1.9459420267804452e-07,
"loss": 0.7171,
"step": 2038
},
{
"epoch": 0.9818816589417926,
"grad_norm": 3.698637008666992,
"learning_rate": 1.8498907736499426e-07,
"loss": 0.6321,
"step": 2039
},
{
"epoch": 0.9823632095347017,
"grad_norm": 2.19526743888855,
"learning_rate": 1.7562682356786487e-07,
"loss": 0.961,
"step": 2040
},
{
"epoch": 0.9828447601276109,
"grad_norm": 3.142582893371582,
"learning_rate": 1.665074640670228e-07,
"loss": 0.9126,
"step": 2041
},
{
"epoch": 0.9833263107205201,
"grad_norm": 2.962191343307495,
"learning_rate": 1.576310210518517e-07,
"loss": 0.566,
"step": 2042
},
{
"epoch": 0.9838078613134292,
"grad_norm": 2.7862355709075928,
"learning_rate": 1.489975161206636e-07,
"loss": 0.5524,
"step": 2043
},
{
"epoch": 0.9842894119063385,
"grad_norm": 2.1677451133728027,
"learning_rate": 1.406069702806323e-07,
"loss": 0.4326,
"step": 2044
},
{
"epoch": 0.9847709624992476,
"grad_norm": 2.6661462783813477,
"learning_rate": 1.324594039477822e-07,
"loss": 1.088,
"step": 2045
},
{
"epoch": 0.9852525130921568,
"grad_norm": 1.7038089036941528,
"learning_rate": 1.2455483694689962e-07,
"loss": 0.8435,
"step": 2046
},
{
"epoch": 0.9857340636850659,
"grad_norm": 2.608567237854004,
"learning_rate": 1.1689328851151038e-07,
"loss": 0.7363,
"step": 2047
},
{
"epoch": 0.9862156142779751,
"grad_norm": 1.127517580986023,
"learning_rate": 1.0947477728381339e-07,
"loss": 0.9246,
"step": 2048
},
{
"epoch": 0.9866971648708842,
"grad_norm": 3.015097141265869,
"learning_rate": 1.0229932131465836e-07,
"loss": 0.8423,
"step": 2049
},
{
"epoch": 0.9871787154637934,
"grad_norm": 2.7446672916412354,
"learning_rate": 9.536693806347919e-08,
"loss": 1.2947,
"step": 2050
},
{
"epoch": 0.9876602660567025,
"grad_norm": 2.5686793327331543,
"learning_rate": 8.867764439826065e-08,
"loss": 0.6687,
"step": 2051
},
{
"epoch": 0.9881418166496118,
"grad_norm": 1.1108747720718384,
"learning_rate": 8.223145659550513e-08,
"loss": 0.7631,
"step": 2052
},
{
"epoch": 0.9886233672425209,
"grad_norm": 2.3981330394744873,
"learning_rate": 7.602839034017706e-08,
"loss": 0.6197,
"step": 2053
},
{
"epoch": 0.9891049178354301,
"grad_norm": 3.0359103679656982,
"learning_rate": 7.006846072568074e-08,
"loss": 0.5404,
"step": 2054
},
{
"epoch": 0.9895864684283393,
"grad_norm": 2.437429189682007,
"learning_rate": 6.435168225381594e-08,
"loss": 0.5185,
"step": 2055
},
{
"epoch": 0.9900680190212484,
"grad_norm": 1.1326676607131958,
"learning_rate": 5.887806883474456e-08,
"loss": 0.2804,
"step": 2056
},
{
"epoch": 0.9905495696141576,
"grad_norm": 3.8812267780303955,
"learning_rate": 5.364763378694626e-08,
"loss": 0.6195,
"step": 2057
},
{
"epoch": 0.9910311202070667,
"grad_norm": 2.3206794261932373,
"learning_rate": 4.8660389837207334e-08,
"loss": 0.9294,
"step": 2058
},
{
"epoch": 0.991512670799976,
"grad_norm": 1.1168718338012695,
"learning_rate": 4.391634912056519e-08,
"loss": 0.8106,
"step": 2059
},
{
"epoch": 0.9919942213928851,
"grad_norm": 1.8295259475708008,
"learning_rate": 3.9415523180297286e-08,
"loss": 0.4507,
"step": 2060
},
{
"epoch": 0.9924757719857943,
"grad_norm": 2.970205068588257,
"learning_rate": 3.515792296789888e-08,
"loss": 0.3523,
"step": 2061
},
{
"epoch": 0.9929573225787034,
"grad_norm": 1.4164469242095947,
"learning_rate": 3.114355884301645e-08,
"loss": 0.7157,
"step": 2062
},
{
"epoch": 0.9934388731716126,
"grad_norm": 2.132336378097534,
"learning_rate": 2.7372440573469883e-08,
"loss": 0.5431,
"step": 2063
},
{
"epoch": 0.9939204237645217,
"grad_norm": 1.5810383558273315,
"learning_rate": 2.384457733520806e-08,
"loss": 0.2308,
"step": 2064
},
{
"epoch": 0.9944019743574309,
"grad_norm": 1.8615809679031372,
"learning_rate": 2.0559977712297785e-08,
"loss": 0.3219,
"step": 2065
},
{
"epoch": 0.99488352495034,
"grad_norm": 3.2156498432159424,
"learning_rate": 1.7518649696857126e-08,
"loss": 0.835,
"step": 2066
},
{
"epoch": 0.9953650755432493,
"grad_norm": 1.9598007202148438,
"learning_rate": 1.4720600689110963e-08,
"loss": 0.536,
"step": 2067
},
{
"epoch": 0.9958466261361584,
"grad_norm": 0.9099422097206116,
"learning_rate": 1.216583749731326e-08,
"loss": 0.3856,
"step": 2068
},
{
"epoch": 0.9963281767290676,
"grad_norm": 2.4464566707611084,
"learning_rate": 9.854366337758159e-09,
"loss": 0.9042,
"step": 2069
},
{
"epoch": 0.9968097273219768,
"grad_norm": 1.8802378177642822,
"learning_rate": 7.786192834746686e-09,
"loss": 0.8939,
"step": 2070
},
{
"epoch": 0.9972912779148859,
"grad_norm": 2.2916386127471924,
"learning_rate": 5.961322020608951e-09,
"loss": 0.7196,
"step": 2071
},
{
"epoch": 0.9977728285077951,
"grad_norm": 0.9568644762039185,
"learning_rate": 4.3797583356264275e-09,
"loss": 0.1672,
"step": 2072
},
{
"epoch": 0.9982543791007042,
"grad_norm": 1.4971035718917847,
"learning_rate": 3.0415056281096755e-09,
"loss": 0.4794,
"step": 2073
},
{
"epoch": 0.9987359296936135,
"grad_norm": 3.3616390228271484,
"learning_rate": 1.9465671543095197e-09,
"loss": 1.0098,
"step": 2074
},
{
"epoch": 0.9992174802865226,
"grad_norm": 3.415937662124634,
"learning_rate": 1.094945578439255e-09,
"loss": 1.0445,
"step": 2075
},
{
"epoch": 0.9996990308794318,
"grad_norm": 3.9460153579711914,
"learning_rate": 4.866429726857469e-10,
"loss": 1.0316,
"step": 2076
},
{
"epoch": 1.0,
"grad_norm": 2.570456027984619,
"learning_rate": 1.2166081717612797e-10,
"loss": 0.9203,
"step": 2077
},
{
"epoch": 1.0,
"step": 2077,
"total_flos": 6.565805779711427e+17,
"train_loss": 0.7756532824952038,
"train_runtime": 4587.6806,
"train_samples_per_second": 7.242,
"train_steps_per_second": 0.453
}
],
"logging_steps": 1,
"max_steps": 2077,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2400000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.565805779711427e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}