text2sql-sft-intermediate-step-rm / trainer_state.json
genies-llm's picture
Model save
1f5a1d7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2406,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012476606363069245,
"grad_norm": 9.34911917109977,
"learning_rate": 0.0,
"loss": 2.4049,
"num_tokens": 79590.0,
"step": 1
},
{
"epoch": 0.002495321272613849,
"grad_norm": 9.364768973646816,
"learning_rate": 1.36986301369863e-07,
"loss": 2.4134,
"num_tokens": 159103.0,
"step": 2
},
{
"epoch": 0.0037429819089207735,
"grad_norm": 9.456536966619405,
"learning_rate": 2.73972602739726e-07,
"loss": 2.4175,
"num_tokens": 237799.0,
"step": 3
},
{
"epoch": 0.004990642545227698,
"grad_norm": 9.426354062444751,
"learning_rate": 4.1095890410958903e-07,
"loss": 2.4288,
"num_tokens": 316493.0,
"step": 4
},
{
"epoch": 0.006238303181534623,
"grad_norm": 9.24236293603858,
"learning_rate": 5.47945205479452e-07,
"loss": 2.3781,
"num_tokens": 396816.0,
"step": 5
},
{
"epoch": 0.007485963817841547,
"grad_norm": 9.125681681424858,
"learning_rate": 6.849315068493151e-07,
"loss": 2.3607,
"num_tokens": 477827.0,
"step": 6
},
{
"epoch": 0.008733624454148471,
"grad_norm": 9.096924762031028,
"learning_rate": 8.219178082191781e-07,
"loss": 2.3668,
"num_tokens": 557522.0,
"step": 7
},
{
"epoch": 0.009981285090455396,
"grad_norm": 9.119042405502713,
"learning_rate": 9.589041095890411e-07,
"loss": 2.3608,
"num_tokens": 636975.0,
"step": 8
},
{
"epoch": 0.011228945726762321,
"grad_norm": 8.92660720049316,
"learning_rate": 1.095890410958904e-06,
"loss": 2.3189,
"num_tokens": 715688.0,
"step": 9
},
{
"epoch": 0.012476606363069246,
"grad_norm": 8.341929678010592,
"learning_rate": 1.2328767123287673e-06,
"loss": 2.2439,
"num_tokens": 796141.0,
"step": 10
},
{
"epoch": 0.01372426699937617,
"grad_norm": 8.261055619190312,
"learning_rate": 1.3698630136986302e-06,
"loss": 2.2317,
"num_tokens": 875804.0,
"step": 11
},
{
"epoch": 0.014971927635683094,
"grad_norm": 8.045610516077126,
"learning_rate": 1.5068493150684932e-06,
"loss": 2.1818,
"num_tokens": 957377.0,
"step": 12
},
{
"epoch": 0.016219588271990017,
"grad_norm": 6.709132143557291,
"learning_rate": 1.6438356164383561e-06,
"loss": 1.9262,
"num_tokens": 1038132.0,
"step": 13
},
{
"epoch": 0.017467248908296942,
"grad_norm": 6.562072632018573,
"learning_rate": 1.7808219178082193e-06,
"loss": 1.8848,
"num_tokens": 1119984.0,
"step": 14
},
{
"epoch": 0.018714909544603867,
"grad_norm": 6.461084025016272,
"learning_rate": 1.9178082191780823e-06,
"loss": 1.8372,
"num_tokens": 1200787.0,
"step": 15
},
{
"epoch": 0.019962570180910792,
"grad_norm": 6.432679419207876,
"learning_rate": 2.0547945205479454e-06,
"loss": 1.8135,
"num_tokens": 1281286.0,
"step": 16
},
{
"epoch": 0.021210230817217717,
"grad_norm": 6.869674718914819,
"learning_rate": 2.191780821917808e-06,
"loss": 1.2571,
"num_tokens": 1360214.0,
"step": 17
},
{
"epoch": 0.022457891453524642,
"grad_norm": 6.548310858256031,
"learning_rate": 2.3287671232876713e-06,
"loss": 1.2354,
"num_tokens": 1442460.0,
"step": 18
},
{
"epoch": 0.023705552089831567,
"grad_norm": 7.14680245757635,
"learning_rate": 2.4657534246575345e-06,
"loss": 1.1882,
"num_tokens": 1522593.0,
"step": 19
},
{
"epoch": 0.024953212726138492,
"grad_norm": 7.930859427939441,
"learning_rate": 2.6027397260273973e-06,
"loss": 1.0258,
"num_tokens": 1603158.0,
"step": 20
},
{
"epoch": 0.026200873362445413,
"grad_norm": 7.129618686033834,
"learning_rate": 2.7397260273972604e-06,
"loss": 0.9557,
"num_tokens": 1682215.0,
"step": 21
},
{
"epoch": 0.02744853399875234,
"grad_norm": 8.351617658742393,
"learning_rate": 2.876712328767123e-06,
"loss": 0.8064,
"num_tokens": 1762875.0,
"step": 22
},
{
"epoch": 0.028696194635059263,
"grad_norm": 6.202115696809869,
"learning_rate": 3.0136986301369864e-06,
"loss": 0.4053,
"num_tokens": 1842280.0,
"step": 23
},
{
"epoch": 0.02994385527136619,
"grad_norm": 3.015415758599779,
"learning_rate": 3.1506849315068495e-06,
"loss": 0.2977,
"num_tokens": 1922017.0,
"step": 24
},
{
"epoch": 0.031191515907673113,
"grad_norm": 1.7548078686272264,
"learning_rate": 3.2876712328767123e-06,
"loss": 0.2561,
"num_tokens": 2001171.0,
"step": 25
},
{
"epoch": 0.032439176543980035,
"grad_norm": 1.2570888602541386,
"learning_rate": 3.4246575342465754e-06,
"loss": 0.243,
"num_tokens": 2082788.0,
"step": 26
},
{
"epoch": 0.03368683718028696,
"grad_norm": 0.9858243334964526,
"learning_rate": 3.5616438356164386e-06,
"loss": 0.2084,
"num_tokens": 2163533.0,
"step": 27
},
{
"epoch": 0.034934497816593885,
"grad_norm": 0.8941120107804094,
"learning_rate": 3.6986301369863014e-06,
"loss": 0.2072,
"num_tokens": 2242151.0,
"step": 28
},
{
"epoch": 0.03618215845290081,
"grad_norm": 0.834578793765266,
"learning_rate": 3.8356164383561645e-06,
"loss": 0.2113,
"num_tokens": 2321655.0,
"step": 29
},
{
"epoch": 0.037429819089207735,
"grad_norm": 0.7958460007482222,
"learning_rate": 3.972602739726027e-06,
"loss": 0.1935,
"num_tokens": 2400547.0,
"step": 30
},
{
"epoch": 0.03867747972551466,
"grad_norm": 0.7476709118804037,
"learning_rate": 4.109589041095891e-06,
"loss": 0.1932,
"num_tokens": 2481040.0,
"step": 31
},
{
"epoch": 0.039925140361821584,
"grad_norm": 0.7336501580269786,
"learning_rate": 4.246575342465754e-06,
"loss": 0.1869,
"num_tokens": 2561815.0,
"step": 32
},
{
"epoch": 0.041172800998128506,
"grad_norm": 0.7276050156656503,
"learning_rate": 4.383561643835616e-06,
"loss": 0.1819,
"num_tokens": 2642280.0,
"step": 33
},
{
"epoch": 0.042420461634435434,
"grad_norm": 0.7320713436530883,
"learning_rate": 4.52054794520548e-06,
"loss": 0.1776,
"num_tokens": 2722505.0,
"step": 34
},
{
"epoch": 0.043668122270742356,
"grad_norm": 0.7211851177445922,
"learning_rate": 4.657534246575343e-06,
"loss": 0.1626,
"num_tokens": 2801483.0,
"step": 35
},
{
"epoch": 0.044915782907049284,
"grad_norm": 0.7706879408631792,
"learning_rate": 4.7945205479452054e-06,
"loss": 0.1652,
"num_tokens": 2880749.0,
"step": 36
},
{
"epoch": 0.046163443543356206,
"grad_norm": 0.6310470242010754,
"learning_rate": 4.931506849315069e-06,
"loss": 0.1578,
"num_tokens": 2960025.0,
"step": 37
},
{
"epoch": 0.047411104179663134,
"grad_norm": 0.5719157846400421,
"learning_rate": 5.068493150684932e-06,
"loss": 0.1559,
"num_tokens": 3040006.0,
"step": 38
},
{
"epoch": 0.048658764815970056,
"grad_norm": 0.5556127515496487,
"learning_rate": 5.2054794520547945e-06,
"loss": 0.1445,
"num_tokens": 3118779.0,
"step": 39
},
{
"epoch": 0.049906425452276984,
"grad_norm": 0.47419374873726,
"learning_rate": 5.342465753424658e-06,
"loss": 0.1494,
"num_tokens": 3200031.0,
"step": 40
},
{
"epoch": 0.051154086088583905,
"grad_norm": 0.4421370793504471,
"learning_rate": 5.479452054794521e-06,
"loss": 0.1523,
"num_tokens": 3281300.0,
"step": 41
},
{
"epoch": 0.05240174672489083,
"grad_norm": 0.43669518921865735,
"learning_rate": 5.6164383561643845e-06,
"loss": 0.1403,
"num_tokens": 3361534.0,
"step": 42
},
{
"epoch": 0.053649407361197755,
"grad_norm": 0.4277843089034562,
"learning_rate": 5.753424657534246e-06,
"loss": 0.1432,
"num_tokens": 3441863.0,
"step": 43
},
{
"epoch": 0.05489706799750468,
"grad_norm": 0.41630114035277377,
"learning_rate": 5.89041095890411e-06,
"loss": 0.1315,
"num_tokens": 3521074.0,
"step": 44
},
{
"epoch": 0.056144728633811605,
"grad_norm": 0.41370843939103547,
"learning_rate": 6.027397260273973e-06,
"loss": 0.1387,
"num_tokens": 3602411.0,
"step": 45
},
{
"epoch": 0.05739238927011853,
"grad_norm": 0.43876538412653404,
"learning_rate": 6.164383561643836e-06,
"loss": 0.1361,
"num_tokens": 3682928.0,
"step": 46
},
{
"epoch": 0.058640049906425455,
"grad_norm": 0.34801092467574696,
"learning_rate": 6.301369863013699e-06,
"loss": 0.1258,
"num_tokens": 3763667.0,
"step": 47
},
{
"epoch": 0.05988771054273238,
"grad_norm": 0.2939900530144378,
"learning_rate": 6.438356164383563e-06,
"loss": 0.1322,
"num_tokens": 3845378.0,
"step": 48
},
{
"epoch": 0.0611353711790393,
"grad_norm": 0.24786140273528653,
"learning_rate": 6.5753424657534245e-06,
"loss": 0.1267,
"num_tokens": 3925502.0,
"step": 49
},
{
"epoch": 0.06238303181534623,
"grad_norm": 0.20173255384894515,
"learning_rate": 6.712328767123288e-06,
"loss": 0.1208,
"num_tokens": 4004936.0,
"step": 50
},
{
"epoch": 0.06363069245165315,
"grad_norm": 0.18388818103544632,
"learning_rate": 6.849315068493151e-06,
"loss": 0.1225,
"num_tokens": 4084695.0,
"step": 51
},
{
"epoch": 0.06487835308796007,
"grad_norm": 0.21045301465823563,
"learning_rate": 6.9863013698630145e-06,
"loss": 0.1216,
"num_tokens": 4163992.0,
"step": 52
},
{
"epoch": 0.066126013724267,
"grad_norm": 0.18341190230418852,
"learning_rate": 7.123287671232877e-06,
"loss": 0.124,
"num_tokens": 4244593.0,
"step": 53
},
{
"epoch": 0.06737367436057393,
"grad_norm": 0.19088058421518178,
"learning_rate": 7.260273972602741e-06,
"loss": 0.1145,
"num_tokens": 4324390.0,
"step": 54
},
{
"epoch": 0.06862133499688085,
"grad_norm": 0.19103912807016007,
"learning_rate": 7.397260273972603e-06,
"loss": 0.1271,
"num_tokens": 4405698.0,
"step": 55
},
{
"epoch": 0.06986899563318777,
"grad_norm": 0.18970020622501074,
"learning_rate": 7.534246575342466e-06,
"loss": 0.112,
"num_tokens": 4485053.0,
"step": 56
},
{
"epoch": 0.07111665626949469,
"grad_norm": 0.19164305174626092,
"learning_rate": 7.671232876712329e-06,
"loss": 0.1098,
"num_tokens": 4565926.0,
"step": 57
},
{
"epoch": 0.07236431690580163,
"grad_norm": 0.19460729939643484,
"learning_rate": 7.808219178082192e-06,
"loss": 0.1087,
"num_tokens": 4644896.0,
"step": 58
},
{
"epoch": 0.07361197754210855,
"grad_norm": 0.18559860342670173,
"learning_rate": 7.945205479452055e-06,
"loss": 0.1101,
"num_tokens": 4724338.0,
"step": 59
},
{
"epoch": 0.07485963817841547,
"grad_norm": 0.20159007460588826,
"learning_rate": 8.082191780821919e-06,
"loss": 0.1194,
"num_tokens": 4805035.0,
"step": 60
},
{
"epoch": 0.07610729881472239,
"grad_norm": 0.19163951373300692,
"learning_rate": 8.219178082191782e-06,
"loss": 0.1101,
"num_tokens": 4886497.0,
"step": 61
},
{
"epoch": 0.07735495945102933,
"grad_norm": 0.1792411438909808,
"learning_rate": 8.356164383561644e-06,
"loss": 0.1052,
"num_tokens": 4966066.0,
"step": 62
},
{
"epoch": 0.07860262008733625,
"grad_norm": 0.19871949594852764,
"learning_rate": 8.493150684931507e-06,
"loss": 0.1107,
"num_tokens": 5046294.0,
"step": 63
},
{
"epoch": 0.07985028072364317,
"grad_norm": 0.2009426482501604,
"learning_rate": 8.63013698630137e-06,
"loss": 0.111,
"num_tokens": 5127200.0,
"step": 64
},
{
"epoch": 0.08109794135995009,
"grad_norm": 0.18854634659363081,
"learning_rate": 8.767123287671233e-06,
"loss": 0.1062,
"num_tokens": 5207375.0,
"step": 65
},
{
"epoch": 0.08234560199625701,
"grad_norm": 0.1909614838823602,
"learning_rate": 8.904109589041097e-06,
"loss": 0.1055,
"num_tokens": 5288077.0,
"step": 66
},
{
"epoch": 0.08359326263256395,
"grad_norm": 0.18966488054923936,
"learning_rate": 9.04109589041096e-06,
"loss": 0.0965,
"num_tokens": 5368986.0,
"step": 67
},
{
"epoch": 0.08484092326887087,
"grad_norm": 0.19600407468281897,
"learning_rate": 9.178082191780823e-06,
"loss": 0.0989,
"num_tokens": 5448866.0,
"step": 68
},
{
"epoch": 0.08608858390517779,
"grad_norm": 0.19248220731163004,
"learning_rate": 9.315068493150685e-06,
"loss": 0.1021,
"num_tokens": 5529761.0,
"step": 69
},
{
"epoch": 0.08733624454148471,
"grad_norm": 0.1790396798178214,
"learning_rate": 9.452054794520548e-06,
"loss": 0.0977,
"num_tokens": 5610079.0,
"step": 70
},
{
"epoch": 0.08858390517779165,
"grad_norm": 0.20227512996028574,
"learning_rate": 9.589041095890411e-06,
"loss": 0.103,
"num_tokens": 5690121.0,
"step": 71
},
{
"epoch": 0.08983156581409857,
"grad_norm": 0.1804340417389163,
"learning_rate": 9.726027397260275e-06,
"loss": 0.0988,
"num_tokens": 5770807.0,
"step": 72
},
{
"epoch": 0.09107922645040549,
"grad_norm": 0.18708792441875738,
"learning_rate": 9.863013698630138e-06,
"loss": 0.1007,
"num_tokens": 5851162.0,
"step": 73
},
{
"epoch": 0.09232688708671241,
"grad_norm": 0.18580134464623252,
"learning_rate": 1e-05,
"loss": 0.1015,
"num_tokens": 5933144.0,
"step": 74
},
{
"epoch": 0.09357454772301933,
"grad_norm": 0.1777511395438126,
"learning_rate": 9.999995920069922e-06,
"loss": 0.0925,
"num_tokens": 6013167.0,
"step": 75
},
{
"epoch": 0.09482220835932627,
"grad_norm": 0.18453230882301527,
"learning_rate": 9.999983680287084e-06,
"loss": 0.0995,
"num_tokens": 6092429.0,
"step": 76
},
{
"epoch": 0.09606986899563319,
"grad_norm": 0.17990626138096272,
"learning_rate": 9.99996328067368e-06,
"loss": 0.0894,
"num_tokens": 6171395.0,
"step": 77
},
{
"epoch": 0.09731752963194011,
"grad_norm": 0.18671247100201174,
"learning_rate": 9.999934721266702e-06,
"loss": 0.0954,
"num_tokens": 6252161.0,
"step": 78
},
{
"epoch": 0.09856519026824703,
"grad_norm": 0.17367676216662678,
"learning_rate": 9.999898002117937e-06,
"loss": 0.0882,
"num_tokens": 6331946.0,
"step": 79
},
{
"epoch": 0.09981285090455397,
"grad_norm": 0.17790787231369953,
"learning_rate": 9.999853123293967e-06,
"loss": 0.0948,
"num_tokens": 6412878.0,
"step": 80
},
{
"epoch": 0.10106051154086089,
"grad_norm": 0.17825720009899704,
"learning_rate": 9.99980008487617e-06,
"loss": 0.0883,
"num_tokens": 6492625.0,
"step": 81
},
{
"epoch": 0.10230817217716781,
"grad_norm": 0.177633671805786,
"learning_rate": 9.999738886960724e-06,
"loss": 0.0958,
"num_tokens": 6572706.0,
"step": 82
},
{
"epoch": 0.10355583281347473,
"grad_norm": 0.1781514598121185,
"learning_rate": 9.999669529658596e-06,
"loss": 0.1016,
"num_tokens": 6654066.0,
"step": 83
},
{
"epoch": 0.10480349344978165,
"grad_norm": 0.1797123476723708,
"learning_rate": 9.999592013095553e-06,
"loss": 0.0889,
"num_tokens": 6733703.0,
"step": 84
},
{
"epoch": 0.10605115408608859,
"grad_norm": 0.17668230595097628,
"learning_rate": 9.999506337412157e-06,
"loss": 0.0905,
"num_tokens": 6813311.0,
"step": 85
},
{
"epoch": 0.10729881472239551,
"grad_norm": 0.1635782657626139,
"learning_rate": 9.99941250276376e-06,
"loss": 0.0891,
"num_tokens": 6892900.0,
"step": 86
},
{
"epoch": 0.10854647535870243,
"grad_norm": 0.17152001517029147,
"learning_rate": 9.999310509320518e-06,
"loss": 0.0852,
"num_tokens": 6971684.0,
"step": 87
},
{
"epoch": 0.10979413599500935,
"grad_norm": 0.16674351199678547,
"learning_rate": 9.999200357267373e-06,
"loss": 0.0844,
"num_tokens": 7050727.0,
"step": 88
},
{
"epoch": 0.11104179663131628,
"grad_norm": 0.16284064715514623,
"learning_rate": 9.999082046804062e-06,
"loss": 0.0894,
"num_tokens": 7130506.0,
"step": 89
},
{
"epoch": 0.11228945726762321,
"grad_norm": 0.17699815453244996,
"learning_rate": 9.998955578145124e-06,
"loss": 0.0896,
"num_tokens": 7210539.0,
"step": 90
},
{
"epoch": 0.11353711790393013,
"grad_norm": 0.17175993286806868,
"learning_rate": 9.998820951519877e-06,
"loss": 0.0909,
"num_tokens": 7291662.0,
"step": 91
},
{
"epoch": 0.11478477854023705,
"grad_norm": 0.16895654572874105,
"learning_rate": 9.998678167172446e-06,
"loss": 0.0866,
"num_tokens": 7371708.0,
"step": 92
},
{
"epoch": 0.11603243917654397,
"grad_norm": 0.15898095818806993,
"learning_rate": 9.99852722536174e-06,
"loss": 0.0891,
"num_tokens": 7451637.0,
"step": 93
},
{
"epoch": 0.11728009981285091,
"grad_norm": 0.1655714312412077,
"learning_rate": 9.998368126361459e-06,
"loss": 0.0855,
"num_tokens": 7532024.0,
"step": 94
},
{
"epoch": 0.11852776044915783,
"grad_norm": 0.16278257907586435,
"learning_rate": 9.998200870460103e-06,
"loss": 0.0855,
"num_tokens": 7611489.0,
"step": 95
},
{
"epoch": 0.11977542108546475,
"grad_norm": 0.17618441557702208,
"learning_rate": 9.998025457960955e-06,
"loss": 0.0963,
"num_tokens": 7693716.0,
"step": 96
},
{
"epoch": 0.12102308172177167,
"grad_norm": 0.15790344775477788,
"learning_rate": 9.997841889182091e-06,
"loss": 0.0856,
"num_tokens": 7774645.0,
"step": 97
},
{
"epoch": 0.1222707423580786,
"grad_norm": 0.16185678156409003,
"learning_rate": 9.997650164456375e-06,
"loss": 0.0793,
"num_tokens": 7855390.0,
"step": 98
},
{
"epoch": 0.12351840299438553,
"grad_norm": 0.16681775638759538,
"learning_rate": 9.997450284131465e-06,
"loss": 0.0826,
"num_tokens": 7935403.0,
"step": 99
},
{
"epoch": 0.12476606363069245,
"grad_norm": 0.16550888127393662,
"learning_rate": 9.997242248569802e-06,
"loss": 0.0836,
"num_tokens": 8015879.0,
"step": 100
},
{
"epoch": 0.1260137242669994,
"grad_norm": 0.1771249194917155,
"learning_rate": 9.997026058148617e-06,
"loss": 0.0863,
"num_tokens": 8096492.0,
"step": 101
},
{
"epoch": 0.1272613849033063,
"grad_norm": 0.18698309428988277,
"learning_rate": 9.996801713259933e-06,
"loss": 0.0949,
"num_tokens": 8177949.0,
"step": 102
},
{
"epoch": 0.12850904553961323,
"grad_norm": 0.15901318998957345,
"learning_rate": 9.996569214310549e-06,
"loss": 0.0819,
"num_tokens": 8256684.0,
"step": 103
},
{
"epoch": 0.12975670617592014,
"grad_norm": 0.1659574005563422,
"learning_rate": 9.99632856172206e-06,
"loss": 0.0755,
"num_tokens": 8335236.0,
"step": 104
},
{
"epoch": 0.13100436681222707,
"grad_norm": 0.16236837894961287,
"learning_rate": 9.99607975593084e-06,
"loss": 0.0837,
"num_tokens": 8414924.0,
"step": 105
},
{
"epoch": 0.132252027448534,
"grad_norm": 0.16857841750351044,
"learning_rate": 9.995822797388052e-06,
"loss": 0.0832,
"num_tokens": 8494719.0,
"step": 106
},
{
"epoch": 0.13349968808484092,
"grad_norm": 0.16077102501626883,
"learning_rate": 9.995557686559635e-06,
"loss": 0.0825,
"num_tokens": 8574385.0,
"step": 107
},
{
"epoch": 0.13474734872114785,
"grad_norm": 0.16504694201517098,
"learning_rate": 9.995284423926318e-06,
"loss": 0.0823,
"num_tokens": 8655947.0,
"step": 108
},
{
"epoch": 0.13599500935745476,
"grad_norm": 0.16670618256633662,
"learning_rate": 9.995003009983608e-06,
"loss": 0.0859,
"num_tokens": 8735660.0,
"step": 109
},
{
"epoch": 0.1372426699937617,
"grad_norm": 0.18503209596000766,
"learning_rate": 9.994713445241793e-06,
"loss": 0.087,
"num_tokens": 8816074.0,
"step": 110
},
{
"epoch": 0.13849033063006863,
"grad_norm": 0.16851469586707762,
"learning_rate": 9.994415730225943e-06,
"loss": 0.0949,
"num_tokens": 8897733.0,
"step": 111
},
{
"epoch": 0.13973799126637554,
"grad_norm": 0.16695446108820516,
"learning_rate": 9.994109865475903e-06,
"loss": 0.0848,
"num_tokens": 8977866.0,
"step": 112
},
{
"epoch": 0.14098565190268247,
"grad_norm": 0.16886821971011903,
"learning_rate": 9.993795851546302e-06,
"loss": 0.0847,
"num_tokens": 9059979.0,
"step": 113
},
{
"epoch": 0.14223331253898938,
"grad_norm": 0.15825031266233286,
"learning_rate": 9.993473689006538e-06,
"loss": 0.0797,
"num_tokens": 9139827.0,
"step": 114
},
{
"epoch": 0.14348097317529632,
"grad_norm": 0.1628042779419236,
"learning_rate": 9.99314337844079e-06,
"loss": 0.0866,
"num_tokens": 9220225.0,
"step": 115
},
{
"epoch": 0.14472863381160325,
"grad_norm": 0.17318475399282052,
"learning_rate": 9.992804920448013e-06,
"loss": 0.0835,
"num_tokens": 9300879.0,
"step": 116
},
{
"epoch": 0.14597629444791016,
"grad_norm": 0.15657831342283274,
"learning_rate": 9.992458315641932e-06,
"loss": 0.0763,
"num_tokens": 9380164.0,
"step": 117
},
{
"epoch": 0.1472239550842171,
"grad_norm": 0.1716611418125263,
"learning_rate": 9.992103564651048e-06,
"loss": 0.0864,
"num_tokens": 9460543.0,
"step": 118
},
{
"epoch": 0.14847161572052403,
"grad_norm": 0.15729587536249917,
"learning_rate": 9.991740668118629e-06,
"loss": 0.078,
"num_tokens": 9540063.0,
"step": 119
},
{
"epoch": 0.14971927635683094,
"grad_norm": 0.17115153204306796,
"learning_rate": 9.991369626702717e-06,
"loss": 0.0838,
"num_tokens": 9620959.0,
"step": 120
},
{
"epoch": 0.15096693699313787,
"grad_norm": 0.16228121857689287,
"learning_rate": 9.990990441076125e-06,
"loss": 0.082,
"num_tokens": 9702803.0,
"step": 121
},
{
"epoch": 0.15221459762944478,
"grad_norm": 0.1635063935531281,
"learning_rate": 9.990603111926424e-06,
"loss": 0.0788,
"num_tokens": 9782410.0,
"step": 122
},
{
"epoch": 0.15346225826575172,
"grad_norm": 0.17181078017382084,
"learning_rate": 9.990207639955969e-06,
"loss": 0.0819,
"num_tokens": 9863350.0,
"step": 123
},
{
"epoch": 0.15470991890205865,
"grad_norm": 0.1643811410282833,
"learning_rate": 9.989804025881862e-06,
"loss": 0.077,
"num_tokens": 9942485.0,
"step": 124
},
{
"epoch": 0.15595757953836556,
"grad_norm": 0.1659276911169034,
"learning_rate": 9.98939227043598e-06,
"loss": 0.0785,
"num_tokens": 10022555.0,
"step": 125
},
{
"epoch": 0.1572052401746725,
"grad_norm": 0.1652036681004015,
"learning_rate": 9.988972374364961e-06,
"loss": 0.0802,
"num_tokens": 10102391.0,
"step": 126
},
{
"epoch": 0.1584529008109794,
"grad_norm": 0.17240153860438123,
"learning_rate": 9.988544338430203e-06,
"loss": 0.0796,
"num_tokens": 10183708.0,
"step": 127
},
{
"epoch": 0.15970056144728634,
"grad_norm": 0.16529523837288534,
"learning_rate": 9.988108163407865e-06,
"loss": 0.0809,
"num_tokens": 10265029.0,
"step": 128
},
{
"epoch": 0.16094822208359327,
"grad_norm": 0.1684552284249565,
"learning_rate": 9.987663850088862e-06,
"loss": 0.0787,
"num_tokens": 10344489.0,
"step": 129
},
{
"epoch": 0.16219588271990018,
"grad_norm": 0.15609693921109477,
"learning_rate": 9.987211399278871e-06,
"loss": 0.0765,
"num_tokens": 10423520.0,
"step": 130
},
{
"epoch": 0.16344354335620712,
"grad_norm": 0.16318291937030596,
"learning_rate": 9.98675081179832e-06,
"loss": 0.071,
"num_tokens": 10502900.0,
"step": 131
},
{
"epoch": 0.16469120399251402,
"grad_norm": 0.2007608468972681,
"learning_rate": 9.986282088482397e-06,
"loss": 0.0767,
"num_tokens": 10583282.0,
"step": 132
},
{
"epoch": 0.16593886462882096,
"grad_norm": 0.16935616163873396,
"learning_rate": 9.985805230181031e-06,
"loss": 0.0749,
"num_tokens": 10662589.0,
"step": 133
},
{
"epoch": 0.1671865252651279,
"grad_norm": 0.171740825216556,
"learning_rate": 9.985320237758918e-06,
"loss": 0.0775,
"num_tokens": 10742307.0,
"step": 134
},
{
"epoch": 0.1684341859014348,
"grad_norm": 0.15828176697546295,
"learning_rate": 9.984827112095495e-06,
"loss": 0.0753,
"num_tokens": 10821872.0,
"step": 135
},
{
"epoch": 0.16968184653774174,
"grad_norm": 0.16873525559334726,
"learning_rate": 9.984325854084946e-06,
"loss": 0.0786,
"num_tokens": 10907937.0,
"step": 136
},
{
"epoch": 0.17092950717404864,
"grad_norm": 0.16380536060958564,
"learning_rate": 9.983816464636203e-06,
"loss": 0.0784,
"num_tokens": 10988958.0,
"step": 137
},
{
"epoch": 0.17217716781035558,
"grad_norm": 0.160963929320833,
"learning_rate": 9.983298944672942e-06,
"loss": 0.0817,
"num_tokens": 11070498.0,
"step": 138
},
{
"epoch": 0.17342482844666252,
"grad_norm": 0.164166377631778,
"learning_rate": 9.982773295133585e-06,
"loss": 0.0754,
"num_tokens": 11150195.0,
"step": 139
},
{
"epoch": 0.17467248908296942,
"grad_norm": 0.18357918297872183,
"learning_rate": 9.982239516971295e-06,
"loss": 0.0783,
"num_tokens": 11231131.0,
"step": 140
},
{
"epoch": 0.17592014971927636,
"grad_norm": 0.15755220760124228,
"learning_rate": 9.98169761115397e-06,
"loss": 0.0786,
"num_tokens": 11311740.0,
"step": 141
},
{
"epoch": 0.1771678103555833,
"grad_norm": 0.16447695668867712,
"learning_rate": 9.98114757866425e-06,
"loss": 0.0784,
"num_tokens": 11392420.0,
"step": 142
},
{
"epoch": 0.1784154709918902,
"grad_norm": 0.15694055102021415,
"learning_rate": 9.980589420499512e-06,
"loss": 0.0841,
"num_tokens": 11472491.0,
"step": 143
},
{
"epoch": 0.17966313162819714,
"grad_norm": 0.1614277416464966,
"learning_rate": 9.980023137671862e-06,
"loss": 0.072,
"num_tokens": 11552759.0,
"step": 144
},
{
"epoch": 0.18091079226450404,
"grad_norm": 0.1581923947848541,
"learning_rate": 9.979448731208145e-06,
"loss": 0.0711,
"num_tokens": 11632535.0,
"step": 145
},
{
"epoch": 0.18215845290081098,
"grad_norm": 0.16298717987923186,
"learning_rate": 9.978866202149931e-06,
"loss": 0.0731,
"num_tokens": 11712101.0,
"step": 146
},
{
"epoch": 0.18340611353711792,
"grad_norm": 0.17368683828180292,
"learning_rate": 9.978275551553526e-06,
"loss": 0.0791,
"num_tokens": 11791963.0,
"step": 147
},
{
"epoch": 0.18465377417342482,
"grad_norm": 0.16144191529310298,
"learning_rate": 9.977676780489953e-06,
"loss": 0.0777,
"num_tokens": 11872345.0,
"step": 148
},
{
"epoch": 0.18590143480973176,
"grad_norm": 0.1651729281394997,
"learning_rate": 9.977069890044965e-06,
"loss": 0.0809,
"num_tokens": 11954469.0,
"step": 149
},
{
"epoch": 0.18714909544603867,
"grad_norm": 0.16467083171937716,
"learning_rate": 9.976454881319041e-06,
"loss": 0.0724,
"num_tokens": 12033673.0,
"step": 150
},
{
"epoch": 0.1883967560823456,
"grad_norm": 0.15270182358741702,
"learning_rate": 9.975831755427376e-06,
"loss": 0.0719,
"num_tokens": 12113393.0,
"step": 151
},
{
"epoch": 0.18964441671865254,
"grad_norm": 0.1552933576757993,
"learning_rate": 9.975200513499886e-06,
"loss": 0.0769,
"num_tokens": 12194535.0,
"step": 152
},
{
"epoch": 0.19089207735495944,
"grad_norm": 0.15708292865325066,
"learning_rate": 9.974561156681203e-06,
"loss": 0.076,
"num_tokens": 12275521.0,
"step": 153
},
{
"epoch": 0.19213973799126638,
"grad_norm": 0.15744491473566746,
"learning_rate": 9.973913686130674e-06,
"loss": 0.0718,
"num_tokens": 12355487.0,
"step": 154
},
{
"epoch": 0.1933873986275733,
"grad_norm": 0.1684494744054102,
"learning_rate": 9.973258103022361e-06,
"loss": 0.077,
"num_tokens": 12435557.0,
"step": 155
},
{
"epoch": 0.19463505926388022,
"grad_norm": 0.17092559295973017,
"learning_rate": 9.97259440854503e-06,
"loss": 0.078,
"num_tokens": 12516421.0,
"step": 156
},
{
"epoch": 0.19588271990018716,
"grad_norm": 0.16467948975450056,
"learning_rate": 9.971922603902164e-06,
"loss": 0.0792,
"num_tokens": 12596956.0,
"step": 157
},
{
"epoch": 0.19713038053649407,
"grad_norm": 0.16129266645735674,
"learning_rate": 9.971242690311944e-06,
"loss": 0.0715,
"num_tokens": 12677329.0,
"step": 158
},
{
"epoch": 0.198378041172801,
"grad_norm": 0.15566121358014243,
"learning_rate": 9.970554669007264e-06,
"loss": 0.071,
"num_tokens": 12757136.0,
"step": 159
},
{
"epoch": 0.19962570180910794,
"grad_norm": 0.15418083568009736,
"learning_rate": 9.969858541235708e-06,
"loss": 0.0707,
"num_tokens": 12837208.0,
"step": 160
},
{
"epoch": 0.20087336244541484,
"grad_norm": 0.15281662930143763,
"learning_rate": 9.969154308259572e-06,
"loss": 0.072,
"num_tokens": 12916423.0,
"step": 161
},
{
"epoch": 0.20212102308172178,
"grad_norm": 0.15806417128594089,
"learning_rate": 9.968441971355839e-06,
"loss": 0.0697,
"num_tokens": 12995763.0,
"step": 162
},
{
"epoch": 0.2033686837180287,
"grad_norm": 0.16247347666054926,
"learning_rate": 9.967721531816194e-06,
"loss": 0.069,
"num_tokens": 13075036.0,
"step": 163
},
{
"epoch": 0.20461634435433562,
"grad_norm": 0.16120489496865675,
"learning_rate": 9.96699299094701e-06,
"loss": 0.0657,
"num_tokens": 13153140.0,
"step": 164
},
{
"epoch": 0.20586400499064256,
"grad_norm": 0.1654367494643062,
"learning_rate": 9.966256350069355e-06,
"loss": 0.0719,
"num_tokens": 13233496.0,
"step": 165
},
{
"epoch": 0.20711166562694946,
"grad_norm": 0.15797090901409847,
"learning_rate": 9.965511610518975e-06,
"loss": 0.0741,
"num_tokens": 13313688.0,
"step": 166
},
{
"epoch": 0.2083593262632564,
"grad_norm": 0.17318092994565018,
"learning_rate": 9.964758773646314e-06,
"loss": 0.0705,
"num_tokens": 13392817.0,
"step": 167
},
{
"epoch": 0.2096069868995633,
"grad_norm": 0.16326342203924862,
"learning_rate": 9.963997840816491e-06,
"loss": 0.0694,
"num_tokens": 13472052.0,
"step": 168
},
{
"epoch": 0.21085464753587024,
"grad_norm": 0.1649032874821036,
"learning_rate": 9.963228813409307e-06,
"loss": 0.0718,
"num_tokens": 13552229.0,
"step": 169
},
{
"epoch": 0.21210230817217718,
"grad_norm": 0.14746862120200496,
"learning_rate": 9.962451692819238e-06,
"loss": 0.0674,
"num_tokens": 13631487.0,
"step": 170
},
{
"epoch": 0.2133499688084841,
"grad_norm": 0.16140552300758973,
"learning_rate": 9.961666480455445e-06,
"loss": 0.0711,
"num_tokens": 13710876.0,
"step": 171
},
{
"epoch": 0.21459762944479102,
"grad_norm": 0.1561744394885393,
"learning_rate": 9.96087317774175e-06,
"loss": 0.0678,
"num_tokens": 13790236.0,
"step": 172
},
{
"epoch": 0.21584529008109793,
"grad_norm": 0.1532654032002478,
"learning_rate": 9.960071786116652e-06,
"loss": 0.0701,
"num_tokens": 13869459.0,
"step": 173
},
{
"epoch": 0.21709295071740486,
"grad_norm": 0.16418271391935854,
"learning_rate": 9.959262307033318e-06,
"loss": 0.0702,
"num_tokens": 13949850.0,
"step": 174
},
{
"epoch": 0.2183406113537118,
"grad_norm": 0.16417748724525755,
"learning_rate": 9.958444741959577e-06,
"loss": 0.0794,
"num_tokens": 14030886.0,
"step": 175
},
{
"epoch": 0.2195882719900187,
"grad_norm": 0.1475411082063503,
"learning_rate": 9.957619092377921e-06,
"loss": 0.0697,
"num_tokens": 14110548.0,
"step": 176
},
{
"epoch": 0.22083593262632564,
"grad_norm": 0.15200242109370143,
"learning_rate": 9.956785359785501e-06,
"loss": 0.0725,
"num_tokens": 14191162.0,
"step": 177
},
{
"epoch": 0.22208359326263255,
"grad_norm": 0.14951724955021337,
"learning_rate": 9.95594354569413e-06,
"loss": 0.0713,
"num_tokens": 14270373.0,
"step": 178
},
{
"epoch": 0.22333125389893949,
"grad_norm": 0.15437761740995362,
"learning_rate": 9.955093651630271e-06,
"loss": 0.0694,
"num_tokens": 14350521.0,
"step": 179
},
{
"epoch": 0.22457891453524642,
"grad_norm": 0.16318758670731984,
"learning_rate": 9.954235679135035e-06,
"loss": 0.0648,
"num_tokens": 14430117.0,
"step": 180
},
{
"epoch": 0.22582657517155333,
"grad_norm": 0.1509213212454131,
"learning_rate": 9.953369629764187e-06,
"loss": 0.0659,
"num_tokens": 14510184.0,
"step": 181
},
{
"epoch": 0.22707423580786026,
"grad_norm": 0.15811847612755178,
"learning_rate": 9.952495505088138e-06,
"loss": 0.069,
"num_tokens": 14589696.0,
"step": 182
},
{
"epoch": 0.2283218964441672,
"grad_norm": 0.17426294289031205,
"learning_rate": 9.95161330669194e-06,
"loss": 0.0721,
"num_tokens": 14669642.0,
"step": 183
},
{
"epoch": 0.2295695570804741,
"grad_norm": 0.17769882981769738,
"learning_rate": 9.950723036175282e-06,
"loss": 0.0703,
"num_tokens": 14749847.0,
"step": 184
},
{
"epoch": 0.23081721771678104,
"grad_norm": 0.1674097835401589,
"learning_rate": 9.9498246951525e-06,
"loss": 0.0747,
"num_tokens": 14829647.0,
"step": 185
},
{
"epoch": 0.23206487835308795,
"grad_norm": 0.15100509686307984,
"learning_rate": 9.948918285252551e-06,
"loss": 0.0745,
"num_tokens": 14910343.0,
"step": 186
},
{
"epoch": 0.23331253898939489,
"grad_norm": 0.15360581031528767,
"learning_rate": 9.948003808119034e-06,
"loss": 0.0714,
"num_tokens": 14990734.0,
"step": 187
},
{
"epoch": 0.23456019962570182,
"grad_norm": 0.16940538023361718,
"learning_rate": 9.94708126541017e-06,
"loss": 0.0684,
"num_tokens": 15070750.0,
"step": 188
},
{
"epoch": 0.23580786026200873,
"grad_norm": 0.15548216290846645,
"learning_rate": 9.94615065879881e-06,
"loss": 0.069,
"num_tokens": 15150020.0,
"step": 189
},
{
"epoch": 0.23705552089831566,
"grad_norm": 0.14993741595806287,
"learning_rate": 9.945211989972425e-06,
"loss": 0.0703,
"num_tokens": 15231989.0,
"step": 190
},
{
"epoch": 0.23830318153462257,
"grad_norm": 0.1474419032756179,
"learning_rate": 9.944265260633105e-06,
"loss": 0.0711,
"num_tokens": 15312107.0,
"step": 191
},
{
"epoch": 0.2395508421709295,
"grad_norm": 0.1601090310495295,
"learning_rate": 9.943310472497556e-06,
"loss": 0.0696,
"num_tokens": 15391755.0,
"step": 192
},
{
"epoch": 0.24079850280723644,
"grad_norm": 0.16372600424581513,
"learning_rate": 9.942347627297095e-06,
"loss": 0.0719,
"num_tokens": 15472703.0,
"step": 193
},
{
"epoch": 0.24204616344354335,
"grad_norm": 0.15885711149872717,
"learning_rate": 9.941376726777656e-06,
"loss": 0.0703,
"num_tokens": 15552902.0,
"step": 194
},
{
"epoch": 0.24329382407985028,
"grad_norm": 0.14374818433177697,
"learning_rate": 9.940397772699773e-06,
"loss": 0.0674,
"num_tokens": 15633405.0,
"step": 195
},
{
"epoch": 0.2445414847161572,
"grad_norm": 0.14857277972441196,
"learning_rate": 9.939410766838586e-06,
"loss": 0.0715,
"num_tokens": 15714298.0,
"step": 196
},
{
"epoch": 0.24578914535246413,
"grad_norm": 0.15728113670824326,
"learning_rate": 9.938415710983834e-06,
"loss": 0.0655,
"num_tokens": 15793415.0,
"step": 197
},
{
"epoch": 0.24703680598877106,
"grad_norm": 0.15942515909588884,
"learning_rate": 9.937412606939854e-06,
"loss": 0.0725,
"num_tokens": 15874136.0,
"step": 198
},
{
"epoch": 0.24828446662507797,
"grad_norm": 0.13933287189995225,
"learning_rate": 9.936401456525578e-06,
"loss": 0.0686,
"num_tokens": 15953965.0,
"step": 199
},
{
"epoch": 0.2495321272613849,
"grad_norm": 0.14826313565340965,
"learning_rate": 9.935382261574527e-06,
"loss": 0.0646,
"num_tokens": 16034405.0,
"step": 200
},
{
"epoch": 0.25077978789769184,
"grad_norm": 0.14090118375202848,
"learning_rate": 9.934355023934808e-06,
"loss": 0.0595,
"num_tokens": 16112634.0,
"step": 201
},
{
"epoch": 0.2520274485339988,
"grad_norm": 0.15692035946489286,
"learning_rate": 9.933319745469117e-06,
"loss": 0.0713,
"num_tokens": 16193908.0,
"step": 202
},
{
"epoch": 0.25327510917030566,
"grad_norm": 0.1656646737047489,
"learning_rate": 9.932276428054723e-06,
"loss": 0.0748,
"num_tokens": 16275142.0,
"step": 203
},
{
"epoch": 0.2545227698066126,
"grad_norm": 0.1680255993051908,
"learning_rate": 9.931225073583476e-06,
"loss": 0.0718,
"num_tokens": 16355412.0,
"step": 204
},
{
"epoch": 0.2557704304429195,
"grad_norm": 0.1646716198896169,
"learning_rate": 9.930165683961803e-06,
"loss": 0.0661,
"num_tokens": 16435568.0,
"step": 205
},
{
"epoch": 0.25701809107922646,
"grad_norm": 0.1565441365205928,
"learning_rate": 9.929098261110694e-06,
"loss": 0.0653,
"num_tokens": 16516116.0,
"step": 206
},
{
"epoch": 0.2582657517155334,
"grad_norm": 0.15611179423469423,
"learning_rate": 9.92802280696571e-06,
"loss": 0.0689,
"num_tokens": 16596327.0,
"step": 207
},
{
"epoch": 0.2595134123518403,
"grad_norm": 0.16424170862613105,
"learning_rate": 9.926939323476976e-06,
"loss": 0.0714,
"num_tokens": 16675904.0,
"step": 208
},
{
"epoch": 0.2607610729881472,
"grad_norm": 0.1496338642109506,
"learning_rate": 9.925847812609174e-06,
"loss": 0.0647,
"num_tokens": 16754483.0,
"step": 209
},
{
"epoch": 0.26200873362445415,
"grad_norm": 0.14882331263651497,
"learning_rate": 9.924748276341541e-06,
"loss": 0.0667,
"num_tokens": 16834373.0,
"step": 210
},
{
"epoch": 0.2632563942607611,
"grad_norm": 0.14757123106619455,
"learning_rate": 9.923640716667872e-06,
"loss": 0.0624,
"num_tokens": 16914378.0,
"step": 211
},
{
"epoch": 0.264504054897068,
"grad_norm": 0.15327214179668633,
"learning_rate": 9.922525135596507e-06,
"loss": 0.0731,
"num_tokens": 16995299.0,
"step": 212
},
{
"epoch": 0.2657517155333749,
"grad_norm": 0.15329303036152214,
"learning_rate": 9.92140153515033e-06,
"loss": 0.0699,
"num_tokens": 17074695.0,
"step": 213
},
{
"epoch": 0.26699937616968183,
"grad_norm": 0.14973454931210878,
"learning_rate": 9.92026991736677e-06,
"loss": 0.0644,
"num_tokens": 17154211.0,
"step": 214
},
{
"epoch": 0.26824703680598877,
"grad_norm": 0.15013258739242202,
"learning_rate": 9.919130284297791e-06,
"loss": 0.0661,
"num_tokens": 17234490.0,
"step": 215
},
{
"epoch": 0.2694946974422957,
"grad_norm": 0.15536001180184006,
"learning_rate": 9.917982638009891e-06,
"loss": 0.0727,
"num_tokens": 17314715.0,
"step": 216
},
{
"epoch": 0.27074235807860264,
"grad_norm": 0.15435069913626115,
"learning_rate": 9.916826980584103e-06,
"loss": 0.0657,
"num_tokens": 17395497.0,
"step": 217
},
{
"epoch": 0.2719900187149095,
"grad_norm": 0.15469096812383332,
"learning_rate": 9.91566331411598e-06,
"loss": 0.0626,
"num_tokens": 17474577.0,
"step": 218
},
{
"epoch": 0.27323767935121646,
"grad_norm": 0.1600274953046015,
"learning_rate": 9.914491640715603e-06,
"loss": 0.0676,
"num_tokens": 17555477.0,
"step": 219
},
{
"epoch": 0.2744853399875234,
"grad_norm": 0.1406463527512994,
"learning_rate": 9.913311962507569e-06,
"loss": 0.0592,
"num_tokens": 17635011.0,
"step": 220
},
{
"epoch": 0.2757330006238303,
"grad_norm": 0.16023616480511607,
"learning_rate": 9.912124281630991e-06,
"loss": 0.069,
"num_tokens": 17714394.0,
"step": 221
},
{
"epoch": 0.27698066126013726,
"grad_norm": 0.14706360693121406,
"learning_rate": 9.910928600239493e-06,
"loss": 0.0672,
"num_tokens": 17795018.0,
"step": 222
},
{
"epoch": 0.27822832189644414,
"grad_norm": 0.15220399575934448,
"learning_rate": 9.909724920501207e-06,
"loss": 0.0657,
"num_tokens": 17874644.0,
"step": 223
},
{
"epoch": 0.2794759825327511,
"grad_norm": 0.15590253629720327,
"learning_rate": 9.90851324459877e-06,
"loss": 0.0692,
"num_tokens": 17954798.0,
"step": 224
},
{
"epoch": 0.280723643169058,
"grad_norm": 0.15484431097082246,
"learning_rate": 9.907293574729317e-06,
"loss": 0.0645,
"num_tokens": 18034230.0,
"step": 225
},
{
"epoch": 0.28197130380536495,
"grad_norm": 0.1449336777934227,
"learning_rate": 9.906065913104474e-06,
"loss": 0.0665,
"num_tokens": 18113515.0,
"step": 226
},
{
"epoch": 0.2832189644416719,
"grad_norm": 0.14201172287080596,
"learning_rate": 9.904830261950366e-06,
"loss": 0.0615,
"num_tokens": 18193510.0,
"step": 227
},
{
"epoch": 0.28446662507797876,
"grad_norm": 0.1422769788921952,
"learning_rate": 9.903586623507603e-06,
"loss": 0.0584,
"num_tokens": 18273926.0,
"step": 228
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.1567259496080909,
"learning_rate": 9.902335000031273e-06,
"loss": 0.0622,
"num_tokens": 18354062.0,
"step": 229
},
{
"epoch": 0.28696194635059263,
"grad_norm": 0.1534870201147952,
"learning_rate": 9.901075393790953e-06,
"loss": 0.0666,
"num_tokens": 18434373.0,
"step": 230
},
{
"epoch": 0.28820960698689957,
"grad_norm": 0.14650419642450943,
"learning_rate": 9.899807807070684e-06,
"loss": 0.0635,
"num_tokens": 18513274.0,
"step": 231
},
{
"epoch": 0.2894572676232065,
"grad_norm": 0.14778200561533844,
"learning_rate": 9.898532242168987e-06,
"loss": 0.0615,
"num_tokens": 18592541.0,
"step": 232
},
{
"epoch": 0.29070492825951344,
"grad_norm": 0.14114387988548585,
"learning_rate": 9.897248701398848e-06,
"loss": 0.0588,
"num_tokens": 18671892.0,
"step": 233
},
{
"epoch": 0.2919525888958203,
"grad_norm": 0.16178673205752156,
"learning_rate": 9.895957187087713e-06,
"loss": 0.0635,
"num_tokens": 18751886.0,
"step": 234
},
{
"epoch": 0.29320024953212726,
"grad_norm": 0.14871524378309123,
"learning_rate": 9.894657701577488e-06,
"loss": 0.0658,
"num_tokens": 18832351.0,
"step": 235
},
{
"epoch": 0.2944479101684342,
"grad_norm": 0.1548609260643329,
"learning_rate": 9.893350247224532e-06,
"loss": 0.0675,
"num_tokens": 18912756.0,
"step": 236
},
{
"epoch": 0.2956955708047411,
"grad_norm": 0.1507526419619637,
"learning_rate": 9.892034826399657e-06,
"loss": 0.0601,
"num_tokens": 18993197.0,
"step": 237
},
{
"epoch": 0.29694323144104806,
"grad_norm": 0.156742989410175,
"learning_rate": 9.890711441488117e-06,
"loss": 0.0669,
"num_tokens": 19074321.0,
"step": 238
},
{
"epoch": 0.29819089207735494,
"grad_norm": 0.15435161952476084,
"learning_rate": 9.889380094889609e-06,
"loss": 0.0659,
"num_tokens": 19153748.0,
"step": 239
},
{
"epoch": 0.2994385527136619,
"grad_norm": 0.14843331611676852,
"learning_rate": 9.888040789018267e-06,
"loss": 0.0608,
"num_tokens": 19232902.0,
"step": 240
},
{
"epoch": 0.3006862133499688,
"grad_norm": 0.15991403846542185,
"learning_rate": 9.886693526302657e-06,
"loss": 0.0699,
"num_tokens": 19314211.0,
"step": 241
},
{
"epoch": 0.30193387398627575,
"grad_norm": 0.14343130811237734,
"learning_rate": 9.885338309185775e-06,
"loss": 0.0593,
"num_tokens": 19393486.0,
"step": 242
},
{
"epoch": 0.3031815346225827,
"grad_norm": 0.14976887951301165,
"learning_rate": 9.883975140125035e-06,
"loss": 0.063,
"num_tokens": 19474289.0,
"step": 243
},
{
"epoch": 0.30442919525888956,
"grad_norm": 0.15391003667967385,
"learning_rate": 9.88260402159228e-06,
"loss": 0.0682,
"num_tokens": 19554117.0,
"step": 244
},
{
"epoch": 0.3056768558951965,
"grad_norm": 0.1558428875604292,
"learning_rate": 9.88122495607376e-06,
"loss": 0.0653,
"num_tokens": 19634331.0,
"step": 245
},
{
"epoch": 0.30692451653150343,
"grad_norm": 0.15426290191639358,
"learning_rate": 9.879837946070138e-06,
"loss": 0.0593,
"num_tokens": 19713085.0,
"step": 246
},
{
"epoch": 0.30817217716781037,
"grad_norm": 0.14767406997007507,
"learning_rate": 9.878442994096481e-06,
"loss": 0.0578,
"num_tokens": 19792400.0,
"step": 247
},
{
"epoch": 0.3094198378041173,
"grad_norm": 0.14741065221457425,
"learning_rate": 9.87704010268226e-06,
"loss": 0.0644,
"num_tokens": 19873279.0,
"step": 248
},
{
"epoch": 0.3106674984404242,
"grad_norm": 0.1529316875881466,
"learning_rate": 9.87562927437134e-06,
"loss": 0.0628,
"num_tokens": 19953306.0,
"step": 249
},
{
"epoch": 0.3119151590767311,
"grad_norm": 0.15478552257088712,
"learning_rate": 9.87421051172198e-06,
"loss": 0.0656,
"num_tokens": 20033613.0,
"step": 250
},
{
"epoch": 0.31316281971303805,
"grad_norm": 0.1469468822446913,
"learning_rate": 9.872783817306827e-06,
"loss": 0.0617,
"num_tokens": 20113360.0,
"step": 251
},
{
"epoch": 0.314410480349345,
"grad_norm": 0.1539836863937035,
"learning_rate": 9.871349193712905e-06,
"loss": 0.0654,
"num_tokens": 20192829.0,
"step": 252
},
{
"epoch": 0.3156581409856519,
"grad_norm": 0.15325635042763217,
"learning_rate": 9.869906643541625e-06,
"loss": 0.0577,
"num_tokens": 20271785.0,
"step": 253
},
{
"epoch": 0.3169058016219588,
"grad_norm": 0.14852576321041067,
"learning_rate": 9.868456169408763e-06,
"loss": 0.0591,
"num_tokens": 20351250.0,
"step": 254
},
{
"epoch": 0.31815346225826574,
"grad_norm": 0.14853921696894695,
"learning_rate": 9.866997773944469e-06,
"loss": 0.0649,
"num_tokens": 20432591.0,
"step": 255
},
{
"epoch": 0.3194011228945727,
"grad_norm": 0.15093199632607562,
"learning_rate": 9.865531459793254e-06,
"loss": 0.0616,
"num_tokens": 20511910.0,
"step": 256
},
{
"epoch": 0.3206487835308796,
"grad_norm": 0.14360562555088788,
"learning_rate": 9.864057229613988e-06,
"loss": 0.0587,
"num_tokens": 20591960.0,
"step": 257
},
{
"epoch": 0.32189644416718655,
"grad_norm": 0.15453328885789705,
"learning_rate": 9.862575086079897e-06,
"loss": 0.0646,
"num_tokens": 20673004.0,
"step": 258
},
{
"epoch": 0.3231441048034934,
"grad_norm": 0.14025760262079745,
"learning_rate": 9.861085031878556e-06,
"loss": 0.0566,
"num_tokens": 20751778.0,
"step": 259
},
{
"epoch": 0.32439176543980036,
"grad_norm": 0.154221736145205,
"learning_rate": 9.859587069711883e-06,
"loss": 0.0677,
"num_tokens": 20833796.0,
"step": 260
},
{
"epoch": 0.3256394260761073,
"grad_norm": 0.16473194510891925,
"learning_rate": 9.858081202296133e-06,
"loss": 0.0628,
"num_tokens": 20913685.0,
"step": 261
},
{
"epoch": 0.32688708671241423,
"grad_norm": 0.1477312778238374,
"learning_rate": 9.856567432361903e-06,
"loss": 0.0608,
"num_tokens": 20995255.0,
"step": 262
},
{
"epoch": 0.32813474734872117,
"grad_norm": 0.15295805096246043,
"learning_rate": 9.855045762654115e-06,
"loss": 0.0631,
"num_tokens": 21077034.0,
"step": 263
},
{
"epoch": 0.32938240798502805,
"grad_norm": 0.1447398547624195,
"learning_rate": 9.853516195932014e-06,
"loss": 0.0579,
"num_tokens": 21156349.0,
"step": 264
},
{
"epoch": 0.330630068621335,
"grad_norm": 0.14589297009135765,
"learning_rate": 9.851978734969168e-06,
"loss": 0.0583,
"num_tokens": 21236413.0,
"step": 265
},
{
"epoch": 0.3318777292576419,
"grad_norm": 0.143764406382231,
"learning_rate": 9.850433382553457e-06,
"loss": 0.062,
"num_tokens": 21318138.0,
"step": 266
},
{
"epoch": 0.33312538989394885,
"grad_norm": 0.16011843739089876,
"learning_rate": 9.848880141487076e-06,
"loss": 0.0695,
"num_tokens": 21399813.0,
"step": 267
},
{
"epoch": 0.3343730505302558,
"grad_norm": 0.14438380491620342,
"learning_rate": 9.847319014586517e-06,
"loss": 0.0598,
"num_tokens": 21482627.0,
"step": 268
},
{
"epoch": 0.33562071116656267,
"grad_norm": 0.14004050344567048,
"learning_rate": 9.845750004682576e-06,
"loss": 0.0591,
"num_tokens": 21561686.0,
"step": 269
},
{
"epoch": 0.3368683718028696,
"grad_norm": 0.15403062197709771,
"learning_rate": 9.844173114620342e-06,
"loss": 0.0595,
"num_tokens": 21641830.0,
"step": 270
},
{
"epoch": 0.33811603243917654,
"grad_norm": 0.15466717690067627,
"learning_rate": 9.842588347259192e-06,
"loss": 0.0568,
"num_tokens": 21721224.0,
"step": 271
},
{
"epoch": 0.3393636930754835,
"grad_norm": 0.13856966189349137,
"learning_rate": 9.84099570547279e-06,
"loss": 0.0598,
"num_tokens": 21801699.0,
"step": 272
},
{
"epoch": 0.3406113537117904,
"grad_norm": 0.15061899204640403,
"learning_rate": 9.839395192149077e-06,
"loss": 0.0591,
"num_tokens": 21881771.0,
"step": 273
},
{
"epoch": 0.3418590143480973,
"grad_norm": 0.14228872056371486,
"learning_rate": 9.837786810190268e-06,
"loss": 0.0608,
"num_tokens": 21961323.0,
"step": 274
},
{
"epoch": 0.3431066749844042,
"grad_norm": 0.16567790027277843,
"learning_rate": 9.836170562512844e-06,
"loss": 0.0608,
"num_tokens": 22041013.0,
"step": 275
},
{
"epoch": 0.34435433562071116,
"grad_norm": 0.14395086051397554,
"learning_rate": 9.83454645204755e-06,
"loss": 0.0581,
"num_tokens": 22119949.0,
"step": 276
},
{
"epoch": 0.3456019962570181,
"grad_norm": 0.1516359303870716,
"learning_rate": 9.832914481739391e-06,
"loss": 0.0634,
"num_tokens": 22200178.0,
"step": 277
},
{
"epoch": 0.34684965689332503,
"grad_norm": 0.16105443817398513,
"learning_rate": 9.831274654547623e-06,
"loss": 0.0626,
"num_tokens": 22279674.0,
"step": 278
},
{
"epoch": 0.34809731752963197,
"grad_norm": 0.1590038648782175,
"learning_rate": 9.829626973445745e-06,
"loss": 0.0635,
"num_tokens": 22360285.0,
"step": 279
},
{
"epoch": 0.34934497816593885,
"grad_norm": 0.14937561999750967,
"learning_rate": 9.827971441421504e-06,
"loss": 0.0613,
"num_tokens": 22442292.0,
"step": 280
},
{
"epoch": 0.3505926388022458,
"grad_norm": 0.1474165532101385,
"learning_rate": 9.826308061476878e-06,
"loss": 0.0565,
"num_tokens": 22521661.0,
"step": 281
},
{
"epoch": 0.3518402994385527,
"grad_norm": 0.15035582630080754,
"learning_rate": 9.824636836628078e-06,
"loss": 0.0624,
"num_tokens": 22601095.0,
"step": 282
},
{
"epoch": 0.35308796007485965,
"grad_norm": 0.14438687507583545,
"learning_rate": 9.822957769905544e-06,
"loss": 0.0573,
"num_tokens": 22681539.0,
"step": 283
},
{
"epoch": 0.3543356207111666,
"grad_norm": 0.14358016685030078,
"learning_rate": 9.821270864353924e-06,
"loss": 0.0532,
"num_tokens": 22760996.0,
"step": 284
},
{
"epoch": 0.35558328134747347,
"grad_norm": 0.14707615741317387,
"learning_rate": 9.819576123032092e-06,
"loss": 0.0603,
"num_tokens": 22841748.0,
"step": 285
},
{
"epoch": 0.3568309419837804,
"grad_norm": 0.14812718809993014,
"learning_rate": 9.817873549013127e-06,
"loss": 0.0592,
"num_tokens": 22922667.0,
"step": 286
},
{
"epoch": 0.35807860262008734,
"grad_norm": 0.1649467927673721,
"learning_rate": 9.816163145384308e-06,
"loss": 0.0554,
"num_tokens": 23002249.0,
"step": 287
},
{
"epoch": 0.3593262632563943,
"grad_norm": 0.15358093663493297,
"learning_rate": 9.814444915247115e-06,
"loss": 0.0632,
"num_tokens": 23083209.0,
"step": 288
},
{
"epoch": 0.3605739238927012,
"grad_norm": 0.15692040771846372,
"learning_rate": 9.81271886171722e-06,
"loss": 0.0592,
"num_tokens": 23163343.0,
"step": 289
},
{
"epoch": 0.3618215845290081,
"grad_norm": 0.1580641913112163,
"learning_rate": 9.810984987924477e-06,
"loss": 0.0579,
"num_tokens": 23243258.0,
"step": 290
},
{
"epoch": 0.363069245165315,
"grad_norm": 0.147171020680957,
"learning_rate": 9.809243297012923e-06,
"loss": 0.0622,
"num_tokens": 23325453.0,
"step": 291
},
{
"epoch": 0.36431690580162196,
"grad_norm": 0.15029752697042956,
"learning_rate": 9.807493792140774e-06,
"loss": 0.0575,
"num_tokens": 23406618.0,
"step": 292
},
{
"epoch": 0.3655645664379289,
"grad_norm": 0.1394169020427776,
"learning_rate": 9.805736476480407e-06,
"loss": 0.0565,
"num_tokens": 23485923.0,
"step": 293
},
{
"epoch": 0.36681222707423583,
"grad_norm": 0.14818497445465476,
"learning_rate": 9.803971353218367e-06,
"loss": 0.0526,
"num_tokens": 23565444.0,
"step": 294
},
{
"epoch": 0.3680598877105427,
"grad_norm": 0.14971154575642173,
"learning_rate": 9.802198425555358e-06,
"loss": 0.0636,
"num_tokens": 23647126.0,
"step": 295
},
{
"epoch": 0.36930754834684965,
"grad_norm": 0.16149541778586904,
"learning_rate": 9.800417696706234e-06,
"loss": 0.0636,
"num_tokens": 23727284.0,
"step": 296
},
{
"epoch": 0.3705552089831566,
"grad_norm": 0.14820265948993624,
"learning_rate": 9.798629169899992e-06,
"loss": 0.056,
"num_tokens": 23807739.0,
"step": 297
},
{
"epoch": 0.3718028696194635,
"grad_norm": 0.15131350009349487,
"learning_rate": 9.796832848379775e-06,
"loss": 0.0591,
"num_tokens": 23887839.0,
"step": 298
},
{
"epoch": 0.37305053025577045,
"grad_norm": 0.14986708525327327,
"learning_rate": 9.795028735402853e-06,
"loss": 0.0598,
"num_tokens": 23968437.0,
"step": 299
},
{
"epoch": 0.37429819089207733,
"grad_norm": 0.15808081173486005,
"learning_rate": 9.79321683424063e-06,
"loss": 0.0571,
"num_tokens": 24047656.0,
"step": 300
},
{
"epoch": 0.37554585152838427,
"grad_norm": 0.14958052977119102,
"learning_rate": 9.791397148178632e-06,
"loss": 0.0554,
"num_tokens": 24127038.0,
"step": 301
},
{
"epoch": 0.3767935121646912,
"grad_norm": 0.152499732455963,
"learning_rate": 9.789569680516497e-06,
"loss": 0.057,
"num_tokens": 24207038.0,
"step": 302
},
{
"epoch": 0.37804117280099814,
"grad_norm": 0.14559783006117594,
"learning_rate": 9.78773443456798e-06,
"loss": 0.0621,
"num_tokens": 24287925.0,
"step": 303
},
{
"epoch": 0.3792888334373051,
"grad_norm": 0.1452527426722152,
"learning_rate": 9.785891413660931e-06,
"loss": 0.0548,
"num_tokens": 24367076.0,
"step": 304
},
{
"epoch": 0.38053649407361195,
"grad_norm": 0.14896918649468194,
"learning_rate": 9.784040621137308e-06,
"loss": 0.0547,
"num_tokens": 24446865.0,
"step": 305
},
{
"epoch": 0.3817841547099189,
"grad_norm": 0.1390970861000212,
"learning_rate": 9.78218206035316e-06,
"loss": 0.0545,
"num_tokens": 24526102.0,
"step": 306
},
{
"epoch": 0.3830318153462258,
"grad_norm": 0.15617459548241075,
"learning_rate": 9.780315734678612e-06,
"loss": 0.0619,
"num_tokens": 24606039.0,
"step": 307
},
{
"epoch": 0.38427947598253276,
"grad_norm": 0.14418794697812448,
"learning_rate": 9.778441647497882e-06,
"loss": 0.0565,
"num_tokens": 24685400.0,
"step": 308
},
{
"epoch": 0.3855271366188397,
"grad_norm": 0.14701646779160854,
"learning_rate": 9.776559802209255e-06,
"loss": 0.0579,
"num_tokens": 24765381.0,
"step": 309
},
{
"epoch": 0.3867747972551466,
"grad_norm": 0.1631595272698652,
"learning_rate": 9.774670202225084e-06,
"loss": 0.0615,
"num_tokens": 24845699.0,
"step": 310
},
{
"epoch": 0.3880224578914535,
"grad_norm": 0.1545918579556817,
"learning_rate": 9.772772850971788e-06,
"loss": 0.0645,
"num_tokens": 24925987.0,
"step": 311
},
{
"epoch": 0.38927011852776044,
"grad_norm": 0.1441116111201304,
"learning_rate": 9.770867751889837e-06,
"loss": 0.0572,
"num_tokens": 25005272.0,
"step": 312
},
{
"epoch": 0.3905177791640674,
"grad_norm": 0.13781567161395594,
"learning_rate": 9.76895490843375e-06,
"loss": 0.0557,
"num_tokens": 25085235.0,
"step": 313
},
{
"epoch": 0.3917654398003743,
"grad_norm": 0.14477971945325008,
"learning_rate": 9.767034324072091e-06,
"loss": 0.0574,
"num_tokens": 25164489.0,
"step": 314
},
{
"epoch": 0.3930131004366812,
"grad_norm": 0.15372533695861323,
"learning_rate": 9.76510600228746e-06,
"loss": 0.0573,
"num_tokens": 25243328.0,
"step": 315
},
{
"epoch": 0.39426076107298813,
"grad_norm": 0.1501403033694261,
"learning_rate": 9.763169946576488e-06,
"loss": 0.0612,
"num_tokens": 25323644.0,
"step": 316
},
{
"epoch": 0.39550842170929507,
"grad_norm": 0.15008446129724604,
"learning_rate": 9.76122616044983e-06,
"loss": 0.0601,
"num_tokens": 25402837.0,
"step": 317
},
{
"epoch": 0.396756082345602,
"grad_norm": 0.15958664875841405,
"learning_rate": 9.759274647432156e-06,
"loss": 0.055,
"num_tokens": 25481850.0,
"step": 318
},
{
"epoch": 0.39800374298190894,
"grad_norm": 0.15101688409063593,
"learning_rate": 9.75731541106215e-06,
"loss": 0.0561,
"num_tokens": 25562517.0,
"step": 319
},
{
"epoch": 0.39925140361821587,
"grad_norm": 0.15012352120224384,
"learning_rate": 9.755348454892498e-06,
"loss": 0.0559,
"num_tokens": 25642299.0,
"step": 320
},
{
"epoch": 0.40049906425452275,
"grad_norm": 0.15137439858210344,
"learning_rate": 9.753373782489887e-06,
"loss": 0.0576,
"num_tokens": 25722205.0,
"step": 321
},
{
"epoch": 0.4017467248908297,
"grad_norm": 0.1506690147271384,
"learning_rate": 9.751391397434996e-06,
"loss": 0.0577,
"num_tokens": 25801893.0,
"step": 322
},
{
"epoch": 0.4029943855271366,
"grad_norm": 0.16103781809588258,
"learning_rate": 9.74940130332249e-06,
"loss": 0.0591,
"num_tokens": 25882061.0,
"step": 323
},
{
"epoch": 0.40424204616344356,
"grad_norm": 0.14444822336784965,
"learning_rate": 9.747403503761006e-06,
"loss": 0.0525,
"num_tokens": 25960184.0,
"step": 324
},
{
"epoch": 0.4054897067997505,
"grad_norm": 0.15166828709851793,
"learning_rate": 9.74539800237316e-06,
"loss": 0.0565,
"num_tokens": 26040338.0,
"step": 325
},
{
"epoch": 0.4067373674360574,
"grad_norm": 0.1504474187463855,
"learning_rate": 9.743384802795535e-06,
"loss": 0.0597,
"num_tokens": 26121905.0,
"step": 326
},
{
"epoch": 0.4079850280723643,
"grad_norm": 0.13344139836523075,
"learning_rate": 9.741363908678669e-06,
"loss": 0.0546,
"num_tokens": 26202255.0,
"step": 327
},
{
"epoch": 0.40923268870867124,
"grad_norm": 0.14377591000899273,
"learning_rate": 9.739335323687052e-06,
"loss": 0.0628,
"num_tokens": 26283068.0,
"step": 328
},
{
"epoch": 0.4104803493449782,
"grad_norm": 0.15940785319665565,
"learning_rate": 9.737299051499125e-06,
"loss": 0.0563,
"num_tokens": 26363408.0,
"step": 329
},
{
"epoch": 0.4117280099812851,
"grad_norm": 0.14599902917885316,
"learning_rate": 9.735255095807263e-06,
"loss": 0.059,
"num_tokens": 26444199.0,
"step": 330
},
{
"epoch": 0.412975670617592,
"grad_norm": 0.1351984864753291,
"learning_rate": 9.733203460317777e-06,
"loss": 0.0584,
"num_tokens": 26524527.0,
"step": 331
},
{
"epoch": 0.41422333125389893,
"grad_norm": 0.1397398203048671,
"learning_rate": 9.731144148750898e-06,
"loss": 0.0502,
"num_tokens": 26604842.0,
"step": 332
},
{
"epoch": 0.41547099189020587,
"grad_norm": 0.1502653858474547,
"learning_rate": 9.729077164840784e-06,
"loss": 0.0617,
"num_tokens": 26685166.0,
"step": 333
},
{
"epoch": 0.4167186525265128,
"grad_norm": 0.15884597841687997,
"learning_rate": 9.727002512335502e-06,
"loss": 0.0579,
"num_tokens": 26766562.0,
"step": 334
},
{
"epoch": 0.41796631316281974,
"grad_norm": 0.15085968982512207,
"learning_rate": 9.724920194997022e-06,
"loss": 0.0563,
"num_tokens": 26845821.0,
"step": 335
},
{
"epoch": 0.4192139737991266,
"grad_norm": 0.14636626725897792,
"learning_rate": 9.722830216601217e-06,
"loss": 0.0573,
"num_tokens": 26925802.0,
"step": 336
},
{
"epoch": 0.42046163443543355,
"grad_norm": 0.15213895413855882,
"learning_rate": 9.720732580937848e-06,
"loss": 0.0573,
"num_tokens": 27004757.0,
"step": 337
},
{
"epoch": 0.4217092950717405,
"grad_norm": 0.15478235225939369,
"learning_rate": 9.718627291810561e-06,
"loss": 0.0558,
"num_tokens": 27085488.0,
"step": 338
},
{
"epoch": 0.4229569557080474,
"grad_norm": 0.14333668524575108,
"learning_rate": 9.716514353036884e-06,
"loss": 0.0529,
"num_tokens": 27165382.0,
"step": 339
},
{
"epoch": 0.42420461634435436,
"grad_norm": 0.13816120021039435,
"learning_rate": 9.714393768448214e-06,
"loss": 0.055,
"num_tokens": 27244847.0,
"step": 340
},
{
"epoch": 0.42545227698066124,
"grad_norm": 0.1516209836461398,
"learning_rate": 9.712265541889809e-06,
"loss": 0.0571,
"num_tokens": 27326623.0,
"step": 341
},
{
"epoch": 0.4266999376169682,
"grad_norm": 0.14387239194414297,
"learning_rate": 9.710129677220788e-06,
"loss": 0.057,
"num_tokens": 27408759.0,
"step": 342
},
{
"epoch": 0.4279475982532751,
"grad_norm": 0.14913332717460548,
"learning_rate": 9.707986178314123e-06,
"loss": 0.0604,
"num_tokens": 27489615.0,
"step": 343
},
{
"epoch": 0.42919525888958204,
"grad_norm": 0.14600137001097122,
"learning_rate": 9.705835049056621e-06,
"loss": 0.0577,
"num_tokens": 27569276.0,
"step": 344
},
{
"epoch": 0.430442919525889,
"grad_norm": 0.16253517587195007,
"learning_rate": 9.70367629334893e-06,
"loss": 0.0593,
"num_tokens": 27650089.0,
"step": 345
},
{
"epoch": 0.43169058016219586,
"grad_norm": 0.15144799683797194,
"learning_rate": 9.701509915105527e-06,
"loss": 0.0551,
"num_tokens": 27731133.0,
"step": 346
},
{
"epoch": 0.4329382407985028,
"grad_norm": 0.1457681651429185,
"learning_rate": 9.699335918254714e-06,
"loss": 0.0539,
"num_tokens": 27810902.0,
"step": 347
},
{
"epoch": 0.43418590143480973,
"grad_norm": 0.1463053074053871,
"learning_rate": 9.6971543067386e-06,
"loss": 0.0588,
"num_tokens": 27892340.0,
"step": 348
},
{
"epoch": 0.43543356207111666,
"grad_norm": 0.1520505948725208,
"learning_rate": 9.694965084513106e-06,
"loss": 0.0562,
"num_tokens": 27973118.0,
"step": 349
},
{
"epoch": 0.4366812227074236,
"grad_norm": 0.15820851987269624,
"learning_rate": 9.692768255547957e-06,
"loss": 0.0567,
"num_tokens": 28053723.0,
"step": 350
},
{
"epoch": 0.4379288833437305,
"grad_norm": 0.1515133004587777,
"learning_rate": 9.690563823826666e-06,
"loss": 0.06,
"num_tokens": 28134449.0,
"step": 351
},
{
"epoch": 0.4391765439800374,
"grad_norm": 0.14702095630330445,
"learning_rate": 9.688351793346533e-06,
"loss": 0.0561,
"num_tokens": 28214373.0,
"step": 352
},
{
"epoch": 0.44042420461634435,
"grad_norm": 0.14263520426345258,
"learning_rate": 9.68613216811864e-06,
"loss": 0.0581,
"num_tokens": 28294827.0,
"step": 353
},
{
"epoch": 0.4416718652526513,
"grad_norm": 0.15433805512072737,
"learning_rate": 9.683904952167837e-06,
"loss": 0.0536,
"num_tokens": 28374441.0,
"step": 354
},
{
"epoch": 0.4429195258889582,
"grad_norm": 0.14704915295293733,
"learning_rate": 9.681670149532739e-06,
"loss": 0.0535,
"num_tokens": 28452997.0,
"step": 355
},
{
"epoch": 0.4441671865252651,
"grad_norm": 0.14891895587500484,
"learning_rate": 9.67942776426572e-06,
"loss": 0.0576,
"num_tokens": 28533011.0,
"step": 356
},
{
"epoch": 0.44541484716157204,
"grad_norm": 0.14800382217071825,
"learning_rate": 9.677177800432903e-06,
"loss": 0.056,
"num_tokens": 28612714.0,
"step": 357
},
{
"epoch": 0.44666250779787897,
"grad_norm": 0.1484302906683845,
"learning_rate": 9.67492026211415e-06,
"loss": 0.0541,
"num_tokens": 28692174.0,
"step": 358
},
{
"epoch": 0.4479101684341859,
"grad_norm": 0.1473133403449387,
"learning_rate": 9.672655153403064e-06,
"loss": 0.0556,
"num_tokens": 28771843.0,
"step": 359
},
{
"epoch": 0.44915782907049284,
"grad_norm": 0.13463944988028403,
"learning_rate": 9.670382478406967e-06,
"loss": 0.0562,
"num_tokens": 28851787.0,
"step": 360
},
{
"epoch": 0.4504054897067998,
"grad_norm": 0.14571672370885358,
"learning_rate": 9.66810224124691e-06,
"loss": 0.0561,
"num_tokens": 28931886.0,
"step": 361
},
{
"epoch": 0.45165315034310666,
"grad_norm": 0.13982592828240858,
"learning_rate": 9.665814446057652e-06,
"loss": 0.0527,
"num_tokens": 29011364.0,
"step": 362
},
{
"epoch": 0.4529008109794136,
"grad_norm": 0.1468605138111856,
"learning_rate": 9.663519096987653e-06,
"loss": 0.0569,
"num_tokens": 29091203.0,
"step": 363
},
{
"epoch": 0.45414847161572053,
"grad_norm": 0.15284313337192354,
"learning_rate": 9.661216198199078e-06,
"loss": 0.0553,
"num_tokens": 29172304.0,
"step": 364
},
{
"epoch": 0.45539613225202746,
"grad_norm": 0.15300814933585585,
"learning_rate": 9.658905753867778e-06,
"loss": 0.0608,
"num_tokens": 29252117.0,
"step": 365
},
{
"epoch": 0.4566437928883344,
"grad_norm": 0.16189213439882416,
"learning_rate": 9.656587768183287e-06,
"loss": 0.0546,
"num_tokens": 29332772.0,
"step": 366
},
{
"epoch": 0.4578914535246413,
"grad_norm": 0.14196790848023397,
"learning_rate": 9.654262245348813e-06,
"loss": 0.0517,
"num_tokens": 29414702.0,
"step": 367
},
{
"epoch": 0.4591391141609482,
"grad_norm": 0.14672195499400306,
"learning_rate": 9.651929189581233e-06,
"loss": 0.0595,
"num_tokens": 29495754.0,
"step": 368
},
{
"epoch": 0.46038677479725515,
"grad_norm": 0.14633512516697522,
"learning_rate": 9.649588605111082e-06,
"loss": 0.0554,
"num_tokens": 29575118.0,
"step": 369
},
{
"epoch": 0.4616344354335621,
"grad_norm": 0.14073644904651714,
"learning_rate": 9.647240496182545e-06,
"loss": 0.0556,
"num_tokens": 29655784.0,
"step": 370
},
{
"epoch": 0.462882096069869,
"grad_norm": 0.15125788764011439,
"learning_rate": 9.644884867053455e-06,
"loss": 0.0549,
"num_tokens": 29736212.0,
"step": 371
},
{
"epoch": 0.4641297567061759,
"grad_norm": 0.1498730140799902,
"learning_rate": 9.64252172199528e-06,
"loss": 0.0567,
"num_tokens": 29816732.0,
"step": 372
},
{
"epoch": 0.46537741734248284,
"grad_norm": 0.13559146848095813,
"learning_rate": 9.640151065293117e-06,
"loss": 0.0546,
"num_tokens": 29896880.0,
"step": 373
},
{
"epoch": 0.46662507797878977,
"grad_norm": 0.1640344913303149,
"learning_rate": 9.63777290124568e-06,
"loss": 0.0701,
"num_tokens": 29980295.0,
"step": 374
},
{
"epoch": 0.4678727386150967,
"grad_norm": 0.14536897470480084,
"learning_rate": 9.635387234165303e-06,
"loss": 0.0564,
"num_tokens": 30061645.0,
"step": 375
},
{
"epoch": 0.46912039925140364,
"grad_norm": 0.13332481481430616,
"learning_rate": 9.632994068377916e-06,
"loss": 0.0485,
"num_tokens": 30141179.0,
"step": 376
},
{
"epoch": 0.4703680598877105,
"grad_norm": 0.15492389456107522,
"learning_rate": 9.63059340822306e-06,
"loss": 0.0559,
"num_tokens": 30220539.0,
"step": 377
},
{
"epoch": 0.47161572052401746,
"grad_norm": 0.13840934949647143,
"learning_rate": 9.628185258053852e-06,
"loss": 0.0566,
"num_tokens": 30301422.0,
"step": 378
},
{
"epoch": 0.4728633811603244,
"grad_norm": 0.14291506474305504,
"learning_rate": 9.625769622236995e-06,
"loss": 0.0545,
"num_tokens": 30381142.0,
"step": 379
},
{
"epoch": 0.4741110417966313,
"grad_norm": 0.14000086512375753,
"learning_rate": 9.623346505152771e-06,
"loss": 0.0508,
"num_tokens": 30460521.0,
"step": 380
},
{
"epoch": 0.47535870243293826,
"grad_norm": 0.14400483775828635,
"learning_rate": 9.620915911195021e-06,
"loss": 0.0504,
"num_tokens": 30539451.0,
"step": 381
},
{
"epoch": 0.47660636306924514,
"grad_norm": 0.15315155933409735,
"learning_rate": 9.618477844771147e-06,
"loss": 0.0558,
"num_tokens": 30618847.0,
"step": 382
},
{
"epoch": 0.4778540237055521,
"grad_norm": 0.13374765928710877,
"learning_rate": 9.6160323103021e-06,
"loss": 0.0584,
"num_tokens": 30697880.0,
"step": 383
},
{
"epoch": 0.479101684341859,
"grad_norm": 0.13578880192469092,
"learning_rate": 9.613579312222377e-06,
"loss": 0.0495,
"num_tokens": 30776740.0,
"step": 384
},
{
"epoch": 0.48034934497816595,
"grad_norm": 0.14830879266525465,
"learning_rate": 9.611118854979998e-06,
"loss": 0.0588,
"num_tokens": 30858807.0,
"step": 385
},
{
"epoch": 0.4815970056144729,
"grad_norm": 0.13811959093377515,
"learning_rate": 9.608650943036522e-06,
"loss": 0.0563,
"num_tokens": 30938478.0,
"step": 386
},
{
"epoch": 0.48284466625077976,
"grad_norm": 0.152755680501879,
"learning_rate": 9.606175580867016e-06,
"loss": 0.0564,
"num_tokens": 31019990.0,
"step": 387
},
{
"epoch": 0.4840923268870867,
"grad_norm": 0.14052581035511705,
"learning_rate": 9.60369277296006e-06,
"loss": 0.0559,
"num_tokens": 31100307.0,
"step": 388
},
{
"epoch": 0.48533998752339363,
"grad_norm": 0.1486718670745852,
"learning_rate": 9.601202523817735e-06,
"loss": 0.0573,
"num_tokens": 31179820.0,
"step": 389
},
{
"epoch": 0.48658764815970057,
"grad_norm": 0.14296008439795943,
"learning_rate": 9.598704837955618e-06,
"loss": 0.0492,
"num_tokens": 31258626.0,
"step": 390
},
{
"epoch": 0.4878353087960075,
"grad_norm": 0.15097962419838978,
"learning_rate": 9.596199719902765e-06,
"loss": 0.0541,
"num_tokens": 31339814.0,
"step": 391
},
{
"epoch": 0.4890829694323144,
"grad_norm": 0.1379963039474492,
"learning_rate": 9.593687174201715e-06,
"loss": 0.053,
"num_tokens": 31419517.0,
"step": 392
},
{
"epoch": 0.4903306300686213,
"grad_norm": 0.13238251476670143,
"learning_rate": 9.59116720540847e-06,
"loss": 0.0492,
"num_tokens": 31498667.0,
"step": 393
},
{
"epoch": 0.49157829070492826,
"grad_norm": 0.1577098734056635,
"learning_rate": 9.588639818092498e-06,
"loss": 0.0574,
"num_tokens": 31579341.0,
"step": 394
},
{
"epoch": 0.4928259513412352,
"grad_norm": 0.1400848617210015,
"learning_rate": 9.586105016836713e-06,
"loss": 0.0547,
"num_tokens": 31660051.0,
"step": 395
},
{
"epoch": 0.4940736119775421,
"grad_norm": 0.14347417674815124,
"learning_rate": 9.58356280623748e-06,
"loss": 0.0561,
"num_tokens": 31740543.0,
"step": 396
},
{
"epoch": 0.495321272613849,
"grad_norm": 0.146326148226546,
"learning_rate": 9.58101319090459e-06,
"loss": 0.0534,
"num_tokens": 31821119.0,
"step": 397
},
{
"epoch": 0.49656893325015594,
"grad_norm": 0.13794827612031768,
"learning_rate": 9.578456175461272e-06,
"loss": 0.0517,
"num_tokens": 31901059.0,
"step": 398
},
{
"epoch": 0.4978165938864629,
"grad_norm": 0.1335465918733181,
"learning_rate": 9.575891764544162e-06,
"loss": 0.0501,
"num_tokens": 31981392.0,
"step": 399
},
{
"epoch": 0.4990642545227698,
"grad_norm": 0.14668973583020378,
"learning_rate": 9.573319962803317e-06,
"loss": 0.0534,
"num_tokens": 32061494.0,
"step": 400
},
{
"epoch": 0.5003119151590767,
"grad_norm": 0.15441275896158793,
"learning_rate": 9.570740774902189e-06,
"loss": 0.0616,
"num_tokens": 32142304.0,
"step": 401
},
{
"epoch": 0.5015595757953837,
"grad_norm": 0.14955081145369936,
"learning_rate": 9.568154205517623e-06,
"loss": 0.0522,
"num_tokens": 32222526.0,
"step": 402
},
{
"epoch": 0.5028072364316906,
"grad_norm": 0.1403875094386206,
"learning_rate": 9.565560259339856e-06,
"loss": 0.0485,
"num_tokens": 32302513.0,
"step": 403
},
{
"epoch": 0.5040548970679976,
"grad_norm": 0.13683297238023728,
"learning_rate": 9.562958941072491e-06,
"loss": 0.0536,
"num_tokens": 32382934.0,
"step": 404
},
{
"epoch": 0.5053025577043044,
"grad_norm": 0.14711235416131968,
"learning_rate": 9.560350255432508e-06,
"loss": 0.0542,
"num_tokens": 32464107.0,
"step": 405
},
{
"epoch": 0.5065502183406113,
"grad_norm": 0.14916320835261607,
"learning_rate": 9.557734207150243e-06,
"loss": 0.0539,
"num_tokens": 32543923.0,
"step": 406
},
{
"epoch": 0.5077978789769183,
"grad_norm": 0.13980467839315625,
"learning_rate": 9.55511080096938e-06,
"loss": 0.049,
"num_tokens": 32623438.0,
"step": 407
},
{
"epoch": 0.5090455396132252,
"grad_norm": 0.15152035984072354,
"learning_rate": 9.552480041646949e-06,
"loss": 0.055,
"num_tokens": 32703162.0,
"step": 408
},
{
"epoch": 0.5102932002495322,
"grad_norm": 0.1388088923573849,
"learning_rate": 9.549841933953308e-06,
"loss": 0.0487,
"num_tokens": 32782526.0,
"step": 409
},
{
"epoch": 0.511540860885839,
"grad_norm": 0.1384190124870507,
"learning_rate": 9.547196482672148e-06,
"loss": 0.0571,
"num_tokens": 32863740.0,
"step": 410
},
{
"epoch": 0.5127885215221459,
"grad_norm": 0.14052473157074324,
"learning_rate": 9.544543692600473e-06,
"loss": 0.0534,
"num_tokens": 32942880.0,
"step": 411
},
{
"epoch": 0.5140361821584529,
"grad_norm": 0.13353121925025832,
"learning_rate": 9.541883568548588e-06,
"loss": 0.0504,
"num_tokens": 33022952.0,
"step": 412
},
{
"epoch": 0.5152838427947598,
"grad_norm": 0.14750782511179328,
"learning_rate": 9.539216115340106e-06,
"loss": 0.0523,
"num_tokens": 33101857.0,
"step": 413
},
{
"epoch": 0.5165315034310668,
"grad_norm": 0.14466406906288587,
"learning_rate": 9.536541337811923e-06,
"loss": 0.0558,
"num_tokens": 33181714.0,
"step": 414
},
{
"epoch": 0.5177791640673737,
"grad_norm": 0.15553123717162787,
"learning_rate": 9.533859240814221e-06,
"loss": 0.055,
"num_tokens": 33261500.0,
"step": 415
},
{
"epoch": 0.5190268247036806,
"grad_norm": 0.13310331465324876,
"learning_rate": 9.531169829210452e-06,
"loss": 0.0501,
"num_tokens": 33340399.0,
"step": 416
},
{
"epoch": 0.5202744853399875,
"grad_norm": 0.14079149042098654,
"learning_rate": 9.528473107877333e-06,
"loss": 0.0497,
"num_tokens": 33420893.0,
"step": 417
},
{
"epoch": 0.5215221459762944,
"grad_norm": 0.1401444099939849,
"learning_rate": 9.525769081704835e-06,
"loss": 0.0526,
"num_tokens": 33500831.0,
"step": 418
},
{
"epoch": 0.5227698066126014,
"grad_norm": 0.13674566510520145,
"learning_rate": 9.523057755596174e-06,
"loss": 0.0524,
"num_tokens": 33581334.0,
"step": 419
},
{
"epoch": 0.5240174672489083,
"grad_norm": 0.14204922606844075,
"learning_rate": 9.520339134467803e-06,
"loss": 0.0538,
"num_tokens": 33661326.0,
"step": 420
},
{
"epoch": 0.5252651278852152,
"grad_norm": 0.1558664826050407,
"learning_rate": 9.517613223249402e-06,
"loss": 0.0572,
"num_tokens": 33741261.0,
"step": 421
},
{
"epoch": 0.5265127885215222,
"grad_norm": 0.15461038514035907,
"learning_rate": 9.514880026883877e-06,
"loss": 0.0575,
"num_tokens": 33822933.0,
"step": 422
},
{
"epoch": 0.527760449157829,
"grad_norm": 0.1607295696047527,
"learning_rate": 9.512139550327338e-06,
"loss": 0.0586,
"num_tokens": 33903895.0,
"step": 423
},
{
"epoch": 0.529008109794136,
"grad_norm": 0.1498493476829081,
"learning_rate": 9.509391798549091e-06,
"loss": 0.0579,
"num_tokens": 33984607.0,
"step": 424
},
{
"epoch": 0.5302557704304429,
"grad_norm": 0.14848887896859914,
"learning_rate": 9.50663677653165e-06,
"loss": 0.0526,
"num_tokens": 34064151.0,
"step": 425
},
{
"epoch": 0.5315034310667498,
"grad_norm": 0.14388277136012045,
"learning_rate": 9.503874489270697e-06,
"loss": 0.0507,
"num_tokens": 34144372.0,
"step": 426
},
{
"epoch": 0.5327510917030568,
"grad_norm": 0.14645441650969096,
"learning_rate": 9.501104941775094e-06,
"loss": 0.0582,
"num_tokens": 34224279.0,
"step": 427
},
{
"epoch": 0.5339987523393637,
"grad_norm": 0.14459393929071007,
"learning_rate": 9.49832813906687e-06,
"loss": 0.0536,
"num_tokens": 34303390.0,
"step": 428
},
{
"epoch": 0.5352464129756707,
"grad_norm": 0.12857512112686587,
"learning_rate": 9.495544086181204e-06,
"loss": 0.0485,
"num_tokens": 34383555.0,
"step": 429
},
{
"epoch": 0.5364940736119775,
"grad_norm": 0.1314484816771736,
"learning_rate": 9.49275278816643e-06,
"loss": 0.0498,
"num_tokens": 34463812.0,
"step": 430
},
{
"epoch": 0.5377417342482844,
"grad_norm": 0.13947556078476245,
"learning_rate": 9.489954250084011e-06,
"loss": 0.0545,
"num_tokens": 34544321.0,
"step": 431
},
{
"epoch": 0.5389893948845914,
"grad_norm": 0.1436721443948016,
"learning_rate": 9.487148477008545e-06,
"loss": 0.0525,
"num_tokens": 34624266.0,
"step": 432
},
{
"epoch": 0.5402370555208983,
"grad_norm": 0.13998074131625612,
"learning_rate": 9.484335474027744e-06,
"loss": 0.0514,
"num_tokens": 34704134.0,
"step": 433
},
{
"epoch": 0.5414847161572053,
"grad_norm": 0.14871361303836866,
"learning_rate": 9.481515246242435e-06,
"loss": 0.053,
"num_tokens": 34784190.0,
"step": 434
},
{
"epoch": 0.5427323767935122,
"grad_norm": 0.14624886222954445,
"learning_rate": 9.478687798766544e-06,
"loss": 0.0531,
"num_tokens": 34864130.0,
"step": 435
},
{
"epoch": 0.543980037429819,
"grad_norm": 0.13641332408321277,
"learning_rate": 9.475853136727086e-06,
"loss": 0.0492,
"num_tokens": 34944204.0,
"step": 436
},
{
"epoch": 0.545227698066126,
"grad_norm": 0.14134926386463173,
"learning_rate": 9.473011265264159e-06,
"loss": 0.0526,
"num_tokens": 35024607.0,
"step": 437
},
{
"epoch": 0.5464753587024329,
"grad_norm": 0.14553055880700336,
"learning_rate": 9.470162189530938e-06,
"loss": 0.0534,
"num_tokens": 35104390.0,
"step": 438
},
{
"epoch": 0.5477230193387399,
"grad_norm": 0.1437000622603961,
"learning_rate": 9.467305914693658e-06,
"loss": 0.0513,
"num_tokens": 35184822.0,
"step": 439
},
{
"epoch": 0.5489706799750468,
"grad_norm": 0.14612828451027657,
"learning_rate": 9.464442445931605e-06,
"loss": 0.0497,
"num_tokens": 35264499.0,
"step": 440
},
{
"epoch": 0.5502183406113537,
"grad_norm": 0.14828985114792528,
"learning_rate": 9.461571788437119e-06,
"loss": 0.0533,
"num_tokens": 35345078.0,
"step": 441
},
{
"epoch": 0.5514660012476607,
"grad_norm": 0.1374551082004164,
"learning_rate": 9.458693947415564e-06,
"loss": 0.0466,
"num_tokens": 35424570.0,
"step": 442
},
{
"epoch": 0.5527136618839675,
"grad_norm": 0.147677054111561,
"learning_rate": 9.455808928085339e-06,
"loss": 0.0542,
"num_tokens": 35505946.0,
"step": 443
},
{
"epoch": 0.5539613225202745,
"grad_norm": 0.13849172697877132,
"learning_rate": 9.452916735677857e-06,
"loss": 0.0485,
"num_tokens": 35586346.0,
"step": 444
},
{
"epoch": 0.5552089831565814,
"grad_norm": 0.15297999265119042,
"learning_rate": 9.450017375437534e-06,
"loss": 0.0531,
"num_tokens": 35665891.0,
"step": 445
},
{
"epoch": 0.5564566437928883,
"grad_norm": 0.15501802210849228,
"learning_rate": 9.44711085262179e-06,
"loss": 0.0573,
"num_tokens": 35746479.0,
"step": 446
},
{
"epoch": 0.5577043044291953,
"grad_norm": 0.13088367378601265,
"learning_rate": 9.444197172501025e-06,
"loss": 0.0484,
"num_tokens": 35826159.0,
"step": 447
},
{
"epoch": 0.5589519650655022,
"grad_norm": 0.16063425009788931,
"learning_rate": 9.441276340358624e-06,
"loss": 0.0576,
"num_tokens": 35906837.0,
"step": 448
},
{
"epoch": 0.5601996257018091,
"grad_norm": 0.1307271068905107,
"learning_rate": 9.438348361490938e-06,
"loss": 0.0525,
"num_tokens": 35987331.0,
"step": 449
},
{
"epoch": 0.561447286338116,
"grad_norm": 0.13171725020454944,
"learning_rate": 9.43541324120728e-06,
"loss": 0.0474,
"num_tokens": 36066069.0,
"step": 450
},
{
"epoch": 0.5626949469744229,
"grad_norm": 0.1309581854874849,
"learning_rate": 9.432470984829908e-06,
"loss": 0.0465,
"num_tokens": 36144807.0,
"step": 451
},
{
"epoch": 0.5639426076107299,
"grad_norm": 0.14475267528300384,
"learning_rate": 9.429521597694023e-06,
"loss": 0.0553,
"num_tokens": 36226188.0,
"step": 452
},
{
"epoch": 0.5651902682470368,
"grad_norm": 0.1425625122178402,
"learning_rate": 9.426565085147755e-06,
"loss": 0.0511,
"num_tokens": 36305107.0,
"step": 453
},
{
"epoch": 0.5664379288833438,
"grad_norm": 0.13616136813599408,
"learning_rate": 9.423601452552153e-06,
"loss": 0.0555,
"num_tokens": 36386076.0,
"step": 454
},
{
"epoch": 0.5676855895196506,
"grad_norm": 0.14376718432927843,
"learning_rate": 9.420630705281182e-06,
"loss": 0.0501,
"num_tokens": 36466132.0,
"step": 455
},
{
"epoch": 0.5689332501559575,
"grad_norm": 0.13639207933313066,
"learning_rate": 9.417652848721704e-06,
"loss": 0.0549,
"num_tokens": 36547317.0,
"step": 456
},
{
"epoch": 0.5701809107922645,
"grad_norm": 0.13858764737031773,
"learning_rate": 9.41466788827347e-06,
"loss": 0.0545,
"num_tokens": 36626984.0,
"step": 457
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.14505131114240344,
"learning_rate": 9.411675829349119e-06,
"loss": 0.0521,
"num_tokens": 36707170.0,
"step": 458
},
{
"epoch": 0.5726762320648784,
"grad_norm": 0.1475234995649541,
"learning_rate": 9.408676677374158e-06,
"loss": 0.0506,
"num_tokens": 36787101.0,
"step": 459
},
{
"epoch": 0.5739238927011853,
"grad_norm": 0.13633501610880747,
"learning_rate": 9.405670437786953e-06,
"loss": 0.0493,
"num_tokens": 36867353.0,
"step": 460
},
{
"epoch": 0.5751715533374921,
"grad_norm": 0.14436718039231583,
"learning_rate": 9.402657116038728e-06,
"loss": 0.0546,
"num_tokens": 36948883.0,
"step": 461
},
{
"epoch": 0.5764192139737991,
"grad_norm": 0.14203475439871666,
"learning_rate": 9.399636717593545e-06,
"loss": 0.0476,
"num_tokens": 37028885.0,
"step": 462
},
{
"epoch": 0.577666874610106,
"grad_norm": 0.14533730733608163,
"learning_rate": 9.3966092479283e-06,
"loss": 0.0516,
"num_tokens": 37108694.0,
"step": 463
},
{
"epoch": 0.578914535246413,
"grad_norm": 0.13808604624352258,
"learning_rate": 9.39357471253271e-06,
"loss": 0.0488,
"num_tokens": 37188517.0,
"step": 464
},
{
"epoch": 0.5801621958827199,
"grad_norm": 0.1461795781132845,
"learning_rate": 9.390533116909305e-06,
"loss": 0.054,
"num_tokens": 37269409.0,
"step": 465
},
{
"epoch": 0.5814098565190269,
"grad_norm": 0.14456406121879972,
"learning_rate": 9.387484466573417e-06,
"loss": 0.0513,
"num_tokens": 37349786.0,
"step": 466
},
{
"epoch": 0.5826575171553338,
"grad_norm": 0.15015842796580345,
"learning_rate": 9.38442876705317e-06,
"loss": 0.0488,
"num_tokens": 37428973.0,
"step": 467
},
{
"epoch": 0.5839051777916406,
"grad_norm": 0.14230447835176302,
"learning_rate": 9.381366023889475e-06,
"loss": 0.052,
"num_tokens": 37509348.0,
"step": 468
},
{
"epoch": 0.5851528384279476,
"grad_norm": 0.1450811926352791,
"learning_rate": 9.378296242636002e-06,
"loss": 0.0545,
"num_tokens": 37590500.0,
"step": 469
},
{
"epoch": 0.5864004990642545,
"grad_norm": 0.13517059402851564,
"learning_rate": 9.375219428859202e-06,
"loss": 0.05,
"num_tokens": 37672765.0,
"step": 470
},
{
"epoch": 0.5876481597005615,
"grad_norm": 0.14229031493976188,
"learning_rate": 9.372135588138262e-06,
"loss": 0.0508,
"num_tokens": 37753260.0,
"step": 471
},
{
"epoch": 0.5888958203368684,
"grad_norm": 0.13875205292970882,
"learning_rate": 9.369044726065121e-06,
"loss": 0.0515,
"num_tokens": 37832754.0,
"step": 472
},
{
"epoch": 0.5901434809731753,
"grad_norm": 0.13790837237053832,
"learning_rate": 9.365946848244445e-06,
"loss": 0.0508,
"num_tokens": 37911845.0,
"step": 473
},
{
"epoch": 0.5913911416094823,
"grad_norm": 0.140360358316561,
"learning_rate": 9.362841960293622e-06,
"loss": 0.0499,
"num_tokens": 37991693.0,
"step": 474
},
{
"epoch": 0.5926388022457891,
"grad_norm": 0.13689888549603996,
"learning_rate": 9.359730067842753e-06,
"loss": 0.0528,
"num_tokens": 38072283.0,
"step": 475
},
{
"epoch": 0.5938864628820961,
"grad_norm": 0.13291064397987987,
"learning_rate": 9.35661117653464e-06,
"loss": 0.0473,
"num_tokens": 38150942.0,
"step": 476
},
{
"epoch": 0.595134123518403,
"grad_norm": 0.1404556622746765,
"learning_rate": 9.353485292024775e-06,
"loss": 0.0554,
"num_tokens": 38232536.0,
"step": 477
},
{
"epoch": 0.5963817841547099,
"grad_norm": 0.13890649535466057,
"learning_rate": 9.35035241998133e-06,
"loss": 0.0487,
"num_tokens": 38312747.0,
"step": 478
},
{
"epoch": 0.5976294447910169,
"grad_norm": 0.13585175607106556,
"learning_rate": 9.347212566085153e-06,
"loss": 0.0512,
"num_tokens": 38392202.0,
"step": 479
},
{
"epoch": 0.5988771054273238,
"grad_norm": 0.14380652025332846,
"learning_rate": 9.344065736029746e-06,
"loss": 0.0542,
"num_tokens": 38472582.0,
"step": 480
},
{
"epoch": 0.6001247660636307,
"grad_norm": 0.1524558297733905,
"learning_rate": 9.34091193552126e-06,
"loss": 0.0522,
"num_tokens": 38553322.0,
"step": 481
},
{
"epoch": 0.6013724266999376,
"grad_norm": 0.1568778299979335,
"learning_rate": 9.337751170278495e-06,
"loss": 0.0541,
"num_tokens": 38633844.0,
"step": 482
},
{
"epoch": 0.6026200873362445,
"grad_norm": 0.14243398281550446,
"learning_rate": 9.334583446032866e-06,
"loss": 0.0528,
"num_tokens": 38713546.0,
"step": 483
},
{
"epoch": 0.6038677479725515,
"grad_norm": 0.14281972240999116,
"learning_rate": 9.331408768528423e-06,
"loss": 0.0504,
"num_tokens": 38792792.0,
"step": 484
},
{
"epoch": 0.6051154086088584,
"grad_norm": 0.1357329955012292,
"learning_rate": 9.328227143521809e-06,
"loss": 0.0511,
"num_tokens": 38872893.0,
"step": 485
},
{
"epoch": 0.6063630692451654,
"grad_norm": 0.12679897769025358,
"learning_rate": 9.325038576782275e-06,
"loss": 0.0476,
"num_tokens": 38952415.0,
"step": 486
},
{
"epoch": 0.6076107298814722,
"grad_norm": 0.1482843741539947,
"learning_rate": 9.321843074091654e-06,
"loss": 0.0524,
"num_tokens": 39033272.0,
"step": 487
},
{
"epoch": 0.6088583905177791,
"grad_norm": 0.14975071143827606,
"learning_rate": 9.318640641244362e-06,
"loss": 0.0488,
"num_tokens": 39111891.0,
"step": 488
},
{
"epoch": 0.6101060511540861,
"grad_norm": 0.1342258373618748,
"learning_rate": 9.315431284047375e-06,
"loss": 0.0505,
"num_tokens": 39192041.0,
"step": 489
},
{
"epoch": 0.611353711790393,
"grad_norm": 0.1285046989054189,
"learning_rate": 9.312215008320228e-06,
"loss": 0.0497,
"num_tokens": 39271930.0,
"step": 490
},
{
"epoch": 0.6126013724267,
"grad_norm": 0.1379379252162495,
"learning_rate": 9.308991819895001e-06,
"loss": 0.0529,
"num_tokens": 39352986.0,
"step": 491
},
{
"epoch": 0.6138490330630069,
"grad_norm": 0.14610092835159,
"learning_rate": 9.30576172461631e-06,
"loss": 0.0642,
"num_tokens": 39433237.0,
"step": 492
},
{
"epoch": 0.6150966936993137,
"grad_norm": 0.13695664278195094,
"learning_rate": 9.302524728341292e-06,
"loss": 0.0526,
"num_tokens": 39514407.0,
"step": 493
},
{
"epoch": 0.6163443543356207,
"grad_norm": 0.14695174818928397,
"learning_rate": 9.299280836939602e-06,
"loss": 0.0527,
"num_tokens": 39594636.0,
"step": 494
},
{
"epoch": 0.6175920149719276,
"grad_norm": 0.13598009895197866,
"learning_rate": 9.296030056293394e-06,
"loss": 0.0464,
"num_tokens": 39674169.0,
"step": 495
},
{
"epoch": 0.6188396756082346,
"grad_norm": 0.14132411207723297,
"learning_rate": 9.292772392297316e-06,
"loss": 0.0518,
"num_tokens": 39754876.0,
"step": 496
},
{
"epoch": 0.6200873362445415,
"grad_norm": 0.13790154255779888,
"learning_rate": 9.289507850858498e-06,
"loss": 0.0481,
"num_tokens": 39835143.0,
"step": 497
},
{
"epoch": 0.6213349968808484,
"grad_norm": 0.136274128237269,
"learning_rate": 9.286236437896538e-06,
"loss": 0.0487,
"num_tokens": 39914731.0,
"step": 498
},
{
"epoch": 0.6225826575171554,
"grad_norm": 0.13960062998109465,
"learning_rate": 9.282958159343502e-06,
"loss": 0.051,
"num_tokens": 39994714.0,
"step": 499
},
{
"epoch": 0.6238303181534622,
"grad_norm": 0.13253521456358003,
"learning_rate": 9.279673021143895e-06,
"loss": 0.0476,
"num_tokens": 40075018.0,
"step": 500
},
{
"epoch": 0.6250779787897692,
"grad_norm": 0.13428832406148858,
"learning_rate": 9.276381029254668e-06,
"loss": 0.0488,
"num_tokens": 40155183.0,
"step": 501
},
{
"epoch": 0.6263256394260761,
"grad_norm": 0.14564770785457162,
"learning_rate": 9.273082189645197e-06,
"loss": 0.053,
"num_tokens": 40236110.0,
"step": 502
},
{
"epoch": 0.627573300062383,
"grad_norm": 0.13321390812262743,
"learning_rate": 9.269776508297272e-06,
"loss": 0.0494,
"num_tokens": 40316306.0,
"step": 503
},
{
"epoch": 0.62882096069869,
"grad_norm": 0.13863622088015634,
"learning_rate": 9.266463991205096e-06,
"loss": 0.0507,
"num_tokens": 40396697.0,
"step": 504
},
{
"epoch": 0.6300686213349969,
"grad_norm": 0.12373924281291247,
"learning_rate": 9.263144644375264e-06,
"loss": 0.0453,
"num_tokens": 40476696.0,
"step": 505
},
{
"epoch": 0.6313162819713038,
"grad_norm": 0.14046491029490663,
"learning_rate": 9.259818473826753e-06,
"loss": 0.0463,
"num_tokens": 40555802.0,
"step": 506
},
{
"epoch": 0.6325639426076107,
"grad_norm": 0.14152092405005381,
"learning_rate": 9.256485485590916e-06,
"loss": 0.0485,
"num_tokens": 40636230.0,
"step": 507
},
{
"epoch": 0.6338116032439176,
"grad_norm": 0.1417103083447363,
"learning_rate": 9.25314568571147e-06,
"loss": 0.0475,
"num_tokens": 40716545.0,
"step": 508
},
{
"epoch": 0.6350592638802246,
"grad_norm": 0.14299937687535086,
"learning_rate": 9.24979908024448e-06,
"loss": 0.05,
"num_tokens": 40795828.0,
"step": 509
},
{
"epoch": 0.6363069245165315,
"grad_norm": 0.17469460318552915,
"learning_rate": 9.246445675258353e-06,
"loss": 0.0485,
"num_tokens": 40876181.0,
"step": 510
},
{
"epoch": 0.6375545851528385,
"grad_norm": 0.1416974505661574,
"learning_rate": 9.243085476833827e-06,
"loss": 0.0486,
"num_tokens": 40956815.0,
"step": 511
},
{
"epoch": 0.6388022457891454,
"grad_norm": 0.13184766297464506,
"learning_rate": 9.239718491063956e-06,
"loss": 0.0452,
"num_tokens": 41035477.0,
"step": 512
},
{
"epoch": 0.6400499064254522,
"grad_norm": 0.14293335757694378,
"learning_rate": 9.236344724054104e-06,
"loss": 0.0481,
"num_tokens": 41114840.0,
"step": 513
},
{
"epoch": 0.6412975670617592,
"grad_norm": 0.1346618868296363,
"learning_rate": 9.232964181921931e-06,
"loss": 0.0469,
"num_tokens": 41194660.0,
"step": 514
},
{
"epoch": 0.6425452276980661,
"grad_norm": 0.14380477524244686,
"learning_rate": 9.22957687079738e-06,
"loss": 0.0503,
"num_tokens": 41274732.0,
"step": 515
},
{
"epoch": 0.6437928883343731,
"grad_norm": 0.1548153138877275,
"learning_rate": 9.22618279682267e-06,
"loss": 0.0534,
"num_tokens": 41355231.0,
"step": 516
},
{
"epoch": 0.64504054897068,
"grad_norm": 0.12901632746395453,
"learning_rate": 9.222781966152284e-06,
"loss": 0.0487,
"num_tokens": 41435607.0,
"step": 517
},
{
"epoch": 0.6462882096069869,
"grad_norm": 0.13407720979842325,
"learning_rate": 9.219374384952955e-06,
"loss": 0.0459,
"num_tokens": 41514560.0,
"step": 518
},
{
"epoch": 0.6475358702432938,
"grad_norm": 0.12463947466497512,
"learning_rate": 9.215960059403657e-06,
"loss": 0.0473,
"num_tokens": 41594491.0,
"step": 519
},
{
"epoch": 0.6487835308796007,
"grad_norm": 0.1279220794254339,
"learning_rate": 9.212538995695597e-06,
"loss": 0.0445,
"num_tokens": 41673375.0,
"step": 520
},
{
"epoch": 0.6500311915159077,
"grad_norm": 0.1422667742985825,
"learning_rate": 9.209111200032197e-06,
"loss": 0.0457,
"num_tokens": 41752931.0,
"step": 521
},
{
"epoch": 0.6512788521522146,
"grad_norm": 0.13547316557303143,
"learning_rate": 9.205676678629084e-06,
"loss": 0.0514,
"num_tokens": 41833139.0,
"step": 522
},
{
"epoch": 0.6525265127885215,
"grad_norm": 0.1294378459458407,
"learning_rate": 9.202235437714085e-06,
"loss": 0.0452,
"num_tokens": 41912284.0,
"step": 523
},
{
"epoch": 0.6537741734248285,
"grad_norm": 0.13400192606267253,
"learning_rate": 9.198787483527211e-06,
"loss": 0.0499,
"num_tokens": 41993387.0,
"step": 524
},
{
"epoch": 0.6550218340611353,
"grad_norm": 0.1374789626351036,
"learning_rate": 9.195332822320643e-06,
"loss": 0.0507,
"num_tokens": 42073397.0,
"step": 525
},
{
"epoch": 0.6562694946974423,
"grad_norm": 0.14254850808958336,
"learning_rate": 9.191871460358727e-06,
"loss": 0.0498,
"num_tokens": 42153736.0,
"step": 526
},
{
"epoch": 0.6575171553337492,
"grad_norm": 0.1492013090788707,
"learning_rate": 9.188403403917959e-06,
"loss": 0.0508,
"num_tokens": 42233576.0,
"step": 527
},
{
"epoch": 0.6587648159700561,
"grad_norm": 0.13528472298759806,
"learning_rate": 9.184928659286972e-06,
"loss": 0.0461,
"num_tokens": 42312824.0,
"step": 528
},
{
"epoch": 0.6600124766063631,
"grad_norm": 0.14462582085339648,
"learning_rate": 9.181447232766531e-06,
"loss": 0.0541,
"num_tokens": 42394280.0,
"step": 529
},
{
"epoch": 0.66126013724267,
"grad_norm": 0.1363211647052413,
"learning_rate": 9.177959130669512e-06,
"loss": 0.0489,
"num_tokens": 42473681.0,
"step": 530
},
{
"epoch": 0.662507797878977,
"grad_norm": 0.13485083339287968,
"learning_rate": 9.174464359320898e-06,
"loss": 0.0471,
"num_tokens": 42552772.0,
"step": 531
},
{
"epoch": 0.6637554585152838,
"grad_norm": 0.1255665390873853,
"learning_rate": 9.170962925057769e-06,
"loss": 0.0472,
"num_tokens": 42632415.0,
"step": 532
},
{
"epoch": 0.6650031191515907,
"grad_norm": 0.1355451583887428,
"learning_rate": 9.167454834229281e-06,
"loss": 0.048,
"num_tokens": 42712289.0,
"step": 533
},
{
"epoch": 0.6662507797878977,
"grad_norm": 0.1350858530193199,
"learning_rate": 9.163940093196663e-06,
"loss": 0.0506,
"num_tokens": 42792392.0,
"step": 534
},
{
"epoch": 0.6674984404242046,
"grad_norm": 0.136136562989076,
"learning_rate": 9.160418708333203e-06,
"loss": 0.0478,
"num_tokens": 42872872.0,
"step": 535
},
{
"epoch": 0.6687461010605116,
"grad_norm": 0.146077765520201,
"learning_rate": 9.156890686024239e-06,
"loss": 0.0498,
"num_tokens": 42953883.0,
"step": 536
},
{
"epoch": 0.6699937616968185,
"grad_norm": 0.12797176999362384,
"learning_rate": 9.153356032667138e-06,
"loss": 0.046,
"num_tokens": 43033437.0,
"step": 537
},
{
"epoch": 0.6712414223331253,
"grad_norm": 0.12545897578255247,
"learning_rate": 9.149814754671296e-06,
"loss": 0.0495,
"num_tokens": 43113703.0,
"step": 538
},
{
"epoch": 0.6724890829694323,
"grad_norm": 0.13008384107108037,
"learning_rate": 9.14626685845812e-06,
"loss": 0.045,
"num_tokens": 43192777.0,
"step": 539
},
{
"epoch": 0.6737367436057392,
"grad_norm": 0.13576570983520106,
"learning_rate": 9.142712350461021e-06,
"loss": 0.0504,
"num_tokens": 43272684.0,
"step": 540
},
{
"epoch": 0.6749844042420462,
"grad_norm": 0.1429459018435017,
"learning_rate": 9.139151237125393e-06,
"loss": 0.052,
"num_tokens": 43354053.0,
"step": 541
},
{
"epoch": 0.6762320648783531,
"grad_norm": 0.11983657237309615,
"learning_rate": 9.135583524908614e-06,
"loss": 0.0441,
"num_tokens": 43435103.0,
"step": 542
},
{
"epoch": 0.67747972551466,
"grad_norm": 0.14035766138431027,
"learning_rate": 9.132009220280021e-06,
"loss": 0.0498,
"num_tokens": 43514717.0,
"step": 543
},
{
"epoch": 0.678727386150967,
"grad_norm": 0.1374244928438365,
"learning_rate": 9.128428329720911e-06,
"loss": 0.0507,
"num_tokens": 43595017.0,
"step": 544
},
{
"epoch": 0.6799750467872738,
"grad_norm": 0.14354836890668102,
"learning_rate": 9.12484085972452e-06,
"loss": 0.0477,
"num_tokens": 43674479.0,
"step": 545
},
{
"epoch": 0.6812227074235808,
"grad_norm": 0.14516354150067737,
"learning_rate": 9.121246816796017e-06,
"loss": 0.0507,
"num_tokens": 43755079.0,
"step": 546
},
{
"epoch": 0.6824703680598877,
"grad_norm": 0.1400453547205689,
"learning_rate": 9.117646207452487e-06,
"loss": 0.0465,
"num_tokens": 43834370.0,
"step": 547
},
{
"epoch": 0.6837180286961946,
"grad_norm": 0.14040265790324466,
"learning_rate": 9.114039038222922e-06,
"loss": 0.045,
"num_tokens": 43914052.0,
"step": 548
},
{
"epoch": 0.6849656893325016,
"grad_norm": 0.14521374141163426,
"learning_rate": 9.110425315648212e-06,
"loss": 0.0489,
"num_tokens": 43993792.0,
"step": 549
},
{
"epoch": 0.6862133499688085,
"grad_norm": 0.142707572861237,
"learning_rate": 9.106805046281127e-06,
"loss": 0.047,
"num_tokens": 44073401.0,
"step": 550
},
{
"epoch": 0.6874610106051154,
"grad_norm": 0.13023090116652908,
"learning_rate": 9.103178236686309e-06,
"loss": 0.0465,
"num_tokens": 44152510.0,
"step": 551
},
{
"epoch": 0.6887086712414223,
"grad_norm": 0.12890465812404028,
"learning_rate": 9.099544893440265e-06,
"loss": 0.0489,
"num_tokens": 44233765.0,
"step": 552
},
{
"epoch": 0.6899563318777293,
"grad_norm": 0.13134548120051465,
"learning_rate": 9.095905023131337e-06,
"loss": 0.0464,
"num_tokens": 44313193.0,
"step": 553
},
{
"epoch": 0.6912039925140362,
"grad_norm": 0.1392402498661132,
"learning_rate": 9.092258632359714e-06,
"loss": 0.0523,
"num_tokens": 44393864.0,
"step": 554
},
{
"epoch": 0.6924516531503431,
"grad_norm": 0.150967684448626,
"learning_rate": 9.088605727737405e-06,
"loss": 0.0491,
"num_tokens": 44472774.0,
"step": 555
},
{
"epoch": 0.6936993137866501,
"grad_norm": 0.13327113149933534,
"learning_rate": 9.08494631588823e-06,
"loss": 0.0479,
"num_tokens": 44552481.0,
"step": 556
},
{
"epoch": 0.6949469744229569,
"grad_norm": 0.15639101173467318,
"learning_rate": 9.08128040344781e-06,
"loss": 0.0481,
"num_tokens": 44632296.0,
"step": 557
},
{
"epoch": 0.6961946350592639,
"grad_norm": 0.15006175503991592,
"learning_rate": 9.077607997063546e-06,
"loss": 0.0513,
"num_tokens": 44711761.0,
"step": 558
},
{
"epoch": 0.6974422956955708,
"grad_norm": 0.13626963289805946,
"learning_rate": 9.073929103394627e-06,
"loss": 0.045,
"num_tokens": 44790965.0,
"step": 559
},
{
"epoch": 0.6986899563318777,
"grad_norm": 0.13584036137479072,
"learning_rate": 9.070243729111998e-06,
"loss": 0.0499,
"num_tokens": 44871764.0,
"step": 560
},
{
"epoch": 0.6999376169681847,
"grad_norm": 0.12333589383269866,
"learning_rate": 9.066551880898356e-06,
"loss": 0.0449,
"num_tokens": 44951455.0,
"step": 561
},
{
"epoch": 0.7011852776044916,
"grad_norm": 0.13494757593024734,
"learning_rate": 9.062853565448137e-06,
"loss": 0.0468,
"num_tokens": 45030780.0,
"step": 562
},
{
"epoch": 0.7024329382407986,
"grad_norm": 0.13272773332650525,
"learning_rate": 9.059148789467508e-06,
"loss": 0.0471,
"num_tokens": 45110544.0,
"step": 563
},
{
"epoch": 0.7036805988771054,
"grad_norm": 0.13404989182349922,
"learning_rate": 9.055437559674343e-06,
"loss": 0.05,
"num_tokens": 45190997.0,
"step": 564
},
{
"epoch": 0.7049282595134123,
"grad_norm": 0.13081012181879706,
"learning_rate": 9.051719882798226e-06,
"loss": 0.0466,
"num_tokens": 45270643.0,
"step": 565
},
{
"epoch": 0.7061759201497193,
"grad_norm": 0.125779618664369,
"learning_rate": 9.047995765580428e-06,
"loss": 0.0464,
"num_tokens": 45351906.0,
"step": 566
},
{
"epoch": 0.7074235807860262,
"grad_norm": 0.1318104484558839,
"learning_rate": 9.044265214773901e-06,
"loss": 0.0485,
"num_tokens": 45431479.0,
"step": 567
},
{
"epoch": 0.7086712414223332,
"grad_norm": 0.13586766601016315,
"learning_rate": 9.040528237143258e-06,
"loss": 0.0508,
"num_tokens": 45511600.0,
"step": 568
},
{
"epoch": 0.7099189020586401,
"grad_norm": 0.1493946566599566,
"learning_rate": 9.036784839464771e-06,
"loss": 0.0482,
"num_tokens": 45591552.0,
"step": 569
},
{
"epoch": 0.7111665626949469,
"grad_norm": 0.13417312397284684,
"learning_rate": 9.033035028526352e-06,
"loss": 0.0424,
"num_tokens": 45670440.0,
"step": 570
},
{
"epoch": 0.7124142233312539,
"grad_norm": 0.13228632139209826,
"learning_rate": 9.029278811127539e-06,
"loss": 0.0462,
"num_tokens": 45750706.0,
"step": 571
},
{
"epoch": 0.7136618839675608,
"grad_norm": 0.12684341302244245,
"learning_rate": 9.025516194079493e-06,
"loss": 0.0447,
"num_tokens": 45830615.0,
"step": 572
},
{
"epoch": 0.7149095446038678,
"grad_norm": 0.14484313305712285,
"learning_rate": 9.021747184204974e-06,
"loss": 0.0502,
"num_tokens": 45912425.0,
"step": 573
},
{
"epoch": 0.7161572052401747,
"grad_norm": 0.13886894398435345,
"learning_rate": 9.017971788338338e-06,
"loss": 0.0519,
"num_tokens": 45994773.0,
"step": 574
},
{
"epoch": 0.7174048658764816,
"grad_norm": 0.13735085430234897,
"learning_rate": 9.014190013325514e-06,
"loss": 0.0486,
"num_tokens": 46075176.0,
"step": 575
},
{
"epoch": 0.7186525265127885,
"grad_norm": 0.14128392290391406,
"learning_rate": 9.010401866024007e-06,
"loss": 0.0463,
"num_tokens": 46155352.0,
"step": 576
},
{
"epoch": 0.7199001871490954,
"grad_norm": 0.129131515512014,
"learning_rate": 9.006607353302874e-06,
"loss": 0.0489,
"num_tokens": 46236369.0,
"step": 577
},
{
"epoch": 0.7211478477854024,
"grad_norm": 0.1388535780094133,
"learning_rate": 9.00280648204271e-06,
"loss": 0.0457,
"num_tokens": 46315492.0,
"step": 578
},
{
"epoch": 0.7223955084217093,
"grad_norm": 0.13207054260238488,
"learning_rate": 8.998999259135648e-06,
"loss": 0.0484,
"num_tokens": 46395923.0,
"step": 579
},
{
"epoch": 0.7236431690580162,
"grad_norm": 0.13220982688358388,
"learning_rate": 8.99518569148533e-06,
"loss": 0.0477,
"num_tokens": 46476015.0,
"step": 580
},
{
"epoch": 0.7248908296943232,
"grad_norm": 0.13584168006721206,
"learning_rate": 8.991365786006908e-06,
"loss": 0.0457,
"num_tokens": 46555654.0,
"step": 581
},
{
"epoch": 0.72613849033063,
"grad_norm": 0.13797132921454844,
"learning_rate": 8.987539549627026e-06,
"loss": 0.0488,
"num_tokens": 46635834.0,
"step": 582
},
{
"epoch": 0.727386150966937,
"grad_norm": 0.12983941108044414,
"learning_rate": 8.983706989283804e-06,
"loss": 0.0439,
"num_tokens": 46715099.0,
"step": 583
},
{
"epoch": 0.7286338116032439,
"grad_norm": 0.12724057093818048,
"learning_rate": 8.979868111926836e-06,
"loss": 0.0453,
"num_tokens": 46794497.0,
"step": 584
},
{
"epoch": 0.7298814722395508,
"grad_norm": 0.14712794756597866,
"learning_rate": 8.976022924517167e-06,
"loss": 0.0523,
"num_tokens": 46875764.0,
"step": 585
},
{
"epoch": 0.7311291328758578,
"grad_norm": 0.1260016012184683,
"learning_rate": 8.972171434027283e-06,
"loss": 0.0467,
"num_tokens": 46954348.0,
"step": 586
},
{
"epoch": 0.7323767935121647,
"grad_norm": 0.13638875135524814,
"learning_rate": 8.968313647441098e-06,
"loss": 0.0495,
"num_tokens": 47035156.0,
"step": 587
},
{
"epoch": 0.7336244541484717,
"grad_norm": 0.13598723506181007,
"learning_rate": 8.964449571753949e-06,
"loss": 0.048,
"num_tokens": 47114933.0,
"step": 588
},
{
"epoch": 0.7348721147847785,
"grad_norm": 0.13957131093949476,
"learning_rate": 8.96057921397257e-06,
"loss": 0.0476,
"num_tokens": 47195699.0,
"step": 589
},
{
"epoch": 0.7361197754210854,
"grad_norm": 0.13018188693022992,
"learning_rate": 8.95670258111509e-06,
"loss": 0.0463,
"num_tokens": 47275340.0,
"step": 590
},
{
"epoch": 0.7373674360573924,
"grad_norm": 0.12927934262951574,
"learning_rate": 8.95281968021102e-06,
"loss": 0.0472,
"num_tokens": 47355190.0,
"step": 591
},
{
"epoch": 0.7386150966936993,
"grad_norm": 0.13407866372594857,
"learning_rate": 8.948930518301228e-06,
"loss": 0.047,
"num_tokens": 47435372.0,
"step": 592
},
{
"epoch": 0.7398627573300063,
"grad_norm": 0.13953419467602407,
"learning_rate": 8.945035102437943e-06,
"loss": 0.0457,
"num_tokens": 47515076.0,
"step": 593
},
{
"epoch": 0.7411104179663132,
"grad_norm": 0.13257678892969102,
"learning_rate": 8.94113343968473e-06,
"loss": 0.0472,
"num_tokens": 47594913.0,
"step": 594
},
{
"epoch": 0.74235807860262,
"grad_norm": 0.13956156798849514,
"learning_rate": 8.937225537116482e-06,
"loss": 0.0499,
"num_tokens": 47674616.0,
"step": 595
},
{
"epoch": 0.743605739238927,
"grad_norm": 0.12720381017536367,
"learning_rate": 8.93331140181941e-06,
"loss": 0.0488,
"num_tokens": 47754372.0,
"step": 596
},
{
"epoch": 0.7448533998752339,
"grad_norm": 0.13600897445694435,
"learning_rate": 8.929391040891022e-06,
"loss": 0.0521,
"num_tokens": 47834920.0,
"step": 597
},
{
"epoch": 0.7461010605115409,
"grad_norm": 0.1392796000732699,
"learning_rate": 8.92546446144012e-06,
"loss": 0.0472,
"num_tokens": 47914382.0,
"step": 598
},
{
"epoch": 0.7473487211478478,
"grad_norm": 0.1385112153330734,
"learning_rate": 8.921531670586778e-06,
"loss": 0.0447,
"num_tokens": 47993846.0,
"step": 599
},
{
"epoch": 0.7485963817841547,
"grad_norm": 0.12658122859507898,
"learning_rate": 8.917592675462333e-06,
"loss": 0.0464,
"num_tokens": 48073646.0,
"step": 600
},
{
"epoch": 0.7498440424204617,
"grad_norm": 0.1419041010382823,
"learning_rate": 8.913647483209376e-06,
"loss": 0.0466,
"num_tokens": 48153764.0,
"step": 601
},
{
"epoch": 0.7510917030567685,
"grad_norm": 0.12852069420000675,
"learning_rate": 8.909696100981734e-06,
"loss": 0.046,
"num_tokens": 48235033.0,
"step": 602
},
{
"epoch": 0.7523393636930755,
"grad_norm": 0.13929453816389586,
"learning_rate": 8.905738535944453e-06,
"loss": 0.0456,
"num_tokens": 48315153.0,
"step": 603
},
{
"epoch": 0.7535870243293824,
"grad_norm": 0.1294892482761023,
"learning_rate": 8.901774795273799e-06,
"loss": 0.0487,
"num_tokens": 48394989.0,
"step": 604
},
{
"epoch": 0.7548346849656893,
"grad_norm": 0.1265516497561457,
"learning_rate": 8.897804886157229e-06,
"loss": 0.0447,
"num_tokens": 48475313.0,
"step": 605
},
{
"epoch": 0.7560823456019963,
"grad_norm": 0.1250026876119754,
"learning_rate": 8.893828815793389e-06,
"loss": 0.0445,
"num_tokens": 48554294.0,
"step": 606
},
{
"epoch": 0.7573300062383032,
"grad_norm": 0.1395621069701436,
"learning_rate": 8.889846591392097e-06,
"loss": 0.045,
"num_tokens": 48633837.0,
"step": 607
},
{
"epoch": 0.7585776668746101,
"grad_norm": 0.1292703654270298,
"learning_rate": 8.88585822017433e-06,
"loss": 0.0438,
"num_tokens": 48712902.0,
"step": 608
},
{
"epoch": 0.759825327510917,
"grad_norm": 0.12675734039745323,
"learning_rate": 8.881863709372207e-06,
"loss": 0.0461,
"num_tokens": 48792522.0,
"step": 609
},
{
"epoch": 0.7610729881472239,
"grad_norm": 0.12830442879917106,
"learning_rate": 8.877863066228987e-06,
"loss": 0.0481,
"num_tokens": 48873543.0,
"step": 610
},
{
"epoch": 0.7623206487835309,
"grad_norm": 0.13244561051443332,
"learning_rate": 8.873856297999045e-06,
"loss": 0.047,
"num_tokens": 48952745.0,
"step": 611
},
{
"epoch": 0.7635683094198378,
"grad_norm": 0.1615375485783518,
"learning_rate": 8.869843411947862e-06,
"loss": 0.0496,
"num_tokens": 49033599.0,
"step": 612
},
{
"epoch": 0.7648159700561448,
"grad_norm": 0.13780128644772976,
"learning_rate": 8.865824415352014e-06,
"loss": 0.0461,
"num_tokens": 49113085.0,
"step": 613
},
{
"epoch": 0.7660636306924516,
"grad_norm": 0.13586985951384964,
"learning_rate": 8.861799315499157e-06,
"loss": 0.046,
"num_tokens": 49191966.0,
"step": 614
},
{
"epoch": 0.7673112913287585,
"grad_norm": 0.1271021845898844,
"learning_rate": 8.85776811968801e-06,
"loss": 0.0435,
"num_tokens": 49271798.0,
"step": 615
},
{
"epoch": 0.7685589519650655,
"grad_norm": 0.13241677295783946,
"learning_rate": 8.853730835228354e-06,
"loss": 0.0462,
"num_tokens": 49351085.0,
"step": 616
},
{
"epoch": 0.7698066126013724,
"grad_norm": 0.1473649979705641,
"learning_rate": 8.849687469441003e-06,
"loss": 0.0512,
"num_tokens": 49432116.0,
"step": 617
},
{
"epoch": 0.7710542732376794,
"grad_norm": 0.13824389423293423,
"learning_rate": 8.845638029657804e-06,
"loss": 0.0437,
"num_tokens": 49511692.0,
"step": 618
},
{
"epoch": 0.7723019338739863,
"grad_norm": 0.14209836646808868,
"learning_rate": 8.841582523221614e-06,
"loss": 0.0491,
"num_tokens": 49592183.0,
"step": 619
},
{
"epoch": 0.7735495945102931,
"grad_norm": 0.12494073093259925,
"learning_rate": 8.83752095748629e-06,
"loss": 0.0456,
"num_tokens": 49672496.0,
"step": 620
},
{
"epoch": 0.7747972551466001,
"grad_norm": 0.1494311981433102,
"learning_rate": 8.833453339816682e-06,
"loss": 0.0544,
"num_tokens": 49752281.0,
"step": 621
},
{
"epoch": 0.776044915782907,
"grad_norm": 0.14004390380525242,
"learning_rate": 8.829379677588607e-06,
"loss": 0.0486,
"num_tokens": 49831426.0,
"step": 622
},
{
"epoch": 0.777292576419214,
"grad_norm": 0.1339521325499603,
"learning_rate": 8.825299978188847e-06,
"loss": 0.0462,
"num_tokens": 49911746.0,
"step": 623
},
{
"epoch": 0.7785402370555209,
"grad_norm": 0.12799070765367507,
"learning_rate": 8.821214249015133e-06,
"loss": 0.0427,
"num_tokens": 49990124.0,
"step": 624
},
{
"epoch": 0.7797878976918278,
"grad_norm": 0.14563763920223902,
"learning_rate": 8.817122497476122e-06,
"loss": 0.0433,
"num_tokens": 50069850.0,
"step": 625
},
{
"epoch": 0.7810355583281348,
"grad_norm": 0.1412962344681427,
"learning_rate": 8.8130247309914e-06,
"loss": 0.0465,
"num_tokens": 50148928.0,
"step": 626
},
{
"epoch": 0.7822832189644416,
"grad_norm": 0.12673793649670753,
"learning_rate": 8.808920956991455e-06,
"loss": 0.0486,
"num_tokens": 50229320.0,
"step": 627
},
{
"epoch": 0.7835308796007486,
"grad_norm": 0.12112230001153174,
"learning_rate": 8.80481118291767e-06,
"loss": 0.0467,
"num_tokens": 50310309.0,
"step": 628
},
{
"epoch": 0.7847785402370555,
"grad_norm": 0.12671902753289974,
"learning_rate": 8.800695416222305e-06,
"loss": 0.0455,
"num_tokens": 50389538.0,
"step": 629
},
{
"epoch": 0.7860262008733624,
"grad_norm": 0.12959073046474556,
"learning_rate": 8.796573664368492e-06,
"loss": 0.0461,
"num_tokens": 50469795.0,
"step": 630
},
{
"epoch": 0.7872738615096694,
"grad_norm": 0.13262046702793368,
"learning_rate": 8.792445934830215e-06,
"loss": 0.0475,
"num_tokens": 50550218.0,
"step": 631
},
{
"epoch": 0.7885215221459763,
"grad_norm": 0.13536729420156757,
"learning_rate": 8.78831223509229e-06,
"loss": 0.0448,
"num_tokens": 50630135.0,
"step": 632
},
{
"epoch": 0.7897691827822833,
"grad_norm": 0.1342625728159916,
"learning_rate": 8.784172572650366e-06,
"loss": 0.0425,
"num_tokens": 50709708.0,
"step": 633
},
{
"epoch": 0.7910168434185901,
"grad_norm": 0.1359058372010772,
"learning_rate": 8.780026955010903e-06,
"loss": 0.0441,
"num_tokens": 50789219.0,
"step": 634
},
{
"epoch": 0.7922645040548971,
"grad_norm": 0.12874679521506371,
"learning_rate": 8.77587538969116e-06,
"loss": 0.0437,
"num_tokens": 50869389.0,
"step": 635
},
{
"epoch": 0.793512164691204,
"grad_norm": 0.1369083708111066,
"learning_rate": 8.771717884219177e-06,
"loss": 0.0493,
"num_tokens": 50949396.0,
"step": 636
},
{
"epoch": 0.7947598253275109,
"grad_norm": 0.14129983597974075,
"learning_rate": 8.767554446133771e-06,
"loss": 0.0451,
"num_tokens": 51029592.0,
"step": 637
},
{
"epoch": 0.7960074859638179,
"grad_norm": 0.1446719706863878,
"learning_rate": 8.763385082984511e-06,
"loss": 0.0506,
"num_tokens": 51110547.0,
"step": 638
},
{
"epoch": 0.7972551466001248,
"grad_norm": 0.13560344797018603,
"learning_rate": 8.759209802331714e-06,
"loss": 0.0472,
"num_tokens": 51191617.0,
"step": 639
},
{
"epoch": 0.7985028072364317,
"grad_norm": 0.15251529644301726,
"learning_rate": 8.755028611746426e-06,
"loss": 0.048,
"num_tokens": 51271860.0,
"step": 640
},
{
"epoch": 0.7997504678727386,
"grad_norm": 0.12215214673056587,
"learning_rate": 8.750841518810407e-06,
"loss": 0.0476,
"num_tokens": 51351464.0,
"step": 641
},
{
"epoch": 0.8009981285090455,
"grad_norm": 0.1295074754396241,
"learning_rate": 8.746648531116126e-06,
"loss": 0.0443,
"num_tokens": 51432841.0,
"step": 642
},
{
"epoch": 0.8022457891453525,
"grad_norm": 0.12895613658486638,
"learning_rate": 8.742449656266733e-06,
"loss": 0.0456,
"num_tokens": 51512108.0,
"step": 643
},
{
"epoch": 0.8034934497816594,
"grad_norm": 0.12440364391857861,
"learning_rate": 8.738244901876061e-06,
"loss": 0.0447,
"num_tokens": 51592279.0,
"step": 644
},
{
"epoch": 0.8047411104179664,
"grad_norm": 0.1326702881926524,
"learning_rate": 8.7340342755686e-06,
"loss": 0.0447,
"num_tokens": 51672173.0,
"step": 645
},
{
"epoch": 0.8059887710542732,
"grad_norm": 0.13915957948227095,
"learning_rate": 8.729817784979485e-06,
"loss": 0.049,
"num_tokens": 51753477.0,
"step": 646
},
{
"epoch": 0.8072364316905801,
"grad_norm": 0.14093605623183406,
"learning_rate": 8.725595437754489e-06,
"loss": 0.0649,
"num_tokens": 51834370.0,
"step": 647
},
{
"epoch": 0.8084840923268871,
"grad_norm": 0.1257616499498055,
"learning_rate": 8.721367241550007e-06,
"loss": 0.0433,
"num_tokens": 51913279.0,
"step": 648
},
{
"epoch": 0.809731752963194,
"grad_norm": 0.14095744719546538,
"learning_rate": 8.717133204033034e-06,
"loss": 0.0418,
"num_tokens": 51991782.0,
"step": 649
},
{
"epoch": 0.810979413599501,
"grad_norm": 0.13700683189419194,
"learning_rate": 8.71289333288116e-06,
"loss": 0.0469,
"num_tokens": 52071863.0,
"step": 650
},
{
"epoch": 0.8122270742358079,
"grad_norm": 0.13167335028118232,
"learning_rate": 8.708647635782553e-06,
"loss": 0.0469,
"num_tokens": 52151659.0,
"step": 651
},
{
"epoch": 0.8134747348721147,
"grad_norm": 0.13212682360066533,
"learning_rate": 8.704396120435944e-06,
"loss": 0.0418,
"num_tokens": 52230329.0,
"step": 652
},
{
"epoch": 0.8147223955084217,
"grad_norm": 0.13460259814153436,
"learning_rate": 8.700138794550617e-06,
"loss": 0.0477,
"num_tokens": 52310621.0,
"step": 653
},
{
"epoch": 0.8159700561447286,
"grad_norm": 0.1262576220568161,
"learning_rate": 8.695875665846392e-06,
"loss": 0.043,
"num_tokens": 52390363.0,
"step": 654
},
{
"epoch": 0.8172177167810356,
"grad_norm": 0.13451324588776792,
"learning_rate": 8.691606742053608e-06,
"loss": 0.0445,
"num_tokens": 52470407.0,
"step": 655
},
{
"epoch": 0.8184653774173425,
"grad_norm": 0.13838060943839892,
"learning_rate": 8.687332030913114e-06,
"loss": 0.0455,
"num_tokens": 52550801.0,
"step": 656
},
{
"epoch": 0.8197130380536494,
"grad_norm": 0.12544846461435052,
"learning_rate": 8.683051540176252e-06,
"loss": 0.0453,
"num_tokens": 52630184.0,
"step": 657
},
{
"epoch": 0.8209606986899564,
"grad_norm": 0.1227925470813221,
"learning_rate": 8.67876527760485e-06,
"loss": 0.0449,
"num_tokens": 52710226.0,
"step": 658
},
{
"epoch": 0.8222083593262632,
"grad_norm": 0.14125944393926942,
"learning_rate": 8.674473250971194e-06,
"loss": 0.0479,
"num_tokens": 52789646.0,
"step": 659
},
{
"epoch": 0.8234560199625702,
"grad_norm": 0.12898991776625454,
"learning_rate": 8.670175468058027e-06,
"loss": 0.0453,
"num_tokens": 52870777.0,
"step": 660
},
{
"epoch": 0.8247036805988771,
"grad_norm": 0.12662076248706405,
"learning_rate": 8.665871936658525e-06,
"loss": 0.0464,
"num_tokens": 52950874.0,
"step": 661
},
{
"epoch": 0.825951341235184,
"grad_norm": 0.1246338732142322,
"learning_rate": 8.661562664576297e-06,
"loss": 0.0449,
"num_tokens": 53030308.0,
"step": 662
},
{
"epoch": 0.827199001871491,
"grad_norm": 0.12386450841807353,
"learning_rate": 8.65724765962535e-06,
"loss": 0.0453,
"num_tokens": 53110905.0,
"step": 663
},
{
"epoch": 0.8284466625077979,
"grad_norm": 0.1252925833827586,
"learning_rate": 8.652926929630097e-06,
"loss": 0.0448,
"num_tokens": 53190924.0,
"step": 664
},
{
"epoch": 0.8296943231441049,
"grad_norm": 0.13199276345367864,
"learning_rate": 8.648600482425325e-06,
"loss": 0.0469,
"num_tokens": 53271193.0,
"step": 665
},
{
"epoch": 0.8309419837804117,
"grad_norm": 0.1226282221225014,
"learning_rate": 8.644268325856193e-06,
"loss": 0.0434,
"num_tokens": 53350537.0,
"step": 666
},
{
"epoch": 0.8321896444167186,
"grad_norm": 0.12053686374626792,
"learning_rate": 8.639930467778206e-06,
"loss": 0.0438,
"num_tokens": 53432655.0,
"step": 667
},
{
"epoch": 0.8334373050530256,
"grad_norm": 0.12451975349671367,
"learning_rate": 8.635586916057214e-06,
"loss": 0.0445,
"num_tokens": 53512112.0,
"step": 668
},
{
"epoch": 0.8346849656893325,
"grad_norm": 0.12083730426276222,
"learning_rate": 8.631237678569391e-06,
"loss": 0.0462,
"num_tokens": 53593075.0,
"step": 669
},
{
"epoch": 0.8359326263256395,
"grad_norm": 0.12769247867470077,
"learning_rate": 8.626882763201215e-06,
"loss": 0.0429,
"num_tokens": 53672672.0,
"step": 670
},
{
"epoch": 0.8371802869619464,
"grad_norm": 0.1238630262805468,
"learning_rate": 8.62252217784947e-06,
"loss": 0.0427,
"num_tokens": 53751669.0,
"step": 671
},
{
"epoch": 0.8384279475982532,
"grad_norm": 0.1391950829315506,
"learning_rate": 8.61815593042121e-06,
"loss": 0.0437,
"num_tokens": 53831916.0,
"step": 672
},
{
"epoch": 0.8396756082345602,
"grad_norm": 0.12832813628154105,
"learning_rate": 8.61378402883376e-06,
"loss": 0.0454,
"num_tokens": 53911127.0,
"step": 673
},
{
"epoch": 0.8409232688708671,
"grad_norm": 0.12869289586194874,
"learning_rate": 8.609406481014704e-06,
"loss": 0.0493,
"num_tokens": 53992015.0,
"step": 674
},
{
"epoch": 0.8421709295071741,
"grad_norm": 0.1295243174606551,
"learning_rate": 8.605023294901857e-06,
"loss": 0.0453,
"num_tokens": 54074688.0,
"step": 675
},
{
"epoch": 0.843418590143481,
"grad_norm": 0.14327157160951173,
"learning_rate": 8.600634478443262e-06,
"loss": 0.0475,
"num_tokens": 54154762.0,
"step": 676
},
{
"epoch": 0.8446662507797879,
"grad_norm": 0.13689413605347608,
"learning_rate": 8.596240039597168e-06,
"loss": 0.0487,
"num_tokens": 54234083.0,
"step": 677
},
{
"epoch": 0.8459139114160948,
"grad_norm": 0.12476227243203361,
"learning_rate": 8.59183998633202e-06,
"loss": 0.043,
"num_tokens": 54313838.0,
"step": 678
},
{
"epoch": 0.8471615720524017,
"grad_norm": 0.12843919174639454,
"learning_rate": 8.587434326626446e-06,
"loss": 0.0439,
"num_tokens": 54393140.0,
"step": 679
},
{
"epoch": 0.8484092326887087,
"grad_norm": 0.13686412036721382,
"learning_rate": 8.58302306846924e-06,
"loss": 0.0501,
"num_tokens": 54474299.0,
"step": 680
},
{
"epoch": 0.8496568933250156,
"grad_norm": 0.13617664891691592,
"learning_rate": 8.57860621985934e-06,
"loss": 0.0445,
"num_tokens": 54553494.0,
"step": 681
},
{
"epoch": 0.8509045539613225,
"grad_norm": 0.1271657490150743,
"learning_rate": 8.574183788805838e-06,
"loss": 0.044,
"num_tokens": 54633507.0,
"step": 682
},
{
"epoch": 0.8521522145976295,
"grad_norm": 0.13384106872272458,
"learning_rate": 8.56975578332793e-06,
"loss": 0.0443,
"num_tokens": 54713372.0,
"step": 683
},
{
"epoch": 0.8533998752339363,
"grad_norm": 0.11560773243774969,
"learning_rate": 8.56532221145493e-06,
"loss": 0.0429,
"num_tokens": 54792488.0,
"step": 684
},
{
"epoch": 0.8546475358702433,
"grad_norm": 0.1212428276738136,
"learning_rate": 8.560883081226246e-06,
"loss": 0.0443,
"num_tokens": 54873959.0,
"step": 685
},
{
"epoch": 0.8558951965065502,
"grad_norm": 0.14025896849225925,
"learning_rate": 8.55643840069136e-06,
"loss": 0.0466,
"num_tokens": 54954820.0,
"step": 686
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.13808021632034995,
"learning_rate": 8.551988177909825e-06,
"loss": 0.0462,
"num_tokens": 55035474.0,
"step": 687
},
{
"epoch": 0.8583905177791641,
"grad_norm": 0.14688685012575073,
"learning_rate": 8.547532420951236e-06,
"loss": 0.0474,
"num_tokens": 55115987.0,
"step": 688
},
{
"epoch": 0.859638178415471,
"grad_norm": 0.13519377075289518,
"learning_rate": 8.543071137895231e-06,
"loss": 0.0486,
"num_tokens": 55195473.0,
"step": 689
},
{
"epoch": 0.860885839051778,
"grad_norm": 0.1383342982212882,
"learning_rate": 8.538604336831463e-06,
"loss": 0.0456,
"num_tokens": 55274897.0,
"step": 690
},
{
"epoch": 0.8621334996880848,
"grad_norm": 0.12302270516387563,
"learning_rate": 8.53413202585959e-06,
"loss": 0.042,
"num_tokens": 55354684.0,
"step": 691
},
{
"epoch": 0.8633811603243917,
"grad_norm": 0.12034339097373062,
"learning_rate": 8.529654213089266e-06,
"loss": 0.0416,
"num_tokens": 55434479.0,
"step": 692
},
{
"epoch": 0.8646288209606987,
"grad_norm": 0.12441949815381428,
"learning_rate": 8.52517090664012e-06,
"loss": 0.0434,
"num_tokens": 55513816.0,
"step": 693
},
{
"epoch": 0.8658764815970056,
"grad_norm": 0.13357252884483028,
"learning_rate": 8.520682114641739e-06,
"loss": 0.0441,
"num_tokens": 55593435.0,
"step": 694
},
{
"epoch": 0.8671241422333126,
"grad_norm": 0.12424156938040647,
"learning_rate": 8.51618784523366e-06,
"loss": 0.0445,
"num_tokens": 55673866.0,
"step": 695
},
{
"epoch": 0.8683718028696195,
"grad_norm": 0.13488286343188147,
"learning_rate": 8.511688106565356e-06,
"loss": 0.0462,
"num_tokens": 55754183.0,
"step": 696
},
{
"epoch": 0.8696194635059263,
"grad_norm": 0.12902239020926456,
"learning_rate": 8.507182906796209e-06,
"loss": 0.0479,
"num_tokens": 55835122.0,
"step": 697
},
{
"epoch": 0.8708671241422333,
"grad_norm": 0.12700265076298542,
"learning_rate": 8.50267225409551e-06,
"loss": 0.0487,
"num_tokens": 55916059.0,
"step": 698
},
{
"epoch": 0.8721147847785402,
"grad_norm": 0.12439806501667146,
"learning_rate": 8.498156156642434e-06,
"loss": 0.0482,
"num_tokens": 55996309.0,
"step": 699
},
{
"epoch": 0.8733624454148472,
"grad_norm": 0.13662835946719407,
"learning_rate": 8.493634622626031e-06,
"loss": 0.0487,
"num_tokens": 56076989.0,
"step": 700
},
{
"epoch": 0.8746101060511541,
"grad_norm": 0.13097355494242618,
"learning_rate": 8.489107660245208e-06,
"loss": 0.0455,
"num_tokens": 56156600.0,
"step": 701
},
{
"epoch": 0.875857766687461,
"grad_norm": 0.12619811815079715,
"learning_rate": 8.484575277708718e-06,
"loss": 0.0482,
"num_tokens": 56237662.0,
"step": 702
},
{
"epoch": 0.877105427323768,
"grad_norm": 0.13272981560160393,
"learning_rate": 8.480037483235142e-06,
"loss": 0.0443,
"num_tokens": 56318037.0,
"step": 703
},
{
"epoch": 0.8783530879600748,
"grad_norm": 0.13876316053346943,
"learning_rate": 8.475494285052873e-06,
"loss": 0.0433,
"num_tokens": 56397397.0,
"step": 704
},
{
"epoch": 0.8796007485963818,
"grad_norm": 0.12990102861221764,
"learning_rate": 8.470945691400095e-06,
"loss": 0.0491,
"num_tokens": 56478651.0,
"step": 705
},
{
"epoch": 0.8808484092326887,
"grad_norm": 0.12177730095213436,
"learning_rate": 8.466391710524792e-06,
"loss": 0.0454,
"num_tokens": 56559439.0,
"step": 706
},
{
"epoch": 0.8820960698689956,
"grad_norm": 0.13073319119141943,
"learning_rate": 8.461832350684701e-06,
"loss": 0.044,
"num_tokens": 56639419.0,
"step": 707
},
{
"epoch": 0.8833437305053026,
"grad_norm": 0.13039116346051194,
"learning_rate": 8.457267620147326e-06,
"loss": 0.0479,
"num_tokens": 56720177.0,
"step": 708
},
{
"epoch": 0.8845913911416095,
"grad_norm": 0.11392658067382258,
"learning_rate": 8.452697527189901e-06,
"loss": 0.0417,
"num_tokens": 56799870.0,
"step": 709
},
{
"epoch": 0.8858390517779164,
"grad_norm": 0.12172332991038214,
"learning_rate": 8.448122080099384e-06,
"loss": 0.0436,
"num_tokens": 56879737.0,
"step": 710
},
{
"epoch": 0.8870867124142233,
"grad_norm": 0.12348030891715099,
"learning_rate": 8.443541287172443e-06,
"loss": 0.0464,
"num_tokens": 56960161.0,
"step": 711
},
{
"epoch": 0.8883343730505302,
"grad_norm": 0.1367197777768623,
"learning_rate": 8.438955156715443e-06,
"loss": 0.0472,
"num_tokens": 57040569.0,
"step": 712
},
{
"epoch": 0.8895820336868372,
"grad_norm": 0.12968055372030557,
"learning_rate": 8.434363697044423e-06,
"loss": 0.0475,
"num_tokens": 57120700.0,
"step": 713
},
{
"epoch": 0.8908296943231441,
"grad_norm": 0.10708595724786292,
"learning_rate": 8.429766916485087e-06,
"loss": 0.0398,
"num_tokens": 57199169.0,
"step": 714
},
{
"epoch": 0.8920773549594511,
"grad_norm": 0.1289490659534409,
"learning_rate": 8.42516482337279e-06,
"loss": 0.0419,
"num_tokens": 57278512.0,
"step": 715
},
{
"epoch": 0.8933250155957579,
"grad_norm": 0.13480571497635496,
"learning_rate": 8.420557426052513e-06,
"loss": 0.0432,
"num_tokens": 57358212.0,
"step": 716
},
{
"epoch": 0.8945726762320648,
"grad_norm": 0.1402455870237538,
"learning_rate": 8.415944732878863e-06,
"loss": 0.0413,
"num_tokens": 57437516.0,
"step": 717
},
{
"epoch": 0.8958203368683718,
"grad_norm": 0.13195092592426141,
"learning_rate": 8.411326752216048e-06,
"loss": 0.0469,
"num_tokens": 57518129.0,
"step": 718
},
{
"epoch": 0.8970679975046787,
"grad_norm": 0.12111826822571906,
"learning_rate": 8.406703492437863e-06,
"loss": 0.0464,
"num_tokens": 57598680.0,
"step": 719
},
{
"epoch": 0.8983156581409857,
"grad_norm": 0.1316599285653508,
"learning_rate": 8.402074961927674e-06,
"loss": 0.0467,
"num_tokens": 57679535.0,
"step": 720
},
{
"epoch": 0.8995633187772926,
"grad_norm": 0.12648224011362696,
"learning_rate": 8.397441169078404e-06,
"loss": 0.0474,
"num_tokens": 57761803.0,
"step": 721
},
{
"epoch": 0.9008109794135996,
"grad_norm": 0.13349369075280076,
"learning_rate": 8.392802122292522e-06,
"loss": 0.0453,
"num_tokens": 57841711.0,
"step": 722
},
{
"epoch": 0.9020586400499064,
"grad_norm": 0.12412030984325427,
"learning_rate": 8.388157829982023e-06,
"loss": 0.0462,
"num_tokens": 57921862.0,
"step": 723
},
{
"epoch": 0.9033063006862133,
"grad_norm": 0.12191290188001579,
"learning_rate": 8.383508300568409e-06,
"loss": 0.0423,
"num_tokens": 58001170.0,
"step": 724
},
{
"epoch": 0.9045539613225203,
"grad_norm": 0.12947888801849444,
"learning_rate": 8.378853542482687e-06,
"loss": 0.0444,
"num_tokens": 58078720.0,
"step": 725
},
{
"epoch": 0.9058016219588272,
"grad_norm": 0.12187109531791596,
"learning_rate": 8.374193564165338e-06,
"loss": 0.0417,
"num_tokens": 58158057.0,
"step": 726
},
{
"epoch": 0.9070492825951342,
"grad_norm": 0.1303865282661451,
"learning_rate": 8.36952837406631e-06,
"loss": 0.044,
"num_tokens": 58237021.0,
"step": 727
},
{
"epoch": 0.9082969432314411,
"grad_norm": 0.12384626240849539,
"learning_rate": 8.364857980645006e-06,
"loss": 0.0436,
"num_tokens": 58318537.0,
"step": 728
},
{
"epoch": 0.9095446038677479,
"grad_norm": 0.123878366797376,
"learning_rate": 8.360182392370258e-06,
"loss": 0.0463,
"num_tokens": 58398712.0,
"step": 729
},
{
"epoch": 0.9107922645040549,
"grad_norm": 0.1252594849614702,
"learning_rate": 8.355501617720321e-06,
"loss": 0.0469,
"num_tokens": 58480120.0,
"step": 730
},
{
"epoch": 0.9120399251403618,
"grad_norm": 0.12212319000991989,
"learning_rate": 8.350815665182855e-06,
"loss": 0.0415,
"num_tokens": 58559337.0,
"step": 731
},
{
"epoch": 0.9132875857766688,
"grad_norm": 0.13115254226617012,
"learning_rate": 8.34612454325491e-06,
"loss": 0.045,
"num_tokens": 58639403.0,
"step": 732
},
{
"epoch": 0.9145352464129757,
"grad_norm": 0.1153903682923885,
"learning_rate": 8.341428260442907e-06,
"loss": 0.0421,
"num_tokens": 58719196.0,
"step": 733
},
{
"epoch": 0.9157829070492826,
"grad_norm": 0.1350993442001407,
"learning_rate": 8.336726825262622e-06,
"loss": 0.0458,
"num_tokens": 58798792.0,
"step": 734
},
{
"epoch": 0.9170305676855895,
"grad_norm": 0.116827202506226,
"learning_rate": 8.332020246239183e-06,
"loss": 0.0454,
"num_tokens": 58878585.0,
"step": 735
},
{
"epoch": 0.9182782283218964,
"grad_norm": 0.13063470525394222,
"learning_rate": 8.327308531907039e-06,
"loss": 0.0429,
"num_tokens": 58957398.0,
"step": 736
},
{
"epoch": 0.9195258889582034,
"grad_norm": 0.11943276059553698,
"learning_rate": 8.322591690809952e-06,
"loss": 0.0436,
"num_tokens": 59036436.0,
"step": 737
},
{
"epoch": 0.9207735495945103,
"grad_norm": 0.13311888551517942,
"learning_rate": 8.317869731500981e-06,
"loss": 0.0472,
"num_tokens": 59117727.0,
"step": 738
},
{
"epoch": 0.9220212102308172,
"grad_norm": 0.13441754023620334,
"learning_rate": 8.313142662542465e-06,
"loss": 0.0427,
"num_tokens": 59198600.0,
"step": 739
},
{
"epoch": 0.9232688708671242,
"grad_norm": 0.12903006803949557,
"learning_rate": 8.30841049250601e-06,
"loss": 0.043,
"num_tokens": 59276825.0,
"step": 740
},
{
"epoch": 0.924516531503431,
"grad_norm": 0.11892900054116198,
"learning_rate": 8.303673229972468e-06,
"loss": 0.0428,
"num_tokens": 59356479.0,
"step": 741
},
{
"epoch": 0.925764192139738,
"grad_norm": 0.11014129765319786,
"learning_rate": 8.298930883531932e-06,
"loss": 0.0402,
"num_tokens": 59435634.0,
"step": 742
},
{
"epoch": 0.9270118527760449,
"grad_norm": 0.12790379344343886,
"learning_rate": 8.294183461783704e-06,
"loss": 0.0479,
"num_tokens": 59518043.0,
"step": 743
},
{
"epoch": 0.9282595134123518,
"grad_norm": 0.1271886081838466,
"learning_rate": 8.2894309733363e-06,
"loss": 0.0449,
"num_tokens": 59598023.0,
"step": 744
},
{
"epoch": 0.9295071740486588,
"grad_norm": 0.12608804603183602,
"learning_rate": 8.284673426807413e-06,
"loss": 0.0442,
"num_tokens": 59677990.0,
"step": 745
},
{
"epoch": 0.9307548346849657,
"grad_norm": 0.12197353808866439,
"learning_rate": 8.279910830823917e-06,
"loss": 0.0428,
"num_tokens": 59757003.0,
"step": 746
},
{
"epoch": 0.9320024953212727,
"grad_norm": 0.1183743614396383,
"learning_rate": 8.275143194021837e-06,
"loss": 0.0421,
"num_tokens": 59835942.0,
"step": 747
},
{
"epoch": 0.9332501559575795,
"grad_norm": 0.11788779371119068,
"learning_rate": 8.270370525046338e-06,
"loss": 0.0387,
"num_tokens": 59915228.0,
"step": 748
},
{
"epoch": 0.9344978165938864,
"grad_norm": 0.12838666597907625,
"learning_rate": 8.265592832551714e-06,
"loss": 0.0459,
"num_tokens": 59997067.0,
"step": 749
},
{
"epoch": 0.9357454772301934,
"grad_norm": 0.12163620589626947,
"learning_rate": 8.260810125201363e-06,
"loss": 0.0441,
"num_tokens": 60076744.0,
"step": 750
},
{
"epoch": 0.9369931378665003,
"grad_norm": 0.1208216904975396,
"learning_rate": 8.25602241166778e-06,
"loss": 0.0434,
"num_tokens": 60156803.0,
"step": 751
},
{
"epoch": 0.9382407985028073,
"grad_norm": 0.12583135298289463,
"learning_rate": 8.251229700632536e-06,
"loss": 0.0439,
"num_tokens": 60237132.0,
"step": 752
},
{
"epoch": 0.9394884591391142,
"grad_norm": 0.12011312406139918,
"learning_rate": 8.246432000786267e-06,
"loss": 0.0409,
"num_tokens": 60316759.0,
"step": 753
},
{
"epoch": 0.940736119775421,
"grad_norm": 0.1257483923597958,
"learning_rate": 8.241629320828652e-06,
"loss": 0.0431,
"num_tokens": 60395865.0,
"step": 754
},
{
"epoch": 0.941983780411728,
"grad_norm": 0.12598050416310153,
"learning_rate": 8.2368216694684e-06,
"loss": 0.0448,
"num_tokens": 60476033.0,
"step": 755
},
{
"epoch": 0.9432314410480349,
"grad_norm": 0.1371990547888002,
"learning_rate": 8.232009055423236e-06,
"loss": 0.0429,
"num_tokens": 60555833.0,
"step": 756
},
{
"epoch": 0.9444791016843419,
"grad_norm": 0.12968495911088507,
"learning_rate": 8.227191487419887e-06,
"loss": 0.0431,
"num_tokens": 60635691.0,
"step": 757
},
{
"epoch": 0.9457267623206488,
"grad_norm": 0.1222666448004795,
"learning_rate": 8.222368974194057e-06,
"loss": 0.0423,
"num_tokens": 60715830.0,
"step": 758
},
{
"epoch": 0.9469744229569557,
"grad_norm": 0.13249737757353294,
"learning_rate": 8.217541524490422e-06,
"loss": 0.0504,
"num_tokens": 60796607.0,
"step": 759
},
{
"epoch": 0.9482220835932627,
"grad_norm": 0.12046136010955967,
"learning_rate": 8.212709147062604e-06,
"loss": 0.0407,
"num_tokens": 60875388.0,
"step": 760
},
{
"epoch": 0.9494697442295695,
"grad_norm": 0.1347014144528205,
"learning_rate": 8.207871850673168e-06,
"loss": 0.0418,
"num_tokens": 60954745.0,
"step": 761
},
{
"epoch": 0.9507174048658765,
"grad_norm": 0.13217879759164464,
"learning_rate": 8.203029644093593e-06,
"loss": 0.0473,
"num_tokens": 61035577.0,
"step": 762
},
{
"epoch": 0.9519650655021834,
"grad_norm": 0.12642961452954143,
"learning_rate": 8.198182536104262e-06,
"loss": 0.0438,
"num_tokens": 61114939.0,
"step": 763
},
{
"epoch": 0.9532127261384903,
"grad_norm": 0.12699546277574592,
"learning_rate": 8.193330535494448e-06,
"loss": 0.0433,
"num_tokens": 61194941.0,
"step": 764
},
{
"epoch": 0.9544603867747973,
"grad_norm": 0.1248723416107508,
"learning_rate": 8.188473651062296e-06,
"loss": 0.0444,
"num_tokens": 61275037.0,
"step": 765
},
{
"epoch": 0.9557080474111042,
"grad_norm": 0.1224075580549069,
"learning_rate": 8.183611891614803e-06,
"loss": 0.0413,
"num_tokens": 61354585.0,
"step": 766
},
{
"epoch": 0.9569557080474111,
"grad_norm": 0.12023974168888206,
"learning_rate": 8.178745265967808e-06,
"loss": 0.0413,
"num_tokens": 61434070.0,
"step": 767
},
{
"epoch": 0.958203368683718,
"grad_norm": 0.11768365656858075,
"learning_rate": 8.173873782945976e-06,
"loss": 0.044,
"num_tokens": 61513480.0,
"step": 768
},
{
"epoch": 0.9594510293200249,
"grad_norm": 0.12252686093661061,
"learning_rate": 8.168997451382778e-06,
"loss": 0.0466,
"num_tokens": 61593111.0,
"step": 769
},
{
"epoch": 0.9606986899563319,
"grad_norm": 0.12444242831191968,
"learning_rate": 8.164116280120478e-06,
"loss": 0.0467,
"num_tokens": 61673108.0,
"step": 770
},
{
"epoch": 0.9619463505926388,
"grad_norm": 0.1170074311782113,
"learning_rate": 8.159230278010113e-06,
"loss": 0.0408,
"num_tokens": 61752840.0,
"step": 771
},
{
"epoch": 0.9631940112289458,
"grad_norm": 0.11877585272337694,
"learning_rate": 8.154339453911483e-06,
"loss": 0.0429,
"num_tokens": 61832555.0,
"step": 772
},
{
"epoch": 0.9644416718652526,
"grad_norm": 0.12480036823767689,
"learning_rate": 8.14944381669313e-06,
"loss": 0.0452,
"num_tokens": 61912425.0,
"step": 773
},
{
"epoch": 0.9656893325015595,
"grad_norm": 0.11828195611774657,
"learning_rate": 8.144543375232322e-06,
"loss": 0.0408,
"num_tokens": 61992459.0,
"step": 774
},
{
"epoch": 0.9669369931378665,
"grad_norm": 0.1326374355179406,
"learning_rate": 8.139638138415041e-06,
"loss": 0.0431,
"num_tokens": 62073365.0,
"step": 775
},
{
"epoch": 0.9681846537741734,
"grad_norm": 0.13258216878493195,
"learning_rate": 8.134728115135967e-06,
"loss": 0.042,
"num_tokens": 62153344.0,
"step": 776
},
{
"epoch": 0.9694323144104804,
"grad_norm": 0.11425515280955807,
"learning_rate": 8.129813314298457e-06,
"loss": 0.0429,
"num_tokens": 62233445.0,
"step": 777
},
{
"epoch": 0.9706799750467873,
"grad_norm": 0.12422677837162528,
"learning_rate": 8.124893744814524e-06,
"loss": 0.0483,
"num_tokens": 62313446.0,
"step": 778
},
{
"epoch": 0.9719276356830941,
"grad_norm": 0.12548120606989233,
"learning_rate": 8.11996941560484e-06,
"loss": 0.0423,
"num_tokens": 62392794.0,
"step": 779
},
{
"epoch": 0.9731752963194011,
"grad_norm": 0.11634689195370958,
"learning_rate": 8.115040335598701e-06,
"loss": 0.0441,
"num_tokens": 62472294.0,
"step": 780
},
{
"epoch": 0.974422956955708,
"grad_norm": 0.12855102743082578,
"learning_rate": 8.110106513734019e-06,
"loss": 0.0467,
"num_tokens": 62552135.0,
"step": 781
},
{
"epoch": 0.975670617592015,
"grad_norm": 0.12511638236191897,
"learning_rate": 8.105167958957302e-06,
"loss": 0.0425,
"num_tokens": 62632686.0,
"step": 782
},
{
"epoch": 0.9769182782283219,
"grad_norm": 0.1235968437568558,
"learning_rate": 8.100224680223647e-06,
"loss": 0.0458,
"num_tokens": 62712757.0,
"step": 783
},
{
"epoch": 0.9781659388646288,
"grad_norm": 0.12105321822365409,
"learning_rate": 8.09527668649671e-06,
"loss": 0.0455,
"num_tokens": 62792452.0,
"step": 784
},
{
"epoch": 0.9794135995009358,
"grad_norm": 0.12052561809276435,
"learning_rate": 8.090323986748696e-06,
"loss": 0.0433,
"num_tokens": 62872571.0,
"step": 785
},
{
"epoch": 0.9806612601372426,
"grad_norm": 0.11630478819939606,
"learning_rate": 8.085366589960353e-06,
"loss": 0.0434,
"num_tokens": 62954093.0,
"step": 786
},
{
"epoch": 0.9819089207735496,
"grad_norm": 0.12090710790599722,
"learning_rate": 8.080404505120936e-06,
"loss": 0.0415,
"num_tokens": 63033837.0,
"step": 787
},
{
"epoch": 0.9831565814098565,
"grad_norm": 0.11999218162825294,
"learning_rate": 8.075437741228205e-06,
"loss": 0.044,
"num_tokens": 63113726.0,
"step": 788
},
{
"epoch": 0.9844042420461634,
"grad_norm": 0.12012264791201754,
"learning_rate": 8.070466307288404e-06,
"loss": 0.045,
"num_tokens": 63195437.0,
"step": 789
},
{
"epoch": 0.9856519026824704,
"grad_norm": 0.10297347591984726,
"learning_rate": 8.065490212316245e-06,
"loss": 0.0385,
"num_tokens": 63274120.0,
"step": 790
},
{
"epoch": 0.9868995633187773,
"grad_norm": 0.1159425491489533,
"learning_rate": 8.060509465334895e-06,
"loss": 0.0421,
"num_tokens": 63353768.0,
"step": 791
},
{
"epoch": 0.9881472239550843,
"grad_norm": 0.12583816039244836,
"learning_rate": 8.055524075375951e-06,
"loss": 0.044,
"num_tokens": 63433687.0,
"step": 792
},
{
"epoch": 0.9893948845913911,
"grad_norm": 0.1300665820006064,
"learning_rate": 8.050534051479432e-06,
"loss": 0.0634,
"num_tokens": 63514928.0,
"step": 793
},
{
"epoch": 0.990642545227698,
"grad_norm": 0.1198989461072627,
"learning_rate": 8.045539402693759e-06,
"loss": 0.0435,
"num_tokens": 63594238.0,
"step": 794
},
{
"epoch": 0.991890205864005,
"grad_norm": 0.12871787019951422,
"learning_rate": 8.040540138075743e-06,
"loss": 0.044,
"num_tokens": 63674352.0,
"step": 795
},
{
"epoch": 0.9931378665003119,
"grad_norm": 0.14279234821369916,
"learning_rate": 8.035536266690561e-06,
"loss": 0.0437,
"num_tokens": 63755466.0,
"step": 796
},
{
"epoch": 0.9943855271366189,
"grad_norm": 0.12668717371746102,
"learning_rate": 8.030527797611742e-06,
"loss": 0.0472,
"num_tokens": 63835541.0,
"step": 797
},
{
"epoch": 0.9956331877729258,
"grad_norm": 0.12504493154631438,
"learning_rate": 8.025514739921155e-06,
"loss": 0.0439,
"num_tokens": 63914999.0,
"step": 798
},
{
"epoch": 0.9968808484092326,
"grad_norm": 0.12250309041796634,
"learning_rate": 8.02049710270899e-06,
"loss": 0.0408,
"num_tokens": 63995141.0,
"step": 799
},
{
"epoch": 0.9981285090455396,
"grad_norm": 0.11677292603996331,
"learning_rate": 8.015474895073739e-06,
"loss": 0.042,
"num_tokens": 64074961.0,
"step": 800
},
{
"epoch": 0.9993761696818465,
"grad_norm": 0.11305611872055077,
"learning_rate": 8.010448126122183e-06,
"loss": 0.0424,
"num_tokens": 64156160.0,
"step": 801
},
{
"epoch": 1.0,
"grad_norm": 0.11305611872055077,
"learning_rate": 8.005416804969374e-06,
"loss": 0.0453,
"num_tokens": 64196778.0,
"step": 802
},
{
"epoch": 1.001247660636307,
"grad_norm": 0.19297241332841056,
"learning_rate": 8.000380940738616e-06,
"loss": 0.0365,
"num_tokens": 64276858.0,
"step": 803
},
{
"epoch": 1.0024953212726138,
"grad_norm": 0.12339605814727976,
"learning_rate": 7.995340542561453e-06,
"loss": 0.0365,
"num_tokens": 64356382.0,
"step": 804
},
{
"epoch": 1.0037429819089208,
"grad_norm": 0.10655882436094036,
"learning_rate": 7.990295619577653e-06,
"loss": 0.0403,
"num_tokens": 64437312.0,
"step": 805
},
{
"epoch": 1.0049906425452277,
"grad_norm": 0.11516009325890297,
"learning_rate": 7.985246180935184e-06,
"loss": 0.0414,
"num_tokens": 64518748.0,
"step": 806
},
{
"epoch": 1.0062383031815347,
"grad_norm": 0.11139160659856995,
"learning_rate": 7.980192235790207e-06,
"loss": 0.0367,
"num_tokens": 64598711.0,
"step": 807
},
{
"epoch": 1.0074859638178415,
"grad_norm": 0.12064118588838565,
"learning_rate": 7.97513379330705e-06,
"loss": 0.0375,
"num_tokens": 64678489.0,
"step": 808
},
{
"epoch": 1.0087336244541485,
"grad_norm": 0.11389522139854903,
"learning_rate": 7.970070862658198e-06,
"loss": 0.036,
"num_tokens": 64758290.0,
"step": 809
},
{
"epoch": 1.0099812850904555,
"grad_norm": 0.11582742904776548,
"learning_rate": 7.965003453024273e-06,
"loss": 0.0387,
"num_tokens": 64838965.0,
"step": 810
},
{
"epoch": 1.0112289457267623,
"grad_norm": 0.11758744952524129,
"learning_rate": 7.959931573594025e-06,
"loss": 0.0373,
"num_tokens": 64919317.0,
"step": 811
},
{
"epoch": 1.0124766063630692,
"grad_norm": 0.1087404021707414,
"learning_rate": 7.954855233564301e-06,
"loss": 0.036,
"num_tokens": 65000106.0,
"step": 812
},
{
"epoch": 1.0137242669993762,
"grad_norm": 0.13115802969604487,
"learning_rate": 7.949774442140043e-06,
"loss": 0.0387,
"num_tokens": 65079674.0,
"step": 813
},
{
"epoch": 1.014971927635683,
"grad_norm": 0.1266899812719372,
"learning_rate": 7.944689208534257e-06,
"loss": 0.0383,
"num_tokens": 65160522.0,
"step": 814
},
{
"epoch": 1.01621958827199,
"grad_norm": 0.12285583224697105,
"learning_rate": 7.939599541968012e-06,
"loss": 0.0383,
"num_tokens": 65240870.0,
"step": 815
},
{
"epoch": 1.017467248908297,
"grad_norm": 0.1190497077658083,
"learning_rate": 7.93450545167041e-06,
"loss": 0.0352,
"num_tokens": 65319646.0,
"step": 816
},
{
"epoch": 1.018714909544604,
"grad_norm": 0.12563652698818017,
"learning_rate": 7.929406946878576e-06,
"loss": 0.0388,
"num_tokens": 65400645.0,
"step": 817
},
{
"epoch": 1.0199625701809107,
"grad_norm": 0.1234239723405108,
"learning_rate": 7.924304036837643e-06,
"loss": 0.0373,
"num_tokens": 65479114.0,
"step": 818
},
{
"epoch": 1.0212102308172177,
"grad_norm": 0.13005122038538955,
"learning_rate": 7.919196730800727e-06,
"loss": 0.0373,
"num_tokens": 65559516.0,
"step": 819
},
{
"epoch": 1.0224578914535247,
"grad_norm": 0.11725724234923955,
"learning_rate": 7.914085038028918e-06,
"loss": 0.0357,
"num_tokens": 65639488.0,
"step": 820
},
{
"epoch": 1.0237055520898315,
"grad_norm": 0.11014632164428752,
"learning_rate": 7.908968967791262e-06,
"loss": 0.0358,
"num_tokens": 65719075.0,
"step": 821
},
{
"epoch": 1.0249532127261385,
"grad_norm": 0.11721508584086246,
"learning_rate": 7.903848529364738e-06,
"loss": 0.0373,
"num_tokens": 65799000.0,
"step": 822
},
{
"epoch": 1.0262008733624455,
"grad_norm": 0.11517241066377722,
"learning_rate": 7.89872373203425e-06,
"loss": 0.0389,
"num_tokens": 65878502.0,
"step": 823
},
{
"epoch": 1.0274485339987522,
"grad_norm": 0.12088239271003214,
"learning_rate": 7.893594585092601e-06,
"loss": 0.0374,
"num_tokens": 65959219.0,
"step": 824
},
{
"epoch": 1.0286961946350592,
"grad_norm": 0.12193974539111146,
"learning_rate": 7.888461097840494e-06,
"loss": 0.0358,
"num_tokens": 66039818.0,
"step": 825
},
{
"epoch": 1.0299438552713662,
"grad_norm": 0.12182216187687943,
"learning_rate": 7.883323279586483e-06,
"loss": 0.0374,
"num_tokens": 66119441.0,
"step": 826
},
{
"epoch": 1.0311915159076732,
"grad_norm": 0.11841925845138539,
"learning_rate": 7.87818113964699e-06,
"loss": 0.0377,
"num_tokens": 66199667.0,
"step": 827
},
{
"epoch": 1.03243917654398,
"grad_norm": 0.12421286713634912,
"learning_rate": 7.873034687346268e-06,
"loss": 0.0371,
"num_tokens": 66279952.0,
"step": 828
},
{
"epoch": 1.033686837180287,
"grad_norm": 0.11425196856934319,
"learning_rate": 7.86788393201639e-06,
"loss": 0.0366,
"num_tokens": 66359882.0,
"step": 829
},
{
"epoch": 1.034934497816594,
"grad_norm": 0.12393074676184707,
"learning_rate": 7.862728882997236e-06,
"loss": 0.0372,
"num_tokens": 66439831.0,
"step": 830
},
{
"epoch": 1.0361821584529007,
"grad_norm": 0.11799704195610591,
"learning_rate": 7.857569549636462e-06,
"loss": 0.0416,
"num_tokens": 66519952.0,
"step": 831
},
{
"epoch": 1.0374298190892077,
"grad_norm": 0.12409491334059666,
"learning_rate": 7.852405941289503e-06,
"loss": 0.0372,
"num_tokens": 66598948.0,
"step": 832
},
{
"epoch": 1.0386774797255147,
"grad_norm": 0.12220754153408792,
"learning_rate": 7.847238067319542e-06,
"loss": 0.0372,
"num_tokens": 66680599.0,
"step": 833
},
{
"epoch": 1.0399251403618215,
"grad_norm": 0.12805122770002453,
"learning_rate": 7.842065937097495e-06,
"loss": 0.0357,
"num_tokens": 66759842.0,
"step": 834
},
{
"epoch": 1.0411728009981285,
"grad_norm": 0.11193160429999634,
"learning_rate": 7.836889560001997e-06,
"loss": 0.0372,
"num_tokens": 66839975.0,
"step": 835
},
{
"epoch": 1.0424204616344355,
"grad_norm": 0.11705280167012153,
"learning_rate": 7.831708945419383e-06,
"loss": 0.0354,
"num_tokens": 66919473.0,
"step": 836
},
{
"epoch": 1.0436681222707425,
"grad_norm": 0.11882428882714241,
"learning_rate": 7.826524102743678e-06,
"loss": 0.0469,
"num_tokens": 67000971.0,
"step": 837
},
{
"epoch": 1.0449157829070492,
"grad_norm": 0.11949649165411871,
"learning_rate": 7.821335041376565e-06,
"loss": 0.0399,
"num_tokens": 67082398.0,
"step": 838
},
{
"epoch": 1.0461634435433562,
"grad_norm": 0.13222049645536524,
"learning_rate": 7.816141770727381e-06,
"loss": 0.038,
"num_tokens": 67162807.0,
"step": 839
},
{
"epoch": 1.0474111041796632,
"grad_norm": 0.1287747273510056,
"learning_rate": 7.810944300213095e-06,
"loss": 0.0365,
"num_tokens": 67243036.0,
"step": 840
},
{
"epoch": 1.04865876481597,
"grad_norm": 0.12359681273704244,
"learning_rate": 7.805742639258297e-06,
"loss": 0.0377,
"num_tokens": 67324171.0,
"step": 841
},
{
"epoch": 1.049906425452277,
"grad_norm": 0.11791544460990037,
"learning_rate": 7.800536797295164e-06,
"loss": 0.0385,
"num_tokens": 67406821.0,
"step": 842
},
{
"epoch": 1.051154086088584,
"grad_norm": 0.11669683358736094,
"learning_rate": 7.795326783763463e-06,
"loss": 0.036,
"num_tokens": 67486421.0,
"step": 843
},
{
"epoch": 1.0524017467248907,
"grad_norm": 0.1145704300717487,
"learning_rate": 7.790112608110523e-06,
"loss": 0.0375,
"num_tokens": 67566397.0,
"step": 844
},
{
"epoch": 1.0536494073611977,
"grad_norm": 0.11192160356785108,
"learning_rate": 7.784894279791224e-06,
"loss": 0.0355,
"num_tokens": 67646842.0,
"step": 845
},
{
"epoch": 1.0548970679975047,
"grad_norm": 0.11562580889859264,
"learning_rate": 7.779671808267968e-06,
"loss": 0.0378,
"num_tokens": 67727482.0,
"step": 846
},
{
"epoch": 1.0561447286338117,
"grad_norm": 0.12153279759646565,
"learning_rate": 7.774445203010676e-06,
"loss": 0.0359,
"num_tokens": 67807939.0,
"step": 847
},
{
"epoch": 1.0573923892701185,
"grad_norm": 0.11470566792658915,
"learning_rate": 7.769214473496766e-06,
"loss": 0.0407,
"num_tokens": 67888052.0,
"step": 848
},
{
"epoch": 1.0586400499064255,
"grad_norm": 0.12383359576287097,
"learning_rate": 7.763979629211127e-06,
"loss": 0.038,
"num_tokens": 67968032.0,
"step": 849
},
{
"epoch": 1.0598877105427325,
"grad_norm": 0.11881454127188035,
"learning_rate": 7.758740679646115e-06,
"loss": 0.0371,
"num_tokens": 68046937.0,
"step": 850
},
{
"epoch": 1.0611353711790392,
"grad_norm": 0.11464903476697275,
"learning_rate": 7.753497634301532e-06,
"loss": 0.0364,
"num_tokens": 68127203.0,
"step": 851
},
{
"epoch": 1.0623830318153462,
"grad_norm": 0.12119441827724774,
"learning_rate": 7.748250502684601e-06,
"loss": 0.0367,
"num_tokens": 68207782.0,
"step": 852
},
{
"epoch": 1.0636306924516532,
"grad_norm": 0.13058645569693353,
"learning_rate": 7.742999294309959e-06,
"loss": 0.037,
"num_tokens": 68287141.0,
"step": 853
},
{
"epoch": 1.06487835308796,
"grad_norm": 0.12137449605751832,
"learning_rate": 7.737744018699634e-06,
"loss": 0.0386,
"num_tokens": 68367278.0,
"step": 854
},
{
"epoch": 1.066126013724267,
"grad_norm": 0.1175051411674423,
"learning_rate": 7.732484685383027e-06,
"loss": 0.0378,
"num_tokens": 68448023.0,
"step": 855
},
{
"epoch": 1.067373674360574,
"grad_norm": 0.11774104211220471,
"learning_rate": 7.7272213038969e-06,
"loss": 0.0362,
"num_tokens": 68527504.0,
"step": 856
},
{
"epoch": 1.068621334996881,
"grad_norm": 0.12107302317693551,
"learning_rate": 7.72195388378536e-06,
"loss": 0.0364,
"num_tokens": 68607013.0,
"step": 857
},
{
"epoch": 1.0698689956331877,
"grad_norm": 0.1227886444188267,
"learning_rate": 7.716682434599823e-06,
"loss": 0.0379,
"num_tokens": 68687882.0,
"step": 858
},
{
"epoch": 1.0711166562694947,
"grad_norm": 0.11421078198972955,
"learning_rate": 7.711406965899026e-06,
"loss": 0.0357,
"num_tokens": 68767520.0,
"step": 859
},
{
"epoch": 1.0723643169058017,
"grad_norm": 0.13139694750578262,
"learning_rate": 7.706127487248984e-06,
"loss": 0.0397,
"num_tokens": 68848548.0,
"step": 860
},
{
"epoch": 1.0736119775421085,
"grad_norm": 0.12048027098934615,
"learning_rate": 7.70084400822299e-06,
"loss": 0.0394,
"num_tokens": 68929107.0,
"step": 861
},
{
"epoch": 1.0748596381784155,
"grad_norm": 0.11688196802345287,
"learning_rate": 7.695556538401588e-06,
"loss": 0.0337,
"num_tokens": 69008833.0,
"step": 862
},
{
"epoch": 1.0761072988147224,
"grad_norm": 0.1178101906162924,
"learning_rate": 7.690265087372559e-06,
"loss": 0.0392,
"num_tokens": 69089477.0,
"step": 863
},
{
"epoch": 1.0773549594510294,
"grad_norm": 0.11901289710556308,
"learning_rate": 7.684969664730903e-06,
"loss": 0.0395,
"num_tokens": 69170299.0,
"step": 864
},
{
"epoch": 1.0786026200873362,
"grad_norm": 0.1131133914969235,
"learning_rate": 7.679670280078823e-06,
"loss": 0.0381,
"num_tokens": 69251770.0,
"step": 865
},
{
"epoch": 1.0798502807236432,
"grad_norm": 0.11744420013358728,
"learning_rate": 7.674366943025705e-06,
"loss": 0.0363,
"num_tokens": 69331910.0,
"step": 866
},
{
"epoch": 1.0810979413599502,
"grad_norm": 0.11404106643899889,
"learning_rate": 7.669059663188099e-06,
"loss": 0.037,
"num_tokens": 69411473.0,
"step": 867
},
{
"epoch": 1.082345601996257,
"grad_norm": 0.11319859025103475,
"learning_rate": 7.66374845018971e-06,
"loss": 0.0348,
"num_tokens": 69491703.0,
"step": 868
},
{
"epoch": 1.083593262632564,
"grad_norm": 0.112683453177615,
"learning_rate": 7.658433313661372e-06,
"loss": 0.0392,
"num_tokens": 69572447.0,
"step": 869
},
{
"epoch": 1.084840923268871,
"grad_norm": 0.1276133280655265,
"learning_rate": 7.653114263241034e-06,
"loss": 0.0388,
"num_tokens": 69653823.0,
"step": 870
},
{
"epoch": 1.0860885839051777,
"grad_norm": 0.1240984988092244,
"learning_rate": 7.647791308573744e-06,
"loss": 0.0387,
"num_tokens": 69734055.0,
"step": 871
},
{
"epoch": 1.0873362445414847,
"grad_norm": 0.12356452504447533,
"learning_rate": 7.642464459311623e-06,
"loss": 0.0347,
"num_tokens": 69813965.0,
"step": 872
},
{
"epoch": 1.0885839051777917,
"grad_norm": 0.11514715620013154,
"learning_rate": 7.637133725113864e-06,
"loss": 0.0366,
"num_tokens": 69894363.0,
"step": 873
},
{
"epoch": 1.0898315658140985,
"grad_norm": 0.12779474123132986,
"learning_rate": 7.631799115646697e-06,
"loss": 0.0357,
"num_tokens": 69973323.0,
"step": 874
},
{
"epoch": 1.0910792264504054,
"grad_norm": 0.11790975872627733,
"learning_rate": 7.6264606405833805e-06,
"loss": 0.0363,
"num_tokens": 70054250.0,
"step": 875
},
{
"epoch": 1.0923268870867124,
"grad_norm": 0.11641886596060783,
"learning_rate": 7.621118309604186e-06,
"loss": 0.0422,
"num_tokens": 70133988.0,
"step": 876
},
{
"epoch": 1.0935745477230194,
"grad_norm": 0.11751888094011356,
"learning_rate": 7.615772132396373e-06,
"loss": 0.035,
"num_tokens": 70213674.0,
"step": 877
},
{
"epoch": 1.0948222083593262,
"grad_norm": 0.10553263064141878,
"learning_rate": 7.6104221186541745e-06,
"loss": 0.0356,
"num_tokens": 70292896.0,
"step": 878
},
{
"epoch": 1.0960698689956332,
"grad_norm": 0.12384471705795567,
"learning_rate": 7.6050682780787865e-06,
"loss": 0.0397,
"num_tokens": 70372902.0,
"step": 879
},
{
"epoch": 1.0973175296319402,
"grad_norm": 0.1290651741195708,
"learning_rate": 7.599710620378337e-06,
"loss": 0.0362,
"num_tokens": 70453829.0,
"step": 880
},
{
"epoch": 1.098565190268247,
"grad_norm": 0.12342303328201779,
"learning_rate": 7.594349155267879e-06,
"loss": 0.0362,
"num_tokens": 70533001.0,
"step": 881
},
{
"epoch": 1.099812850904554,
"grad_norm": 0.12141677579551534,
"learning_rate": 7.588983892469372e-06,
"loss": 0.0367,
"num_tokens": 70613271.0,
"step": 882
},
{
"epoch": 1.101060511540861,
"grad_norm": 0.12196685769280396,
"learning_rate": 7.583614841711657e-06,
"loss": 0.0365,
"num_tokens": 70692565.0,
"step": 883
},
{
"epoch": 1.102308172177168,
"grad_norm": 0.12354073131658985,
"learning_rate": 7.5782420127304466e-06,
"loss": 0.0384,
"num_tokens": 70772857.0,
"step": 884
},
{
"epoch": 1.1035558328134747,
"grad_norm": 0.11720083779315267,
"learning_rate": 7.572865415268303e-06,
"loss": 0.0355,
"num_tokens": 70852777.0,
"step": 885
},
{
"epoch": 1.1048034934497817,
"grad_norm": 0.11247611644561628,
"learning_rate": 7.567485059074623e-06,
"loss": 0.0367,
"num_tokens": 70933435.0,
"step": 886
},
{
"epoch": 1.1060511540860887,
"grad_norm": 0.12333260571429218,
"learning_rate": 7.5621009539056175e-06,
"loss": 0.0374,
"num_tokens": 71013136.0,
"step": 887
},
{
"epoch": 1.1072988147223954,
"grad_norm": 0.12451469169995692,
"learning_rate": 7.556713109524301e-06,
"loss": 0.0372,
"num_tokens": 71093256.0,
"step": 888
},
{
"epoch": 1.1085464753587024,
"grad_norm": 0.13359967657120056,
"learning_rate": 7.551321535700456e-06,
"loss": 0.0358,
"num_tokens": 71172930.0,
"step": 889
},
{
"epoch": 1.1097941359950094,
"grad_norm": 0.11691502751945572,
"learning_rate": 7.545926242210643e-06,
"loss": 0.0362,
"num_tokens": 71252476.0,
"step": 890
},
{
"epoch": 1.1110417966313162,
"grad_norm": 0.12287538546531206,
"learning_rate": 7.540527238838156e-06,
"loss": 0.0352,
"num_tokens": 71331645.0,
"step": 891
},
{
"epoch": 1.1122894572676232,
"grad_norm": 0.12355870380253296,
"learning_rate": 7.535124535373019e-06,
"loss": 0.0352,
"num_tokens": 71410967.0,
"step": 892
},
{
"epoch": 1.1135371179039302,
"grad_norm": 0.11947008075731919,
"learning_rate": 7.529718141611972e-06,
"loss": 0.0369,
"num_tokens": 71491316.0,
"step": 893
},
{
"epoch": 1.114784778540237,
"grad_norm": 0.11449370018994913,
"learning_rate": 7.5243080673584345e-06,
"loss": 0.0338,
"num_tokens": 71572312.0,
"step": 894
},
{
"epoch": 1.116032439176544,
"grad_norm": 0.10417902495607685,
"learning_rate": 7.51889432242251e-06,
"loss": 0.0351,
"num_tokens": 71652300.0,
"step": 895
},
{
"epoch": 1.117280099812851,
"grad_norm": 0.11908753377459247,
"learning_rate": 7.513476916620952e-06,
"loss": 0.0387,
"num_tokens": 71733471.0,
"step": 896
},
{
"epoch": 1.118527760449158,
"grad_norm": 0.11776124309241255,
"learning_rate": 7.508055859777157e-06,
"loss": 0.0347,
"num_tokens": 71812889.0,
"step": 897
},
{
"epoch": 1.1197754210854647,
"grad_norm": 0.11842774970819654,
"learning_rate": 7.502631161721139e-06,
"loss": 0.0361,
"num_tokens": 71892941.0,
"step": 898
},
{
"epoch": 1.1210230817217717,
"grad_norm": 0.11809125594779071,
"learning_rate": 7.497202832289514e-06,
"loss": 0.0376,
"num_tokens": 71972718.0,
"step": 899
},
{
"epoch": 1.1222707423580787,
"grad_norm": 0.11729161979135838,
"learning_rate": 7.4917708813254865e-06,
"loss": 0.0387,
"num_tokens": 72054222.0,
"step": 900
},
{
"epoch": 1.1235184029943854,
"grad_norm": 0.11425658550363159,
"learning_rate": 7.4863353186788234e-06,
"loss": 0.0363,
"num_tokens": 72133579.0,
"step": 901
},
{
"epoch": 1.1247660636306924,
"grad_norm": 0.11436527897915852,
"learning_rate": 7.480896154205844e-06,
"loss": 0.0362,
"num_tokens": 72213206.0,
"step": 902
},
{
"epoch": 1.1260137242669994,
"grad_norm": 0.11730584452702315,
"learning_rate": 7.475453397769396e-06,
"loss": 0.0391,
"num_tokens": 72292700.0,
"step": 903
},
{
"epoch": 1.1272613849033064,
"grad_norm": 0.11830533017335358,
"learning_rate": 7.470007059238842e-06,
"loss": 0.0351,
"num_tokens": 72371412.0,
"step": 904
},
{
"epoch": 1.1285090455396132,
"grad_norm": 0.11213629131896848,
"learning_rate": 7.464557148490041e-06,
"loss": 0.035,
"num_tokens": 72451362.0,
"step": 905
},
{
"epoch": 1.1297567061759202,
"grad_norm": 0.14744282107332352,
"learning_rate": 7.459103675405328e-06,
"loss": 0.0384,
"num_tokens": 72531571.0,
"step": 906
},
{
"epoch": 1.1310043668122272,
"grad_norm": 0.11322054203080899,
"learning_rate": 7.4536466498735e-06,
"loss": 0.0362,
"num_tokens": 72611893.0,
"step": 907
},
{
"epoch": 1.132252027448534,
"grad_norm": 0.10786998375465344,
"learning_rate": 7.44818608178979e-06,
"loss": 0.0368,
"num_tokens": 72691853.0,
"step": 908
},
{
"epoch": 1.133499688084841,
"grad_norm": 0.1213664887001441,
"learning_rate": 7.442721981055862e-06,
"loss": 0.04,
"num_tokens": 72773392.0,
"step": 909
},
{
"epoch": 1.134747348721148,
"grad_norm": 0.11598720811101557,
"learning_rate": 7.43725435757978e-06,
"loss": 0.0359,
"num_tokens": 72852913.0,
"step": 910
},
{
"epoch": 1.1359950093574547,
"grad_norm": 0.11009547084534818,
"learning_rate": 7.431783221275997e-06,
"loss": 0.0372,
"num_tokens": 72932495.0,
"step": 911
},
{
"epoch": 1.1372426699937617,
"grad_norm": 0.13290296128464088,
"learning_rate": 7.426308582065339e-06,
"loss": 0.0375,
"num_tokens": 73013678.0,
"step": 912
},
{
"epoch": 1.1384903306300687,
"grad_norm": 0.12616925947987406,
"learning_rate": 7.4208304498749825e-06,
"loss": 0.0379,
"num_tokens": 73095054.0,
"step": 913
},
{
"epoch": 1.1397379912663754,
"grad_norm": 0.11425464035364839,
"learning_rate": 7.415348834638433e-06,
"loss": 0.0372,
"num_tokens": 73175046.0,
"step": 914
},
{
"epoch": 1.1409856519026824,
"grad_norm": 0.1245543910672972,
"learning_rate": 7.40986374629552e-06,
"loss": 0.0369,
"num_tokens": 73254678.0,
"step": 915
},
{
"epoch": 1.1422333125389894,
"grad_norm": 0.1179981926555406,
"learning_rate": 7.404375194792365e-06,
"loss": 0.0374,
"num_tokens": 73334623.0,
"step": 916
},
{
"epoch": 1.1434809731752964,
"grad_norm": 0.11126346028333928,
"learning_rate": 7.398883190081368e-06,
"loss": 0.0353,
"num_tokens": 73414712.0,
"step": 917
},
{
"epoch": 1.1447286338116032,
"grad_norm": 0.11760773052330453,
"learning_rate": 7.3933877421211986e-06,
"loss": 0.0356,
"num_tokens": 73495679.0,
"step": 918
},
{
"epoch": 1.1459762944479102,
"grad_norm": 0.11803137365247664,
"learning_rate": 7.387888860876763e-06,
"loss": 0.0362,
"num_tokens": 73575931.0,
"step": 919
},
{
"epoch": 1.1472239550842172,
"grad_norm": 0.11691088438783126,
"learning_rate": 7.382386556319193e-06,
"loss": 0.0357,
"num_tokens": 73656607.0,
"step": 920
},
{
"epoch": 1.1484716157205241,
"grad_norm": 0.11592560296885844,
"learning_rate": 7.376880838425832e-06,
"loss": 0.0366,
"num_tokens": 73736234.0,
"step": 921
},
{
"epoch": 1.149719276356831,
"grad_norm": 0.1211126889771943,
"learning_rate": 7.3713717171802106e-06,
"loss": 0.0354,
"num_tokens": 73816380.0,
"step": 922
},
{
"epoch": 1.150966936993138,
"grad_norm": 0.11810769499767688,
"learning_rate": 7.3658592025720285e-06,
"loss": 0.04,
"num_tokens": 73897698.0,
"step": 923
},
{
"epoch": 1.152214597629445,
"grad_norm": 0.11484038275799247,
"learning_rate": 7.360343304597144e-06,
"loss": 0.0354,
"num_tokens": 73977453.0,
"step": 924
},
{
"epoch": 1.1534622582657517,
"grad_norm": 0.10850730121509818,
"learning_rate": 7.354824033257546e-06,
"loss": 0.0365,
"num_tokens": 74056422.0,
"step": 925
},
{
"epoch": 1.1547099189020587,
"grad_norm": 0.11974147584903187,
"learning_rate": 7.349301398561342e-06,
"loss": 0.0351,
"num_tokens": 74136845.0,
"step": 926
},
{
"epoch": 1.1559575795383656,
"grad_norm": 0.11829958356397421,
"learning_rate": 7.3437754105227365e-06,
"loss": 0.0358,
"num_tokens": 74217377.0,
"step": 927
},
{
"epoch": 1.1572052401746724,
"grad_norm": 0.1119802739133821,
"learning_rate": 7.3382460791620165e-06,
"loss": 0.0365,
"num_tokens": 74299632.0,
"step": 928
},
{
"epoch": 1.1584529008109794,
"grad_norm": 0.11642309337985378,
"learning_rate": 7.332713414505534e-06,
"loss": 0.0373,
"num_tokens": 74379725.0,
"step": 929
},
{
"epoch": 1.1597005614472864,
"grad_norm": 0.12355861507215243,
"learning_rate": 7.32717742658568e-06,
"loss": 0.0355,
"num_tokens": 74459397.0,
"step": 930
},
{
"epoch": 1.1609482220835932,
"grad_norm": 0.11263553582694089,
"learning_rate": 7.321638125440872e-06,
"loss": 0.0338,
"num_tokens": 74539162.0,
"step": 931
},
{
"epoch": 1.1621958827199002,
"grad_norm": 0.11483802072503761,
"learning_rate": 7.316095521115541e-06,
"loss": 0.0395,
"num_tokens": 74619166.0,
"step": 932
},
{
"epoch": 1.1634435433562071,
"grad_norm": 0.12246171184256209,
"learning_rate": 7.310549623660101e-06,
"loss": 0.0378,
"num_tokens": 74699097.0,
"step": 933
},
{
"epoch": 1.164691203992514,
"grad_norm": 0.10951266608078988,
"learning_rate": 7.305000443130943e-06,
"loss": 0.0359,
"num_tokens": 74778723.0,
"step": 934
},
{
"epoch": 1.165938864628821,
"grad_norm": 0.1296103420820968,
"learning_rate": 7.299447989590406e-06,
"loss": 0.0379,
"num_tokens": 74857957.0,
"step": 935
},
{
"epoch": 1.167186525265128,
"grad_norm": 0.1106739790929476,
"learning_rate": 7.293892273106768e-06,
"loss": 0.0339,
"num_tokens": 74937533.0,
"step": 936
},
{
"epoch": 1.1684341859014349,
"grad_norm": 0.1159745943163753,
"learning_rate": 7.2883333037542205e-06,
"loss": 0.0361,
"num_tokens": 75017116.0,
"step": 937
},
{
"epoch": 1.1696818465377417,
"grad_norm": 0.11565795695947093,
"learning_rate": 7.282771091612858e-06,
"loss": 0.037,
"num_tokens": 75097805.0,
"step": 938
},
{
"epoch": 1.1709295071740486,
"grad_norm": 0.12436659888574522,
"learning_rate": 7.27720564676865e-06,
"loss": 0.0358,
"num_tokens": 75177008.0,
"step": 939
},
{
"epoch": 1.1721771678103556,
"grad_norm": 0.11305973816370699,
"learning_rate": 7.271636979313432e-06,
"loss": 0.0338,
"num_tokens": 75256060.0,
"step": 940
},
{
"epoch": 1.1734248284466626,
"grad_norm": 0.11468359036363136,
"learning_rate": 7.266065099344881e-06,
"loss": 0.0371,
"num_tokens": 75336135.0,
"step": 941
},
{
"epoch": 1.1746724890829694,
"grad_norm": 0.11929128098454908,
"learning_rate": 7.260490016966497e-06,
"loss": 0.0373,
"num_tokens": 75416812.0,
"step": 942
},
{
"epoch": 1.1759201497192764,
"grad_norm": 0.130142686177145,
"learning_rate": 7.2549117422875925e-06,
"loss": 0.0396,
"num_tokens": 75496993.0,
"step": 943
},
{
"epoch": 1.1771678103555834,
"grad_norm": 0.12008704233696307,
"learning_rate": 7.249330285423265e-06,
"loss": 0.0389,
"num_tokens": 75577776.0,
"step": 944
},
{
"epoch": 1.1784154709918901,
"grad_norm": 0.1220975661519173,
"learning_rate": 7.243745656494382e-06,
"loss": 0.038,
"num_tokens": 75657576.0,
"step": 945
},
{
"epoch": 1.1796631316281971,
"grad_norm": 0.12744332708746905,
"learning_rate": 7.238157865627562e-06,
"loss": 0.0364,
"num_tokens": 75737747.0,
"step": 946
},
{
"epoch": 1.1809107922645041,
"grad_norm": 0.11326510598111456,
"learning_rate": 7.2325669229551636e-06,
"loss": 0.0364,
"num_tokens": 75819395.0,
"step": 947
},
{
"epoch": 1.182158452900811,
"grad_norm": 0.1213820127222462,
"learning_rate": 7.226972838615251e-06,
"loss": 0.0384,
"num_tokens": 75898696.0,
"step": 948
},
{
"epoch": 1.1834061135371179,
"grad_norm": 0.1226667585479789,
"learning_rate": 7.221375622751593e-06,
"loss": 0.0407,
"num_tokens": 75978883.0,
"step": 949
},
{
"epoch": 1.1846537741734249,
"grad_norm": 0.1193275551364265,
"learning_rate": 7.215775285513633e-06,
"loss": 0.037,
"num_tokens": 76058141.0,
"step": 950
},
{
"epoch": 1.1859014348097316,
"grad_norm": 0.12089332711649621,
"learning_rate": 7.210171837056474e-06,
"loss": 0.0373,
"num_tokens": 76138153.0,
"step": 951
},
{
"epoch": 1.1871490954460386,
"grad_norm": 0.12206397250048755,
"learning_rate": 7.2045652875408614e-06,
"loss": 0.0362,
"num_tokens": 76218972.0,
"step": 952
},
{
"epoch": 1.1883967560823456,
"grad_norm": 0.11543786673607577,
"learning_rate": 7.198955647133167e-06,
"loss": 0.0364,
"num_tokens": 76298129.0,
"step": 953
},
{
"epoch": 1.1896444167186526,
"grad_norm": 0.1170017252794963,
"learning_rate": 7.193342926005362e-06,
"loss": 0.0359,
"num_tokens": 76377939.0,
"step": 954
},
{
"epoch": 1.1908920773549594,
"grad_norm": 0.12112960322552498,
"learning_rate": 7.187727134335006e-06,
"loss": 0.0386,
"num_tokens": 76458143.0,
"step": 955
},
{
"epoch": 1.1921397379912664,
"grad_norm": 0.12276416511613138,
"learning_rate": 7.182108282305231e-06,
"loss": 0.0366,
"num_tokens": 76537173.0,
"step": 956
},
{
"epoch": 1.1933873986275734,
"grad_norm": 0.12474176247002251,
"learning_rate": 7.176486380104707e-06,
"loss": 0.0372,
"num_tokens": 76617763.0,
"step": 957
},
{
"epoch": 1.1946350592638801,
"grad_norm": 0.12126829791671767,
"learning_rate": 7.1708614379276485e-06,
"loss": 0.0374,
"num_tokens": 76698109.0,
"step": 958
},
{
"epoch": 1.1958827199001871,
"grad_norm": 0.1248231725076693,
"learning_rate": 7.165233465973771e-06,
"loss": 0.0375,
"num_tokens": 76777864.0,
"step": 959
},
{
"epoch": 1.1971303805364941,
"grad_norm": 0.12569048840462274,
"learning_rate": 7.159602474448292e-06,
"loss": 0.0369,
"num_tokens": 76857197.0,
"step": 960
},
{
"epoch": 1.1983780411728011,
"grad_norm": 0.12332798310694695,
"learning_rate": 7.1539684735618995e-06,
"loss": 0.0364,
"num_tokens": 76937469.0,
"step": 961
},
{
"epoch": 1.1996257018091079,
"grad_norm": 0.11706852204502789,
"learning_rate": 7.148331473530741e-06,
"loss": 0.0383,
"num_tokens": 77017833.0,
"step": 962
},
{
"epoch": 1.2008733624454149,
"grad_norm": 0.12424916506541904,
"learning_rate": 7.142691484576399e-06,
"loss": 0.0361,
"num_tokens": 77097219.0,
"step": 963
},
{
"epoch": 1.2021210230817219,
"grad_norm": 0.10771753138390724,
"learning_rate": 7.137048516925882e-06,
"loss": 0.0335,
"num_tokens": 77176147.0,
"step": 964
},
{
"epoch": 1.2033686837180286,
"grad_norm": 0.11534715229087208,
"learning_rate": 7.131402580811593e-06,
"loss": 0.0373,
"num_tokens": 77256126.0,
"step": 965
},
{
"epoch": 1.2046163443543356,
"grad_norm": 0.12314227105562962,
"learning_rate": 7.125753686471322e-06,
"loss": 0.0361,
"num_tokens": 77335910.0,
"step": 966
},
{
"epoch": 1.2058640049906426,
"grad_norm": 0.12340324578863827,
"learning_rate": 7.120101844148222e-06,
"loss": 0.0358,
"num_tokens": 77416109.0,
"step": 967
},
{
"epoch": 1.2071116656269494,
"grad_norm": 0.12119177535967239,
"learning_rate": 7.1144470640907906e-06,
"loss": 0.0418,
"num_tokens": 77497402.0,
"step": 968
},
{
"epoch": 1.2083593262632564,
"grad_norm": 0.12173802469100313,
"learning_rate": 7.1087893565528545e-06,
"loss": 0.0372,
"num_tokens": 77577501.0,
"step": 969
},
{
"epoch": 1.2096069868995634,
"grad_norm": 0.11142850106634652,
"learning_rate": 7.103128731793546e-06,
"loss": 0.0365,
"num_tokens": 77657098.0,
"step": 970
},
{
"epoch": 1.2108546475358701,
"grad_norm": 0.11827410294214857,
"learning_rate": 7.097465200077289e-06,
"loss": 0.0358,
"num_tokens": 77735468.0,
"step": 971
},
{
"epoch": 1.2121023081721771,
"grad_norm": 0.12499420528672263,
"learning_rate": 7.0917987716737795e-06,
"loss": 0.0384,
"num_tokens": 77815844.0,
"step": 972
},
{
"epoch": 1.2133499688084841,
"grad_norm": 0.12225834238347827,
"learning_rate": 7.086129456857963e-06,
"loss": 0.0358,
"num_tokens": 77895631.0,
"step": 973
},
{
"epoch": 1.214597629444791,
"grad_norm": 0.11545282158638288,
"learning_rate": 7.080457265910022e-06,
"loss": 0.0364,
"num_tokens": 77976656.0,
"step": 974
},
{
"epoch": 1.2158452900810979,
"grad_norm": 0.12462671166142329,
"learning_rate": 7.074782209115356e-06,
"loss": 0.036,
"num_tokens": 78058340.0,
"step": 975
},
{
"epoch": 1.2170929507174049,
"grad_norm": 0.11756188080542514,
"learning_rate": 7.069104296764553e-06,
"loss": 0.0382,
"num_tokens": 78139204.0,
"step": 976
},
{
"epoch": 1.2183406113537119,
"grad_norm": 0.12216323940445717,
"learning_rate": 7.0634235391533874e-06,
"loss": 0.0383,
"num_tokens": 78219057.0,
"step": 977
},
{
"epoch": 1.2195882719900186,
"grad_norm": 0.12667133266309244,
"learning_rate": 7.05773994658279e-06,
"loss": 0.0365,
"num_tokens": 78299547.0,
"step": 978
},
{
"epoch": 1.2208359326263256,
"grad_norm": 0.13620206727181836,
"learning_rate": 7.052053529358831e-06,
"loss": 0.0352,
"num_tokens": 78378421.0,
"step": 979
},
{
"epoch": 1.2220835932626326,
"grad_norm": 0.11745647885605227,
"learning_rate": 7.046364297792703e-06,
"loss": 0.0348,
"num_tokens": 78458843.0,
"step": 980
},
{
"epoch": 1.2233312538989396,
"grad_norm": 0.11499104455275264,
"learning_rate": 7.040672262200705e-06,
"loss": 0.0407,
"num_tokens": 78539916.0,
"step": 981
},
{
"epoch": 1.2245789145352464,
"grad_norm": 0.11717217325806314,
"learning_rate": 7.0349774329042135e-06,
"loss": 0.0355,
"num_tokens": 78619130.0,
"step": 982
},
{
"epoch": 1.2258265751715534,
"grad_norm": 0.12516375230993684,
"learning_rate": 7.02927982022968e-06,
"loss": 0.0377,
"num_tokens": 78699034.0,
"step": 983
},
{
"epoch": 1.2270742358078603,
"grad_norm": 0.15353076061722065,
"learning_rate": 7.023579434508596e-06,
"loss": 0.0345,
"num_tokens": 78777947.0,
"step": 984
},
{
"epoch": 1.2283218964441671,
"grad_norm": 0.1197038982406782,
"learning_rate": 7.017876286077484e-06,
"loss": 0.0557,
"num_tokens": 78859554.0,
"step": 985
},
{
"epoch": 1.229569557080474,
"grad_norm": 0.13053665449746876,
"learning_rate": 7.012170385277877e-06,
"loss": 0.0347,
"num_tokens": 78939749.0,
"step": 986
},
{
"epoch": 1.230817217716781,
"grad_norm": 0.11902958957924019,
"learning_rate": 7.006461742456297e-06,
"loss": 0.0356,
"num_tokens": 79019918.0,
"step": 987
},
{
"epoch": 1.2320648783530879,
"grad_norm": 0.11340880688683018,
"learning_rate": 7.000750367964239e-06,
"loss": 0.0379,
"num_tokens": 79099464.0,
"step": 988
},
{
"epoch": 1.2333125389893949,
"grad_norm": 0.11875742504896669,
"learning_rate": 6.99503627215815e-06,
"loss": 0.0349,
"num_tokens": 79178900.0,
"step": 989
},
{
"epoch": 1.2345601996257018,
"grad_norm": 0.11112107319089891,
"learning_rate": 6.989319465399415e-06,
"loss": 0.0368,
"num_tokens": 79258330.0,
"step": 990
},
{
"epoch": 1.2358078602620086,
"grad_norm": 0.1127792821863249,
"learning_rate": 6.983599958054331e-06,
"loss": 0.0377,
"num_tokens": 79337995.0,
"step": 991
},
{
"epoch": 1.2370555208983156,
"grad_norm": 0.115241478566488,
"learning_rate": 6.977877760494094e-06,
"loss": 0.0348,
"num_tokens": 79419296.0,
"step": 992
},
{
"epoch": 1.2383031815346226,
"grad_norm": 0.12502572470635195,
"learning_rate": 6.972152883094778e-06,
"loss": 0.0355,
"num_tokens": 79498279.0,
"step": 993
},
{
"epoch": 1.2395508421709296,
"grad_norm": 0.12440205175969768,
"learning_rate": 6.966425336237317e-06,
"loss": 0.037,
"num_tokens": 79578630.0,
"step": 994
},
{
"epoch": 1.2407985028072364,
"grad_norm": 0.1251089531609647,
"learning_rate": 6.960695130307484e-06,
"loss": 0.039,
"num_tokens": 79659951.0,
"step": 995
},
{
"epoch": 1.2420461634435433,
"grad_norm": 0.12293216602459224,
"learning_rate": 6.954962275695871e-06,
"loss": 0.0372,
"num_tokens": 79740063.0,
"step": 996
},
{
"epoch": 1.2432938240798503,
"grad_norm": 0.12394497640341638,
"learning_rate": 6.9492267827978824e-06,
"loss": 0.0374,
"num_tokens": 79821223.0,
"step": 997
},
{
"epoch": 1.244541484716157,
"grad_norm": 0.1129659823407307,
"learning_rate": 6.943488662013697e-06,
"loss": 0.0355,
"num_tokens": 79901255.0,
"step": 998
},
{
"epoch": 1.245789145352464,
"grad_norm": 0.1191576382470901,
"learning_rate": 6.93774792374826e-06,
"loss": 0.0366,
"num_tokens": 79981164.0,
"step": 999
},
{
"epoch": 1.247036805988771,
"grad_norm": 0.12363634596962561,
"learning_rate": 6.93200457841127e-06,
"loss": 0.0345,
"num_tokens": 80060941.0,
"step": 1000
},
{
"epoch": 1.248284466625078,
"grad_norm": 0.11752492427361626,
"learning_rate": 6.9262586364171455e-06,
"loss": 0.0355,
"num_tokens": 80140169.0,
"step": 1001
},
{
"epoch": 1.2495321272613849,
"grad_norm": 0.12154555071949472,
"learning_rate": 6.920510108185016e-06,
"loss": 0.0398,
"num_tokens": 80219606.0,
"step": 1002
},
{
"epoch": 1.2507797878976918,
"grad_norm": 0.13878040487943977,
"learning_rate": 6.9147590041387e-06,
"loss": 0.0393,
"num_tokens": 80300978.0,
"step": 1003
},
{
"epoch": 1.2520274485339988,
"grad_norm": 0.11693892617963454,
"learning_rate": 6.909005334706688e-06,
"loss": 0.0357,
"num_tokens": 80380717.0,
"step": 1004
},
{
"epoch": 1.2532751091703056,
"grad_norm": 0.11483400032407586,
"learning_rate": 6.903249110322123e-06,
"loss": 0.0393,
"num_tokens": 80461525.0,
"step": 1005
},
{
"epoch": 1.2545227698066126,
"grad_norm": 0.1176675268573737,
"learning_rate": 6.897490341422779e-06,
"loss": 0.0337,
"num_tokens": 80540527.0,
"step": 1006
},
{
"epoch": 1.2557704304429196,
"grad_norm": 0.12167676830713421,
"learning_rate": 6.8917290384510435e-06,
"loss": 0.0375,
"num_tokens": 80619663.0,
"step": 1007
},
{
"epoch": 1.2570180910792264,
"grad_norm": 0.12421371269284932,
"learning_rate": 6.885965211853902e-06,
"loss": 0.035,
"num_tokens": 80700557.0,
"step": 1008
},
{
"epoch": 1.2582657517155333,
"grad_norm": 0.11922957080231958,
"learning_rate": 6.8801988720829134e-06,
"loss": 0.0369,
"num_tokens": 80780369.0,
"step": 1009
},
{
"epoch": 1.2595134123518403,
"grad_norm": 0.12633099452240243,
"learning_rate": 6.874430029594194e-06,
"loss": 0.0393,
"num_tokens": 80859727.0,
"step": 1010
},
{
"epoch": 1.260761072988147,
"grad_norm": 0.10977750675097879,
"learning_rate": 6.8686586948483995e-06,
"loss": 0.0385,
"num_tokens": 80940815.0,
"step": 1011
},
{
"epoch": 1.262008733624454,
"grad_norm": 0.11227691696830043,
"learning_rate": 6.862884878310705e-06,
"loss": 0.0361,
"num_tokens": 81019729.0,
"step": 1012
},
{
"epoch": 1.263256394260761,
"grad_norm": 0.11693513455086187,
"learning_rate": 6.8571085904507825e-06,
"loss": 0.0344,
"num_tokens": 81099372.0,
"step": 1013
},
{
"epoch": 1.264504054897068,
"grad_norm": 0.11256114481937485,
"learning_rate": 6.8513298417427895e-06,
"loss": 0.0363,
"num_tokens": 81179368.0,
"step": 1014
},
{
"epoch": 1.2657517155333748,
"grad_norm": 0.11521215818800695,
"learning_rate": 6.845548642665347e-06,
"loss": 0.0342,
"num_tokens": 81257916.0,
"step": 1015
},
{
"epoch": 1.2669993761696818,
"grad_norm": 0.10695999901129719,
"learning_rate": 6.839765003701511e-06,
"loss": 0.037,
"num_tokens": 81337952.0,
"step": 1016
},
{
"epoch": 1.2682470368059888,
"grad_norm": 0.12118215671319715,
"learning_rate": 6.833978935338772e-06,
"loss": 0.0363,
"num_tokens": 81416824.0,
"step": 1017
},
{
"epoch": 1.2694946974422958,
"grad_norm": 0.10660530253287213,
"learning_rate": 6.828190448069016e-06,
"loss": 0.035,
"num_tokens": 81496879.0,
"step": 1018
},
{
"epoch": 1.2707423580786026,
"grad_norm": 0.11394529803643212,
"learning_rate": 6.822399552388523e-06,
"loss": 0.0363,
"num_tokens": 81576199.0,
"step": 1019
},
{
"epoch": 1.2719900187149096,
"grad_norm": 0.12045967866524018,
"learning_rate": 6.816606258797936e-06,
"loss": 0.0347,
"num_tokens": 81655945.0,
"step": 1020
},
{
"epoch": 1.2732376793512166,
"grad_norm": 0.12925247193109857,
"learning_rate": 6.810810577802249e-06,
"loss": 0.0403,
"num_tokens": 81736714.0,
"step": 1021
},
{
"epoch": 1.2744853399875233,
"grad_norm": 0.12389301275829777,
"learning_rate": 6.8050125199107835e-06,
"loss": 0.038,
"num_tokens": 81816119.0,
"step": 1022
},
{
"epoch": 1.2757330006238303,
"grad_norm": 0.1167932325177084,
"learning_rate": 6.799212095637169e-06,
"loss": 0.0363,
"num_tokens": 81896630.0,
"step": 1023
},
{
"epoch": 1.2769806612601373,
"grad_norm": 0.11848171062553153,
"learning_rate": 6.7934093154993285e-06,
"loss": 0.0366,
"num_tokens": 81977134.0,
"step": 1024
},
{
"epoch": 1.278228321896444,
"grad_norm": 0.12149988422451896,
"learning_rate": 6.787604190019456e-06,
"loss": 0.0349,
"num_tokens": 82057209.0,
"step": 1025
},
{
"epoch": 1.279475982532751,
"grad_norm": 0.12548818987766705,
"learning_rate": 6.781796729724001e-06,
"loss": 0.0352,
"num_tokens": 82136947.0,
"step": 1026
},
{
"epoch": 1.280723643169058,
"grad_norm": 0.13025681794055913,
"learning_rate": 6.775986945143641e-06,
"loss": 0.0366,
"num_tokens": 82217559.0,
"step": 1027
},
{
"epoch": 1.2819713038053648,
"grad_norm": 0.12011534736589388,
"learning_rate": 6.770174846813273e-06,
"loss": 0.0372,
"num_tokens": 82296722.0,
"step": 1028
},
{
"epoch": 1.2832189644416718,
"grad_norm": 0.11817718721722607,
"learning_rate": 6.7643604452719894e-06,
"loss": 0.0366,
"num_tokens": 82376898.0,
"step": 1029
},
{
"epoch": 1.2844666250779788,
"grad_norm": 0.11923904159473753,
"learning_rate": 6.758543751063055e-06,
"loss": 0.0349,
"num_tokens": 82457760.0,
"step": 1030
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.11244501964085687,
"learning_rate": 6.752724774733899e-06,
"loss": 0.0348,
"num_tokens": 82537611.0,
"step": 1031
},
{
"epoch": 1.2869619463505926,
"grad_norm": 0.115025011194697,
"learning_rate": 6.746903526836079e-06,
"loss": 0.0359,
"num_tokens": 82618396.0,
"step": 1032
},
{
"epoch": 1.2882096069868996,
"grad_norm": 0.12002180306264341,
"learning_rate": 6.741080017925279e-06,
"loss": 0.0363,
"num_tokens": 82698629.0,
"step": 1033
},
{
"epoch": 1.2894572676232066,
"grad_norm": 0.1226967210363595,
"learning_rate": 6.735254258561281e-06,
"loss": 0.0376,
"num_tokens": 82777184.0,
"step": 1034
},
{
"epoch": 1.2907049282595136,
"grad_norm": 0.10358231550003588,
"learning_rate": 6.729426259307948e-06,
"loss": 0.0333,
"num_tokens": 82856190.0,
"step": 1035
},
{
"epoch": 1.2919525888958203,
"grad_norm": 0.11578090273117891,
"learning_rate": 6.723596030733204e-06,
"loss": 0.0347,
"num_tokens": 82936794.0,
"step": 1036
},
{
"epoch": 1.2932002495321273,
"grad_norm": 0.10081923729427175,
"learning_rate": 6.717763583409016e-06,
"loss": 0.0346,
"num_tokens": 83016097.0,
"step": 1037
},
{
"epoch": 1.2944479101684343,
"grad_norm": 0.12233503099838965,
"learning_rate": 6.711928927911373e-06,
"loss": 0.0376,
"num_tokens": 83095632.0,
"step": 1038
},
{
"epoch": 1.295695570804741,
"grad_norm": 0.11462973160105773,
"learning_rate": 6.7060920748202674e-06,
"loss": 0.0369,
"num_tokens": 83177302.0,
"step": 1039
},
{
"epoch": 1.296943231441048,
"grad_norm": 0.11946539311970528,
"learning_rate": 6.700253034719684e-06,
"loss": 0.0386,
"num_tokens": 83258689.0,
"step": 1040
},
{
"epoch": 1.298190892077355,
"grad_norm": 0.12359336236001878,
"learning_rate": 6.694411818197561e-06,
"loss": 0.0351,
"num_tokens": 83338185.0,
"step": 1041
},
{
"epoch": 1.2994385527136618,
"grad_norm": 0.1139374127755781,
"learning_rate": 6.688568435845792e-06,
"loss": 0.0347,
"num_tokens": 83417497.0,
"step": 1042
},
{
"epoch": 1.3006862133499688,
"grad_norm": 0.11205952042329616,
"learning_rate": 6.682722898260195e-06,
"loss": 0.0378,
"num_tokens": 83498065.0,
"step": 1043
},
{
"epoch": 1.3019338739862758,
"grad_norm": 0.1083104938066509,
"learning_rate": 6.676875216040498e-06,
"loss": 0.0339,
"num_tokens": 83577372.0,
"step": 1044
},
{
"epoch": 1.3031815346225826,
"grad_norm": 0.10828476825816279,
"learning_rate": 6.671025399790315e-06,
"loss": 0.0385,
"num_tokens": 83657938.0,
"step": 1045
},
{
"epoch": 1.3044291952588896,
"grad_norm": 0.11019045987410468,
"learning_rate": 6.66517346011713e-06,
"loss": 0.0365,
"num_tokens": 83738524.0,
"step": 1046
},
{
"epoch": 1.3056768558951966,
"grad_norm": 0.12276236466671721,
"learning_rate": 6.659319407632282e-06,
"loss": 0.0398,
"num_tokens": 83818548.0,
"step": 1047
},
{
"epoch": 1.3069245165315033,
"grad_norm": 0.1263772382661588,
"learning_rate": 6.653463252950933e-06,
"loss": 0.0378,
"num_tokens": 83898937.0,
"step": 1048
},
{
"epoch": 1.3081721771678103,
"grad_norm": 0.12027367867659687,
"learning_rate": 6.647605006692066e-06,
"loss": 0.037,
"num_tokens": 83979503.0,
"step": 1049
},
{
"epoch": 1.3094198378041173,
"grad_norm": 0.11001899385006926,
"learning_rate": 6.641744679478448e-06,
"loss": 0.0352,
"num_tokens": 84058957.0,
"step": 1050
},
{
"epoch": 1.310667498440424,
"grad_norm": 0.11280548803095132,
"learning_rate": 6.635882281936625e-06,
"loss": 0.0354,
"num_tokens": 84138073.0,
"step": 1051
},
{
"epoch": 1.311915159076731,
"grad_norm": 0.11780560554733235,
"learning_rate": 6.630017824696898e-06,
"loss": 0.0347,
"num_tokens": 84218047.0,
"step": 1052
},
{
"epoch": 1.313162819713038,
"grad_norm": 0.12058333417344305,
"learning_rate": 6.624151318393298e-06,
"loss": 0.0373,
"num_tokens": 84298783.0,
"step": 1053
},
{
"epoch": 1.314410480349345,
"grad_norm": 0.11911044876914684,
"learning_rate": 6.618282773663576e-06,
"loss": 0.0356,
"num_tokens": 84378667.0,
"step": 1054
},
{
"epoch": 1.315658140985652,
"grad_norm": 0.11647453576092717,
"learning_rate": 6.612412201149175e-06,
"loss": 0.037,
"num_tokens": 84459833.0,
"step": 1055
},
{
"epoch": 1.3169058016219588,
"grad_norm": 0.1381556034687667,
"learning_rate": 6.6065396114952195e-06,
"loss": 0.0345,
"num_tokens": 84538415.0,
"step": 1056
},
{
"epoch": 1.3181534622582658,
"grad_norm": 0.11291795464976989,
"learning_rate": 6.600665015350487e-06,
"loss": 0.0358,
"num_tokens": 84618356.0,
"step": 1057
},
{
"epoch": 1.3194011228945728,
"grad_norm": 0.11428531136103644,
"learning_rate": 6.594788423367399e-06,
"loss": 0.0379,
"num_tokens": 84699392.0,
"step": 1058
},
{
"epoch": 1.3206487835308796,
"grad_norm": 0.12559230820363362,
"learning_rate": 6.588909846201992e-06,
"loss": 0.0357,
"num_tokens": 84780174.0,
"step": 1059
},
{
"epoch": 1.3218964441671865,
"grad_norm": 0.11157729667744716,
"learning_rate": 6.583029294513902e-06,
"loss": 0.0374,
"num_tokens": 84861023.0,
"step": 1060
},
{
"epoch": 1.3231441048034935,
"grad_norm": 0.10532360001946126,
"learning_rate": 6.577146778966347e-06,
"loss": 0.0347,
"num_tokens": 84941536.0,
"step": 1061
},
{
"epoch": 1.3243917654398003,
"grad_norm": 0.11380661535983909,
"learning_rate": 6.571262310226108e-06,
"loss": 0.0361,
"num_tokens": 85021693.0,
"step": 1062
},
{
"epoch": 1.3256394260761073,
"grad_norm": 0.11751173019320106,
"learning_rate": 6.565375898963503e-06,
"loss": 0.0367,
"num_tokens": 85101820.0,
"step": 1063
},
{
"epoch": 1.3268870867124143,
"grad_norm": 0.11556897548657508,
"learning_rate": 6.5594875558523755e-06,
"loss": 0.0366,
"num_tokens": 85182245.0,
"step": 1064
},
{
"epoch": 1.328134747348721,
"grad_norm": 0.12290166180442655,
"learning_rate": 6.553597291570071e-06,
"loss": 0.034,
"num_tokens": 85261741.0,
"step": 1065
},
{
"epoch": 1.329382407985028,
"grad_norm": 0.11300496986222,
"learning_rate": 6.547705116797422e-06,
"loss": 0.0385,
"num_tokens": 85341243.0,
"step": 1066
},
{
"epoch": 1.330630068621335,
"grad_norm": 0.12054222224762111,
"learning_rate": 6.5418110422187156e-06,
"loss": 0.037,
"num_tokens": 85421844.0,
"step": 1067
},
{
"epoch": 1.3318777292576418,
"grad_norm": 0.11600190728796984,
"learning_rate": 6.535915078521697e-06,
"loss": 0.0364,
"num_tokens": 85500760.0,
"step": 1068
},
{
"epoch": 1.3331253898939488,
"grad_norm": 0.12283600030048854,
"learning_rate": 6.530017236397529e-06,
"loss": 0.0365,
"num_tokens": 85580795.0,
"step": 1069
},
{
"epoch": 1.3343730505302558,
"grad_norm": 0.11450545067136071,
"learning_rate": 6.52411752654078e-06,
"loss": 0.0347,
"num_tokens": 85661620.0,
"step": 1070
},
{
"epoch": 1.3356207111665626,
"grad_norm": 0.1114808904800754,
"learning_rate": 6.518215959649409e-06,
"loss": 0.0363,
"num_tokens": 85742723.0,
"step": 1071
},
{
"epoch": 1.3368683718028695,
"grad_norm": 0.12107476538251831,
"learning_rate": 6.512312546424739e-06,
"loss": 0.0332,
"num_tokens": 85821771.0,
"step": 1072
},
{
"epoch": 1.3381160324391765,
"grad_norm": 0.11709745580529317,
"learning_rate": 6.506407297571445e-06,
"loss": 0.0385,
"num_tokens": 85901823.0,
"step": 1073
},
{
"epoch": 1.3393636930754835,
"grad_norm": 0.12275293011077212,
"learning_rate": 6.500500223797526e-06,
"loss": 0.0357,
"num_tokens": 85981915.0,
"step": 1074
},
{
"epoch": 1.3406113537117905,
"grad_norm": 0.11678308605913426,
"learning_rate": 6.494591335814292e-06,
"loss": 0.0329,
"num_tokens": 86061069.0,
"step": 1075
},
{
"epoch": 1.3418590143480973,
"grad_norm": 0.11176605025879648,
"learning_rate": 6.488680644336344e-06,
"loss": 0.0379,
"num_tokens": 86141683.0,
"step": 1076
},
{
"epoch": 1.3431066749844043,
"grad_norm": 0.11363151432726346,
"learning_rate": 6.482768160081553e-06,
"loss": 0.0348,
"num_tokens": 86222086.0,
"step": 1077
},
{
"epoch": 1.3443543356207113,
"grad_norm": 0.11445806649743549,
"learning_rate": 6.4768538937710364e-06,
"loss": 0.0382,
"num_tokens": 86302703.0,
"step": 1078
},
{
"epoch": 1.345601996257018,
"grad_norm": 0.13464785228692652,
"learning_rate": 6.470937856129152e-06,
"loss": 0.0347,
"num_tokens": 86382450.0,
"step": 1079
},
{
"epoch": 1.346849656893325,
"grad_norm": 0.1280907328510425,
"learning_rate": 6.465020057883461e-06,
"loss": 0.0337,
"num_tokens": 86462384.0,
"step": 1080
},
{
"epoch": 1.348097317529632,
"grad_norm": 0.10759760692353552,
"learning_rate": 6.45910050976472e-06,
"loss": 0.0343,
"num_tokens": 86541989.0,
"step": 1081
},
{
"epoch": 1.3493449781659388,
"grad_norm": 0.11223790651092914,
"learning_rate": 6.45317922250686e-06,
"loss": 0.0383,
"num_tokens": 86623690.0,
"step": 1082
},
{
"epoch": 1.3505926388022458,
"grad_norm": 0.11168721545726104,
"learning_rate": 6.447256206846963e-06,
"loss": 0.0356,
"num_tokens": 86703459.0,
"step": 1083
},
{
"epoch": 1.3518402994385528,
"grad_norm": 0.11578985731239276,
"learning_rate": 6.44133147352525e-06,
"loss": 0.0351,
"num_tokens": 86787758.0,
"step": 1084
},
{
"epoch": 1.3530879600748595,
"grad_norm": 0.11806982316043722,
"learning_rate": 6.4354050332850505e-06,
"loss": 0.0351,
"num_tokens": 86868231.0,
"step": 1085
},
{
"epoch": 1.3543356207111665,
"grad_norm": 0.13776869218741045,
"learning_rate": 6.429476896872793e-06,
"loss": 0.0355,
"num_tokens": 86948392.0,
"step": 1086
},
{
"epoch": 1.3555832813474735,
"grad_norm": 0.1143170779368215,
"learning_rate": 6.4235470750379794e-06,
"loss": 0.0352,
"num_tokens": 87028370.0,
"step": 1087
},
{
"epoch": 1.3568309419837803,
"grad_norm": 0.10825819045832565,
"learning_rate": 6.4176155785331705e-06,
"loss": 0.0363,
"num_tokens": 87109397.0,
"step": 1088
},
{
"epoch": 1.3580786026200873,
"grad_norm": 0.12553276062809313,
"learning_rate": 6.411682418113961e-06,
"loss": 0.0401,
"num_tokens": 87191884.0,
"step": 1089
},
{
"epoch": 1.3593262632563943,
"grad_norm": 0.1190623682061882,
"learning_rate": 6.405747604538965e-06,
"loss": 0.036,
"num_tokens": 87272846.0,
"step": 1090
},
{
"epoch": 1.3605739238927013,
"grad_norm": 0.13120836782794848,
"learning_rate": 6.399811148569794e-06,
"loss": 0.0387,
"num_tokens": 87354474.0,
"step": 1091
},
{
"epoch": 1.361821584529008,
"grad_norm": 0.12749448720764697,
"learning_rate": 6.393873060971036e-06,
"loss": 0.0354,
"num_tokens": 87434034.0,
"step": 1092
},
{
"epoch": 1.363069245165315,
"grad_norm": 0.11265346187198873,
"learning_rate": 6.3879333525102375e-06,
"loss": 0.0383,
"num_tokens": 87514380.0,
"step": 1093
},
{
"epoch": 1.364316905801622,
"grad_norm": 0.10396098070933056,
"learning_rate": 6.381992033957889e-06,
"loss": 0.0374,
"num_tokens": 87594266.0,
"step": 1094
},
{
"epoch": 1.365564566437929,
"grad_norm": 0.12007723700394293,
"learning_rate": 6.376049116087393e-06,
"loss": 0.0358,
"num_tokens": 87675016.0,
"step": 1095
},
{
"epoch": 1.3668122270742358,
"grad_norm": 0.11088738920455023,
"learning_rate": 6.370104609675058e-06,
"loss": 0.0365,
"num_tokens": 87755275.0,
"step": 1096
},
{
"epoch": 1.3680598877105428,
"grad_norm": 0.12947725342366095,
"learning_rate": 6.364158525500069e-06,
"loss": 0.0386,
"num_tokens": 87835968.0,
"step": 1097
},
{
"epoch": 1.3693075483468498,
"grad_norm": 0.10743601396658202,
"learning_rate": 6.358210874344476e-06,
"loss": 0.0359,
"num_tokens": 87916756.0,
"step": 1098
},
{
"epoch": 1.3705552089831565,
"grad_norm": 0.10538537489377399,
"learning_rate": 6.352261666993167e-06,
"loss": 0.0344,
"num_tokens": 87997097.0,
"step": 1099
},
{
"epoch": 1.3718028696194635,
"grad_norm": 0.10920904020216766,
"learning_rate": 6.346310914233854e-06,
"loss": 0.0337,
"num_tokens": 88075564.0,
"step": 1100
},
{
"epoch": 1.3730505302557705,
"grad_norm": 0.12064678113049096,
"learning_rate": 6.340358626857049e-06,
"loss": 0.0374,
"num_tokens": 88155637.0,
"step": 1101
},
{
"epoch": 1.3742981908920773,
"grad_norm": 0.11366242121995487,
"learning_rate": 6.334404815656049e-06,
"loss": 0.034,
"num_tokens": 88234184.0,
"step": 1102
},
{
"epoch": 1.3755458515283843,
"grad_norm": 0.10541533458396993,
"learning_rate": 6.328449491426914e-06,
"loss": 0.0334,
"num_tokens": 88313988.0,
"step": 1103
},
{
"epoch": 1.3767935121646913,
"grad_norm": 0.1126212487247659,
"learning_rate": 6.322492664968446e-06,
"loss": 0.0361,
"num_tokens": 88394035.0,
"step": 1104
},
{
"epoch": 1.378041172800998,
"grad_norm": 0.10415858729571198,
"learning_rate": 6.316534347082173e-06,
"loss": 0.0353,
"num_tokens": 88473457.0,
"step": 1105
},
{
"epoch": 1.379288833437305,
"grad_norm": 0.11311924737695511,
"learning_rate": 6.310574548572325e-06,
"loss": 0.0396,
"num_tokens": 88554028.0,
"step": 1106
},
{
"epoch": 1.380536494073612,
"grad_norm": 0.11641164289365231,
"learning_rate": 6.304613280245816e-06,
"loss": 0.0351,
"num_tokens": 88633282.0,
"step": 1107
},
{
"epoch": 1.3817841547099188,
"grad_norm": 0.1238656036368708,
"learning_rate": 6.298650552912233e-06,
"loss": 0.0369,
"num_tokens": 88713446.0,
"step": 1108
},
{
"epoch": 1.3830318153462258,
"grad_norm": 0.10828413683130844,
"learning_rate": 6.292686377383797e-06,
"loss": 0.0366,
"num_tokens": 88793591.0,
"step": 1109
},
{
"epoch": 1.3842794759825328,
"grad_norm": 0.11180313140533374,
"learning_rate": 6.286720764475365e-06,
"loss": 0.0355,
"num_tokens": 88872762.0,
"step": 1110
},
{
"epoch": 1.3855271366188397,
"grad_norm": 0.11370487600194723,
"learning_rate": 6.280753725004395e-06,
"loss": 0.0362,
"num_tokens": 88955457.0,
"step": 1111
},
{
"epoch": 1.3867747972551465,
"grad_norm": 0.09617328020740629,
"learning_rate": 6.274785269790932e-06,
"loss": 0.0336,
"num_tokens": 89035406.0,
"step": 1112
},
{
"epoch": 1.3880224578914535,
"grad_norm": 0.11176348044222259,
"learning_rate": 6.268815409657592e-06,
"loss": 0.0342,
"num_tokens": 89116507.0,
"step": 1113
},
{
"epoch": 1.3892701185277605,
"grad_norm": 0.11325296873554795,
"learning_rate": 6.262844155429533e-06,
"loss": 0.0374,
"num_tokens": 89199614.0,
"step": 1114
},
{
"epoch": 1.3905177791640675,
"grad_norm": 0.11704030265578563,
"learning_rate": 6.256871517934445e-06,
"loss": 0.0357,
"num_tokens": 89279144.0,
"step": 1115
},
{
"epoch": 1.3917654398003743,
"grad_norm": 0.11952799829012306,
"learning_rate": 6.2508975080025254e-06,
"loss": 0.0353,
"num_tokens": 89359708.0,
"step": 1116
},
{
"epoch": 1.3930131004366813,
"grad_norm": 0.11502731870682785,
"learning_rate": 6.24492213646646e-06,
"loss": 0.0368,
"num_tokens": 89439326.0,
"step": 1117
},
{
"epoch": 1.3942607610729882,
"grad_norm": 0.12252081179674802,
"learning_rate": 6.2389454141614024e-06,
"loss": 0.0345,
"num_tokens": 89518867.0,
"step": 1118
},
{
"epoch": 1.395508421709295,
"grad_norm": 0.18070152071501802,
"learning_rate": 6.232967351924959e-06,
"loss": 0.0355,
"num_tokens": 89598243.0,
"step": 1119
},
{
"epoch": 1.396756082345602,
"grad_norm": 0.1228366552270244,
"learning_rate": 6.226987960597161e-06,
"loss": 0.0363,
"num_tokens": 89678232.0,
"step": 1120
},
{
"epoch": 1.398003742981909,
"grad_norm": 0.13722290208459134,
"learning_rate": 6.22100725102045e-06,
"loss": 0.0347,
"num_tokens": 89758532.0,
"step": 1121
},
{
"epoch": 1.3992514036182158,
"grad_norm": 0.11700538827862798,
"learning_rate": 6.215025234039667e-06,
"loss": 0.0371,
"num_tokens": 89838761.0,
"step": 1122
},
{
"epoch": 1.4004990642545228,
"grad_norm": 0.11749579319296499,
"learning_rate": 6.209041920502012e-06,
"loss": 0.0362,
"num_tokens": 89919068.0,
"step": 1123
},
{
"epoch": 1.4017467248908297,
"grad_norm": 0.1239771498120876,
"learning_rate": 6.203057321257041e-06,
"loss": 0.0347,
"num_tokens": 89999454.0,
"step": 1124
},
{
"epoch": 1.4029943855271365,
"grad_norm": 0.10457854348428894,
"learning_rate": 6.197071447156643e-06,
"loss": 0.0369,
"num_tokens": 90079489.0,
"step": 1125
},
{
"epoch": 1.4042420461634435,
"grad_norm": 0.12132963451268676,
"learning_rate": 6.191084309055018e-06,
"loss": 0.0359,
"num_tokens": 90160079.0,
"step": 1126
},
{
"epoch": 1.4054897067997505,
"grad_norm": 0.1149651839306163,
"learning_rate": 6.185095917808654e-06,
"loss": 0.0367,
"num_tokens": 90239067.0,
"step": 1127
},
{
"epoch": 1.4067373674360573,
"grad_norm": 0.11185660034071362,
"learning_rate": 6.179106284276315e-06,
"loss": 0.0345,
"num_tokens": 90320588.0,
"step": 1128
},
{
"epoch": 1.4079850280723643,
"grad_norm": 0.11791295713574668,
"learning_rate": 6.173115419319019e-06,
"loss": 0.0357,
"num_tokens": 90400737.0,
"step": 1129
},
{
"epoch": 1.4092326887086712,
"grad_norm": 0.12005035519918307,
"learning_rate": 6.167123333800014e-06,
"loss": 0.0372,
"num_tokens": 90481553.0,
"step": 1130
},
{
"epoch": 1.4104803493449782,
"grad_norm": 0.11280371649250795,
"learning_rate": 6.161130038584762e-06,
"loss": 0.0359,
"num_tokens": 90560492.0,
"step": 1131
},
{
"epoch": 1.4117280099812852,
"grad_norm": 0.11483105002319907,
"learning_rate": 6.155135544540917e-06,
"loss": 0.0329,
"num_tokens": 90640526.0,
"step": 1132
},
{
"epoch": 1.412975670617592,
"grad_norm": 0.10610511431168547,
"learning_rate": 6.1491398625383116e-06,
"loss": 0.0365,
"num_tokens": 90720931.0,
"step": 1133
},
{
"epoch": 1.414223331253899,
"grad_norm": 0.11403045781580023,
"learning_rate": 6.143143003448929e-06,
"loss": 0.0334,
"num_tokens": 90799876.0,
"step": 1134
},
{
"epoch": 1.415470991890206,
"grad_norm": 0.1128226484451764,
"learning_rate": 6.1371449781468835e-06,
"loss": 0.036,
"num_tokens": 90879955.0,
"step": 1135
},
{
"epoch": 1.4167186525265127,
"grad_norm": 0.1339643064995737,
"learning_rate": 6.131145797508414e-06,
"loss": 0.0362,
"num_tokens": 90960140.0,
"step": 1136
},
{
"epoch": 1.4179663131628197,
"grad_norm": 0.11977396725939997,
"learning_rate": 6.125145472411845e-06,
"loss": 0.0408,
"num_tokens": 91040880.0,
"step": 1137
},
{
"epoch": 1.4192139737991267,
"grad_norm": 0.1304144374986512,
"learning_rate": 6.1191440137375775e-06,
"loss": 0.0356,
"num_tokens": 91120578.0,
"step": 1138
},
{
"epoch": 1.4204616344354335,
"grad_norm": 0.10952816992518201,
"learning_rate": 6.113141432368075e-06,
"loss": 0.0342,
"num_tokens": 91199968.0,
"step": 1139
},
{
"epoch": 1.4217092950717405,
"grad_norm": 0.11923470924014716,
"learning_rate": 6.107137739187827e-06,
"loss": 0.0382,
"num_tokens": 91280534.0,
"step": 1140
},
{
"epoch": 1.4229569557080475,
"grad_norm": 0.13414248321277958,
"learning_rate": 6.101132945083347e-06,
"loss": 0.0353,
"num_tokens": 91359718.0,
"step": 1141
},
{
"epoch": 1.4242046163443542,
"grad_norm": 0.11441510307386335,
"learning_rate": 6.095127060943141e-06,
"loss": 0.0355,
"num_tokens": 91439021.0,
"step": 1142
},
{
"epoch": 1.4254522769806612,
"grad_norm": 0.11361047483263015,
"learning_rate": 6.089120097657692e-06,
"loss": 0.0399,
"num_tokens": 91520278.0,
"step": 1143
},
{
"epoch": 1.4266999376169682,
"grad_norm": 0.11528784116675486,
"learning_rate": 6.083112066119439e-06,
"loss": 0.0379,
"num_tokens": 91600857.0,
"step": 1144
},
{
"epoch": 1.427947598253275,
"grad_norm": 0.12339012360670368,
"learning_rate": 6.077102977222763e-06,
"loss": 0.0363,
"num_tokens": 91681068.0,
"step": 1145
},
{
"epoch": 1.429195258889582,
"grad_norm": 0.11822266945602426,
"learning_rate": 6.0710928418639515e-06,
"loss": 0.0369,
"num_tokens": 91762429.0,
"step": 1146
},
{
"epoch": 1.430442919525889,
"grad_norm": 0.10934047203309372,
"learning_rate": 6.065081670941204e-06,
"loss": 0.0343,
"num_tokens": 91842442.0,
"step": 1147
},
{
"epoch": 1.4316905801621957,
"grad_norm": 0.11563946305646088,
"learning_rate": 6.059069475354586e-06,
"loss": 0.0371,
"num_tokens": 91921815.0,
"step": 1148
},
{
"epoch": 1.4329382407985027,
"grad_norm": 0.13902235644402097,
"learning_rate": 6.0530562660060276e-06,
"loss": 0.035,
"num_tokens": 92001388.0,
"step": 1149
},
{
"epoch": 1.4341859014348097,
"grad_norm": 0.11365381032155394,
"learning_rate": 6.0470420537992915e-06,
"loss": 0.0361,
"num_tokens": 92080682.0,
"step": 1150
},
{
"epoch": 1.4354335620711167,
"grad_norm": 0.1215204822957711,
"learning_rate": 6.041026849639966e-06,
"loss": 0.0367,
"num_tokens": 92160919.0,
"step": 1151
},
{
"epoch": 1.4366812227074237,
"grad_norm": 0.10228268537749387,
"learning_rate": 6.035010664435434e-06,
"loss": 0.0361,
"num_tokens": 92241085.0,
"step": 1152
},
{
"epoch": 1.4379288833437305,
"grad_norm": 0.11290899169878665,
"learning_rate": 6.0289935090948536e-06,
"loss": 0.0339,
"num_tokens": 92320066.0,
"step": 1153
},
{
"epoch": 1.4391765439800375,
"grad_norm": 0.12929287703536887,
"learning_rate": 6.022975394529149e-06,
"loss": 0.0344,
"num_tokens": 92399898.0,
"step": 1154
},
{
"epoch": 1.4404242046163445,
"grad_norm": 0.10350159451251419,
"learning_rate": 6.016956331650984e-06,
"loss": 0.0338,
"num_tokens": 92479871.0,
"step": 1155
},
{
"epoch": 1.4416718652526512,
"grad_norm": 0.11525710135617265,
"learning_rate": 6.010936331374735e-06,
"loss": 0.0359,
"num_tokens": 92560206.0,
"step": 1156
},
{
"epoch": 1.4429195258889582,
"grad_norm": 0.11743945426514996,
"learning_rate": 6.00491540461648e-06,
"loss": 0.034,
"num_tokens": 92639628.0,
"step": 1157
},
{
"epoch": 1.4441671865252652,
"grad_norm": 0.10670559077717189,
"learning_rate": 5.998893562293986e-06,
"loss": 0.0377,
"num_tokens": 92719681.0,
"step": 1158
},
{
"epoch": 1.445414847161572,
"grad_norm": 0.11601172863515272,
"learning_rate": 5.992870815326667e-06,
"loss": 0.0366,
"num_tokens": 92799584.0,
"step": 1159
},
{
"epoch": 1.446662507797879,
"grad_norm": 0.11460386722320819,
"learning_rate": 5.986847174635586e-06,
"loss": 0.0332,
"num_tokens": 92879565.0,
"step": 1160
},
{
"epoch": 1.447910168434186,
"grad_norm": 0.10697055130942412,
"learning_rate": 5.980822651143426e-06,
"loss": 0.0365,
"num_tokens": 92959785.0,
"step": 1161
},
{
"epoch": 1.4491578290704927,
"grad_norm": 0.12723407944880333,
"learning_rate": 5.9747972557744675e-06,
"loss": 0.0382,
"num_tokens": 93040108.0,
"step": 1162
},
{
"epoch": 1.4504054897067997,
"grad_norm": 0.11841204743560867,
"learning_rate": 5.968770999454572e-06,
"loss": 0.036,
"num_tokens": 93121058.0,
"step": 1163
},
{
"epoch": 1.4516531503431067,
"grad_norm": 0.11569156196248792,
"learning_rate": 5.962743893111165e-06,
"loss": 0.0353,
"num_tokens": 93200814.0,
"step": 1164
},
{
"epoch": 1.4529008109794135,
"grad_norm": 0.10777418932429268,
"learning_rate": 5.956715947673212e-06,
"loss": 0.0348,
"num_tokens": 93281213.0,
"step": 1165
},
{
"epoch": 1.4541484716157205,
"grad_norm": 0.10994616619439264,
"learning_rate": 5.950687174071201e-06,
"loss": 0.0356,
"num_tokens": 93360403.0,
"step": 1166
},
{
"epoch": 1.4553961322520275,
"grad_norm": 0.10558957609563456,
"learning_rate": 5.944657583237119e-06,
"loss": 0.0359,
"num_tokens": 93440112.0,
"step": 1167
},
{
"epoch": 1.4566437928883345,
"grad_norm": 0.10340454479634237,
"learning_rate": 5.938627186104438e-06,
"loss": 0.0341,
"num_tokens": 93519997.0,
"step": 1168
},
{
"epoch": 1.4578914535246412,
"grad_norm": 0.11216617986802824,
"learning_rate": 5.932595993608092e-06,
"loss": 0.0367,
"num_tokens": 93601531.0,
"step": 1169
},
{
"epoch": 1.4591391141609482,
"grad_norm": 0.11118613571047507,
"learning_rate": 5.926564016684453e-06,
"loss": 0.0369,
"num_tokens": 93684506.0,
"step": 1170
},
{
"epoch": 1.4603867747972552,
"grad_norm": 0.11180759900830808,
"learning_rate": 5.920531266271317e-06,
"loss": 0.0345,
"num_tokens": 93765144.0,
"step": 1171
},
{
"epoch": 1.4616344354335622,
"grad_norm": 0.10849133876831726,
"learning_rate": 5.9144977533078885e-06,
"loss": 0.0338,
"num_tokens": 93844866.0,
"step": 1172
},
{
"epoch": 1.462882096069869,
"grad_norm": 0.10146908320691236,
"learning_rate": 5.90846348873475e-06,
"loss": 0.0376,
"num_tokens": 93925678.0,
"step": 1173
},
{
"epoch": 1.464129756706176,
"grad_norm": 0.11406035504111364,
"learning_rate": 5.902428483493845e-06,
"loss": 0.0354,
"num_tokens": 94005650.0,
"step": 1174
},
{
"epoch": 1.465377417342483,
"grad_norm": 0.10629587740727439,
"learning_rate": 5.89639274852846e-06,
"loss": 0.0338,
"num_tokens": 94085102.0,
"step": 1175
},
{
"epoch": 1.4666250779787897,
"grad_norm": 0.10967958676982019,
"learning_rate": 5.890356294783213e-06,
"loss": 0.0346,
"num_tokens": 94163932.0,
"step": 1176
},
{
"epoch": 1.4678727386150967,
"grad_norm": 0.1110479738340064,
"learning_rate": 5.8843191332040125e-06,
"loss": 0.0366,
"num_tokens": 94244813.0,
"step": 1177
},
{
"epoch": 1.4691203992514037,
"grad_norm": 0.11487895996237814,
"learning_rate": 5.878281274738061e-06,
"loss": 0.044,
"num_tokens": 94326986.0,
"step": 1178
},
{
"epoch": 1.4703680598877105,
"grad_norm": 0.12426013311708847,
"learning_rate": 5.872242730333822e-06,
"loss": 0.0373,
"num_tokens": 94407345.0,
"step": 1179
},
{
"epoch": 1.4716157205240175,
"grad_norm": 0.10392684893138183,
"learning_rate": 5.866203510940998e-06,
"loss": 0.0341,
"num_tokens": 94486495.0,
"step": 1180
},
{
"epoch": 1.4728633811603244,
"grad_norm": 0.11624920561782613,
"learning_rate": 5.860163627510521e-06,
"loss": 0.0354,
"num_tokens": 94566567.0,
"step": 1181
},
{
"epoch": 1.4741110417966312,
"grad_norm": 0.11632421080295033,
"learning_rate": 5.854123090994524e-06,
"loss": 0.0351,
"num_tokens": 94646328.0,
"step": 1182
},
{
"epoch": 1.4753587024329382,
"grad_norm": 0.1085474029571278,
"learning_rate": 5.848081912346329e-06,
"loss": 0.0357,
"num_tokens": 94726254.0,
"step": 1183
},
{
"epoch": 1.4766063630692452,
"grad_norm": 0.13928705664750285,
"learning_rate": 5.842040102520416e-06,
"loss": 0.0345,
"num_tokens": 94806492.0,
"step": 1184
},
{
"epoch": 1.477854023705552,
"grad_norm": 0.11956422118762197,
"learning_rate": 5.8359976724724146e-06,
"loss": 0.0373,
"num_tokens": 94888343.0,
"step": 1185
},
{
"epoch": 1.479101684341859,
"grad_norm": 0.10678348906652803,
"learning_rate": 5.829954633159073e-06,
"loss": 0.0365,
"num_tokens": 94968750.0,
"step": 1186
},
{
"epoch": 1.480349344978166,
"grad_norm": 0.10950504983347997,
"learning_rate": 5.823910995538251e-06,
"loss": 0.0363,
"num_tokens": 95048007.0,
"step": 1187
},
{
"epoch": 1.481597005614473,
"grad_norm": 0.10871513034226521,
"learning_rate": 5.8178667705688895e-06,
"loss": 0.034,
"num_tokens": 95127214.0,
"step": 1188
},
{
"epoch": 1.4828446662507797,
"grad_norm": 0.11813024759210294,
"learning_rate": 5.811821969210995e-06,
"loss": 0.0378,
"num_tokens": 95207769.0,
"step": 1189
},
{
"epoch": 1.4840923268870867,
"grad_norm": 0.12334121149788137,
"learning_rate": 5.8057766024256205e-06,
"loss": 0.0339,
"num_tokens": 95286918.0,
"step": 1190
},
{
"epoch": 1.4853399875233937,
"grad_norm": 0.11137548828632113,
"learning_rate": 5.799730681174842e-06,
"loss": 0.0345,
"num_tokens": 95367783.0,
"step": 1191
},
{
"epoch": 1.4865876481597007,
"grad_norm": 0.11258420176536157,
"learning_rate": 5.793684216421744e-06,
"loss": 0.0379,
"num_tokens": 95449134.0,
"step": 1192
},
{
"epoch": 1.4878353087960074,
"grad_norm": 0.1283974468707094,
"learning_rate": 5.787637219130392e-06,
"loss": 0.0373,
"num_tokens": 95528492.0,
"step": 1193
},
{
"epoch": 1.4890829694323144,
"grad_norm": 0.10883979183962499,
"learning_rate": 5.781589700265823e-06,
"loss": 0.0343,
"num_tokens": 95608208.0,
"step": 1194
},
{
"epoch": 1.4903306300686214,
"grad_norm": 0.10270640891807907,
"learning_rate": 5.7755416707940135e-06,
"loss": 0.0348,
"num_tokens": 95687611.0,
"step": 1195
},
{
"epoch": 1.4915782907049282,
"grad_norm": 0.12695450443859627,
"learning_rate": 5.76949314168187e-06,
"loss": 0.0359,
"num_tokens": 95767108.0,
"step": 1196
},
{
"epoch": 1.4928259513412352,
"grad_norm": 0.11114393977783613,
"learning_rate": 5.763444123897206e-06,
"loss": 0.0345,
"num_tokens": 95846696.0,
"step": 1197
},
{
"epoch": 1.4940736119775422,
"grad_norm": 0.10051611126921174,
"learning_rate": 5.757394628408716e-06,
"loss": 0.0355,
"num_tokens": 95927423.0,
"step": 1198
},
{
"epoch": 1.495321272613849,
"grad_norm": 0.10698918904729461,
"learning_rate": 5.7513446661859664e-06,
"loss": 0.0334,
"num_tokens": 96008401.0,
"step": 1199
},
{
"epoch": 1.496568933250156,
"grad_norm": 0.11515052950318992,
"learning_rate": 5.7452942481993655e-06,
"loss": 0.0329,
"num_tokens": 96087128.0,
"step": 1200
},
{
"epoch": 1.497816593886463,
"grad_norm": 0.10667262101407343,
"learning_rate": 5.739243385420151e-06,
"loss": 0.0358,
"num_tokens": 96167367.0,
"step": 1201
},
{
"epoch": 1.4990642545227697,
"grad_norm": 0.12405579945874308,
"learning_rate": 5.7331920888203655e-06,
"loss": 0.0346,
"num_tokens": 96246922.0,
"step": 1202
},
{
"epoch": 1.5003119151590767,
"grad_norm": 0.10489860643653304,
"learning_rate": 5.727140369372838e-06,
"loss": 0.0376,
"num_tokens": 96327807.0,
"step": 1203
},
{
"epoch": 1.5015595757953837,
"grad_norm": 0.12443731067183812,
"learning_rate": 5.721088238051168e-06,
"loss": 0.0356,
"num_tokens": 96408288.0,
"step": 1204
},
{
"epoch": 1.5028072364316905,
"grad_norm": 0.10422931405567512,
"learning_rate": 5.715035705829696e-06,
"loss": 0.0378,
"num_tokens": 96488747.0,
"step": 1205
},
{
"epoch": 1.5040548970679977,
"grad_norm": 0.11583240842588108,
"learning_rate": 5.708982783683492e-06,
"loss": 0.0351,
"num_tokens": 96567394.0,
"step": 1206
},
{
"epoch": 1.5053025577043044,
"grad_norm": 0.11679015463888166,
"learning_rate": 5.7029294825883365e-06,
"loss": 0.035,
"num_tokens": 96646566.0,
"step": 1207
},
{
"epoch": 1.5065502183406112,
"grad_norm": 0.11437606283844103,
"learning_rate": 5.696875813520691e-06,
"loss": 0.0392,
"num_tokens": 96727492.0,
"step": 1208
},
{
"epoch": 1.5077978789769184,
"grad_norm": 0.11691760587370684,
"learning_rate": 5.69082178745769e-06,
"loss": 0.0352,
"num_tokens": 96807931.0,
"step": 1209
},
{
"epoch": 1.5090455396132252,
"grad_norm": 0.10574935510817819,
"learning_rate": 5.68476741537711e-06,
"loss": 0.0346,
"num_tokens": 96887154.0,
"step": 1210
},
{
"epoch": 1.5102932002495322,
"grad_norm": 0.11625074019798143,
"learning_rate": 5.678712708257358e-06,
"loss": 0.039,
"num_tokens": 96969029.0,
"step": 1211
},
{
"epoch": 1.5115408608858392,
"grad_norm": 0.11630274180853753,
"learning_rate": 5.672657677077449e-06,
"loss": 0.0363,
"num_tokens": 97049726.0,
"step": 1212
},
{
"epoch": 1.512788521522146,
"grad_norm": 0.1127526825270542,
"learning_rate": 5.666602332816985e-06,
"loss": 0.0372,
"num_tokens": 97130469.0,
"step": 1213
},
{
"epoch": 1.514036182158453,
"grad_norm": 0.11863761031881935,
"learning_rate": 5.6605466864561344e-06,
"loss": 0.0344,
"num_tokens": 97210798.0,
"step": 1214
},
{
"epoch": 1.51528384279476,
"grad_norm": 0.11747556688292905,
"learning_rate": 5.654490748975615e-06,
"loss": 0.0368,
"num_tokens": 97290820.0,
"step": 1215
},
{
"epoch": 1.5165315034310667,
"grad_norm": 0.11740884642076882,
"learning_rate": 5.648434531356671e-06,
"loss": 0.0341,
"num_tokens": 97370963.0,
"step": 1216
},
{
"epoch": 1.5177791640673737,
"grad_norm": 0.1121718619003604,
"learning_rate": 5.642378044581057e-06,
"loss": 0.0372,
"num_tokens": 97451787.0,
"step": 1217
},
{
"epoch": 1.5190268247036807,
"grad_norm": 0.11028715028041086,
"learning_rate": 5.636321299631015e-06,
"loss": 0.0355,
"num_tokens": 97531107.0,
"step": 1218
},
{
"epoch": 1.5202744853399874,
"grad_norm": 0.11526797879935653,
"learning_rate": 5.630264307489251e-06,
"loss": 0.0356,
"num_tokens": 97610596.0,
"step": 1219
},
{
"epoch": 1.5215221459762944,
"grad_norm": 0.11697834812620382,
"learning_rate": 5.624207079138922e-06,
"loss": 0.0372,
"num_tokens": 97692010.0,
"step": 1220
},
{
"epoch": 1.5227698066126014,
"grad_norm": 0.11361340804878213,
"learning_rate": 5.6181496255636195e-06,
"loss": 0.038,
"num_tokens": 97771259.0,
"step": 1221
},
{
"epoch": 1.5240174672489082,
"grad_norm": 0.11948073802819091,
"learning_rate": 5.612091957747333e-06,
"loss": 0.0362,
"num_tokens": 97851776.0,
"step": 1222
},
{
"epoch": 1.5252651278852152,
"grad_norm": 0.11933952019877841,
"learning_rate": 5.606034086674447e-06,
"loss": 0.0347,
"num_tokens": 97931323.0,
"step": 1223
},
{
"epoch": 1.5265127885215222,
"grad_norm": 0.1119341984186723,
"learning_rate": 5.5999760233297115e-06,
"loss": 0.0355,
"num_tokens": 98012414.0,
"step": 1224
},
{
"epoch": 1.527760449157829,
"grad_norm": 0.12570139447794026,
"learning_rate": 5.593917778698227e-06,
"loss": 0.0351,
"num_tokens": 98092865.0,
"step": 1225
},
{
"epoch": 1.5290081097941361,
"grad_norm": 0.12255701579140926,
"learning_rate": 5.5878593637654226e-06,
"loss": 0.0372,
"num_tokens": 98173575.0,
"step": 1226
},
{
"epoch": 1.530255770430443,
"grad_norm": 0.12045998667186913,
"learning_rate": 5.581800789517036e-06,
"loss": 0.0338,
"num_tokens": 98253478.0,
"step": 1227
},
{
"epoch": 1.5315034310667497,
"grad_norm": 0.10599310040291675,
"learning_rate": 5.5757420669390925e-06,
"loss": 0.0333,
"num_tokens": 98331761.0,
"step": 1228
},
{
"epoch": 1.532751091703057,
"grad_norm": 0.1189636777802236,
"learning_rate": 5.5696832070178885e-06,
"loss": 0.0353,
"num_tokens": 98412258.0,
"step": 1229
},
{
"epoch": 1.5339987523393637,
"grad_norm": 0.11265547375888633,
"learning_rate": 5.563624220739969e-06,
"loss": 0.0369,
"num_tokens": 98492861.0,
"step": 1230
},
{
"epoch": 1.5352464129756707,
"grad_norm": 0.1135501412896236,
"learning_rate": 5.557565119092106e-06,
"loss": 0.034,
"num_tokens": 98572091.0,
"step": 1231
},
{
"epoch": 1.5364940736119777,
"grad_norm": 0.12706095636153494,
"learning_rate": 5.551505913061281e-06,
"loss": 0.0386,
"num_tokens": 98652747.0,
"step": 1232
},
{
"epoch": 1.5377417342482844,
"grad_norm": 0.11767249883496335,
"learning_rate": 5.54544661363467e-06,
"loss": 0.0356,
"num_tokens": 98732307.0,
"step": 1233
},
{
"epoch": 1.5389893948845914,
"grad_norm": 0.11861760143450235,
"learning_rate": 5.53938723179961e-06,
"loss": 0.0337,
"num_tokens": 98811668.0,
"step": 1234
},
{
"epoch": 1.5402370555208984,
"grad_norm": 0.1110579082090557,
"learning_rate": 5.533327778543593e-06,
"loss": 0.0356,
"num_tokens": 98890773.0,
"step": 1235
},
{
"epoch": 1.5414847161572052,
"grad_norm": 0.10918115524283789,
"learning_rate": 5.527268264854241e-06,
"loss": 0.0354,
"num_tokens": 98970768.0,
"step": 1236
},
{
"epoch": 1.5427323767935122,
"grad_norm": 0.1120508500986546,
"learning_rate": 5.521208701719284e-06,
"loss": 0.0371,
"num_tokens": 99052179.0,
"step": 1237
},
{
"epoch": 1.5439800374298192,
"grad_norm": 0.11689356960540866,
"learning_rate": 5.515149100126539e-06,
"loss": 0.0364,
"num_tokens": 99132493.0,
"step": 1238
},
{
"epoch": 1.545227698066126,
"grad_norm": 0.10663726507181617,
"learning_rate": 5.509089471063897e-06,
"loss": 0.0338,
"num_tokens": 99212713.0,
"step": 1239
},
{
"epoch": 1.546475358702433,
"grad_norm": 0.10825998214250622,
"learning_rate": 5.503029825519296e-06,
"loss": 0.0346,
"num_tokens": 99292651.0,
"step": 1240
},
{
"epoch": 1.54772301933874,
"grad_norm": 0.11313988408089551,
"learning_rate": 5.496970174480706e-06,
"loss": 0.0339,
"num_tokens": 99372261.0,
"step": 1241
},
{
"epoch": 1.5489706799750467,
"grad_norm": 0.12649997908530414,
"learning_rate": 5.4909105289361055e-06,
"loss": 0.0539,
"num_tokens": 99453192.0,
"step": 1242
},
{
"epoch": 1.5502183406113537,
"grad_norm": 0.1186247978835761,
"learning_rate": 5.4848508998734626e-06,
"loss": 0.0362,
"num_tokens": 99534693.0,
"step": 1243
},
{
"epoch": 1.5514660012476607,
"grad_norm": 0.11032319195795326,
"learning_rate": 5.478791298280719e-06,
"loss": 0.0325,
"num_tokens": 99613614.0,
"step": 1244
},
{
"epoch": 1.5527136618839674,
"grad_norm": 0.10331362127146462,
"learning_rate": 5.47273173514576e-06,
"loss": 0.0366,
"num_tokens": 99694144.0,
"step": 1245
},
{
"epoch": 1.5539613225202746,
"grad_norm": 0.11441502157583171,
"learning_rate": 5.466672221456408e-06,
"loss": 0.0352,
"num_tokens": 99772396.0,
"step": 1246
},
{
"epoch": 1.5552089831565814,
"grad_norm": 0.1384519685906425,
"learning_rate": 5.4606127682003915e-06,
"loss": 0.0364,
"num_tokens": 99853878.0,
"step": 1247
},
{
"epoch": 1.5564566437928882,
"grad_norm": 0.1187428026312172,
"learning_rate": 5.454553386365333e-06,
"loss": 0.0362,
"num_tokens": 99933199.0,
"step": 1248
},
{
"epoch": 1.5577043044291954,
"grad_norm": 0.1154288559693241,
"learning_rate": 5.44849408693872e-06,
"loss": 0.0355,
"num_tokens": 100013822.0,
"step": 1249
},
{
"epoch": 1.5589519650655022,
"grad_norm": 0.10939017703608667,
"learning_rate": 5.4424348809078974e-06,
"loss": 0.0364,
"num_tokens": 100093850.0,
"step": 1250
},
{
"epoch": 1.5601996257018091,
"grad_norm": 0.11577706451313442,
"learning_rate": 5.436375779260034e-06,
"loss": 0.0348,
"num_tokens": 100174014.0,
"step": 1251
},
{
"epoch": 1.5614472863381161,
"grad_norm": 0.11552930897371735,
"learning_rate": 5.430316792982112e-06,
"loss": 0.0364,
"num_tokens": 100254096.0,
"step": 1252
},
{
"epoch": 1.562694946974423,
"grad_norm": 0.12091413731054657,
"learning_rate": 5.424257933060908e-06,
"loss": 0.036,
"num_tokens": 100335736.0,
"step": 1253
},
{
"epoch": 1.56394260761073,
"grad_norm": 0.11349954972674088,
"learning_rate": 5.418199210482965e-06,
"loss": 0.0339,
"num_tokens": 100415770.0,
"step": 1254
},
{
"epoch": 1.5651902682470369,
"grad_norm": 0.11257539896050413,
"learning_rate": 5.412140636234579e-06,
"loss": 0.0365,
"num_tokens": 100496239.0,
"step": 1255
},
{
"epoch": 1.5664379288833437,
"grad_norm": 0.10433088685065614,
"learning_rate": 5.4060822213017745e-06,
"loss": 0.033,
"num_tokens": 100575751.0,
"step": 1256
},
{
"epoch": 1.5676855895196506,
"grad_norm": 0.11128220666515805,
"learning_rate": 5.400023976670291e-06,
"loss": 0.0362,
"num_tokens": 100655896.0,
"step": 1257
},
{
"epoch": 1.5689332501559576,
"grad_norm": 0.11665278503733945,
"learning_rate": 5.393965913325555e-06,
"loss": 0.036,
"num_tokens": 100736726.0,
"step": 1258
},
{
"epoch": 1.5701809107922644,
"grad_norm": 0.11372703831180105,
"learning_rate": 5.387908042252667e-06,
"loss": 0.0521,
"num_tokens": 100817144.0,
"step": 1259
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.11403138474441783,
"learning_rate": 5.381850374436383e-06,
"loss": 0.0349,
"num_tokens": 100896712.0,
"step": 1260
},
{
"epoch": 1.5726762320648784,
"grad_norm": 0.11114301741148946,
"learning_rate": 5.3757929208610784e-06,
"loss": 0.0338,
"num_tokens": 100975384.0,
"step": 1261
},
{
"epoch": 1.5739238927011852,
"grad_norm": 0.15901465459498454,
"learning_rate": 5.3697356925107514e-06,
"loss": 0.0353,
"num_tokens": 101055790.0,
"step": 1262
},
{
"epoch": 1.5751715533374921,
"grad_norm": 0.10627021014563413,
"learning_rate": 5.363678700368987e-06,
"loss": 0.0369,
"num_tokens": 101136479.0,
"step": 1263
},
{
"epoch": 1.5764192139737991,
"grad_norm": 0.12456078400858275,
"learning_rate": 5.3576219554189445e-06,
"loss": 0.0401,
"num_tokens": 101217634.0,
"step": 1264
},
{
"epoch": 1.577666874610106,
"grad_norm": 0.1210041167801264,
"learning_rate": 5.35156546864333e-06,
"loss": 0.037,
"num_tokens": 101297872.0,
"step": 1265
},
{
"epoch": 1.5789145352464131,
"grad_norm": 0.1070252120913601,
"learning_rate": 5.345509251024387e-06,
"loss": 0.0374,
"num_tokens": 101380176.0,
"step": 1266
},
{
"epoch": 1.5801621958827199,
"grad_norm": 0.10983324132682443,
"learning_rate": 5.339453313543868e-06,
"loss": 0.0331,
"num_tokens": 101459436.0,
"step": 1267
},
{
"epoch": 1.5814098565190269,
"grad_norm": 0.10244262198712618,
"learning_rate": 5.3333976671830165e-06,
"loss": 0.0343,
"num_tokens": 101538263.0,
"step": 1268
},
{
"epoch": 1.5826575171553339,
"grad_norm": 0.10678644753096563,
"learning_rate": 5.327342322922553e-06,
"loss": 0.0333,
"num_tokens": 101618020.0,
"step": 1269
},
{
"epoch": 1.5839051777916406,
"grad_norm": 0.10303177226381595,
"learning_rate": 5.321287291742645e-06,
"loss": 0.0335,
"num_tokens": 101696955.0,
"step": 1270
},
{
"epoch": 1.5851528384279476,
"grad_norm": 0.10922923647766163,
"learning_rate": 5.315232584622893e-06,
"loss": 0.0332,
"num_tokens": 101776118.0,
"step": 1271
},
{
"epoch": 1.5864004990642546,
"grad_norm": 0.11106303953873586,
"learning_rate": 5.309178212542313e-06,
"loss": 0.0342,
"num_tokens": 101855486.0,
"step": 1272
},
{
"epoch": 1.5876481597005614,
"grad_norm": 0.11146880211359997,
"learning_rate": 5.303124186479309e-06,
"loss": 0.0325,
"num_tokens": 101933520.0,
"step": 1273
},
{
"epoch": 1.5888958203368684,
"grad_norm": 0.11066657207520725,
"learning_rate": 5.297070517411664e-06,
"loss": 0.037,
"num_tokens": 102015666.0,
"step": 1274
},
{
"epoch": 1.5901434809731754,
"grad_norm": 0.12070337142463965,
"learning_rate": 5.2910172163165096e-06,
"loss": 0.0355,
"num_tokens": 102095242.0,
"step": 1275
},
{
"epoch": 1.5913911416094821,
"grad_norm": 0.11240092383283054,
"learning_rate": 5.284964294170306e-06,
"loss": 0.0362,
"num_tokens": 102174205.0,
"step": 1276
},
{
"epoch": 1.5926388022457891,
"grad_norm": 0.12576136429390572,
"learning_rate": 5.278911761948834e-06,
"loss": 0.0355,
"num_tokens": 102255437.0,
"step": 1277
},
{
"epoch": 1.5938864628820961,
"grad_norm": 0.11710384991621879,
"learning_rate": 5.272859630627164e-06,
"loss": 0.0347,
"num_tokens": 102334946.0,
"step": 1278
},
{
"epoch": 1.595134123518403,
"grad_norm": 0.10521059038838904,
"learning_rate": 5.266807911179638e-06,
"loss": 0.0354,
"num_tokens": 102414226.0,
"step": 1279
},
{
"epoch": 1.5963817841547099,
"grad_norm": 0.11308055443975817,
"learning_rate": 5.260756614579851e-06,
"loss": 0.0327,
"num_tokens": 102493991.0,
"step": 1280
},
{
"epoch": 1.5976294447910169,
"grad_norm": 0.1186031956772661,
"learning_rate": 5.254705751800636e-06,
"loss": 0.0348,
"num_tokens": 102574527.0,
"step": 1281
},
{
"epoch": 1.5988771054273236,
"grad_norm": 0.10866075596998348,
"learning_rate": 5.248655333814036e-06,
"loss": 0.0333,
"num_tokens": 102654519.0,
"step": 1282
},
{
"epoch": 1.6001247660636309,
"grad_norm": 0.10696137668864042,
"learning_rate": 5.242605371591286e-06,
"loss": 0.0352,
"num_tokens": 102734795.0,
"step": 1283
},
{
"epoch": 1.6013724266999376,
"grad_norm": 0.10337102406850418,
"learning_rate": 5.236555876102797e-06,
"loss": 0.0375,
"num_tokens": 102814405.0,
"step": 1284
},
{
"epoch": 1.6026200873362444,
"grad_norm": 0.12757465999262096,
"learning_rate": 5.2305068583181314e-06,
"loss": 0.036,
"num_tokens": 102894980.0,
"step": 1285
},
{
"epoch": 1.6038677479725516,
"grad_norm": 0.13690644302329974,
"learning_rate": 5.2244583292059896e-06,
"loss": 0.0366,
"num_tokens": 102977017.0,
"step": 1286
},
{
"epoch": 1.6051154086088584,
"grad_norm": 0.11930944617293818,
"learning_rate": 5.218410299734181e-06,
"loss": 0.034,
"num_tokens": 103056779.0,
"step": 1287
},
{
"epoch": 1.6063630692451654,
"grad_norm": 0.11012326303635499,
"learning_rate": 5.2123627808696084e-06,
"loss": 0.0339,
"num_tokens": 103137283.0,
"step": 1288
},
{
"epoch": 1.6076107298814724,
"grad_norm": 0.10957703717797697,
"learning_rate": 5.206315783578258e-06,
"loss": 0.0336,
"num_tokens": 103216776.0,
"step": 1289
},
{
"epoch": 1.6088583905177791,
"grad_norm": 0.10906712477554352,
"learning_rate": 5.20026931882516e-06,
"loss": 0.0341,
"num_tokens": 103296399.0,
"step": 1290
},
{
"epoch": 1.6101060511540861,
"grad_norm": 0.11440229595698308,
"learning_rate": 5.194223397574381e-06,
"loss": 0.0376,
"num_tokens": 103376203.0,
"step": 1291
},
{
"epoch": 1.611353711790393,
"grad_norm": 0.12588773683905502,
"learning_rate": 5.188178030789008e-06,
"loss": 0.0346,
"num_tokens": 103456735.0,
"step": 1292
},
{
"epoch": 1.6126013724266999,
"grad_norm": 0.12042374456895473,
"learning_rate": 5.1821332294311136e-06,
"loss": 0.0356,
"num_tokens": 103537257.0,
"step": 1293
},
{
"epoch": 1.6138490330630069,
"grad_norm": 0.11853035765383917,
"learning_rate": 5.176089004461752e-06,
"loss": 0.0361,
"num_tokens": 103617737.0,
"step": 1294
},
{
"epoch": 1.6150966936993139,
"grad_norm": 0.1084054829939777,
"learning_rate": 5.170045366840929e-06,
"loss": 0.0373,
"num_tokens": 103698813.0,
"step": 1295
},
{
"epoch": 1.6163443543356206,
"grad_norm": 0.12273703655358631,
"learning_rate": 5.164002327527588e-06,
"loss": 0.0374,
"num_tokens": 103782172.0,
"step": 1296
},
{
"epoch": 1.6175920149719276,
"grad_norm": 0.10996795677144779,
"learning_rate": 5.157959897479587e-06,
"loss": 0.0359,
"num_tokens": 103862312.0,
"step": 1297
},
{
"epoch": 1.6188396756082346,
"grad_norm": 0.11103120135245384,
"learning_rate": 5.151918087653672e-06,
"loss": 0.0336,
"num_tokens": 103943466.0,
"step": 1298
},
{
"epoch": 1.6200873362445414,
"grad_norm": 0.10467769656583091,
"learning_rate": 5.145876909005477e-06,
"loss": 0.0335,
"num_tokens": 104023154.0,
"step": 1299
},
{
"epoch": 1.6213349968808484,
"grad_norm": 0.10318413558631827,
"learning_rate": 5.139836372489481e-06,
"loss": 0.0318,
"num_tokens": 104102426.0,
"step": 1300
},
{
"epoch": 1.6225826575171554,
"grad_norm": 0.10919271565885186,
"learning_rate": 5.133796489059005e-06,
"loss": 0.0355,
"num_tokens": 104182188.0,
"step": 1301
},
{
"epoch": 1.6238303181534621,
"grad_norm": 0.11310842463169009,
"learning_rate": 5.1277572696661806e-06,
"loss": 0.0351,
"num_tokens": 104261762.0,
"step": 1302
},
{
"epoch": 1.6250779787897693,
"grad_norm": 0.1165076029660535,
"learning_rate": 5.12171872526194e-06,
"loss": 0.0355,
"num_tokens": 104342372.0,
"step": 1303
},
{
"epoch": 1.626325639426076,
"grad_norm": 0.11376695615925705,
"learning_rate": 5.115680866795989e-06,
"loss": 0.0342,
"num_tokens": 104422382.0,
"step": 1304
},
{
"epoch": 1.6275733000623829,
"grad_norm": 0.11767626403403031,
"learning_rate": 5.109643705216789e-06,
"loss": 0.0354,
"num_tokens": 104503015.0,
"step": 1305
},
{
"epoch": 1.62882096069869,
"grad_norm": 0.1181534867539231,
"learning_rate": 5.103607251471541e-06,
"loss": 0.0321,
"num_tokens": 104582079.0,
"step": 1306
},
{
"epoch": 1.6300686213349969,
"grad_norm": 0.09517674089409651,
"learning_rate": 5.097571516506158e-06,
"loss": 0.0331,
"num_tokens": 104661780.0,
"step": 1307
},
{
"epoch": 1.6313162819713038,
"grad_norm": 0.11931275868780851,
"learning_rate": 5.091536511265253e-06,
"loss": 0.0348,
"num_tokens": 104741752.0,
"step": 1308
},
{
"epoch": 1.6325639426076108,
"grad_norm": 0.10940449179708552,
"learning_rate": 5.085502246692111e-06,
"loss": 0.0354,
"num_tokens": 104822380.0,
"step": 1309
},
{
"epoch": 1.6338116032439176,
"grad_norm": 0.12053843559094413,
"learning_rate": 5.079468733728684e-06,
"loss": 0.0369,
"num_tokens": 104902595.0,
"step": 1310
},
{
"epoch": 1.6350592638802246,
"grad_norm": 0.11265796315863995,
"learning_rate": 5.07343598331555e-06,
"loss": 0.0336,
"num_tokens": 104981340.0,
"step": 1311
},
{
"epoch": 1.6363069245165316,
"grad_norm": 0.1109231726527133,
"learning_rate": 5.0674040063919114e-06,
"loss": 0.0349,
"num_tokens": 105060850.0,
"step": 1312
},
{
"epoch": 1.6375545851528384,
"grad_norm": 0.11588144786485258,
"learning_rate": 5.0613728138955644e-06,
"loss": 0.0343,
"num_tokens": 105140234.0,
"step": 1313
},
{
"epoch": 1.6388022457891454,
"grad_norm": 0.10922220587388409,
"learning_rate": 5.055342416762883e-06,
"loss": 0.0333,
"num_tokens": 105219670.0,
"step": 1314
},
{
"epoch": 1.6400499064254523,
"grad_norm": 0.10863070032451541,
"learning_rate": 5.0493128259288025e-06,
"loss": 0.0348,
"num_tokens": 105300783.0,
"step": 1315
},
{
"epoch": 1.641297567061759,
"grad_norm": 0.10544356260119463,
"learning_rate": 5.043284052326789e-06,
"loss": 0.0337,
"num_tokens": 105380432.0,
"step": 1316
},
{
"epoch": 1.642545227698066,
"grad_norm": 0.10945480005987275,
"learning_rate": 5.037256106888837e-06,
"loss": 0.0337,
"num_tokens": 105459281.0,
"step": 1317
},
{
"epoch": 1.643792888334373,
"grad_norm": 0.10513668226551459,
"learning_rate": 5.03122900054543e-06,
"loss": 0.0364,
"num_tokens": 105539043.0,
"step": 1318
},
{
"epoch": 1.6450405489706799,
"grad_norm": 0.11889358390219724,
"learning_rate": 5.025202744225535e-06,
"loss": 0.0323,
"num_tokens": 105618097.0,
"step": 1319
},
{
"epoch": 1.6462882096069869,
"grad_norm": 0.10819389235655742,
"learning_rate": 5.019177348856576e-06,
"loss": 0.0384,
"num_tokens": 105700246.0,
"step": 1320
},
{
"epoch": 1.6475358702432938,
"grad_norm": 0.12285064335570973,
"learning_rate": 5.013152825364416e-06,
"loss": 0.0335,
"num_tokens": 105779956.0,
"step": 1321
},
{
"epoch": 1.6487835308796006,
"grad_norm": 0.10457229323362498,
"learning_rate": 5.007129184673335e-06,
"loss": 0.0342,
"num_tokens": 105859422.0,
"step": 1322
},
{
"epoch": 1.6500311915159078,
"grad_norm": 0.10418232593169255,
"learning_rate": 5.001106437706016e-06,
"loss": 0.0354,
"num_tokens": 105939798.0,
"step": 1323
},
{
"epoch": 1.6512788521522146,
"grad_norm": 0.10621297870744945,
"learning_rate": 4.99508459538352e-06,
"loss": 0.035,
"num_tokens": 106019008.0,
"step": 1324
},
{
"epoch": 1.6525265127885214,
"grad_norm": 0.11055886736084558,
"learning_rate": 4.989063668625267e-06,
"loss": 0.0331,
"num_tokens": 106099014.0,
"step": 1325
},
{
"epoch": 1.6537741734248286,
"grad_norm": 0.12055990828923857,
"learning_rate": 4.983043668349018e-06,
"loss": 0.035,
"num_tokens": 106178538.0,
"step": 1326
},
{
"epoch": 1.6550218340611353,
"grad_norm": 0.11615411814602732,
"learning_rate": 4.977024605470851e-06,
"loss": 0.0342,
"num_tokens": 106257051.0,
"step": 1327
},
{
"epoch": 1.6562694946974423,
"grad_norm": 0.11242812128073622,
"learning_rate": 4.971006490905148e-06,
"loss": 0.0326,
"num_tokens": 106337027.0,
"step": 1328
},
{
"epoch": 1.6575171553337493,
"grad_norm": 0.10451594006379315,
"learning_rate": 4.964989335564571e-06,
"loss": 0.035,
"num_tokens": 106415989.0,
"step": 1329
},
{
"epoch": 1.658764815970056,
"grad_norm": 0.12008210058517821,
"learning_rate": 4.958973150360034e-06,
"loss": 0.0338,
"num_tokens": 106496902.0,
"step": 1330
},
{
"epoch": 1.660012476606363,
"grad_norm": 0.10283103060613702,
"learning_rate": 4.952957946200709e-06,
"loss": 0.0316,
"num_tokens": 106576276.0,
"step": 1331
},
{
"epoch": 1.66126013724267,
"grad_norm": 0.10840588788185893,
"learning_rate": 4.946943733993974e-06,
"loss": 0.0342,
"num_tokens": 106656074.0,
"step": 1332
},
{
"epoch": 1.6625077978789768,
"grad_norm": 0.1151122733175202,
"learning_rate": 4.940930524645414e-06,
"loss": 0.0359,
"num_tokens": 106737048.0,
"step": 1333
},
{
"epoch": 1.6637554585152838,
"grad_norm": 0.12138327404493408,
"learning_rate": 4.934918329058798e-06,
"loss": 0.0329,
"num_tokens": 106817115.0,
"step": 1334
},
{
"epoch": 1.6650031191515908,
"grad_norm": 0.10347193468215583,
"learning_rate": 4.928907158136049e-06,
"loss": 0.0351,
"num_tokens": 106896600.0,
"step": 1335
},
{
"epoch": 1.6662507797878976,
"grad_norm": 0.10818319318418472,
"learning_rate": 4.922897022777241e-06,
"loss": 0.0342,
"num_tokens": 106976257.0,
"step": 1336
},
{
"epoch": 1.6674984404242046,
"grad_norm": 0.125943792765203,
"learning_rate": 4.916887933880562e-06,
"loss": 0.0364,
"num_tokens": 107056103.0,
"step": 1337
},
{
"epoch": 1.6687461010605116,
"grad_norm": 0.10714932245614212,
"learning_rate": 4.910879902342309e-06,
"loss": 0.0328,
"num_tokens": 107135177.0,
"step": 1338
},
{
"epoch": 1.6699937616968183,
"grad_norm": 0.10022422578809588,
"learning_rate": 4.904872939056859e-06,
"loss": 0.0327,
"num_tokens": 107215076.0,
"step": 1339
},
{
"epoch": 1.6712414223331253,
"grad_norm": 0.11904997859265425,
"learning_rate": 4.898867054916655e-06,
"loss": 0.0331,
"num_tokens": 107294670.0,
"step": 1340
},
{
"epoch": 1.6724890829694323,
"grad_norm": 0.10346330508751622,
"learning_rate": 4.892862260812174e-06,
"loss": 0.0355,
"num_tokens": 107375743.0,
"step": 1341
},
{
"epoch": 1.673736743605739,
"grad_norm": 0.10320313489980291,
"learning_rate": 4.886858567631927e-06,
"loss": 0.037,
"num_tokens": 107456245.0,
"step": 1342
},
{
"epoch": 1.6749844042420463,
"grad_norm": 0.11376647833132077,
"learning_rate": 4.880855986262424e-06,
"loss": 0.0336,
"num_tokens": 107535076.0,
"step": 1343
},
{
"epoch": 1.676232064878353,
"grad_norm": 0.11251959987852904,
"learning_rate": 4.874854527588159e-06,
"loss": 0.0348,
"num_tokens": 107615448.0,
"step": 1344
},
{
"epoch": 1.6774797255146598,
"grad_norm": 0.11732474632394792,
"learning_rate": 4.868854202491587e-06,
"loss": 0.0343,
"num_tokens": 107695214.0,
"step": 1345
},
{
"epoch": 1.678727386150967,
"grad_norm": 0.11672350989227481,
"learning_rate": 4.862855021853117e-06,
"loss": 0.0352,
"num_tokens": 107775647.0,
"step": 1346
},
{
"epoch": 1.6799750467872738,
"grad_norm": 0.11684779926230927,
"learning_rate": 4.856856996551074e-06,
"loss": 0.0343,
"num_tokens": 107855586.0,
"step": 1347
},
{
"epoch": 1.6812227074235808,
"grad_norm": 0.11194528364428014,
"learning_rate": 4.850860137461691e-06,
"loss": 0.0349,
"num_tokens": 107933910.0,
"step": 1348
},
{
"epoch": 1.6824703680598878,
"grad_norm": 0.12241649866099316,
"learning_rate": 4.844864455459085e-06,
"loss": 0.0331,
"num_tokens": 108013285.0,
"step": 1349
},
{
"epoch": 1.6837180286961946,
"grad_norm": 0.10701404402743815,
"learning_rate": 4.83886996141524e-06,
"loss": 0.0355,
"num_tokens": 108093741.0,
"step": 1350
},
{
"epoch": 1.6849656893325016,
"grad_norm": 0.12427674118245957,
"learning_rate": 4.8328766661999885e-06,
"loss": 0.0351,
"num_tokens": 108174452.0,
"step": 1351
},
{
"epoch": 1.6862133499688086,
"grad_norm": 0.11109278194963701,
"learning_rate": 4.826884580680981e-06,
"loss": 0.0363,
"num_tokens": 108255274.0,
"step": 1352
},
{
"epoch": 1.6874610106051153,
"grad_norm": 0.11471172964381411,
"learning_rate": 4.8208937157236855e-06,
"loss": 0.033,
"num_tokens": 108335095.0,
"step": 1353
},
{
"epoch": 1.6887086712414223,
"grad_norm": 0.12341901134238756,
"learning_rate": 4.814904082191349e-06,
"loss": 0.0342,
"num_tokens": 108415877.0,
"step": 1354
},
{
"epoch": 1.6899563318777293,
"grad_norm": 0.1050413188206562,
"learning_rate": 4.8089156909449845e-06,
"loss": 0.0342,
"num_tokens": 108495262.0,
"step": 1355
},
{
"epoch": 1.691203992514036,
"grad_norm": 0.10900307986971748,
"learning_rate": 4.802928552843358e-06,
"loss": 0.0351,
"num_tokens": 108574803.0,
"step": 1356
},
{
"epoch": 1.692451653150343,
"grad_norm": 0.12184670852461642,
"learning_rate": 4.79694267874296e-06,
"loss": 0.0373,
"num_tokens": 108655482.0,
"step": 1357
},
{
"epoch": 1.69369931378665,
"grad_norm": 0.10975468138719802,
"learning_rate": 4.790958079497991e-06,
"loss": 0.0342,
"num_tokens": 108735411.0,
"step": 1358
},
{
"epoch": 1.6949469744229568,
"grad_norm": 0.10818231872378661,
"learning_rate": 4.784974765960335e-06,
"loss": 0.0361,
"num_tokens": 108815263.0,
"step": 1359
},
{
"epoch": 1.696194635059264,
"grad_norm": 0.10258324984418032,
"learning_rate": 4.77899274897955e-06,
"loss": 0.0342,
"num_tokens": 108894762.0,
"step": 1360
},
{
"epoch": 1.6974422956955708,
"grad_norm": 0.11081946376785735,
"learning_rate": 4.773012039402841e-06,
"loss": 0.0368,
"num_tokens": 108975844.0,
"step": 1361
},
{
"epoch": 1.6986899563318776,
"grad_norm": 0.10763088262915656,
"learning_rate": 4.767032648075043e-06,
"loss": 0.0352,
"num_tokens": 109056110.0,
"step": 1362
},
{
"epoch": 1.6999376169681848,
"grad_norm": 0.10826684816224101,
"learning_rate": 4.761054585838599e-06,
"loss": 0.0341,
"num_tokens": 109136888.0,
"step": 1363
},
{
"epoch": 1.7011852776044916,
"grad_norm": 0.11349832649415174,
"learning_rate": 4.755077863533541e-06,
"loss": 0.0342,
"num_tokens": 109216345.0,
"step": 1364
},
{
"epoch": 1.7024329382407986,
"grad_norm": 0.10721063931317605,
"learning_rate": 4.749102491997476e-06,
"loss": 0.0304,
"num_tokens": 109297726.0,
"step": 1365
},
{
"epoch": 1.7036805988771055,
"grad_norm": 0.11814753856914108,
"learning_rate": 4.743128482065555e-06,
"loss": 0.0368,
"num_tokens": 109377771.0,
"step": 1366
},
{
"epoch": 1.7049282595134123,
"grad_norm": 0.10923227600327748,
"learning_rate": 4.737155844570468e-06,
"loss": 0.0358,
"num_tokens": 109457147.0,
"step": 1367
},
{
"epoch": 1.7061759201497193,
"grad_norm": 0.1180774047748652,
"learning_rate": 4.7311845903424104e-06,
"loss": 0.0326,
"num_tokens": 109536870.0,
"step": 1368
},
{
"epoch": 1.7074235807860263,
"grad_norm": 0.10414052812314961,
"learning_rate": 4.725214730209069e-06,
"loss": 0.0348,
"num_tokens": 109617166.0,
"step": 1369
},
{
"epoch": 1.708671241422333,
"grad_norm": 0.11325120099720434,
"learning_rate": 4.719246274995607e-06,
"loss": 0.0373,
"num_tokens": 109697533.0,
"step": 1370
},
{
"epoch": 1.70991890205864,
"grad_norm": 0.10237282354895524,
"learning_rate": 4.713279235524637e-06,
"loss": 0.0339,
"num_tokens": 109778192.0,
"step": 1371
},
{
"epoch": 1.711166562694947,
"grad_norm": 0.1495589045820829,
"learning_rate": 4.707313622616205e-06,
"loss": 0.0348,
"num_tokens": 109858008.0,
"step": 1372
},
{
"epoch": 1.7124142233312538,
"grad_norm": 0.1155569950547421,
"learning_rate": 4.701349447087769e-06,
"loss": 0.0338,
"num_tokens": 109938620.0,
"step": 1373
},
{
"epoch": 1.7136618839675608,
"grad_norm": 0.1085633556890907,
"learning_rate": 4.695386719754184e-06,
"loss": 0.0344,
"num_tokens": 110018354.0,
"step": 1374
},
{
"epoch": 1.7149095446038678,
"grad_norm": 0.11234843873728942,
"learning_rate": 4.689425451427677e-06,
"loss": 0.0359,
"num_tokens": 110098749.0,
"step": 1375
},
{
"epoch": 1.7161572052401746,
"grad_norm": 0.11707214723093819,
"learning_rate": 4.683465652917828e-06,
"loss": 0.0358,
"num_tokens": 110178574.0,
"step": 1376
},
{
"epoch": 1.7174048658764816,
"grad_norm": 0.11575033343150132,
"learning_rate": 4.677507335031555e-06,
"loss": 0.0387,
"num_tokens": 110258909.0,
"step": 1377
},
{
"epoch": 1.7186525265127885,
"grad_norm": 0.1219047452029518,
"learning_rate": 4.671550508573087e-06,
"loss": 0.0364,
"num_tokens": 110338876.0,
"step": 1378
},
{
"epoch": 1.7199001871490953,
"grad_norm": 0.11606054592764724,
"learning_rate": 4.6655951843439514e-06,
"loss": 0.034,
"num_tokens": 110417753.0,
"step": 1379
},
{
"epoch": 1.7211478477854025,
"grad_norm": 0.11058642589780793,
"learning_rate": 4.659641373142953e-06,
"loss": 0.0369,
"num_tokens": 110497691.0,
"step": 1380
},
{
"epoch": 1.7223955084217093,
"grad_norm": 0.10502695534031538,
"learning_rate": 4.653689085766147e-06,
"loss": 0.0338,
"num_tokens": 110578084.0,
"step": 1381
},
{
"epoch": 1.723643169058016,
"grad_norm": 0.10033432947560307,
"learning_rate": 4.6477383330068335e-06,
"loss": 0.0324,
"num_tokens": 110656605.0,
"step": 1382
},
{
"epoch": 1.7248908296943233,
"grad_norm": 0.10347951460818469,
"learning_rate": 4.641789125655526e-06,
"loss": 0.0343,
"num_tokens": 110736629.0,
"step": 1383
},
{
"epoch": 1.72613849033063,
"grad_norm": 0.10404769094997354,
"learning_rate": 4.6358414744999324e-06,
"loss": 0.0322,
"num_tokens": 110816051.0,
"step": 1384
},
{
"epoch": 1.727386150966937,
"grad_norm": 0.10284553692602667,
"learning_rate": 4.6298953903249455e-06,
"loss": 0.0331,
"num_tokens": 110894941.0,
"step": 1385
},
{
"epoch": 1.728633811603244,
"grad_norm": 0.10288209629612745,
"learning_rate": 4.623950883912609e-06,
"loss": 0.0353,
"num_tokens": 110975460.0,
"step": 1386
},
{
"epoch": 1.7298814722395508,
"grad_norm": 0.10789958526274712,
"learning_rate": 4.618007966042114e-06,
"loss": 0.0342,
"num_tokens": 111054782.0,
"step": 1387
},
{
"epoch": 1.7311291328758578,
"grad_norm": 0.10876892596056022,
"learning_rate": 4.612066647489762e-06,
"loss": 0.0355,
"num_tokens": 111135170.0,
"step": 1388
},
{
"epoch": 1.7323767935121648,
"grad_norm": 0.12136812485887387,
"learning_rate": 4.606126939028965e-06,
"loss": 0.0338,
"num_tokens": 111214768.0,
"step": 1389
},
{
"epoch": 1.7336244541484715,
"grad_norm": 0.10679814813875112,
"learning_rate": 4.600188851430206e-06,
"loss": 0.0324,
"num_tokens": 111294207.0,
"step": 1390
},
{
"epoch": 1.7348721147847785,
"grad_norm": 0.1032000235074082,
"learning_rate": 4.594252395461036e-06,
"loss": 0.034,
"num_tokens": 111374632.0,
"step": 1391
},
{
"epoch": 1.7361197754210855,
"grad_norm": 0.10918063307642732,
"learning_rate": 4.588317581886041e-06,
"loss": 0.0344,
"num_tokens": 111454535.0,
"step": 1392
},
{
"epoch": 1.7373674360573923,
"grad_norm": 0.10999773787711306,
"learning_rate": 4.5823844214668326e-06,
"loss": 0.0352,
"num_tokens": 111534940.0,
"step": 1393
},
{
"epoch": 1.7386150966936993,
"grad_norm": 0.12310258864133658,
"learning_rate": 4.576452924962024e-06,
"loss": 0.0379,
"num_tokens": 111616583.0,
"step": 1394
},
{
"epoch": 1.7398627573300063,
"grad_norm": 0.11330748509366423,
"learning_rate": 4.570523103127209e-06,
"loss": 0.0334,
"num_tokens": 111694802.0,
"step": 1395
},
{
"epoch": 1.741110417966313,
"grad_norm": 0.11642883285973465,
"learning_rate": 4.564594966714952e-06,
"loss": 0.0344,
"num_tokens": 111774583.0,
"step": 1396
},
{
"epoch": 1.74235807860262,
"grad_norm": 0.11628781515429629,
"learning_rate": 4.558668526474751e-06,
"loss": 0.0361,
"num_tokens": 111854564.0,
"step": 1397
},
{
"epoch": 1.743605739238927,
"grad_norm": 0.11553952117653499,
"learning_rate": 4.552743793153037e-06,
"loss": 0.0342,
"num_tokens": 111934145.0,
"step": 1398
},
{
"epoch": 1.7448533998752338,
"grad_norm": 0.1130454484695648,
"learning_rate": 4.5468207774931414e-06,
"loss": 0.0362,
"num_tokens": 112014599.0,
"step": 1399
},
{
"epoch": 1.746101060511541,
"grad_norm": 0.10424941012798582,
"learning_rate": 4.540899490235282e-06,
"loss": 0.0353,
"num_tokens": 112095898.0,
"step": 1400
},
{
"epoch": 1.7473487211478478,
"grad_norm": 0.12176082233686349,
"learning_rate": 4.534979942116542e-06,
"loss": 0.0335,
"num_tokens": 112174805.0,
"step": 1401
},
{
"epoch": 1.7485963817841546,
"grad_norm": 0.10725290738018774,
"learning_rate": 4.529062143870849e-06,
"loss": 0.035,
"num_tokens": 112254844.0,
"step": 1402
},
{
"epoch": 1.7498440424204618,
"grad_norm": 0.11613612803991162,
"learning_rate": 4.5231461062289624e-06,
"loss": 0.0364,
"num_tokens": 112336687.0,
"step": 1403
},
{
"epoch": 1.7510917030567685,
"grad_norm": 0.11248286120432541,
"learning_rate": 4.5172318399184485e-06,
"loss": 0.0335,
"num_tokens": 112416437.0,
"step": 1404
},
{
"epoch": 1.7523393636930755,
"grad_norm": 0.11498685047252416,
"learning_rate": 4.511319355663657e-06,
"loss": 0.0363,
"num_tokens": 112496544.0,
"step": 1405
},
{
"epoch": 1.7535870243293825,
"grad_norm": 0.11584733211628963,
"learning_rate": 4.50540866418571e-06,
"loss": 0.0385,
"num_tokens": 112577418.0,
"step": 1406
},
{
"epoch": 1.7548346849656893,
"grad_norm": 0.10971791007225389,
"learning_rate": 4.499499776202476e-06,
"loss": 0.0327,
"num_tokens": 112655726.0,
"step": 1407
},
{
"epoch": 1.7560823456019963,
"grad_norm": 0.11202865802168581,
"learning_rate": 4.493592702428558e-06,
"loss": 0.0365,
"num_tokens": 112736130.0,
"step": 1408
},
{
"epoch": 1.7573300062383033,
"grad_norm": 0.11590791221388194,
"learning_rate": 4.487687453575261e-06,
"loss": 0.0359,
"num_tokens": 112816354.0,
"step": 1409
},
{
"epoch": 1.75857766687461,
"grad_norm": 0.11455918942737528,
"learning_rate": 4.481784040350593e-06,
"loss": 0.0357,
"num_tokens": 112895741.0,
"step": 1410
},
{
"epoch": 1.759825327510917,
"grad_norm": 0.11184953434294027,
"learning_rate": 4.475882473459221e-06,
"loss": 0.0323,
"num_tokens": 112975009.0,
"step": 1411
},
{
"epoch": 1.761072988147224,
"grad_norm": 0.1043909165249291,
"learning_rate": 4.469982763602473e-06,
"loss": 0.0376,
"num_tokens": 113056299.0,
"step": 1412
},
{
"epoch": 1.7623206487835308,
"grad_norm": 0.11796014666013199,
"learning_rate": 4.464084921478303e-06,
"loss": 0.0345,
"num_tokens": 113135339.0,
"step": 1413
},
{
"epoch": 1.7635683094198378,
"grad_norm": 0.10655530028085919,
"learning_rate": 4.458188957781285e-06,
"loss": 0.0332,
"num_tokens": 113215391.0,
"step": 1414
},
{
"epoch": 1.7648159700561448,
"grad_norm": 0.1137944705272993,
"learning_rate": 4.452294883202581e-06,
"loss": 0.0326,
"num_tokens": 113294604.0,
"step": 1415
},
{
"epoch": 1.7660636306924515,
"grad_norm": 0.10386535837108864,
"learning_rate": 4.44640270842993e-06,
"loss": 0.0355,
"num_tokens": 113373702.0,
"step": 1416
},
{
"epoch": 1.7673112913287585,
"grad_norm": 0.1124153813219037,
"learning_rate": 4.440512444147626e-06,
"loss": 0.0338,
"num_tokens": 113454253.0,
"step": 1417
},
{
"epoch": 1.7685589519650655,
"grad_norm": 0.11173982165813126,
"learning_rate": 4.434624101036498e-06,
"loss": 0.034,
"num_tokens": 113534170.0,
"step": 1418
},
{
"epoch": 1.7698066126013723,
"grad_norm": 0.10766159191175759,
"learning_rate": 4.4287376897738945e-06,
"loss": 0.0337,
"num_tokens": 113613505.0,
"step": 1419
},
{
"epoch": 1.7710542732376795,
"grad_norm": 0.11662200449928516,
"learning_rate": 4.4228532210336535e-06,
"loss": 0.0351,
"num_tokens": 113694356.0,
"step": 1420
},
{
"epoch": 1.7723019338739863,
"grad_norm": 0.12239528048779576,
"learning_rate": 4.4169707054861e-06,
"loss": 0.0348,
"num_tokens": 113773772.0,
"step": 1421
},
{
"epoch": 1.773549594510293,
"grad_norm": 0.11495501032768879,
"learning_rate": 4.411090153798011e-06,
"loss": 0.0361,
"num_tokens": 113854394.0,
"step": 1422
},
{
"epoch": 1.7747972551466002,
"grad_norm": 0.10793305404835776,
"learning_rate": 4.405211576632602e-06,
"loss": 0.0331,
"num_tokens": 113934601.0,
"step": 1423
},
{
"epoch": 1.776044915782907,
"grad_norm": 0.10521475892016557,
"learning_rate": 4.3993349846495136e-06,
"loss": 0.0336,
"num_tokens": 114014077.0,
"step": 1424
},
{
"epoch": 1.777292576419214,
"grad_norm": 0.11667480494909614,
"learning_rate": 4.393460388504784e-06,
"loss": 0.0364,
"num_tokens": 114095540.0,
"step": 1425
},
{
"epoch": 1.778540237055521,
"grad_norm": 0.11816708505706751,
"learning_rate": 4.387587798850826e-06,
"loss": 0.036,
"num_tokens": 114175449.0,
"step": 1426
},
{
"epoch": 1.7797878976918278,
"grad_norm": 0.11547381713071538,
"learning_rate": 4.381717226336426e-06,
"loss": 0.033,
"num_tokens": 114255254.0,
"step": 1427
},
{
"epoch": 1.7810355583281348,
"grad_norm": 0.11558449024822219,
"learning_rate": 4.375848681606704e-06,
"loss": 0.0355,
"num_tokens": 114335582.0,
"step": 1428
},
{
"epoch": 1.7822832189644418,
"grad_norm": 0.11501299427526183,
"learning_rate": 4.369982175303104e-06,
"loss": 0.0356,
"num_tokens": 114417492.0,
"step": 1429
},
{
"epoch": 1.7835308796007485,
"grad_norm": 0.11575029405727906,
"learning_rate": 4.364117718063375e-06,
"loss": 0.0342,
"num_tokens": 114498411.0,
"step": 1430
},
{
"epoch": 1.7847785402370555,
"grad_norm": 0.10471073262325635,
"learning_rate": 4.358255320521553e-06,
"loss": 0.0335,
"num_tokens": 114579592.0,
"step": 1431
},
{
"epoch": 1.7860262008733625,
"grad_norm": 0.10800367249362049,
"learning_rate": 4.352394993307935e-06,
"loss": 0.0346,
"num_tokens": 114660132.0,
"step": 1432
},
{
"epoch": 1.7872738615096693,
"grad_norm": 0.11784628615301575,
"learning_rate": 4.346536747049068e-06,
"loss": 0.035,
"num_tokens": 114742718.0,
"step": 1433
},
{
"epoch": 1.7885215221459763,
"grad_norm": 0.11047252826803172,
"learning_rate": 4.340680592367721e-06,
"loss": 0.0353,
"num_tokens": 114823729.0,
"step": 1434
},
{
"epoch": 1.7897691827822833,
"grad_norm": 0.10863771684339983,
"learning_rate": 4.33482653988287e-06,
"loss": 0.0352,
"num_tokens": 114903277.0,
"step": 1435
},
{
"epoch": 1.79101684341859,
"grad_norm": 0.11748528208682778,
"learning_rate": 4.328974600209687e-06,
"loss": 0.0333,
"num_tokens": 114983776.0,
"step": 1436
},
{
"epoch": 1.7922645040548972,
"grad_norm": 0.10380189194122497,
"learning_rate": 4.3231247839595045e-06,
"loss": 0.0334,
"num_tokens": 115063423.0,
"step": 1437
},
{
"epoch": 1.793512164691204,
"grad_norm": 0.1021667642296284,
"learning_rate": 4.317277101739806e-06,
"loss": 0.0338,
"num_tokens": 115143251.0,
"step": 1438
},
{
"epoch": 1.7947598253275108,
"grad_norm": 0.10887837383935073,
"learning_rate": 4.3114315641542105e-06,
"loss": 0.0342,
"num_tokens": 115224078.0,
"step": 1439
},
{
"epoch": 1.796007485963818,
"grad_norm": 0.12498802533911925,
"learning_rate": 4.305588181802441e-06,
"loss": 0.0333,
"num_tokens": 115304392.0,
"step": 1440
},
{
"epoch": 1.7972551466001248,
"grad_norm": 0.10562913529548969,
"learning_rate": 4.2997469652803185e-06,
"loss": 0.0359,
"num_tokens": 115384699.0,
"step": 1441
},
{
"epoch": 1.7985028072364317,
"grad_norm": 0.10968417412970875,
"learning_rate": 4.293907925179733e-06,
"loss": 0.0343,
"num_tokens": 115465341.0,
"step": 1442
},
{
"epoch": 1.7997504678727387,
"grad_norm": 0.10397756346348985,
"learning_rate": 4.28807107208863e-06,
"loss": 0.0328,
"num_tokens": 115544709.0,
"step": 1443
},
{
"epoch": 1.8009981285090455,
"grad_norm": 0.11506875540961697,
"learning_rate": 4.282236416590986e-06,
"loss": 0.0375,
"num_tokens": 115625949.0,
"step": 1444
},
{
"epoch": 1.8022457891453525,
"grad_norm": 0.10835660954726115,
"learning_rate": 4.276403969266797e-06,
"loss": 0.0336,
"num_tokens": 115705144.0,
"step": 1445
},
{
"epoch": 1.8034934497816595,
"grad_norm": 0.1125488424807153,
"learning_rate": 4.270573740692053e-06,
"loss": 0.0359,
"num_tokens": 115786232.0,
"step": 1446
},
{
"epoch": 1.8047411104179663,
"grad_norm": 0.11179280220174558,
"learning_rate": 4.2647457414387205e-06,
"loss": 0.0324,
"num_tokens": 115865733.0,
"step": 1447
},
{
"epoch": 1.8059887710542732,
"grad_norm": 0.10438418516083571,
"learning_rate": 4.2589199820747226e-06,
"loss": 0.0335,
"num_tokens": 115946117.0,
"step": 1448
},
{
"epoch": 1.8072364316905802,
"grad_norm": 0.12496207865854495,
"learning_rate": 4.253096473163923e-06,
"loss": 0.038,
"num_tokens": 116028253.0,
"step": 1449
},
{
"epoch": 1.808484092326887,
"grad_norm": 0.10562424147281688,
"learning_rate": 4.247275225266103e-06,
"loss": 0.0347,
"num_tokens": 116109099.0,
"step": 1450
},
{
"epoch": 1.809731752963194,
"grad_norm": 0.11476927473343183,
"learning_rate": 4.241456248936946e-06,
"loss": 0.0326,
"num_tokens": 116189316.0,
"step": 1451
},
{
"epoch": 1.810979413599501,
"grad_norm": 0.10456867355165096,
"learning_rate": 4.23563955472801e-06,
"loss": 0.035,
"num_tokens": 116270657.0,
"step": 1452
},
{
"epoch": 1.8122270742358078,
"grad_norm": 0.10972404579793639,
"learning_rate": 4.229825153186727e-06,
"loss": 0.035,
"num_tokens": 116351453.0,
"step": 1453
},
{
"epoch": 1.8134747348721147,
"grad_norm": 0.11614985559784176,
"learning_rate": 4.22401305485636e-06,
"loss": 0.035,
"num_tokens": 116432720.0,
"step": 1454
},
{
"epoch": 1.8147223955084217,
"grad_norm": 0.10838486449752817,
"learning_rate": 4.218203270276e-06,
"loss": 0.035,
"num_tokens": 116512799.0,
"step": 1455
},
{
"epoch": 1.8159700561447285,
"grad_norm": 0.11198739360311537,
"learning_rate": 4.2123958099805466e-06,
"loss": 0.0334,
"num_tokens": 116592644.0,
"step": 1456
},
{
"epoch": 1.8172177167810357,
"grad_norm": 0.11215477815079397,
"learning_rate": 4.206590684500675e-06,
"loss": 0.0348,
"num_tokens": 116672720.0,
"step": 1457
},
{
"epoch": 1.8184653774173425,
"grad_norm": 0.1130464870503315,
"learning_rate": 4.200787904362833e-06,
"loss": 0.0344,
"num_tokens": 116753067.0,
"step": 1458
},
{
"epoch": 1.8197130380536493,
"grad_norm": 0.10319308148117898,
"learning_rate": 4.194987480089218e-06,
"loss": 0.0333,
"num_tokens": 116832122.0,
"step": 1459
},
{
"epoch": 1.8209606986899565,
"grad_norm": 0.10789295427643053,
"learning_rate": 4.189189422197751e-06,
"loss": 0.0349,
"num_tokens": 116911422.0,
"step": 1460
},
{
"epoch": 1.8222083593262632,
"grad_norm": 0.10741052441160895,
"learning_rate": 4.183393741202065e-06,
"loss": 0.0346,
"num_tokens": 116991316.0,
"step": 1461
},
{
"epoch": 1.8234560199625702,
"grad_norm": 0.1149851515920513,
"learning_rate": 4.177600447611478e-06,
"loss": 0.0335,
"num_tokens": 117069959.0,
"step": 1462
},
{
"epoch": 1.8247036805988772,
"grad_norm": 0.11670396653884813,
"learning_rate": 4.171809551930985e-06,
"loss": 0.0344,
"num_tokens": 117149818.0,
"step": 1463
},
{
"epoch": 1.825951341235184,
"grad_norm": 0.11711057074934993,
"learning_rate": 4.166021064661231e-06,
"loss": 0.035,
"num_tokens": 117230247.0,
"step": 1464
},
{
"epoch": 1.827199001871491,
"grad_norm": 0.1163081603973351,
"learning_rate": 4.160234996298491e-06,
"loss": 0.0341,
"num_tokens": 117310424.0,
"step": 1465
},
{
"epoch": 1.828446662507798,
"grad_norm": 0.10458937874158106,
"learning_rate": 4.154451357334654e-06,
"loss": 0.0344,
"num_tokens": 117389859.0,
"step": 1466
},
{
"epoch": 1.8296943231441047,
"grad_norm": 0.10574896551270638,
"learning_rate": 4.148670158257211e-06,
"loss": 0.0341,
"num_tokens": 117469420.0,
"step": 1467
},
{
"epoch": 1.8309419837804117,
"grad_norm": 0.10899400352647608,
"learning_rate": 4.142891409549219e-06,
"loss": 0.0337,
"num_tokens": 117548721.0,
"step": 1468
},
{
"epoch": 1.8321896444167187,
"grad_norm": 0.10611750813836142,
"learning_rate": 4.137115121689297e-06,
"loss": 0.0348,
"num_tokens": 117628895.0,
"step": 1469
},
{
"epoch": 1.8334373050530255,
"grad_norm": 0.1111805298125393,
"learning_rate": 4.131341305151603e-06,
"loss": 0.0348,
"num_tokens": 117710011.0,
"step": 1470
},
{
"epoch": 1.8346849656893325,
"grad_norm": 0.10266067632336437,
"learning_rate": 4.1255699704058085e-06,
"loss": 0.0332,
"num_tokens": 117790110.0,
"step": 1471
},
{
"epoch": 1.8359326263256395,
"grad_norm": 0.12013831863359381,
"learning_rate": 4.119801127917089e-06,
"loss": 0.0329,
"num_tokens": 117870583.0,
"step": 1472
},
{
"epoch": 1.8371802869619462,
"grad_norm": 0.10173281550009515,
"learning_rate": 4.114034788146101e-06,
"loss": 0.0344,
"num_tokens": 117949961.0,
"step": 1473
},
{
"epoch": 1.8384279475982532,
"grad_norm": 0.10191141790449527,
"learning_rate": 4.108270961548957e-06,
"loss": 0.033,
"num_tokens": 118028504.0,
"step": 1474
},
{
"epoch": 1.8396756082345602,
"grad_norm": 0.12183190605596082,
"learning_rate": 4.102509658577223e-06,
"loss": 0.0338,
"num_tokens": 118108384.0,
"step": 1475
},
{
"epoch": 1.840923268870867,
"grad_norm": 0.10416883300068461,
"learning_rate": 4.096750889677878e-06,
"loss": 0.0349,
"num_tokens": 118188162.0,
"step": 1476
},
{
"epoch": 1.8421709295071742,
"grad_norm": 0.11986832551794753,
"learning_rate": 4.090994665293313e-06,
"loss": 0.0346,
"num_tokens": 118268028.0,
"step": 1477
},
{
"epoch": 1.843418590143481,
"grad_norm": 0.10172966968166275,
"learning_rate": 4.085240995861301e-06,
"loss": 0.0342,
"num_tokens": 118348423.0,
"step": 1478
},
{
"epoch": 1.8446662507797877,
"grad_norm": 0.11569226583939839,
"learning_rate": 4.079489891814986e-06,
"loss": 0.0352,
"num_tokens": 118429210.0,
"step": 1479
},
{
"epoch": 1.845913911416095,
"grad_norm": 0.10916278732424804,
"learning_rate": 4.073741363582856e-06,
"loss": 0.0352,
"num_tokens": 118508578.0,
"step": 1480
},
{
"epoch": 1.8471615720524017,
"grad_norm": 0.11293663723584749,
"learning_rate": 4.06799542158873e-06,
"loss": 0.0355,
"num_tokens": 118588744.0,
"step": 1481
},
{
"epoch": 1.8484092326887087,
"grad_norm": 0.11722344024028651,
"learning_rate": 4.062252076251739e-06,
"loss": 0.0328,
"num_tokens": 118667906.0,
"step": 1482
},
{
"epoch": 1.8496568933250157,
"grad_norm": 0.10122769351963658,
"learning_rate": 4.056511337986304e-06,
"loss": 0.0318,
"num_tokens": 118746761.0,
"step": 1483
},
{
"epoch": 1.8509045539613225,
"grad_norm": 0.10229816908665479,
"learning_rate": 4.05077321720212e-06,
"loss": 0.032,
"num_tokens": 118826981.0,
"step": 1484
},
{
"epoch": 1.8521522145976295,
"grad_norm": 0.11918185823359788,
"learning_rate": 4.045037724304129e-06,
"loss": 0.0338,
"num_tokens": 118906395.0,
"step": 1485
},
{
"epoch": 1.8533998752339365,
"grad_norm": 0.10337247858601341,
"learning_rate": 4.039304869692518e-06,
"loss": 0.0333,
"num_tokens": 118985392.0,
"step": 1486
},
{
"epoch": 1.8546475358702432,
"grad_norm": 0.1019281333615369,
"learning_rate": 4.033574663762685e-06,
"loss": 0.0354,
"num_tokens": 119065923.0,
"step": 1487
},
{
"epoch": 1.8558951965065502,
"grad_norm": 0.11895729478200164,
"learning_rate": 4.0278471169052224e-06,
"loss": 0.0333,
"num_tokens": 119144893.0,
"step": 1488
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.10167459971375013,
"learning_rate": 4.022122239505906e-06,
"loss": 0.0335,
"num_tokens": 119224641.0,
"step": 1489
},
{
"epoch": 1.858390517779164,
"grad_norm": 0.10149185182575064,
"learning_rate": 4.0164000419456715e-06,
"loss": 0.0341,
"num_tokens": 119305051.0,
"step": 1490
},
{
"epoch": 1.859638178415471,
"grad_norm": 0.11077520841456237,
"learning_rate": 4.010680534600587e-06,
"loss": 0.034,
"num_tokens": 119385471.0,
"step": 1491
},
{
"epoch": 1.860885839051778,
"grad_norm": 0.1120329287780735,
"learning_rate": 4.004963727841852e-06,
"loss": 0.0358,
"num_tokens": 119465985.0,
"step": 1492
},
{
"epoch": 1.8621334996880847,
"grad_norm": 0.11813850687069626,
"learning_rate": 3.9992496320357645e-06,
"loss": 0.0351,
"num_tokens": 119546271.0,
"step": 1493
},
{
"epoch": 1.8633811603243917,
"grad_norm": 0.11886113375271443,
"learning_rate": 3.993538257543706e-06,
"loss": 0.0316,
"num_tokens": 119627060.0,
"step": 1494
},
{
"epoch": 1.8646288209606987,
"grad_norm": 0.09874498532250679,
"learning_rate": 3.987829614722124e-06,
"loss": 0.0335,
"num_tokens": 119707551.0,
"step": 1495
},
{
"epoch": 1.8658764815970055,
"grad_norm": 0.10786226744861373,
"learning_rate": 3.982123713922517e-06,
"loss": 0.0344,
"num_tokens": 119787384.0,
"step": 1496
},
{
"epoch": 1.8671241422333127,
"grad_norm": 0.10917423418357436,
"learning_rate": 3.976420565491404e-06,
"loss": 0.0322,
"num_tokens": 119866556.0,
"step": 1497
},
{
"epoch": 1.8683718028696195,
"grad_norm": 0.10509609392688785,
"learning_rate": 3.970720179770322e-06,
"loss": 0.032,
"num_tokens": 119946177.0,
"step": 1498
},
{
"epoch": 1.8696194635059262,
"grad_norm": 0.11086500849159564,
"learning_rate": 3.965022567095788e-06,
"loss": 0.0382,
"num_tokens": 120026777.0,
"step": 1499
},
{
"epoch": 1.8708671241422334,
"grad_norm": 0.1026552922683715,
"learning_rate": 3.959327737799298e-06,
"loss": 0.0349,
"num_tokens": 120106915.0,
"step": 1500
},
{
"epoch": 1.8721147847785402,
"grad_norm": 0.10134968732729031,
"learning_rate": 3.953635702207299e-06,
"loss": 0.0316,
"num_tokens": 120186044.0,
"step": 1501
},
{
"epoch": 1.8733624454148472,
"grad_norm": 0.09477996979178789,
"learning_rate": 3.947946470641169e-06,
"loss": 0.0314,
"num_tokens": 120265104.0,
"step": 1502
},
{
"epoch": 1.8746101060511542,
"grad_norm": 0.10341753477077234,
"learning_rate": 3.9422600534172105e-06,
"loss": 0.0347,
"num_tokens": 120345399.0,
"step": 1503
},
{
"epoch": 1.875857766687461,
"grad_norm": 0.109955990836735,
"learning_rate": 3.936576460846614e-06,
"loss": 0.0343,
"num_tokens": 120424888.0,
"step": 1504
},
{
"epoch": 1.877105427323768,
"grad_norm": 0.1067874469666549,
"learning_rate": 3.930895703235448e-06,
"loss": 0.0316,
"num_tokens": 120504351.0,
"step": 1505
},
{
"epoch": 1.878353087960075,
"grad_norm": 0.10876424328262234,
"learning_rate": 3.925217790884646e-06,
"loss": 0.0325,
"num_tokens": 120583924.0,
"step": 1506
},
{
"epoch": 1.8796007485963817,
"grad_norm": 0.11262768159157971,
"learning_rate": 3.919542734089978e-06,
"loss": 0.0354,
"num_tokens": 120664606.0,
"step": 1507
},
{
"epoch": 1.8808484092326887,
"grad_norm": 0.10063627694670048,
"learning_rate": 3.913870543142038e-06,
"loss": 0.0359,
"num_tokens": 120744208.0,
"step": 1508
},
{
"epoch": 1.8820960698689957,
"grad_norm": 0.12382898842560677,
"learning_rate": 3.908201228326222e-06,
"loss": 0.0328,
"num_tokens": 120824125.0,
"step": 1509
},
{
"epoch": 1.8833437305053025,
"grad_norm": 0.10492818441358094,
"learning_rate": 3.902534799922713e-06,
"loss": 0.0318,
"num_tokens": 120902940.0,
"step": 1510
},
{
"epoch": 1.8845913911416095,
"grad_norm": 0.10403111315609859,
"learning_rate": 3.896871268206456e-06,
"loss": 0.0364,
"num_tokens": 120985013.0,
"step": 1511
},
{
"epoch": 1.8858390517779164,
"grad_norm": 0.11769575525186173,
"learning_rate": 3.8912106434471486e-06,
"loss": 0.0325,
"num_tokens": 121064553.0,
"step": 1512
},
{
"epoch": 1.8870867124142232,
"grad_norm": 0.10389306641364818,
"learning_rate": 3.885552935909212e-06,
"loss": 0.0344,
"num_tokens": 121144893.0,
"step": 1513
},
{
"epoch": 1.8883343730505302,
"grad_norm": 0.10981301470560886,
"learning_rate": 3.879898155851779e-06,
"loss": 0.0337,
"num_tokens": 121225640.0,
"step": 1514
},
{
"epoch": 1.8895820336868372,
"grad_norm": 0.10151306602448709,
"learning_rate": 3.874246313528679e-06,
"loss": 0.034,
"num_tokens": 121307302.0,
"step": 1515
},
{
"epoch": 1.890829694323144,
"grad_norm": 0.11075900606639615,
"learning_rate": 3.868597419188409e-06,
"loss": 0.0356,
"num_tokens": 121388078.0,
"step": 1516
},
{
"epoch": 1.8920773549594512,
"grad_norm": 0.11142810402559525,
"learning_rate": 3.862951483074119e-06,
"loss": 0.0329,
"num_tokens": 121467954.0,
"step": 1517
},
{
"epoch": 1.893325015595758,
"grad_norm": 0.10044572617648408,
"learning_rate": 3.857308515423601e-06,
"loss": 0.0354,
"num_tokens": 121547246.0,
"step": 1518
},
{
"epoch": 1.8945726762320647,
"grad_norm": 0.12058986292961471,
"learning_rate": 3.851668526469261e-06,
"loss": 0.0361,
"num_tokens": 121628571.0,
"step": 1519
},
{
"epoch": 1.895820336868372,
"grad_norm": 0.10836004623444401,
"learning_rate": 3.846031526438102e-06,
"loss": 0.0332,
"num_tokens": 121709487.0,
"step": 1520
},
{
"epoch": 1.8970679975046787,
"grad_norm": 0.10878927022806904,
"learning_rate": 3.84039752555171e-06,
"loss": 0.0332,
"num_tokens": 121789703.0,
"step": 1521
},
{
"epoch": 1.8983156581409857,
"grad_norm": 0.10824955993736991,
"learning_rate": 3.834766534026231e-06,
"loss": 0.0332,
"num_tokens": 121869985.0,
"step": 1522
},
{
"epoch": 1.8995633187772927,
"grad_norm": 0.10955981205776032,
"learning_rate": 3.829138562072353e-06,
"loss": 0.0335,
"num_tokens": 121948903.0,
"step": 1523
},
{
"epoch": 1.9008109794135994,
"grad_norm": 0.10837260665067358,
"learning_rate": 3.823513619895293e-06,
"loss": 0.034,
"num_tokens": 122028802.0,
"step": 1524
},
{
"epoch": 1.9020586400499064,
"grad_norm": 0.11392485556540904,
"learning_rate": 3.81789171769477e-06,
"loss": 0.0339,
"num_tokens": 122110650.0,
"step": 1525
},
{
"epoch": 1.9033063006862134,
"grad_norm": 0.10471350453492868,
"learning_rate": 3.812272865664994e-06,
"loss": 0.0349,
"num_tokens": 122190750.0,
"step": 1526
},
{
"epoch": 1.9045539613225202,
"grad_norm": 0.11294540539518536,
"learning_rate": 3.8066570739946394e-06,
"loss": 0.0346,
"num_tokens": 122271273.0,
"step": 1527
},
{
"epoch": 1.9058016219588272,
"grad_norm": 0.11267496722591189,
"learning_rate": 3.801044352866834e-06,
"loss": 0.0358,
"num_tokens": 122351564.0,
"step": 1528
},
{
"epoch": 1.9070492825951342,
"grad_norm": 0.11401020747834047,
"learning_rate": 3.7954347124591395e-06,
"loss": 0.0344,
"num_tokens": 122431176.0,
"step": 1529
},
{
"epoch": 1.908296943231441,
"grad_norm": 0.10875035338383428,
"learning_rate": 3.7898281629435286e-06,
"loss": 0.0348,
"num_tokens": 122512805.0,
"step": 1530
},
{
"epoch": 1.909544603867748,
"grad_norm": 0.1104918835133121,
"learning_rate": 3.7842247144863686e-06,
"loss": 0.0321,
"num_tokens": 122592405.0,
"step": 1531
},
{
"epoch": 1.910792264504055,
"grad_norm": 0.10420246611454374,
"learning_rate": 3.778624377248409e-06,
"loss": 0.0336,
"num_tokens": 122672154.0,
"step": 1532
},
{
"epoch": 1.9120399251403617,
"grad_norm": 0.10975968736912756,
"learning_rate": 3.77302716138475e-06,
"loss": 0.0355,
"num_tokens": 122753896.0,
"step": 1533
},
{
"epoch": 1.913287585776669,
"grad_norm": 0.11562096811664528,
"learning_rate": 3.7674330770448374e-06,
"loss": 0.0339,
"num_tokens": 122833339.0,
"step": 1534
},
{
"epoch": 1.9145352464129757,
"grad_norm": 0.09559889402594825,
"learning_rate": 3.7618421343724386e-06,
"loss": 0.0343,
"num_tokens": 122913131.0,
"step": 1535
},
{
"epoch": 1.9157829070492824,
"grad_norm": 0.12632567082780488,
"learning_rate": 3.756254343505621e-06,
"loss": 0.0326,
"num_tokens": 122992473.0,
"step": 1536
},
{
"epoch": 1.9170305676855897,
"grad_norm": 0.10173152865169789,
"learning_rate": 3.7506697145767367e-06,
"loss": 0.0334,
"num_tokens": 123072318.0,
"step": 1537
},
{
"epoch": 1.9182782283218964,
"grad_norm": 0.11406574908341054,
"learning_rate": 3.745088257712408e-06,
"loss": 0.0337,
"num_tokens": 123151348.0,
"step": 1538
},
{
"epoch": 1.9195258889582034,
"grad_norm": 0.10892304795841887,
"learning_rate": 3.7395099830335034e-06,
"loss": 0.0342,
"num_tokens": 123231936.0,
"step": 1539
},
{
"epoch": 1.9207735495945104,
"grad_norm": 0.10694417012259536,
"learning_rate": 3.7339349006551193e-06,
"loss": 0.0337,
"num_tokens": 123312950.0,
"step": 1540
},
{
"epoch": 1.9220212102308172,
"grad_norm": 0.10017582315819759,
"learning_rate": 3.7283630206865696e-06,
"loss": 0.0333,
"num_tokens": 123392780.0,
"step": 1541
},
{
"epoch": 1.9232688708671242,
"grad_norm": 0.10747239898877138,
"learning_rate": 3.7227943532313504e-06,
"loss": 0.0341,
"num_tokens": 123472235.0,
"step": 1542
},
{
"epoch": 1.9245165315034312,
"grad_norm": 0.11544815418654572,
"learning_rate": 3.7172289083871436e-06,
"loss": 0.0362,
"num_tokens": 123552579.0,
"step": 1543
},
{
"epoch": 1.925764192139738,
"grad_norm": 0.11739967817563109,
"learning_rate": 3.7116666962457813e-06,
"loss": 0.033,
"num_tokens": 123631233.0,
"step": 1544
},
{
"epoch": 1.927011852776045,
"grad_norm": 0.10369089138250741,
"learning_rate": 3.7061077268932333e-06,
"loss": 0.0344,
"num_tokens": 123711026.0,
"step": 1545
},
{
"epoch": 1.928259513412352,
"grad_norm": 0.11479151752423149,
"learning_rate": 3.700552010409596e-06,
"loss": 0.0358,
"num_tokens": 123790909.0,
"step": 1546
},
{
"epoch": 1.9295071740486587,
"grad_norm": 0.1028773833805819,
"learning_rate": 3.694999556869059e-06,
"loss": 0.0351,
"num_tokens": 123872098.0,
"step": 1547
},
{
"epoch": 1.9307548346849657,
"grad_norm": 0.10697413999677229,
"learning_rate": 3.6894503763399003e-06,
"loss": 0.033,
"num_tokens": 123952070.0,
"step": 1548
},
{
"epoch": 1.9320024953212727,
"grad_norm": 0.10029417018146669,
"learning_rate": 3.683904478884461e-06,
"loss": 0.0324,
"num_tokens": 124032234.0,
"step": 1549
},
{
"epoch": 1.9332501559575794,
"grad_norm": 0.11027618652006071,
"learning_rate": 3.67836187455913e-06,
"loss": 0.0326,
"num_tokens": 124111121.0,
"step": 1550
},
{
"epoch": 1.9344978165938864,
"grad_norm": 0.1056614113686575,
"learning_rate": 3.672822573414323e-06,
"loss": 0.0369,
"num_tokens": 124191867.0,
"step": 1551
},
{
"epoch": 1.9357454772301934,
"grad_norm": 0.11091294820691895,
"learning_rate": 3.6672865854944673e-06,
"loss": 0.0356,
"num_tokens": 124272599.0,
"step": 1552
},
{
"epoch": 1.9369931378665002,
"grad_norm": 0.1084522698863579,
"learning_rate": 3.6617539208379836e-06,
"loss": 0.0336,
"num_tokens": 124352094.0,
"step": 1553
},
{
"epoch": 1.9382407985028074,
"grad_norm": 0.10585047276937648,
"learning_rate": 3.656224589477264e-06,
"loss": 0.0352,
"num_tokens": 124433325.0,
"step": 1554
},
{
"epoch": 1.9394884591391142,
"grad_norm": 0.10935995370656627,
"learning_rate": 3.65069860143866e-06,
"loss": 0.0312,
"num_tokens": 124511945.0,
"step": 1555
},
{
"epoch": 1.940736119775421,
"grad_norm": 0.10900465709590437,
"learning_rate": 3.645175966742456e-06,
"loss": 0.0355,
"num_tokens": 124592313.0,
"step": 1556
},
{
"epoch": 1.9419837804117281,
"grad_norm": 0.11473015794303712,
"learning_rate": 3.639656695402858e-06,
"loss": 0.0349,
"num_tokens": 124672598.0,
"step": 1557
},
{
"epoch": 1.943231441048035,
"grad_norm": 0.10421687435100782,
"learning_rate": 3.634140797427974e-06,
"loss": 0.0343,
"num_tokens": 124752029.0,
"step": 1558
},
{
"epoch": 1.944479101684342,
"grad_norm": 0.11638448293053776,
"learning_rate": 3.6286282828197904e-06,
"loss": 0.0361,
"num_tokens": 124832634.0,
"step": 1559
},
{
"epoch": 1.945726762320649,
"grad_norm": 0.11391845107383987,
"learning_rate": 3.623119161574169e-06,
"loss": 0.0327,
"num_tokens": 124912364.0,
"step": 1560
},
{
"epoch": 1.9469744229569557,
"grad_norm": 0.10616429489708827,
"learning_rate": 3.6176134436808074e-06,
"loss": 0.0344,
"num_tokens": 124991270.0,
"step": 1561
},
{
"epoch": 1.9482220835932627,
"grad_norm": 0.11167759034779555,
"learning_rate": 3.612111139123239e-06,
"loss": 0.0348,
"num_tokens": 125070833.0,
"step": 1562
},
{
"epoch": 1.9494697442295696,
"grad_norm": 0.1127208715547816,
"learning_rate": 3.6066122578788033e-06,
"loss": 0.0381,
"num_tokens": 125152435.0,
"step": 1563
},
{
"epoch": 1.9507174048658764,
"grad_norm": 0.10729694888144387,
"learning_rate": 3.6011168099186322e-06,
"loss": 0.0335,
"num_tokens": 125233146.0,
"step": 1564
},
{
"epoch": 1.9519650655021834,
"grad_norm": 0.11577479373886601,
"learning_rate": 3.5956248052076383e-06,
"loss": 0.0332,
"num_tokens": 125312477.0,
"step": 1565
},
{
"epoch": 1.9532127261384904,
"grad_norm": 0.10816544241229983,
"learning_rate": 3.5901362537044826e-06,
"loss": 0.0353,
"num_tokens": 125393204.0,
"step": 1566
},
{
"epoch": 1.9544603867747972,
"grad_norm": 0.1144921783827139,
"learning_rate": 3.584651165361568e-06,
"loss": 0.0339,
"num_tokens": 125473311.0,
"step": 1567
},
{
"epoch": 1.9557080474111042,
"grad_norm": 0.1023826741374561,
"learning_rate": 3.579169550125019e-06,
"loss": 0.0314,
"num_tokens": 125553173.0,
"step": 1568
},
{
"epoch": 1.9569557080474111,
"grad_norm": 0.1081903875181614,
"learning_rate": 3.5736914179346626e-06,
"loss": 0.0359,
"num_tokens": 125633344.0,
"step": 1569
},
{
"epoch": 1.958203368683718,
"grad_norm": 0.11814223600506968,
"learning_rate": 3.5682167787240053e-06,
"loss": 0.0333,
"num_tokens": 125713700.0,
"step": 1570
},
{
"epoch": 1.959451029320025,
"grad_norm": 0.11106552128597881,
"learning_rate": 3.5627456424202223e-06,
"loss": 0.0336,
"num_tokens": 125793802.0,
"step": 1571
},
{
"epoch": 1.960698689956332,
"grad_norm": 0.11282020876952419,
"learning_rate": 3.55727801894414e-06,
"loss": 0.0328,
"num_tokens": 125873850.0,
"step": 1572
},
{
"epoch": 1.9619463505926387,
"grad_norm": 0.10618294835959388,
"learning_rate": 3.5518139182102106e-06,
"loss": 0.033,
"num_tokens": 125953640.0,
"step": 1573
},
{
"epoch": 1.9631940112289459,
"grad_norm": 0.09834808850932374,
"learning_rate": 3.5463533501265e-06,
"loss": 0.032,
"num_tokens": 126033564.0,
"step": 1574
},
{
"epoch": 1.9644416718652526,
"grad_norm": 0.11284124916966394,
"learning_rate": 3.5408963245946714e-06,
"loss": 0.0348,
"num_tokens": 126114330.0,
"step": 1575
},
{
"epoch": 1.9656893325015594,
"grad_norm": 0.09978958639176037,
"learning_rate": 3.53544285150996e-06,
"loss": 0.0329,
"num_tokens": 126194064.0,
"step": 1576
},
{
"epoch": 1.9669369931378666,
"grad_norm": 0.11717035362702442,
"learning_rate": 3.529992940761159e-06,
"loss": 0.0397,
"num_tokens": 126274303.0,
"step": 1577
},
{
"epoch": 1.9681846537741734,
"grad_norm": 0.11953639846307562,
"learning_rate": 3.524546602230606e-06,
"loss": 0.0351,
"num_tokens": 126355527.0,
"step": 1578
},
{
"epoch": 1.9694323144104804,
"grad_norm": 0.10799542403102128,
"learning_rate": 3.5191038457941596e-06,
"loss": 0.0324,
"num_tokens": 126434438.0,
"step": 1579
},
{
"epoch": 1.9706799750467874,
"grad_norm": 0.11164008102679486,
"learning_rate": 3.5136646813211784e-06,
"loss": 0.0338,
"num_tokens": 126513806.0,
"step": 1580
},
{
"epoch": 1.9719276356830941,
"grad_norm": 0.11042889340863964,
"learning_rate": 3.5082291186745145e-06,
"loss": 0.0342,
"num_tokens": 126594160.0,
"step": 1581
},
{
"epoch": 1.9731752963194011,
"grad_norm": 0.11255018277654384,
"learning_rate": 3.5027971677104867e-06,
"loss": 0.0349,
"num_tokens": 126674625.0,
"step": 1582
},
{
"epoch": 1.9744229569557081,
"grad_norm": 0.11010249880686576,
"learning_rate": 3.497368838278862e-06,
"loss": 0.0343,
"num_tokens": 126754334.0,
"step": 1583
},
{
"epoch": 1.975670617592015,
"grad_norm": 0.10466844945374801,
"learning_rate": 3.491944140222845e-06,
"loss": 0.0327,
"num_tokens": 126834485.0,
"step": 1584
},
{
"epoch": 1.976918278228322,
"grad_norm": 0.10973920592988463,
"learning_rate": 3.486523083379051e-06,
"loss": 0.0336,
"num_tokens": 126913919.0,
"step": 1585
},
{
"epoch": 1.9781659388646289,
"grad_norm": 0.09624968559215073,
"learning_rate": 3.481105677577493e-06,
"loss": 0.0322,
"num_tokens": 126992768.0,
"step": 1586
},
{
"epoch": 1.9794135995009356,
"grad_norm": 0.10348850398918999,
"learning_rate": 3.475691932641569e-06,
"loss": 0.0327,
"num_tokens": 127073558.0,
"step": 1587
},
{
"epoch": 1.9806612601372426,
"grad_norm": 0.11008522746530837,
"learning_rate": 3.4702818583880305e-06,
"loss": 0.0329,
"num_tokens": 127154390.0,
"step": 1588
},
{
"epoch": 1.9819089207735496,
"grad_norm": 0.10576324521968579,
"learning_rate": 3.46487546462698e-06,
"loss": 0.0335,
"num_tokens": 127233684.0,
"step": 1589
},
{
"epoch": 1.9831565814098564,
"grad_norm": 0.10909609236650647,
"learning_rate": 3.4594727611618462e-06,
"loss": 0.0354,
"num_tokens": 127314072.0,
"step": 1590
},
{
"epoch": 1.9844042420461634,
"grad_norm": 0.10226819882059832,
"learning_rate": 3.454073757789359e-06,
"loss": 0.0344,
"num_tokens": 127393809.0,
"step": 1591
},
{
"epoch": 1.9856519026824704,
"grad_norm": 0.11124791645732714,
"learning_rate": 3.4486784642995442e-06,
"loss": 0.0338,
"num_tokens": 127474232.0,
"step": 1592
},
{
"epoch": 1.9868995633187772,
"grad_norm": 0.10680204301961628,
"learning_rate": 3.4432868904757024e-06,
"loss": 0.0342,
"num_tokens": 127554705.0,
"step": 1593
},
{
"epoch": 1.9881472239550844,
"grad_norm": 0.10744418843654158,
"learning_rate": 3.437899046094384e-06,
"loss": 0.0334,
"num_tokens": 127634236.0,
"step": 1594
},
{
"epoch": 1.9893948845913911,
"grad_norm": 0.10963223394836877,
"learning_rate": 3.432514940925378e-06,
"loss": 0.0344,
"num_tokens": 127714557.0,
"step": 1595
},
{
"epoch": 1.990642545227698,
"grad_norm": 0.11012697826983461,
"learning_rate": 3.4271345847316974e-06,
"loss": 0.0364,
"num_tokens": 127795159.0,
"step": 1596
},
{
"epoch": 1.9918902058640051,
"grad_norm": 0.11016272568594698,
"learning_rate": 3.421757987269554e-06,
"loss": 0.0362,
"num_tokens": 127875081.0,
"step": 1597
},
{
"epoch": 1.9931378665003119,
"grad_norm": 0.10065770585753678,
"learning_rate": 3.416385158288343e-06,
"loss": 0.0327,
"num_tokens": 127954573.0,
"step": 1598
},
{
"epoch": 1.9943855271366189,
"grad_norm": 0.10974734302658783,
"learning_rate": 3.411016107530628e-06,
"loss": 0.033,
"num_tokens": 128034668.0,
"step": 1599
},
{
"epoch": 1.9956331877729259,
"grad_norm": 0.11061221640528077,
"learning_rate": 3.405650844732122e-06,
"loss": 0.0351,
"num_tokens": 128114461.0,
"step": 1600
},
{
"epoch": 1.9968808484092326,
"grad_norm": 0.09898172330167038,
"learning_rate": 3.400289379621664e-06,
"loss": 0.0334,
"num_tokens": 128194681.0,
"step": 1601
},
{
"epoch": 1.9981285090455396,
"grad_norm": 0.10720532173976896,
"learning_rate": 3.394931721921214e-06,
"loss": 0.0323,
"num_tokens": 128274005.0,
"step": 1602
},
{
"epoch": 1.9993761696818466,
"grad_norm": 0.10715909655170884,
"learning_rate": 3.3895778813458256e-06,
"loss": 0.0339,
"num_tokens": 128353693.0,
"step": 1603
},
{
"epoch": 2.0,
"grad_norm": 0.15351115677037117,
"learning_rate": 3.3842278676036293e-06,
"loss": 0.0295,
"num_tokens": 128394204.0,
"step": 1604
},
{
"epoch": 2.0012476606363068,
"grad_norm": 0.09668414870141348,
"learning_rate": 3.3788816903958145e-06,
"loss": 0.0292,
"num_tokens": 128474132.0,
"step": 1605
},
{
"epoch": 2.002495321272614,
"grad_norm": 0.09284121837219293,
"learning_rate": 3.37353935941662e-06,
"loss": 0.0277,
"num_tokens": 128554051.0,
"step": 1606
},
{
"epoch": 2.0037429819089208,
"grad_norm": 0.09339384544922638,
"learning_rate": 3.3682008843533055e-06,
"loss": 0.029,
"num_tokens": 128634209.0,
"step": 1607
},
{
"epoch": 2.0049906425452275,
"grad_norm": 0.09435077752847291,
"learning_rate": 3.3628662748861374e-06,
"loss": 0.0282,
"num_tokens": 128715069.0,
"step": 1608
},
{
"epoch": 2.0062383031815347,
"grad_norm": 0.09614281235112,
"learning_rate": 3.357535540688379e-06,
"loss": 0.0278,
"num_tokens": 128795504.0,
"step": 1609
},
{
"epoch": 2.0074859638178415,
"grad_norm": 0.09264669561245602,
"learning_rate": 3.3522086914262585e-06,
"loss": 0.0271,
"num_tokens": 128876123.0,
"step": 1610
},
{
"epoch": 2.0087336244541483,
"grad_norm": 0.11140936885389302,
"learning_rate": 3.3468857367589665e-06,
"loss": 0.0275,
"num_tokens": 128955991.0,
"step": 1611
},
{
"epoch": 2.0099812850904555,
"grad_norm": 0.09169587047179813,
"learning_rate": 3.3415666863386298e-06,
"loss": 0.0275,
"num_tokens": 129039016.0,
"step": 1612
},
{
"epoch": 2.0112289457267623,
"grad_norm": 0.1042070178139685,
"learning_rate": 3.3362515498102934e-06,
"loss": 0.0275,
"num_tokens": 129120041.0,
"step": 1613
},
{
"epoch": 2.0124766063630695,
"grad_norm": 0.11233635245764659,
"learning_rate": 3.330940336811903e-06,
"loss": 0.0281,
"num_tokens": 129200867.0,
"step": 1614
},
{
"epoch": 2.0137242669993762,
"grad_norm": 0.09294626630897553,
"learning_rate": 3.325633056974298e-06,
"loss": 0.0259,
"num_tokens": 129279446.0,
"step": 1615
},
{
"epoch": 2.014971927635683,
"grad_norm": 0.10425624673925903,
"learning_rate": 3.3203297199211794e-06,
"loss": 0.0273,
"num_tokens": 129359517.0,
"step": 1616
},
{
"epoch": 2.01621958827199,
"grad_norm": 0.10772022702288936,
"learning_rate": 3.315030335269096e-06,
"loss": 0.0272,
"num_tokens": 129439678.0,
"step": 1617
},
{
"epoch": 2.017467248908297,
"grad_norm": 0.11804003684908118,
"learning_rate": 3.309734912627441e-06,
"loss": 0.0282,
"num_tokens": 129519376.0,
"step": 1618
},
{
"epoch": 2.0187149095446038,
"grad_norm": 0.12381607955705823,
"learning_rate": 3.304443461598413e-06,
"loss": 0.0288,
"num_tokens": 129600651.0,
"step": 1619
},
{
"epoch": 2.019962570180911,
"grad_norm": 0.13319111452989643,
"learning_rate": 3.299155991777011e-06,
"loss": 0.0297,
"num_tokens": 129680386.0,
"step": 1620
},
{
"epoch": 2.0212102308172177,
"grad_norm": 0.11432348701501038,
"learning_rate": 3.2938725127510185e-06,
"loss": 0.0282,
"num_tokens": 129760859.0,
"step": 1621
},
{
"epoch": 2.0224578914535245,
"grad_norm": 0.10791791050792644,
"learning_rate": 3.2885930341009774e-06,
"loss": 0.0277,
"num_tokens": 129840051.0,
"step": 1622
},
{
"epoch": 2.0237055520898317,
"grad_norm": 0.11686951819864136,
"learning_rate": 3.2833175654001787e-06,
"loss": 0.0283,
"num_tokens": 129919701.0,
"step": 1623
},
{
"epoch": 2.0249532127261385,
"grad_norm": 0.11359351664175828,
"learning_rate": 3.278046116214642e-06,
"loss": 0.0269,
"num_tokens": 129999877.0,
"step": 1624
},
{
"epoch": 2.0262008733624453,
"grad_norm": 0.1023939359316259,
"learning_rate": 3.272778696103099e-06,
"loss": 0.0275,
"num_tokens": 130079968.0,
"step": 1625
},
{
"epoch": 2.0274485339987525,
"grad_norm": 0.10892544064999989,
"learning_rate": 3.2675153146169736e-06,
"loss": 0.0275,
"num_tokens": 130160624.0,
"step": 1626
},
{
"epoch": 2.0286961946350592,
"grad_norm": 0.10430749700068655,
"learning_rate": 3.2622559813003684e-06,
"loss": 0.0288,
"num_tokens": 130239925.0,
"step": 1627
},
{
"epoch": 2.029943855271366,
"grad_norm": 0.10317672945567567,
"learning_rate": 3.2570007056900437e-06,
"loss": 0.0271,
"num_tokens": 130320799.0,
"step": 1628
},
{
"epoch": 2.031191515907673,
"grad_norm": 0.10963736701978923,
"learning_rate": 3.2517494973154008e-06,
"loss": 0.0277,
"num_tokens": 130400099.0,
"step": 1629
},
{
"epoch": 2.03243917654398,
"grad_norm": 0.12238499931140737,
"learning_rate": 3.2465023656984707e-06,
"loss": 0.0292,
"num_tokens": 130480729.0,
"step": 1630
},
{
"epoch": 2.0336868371802868,
"grad_norm": 0.10971492853164319,
"learning_rate": 3.2412593203538857e-06,
"loss": 0.0297,
"num_tokens": 130560371.0,
"step": 1631
},
{
"epoch": 2.034934497816594,
"grad_norm": 0.1230436141198275,
"learning_rate": 3.236020370788876e-06,
"loss": 0.0273,
"num_tokens": 130639731.0,
"step": 1632
},
{
"epoch": 2.0361821584529007,
"grad_norm": 0.10170244549018313,
"learning_rate": 3.230785526503236e-06,
"loss": 0.0269,
"num_tokens": 130719595.0,
"step": 1633
},
{
"epoch": 2.037429819089208,
"grad_norm": 0.11160175846037224,
"learning_rate": 3.225554796989325e-06,
"loss": 0.0279,
"num_tokens": 130800181.0,
"step": 1634
},
{
"epoch": 2.0386774797255147,
"grad_norm": 0.09813935826310065,
"learning_rate": 3.2203281917320328e-06,
"loss": 0.0261,
"num_tokens": 130879513.0,
"step": 1635
},
{
"epoch": 2.0399251403618215,
"grad_norm": 0.11117192931618461,
"learning_rate": 3.2151057202087783e-06,
"loss": 0.0283,
"num_tokens": 130958850.0,
"step": 1636
},
{
"epoch": 2.0411728009981287,
"grad_norm": 0.10698263215463055,
"learning_rate": 3.209887391889479e-06,
"loss": 0.027,
"num_tokens": 131038985.0,
"step": 1637
},
{
"epoch": 2.0424204616344355,
"grad_norm": 0.10820076619681371,
"learning_rate": 3.204673216236539e-06,
"loss": 0.0275,
"num_tokens": 131118757.0,
"step": 1638
},
{
"epoch": 2.0436681222707422,
"grad_norm": 0.11467484503330413,
"learning_rate": 3.199463202704838e-06,
"loss": 0.0273,
"num_tokens": 131201221.0,
"step": 1639
},
{
"epoch": 2.0449157829070495,
"grad_norm": 0.12247988636033476,
"learning_rate": 3.194257360741706e-06,
"loss": 0.0288,
"num_tokens": 131282719.0,
"step": 1640
},
{
"epoch": 2.046163443543356,
"grad_norm": 0.1057593800426446,
"learning_rate": 3.189055699786906e-06,
"loss": 0.0274,
"num_tokens": 131362232.0,
"step": 1641
},
{
"epoch": 2.047411104179663,
"grad_norm": 0.12364375320242353,
"learning_rate": 3.1838582292726206e-06,
"loss": 0.0289,
"num_tokens": 131442046.0,
"step": 1642
},
{
"epoch": 2.04865876481597,
"grad_norm": 0.1145129297133959,
"learning_rate": 3.1786649586234373e-06,
"loss": 0.0282,
"num_tokens": 131523135.0,
"step": 1643
},
{
"epoch": 2.049906425452277,
"grad_norm": 0.10215949526592619,
"learning_rate": 3.173475897256325e-06,
"loss": 0.0266,
"num_tokens": 131603284.0,
"step": 1644
},
{
"epoch": 2.0511540860885837,
"grad_norm": 0.10761560661797925,
"learning_rate": 3.1682910545806167e-06,
"loss": 0.028,
"num_tokens": 131683908.0,
"step": 1645
},
{
"epoch": 2.052401746724891,
"grad_norm": 0.11121482292743783,
"learning_rate": 3.1631104399980053e-06,
"loss": 0.0267,
"num_tokens": 131762852.0,
"step": 1646
},
{
"epoch": 2.0536494073611977,
"grad_norm": 0.11153961622318409,
"learning_rate": 3.157934062902508e-06,
"loss": 0.0276,
"num_tokens": 131842654.0,
"step": 1647
},
{
"epoch": 2.0548970679975045,
"grad_norm": 0.11202792323594137,
"learning_rate": 3.1527619326804594e-06,
"loss": 0.0278,
"num_tokens": 131921403.0,
"step": 1648
},
{
"epoch": 2.0561447286338117,
"grad_norm": 0.11878747584208416,
"learning_rate": 3.147594058710498e-06,
"loss": 0.0279,
"num_tokens": 132003424.0,
"step": 1649
},
{
"epoch": 2.0573923892701185,
"grad_norm": 0.11600844083004452,
"learning_rate": 3.14243045036354e-06,
"loss": 0.0276,
"num_tokens": 132082840.0,
"step": 1650
},
{
"epoch": 2.0586400499064252,
"grad_norm": 0.11293539536069673,
"learning_rate": 3.1372711170027666e-06,
"loss": 0.0275,
"num_tokens": 132162541.0,
"step": 1651
},
{
"epoch": 2.0598877105427325,
"grad_norm": 0.11026505266606129,
"learning_rate": 3.13211606798361e-06,
"loss": 0.0274,
"num_tokens": 132242631.0,
"step": 1652
},
{
"epoch": 2.061135371179039,
"grad_norm": 0.12188319048183703,
"learning_rate": 3.1269653126537344e-06,
"loss": 0.0278,
"num_tokens": 132323488.0,
"step": 1653
},
{
"epoch": 2.0623830318153464,
"grad_norm": 0.12082041385469403,
"learning_rate": 3.121818860353011e-06,
"loss": 0.0275,
"num_tokens": 132403983.0,
"step": 1654
},
{
"epoch": 2.063630692451653,
"grad_norm": 0.10629866280671935,
"learning_rate": 3.116676720413519e-06,
"loss": 0.0267,
"num_tokens": 132483686.0,
"step": 1655
},
{
"epoch": 2.06487835308796,
"grad_norm": 0.10618934252287057,
"learning_rate": 3.11153890215951e-06,
"loss": 0.028,
"num_tokens": 132562728.0,
"step": 1656
},
{
"epoch": 2.066126013724267,
"grad_norm": 0.1179037811248608,
"learning_rate": 3.1064054149073984e-06,
"loss": 0.0287,
"num_tokens": 132643301.0,
"step": 1657
},
{
"epoch": 2.067373674360574,
"grad_norm": 0.1068883102584125,
"learning_rate": 3.1012762679657525e-06,
"loss": 0.0271,
"num_tokens": 132722314.0,
"step": 1658
},
{
"epoch": 2.0686213349968807,
"grad_norm": 0.10906974578938657,
"learning_rate": 3.0961514706352654e-06,
"loss": 0.028,
"num_tokens": 132801325.0,
"step": 1659
},
{
"epoch": 2.069868995633188,
"grad_norm": 0.11050438055442657,
"learning_rate": 3.09103103220874e-06,
"loss": 0.0273,
"num_tokens": 132881040.0,
"step": 1660
},
{
"epoch": 2.0711166562694947,
"grad_norm": 0.11682521580597607,
"learning_rate": 3.085914961971082e-06,
"loss": 0.0283,
"num_tokens": 132960595.0,
"step": 1661
},
{
"epoch": 2.0723643169058015,
"grad_norm": 0.11450793460923425,
"learning_rate": 3.080803269199275e-06,
"loss": 0.0277,
"num_tokens": 133039931.0,
"step": 1662
},
{
"epoch": 2.0736119775421087,
"grad_norm": 0.11467759010106195,
"learning_rate": 3.0756959631623583e-06,
"loss": 0.0276,
"num_tokens": 133120290.0,
"step": 1663
},
{
"epoch": 2.0748596381784155,
"grad_norm": 0.11713560955703137,
"learning_rate": 3.0705930531214255e-06,
"loss": 0.0284,
"num_tokens": 133201093.0,
"step": 1664
},
{
"epoch": 2.0761072988147222,
"grad_norm": 0.11798027964717171,
"learning_rate": 3.065494548329594e-06,
"loss": 0.0283,
"num_tokens": 133280732.0,
"step": 1665
},
{
"epoch": 2.0773549594510294,
"grad_norm": 0.11006281058166327,
"learning_rate": 3.060400458031991e-06,
"loss": 0.0267,
"num_tokens": 133360752.0,
"step": 1666
},
{
"epoch": 2.078602620087336,
"grad_norm": 0.10693659229092088,
"learning_rate": 3.055310791465744e-06,
"loss": 0.0281,
"num_tokens": 133440361.0,
"step": 1667
},
{
"epoch": 2.079850280723643,
"grad_norm": 0.10276904415759595,
"learning_rate": 3.0502255578599594e-06,
"loss": 0.0266,
"num_tokens": 133520074.0,
"step": 1668
},
{
"epoch": 2.08109794135995,
"grad_norm": 0.10682226267908494,
"learning_rate": 3.0451447664357005e-06,
"loss": 0.0274,
"num_tokens": 133599454.0,
"step": 1669
},
{
"epoch": 2.082345601996257,
"grad_norm": 0.11642340085158942,
"learning_rate": 3.040068426405976e-06,
"loss": 0.028,
"num_tokens": 133679535.0,
"step": 1670
},
{
"epoch": 2.083593262632564,
"grad_norm": 0.10439274580853032,
"learning_rate": 3.0349965469757283e-06,
"loss": 0.0276,
"num_tokens": 133759276.0,
"step": 1671
},
{
"epoch": 2.084840923268871,
"grad_norm": 0.10178900758944974,
"learning_rate": 3.0299291373418038e-06,
"loss": 0.0271,
"num_tokens": 133839238.0,
"step": 1672
},
{
"epoch": 2.0860885839051777,
"grad_norm": 0.10108576053246532,
"learning_rate": 3.024866206692953e-06,
"loss": 0.0266,
"num_tokens": 133918423.0,
"step": 1673
},
{
"epoch": 2.087336244541485,
"grad_norm": 0.10890004660532689,
"learning_rate": 3.0198077642097945e-06,
"loss": 0.0281,
"num_tokens": 133999314.0,
"step": 1674
},
{
"epoch": 2.0885839051777917,
"grad_norm": 0.11949459892569592,
"learning_rate": 3.014753819064817e-06,
"loss": 0.0285,
"num_tokens": 134079417.0,
"step": 1675
},
{
"epoch": 2.0898315658140985,
"grad_norm": 0.11019307261367436,
"learning_rate": 3.009704380422348e-06,
"loss": 0.0268,
"num_tokens": 134159336.0,
"step": 1676
},
{
"epoch": 2.0910792264504057,
"grad_norm": 0.1094076338739407,
"learning_rate": 3.004659457438548e-06,
"loss": 0.0282,
"num_tokens": 134238687.0,
"step": 1677
},
{
"epoch": 2.0923268870867124,
"grad_norm": 0.11203035809174719,
"learning_rate": 2.999619059261387e-06,
"loss": 0.0277,
"num_tokens": 134319431.0,
"step": 1678
},
{
"epoch": 2.093574547723019,
"grad_norm": 0.10125109772296438,
"learning_rate": 2.9945831950306285e-06,
"loss": 0.0267,
"num_tokens": 134399747.0,
"step": 1679
},
{
"epoch": 2.0948222083593264,
"grad_norm": 0.10479161190075127,
"learning_rate": 2.9895518738778196e-06,
"loss": 0.0271,
"num_tokens": 134479438.0,
"step": 1680
},
{
"epoch": 2.096069868995633,
"grad_norm": 0.11262694348153912,
"learning_rate": 2.984525104926262e-06,
"loss": 0.0285,
"num_tokens": 134559824.0,
"step": 1681
},
{
"epoch": 2.09731752963194,
"grad_norm": 0.11572630284775189,
"learning_rate": 2.97950289729101e-06,
"loss": 0.0286,
"num_tokens": 134639963.0,
"step": 1682
},
{
"epoch": 2.098565190268247,
"grad_norm": 0.11746731799240802,
"learning_rate": 2.974485260078846e-06,
"loss": 0.0277,
"num_tokens": 134719925.0,
"step": 1683
},
{
"epoch": 2.099812850904554,
"grad_norm": 0.121581880657992,
"learning_rate": 2.9694722023882607e-06,
"loss": 0.0269,
"num_tokens": 134799277.0,
"step": 1684
},
{
"epoch": 2.1010605115408607,
"grad_norm": 0.11598556908383914,
"learning_rate": 2.9644637333094404e-06,
"loss": 0.0284,
"num_tokens": 134879892.0,
"step": 1685
},
{
"epoch": 2.102308172177168,
"grad_norm": 0.14223573137275083,
"learning_rate": 2.959459861924258e-06,
"loss": 0.028,
"num_tokens": 134959367.0,
"step": 1686
},
{
"epoch": 2.1035558328134747,
"grad_norm": 0.12003504700797399,
"learning_rate": 2.954460597306242e-06,
"loss": 0.0291,
"num_tokens": 135039815.0,
"step": 1687
},
{
"epoch": 2.1048034934497815,
"grad_norm": 0.12915254648172153,
"learning_rate": 2.9494659485205683e-06,
"loss": 0.0459,
"num_tokens": 135121468.0,
"step": 1688
},
{
"epoch": 2.1060511540860887,
"grad_norm": 0.10985724087657672,
"learning_rate": 2.9444759246240505e-06,
"loss": 0.0271,
"num_tokens": 135200652.0,
"step": 1689
},
{
"epoch": 2.1072988147223954,
"grad_norm": 0.11936091256114177,
"learning_rate": 2.939490534665107e-06,
"loss": 0.0278,
"num_tokens": 135281590.0,
"step": 1690
},
{
"epoch": 2.108546475358702,
"grad_norm": 0.11222455840445227,
"learning_rate": 2.934509787683755e-06,
"loss": 0.0281,
"num_tokens": 135361375.0,
"step": 1691
},
{
"epoch": 2.1097941359950094,
"grad_norm": 0.11147040706020958,
"learning_rate": 2.929533692711598e-06,
"loss": 0.0274,
"num_tokens": 135442925.0,
"step": 1692
},
{
"epoch": 2.111041796631316,
"grad_norm": 0.10467852886453667,
"learning_rate": 2.9245622587717982e-06,
"loss": 0.0275,
"num_tokens": 135523906.0,
"step": 1693
},
{
"epoch": 2.1122894572676234,
"grad_norm": 0.10869697183862845,
"learning_rate": 2.919595494879065e-06,
"loss": 0.0276,
"num_tokens": 135603783.0,
"step": 1694
},
{
"epoch": 2.11353711790393,
"grad_norm": 0.1155482691293836,
"learning_rate": 2.9146334100396474e-06,
"loss": 0.0282,
"num_tokens": 135684084.0,
"step": 1695
},
{
"epoch": 2.114784778540237,
"grad_norm": 0.11252805380717851,
"learning_rate": 2.9096760132513036e-06,
"loss": 0.0286,
"num_tokens": 135765048.0,
"step": 1696
},
{
"epoch": 2.116032439176544,
"grad_norm": 0.11033818934026106,
"learning_rate": 2.9047233135032927e-06,
"loss": 0.0275,
"num_tokens": 135845335.0,
"step": 1697
},
{
"epoch": 2.117280099812851,
"grad_norm": 0.13859448472332372,
"learning_rate": 2.8997753197763532e-06,
"loss": 0.03,
"num_tokens": 135925643.0,
"step": 1698
},
{
"epoch": 2.1185277604491577,
"grad_norm": 0.11224407005748382,
"learning_rate": 2.894832041042699e-06,
"loss": 0.0288,
"num_tokens": 136005781.0,
"step": 1699
},
{
"epoch": 2.119775421085465,
"grad_norm": 0.10019855101966049,
"learning_rate": 2.8898934862659823e-06,
"loss": 0.0266,
"num_tokens": 136085338.0,
"step": 1700
},
{
"epoch": 2.1210230817217717,
"grad_norm": 0.12238851973022535,
"learning_rate": 2.8849596644013e-06,
"loss": 0.0277,
"num_tokens": 136166090.0,
"step": 1701
},
{
"epoch": 2.1222707423580784,
"grad_norm": 0.11577197457908402,
"learning_rate": 2.880030584395162e-06,
"loss": 0.0281,
"num_tokens": 136246455.0,
"step": 1702
},
{
"epoch": 2.1235184029943857,
"grad_norm": 0.11575314696378225,
"learning_rate": 2.8751062551854775e-06,
"loss": 0.0292,
"num_tokens": 136326020.0,
"step": 1703
},
{
"epoch": 2.1247660636306924,
"grad_norm": 0.1236602811227467,
"learning_rate": 2.870186685701545e-06,
"loss": 0.0278,
"num_tokens": 136405902.0,
"step": 1704
},
{
"epoch": 2.126013724266999,
"grad_norm": 0.10698357238776467,
"learning_rate": 2.8652718848640337e-06,
"loss": 0.0271,
"num_tokens": 136484304.0,
"step": 1705
},
{
"epoch": 2.1272613849033064,
"grad_norm": 0.11565417571999578,
"learning_rate": 2.8603618615849603e-06,
"loss": 0.0289,
"num_tokens": 136563783.0,
"step": 1706
},
{
"epoch": 2.128509045539613,
"grad_norm": 0.11178198694056778,
"learning_rate": 2.8554566247676806e-06,
"loss": 0.027,
"num_tokens": 136643585.0,
"step": 1707
},
{
"epoch": 2.12975670617592,
"grad_norm": 0.10484411194496365,
"learning_rate": 2.850556183306874e-06,
"loss": 0.0277,
"num_tokens": 136723910.0,
"step": 1708
},
{
"epoch": 2.131004366812227,
"grad_norm": 0.11008252356620116,
"learning_rate": 2.845660546088519e-06,
"loss": 0.0271,
"num_tokens": 136802746.0,
"step": 1709
},
{
"epoch": 2.132252027448534,
"grad_norm": 0.1244585389706024,
"learning_rate": 2.8407697219898865e-06,
"loss": 0.0277,
"num_tokens": 136883051.0,
"step": 1710
},
{
"epoch": 2.133499688084841,
"grad_norm": 0.11761109934284099,
"learning_rate": 2.8358837198795223e-06,
"loss": 0.0285,
"num_tokens": 136963474.0,
"step": 1711
},
{
"epoch": 2.134747348721148,
"grad_norm": 0.11248525039399156,
"learning_rate": 2.8310025486172223e-06,
"loss": 0.0288,
"num_tokens": 137043833.0,
"step": 1712
},
{
"epoch": 2.1359950093574547,
"grad_norm": 0.11548415099531208,
"learning_rate": 2.8261262170540242e-06,
"loss": 0.0274,
"num_tokens": 137123180.0,
"step": 1713
},
{
"epoch": 2.137242669993762,
"grad_norm": 0.10709230843633225,
"learning_rate": 2.821254734032194e-06,
"loss": 0.0275,
"num_tokens": 137203580.0,
"step": 1714
},
{
"epoch": 2.1384903306300687,
"grad_norm": 0.11759009663698383,
"learning_rate": 2.8163881083852e-06,
"loss": 0.0299,
"num_tokens": 137284710.0,
"step": 1715
},
{
"epoch": 2.1397379912663754,
"grad_norm": 0.10669561409225682,
"learning_rate": 2.811526348937706e-06,
"loss": 0.0275,
"num_tokens": 137363810.0,
"step": 1716
},
{
"epoch": 2.1409856519026826,
"grad_norm": 0.10185154190042917,
"learning_rate": 2.806669464505552e-06,
"loss": 0.0274,
"num_tokens": 137443227.0,
"step": 1717
},
{
"epoch": 2.1422333125389894,
"grad_norm": 0.11279381093375922,
"learning_rate": 2.80181746389574e-06,
"loss": 0.0277,
"num_tokens": 137522857.0,
"step": 1718
},
{
"epoch": 2.143480973175296,
"grad_norm": 0.11315608721853433,
"learning_rate": 2.7969703559064076e-06,
"loss": 0.0278,
"num_tokens": 137602845.0,
"step": 1719
},
{
"epoch": 2.1447286338116034,
"grad_norm": 0.11922283733338566,
"learning_rate": 2.792128149326833e-06,
"loss": 0.0289,
"num_tokens": 137683630.0,
"step": 1720
},
{
"epoch": 2.14597629444791,
"grad_norm": 0.115407862616771,
"learning_rate": 2.7872908529373976e-06,
"loss": 0.0286,
"num_tokens": 137765456.0,
"step": 1721
},
{
"epoch": 2.147223955084217,
"grad_norm": 0.11208353296927773,
"learning_rate": 2.782458475509581e-06,
"loss": 0.0271,
"num_tokens": 137845544.0,
"step": 1722
},
{
"epoch": 2.148471615720524,
"grad_norm": 0.1068207673083175,
"learning_rate": 2.7776310258059447e-06,
"loss": 0.0271,
"num_tokens": 137924567.0,
"step": 1723
},
{
"epoch": 2.149719276356831,
"grad_norm": 0.1144766801038503,
"learning_rate": 2.772808512580114e-06,
"loss": 0.0311,
"num_tokens": 138004671.0,
"step": 1724
},
{
"epoch": 2.1509669369931377,
"grad_norm": 0.11230257045378114,
"learning_rate": 2.767990944576763e-06,
"loss": 0.0281,
"num_tokens": 138085655.0,
"step": 1725
},
{
"epoch": 2.152214597629445,
"grad_norm": 0.10991165872168095,
"learning_rate": 2.7631783305316017e-06,
"loss": 0.0268,
"num_tokens": 138166694.0,
"step": 1726
},
{
"epoch": 2.1534622582657517,
"grad_norm": 0.10971797219708118,
"learning_rate": 2.7583706791713503e-06,
"loss": 0.0273,
"num_tokens": 138246414.0,
"step": 1727
},
{
"epoch": 2.154709918902059,
"grad_norm": 0.11270715704229847,
"learning_rate": 2.7535679992137338e-06,
"loss": 0.0271,
"num_tokens": 138326768.0,
"step": 1728
},
{
"epoch": 2.1559575795383656,
"grad_norm": 0.12328542886456291,
"learning_rate": 2.7487702993674647e-06,
"loss": 0.029,
"num_tokens": 138406486.0,
"step": 1729
},
{
"epoch": 2.1572052401746724,
"grad_norm": 0.1190445079709797,
"learning_rate": 2.7439775883322228e-06,
"loss": 0.0284,
"num_tokens": 138488315.0,
"step": 1730
},
{
"epoch": 2.158452900810979,
"grad_norm": 0.11340655933746487,
"learning_rate": 2.739189874798639e-06,
"loss": 0.0286,
"num_tokens": 138569358.0,
"step": 1731
},
{
"epoch": 2.1597005614472864,
"grad_norm": 0.11303296924070988,
"learning_rate": 2.7344071674482874e-06,
"loss": 0.0281,
"num_tokens": 138648911.0,
"step": 1732
},
{
"epoch": 2.160948222083593,
"grad_norm": 0.1034480761800008,
"learning_rate": 2.729629474953662e-06,
"loss": 0.0266,
"num_tokens": 138727334.0,
"step": 1733
},
{
"epoch": 2.1621958827199004,
"grad_norm": 0.11156242541516115,
"learning_rate": 2.7248568059781654e-06,
"loss": 0.0263,
"num_tokens": 138807463.0,
"step": 1734
},
{
"epoch": 2.163443543356207,
"grad_norm": 0.10227201302760246,
"learning_rate": 2.7200891691760838e-06,
"loss": 0.0272,
"num_tokens": 138888573.0,
"step": 1735
},
{
"epoch": 2.164691203992514,
"grad_norm": 0.11926979754093582,
"learning_rate": 2.715326573192588e-06,
"loss": 0.028,
"num_tokens": 138967599.0,
"step": 1736
},
{
"epoch": 2.165938864628821,
"grad_norm": 0.10590335435742637,
"learning_rate": 2.710569026663702e-06,
"loss": 0.027,
"num_tokens": 139048151.0,
"step": 1737
},
{
"epoch": 2.167186525265128,
"grad_norm": 0.1148169854925881,
"learning_rate": 2.705816538216296e-06,
"loss": 0.0272,
"num_tokens": 139128924.0,
"step": 1738
},
{
"epoch": 2.1684341859014347,
"grad_norm": 0.1019205633049057,
"learning_rate": 2.7010691164680696e-06,
"loss": 0.0268,
"num_tokens": 139210922.0,
"step": 1739
},
{
"epoch": 2.169681846537742,
"grad_norm": 0.12286991978648502,
"learning_rate": 2.696326770027533e-06,
"loss": 0.0352,
"num_tokens": 139292094.0,
"step": 1740
},
{
"epoch": 2.1709295071740486,
"grad_norm": 0.11680777294553368,
"learning_rate": 2.6915895074939912e-06,
"loss": 0.0274,
"num_tokens": 139372386.0,
"step": 1741
},
{
"epoch": 2.1721771678103554,
"grad_norm": 0.11434415621297753,
"learning_rate": 2.6868573374575356e-06,
"loss": 0.028,
"num_tokens": 139451840.0,
"step": 1742
},
{
"epoch": 2.1734248284466626,
"grad_norm": 0.11068475985426603,
"learning_rate": 2.6821302684990204e-06,
"loss": 0.0282,
"num_tokens": 139531179.0,
"step": 1743
},
{
"epoch": 2.1746724890829694,
"grad_norm": 0.10765013051927665,
"learning_rate": 2.677408309190049e-06,
"loss": 0.0273,
"num_tokens": 139611340.0,
"step": 1744
},
{
"epoch": 2.175920149719276,
"grad_norm": 0.1047064628578538,
"learning_rate": 2.672691468092963e-06,
"loss": 0.0266,
"num_tokens": 139690494.0,
"step": 1745
},
{
"epoch": 2.1771678103555834,
"grad_norm": 0.12852423753495817,
"learning_rate": 2.6679797537608184e-06,
"loss": 0.0283,
"num_tokens": 139773131.0,
"step": 1746
},
{
"epoch": 2.17841547099189,
"grad_norm": 0.10882310129634558,
"learning_rate": 2.6632731747373785e-06,
"loss": 0.0281,
"num_tokens": 139853287.0,
"step": 1747
},
{
"epoch": 2.179663131628197,
"grad_norm": 0.11131589806757092,
"learning_rate": 2.658571739557096e-06,
"loss": 0.0278,
"num_tokens": 139934168.0,
"step": 1748
},
{
"epoch": 2.180910792264504,
"grad_norm": 0.11163023096979792,
"learning_rate": 2.653875456745092e-06,
"loss": 0.0274,
"num_tokens": 140014255.0,
"step": 1749
},
{
"epoch": 2.182158452900811,
"grad_norm": 0.10614675078257936,
"learning_rate": 2.6491843348171455e-06,
"loss": 0.0275,
"num_tokens": 140094036.0,
"step": 1750
},
{
"epoch": 2.183406113537118,
"grad_norm": 0.11255684585168828,
"learning_rate": 2.644498382279681e-06,
"loss": 0.0279,
"num_tokens": 140174691.0,
"step": 1751
},
{
"epoch": 2.184653774173425,
"grad_norm": 0.11526184776808238,
"learning_rate": 2.639817607629745e-06,
"loss": 0.028,
"num_tokens": 140254358.0,
"step": 1752
},
{
"epoch": 2.1859014348097316,
"grad_norm": 0.11014270003322352,
"learning_rate": 2.635142019354995e-06,
"loss": 0.0279,
"num_tokens": 140335203.0,
"step": 1753
},
{
"epoch": 2.187149095446039,
"grad_norm": 0.11726190195059955,
"learning_rate": 2.6304716259336903e-06,
"loss": 0.027,
"num_tokens": 140415001.0,
"step": 1754
},
{
"epoch": 2.1883967560823456,
"grad_norm": 0.11551789258082427,
"learning_rate": 2.6258064358346642e-06,
"loss": 0.0276,
"num_tokens": 140495012.0,
"step": 1755
},
{
"epoch": 2.1896444167186524,
"grad_norm": 0.12911904078324782,
"learning_rate": 2.621146457517314e-06,
"loss": 0.0298,
"num_tokens": 140577112.0,
"step": 1756
},
{
"epoch": 2.1908920773549596,
"grad_norm": 0.10893674109666582,
"learning_rate": 2.6164916994315916e-06,
"loss": 0.0269,
"num_tokens": 140657288.0,
"step": 1757
},
{
"epoch": 2.1921397379912664,
"grad_norm": 0.11561232996328151,
"learning_rate": 2.6118421700179795e-06,
"loss": 0.0275,
"num_tokens": 140737502.0,
"step": 1758
},
{
"epoch": 2.193387398627573,
"grad_norm": 0.11811526803531466,
"learning_rate": 2.6071978777074796e-06,
"loss": 0.0278,
"num_tokens": 140816956.0,
"step": 1759
},
{
"epoch": 2.1946350592638804,
"grad_norm": 0.10604896561401701,
"learning_rate": 2.6025588309215975e-06,
"loss": 0.0277,
"num_tokens": 140898099.0,
"step": 1760
},
{
"epoch": 2.195882719900187,
"grad_norm": 0.11100890813179569,
"learning_rate": 2.5979250380723287e-06,
"loss": 0.0281,
"num_tokens": 140977581.0,
"step": 1761
},
{
"epoch": 2.197130380536494,
"grad_norm": 0.12574865966231316,
"learning_rate": 2.5932965075621376e-06,
"loss": 0.0291,
"num_tokens": 141058023.0,
"step": 1762
},
{
"epoch": 2.198378041172801,
"grad_norm": 0.11094390713862573,
"learning_rate": 2.5886732477839514e-06,
"loss": 0.0278,
"num_tokens": 141137869.0,
"step": 1763
},
{
"epoch": 2.199625701809108,
"grad_norm": 0.10875999873552089,
"learning_rate": 2.584055267121137e-06,
"loss": 0.0264,
"num_tokens": 141216853.0,
"step": 1764
},
{
"epoch": 2.2008733624454146,
"grad_norm": 0.11359059997751579,
"learning_rate": 2.579442573947488e-06,
"loss": 0.0302,
"num_tokens": 141298589.0,
"step": 1765
},
{
"epoch": 2.202121023081722,
"grad_norm": 0.12697222632331867,
"learning_rate": 2.5748351766272127e-06,
"loss": 0.0289,
"num_tokens": 141378590.0,
"step": 1766
},
{
"epoch": 2.2033686837180286,
"grad_norm": 0.10545644958622358,
"learning_rate": 2.5702330835149137e-06,
"loss": 0.0263,
"num_tokens": 141457825.0,
"step": 1767
},
{
"epoch": 2.204616344354336,
"grad_norm": 0.11016392505811934,
"learning_rate": 2.5656363029555788e-06,
"loss": 0.0289,
"num_tokens": 141538133.0,
"step": 1768
},
{
"epoch": 2.2058640049906426,
"grad_norm": 0.10965895021224552,
"learning_rate": 2.561044843284558e-06,
"loss": 0.0281,
"num_tokens": 141617180.0,
"step": 1769
},
{
"epoch": 2.2071116656269494,
"grad_norm": 0.1133356513749835,
"learning_rate": 2.556458712827558e-06,
"loss": 0.0287,
"num_tokens": 141695468.0,
"step": 1770
},
{
"epoch": 2.2083593262632566,
"grad_norm": 0.11395886557137493,
"learning_rate": 2.551877919900619e-06,
"loss": 0.0282,
"num_tokens": 141775903.0,
"step": 1771
},
{
"epoch": 2.2096069868995634,
"grad_norm": 0.11822802385416521,
"learning_rate": 2.5473024728101004e-06,
"loss": 0.0278,
"num_tokens": 141856125.0,
"step": 1772
},
{
"epoch": 2.21085464753587,
"grad_norm": 0.11611618297885314,
"learning_rate": 2.5427323798526747e-06,
"loss": 0.0277,
"num_tokens": 141936379.0,
"step": 1773
},
{
"epoch": 2.2121023081721773,
"grad_norm": 0.11559137208466257,
"learning_rate": 2.538167649315298e-06,
"loss": 0.0287,
"num_tokens": 142018427.0,
"step": 1774
},
{
"epoch": 2.213349968808484,
"grad_norm": 0.11659924816734041,
"learning_rate": 2.5336082894752084e-06,
"loss": 0.0285,
"num_tokens": 142098768.0,
"step": 1775
},
{
"epoch": 2.214597629444791,
"grad_norm": 0.11963881975246271,
"learning_rate": 2.529054308599906e-06,
"loss": 0.0308,
"num_tokens": 142178577.0,
"step": 1776
},
{
"epoch": 2.215845290081098,
"grad_norm": 0.11090511560379994,
"learning_rate": 2.524505714947131e-06,
"loss": 0.0281,
"num_tokens": 142258299.0,
"step": 1777
},
{
"epoch": 2.217092950717405,
"grad_norm": 0.11296744141624795,
"learning_rate": 2.5199625167648576e-06,
"loss": 0.028,
"num_tokens": 142339109.0,
"step": 1778
},
{
"epoch": 2.2183406113537116,
"grad_norm": 0.1257848652662134,
"learning_rate": 2.515424722291282e-06,
"loss": 0.0268,
"num_tokens": 142421110.0,
"step": 1779
},
{
"epoch": 2.219588271990019,
"grad_norm": 0.10900698032162932,
"learning_rate": 2.5108923397547934e-06,
"loss": 0.027,
"num_tokens": 142501998.0,
"step": 1780
},
{
"epoch": 2.2208359326263256,
"grad_norm": 0.11575161471868187,
"learning_rate": 2.5063653773739705e-06,
"loss": 0.0278,
"num_tokens": 142581395.0,
"step": 1781
},
{
"epoch": 2.2220835932626324,
"grad_norm": 0.10090543842040466,
"learning_rate": 2.501843843357568e-06,
"loss": 0.0263,
"num_tokens": 142659673.0,
"step": 1782
},
{
"epoch": 2.2233312538989396,
"grad_norm": 0.1115252060659669,
"learning_rate": 2.4973277459044927e-06,
"loss": 0.0282,
"num_tokens": 142741046.0,
"step": 1783
},
{
"epoch": 2.2245789145352464,
"grad_norm": 0.1133673733213164,
"learning_rate": 2.4928170932037916e-06,
"loss": 0.0277,
"num_tokens": 142820299.0,
"step": 1784
},
{
"epoch": 2.225826575171553,
"grad_norm": 0.1127136967529174,
"learning_rate": 2.4883118934346446e-06,
"loss": 0.0273,
"num_tokens": 142900381.0,
"step": 1785
},
{
"epoch": 2.2270742358078603,
"grad_norm": 0.10491586464405452,
"learning_rate": 2.48381215476634e-06,
"loss": 0.0265,
"num_tokens": 142980799.0,
"step": 1786
},
{
"epoch": 2.228321896444167,
"grad_norm": 0.11469618848304418,
"learning_rate": 2.4793178853582624e-06,
"loss": 0.0273,
"num_tokens": 143061287.0,
"step": 1787
},
{
"epoch": 2.229569557080474,
"grad_norm": 0.1087127525924716,
"learning_rate": 2.474829093359881e-06,
"loss": 0.0275,
"num_tokens": 143141303.0,
"step": 1788
},
{
"epoch": 2.230817217716781,
"grad_norm": 0.10390821955698,
"learning_rate": 2.4703457869107346e-06,
"loss": 0.0272,
"num_tokens": 143221934.0,
"step": 1789
},
{
"epoch": 2.232064878353088,
"grad_norm": 0.11473477655662885,
"learning_rate": 2.4658679741404106e-06,
"loss": 0.0287,
"num_tokens": 143303459.0,
"step": 1790
},
{
"epoch": 2.233312538989395,
"grad_norm": 0.11105580740416413,
"learning_rate": 2.461395663168539e-06,
"loss": 0.0281,
"num_tokens": 143383014.0,
"step": 1791
},
{
"epoch": 2.234560199625702,
"grad_norm": 0.11200067622309487,
"learning_rate": 2.4569288621047704e-06,
"loss": 0.0284,
"num_tokens": 143462866.0,
"step": 1792
},
{
"epoch": 2.2358078602620086,
"grad_norm": 0.10838875549676315,
"learning_rate": 2.452467579048764e-06,
"loss": 0.0267,
"num_tokens": 143541856.0,
"step": 1793
},
{
"epoch": 2.237055520898316,
"grad_norm": 0.11004549583890312,
"learning_rate": 2.4480118220901764e-06,
"loss": 0.0268,
"num_tokens": 143621766.0,
"step": 1794
},
{
"epoch": 2.2383031815346226,
"grad_norm": 0.10774371386318345,
"learning_rate": 2.4435615993086414e-06,
"loss": 0.0281,
"num_tokens": 143700863.0,
"step": 1795
},
{
"epoch": 2.2395508421709294,
"grad_norm": 0.10262726213219055,
"learning_rate": 2.4391169187737555e-06,
"loss": 0.0264,
"num_tokens": 143780027.0,
"step": 1796
},
{
"epoch": 2.2407985028072366,
"grad_norm": 0.10594867992251468,
"learning_rate": 2.434677788545071e-06,
"loss": 0.0278,
"num_tokens": 143859671.0,
"step": 1797
},
{
"epoch": 2.2420461634435433,
"grad_norm": 0.10587436147761728,
"learning_rate": 2.4302442166720723e-06,
"loss": 0.0275,
"num_tokens": 143940423.0,
"step": 1798
},
{
"epoch": 2.24329382407985,
"grad_norm": 0.11934655967345954,
"learning_rate": 2.4258162111941634e-06,
"loss": 0.0276,
"num_tokens": 144021103.0,
"step": 1799
},
{
"epoch": 2.2445414847161573,
"grad_norm": 0.11108721213520488,
"learning_rate": 2.42139378014066e-06,
"loss": 0.028,
"num_tokens": 144101887.0,
"step": 1800
},
{
"epoch": 2.245789145352464,
"grad_norm": 0.11656736214212844,
"learning_rate": 2.416976931530764e-06,
"loss": 0.0287,
"num_tokens": 144182215.0,
"step": 1801
},
{
"epoch": 2.247036805988771,
"grad_norm": 0.11510680943141609,
"learning_rate": 2.4125656733735554e-06,
"loss": 0.0281,
"num_tokens": 144263091.0,
"step": 1802
},
{
"epoch": 2.248284466625078,
"grad_norm": 0.10345650824569999,
"learning_rate": 2.4081600136679805e-06,
"loss": 0.0276,
"num_tokens": 144342322.0,
"step": 1803
},
{
"epoch": 2.249532127261385,
"grad_norm": 0.11225842570308209,
"learning_rate": 2.403759960402834e-06,
"loss": 0.028,
"num_tokens": 144422662.0,
"step": 1804
},
{
"epoch": 2.2507797878976916,
"grad_norm": 0.14563905711788522,
"learning_rate": 2.39936552155674e-06,
"loss": 0.0276,
"num_tokens": 144502890.0,
"step": 1805
},
{
"epoch": 2.252027448533999,
"grad_norm": 0.11458859468044612,
"learning_rate": 2.394976705098143e-06,
"loss": 0.0277,
"num_tokens": 144583307.0,
"step": 1806
},
{
"epoch": 2.2532751091703056,
"grad_norm": 0.11282706392587076,
"learning_rate": 2.3905935189852967e-06,
"loss": 0.0286,
"num_tokens": 144664063.0,
"step": 1807
},
{
"epoch": 2.254522769806613,
"grad_norm": 0.10716579111382028,
"learning_rate": 2.386215971166242e-06,
"loss": 0.0273,
"num_tokens": 144744193.0,
"step": 1808
},
{
"epoch": 2.2557704304429196,
"grad_norm": 0.10816884263366056,
"learning_rate": 2.381844069578793e-06,
"loss": 0.0273,
"num_tokens": 144825420.0,
"step": 1809
},
{
"epoch": 2.2570180910792264,
"grad_norm": 0.11079261493952917,
"learning_rate": 2.3774778221505316e-06,
"loss": 0.0269,
"num_tokens": 144904705.0,
"step": 1810
},
{
"epoch": 2.2582657517155336,
"grad_norm": 0.12304447536252638,
"learning_rate": 2.3731172367987856e-06,
"loss": 0.0279,
"num_tokens": 144984409.0,
"step": 1811
},
{
"epoch": 2.2595134123518403,
"grad_norm": 0.12073914506900349,
"learning_rate": 2.3687623214306096e-06,
"loss": 0.0281,
"num_tokens": 145065436.0,
"step": 1812
},
{
"epoch": 2.260761072988147,
"grad_norm": 0.11025114886878515,
"learning_rate": 2.364413083942787e-06,
"loss": 0.0278,
"num_tokens": 145145726.0,
"step": 1813
},
{
"epoch": 2.2620087336244543,
"grad_norm": 0.10678041974245804,
"learning_rate": 2.3600695322217965e-06,
"loss": 0.0278,
"num_tokens": 145225716.0,
"step": 1814
},
{
"epoch": 2.263256394260761,
"grad_norm": 0.1094478383949689,
"learning_rate": 2.355731674143809e-06,
"loss": 0.0266,
"num_tokens": 145304672.0,
"step": 1815
},
{
"epoch": 2.264504054897068,
"grad_norm": 0.11066469315589064,
"learning_rate": 2.3513995175746757e-06,
"loss": 0.0276,
"num_tokens": 145383597.0,
"step": 1816
},
{
"epoch": 2.265751715533375,
"grad_norm": 0.11081072087024889,
"learning_rate": 2.3470730703699034e-06,
"loss": 0.0264,
"num_tokens": 145463533.0,
"step": 1817
},
{
"epoch": 2.266999376169682,
"grad_norm": 0.12335653809318686,
"learning_rate": 2.3427523403746496e-06,
"loss": 0.0286,
"num_tokens": 145543691.0,
"step": 1818
},
{
"epoch": 2.2682470368059886,
"grad_norm": 0.10000215802185779,
"learning_rate": 2.338437335423705e-06,
"loss": 0.0264,
"num_tokens": 145622759.0,
"step": 1819
},
{
"epoch": 2.269494697442296,
"grad_norm": 0.11700316815610115,
"learning_rate": 2.3341280633414763e-06,
"loss": 0.0286,
"num_tokens": 145703874.0,
"step": 1820
},
{
"epoch": 2.2707423580786026,
"grad_norm": 0.11974933764453737,
"learning_rate": 2.3298245319419755e-06,
"loss": 0.0281,
"num_tokens": 145784643.0,
"step": 1821
},
{
"epoch": 2.2719900187149094,
"grad_norm": 0.11752237392767895,
"learning_rate": 2.325526749028808e-06,
"loss": 0.0278,
"num_tokens": 145865466.0,
"step": 1822
},
{
"epoch": 2.2732376793512166,
"grad_norm": 0.12283401566553695,
"learning_rate": 2.321234722395152e-06,
"loss": 0.0282,
"num_tokens": 145944342.0,
"step": 1823
},
{
"epoch": 2.2744853399875233,
"grad_norm": 0.11195250793318184,
"learning_rate": 2.3169484598237484e-06,
"loss": 0.0276,
"num_tokens": 146023270.0,
"step": 1824
},
{
"epoch": 2.2757330006238305,
"grad_norm": 0.11013517177727546,
"learning_rate": 2.312667969086887e-06,
"loss": 0.0267,
"num_tokens": 146102006.0,
"step": 1825
},
{
"epoch": 2.2769806612601373,
"grad_norm": 0.11787583993905608,
"learning_rate": 2.308393257946393e-06,
"loss": 0.0274,
"num_tokens": 146181867.0,
"step": 1826
},
{
"epoch": 2.278228321896444,
"grad_norm": 0.11819294612785929,
"learning_rate": 2.304124334153608e-06,
"loss": 0.0269,
"num_tokens": 146261476.0,
"step": 1827
},
{
"epoch": 2.279475982532751,
"grad_norm": 0.11184828699005617,
"learning_rate": 2.2998612054493827e-06,
"loss": 0.0284,
"num_tokens": 146341515.0,
"step": 1828
},
{
"epoch": 2.280723643169058,
"grad_norm": 0.10777522190355769,
"learning_rate": 2.2956038795640573e-06,
"loss": 0.0274,
"num_tokens": 146420733.0,
"step": 1829
},
{
"epoch": 2.281971303805365,
"grad_norm": 0.10954932409855346,
"learning_rate": 2.291352364217449e-06,
"loss": 0.028,
"num_tokens": 146501895.0,
"step": 1830
},
{
"epoch": 2.283218964441672,
"grad_norm": 0.10884043485738872,
"learning_rate": 2.287106667118841e-06,
"loss": 0.0281,
"num_tokens": 146581796.0,
"step": 1831
},
{
"epoch": 2.284466625077979,
"grad_norm": 0.11517754082110432,
"learning_rate": 2.2828667959669674e-06,
"loss": 0.0286,
"num_tokens": 146663043.0,
"step": 1832
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.1259652230677902,
"learning_rate": 2.2786327584499944e-06,
"loss": 0.0291,
"num_tokens": 146744082.0,
"step": 1833
},
{
"epoch": 2.286961946350593,
"grad_norm": 0.11385926033954893,
"learning_rate": 2.2744045622455112e-06,
"loss": 0.0278,
"num_tokens": 146824514.0,
"step": 1834
},
{
"epoch": 2.2882096069868996,
"grad_norm": 0.10823713522415669,
"learning_rate": 2.270182215020517e-06,
"loss": 0.0275,
"num_tokens": 146905003.0,
"step": 1835
},
{
"epoch": 2.2894572676232063,
"grad_norm": 0.10607107777899051,
"learning_rate": 2.2659657244314017e-06,
"loss": 0.0274,
"num_tokens": 146984485.0,
"step": 1836
},
{
"epoch": 2.2907049282595136,
"grad_norm": 0.10959122120451005,
"learning_rate": 2.26175509812394e-06,
"loss": 0.0276,
"num_tokens": 147064157.0,
"step": 1837
},
{
"epoch": 2.2919525888958203,
"grad_norm": 0.10057700457691329,
"learning_rate": 2.2575503437332677e-06,
"loss": 0.0273,
"num_tokens": 147143970.0,
"step": 1838
},
{
"epoch": 2.293200249532127,
"grad_norm": 0.10943750560043476,
"learning_rate": 2.2533514688838755e-06,
"loss": 0.028,
"num_tokens": 147225213.0,
"step": 1839
},
{
"epoch": 2.2944479101684343,
"grad_norm": 0.1101560015216109,
"learning_rate": 2.2491584811895927e-06,
"loss": 0.0272,
"num_tokens": 147305029.0,
"step": 1840
},
{
"epoch": 2.295695570804741,
"grad_norm": 0.1108231639983885,
"learning_rate": 2.244971388253576e-06,
"loss": 0.0261,
"num_tokens": 147385812.0,
"step": 1841
},
{
"epoch": 2.2969432314410483,
"grad_norm": 0.11679304596070587,
"learning_rate": 2.2407901976682884e-06,
"loss": 0.0294,
"num_tokens": 147466071.0,
"step": 1842
},
{
"epoch": 2.298190892077355,
"grad_norm": 0.09947228828930545,
"learning_rate": 2.2366149170154907e-06,
"loss": 0.0262,
"num_tokens": 147544270.0,
"step": 1843
},
{
"epoch": 2.299438552713662,
"grad_norm": 0.11014547069179065,
"learning_rate": 2.232445553866231e-06,
"loss": 0.0278,
"num_tokens": 147624344.0,
"step": 1844
},
{
"epoch": 2.3006862133499686,
"grad_norm": 0.11618854645406114,
"learning_rate": 2.228282115780824e-06,
"loss": 0.0283,
"num_tokens": 147703871.0,
"step": 1845
},
{
"epoch": 2.301933873986276,
"grad_norm": 0.11530991453976354,
"learning_rate": 2.22412461030884e-06,
"loss": 0.0277,
"num_tokens": 147783896.0,
"step": 1846
},
{
"epoch": 2.3031815346225826,
"grad_norm": 0.10820957500421617,
"learning_rate": 2.2199730449890964e-06,
"loss": 0.027,
"num_tokens": 147864352.0,
"step": 1847
},
{
"epoch": 2.30442919525889,
"grad_norm": 0.11573884363992257,
"learning_rate": 2.215827427349635e-06,
"loss": 0.0282,
"num_tokens": 147944345.0,
"step": 1848
},
{
"epoch": 2.3056768558951966,
"grad_norm": 0.11777830036437165,
"learning_rate": 2.211687764907711e-06,
"loss": 0.0272,
"num_tokens": 148026015.0,
"step": 1849
},
{
"epoch": 2.3069245165315033,
"grad_norm": 0.11222239508770851,
"learning_rate": 2.2075540651697873e-06,
"loss": 0.0275,
"num_tokens": 148106133.0,
"step": 1850
},
{
"epoch": 2.3081721771678105,
"grad_norm": 0.10933938525610572,
"learning_rate": 2.2034263356315087e-06,
"loss": 0.0278,
"num_tokens": 148186803.0,
"step": 1851
},
{
"epoch": 2.3094198378041173,
"grad_norm": 0.11453260431350368,
"learning_rate": 2.1993045837776957e-06,
"loss": 0.0278,
"num_tokens": 148267327.0,
"step": 1852
},
{
"epoch": 2.310667498440424,
"grad_norm": 0.11836630922316033,
"learning_rate": 2.195188817082331e-06,
"loss": 0.0283,
"num_tokens": 148349060.0,
"step": 1853
},
{
"epoch": 2.3119151590767313,
"grad_norm": 0.12681294550992456,
"learning_rate": 2.1910790430085465e-06,
"loss": 0.0275,
"num_tokens": 148428476.0,
"step": 1854
},
{
"epoch": 2.313162819713038,
"grad_norm": 0.10886143919786201,
"learning_rate": 2.1869752690086e-06,
"loss": 0.0269,
"num_tokens": 148507956.0,
"step": 1855
},
{
"epoch": 2.314410480349345,
"grad_norm": 0.1123737906437985,
"learning_rate": 2.1828775025238787e-06,
"loss": 0.027,
"num_tokens": 148587206.0,
"step": 1856
},
{
"epoch": 2.315658140985652,
"grad_norm": 0.1087200879265718,
"learning_rate": 2.1787857509848693e-06,
"loss": 0.027,
"num_tokens": 148666530.0,
"step": 1857
},
{
"epoch": 2.316905801621959,
"grad_norm": 0.12845500495545087,
"learning_rate": 2.174700021811153e-06,
"loss": 0.0288,
"num_tokens": 148745506.0,
"step": 1858
},
{
"epoch": 2.3181534622582656,
"grad_norm": 0.10251335702347915,
"learning_rate": 2.1706203224113944e-06,
"loss": 0.0273,
"num_tokens": 148826801.0,
"step": 1859
},
{
"epoch": 2.319401122894573,
"grad_norm": 0.13207173451085918,
"learning_rate": 2.1665466601833197e-06,
"loss": 0.0282,
"num_tokens": 148907396.0,
"step": 1860
},
{
"epoch": 2.3206487835308796,
"grad_norm": 0.11045087789527284,
"learning_rate": 2.162479042513711e-06,
"loss": 0.0275,
"num_tokens": 148987223.0,
"step": 1861
},
{
"epoch": 2.3218964441671863,
"grad_norm": 0.10466374938904822,
"learning_rate": 2.158417476778388e-06,
"loss": 0.0276,
"num_tokens": 149068080.0,
"step": 1862
},
{
"epoch": 2.3231441048034935,
"grad_norm": 0.11264871081312319,
"learning_rate": 2.1543619703421975e-06,
"loss": 0.0278,
"num_tokens": 149147571.0,
"step": 1863
},
{
"epoch": 2.3243917654398003,
"grad_norm": 0.104606366045223,
"learning_rate": 2.1503125305589976e-06,
"loss": 0.0272,
"num_tokens": 149227191.0,
"step": 1864
},
{
"epoch": 2.3256394260761075,
"grad_norm": 0.12583404589107572,
"learning_rate": 2.146269164771648e-06,
"loss": 0.0292,
"num_tokens": 149308765.0,
"step": 1865
},
{
"epoch": 2.3268870867124143,
"grad_norm": 0.11179688039422762,
"learning_rate": 2.142231880311992e-06,
"loss": 0.0274,
"num_tokens": 149388926.0,
"step": 1866
},
{
"epoch": 2.328134747348721,
"grad_norm": 0.10361239593188457,
"learning_rate": 2.1382006845008456e-06,
"loss": 0.0267,
"num_tokens": 149468409.0,
"step": 1867
},
{
"epoch": 2.329382407985028,
"grad_norm": 0.10285293247205865,
"learning_rate": 2.1341755846479868e-06,
"loss": 0.0274,
"num_tokens": 149548190.0,
"step": 1868
},
{
"epoch": 2.330630068621335,
"grad_norm": 0.11770503261290141,
"learning_rate": 2.1301565880521387e-06,
"loss": 0.0273,
"num_tokens": 149628012.0,
"step": 1869
},
{
"epoch": 2.331877729257642,
"grad_norm": 0.1074927375632693,
"learning_rate": 2.1261437020009565e-06,
"loss": 0.0271,
"num_tokens": 149708217.0,
"step": 1870
},
{
"epoch": 2.333125389893949,
"grad_norm": 0.11030656423571202,
"learning_rate": 2.122136933771014e-06,
"loss": 0.0272,
"num_tokens": 149788673.0,
"step": 1871
},
{
"epoch": 2.334373050530256,
"grad_norm": 0.12162150889745205,
"learning_rate": 2.118136290627795e-06,
"loss": 0.0287,
"num_tokens": 149868514.0,
"step": 1872
},
{
"epoch": 2.3356207111665626,
"grad_norm": 0.126632958015695,
"learning_rate": 2.114141779825674e-06,
"loss": 0.0282,
"num_tokens": 149948614.0,
"step": 1873
},
{
"epoch": 2.3368683718028698,
"grad_norm": 0.10058350258071957,
"learning_rate": 2.110153408607904e-06,
"loss": 0.0262,
"num_tokens": 150028919.0,
"step": 1874
},
{
"epoch": 2.3381160324391765,
"grad_norm": 0.11778995124698656,
"learning_rate": 2.1061711842066124e-06,
"loss": 0.028,
"num_tokens": 150108918.0,
"step": 1875
},
{
"epoch": 2.3393636930754833,
"grad_norm": 0.11526712748316235,
"learning_rate": 2.1021951138427736e-06,
"loss": 0.0275,
"num_tokens": 150188111.0,
"step": 1876
},
{
"epoch": 2.3406113537117905,
"grad_norm": 0.10576774424191199,
"learning_rate": 2.0982252047262025e-06,
"loss": 0.0277,
"num_tokens": 150267512.0,
"step": 1877
},
{
"epoch": 2.3418590143480973,
"grad_norm": 0.11922101045159628,
"learning_rate": 2.094261464055548e-06,
"loss": 0.028,
"num_tokens": 150349541.0,
"step": 1878
},
{
"epoch": 2.343106674984404,
"grad_norm": 0.11472447341513481,
"learning_rate": 2.0903038990182684e-06,
"loss": 0.0281,
"num_tokens": 150429122.0,
"step": 1879
},
{
"epoch": 2.3443543356207113,
"grad_norm": 0.11865849401366725,
"learning_rate": 2.086352516790624e-06,
"loss": 0.0282,
"num_tokens": 150509545.0,
"step": 1880
},
{
"epoch": 2.345601996257018,
"grad_norm": 0.11861011227322517,
"learning_rate": 2.082407324537668e-06,
"loss": 0.028,
"num_tokens": 150589757.0,
"step": 1881
},
{
"epoch": 2.3468496568933253,
"grad_norm": 0.1151057044148094,
"learning_rate": 2.078468329413223e-06,
"loss": 0.0278,
"num_tokens": 150671338.0,
"step": 1882
},
{
"epoch": 2.348097317529632,
"grad_norm": 0.10255007625515707,
"learning_rate": 2.07453553855988e-06,
"loss": 0.0264,
"num_tokens": 150751508.0,
"step": 1883
},
{
"epoch": 2.349344978165939,
"grad_norm": 0.11660533332586445,
"learning_rate": 2.0706089591089785e-06,
"loss": 0.0282,
"num_tokens": 150832075.0,
"step": 1884
},
{
"epoch": 2.3505926388022456,
"grad_norm": 0.1155557180421997,
"learning_rate": 2.0666885981805916e-06,
"loss": 0.0273,
"num_tokens": 150912548.0,
"step": 1885
},
{
"epoch": 2.3518402994385528,
"grad_norm": 0.11188233909320953,
"learning_rate": 2.0627744628835196e-06,
"loss": 0.0278,
"num_tokens": 150991832.0,
"step": 1886
},
{
"epoch": 2.3530879600748595,
"grad_norm": 0.11350157709518613,
"learning_rate": 2.058866560315273e-06,
"loss": 0.0272,
"num_tokens": 151071514.0,
"step": 1887
},
{
"epoch": 2.3543356207111668,
"grad_norm": 0.11410767768820493,
"learning_rate": 2.054964897562061e-06,
"loss": 0.0276,
"num_tokens": 151150714.0,
"step": 1888
},
{
"epoch": 2.3555832813474735,
"grad_norm": 0.10872089005911116,
"learning_rate": 2.0510694816987724e-06,
"loss": 0.0279,
"num_tokens": 151231361.0,
"step": 1889
},
{
"epoch": 2.3568309419837803,
"grad_norm": 0.1196042703091023,
"learning_rate": 2.047180319788981e-06,
"loss": 0.0276,
"num_tokens": 151311684.0,
"step": 1890
},
{
"epoch": 2.3580786026200875,
"grad_norm": 0.11841309111635819,
"learning_rate": 2.0432974188849103e-06,
"loss": 0.0275,
"num_tokens": 151392783.0,
"step": 1891
},
{
"epoch": 2.3593262632563943,
"grad_norm": 0.11255039160218248,
"learning_rate": 2.0394207860274304e-06,
"loss": 0.0277,
"num_tokens": 151472580.0,
"step": 1892
},
{
"epoch": 2.360573923892701,
"grad_norm": 0.10746347740509427,
"learning_rate": 2.035550428246053e-06,
"loss": 0.0272,
"num_tokens": 151552266.0,
"step": 1893
},
{
"epoch": 2.3618215845290083,
"grad_norm": 0.11470087718795037,
"learning_rate": 2.0316863525589037e-06,
"loss": 0.0279,
"num_tokens": 151631911.0,
"step": 1894
},
{
"epoch": 2.363069245165315,
"grad_norm": 0.12367004565272316,
"learning_rate": 2.0278285659727187e-06,
"loss": 0.0273,
"num_tokens": 151710855.0,
"step": 1895
},
{
"epoch": 2.364316905801622,
"grad_norm": 0.12984514692522087,
"learning_rate": 2.023977075482833e-06,
"loss": 0.0307,
"num_tokens": 151793755.0,
"step": 1896
},
{
"epoch": 2.365564566437929,
"grad_norm": 0.12148820532988883,
"learning_rate": 2.0201318880731633e-06,
"loss": 0.0283,
"num_tokens": 151875625.0,
"step": 1897
},
{
"epoch": 2.3668122270742358,
"grad_norm": 0.11585673908956987,
"learning_rate": 2.0162930107161963e-06,
"loss": 0.0273,
"num_tokens": 151955662.0,
"step": 1898
},
{
"epoch": 2.3680598877105425,
"grad_norm": 0.12778131669298992,
"learning_rate": 2.012460450372976e-06,
"loss": 0.0296,
"num_tokens": 152037665.0,
"step": 1899
},
{
"epoch": 2.3693075483468498,
"grad_norm": 0.12449191375414341,
"learning_rate": 2.0086342139930932e-06,
"loss": 0.031,
"num_tokens": 152116842.0,
"step": 1900
},
{
"epoch": 2.3705552089831565,
"grad_norm": 0.12060369090810642,
"learning_rate": 2.004814308514671e-06,
"loss": 0.0287,
"num_tokens": 152196571.0,
"step": 1901
},
{
"epoch": 2.3718028696194633,
"grad_norm": 0.12138740721784497,
"learning_rate": 2.001000740864353e-06,
"loss": 0.0276,
"num_tokens": 152276228.0,
"step": 1902
},
{
"epoch": 2.3730505302557705,
"grad_norm": 0.10235549260740247,
"learning_rate": 1.9971935179572893e-06,
"loss": 0.0268,
"num_tokens": 152355915.0,
"step": 1903
},
{
"epoch": 2.3742981908920773,
"grad_norm": 0.1156977150303195,
"learning_rate": 1.993392646697127e-06,
"loss": 0.0276,
"num_tokens": 152435603.0,
"step": 1904
},
{
"epoch": 2.3755458515283845,
"grad_norm": 0.1134522871431333,
"learning_rate": 1.9895981339759927e-06,
"loss": 0.0276,
"num_tokens": 152516244.0,
"step": 1905
},
{
"epoch": 2.3767935121646913,
"grad_norm": 0.116802365651326,
"learning_rate": 1.985809986674487e-06,
"loss": 0.0281,
"num_tokens": 152595457.0,
"step": 1906
},
{
"epoch": 2.378041172800998,
"grad_norm": 0.1162946923631464,
"learning_rate": 1.982028211661665e-06,
"loss": 0.0275,
"num_tokens": 152675867.0,
"step": 1907
},
{
"epoch": 2.3792888334373052,
"grad_norm": 0.12149058207102886,
"learning_rate": 1.9782528157950266e-06,
"loss": 0.0285,
"num_tokens": 152756569.0,
"step": 1908
},
{
"epoch": 2.380536494073612,
"grad_norm": 0.1201975730835599,
"learning_rate": 1.974483805920508e-06,
"loss": 0.0285,
"num_tokens": 152836657.0,
"step": 1909
},
{
"epoch": 2.3817841547099188,
"grad_norm": 0.11308003280924744,
"learning_rate": 1.970721188872461e-06,
"loss": 0.0283,
"num_tokens": 152918160.0,
"step": 1910
},
{
"epoch": 2.383031815346226,
"grad_norm": 0.10654932419802848,
"learning_rate": 1.966964971473649e-06,
"loss": 0.0264,
"num_tokens": 153002339.0,
"step": 1911
},
{
"epoch": 2.3842794759825328,
"grad_norm": 0.10620407558672333,
"learning_rate": 1.9632151605352296e-06,
"loss": 0.0279,
"num_tokens": 153081791.0,
"step": 1912
},
{
"epoch": 2.3855271366188395,
"grad_norm": 0.11368468414357902,
"learning_rate": 1.9594717628567432e-06,
"loss": 0.0274,
"num_tokens": 153162006.0,
"step": 1913
},
{
"epoch": 2.3867747972551467,
"grad_norm": 0.11000047901719937,
"learning_rate": 1.9557347852261007e-06,
"loss": 0.0274,
"num_tokens": 153242632.0,
"step": 1914
},
{
"epoch": 2.3880224578914535,
"grad_norm": 0.1089512194404227,
"learning_rate": 1.9520042344195727e-06,
"loss": 0.0272,
"num_tokens": 153323418.0,
"step": 1915
},
{
"epoch": 2.3892701185277603,
"grad_norm": 0.10861808273136546,
"learning_rate": 1.9482801172017758e-06,
"loss": 0.0266,
"num_tokens": 153403974.0,
"step": 1916
},
{
"epoch": 2.3905177791640675,
"grad_norm": 0.11473505086721252,
"learning_rate": 1.9445624403256576e-06,
"loss": 0.0267,
"num_tokens": 153483141.0,
"step": 1917
},
{
"epoch": 2.3917654398003743,
"grad_norm": 0.10654799873398078,
"learning_rate": 1.940851210532493e-06,
"loss": 0.0279,
"num_tokens": 153563318.0,
"step": 1918
},
{
"epoch": 2.393013100436681,
"grad_norm": 0.10839257143529775,
"learning_rate": 1.937146434551863e-06,
"loss": 0.0269,
"num_tokens": 153643226.0,
"step": 1919
},
{
"epoch": 2.3942607610729882,
"grad_norm": 0.10860934191124491,
"learning_rate": 1.933448119101644e-06,
"loss": 0.0286,
"num_tokens": 153722527.0,
"step": 1920
},
{
"epoch": 2.395508421709295,
"grad_norm": 0.10443243666752638,
"learning_rate": 1.929756270888003e-06,
"loss": 0.0265,
"num_tokens": 153802225.0,
"step": 1921
},
{
"epoch": 2.3967560823456022,
"grad_norm": 0.10427755534598981,
"learning_rate": 1.9260708966053744e-06,
"loss": 0.0271,
"num_tokens": 153881820.0,
"step": 1922
},
{
"epoch": 2.398003742981909,
"grad_norm": 0.1177864618305411,
"learning_rate": 1.9223920029364555e-06,
"loss": 0.0278,
"num_tokens": 153961515.0,
"step": 1923
},
{
"epoch": 2.3992514036182158,
"grad_norm": 0.11765676375572733,
"learning_rate": 1.9187195965521934e-06,
"loss": 0.028,
"num_tokens": 154041531.0,
"step": 1924
},
{
"epoch": 2.4004990642545225,
"grad_norm": 0.10500942947449511,
"learning_rate": 1.9150536841117713e-06,
"loss": 0.027,
"num_tokens": 154121639.0,
"step": 1925
},
{
"epoch": 2.4017467248908297,
"grad_norm": 0.12216098631005727,
"learning_rate": 1.911394272262595e-06,
"loss": 0.0298,
"num_tokens": 154203019.0,
"step": 1926
},
{
"epoch": 2.4029943855271365,
"grad_norm": 0.10777756704694372,
"learning_rate": 1.907741367640286e-06,
"loss": 0.0273,
"num_tokens": 154283769.0,
"step": 1927
},
{
"epoch": 2.4042420461634437,
"grad_norm": 0.12546585298956595,
"learning_rate": 1.9040949768686646e-06,
"loss": 0.0282,
"num_tokens": 154364349.0,
"step": 1928
},
{
"epoch": 2.4054897067997505,
"grad_norm": 0.11144911675809406,
"learning_rate": 1.900455106559737e-06,
"loss": 0.0275,
"num_tokens": 154444248.0,
"step": 1929
},
{
"epoch": 2.4067373674360573,
"grad_norm": 0.10425862497450274,
"learning_rate": 1.8968217633136909e-06,
"loss": 0.0266,
"num_tokens": 154523672.0,
"step": 1930
},
{
"epoch": 2.4079850280723645,
"grad_norm": 0.10633358249196864,
"learning_rate": 1.893194953718875e-06,
"loss": 0.0267,
"num_tokens": 154603383.0,
"step": 1931
},
{
"epoch": 2.4092326887086712,
"grad_norm": 0.11025002525121674,
"learning_rate": 1.8895746843517892e-06,
"loss": 0.0273,
"num_tokens": 154683826.0,
"step": 1932
},
{
"epoch": 2.410480349344978,
"grad_norm": 0.12840305730755552,
"learning_rate": 1.8859609617770786e-06,
"loss": 0.0436,
"num_tokens": 154764256.0,
"step": 1933
},
{
"epoch": 2.4117280099812852,
"grad_norm": 0.10790452835307801,
"learning_rate": 1.8823537925475143e-06,
"loss": 0.0272,
"num_tokens": 154843472.0,
"step": 1934
},
{
"epoch": 2.412975670617592,
"grad_norm": 0.10970134188031404,
"learning_rate": 1.8787531832039846e-06,
"loss": 0.0278,
"num_tokens": 154923415.0,
"step": 1935
},
{
"epoch": 2.4142233312538988,
"grad_norm": 0.10844976031889222,
"learning_rate": 1.8751591402754802e-06,
"loss": 0.0271,
"num_tokens": 155001644.0,
"step": 1936
},
{
"epoch": 2.415470991890206,
"grad_norm": 0.10790804890822218,
"learning_rate": 1.8715716702790903e-06,
"loss": 0.0278,
"num_tokens": 155082689.0,
"step": 1937
},
{
"epoch": 2.4167186525265127,
"grad_norm": 0.10777373189548671,
"learning_rate": 1.8679907797199798e-06,
"loss": 0.0271,
"num_tokens": 155161299.0,
"step": 1938
},
{
"epoch": 2.41796631316282,
"grad_norm": 0.10873821982168631,
"learning_rate": 1.8644164750913868e-06,
"loss": 0.0274,
"num_tokens": 155240482.0,
"step": 1939
},
{
"epoch": 2.4192139737991267,
"grad_norm": 0.10820841964710465,
"learning_rate": 1.8608487628746072e-06,
"loss": 0.0272,
"num_tokens": 155320020.0,
"step": 1940
},
{
"epoch": 2.4204616344354335,
"grad_norm": 0.11112717770569587,
"learning_rate": 1.8572876495389808e-06,
"loss": 0.0272,
"num_tokens": 155400618.0,
"step": 1941
},
{
"epoch": 2.4217092950717403,
"grad_norm": 0.11217938609690513,
"learning_rate": 1.8537331415418802e-06,
"loss": 0.0274,
"num_tokens": 155480528.0,
"step": 1942
},
{
"epoch": 2.4229569557080475,
"grad_norm": 0.11097473906028339,
"learning_rate": 1.8501852453287056e-06,
"loss": 0.0272,
"num_tokens": 155562620.0,
"step": 1943
},
{
"epoch": 2.4242046163443542,
"grad_norm": 0.10818902006610549,
"learning_rate": 1.846643967332865e-06,
"loss": 0.0274,
"num_tokens": 155641589.0,
"step": 1944
},
{
"epoch": 2.4254522769806615,
"grad_norm": 0.11177617772827625,
"learning_rate": 1.8431093139757635e-06,
"loss": 0.028,
"num_tokens": 155723189.0,
"step": 1945
},
{
"epoch": 2.4266999376169682,
"grad_norm": 0.11904079761014202,
"learning_rate": 1.8395812916667974e-06,
"loss": 0.0276,
"num_tokens": 155802507.0,
"step": 1946
},
{
"epoch": 2.427947598253275,
"grad_norm": 0.10796895127972975,
"learning_rate": 1.836059906803339e-06,
"loss": 0.0275,
"num_tokens": 155882396.0,
"step": 1947
},
{
"epoch": 2.429195258889582,
"grad_norm": 0.11670361221308223,
"learning_rate": 1.832545165770721e-06,
"loss": 0.0283,
"num_tokens": 155962352.0,
"step": 1948
},
{
"epoch": 2.430442919525889,
"grad_norm": 0.11461518816560916,
"learning_rate": 1.8290370749422327e-06,
"loss": 0.0279,
"num_tokens": 156043890.0,
"step": 1949
},
{
"epoch": 2.4316905801621957,
"grad_norm": 0.10679786955474743,
"learning_rate": 1.8255356406791036e-06,
"loss": 0.0272,
"num_tokens": 156123445.0,
"step": 1950
},
{
"epoch": 2.432938240798503,
"grad_norm": 0.10688075962587382,
"learning_rate": 1.82204086933049e-06,
"loss": 0.0268,
"num_tokens": 156203255.0,
"step": 1951
},
{
"epoch": 2.4341859014348097,
"grad_norm": 0.11372182933826729,
"learning_rate": 1.8185527672334712e-06,
"loss": 0.0265,
"num_tokens": 156282984.0,
"step": 1952
},
{
"epoch": 2.4354335620711165,
"grad_norm": 0.11467488601047325,
"learning_rate": 1.8150713407130283e-06,
"loss": 0.0278,
"num_tokens": 156362864.0,
"step": 1953
},
{
"epoch": 2.4366812227074237,
"grad_norm": 0.11881203317305625,
"learning_rate": 1.8115965960820414e-06,
"loss": 0.0284,
"num_tokens": 156443925.0,
"step": 1954
},
{
"epoch": 2.4379288833437305,
"grad_norm": 0.11291508916496522,
"learning_rate": 1.8081285396412738e-06,
"loss": 0.0275,
"num_tokens": 156526214.0,
"step": 1955
},
{
"epoch": 2.4391765439800372,
"grad_norm": 0.1127357370742545,
"learning_rate": 1.8046671776793584e-06,
"loss": 0.028,
"num_tokens": 156606671.0,
"step": 1956
},
{
"epoch": 2.4404242046163445,
"grad_norm": 0.11312286561244898,
"learning_rate": 1.80121251647279e-06,
"loss": 0.0274,
"num_tokens": 156686884.0,
"step": 1957
},
{
"epoch": 2.4416718652526512,
"grad_norm": 0.11771902116431116,
"learning_rate": 1.7977645622859157e-06,
"loss": 0.0285,
"num_tokens": 156767153.0,
"step": 1958
},
{
"epoch": 2.442919525888958,
"grad_norm": 0.11286969218942611,
"learning_rate": 1.7943233213709173e-06,
"loss": 0.0276,
"num_tokens": 156848405.0,
"step": 1959
},
{
"epoch": 2.444167186525265,
"grad_norm": 0.10775615689943846,
"learning_rate": 1.7908887999678046e-06,
"loss": 0.0269,
"num_tokens": 156927640.0,
"step": 1960
},
{
"epoch": 2.445414847161572,
"grad_norm": 0.12023464215065506,
"learning_rate": 1.7874610043044027e-06,
"loss": 0.0288,
"num_tokens": 157009867.0,
"step": 1961
},
{
"epoch": 2.446662507797879,
"grad_norm": 0.1109760515355431,
"learning_rate": 1.7840399405963432e-06,
"loss": 0.027,
"num_tokens": 157090040.0,
"step": 1962
},
{
"epoch": 2.447910168434186,
"grad_norm": 0.11510300631971997,
"learning_rate": 1.7806256150470472e-06,
"loss": 0.0278,
"num_tokens": 157171270.0,
"step": 1963
},
{
"epoch": 2.4491578290704927,
"grad_norm": 0.113356826538854,
"learning_rate": 1.7772180338477173e-06,
"loss": 0.0269,
"num_tokens": 157250589.0,
"step": 1964
},
{
"epoch": 2.4504054897068,
"grad_norm": 0.11020161557209646,
"learning_rate": 1.7738172031773322e-06,
"loss": 0.0272,
"num_tokens": 157332005.0,
"step": 1965
},
{
"epoch": 2.4516531503431067,
"grad_norm": 0.10920573621039224,
"learning_rate": 1.7704231292026219e-06,
"loss": 0.0274,
"num_tokens": 157412201.0,
"step": 1966
},
{
"epoch": 2.4529008109794135,
"grad_norm": 0.11239699652286433,
"learning_rate": 1.76703581807807e-06,
"loss": 0.0272,
"num_tokens": 157493943.0,
"step": 1967
},
{
"epoch": 2.4541484716157207,
"grad_norm": 0.1082221654475512,
"learning_rate": 1.7636552759458963e-06,
"loss": 0.0269,
"num_tokens": 157572738.0,
"step": 1968
},
{
"epoch": 2.4553961322520275,
"grad_norm": 0.11287162279835761,
"learning_rate": 1.760281508936045e-06,
"loss": 0.0274,
"num_tokens": 157653339.0,
"step": 1969
},
{
"epoch": 2.4566437928883342,
"grad_norm": 0.11440457618886876,
"learning_rate": 1.7569145231661738e-06,
"loss": 0.028,
"num_tokens": 157734700.0,
"step": 1970
},
{
"epoch": 2.4578914535246414,
"grad_norm": 0.11118819360031555,
"learning_rate": 1.753554324741648e-06,
"loss": 0.0272,
"num_tokens": 157815324.0,
"step": 1971
},
{
"epoch": 2.459139114160948,
"grad_norm": 0.11370081079972753,
"learning_rate": 1.7502009197555215e-06,
"loss": 0.0287,
"num_tokens": 157896290.0,
"step": 1972
},
{
"epoch": 2.460386774797255,
"grad_norm": 0.13188166291275744,
"learning_rate": 1.7468543142885308e-06,
"loss": 0.0291,
"num_tokens": 157978183.0,
"step": 1973
},
{
"epoch": 2.461634435433562,
"grad_norm": 0.10901400798519152,
"learning_rate": 1.7435145144090852e-06,
"loss": 0.0281,
"num_tokens": 158056882.0,
"step": 1974
},
{
"epoch": 2.462882096069869,
"grad_norm": 0.10230118141977014,
"learning_rate": 1.740181526173248e-06,
"loss": 0.0271,
"num_tokens": 158136794.0,
"step": 1975
},
{
"epoch": 2.4641297567061757,
"grad_norm": 0.11403183233637615,
"learning_rate": 1.736855355624737e-06,
"loss": 0.0278,
"num_tokens": 158216836.0,
"step": 1976
},
{
"epoch": 2.465377417342483,
"grad_norm": 0.10903076514079771,
"learning_rate": 1.7335360087949048e-06,
"loss": 0.0275,
"num_tokens": 158297798.0,
"step": 1977
},
{
"epoch": 2.4666250779787897,
"grad_norm": 0.10466959609108883,
"learning_rate": 1.73022349170273e-06,
"loss": 0.0271,
"num_tokens": 158378165.0,
"step": 1978
},
{
"epoch": 2.467872738615097,
"grad_norm": 0.11147187512684335,
"learning_rate": 1.7269178103548057e-06,
"loss": 0.0271,
"num_tokens": 158459178.0,
"step": 1979
},
{
"epoch": 2.4691203992514037,
"grad_norm": 0.11409682525660257,
"learning_rate": 1.723618970745334e-06,
"loss": 0.0281,
"num_tokens": 158540418.0,
"step": 1980
},
{
"epoch": 2.4703680598877105,
"grad_norm": 0.10477788112973571,
"learning_rate": 1.7203269788561067e-06,
"loss": 0.027,
"num_tokens": 158619445.0,
"step": 1981
},
{
"epoch": 2.4716157205240172,
"grad_norm": 0.1074788211668467,
"learning_rate": 1.7170418406564982e-06,
"loss": 0.027,
"num_tokens": 158700296.0,
"step": 1982
},
{
"epoch": 2.4728633811603244,
"grad_norm": 0.11129094430099397,
"learning_rate": 1.7137635621034614e-06,
"loss": 0.0277,
"num_tokens": 158780253.0,
"step": 1983
},
{
"epoch": 2.474111041796631,
"grad_norm": 0.10164355680147183,
"learning_rate": 1.7104921491415038e-06,
"loss": 0.0268,
"num_tokens": 158860384.0,
"step": 1984
},
{
"epoch": 2.4753587024329384,
"grad_norm": 0.11686566136668367,
"learning_rate": 1.7072276077026856e-06,
"loss": 0.0283,
"num_tokens": 158939479.0,
"step": 1985
},
{
"epoch": 2.476606363069245,
"grad_norm": 0.11938137497936563,
"learning_rate": 1.7039699437066076e-06,
"loss": 0.0274,
"num_tokens": 159021019.0,
"step": 1986
},
{
"epoch": 2.477854023705552,
"grad_norm": 0.110769383334913,
"learning_rate": 1.7007191630604003e-06,
"loss": 0.0269,
"num_tokens": 159100911.0,
"step": 1987
},
{
"epoch": 2.479101684341859,
"grad_norm": 0.1066179656774663,
"learning_rate": 1.6974752716587092e-06,
"loss": 0.0276,
"num_tokens": 159180930.0,
"step": 1988
},
{
"epoch": 2.480349344978166,
"grad_norm": 0.12059952000176534,
"learning_rate": 1.6942382753836912e-06,
"loss": 0.0287,
"num_tokens": 159260601.0,
"step": 1989
},
{
"epoch": 2.4815970056144727,
"grad_norm": 0.11227342940000516,
"learning_rate": 1.691008180105e-06,
"loss": 0.0273,
"num_tokens": 159340555.0,
"step": 1990
},
{
"epoch": 2.48284466625078,
"grad_norm": 0.1098479649477286,
"learning_rate": 1.6877849916797728e-06,
"loss": 0.0288,
"num_tokens": 159420038.0,
"step": 1991
},
{
"epoch": 2.4840923268870867,
"grad_norm": 0.1111828471383589,
"learning_rate": 1.684568715952626e-06,
"loss": 0.0277,
"num_tokens": 159499863.0,
"step": 1992
},
{
"epoch": 2.4853399875233935,
"grad_norm": 0.11053778696407449,
"learning_rate": 1.6813593587556392e-06,
"loss": 0.0272,
"num_tokens": 159579187.0,
"step": 1993
},
{
"epoch": 2.4865876481597007,
"grad_norm": 0.11439406236806961,
"learning_rate": 1.6781569259083463e-06,
"loss": 0.0282,
"num_tokens": 159659711.0,
"step": 1994
},
{
"epoch": 2.4878353087960074,
"grad_norm": 0.10690423131408978,
"learning_rate": 1.6749614232177273e-06,
"loss": 0.0281,
"num_tokens": 159740822.0,
"step": 1995
},
{
"epoch": 2.489082969432314,
"grad_norm": 0.11272934482085559,
"learning_rate": 1.6717728564781927e-06,
"loss": 0.0277,
"num_tokens": 159820583.0,
"step": 1996
},
{
"epoch": 2.4903306300686214,
"grad_norm": 0.11712009141925543,
"learning_rate": 1.6685912314715797e-06,
"loss": 0.0276,
"num_tokens": 159901927.0,
"step": 1997
},
{
"epoch": 2.491578290704928,
"grad_norm": 0.11201642856684556,
"learning_rate": 1.6654165539671342e-06,
"loss": 0.0273,
"num_tokens": 159982551.0,
"step": 1998
},
{
"epoch": 2.492825951341235,
"grad_norm": 0.12217007620784424,
"learning_rate": 1.6622488297215079e-06,
"loss": 0.0281,
"num_tokens": 160063000.0,
"step": 1999
},
{
"epoch": 2.494073611977542,
"grad_norm": 0.10737268262091881,
"learning_rate": 1.6590880644787407e-06,
"loss": 0.0268,
"num_tokens": 160142294.0,
"step": 2000
},
{
"epoch": 2.495321272613849,
"grad_norm": 0.1250676932249431,
"learning_rate": 1.6559342639702563e-06,
"loss": 0.0306,
"num_tokens": 160220912.0,
"step": 2001
},
{
"epoch": 2.496568933250156,
"grad_norm": 0.11211010682205132,
"learning_rate": 1.6527874339148484e-06,
"loss": 0.0264,
"num_tokens": 160301952.0,
"step": 2002
},
{
"epoch": 2.497816593886463,
"grad_norm": 0.11786513203484543,
"learning_rate": 1.6496475800186702e-06,
"loss": 0.0281,
"num_tokens": 160381913.0,
"step": 2003
},
{
"epoch": 2.4990642545227697,
"grad_norm": 0.10893692645197386,
"learning_rate": 1.6465147079752264e-06,
"loss": 0.0275,
"num_tokens": 160462281.0,
"step": 2004
},
{
"epoch": 2.5003119151590765,
"grad_norm": 0.11526335706838876,
"learning_rate": 1.6433888234653614e-06,
"loss": 0.0278,
"num_tokens": 160542998.0,
"step": 2005
},
{
"epoch": 2.5015595757953837,
"grad_norm": 0.1094817219693971,
"learning_rate": 1.6402699321572485e-06,
"loss": 0.0267,
"num_tokens": 160621811.0,
"step": 2006
},
{
"epoch": 2.5028072364316905,
"grad_norm": 0.11249905162283616,
"learning_rate": 1.6371580397063788e-06,
"loss": 0.0277,
"num_tokens": 160702177.0,
"step": 2007
},
{
"epoch": 2.5040548970679977,
"grad_norm": 0.1218621001840924,
"learning_rate": 1.6340531517555563e-06,
"loss": 0.0294,
"num_tokens": 160783219.0,
"step": 2008
},
{
"epoch": 2.5053025577043044,
"grad_norm": 0.10986240798677817,
"learning_rate": 1.6309552739348804e-06,
"loss": 0.0268,
"num_tokens": 160862773.0,
"step": 2009
},
{
"epoch": 2.506550218340611,
"grad_norm": 0.1207523920480209,
"learning_rate": 1.6278644118617375e-06,
"loss": 0.0275,
"num_tokens": 160942474.0,
"step": 2010
},
{
"epoch": 2.5077978789769184,
"grad_norm": 0.11803187004719243,
"learning_rate": 1.6247805711407993e-06,
"loss": 0.0279,
"num_tokens": 161021964.0,
"step": 2011
},
{
"epoch": 2.509045539613225,
"grad_norm": 0.11522862481685167,
"learning_rate": 1.6217037573639983e-06,
"loss": 0.0278,
"num_tokens": 161101620.0,
"step": 2012
},
{
"epoch": 2.5102932002495324,
"grad_norm": 0.11566448225959479,
"learning_rate": 1.6186339761105275e-06,
"loss": 0.0288,
"num_tokens": 161180903.0,
"step": 2013
},
{
"epoch": 2.511540860885839,
"grad_norm": 0.10653159823487614,
"learning_rate": 1.6155712329468305e-06,
"loss": 0.0267,
"num_tokens": 161260133.0,
"step": 2014
},
{
"epoch": 2.512788521522146,
"grad_norm": 0.11974291187007381,
"learning_rate": 1.6125155334265846e-06,
"loss": 0.0289,
"num_tokens": 161340599.0,
"step": 2015
},
{
"epoch": 2.5140361821584527,
"grad_norm": 0.10950555678047849,
"learning_rate": 1.6094668830906959e-06,
"loss": 0.0281,
"num_tokens": 161421054.0,
"step": 2016
},
{
"epoch": 2.51528384279476,
"grad_norm": 0.11950555449558682,
"learning_rate": 1.6064252874672904e-06,
"loss": 0.028,
"num_tokens": 161501219.0,
"step": 2017
},
{
"epoch": 2.5165315034310667,
"grad_norm": 0.11235098261736175,
"learning_rate": 1.6033907520717008e-06,
"loss": 0.0274,
"num_tokens": 161580744.0,
"step": 2018
},
{
"epoch": 2.517779164067374,
"grad_norm": 0.1093365486795711,
"learning_rate": 1.6003632824064553e-06,
"loss": 0.0267,
"num_tokens": 161660539.0,
"step": 2019
},
{
"epoch": 2.5190268247036807,
"grad_norm": 0.11976522826322118,
"learning_rate": 1.5973428839612727e-06,
"loss": 0.028,
"num_tokens": 161741138.0,
"step": 2020
},
{
"epoch": 2.5202744853399874,
"grad_norm": 0.1156287357078057,
"learning_rate": 1.5943295622130483e-06,
"loss": 0.028,
"num_tokens": 161821837.0,
"step": 2021
},
{
"epoch": 2.521522145976294,
"grad_norm": 0.11441831410599412,
"learning_rate": 1.5913233226258437e-06,
"loss": 0.0271,
"num_tokens": 161900788.0,
"step": 2022
},
{
"epoch": 2.5227698066126014,
"grad_norm": 0.1112970332304599,
"learning_rate": 1.5883241706508823e-06,
"loss": 0.0274,
"num_tokens": 161980892.0,
"step": 2023
},
{
"epoch": 2.524017467248908,
"grad_norm": 0.12122029274976619,
"learning_rate": 1.5853321117265317e-06,
"loss": 0.0278,
"num_tokens": 162060107.0,
"step": 2024
},
{
"epoch": 2.5252651278852154,
"grad_norm": 0.1129961042647627,
"learning_rate": 1.5823471512782983e-06,
"loss": 0.0276,
"num_tokens": 162140320.0,
"step": 2025
},
{
"epoch": 2.526512788521522,
"grad_norm": 0.10694528173242357,
"learning_rate": 1.579369294718819e-06,
"loss": 0.0273,
"num_tokens": 162220263.0,
"step": 2026
},
{
"epoch": 2.527760449157829,
"grad_norm": 0.10649726142962046,
"learning_rate": 1.5763985474478483e-06,
"loss": 0.0265,
"num_tokens": 162301940.0,
"step": 2027
},
{
"epoch": 2.529008109794136,
"grad_norm": 0.11052731141682264,
"learning_rate": 1.5734349148522471e-06,
"loss": 0.0266,
"num_tokens": 162381737.0,
"step": 2028
},
{
"epoch": 2.530255770430443,
"grad_norm": 0.11827410827331811,
"learning_rate": 1.5704784023059788e-06,
"loss": 0.0283,
"num_tokens": 162461425.0,
"step": 2029
},
{
"epoch": 2.5315034310667497,
"grad_norm": 0.11322497252667106,
"learning_rate": 1.5675290151700937e-06,
"loss": 0.0272,
"num_tokens": 162541913.0,
"step": 2030
},
{
"epoch": 2.532751091703057,
"grad_norm": 0.1169346308768019,
"learning_rate": 1.5645867587927208e-06,
"loss": 0.0276,
"num_tokens": 162623234.0,
"step": 2031
},
{
"epoch": 2.5339987523393637,
"grad_norm": 0.12004814618546314,
"learning_rate": 1.561651638509062e-06,
"loss": 0.0274,
"num_tokens": 162702087.0,
"step": 2032
},
{
"epoch": 2.5352464129756704,
"grad_norm": 0.10868378154550547,
"learning_rate": 1.5587236596413773e-06,
"loss": 0.0276,
"num_tokens": 162782960.0,
"step": 2033
},
{
"epoch": 2.5364940736119777,
"grad_norm": 0.11068877689549345,
"learning_rate": 1.5558028274989778e-06,
"loss": 0.028,
"num_tokens": 162862837.0,
"step": 2034
},
{
"epoch": 2.5377417342482844,
"grad_norm": 0.1438726002320652,
"learning_rate": 1.5528891473782126e-06,
"loss": 0.027,
"num_tokens": 162941638.0,
"step": 2035
},
{
"epoch": 2.5389893948845916,
"grad_norm": 0.11403604077645453,
"learning_rate": 1.5499826245624674e-06,
"loss": 0.0285,
"num_tokens": 163021811.0,
"step": 2036
},
{
"epoch": 2.5402370555208984,
"grad_norm": 0.11230917197373902,
"learning_rate": 1.547083264322145e-06,
"loss": 0.0278,
"num_tokens": 163101898.0,
"step": 2037
},
{
"epoch": 2.541484716157205,
"grad_norm": 0.11526915003925865,
"learning_rate": 1.5441910719146616e-06,
"loss": 0.0273,
"num_tokens": 163182128.0,
"step": 2038
},
{
"epoch": 2.542732376793512,
"grad_norm": 0.11514119570048018,
"learning_rate": 1.541306052584437e-06,
"loss": 0.0277,
"num_tokens": 163263096.0,
"step": 2039
},
{
"epoch": 2.543980037429819,
"grad_norm": 0.11954122082354755,
"learning_rate": 1.5384282115628834e-06,
"loss": 0.0275,
"num_tokens": 163343548.0,
"step": 2040
},
{
"epoch": 2.545227698066126,
"grad_norm": 0.11066461352482032,
"learning_rate": 1.5355575540683953e-06,
"loss": 0.0278,
"num_tokens": 163423064.0,
"step": 2041
},
{
"epoch": 2.546475358702433,
"grad_norm": 0.11137639959635122,
"learning_rate": 1.5326940853063443e-06,
"loss": 0.0282,
"num_tokens": 163502397.0,
"step": 2042
},
{
"epoch": 2.54772301933874,
"grad_norm": 0.11324148029107538,
"learning_rate": 1.5298378104690636e-06,
"loss": 0.027,
"num_tokens": 163581986.0,
"step": 2043
},
{
"epoch": 2.5489706799750467,
"grad_norm": 0.12329020795643553,
"learning_rate": 1.5269887347358414e-06,
"loss": 0.0279,
"num_tokens": 163661772.0,
"step": 2044
},
{
"epoch": 2.5502183406113534,
"grad_norm": 0.11352719316396652,
"learning_rate": 1.5241468632729161e-06,
"loss": 0.028,
"num_tokens": 163742085.0,
"step": 2045
},
{
"epoch": 2.5514660012476607,
"grad_norm": 0.11431548220101798,
"learning_rate": 1.5213122012334572e-06,
"loss": 0.0277,
"num_tokens": 163823576.0,
"step": 2046
},
{
"epoch": 2.5527136618839674,
"grad_norm": 0.1151041171348859,
"learning_rate": 1.5184847537575647e-06,
"loss": 0.0273,
"num_tokens": 163903399.0,
"step": 2047
},
{
"epoch": 2.5539613225202746,
"grad_norm": 0.11215314990558678,
"learning_rate": 1.5156645259722565e-06,
"loss": 0.0276,
"num_tokens": 163984511.0,
"step": 2048
},
{
"epoch": 2.5552089831565814,
"grad_norm": 0.11132081944460806,
"learning_rate": 1.5128515229914568e-06,
"loss": 0.0288,
"num_tokens": 164064533.0,
"step": 2049
},
{
"epoch": 2.556456643792888,
"grad_norm": 0.11240110122489118,
"learning_rate": 1.5100457499159897e-06,
"loss": 0.0276,
"num_tokens": 164145208.0,
"step": 2050
},
{
"epoch": 2.5577043044291954,
"grad_norm": 0.12087806227209612,
"learning_rate": 1.507247211833572e-06,
"loss": 0.0276,
"num_tokens": 164226437.0,
"step": 2051
},
{
"epoch": 2.558951965065502,
"grad_norm": 0.11949968556805396,
"learning_rate": 1.5044559138187967e-06,
"loss": 0.0273,
"num_tokens": 164306701.0,
"step": 2052
},
{
"epoch": 2.5601996257018094,
"grad_norm": 0.10637452873416925,
"learning_rate": 1.5016718609331315e-06,
"loss": 0.0269,
"num_tokens": 164386138.0,
"step": 2053
},
{
"epoch": 2.561447286338116,
"grad_norm": 0.10741900874283838,
"learning_rate": 1.4988950582249061e-06,
"loss": 0.0262,
"num_tokens": 164466296.0,
"step": 2054
},
{
"epoch": 2.562694946974423,
"grad_norm": 0.10847356735320615,
"learning_rate": 1.4961255107293044e-06,
"loss": 0.0273,
"num_tokens": 164546034.0,
"step": 2055
},
{
"epoch": 2.5639426076107297,
"grad_norm": 0.10867775728226359,
"learning_rate": 1.4933632234683506e-06,
"loss": 0.0275,
"num_tokens": 164627152.0,
"step": 2056
},
{
"epoch": 2.565190268247037,
"grad_norm": 0.10863086055220095,
"learning_rate": 1.4906082014509088e-06,
"loss": 0.0275,
"num_tokens": 164706832.0,
"step": 2057
},
{
"epoch": 2.5664379288833437,
"grad_norm": 0.11004843698519694,
"learning_rate": 1.4878604496726653e-06,
"loss": 0.0281,
"num_tokens": 164786256.0,
"step": 2058
},
{
"epoch": 2.567685589519651,
"grad_norm": 0.11568288570947931,
"learning_rate": 1.4851199731161243e-06,
"loss": 0.027,
"num_tokens": 164865074.0,
"step": 2059
},
{
"epoch": 2.5689332501559576,
"grad_norm": 0.10186204742079674,
"learning_rate": 1.4823867767505981e-06,
"loss": 0.0265,
"num_tokens": 164943814.0,
"step": 2060
},
{
"epoch": 2.5701809107922644,
"grad_norm": 0.10521205156822269,
"learning_rate": 1.4796608655322001e-06,
"loss": 0.0265,
"num_tokens": 165022913.0,
"step": 2061
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.11589068913118131,
"learning_rate": 1.476942244403829e-06,
"loss": 0.0289,
"num_tokens": 165102030.0,
"step": 2062
},
{
"epoch": 2.5726762320648784,
"grad_norm": 0.11233950248375833,
"learning_rate": 1.4742309182951663e-06,
"loss": 0.0273,
"num_tokens": 165182179.0,
"step": 2063
},
{
"epoch": 2.573923892701185,
"grad_norm": 0.11565035317539887,
"learning_rate": 1.4715268921226677e-06,
"loss": 0.0279,
"num_tokens": 165262779.0,
"step": 2064
},
{
"epoch": 2.5751715533374924,
"grad_norm": 0.11808315287951174,
"learning_rate": 1.468830170789548e-06,
"loss": 0.0278,
"num_tokens": 165344308.0,
"step": 2065
},
{
"epoch": 2.576419213973799,
"grad_norm": 0.11443778145606423,
"learning_rate": 1.4661407591857795e-06,
"loss": 0.0276,
"num_tokens": 165423934.0,
"step": 2066
},
{
"epoch": 2.577666874610106,
"grad_norm": 0.11190198300939089,
"learning_rate": 1.4634586621880786e-06,
"loss": 0.0273,
"num_tokens": 165503345.0,
"step": 2067
},
{
"epoch": 2.578914535246413,
"grad_norm": 0.11593353577598203,
"learning_rate": 1.4607838846598959e-06,
"loss": 0.0263,
"num_tokens": 165583590.0,
"step": 2068
},
{
"epoch": 2.58016219588272,
"grad_norm": 0.11449567615373328,
"learning_rate": 1.4581164314514127e-06,
"loss": 0.0279,
"num_tokens": 165663697.0,
"step": 2069
},
{
"epoch": 2.581409856519027,
"grad_norm": 0.10372245010081463,
"learning_rate": 1.4554563073995284e-06,
"loss": 0.027,
"num_tokens": 165744637.0,
"step": 2070
},
{
"epoch": 2.582657517155334,
"grad_norm": 0.10739901271925287,
"learning_rate": 1.452803517327852e-06,
"loss": 0.0269,
"num_tokens": 165825845.0,
"step": 2071
},
{
"epoch": 2.5839051777916406,
"grad_norm": 0.10419262399020085,
"learning_rate": 1.450158066046692e-06,
"loss": 0.0267,
"num_tokens": 165906115.0,
"step": 2072
},
{
"epoch": 2.5851528384279474,
"grad_norm": 0.10832482949706342,
"learning_rate": 1.4475199583530536e-06,
"loss": 0.0272,
"num_tokens": 165986633.0,
"step": 2073
},
{
"epoch": 2.5864004990642546,
"grad_norm": 0.11424246115691489,
"learning_rate": 1.444889199030622e-06,
"loss": 0.0284,
"num_tokens": 166066387.0,
"step": 2074
},
{
"epoch": 2.5876481597005614,
"grad_norm": 0.11100527216764833,
"learning_rate": 1.4422657928497572e-06,
"loss": 0.0265,
"num_tokens": 166148319.0,
"step": 2075
},
{
"epoch": 2.5888958203368686,
"grad_norm": 0.12099284205512863,
"learning_rate": 1.4396497445674917e-06,
"loss": 0.0296,
"num_tokens": 166228531.0,
"step": 2076
},
{
"epoch": 2.5901434809731754,
"grad_norm": 0.10891165530512759,
"learning_rate": 1.4370410589275096e-06,
"loss": 0.0279,
"num_tokens": 166307862.0,
"step": 2077
},
{
"epoch": 2.591391141609482,
"grad_norm": 0.12679386624360375,
"learning_rate": 1.4344397406601454e-06,
"loss": 0.0293,
"num_tokens": 166388213.0,
"step": 2078
},
{
"epoch": 2.592638802245789,
"grad_norm": 0.11178974797840838,
"learning_rate": 1.4318457944823775e-06,
"loss": 0.028,
"num_tokens": 166468608.0,
"step": 2079
},
{
"epoch": 2.593886462882096,
"grad_norm": 0.10698791554236746,
"learning_rate": 1.4292592250978137e-06,
"loss": 0.0268,
"num_tokens": 166548055.0,
"step": 2080
},
{
"epoch": 2.595134123518403,
"grad_norm": 0.10402086737935964,
"learning_rate": 1.4266800371966844e-06,
"loss": 0.0265,
"num_tokens": 166627914.0,
"step": 2081
},
{
"epoch": 2.59638178415471,
"grad_norm": 0.11256053343287431,
"learning_rate": 1.424108235455838e-06,
"loss": 0.0283,
"num_tokens": 166708046.0,
"step": 2082
},
{
"epoch": 2.597629444791017,
"grad_norm": 0.11648920137262059,
"learning_rate": 1.4215438245387303e-06,
"loss": 0.0278,
"num_tokens": 166788951.0,
"step": 2083
},
{
"epoch": 2.5988771054273236,
"grad_norm": 0.11412389464634487,
"learning_rate": 1.41898680909541e-06,
"loss": 0.0279,
"num_tokens": 166867908.0,
"step": 2084
},
{
"epoch": 2.600124766063631,
"grad_norm": 0.14616857946554868,
"learning_rate": 1.4164371937625222e-06,
"loss": 0.0274,
"num_tokens": 166947938.0,
"step": 2085
},
{
"epoch": 2.6013724266999376,
"grad_norm": 0.10641921005233325,
"learning_rate": 1.4138949831632879e-06,
"loss": 0.027,
"num_tokens": 167028248.0,
"step": 2086
},
{
"epoch": 2.6026200873362444,
"grad_norm": 0.11340157643915183,
"learning_rate": 1.4113601819075037e-06,
"loss": 0.0275,
"num_tokens": 167107712.0,
"step": 2087
},
{
"epoch": 2.6038677479725516,
"grad_norm": 0.11403401971381442,
"learning_rate": 1.4088327945915315e-06,
"loss": 0.0268,
"num_tokens": 167188583.0,
"step": 2088
},
{
"epoch": 2.6051154086088584,
"grad_norm": 0.10625095668404386,
"learning_rate": 1.4063128257982867e-06,
"loss": 0.0267,
"num_tokens": 167268518.0,
"step": 2089
},
{
"epoch": 2.606363069245165,
"grad_norm": 0.1171168338572617,
"learning_rate": 1.4038002800972362e-06,
"loss": 0.0275,
"num_tokens": 167348636.0,
"step": 2090
},
{
"epoch": 2.6076107298814724,
"grad_norm": 0.11542335973973553,
"learning_rate": 1.401295162044383e-06,
"loss": 0.0274,
"num_tokens": 167429322.0,
"step": 2091
},
{
"epoch": 2.608858390517779,
"grad_norm": 0.11727306920785573,
"learning_rate": 1.3987974761822656e-06,
"loss": 0.0275,
"num_tokens": 167510283.0,
"step": 2092
},
{
"epoch": 2.6101060511540863,
"grad_norm": 0.11766850450132074,
"learning_rate": 1.3963072270399411e-06,
"loss": 0.0273,
"num_tokens": 167590184.0,
"step": 2093
},
{
"epoch": 2.611353711790393,
"grad_norm": 0.11918576619487399,
"learning_rate": 1.393824419132986e-06,
"loss": 0.0279,
"num_tokens": 167669021.0,
"step": 2094
},
{
"epoch": 2.6126013724267,
"grad_norm": 0.11307732162265724,
"learning_rate": 1.3913490569634796e-06,
"loss": 0.0277,
"num_tokens": 167748127.0,
"step": 2095
},
{
"epoch": 2.6138490330630066,
"grad_norm": 0.12446227031099523,
"learning_rate": 1.388881145020002e-06,
"loss": 0.0297,
"num_tokens": 167828350.0,
"step": 2096
},
{
"epoch": 2.615096693699314,
"grad_norm": 0.11026469374300878,
"learning_rate": 1.3864206877776245e-06,
"loss": 0.0273,
"num_tokens": 167908530.0,
"step": 2097
},
{
"epoch": 2.6163443543356206,
"grad_norm": 0.11295221539796105,
"learning_rate": 1.3839676896978997e-06,
"loss": 0.0275,
"num_tokens": 167989313.0,
"step": 2098
},
{
"epoch": 2.617592014971928,
"grad_norm": 0.11298609559496968,
"learning_rate": 1.3815221552288541e-06,
"loss": 0.0276,
"num_tokens": 168069559.0,
"step": 2099
},
{
"epoch": 2.6188396756082346,
"grad_norm": 0.1141025393450742,
"learning_rate": 1.3790840888049802e-06,
"loss": 0.0272,
"num_tokens": 168148187.0,
"step": 2100
},
{
"epoch": 2.6200873362445414,
"grad_norm": 0.11929918883054025,
"learning_rate": 1.3766534948472307e-06,
"loss": 0.0271,
"num_tokens": 168228189.0,
"step": 2101
},
{
"epoch": 2.621334996880848,
"grad_norm": 0.11988289797660306,
"learning_rate": 1.3742303777630057e-06,
"loss": 0.0276,
"num_tokens": 168308932.0,
"step": 2102
},
{
"epoch": 2.6225826575171554,
"grad_norm": 0.11436560741581663,
"learning_rate": 1.3718147419461497e-06,
"loss": 0.0282,
"num_tokens": 168388848.0,
"step": 2103
},
{
"epoch": 2.623830318153462,
"grad_norm": 0.1056496402693744,
"learning_rate": 1.3694065917769414e-06,
"loss": 0.027,
"num_tokens": 168467832.0,
"step": 2104
},
{
"epoch": 2.6250779787897693,
"grad_norm": 0.1104079059611424,
"learning_rate": 1.367005931622084e-06,
"loss": 0.0282,
"num_tokens": 168547708.0,
"step": 2105
},
{
"epoch": 2.626325639426076,
"grad_norm": 0.10429616151130164,
"learning_rate": 1.3646127658346992e-06,
"loss": 0.0267,
"num_tokens": 168627496.0,
"step": 2106
},
{
"epoch": 2.627573300062383,
"grad_norm": 0.11808419996882835,
"learning_rate": 1.3622270987543215e-06,
"loss": 0.0275,
"num_tokens": 168706999.0,
"step": 2107
},
{
"epoch": 2.62882096069869,
"grad_norm": 0.11162808996935646,
"learning_rate": 1.3598489347068858e-06,
"loss": 0.0265,
"num_tokens": 168787359.0,
"step": 2108
},
{
"epoch": 2.630068621334997,
"grad_norm": 0.11556801052351962,
"learning_rate": 1.357478278004721e-06,
"loss": 0.027,
"num_tokens": 168868083.0,
"step": 2109
},
{
"epoch": 2.631316281971304,
"grad_norm": 0.11227131668663762,
"learning_rate": 1.3551151329465462e-06,
"loss": 0.0272,
"num_tokens": 168946701.0,
"step": 2110
},
{
"epoch": 2.632563942607611,
"grad_norm": 0.1145171495962,
"learning_rate": 1.3527595038174566e-06,
"loss": 0.0269,
"num_tokens": 169026964.0,
"step": 2111
},
{
"epoch": 2.6338116032439176,
"grad_norm": 0.11258767236811885,
"learning_rate": 1.35041139488892e-06,
"loss": 0.0269,
"num_tokens": 169106455.0,
"step": 2112
},
{
"epoch": 2.6350592638802244,
"grad_norm": 0.1269169979219692,
"learning_rate": 1.3480708104187685e-06,
"loss": 0.0278,
"num_tokens": 169187782.0,
"step": 2113
},
{
"epoch": 2.6363069245165316,
"grad_norm": 0.10759670867627756,
"learning_rate": 1.3457377546511882e-06,
"loss": 0.0274,
"num_tokens": 169268014.0,
"step": 2114
},
{
"epoch": 2.6375545851528384,
"grad_norm": 0.10677322152837186,
"learning_rate": 1.3434122318167142e-06,
"loss": 0.0267,
"num_tokens": 169347680.0,
"step": 2115
},
{
"epoch": 2.6388022457891456,
"grad_norm": 0.1116462775595202,
"learning_rate": 1.3410942461322236e-06,
"loss": 0.0273,
"num_tokens": 169428596.0,
"step": 2116
},
{
"epoch": 2.6400499064254523,
"grad_norm": 0.11029769450411615,
"learning_rate": 1.3387838018009239e-06,
"loss": 0.0267,
"num_tokens": 169509363.0,
"step": 2117
},
{
"epoch": 2.641297567061759,
"grad_norm": 0.10789263559722907,
"learning_rate": 1.3364809030123477e-06,
"loss": 0.0265,
"num_tokens": 169589358.0,
"step": 2118
},
{
"epoch": 2.642545227698066,
"grad_norm": 0.11544419735850721,
"learning_rate": 1.3341855539423499e-06,
"loss": 0.0272,
"num_tokens": 169669893.0,
"step": 2119
},
{
"epoch": 2.643792888334373,
"grad_norm": 0.13240480297882418,
"learning_rate": 1.3318977587530907e-06,
"loss": 0.0441,
"num_tokens": 169750499.0,
"step": 2120
},
{
"epoch": 2.64504054897068,
"grad_norm": 0.10788331716169906,
"learning_rate": 1.3296175215930326e-06,
"loss": 0.0266,
"num_tokens": 169829770.0,
"step": 2121
},
{
"epoch": 2.646288209606987,
"grad_norm": 0.10324597149645515,
"learning_rate": 1.3273448465969376e-06,
"loss": 0.0273,
"num_tokens": 169909873.0,
"step": 2122
},
{
"epoch": 2.647535870243294,
"grad_norm": 0.1083672497231884,
"learning_rate": 1.3250797378858507e-06,
"loss": 0.0273,
"num_tokens": 169990577.0,
"step": 2123
},
{
"epoch": 2.6487835308796006,
"grad_norm": 0.10796760309733955,
"learning_rate": 1.3228221995670987e-06,
"loss": 0.0272,
"num_tokens": 170069752.0,
"step": 2124
},
{
"epoch": 2.650031191515908,
"grad_norm": 0.11470562508991235,
"learning_rate": 1.3205722357342807e-06,
"loss": 0.0281,
"num_tokens": 170150197.0,
"step": 2125
},
{
"epoch": 2.6512788521522146,
"grad_norm": 0.10707306739767271,
"learning_rate": 1.3183298504672626e-06,
"loss": 0.0276,
"num_tokens": 170229547.0,
"step": 2126
},
{
"epoch": 2.6525265127885214,
"grad_norm": 0.10919949194770662,
"learning_rate": 1.316095047832166e-06,
"loss": 0.0274,
"num_tokens": 170309378.0,
"step": 2127
},
{
"epoch": 2.6537741734248286,
"grad_norm": 0.11957546966930166,
"learning_rate": 1.3138678318813618e-06,
"loss": 0.0285,
"num_tokens": 170389032.0,
"step": 2128
},
{
"epoch": 2.6550218340611353,
"grad_norm": 0.11617481255786126,
"learning_rate": 1.3116482066534686e-06,
"loss": 0.027,
"num_tokens": 170468247.0,
"step": 2129
},
{
"epoch": 2.656269494697442,
"grad_norm": 0.11357476091195207,
"learning_rate": 1.3094361761733356e-06,
"loss": 0.0275,
"num_tokens": 170548512.0,
"step": 2130
},
{
"epoch": 2.6575171553337493,
"grad_norm": 0.10925157715831366,
"learning_rate": 1.3072317444520449e-06,
"loss": 0.027,
"num_tokens": 170630459.0,
"step": 2131
},
{
"epoch": 2.658764815970056,
"grad_norm": 0.1169285409735411,
"learning_rate": 1.3050349154868946e-06,
"loss": 0.037,
"num_tokens": 170711970.0,
"step": 2132
},
{
"epoch": 2.6600124766063633,
"grad_norm": 0.11614548941267049,
"learning_rate": 1.3028456932614019e-06,
"loss": 0.027,
"num_tokens": 170791598.0,
"step": 2133
},
{
"epoch": 2.66126013724267,
"grad_norm": 0.12300026143621852,
"learning_rate": 1.3006640817452873e-06,
"loss": 0.0278,
"num_tokens": 170870743.0,
"step": 2134
},
{
"epoch": 2.662507797878977,
"grad_norm": 0.10949480726257627,
"learning_rate": 1.2984900848944727e-06,
"loss": 0.0268,
"num_tokens": 170950664.0,
"step": 2135
},
{
"epoch": 2.6637554585152836,
"grad_norm": 0.11076298472497624,
"learning_rate": 1.2963237066510715e-06,
"loss": 0.0279,
"num_tokens": 171030340.0,
"step": 2136
},
{
"epoch": 2.665003119151591,
"grad_norm": 0.11388272829849243,
"learning_rate": 1.2941649509433808e-06,
"loss": 0.0269,
"num_tokens": 171109325.0,
"step": 2137
},
{
"epoch": 2.6662507797878976,
"grad_norm": 0.12341205651411256,
"learning_rate": 1.2920138216858791e-06,
"loss": 0.0286,
"num_tokens": 171189575.0,
"step": 2138
},
{
"epoch": 2.667498440424205,
"grad_norm": 0.11331697834247403,
"learning_rate": 1.289870322779212e-06,
"loss": 0.0285,
"num_tokens": 171268927.0,
"step": 2139
},
{
"epoch": 2.6687461010605116,
"grad_norm": 0.11882020374000711,
"learning_rate": 1.2877344581101922e-06,
"loss": 0.0272,
"num_tokens": 171347477.0,
"step": 2140
},
{
"epoch": 2.6699937616968183,
"grad_norm": 0.10972444404388286,
"learning_rate": 1.2856062315517885e-06,
"loss": 0.0278,
"num_tokens": 171427407.0,
"step": 2141
},
{
"epoch": 2.671241422333125,
"grad_norm": 0.11946680862932044,
"learning_rate": 1.2834856469631174e-06,
"loss": 0.028,
"num_tokens": 171507650.0,
"step": 2142
},
{
"epoch": 2.6724890829694323,
"grad_norm": 0.10858577662792648,
"learning_rate": 1.28137270818944e-06,
"loss": 0.0276,
"num_tokens": 171588030.0,
"step": 2143
},
{
"epoch": 2.673736743605739,
"grad_norm": 0.11241775634496111,
"learning_rate": 1.279267419062155e-06,
"loss": 0.0272,
"num_tokens": 171668622.0,
"step": 2144
},
{
"epoch": 2.6749844042420463,
"grad_norm": 0.1138275210262026,
"learning_rate": 1.2771697833987852e-06,
"loss": 0.0274,
"num_tokens": 171747879.0,
"step": 2145
},
{
"epoch": 2.676232064878353,
"grad_norm": 0.12107882370247575,
"learning_rate": 1.2750798050029782e-06,
"loss": 0.0301,
"num_tokens": 171828495.0,
"step": 2146
},
{
"epoch": 2.67747972551466,
"grad_norm": 0.11812881048310148,
"learning_rate": 1.272997487664499e-06,
"loss": 0.0289,
"num_tokens": 171908174.0,
"step": 2147
},
{
"epoch": 2.678727386150967,
"grad_norm": 0.10269224386186213,
"learning_rate": 1.2709228351592167e-06,
"loss": 0.0261,
"num_tokens": 171988943.0,
"step": 2148
},
{
"epoch": 2.679975046787274,
"grad_norm": 0.10927504867119865,
"learning_rate": 1.2688558512491032e-06,
"loss": 0.028,
"num_tokens": 172068842.0,
"step": 2149
},
{
"epoch": 2.681222707423581,
"grad_norm": 0.11683785756855145,
"learning_rate": 1.2667965396822257e-06,
"loss": 0.0275,
"num_tokens": 172148664.0,
"step": 2150
},
{
"epoch": 2.682470368059888,
"grad_norm": 0.12167175622894594,
"learning_rate": 1.2647449041927385e-06,
"loss": 0.0278,
"num_tokens": 172229020.0,
"step": 2151
},
{
"epoch": 2.6837180286961946,
"grad_norm": 0.10888309628100702,
"learning_rate": 1.2627009485008754e-06,
"loss": 0.0272,
"num_tokens": 172309720.0,
"step": 2152
},
{
"epoch": 2.6849656893325013,
"grad_norm": 0.122912436571624,
"learning_rate": 1.2606646763129476e-06,
"loss": 0.0284,
"num_tokens": 172389961.0,
"step": 2153
},
{
"epoch": 2.6862133499688086,
"grad_norm": 0.10857265378270686,
"learning_rate": 1.2586360913213315e-06,
"loss": 0.0262,
"num_tokens": 172471162.0,
"step": 2154
},
{
"epoch": 2.6874610106051153,
"grad_norm": 0.11669471057841353,
"learning_rate": 1.256615197204465e-06,
"loss": 0.0283,
"num_tokens": 172551948.0,
"step": 2155
},
{
"epoch": 2.6887086712414225,
"grad_norm": 0.13329440954951594,
"learning_rate": 1.2546019976268403e-06,
"loss": 0.0292,
"num_tokens": 172632487.0,
"step": 2156
},
{
"epoch": 2.6899563318777293,
"grad_norm": 0.11522808892441781,
"learning_rate": 1.2525964962389961e-06,
"loss": 0.027,
"num_tokens": 172713350.0,
"step": 2157
},
{
"epoch": 2.691203992514036,
"grad_norm": 0.11229194690407834,
"learning_rate": 1.250598696677512e-06,
"loss": 0.0278,
"num_tokens": 172792803.0,
"step": 2158
},
{
"epoch": 2.692451653150343,
"grad_norm": 0.10819150987759615,
"learning_rate": 1.2486086025650045e-06,
"loss": 0.0269,
"num_tokens": 172873683.0,
"step": 2159
},
{
"epoch": 2.69369931378665,
"grad_norm": 0.10976961567370531,
"learning_rate": 1.246626217510114e-06,
"loss": 0.0273,
"num_tokens": 172953708.0,
"step": 2160
},
{
"epoch": 2.694946974422957,
"grad_norm": 0.10593975910980304,
"learning_rate": 1.244651545107503e-06,
"loss": 0.0275,
"num_tokens": 173034948.0,
"step": 2161
},
{
"epoch": 2.696194635059264,
"grad_norm": 0.10752240766626842,
"learning_rate": 1.2426845889378516e-06,
"loss": 0.0271,
"num_tokens": 173114456.0,
"step": 2162
},
{
"epoch": 2.697442295695571,
"grad_norm": 0.11100439809268697,
"learning_rate": 1.2407253525678453e-06,
"loss": 0.027,
"num_tokens": 173193677.0,
"step": 2163
},
{
"epoch": 2.6986899563318776,
"grad_norm": 0.11124936556530347,
"learning_rate": 1.2387738395501714e-06,
"loss": 0.028,
"num_tokens": 173273829.0,
"step": 2164
},
{
"epoch": 2.699937616968185,
"grad_norm": 0.11341877617750057,
"learning_rate": 1.236830053423512e-06,
"loss": 0.0279,
"num_tokens": 173354403.0,
"step": 2165
},
{
"epoch": 2.7011852776044916,
"grad_norm": 0.10432445599121043,
"learning_rate": 1.2348939977125412e-06,
"loss": 0.0273,
"num_tokens": 173434752.0,
"step": 2166
},
{
"epoch": 2.7024329382407988,
"grad_norm": 0.10349647102790291,
"learning_rate": 1.2329656759279108e-06,
"loss": 0.0265,
"num_tokens": 173513891.0,
"step": 2167
},
{
"epoch": 2.7036805988771055,
"grad_norm": 0.10730067378099671,
"learning_rate": 1.2310450915662516e-06,
"loss": 0.0263,
"num_tokens": 173593347.0,
"step": 2168
},
{
"epoch": 2.7049282595134123,
"grad_norm": 0.12358364979067543,
"learning_rate": 1.229132248110165e-06,
"loss": 0.0289,
"num_tokens": 173673697.0,
"step": 2169
},
{
"epoch": 2.706175920149719,
"grad_norm": 0.121969574680686,
"learning_rate": 1.2272271490282134e-06,
"loss": 0.0273,
"num_tokens": 173753524.0,
"step": 2170
},
{
"epoch": 2.7074235807860263,
"grad_norm": 0.10988278388736815,
"learning_rate": 1.2253297977749163e-06,
"loss": 0.0267,
"num_tokens": 173833499.0,
"step": 2171
},
{
"epoch": 2.708671241422333,
"grad_norm": 0.11127655885847154,
"learning_rate": 1.2234401977907468e-06,
"loss": 0.0276,
"num_tokens": 173913319.0,
"step": 2172
},
{
"epoch": 2.7099189020586403,
"grad_norm": 0.11247307459253601,
"learning_rate": 1.2215583525021203e-06,
"loss": 0.0273,
"num_tokens": 173992389.0,
"step": 2173
},
{
"epoch": 2.711166562694947,
"grad_norm": 0.11437743854655531,
"learning_rate": 1.2196842653213896e-06,
"loss": 0.0294,
"num_tokens": 174072025.0,
"step": 2174
},
{
"epoch": 2.712414223331254,
"grad_norm": 0.11260264853485703,
"learning_rate": 1.2178179396468428e-06,
"loss": 0.0284,
"num_tokens": 174152930.0,
"step": 2175
},
{
"epoch": 2.7136618839675606,
"grad_norm": 0.12047852452761347,
"learning_rate": 1.215959378862692e-06,
"loss": 0.0276,
"num_tokens": 174232157.0,
"step": 2176
},
{
"epoch": 2.714909544603868,
"grad_norm": 0.10620348737338733,
"learning_rate": 1.2141085863390696e-06,
"loss": 0.0267,
"num_tokens": 174312869.0,
"step": 2177
},
{
"epoch": 2.7161572052401746,
"grad_norm": 0.10200387743067355,
"learning_rate": 1.2122655654320225e-06,
"loss": 0.026,
"num_tokens": 174391850.0,
"step": 2178
},
{
"epoch": 2.717404865876482,
"grad_norm": 0.11792058541690678,
"learning_rate": 1.210430319483504e-06,
"loss": 0.0279,
"num_tokens": 174472240.0,
"step": 2179
},
{
"epoch": 2.7186525265127885,
"grad_norm": 0.11352708440547712,
"learning_rate": 1.2086028518213694e-06,
"loss": 0.0273,
"num_tokens": 174551750.0,
"step": 2180
},
{
"epoch": 2.7199001871490953,
"grad_norm": 0.11403171270711979,
"learning_rate": 1.206783165759371e-06,
"loss": 0.0273,
"num_tokens": 174631491.0,
"step": 2181
},
{
"epoch": 2.7211478477854025,
"grad_norm": 0.1196489902512615,
"learning_rate": 1.204971264597148e-06,
"loss": 0.0275,
"num_tokens": 174711527.0,
"step": 2182
},
{
"epoch": 2.7223955084217093,
"grad_norm": 0.1081192153362361,
"learning_rate": 1.2031671516202263e-06,
"loss": 0.0272,
"num_tokens": 174790208.0,
"step": 2183
},
{
"epoch": 2.723643169058016,
"grad_norm": 0.11976247939116517,
"learning_rate": 1.2013708301000082e-06,
"loss": 0.028,
"num_tokens": 174870561.0,
"step": 2184
},
{
"epoch": 2.7248908296943233,
"grad_norm": 0.11442096061556735,
"learning_rate": 1.199582303293767e-06,
"loss": 0.0274,
"num_tokens": 174951422.0,
"step": 2185
},
{
"epoch": 2.72613849033063,
"grad_norm": 0.11617050642869951,
"learning_rate": 1.1978015744446417e-06,
"loss": 0.0271,
"num_tokens": 175031463.0,
"step": 2186
},
{
"epoch": 2.727386150966937,
"grad_norm": 0.12293559783309263,
"learning_rate": 1.1960286467816331e-06,
"loss": 0.0278,
"num_tokens": 175112098.0,
"step": 2187
},
{
"epoch": 2.728633811603244,
"grad_norm": 0.11221752709437237,
"learning_rate": 1.1942635235195949e-06,
"loss": 0.0285,
"num_tokens": 175192431.0,
"step": 2188
},
{
"epoch": 2.729881472239551,
"grad_norm": 0.11210559785351049,
"learning_rate": 1.1925062078592279e-06,
"loss": 0.0267,
"num_tokens": 175272604.0,
"step": 2189
},
{
"epoch": 2.731129132875858,
"grad_norm": 0.10832068295079063,
"learning_rate": 1.190756702987077e-06,
"loss": 0.0269,
"num_tokens": 175352453.0,
"step": 2190
},
{
"epoch": 2.732376793512165,
"grad_norm": 0.12338756235671419,
"learning_rate": 1.1890150120755244e-06,
"loss": 0.0284,
"num_tokens": 175432528.0,
"step": 2191
},
{
"epoch": 2.7336244541484715,
"grad_norm": 0.11219089612196873,
"learning_rate": 1.1872811382827811e-06,
"loss": 0.027,
"num_tokens": 175512749.0,
"step": 2192
},
{
"epoch": 2.7348721147847783,
"grad_norm": 0.12417263033167164,
"learning_rate": 1.1855550847528849e-06,
"loss": 0.0271,
"num_tokens": 175593593.0,
"step": 2193
},
{
"epoch": 2.7361197754210855,
"grad_norm": 0.11709495692614103,
"learning_rate": 1.1838368546156924e-06,
"loss": 0.0266,
"num_tokens": 175674330.0,
"step": 2194
},
{
"epoch": 2.7373674360573923,
"grad_norm": 0.11715755751386901,
"learning_rate": 1.182126450986874e-06,
"loss": 0.028,
"num_tokens": 175754689.0,
"step": 2195
},
{
"epoch": 2.7386150966936995,
"grad_norm": 0.11080050252306903,
"learning_rate": 1.1804238769679077e-06,
"loss": 0.0273,
"num_tokens": 175834619.0,
"step": 2196
},
{
"epoch": 2.7398627573300063,
"grad_norm": 0.11676541458720631,
"learning_rate": 1.178729135646077e-06,
"loss": 0.0269,
"num_tokens": 175914942.0,
"step": 2197
},
{
"epoch": 2.741110417966313,
"grad_norm": 0.11457451396067345,
"learning_rate": 1.1770422300944586e-06,
"loss": 0.028,
"num_tokens": 175996002.0,
"step": 2198
},
{
"epoch": 2.74235807860262,
"grad_norm": 0.10756801142995977,
"learning_rate": 1.1753631633719217e-06,
"loss": 0.0281,
"num_tokens": 176076559.0,
"step": 2199
},
{
"epoch": 2.743605739238927,
"grad_norm": 0.12404043322828145,
"learning_rate": 1.1736919385231236e-06,
"loss": 0.028,
"num_tokens": 176156992.0,
"step": 2200
},
{
"epoch": 2.744853399875234,
"grad_norm": 0.10783216704576966,
"learning_rate": 1.1720285585784983e-06,
"loss": 0.0258,
"num_tokens": 176235147.0,
"step": 2201
},
{
"epoch": 2.746101060511541,
"grad_norm": 0.11227645559258542,
"learning_rate": 1.1703730265542569e-06,
"loss": 0.0275,
"num_tokens": 176315166.0,
"step": 2202
},
{
"epoch": 2.747348721147848,
"grad_norm": 0.11576428333299565,
"learning_rate": 1.16872534545238e-06,
"loss": 0.028,
"num_tokens": 176395152.0,
"step": 2203
},
{
"epoch": 2.7485963817841546,
"grad_norm": 0.11841116972911843,
"learning_rate": 1.1670855182606106e-06,
"loss": 0.0274,
"num_tokens": 176476772.0,
"step": 2204
},
{
"epoch": 2.7498440424204618,
"grad_norm": 0.10483183967952564,
"learning_rate": 1.1654535479524511e-06,
"loss": 0.0269,
"num_tokens": 176556474.0,
"step": 2205
},
{
"epoch": 2.7510917030567685,
"grad_norm": 0.11471768456732195,
"learning_rate": 1.163829437487158e-06,
"loss": 0.0277,
"num_tokens": 176637858.0,
"step": 2206
},
{
"epoch": 2.7523393636930757,
"grad_norm": 0.108264123677895,
"learning_rate": 1.162213189809734e-06,
"loss": 0.0271,
"num_tokens": 176717271.0,
"step": 2207
},
{
"epoch": 2.7535870243293825,
"grad_norm": 0.15282329766969122,
"learning_rate": 1.1606048078509235e-06,
"loss": 0.0346,
"num_tokens": 176797686.0,
"step": 2208
},
{
"epoch": 2.7548346849656893,
"grad_norm": 0.11649281003810305,
"learning_rate": 1.1590042945272108e-06,
"loss": 0.0272,
"num_tokens": 176876658.0,
"step": 2209
},
{
"epoch": 2.756082345601996,
"grad_norm": 0.10842683509303165,
"learning_rate": 1.1574116527408093e-06,
"loss": 0.0267,
"num_tokens": 176956269.0,
"step": 2210
},
{
"epoch": 2.7573300062383033,
"grad_norm": 0.11131932466818467,
"learning_rate": 1.1558268853796597e-06,
"loss": 0.0269,
"num_tokens": 177036068.0,
"step": 2211
},
{
"epoch": 2.75857766687461,
"grad_norm": 0.12181987186342921,
"learning_rate": 1.1542499953174257e-06,
"loss": 0.0282,
"num_tokens": 177117826.0,
"step": 2212
},
{
"epoch": 2.7598253275109172,
"grad_norm": 0.11099109738309532,
"learning_rate": 1.1526809854134844e-06,
"loss": 0.0281,
"num_tokens": 177198291.0,
"step": 2213
},
{
"epoch": 2.761072988147224,
"grad_norm": 0.11310340011164033,
"learning_rate": 1.151119858512925e-06,
"loss": 0.0268,
"num_tokens": 177278136.0,
"step": 2214
},
{
"epoch": 2.762320648783531,
"grad_norm": 0.12040084886173383,
"learning_rate": 1.149566617446543e-06,
"loss": 0.0273,
"num_tokens": 177358685.0,
"step": 2215
},
{
"epoch": 2.7635683094198376,
"grad_norm": 0.11023952574462588,
"learning_rate": 1.1480212650308337e-06,
"loss": 0.0277,
"num_tokens": 177439052.0,
"step": 2216
},
{
"epoch": 2.7648159700561448,
"grad_norm": 0.10245924815558206,
"learning_rate": 1.1464838040679876e-06,
"loss": 0.0265,
"num_tokens": 177518673.0,
"step": 2217
},
{
"epoch": 2.7660636306924515,
"grad_norm": 0.11896518323741509,
"learning_rate": 1.1449542373458867e-06,
"loss": 0.028,
"num_tokens": 177599813.0,
"step": 2218
},
{
"epoch": 2.7673112913287587,
"grad_norm": 0.11721840542278891,
"learning_rate": 1.1434325676380983e-06,
"loss": 0.0275,
"num_tokens": 177680047.0,
"step": 2219
},
{
"epoch": 2.7685589519650655,
"grad_norm": 0.11242112554545239,
"learning_rate": 1.141918797703868e-06,
"loss": 0.0273,
"num_tokens": 177759176.0,
"step": 2220
},
{
"epoch": 2.7698066126013723,
"grad_norm": 0.12213920330413136,
"learning_rate": 1.1404129302881193e-06,
"loss": 0.0276,
"num_tokens": 177840002.0,
"step": 2221
},
{
"epoch": 2.7710542732376795,
"grad_norm": 0.11684412169186235,
"learning_rate": 1.1389149681214456e-06,
"loss": 0.0285,
"num_tokens": 177920088.0,
"step": 2222
},
{
"epoch": 2.7723019338739863,
"grad_norm": 0.1119053015497875,
"learning_rate": 1.1374249139201035e-06,
"loss": 0.0274,
"num_tokens": 177999727.0,
"step": 2223
},
{
"epoch": 2.773549594510293,
"grad_norm": 0.11111524937162807,
"learning_rate": 1.135942770386013e-06,
"loss": 0.0268,
"num_tokens": 178078737.0,
"step": 2224
},
{
"epoch": 2.7747972551466002,
"grad_norm": 0.10700815561780962,
"learning_rate": 1.1344685402067475e-06,
"loss": 0.0269,
"num_tokens": 178157602.0,
"step": 2225
},
{
"epoch": 2.776044915782907,
"grad_norm": 0.12352603858436706,
"learning_rate": 1.1330022260555321e-06,
"loss": 0.0284,
"num_tokens": 178238153.0,
"step": 2226
},
{
"epoch": 2.777292576419214,
"grad_norm": 0.10878032533922675,
"learning_rate": 1.1315438305912377e-06,
"loss": 0.0265,
"num_tokens": 178317857.0,
"step": 2227
},
{
"epoch": 2.778540237055521,
"grad_norm": 0.11929846361121192,
"learning_rate": 1.1300933564583764e-06,
"loss": 0.0275,
"num_tokens": 178398144.0,
"step": 2228
},
{
"epoch": 2.7797878976918278,
"grad_norm": 0.11575836382461366,
"learning_rate": 1.1286508062870952e-06,
"loss": 0.0277,
"num_tokens": 178477765.0,
"step": 2229
},
{
"epoch": 2.781035558328135,
"grad_norm": 0.10742117792303336,
"learning_rate": 1.1272161826931745e-06,
"loss": 0.0265,
"num_tokens": 178558431.0,
"step": 2230
},
{
"epoch": 2.7822832189644418,
"grad_norm": 0.11418872199176901,
"learning_rate": 1.1257894882780206e-06,
"loss": 0.0283,
"num_tokens": 178639242.0,
"step": 2231
},
{
"epoch": 2.7835308796007485,
"grad_norm": 0.10723962416090788,
"learning_rate": 1.1243707256286606e-06,
"loss": 0.0271,
"num_tokens": 178719072.0,
"step": 2232
},
{
"epoch": 2.7847785402370553,
"grad_norm": 0.10881136487138986,
"learning_rate": 1.1229598973177407e-06,
"loss": 0.0272,
"num_tokens": 178797551.0,
"step": 2233
},
{
"epoch": 2.7860262008733625,
"grad_norm": 0.1033276997183416,
"learning_rate": 1.1215570059035199e-06,
"loss": 0.0261,
"num_tokens": 178876643.0,
"step": 2234
},
{
"epoch": 2.7872738615096693,
"grad_norm": 0.11221713077894142,
"learning_rate": 1.1201620539298636e-06,
"loss": 0.0277,
"num_tokens": 178956890.0,
"step": 2235
},
{
"epoch": 2.7885215221459765,
"grad_norm": 0.11497597690368146,
"learning_rate": 1.1187750439262405e-06,
"loss": 0.0274,
"num_tokens": 179036190.0,
"step": 2236
},
{
"epoch": 2.7897691827822833,
"grad_norm": 0.11685097421442404,
"learning_rate": 1.1173959784077207e-06,
"loss": 0.0292,
"num_tokens": 179117241.0,
"step": 2237
},
{
"epoch": 2.79101684341859,
"grad_norm": 0.11901364215641137,
"learning_rate": 1.1160248598749652e-06,
"loss": 0.0286,
"num_tokens": 179197576.0,
"step": 2238
},
{
"epoch": 2.7922645040548972,
"grad_norm": 0.10306800380453014,
"learning_rate": 1.114661690814227e-06,
"loss": 0.0266,
"num_tokens": 179276756.0,
"step": 2239
},
{
"epoch": 2.793512164691204,
"grad_norm": 0.11195164030232999,
"learning_rate": 1.1133064736973443e-06,
"loss": 0.0265,
"num_tokens": 179357775.0,
"step": 2240
},
{
"epoch": 2.7947598253275108,
"grad_norm": 0.11752175021287195,
"learning_rate": 1.1119592109817346e-06,
"loss": 0.0275,
"num_tokens": 179438867.0,
"step": 2241
},
{
"epoch": 2.796007485963818,
"grad_norm": 0.1140881376229996,
"learning_rate": 1.1106199051103922e-06,
"loss": 0.0271,
"num_tokens": 179518359.0,
"step": 2242
},
{
"epoch": 2.7972551466001248,
"grad_norm": 0.11251686289836357,
"learning_rate": 1.109288558511884e-06,
"loss": 0.0268,
"num_tokens": 179597988.0,
"step": 2243
},
{
"epoch": 2.7985028072364315,
"grad_norm": 0.10788731583211555,
"learning_rate": 1.1079651736003441e-06,
"loss": 0.0273,
"num_tokens": 179678144.0,
"step": 2244
},
{
"epoch": 2.7997504678727387,
"grad_norm": 0.11520086050466517,
"learning_rate": 1.106649752775468e-06,
"loss": 0.0267,
"num_tokens": 179758675.0,
"step": 2245
},
{
"epoch": 2.8009981285090455,
"grad_norm": 0.12597826940633167,
"learning_rate": 1.1053422984225127e-06,
"loss": 0.027,
"num_tokens": 179839093.0,
"step": 2246
},
{
"epoch": 2.8022457891453527,
"grad_norm": 0.11239946957555676,
"learning_rate": 1.1040428129122873e-06,
"loss": 0.0265,
"num_tokens": 179920234.0,
"step": 2247
},
{
"epoch": 2.8034934497816595,
"grad_norm": 0.11485169689579346,
"learning_rate": 1.102751298601152e-06,
"loss": 0.0272,
"num_tokens": 179999475.0,
"step": 2248
},
{
"epoch": 2.8047411104179663,
"grad_norm": 0.11199877474314972,
"learning_rate": 1.1014677578310128e-06,
"loss": 0.0277,
"num_tokens": 180078857.0,
"step": 2249
},
{
"epoch": 2.805988771054273,
"grad_norm": 0.11085777355981787,
"learning_rate": 1.1001921929293172e-06,
"loss": 0.0281,
"num_tokens": 180157620.0,
"step": 2250
},
{
"epoch": 2.8072364316905802,
"grad_norm": 0.11095268337610681,
"learning_rate": 1.0989246062090495e-06,
"loss": 0.0269,
"num_tokens": 180237202.0,
"step": 2251
},
{
"epoch": 2.808484092326887,
"grad_norm": 0.1314904657457488,
"learning_rate": 1.0976649999687282e-06,
"loss": 0.0273,
"num_tokens": 180316592.0,
"step": 2252
},
{
"epoch": 2.809731752963194,
"grad_norm": 0.10715724234388715,
"learning_rate": 1.096413376492399e-06,
"loss": 0.0272,
"num_tokens": 180396786.0,
"step": 2253
},
{
"epoch": 2.810979413599501,
"grad_norm": 0.11446876586604864,
"learning_rate": 1.0951697380496343e-06,
"loss": 0.0267,
"num_tokens": 180477208.0,
"step": 2254
},
{
"epoch": 2.8122270742358078,
"grad_norm": 0.10000594303521496,
"learning_rate": 1.093934086895526e-06,
"loss": 0.0262,
"num_tokens": 180557503.0,
"step": 2255
},
{
"epoch": 2.8134747348721145,
"grad_norm": 0.11468231454145983,
"learning_rate": 1.0927064252706845e-06,
"loss": 0.0264,
"num_tokens": 180636993.0,
"step": 2256
},
{
"epoch": 2.8147223955084217,
"grad_norm": 0.11326974384224862,
"learning_rate": 1.0914867554012297e-06,
"loss": 0.028,
"num_tokens": 180717357.0,
"step": 2257
},
{
"epoch": 2.8159700561447285,
"grad_norm": 0.12011090419081122,
"learning_rate": 1.090275079498793e-06,
"loss": 0.0286,
"num_tokens": 180796981.0,
"step": 2258
},
{
"epoch": 2.8172177167810357,
"grad_norm": 0.11208373588605422,
"learning_rate": 1.0890713997605085e-06,
"loss": 0.0276,
"num_tokens": 180876805.0,
"step": 2259
},
{
"epoch": 2.8184653774173425,
"grad_norm": 0.11929554034691514,
"learning_rate": 1.0878757183690112e-06,
"loss": 0.0276,
"num_tokens": 180956790.0,
"step": 2260
},
{
"epoch": 2.8197130380536493,
"grad_norm": 0.10883696590219329,
"learning_rate": 1.086688037492433e-06,
"loss": 0.027,
"num_tokens": 181037554.0,
"step": 2261
},
{
"epoch": 2.8209606986899565,
"grad_norm": 0.11656014791821229,
"learning_rate": 1.0855083592843985e-06,
"loss": 0.028,
"num_tokens": 181117563.0,
"step": 2262
},
{
"epoch": 2.8222083593262632,
"grad_norm": 0.10887563809750479,
"learning_rate": 1.0843366858840209e-06,
"loss": 0.0269,
"num_tokens": 181197130.0,
"step": 2263
},
{
"epoch": 2.8234560199625705,
"grad_norm": 0.10788088292257987,
"learning_rate": 1.0831730194158982e-06,
"loss": 0.0266,
"num_tokens": 181276375.0,
"step": 2264
},
{
"epoch": 2.824703680598877,
"grad_norm": 0.10930839605540808,
"learning_rate": 1.0820173619901093e-06,
"loss": 0.0271,
"num_tokens": 181356017.0,
"step": 2265
},
{
"epoch": 2.825951341235184,
"grad_norm": 0.12259984441092837,
"learning_rate": 1.08086971570221e-06,
"loss": 0.0292,
"num_tokens": 181436275.0,
"step": 2266
},
{
"epoch": 2.8271990018714908,
"grad_norm": 0.10639541929699158,
"learning_rate": 1.0797300826332307e-06,
"loss": 0.0268,
"num_tokens": 181516434.0,
"step": 2267
},
{
"epoch": 2.828446662507798,
"grad_norm": 0.12324754295042131,
"learning_rate": 1.07859846484967e-06,
"loss": 0.028,
"num_tokens": 181597476.0,
"step": 2268
},
{
"epoch": 2.8296943231441047,
"grad_norm": 0.10801092400541287,
"learning_rate": 1.0774748644034936e-06,
"loss": 0.0271,
"num_tokens": 181677449.0,
"step": 2269
},
{
"epoch": 2.830941983780412,
"grad_norm": 0.11429743139755069,
"learning_rate": 1.0763592833321277e-06,
"loss": 0.0269,
"num_tokens": 181757429.0,
"step": 2270
},
{
"epoch": 2.8321896444167187,
"grad_norm": 0.11295710267647807,
"learning_rate": 1.0752517236584595e-06,
"loss": 0.027,
"num_tokens": 181836252.0,
"step": 2271
},
{
"epoch": 2.8334373050530255,
"grad_norm": 0.11139902033265346,
"learning_rate": 1.0741521873908283e-06,
"loss": 0.0268,
"num_tokens": 181916218.0,
"step": 2272
},
{
"epoch": 2.8346849656893323,
"grad_norm": 0.1054002144624115,
"learning_rate": 1.0730606765230257e-06,
"loss": 0.0269,
"num_tokens": 181995689.0,
"step": 2273
},
{
"epoch": 2.8359326263256395,
"grad_norm": 0.10302028960792996,
"learning_rate": 1.0719771930342913e-06,
"loss": 0.0261,
"num_tokens": 182075038.0,
"step": 2274
},
{
"epoch": 2.8371802869619462,
"grad_norm": 0.11798628087878685,
"learning_rate": 1.0709017388893075e-06,
"loss": 0.0283,
"num_tokens": 182154597.0,
"step": 2275
},
{
"epoch": 2.8384279475982535,
"grad_norm": 0.11276374561736073,
"learning_rate": 1.0698343160381987e-06,
"loss": 0.0266,
"num_tokens": 182233887.0,
"step": 2276
},
{
"epoch": 2.8396756082345602,
"grad_norm": 0.1174576965951362,
"learning_rate": 1.0687749264165248e-06,
"loss": 0.028,
"num_tokens": 182314132.0,
"step": 2277
},
{
"epoch": 2.840923268870867,
"grad_norm": 0.10814129453586245,
"learning_rate": 1.067723571945279e-06,
"loss": 0.0261,
"num_tokens": 182394712.0,
"step": 2278
},
{
"epoch": 2.842170929507174,
"grad_norm": 0.11723270887984091,
"learning_rate": 1.0666802545308847e-06,
"loss": 0.0281,
"num_tokens": 182476047.0,
"step": 2279
},
{
"epoch": 2.843418590143481,
"grad_norm": 0.11081835835199184,
"learning_rate": 1.065644976065193e-06,
"loss": 0.0279,
"num_tokens": 182554762.0,
"step": 2280
},
{
"epoch": 2.8446662507797877,
"grad_norm": 0.10738520261965014,
"learning_rate": 1.0646177384254747e-06,
"loss": 0.027,
"num_tokens": 182633545.0,
"step": 2281
},
{
"epoch": 2.845913911416095,
"grad_norm": 0.10253227627415959,
"learning_rate": 1.063598543474423e-06,
"loss": 0.0263,
"num_tokens": 182711682.0,
"step": 2282
},
{
"epoch": 2.8471615720524017,
"grad_norm": 0.11632474054875933,
"learning_rate": 1.062587393060147e-06,
"loss": 0.0277,
"num_tokens": 182791996.0,
"step": 2283
},
{
"epoch": 2.8484092326887085,
"grad_norm": 0.11239301355373442,
"learning_rate": 1.0615842890161675e-06,
"loss": 0.0267,
"num_tokens": 182871570.0,
"step": 2284
},
{
"epoch": 2.8496568933250157,
"grad_norm": 0.1260833333963082,
"learning_rate": 1.0605892331614158e-06,
"loss": 0.0305,
"num_tokens": 182951531.0,
"step": 2285
},
{
"epoch": 2.8509045539613225,
"grad_norm": 0.10700130560287861,
"learning_rate": 1.0596022273002282e-06,
"loss": 0.0268,
"num_tokens": 183030679.0,
"step": 2286
},
{
"epoch": 2.8521522145976297,
"grad_norm": 0.11364031790238421,
"learning_rate": 1.0586232732223446e-06,
"loss": 0.0271,
"num_tokens": 183112076.0,
"step": 2287
},
{
"epoch": 2.8533998752339365,
"grad_norm": 0.10879548973912555,
"learning_rate": 1.0576523727029053e-06,
"loss": 0.0273,
"num_tokens": 183192328.0,
"step": 2288
},
{
"epoch": 2.8546475358702432,
"grad_norm": 0.11352263022080847,
"learning_rate": 1.0566895275024458e-06,
"loss": 0.0297,
"num_tokens": 183271794.0,
"step": 2289
},
{
"epoch": 2.85589519650655,
"grad_norm": 0.11082853865055407,
"learning_rate": 1.0557347393668966e-06,
"loss": 0.027,
"num_tokens": 183350787.0,
"step": 2290
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.11456889825294071,
"learning_rate": 1.0547880100275755e-06,
"loss": 0.0275,
"num_tokens": 183431435.0,
"step": 2291
},
{
"epoch": 2.858390517779164,
"grad_norm": 0.11016237704435282,
"learning_rate": 1.0538493412011901e-06,
"loss": 0.0267,
"num_tokens": 183510859.0,
"step": 2292
},
{
"epoch": 2.859638178415471,
"grad_norm": 0.11417004760246512,
"learning_rate": 1.0529187345898304e-06,
"loss": 0.0277,
"num_tokens": 183591361.0,
"step": 2293
},
{
"epoch": 2.860885839051778,
"grad_norm": 0.11122299199182507,
"learning_rate": 1.0519961918809675e-06,
"loss": 0.0271,
"num_tokens": 183671105.0,
"step": 2294
},
{
"epoch": 2.8621334996880847,
"grad_norm": 0.11426790938884109,
"learning_rate": 1.05108171474745e-06,
"loss": 0.0275,
"num_tokens": 183751497.0,
"step": 2295
},
{
"epoch": 2.8633811603243915,
"grad_norm": 0.1132178603222439,
"learning_rate": 1.050175304847502e-06,
"loss": 0.0276,
"num_tokens": 183831206.0,
"step": 2296
},
{
"epoch": 2.8646288209606987,
"grad_norm": 0.11247491256932537,
"learning_rate": 1.0492769638247177e-06,
"loss": 0.0278,
"num_tokens": 183912383.0,
"step": 2297
},
{
"epoch": 2.8658764815970055,
"grad_norm": 0.10746653770671626,
"learning_rate": 1.0483866933080611e-06,
"loss": 0.0265,
"num_tokens": 183992514.0,
"step": 2298
},
{
"epoch": 2.8671241422333127,
"grad_norm": 0.11160208763303599,
"learning_rate": 1.0475044949118624e-06,
"loss": 0.0276,
"num_tokens": 184072119.0,
"step": 2299
},
{
"epoch": 2.8683718028696195,
"grad_norm": 0.11850309053487157,
"learning_rate": 1.0466303702358139e-06,
"loss": 0.0279,
"num_tokens": 184155215.0,
"step": 2300
},
{
"epoch": 2.8696194635059262,
"grad_norm": 0.12631852786380027,
"learning_rate": 1.0457643208649665e-06,
"loss": 0.0288,
"num_tokens": 184235940.0,
"step": 2301
},
{
"epoch": 2.8708671241422334,
"grad_norm": 0.11138287151881872,
"learning_rate": 1.044906348369731e-06,
"loss": 0.0276,
"num_tokens": 184316322.0,
"step": 2302
},
{
"epoch": 2.87211478477854,
"grad_norm": 0.10931209504132226,
"learning_rate": 1.0440564543058703e-06,
"loss": 0.0268,
"num_tokens": 184396446.0,
"step": 2303
},
{
"epoch": 2.8733624454148474,
"grad_norm": 0.11012682455226674,
"learning_rate": 1.0432146402144986e-06,
"loss": 0.0278,
"num_tokens": 184475613.0,
"step": 2304
},
{
"epoch": 2.874610106051154,
"grad_norm": 0.10769527893932587,
"learning_rate": 1.0423809076220805e-06,
"loss": 0.0269,
"num_tokens": 184555834.0,
"step": 2305
},
{
"epoch": 2.875857766687461,
"grad_norm": 0.12501985655306355,
"learning_rate": 1.041555258040425e-06,
"loss": 0.0289,
"num_tokens": 184637311.0,
"step": 2306
},
{
"epoch": 2.8771054273237677,
"grad_norm": 0.10611283848990914,
"learning_rate": 1.0407376929666833e-06,
"loss": 0.0272,
"num_tokens": 184717540.0,
"step": 2307
},
{
"epoch": 2.878353087960075,
"grad_norm": 0.11684908656337946,
"learning_rate": 1.0399282138833488e-06,
"loss": 0.0275,
"num_tokens": 184796663.0,
"step": 2308
},
{
"epoch": 2.8796007485963817,
"grad_norm": 0.12163581971776054,
"learning_rate": 1.039126822258252e-06,
"loss": 0.0277,
"num_tokens": 184877517.0,
"step": 2309
},
{
"epoch": 2.880848409232689,
"grad_norm": 0.11561664029854038,
"learning_rate": 1.0383335195445573e-06,
"loss": 0.0271,
"num_tokens": 184956700.0,
"step": 2310
},
{
"epoch": 2.8820960698689957,
"grad_norm": 0.11377476242109719,
"learning_rate": 1.0375483071807626e-06,
"loss": 0.0277,
"num_tokens": 185036719.0,
"step": 2311
},
{
"epoch": 2.8833437305053025,
"grad_norm": 0.10571550440670317,
"learning_rate": 1.036771186590696e-06,
"loss": 0.0268,
"num_tokens": 185116185.0,
"step": 2312
},
{
"epoch": 2.8845913911416092,
"grad_norm": 0.11078801585715989,
"learning_rate": 1.0360021591835108e-06,
"loss": 0.0274,
"num_tokens": 185195865.0,
"step": 2313
},
{
"epoch": 2.8858390517779164,
"grad_norm": 0.10587436172290575,
"learning_rate": 1.0352412263536868e-06,
"loss": 0.0258,
"num_tokens": 185275227.0,
"step": 2314
},
{
"epoch": 2.887086712414223,
"grad_norm": 0.10645437206340462,
"learning_rate": 1.0344883894810257e-06,
"loss": 0.0274,
"num_tokens": 185354901.0,
"step": 2315
},
{
"epoch": 2.8883343730505304,
"grad_norm": 0.11587554997670732,
"learning_rate": 1.033743649930647e-06,
"loss": 0.0263,
"num_tokens": 185433853.0,
"step": 2316
},
{
"epoch": 2.889582033686837,
"grad_norm": 0.11506509844410158,
"learning_rate": 1.03300700905299e-06,
"loss": 0.0274,
"num_tokens": 185514489.0,
"step": 2317
},
{
"epoch": 2.890829694323144,
"grad_norm": 0.10249391579856434,
"learning_rate": 1.0322784681838062e-06,
"loss": 0.0266,
"num_tokens": 185595544.0,
"step": 2318
},
{
"epoch": 2.892077354959451,
"grad_norm": 0.1244108209218656,
"learning_rate": 1.0315580286441616e-06,
"loss": 0.027,
"num_tokens": 185675597.0,
"step": 2319
},
{
"epoch": 2.893325015595758,
"grad_norm": 0.11187955851184811,
"learning_rate": 1.0308456917404294e-06,
"loss": 0.0269,
"num_tokens": 185755490.0,
"step": 2320
},
{
"epoch": 2.8945726762320647,
"grad_norm": 0.1118035236277393,
"learning_rate": 1.0301414587642926e-06,
"loss": 0.0267,
"num_tokens": 185833837.0,
"step": 2321
},
{
"epoch": 2.895820336868372,
"grad_norm": 0.11431007300598617,
"learning_rate": 1.029445330992738e-06,
"loss": 0.0285,
"num_tokens": 185915064.0,
"step": 2322
},
{
"epoch": 2.8970679975046787,
"grad_norm": 0.10462947510709711,
"learning_rate": 1.0287573096880566e-06,
"loss": 0.0267,
"num_tokens": 185995076.0,
"step": 2323
},
{
"epoch": 2.8983156581409855,
"grad_norm": 0.11570282963133109,
"learning_rate": 1.028077396097838e-06,
"loss": 0.0269,
"num_tokens": 186074872.0,
"step": 2324
},
{
"epoch": 2.8995633187772927,
"grad_norm": 0.12626629625181418,
"learning_rate": 1.0274055914549708e-06,
"loss": 0.0281,
"num_tokens": 186156148.0,
"step": 2325
},
{
"epoch": 2.9008109794135994,
"grad_norm": 0.11032478054432032,
"learning_rate": 1.0267418969776405e-06,
"loss": 0.0264,
"num_tokens": 186236795.0,
"step": 2326
},
{
"epoch": 2.9020586400499067,
"grad_norm": 0.108031229594549,
"learning_rate": 1.0260863138693264e-06,
"loss": 0.0282,
"num_tokens": 186316351.0,
"step": 2327
},
{
"epoch": 2.9033063006862134,
"grad_norm": 0.11591386223810314,
"learning_rate": 1.0254388433187975e-06,
"loss": 0.0279,
"num_tokens": 186396106.0,
"step": 2328
},
{
"epoch": 2.90455396132252,
"grad_norm": 0.10622785774173073,
"learning_rate": 1.0247994865001147e-06,
"loss": 0.0259,
"num_tokens": 186475119.0,
"step": 2329
},
{
"epoch": 2.905801621958827,
"grad_norm": 0.12280744075841177,
"learning_rate": 1.0241682445726246e-06,
"loss": 0.0279,
"num_tokens": 186556403.0,
"step": 2330
},
{
"epoch": 2.907049282595134,
"grad_norm": 0.12850490414436164,
"learning_rate": 1.0235451186809596e-06,
"loss": 0.0281,
"num_tokens": 186636091.0,
"step": 2331
},
{
"epoch": 2.908296943231441,
"grad_norm": 0.12129199138278511,
"learning_rate": 1.0229301099550352e-06,
"loss": 0.0263,
"num_tokens": 186718680.0,
"step": 2332
},
{
"epoch": 2.909544603867748,
"grad_norm": 0.10134475681869966,
"learning_rate": 1.0223232195100485e-06,
"loss": 0.0267,
"num_tokens": 186797276.0,
"step": 2333
},
{
"epoch": 2.910792264504055,
"grad_norm": 0.10862663209574905,
"learning_rate": 1.0217244484464758e-06,
"loss": 0.0269,
"num_tokens": 186877678.0,
"step": 2334
},
{
"epoch": 2.9120399251403617,
"grad_norm": 0.10595868824858773,
"learning_rate": 1.0211337978500687e-06,
"loss": 0.0269,
"num_tokens": 186956753.0,
"step": 2335
},
{
"epoch": 2.913287585776669,
"grad_norm": 0.11109899490724645,
"learning_rate": 1.0205512687918558e-06,
"loss": 0.0275,
"num_tokens": 187037968.0,
"step": 2336
},
{
"epoch": 2.9145352464129757,
"grad_norm": 0.10985537743814915,
"learning_rate": 1.0199768623281388e-06,
"loss": 0.028,
"num_tokens": 187117786.0,
"step": 2337
},
{
"epoch": 2.9157829070492824,
"grad_norm": 0.11212443137177852,
"learning_rate": 1.0194105795004896e-06,
"loss": 0.0273,
"num_tokens": 187197753.0,
"step": 2338
},
{
"epoch": 2.9170305676855897,
"grad_norm": 0.11357099125731306,
"learning_rate": 1.0188524213357507e-06,
"loss": 0.027,
"num_tokens": 187278814.0,
"step": 2339
},
{
"epoch": 2.9182782283218964,
"grad_norm": 0.11863761901671414,
"learning_rate": 1.0183023888460312e-06,
"loss": 0.0278,
"num_tokens": 187359939.0,
"step": 2340
},
{
"epoch": 2.919525888958203,
"grad_norm": 0.11740995441233278,
"learning_rate": 1.017760483028706e-06,
"loss": 0.0276,
"num_tokens": 187439262.0,
"step": 2341
},
{
"epoch": 2.9207735495945104,
"grad_norm": 0.10961685956776607,
"learning_rate": 1.017226704866415e-06,
"loss": 0.0273,
"num_tokens": 187519426.0,
"step": 2342
},
{
"epoch": 2.922021210230817,
"grad_norm": 0.13193569549980025,
"learning_rate": 1.0167010553270588e-06,
"loss": 0.0275,
"num_tokens": 187599140.0,
"step": 2343
},
{
"epoch": 2.9232688708671244,
"grad_norm": 0.1090653954416858,
"learning_rate": 1.016183535363799e-06,
"loss": 0.0263,
"num_tokens": 187679734.0,
"step": 2344
},
{
"epoch": 2.924516531503431,
"grad_norm": 0.11264135405112981,
"learning_rate": 1.0156741459150556e-06,
"loss": 0.0271,
"num_tokens": 187760016.0,
"step": 2345
},
{
"epoch": 2.925764192139738,
"grad_norm": 0.1127907828444805,
"learning_rate": 1.0151728879045057e-06,
"loss": 0.0273,
"num_tokens": 187840030.0,
"step": 2346
},
{
"epoch": 2.9270118527760447,
"grad_norm": 0.11654407524405933,
"learning_rate": 1.0146797622410813e-06,
"loss": 0.0266,
"num_tokens": 187918803.0,
"step": 2347
},
{
"epoch": 2.928259513412352,
"grad_norm": 0.11309647290317737,
"learning_rate": 1.0141947698189684e-06,
"loss": 0.0272,
"num_tokens": 187998146.0,
"step": 2348
},
{
"epoch": 2.9295071740486587,
"grad_norm": 0.11280706058466917,
"learning_rate": 1.0137179115176055e-06,
"loss": 0.0265,
"num_tokens": 188077340.0,
"step": 2349
},
{
"epoch": 2.930754834684966,
"grad_norm": 0.13076601205501484,
"learning_rate": 1.0132491882016805e-06,
"loss": 0.0262,
"num_tokens": 188157146.0,
"step": 2350
},
{
"epoch": 2.9320024953212727,
"grad_norm": 0.11012339063784357,
"learning_rate": 1.0127886007211298e-06,
"loss": 0.0275,
"num_tokens": 188237197.0,
"step": 2351
},
{
"epoch": 2.9332501559575794,
"grad_norm": 0.11371206516952323,
"learning_rate": 1.0123361499111383e-06,
"loss": 0.0272,
"num_tokens": 188316620.0,
"step": 2352
},
{
"epoch": 2.934497816593886,
"grad_norm": 0.10964761702423996,
"learning_rate": 1.011891836592136e-06,
"loss": 0.0278,
"num_tokens": 188397001.0,
"step": 2353
},
{
"epoch": 2.9357454772301934,
"grad_norm": 0.1117067743384703,
"learning_rate": 1.0114556615697971e-06,
"loss": 0.027,
"num_tokens": 188476537.0,
"step": 2354
},
{
"epoch": 2.9369931378665,
"grad_norm": 0.10532925727660913,
"learning_rate": 1.0110276256350393e-06,
"loss": 0.0265,
"num_tokens": 188554894.0,
"step": 2355
},
{
"epoch": 2.9382407985028074,
"grad_norm": 0.11464738528815109,
"learning_rate": 1.010607729564021e-06,
"loss": 0.0273,
"num_tokens": 188635007.0,
"step": 2356
},
{
"epoch": 2.939488459139114,
"grad_norm": 0.11411346562493145,
"learning_rate": 1.0101959741181396e-06,
"loss": 0.0273,
"num_tokens": 188714832.0,
"step": 2357
},
{
"epoch": 2.940736119775421,
"grad_norm": 0.11297091154604039,
"learning_rate": 1.0097923600440335e-06,
"loss": 0.0266,
"num_tokens": 188794100.0,
"step": 2358
},
{
"epoch": 2.941983780411728,
"grad_norm": 0.11419809264174352,
"learning_rate": 1.0093968880735762e-06,
"loss": 0.0277,
"num_tokens": 188875432.0,
"step": 2359
},
{
"epoch": 2.943231441048035,
"grad_norm": 0.10794783898522096,
"learning_rate": 1.009009558923878e-06,
"loss": 0.027,
"num_tokens": 188954916.0,
"step": 2360
},
{
"epoch": 2.944479101684342,
"grad_norm": 0.10544977589684877,
"learning_rate": 1.0086303732972843e-06,
"loss": 0.0266,
"num_tokens": 189035103.0,
"step": 2361
},
{
"epoch": 2.945726762320649,
"grad_norm": 0.12035892909369736,
"learning_rate": 1.0082593318813728e-06,
"loss": 0.027,
"num_tokens": 189114556.0,
"step": 2362
},
{
"epoch": 2.9469744229569557,
"grad_norm": 0.11275061926774883,
"learning_rate": 1.0078964353489536e-06,
"loss": 0.0268,
"num_tokens": 189194415.0,
"step": 2363
},
{
"epoch": 2.9482220835932624,
"grad_norm": 0.11213359019218412,
"learning_rate": 1.0075416843580687e-06,
"loss": 0.0272,
"num_tokens": 189275904.0,
"step": 2364
},
{
"epoch": 2.9494697442295696,
"grad_norm": 0.11713393130799933,
"learning_rate": 1.0071950795519873e-06,
"loss": 0.0279,
"num_tokens": 189355944.0,
"step": 2365
},
{
"epoch": 2.9507174048658764,
"grad_norm": 0.10806295055772125,
"learning_rate": 1.00685662155921e-06,
"loss": 0.0274,
"num_tokens": 189436064.0,
"step": 2366
},
{
"epoch": 2.9519650655021836,
"grad_norm": 0.11788280963029622,
"learning_rate": 1.0065263109934633e-06,
"loss": 0.0277,
"num_tokens": 189516906.0,
"step": 2367
},
{
"epoch": 2.9532127261384904,
"grad_norm": 0.11439835587046002,
"learning_rate": 1.0062041484536994e-06,
"loss": 0.0292,
"num_tokens": 189597299.0,
"step": 2368
},
{
"epoch": 2.954460386774797,
"grad_norm": 0.11618035546784151,
"learning_rate": 1.0058901345240967e-06,
"loss": 0.0274,
"num_tokens": 189677346.0,
"step": 2369
},
{
"epoch": 2.955708047411104,
"grad_norm": 0.10998886233872446,
"learning_rate": 1.0055842697740576e-06,
"loss": 0.0268,
"num_tokens": 189756509.0,
"step": 2370
},
{
"epoch": 2.956955708047411,
"grad_norm": 0.1102297346203138,
"learning_rate": 1.0052865547582074e-06,
"loss": 0.0273,
"num_tokens": 189837787.0,
"step": 2371
},
{
"epoch": 2.958203368683718,
"grad_norm": 0.11011616094968564,
"learning_rate": 1.004996990016393e-06,
"loss": 0.0278,
"num_tokens": 189916820.0,
"step": 2372
},
{
"epoch": 2.959451029320025,
"grad_norm": 0.1058190186745868,
"learning_rate": 1.0047155760736828e-06,
"loss": 0.0263,
"num_tokens": 189996448.0,
"step": 2373
},
{
"epoch": 2.960698689956332,
"grad_norm": 0.09902032964536187,
"learning_rate": 1.004442313440366e-06,
"loss": 0.0261,
"num_tokens": 190075935.0,
"step": 2374
},
{
"epoch": 2.9619463505926387,
"grad_norm": 0.12154175766740306,
"learning_rate": 1.0041772026119493e-06,
"loss": 0.027,
"num_tokens": 190156058.0,
"step": 2375
},
{
"epoch": 2.963194011228946,
"grad_norm": 0.11633889607609949,
"learning_rate": 1.0039202440691598e-06,
"loss": 0.0271,
"num_tokens": 190236260.0,
"step": 2376
},
{
"epoch": 2.9644416718652526,
"grad_norm": 0.10715021652513443,
"learning_rate": 1.0036714382779405e-06,
"loss": 0.0263,
"num_tokens": 190315801.0,
"step": 2377
},
{
"epoch": 2.9656893325015594,
"grad_norm": 0.11833748571681439,
"learning_rate": 1.0034307856894511e-06,
"loss": 0.0268,
"num_tokens": 190394686.0,
"step": 2378
},
{
"epoch": 2.9669369931378666,
"grad_norm": 0.11390670772540895,
"learning_rate": 1.0031982867400683e-06,
"loss": 0.0274,
"num_tokens": 190474200.0,
"step": 2379
},
{
"epoch": 2.9681846537741734,
"grad_norm": 0.09835116296798709,
"learning_rate": 1.0029739418513825e-06,
"loss": 0.0262,
"num_tokens": 190553083.0,
"step": 2380
},
{
"epoch": 2.96943231441048,
"grad_norm": 0.10781406391922578,
"learning_rate": 1.0027577514301988e-06,
"loss": 0.0264,
"num_tokens": 190632255.0,
"step": 2381
},
{
"epoch": 2.9706799750467874,
"grad_norm": 0.10062575360038996,
"learning_rate": 1.002549715868536e-06,
"loss": 0.0262,
"num_tokens": 190712052.0,
"step": 2382
},
{
"epoch": 2.971927635683094,
"grad_norm": 0.10840533051873137,
"learning_rate": 1.0023498355436255e-06,
"loss": 0.0269,
"num_tokens": 190791575.0,
"step": 2383
},
{
"epoch": 2.9731752963194014,
"grad_norm": 0.10941923321723614,
"learning_rate": 1.0021581108179105e-06,
"loss": 0.026,
"num_tokens": 190870712.0,
"step": 2384
},
{
"epoch": 2.974422956955708,
"grad_norm": 0.10761702827903147,
"learning_rate": 1.0019745420390455e-06,
"loss": 0.027,
"num_tokens": 190951038.0,
"step": 2385
},
{
"epoch": 2.975670617592015,
"grad_norm": 0.11368723139537162,
"learning_rate": 1.001799129539897e-06,
"loss": 0.0276,
"num_tokens": 191030954.0,
"step": 2386
},
{
"epoch": 2.9769182782283217,
"grad_norm": 0.10855427538159527,
"learning_rate": 1.0016318736385406e-06,
"loss": 0.0268,
"num_tokens": 191110413.0,
"step": 2387
},
{
"epoch": 2.978165938864629,
"grad_norm": 0.1068843734320981,
"learning_rate": 1.0014727746382615e-06,
"loss": 0.0259,
"num_tokens": 191189445.0,
"step": 2388
},
{
"epoch": 2.9794135995009356,
"grad_norm": 0.1257994774418835,
"learning_rate": 1.0013218328275544e-06,
"loss": 0.0279,
"num_tokens": 191270715.0,
"step": 2389
},
{
"epoch": 2.980661260137243,
"grad_norm": 0.11714566321070789,
"learning_rate": 1.0011790484801231e-06,
"loss": 0.0281,
"num_tokens": 191350789.0,
"step": 2390
},
{
"epoch": 2.9819089207735496,
"grad_norm": 0.11060106527704666,
"learning_rate": 1.0010444218548777e-06,
"loss": 0.0272,
"num_tokens": 191432051.0,
"step": 2391
},
{
"epoch": 2.9831565814098564,
"grad_norm": 0.11345865119840931,
"learning_rate": 1.0009179531959374e-06,
"loss": 0.0274,
"num_tokens": 191514006.0,
"step": 2392
},
{
"epoch": 2.984404242046163,
"grad_norm": 0.11834001788647562,
"learning_rate": 1.0007996427326282e-06,
"loss": 0.0274,
"num_tokens": 191595045.0,
"step": 2393
},
{
"epoch": 2.9856519026824704,
"grad_norm": 0.11736078471554756,
"learning_rate": 1.0006894906794828e-06,
"loss": 0.0281,
"num_tokens": 191674983.0,
"step": 2394
},
{
"epoch": 2.986899563318777,
"grad_norm": 0.10662448160761655,
"learning_rate": 1.0005874972362403e-06,
"loss": 0.0271,
"num_tokens": 191754802.0,
"step": 2395
},
{
"epoch": 2.9881472239550844,
"grad_norm": 0.10732557554988709,
"learning_rate": 1.000493662587845e-06,
"loss": 0.0265,
"num_tokens": 191833832.0,
"step": 2396
},
{
"epoch": 2.989394884591391,
"grad_norm": 0.11470784626798826,
"learning_rate": 1.0004079869044482e-06,
"loss": 0.0274,
"num_tokens": 191913223.0,
"step": 2397
},
{
"epoch": 2.990642545227698,
"grad_norm": 0.10039798181941487,
"learning_rate": 1.0003304703414053e-06,
"loss": 0.0255,
"num_tokens": 191993666.0,
"step": 2398
},
{
"epoch": 2.991890205864005,
"grad_norm": 0.10361335698424663,
"learning_rate": 1.0002611130392772e-06,
"loss": 0.0263,
"num_tokens": 192074190.0,
"step": 2399
},
{
"epoch": 2.993137866500312,
"grad_norm": 0.10857365381947398,
"learning_rate": 1.0001999151238303e-06,
"loss": 0.0271,
"num_tokens": 192153740.0,
"step": 2400
},
{
"epoch": 2.994385527136619,
"grad_norm": 0.10539286028474344,
"learning_rate": 1.0001468767060341e-06,
"loss": 0.0264,
"num_tokens": 192232957.0,
"step": 2401
},
{
"epoch": 2.995633187772926,
"grad_norm": 0.1103209776499511,
"learning_rate": 1.000101997882064e-06,
"loss": 0.0273,
"num_tokens": 192312730.0,
"step": 2402
},
{
"epoch": 2.9968808484092326,
"grad_norm": 0.11105172947776741,
"learning_rate": 1.0000652787332984e-06,
"loss": 0.0274,
"num_tokens": 192392230.0,
"step": 2403
},
{
"epoch": 2.9981285090455394,
"grad_norm": 0.11159122867179921,
"learning_rate": 1.0000367193263206e-06,
"loss": 0.0275,
"num_tokens": 192471730.0,
"step": 2404
},
{
"epoch": 2.9993761696818466,
"grad_norm": 0.10531323399329573,
"learning_rate": 1.000016319712917e-06,
"loss": 0.0264,
"num_tokens": 192551044.0,
"step": 2405
},
{
"epoch": 3.0,
"grad_norm": 0.10531323399329573,
"learning_rate": 1.0000040799300788e-06,
"loss": 0.0257,
"num_tokens": 192590850.0,
"step": 2406
},
{
"epoch": 3.0,
"step": 2406,
"total_flos": 3.699807586474721e+17,
"train_loss": 0.0587594091221814,
"train_runtime": 5162.578,
"train_samples_per_second": 59.587,
"train_steps_per_second": 0.466
}
],
"logging_steps": 1,
"max_steps": 2406,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.699807586474721e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}